Initial import

author: André Fabian Silva Delgado <emulatorman@parabola.nu> 2015-08-05 17:04:01 -0300
committer: André Fabian Silva Delgado <emulatorman@parabola.nu> 2015-08-05 17:04:01 -0300
commit: 57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree: 5e910f0e82173f4ef4f51111366a3f1299037a7b /drivers/infiniband
369 files changed, 269226 insertions, 0 deletions
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
new file mode 100644
index 000000000..b89953149
--- /dev/null
+++ b/drivers/infiniband/Kconfig
@@ -0,0 +1,78 @@
+menuconfig INFINIBAND
+	tristate "InfiniBand support"
+	depends on PCI || BROKEN
+	depends on HAS_IOMEM
+	depends on NET
+	depends on INET
+	depends on m || IPV6 != m
+	---help---
+	  Core support for InfiniBand (IB).  Make sure to also select
+	  any protocols you wish to use as well as drivers for your
+	  InfiniBand hardware.
+
+if INFINIBAND
+
+config INFINIBAND_USER_MAD
+	tristate "InfiniBand userspace MAD support"
+	depends on INFINIBAND
+	---help---
+	  Userspace InfiniBand Management Datagram (MAD) support.  This
+	  is the kernel side of the userspace MAD support, which allows
+	  userspace processes to send and receive MADs. You will also
+	  need libibumad from <http://www.openfabrics.org/downloads/management/>.
+
+config INFINIBAND_USER_ACCESS
+	tristate "InfiniBand userspace access (verbs and CM)"
+	select ANON_INODES
+	---help---
+	  Userspace InfiniBand access support.  This enables the
+	  kernel side of userspace verbs and the userspace
+	  communication manager (CM).  This allows userspace processes
+	  to set up connections and directly access InfiniBand
+	  hardware for fast-path operations.  You will also need
+	  libibverbs, libibcm and a hardware driver library from
+	  <http://www.openfabrics.org/git/>.
+
+config INFINIBAND_USER_MEM
+	bool
+	depends on INFINIBAND_USER_ACCESS != n
+	default y
+
+config INFINIBAND_ON_DEMAND_PAGING
+	bool "InfiniBand on-demand paging support"
+	depends on INFINIBAND_USER_MEM
+	select MMU_NOTIFIER
+	default y
+	---help---
+	  On demand paging support for the InfiniBand subsystem.
+	  Together with driver support this allows registration of
+	  memory regions without pinning their pages, fetching the
+	  pages on demand instead.
+
+config INFINIBAND_ADDR_TRANS
+	bool
+	depends on INFINIBAND
+	default y
+
+source "drivers/infiniband/hw/mthca/Kconfig"
+source "drivers/infiniband/hw/ipath/Kconfig"
+source "drivers/infiniband/hw/qib/Kconfig"
+source "drivers/infiniband/hw/ehca/Kconfig"
+source "drivers/infiniband/hw/amso1100/Kconfig"
+source "drivers/infiniband/hw/cxgb3/Kconfig"
+source "drivers/infiniband/hw/cxgb4/Kconfig"
+source "drivers/infiniband/hw/mlx4/Kconfig"
+source "drivers/infiniband/hw/mlx5/Kconfig"
+source "drivers/infiniband/hw/nes/Kconfig"
+source "drivers/infiniband/hw/ocrdma/Kconfig"
+source "drivers/infiniband/hw/usnic/Kconfig"
+
+source "drivers/infiniband/ulp/ipoib/Kconfig"
+
+source "drivers/infiniband/ulp/srp/Kconfig"
+source "drivers/infiniband/ulp/srpt/Kconfig"
+
+source "drivers/infiniband/ulp/iser/Kconfig"
+source "drivers/infiniband/ulp/isert/Kconfig"
+
+endif # INFINIBAND
diff --git a/drivers/infiniband/Makefile b/drivers/infiniband/Makefile
new file mode 100644
index 000000000..dc21836b5
--- /dev/null
+++ b/drivers/infiniband/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_INFINIBAND)		+= core/
+obj-$(CONFIG_INFINIBAND)		+= hw/
+obj-$(CONFIG_INFINIBAND)		+= ulp/
diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile
new file mode 100644
index 000000000..acf736764
--- /dev/null
+++ b/drivers/infiniband/core/Makefile
@@ -0,0 +1,34 @@
+infiniband-$(CONFIG_INFINIBAND_ADDR_TRANS)	:= rdma_cm.o
+user_access-$(CONFIG_INFINIBAND_ADDR_TRANS)	:= rdma_ucm.o
+
+obj-$(CONFIG_INFINIBAND) +=		ib_core.o ib_mad.o ib_sa.o \
+					ib_cm.o iw_cm.o ib_addr.o \
+					$(infiniband-y)
+obj-$(CONFIG_INFINIBAND_USER_MAD) +=	ib_umad.o
+obj-$(CONFIG_INFINIBAND_USER_ACCESS) +=	ib_uverbs.o ib_ucm.o \
+					$(user_access-y)
+
+ib_core-y :=			packer.o ud_header.o verbs.o sysfs.o \
+				device.o fmr_pool.o cache.o netlink.o
+ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o
+ib_core-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += umem_odp.o umem_rbtree.o
+
+ib_mad-y :=			mad.o smi.o agent.o mad_rmpp.o
+
+ib_sa-y :=			sa_query.o multicast.o
+
+ib_cm-y :=			cm.o
+
+iw_cm-y :=			iwcm.o iwpm_util.o iwpm_msg.o
+
+rdma_cm-y :=			cma.o
+
+rdma_ucm-y :=			ucma.o
+
+ib_addr-y :=			addr.o
+
+ib_umad-y :=			user_mad.o
+
+ib_ucm-y :=			ucm.o
+
+ib_uverbs-y :=			uverbs_main.o uverbs_cmd.o uverbs_marshall.o
diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c
new file mode 100644
index 000000000..38339d220
--- /dev/null
+++ b/drivers/infiniband/core/addr.c
@@ -0,0 +1,558 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mutex.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/module.h>
+#include <net/arp.h>
+#include <net/neighbour.h>
+#include <net/route.h>
+#include <net/netevent.h>
+#include <net/addrconf.h>
+#include <net/ip6_route.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("IB Address Translation");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct addr_req {
+	struct list_head list;
+	struct sockaddr_storage src_addr;
+	struct sockaddr_storage dst_addr;
+	struct rdma_dev_addr *addr;
+	struct rdma_addr_client *client;
+	void *context;
+	void (*callback)(int status, struct sockaddr *src_addr,
+			 struct rdma_dev_addr *addr, void *context);
+	unsigned long timeout;
+	int status;
+};
+
+static void process_req(struct work_struct *work);
+
+static DEFINE_MUTEX(lock);
+static LIST_HEAD(req_list);
+static DECLARE_DELAYED_WORK(work, process_req);
+static struct workqueue_struct *addr_wq;
+
+int rdma_addr_size(struct sockaddr *addr)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		return sizeof(struct sockaddr_in);
+	case AF_INET6:
+		return sizeof(struct sockaddr_in6);
+	case AF_IB:
+		return sizeof(struct sockaddr_ib);
+	default:
+		return 0;
+	}
+}
+EXPORT_SYMBOL(rdma_addr_size);
+
+static struct rdma_addr_client self;
+
+void rdma_addr_register_client(struct rdma_addr_client *client)
+{
+	atomic_set(&client->refcount, 1);
+	init_completion(&client->comp);
+}
+EXPORT_SYMBOL(rdma_addr_register_client);
+
+static inline void put_client(struct rdma_addr_client *client)
+{
+	if (atomic_dec_and_test(&client->refcount))
+		complete(&client->comp);
+}
+
+void rdma_addr_unregister_client(struct rdma_addr_client *client)
+{
+	put_client(client);
+	wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(rdma_addr_unregister_client);
+
+int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev,
+		     const unsigned char *dst_dev_addr)
+{
+	dev_addr->dev_type = dev->type;
+	memcpy(dev_addr->src_dev_addr, dev->dev_addr, MAX_ADDR_LEN);
+	memcpy(dev_addr->broadcast, dev->broadcast, MAX_ADDR_LEN);
+	if (dst_dev_addr)
+		memcpy(dev_addr->dst_dev_addr, dst_dev_addr, MAX_ADDR_LEN);
+	dev_addr->bound_dev_if = dev->ifindex;
+	return 0;
+}
+EXPORT_SYMBOL(rdma_copy_addr);
+
+int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr,
+		      u16 *vlan_id)
+{
+	struct net_device *dev;
+	int ret = -EADDRNOTAVAIL;
+
+	if (dev_addr->bound_dev_if) {
+		dev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+		if (!dev)
+			return -ENODEV;
+		ret = rdma_copy_addr(dev_addr, dev, NULL);
+		dev_put(dev);
+		return ret;
+	}
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		dev = ip_dev_find(&init_net,
+			((struct sockaddr_in *) addr)->sin_addr.s_addr);
+
+		if (!dev)
+			return ret;
+
+		ret = rdma_copy_addr(dev_addr, dev, NULL);
+		if (vlan_id)
+			*vlan_id = rdma_vlan_dev_vlan_id(dev);
+		dev_put(dev);
+		break;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	case AF_INET6:
+		rcu_read_lock();
+		for_each_netdev_rcu(&init_net, dev) {
+			if (ipv6_chk_addr(&init_net,
+					  &((struct sockaddr_in6 *) addr)->sin6_addr,
+					  dev, 1)) {
+				ret = rdma_copy_addr(dev_addr, dev, NULL);
+				if (vlan_id)
+					*vlan_id = rdma_vlan_dev_vlan_id(dev);
+				break;
+			}
+		}
+		rcu_read_unlock();
+		break;
+#endif
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rdma_translate_ip);
+
+static void set_timeout(unsigned long time)
+{
+	unsigned long delay;
+
+	delay = time - jiffies;
+	if ((long)delay < 0)
+		delay = 0;
+
+	mod_delayed_work(addr_wq, &work, delay);
+}
+
+static void queue_req(struct addr_req *req)
+{
+	struct addr_req *temp_req;
+
+	mutex_lock(&lock);
+	list_for_each_entry_reverse(temp_req, &req_list, list) {
+		if (time_after_eq(req->timeout, temp_req->timeout))
+			break;
+	}
+
+	list_add(&req->list, &temp_req->list);
+
+	if (req_list.next == &req->list)
+		set_timeout(req->timeout);
+	mutex_unlock(&lock);
+}
+
+static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, void *daddr)
+{
+	struct neighbour *n;
+	int ret;
+
+	n = dst_neigh_lookup(dst, daddr);
+
+	rcu_read_lock();
+	if (!n || !(n->nud_state & NUD_VALID)) {
+		if (n)
+			neigh_event_send(n, NULL);
+		ret = -ENODATA;
+	} else {
+		ret = rdma_copy_addr(dev_addr, dst->dev, n->ha);
+	}
+	rcu_read_unlock();
+
+	if (n)
+		neigh_release(n);
+
+	return ret;
+}
+
+static int addr4_resolve(struct sockaddr_in *src_in,
+			 struct sockaddr_in *dst_in,
+			 struct rdma_dev_addr *addr)
+{
+	__be32 src_ip = src_in->sin_addr.s_addr;
+	__be32 dst_ip = dst_in->sin_addr.s_addr;
+	struct rtable *rt;
+	struct flowi4 fl4;
+	int ret;
+
+	memset(&fl4, 0, sizeof(fl4));
+	fl4.daddr = dst_ip;
+	fl4.saddr = src_ip;
+	fl4.flowi4_oif = addr->bound_dev_if;
+	rt = ip_route_output_key(&init_net, &fl4);
+	if (IS_ERR(rt)) {
+		ret = PTR_ERR(rt);
+		goto out;
+	}
+	src_in->sin_family = AF_INET;
+	src_in->sin_addr.s_addr = fl4.saddr;
+
+	if (rt->dst.dev->flags & IFF_LOOPBACK) {
+		ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL);
+		if (!ret)
+			memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
+		goto put;
+	}
+
+	/* If the device does ARP internally, return 'done' */
+	if (rt->dst.dev->flags & IFF_NOARP) {
+		ret = rdma_copy_addr(addr, rt->dst.dev, NULL);
+		goto put;
+	}
+
+	ret = dst_fetch_ha(&rt->dst, addr, &fl4.daddr);
+put:
+	ip_rt_put(rt);
+out:
+	return ret;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+			 struct sockaddr_in6 *dst_in,
+			 struct rdma_dev_addr *addr)
+{
+	struct flowi6 fl6;
+	struct dst_entry *dst;
+	int ret;
+
+	memset(&fl6, 0, sizeof fl6);
+	fl6.daddr = dst_in->sin6_addr;
+	fl6.saddr = src_in->sin6_addr;
+	fl6.flowi6_oif = addr->bound_dev_if;
+
+	dst = ip6_route_output(&init_net, NULL, &fl6);
+	if ((ret = dst->error))
+		goto put;
+
+	if (ipv6_addr_any(&fl6.saddr)) {
+		ret = ipv6_dev_get_saddr(&init_net, ip6_dst_idev(dst)->dev,
+					 &fl6.daddr, 0, &fl6.saddr);
+		if (ret)
+			goto put;
+
+		src_in->sin6_family = AF_INET6;
+		src_in->sin6_addr = fl6.saddr;
+	}
+
+	if (dst->dev->flags & IFF_LOOPBACK) {
+		ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL);
+		if (!ret)
+			memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN);
+		goto put;
+	}
+
+	/* If the device does ARP internally, return 'done' */
+	if (dst->dev->flags & IFF_NOARP) {
+		ret = rdma_copy_addr(addr, dst->dev, NULL);
+		goto put;
+	}
+
+	ret = dst_fetch_ha(dst, addr, &fl6.daddr);
+put:
+	dst_release(dst);
+	return ret;
+}
+#else
+static int addr6_resolve(struct sockaddr_in6 *src_in,
+			 struct sockaddr_in6 *dst_in,
+			 struct rdma_dev_addr *addr)
+{
+	return -EADDRNOTAVAIL;
+}
+#endif
+
+static int addr_resolve(struct sockaddr *src_in,
+			struct sockaddr *dst_in,
+			struct rdma_dev_addr *addr)
+{
+	if (src_in->sa_family == AF_INET) {
+		return addr4_resolve((struct sockaddr_in *) src_in,
+			(struct sockaddr_in *) dst_in, addr);
+	} else
+		return addr6_resolve((struct sockaddr_in6 *) src_in,
+			(struct sockaddr_in6 *) dst_in, addr);
+}
+
+static void process_req(struct work_struct *work)
+{
+	struct addr_req *req, *temp_req;
+	struct sockaddr *src_in, *dst_in;
+	struct list_head done_list;
+
+	INIT_LIST_HEAD(&done_list);
+
+	mutex_lock(&lock);
+	list_for_each_entry_safe(req, temp_req, &req_list, list) {
+		if (req->status == -ENODATA) {
+			src_in = (struct sockaddr *) &req->src_addr;
+			dst_in = (struct sockaddr *) &req->dst_addr;
+			req->status = addr_resolve(src_in, dst_in, req->addr);
+			if (req->status && time_after_eq(jiffies, req->timeout))
+				req->status = -ETIMEDOUT;
+			else if (req->status == -ENODATA)
+				continue;
+		}
+		list_move_tail(&req->list, &done_list);
+	}
+
+	if (!list_empty(&req_list)) {
+		req = list_entry(req_list.next, struct addr_req, list);
+		set_timeout(req->timeout);
+	}
+	mutex_unlock(&lock);
+
+	list_for_each_entry_safe(req, temp_req, &done_list, list) {
+		list_del(&req->list);
+		req->callback(req->status, (struct sockaddr *) &req->src_addr,
+			req->addr, req->context);
+		put_client(req->client);
+		kfree(req);
+	}
+}
+
+int rdma_resolve_ip(struct rdma_addr_client *client,
+		    struct sockaddr *src_addr, struct sockaddr *dst_addr,
+		    struct rdma_dev_addr *addr, int timeout_ms,
+		    void (*callback)(int status, struct sockaddr *src_addr,
+				     struct rdma_dev_addr *addr, void *context),
+		    void *context)
+{
+	struct sockaddr *src_in, *dst_in;
+	struct addr_req *req;
+	int ret = 0;
+
+	req = kzalloc(sizeof *req, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	src_in = (struct sockaddr *) &req->src_addr;
+	dst_in = (struct sockaddr *) &req->dst_addr;
+
+	if (src_addr) {
+		if (src_addr->sa_family != dst_addr->sa_family) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		memcpy(src_in, src_addr, rdma_addr_size(src_addr));
+	} else {
+		src_in->sa_family = dst_addr->sa_family;
+	}
+
+	memcpy(dst_in, dst_addr, rdma_addr_size(dst_addr));
+	req->addr = addr;
+	req->callback = callback;
+	req->context = context;
+	req->client = client;
+	atomic_inc(&client->refcount);
+
+	req->status = addr_resolve(src_in, dst_in, addr);
+	switch (req->status) {
+	case 0:
+		req->timeout = jiffies;
+		queue_req(req);
+		break;
+	case -ENODATA:
+		req->timeout = msecs_to_jiffies(timeout_ms) + jiffies;
+		queue_req(req);
+		break;
+	default:
+		ret = req->status;
+		atomic_dec(&client->refcount);
+		goto err;
+	}
+	return ret;
+err:
+	kfree(req);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_ip);
+
+void rdma_addr_cancel(struct rdma_dev_addr *addr)
+{
+	struct addr_req *req, *temp_req;
+
+	mutex_lock(&lock);
+	list_for_each_entry_safe(req, temp_req, &req_list, list) {
+		if (req->addr == addr) {
+			req->status = -ECANCELED;
+			req->timeout = jiffies;
+			list_move(&req->list, &req_list);
+			set_timeout(req->timeout);
+			break;
+		}
+	}
+	mutex_unlock(&lock);
+}
+EXPORT_SYMBOL(rdma_addr_cancel);
+
+struct resolve_cb_context {
+	struct rdma_dev_addr *addr;
+	struct completion comp;
+};
+
+static void resolve_cb(int status, struct sockaddr *src_addr,
+	     struct rdma_dev_addr *addr, void *context)
+{
+	memcpy(((struct resolve_cb_context *)context)->addr, addr, sizeof(struct
+				rdma_dev_addr));
+	complete(&((struct resolve_cb_context *)context)->comp);
+}
+
+int rdma_addr_find_dmac_by_grh(union ib_gid *sgid, union ib_gid *dgid, u8 *dmac,
+			       u16 *vlan_id)
+{
+	int ret = 0;
+	struct rdma_dev_addr dev_addr;
+	struct resolve_cb_context ctx;
+	struct net_device *dev;
+
+	union {
+		struct sockaddr     _sockaddr;
+		struct sockaddr_in  _sockaddr_in;
+		struct sockaddr_in6 _sockaddr_in6;
+	} sgid_addr, dgid_addr;
+
+
+	rdma_gid2ip(&sgid_addr._sockaddr, sgid);
+	rdma_gid2ip(&dgid_addr._sockaddr, dgid);
+
+	memset(&dev_addr, 0, sizeof(dev_addr));
+
+	ctx.addr = &dev_addr;
+	init_completion(&ctx.comp);
+	ret = rdma_resolve_ip(&self, &sgid_addr._sockaddr, &dgid_addr._sockaddr,
+			&dev_addr, 1000, resolve_cb, &ctx);
+	if (ret)
+		return ret;
+
+	wait_for_completion(&ctx.comp);
+
+	memcpy(dmac, dev_addr.dst_dev_addr, ETH_ALEN);
+	dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if);
+	if (!dev)
+		return -ENODEV;
+	if (vlan_id)
+		*vlan_id = rdma_vlan_dev_vlan_id(dev);
+	dev_put(dev);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh);
+
+int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id)
+{
+	int ret = 0;
+	struct rdma_dev_addr dev_addr;
+	union {
+		struct sockaddr     _sockaddr;
+		struct sockaddr_in  _sockaddr_in;
+		struct sockaddr_in6 _sockaddr_in6;
+	} gid_addr;
+
+	rdma_gid2ip(&gid_addr._sockaddr, sgid);
+
+	memset(&dev_addr, 0, sizeof(dev_addr));
+	ret = rdma_translate_ip(&gid_addr._sockaddr, &dev_addr, vlan_id);
+	if (ret)
+		return ret;
+
+	memcpy(smac, dev_addr.src_dev_addr, ETH_ALEN);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_addr_find_smac_by_sgid);
+
+static int netevent_callback(struct notifier_block *self, unsigned long event,
+	void *ctx)
+{
+	if (event == NETEVENT_NEIGH_UPDATE) {
+		struct neighbour *neigh = ctx;
+
+		if (neigh->nud_state & NUD_VALID) {
+			set_timeout(jiffies);
+		}
+	}
+	return 0;
+}
+
+static struct notifier_block nb = {
+	.notifier_call = netevent_callback
+};
+
+static int __init addr_init(void)
+{
+	addr_wq = create_singlethread_workqueue("ib_addr");
+	if (!addr_wq)
+		return -ENOMEM;
+
+	register_netevent_notifier(&nb);
+	rdma_addr_register_client(&self);
+	return 0;
+}
+
+static void __exit addr_cleanup(void)
+{
+	rdma_addr_unregister_client(&self);
+	unregister_netevent_notifier(&nb);
+	destroy_workqueue(addr_wq);
+}
+
+module_init(addr_init);
+module_exit(addr_cleanup);
diff --git a/drivers/infiniband/core/agent.c b/drivers/infiniband/core/agent.c
new file mode 100644
index 000000000..f6d29614c
--- /dev/null
+++ b/drivers/infiniband/core/agent.c
@@ -0,0 +1,217 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+
+#include "agent.h"
+#include "smi.h"
+#include "mad_priv.h"
+
+#define SPFX "ib_agent: "
+
+struct ib_agent_port_private {
+	struct list_head port_list;
+	struct ib_mad_agent *agent[2];
+};
+
+static DEFINE_SPINLOCK(ib_agent_port_list_lock);
+static LIST_HEAD(ib_agent_port_list);
+
+static struct ib_agent_port_private *
+__ib_get_agent_port(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *entry;
+
+	list_for_each_entry(entry, &ib_agent_port_list, port_list) {
+		if (entry->agent[1]->device == device &&
+		    entry->agent[1]->port_num == port_num)
+			return entry;
+	}
+	return NULL;
+}
+
+static struct ib_agent_port_private *
+ib_get_agent_port(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	entry = __ib_get_agent_port(device, port_num);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+	return entry;
+}
+
+void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
+			 struct ib_wc *wc, struct ib_device *device,
+			 int port_num, int qpn)
+{
+	struct ib_agent_port_private *port_priv;
+	struct ib_mad_agent *agent;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_ah *ah;
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		port_priv = ib_get_agent_port(device, 0);
+	else
+		port_priv = ib_get_agent_port(device, port_num);
+
+	if (!port_priv) {
+		dev_err(&device->dev, "Unable to find port agent\n");
+		return;
+	}
+
+	agent = port_priv->agent[qpn];
+	ah = ib_create_ah_from_wc(agent->qp->pd, wc, grh, port_num);
+	if (IS_ERR(ah)) {
+		dev_err(&device->dev, "ib_create_ah_from_wc error %ld\n",
+			PTR_ERR(ah));
+		return;
+	}
+
+	send_buf = ib_create_send_mad(agent, wc->src_qp, wc->pkey_index, 0,
+				      IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+				      GFP_KERNEL);
+	if (IS_ERR(send_buf)) {
+		dev_err(&device->dev, "ib_create_send_mad error\n");
+		goto err1;
+	}
+
+	memcpy(send_buf->mad, mad, sizeof *mad);
+	send_buf->ah = ah;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		mad_send_wr = container_of(send_buf,
+					   struct ib_mad_send_wr_private,
+					   send_buf);
+		mad_send_wr->send_wr.wr.ud.port_num = port_num;
+	}
+
+	if (ib_post_send_mad(send_buf, NULL)) {
+		dev_err(&device->dev, "ib_post_send_mad error\n");
+		goto err2;
+	}
+	return;
+err2:
+	ib_free_send_mad(send_buf);
+err1:
+	ib_destroy_ah(ah);
+}
+
+static void agent_send_handler(struct ib_mad_agent *mad_agent,
+			       struct ib_mad_send_wc *mad_send_wc)
+{
+	ib_destroy_ah(mad_send_wc->send_buf->ah);
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int ib_agent_port_open(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *port_priv;
+	unsigned long flags;
+	int ret;
+
+	/* Create new device info */
+	port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+	if (!port_priv) {
+		dev_err(&device->dev, "No memory for ib_agent_port_private\n");
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	if (rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND) {
+		/* Obtain send only MAD agent for SMI QP */
+		port_priv->agent[0] = ib_register_mad_agent(device, port_num,
+							    IB_QPT_SMI, NULL, 0,
+							    &agent_send_handler,
+							    NULL, NULL, 0);
+		if (IS_ERR(port_priv->agent[0])) {
+			ret = PTR_ERR(port_priv->agent[0]);
+			goto error2;
+		}
+	}
+
+	/* Obtain send only MAD agent for GSI QP */
+	port_priv->agent[1] = ib_register_mad_agent(device, port_num,
+						    IB_QPT_GSI, NULL, 0,
+						    &agent_send_handler,
+						    NULL, NULL, 0);
+	if (IS_ERR(port_priv->agent[1])) {
+		ret = PTR_ERR(port_priv->agent[1]);
+		goto error3;
+	}
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	list_add_tail(&port_priv->port_list, &ib_agent_port_list);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+	return 0;
+
+error3:
+	if (port_priv->agent[0])
+		ib_unregister_mad_agent(port_priv->agent[0]);
+error2:
+	kfree(port_priv);
+error1:
+	return ret;
+}
+
+int ib_agent_port_close(struct ib_device *device, int port_num)
+{
+	struct ib_agent_port_private *port_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_agent_port_list_lock, flags);
+	port_priv = __ib_get_agent_port(device, port_num);
+	if (port_priv == NULL) {
+		spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+		dev_err(&device->dev, "Port %d not found\n", port_num);
+		return -ENODEV;
+	}
+	list_del(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_agent_port_list_lock, flags);
+
+	ib_unregister_mad_agent(port_priv->agent[1]);
+	if (port_priv->agent[0])
+		ib_unregister_mad_agent(port_priv->agent[0]);
+
+	kfree(port_priv);
+	return 0;
+}
diff --git a/drivers/infiniband/core/agent.h b/drivers/infiniband/core/agent.h
new file mode 100644
index 000000000..666928700
--- /dev/null
+++ b/drivers/infiniband/core/agent.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __AGENT_H_
+#define __AGENT_H_
+
+#include <linux/err.h>
+#include <rdma/ib_mad.h>
+
+extern int ib_agent_port_open(struct ib_device *device, int port_num);
+
+extern int ib_agent_port_close(struct ib_device *device, int port_num);
+
+extern void agent_send_response(struct ib_mad *mad, struct ib_grh *grh,
+				struct ib_wc *wc, struct ib_device *device,
+				int port_num, int qpn);
+
+#endif	/* __AGENT_H_ */
diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c
new file mode 100644
index 000000000..80f6cf244
--- /dev/null
+++ b/drivers/infiniband/core/cache.c
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+
+#include <rdma/ib_cache.h>
+
+#include "core_priv.h"
+
+struct ib_pkey_cache {
+	int             table_len;
+	u16             table[0];
+};
+
+struct ib_gid_cache {
+	int             table_len;
+	union ib_gid    table[0];
+};
+
+struct ib_update_work {
+	struct work_struct work;
+	struct ib_device  *device;
+	u8                 port_num;
+};
+
+static inline int start_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+static inline int end_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+		0 : device->phys_port_cnt;
+}
+
+int ib_get_cached_gid(struct ib_device *device,
+		      u8                port_num,
+		      int               index,
+		      union ib_gid     *gid)
+{
+	struct ib_gid_cache *cache;
+	unsigned long flags;
+	int ret = 0;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.gid_cache[port_num - start_port(device)];
+
+	if (index < 0 || index >= cache->table_len)
+		ret = -EINVAL;
+	else
+		*gid = cache->table[index];
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_gid);
+
+int ib_find_cached_gid(struct ib_device *device,
+		       union ib_gid	*gid,
+		       u8               *port_num,
+		       u16              *index)
+{
+	struct ib_gid_cache *cache;
+	unsigned long flags;
+	int p, i;
+	int ret = -ENOENT;
+
+	*port_num = -1;
+	if (index)
+		*index = -1;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		cache = device->cache.gid_cache[p];
+		for (i = 0; i < cache->table_len; ++i) {
+			if (!memcmp(gid, &cache->table[i], sizeof *gid)) {
+				*port_num = p + start_port(device);
+				if (index)
+					*index = i;
+				ret = 0;
+				goto found;
+			}
+		}
+	}
+found:
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_gid);
+
+int ib_get_cached_pkey(struct ib_device *device,
+		       u8                port_num,
+		       int               index,
+		       u16              *pkey)
+{
+	struct ib_pkey_cache *cache;
+	unsigned long flags;
+	int ret = 0;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+	if (index < 0 || index >= cache->table_len)
+		ret = -EINVAL;
+	else
+		*pkey = cache->table[index];
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_pkey);
+
+int ib_find_cached_pkey(struct ib_device *device,
+			u8                port_num,
+			u16               pkey,
+			u16              *index)
+{
+	struct ib_pkey_cache *cache;
+	unsigned long flags;
+	int i;
+	int ret = -ENOENT;
+	int partial_ix = -1;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+	*index = -1;
+
+	for (i = 0; i < cache->table_len; ++i)
+		if ((cache->table[i] & 0x7fff) == (pkey & 0x7fff)) {
+			if (cache->table[i] & 0x8000) {
+				*index = i;
+				ret = 0;
+				break;
+			} else
+				partial_ix = i;
+		}
+
+	if (ret && partial_ix >= 0) {
+		*index = partial_ix;
+		ret = 0;
+	}
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_find_cached_pkey);
+
+int ib_find_exact_cached_pkey(struct ib_device *device,
+			      u8                port_num,
+			      u16               pkey,
+			      u16              *index)
+{
+	struct ib_pkey_cache *cache;
+	unsigned long flags;
+	int i;
+	int ret = -ENOENT;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+
+	cache = device->cache.pkey_cache[port_num - start_port(device)];
+
+	*index = -1;
+
+	for (i = 0; i < cache->table_len; ++i)
+		if (cache->table[i] == pkey) {
+			*index = i;
+			ret = 0;
+			break;
+		}
+
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_find_exact_cached_pkey);
+
+int ib_get_cached_lmc(struct ib_device *device,
+		      u8                port_num,
+		      u8                *lmc)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	read_lock_irqsave(&device->cache.lock, flags);
+	*lmc = device->cache.lmc_cache[port_num - start_port(device)];
+	read_unlock_irqrestore(&device->cache.lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_cached_lmc);
+
+static void ib_cache_update(struct ib_device *device,
+			    u8                port)
+{
+	struct ib_port_attr       *tprops = NULL;
+	struct ib_pkey_cache      *pkey_cache = NULL, *old_pkey_cache;
+	struct ib_gid_cache       *gid_cache = NULL, *old_gid_cache;
+	int                        i;
+	int                        ret;
+
+	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+	if (!tprops)
+		return;
+
+	ret = ib_query_port(device, port, tprops);
+	if (ret) {
+		printk(KERN_WARNING "ib_query_port failed (%d) for %s\n",
+		       ret, device->name);
+		goto err;
+	}
+
+	pkey_cache = kmalloc(sizeof *pkey_cache + tprops->pkey_tbl_len *
+			     sizeof *pkey_cache->table, GFP_KERNEL);
+	if (!pkey_cache)
+		goto err;
+
+	pkey_cache->table_len = tprops->pkey_tbl_len;
+
+	gid_cache = kmalloc(sizeof *gid_cache + tprops->gid_tbl_len *
+			    sizeof *gid_cache->table, GFP_KERNEL);
+	if (!gid_cache)
+		goto err;
+
+	gid_cache->table_len = tprops->gid_tbl_len;
+
+	for (i = 0; i < pkey_cache->table_len; ++i) {
+		ret = ib_query_pkey(device, port, i, pkey_cache->table + i);
+		if (ret) {
+			printk(KERN_WARNING "ib_query_pkey failed (%d) for %s (index %d)\n",
+			       ret, device->name, i);
+			goto err;
+		}
+	}
+
+	for (i = 0; i < gid_cache->table_len; ++i) {
+		ret = ib_query_gid(device, port, i, gid_cache->table + i);
+		if (ret) {
+			printk(KERN_WARNING "ib_query_gid failed (%d) for %s (index %d)\n",
+			       ret, device->name, i);
+			goto err;
+		}
+	}
+
+	write_lock_irq(&device->cache.lock);
+
+	old_pkey_cache = device->cache.pkey_cache[port - start_port(device)];
+	old_gid_cache  = device->cache.gid_cache [port - start_port(device)];
+
+	device->cache.pkey_cache[port - start_port(device)] = pkey_cache;
+	device->cache.gid_cache [port - start_port(device)] = gid_cache;
+
+	device->cache.lmc_cache[port - start_port(device)] = tprops->lmc;
+
+	write_unlock_irq(&device->cache.lock);
+
+	kfree(old_pkey_cache);
+	kfree(old_gid_cache);
+	kfree(tprops);
+	return;
+
+err:
+	kfree(pkey_cache);
+	kfree(gid_cache);
+	kfree(tprops);
+}
+
+static void ib_cache_task(struct work_struct *_work)
+{
+	struct ib_update_work *work =
+		container_of(_work, struct ib_update_work, work);
+
+	ib_cache_update(work->device, work->port_num);
+	kfree(work);
+}
+
+static void ib_cache_event(struct ib_event_handler *handler,
+			   struct ib_event *event)
+{
+	struct ib_update_work *work;
+
+	if (event->event == IB_EVENT_PORT_ERR    ||
+	    event->event == IB_EVENT_PORT_ACTIVE ||
+	    event->event == IB_EVENT_LID_CHANGE  ||
+	    event->event == IB_EVENT_PKEY_CHANGE ||
+	    event->event == IB_EVENT_SM_CHANGE   ||
+	    event->event == IB_EVENT_CLIENT_REREGISTER ||
+	    event->event == IB_EVENT_GID_CHANGE) {
+		work = kmalloc(sizeof *work, GFP_ATOMIC);
+		if (work) {
+			INIT_WORK(&work->work, ib_cache_task);
+			work->device   = event->device;
+			work->port_num = event->element.port_num;
+			queue_work(ib_wq, &work->work);
+		}
+	}
+}
+
+static void ib_cache_setup_one(struct ib_device *device)
+{
+	int p;
+
+	rwlock_init(&device->cache.lock);
+
+	device->cache.pkey_cache =
+		kmalloc(sizeof *device->cache.pkey_cache *
+			(end_port(device) - start_port(device) + 1), GFP_KERNEL);
+	device->cache.gid_cache =
+		kmalloc(sizeof *device->cache.gid_cache *
+			(end_port(device) - start_port(device) + 1), GFP_KERNEL);
+
+	device->cache.lmc_cache = kmalloc(sizeof *device->cache.lmc_cache *
+					  (end_port(device) -
+					   start_port(device) + 1),
+					  GFP_KERNEL);
+
+	if (!device->cache.pkey_cache || !device->cache.gid_cache ||
+	    !device->cache.lmc_cache) {
+		printk(KERN_WARNING "Couldn't allocate cache "
+		       "for %s\n", device->name);
+		goto err;
+	}
+
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		device->cache.pkey_cache[p] = NULL;
+		device->cache.gid_cache [p] = NULL;
+		ib_cache_update(device, p + start_port(device));
+	}
+
+	INIT_IB_EVENT_HANDLER(&device->cache.event_handler,
+			      device, ib_cache_event);
+	if (ib_register_event_handler(&device->cache.event_handler))
+		goto err_cache;
+
+	return;
+
+err_cache:
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		kfree(device->cache.pkey_cache[p]);
+		kfree(device->cache.gid_cache[p]);
+	}
+
+err:
+	kfree(device->cache.pkey_cache);
+	kfree(device->cache.gid_cache);
+	kfree(device->cache.lmc_cache);
+}
+
+static void ib_cache_cleanup_one(struct ib_device *device)
+{
+	int p;
+
+	ib_unregister_event_handler(&device->cache.event_handler);
+	flush_workqueue(ib_wq);
+
+	for (p = 0; p <= end_port(device) - start_port(device); ++p) {
+		kfree(device->cache.pkey_cache[p]);
+		kfree(device->cache.gid_cache[p]);
+	}
+
+	kfree(device->cache.pkey_cache);
+	kfree(device->cache.gid_cache);
+	kfree(device->cache.lmc_cache);
+}
+
+static struct ib_client cache_client = {
+	.name   = "cache",
+	.add    = ib_cache_setup_one,
+	.remove = ib_cache_cleanup_one
+};
+
+int __init ib_cache_setup(void)
+{
+	return ib_register_client(&cache_client);
+}
+
+void __exit ib_cache_cleanup(void)
+{
+	ib_unregister_client(&cache_client);
+}
diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c
new file mode 100644
index 000000000..0271608a5
--- /dev/null
+++ b/drivers/infiniband/core/cm.c
@@ -0,0 +1,3931 @@
+/*
+ * Copyright (c) 2004-2007 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/random.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/workqueue.h>
+#include <linux/kdev_t.h>
+#include <linux/etherdevice.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include "cm_msgs.h"
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("InfiniBand CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static void cm_add_one(struct ib_device *device);
+static void cm_remove_one(struct ib_device *device);
+
+static struct ib_client cm_client = {
+	.name   = "cm",
+	.add    = cm_add_one,
+	.remove = cm_remove_one
+};
+
+static struct ib_cm {
+	spinlock_t lock;
+	struct list_head device_list;
+	rwlock_t device_lock;
+	struct rb_root listen_service_table;
+	u64 listen_service_id;
+	/* struct rb_root peer_service_table; todo: fix peer to peer */
+	struct rb_root remote_qp_table;
+	struct rb_root remote_id_table;
+	struct rb_root remote_sidr_table;
+	struct idr local_id_table;
+	__be32 random_id_operand;
+	struct list_head timewait_list;
+	struct workqueue_struct *wq;
+} cm;
+
+/* Counter indexes ordered by attribute ID */
+enum {
+	CM_REQ_COUNTER,
+	CM_MRA_COUNTER,
+	CM_REJ_COUNTER,
+	CM_REP_COUNTER,
+	CM_RTU_COUNTER,
+	CM_DREQ_COUNTER,
+	CM_DREP_COUNTER,
+	CM_SIDR_REQ_COUNTER,
+	CM_SIDR_REP_COUNTER,
+	CM_LAP_COUNTER,
+	CM_APR_COUNTER,
+	CM_ATTR_COUNT,
+	CM_ATTR_ID_OFFSET = 0x0010,
+};
+
+enum {
+	CM_XMIT,
+	CM_XMIT_RETRIES,
+	CM_RECV,
+	CM_RECV_DUPLICATES,
+	CM_COUNTER_GROUPS
+};
+
+static char const counter_group_names[CM_COUNTER_GROUPS]
+				     [sizeof("cm_rx_duplicates")] = {
+	"cm_tx_msgs", "cm_tx_retries",
+	"cm_rx_msgs", "cm_rx_duplicates"
+};
+
+struct cm_counter_group {
+	struct kobject obj;
+	atomic_long_t counter[CM_ATTR_COUNT];
+};
+
+struct cm_counter_attribute {
+	struct attribute attr;
+	int index;
+};
+
+#define CM_COUNTER_ATTR(_name, _index) \
+struct cm_counter_attribute cm_##_name##_counter_attr = { \
+	.attr = { .name = __stringify(_name), .mode = 0444 }, \
+	.index = _index \
+}
+
+static CM_COUNTER_ATTR(req, CM_REQ_COUNTER);
+static CM_COUNTER_ATTR(mra, CM_MRA_COUNTER);
+static CM_COUNTER_ATTR(rej, CM_REJ_COUNTER);
+static CM_COUNTER_ATTR(rep, CM_REP_COUNTER);
+static CM_COUNTER_ATTR(rtu, CM_RTU_COUNTER);
+static CM_COUNTER_ATTR(dreq, CM_DREQ_COUNTER);
+static CM_COUNTER_ATTR(drep, CM_DREP_COUNTER);
+static CM_COUNTER_ATTR(sidr_req, CM_SIDR_REQ_COUNTER);
+static CM_COUNTER_ATTR(sidr_rep, CM_SIDR_REP_COUNTER);
+static CM_COUNTER_ATTR(lap, CM_LAP_COUNTER);
+static CM_COUNTER_ATTR(apr, CM_APR_COUNTER);
+
+static struct attribute *cm_counter_default_attrs[] = {
+	&cm_req_counter_attr.attr,
+	&cm_mra_counter_attr.attr,
+	&cm_rej_counter_attr.attr,
+	&cm_rep_counter_attr.attr,
+	&cm_rtu_counter_attr.attr,
+	&cm_dreq_counter_attr.attr,
+	&cm_drep_counter_attr.attr,
+	&cm_sidr_req_counter_attr.attr,
+	&cm_sidr_rep_counter_attr.attr,
+	&cm_lap_counter_attr.attr,
+	&cm_apr_counter_attr.attr,
+	NULL
+};
+
+struct cm_port {
+	struct cm_device *cm_dev;
+	struct ib_mad_agent *mad_agent;
+	struct kobject port_obj;
+	u8 port_num;
+	struct cm_counter_group counter_group[CM_COUNTER_GROUPS];
+};
+
+struct cm_device {
+	struct list_head list;
+	struct ib_device *ib_device;
+	struct device *device;
+	u8 ack_delay;
+	struct cm_port *port[0];
+};
+
+struct cm_av {
+	struct cm_port *port;
+	union ib_gid dgid;
+	struct ib_ah_attr ah_attr;
+	u16 pkey_index;
+	u8 timeout;
+	u8  valid;
+	u8  smac[ETH_ALEN];
+};
+
+struct cm_work {
+	struct delayed_work work;
+	struct list_head list;
+	struct cm_port *port;
+	struct ib_mad_recv_wc *mad_recv_wc;	/* Received MADs */
+	__be32 local_id;			/* Established / timewait */
+	__be32 remote_id;
+	struct ib_cm_event cm_event;
+	struct ib_sa_path_rec path[0];
+};
+
+struct cm_timewait_info {
+	struct cm_work work;			/* Must be first. */
+	struct list_head list;
+	struct rb_node remote_qp_node;
+	struct rb_node remote_id_node;
+	__be64 remote_ca_guid;
+	__be32 remote_qpn;
+	u8 inserted_remote_qp;
+	u8 inserted_remote_id;
+};
+
+struct cm_id_private {
+	struct ib_cm_id	id;
+
+	struct rb_node service_node;
+	struct rb_node sidr_id_node;
+	spinlock_t lock;	/* Do not acquire inside cm.lock */
+	struct completion comp;
+	atomic_t refcount;
+
+	struct ib_mad_send_buf *msg;
+	struct cm_timewait_info *timewait_info;
+	/* todo: use alternate port on send failure */
+	struct cm_av av;
+	struct cm_av alt_av;
+	struct ib_cm_compare_data *compare_data;
+
+	void *private_data;
+	__be64 tid;
+	__be32 local_qpn;
+	__be32 remote_qpn;
+	enum ib_qp_type qp_type;
+	__be32 sq_psn;
+	__be32 rq_psn;
+	int timeout_ms;
+	enum ib_mtu path_mtu;
+	__be16 pkey;
+	u8 private_data_len;
+	u8 max_cm_retries;
+	u8 peer_to_peer;
+	u8 responder_resources;
+	u8 initiator_depth;
+	u8 retry_count;
+	u8 rnr_retry_count;
+	u8 service_timeout;
+	u8 target_ack_delay;
+
+	struct list_head work_list;
+	atomic_t work_count;
+};
+
+static void cm_work_handler(struct work_struct *work);
+
+static inline void cm_deref_id(struct cm_id_private *cm_id_priv)
+{
+	if (atomic_dec_and_test(&cm_id_priv->refcount))
+		complete(&cm_id_priv->comp);
+}
+
+static int cm_alloc_msg(struct cm_id_private *cm_id_priv,
+			struct ib_mad_send_buf **msg)
+{
+	struct ib_mad_agent *mad_agent;
+	struct ib_mad_send_buf *m;
+	struct ib_ah *ah;
+
+	mad_agent = cm_id_priv->av.port->mad_agent;
+	ah = ib_create_ah(mad_agent->qp->pd, &cm_id_priv->av.ah_attr);
+	if (IS_ERR(ah))
+		return PTR_ERR(ah);
+
+	m = ib_create_send_mad(mad_agent, cm_id_priv->id.remote_cm_qpn,
+			       cm_id_priv->av.pkey_index,
+			       0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+			       GFP_ATOMIC);
+	if (IS_ERR(m)) {
+		ib_destroy_ah(ah);
+		return PTR_ERR(m);
+	}
+
+	/* Timeout set by caller if response is expected. */
+	m->ah = ah;
+	m->retries = cm_id_priv->max_cm_retries;
+
+	atomic_inc(&cm_id_priv->refcount);
+	m->context[0] = cm_id_priv;
+	*msg = m;
+	return 0;
+}
+
+static int cm_alloc_response_msg(struct cm_port *port,
+				 struct ib_mad_recv_wc *mad_recv_wc,
+				 struct ib_mad_send_buf **msg)
+{
+	struct ib_mad_send_buf *m;
+	struct ib_ah *ah;
+
+	ah = ib_create_ah_from_wc(port->mad_agent->qp->pd, mad_recv_wc->wc,
+				  mad_recv_wc->recv_buf.grh, port->port_num);
+	if (IS_ERR(ah))
+		return PTR_ERR(ah);
+
+	m = ib_create_send_mad(port->mad_agent, 1, mad_recv_wc->wc->pkey_index,
+			       0, IB_MGMT_MAD_HDR, IB_MGMT_MAD_DATA,
+			       GFP_ATOMIC);
+	if (IS_ERR(m)) {
+		ib_destroy_ah(ah);
+		return PTR_ERR(m);
+	}
+	m->ah = ah;
+	*msg = m;
+	return 0;
+}
+
+static void cm_free_msg(struct ib_mad_send_buf *msg)
+{
+	ib_destroy_ah(msg->ah);
+	if (msg->context[0])
+		cm_deref_id(msg->context[0]);
+	ib_free_send_mad(msg);
+}
+
+static void * cm_copy_private_data(const void *private_data,
+				   u8 private_data_len)
+{
+	void *data;
+
+	if (!private_data || !private_data_len)
+		return NULL;
+
+	data = kmemdup(private_data, private_data_len, GFP_KERNEL);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	return data;
+}
+
+static void cm_set_private_data(struct cm_id_private *cm_id_priv,
+				 void *private_data, u8 private_data_len)
+{
+	if (cm_id_priv->private_data && cm_id_priv->private_data_len)
+		kfree(cm_id_priv->private_data);
+
+	cm_id_priv->private_data = private_data;
+	cm_id_priv->private_data_len = private_data_len;
+}
+
+static void cm_init_av_for_response(struct cm_port *port, struct ib_wc *wc,
+				    struct ib_grh *grh, struct cm_av *av)
+{
+	av->port = port;
+	av->pkey_index = wc->pkey_index;
+	ib_init_ah_from_wc(port->cm_dev->ib_device, port->port_num, wc,
+			   grh, &av->ah_attr);
+}
+
+static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av)
+{
+	struct cm_device *cm_dev;
+	struct cm_port *port = NULL;
+	unsigned long flags;
+	int ret;
+	u8 p;
+
+	read_lock_irqsave(&cm.device_lock, flags);
+	list_for_each_entry(cm_dev, &cm.device_list, list) {
+		if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid,
+					&p, NULL)) {
+			port = cm_dev->port[p-1];
+			break;
+		}
+	}
+	read_unlock_irqrestore(&cm.device_lock, flags);
+
+	if (!port)
+		return -EINVAL;
+
+	ret = ib_find_cached_pkey(cm_dev->ib_device, port->port_num,
+				  be16_to_cpu(path->pkey), &av->pkey_index);
+	if (ret)
+		return ret;
+
+	av->port = port;
+	ib_init_ah_from_path(cm_dev->ib_device, port->port_num, path,
+			     &av->ah_attr);
+	av->timeout = path->packet_life_time + 1;
+	memcpy(av->smac, path->smac, sizeof(av->smac));
+
+	av->valid = 1;
+	return 0;
+}
+
+static int cm_alloc_id(struct cm_id_private *cm_id_priv)
+{
+	unsigned long flags;
+	int id;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irqsave(&cm.lock, flags);
+
+	id = idr_alloc_cyclic(&cm.local_id_table, cm_id_priv, 0, 0, GFP_NOWAIT);
+
+	spin_unlock_irqrestore(&cm.lock, flags);
+	idr_preload_end();
+
+	cm_id_priv->id.local_id = (__force __be32)id ^ cm.random_id_operand;
+	return id < 0 ? id : 0;
+}
+
+static void cm_free_id(__be32 local_id)
+{
+	spin_lock_irq(&cm.lock);
+	idr_remove(&cm.local_id_table,
+		   (__force int) (local_id ^ cm.random_id_operand));
+	spin_unlock_irq(&cm.lock);
+}
+
+static struct cm_id_private * cm_get_id(__be32 local_id, __be32 remote_id)
+{
+	struct cm_id_private *cm_id_priv;
+
+	cm_id_priv = idr_find(&cm.local_id_table,
+			      (__force int) (local_id ^ cm.random_id_operand));
+	if (cm_id_priv) {
+		if (cm_id_priv->id.remote_id == remote_id)
+			atomic_inc(&cm_id_priv->refcount);
+		else
+			cm_id_priv = NULL;
+	}
+
+	return cm_id_priv;
+}
+
+static struct cm_id_private * cm_acquire_id(__be32 local_id, __be32 remote_id)
+{
+	struct cm_id_private *cm_id_priv;
+
+	spin_lock_irq(&cm.lock);
+	cm_id_priv = cm_get_id(local_id, remote_id);
+	spin_unlock_irq(&cm.lock);
+
+	return cm_id_priv;
+}
+
+static void cm_mask_copy(u32 *dst, const u32 *src, const u32 *mask)
+{
+	int i;
+
+	for (i = 0; i < IB_CM_COMPARE_SIZE; i++)
+		dst[i] = src[i] & mask[i];
+}
+
+static int cm_compare_data(struct ib_cm_compare_data *src_data,
+			   struct ib_cm_compare_data *dst_data)
+{
+	u32 src[IB_CM_COMPARE_SIZE];
+	u32 dst[IB_CM_COMPARE_SIZE];
+
+	if (!src_data || !dst_data)
+		return 0;
+
+	cm_mask_copy(src, src_data->data, dst_data->mask);
+	cm_mask_copy(dst, dst_data->data, src_data->mask);
+	return memcmp(src, dst, sizeof(src));
+}
+
+static int cm_compare_private_data(u32 *private_data,
+				   struct ib_cm_compare_data *dst_data)
+{
+	u32 src[IB_CM_COMPARE_SIZE];
+
+	if (!dst_data)
+		return 0;
+
+	cm_mask_copy(src, private_data, dst_data->mask);
+	return memcmp(src, dst_data->data, sizeof(src));
+}
+
+/*
+ * Trivial helpers to strip endian annotation and compare; the
+ * endianness doesn't actually matter since we just need a stable
+ * order for the RB tree.
+ */
+static int be32_lt(__be32 a, __be32 b)
+{
+	return (__force u32) a < (__force u32) b;
+}
+
+static int be32_gt(__be32 a, __be32 b)
+{
+	return (__force u32) a > (__force u32) b;
+}
+
+static int be64_lt(__be64 a, __be64 b)
+{
+	return (__force u64) a < (__force u64) b;
+}
+
+static int be64_gt(__be64 a, __be64 b)
+{
+	return (__force u64) a > (__force u64) b;
+}
+
+static struct cm_id_private * cm_insert_listen(struct cm_id_private *cm_id_priv)
+{
+	struct rb_node **link = &cm.listen_service_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_id_private *cur_cm_id_priv;
+	__be64 service_id = cm_id_priv->id.service_id;
+	__be64 service_mask = cm_id_priv->id.service_mask;
+	int data_cmp;
+
+	while (*link) {
+		parent = *link;
+		cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+					  service_node);
+		data_cmp = cm_compare_data(cm_id_priv->compare_data,
+					   cur_cm_id_priv->compare_data);
+		if ((cur_cm_id_priv->id.service_mask & service_id) ==
+		    (service_mask & cur_cm_id_priv->id.service_id) &&
+		    (cm_id_priv->id.device == cur_cm_id_priv->id.device) &&
+		    !data_cmp)
+			return cur_cm_id_priv;
+
+		if (cm_id_priv->id.device < cur_cm_id_priv->id.device)
+			link = &(*link)->rb_left;
+		else if (cm_id_priv->id.device > cur_cm_id_priv->id.device)
+			link = &(*link)->rb_right;
+		else if (be64_lt(service_id, cur_cm_id_priv->id.service_id))
+			link = &(*link)->rb_left;
+		else if (be64_gt(service_id, cur_cm_id_priv->id.service_id))
+			link = &(*link)->rb_right;
+		else if (data_cmp < 0)
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+	rb_link_node(&cm_id_priv->service_node, parent, link);
+	rb_insert_color(&cm_id_priv->service_node, &cm.listen_service_table);
+	return NULL;
+}
+
+static struct cm_id_private * cm_find_listen(struct ib_device *device,
+					     __be64 service_id,
+					     u32 *private_data)
+{
+	struct rb_node *node = cm.listen_service_table.rb_node;
+	struct cm_id_private *cm_id_priv;
+	int data_cmp;
+
+	while (node) {
+		cm_id_priv = rb_entry(node, struct cm_id_private, service_node);
+		data_cmp = cm_compare_private_data(private_data,
+						   cm_id_priv->compare_data);
+		if ((cm_id_priv->id.service_mask & service_id) ==
+		     cm_id_priv->id.service_id &&
+		    (cm_id_priv->id.device == device) && !data_cmp)
+			return cm_id_priv;
+
+		if (device < cm_id_priv->id.device)
+			node = node->rb_left;
+		else if (device > cm_id_priv->id.device)
+			node = node->rb_right;
+		else if (be64_lt(service_id, cm_id_priv->id.service_id))
+			node = node->rb_left;
+		else if (be64_gt(service_id, cm_id_priv->id.service_id))
+			node = node->rb_right;
+		else if (data_cmp < 0)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct cm_timewait_info * cm_insert_remote_id(struct cm_timewait_info
+						     *timewait_info)
+{
+	struct rb_node **link = &cm.remote_id_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_timewait_info *cur_timewait_info;
+	__be64 remote_ca_guid = timewait_info->remote_ca_guid;
+	__be32 remote_id = timewait_info->work.remote_id;
+
+	while (*link) {
+		parent = *link;
+		cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+					     remote_id_node);
+		if (be32_lt(remote_id, cur_timewait_info->work.remote_id))
+			link = &(*link)->rb_left;
+		else if (be32_gt(remote_id, cur_timewait_info->work.remote_id))
+			link = &(*link)->rb_right;
+		else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_left;
+		else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_right;
+		else
+			return cur_timewait_info;
+	}
+	timewait_info->inserted_remote_id = 1;
+	rb_link_node(&timewait_info->remote_id_node, parent, link);
+	rb_insert_color(&timewait_info->remote_id_node, &cm.remote_id_table);
+	return NULL;
+}
+
+static struct cm_timewait_info * cm_find_remote_id(__be64 remote_ca_guid,
+						   __be32 remote_id)
+{
+	struct rb_node *node = cm.remote_id_table.rb_node;
+	struct cm_timewait_info *timewait_info;
+
+	while (node) {
+		timewait_info = rb_entry(node, struct cm_timewait_info,
+					 remote_id_node);
+		if (be32_lt(remote_id, timewait_info->work.remote_id))
+			node = node->rb_left;
+		else if (be32_gt(remote_id, timewait_info->work.remote_id))
+			node = node->rb_right;
+		else if (be64_lt(remote_ca_guid, timewait_info->remote_ca_guid))
+			node = node->rb_left;
+		else if (be64_gt(remote_ca_guid, timewait_info->remote_ca_guid))
+			node = node->rb_right;
+		else
+			return timewait_info;
+	}
+	return NULL;
+}
+
+static struct cm_timewait_info * cm_insert_remote_qpn(struct cm_timewait_info
+						      *timewait_info)
+{
+	struct rb_node **link = &cm.remote_qp_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_timewait_info *cur_timewait_info;
+	__be64 remote_ca_guid = timewait_info->remote_ca_guid;
+	__be32 remote_qpn = timewait_info->remote_qpn;
+
+	while (*link) {
+		parent = *link;
+		cur_timewait_info = rb_entry(parent, struct cm_timewait_info,
+					     remote_qp_node);
+		if (be32_lt(remote_qpn, cur_timewait_info->remote_qpn))
+			link = &(*link)->rb_left;
+		else if (be32_gt(remote_qpn, cur_timewait_info->remote_qpn))
+			link = &(*link)->rb_right;
+		else if (be64_lt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_left;
+		else if (be64_gt(remote_ca_guid, cur_timewait_info->remote_ca_guid))
+			link = &(*link)->rb_right;
+		else
+			return cur_timewait_info;
+	}
+	timewait_info->inserted_remote_qp = 1;
+	rb_link_node(&timewait_info->remote_qp_node, parent, link);
+	rb_insert_color(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+	return NULL;
+}
+
+static struct cm_id_private * cm_insert_remote_sidr(struct cm_id_private
+						    *cm_id_priv)
+{
+	struct rb_node **link = &cm.remote_sidr_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct cm_id_private *cur_cm_id_priv;
+	union ib_gid *port_gid = &cm_id_priv->av.dgid;
+	__be32 remote_id = cm_id_priv->id.remote_id;
+
+	while (*link) {
+		parent = *link;
+		cur_cm_id_priv = rb_entry(parent, struct cm_id_private,
+					  sidr_id_node);
+		if (be32_lt(remote_id, cur_cm_id_priv->id.remote_id))
+			link = &(*link)->rb_left;
+		else if (be32_gt(remote_id, cur_cm_id_priv->id.remote_id))
+			link = &(*link)->rb_right;
+		else {
+			int cmp;
+			cmp = memcmp(port_gid, &cur_cm_id_priv->av.dgid,
+				     sizeof *port_gid);
+			if (cmp < 0)
+				link = &(*link)->rb_left;
+			else if (cmp > 0)
+				link = &(*link)->rb_right;
+			else
+				return cur_cm_id_priv;
+		}
+	}
+	rb_link_node(&cm_id_priv->sidr_id_node, parent, link);
+	rb_insert_color(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+	return NULL;
+}
+
+static void cm_reject_sidr_req(struct cm_id_private *cm_id_priv,
+			       enum ib_cm_sidr_status status)
+{
+	struct ib_cm_sidr_rep_param param;
+
+	memset(&param, 0, sizeof param);
+	param.status = status;
+	ib_send_cm_sidr_rep(&cm_id_priv->id, &param);
+}
+
+struct ib_cm_id *ib_create_cm_id(struct ib_device *device,
+				 ib_cm_handler cm_handler,
+				 void *context)
+{
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	cm_id_priv = kzalloc(sizeof *cm_id_priv, GFP_KERNEL);
+	if (!cm_id_priv)
+		return ERR_PTR(-ENOMEM);
+
+	cm_id_priv->id.state = IB_CM_IDLE;
+	cm_id_priv->id.device = device;
+	cm_id_priv->id.cm_handler = cm_handler;
+	cm_id_priv->id.context = context;
+	cm_id_priv->id.remote_cm_qpn = 1;
+	ret = cm_alloc_id(cm_id_priv);
+	if (ret)
+		goto error;
+
+	spin_lock_init(&cm_id_priv->lock);
+	init_completion(&cm_id_priv->comp);
+	INIT_LIST_HEAD(&cm_id_priv->work_list);
+	atomic_set(&cm_id_priv->work_count, -1);
+	atomic_set(&cm_id_priv->refcount, 1);
+	return &cm_id_priv->id;
+
+error:
+	kfree(cm_id_priv);
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_cm_id);
+
+static struct cm_work * cm_dequeue_work(struct cm_id_private *cm_id_priv)
+{
+	struct cm_work *work;
+
+	if (list_empty(&cm_id_priv->work_list))
+		return NULL;
+
+	work = list_entry(cm_id_priv->work_list.next, struct cm_work, list);
+	list_del(&work->list);
+	return work;
+}
+
+static void cm_free_work(struct cm_work *work)
+{
+	if (work->mad_recv_wc)
+		ib_free_recv_mad(work->mad_recv_wc);
+	kfree(work);
+}
+
+static inline int cm_convert_to_ms(int iba_time)
+{
+	/* approximate conversion to ms from 4.096us x 2^iba_time */
+	return 1 << max(iba_time - 8, 0);
+}
+
+/*
+ * calculate: 4.096x2^ack_timeout = 4.096x2^ack_delay + 2x4.096x2^life_time
+ * Because of how ack_timeout is stored, adding one doubles the timeout.
+ * To avoid large timeouts, select the max(ack_delay, life_time + 1), and
+ * increment it (round up) only if the other is within 50%.
+ */
+static u8 cm_ack_timeout(u8 ca_ack_delay, u8 packet_life_time)
+{
+	int ack_timeout = packet_life_time + 1;
+
+	if (ack_timeout >= ca_ack_delay)
+		ack_timeout += (ca_ack_delay >= (ack_timeout - 1));
+	else
+		ack_timeout = ca_ack_delay +
+			      (ack_timeout >= (ca_ack_delay - 1));
+
+	return min(31, ack_timeout);
+}
+
+static void cm_cleanup_timewait(struct cm_timewait_info *timewait_info)
+{
+	if (timewait_info->inserted_remote_id) {
+		rb_erase(&timewait_info->remote_id_node, &cm.remote_id_table);
+		timewait_info->inserted_remote_id = 0;
+	}
+
+	if (timewait_info->inserted_remote_qp) {
+		rb_erase(&timewait_info->remote_qp_node, &cm.remote_qp_table);
+		timewait_info->inserted_remote_qp = 0;
+	}
+}
+
+static struct cm_timewait_info * cm_create_timewait_info(__be32 local_id)
+{
+	struct cm_timewait_info *timewait_info;
+
+	timewait_info = kzalloc(sizeof *timewait_info, GFP_KERNEL);
+	if (!timewait_info)
+		return ERR_PTR(-ENOMEM);
+
+	timewait_info->work.local_id = local_id;
+	INIT_DELAYED_WORK(&timewait_info->work.work, cm_work_handler);
+	timewait_info->work.cm_event.event = IB_CM_TIMEWAIT_EXIT;
+	return timewait_info;
+}
+
+static void cm_enter_timewait(struct cm_id_private *cm_id_priv)
+{
+	int wait_time;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm.lock, flags);
+	cm_cleanup_timewait(cm_id_priv->timewait_info);
+	list_add_tail(&cm_id_priv->timewait_info->list, &cm.timewait_list);
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+	/*
+	 * The cm_id could be destroyed by the user before we exit timewait.
+	 * To protect against this, we search for the cm_id after exiting
+	 * timewait before notifying the user that we've exited timewait.
+	 */
+	cm_id_priv->id.state = IB_CM_TIMEWAIT;
+	wait_time = cm_convert_to_ms(cm_id_priv->av.timeout);
+	queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work,
+			   msecs_to_jiffies(wait_time));
+	cm_id_priv->timewait_info = NULL;
+}
+
+static void cm_reset_to_idle(struct cm_id_private *cm_id_priv)
+{
+	unsigned long flags;
+
+	cm_id_priv->id.state = IB_CM_IDLE;
+	if (cm_id_priv->timewait_info) {
+		spin_lock_irqsave(&cm.lock, flags);
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		spin_unlock_irqrestore(&cm.lock, flags);
+		kfree(cm_id_priv->timewait_info);
+		cm_id_priv->timewait_info = NULL;
+	}
+}
+
+static void cm_destroy_id(struct ib_cm_id *cm_id, int err)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_work *work;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+retest:
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id->state) {
+	case IB_CM_LISTEN:
+		cm_id->state = IB_CM_IDLE;
+		spin_unlock_irq(&cm_id_priv->lock);
+		spin_lock_irq(&cm.lock);
+		rb_erase(&cm_id_priv->service_node, &cm.listen_service_table);
+		spin_unlock_irq(&cm.lock);
+		break;
+	case IB_CM_SIDR_REQ_SENT:
+		cm_id->state = IB_CM_IDLE;
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		spin_unlock_irq(&cm_id_priv->lock);
+		break;
+	case IB_CM_SIDR_REQ_RCVD:
+		spin_unlock_irq(&cm_id_priv->lock);
+		cm_reject_sidr_req(cm_id_priv, IB_SIDR_REJECT);
+		break;
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_TIMEOUT,
+			       &cm_id_priv->id.device->node_guid,
+			       sizeof cm_id_priv->id.device->node_guid,
+			       NULL, 0);
+		break;
+	case IB_CM_REQ_RCVD:
+		if (err == -ENOMEM) {
+			/* Do not reject to allow future retries. */
+			cm_reset_to_idle(cm_id_priv);
+			spin_unlock_irq(&cm_id_priv->lock);
+		} else {
+			spin_unlock_irq(&cm_id_priv->lock);
+			ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+				       NULL, 0, NULL, 0);
+		}
+		break;
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		/* Fall through */
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+			       NULL, 0, NULL, 0);
+		break;
+	case IB_CM_ESTABLISHED:
+		spin_unlock_irq(&cm_id_priv->lock);
+		if (cm_id_priv->qp_type == IB_QPT_XRC_TGT)
+			break;
+		ib_send_cm_dreq(cm_id, NULL, 0);
+		goto retest;
+	case IB_CM_DREQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		cm_enter_timewait(cm_id_priv);
+		spin_unlock_irq(&cm_id_priv->lock);
+		break;
+	case IB_CM_DREQ_RCVD:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ib_send_cm_drep(cm_id, NULL, 0);
+		break;
+	default:
+		spin_unlock_irq(&cm_id_priv->lock);
+		break;
+	}
+
+	cm_free_id(cm_id->local_id);
+	cm_deref_id(cm_id_priv);
+	wait_for_completion(&cm_id_priv->comp);
+	while ((work = cm_dequeue_work(cm_id_priv)) != NULL)
+		cm_free_work(work);
+	kfree(cm_id_priv->compare_data);
+	kfree(cm_id_priv->private_data);
+	kfree(cm_id_priv);
+}
+
+void ib_destroy_cm_id(struct ib_cm_id *cm_id)
+{
+	cm_destroy_id(cm_id, 0);
+}
+EXPORT_SYMBOL(ib_destroy_cm_id);
+
+int ib_cm_listen(struct ib_cm_id *cm_id, __be64 service_id, __be64 service_mask,
+		 struct ib_cm_compare_data *compare_data)
+{
+	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+
+	service_mask = service_mask ? service_mask : ~cpu_to_be64(0);
+	service_id &= service_mask;
+	if ((service_id & IB_SERVICE_ID_AGN_MASK) == IB_CM_ASSIGN_SERVICE_ID &&
+	    (service_id != IB_CM_ASSIGN_SERVICE_ID))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	if (cm_id->state != IB_CM_IDLE)
+		return -EINVAL;
+
+	if (compare_data) {
+		cm_id_priv->compare_data = kzalloc(sizeof *compare_data,
+						   GFP_KERNEL);
+		if (!cm_id_priv->compare_data)
+			return -ENOMEM;
+		cm_mask_copy(cm_id_priv->compare_data->data,
+			     compare_data->data, compare_data->mask);
+		memcpy(cm_id_priv->compare_data->mask, compare_data->mask,
+		       sizeof(compare_data->mask));
+	}
+
+	cm_id->state = IB_CM_LISTEN;
+
+	spin_lock_irqsave(&cm.lock, flags);
+	if (service_id == IB_CM_ASSIGN_SERVICE_ID) {
+		cm_id->service_id = cpu_to_be64(cm.listen_service_id++);
+		cm_id->service_mask = ~cpu_to_be64(0);
+	} else {
+		cm_id->service_id = service_id;
+		cm_id->service_mask = service_mask;
+	}
+	cur_cm_id_priv = cm_insert_listen(cm_id_priv);
+	spin_unlock_irqrestore(&cm.lock, flags);
+
+	if (cur_cm_id_priv) {
+		cm_id->state = IB_CM_IDLE;
+		kfree(cm_id_priv->compare_data);
+		cm_id_priv->compare_data = NULL;
+		ret = -EBUSY;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(ib_cm_listen);
+
+static __be64 cm_form_tid(struct cm_id_private *cm_id_priv,
+			  enum cm_msg_sequence msg_seq)
+{
+	u64 hi_tid, low_tid;
+
+	hi_tid   = ((u64) cm_id_priv->av.port->mad_agent->hi_tid) << 32;
+	low_tid  = (u64) ((__force u32)cm_id_priv->id.local_id |
+			  (msg_seq << 30));
+	return cpu_to_be64(hi_tid | low_tid);
+}
+
+static void cm_format_mad_hdr(struct ib_mad_hdr *hdr,
+			      __be16 attr_id, __be64 tid)
+{
+	hdr->base_version  = IB_MGMT_BASE_VERSION;
+	hdr->mgmt_class	   = IB_MGMT_CLASS_CM;
+	hdr->class_version = IB_CM_CLASS_VERSION;
+	hdr->method	   = IB_MGMT_METHOD_SEND;
+	hdr->attr_id	   = attr_id;
+	hdr->tid	   = tid;
+}
+
+static void cm_format_req(struct cm_req_msg *req_msg,
+			  struct cm_id_private *cm_id_priv,
+			  struct ib_cm_req_param *param)
+{
+	struct ib_sa_path_rec *pri_path = param->primary_path;
+	struct ib_sa_path_rec *alt_path = param->alternate_path;
+
+	cm_format_mad_hdr(&req_msg->hdr, CM_REQ_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_REQ));
+
+	req_msg->local_comm_id = cm_id_priv->id.local_id;
+	req_msg->service_id = param->service_id;
+	req_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+	cm_req_set_local_qpn(req_msg, cpu_to_be32(param->qp_num));
+	cm_req_set_init_depth(req_msg, param->initiator_depth);
+	cm_req_set_remote_resp_timeout(req_msg,
+				       param->remote_cm_response_timeout);
+	cm_req_set_qp_type(req_msg, param->qp_type);
+	cm_req_set_flow_ctrl(req_msg, param->flow_control);
+	cm_req_set_starting_psn(req_msg, cpu_to_be32(param->starting_psn));
+	cm_req_set_local_resp_timeout(req_msg,
+				      param->local_cm_response_timeout);
+	req_msg->pkey = param->primary_path->pkey;
+	cm_req_set_path_mtu(req_msg, param->primary_path->mtu);
+	cm_req_set_max_cm_retries(req_msg, param->max_cm_retries);
+
+	if (param->qp_type != IB_QPT_XRC_INI) {
+		cm_req_set_resp_res(req_msg, param->responder_resources);
+		cm_req_set_retry_count(req_msg, param->retry_count);
+		cm_req_set_rnr_retry_count(req_msg, param->rnr_retry_count);
+		cm_req_set_srq(req_msg, param->srq);
+	}
+
+	if (pri_path->hop_limit <= 1) {
+		req_msg->primary_local_lid = pri_path->slid;
+		req_msg->primary_remote_lid = pri_path->dlid;
+	} else {
+		/* Work-around until there's a way to obtain remote LID info */
+		req_msg->primary_local_lid = IB_LID_PERMISSIVE;
+		req_msg->primary_remote_lid = IB_LID_PERMISSIVE;
+	}
+	req_msg->primary_local_gid = pri_path->sgid;
+	req_msg->primary_remote_gid = pri_path->dgid;
+	cm_req_set_primary_flow_label(req_msg, pri_path->flow_label);
+	cm_req_set_primary_packet_rate(req_msg, pri_path->rate);
+	req_msg->primary_traffic_class = pri_path->traffic_class;
+	req_msg->primary_hop_limit = pri_path->hop_limit;
+	cm_req_set_primary_sl(req_msg, pri_path->sl);
+	cm_req_set_primary_subnet_local(req_msg, (pri_path->hop_limit <= 1));
+	cm_req_set_primary_local_ack_timeout(req_msg,
+		cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+			       pri_path->packet_life_time));
+
+	if (alt_path) {
+		if (alt_path->hop_limit <= 1) {
+			req_msg->alt_local_lid = alt_path->slid;
+			req_msg->alt_remote_lid = alt_path->dlid;
+		} else {
+			req_msg->alt_local_lid = IB_LID_PERMISSIVE;
+			req_msg->alt_remote_lid = IB_LID_PERMISSIVE;
+		}
+		req_msg->alt_local_gid = alt_path->sgid;
+		req_msg->alt_remote_gid = alt_path->dgid;
+		cm_req_set_alt_flow_label(req_msg,
+					  alt_path->flow_label);
+		cm_req_set_alt_packet_rate(req_msg, alt_path->rate);
+		req_msg->alt_traffic_class = alt_path->traffic_class;
+		req_msg->alt_hop_limit = alt_path->hop_limit;
+		cm_req_set_alt_sl(req_msg, alt_path->sl);
+		cm_req_set_alt_subnet_local(req_msg, (alt_path->hop_limit <= 1));
+		cm_req_set_alt_local_ack_timeout(req_msg,
+			cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+				       alt_path->packet_life_time));
+	}
+
+	if (param->private_data && param->private_data_len)
+		memcpy(req_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+static int cm_validate_req_param(struct ib_cm_req_param *param)
+{
+	/* peer-to-peer not supported */
+	if (param->peer_to_peer)
+		return -EINVAL;
+
+	if (!param->primary_path)
+		return -EINVAL;
+
+	if (param->qp_type != IB_QPT_RC && param->qp_type != IB_QPT_UC &&
+	    param->qp_type != IB_QPT_XRC_INI)
+		return -EINVAL;
+
+	if (param->private_data &&
+	    param->private_data_len > IB_CM_REQ_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	if (param->alternate_path &&
+	    (param->alternate_path->pkey != param->primary_path->pkey ||
+	     param->alternate_path->mtu != param->primary_path->mtu))
+		return -EINVAL;
+
+	return 0;
+}
+
+int ib_send_cm_req(struct ib_cm_id *cm_id,
+		   struct ib_cm_req_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_req_msg *req_msg;
+	unsigned long flags;
+	int ret;
+
+	ret = cm_validate_req_param(param);
+	if (ret)
+		return ret;
+
+	/* Verify that we're not in timewait. */
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_IDLE) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		ret = -EINVAL;
+		goto out;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+							    id.local_id);
+	if (IS_ERR(cm_id_priv->timewait_info)) {
+		ret = PTR_ERR(cm_id_priv->timewait_info);
+		goto out;
+	}
+
+	ret = cm_init_av_by_path(param->primary_path, &cm_id_priv->av);
+	if (ret)
+		goto error1;
+	if (param->alternate_path) {
+		ret = cm_init_av_by_path(param->alternate_path,
+					 &cm_id_priv->alt_av);
+		if (ret)
+			goto error1;
+	}
+	cm_id->service_id = param->service_id;
+	cm_id->service_mask = ~cpu_to_be64(0);
+	cm_id_priv->timeout_ms = cm_convert_to_ms(
+				    param->primary_path->packet_life_time) * 2 +
+				 cm_convert_to_ms(
+				    param->remote_cm_response_timeout);
+	cm_id_priv->max_cm_retries = param->max_cm_retries;
+	cm_id_priv->initiator_depth = param->initiator_depth;
+	cm_id_priv->responder_resources = param->responder_resources;
+	cm_id_priv->retry_count = param->retry_count;
+	cm_id_priv->path_mtu = param->primary_path->mtu;
+	cm_id_priv->pkey = param->primary_path->pkey;
+	cm_id_priv->qp_type = param->qp_type;
+
+	ret = cm_alloc_msg(cm_id_priv, &cm_id_priv->msg);
+	if (ret)
+		goto error1;
+
+	req_msg = (struct cm_req_msg *) cm_id_priv->msg->mad;
+	cm_format_req(req_msg, cm_id_priv, param);
+	cm_id_priv->tid = req_msg->hdr.tid;
+	cm_id_priv->msg->timeout_ms = cm_id_priv->timeout_ms;
+	cm_id_priv->msg->context[1] = (void *) (unsigned long) IB_CM_REQ_SENT;
+
+	cm_id_priv->local_qpn = cm_req_get_local_qpn(req_msg);
+	cm_id_priv->rq_psn = cm_req_get_starting_psn(req_msg);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	ret = ib_post_send_mad(cm_id_priv->msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		goto error2;
+	}
+	BUG_ON(cm_id->state != IB_CM_IDLE);
+	cm_id->state = IB_CM_REQ_SENT;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+
+error2:	cm_free_msg(cm_id_priv->msg);
+error1:	kfree(cm_id_priv->timewait_info);
+out:	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_req);
+
+static int cm_issue_rej(struct cm_port *port,
+			struct ib_mad_recv_wc *mad_recv_wc,
+			enum ib_cm_rej_reason reason,
+			enum cm_msg_response msg_rejected,
+			void *ari, u8 ari_length)
+{
+	struct ib_mad_send_buf *msg = NULL;
+	struct cm_rej_msg *rej_msg, *rcv_msg;
+	int ret;
+
+	ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+	if (ret)
+		return ret;
+
+	/* We just need common CM header information.  Cast to any message. */
+	rcv_msg = (struct cm_rej_msg *) mad_recv_wc->recv_buf.mad;
+	rej_msg = (struct cm_rej_msg *) msg->mad;
+
+	cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, rcv_msg->hdr.tid);
+	rej_msg->remote_comm_id = rcv_msg->local_comm_id;
+	rej_msg->local_comm_id = rcv_msg->remote_comm_id;
+	cm_rej_set_msg_rejected(rej_msg, msg_rejected);
+	rej_msg->reason = cpu_to_be16(reason);
+
+	if (ari && ari_length) {
+		cm_rej_set_reject_info_len(rej_msg, ari_length);
+		memcpy(rej_msg->ari, ari, ari_length);
+	}
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		cm_free_msg(msg);
+
+	return ret;
+}
+
+static inline int cm_is_active_peer(__be64 local_ca_guid, __be64 remote_ca_guid,
+				    __be32 local_qpn, __be32 remote_qpn)
+{
+	return (be64_to_cpu(local_ca_guid) > be64_to_cpu(remote_ca_guid) ||
+		((local_ca_guid == remote_ca_guid) &&
+		 (be32_to_cpu(local_qpn) > be32_to_cpu(remote_qpn))));
+}
+
+static void cm_format_paths_from_req(struct cm_req_msg *req_msg,
+					    struct ib_sa_path_rec *primary_path,
+					    struct ib_sa_path_rec *alt_path)
+{
+	memset(primary_path, 0, sizeof *primary_path);
+	primary_path->dgid = req_msg->primary_local_gid;
+	primary_path->sgid = req_msg->primary_remote_gid;
+	primary_path->dlid = req_msg->primary_local_lid;
+	primary_path->slid = req_msg->primary_remote_lid;
+	primary_path->flow_label = cm_req_get_primary_flow_label(req_msg);
+	primary_path->hop_limit = req_msg->primary_hop_limit;
+	primary_path->traffic_class = req_msg->primary_traffic_class;
+	primary_path->reversible = 1;
+	primary_path->pkey = req_msg->pkey;
+	primary_path->sl = cm_req_get_primary_sl(req_msg);
+	primary_path->mtu_selector = IB_SA_EQ;
+	primary_path->mtu = cm_req_get_path_mtu(req_msg);
+	primary_path->rate_selector = IB_SA_EQ;
+	primary_path->rate = cm_req_get_primary_packet_rate(req_msg);
+	primary_path->packet_life_time_selector = IB_SA_EQ;
+	primary_path->packet_life_time =
+		cm_req_get_primary_local_ack_timeout(req_msg);
+	primary_path->packet_life_time -= (primary_path->packet_life_time > 0);
+
+	if (req_msg->alt_local_lid) {
+		memset(alt_path, 0, sizeof *alt_path);
+		alt_path->dgid = req_msg->alt_local_gid;
+		alt_path->sgid = req_msg->alt_remote_gid;
+		alt_path->dlid = req_msg->alt_local_lid;
+		alt_path->slid = req_msg->alt_remote_lid;
+		alt_path->flow_label = cm_req_get_alt_flow_label(req_msg);
+		alt_path->hop_limit = req_msg->alt_hop_limit;
+		alt_path->traffic_class = req_msg->alt_traffic_class;
+		alt_path->reversible = 1;
+		alt_path->pkey = req_msg->pkey;
+		alt_path->sl = cm_req_get_alt_sl(req_msg);
+		alt_path->mtu_selector = IB_SA_EQ;
+		alt_path->mtu = cm_req_get_path_mtu(req_msg);
+		alt_path->rate_selector = IB_SA_EQ;
+		alt_path->rate = cm_req_get_alt_packet_rate(req_msg);
+		alt_path->packet_life_time_selector = IB_SA_EQ;
+		alt_path->packet_life_time =
+			cm_req_get_alt_local_ack_timeout(req_msg);
+		alt_path->packet_life_time -= (alt_path->packet_life_time > 0);
+	}
+}
+
+static void cm_format_req_event(struct cm_work *work,
+				struct cm_id_private *cm_id_priv,
+				struct ib_cm_id *listen_id)
+{
+	struct cm_req_msg *req_msg;
+	struct ib_cm_req_event_param *param;
+
+	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.req_rcvd;
+	param->listen_id = listen_id;
+	param->port = cm_id_priv->av.port->port_num;
+	param->primary_path = &work->path[0];
+	if (req_msg->alt_local_lid)
+		param->alternate_path = &work->path[1];
+	else
+		param->alternate_path = NULL;
+	param->remote_ca_guid = req_msg->local_ca_guid;
+	param->remote_qkey = be32_to_cpu(req_msg->local_qkey);
+	param->remote_qpn = be32_to_cpu(cm_req_get_local_qpn(req_msg));
+	param->qp_type = cm_req_get_qp_type(req_msg);
+	param->starting_psn = be32_to_cpu(cm_req_get_starting_psn(req_msg));
+	param->responder_resources = cm_req_get_init_depth(req_msg);
+	param->initiator_depth = cm_req_get_resp_res(req_msg);
+	param->local_cm_response_timeout =
+					cm_req_get_remote_resp_timeout(req_msg);
+	param->flow_control = cm_req_get_flow_ctrl(req_msg);
+	param->remote_cm_response_timeout =
+					cm_req_get_local_resp_timeout(req_msg);
+	param->retry_count = cm_req_get_retry_count(req_msg);
+	param->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg);
+	param->srq = cm_req_get_srq(req_msg);
+	work->cm_event.private_data = &req_msg->private_data;
+}
+
+static void cm_process_work(struct cm_id_private *cm_id_priv,
+			    struct cm_work *work)
+{
+	int ret;
+
+	/* We will typically only have the current event to report. */
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &work->cm_event);
+	cm_free_work(work);
+
+	while (!ret && !atomic_add_negative(-1, &cm_id_priv->work_count)) {
+		spin_lock_irq(&cm_id_priv->lock);
+		work = cm_dequeue_work(cm_id_priv);
+		spin_unlock_irq(&cm_id_priv->lock);
+		BUG_ON(!work);
+		ret = cm_id_priv->id.cm_handler(&cm_id_priv->id,
+						&work->cm_event);
+		cm_free_work(work);
+	}
+	cm_deref_id(cm_id_priv);
+	if (ret)
+		cm_destroy_id(&cm_id_priv->id, ret);
+}
+
+static void cm_format_mra(struct cm_mra_msg *mra_msg,
+			  struct cm_id_private *cm_id_priv,
+			  enum cm_msg_response msg_mraed, u8 service_timeout,
+			  const void *private_data, u8 private_data_len)
+{
+	cm_format_mad_hdr(&mra_msg->hdr, CM_MRA_ATTR_ID, cm_id_priv->tid);
+	cm_mra_set_msg_mraed(mra_msg, msg_mraed);
+	mra_msg->local_comm_id = cm_id_priv->id.local_id;
+	mra_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_mra_set_service_timeout(mra_msg, service_timeout);
+
+	if (private_data && private_data_len)
+		memcpy(mra_msg->private_data, private_data, private_data_len);
+}
+
+static void cm_format_rej(struct cm_rej_msg *rej_msg,
+			  struct cm_id_private *cm_id_priv,
+			  enum ib_cm_rej_reason reason,
+			  void *ari,
+			  u8 ari_length,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&rej_msg->hdr, CM_REJ_ATTR_ID, cm_id_priv->tid);
+	rej_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+	switch(cm_id_priv->id.state) {
+	case IB_CM_REQ_RCVD:
+		rej_msg->local_comm_id = 0;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ);
+		break;
+	case IB_CM_MRA_REQ_SENT:
+		rej_msg->local_comm_id = cm_id_priv->id.local_id;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REQ);
+		break;
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		rej_msg->local_comm_id = cm_id_priv->id.local_id;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_REP);
+		break;
+	default:
+		rej_msg->local_comm_id = cm_id_priv->id.local_id;
+		cm_rej_set_msg_rejected(rej_msg, CM_MSG_RESPONSE_OTHER);
+		break;
+	}
+
+	rej_msg->reason = cpu_to_be16(reason);
+	if (ari && ari_length) {
+		cm_rej_set_reject_info_len(rej_msg, ari_length);
+		memcpy(rej_msg->ari, ari, ari_length);
+	}
+
+	if (private_data && private_data_len)
+		memcpy(rej_msg->private_data, private_data, private_data_len);
+}
+
+static void cm_dup_req_handler(struct cm_work *work,
+			       struct cm_id_private *cm_id_priv)
+{
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+			counter[CM_REQ_COUNTER]);
+
+	/* Quick state check to discard duplicate REQs. */
+	if (cm_id_priv->id.state == IB_CM_REQ_RCVD)
+		return;
+
+	ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+	if (ret)
+		return;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_MRA_REQ_SENT:
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      CM_MSG_RESPONSE_REQ, cm_id_priv->service_timeout,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+		break;
+	case IB_CM_TIMEWAIT:
+		cm_format_rej((struct cm_rej_msg *) msg->mad, cm_id_priv,
+			      IB_CM_REJ_STALE_CONN, NULL, 0, NULL, 0);
+		break;
+	default:
+		goto unlock;
+	}
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		goto free;
+	return;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+free:	cm_free_msg(msg);
+}
+
+static struct cm_id_private * cm_match_req(struct cm_work *work,
+					   struct cm_id_private *cm_id_priv)
+{
+	struct cm_id_private *listen_cm_id_priv, *cur_cm_id_priv;
+	struct cm_timewait_info *timewait_info;
+	struct cm_req_msg *req_msg;
+
+	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+	/* Check for possible duplicate REQ. */
+	spin_lock_irq(&cm.lock);
+	timewait_info = cm_insert_remote_id(cm_id_priv->timewait_info);
+	if (timewait_info) {
+		cur_cm_id_priv = cm_get_id(timewait_info->work.local_id,
+					   timewait_info->work.remote_id);
+		spin_unlock_irq(&cm.lock);
+		if (cur_cm_id_priv) {
+			cm_dup_req_handler(work, cur_cm_id_priv);
+			cm_deref_id(cur_cm_id_priv);
+		}
+		return NULL;
+	}
+
+	/* Check for stale connections. */
+	timewait_info = cm_insert_remote_qpn(cm_id_priv->timewait_info);
+	if (timewait_info) {
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		spin_unlock_irq(&cm.lock);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REQ,
+			     NULL, 0);
+		return NULL;
+	}
+
+	/* Find matching listen request. */
+	listen_cm_id_priv = cm_find_listen(cm_id_priv->id.device,
+					   req_msg->service_id,
+					   req_msg->private_data);
+	if (!listen_cm_id_priv) {
+		cm_cleanup_timewait(cm_id_priv->timewait_info);
+		spin_unlock_irq(&cm.lock);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_INVALID_SERVICE_ID, CM_MSG_RESPONSE_REQ,
+			     NULL, 0);
+		goto out;
+	}
+	atomic_inc(&listen_cm_id_priv->refcount);
+	atomic_inc(&cm_id_priv->refcount);
+	cm_id_priv->id.state = IB_CM_REQ_RCVD;
+	atomic_inc(&cm_id_priv->work_count);
+	spin_unlock_irq(&cm.lock);
+out:
+	return listen_cm_id_priv;
+}
+
+/*
+ * Work-around for inter-subnet connections.  If the LIDs are permissive,
+ * we need to override the LID/SL data in the REQ with the LID information
+ * in the work completion.
+ */
+static void cm_process_routed_req(struct cm_req_msg *req_msg, struct ib_wc *wc)
+{
+	if (!cm_req_get_primary_subnet_local(req_msg)) {
+		if (req_msg->primary_local_lid == IB_LID_PERMISSIVE) {
+			req_msg->primary_local_lid = cpu_to_be16(wc->slid);
+			cm_req_set_primary_sl(req_msg, wc->sl);
+		}
+
+		if (req_msg->primary_remote_lid == IB_LID_PERMISSIVE)
+			req_msg->primary_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+	}
+
+	if (!cm_req_get_alt_subnet_local(req_msg)) {
+		if (req_msg->alt_local_lid == IB_LID_PERMISSIVE) {
+			req_msg->alt_local_lid = cpu_to_be16(wc->slid);
+			cm_req_set_alt_sl(req_msg, wc->sl);
+		}
+
+		if (req_msg->alt_remote_lid == IB_LID_PERMISSIVE)
+			req_msg->alt_remote_lid = cpu_to_be16(wc->dlid_path_bits);
+	}
+}
+
+static int cm_req_handler(struct cm_work *work)
+{
+	struct ib_cm_id *cm_id;
+	struct cm_id_private *cm_id_priv, *listen_cm_id_priv;
+	struct cm_req_msg *req_msg;
+	int ret;
+
+	req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad;
+
+	cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	cm_id_priv->id.remote_id = req_msg->local_comm_id;
+	cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				work->mad_recv_wc->recv_buf.grh,
+				&cm_id_priv->av);
+	cm_id_priv->timewait_info = cm_create_timewait_info(cm_id_priv->
+							    id.local_id);
+	if (IS_ERR(cm_id_priv->timewait_info)) {
+		ret = PTR_ERR(cm_id_priv->timewait_info);
+		goto destroy;
+	}
+	cm_id_priv->timewait_info->work.remote_id = req_msg->local_comm_id;
+	cm_id_priv->timewait_info->remote_ca_guid = req_msg->local_ca_guid;
+	cm_id_priv->timewait_info->remote_qpn = cm_req_get_local_qpn(req_msg);
+
+	listen_cm_id_priv = cm_match_req(work, cm_id_priv);
+	if (!listen_cm_id_priv) {
+		ret = -EINVAL;
+		kfree(cm_id_priv->timewait_info);
+		goto destroy;
+	}
+
+	cm_id_priv->id.cm_handler = listen_cm_id_priv->id.cm_handler;
+	cm_id_priv->id.context = listen_cm_id_priv->id.context;
+	cm_id_priv->id.service_id = req_msg->service_id;
+	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+
+	cm_process_routed_req(req_msg, work->mad_recv_wc->wc);
+	cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]);
+
+	memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN);
+	work->path[0].vlan_id = cm_id_priv->av.ah_attr.vlan_id;
+	ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av);
+	if (ret) {
+		ib_get_cached_gid(work->port->cm_dev->ib_device,
+				  work->port->port_num, 0, &work->path[0].sgid);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID,
+			       &work->path[0].sgid, sizeof work->path[0].sgid,
+			       NULL, 0);
+		goto rejected;
+	}
+	if (req_msg->alt_local_lid) {
+		ret = cm_init_av_by_path(&work->path[1], &cm_id_priv->alt_av);
+		if (ret) {
+			ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_ALT_GID,
+				       &work->path[0].sgid,
+				       sizeof work->path[0].sgid, NULL, 0);
+			goto rejected;
+		}
+	}
+	cm_id_priv->tid = req_msg->hdr.tid;
+	cm_id_priv->timeout_ms = cm_convert_to_ms(
+					cm_req_get_local_resp_timeout(req_msg));
+	cm_id_priv->max_cm_retries = cm_req_get_max_cm_retries(req_msg);
+	cm_id_priv->remote_qpn = cm_req_get_local_qpn(req_msg);
+	cm_id_priv->initiator_depth = cm_req_get_resp_res(req_msg);
+	cm_id_priv->responder_resources = cm_req_get_init_depth(req_msg);
+	cm_id_priv->path_mtu = cm_req_get_path_mtu(req_msg);
+	cm_id_priv->pkey = req_msg->pkey;
+	cm_id_priv->sq_psn = cm_req_get_starting_psn(req_msg);
+	cm_id_priv->retry_count = cm_req_get_retry_count(req_msg);
+	cm_id_priv->rnr_retry_count = cm_req_get_rnr_retry_count(req_msg);
+	cm_id_priv->qp_type = cm_req_get_qp_type(req_msg);
+
+	cm_format_req_event(work, cm_id_priv, &listen_cm_id_priv->id);
+	cm_process_work(cm_id_priv, work);
+	cm_deref_id(listen_cm_id_priv);
+	return 0;
+
+rejected:
+	atomic_dec(&cm_id_priv->refcount);
+	cm_deref_id(listen_cm_id_priv);
+destroy:
+	ib_destroy_cm_id(cm_id);
+	return ret;
+}
+
+static void cm_format_rep(struct cm_rep_msg *rep_msg,
+			  struct cm_id_private *cm_id_priv,
+			  struct ib_cm_rep_param *param)
+{
+	cm_format_mad_hdr(&rep_msg->hdr, CM_REP_ATTR_ID, cm_id_priv->tid);
+	rep_msg->local_comm_id = cm_id_priv->id.local_id;
+	rep_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_rep_set_starting_psn(rep_msg, cpu_to_be32(param->starting_psn));
+	rep_msg->resp_resources = param->responder_resources;
+	cm_rep_set_target_ack_delay(rep_msg,
+				    cm_id_priv->av.port->cm_dev->ack_delay);
+	cm_rep_set_failover(rep_msg, param->failover_accepted);
+	cm_rep_set_rnr_retry_count(rep_msg, param->rnr_retry_count);
+	rep_msg->local_ca_guid = cm_id_priv->id.device->node_guid;
+
+	if (cm_id_priv->qp_type != IB_QPT_XRC_TGT) {
+		rep_msg->initiator_depth = param->initiator_depth;
+		cm_rep_set_flow_ctrl(rep_msg, param->flow_control);
+		cm_rep_set_srq(rep_msg, param->srq);
+		cm_rep_set_local_qpn(rep_msg, cpu_to_be32(param->qp_num));
+	} else {
+		cm_rep_set_srq(rep_msg, 1);
+		cm_rep_set_local_eecn(rep_msg, cpu_to_be32(param->qp_num));
+	}
+
+	if (param->private_data && param->private_data_len)
+		memcpy(rep_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+int ib_send_cm_rep(struct ib_cm_id *cm_id,
+		   struct ib_cm_rep_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	struct cm_rep_msg *rep_msg;
+	unsigned long flags;
+	int ret;
+
+	if (param->private_data &&
+	    param->private_data_len > IB_CM_REP_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_REQ_RCVD &&
+	    cm_id->state != IB_CM_MRA_REQ_SENT) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	rep_msg = (struct cm_rep_msg *) msg->mad;
+	cm_format_rep(rep_msg, cm_id_priv, param);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_REP_SENT;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->state = IB_CM_REP_SENT;
+	cm_id_priv->msg = msg;
+	cm_id_priv->initiator_depth = param->initiator_depth;
+	cm_id_priv->responder_resources = param->responder_resources;
+	cm_id_priv->rq_psn = cm_rep_get_starting_psn(rep_msg);
+	cm_id_priv->local_qpn = cpu_to_be32(param->qp_num & 0xFFFFFF);
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rep);
+
+static void cm_format_rtu(struct cm_rtu_msg *rtu_msg,
+			  struct cm_id_private *cm_id_priv,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&rtu_msg->hdr, CM_RTU_ATTR_ID, cm_id_priv->tid);
+	rtu_msg->local_comm_id = cm_id_priv->id.local_id;
+	rtu_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+	if (private_data && private_data_len)
+		memcpy(rtu_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_rtu(struct ib_cm_id *cm_id,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	void *data;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_RTU_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	data = cm_copy_private_data(private_data, private_data_len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_REP_RCVD &&
+	    cm_id->state != IB_CM_MRA_REP_SENT) {
+		ret = -EINVAL;
+		goto error;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto error;
+
+	cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+		      private_data, private_data_len);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		kfree(data);
+		return ret;
+	}
+
+	cm_id->state = IB_CM_ESTABLISHED;
+	cm_set_private_data(cm_id_priv, data, private_data_len);
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+
+error:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	kfree(data);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rtu);
+
+static void cm_format_rep_event(struct cm_work *work, enum ib_qp_type qp_type)
+{
+	struct cm_rep_msg *rep_msg;
+	struct ib_cm_rep_event_param *param;
+
+	rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.rep_rcvd;
+	param->remote_ca_guid = rep_msg->local_ca_guid;
+	param->remote_qkey = be32_to_cpu(rep_msg->local_qkey);
+	param->remote_qpn = be32_to_cpu(cm_rep_get_qpn(rep_msg, qp_type));
+	param->starting_psn = be32_to_cpu(cm_rep_get_starting_psn(rep_msg));
+	param->responder_resources = rep_msg->initiator_depth;
+	param->initiator_depth = rep_msg->resp_resources;
+	param->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg);
+	param->failover_accepted = cm_rep_get_failover(rep_msg);
+	param->flow_control = cm_rep_get_flow_ctrl(rep_msg);
+	param->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg);
+	param->srq = cm_rep_get_srq(rep_msg);
+	work->cm_event.private_data = &rep_msg->private_data;
+}
+
+static void cm_dup_rep_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rep_msg *rep_msg;
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	rep_msg = (struct cm_rep_msg *) work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id,
+				   rep_msg->local_comm_id);
+	if (!cm_id_priv)
+		return;
+
+	atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+			counter[CM_REP_COUNTER]);
+	ret = cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg);
+	if (ret)
+		goto deref;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state == IB_CM_ESTABLISHED)
+		cm_format_rtu((struct cm_rtu_msg *) msg->mad, cm_id_priv,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+	else if (cm_id_priv->id.state == IB_CM_MRA_REP_SENT)
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      CM_MSG_RESPONSE_REP, cm_id_priv->service_timeout,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+	else
+		goto unlock;
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		goto free;
+	goto deref;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+free:	cm_free_msg(msg);
+deref:	cm_deref_id(cm_id_priv);
+}
+
+static int cm_rep_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rep_msg *rep_msg;
+	int ret;
+
+	rep_msg = (struct cm_rep_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(rep_msg->remote_comm_id, 0);
+	if (!cm_id_priv) {
+		cm_dup_rep_handler(work);
+		return -EINVAL;
+	}
+
+	cm_format_rep_event(work, cm_id_priv->qp_type);
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+		break;
+	default:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ret = -EINVAL;
+		goto error;
+	}
+
+	cm_id_priv->timewait_info->work.remote_id = rep_msg->local_comm_id;
+	cm_id_priv->timewait_info->remote_ca_guid = rep_msg->local_ca_guid;
+	cm_id_priv->timewait_info->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type);
+
+	spin_lock(&cm.lock);
+	/* Check for duplicate REP. */
+	if (cm_insert_remote_id(cm_id_priv->timewait_info)) {
+		spin_unlock(&cm.lock);
+		spin_unlock_irq(&cm_id_priv->lock);
+		ret = -EINVAL;
+		goto error;
+	}
+	/* Check for a stale connection. */
+	if (cm_insert_remote_qpn(cm_id_priv->timewait_info)) {
+		rb_erase(&cm_id_priv->timewait_info->remote_id_node,
+			 &cm.remote_id_table);
+		cm_id_priv->timewait_info->inserted_remote_id = 0;
+		spin_unlock(&cm.lock);
+		spin_unlock_irq(&cm_id_priv->lock);
+		cm_issue_rej(work->port, work->mad_recv_wc,
+			     IB_CM_REJ_STALE_CONN, CM_MSG_RESPONSE_REP,
+			     NULL, 0);
+		ret = -EINVAL;
+		goto error;
+	}
+	spin_unlock(&cm.lock);
+
+	cm_id_priv->id.state = IB_CM_REP_RCVD;
+	cm_id_priv->id.remote_id = rep_msg->local_comm_id;
+	cm_id_priv->remote_qpn = cm_rep_get_qpn(rep_msg, cm_id_priv->qp_type);
+	cm_id_priv->initiator_depth = rep_msg->resp_resources;
+	cm_id_priv->responder_resources = rep_msg->initiator_depth;
+	cm_id_priv->sq_psn = cm_rep_get_starting_psn(rep_msg);
+	cm_id_priv->rnr_retry_count = cm_rep_get_rnr_retry_count(rep_msg);
+	cm_id_priv->target_ack_delay = cm_rep_get_target_ack_delay(rep_msg);
+	cm_id_priv->av.timeout =
+			cm_ack_timeout(cm_id_priv->target_ack_delay,
+				       cm_id_priv->av.timeout - 1);
+	cm_id_priv->alt_av.timeout =
+			cm_ack_timeout(cm_id_priv->target_ack_delay,
+				       cm_id_priv->alt_av.timeout - 1);
+
+	/* todo: handle peer_to_peer */
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+
+error:
+	cm_deref_id(cm_id_priv);
+	return ret;
+}
+
+static int cm_establish_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	/* See comment in cm_establish about lookup. */
+	cm_id_priv = cm_acquire_id(work->local_id, work->remote_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_ESTABLISHED) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static int cm_rtu_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rtu_msg *rtu_msg;
+	int ret;
+
+	rtu_msg = (struct cm_rtu_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(rtu_msg->remote_comm_id,
+				   rtu_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	work->cm_event.private_data = &rtu_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_REP_SENT &&
+	    cm_id_priv->id.state != IB_CM_MRA_REP_RCVD) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_RTU_COUNTER]);
+		goto out;
+	}
+	cm_id_priv->id.state = IB_CM_ESTABLISHED;
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_dreq(struct cm_dreq_msg *dreq_msg,
+			  struct cm_id_private *cm_id_priv,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&dreq_msg->hdr, CM_DREQ_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_DREQ));
+	dreq_msg->local_comm_id = cm_id_priv->id.local_id;
+	dreq_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_dreq_set_remote_qpn(dreq_msg, cm_id_priv->remote_qpn);
+
+	if (private_data && private_data_len)
+		memcpy(dreq_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_dreq(struct ib_cm_id *cm_id,
+		    const void *private_data,
+		    u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_DREQ_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_ESTABLISHED) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (cm_id->lap_state == IB_CM_LAP_SENT ||
+	    cm_id->lap_state == IB_CM_MRA_LAP_RCVD)
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret) {
+		cm_enter_timewait(cm_id_priv);
+		goto out;
+	}
+
+	cm_format_dreq((struct cm_dreq_msg *) msg->mad, cm_id_priv,
+		       private_data, private_data_len);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_DREQ_SENT;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		cm_enter_timewait(cm_id_priv);
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->state = IB_CM_DREQ_SENT;
+	cm_id_priv->msg = msg;
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_dreq);
+
+static void cm_format_drep(struct cm_drep_msg *drep_msg,
+			  struct cm_id_private *cm_id_priv,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, cm_id_priv->tid);
+	drep_msg->local_comm_id = cm_id_priv->id.local_id;
+	drep_msg->remote_comm_id = cm_id_priv->id.remote_id;
+
+	if (private_data && private_data_len)
+		memcpy(drep_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_drep(struct ib_cm_id *cm_id,
+		    const void *private_data,
+		    u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	void *data;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_DREP_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	data = cm_copy_private_data(private_data, private_data_len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_DREQ_RCVD) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		kfree(data);
+		return -EINVAL;
+	}
+
+	cm_set_private_data(cm_id_priv, data, private_data_len);
+	cm_enter_timewait(cm_id_priv);
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+		       private_data, private_data_len);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_drep);
+
+static int cm_issue_drep(struct cm_port *port,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_buf *msg = NULL;
+	struct cm_dreq_msg *dreq_msg;
+	struct cm_drep_msg *drep_msg;
+	int ret;
+
+	ret = cm_alloc_response_msg(port, mad_recv_wc, &msg);
+	if (ret)
+		return ret;
+
+	dreq_msg = (struct cm_dreq_msg *) mad_recv_wc->recv_buf.mad;
+	drep_msg = (struct cm_drep_msg *) msg->mad;
+
+	cm_format_mad_hdr(&drep_msg->hdr, CM_DREP_ATTR_ID, dreq_msg->hdr.tid);
+	drep_msg->remote_comm_id = dreq_msg->local_comm_id;
+	drep_msg->local_comm_id = dreq_msg->remote_comm_id;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		cm_free_msg(msg);
+
+	return ret;
+}
+
+static int cm_dreq_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_dreq_msg *dreq_msg;
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	dreq_msg = (struct cm_dreq_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(dreq_msg->remote_comm_id,
+				   dreq_msg->local_comm_id);
+	if (!cm_id_priv) {
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		cm_issue_drep(work->port, work->mad_recv_wc);
+		return -EINVAL;
+	}
+
+	work->cm_event.private_data = &dreq_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->local_qpn != cm_dreq_get_remote_qpn(dreq_msg))
+		goto unlock;
+
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REP_SENT:
+	case IB_CM_DREQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		break;
+	case IB_CM_ESTABLISHED:
+		if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT ||
+		    cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+			ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		break;
+	case IB_CM_MRA_REP_RCVD:
+		break;
+	case IB_CM_TIMEWAIT:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg))
+			goto unlock;
+
+		cm_format_drep((struct cm_drep_msg *) msg->mad, cm_id_priv,
+			       cm_id_priv->private_data,
+			       cm_id_priv->private_data_len);
+		spin_unlock_irq(&cm_id_priv->lock);
+
+		if (ib_post_send_mad(msg, NULL))
+			cm_free_msg(msg);
+		goto deref;
+	case IB_CM_DREQ_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_DREQ_COUNTER]);
+		goto unlock;
+	default:
+		goto unlock;
+	}
+	cm_id_priv->id.state = IB_CM_DREQ_RCVD;
+	cm_id_priv->tid = dreq_msg->hdr.tid;
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+deref:	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static int cm_drep_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_drep_msg *drep_msg;
+	int ret;
+
+	drep_msg = (struct cm_drep_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(drep_msg->remote_comm_id,
+				   drep_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	work->cm_event.private_data = &drep_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_DREQ_SENT &&
+	    cm_id_priv->id.state != IB_CM_DREQ_RCVD) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_enter_timewait(cm_id_priv);
+
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+int ib_send_cm_rej(struct ib_cm_id *cm_id,
+		   enum ib_cm_rej_reason reason,
+		   void *ari,
+		   u8 ari_length,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if ((private_data && private_data_len > IB_CM_REJ_PRIVATE_DATA_SIZE) ||
+	    (ari && ari_length > IB_CM_REJ_ARI_LENGTH))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id->state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (!ret)
+			cm_format_rej((struct cm_rej_msg *) msg->mad,
+				      cm_id_priv, reason, ari, ari_length,
+				      private_data, private_data_len);
+
+		cm_reset_to_idle(cm_id_priv);
+		break;
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (!ret)
+			cm_format_rej((struct cm_rej_msg *) msg->mad,
+				      cm_id_priv, reason, ari, ari_length,
+				      private_data, private_data_len);
+
+		cm_enter_timewait(cm_id_priv);
+		break;
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (ret)
+		goto out;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		cm_free_msg(msg);
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_rej);
+
+static void cm_format_rej_event(struct cm_work *work)
+{
+	struct cm_rej_msg *rej_msg;
+	struct ib_cm_rej_event_param *param;
+
+	rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.rej_rcvd;
+	param->ari = rej_msg->ari;
+	param->ari_length = cm_rej_get_reject_info_len(rej_msg);
+	param->reason = __be16_to_cpu(rej_msg->reason);
+	work->cm_event.private_data = &rej_msg->private_data;
+}
+
+static struct cm_id_private * cm_acquire_rejected_id(struct cm_rej_msg *rej_msg)
+{
+	struct cm_timewait_info *timewait_info;
+	struct cm_id_private *cm_id_priv;
+	__be32 remote_id;
+
+	remote_id = rej_msg->local_comm_id;
+
+	if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_TIMEOUT) {
+		spin_lock_irq(&cm.lock);
+		timewait_info = cm_find_remote_id( *((__be64 *) rej_msg->ari),
+						  remote_id);
+		if (!timewait_info) {
+			spin_unlock_irq(&cm.lock);
+			return NULL;
+		}
+		cm_id_priv = idr_find(&cm.local_id_table, (__force int)
+				      (timewait_info->work.local_id ^
+				       cm.random_id_operand));
+		if (cm_id_priv) {
+			if (cm_id_priv->id.remote_id == remote_id)
+				atomic_inc(&cm_id_priv->refcount);
+			else
+				cm_id_priv = NULL;
+		}
+		spin_unlock_irq(&cm.lock);
+	} else if (cm_rej_get_msg_rejected(rej_msg) == CM_MSG_RESPONSE_REQ)
+		cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, 0);
+	else
+		cm_id_priv = cm_acquire_id(rej_msg->remote_comm_id, remote_id);
+
+	return cm_id_priv;
+}
+
+static int cm_rej_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_rej_msg *rej_msg;
+	int ret;
+
+	rej_msg = (struct cm_rej_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_rejected_id(rej_msg);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	cm_format_rej_event(work);
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		/* fall through */
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+		if (__be16_to_cpu(rej_msg->reason) == IB_CM_REJ_STALE_CONN)
+			cm_enter_timewait(cm_id_priv);
+		else
+			cm_reset_to_idle(cm_id_priv);
+		break;
+	case IB_CM_DREQ_SENT:
+		ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+		/* fall through */
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+		cm_enter_timewait(cm_id_priv);
+		break;
+	case IB_CM_ESTABLISHED:
+		if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT ||
+		    cm_id_priv->id.lap_state == IB_CM_LAP_SENT) {
+			if (cm_id_priv->id.lap_state == IB_CM_LAP_SENT)
+				ib_cancel_mad(cm_id_priv->av.port->mad_agent,
+					      cm_id_priv->msg);
+			cm_enter_timewait(cm_id_priv);
+			break;
+		}
+		/* fall through */
+	default:
+		spin_unlock_irq(&cm_id_priv->lock);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+int ib_send_cm_mra(struct ib_cm_id *cm_id,
+		   u8 service_timeout,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	enum ib_cm_state cm_state;
+	enum ib_cm_lap_state lap_state;
+	enum cm_msg_response msg_response;
+	void *data;
+	unsigned long flags;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_MRA_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	data = cm_copy_private_data(private_data, private_data_len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch(cm_id_priv->id.state) {
+	case IB_CM_REQ_RCVD:
+		cm_state = IB_CM_MRA_REQ_SENT;
+		lap_state = cm_id->lap_state;
+		msg_response = CM_MSG_RESPONSE_REQ;
+		break;
+	case IB_CM_REP_RCVD:
+		cm_state = IB_CM_MRA_REP_SENT;
+		lap_state = cm_id->lap_state;
+		msg_response = CM_MSG_RESPONSE_REP;
+		break;
+	case IB_CM_ESTABLISHED:
+		if (cm_id->lap_state == IB_CM_LAP_RCVD) {
+			cm_state = cm_id->state;
+			lap_state = IB_CM_MRA_LAP_SENT;
+			msg_response = CM_MSG_RESPONSE_OTHER;
+			break;
+		}
+	default:
+		ret = -EINVAL;
+		goto error1;
+	}
+
+	if (!(service_timeout & IB_CM_MRA_FLAG_DELAY)) {
+		ret = cm_alloc_msg(cm_id_priv, &msg);
+		if (ret)
+			goto error1;
+
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      msg_response, service_timeout,
+			      private_data, private_data_len);
+		ret = ib_post_send_mad(msg, NULL);
+		if (ret)
+			goto error2;
+	}
+
+	cm_id->state = cm_state;
+	cm_id->lap_state = lap_state;
+	cm_id_priv->service_timeout = service_timeout;
+	cm_set_private_data(cm_id_priv, data, private_data_len);
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return 0;
+
+error1:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	kfree(data);
+	return ret;
+
+error2:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	kfree(data);
+	cm_free_msg(msg);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_mra);
+
+static struct cm_id_private * cm_acquire_mraed_id(struct cm_mra_msg *mra_msg)
+{
+	switch (cm_mra_get_msg_mraed(mra_msg)) {
+	case CM_MSG_RESPONSE_REQ:
+		return cm_acquire_id(mra_msg->remote_comm_id, 0);
+	case CM_MSG_RESPONSE_REP:
+	case CM_MSG_RESPONSE_OTHER:
+		return cm_acquire_id(mra_msg->remote_comm_id,
+				     mra_msg->local_comm_id);
+	default:
+		return NULL;
+	}
+}
+
+static int cm_mra_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_mra_msg *mra_msg;
+	int timeout, ret;
+
+	mra_msg = (struct cm_mra_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_mraed_id(mra_msg);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	work->cm_event.private_data = &mra_msg->private_data;
+	work->cm_event.param.mra_rcvd.service_timeout =
+					cm_mra_get_service_timeout(mra_msg);
+	timeout = cm_convert_to_ms(cm_mra_get_service_timeout(mra_msg)) +
+		  cm_convert_to_ms(cm_id_priv->av.timeout);
+
+	spin_lock_irq(&cm_id_priv->lock);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REQ ||
+		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
+				  cm_id_priv->msg, timeout))
+			goto out;
+		cm_id_priv->id.state = IB_CM_MRA_REQ_RCVD;
+		break;
+	case IB_CM_REP_SENT:
+		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_REP ||
+		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
+				  cm_id_priv->msg, timeout))
+			goto out;
+		cm_id_priv->id.state = IB_CM_MRA_REP_RCVD;
+		break;
+	case IB_CM_ESTABLISHED:
+		if (cm_mra_get_msg_mraed(mra_msg) != CM_MSG_RESPONSE_OTHER ||
+		    cm_id_priv->id.lap_state != IB_CM_LAP_SENT ||
+		    ib_modify_mad(cm_id_priv->av.port->mad_agent,
+				  cm_id_priv->msg, timeout)) {
+			if (cm_id_priv->id.lap_state == IB_CM_MRA_LAP_RCVD)
+				atomic_long_inc(&work->port->
+						counter_group[CM_RECV_DUPLICATES].
+						counter[CM_MRA_COUNTER]);
+			goto out;
+		}
+		cm_id_priv->id.lap_state = IB_CM_MRA_LAP_RCVD;
+		break;
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_MRA_REP_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_MRA_COUNTER]);
+		/* fall through */
+	default:
+		goto out;
+	}
+
+	cm_id_priv->msg->context[1] = (void *) (unsigned long)
+				      cm_id_priv->id.state;
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	spin_unlock_irq(&cm_id_priv->lock);
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_lap(struct cm_lap_msg *lap_msg,
+			  struct cm_id_private *cm_id_priv,
+			  struct ib_sa_path_rec *alternate_path,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&lap_msg->hdr, CM_LAP_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_LAP));
+	lap_msg->local_comm_id = cm_id_priv->id.local_id;
+	lap_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	cm_lap_set_remote_qpn(lap_msg, cm_id_priv->remote_qpn);
+	/* todo: need remote CM response timeout */
+	cm_lap_set_remote_resp_timeout(lap_msg, 0x1F);
+	lap_msg->alt_local_lid = alternate_path->slid;
+	lap_msg->alt_remote_lid = alternate_path->dlid;
+	lap_msg->alt_local_gid = alternate_path->sgid;
+	lap_msg->alt_remote_gid = alternate_path->dgid;
+	cm_lap_set_flow_label(lap_msg, alternate_path->flow_label);
+	cm_lap_set_traffic_class(lap_msg, alternate_path->traffic_class);
+	lap_msg->alt_hop_limit = alternate_path->hop_limit;
+	cm_lap_set_packet_rate(lap_msg, alternate_path->rate);
+	cm_lap_set_sl(lap_msg, alternate_path->sl);
+	cm_lap_set_subnet_local(lap_msg, 1); /* local only... */
+	cm_lap_set_local_ack_timeout(lap_msg,
+		cm_ack_timeout(cm_id_priv->av.port->cm_dev->ack_delay,
+			       alternate_path->packet_life_time));
+
+	if (private_data && private_data_len)
+		memcpy(lap_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_lap(struct ib_cm_id *cm_id,
+		   struct ib_sa_path_rec *alternate_path,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if (private_data && private_data_len > IB_CM_LAP_PRIVATE_DATA_SIZE)
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_ESTABLISHED ||
+	    (cm_id->lap_state != IB_CM_LAP_UNINIT &&
+	     cm_id->lap_state != IB_CM_LAP_IDLE)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_init_av_by_path(alternate_path, &cm_id_priv->alt_av);
+	if (ret)
+		goto out;
+	cm_id_priv->alt_av.timeout =
+			cm_ack_timeout(cm_id_priv->target_ack_delay,
+				       cm_id_priv->alt_av.timeout - 1);
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_lap((struct cm_lap_msg *) msg->mad, cm_id_priv,
+		      alternate_path, private_data, private_data_len);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_ESTABLISHED;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->lap_state = IB_CM_LAP_SENT;
+	cm_id_priv->msg = msg;
+
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_lap);
+
+static void cm_format_path_from_lap(struct cm_id_private *cm_id_priv,
+				    struct ib_sa_path_rec *path,
+				    struct cm_lap_msg *lap_msg)
+{
+	memset(path, 0, sizeof *path);
+	path->dgid = lap_msg->alt_local_gid;
+	path->sgid = lap_msg->alt_remote_gid;
+	path->dlid = lap_msg->alt_local_lid;
+	path->slid = lap_msg->alt_remote_lid;
+	path->flow_label = cm_lap_get_flow_label(lap_msg);
+	path->hop_limit = lap_msg->alt_hop_limit;
+	path->traffic_class = cm_lap_get_traffic_class(lap_msg);
+	path->reversible = 1;
+	path->pkey = cm_id_priv->pkey;
+	path->sl = cm_lap_get_sl(lap_msg);
+	path->mtu_selector = IB_SA_EQ;
+	path->mtu = cm_id_priv->path_mtu;
+	path->rate_selector = IB_SA_EQ;
+	path->rate = cm_lap_get_packet_rate(lap_msg);
+	path->packet_life_time_selector = IB_SA_EQ;
+	path->packet_life_time = cm_lap_get_local_ack_timeout(lap_msg);
+	path->packet_life_time -= (path->packet_life_time > 0);
+}
+
+static int cm_lap_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_lap_msg *lap_msg;
+	struct ib_cm_lap_event_param *param;
+	struct ib_mad_send_buf *msg = NULL;
+	int ret;
+
+	/* todo: verify LAP request and send reject APR if invalid. */
+	lap_msg = (struct cm_lap_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(lap_msg->remote_comm_id,
+				   lap_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	param = &work->cm_event.param.lap_rcvd;
+	param->alternate_path = &work->path[0];
+	cm_format_path_from_lap(cm_id_priv, param->alternate_path, lap_msg);
+	work->cm_event.private_data = &lap_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_ESTABLISHED)
+		goto unlock;
+
+	switch (cm_id_priv->id.lap_state) {
+	case IB_CM_LAP_UNINIT:
+	case IB_CM_LAP_IDLE:
+		break;
+	case IB_CM_MRA_LAP_SENT:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_LAP_COUNTER]);
+		if (cm_alloc_response_msg(work->port, work->mad_recv_wc, &msg))
+			goto unlock;
+
+		cm_format_mra((struct cm_mra_msg *) msg->mad, cm_id_priv,
+			      CM_MSG_RESPONSE_OTHER,
+			      cm_id_priv->service_timeout,
+			      cm_id_priv->private_data,
+			      cm_id_priv->private_data_len);
+		spin_unlock_irq(&cm_id_priv->lock);
+
+		if (ib_post_send_mad(msg, NULL))
+			cm_free_msg(msg);
+		goto deref;
+	case IB_CM_LAP_RCVD:
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_LAP_COUNTER]);
+		goto unlock;
+	default:
+		goto unlock;
+	}
+
+	cm_id_priv->id.lap_state = IB_CM_LAP_RCVD;
+	cm_id_priv->tid = lap_msg->hdr.tid;
+	cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				work->mad_recv_wc->recv_buf.grh,
+				&cm_id_priv->av);
+	cm_init_av_by_path(param->alternate_path, &cm_id_priv->alt_av);
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+
+unlock:	spin_unlock_irq(&cm_id_priv->lock);
+deref:	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_apr(struct cm_apr_msg *apr_msg,
+			  struct cm_id_private *cm_id_priv,
+			  enum ib_cm_apr_status status,
+			  void *info,
+			  u8 info_length,
+			  const void *private_data,
+			  u8 private_data_len)
+{
+	cm_format_mad_hdr(&apr_msg->hdr, CM_APR_ATTR_ID, cm_id_priv->tid);
+	apr_msg->local_comm_id = cm_id_priv->id.local_id;
+	apr_msg->remote_comm_id = cm_id_priv->id.remote_id;
+	apr_msg->ap_status = (u8) status;
+
+	if (info && info_length) {
+		apr_msg->info_length = info_length;
+		memcpy(apr_msg->info, info, info_length);
+	}
+
+	if (private_data && private_data_len)
+		memcpy(apr_msg->private_data, private_data, private_data_len);
+}
+
+int ib_send_cm_apr(struct ib_cm_id *cm_id,
+		   enum ib_cm_apr_status status,
+		   void *info,
+		   u8 info_length,
+		   const void *private_data,
+		   u8 private_data_len)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if ((private_data && private_data_len > IB_CM_APR_PRIVATE_DATA_SIZE) ||
+	    (info && info_length > IB_CM_APR_INFO_LENGTH))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_ESTABLISHED ||
+	    (cm_id->lap_state != IB_CM_LAP_RCVD &&
+	     cm_id->lap_state != IB_CM_MRA_LAP_SENT)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_apr((struct cm_apr_msg *) msg->mad, cm_id_priv, status,
+		      info, info_length, private_data, private_data_len);
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+
+	cm_id->lap_state = IB_CM_LAP_IDLE;
+out:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_apr);
+
+static int cm_apr_handler(struct cm_work *work)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_apr_msg *apr_msg;
+	int ret;
+
+	apr_msg = (struct cm_apr_msg *)work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(apr_msg->remote_comm_id,
+				   apr_msg->local_comm_id);
+	if (!cm_id_priv)
+		return -EINVAL; /* Unmatched reply. */
+
+	work->cm_event.param.apr_rcvd.ap_status = apr_msg->ap_status;
+	work->cm_event.param.apr_rcvd.apr_info = &apr_msg->info;
+	work->cm_event.param.apr_rcvd.info_len = apr_msg->info_length;
+	work->cm_event.private_data = &apr_msg->private_data;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_ESTABLISHED ||
+	    (cm_id_priv->id.lap_state != IB_CM_LAP_SENT &&
+	     cm_id_priv->id.lap_state != IB_CM_MRA_LAP_RCVD)) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_id_priv->id.lap_state = IB_CM_LAP_IDLE;
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	cm_id_priv->msg = NULL;
+
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static int cm_timewait_handler(struct cm_work *work)
+{
+	struct cm_timewait_info *timewait_info;
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	timewait_info = (struct cm_timewait_info *)work;
+	spin_lock_irq(&cm.lock);
+	list_del(&timewait_info->list);
+	spin_unlock_irq(&cm.lock);
+
+	cm_id_priv = cm_acquire_id(timewait_info->work.local_id,
+				   timewait_info->work.remote_id);
+	if (!cm_id_priv)
+		return -EINVAL;
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_TIMEWAIT ||
+	    cm_id_priv->remote_qpn != timewait_info->remote_qpn) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_id_priv->id.state = IB_CM_IDLE;
+	ret = atomic_inc_and_test(&cm_id_priv->work_count);
+	if (!ret)
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	if (ret)
+		cm_process_work(cm_id_priv, work);
+	else
+		cm_deref_id(cm_id_priv);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_format_sidr_req(struct cm_sidr_req_msg *sidr_req_msg,
+			       struct cm_id_private *cm_id_priv,
+			       struct ib_cm_sidr_req_param *param)
+{
+	cm_format_mad_hdr(&sidr_req_msg->hdr, CM_SIDR_REQ_ATTR_ID,
+			  cm_form_tid(cm_id_priv, CM_MSG_SEQUENCE_SIDR));
+	sidr_req_msg->request_id = cm_id_priv->id.local_id;
+	sidr_req_msg->pkey = param->path->pkey;
+	sidr_req_msg->service_id = param->service_id;
+
+	if (param->private_data && param->private_data_len)
+		memcpy(sidr_req_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+int ib_send_cm_sidr_req(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_req_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if (!param->path || (param->private_data &&
+	     param->private_data_len > IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	ret = cm_init_av_by_path(param->path, &cm_id_priv->av);
+	if (ret)
+		goto out;
+
+	cm_id->service_id = param->service_id;
+	cm_id->service_mask = ~cpu_to_be64(0);
+	cm_id_priv->timeout_ms = param->timeout_ms;
+	cm_id_priv->max_cm_retries = param->max_cm_retries;
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto out;
+
+	cm_format_sidr_req((struct cm_sidr_req_msg *) msg->mad, cm_id_priv,
+			   param);
+	msg->timeout_ms = cm_id_priv->timeout_ms;
+	msg->context[1] = (void *) (unsigned long) IB_CM_SIDR_REQ_SENT;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state == IB_CM_IDLE)
+		ret = ib_post_send_mad(msg, NULL);
+	else
+		ret = -EINVAL;
+
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		goto out;
+	}
+	cm_id->state = IB_CM_SIDR_REQ_SENT;
+	cm_id_priv->msg = msg;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+out:
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_req);
+
+static void cm_format_sidr_req_event(struct cm_work *work,
+				     struct ib_cm_id *listen_id)
+{
+	struct cm_sidr_req_msg *sidr_req_msg;
+	struct ib_cm_sidr_req_event_param *param;
+
+	sidr_req_msg = (struct cm_sidr_req_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.sidr_req_rcvd;
+	param->pkey = __be16_to_cpu(sidr_req_msg->pkey);
+	param->listen_id = listen_id;
+	param->port = work->port->port_num;
+	work->cm_event.private_data = &sidr_req_msg->private_data;
+}
+
+static int cm_sidr_req_handler(struct cm_work *work)
+{
+	struct ib_cm_id *cm_id;
+	struct cm_id_private *cm_id_priv, *cur_cm_id_priv;
+	struct cm_sidr_req_msg *sidr_req_msg;
+	struct ib_wc *wc;
+
+	cm_id = ib_create_cm_id(work->port->cm_dev->ib_device, NULL, NULL);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+
+	/* Record SGID/SLID and request ID for lookup. */
+	sidr_req_msg = (struct cm_sidr_req_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	wc = work->mad_recv_wc->wc;
+	cm_id_priv->av.dgid.global.subnet_prefix = cpu_to_be64(wc->slid);
+	cm_id_priv->av.dgid.global.interface_id = 0;
+	cm_init_av_for_response(work->port, work->mad_recv_wc->wc,
+				work->mad_recv_wc->recv_buf.grh,
+				&cm_id_priv->av);
+	cm_id_priv->id.remote_id = sidr_req_msg->request_id;
+	cm_id_priv->tid = sidr_req_msg->hdr.tid;
+	atomic_inc(&cm_id_priv->work_count);
+
+	spin_lock_irq(&cm.lock);
+	cur_cm_id_priv = cm_insert_remote_sidr(cm_id_priv);
+	if (cur_cm_id_priv) {
+		spin_unlock_irq(&cm.lock);
+		atomic_long_inc(&work->port->counter_group[CM_RECV_DUPLICATES].
+				counter[CM_SIDR_REQ_COUNTER]);
+		goto out; /* Duplicate message. */
+	}
+	cm_id_priv->id.state = IB_CM_SIDR_REQ_RCVD;
+	cur_cm_id_priv = cm_find_listen(cm_id->device,
+					sidr_req_msg->service_id,
+					sidr_req_msg->private_data);
+	if (!cur_cm_id_priv) {
+		spin_unlock_irq(&cm.lock);
+		cm_reject_sidr_req(cm_id_priv, IB_SIDR_UNSUPPORTED);
+		goto out; /* No match. */
+	}
+	atomic_inc(&cur_cm_id_priv->refcount);
+	atomic_inc(&cm_id_priv->refcount);
+	spin_unlock_irq(&cm.lock);
+
+	cm_id_priv->id.cm_handler = cur_cm_id_priv->id.cm_handler;
+	cm_id_priv->id.context = cur_cm_id_priv->id.context;
+	cm_id_priv->id.service_id = sidr_req_msg->service_id;
+	cm_id_priv->id.service_mask = ~cpu_to_be64(0);
+
+	cm_format_sidr_req_event(work, &cur_cm_id_priv->id);
+	cm_process_work(cm_id_priv, work);
+	cm_deref_id(cur_cm_id_priv);
+	return 0;
+out:
+	ib_destroy_cm_id(&cm_id_priv->id);
+	return -EINVAL;
+}
+
+static void cm_format_sidr_rep(struct cm_sidr_rep_msg *sidr_rep_msg,
+			       struct cm_id_private *cm_id_priv,
+			       struct ib_cm_sidr_rep_param *param)
+{
+	cm_format_mad_hdr(&sidr_rep_msg->hdr, CM_SIDR_REP_ATTR_ID,
+			  cm_id_priv->tid);
+	sidr_rep_msg->request_id = cm_id_priv->id.remote_id;
+	sidr_rep_msg->status = param->status;
+	cm_sidr_rep_set_qpn(sidr_rep_msg, cpu_to_be32(param->qp_num));
+	sidr_rep_msg->service_id = cm_id_priv->id.service_id;
+	sidr_rep_msg->qkey = cpu_to_be32(param->qkey);
+
+	if (param->info && param->info_length)
+		memcpy(sidr_rep_msg->info, param->info, param->info_length);
+
+	if (param->private_data && param->private_data_len)
+		memcpy(sidr_rep_msg->private_data, param->private_data,
+		       param->private_data_len);
+}
+
+int ib_send_cm_sidr_rep(struct ib_cm_id *cm_id,
+			struct ib_cm_sidr_rep_param *param)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_mad_send_buf *msg;
+	unsigned long flags;
+	int ret;
+
+	if ((param->info && param->info_length > IB_CM_SIDR_REP_INFO_LENGTH) ||
+	    (param->private_data &&
+	     param->private_data_len > IB_CM_SIDR_REP_PRIVATE_DATA_SIZE))
+		return -EINVAL;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state != IB_CM_SIDR_REQ_RCVD) {
+		ret = -EINVAL;
+		goto error;
+	}
+
+	ret = cm_alloc_msg(cm_id_priv, &msg);
+	if (ret)
+		goto error;
+
+	cm_format_sidr_rep((struct cm_sidr_rep_msg *) msg->mad, cm_id_priv,
+			   param);
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_free_msg(msg);
+		return ret;
+	}
+	cm_id->state = IB_CM_IDLE;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	spin_lock_irqsave(&cm.lock, flags);
+	rb_erase(&cm_id_priv->sidr_id_node, &cm.remote_sidr_table);
+	spin_unlock_irqrestore(&cm.lock, flags);
+	return 0;
+
+error:	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(ib_send_cm_sidr_rep);
+
+static void cm_format_sidr_rep_event(struct cm_work *work)
+{
+	struct cm_sidr_rep_msg *sidr_rep_msg;
+	struct ib_cm_sidr_rep_event_param *param;
+
+	sidr_rep_msg = (struct cm_sidr_rep_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	param = &work->cm_event.param.sidr_rep_rcvd;
+	param->status = sidr_rep_msg->status;
+	param->qkey = be32_to_cpu(sidr_rep_msg->qkey);
+	param->qpn = be32_to_cpu(cm_sidr_rep_get_qpn(sidr_rep_msg));
+	param->info = &sidr_rep_msg->info;
+	param->info_len = sidr_rep_msg->info_length;
+	work->cm_event.private_data = &sidr_rep_msg->private_data;
+}
+
+static int cm_sidr_rep_handler(struct cm_work *work)
+{
+	struct cm_sidr_rep_msg *sidr_rep_msg;
+	struct cm_id_private *cm_id_priv;
+
+	sidr_rep_msg = (struct cm_sidr_rep_msg *)
+				work->mad_recv_wc->recv_buf.mad;
+	cm_id_priv = cm_acquire_id(sidr_rep_msg->request_id, 0);
+	if (!cm_id_priv)
+		return -EINVAL; /* Unmatched reply. */
+
+	spin_lock_irq(&cm_id_priv->lock);
+	if (cm_id_priv->id.state != IB_CM_SIDR_REQ_SENT) {
+		spin_unlock_irq(&cm_id_priv->lock);
+		goto out;
+	}
+	cm_id_priv->id.state = IB_CM_IDLE;
+	ib_cancel_mad(cm_id_priv->av.port->mad_agent, cm_id_priv->msg);
+	spin_unlock_irq(&cm_id_priv->lock);
+
+	cm_format_sidr_rep_event(work);
+	cm_process_work(cm_id_priv, work);
+	return 0;
+out:
+	cm_deref_id(cm_id_priv);
+	return -EINVAL;
+}
+
+static void cm_process_send_error(struct ib_mad_send_buf *msg,
+				  enum ib_wc_status wc_status)
+{
+	struct cm_id_private *cm_id_priv;
+	struct ib_cm_event cm_event;
+	enum ib_cm_state state;
+	int ret;
+
+	memset(&cm_event, 0, sizeof cm_event);
+	cm_id_priv = msg->context[0];
+
+	/* Discard old sends or ones without a response. */
+	spin_lock_irq(&cm_id_priv->lock);
+	state = (enum ib_cm_state) (unsigned long) msg->context[1];
+	if (msg != cm_id_priv->msg || state != cm_id_priv->id.state)
+		goto discard;
+
+	switch (state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+		cm_reset_to_idle(cm_id_priv);
+		cm_event.event = IB_CM_REQ_ERROR;
+		break;
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		cm_reset_to_idle(cm_id_priv);
+		cm_event.event = IB_CM_REP_ERROR;
+		break;
+	case IB_CM_DREQ_SENT:
+		cm_enter_timewait(cm_id_priv);
+		cm_event.event = IB_CM_DREQ_ERROR;
+		break;
+	case IB_CM_SIDR_REQ_SENT:
+		cm_id_priv->id.state = IB_CM_IDLE;
+		cm_event.event = IB_CM_SIDR_REQ_ERROR;
+		break;
+	default:
+		goto discard;
+	}
+	spin_unlock_irq(&cm_id_priv->lock);
+	cm_event.param.send_status = wc_status;
+
+	/* No other events can occur on the cm_id at this point. */
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, &cm_event);
+	cm_free_msg(msg);
+	if (ret)
+		ib_destroy_cm_id(&cm_id_priv->id);
+	return;
+discard:
+	spin_unlock_irq(&cm_id_priv->lock);
+	cm_free_msg(msg);
+}
+
+static void cm_send_handler(struct ib_mad_agent *mad_agent,
+			    struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_mad_send_buf *msg = mad_send_wc->send_buf;
+	struct cm_port *port;
+	u16 attr_index;
+
+	port = mad_agent->context;
+	attr_index = be16_to_cpu(((struct ib_mad_hdr *)
+				  msg->mad)->attr_id) - CM_ATTR_ID_OFFSET;
+
+	/*
+	 * If the send was in response to a received message (context[0] is not
+	 * set to a cm_id), and is not a REJ, then it is a send that was
+	 * manually retried.
+	 */
+	if (!msg->context[0] && (attr_index != CM_REJ_COUNTER))
+		msg->retries = 1;
+
+	atomic_long_add(1 + msg->retries,
+			&port->counter_group[CM_XMIT].counter[attr_index]);
+	if (msg->retries)
+		atomic_long_add(msg->retries,
+				&port->counter_group[CM_XMIT_RETRIES].
+				counter[attr_index]);
+
+	switch (mad_send_wc->status) {
+	case IB_WC_SUCCESS:
+	case IB_WC_WR_FLUSH_ERR:
+		cm_free_msg(msg);
+		break;
+	default:
+		if (msg->context[0] && msg->context[1])
+			cm_process_send_error(msg, mad_send_wc->status);
+		else
+			cm_free_msg(msg);
+		break;
+	}
+}
+
+static void cm_work_handler(struct work_struct *_work)
+{
+	struct cm_work *work = container_of(_work, struct cm_work, work.work);
+	int ret;
+
+	switch (work->cm_event.event) {
+	case IB_CM_REQ_RECEIVED:
+		ret = cm_req_handler(work);
+		break;
+	case IB_CM_MRA_RECEIVED:
+		ret = cm_mra_handler(work);
+		break;
+	case IB_CM_REJ_RECEIVED:
+		ret = cm_rej_handler(work);
+		break;
+	case IB_CM_REP_RECEIVED:
+		ret = cm_rep_handler(work);
+		break;
+	case IB_CM_RTU_RECEIVED:
+		ret = cm_rtu_handler(work);
+		break;
+	case IB_CM_USER_ESTABLISHED:
+		ret = cm_establish_handler(work);
+		break;
+	case IB_CM_DREQ_RECEIVED:
+		ret = cm_dreq_handler(work);
+		break;
+	case IB_CM_DREP_RECEIVED:
+		ret = cm_drep_handler(work);
+		break;
+	case IB_CM_SIDR_REQ_RECEIVED:
+		ret = cm_sidr_req_handler(work);
+		break;
+	case IB_CM_SIDR_REP_RECEIVED:
+		ret = cm_sidr_rep_handler(work);
+		break;
+	case IB_CM_LAP_RECEIVED:
+		ret = cm_lap_handler(work);
+		break;
+	case IB_CM_APR_RECEIVED:
+		ret = cm_apr_handler(work);
+		break;
+	case IB_CM_TIMEWAIT_EXIT:
+		ret = cm_timewait_handler(work);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	if (ret)
+		cm_free_work(work);
+}
+
+static int cm_establish(struct ib_cm_id *cm_id)
+{
+	struct cm_id_private *cm_id_priv;
+	struct cm_work *work;
+	unsigned long flags;
+	int ret = 0;
+
+	work = kmalloc(sizeof *work, GFP_ATOMIC);
+	if (!work)
+		return -ENOMEM;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id->state)
+	{
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+		cm_id->state = IB_CM_ESTABLISHED;
+		break;
+	case IB_CM_ESTABLISHED:
+		ret = -EISCONN;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	if (ret) {
+		kfree(work);
+		goto out;
+	}
+
+	/*
+	 * The CM worker thread may try to destroy the cm_id before it
+	 * can execute this work item.  To prevent potential deadlock,
+	 * we need to find the cm_id once we're in the context of the
+	 * worker thread, rather than holding a reference on it.
+	 */
+	INIT_DELAYED_WORK(&work->work, cm_work_handler);
+	work->local_id = cm_id->local_id;
+	work->remote_id = cm_id->remote_id;
+	work->mad_recv_wc = NULL;
+	work->cm_event.event = IB_CM_USER_ESTABLISHED;
+	queue_delayed_work(cm.wq, &work->work, 0);
+out:
+	return ret;
+}
+
+static int cm_migrate(struct ib_cm_id *cm_id)
+{
+	struct cm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id->state == IB_CM_ESTABLISHED &&
+	    (cm_id->lap_state == IB_CM_LAP_UNINIT ||
+	     cm_id->lap_state == IB_CM_LAP_IDLE)) {
+		cm_id->lap_state = IB_CM_LAP_IDLE;
+		cm_id_priv->av = cm_id_priv->alt_av;
+	} else
+		ret = -EINVAL;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	return ret;
+}
+
+int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event)
+{
+	int ret;
+
+	switch (event) {
+	case IB_EVENT_COMM_EST:
+		ret = cm_establish(cm_id);
+		break;
+	case IB_EVENT_PATH_MIG:
+		ret = cm_migrate(cm_id);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(ib_cm_notify);
+
+static void cm_recv_handler(struct ib_mad_agent *mad_agent,
+			    struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct cm_port *port = mad_agent->context;
+	struct cm_work *work;
+	enum ib_cm_event_type event;
+	u16 attr_id;
+	int paths = 0;
+
+	switch (mad_recv_wc->recv_buf.mad->mad_hdr.attr_id) {
+	case CM_REQ_ATTR_ID:
+		paths = 1 + (((struct cm_req_msg *) mad_recv_wc->recv_buf.mad)->
+						    alt_local_lid != 0);
+		event = IB_CM_REQ_RECEIVED;
+		break;
+	case CM_MRA_ATTR_ID:
+		event = IB_CM_MRA_RECEIVED;
+		break;
+	case CM_REJ_ATTR_ID:
+		event = IB_CM_REJ_RECEIVED;
+		break;
+	case CM_REP_ATTR_ID:
+		event = IB_CM_REP_RECEIVED;
+		break;
+	case CM_RTU_ATTR_ID:
+		event = IB_CM_RTU_RECEIVED;
+		break;
+	case CM_DREQ_ATTR_ID:
+		event = IB_CM_DREQ_RECEIVED;
+		break;
+	case CM_DREP_ATTR_ID:
+		event = IB_CM_DREP_RECEIVED;
+		break;
+	case CM_SIDR_REQ_ATTR_ID:
+		event = IB_CM_SIDR_REQ_RECEIVED;
+		break;
+	case CM_SIDR_REP_ATTR_ID:
+		event = IB_CM_SIDR_REP_RECEIVED;
+		break;
+	case CM_LAP_ATTR_ID:
+		paths = 1;
+		event = IB_CM_LAP_RECEIVED;
+		break;
+	case CM_APR_ATTR_ID:
+		event = IB_CM_APR_RECEIVED;
+		break;
+	default:
+		ib_free_recv_mad(mad_recv_wc);
+		return;
+	}
+
+	attr_id = be16_to_cpu(mad_recv_wc->recv_buf.mad->mad_hdr.attr_id);
+	atomic_long_inc(&port->counter_group[CM_RECV].
+			counter[attr_id - CM_ATTR_ID_OFFSET]);
+
+	work = kmalloc(sizeof *work + sizeof(struct ib_sa_path_rec) * paths,
+		       GFP_KERNEL);
+	if (!work) {
+		ib_free_recv_mad(mad_recv_wc);
+		return;
+	}
+
+	INIT_DELAYED_WORK(&work->work, cm_work_handler);
+	work->cm_event.event = event;
+	work->mad_recv_wc = mad_recv_wc;
+	work->port = port;
+	queue_delayed_work(cm.wq, &work->work, 0);
+}
+
+static int cm_init_qp_init_attr(struct cm_id_private *cm_id_priv,
+				struct ib_qp_attr *qp_attr,
+				int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_SENT:
+	case IB_CM_MRA_REQ_RCVD:
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+	case IB_CM_ESTABLISHED:
+		*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS |
+				IB_QP_PKEY_INDEX | IB_QP_PORT;
+		qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE;
+		if (cm_id_priv->responder_resources)
+			qp_attr->qp_access_flags |= IB_ACCESS_REMOTE_READ |
+						    IB_ACCESS_REMOTE_ATOMIC;
+		qp_attr->pkey_index = cm_id_priv->av.pkey_index;
+		qp_attr->port_num = cm_id_priv->av.port->port_num;
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int cm_init_qp_rtr_attr(struct cm_id_private *cm_id_priv,
+			       struct ib_qp_attr *qp_attr,
+			       int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->id.state) {
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+	case IB_CM_ESTABLISHED:
+		*qp_attr_mask = IB_QP_STATE | IB_QP_AV | IB_QP_PATH_MTU |
+				IB_QP_DEST_QPN | IB_QP_RQ_PSN;
+		qp_attr->ah_attr = cm_id_priv->av.ah_attr;
+		if (!cm_id_priv->av.valid) {
+			spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+			return -EINVAL;
+		}
+		if (cm_id_priv->av.ah_attr.vlan_id != 0xffff) {
+			qp_attr->vlan_id = cm_id_priv->av.ah_attr.vlan_id;
+			*qp_attr_mask |= IB_QP_VID;
+		}
+		if (!is_zero_ether_addr(cm_id_priv->av.smac)) {
+			memcpy(qp_attr->smac, cm_id_priv->av.smac,
+			       sizeof(qp_attr->smac));
+			*qp_attr_mask |= IB_QP_SMAC;
+		}
+		if (cm_id_priv->alt_av.valid) {
+			if (cm_id_priv->alt_av.ah_attr.vlan_id != 0xffff) {
+				qp_attr->alt_vlan_id =
+					cm_id_priv->alt_av.ah_attr.vlan_id;
+				*qp_attr_mask |= IB_QP_ALT_VID;
+			}
+			if (!is_zero_ether_addr(cm_id_priv->alt_av.smac)) {
+				memcpy(qp_attr->alt_smac,
+				       cm_id_priv->alt_av.smac,
+				       sizeof(qp_attr->alt_smac));
+				*qp_attr_mask |= IB_QP_ALT_SMAC;
+			}
+		}
+		qp_attr->path_mtu = cm_id_priv->path_mtu;
+		qp_attr->dest_qp_num = be32_to_cpu(cm_id_priv->remote_qpn);
+		qp_attr->rq_psn = be32_to_cpu(cm_id_priv->rq_psn);
+		if (cm_id_priv->qp_type == IB_QPT_RC ||
+		    cm_id_priv->qp_type == IB_QPT_XRC_TGT) {
+			*qp_attr_mask |= IB_QP_MAX_DEST_RD_ATOMIC |
+					 IB_QP_MIN_RNR_TIMER;
+			qp_attr->max_dest_rd_atomic =
+					cm_id_priv->responder_resources;
+			qp_attr->min_rnr_timer = 0;
+		}
+		if (cm_id_priv->alt_av.ah_attr.dlid) {
+			*qp_attr_mask |= IB_QP_ALT_PATH;
+			qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+			qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+			qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+			qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+		}
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int cm_init_qp_rts_attr(struct cm_id_private *cm_id_priv,
+			       struct ib_qp_attr *qp_attr,
+			       int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->id.state) {
+	/* Allow transition to RTS before sending REP */
+	case IB_CM_REQ_RCVD:
+	case IB_CM_MRA_REQ_SENT:
+
+	case IB_CM_REP_RCVD:
+	case IB_CM_MRA_REP_SENT:
+	case IB_CM_REP_SENT:
+	case IB_CM_MRA_REP_RCVD:
+	case IB_CM_ESTABLISHED:
+		if (cm_id_priv->id.lap_state == IB_CM_LAP_UNINIT) {
+			*qp_attr_mask = IB_QP_STATE | IB_QP_SQ_PSN;
+			qp_attr->sq_psn = be32_to_cpu(cm_id_priv->sq_psn);
+			switch (cm_id_priv->qp_type) {
+			case IB_QPT_RC:
+			case IB_QPT_XRC_INI:
+				*qp_attr_mask |= IB_QP_RETRY_CNT | IB_QP_RNR_RETRY |
+						 IB_QP_MAX_QP_RD_ATOMIC;
+				qp_attr->retry_cnt = cm_id_priv->retry_count;
+				qp_attr->rnr_retry = cm_id_priv->rnr_retry_count;
+				qp_attr->max_rd_atomic = cm_id_priv->initiator_depth;
+				/* fall through */
+			case IB_QPT_XRC_TGT:
+				*qp_attr_mask |= IB_QP_TIMEOUT;
+				qp_attr->timeout = cm_id_priv->av.timeout;
+				break;
+			default:
+				break;
+			}
+			if (cm_id_priv->alt_av.ah_attr.dlid) {
+				*qp_attr_mask |= IB_QP_PATH_MIG_STATE;
+				qp_attr->path_mig_state = IB_MIG_REARM;
+			}
+		} else {
+			*qp_attr_mask = IB_QP_ALT_PATH | IB_QP_PATH_MIG_STATE;
+			qp_attr->alt_port_num = cm_id_priv->alt_av.port->port_num;
+			qp_attr->alt_pkey_index = cm_id_priv->alt_av.pkey_index;
+			qp_attr->alt_timeout = cm_id_priv->alt_av.timeout;
+			qp_attr->alt_ah_attr = cm_id_priv->alt_av.ah_attr;
+			qp_attr->path_mig_state = IB_MIG_REARM;
+		}
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+int ib_cm_init_qp_attr(struct ib_cm_id *cm_id,
+		       struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask)
+{
+	struct cm_id_private *cm_id_priv;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct cm_id_private, id);
+	switch (qp_attr->qp_state) {
+	case IB_QPS_INIT:
+		ret = cm_init_qp_init_attr(cm_id_priv, qp_attr, qp_attr_mask);
+		break;
+	case IB_QPS_RTR:
+		ret = cm_init_qp_rtr_attr(cm_id_priv, qp_attr, qp_attr_mask);
+		break;
+	case IB_QPS_RTS:
+		ret = cm_init_qp_rts_attr(cm_id_priv, qp_attr, qp_attr_mask);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(ib_cm_init_qp_attr);
+
+static void cm_get_ack_delay(struct cm_device *cm_dev)
+{
+	struct ib_device_attr attr;
+
+	if (ib_query_device(cm_dev->ib_device, &attr))
+		cm_dev->ack_delay = 0; /* acks will rely on packet life time */
+	else
+		cm_dev->ack_delay = attr.local_ca_ack_delay;
+}
+
+static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr,
+			       char *buf)
+{
+	struct cm_counter_group *group;
+	struct cm_counter_attribute *cm_attr;
+
+	group = container_of(obj, struct cm_counter_group, obj);
+	cm_attr = container_of(attr, struct cm_counter_attribute, attr);
+
+	return sprintf(buf, "%ld\n",
+		       atomic_long_read(&group->counter[cm_attr->index]));
+}
+
+static const struct sysfs_ops cm_counter_ops = {
+	.show = cm_show_counter
+};
+
+static struct kobj_type cm_counter_obj_type = {
+	.sysfs_ops = &cm_counter_ops,
+	.default_attrs = cm_counter_default_attrs
+};
+
+static void cm_release_port_obj(struct kobject *obj)
+{
+	struct cm_port *cm_port;
+
+	cm_port = container_of(obj, struct cm_port, port_obj);
+	kfree(cm_port);
+}
+
+static struct kobj_type cm_port_obj_type = {
+	.release = cm_release_port_obj
+};
+
+static char *cm_devnode(struct device *dev, umode_t *mode)
+{
+	if (mode)
+		*mode = 0666;
+	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+struct class cm_class = {
+	.owner   = THIS_MODULE,
+	.name    = "infiniband_cm",
+	.devnode = cm_devnode,
+};
+EXPORT_SYMBOL(cm_class);
+
+static int cm_create_port_fs(struct cm_port *port)
+{
+	int i, ret;
+
+	ret = kobject_init_and_add(&port->port_obj, &cm_port_obj_type,
+				   &port->cm_dev->device->kobj,
+				   "%d", port->port_num);
+	if (ret) {
+		kfree(port);
+		return ret;
+	}
+
+	for (i = 0; i < CM_COUNTER_GROUPS; i++) {
+		ret = kobject_init_and_add(&port->counter_group[i].obj,
+					   &cm_counter_obj_type,
+					   &port->port_obj,
+					   "%s", counter_group_names[i]);
+		if (ret)
+			goto error;
+	}
+
+	return 0;
+
+error:
+	while (i--)
+		kobject_put(&port->counter_group[i].obj);
+	kobject_put(&port->port_obj);
+	return ret;
+
+}
+
+static void cm_remove_port_fs(struct cm_port *port)
+{
+	int i;
+
+	for (i = 0; i < CM_COUNTER_GROUPS; i++)
+		kobject_put(&port->counter_group[i].obj);
+
+	kobject_put(&port->port_obj);
+}
+
+static void cm_add_one(struct ib_device *ib_device)
+{
+	struct cm_device *cm_dev;
+	struct cm_port *port;
+	struct ib_mad_reg_req reg_req = {
+		.mgmt_class = IB_MGMT_CLASS_CM,
+		.mgmt_class_version = IB_CM_CLASS_VERSION,
+	};
+	struct ib_port_modify port_modify = {
+		.set_port_cap_mask = IB_PORT_CM_SUP
+	};
+	unsigned long flags;
+	int ret;
+	u8 i;
+
+	if (rdma_node_get_transport(ib_device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	cm_dev = kzalloc(sizeof(*cm_dev) + sizeof(*port) *
+			 ib_device->phys_port_cnt, GFP_KERNEL);
+	if (!cm_dev)
+		return;
+
+	cm_dev->ib_device = ib_device;
+	cm_get_ack_delay(cm_dev);
+
+	cm_dev->device = device_create(&cm_class, &ib_device->dev,
+				       MKDEV(0, 0), NULL,
+				       "%s", ib_device->name);
+	if (IS_ERR(cm_dev->device)) {
+		kfree(cm_dev);
+		return;
+	}
+
+	set_bit(IB_MGMT_METHOD_SEND, reg_req.method_mask);
+	for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+		port = kzalloc(sizeof *port, GFP_KERNEL);
+		if (!port)
+			goto error1;
+
+		cm_dev->port[i-1] = port;
+		port->cm_dev = cm_dev;
+		port->port_num = i;
+
+		ret = cm_create_port_fs(port);
+		if (ret)
+			goto error1;
+
+		port->mad_agent = ib_register_mad_agent(ib_device, i,
+							IB_QPT_GSI,
+							&reg_req,
+							0,
+							cm_send_handler,
+							cm_recv_handler,
+							port,
+							0);
+		if (IS_ERR(port->mad_agent))
+			goto error2;
+
+		ret = ib_modify_port(ib_device, i, 0, &port_modify);
+		if (ret)
+			goto error3;
+	}
+	ib_set_client_data(ib_device, &cm_client, cm_dev);
+
+	write_lock_irqsave(&cm.device_lock, flags);
+	list_add_tail(&cm_dev->list, &cm.device_list);
+	write_unlock_irqrestore(&cm.device_lock, flags);
+	return;
+
+error3:
+	ib_unregister_mad_agent(port->mad_agent);
+error2:
+	cm_remove_port_fs(port);
+error1:
+	port_modify.set_port_cap_mask = 0;
+	port_modify.clr_port_cap_mask = IB_PORT_CM_SUP;
+	while (--i) {
+		port = cm_dev->port[i-1];
+		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+		ib_unregister_mad_agent(port->mad_agent);
+		cm_remove_port_fs(port);
+	}
+	device_unregister(cm_dev->device);
+	kfree(cm_dev);
+}
+
+static void cm_remove_one(struct ib_device *ib_device)
+{
+	struct cm_device *cm_dev;
+	struct cm_port *port;
+	struct ib_port_modify port_modify = {
+		.clr_port_cap_mask = IB_PORT_CM_SUP
+	};
+	unsigned long flags;
+	int i;
+
+	cm_dev = ib_get_client_data(ib_device, &cm_client);
+	if (!cm_dev)
+		return;
+
+	write_lock_irqsave(&cm.device_lock, flags);
+	list_del(&cm_dev->list);
+	write_unlock_irqrestore(&cm.device_lock, flags);
+
+	for (i = 1; i <= ib_device->phys_port_cnt; i++) {
+		port = cm_dev->port[i-1];
+		ib_modify_port(ib_device, port->port_num, 0, &port_modify);
+		ib_unregister_mad_agent(port->mad_agent);
+		flush_workqueue(cm.wq);
+		cm_remove_port_fs(port);
+	}
+	device_unregister(cm_dev->device);
+	kfree(cm_dev);
+}
+
+static int __init ib_cm_init(void)
+{
+	int ret;
+
+	memset(&cm, 0, sizeof cm);
+	INIT_LIST_HEAD(&cm.device_list);
+	rwlock_init(&cm.device_lock);
+	spin_lock_init(&cm.lock);
+	cm.listen_service_table = RB_ROOT;
+	cm.listen_service_id = be64_to_cpu(IB_CM_ASSIGN_SERVICE_ID);
+	cm.remote_id_table = RB_ROOT;
+	cm.remote_qp_table = RB_ROOT;
+	cm.remote_sidr_table = RB_ROOT;
+	idr_init(&cm.local_id_table);
+	get_random_bytes(&cm.random_id_operand, sizeof cm.random_id_operand);
+	INIT_LIST_HEAD(&cm.timewait_list);
+
+	ret = class_register(&cm_class);
+	if (ret) {
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	cm.wq = create_workqueue("ib_cm");
+	if (!cm.wq) {
+		ret = -ENOMEM;
+		goto error2;
+	}
+
+	ret = ib_register_client(&cm_client);
+	if (ret)
+		goto error3;
+
+	return 0;
+error3:
+	destroy_workqueue(cm.wq);
+error2:
+	class_unregister(&cm_class);
+error1:
+	idr_destroy(&cm.local_id_table);
+	return ret;
+}
+
+static void __exit ib_cm_cleanup(void)
+{
+	struct cm_timewait_info *timewait_info, *tmp;
+
+	spin_lock_irq(&cm.lock);
+	list_for_each_entry(timewait_info, &cm.timewait_list, list)
+		cancel_delayed_work(&timewait_info->work.work);
+	spin_unlock_irq(&cm.lock);
+
+	ib_unregister_client(&cm_client);
+	destroy_workqueue(cm.wq);
+
+	list_for_each_entry_safe(timewait_info, tmp, &cm.timewait_list, list) {
+		list_del(&timewait_info->list);
+		kfree(timewait_info);
+	}
+
+	class_unregister(&cm_class);
+	idr_destroy(&cm.local_id_table);
+}
+
+module_init(ib_cm_init);
+module_exit(ib_cm_cleanup);
+
diff --git a/drivers/infiniband/core/cm_msgs.h b/drivers/infiniband/core/cm_msgs.h
new file mode 100644
index 000000000..8b76f0ef9
--- /dev/null
+++ b/drivers/infiniband/core/cm_msgs.h
@@ -0,0 +1,836 @@
+/*
+ * Copyright (c) 2004, 2011 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING the madirectory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use source and binary forms, with or
+ *     withmodification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retathe above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHWARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS THE
+ * SOFTWARE.
+ */
+#if !defined(CM_MSGS_H)
+#define CM_MSGS_H
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_cm.h>
+
+/*
+ * Parameters to routines below should be in network-byte order, and values
+ * are returned in network-byte order.
+ */
+
+#define IB_CM_CLASS_VERSION	2 /* IB specification 1.2 */
+
+enum cm_msg_sequence {
+	CM_MSG_SEQUENCE_REQ,
+	CM_MSG_SEQUENCE_LAP,
+	CM_MSG_SEQUENCE_DREQ,
+	CM_MSG_SEQUENCE_SIDR
+};
+
+struct cm_req_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 rsvd4;
+	__be64 service_id;
+	__be64 local_ca_guid;
+	__be32 rsvd24;
+	__be32 local_qkey;
+	/* local QPN:24, responder resources:8 */
+	__be32 offset32;
+	/* local EECN:24, initiator depth:8 */
+	__be32 offset36;
+	/*
+	 * remote EECN:24, remote CM response timeout:5,
+	 * transport service type:2, end-to-end flow control:1
+	 */
+	__be32 offset40;
+	/* starting PSN:24, local CM response timeout:5, retry count:3 */
+	__be32 offset44;
+	__be16 pkey;
+	/* path MTU:4, RDC exists:1, RNR retry count:3. */
+	u8 offset50;
+	/* max CM Retries:4, SRQ:1, extended transport type:3 */
+	u8 offset51;
+
+	__be16 primary_local_lid;
+	__be16 primary_remote_lid;
+	union ib_gid primary_local_gid;
+	union ib_gid primary_remote_gid;
+	/* flow label:20, rsvd:6, packet rate:6 */
+	__be32 primary_offset88;
+	u8 primary_traffic_class;
+	u8 primary_hop_limit;
+	/* SL:4, subnet local:1, rsvd:3 */
+	u8 primary_offset94;
+	/* local ACK timeout:5, rsvd:3 */
+	u8 primary_offset95;
+
+	__be16 alt_local_lid;
+	__be16 alt_remote_lid;
+	union ib_gid alt_local_gid;
+	union ib_gid alt_remote_gid;
+	/* flow label:20, rsvd:6, packet rate:6 */
+	__be32 alt_offset132;
+	u8 alt_traffic_class;
+	u8 alt_hop_limit;
+	/* SL:4, subnet local:1, rsvd:3 */
+	u8 alt_offset138;
+	/* local ACK timeout:5, rsvd:3 */
+	u8 alt_offset139;
+
+	u32 private_data[IB_CM_REQ_PRIVATE_DATA_SIZE / sizeof(u32)];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_req_get_local_qpn(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->offset32) >> 8);
+}
+
+static inline void cm_req_set_local_qpn(struct cm_req_msg *req_msg, __be32 qpn)
+{
+	req_msg->offset32 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+					 (be32_to_cpu(req_msg->offset32) &
+					  0x000000FF));
+}
+
+static inline u8 cm_req_get_resp_res(struct cm_req_msg *req_msg)
+{
+	return (u8) be32_to_cpu(req_msg->offset32);
+}
+
+static inline void cm_req_set_resp_res(struct cm_req_msg *req_msg, u8 resp_res)
+{
+	req_msg->offset32 = cpu_to_be32(resp_res |
+					(be32_to_cpu(req_msg->offset32) &
+					 0xFFFFFF00));
+}
+
+static inline u8 cm_req_get_init_depth(struct cm_req_msg *req_msg)
+{
+	return (u8) be32_to_cpu(req_msg->offset36);
+}
+
+static inline void cm_req_set_init_depth(struct cm_req_msg *req_msg,
+					 u8 init_depth)
+{
+	req_msg->offset36 = cpu_to_be32(init_depth |
+					(be32_to_cpu(req_msg->offset36) &
+					 0xFFFFFF00));
+}
+
+static inline u8 cm_req_get_remote_resp_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) ((be32_to_cpu(req_msg->offset40) & 0xF8) >> 3);
+}
+
+static inline void cm_req_set_remote_resp_timeout(struct cm_req_msg *req_msg,
+						  u8 resp_timeout)
+{
+	req_msg->offset40 = cpu_to_be32((resp_timeout << 3) |
+					 (be32_to_cpu(req_msg->offset40) &
+					  0xFFFFFF07));
+}
+
+static inline enum ib_qp_type cm_req_get_qp_type(struct cm_req_msg *req_msg)
+{
+	u8 transport_type = (u8) (be32_to_cpu(req_msg->offset40) & 0x06) >> 1;
+	switch(transport_type) {
+	case 0: return IB_QPT_RC;
+	case 1: return IB_QPT_UC;
+	case 3:
+		switch (req_msg->offset51 & 0x7) {
+		case 1: return IB_QPT_XRC_TGT;
+		default: return 0;
+		}
+	default: return 0;
+	}
+}
+
+static inline void cm_req_set_qp_type(struct cm_req_msg *req_msg,
+				      enum ib_qp_type qp_type)
+{
+	switch(qp_type) {
+	case IB_QPT_UC:
+		req_msg->offset40 = cpu_to_be32((be32_to_cpu(
+						  req_msg->offset40) &
+						   0xFFFFFFF9) | 0x2);
+		break;
+	case IB_QPT_XRC_INI:
+		req_msg->offset40 = cpu_to_be32((be32_to_cpu(
+						 req_msg->offset40) &
+						   0xFFFFFFF9) | 0x6);
+		req_msg->offset51 = (req_msg->offset51 & 0xF8) | 1;
+		break;
+	default:
+		req_msg->offset40 = cpu_to_be32(be32_to_cpu(
+						 req_msg->offset40) &
+						  0xFFFFFFF9);
+	}
+}
+
+static inline u8 cm_req_get_flow_ctrl(struct cm_req_msg *req_msg)
+{
+	return be32_to_cpu(req_msg->offset40) & 0x1;
+}
+
+static inline void cm_req_set_flow_ctrl(struct cm_req_msg *req_msg,
+					u8 flow_ctrl)
+{
+	req_msg->offset40 = cpu_to_be32((flow_ctrl & 0x1) |
+					 (be32_to_cpu(req_msg->offset40) &
+					  0xFFFFFFFE));
+}
+
+static inline __be32 cm_req_get_starting_psn(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->offset44) >> 8);
+}
+
+static inline void cm_req_set_starting_psn(struct cm_req_msg *req_msg,
+					   __be32 starting_psn)
+{
+	req_msg->offset44 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) |
+			    (be32_to_cpu(req_msg->offset44) & 0x000000FF));
+}
+
+static inline u8 cm_req_get_local_resp_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) ((be32_to_cpu(req_msg->offset44) & 0xF8) >> 3);
+}
+
+static inline void cm_req_set_local_resp_timeout(struct cm_req_msg *req_msg,
+						 u8 resp_timeout)
+{
+	req_msg->offset44 = cpu_to_be32((resp_timeout << 3) |
+			    (be32_to_cpu(req_msg->offset44) & 0xFFFFFF07));
+}
+
+static inline u8 cm_req_get_retry_count(struct cm_req_msg *req_msg)
+{
+	return (u8) (be32_to_cpu(req_msg->offset44) & 0x7);
+}
+
+static inline void cm_req_set_retry_count(struct cm_req_msg *req_msg,
+					  u8 retry_count)
+{
+	req_msg->offset44 = cpu_to_be32((retry_count & 0x7) |
+			    (be32_to_cpu(req_msg->offset44) & 0xFFFFFFF8));
+}
+
+static inline u8 cm_req_get_path_mtu(struct cm_req_msg *req_msg)
+{
+	return req_msg->offset50 >> 4;
+}
+
+static inline void cm_req_set_path_mtu(struct cm_req_msg *req_msg, u8 path_mtu)
+{
+	req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF) | (path_mtu << 4));
+}
+
+static inline u8 cm_req_get_rnr_retry_count(struct cm_req_msg *req_msg)
+{
+	return req_msg->offset50 & 0x7;
+}
+
+static inline void cm_req_set_rnr_retry_count(struct cm_req_msg *req_msg,
+					      u8 rnr_retry_count)
+{
+	req_msg->offset50 = (u8) ((req_msg->offset50 & 0xF8) |
+				  (rnr_retry_count & 0x7));
+}
+
+static inline u8 cm_req_get_max_cm_retries(struct cm_req_msg *req_msg)
+{
+	return req_msg->offset51 >> 4;
+}
+
+static inline void cm_req_set_max_cm_retries(struct cm_req_msg *req_msg,
+					     u8 retries)
+{
+	req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF) | (retries << 4));
+}
+
+static inline u8 cm_req_get_srq(struct cm_req_msg *req_msg)
+{
+	return (req_msg->offset51 & 0x8) >> 3;
+}
+
+static inline void cm_req_set_srq(struct cm_req_msg *req_msg, u8 srq)
+{
+	req_msg->offset51 = (u8) ((req_msg->offset51 & 0xF7) |
+				  ((srq & 0x1) << 3));
+}
+
+static inline __be32 cm_req_get_primary_flow_label(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->primary_offset88) >> 12);
+}
+
+static inline void cm_req_set_primary_flow_label(struct cm_req_msg *req_msg,
+						 __be32 flow_label)
+{
+	req_msg->primary_offset88 = cpu_to_be32(
+				    (be32_to_cpu(req_msg->primary_offset88) &
+				     0x00000FFF) |
+				     (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_req_get_primary_packet_rate(struct cm_req_msg *req_msg)
+{
+	return (u8) (be32_to_cpu(req_msg->primary_offset88) & 0x3F);
+}
+
+static inline void cm_req_set_primary_packet_rate(struct cm_req_msg *req_msg,
+						  u8 rate)
+{
+	req_msg->primary_offset88 = cpu_to_be32(
+				    (be32_to_cpu(req_msg->primary_offset88) &
+				     0xFFFFFFC0) | (rate & 0x3F));
+}
+
+static inline u8 cm_req_get_primary_sl(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->primary_offset94 >> 4);
+}
+
+static inline void cm_req_set_primary_sl(struct cm_req_msg *req_msg, u8 sl)
+{
+	req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0x0F) |
+					  (sl << 4));
+}
+
+static inline u8 cm_req_get_primary_subnet_local(struct cm_req_msg *req_msg)
+{
+	return (u8) ((req_msg->primary_offset94 & 0x08) >> 3);
+}
+
+static inline void cm_req_set_primary_subnet_local(struct cm_req_msg *req_msg,
+						   u8 subnet_local)
+{
+	req_msg->primary_offset94 = (u8) ((req_msg->primary_offset94 & 0xF7) |
+					  ((subnet_local & 0x1) << 3));
+}
+
+static inline u8 cm_req_get_primary_local_ack_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->primary_offset95 >> 3);
+}
+
+static inline void cm_req_set_primary_local_ack_timeout(struct cm_req_msg *req_msg,
+							u8 local_ack_timeout)
+{
+	req_msg->primary_offset95 = (u8) ((req_msg->primary_offset95 & 0x07) |
+					  (local_ack_timeout << 3));
+}
+
+static inline __be32 cm_req_get_alt_flow_label(struct cm_req_msg *req_msg)
+{
+	return cpu_to_be32(be32_to_cpu(req_msg->alt_offset132) >> 12);
+}
+
+static inline void cm_req_set_alt_flow_label(struct cm_req_msg *req_msg,
+					     __be32 flow_label)
+{
+	req_msg->alt_offset132 = cpu_to_be32(
+				 (be32_to_cpu(req_msg->alt_offset132) &
+				  0x00000FFF) |
+				  (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_req_get_alt_packet_rate(struct cm_req_msg *req_msg)
+{
+	return (u8) (be32_to_cpu(req_msg->alt_offset132) & 0x3F);
+}
+
+static inline void cm_req_set_alt_packet_rate(struct cm_req_msg *req_msg,
+					      u8 rate)
+{
+	req_msg->alt_offset132 = cpu_to_be32(
+				 (be32_to_cpu(req_msg->alt_offset132) &
+				  0xFFFFFFC0) | (rate & 0x3F));
+}
+
+static inline u8 cm_req_get_alt_sl(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->alt_offset138 >> 4);
+}
+
+static inline void cm_req_set_alt_sl(struct cm_req_msg *req_msg, u8 sl)
+{
+	req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0x0F) |
+				       (sl << 4));
+}
+
+static inline u8 cm_req_get_alt_subnet_local(struct cm_req_msg *req_msg)
+{
+	return (u8) ((req_msg->alt_offset138 & 0x08) >> 3);
+}
+
+static inline void cm_req_set_alt_subnet_local(struct cm_req_msg *req_msg,
+					       u8 subnet_local)
+{
+	req_msg->alt_offset138 = (u8) ((req_msg->alt_offset138 & 0xF7) |
+				       ((subnet_local & 0x1) << 3));
+}
+
+static inline u8 cm_req_get_alt_local_ack_timeout(struct cm_req_msg *req_msg)
+{
+	return (u8) (req_msg->alt_offset139 >> 3);
+}
+
+static inline void cm_req_set_alt_local_ack_timeout(struct cm_req_msg *req_msg,
+						    u8 local_ack_timeout)
+{
+	req_msg->alt_offset139 = (u8) ((req_msg->alt_offset139 & 0x07) |
+				       (local_ack_timeout << 3));
+}
+
+/* Message REJected or MRAed */
+enum cm_msg_response {
+	CM_MSG_RESPONSE_REQ = 0x0,
+	CM_MSG_RESPONSE_REP = 0x1,
+	CM_MSG_RESPONSE_OTHER = 0x2
+};
+
+ struct cm_mra_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	/* message MRAed:2, rsvd:6 */
+	u8 offset8;
+	/* service timeout:5, rsvd:3 */
+	u8 offset9;
+
+	u8 private_data[IB_CM_MRA_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline u8 cm_mra_get_msg_mraed(struct cm_mra_msg *mra_msg)
+{
+	return (u8) (mra_msg->offset8 >> 6);
+}
+
+static inline void cm_mra_set_msg_mraed(struct cm_mra_msg *mra_msg, u8 msg)
+{
+	mra_msg->offset8 = (u8) ((mra_msg->offset8 & 0x3F) | (msg << 6));
+}
+
+static inline u8 cm_mra_get_service_timeout(struct cm_mra_msg *mra_msg)
+{
+	return (u8) (mra_msg->offset9 >> 3);
+}
+
+static inline void cm_mra_set_service_timeout(struct cm_mra_msg *mra_msg,
+					      u8 service_timeout)
+{
+	mra_msg->offset9 = (u8) ((mra_msg->offset9 & 0x07) |
+				 (service_timeout << 3));
+}
+
+struct cm_rej_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	/* message REJected:2, rsvd:6 */
+	u8 offset8;
+	/* reject info length:7, rsvd:1. */
+	u8 offset9;
+	__be16 reason;
+	u8 ari[IB_CM_REJ_ARI_LENGTH];
+
+	u8 private_data[IB_CM_REJ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline u8 cm_rej_get_msg_rejected(struct cm_rej_msg *rej_msg)
+{
+	return (u8) (rej_msg->offset8 >> 6);
+}
+
+static inline void cm_rej_set_msg_rejected(struct cm_rej_msg *rej_msg, u8 msg)
+{
+	rej_msg->offset8 = (u8) ((rej_msg->offset8 & 0x3F) | (msg << 6));
+}
+
+static inline u8 cm_rej_get_reject_info_len(struct cm_rej_msg *rej_msg)
+{
+	return (u8) (rej_msg->offset9 >> 1);
+}
+
+static inline void cm_rej_set_reject_info_len(struct cm_rej_msg *rej_msg,
+					      u8 len)
+{
+	rej_msg->offset9 = (u8) ((rej_msg->offset9 & 0x1) | (len << 1));
+}
+
+struct cm_rep_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	__be32 local_qkey;
+	/* local QPN:24, rsvd:8 */
+	__be32 offset12;
+	/* local EECN:24, rsvd:8 */
+	__be32 offset16;
+	/* starting PSN:24 rsvd:8 */
+	__be32 offset20;
+	u8 resp_resources;
+	u8 initiator_depth;
+	/* target ACK delay:5, failover accepted:2, end-to-end flow control:1 */
+	u8 offset26;
+	/* RNR retry count:3, SRQ:1, rsvd:5 */
+	u8 offset27;
+	__be64 local_ca_guid;
+
+	u8 private_data[IB_CM_REP_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_rep_get_local_qpn(struct cm_rep_msg *rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(rep_msg->offset12) >> 8);
+}
+
+static inline void cm_rep_set_local_qpn(struct cm_rep_msg *rep_msg, __be32 qpn)
+{
+	rep_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+			    (be32_to_cpu(rep_msg->offset12) & 0x000000FF));
+}
+
+static inline __be32 cm_rep_get_local_eecn(struct cm_rep_msg *rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(rep_msg->offset16) >> 8);
+}
+
+static inline void cm_rep_set_local_eecn(struct cm_rep_msg *rep_msg, __be32 eecn)
+{
+	rep_msg->offset16 = cpu_to_be32((be32_to_cpu(eecn) << 8) |
+			    (be32_to_cpu(rep_msg->offset16) & 0x000000FF));
+}
+
+static inline __be32 cm_rep_get_qpn(struct cm_rep_msg *rep_msg, enum ib_qp_type qp_type)
+{
+	return (qp_type == IB_QPT_XRC_INI) ?
+		cm_rep_get_local_eecn(rep_msg) : cm_rep_get_local_qpn(rep_msg);
+}
+
+static inline __be32 cm_rep_get_starting_psn(struct cm_rep_msg *rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(rep_msg->offset20) >> 8);
+}
+
+static inline void cm_rep_set_starting_psn(struct cm_rep_msg *rep_msg,
+					   __be32 starting_psn)
+{
+	rep_msg->offset20 = cpu_to_be32((be32_to_cpu(starting_psn) << 8) |
+			    (be32_to_cpu(rep_msg->offset20) & 0x000000FF));
+}
+
+static inline u8 cm_rep_get_target_ack_delay(struct cm_rep_msg *rep_msg)
+{
+	return (u8) (rep_msg->offset26 >> 3);
+}
+
+static inline void cm_rep_set_target_ack_delay(struct cm_rep_msg *rep_msg,
+					       u8 target_ack_delay)
+{
+	rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0x07) |
+				  (target_ack_delay << 3));
+}
+
+static inline u8 cm_rep_get_failover(struct cm_rep_msg *rep_msg)
+{
+	return (u8) ((rep_msg->offset26 & 0x06) >> 1);
+}
+
+static inline void cm_rep_set_failover(struct cm_rep_msg *rep_msg, u8 failover)
+{
+	rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xF9) |
+				  ((failover & 0x3) << 1));
+}
+
+static inline u8 cm_rep_get_flow_ctrl(struct cm_rep_msg *rep_msg)
+{
+	return (u8) (rep_msg->offset26 & 0x01);
+}
+
+static inline void cm_rep_set_flow_ctrl(struct cm_rep_msg *rep_msg,
+					    u8 flow_ctrl)
+{
+	rep_msg->offset26 = (u8) ((rep_msg->offset26 & 0xFE) |
+				  (flow_ctrl & 0x1));
+}
+
+static inline u8 cm_rep_get_rnr_retry_count(struct cm_rep_msg *rep_msg)
+{
+	return (u8) (rep_msg->offset27 >> 5);
+}
+
+static inline void cm_rep_set_rnr_retry_count(struct cm_rep_msg *rep_msg,
+					      u8 rnr_retry_count)
+{
+	rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0x1F) |
+				  (rnr_retry_count << 5));
+}
+
+static inline u8 cm_rep_get_srq(struct cm_rep_msg *rep_msg)
+{
+	return (u8) ((rep_msg->offset27 >> 4) & 0x1);
+}
+
+static inline void cm_rep_set_srq(struct cm_rep_msg *rep_msg, u8 srq)
+{
+	rep_msg->offset27 = (u8) ((rep_msg->offset27 & 0xEF) |
+				  ((srq & 0x1) << 4));
+}
+
+struct cm_rtu_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	u8 private_data[IB_CM_RTU_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+struct cm_dreq_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+	/* remote QPN/EECN:24, rsvd:8 */
+	__be32 offset8;
+
+	u8 private_data[IB_CM_DREQ_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+static inline __be32 cm_dreq_get_remote_qpn(struct cm_dreq_msg *dreq_msg)
+{
+	return cpu_to_be32(be32_to_cpu(dreq_msg->offset8) >> 8);
+}
+
+static inline void cm_dreq_set_remote_qpn(struct cm_dreq_msg *dreq_msg, __be32 qpn)
+{
+	dreq_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+			    (be32_to_cpu(dreq_msg->offset8) & 0x000000FF));
+}
+
+struct cm_drep_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	u8 private_data[IB_CM_DREP_PRIVATE_DATA_SIZE];
+
+} __attribute__ ((packed));
+
+struct cm_lap_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	__be32 rsvd8;
+	/* remote QPN/EECN:24, remote CM response timeout:5, rsvd:3 */
+	__be32 offset12;
+	__be32 rsvd16;
+
+	__be16 alt_local_lid;
+	__be16 alt_remote_lid;
+	union ib_gid alt_local_gid;
+	union ib_gid alt_remote_gid;
+	/* flow label:20, rsvd:4, traffic class:8 */
+	__be32 offset56;
+	u8 alt_hop_limit;
+	/* rsvd:2, packet rate:6 */
+	u8 offset61;
+	/* SL:4, subnet local:1, rsvd:3 */
+	u8 offset62;
+	/* local ACK timeout:5, rsvd:3 */
+	u8 offset63;
+
+	u8 private_data[IB_CM_LAP_PRIVATE_DATA_SIZE];
+} __attribute__  ((packed));
+
+static inline __be32 cm_lap_get_remote_qpn(struct cm_lap_msg *lap_msg)
+{
+	return cpu_to_be32(be32_to_cpu(lap_msg->offset12) >> 8);
+}
+
+static inline void cm_lap_set_remote_qpn(struct cm_lap_msg *lap_msg, __be32 qpn)
+{
+	lap_msg->offset12 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+					 (be32_to_cpu(lap_msg->offset12) &
+					  0x000000FF));
+}
+
+static inline u8 cm_lap_get_remote_resp_timeout(struct cm_lap_msg *lap_msg)
+{
+	return (u8) ((be32_to_cpu(lap_msg->offset12) & 0xF8) >> 3);
+}
+
+static inline void cm_lap_set_remote_resp_timeout(struct cm_lap_msg *lap_msg,
+						  u8 resp_timeout)
+{
+	lap_msg->offset12 = cpu_to_be32((resp_timeout << 3) |
+					 (be32_to_cpu(lap_msg->offset12) &
+					  0xFFFFFF07));
+}
+
+static inline __be32 cm_lap_get_flow_label(struct cm_lap_msg *lap_msg)
+{
+	return cpu_to_be32(be32_to_cpu(lap_msg->offset56) >> 12);
+}
+
+static inline void cm_lap_set_flow_label(struct cm_lap_msg *lap_msg,
+					 __be32 flow_label)
+{
+	lap_msg->offset56 = cpu_to_be32(
+				 (be32_to_cpu(lap_msg->offset56) & 0x00000FFF) |
+				 (be32_to_cpu(flow_label) << 12));
+}
+
+static inline u8 cm_lap_get_traffic_class(struct cm_lap_msg *lap_msg)
+{
+	return (u8) be32_to_cpu(lap_msg->offset56);
+}
+
+static inline void cm_lap_set_traffic_class(struct cm_lap_msg *lap_msg,
+					    u8 traffic_class)
+{
+	lap_msg->offset56 = cpu_to_be32(traffic_class |
+					 (be32_to_cpu(lap_msg->offset56) &
+					  0xFFFFFF00));
+}
+
+static inline u8 cm_lap_get_packet_rate(struct cm_lap_msg *lap_msg)
+{
+	return lap_msg->offset61 & 0x3F;
+}
+
+static inline void cm_lap_set_packet_rate(struct cm_lap_msg *lap_msg,
+					  u8 packet_rate)
+{
+	lap_msg->offset61 = (packet_rate & 0x3F) | (lap_msg->offset61 & 0xC0);
+}
+
+static inline u8 cm_lap_get_sl(struct cm_lap_msg *lap_msg)
+{
+	return lap_msg->offset62 >> 4;
+}
+
+static inline void cm_lap_set_sl(struct cm_lap_msg *lap_msg, u8 sl)
+{
+	lap_msg->offset62 = (sl << 4) | (lap_msg->offset62 & 0x0F);
+}
+
+static inline u8 cm_lap_get_subnet_local(struct cm_lap_msg *lap_msg)
+{
+	return (lap_msg->offset62 >> 3) & 0x1;
+}
+
+static inline void cm_lap_set_subnet_local(struct cm_lap_msg *lap_msg,
+					   u8 subnet_local)
+{
+	lap_msg->offset62 = ((subnet_local & 0x1) << 3) |
+			     (lap_msg->offset61 & 0xF7);
+}
+static inline u8 cm_lap_get_local_ack_timeout(struct cm_lap_msg *lap_msg)
+{
+	return lap_msg->offset63 >> 3;
+}
+
+static inline void cm_lap_set_local_ack_timeout(struct cm_lap_msg *lap_msg,
+						u8 local_ack_timeout)
+{
+	lap_msg->offset63 = (local_ack_timeout << 3) |
+			    (lap_msg->offset63 & 0x07);
+}
+
+struct cm_apr_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+
+	u8 info_length;
+	u8 ap_status;
+	__be16 rsvd;
+	u8 info[IB_CM_APR_INFO_LENGTH];
+
+	u8 private_data[IB_CM_APR_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+struct cm_sidr_req_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 request_id;
+	__be16 pkey;
+	__be16 rsvd;
+	__be64 service_id;
+
+	u32 private_data[IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE / sizeof(u32)];
+} __attribute__ ((packed));
+
+struct cm_sidr_rep_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 request_id;
+	u8 status;
+	u8 info_length;
+	__be16 rsvd;
+	/* QPN:24, rsvd:8 */
+	__be32 offset8;
+	__be64 service_id;
+	__be32 qkey;
+	u8 info[IB_CM_SIDR_REP_INFO_LENGTH];
+
+	u8 private_data[IB_CM_SIDR_REP_PRIVATE_DATA_SIZE];
+} __attribute__ ((packed));
+
+static inline __be32 cm_sidr_rep_get_qpn(struct cm_sidr_rep_msg *sidr_rep_msg)
+{
+	return cpu_to_be32(be32_to_cpu(sidr_rep_msg->offset8) >> 8);
+}
+
+static inline void cm_sidr_rep_set_qpn(struct cm_sidr_rep_msg *sidr_rep_msg,
+				       __be32 qpn)
+{
+	sidr_rep_msg->offset8 = cpu_to_be32((be32_to_cpu(qpn) << 8) |
+					(be32_to_cpu(sidr_rep_msg->offset8) &
+					 0x000000FF));
+}
+
+#endif /* CM_MSGS_H */
diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c
new file mode 100644
index 000000000..38ffe0981
--- /dev/null
+++ b/drivers/infiniband/core/cma.c
@@ -0,0 +1,3720 @@
+/*
+ * Copyright (c) 2005 Voltaire Inc.  All rights reserved.
+ * Copyright (c) 2002-2005, Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 1999-2005, Mellanox Technologies, Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/mutex.h>
+#include <linux/random.h>
+#include <linux/idr.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <net/route.h>
+
+#include <net/tcp.h>
+#include <net/ipv6.h>
+
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+#include <rdma/rdma_netlink.h>
+#include <rdma/ib.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_sa.h>
+#include <rdma/iw_cm.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("Generic RDMA CM Agent");
+MODULE_LICENSE("Dual BSD/GPL");
+
+#define CMA_CM_RESPONSE_TIMEOUT 20
+#define CMA_MAX_CM_RETRIES 15
+#define CMA_CM_MRA_SETTING (IB_CM_MRA_FLAG_DELAY | 24)
+#define CMA_IBOE_PACKET_LIFETIME 18
+
+static void cma_add_one(struct ib_device *device);
+static void cma_remove_one(struct ib_device *device);
+
+static struct ib_client cma_client = {
+	.name   = "cma",
+	.add    = cma_add_one,
+	.remove = cma_remove_one
+};
+
+static struct ib_sa_client sa_client;
+static struct rdma_addr_client addr_client;
+static LIST_HEAD(dev_list);
+static LIST_HEAD(listen_any_list);
+static DEFINE_MUTEX(lock);
+static struct workqueue_struct *cma_wq;
+static DEFINE_IDR(tcp_ps);
+static DEFINE_IDR(udp_ps);
+static DEFINE_IDR(ipoib_ps);
+static DEFINE_IDR(ib_ps);
+
+struct cma_device {
+	struct list_head	list;
+	struct ib_device	*device;
+	struct completion	comp;
+	atomic_t		refcount;
+	struct list_head	id_list;
+};
+
+struct rdma_bind_list {
+	struct idr		*ps;
+	struct hlist_head	owners;
+	unsigned short		port;
+};
+
+enum {
+	CMA_OPTION_AFONLY,
+};
+
+/*
+ * Device removal can occur at anytime, so we need extra handling to
+ * serialize notifying the user of device removal with other callbacks.
+ * We do this by disabling removal notification while a callback is in process,
+ * and reporting it after the callback completes.
+ */
+struct rdma_id_private {
+	struct rdma_cm_id	id;
+
+	struct rdma_bind_list	*bind_list;
+	struct hlist_node	node;
+	struct list_head	list; /* listen_any_list or cma_device.list */
+	struct list_head	listen_list; /* per device listens */
+	struct cma_device	*cma_dev;
+	struct list_head	mc_list;
+
+	int			internal_id;
+	enum rdma_cm_state	state;
+	spinlock_t		lock;
+	struct mutex		qp_mutex;
+
+	struct completion	comp;
+	atomic_t		refcount;
+	struct mutex		handler_mutex;
+
+	int			backlog;
+	int			timeout_ms;
+	struct ib_sa_query	*query;
+	int			query_id;
+	union {
+		struct ib_cm_id	*ib;
+		struct iw_cm_id	*iw;
+	} cm_id;
+
+	u32			seq_num;
+	u32			qkey;
+	u32			qp_num;
+	pid_t			owner;
+	u32			options;
+	u8			srq;
+	u8			tos;
+	u8			reuseaddr;
+	u8			afonly;
+};
+
+struct cma_multicast {
+	struct rdma_id_private *id_priv;
+	union {
+		struct ib_sa_multicast *ib;
+	} multicast;
+	struct list_head	list;
+	void			*context;
+	struct sockaddr_storage	addr;
+	struct kref		mcref;
+};
+
+struct cma_work {
+	struct work_struct	work;
+	struct rdma_id_private	*id;
+	enum rdma_cm_state	old_state;
+	enum rdma_cm_state	new_state;
+	struct rdma_cm_event	event;
+};
+
+struct cma_ndev_work {
+	struct work_struct	work;
+	struct rdma_id_private	*id;
+	struct rdma_cm_event	event;
+};
+
+struct iboe_mcast_work {
+	struct work_struct	 work;
+	struct rdma_id_private	*id;
+	struct cma_multicast	*mc;
+};
+
+union cma_ip_addr {
+	struct in6_addr ip6;
+	struct {
+		__be32 pad[3];
+		__be32 addr;
+	} ip4;
+};
+
+struct cma_hdr {
+	u8 cma_version;
+	u8 ip_version;	/* IP version: 7:4 */
+	__be16 port;
+	union cma_ip_addr src_addr;
+	union cma_ip_addr dst_addr;
+};
+
+#define CMA_VERSION 0x00
+
+static int cma_comp(struct rdma_id_private *id_priv, enum rdma_cm_state comp)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&id_priv->lock, flags);
+	ret = (id_priv->state == comp);
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return ret;
+}
+
+static int cma_comp_exch(struct rdma_id_private *id_priv,
+			 enum rdma_cm_state comp, enum rdma_cm_state exch)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&id_priv->lock, flags);
+	if ((ret = (id_priv->state == comp)))
+		id_priv->state = exch;
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return ret;
+}
+
+static enum rdma_cm_state cma_exch(struct rdma_id_private *id_priv,
+				   enum rdma_cm_state exch)
+{
+	unsigned long flags;
+	enum rdma_cm_state old;
+
+	spin_lock_irqsave(&id_priv->lock, flags);
+	old = id_priv->state;
+	id_priv->state = exch;
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return old;
+}
+
+static inline u8 cma_get_ip_ver(struct cma_hdr *hdr)
+{
+	return hdr->ip_version >> 4;
+}
+
+static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver)
+{
+	hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF);
+}
+
+static void cma_attach_to_dev(struct rdma_id_private *id_priv,
+			      struct cma_device *cma_dev)
+{
+	atomic_inc(&cma_dev->refcount);
+	id_priv->cma_dev = cma_dev;
+	id_priv->id.device = cma_dev->device;
+	id_priv->id.route.addr.dev_addr.transport =
+		rdma_node_get_transport(cma_dev->device->node_type);
+	list_add_tail(&id_priv->list, &cma_dev->id_list);
+}
+
+static inline void cma_deref_dev(struct cma_device *cma_dev)
+{
+	if (atomic_dec_and_test(&cma_dev->refcount))
+		complete(&cma_dev->comp);
+}
+
+static inline void release_mc(struct kref *kref)
+{
+	struct cma_multicast *mc = container_of(kref, struct cma_multicast, mcref);
+
+	kfree(mc->multicast.ib);
+	kfree(mc);
+}
+
+static void cma_release_dev(struct rdma_id_private *id_priv)
+{
+	mutex_lock(&lock);
+	list_del(&id_priv->list);
+	cma_deref_dev(id_priv->cma_dev);
+	id_priv->cma_dev = NULL;
+	mutex_unlock(&lock);
+}
+
+static inline struct sockaddr *cma_src_addr(struct rdma_id_private *id_priv)
+{
+	return (struct sockaddr *) &id_priv->id.route.addr.src_addr;
+}
+
+static inline struct sockaddr *cma_dst_addr(struct rdma_id_private *id_priv)
+{
+	return (struct sockaddr *) &id_priv->id.route.addr.dst_addr;
+}
+
+static inline unsigned short cma_family(struct rdma_id_private *id_priv)
+{
+	return id_priv->id.route.addr.src_addr.ss_family;
+}
+
+static int cma_set_qkey(struct rdma_id_private *id_priv, u32 qkey)
+{
+	struct ib_sa_mcmember_rec rec;
+	int ret = 0;
+
+	if (id_priv->qkey) {
+		if (qkey && id_priv->qkey != qkey)
+			return -EINVAL;
+		return 0;
+	}
+
+	if (qkey) {
+		id_priv->qkey = qkey;
+		return 0;
+	}
+
+	switch (id_priv->id.ps) {
+	case RDMA_PS_UDP:
+	case RDMA_PS_IB:
+		id_priv->qkey = RDMA_UDP_QKEY;
+		break;
+	case RDMA_PS_IPOIB:
+		ib_addr_get_mgid(&id_priv->id.route.addr.dev_addr, &rec.mgid);
+		ret = ib_sa_get_mcmember_rec(id_priv->id.device,
+					     id_priv->id.port_num, &rec.mgid,
+					     &rec);
+		if (!ret)
+			id_priv->qkey = be32_to_cpu(rec.qkey);
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+static void cma_translate_ib(struct sockaddr_ib *sib, struct rdma_dev_addr *dev_addr)
+{
+	dev_addr->dev_type = ARPHRD_INFINIBAND;
+	rdma_addr_set_sgid(dev_addr, (union ib_gid *) &sib->sib_addr);
+	ib_addr_set_pkey(dev_addr, ntohs(sib->sib_pkey));
+}
+
+static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_addr)
+{
+	int ret;
+
+	if (addr->sa_family != AF_IB) {
+		ret = rdma_translate_ip(addr, dev_addr, NULL);
+	} else {
+		cma_translate_ib((struct sockaddr_ib *) addr, dev_addr);
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int cma_acquire_dev(struct rdma_id_private *id_priv,
+			   struct rdma_id_private *listen_id_priv)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	struct cma_device *cma_dev;
+	union ib_gid gid, iboe_gid;
+	int ret = -ENODEV;
+	u8 port, found_port;
+	enum rdma_link_layer dev_ll = dev_addr->dev_type == ARPHRD_INFINIBAND ?
+		IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
+
+	if (dev_ll != IB_LINK_LAYER_INFINIBAND &&
+	    id_priv->id.ps == RDMA_PS_IPOIB)
+		return -EINVAL;
+
+	mutex_lock(&lock);
+	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+		    &iboe_gid);
+
+	memcpy(&gid, dev_addr->src_dev_addr +
+	       rdma_addr_gid_offset(dev_addr), sizeof gid);
+	if (listen_id_priv &&
+	    rdma_port_get_link_layer(listen_id_priv->id.device,
+				     listen_id_priv->id.port_num) == dev_ll) {
+		cma_dev = listen_id_priv->cma_dev;
+		port = listen_id_priv->id.port_num;
+		if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
+		    rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
+			ret = ib_find_cached_gid(cma_dev->device, &iboe_gid,
+						 &found_port, NULL);
+		else
+			ret = ib_find_cached_gid(cma_dev->device, &gid,
+						 &found_port, NULL);
+
+		if (!ret && (port  == found_port)) {
+			id_priv->id.port_num = found_port;
+			goto out;
+		}
+	}
+	list_for_each_entry(cma_dev, &dev_list, list) {
+		for (port = 1; port <= cma_dev->device->phys_port_cnt; ++port) {
+			if (listen_id_priv &&
+			    listen_id_priv->cma_dev == cma_dev &&
+			    listen_id_priv->id.port_num == port)
+				continue;
+			if (rdma_port_get_link_layer(cma_dev->device, port) == dev_ll) {
+				if (rdma_node_get_transport(cma_dev->device->node_type) == RDMA_TRANSPORT_IB &&
+				    rdma_port_get_link_layer(cma_dev->device, port) == IB_LINK_LAYER_ETHERNET)
+					ret = ib_find_cached_gid(cma_dev->device, &iboe_gid, &found_port, NULL);
+				else
+					ret = ib_find_cached_gid(cma_dev->device, &gid, &found_port, NULL);
+
+				if (!ret && (port == found_port)) {
+					id_priv->id.port_num = found_port;
+					goto out;
+				}
+			}
+		}
+	}
+
+out:
+	if (!ret)
+		cma_attach_to_dev(id_priv, cma_dev);
+
+	mutex_unlock(&lock);
+	return ret;
+}
+
+/*
+ * Select the source IB device and address to reach the destination IB address.
+ */
+static int cma_resolve_ib_dev(struct rdma_id_private *id_priv)
+{
+	struct cma_device *cma_dev, *cur_dev;
+	struct sockaddr_ib *addr;
+	union ib_gid gid, sgid, *dgid;
+	u16 pkey, index;
+	u8 p;
+	int i;
+
+	cma_dev = NULL;
+	addr = (struct sockaddr_ib *) cma_dst_addr(id_priv);
+	dgid = (union ib_gid *) &addr->sib_addr;
+	pkey = ntohs(addr->sib_pkey);
+
+	list_for_each_entry(cur_dev, &dev_list, list) {
+		if (rdma_node_get_transport(cur_dev->device->node_type) != RDMA_TRANSPORT_IB)
+			continue;
+
+		for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) {
+			if (ib_find_cached_pkey(cur_dev->device, p, pkey, &index))
+				continue;
+
+			for (i = 0; !ib_get_cached_gid(cur_dev->device, p, i, &gid); i++) {
+				if (!memcmp(&gid, dgid, sizeof(gid))) {
+					cma_dev = cur_dev;
+					sgid = gid;
+					id_priv->id.port_num = p;
+					goto found;
+				}
+
+				if (!cma_dev && (gid.global.subnet_prefix ==
+						 dgid->global.subnet_prefix)) {
+					cma_dev = cur_dev;
+					sgid = gid;
+					id_priv->id.port_num = p;
+				}
+			}
+		}
+	}
+
+	if (!cma_dev)
+		return -ENODEV;
+
+found:
+	cma_attach_to_dev(id_priv, cma_dev);
+	addr = (struct sockaddr_ib *) cma_src_addr(id_priv);
+	memcpy(&addr->sib_addr, &sgid, sizeof sgid);
+	cma_translate_ib(addr, &id_priv->id.route.addr.dev_addr);
+	return 0;
+}
+
+static void cma_deref_id(struct rdma_id_private *id_priv)
+{
+	if (atomic_dec_and_test(&id_priv->refcount))
+		complete(&id_priv->comp);
+}
+
+static int cma_disable_callback(struct rdma_id_private *id_priv,
+				enum rdma_cm_state state)
+{
+	mutex_lock(&id_priv->handler_mutex);
+	if (id_priv->state != state) {
+		mutex_unlock(&id_priv->handler_mutex);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+struct rdma_cm_id *rdma_create_id(rdma_cm_event_handler event_handler,
+				  void *context, enum rdma_port_space ps,
+				  enum ib_qp_type qp_type)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = kzalloc(sizeof *id_priv, GFP_KERNEL);
+	if (!id_priv)
+		return ERR_PTR(-ENOMEM);
+
+	id_priv->owner = task_pid_nr(current);
+	id_priv->state = RDMA_CM_IDLE;
+	id_priv->id.context = context;
+	id_priv->id.event_handler = event_handler;
+	id_priv->id.ps = ps;
+	id_priv->id.qp_type = qp_type;
+	spin_lock_init(&id_priv->lock);
+	mutex_init(&id_priv->qp_mutex);
+	init_completion(&id_priv->comp);
+	atomic_set(&id_priv->refcount, 1);
+	mutex_init(&id_priv->handler_mutex);
+	INIT_LIST_HEAD(&id_priv->listen_list);
+	INIT_LIST_HEAD(&id_priv->mc_list);
+	get_random_bytes(&id_priv->seq_num, sizeof id_priv->seq_num);
+
+	return &id_priv->id;
+}
+EXPORT_SYMBOL(rdma_create_id);
+
+static int cma_init_ud_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret)
+		return ret;
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+	if (ret)
+		return ret;
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	qp_attr.sq_psn = 0;
+	ret = ib_modify_qp(qp, &qp_attr, IB_QP_STATE | IB_QP_SQ_PSN);
+
+	return ret;
+}
+
+static int cma_init_conn_qp(struct rdma_id_private *id_priv, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	return ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+}
+
+int rdma_create_qp(struct rdma_cm_id *id, struct ib_pd *pd,
+		   struct ib_qp_init_attr *qp_init_attr)
+{
+	struct rdma_id_private *id_priv;
+	struct ib_qp *qp;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (id->device != pd->device)
+		return -EINVAL;
+
+	qp = ib_create_qp(pd, qp_init_attr);
+	if (IS_ERR(qp))
+		return PTR_ERR(qp);
+
+	if (id->qp_type == IB_QPT_UD)
+		ret = cma_init_ud_qp(id_priv, qp);
+	else
+		ret = cma_init_conn_qp(id_priv, qp);
+	if (ret)
+		goto err;
+
+	id->qp = qp;
+	id_priv->qp_num = qp->qp_num;
+	id_priv->srq = (qp->srq != NULL);
+	return 0;
+err:
+	ib_destroy_qp(qp);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_create_qp);
+
+void rdma_destroy_qp(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	mutex_lock(&id_priv->qp_mutex);
+	ib_destroy_qp(id_priv->id.qp);
+	id_priv->id.qp = NULL;
+	mutex_unlock(&id_priv->qp_mutex);
+}
+EXPORT_SYMBOL(rdma_destroy_qp);
+
+static int cma_modify_qp_rtr(struct rdma_id_private *id_priv,
+			     struct rdma_conn_param *conn_param)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+	union ib_gid sgid;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
+
+	/* Need to update QP attributes from default values. */
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		goto out;
+
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+	if (ret)
+		goto out;
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		goto out;
+
+	ret = ib_query_gid(id_priv->id.device, id_priv->id.port_num,
+			   qp_attr.ah_attr.grh.sgid_index, &sgid);
+	if (ret)
+		goto out;
+
+	if (rdma_node_get_transport(id_priv->cma_dev->device->node_type)
+	    == RDMA_TRANSPORT_IB &&
+	    rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)
+	    == IB_LINK_LAYER_ETHERNET) {
+		ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr.smac, NULL);
+
+		if (ret)
+			goto out;
+	}
+	if (conn_param)
+		qp_attr.max_dest_rd_atomic = conn_param->responder_resources;
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
+}
+
+static int cma_modify_qp_rts(struct rdma_id_private *id_priv,
+			     struct rdma_conn_param *conn_param)
+{
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = rdma_init_qp_attr(&id_priv->id, &qp_attr, &qp_attr_mask);
+	if (ret)
+		goto out;
+
+	if (conn_param)
+		qp_attr.max_rd_atomic = conn_param->initiator_depth;
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, qp_attr_mask);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
+}
+
+static int cma_modify_qp_err(struct rdma_id_private *id_priv)
+{
+	struct ib_qp_attr qp_attr;
+	int ret;
+
+	mutex_lock(&id_priv->qp_mutex);
+	if (!id_priv->id.qp) {
+		ret = 0;
+		goto out;
+	}
+
+	qp_attr.qp_state = IB_QPS_ERR;
+	ret = ib_modify_qp(id_priv->id.qp, &qp_attr, IB_QP_STATE);
+out:
+	mutex_unlock(&id_priv->qp_mutex);
+	return ret;
+}
+
+static int cma_ib_init_qp_attr(struct rdma_id_private *id_priv,
+			       struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	int ret;
+	u16 pkey;
+
+	if (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num) ==
+	    IB_LINK_LAYER_INFINIBAND)
+		pkey = ib_addr_get_pkey(dev_addr);
+	else
+		pkey = 0xffff;
+
+	ret = ib_find_cached_pkey(id_priv->id.device, id_priv->id.port_num,
+				  pkey, &qp_attr->pkey_index);
+	if (ret)
+		return ret;
+
+	qp_attr->port_num = id_priv->id.port_num;
+	*qp_attr_mask = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+	if (id_priv->id.qp_type == IB_QPT_UD) {
+		ret = cma_set_qkey(id_priv, 0);
+		if (ret)
+			return ret;
+
+		qp_attr->qkey = id_priv->qkey;
+		*qp_attr_mask |= IB_QP_QKEY;
+	} else {
+		qp_attr->qp_access_flags = 0;
+		*qp_attr_mask |= IB_QP_ACCESS_FLAGS;
+	}
+	return 0;
+}
+
+int rdma_init_qp_attr(struct rdma_cm_id *id, struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask)
+{
+	struct rdma_id_private *id_priv;
+	int ret = 0;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (!id_priv->cm_id.ib || (id_priv->id.qp_type == IB_QPT_UD))
+			ret = cma_ib_init_qp_attr(id_priv, qp_attr, qp_attr_mask);
+		else
+			ret = ib_cm_init_qp_attr(id_priv->cm_id.ib, qp_attr,
+						 qp_attr_mask);
+
+		if (qp_attr->qp_state == IB_QPS_RTR)
+			qp_attr->rq_psn = id_priv->seq_num;
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		if (!id_priv->cm_id.iw) {
+			qp_attr->qp_access_flags = 0;
+			*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+		} else
+			ret = iw_cm_init_qp_attr(id_priv->cm_id.iw, qp_attr,
+						 qp_attr_mask);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(rdma_init_qp_attr);
+
+static inline int cma_zero_addr(struct sockaddr *addr)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		return ipv4_is_zeronet(((struct sockaddr_in *)addr)->sin_addr.s_addr);
+	case AF_INET6:
+		return ipv6_addr_any(&((struct sockaddr_in6 *) addr)->sin6_addr);
+	case AF_IB:
+		return ib_addr_any(&((struct sockaddr_ib *) addr)->sib_addr);
+	default:
+		return 0;
+	}
+}
+
+static inline int cma_loopback_addr(struct sockaddr *addr)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		return ipv4_is_loopback(((struct sockaddr_in *) addr)->sin_addr.s_addr);
+	case AF_INET6:
+		return ipv6_addr_loopback(&((struct sockaddr_in6 *) addr)->sin6_addr);
+	case AF_IB:
+		return ib_addr_loopback(&((struct sockaddr_ib *) addr)->sib_addr);
+	default:
+		return 0;
+	}
+}
+
+static inline int cma_any_addr(struct sockaddr *addr)
+{
+	return cma_zero_addr(addr) || cma_loopback_addr(addr);
+}
+
+static int cma_addr_cmp(struct sockaddr *src, struct sockaddr *dst)
+{
+	if (src->sa_family != dst->sa_family)
+		return -1;
+
+	switch (src->sa_family) {
+	case AF_INET:
+		return ((struct sockaddr_in *) src)->sin_addr.s_addr !=
+		       ((struct sockaddr_in *) dst)->sin_addr.s_addr;
+	case AF_INET6:
+		return ipv6_addr_cmp(&((struct sockaddr_in6 *) src)->sin6_addr,
+				     &((struct sockaddr_in6 *) dst)->sin6_addr);
+	default:
+		return ib_addr_cmp(&((struct sockaddr_ib *) src)->sib_addr,
+				   &((struct sockaddr_ib *) dst)->sib_addr);
+	}
+}
+
+static __be16 cma_port(struct sockaddr *addr)
+{
+	struct sockaddr_ib *sib;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		return ((struct sockaddr_in *) addr)->sin_port;
+	case AF_INET6:
+		return ((struct sockaddr_in6 *) addr)->sin6_port;
+	case AF_IB:
+		sib = (struct sockaddr_ib *) addr;
+		return htons((u16) (be64_to_cpu(sib->sib_sid) &
+				    be64_to_cpu(sib->sib_sid_mask)));
+	default:
+		return 0;
+	}
+}
+
+static inline int cma_any_port(struct sockaddr *addr)
+{
+	return !cma_port(addr);
+}
+
+static void cma_save_ib_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+			     struct ib_sa_path_rec *path)
+{
+	struct sockaddr_ib *listen_ib, *ib;
+
+	listen_ib = (struct sockaddr_ib *) &listen_id->route.addr.src_addr;
+	ib = (struct sockaddr_ib *) &id->route.addr.src_addr;
+	ib->sib_family = listen_ib->sib_family;
+	if (path) {
+		ib->sib_pkey = path->pkey;
+		ib->sib_flowinfo = path->flow_label;
+		memcpy(&ib->sib_addr, &path->sgid, 16);
+	} else {
+		ib->sib_pkey = listen_ib->sib_pkey;
+		ib->sib_flowinfo = listen_ib->sib_flowinfo;
+		ib->sib_addr = listen_ib->sib_addr;
+	}
+	ib->sib_sid = listen_ib->sib_sid;
+	ib->sib_sid_mask = cpu_to_be64(0xffffffffffffffffULL);
+	ib->sib_scope_id = listen_ib->sib_scope_id;
+
+	if (path) {
+		ib = (struct sockaddr_ib *) &id->route.addr.dst_addr;
+		ib->sib_family = listen_ib->sib_family;
+		ib->sib_pkey = path->pkey;
+		ib->sib_flowinfo = path->flow_label;
+		memcpy(&ib->sib_addr, &path->dgid, 16);
+	}
+}
+
+static __be16 ss_get_port(const struct sockaddr_storage *ss)
+{
+	if (ss->ss_family == AF_INET)
+		return ((struct sockaddr_in *)ss)->sin_port;
+	else if (ss->ss_family == AF_INET6)
+		return ((struct sockaddr_in6 *)ss)->sin6_port;
+	BUG();
+}
+
+static void cma_save_ip4_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+			      struct cma_hdr *hdr)
+{
+	struct sockaddr_in *ip4;
+
+	ip4 = (struct sockaddr_in *) &id->route.addr.src_addr;
+	ip4->sin_family = AF_INET;
+	ip4->sin_addr.s_addr = hdr->dst_addr.ip4.addr;
+	ip4->sin_port = ss_get_port(&listen_id->route.addr.src_addr);
+
+	ip4 = (struct sockaddr_in *) &id->route.addr.dst_addr;
+	ip4->sin_family = AF_INET;
+	ip4->sin_addr.s_addr = hdr->src_addr.ip4.addr;
+	ip4->sin_port = hdr->port;
+}
+
+static void cma_save_ip6_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+			      struct cma_hdr *hdr)
+{
+	struct sockaddr_in6 *ip6;
+
+	ip6 = (struct sockaddr_in6 *) &id->route.addr.src_addr;
+	ip6->sin6_family = AF_INET6;
+	ip6->sin6_addr = hdr->dst_addr.ip6;
+	ip6->sin6_port = ss_get_port(&listen_id->route.addr.src_addr);
+
+	ip6 = (struct sockaddr_in6 *) &id->route.addr.dst_addr;
+	ip6->sin6_family = AF_INET6;
+	ip6->sin6_addr = hdr->src_addr.ip6;
+	ip6->sin6_port = hdr->port;
+}
+
+static int cma_save_net_info(struct rdma_cm_id *id, struct rdma_cm_id *listen_id,
+			     struct ib_cm_event *ib_event)
+{
+	struct cma_hdr *hdr;
+
+	if (listen_id->route.addr.src_addr.ss_family == AF_IB) {
+		if (ib_event->event == IB_CM_REQ_RECEIVED)
+			cma_save_ib_info(id, listen_id, ib_event->param.req_rcvd.primary_path);
+		else if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED)
+			cma_save_ib_info(id, listen_id, NULL);
+		return 0;
+	}
+
+	hdr = ib_event->private_data;
+	if (hdr->cma_version != CMA_VERSION)
+		return -EINVAL;
+
+	switch (cma_get_ip_ver(hdr)) {
+	case 4:
+		cma_save_ip4_info(id, listen_id, hdr);
+		break;
+	case 6:
+		cma_save_ip6_info(id, listen_id, hdr);
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static inline int cma_user_data_offset(struct rdma_id_private *id_priv)
+{
+	return cma_family(id_priv) == AF_IB ? 0 : sizeof(struct cma_hdr);
+}
+
+static void cma_cancel_route(struct rdma_id_private *id_priv)
+{
+	switch (rdma_port_get_link_layer(id_priv->id.device, id_priv->id.port_num)) {
+	case IB_LINK_LAYER_INFINIBAND:
+		if (id_priv->query)
+			ib_sa_cancel_query(id_priv->query_id, id_priv->query);
+		break;
+	default:
+		break;
+	}
+}
+
+static void cma_cancel_listens(struct rdma_id_private *id_priv)
+{
+	struct rdma_id_private *dev_id_priv;
+
+	/*
+	 * Remove from listen_any_list to prevent added devices from spawning
+	 * additional listen requests.
+	 */
+	mutex_lock(&lock);
+	list_del(&id_priv->list);
+
+	while (!list_empty(&id_priv->listen_list)) {
+		dev_id_priv = list_entry(id_priv->listen_list.next,
+					 struct rdma_id_private, listen_list);
+		/* sync with device removal to avoid duplicate destruction */
+		list_del_init(&dev_id_priv->list);
+		list_del(&dev_id_priv->listen_list);
+		mutex_unlock(&lock);
+
+		rdma_destroy_id(&dev_id_priv->id);
+		mutex_lock(&lock);
+	}
+	mutex_unlock(&lock);
+}
+
+static void cma_cancel_operation(struct rdma_id_private *id_priv,
+				 enum rdma_cm_state state)
+{
+	switch (state) {
+	case RDMA_CM_ADDR_QUERY:
+		rdma_addr_cancel(&id_priv->id.route.addr.dev_addr);
+		break;
+	case RDMA_CM_ROUTE_QUERY:
+		cma_cancel_route(id_priv);
+		break;
+	case RDMA_CM_LISTEN:
+		if (cma_any_addr(cma_src_addr(id_priv)) && !id_priv->cma_dev)
+			cma_cancel_listens(id_priv);
+		break;
+	default:
+		break;
+	}
+}
+
+static void cma_release_port(struct rdma_id_private *id_priv)
+{
+	struct rdma_bind_list *bind_list = id_priv->bind_list;
+
+	if (!bind_list)
+		return;
+
+	mutex_lock(&lock);
+	hlist_del(&id_priv->node);
+	if (hlist_empty(&bind_list->owners)) {
+		idr_remove(bind_list->ps, bind_list->port);
+		kfree(bind_list);
+	}
+	mutex_unlock(&lock);
+}
+
+static void cma_leave_mc_groups(struct rdma_id_private *id_priv)
+{
+	struct cma_multicast *mc;
+
+	while (!list_empty(&id_priv->mc_list)) {
+		mc = container_of(id_priv->mc_list.next,
+				  struct cma_multicast, list);
+		list_del(&mc->list);
+		switch (rdma_port_get_link_layer(id_priv->cma_dev->device, id_priv->id.port_num)) {
+		case IB_LINK_LAYER_INFINIBAND:
+			ib_sa_free_multicast(mc->multicast.ib);
+			kfree(mc);
+			break;
+		case IB_LINK_LAYER_ETHERNET:
+			kref_put(&mc->mcref, release_mc);
+			break;
+		default:
+			break;
+		}
+	}
+}
+
+void rdma_destroy_id(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv;
+	enum rdma_cm_state state;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	state = cma_exch(id_priv, RDMA_CM_DESTROYING);
+	cma_cancel_operation(id_priv, state);
+
+	/*
+	 * Wait for any active callback to finish.  New callbacks will find
+	 * the id_priv state set to destroying and abort.
+	 */
+	mutex_lock(&id_priv->handler_mutex);
+	mutex_unlock(&id_priv->handler_mutex);
+
+	if (id_priv->cma_dev) {
+		switch (rdma_node_get_transport(id_priv->id.device->node_type)) {
+		case RDMA_TRANSPORT_IB:
+			if (id_priv->cm_id.ib)
+				ib_destroy_cm_id(id_priv->cm_id.ib);
+			break;
+		case RDMA_TRANSPORT_IWARP:
+			if (id_priv->cm_id.iw)
+				iw_destroy_cm_id(id_priv->cm_id.iw);
+			break;
+		default:
+			break;
+		}
+		cma_leave_mc_groups(id_priv);
+		cma_release_dev(id_priv);
+	}
+
+	cma_release_port(id_priv);
+	cma_deref_id(id_priv);
+	wait_for_completion(&id_priv->comp);
+
+	if (id_priv->internal_id)
+		cma_deref_id(id_priv->id.context);
+
+	kfree(id_priv->id.route.path_rec);
+	kfree(id_priv);
+}
+EXPORT_SYMBOL(rdma_destroy_id);
+
+static int cma_rep_recv(struct rdma_id_private *id_priv)
+{
+	int ret;
+
+	ret = cma_modify_qp_rtr(id_priv, NULL);
+	if (ret)
+		goto reject;
+
+	ret = cma_modify_qp_rts(id_priv, NULL);
+	if (ret)
+		goto reject;
+
+	ret = ib_send_cm_rtu(id_priv->cm_id.ib, NULL, 0);
+	if (ret)
+		goto reject;
+
+	return 0;
+reject:
+	cma_modify_qp_err(id_priv);
+	ib_send_cm_rej(id_priv->cm_id.ib, IB_CM_REJ_CONSUMER_DEFINED,
+		       NULL, 0, NULL, 0);
+	return ret;
+}
+
+static void cma_set_rep_event_data(struct rdma_cm_event *event,
+				   struct ib_cm_rep_event_param *rep_data,
+				   void *private_data)
+{
+	event->param.conn.private_data = private_data;
+	event->param.conn.private_data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+	event->param.conn.responder_resources = rep_data->responder_resources;
+	event->param.conn.initiator_depth = rep_data->initiator_depth;
+	event->param.conn.flow_control = rep_data->flow_control;
+	event->param.conn.rnr_retry_count = rep_data->rnr_retry_count;
+	event->param.conn.srq = rep_data->srq;
+	event->param.conn.qp_num = rep_data->remote_qpn;
+}
+
+static int cma_ib_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv = cm_id->context;
+	struct rdma_cm_event event;
+	int ret = 0;
+
+	if ((ib_event->event != IB_CM_TIMEWAIT_EXIT &&
+		cma_disable_callback(id_priv, RDMA_CM_CONNECT)) ||
+	    (ib_event->event == IB_CM_TIMEWAIT_EXIT &&
+		cma_disable_callback(id_priv, RDMA_CM_DISCONNECT)))
+		return 0;
+
+	memset(&event, 0, sizeof event);
+	switch (ib_event->event) {
+	case IB_CM_REQ_ERROR:
+	case IB_CM_REP_ERROR:
+		event.event = RDMA_CM_EVENT_UNREACHABLE;
+		event.status = -ETIMEDOUT;
+		break;
+	case IB_CM_REP_RECEIVED:
+		if (id_priv->id.qp) {
+			event.status = cma_rep_recv(id_priv);
+			event.event = event.status ? RDMA_CM_EVENT_CONNECT_ERROR :
+						     RDMA_CM_EVENT_ESTABLISHED;
+		} else {
+			event.event = RDMA_CM_EVENT_CONNECT_RESPONSE;
+		}
+		cma_set_rep_event_data(&event, &ib_event->param.rep_rcvd,
+				       ib_event->private_data);
+		break;
+	case IB_CM_RTU_RECEIVED:
+	case IB_CM_USER_ESTABLISHED:
+		event.event = RDMA_CM_EVENT_ESTABLISHED;
+		break;
+	case IB_CM_DREQ_ERROR:
+		event.status = -ETIMEDOUT; /* fall through */
+	case IB_CM_DREQ_RECEIVED:
+	case IB_CM_DREP_RECEIVED:
+		if (!cma_comp_exch(id_priv, RDMA_CM_CONNECT,
+				   RDMA_CM_DISCONNECT))
+			goto out;
+		event.event = RDMA_CM_EVENT_DISCONNECTED;
+		break;
+	case IB_CM_TIMEWAIT_EXIT:
+		event.event = RDMA_CM_EVENT_TIMEWAIT_EXIT;
+		break;
+	case IB_CM_MRA_RECEIVED:
+		/* ignore event */
+		goto out;
+	case IB_CM_REJ_RECEIVED:
+		cma_modify_qp_err(id_priv);
+		event.status = ib_event->param.rej_rcvd.reason;
+		event.event = RDMA_CM_EVENT_REJECTED;
+		event.param.conn.private_data = ib_event->private_data;
+		event.param.conn.private_data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+		break;
+	default:
+		printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
+		       ib_event->event);
+		goto out;
+	}
+
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		/* Destroy the CM ID by returning a non-zero value. */
+		id_priv->cm_id.ib = NULL;
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return ret;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static struct rdma_id_private *cma_new_conn_id(struct rdma_cm_id *listen_id,
+					       struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv;
+	struct rdma_cm_id *id;
+	struct rdma_route *rt;
+	int ret;
+
+	id = rdma_create_id(listen_id->event_handler, listen_id->context,
+			    listen_id->ps, ib_event->param.req_rcvd.qp_type);
+	if (IS_ERR(id))
+		return NULL;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (cma_save_net_info(id, listen_id, ib_event))
+		goto err;
+
+	rt = &id->route;
+	rt->num_paths = ib_event->param.req_rcvd.alternate_path ? 2 : 1;
+	rt->path_rec = kmalloc(sizeof *rt->path_rec * rt->num_paths,
+			       GFP_KERNEL);
+	if (!rt->path_rec)
+		goto err;
+
+	rt->path_rec[0] = *ib_event->param.req_rcvd.primary_path;
+	if (rt->num_paths == 2)
+		rt->path_rec[1] = *ib_event->param.req_rcvd.alternate_path;
+
+	if (cma_any_addr(cma_src_addr(id_priv))) {
+		rt->addr.dev_addr.dev_type = ARPHRD_INFINIBAND;
+		rdma_addr_set_sgid(&rt->addr.dev_addr, &rt->path_rec[0].sgid);
+		ib_addr_set_pkey(&rt->addr.dev_addr, be16_to_cpu(rt->path_rec[0].pkey));
+	} else {
+		ret = cma_translate_addr(cma_src_addr(id_priv), &rt->addr.dev_addr);
+		if (ret)
+			goto err;
+	}
+	rdma_addr_set_dgid(&rt->addr.dev_addr, &rt->path_rec[0].dgid);
+
+	id_priv->state = RDMA_CM_CONNECT;
+	return id_priv;
+
+err:
+	rdma_destroy_id(id);
+	return NULL;
+}
+
+static struct rdma_id_private *cma_new_udp_id(struct rdma_cm_id *listen_id,
+					      struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv;
+	struct rdma_cm_id *id;
+	int ret;
+
+	id = rdma_create_id(listen_id->event_handler, listen_id->context,
+			    listen_id->ps, IB_QPT_UD);
+	if (IS_ERR(id))
+		return NULL;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (cma_save_net_info(id, listen_id, ib_event))
+		goto err;
+
+	if (!cma_any_addr((struct sockaddr *) &id->route.addr.src_addr)) {
+		ret = cma_translate_addr(cma_src_addr(id_priv), &id->route.addr.dev_addr);
+		if (ret)
+			goto err;
+	}
+
+	id_priv->state = RDMA_CM_CONNECT;
+	return id_priv;
+err:
+	rdma_destroy_id(id);
+	return NULL;
+}
+
+static void cma_set_req_event_data(struct rdma_cm_event *event,
+				   struct ib_cm_req_event_param *req_data,
+				   void *private_data, int offset)
+{
+	event->param.conn.private_data = private_data + offset;
+	event->param.conn.private_data_len = IB_CM_REQ_PRIVATE_DATA_SIZE - offset;
+	event->param.conn.responder_resources = req_data->responder_resources;
+	event->param.conn.initiator_depth = req_data->initiator_depth;
+	event->param.conn.flow_control = req_data->flow_control;
+	event->param.conn.retry_count = req_data->retry_count;
+	event->param.conn.rnr_retry_count = req_data->rnr_retry_count;
+	event->param.conn.srq = req_data->srq;
+	event->param.conn.qp_num = req_data->remote_qpn;
+}
+
+static int cma_check_req_qp_type(struct rdma_cm_id *id, struct ib_cm_event *ib_event)
+{
+	return (((ib_event->event == IB_CM_REQ_RECEIVED) &&
+		 (ib_event->param.req_rcvd.qp_type == id->qp_type)) ||
+		((ib_event->event == IB_CM_SIDR_REQ_RECEIVED) &&
+		 (id->qp_type == IB_QPT_UD)) ||
+		(!id->qp_type));
+}
+
+static int cma_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *listen_id, *conn_id;
+	struct rdma_cm_event event;
+	int offset, ret;
+
+	listen_id = cm_id->context;
+	if (!cma_check_req_qp_type(&listen_id->id, ib_event))
+		return -EINVAL;
+
+	if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
+		return -ECONNABORTED;
+
+	memset(&event, 0, sizeof event);
+	offset = cma_user_data_offset(listen_id);
+	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+	if (ib_event->event == IB_CM_SIDR_REQ_RECEIVED) {
+		conn_id = cma_new_udp_id(&listen_id->id, ib_event);
+		event.param.ud.private_data = ib_event->private_data + offset;
+		event.param.ud.private_data_len =
+				IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE - offset;
+	} else {
+		conn_id = cma_new_conn_id(&listen_id->id, ib_event);
+		cma_set_req_event_data(&event, &ib_event->param.req_rcvd,
+				       ib_event->private_data, offset);
+	}
+	if (!conn_id) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+	ret = cma_acquire_dev(conn_id, listen_id);
+	if (ret)
+		goto err2;
+
+	conn_id->cm_id.ib = cm_id;
+	cm_id->context = conn_id;
+	cm_id->cm_handler = cma_ib_handler;
+
+	/*
+	 * Protect against the user destroying conn_id from another thread
+	 * until we're done accessing it.
+	 */
+	atomic_inc(&conn_id->refcount);
+	ret = conn_id->id.event_handler(&conn_id->id, &event);
+	if (ret)
+		goto err3;
+	/*
+	 * Acquire mutex to prevent user executing rdma_destroy_id()
+	 * while we're accessing the cm_id.
+	 */
+	mutex_lock(&lock);
+	if (cma_comp(conn_id, RDMA_CM_CONNECT) &&
+	    (conn_id->id.qp_type != IB_QPT_UD))
+		ib_send_cm_mra(cm_id, CMA_CM_MRA_SETTING, NULL, 0);
+	mutex_unlock(&lock);
+	mutex_unlock(&conn_id->handler_mutex);
+	mutex_unlock(&listen_id->handler_mutex);
+	cma_deref_id(conn_id);
+	return 0;
+
+err3:
+	cma_deref_id(conn_id);
+	/* Destroy the CM ID by returning a non-zero value. */
+	conn_id->cm_id.ib = NULL;
+err2:
+	cma_exch(conn_id, RDMA_CM_DESTROYING);
+	mutex_unlock(&conn_id->handler_mutex);
+err1:
+	mutex_unlock(&listen_id->handler_mutex);
+	if (conn_id)
+		rdma_destroy_id(&conn_id->id);
+	return ret;
+}
+
+__be64 rdma_get_service_id(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+	if (addr->sa_family == AF_IB)
+		return ((struct sockaddr_ib *) addr)->sib_sid;
+
+	return cpu_to_be64(((u64)id->ps << 16) + be16_to_cpu(cma_port(addr)));
+}
+EXPORT_SYMBOL(rdma_get_service_id);
+
+static void cma_set_compare_data(enum rdma_port_space ps, struct sockaddr *addr,
+				 struct ib_cm_compare_data *compare)
+{
+	struct cma_hdr *cma_data, *cma_mask;
+	__be32 ip4_addr;
+	struct in6_addr ip6_addr;
+
+	memset(compare, 0, sizeof *compare);
+	cma_data = (void *) compare->data;
+	cma_mask = (void *) compare->mask;
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		ip4_addr = ((struct sockaddr_in *) addr)->sin_addr.s_addr;
+		cma_set_ip_ver(cma_data, 4);
+		cma_set_ip_ver(cma_mask, 0xF);
+		if (!cma_any_addr(addr)) {
+			cma_data->dst_addr.ip4.addr = ip4_addr;
+			cma_mask->dst_addr.ip4.addr = htonl(~0);
+		}
+		break;
+	case AF_INET6:
+		ip6_addr = ((struct sockaddr_in6 *) addr)->sin6_addr;
+		cma_set_ip_ver(cma_data, 6);
+		cma_set_ip_ver(cma_mask, 0xF);
+		if (!cma_any_addr(addr)) {
+			cma_data->dst_addr.ip6 = ip6_addr;
+			memset(&cma_mask->dst_addr.ip6, 0xFF,
+			       sizeof cma_mask->dst_addr.ip6);
+		}
+		break;
+	default:
+		break;
+	}
+}
+
+static int cma_iw_handler(struct iw_cm_id *iw_id, struct iw_cm_event *iw_event)
+{
+	struct rdma_id_private *id_priv = iw_id->context;
+	struct rdma_cm_event event;
+	int ret = 0;
+	struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
+	struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
+
+	if (cma_disable_callback(id_priv, RDMA_CM_CONNECT))
+		return 0;
+
+	memset(&event, 0, sizeof event);
+	switch (iw_event->event) {
+	case IW_CM_EVENT_CLOSE:
+		event.event = RDMA_CM_EVENT_DISCONNECTED;
+		break;
+	case IW_CM_EVENT_CONNECT_REPLY:
+		memcpy(cma_src_addr(id_priv), laddr,
+		       rdma_addr_size(laddr));
+		memcpy(cma_dst_addr(id_priv), raddr,
+		       rdma_addr_size(raddr));
+		switch (iw_event->status) {
+		case 0:
+			event.event = RDMA_CM_EVENT_ESTABLISHED;
+			event.param.conn.initiator_depth = iw_event->ird;
+			event.param.conn.responder_resources = iw_event->ord;
+			break;
+		case -ECONNRESET:
+		case -ECONNREFUSED:
+			event.event = RDMA_CM_EVENT_REJECTED;
+			break;
+		case -ETIMEDOUT:
+			event.event = RDMA_CM_EVENT_UNREACHABLE;
+			break;
+		default:
+			event.event = RDMA_CM_EVENT_CONNECT_ERROR;
+			break;
+		}
+		break;
+	case IW_CM_EVENT_ESTABLISHED:
+		event.event = RDMA_CM_EVENT_ESTABLISHED;
+		event.param.conn.initiator_depth = iw_event->ird;
+		event.param.conn.responder_resources = iw_event->ord;
+		break;
+	default:
+		BUG_ON(1);
+	}
+
+	event.status = iw_event->status;
+	event.param.conn.private_data = iw_event->private_data;
+	event.param.conn.private_data_len = iw_event->private_data_len;
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		/* Destroy the CM ID by returning a non-zero value. */
+		id_priv->cm_id.iw = NULL;
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return ret;
+	}
+
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static int iw_conn_req_handler(struct iw_cm_id *cm_id,
+			       struct iw_cm_event *iw_event)
+{
+	struct rdma_cm_id *new_cm_id;
+	struct rdma_id_private *listen_id, *conn_id;
+	struct rdma_cm_event event;
+	int ret;
+	struct ib_device_attr attr;
+	struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr;
+	struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr;
+
+	listen_id = cm_id->context;
+	if (cma_disable_callback(listen_id, RDMA_CM_LISTEN))
+		return -ECONNABORTED;
+
+	/* Create a new RDMA id for the new IW CM ID */
+	new_cm_id = rdma_create_id(listen_id->id.event_handler,
+				   listen_id->id.context,
+				   RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(new_cm_id)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	conn_id = container_of(new_cm_id, struct rdma_id_private, id);
+	mutex_lock_nested(&conn_id->handler_mutex, SINGLE_DEPTH_NESTING);
+	conn_id->state = RDMA_CM_CONNECT;
+
+	ret = rdma_translate_ip(laddr, &conn_id->id.route.addr.dev_addr, NULL);
+	if (ret) {
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+
+	ret = cma_acquire_dev(conn_id, listen_id);
+	if (ret) {
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+
+	conn_id->cm_id.iw = cm_id;
+	cm_id->context = conn_id;
+	cm_id->cm_handler = cma_iw_handler;
+
+	memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr));
+	memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr));
+
+	ret = ib_query_device(conn_id->id.device, &attr);
+	if (ret) {
+		mutex_unlock(&conn_id->handler_mutex);
+		rdma_destroy_id(new_cm_id);
+		goto out;
+	}
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_CONNECT_REQUEST;
+	event.param.conn.private_data = iw_event->private_data;
+	event.param.conn.private_data_len = iw_event->private_data_len;
+	event.param.conn.initiator_depth = iw_event->ird;
+	event.param.conn.responder_resources = iw_event->ord;
+
+	/*
+	 * Protect against the user destroying conn_id from another thread
+	 * until we're done accessing it.
+	 */
+	atomic_inc(&conn_id->refcount);
+	ret = conn_id->id.event_handler(&conn_id->id, &event);
+	if (ret) {
+		/* User wants to destroy the CM ID */
+		conn_id->cm_id.iw = NULL;
+		cma_exch(conn_id, RDMA_CM_DESTROYING);
+		mutex_unlock(&conn_id->handler_mutex);
+		cma_deref_id(conn_id);
+		rdma_destroy_id(&conn_id->id);
+		goto out;
+	}
+
+	mutex_unlock(&conn_id->handler_mutex);
+	cma_deref_id(conn_id);
+
+out:
+	mutex_unlock(&listen_id->handler_mutex);
+	return ret;
+}
+
+static int cma_ib_listen(struct rdma_id_private *id_priv)
+{
+	struct ib_cm_compare_data compare_data;
+	struct sockaddr *addr;
+	struct ib_cm_id	*id;
+	__be64 svc_id;
+	int ret;
+
+	id = ib_create_cm_id(id_priv->id.device, cma_req_handler, id_priv);
+	if (IS_ERR(id))
+		return PTR_ERR(id);
+
+	id_priv->cm_id.ib = id;
+
+	addr = cma_src_addr(id_priv);
+	svc_id = rdma_get_service_id(&id_priv->id, addr);
+	if (cma_any_addr(addr) && !id_priv->afonly)
+		ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, NULL);
+	else {
+		cma_set_compare_data(id_priv->id.ps, addr, &compare_data);
+		ret = ib_cm_listen(id_priv->cm_id.ib, svc_id, 0, &compare_data);
+	}
+
+	if (ret) {
+		ib_destroy_cm_id(id_priv->cm_id.ib);
+		id_priv->cm_id.ib = NULL;
+	}
+
+	return ret;
+}
+
+static int cma_iw_listen(struct rdma_id_private *id_priv, int backlog)
+{
+	int ret;
+	struct iw_cm_id	*id;
+
+	id = iw_create_cm_id(id_priv->id.device,
+			     iw_conn_req_handler,
+			     id_priv);
+	if (IS_ERR(id))
+		return PTR_ERR(id);
+
+	id_priv->cm_id.iw = id;
+
+	memcpy(&id_priv->cm_id.iw->local_addr, cma_src_addr(id_priv),
+	       rdma_addr_size(cma_src_addr(id_priv)));
+
+	ret = iw_cm_listen(id_priv->cm_id.iw, backlog);
+
+	if (ret) {
+		iw_destroy_cm_id(id_priv->cm_id.iw);
+		id_priv->cm_id.iw = NULL;
+	}
+
+	return ret;
+}
+
+static int cma_listen_handler(struct rdma_cm_id *id,
+			      struct rdma_cm_event *event)
+{
+	struct rdma_id_private *id_priv = id->context;
+
+	id->context = id_priv->id.context;
+	id->event_handler = id_priv->id.event_handler;
+	return id_priv->id.event_handler(id, event);
+}
+
+static void cma_listen_on_dev(struct rdma_id_private *id_priv,
+			      struct cma_device *cma_dev)
+{
+	struct rdma_id_private *dev_id_priv;
+	struct rdma_cm_id *id;
+	int ret;
+
+	if (cma_family(id_priv) == AF_IB &&
+	    rdma_node_get_transport(cma_dev->device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	id = rdma_create_id(cma_listen_handler, id_priv, id_priv->id.ps,
+			    id_priv->id.qp_type);
+	if (IS_ERR(id))
+		return;
+
+	dev_id_priv = container_of(id, struct rdma_id_private, id);
+
+	dev_id_priv->state = RDMA_CM_ADDR_BOUND;
+	memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv),
+	       rdma_addr_size(cma_src_addr(id_priv)));
+
+	cma_attach_to_dev(dev_id_priv, cma_dev);
+	list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list);
+	atomic_inc(&id_priv->refcount);
+	dev_id_priv->internal_id = 1;
+	dev_id_priv->afonly = id_priv->afonly;
+
+	ret = rdma_listen(id, id_priv->backlog);
+	if (ret)
+		printk(KERN_WARNING "RDMA CMA: cma_listen_on_dev, error %d, "
+		       "listening on device %s\n", ret, cma_dev->device->name);
+}
+
+static void cma_listen_on_all(struct rdma_id_private *id_priv)
+{
+	struct cma_device *cma_dev;
+
+	mutex_lock(&lock);
+	list_add_tail(&id_priv->list, &listen_any_list);
+	list_for_each_entry(cma_dev, &dev_list, list)
+		cma_listen_on_dev(id_priv, cma_dev);
+	mutex_unlock(&lock);
+}
+
+void rdma_set_service_type(struct rdma_cm_id *id, int tos)
+{
+	struct rdma_id_private *id_priv;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	id_priv->tos = (u8) tos;
+}
+EXPORT_SYMBOL(rdma_set_service_type);
+
+static void cma_query_handler(int status, struct ib_sa_path_rec *path_rec,
+			      void *context)
+{
+	struct cma_work *work = context;
+	struct rdma_route *route;
+
+	route = &work->id->id.route;
+
+	if (!status) {
+		route->num_paths = 1;
+		*route->path_rec = *path_rec;
+	} else {
+		work->old_state = RDMA_CM_ROUTE_QUERY;
+		work->new_state = RDMA_CM_ADDR_RESOLVED;
+		work->event.event = RDMA_CM_EVENT_ROUTE_ERROR;
+		work->event.status = status;
+	}
+
+	queue_work(cma_wq, &work->work);
+}
+
+static int cma_query_ib_route(struct rdma_id_private *id_priv, int timeout_ms,
+			      struct cma_work *work)
+{
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	struct ib_sa_path_rec path_rec;
+	ib_sa_comp_mask comp_mask;
+	struct sockaddr_in6 *sin6;
+	struct sockaddr_ib *sib;
+
+	memset(&path_rec, 0, sizeof path_rec);
+	rdma_addr_get_sgid(dev_addr, &path_rec.sgid);
+	rdma_addr_get_dgid(dev_addr, &path_rec.dgid);
+	path_rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+	path_rec.numb_path = 1;
+	path_rec.reversible = 1;
+	path_rec.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
+
+	comp_mask = IB_SA_PATH_REC_DGID | IB_SA_PATH_REC_SGID |
+		    IB_SA_PATH_REC_PKEY | IB_SA_PATH_REC_NUMB_PATH |
+		    IB_SA_PATH_REC_REVERSIBLE | IB_SA_PATH_REC_SERVICE_ID;
+
+	switch (cma_family(id_priv)) {
+	case AF_INET:
+		path_rec.qos_class = cpu_to_be16((u16) id_priv->tos);
+		comp_mask |= IB_SA_PATH_REC_QOS_CLASS;
+		break;
+	case AF_INET6:
+		sin6 = (struct sockaddr_in6 *) cma_src_addr(id_priv);
+		path_rec.traffic_class = (u8) (be32_to_cpu(sin6->sin6_flowinfo) >> 20);
+		comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+		break;
+	case AF_IB:
+		sib = (struct sockaddr_ib *) cma_src_addr(id_priv);
+		path_rec.traffic_class = (u8) (be32_to_cpu(sib->sib_flowinfo) >> 20);
+		comp_mask |= IB_SA_PATH_REC_TRAFFIC_CLASS;
+		break;
+	}
+
+	id_priv->query_id = ib_sa_path_rec_get(&sa_client, id_priv->id.device,
+					       id_priv->id.port_num, &path_rec,
+					       comp_mask, timeout_ms,
+					       GFP_KERNEL, cma_query_handler,
+					       work, &id_priv->query);
+
+	return (id_priv->query_id < 0) ? id_priv->query_id : 0;
+}
+
+static void cma_work_handler(struct work_struct *_work)
+{
+	struct cma_work *work = container_of(_work, struct cma_work, work);
+	struct rdma_id_private *id_priv = work->id;
+	int destroy = 0;
+
+	mutex_lock(&id_priv->handler_mutex);
+	if (!cma_comp_exch(id_priv, work->old_state, work->new_state))
+		goto out;
+
+	if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		destroy = 1;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_deref_id(id_priv);
+	if (destroy)
+		rdma_destroy_id(&id_priv->id);
+	kfree(work);
+}
+
+static void cma_ndev_work_handler(struct work_struct *_work)
+{
+	struct cma_ndev_work *work = container_of(_work, struct cma_ndev_work, work);
+	struct rdma_id_private *id_priv = work->id;
+	int destroy = 0;
+
+	mutex_lock(&id_priv->handler_mutex);
+	if (id_priv->state == RDMA_CM_DESTROYING ||
+	    id_priv->state == RDMA_CM_DEVICE_REMOVAL)
+		goto out;
+
+	if (id_priv->id.event_handler(&id_priv->id, &work->event)) {
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		destroy = 1;
+	}
+
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_deref_id(id_priv);
+	if (destroy)
+		rdma_destroy_id(&id_priv->id);
+	kfree(work);
+}
+
+static int cma_resolve_ib_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+	struct rdma_route *route = &id_priv->id.route;
+	struct cma_work *work;
+	int ret;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = RDMA_CM_ROUTE_QUERY;
+	work->new_state = RDMA_CM_ROUTE_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+
+	route->path_rec = kmalloc(sizeof *route->path_rec, GFP_KERNEL);
+	if (!route->path_rec) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	ret = cma_query_ib_route(id_priv, timeout_ms, work);
+	if (ret)
+		goto err2;
+
+	return 0;
+err2:
+	kfree(route->path_rec);
+	route->path_rec = NULL;
+err1:
+	kfree(work);
+	return ret;
+}
+
+int rdma_set_ib_paths(struct rdma_cm_id *id,
+		      struct ib_sa_path_rec *path_rec, int num_paths)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+			   RDMA_CM_ROUTE_RESOLVED))
+		return -EINVAL;
+
+	id->route.path_rec = kmemdup(path_rec, sizeof *path_rec * num_paths,
+				     GFP_KERNEL);
+	if (!id->route.path_rec) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	id->route.num_paths = num_paths;
+	return 0;
+err:
+	cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_ADDR_RESOLVED);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_set_ib_paths);
+
+static int cma_resolve_iw_route(struct rdma_id_private *id_priv, int timeout_ms)
+{
+	struct cma_work *work;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = RDMA_CM_ROUTE_QUERY;
+	work->new_state = RDMA_CM_ROUTE_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	queue_work(cma_wq, &work->work);
+	return 0;
+}
+
+static int iboe_tos_to_sl(struct net_device *ndev, int tos)
+{
+	int prio;
+	struct net_device *dev;
+
+	prio = rt_tos2priority(tos);
+	dev = ndev->priv_flags & IFF_802_1Q_VLAN ?
+		vlan_dev_real_dev(ndev) : ndev;
+
+	if (dev->num_tc)
+		return netdev_get_prio_tc_map(dev, prio);
+
+#if IS_ENABLED(CONFIG_VLAN_8021Q)
+	if (ndev->priv_flags & IFF_802_1Q_VLAN)
+		return (vlan_dev_get_egress_qos_mask(ndev, prio) &
+			VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT;
+#endif
+	return 0;
+}
+
+static int cma_resolve_iboe_route(struct rdma_id_private *id_priv)
+{
+	struct rdma_route *route = &id_priv->id.route;
+	struct rdma_addr *addr = &route->addr;
+	struct cma_work *work;
+	int ret;
+	struct net_device *ndev = NULL;
+
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+
+	route->path_rec = kzalloc(sizeof *route->path_rec, GFP_KERNEL);
+	if (!route->path_rec) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	route->num_paths = 1;
+
+	if (addr->dev_addr.bound_dev_if)
+		ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if);
+	if (!ndev) {
+		ret = -ENODEV;
+		goto err2;
+	}
+
+	route->path_rec->vlan_id = rdma_vlan_dev_vlan_id(ndev);
+	memcpy(route->path_rec->dmac, addr->dev_addr.dst_dev_addr, ETH_ALEN);
+	memcpy(route->path_rec->smac, ndev->dev_addr, ndev->addr_len);
+
+	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+		    &route->path_rec->sgid);
+	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr,
+		    &route->path_rec->dgid);
+
+	route->path_rec->hop_limit = 1;
+	route->path_rec->reversible = 1;
+	route->path_rec->pkey = cpu_to_be16(0xffff);
+	route->path_rec->mtu_selector = IB_SA_EQ;
+	route->path_rec->sl = iboe_tos_to_sl(ndev, id_priv->tos);
+	route->path_rec->mtu = iboe_get_mtu(ndev->mtu);
+	route->path_rec->rate_selector = IB_SA_EQ;
+	route->path_rec->rate = iboe_get_rate(ndev);
+	dev_put(ndev);
+	route->path_rec->packet_life_time_selector = IB_SA_EQ;
+	route->path_rec->packet_life_time = CMA_IBOE_PACKET_LIFETIME;
+	if (!route->path_rec->mtu) {
+		ret = -EINVAL;
+		goto err2;
+	}
+
+	work->old_state = RDMA_CM_ROUTE_QUERY;
+	work->new_state = RDMA_CM_ROUTE_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	work->event.status = 0;
+
+	queue_work(cma_wq, &work->work);
+
+	return 0;
+
+err2:
+	kfree(route->path_rec);
+	route->path_rec = NULL;
+err1:
+	kfree(work);
+	return ret;
+}
+
+int rdma_resolve_route(struct rdma_cm_id *id, int timeout_ms)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED, RDMA_CM_ROUTE_QUERY))
+		return -EINVAL;
+
+	atomic_inc(&id_priv->refcount);
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+		case IB_LINK_LAYER_INFINIBAND:
+			ret = cma_resolve_ib_route(id_priv, timeout_ms);
+			break;
+		case IB_LINK_LAYER_ETHERNET:
+			ret = cma_resolve_iboe_route(id_priv);
+			break;
+		default:
+			ret = -ENOSYS;
+		}
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = cma_resolve_iw_route(id_priv, timeout_ms);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	cma_comp_exch(id_priv, RDMA_CM_ROUTE_QUERY, RDMA_CM_ADDR_RESOLVED);
+	cma_deref_id(id_priv);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_route);
+
+static void cma_set_loopback(struct sockaddr *addr)
+{
+	switch (addr->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *) addr)->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
+		break;
+	case AF_INET6:
+		ipv6_addr_set(&((struct sockaddr_in6 *) addr)->sin6_addr,
+			      0, 0, 0, htonl(1));
+		break;
+	default:
+		ib_addr_set(&((struct sockaddr_ib *) addr)->sib_addr,
+			    0, 0, 0, htonl(1));
+		break;
+	}
+}
+
+static int cma_bind_loopback(struct rdma_id_private *id_priv)
+{
+	struct cma_device *cma_dev, *cur_dev;
+	struct ib_port_attr port_attr;
+	union ib_gid gid;
+	u16 pkey;
+	int ret;
+	u8 p;
+
+	cma_dev = NULL;
+	mutex_lock(&lock);
+	list_for_each_entry(cur_dev, &dev_list, list) {
+		if (cma_family(id_priv) == AF_IB &&
+		    rdma_node_get_transport(cur_dev->device->node_type) != RDMA_TRANSPORT_IB)
+			continue;
+
+		if (!cma_dev)
+			cma_dev = cur_dev;
+
+		for (p = 1; p <= cur_dev->device->phys_port_cnt; ++p) {
+			if (!ib_query_port(cur_dev->device, p, &port_attr) &&
+			    port_attr.state == IB_PORT_ACTIVE) {
+				cma_dev = cur_dev;
+				goto port_found;
+			}
+		}
+	}
+
+	if (!cma_dev) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	p = 1;
+
+port_found:
+	ret = ib_get_cached_gid(cma_dev->device, p, 0, &gid);
+	if (ret)
+		goto out;
+
+	ret = ib_get_cached_pkey(cma_dev->device, p, 0, &pkey);
+	if (ret)
+		goto out;
+
+	id_priv->id.route.addr.dev_addr.dev_type =
+		(rdma_port_get_link_layer(cma_dev->device, p) == IB_LINK_LAYER_INFINIBAND) ?
+		ARPHRD_INFINIBAND : ARPHRD_ETHER;
+
+	rdma_addr_set_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+	ib_addr_set_pkey(&id_priv->id.route.addr.dev_addr, pkey);
+	id_priv->id.port_num = p;
+	cma_attach_to_dev(id_priv, cma_dev);
+	cma_set_loopback(cma_src_addr(id_priv));
+out:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static void addr_handler(int status, struct sockaddr *src_addr,
+			 struct rdma_dev_addr *dev_addr, void *context)
+{
+	struct rdma_id_private *id_priv = context;
+	struct rdma_cm_event event;
+
+	memset(&event, 0, sizeof event);
+	mutex_lock(&id_priv->handler_mutex);
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY,
+			   RDMA_CM_ADDR_RESOLVED))
+		goto out;
+
+	memcpy(cma_src_addr(id_priv), src_addr, rdma_addr_size(src_addr));
+	if (!status && !id_priv->cma_dev)
+		status = cma_acquire_dev(id_priv, NULL);
+
+	if (status) {
+		if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_RESOLVED,
+				   RDMA_CM_ADDR_BOUND))
+			goto out;
+		event.event = RDMA_CM_EVENT_ADDR_ERROR;
+		event.status = status;
+	} else
+		event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+
+	if (id_priv->id.event_handler(&id_priv->id, &event)) {
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		cma_deref_id(id_priv);
+		rdma_destroy_id(&id_priv->id);
+		return;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	cma_deref_id(id_priv);
+}
+
+static int cma_resolve_loopback(struct rdma_id_private *id_priv)
+{
+	struct cma_work *work;
+	union ib_gid gid;
+	int ret;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	if (!id_priv->cma_dev) {
+		ret = cma_bind_loopback(id_priv);
+		if (ret)
+			goto err;
+	}
+
+	rdma_addr_get_sgid(&id_priv->id.route.addr.dev_addr, &gid);
+	rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, &gid);
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = RDMA_CM_ADDR_QUERY;
+	work->new_state = RDMA_CM_ADDR_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+	queue_work(cma_wq, &work->work);
+	return 0;
+err:
+	kfree(work);
+	return ret;
+}
+
+static int cma_resolve_ib_addr(struct rdma_id_private *id_priv)
+{
+	struct cma_work *work;
+	int ret;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	if (!id_priv->cma_dev) {
+		ret = cma_resolve_ib_dev(id_priv);
+		if (ret)
+			goto err;
+	}
+
+	rdma_addr_set_dgid(&id_priv->id.route.addr.dev_addr, (union ib_gid *)
+		&(((struct sockaddr_ib *) &id_priv->id.route.addr.dst_addr)->sib_addr));
+
+	work->id = id_priv;
+	INIT_WORK(&work->work, cma_work_handler);
+	work->old_state = RDMA_CM_ADDR_QUERY;
+	work->new_state = RDMA_CM_ADDR_RESOLVED;
+	work->event.event = RDMA_CM_EVENT_ADDR_RESOLVED;
+	queue_work(cma_wq, &work->work);
+	return 0;
+err:
+	kfree(work);
+	return ret;
+}
+
+static int cma_bind_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+			 struct sockaddr *dst_addr)
+{
+	if (!src_addr || !src_addr->sa_family) {
+		src_addr = (struct sockaddr *) &id->route.addr.src_addr;
+		src_addr->sa_family = dst_addr->sa_family;
+		if (dst_addr->sa_family == AF_INET6) {
+			((struct sockaddr_in6 *) src_addr)->sin6_scope_id =
+				((struct sockaddr_in6 *) dst_addr)->sin6_scope_id;
+		} else if (dst_addr->sa_family == AF_IB) {
+			((struct sockaddr_ib *) src_addr)->sib_pkey =
+				((struct sockaddr_ib *) dst_addr)->sib_pkey;
+		}
+	}
+	return rdma_bind_addr(id, src_addr);
+}
+
+int rdma_resolve_addr(struct rdma_cm_id *id, struct sockaddr *src_addr,
+		      struct sockaddr *dst_addr, int timeout_ms)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (id_priv->state == RDMA_CM_IDLE) {
+		ret = cma_bind_addr(id, src_addr, dst_addr);
+		if (ret)
+			return ret;
+	}
+
+	if (cma_family(id_priv) != dst_addr->sa_family)
+		return -EINVAL;
+
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_ADDR_QUERY))
+		return -EINVAL;
+
+	atomic_inc(&id_priv->refcount);
+	memcpy(cma_dst_addr(id_priv), dst_addr, rdma_addr_size(dst_addr));
+	if (cma_any_addr(dst_addr)) {
+		ret = cma_resolve_loopback(id_priv);
+	} else {
+		if (dst_addr->sa_family == AF_IB) {
+			ret = cma_resolve_ib_addr(id_priv);
+		} else {
+			ret = rdma_resolve_ip(&addr_client, cma_src_addr(id_priv),
+					      dst_addr, &id->route.addr.dev_addr,
+					      timeout_ms, addr_handler, id_priv);
+		}
+	}
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	cma_comp_exch(id_priv, RDMA_CM_ADDR_QUERY, RDMA_CM_ADDR_BOUND);
+	cma_deref_id(id_priv);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_resolve_addr);
+
+int rdma_set_reuseaddr(struct rdma_cm_id *id, int reuse)
+{
+	struct rdma_id_private *id_priv;
+	unsigned long flags;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	spin_lock_irqsave(&id_priv->lock, flags);
+	if (reuse || id_priv->state == RDMA_CM_IDLE) {
+		id_priv->reuseaddr = reuse;
+		ret = 0;
+	} else {
+		ret = -EINVAL;
+	}
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_set_reuseaddr);
+
+int rdma_set_afonly(struct rdma_cm_id *id, int afonly)
+{
+	struct rdma_id_private *id_priv;
+	unsigned long flags;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	spin_lock_irqsave(&id_priv->lock, flags);
+	if (id_priv->state == RDMA_CM_IDLE || id_priv->state == RDMA_CM_ADDR_BOUND) {
+		id_priv->options |= (1 << CMA_OPTION_AFONLY);
+		id_priv->afonly = afonly;
+		ret = 0;
+	} else {
+		ret = -EINVAL;
+	}
+	spin_unlock_irqrestore(&id_priv->lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_set_afonly);
+
+static void cma_bind_port(struct rdma_bind_list *bind_list,
+			  struct rdma_id_private *id_priv)
+{
+	struct sockaddr *addr;
+	struct sockaddr_ib *sib;
+	u64 sid, mask;
+	__be16 port;
+
+	addr = cma_src_addr(id_priv);
+	port = htons(bind_list->port);
+
+	switch (addr->sa_family) {
+	case AF_INET:
+		((struct sockaddr_in *) addr)->sin_port = port;
+		break;
+	case AF_INET6:
+		((struct sockaddr_in6 *) addr)->sin6_port = port;
+		break;
+	case AF_IB:
+		sib = (struct sockaddr_ib *) addr;
+		sid = be64_to_cpu(sib->sib_sid);
+		mask = be64_to_cpu(sib->sib_sid_mask);
+		sib->sib_sid = cpu_to_be64((sid & mask) | (u64) ntohs(port));
+		sib->sib_sid_mask = cpu_to_be64(~0ULL);
+		break;
+	}
+	id_priv->bind_list = bind_list;
+	hlist_add_head(&id_priv->node, &bind_list->owners);
+}
+
+static int cma_alloc_port(struct idr *ps, struct rdma_id_private *id_priv,
+			  unsigned short snum)
+{
+	struct rdma_bind_list *bind_list;
+	int ret;
+
+	bind_list = kzalloc(sizeof *bind_list, GFP_KERNEL);
+	if (!bind_list)
+		return -ENOMEM;
+
+	ret = idr_alloc(ps, bind_list, snum, snum + 1, GFP_KERNEL);
+	if (ret < 0)
+		goto err;
+
+	bind_list->ps = ps;
+	bind_list->port = (unsigned short)ret;
+	cma_bind_port(bind_list, id_priv);
+	return 0;
+err:
+	kfree(bind_list);
+	return ret == -ENOSPC ? -EADDRNOTAVAIL : ret;
+}
+
+static int cma_alloc_any_port(struct idr *ps, struct rdma_id_private *id_priv)
+{
+	static unsigned int last_used_port;
+	int low, high, remaining;
+	unsigned int rover;
+
+	inet_get_local_port_range(&init_net, &low, &high);
+	remaining = (high - low) + 1;
+	rover = prandom_u32() % remaining + low;
+retry:
+	if (last_used_port != rover &&
+	    !idr_find(ps, (unsigned short) rover)) {
+		int ret = cma_alloc_port(ps, id_priv, rover);
+		/*
+		 * Remember previously used port number in order to avoid
+		 * re-using same port immediately after it is closed.
+		 */
+		if (!ret)
+			last_used_port = rover;
+		if (ret != -EADDRNOTAVAIL)
+			return ret;
+	}
+	if (--remaining) {
+		rover++;
+		if ((rover < low) || (rover > high))
+			rover = low;
+		goto retry;
+	}
+	return -EADDRNOTAVAIL;
+}
+
+/*
+ * Check that the requested port is available.  This is called when trying to
+ * bind to a specific port, or when trying to listen on a bound port.  In
+ * the latter case, the provided id_priv may already be on the bind_list, but
+ * we still need to check that it's okay to start listening.
+ */
+static int cma_check_port(struct rdma_bind_list *bind_list,
+			  struct rdma_id_private *id_priv, uint8_t reuseaddr)
+{
+	struct rdma_id_private *cur_id;
+	struct sockaddr *addr, *cur_addr;
+
+	addr = cma_src_addr(id_priv);
+	hlist_for_each_entry(cur_id, &bind_list->owners, node) {
+		if (id_priv == cur_id)
+			continue;
+
+		if ((cur_id->state != RDMA_CM_LISTEN) && reuseaddr &&
+		    cur_id->reuseaddr)
+			continue;
+
+		cur_addr = cma_src_addr(cur_id);
+		if (id_priv->afonly && cur_id->afonly &&
+		    (addr->sa_family != cur_addr->sa_family))
+			continue;
+
+		if (cma_any_addr(addr) || cma_any_addr(cur_addr))
+			return -EADDRNOTAVAIL;
+
+		if (!cma_addr_cmp(addr, cur_addr))
+			return -EADDRINUSE;
+	}
+	return 0;
+}
+
+static int cma_use_port(struct idr *ps, struct rdma_id_private *id_priv)
+{
+	struct rdma_bind_list *bind_list;
+	unsigned short snum;
+	int ret;
+
+	snum = ntohs(cma_port(cma_src_addr(id_priv)));
+	if (snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
+		return -EACCES;
+
+	bind_list = idr_find(ps, snum);
+	if (!bind_list) {
+		ret = cma_alloc_port(ps, id_priv, snum);
+	} else {
+		ret = cma_check_port(bind_list, id_priv, id_priv->reuseaddr);
+		if (!ret)
+			cma_bind_port(bind_list, id_priv);
+	}
+	return ret;
+}
+
+static int cma_bind_listen(struct rdma_id_private *id_priv)
+{
+	struct rdma_bind_list *bind_list = id_priv->bind_list;
+	int ret = 0;
+
+	mutex_lock(&lock);
+	if (bind_list->owners.first->next)
+		ret = cma_check_port(bind_list, id_priv, 0);
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static struct idr *cma_select_inet_ps(struct rdma_id_private *id_priv)
+{
+	switch (id_priv->id.ps) {
+	case RDMA_PS_TCP:
+		return &tcp_ps;
+	case RDMA_PS_UDP:
+		return &udp_ps;
+	case RDMA_PS_IPOIB:
+		return &ipoib_ps;
+	case RDMA_PS_IB:
+		return &ib_ps;
+	default:
+		return NULL;
+	}
+}
+
+static struct idr *cma_select_ib_ps(struct rdma_id_private *id_priv)
+{
+	struct idr *ps = NULL;
+	struct sockaddr_ib *sib;
+	u64 sid_ps, mask, sid;
+
+	sib = (struct sockaddr_ib *) cma_src_addr(id_priv);
+	mask = be64_to_cpu(sib->sib_sid_mask) & RDMA_IB_IP_PS_MASK;
+	sid = be64_to_cpu(sib->sib_sid) & mask;
+
+	if ((id_priv->id.ps == RDMA_PS_IB) && (sid == (RDMA_IB_IP_PS_IB & mask))) {
+		sid_ps = RDMA_IB_IP_PS_IB;
+		ps = &ib_ps;
+	} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_TCP)) &&
+		   (sid == (RDMA_IB_IP_PS_TCP & mask))) {
+		sid_ps = RDMA_IB_IP_PS_TCP;
+		ps = &tcp_ps;
+	} else if (((id_priv->id.ps == RDMA_PS_IB) || (id_priv->id.ps == RDMA_PS_UDP)) &&
+		   (sid == (RDMA_IB_IP_PS_UDP & mask))) {
+		sid_ps = RDMA_IB_IP_PS_UDP;
+		ps = &udp_ps;
+	}
+
+	if (ps) {
+		sib->sib_sid = cpu_to_be64(sid_ps | ntohs(cma_port((struct sockaddr *) sib)));
+		sib->sib_sid_mask = cpu_to_be64(RDMA_IB_IP_PS_MASK |
+						be64_to_cpu(sib->sib_sid_mask));
+	}
+	return ps;
+}
+
+static int cma_get_port(struct rdma_id_private *id_priv)
+{
+	struct idr *ps;
+	int ret;
+
+	if (cma_family(id_priv) != AF_IB)
+		ps = cma_select_inet_ps(id_priv);
+	else
+		ps = cma_select_ib_ps(id_priv);
+	if (!ps)
+		return -EPROTONOSUPPORT;
+
+	mutex_lock(&lock);
+	if (cma_any_port(cma_src_addr(id_priv)))
+		ret = cma_alloc_any_port(ps, id_priv);
+	else
+		ret = cma_use_port(ps, id_priv);
+	mutex_unlock(&lock);
+
+	return ret;
+}
+
+static int cma_check_linklocal(struct rdma_dev_addr *dev_addr,
+			       struct sockaddr *addr)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	struct sockaddr_in6 *sin6;
+
+	if (addr->sa_family != AF_INET6)
+		return 0;
+
+	sin6 = (struct sockaddr_in6 *) addr;
+
+	if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
+		return 0;
+
+	if (!sin6->sin6_scope_id)
+			return -EINVAL;
+
+	dev_addr->bound_dev_if = sin6->sin6_scope_id;
+#endif
+	return 0;
+}
+
+int rdma_listen(struct rdma_cm_id *id, int backlog)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (id_priv->state == RDMA_CM_IDLE) {
+		id->route.addr.src_addr.ss_family = AF_INET;
+		ret = rdma_bind_addr(id, cma_src_addr(id_priv));
+		if (ret)
+			return ret;
+	}
+
+	if (!cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_LISTEN))
+		return -EINVAL;
+
+	if (id_priv->reuseaddr) {
+		ret = cma_bind_listen(id_priv);
+		if (ret)
+			goto err;
+	}
+
+	id_priv->backlog = backlog;
+	if (id->device) {
+		switch (rdma_node_get_transport(id->device->node_type)) {
+		case RDMA_TRANSPORT_IB:
+			ret = cma_ib_listen(id_priv);
+			if (ret)
+				goto err;
+			break;
+		case RDMA_TRANSPORT_IWARP:
+			ret = cma_iw_listen(id_priv, backlog);
+			if (ret)
+				goto err;
+			break;
+		default:
+			ret = -ENOSYS;
+			goto err;
+		}
+	} else
+		cma_listen_on_all(id_priv);
+
+	return 0;
+err:
+	id_priv->backlog = 0;
+	cma_comp_exch(id_priv, RDMA_CM_LISTEN, RDMA_CM_ADDR_BOUND);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_listen);
+
+int rdma_bind_addr(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6 &&
+	    addr->sa_family != AF_IB)
+		return -EAFNOSUPPORT;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, RDMA_CM_IDLE, RDMA_CM_ADDR_BOUND))
+		return -EINVAL;
+
+	ret = cma_check_linklocal(&id->route.addr.dev_addr, addr);
+	if (ret)
+		goto err1;
+
+	memcpy(cma_src_addr(id_priv), addr, rdma_addr_size(addr));
+	if (!cma_any_addr(addr)) {
+		ret = cma_translate_addr(addr, &id->route.addr.dev_addr);
+		if (ret)
+			goto err1;
+
+		ret = cma_acquire_dev(id_priv, NULL);
+		if (ret)
+			goto err1;
+	}
+
+	if (!(id_priv->options & (1 << CMA_OPTION_AFONLY))) {
+		if (addr->sa_family == AF_INET)
+			id_priv->afonly = 1;
+#if IS_ENABLED(CONFIG_IPV6)
+		else if (addr->sa_family == AF_INET6)
+			id_priv->afonly = init_net.ipv6.sysctl.bindv6only;
+#endif
+	}
+	ret = cma_get_port(id_priv);
+	if (ret)
+		goto err2;
+
+	return 0;
+err2:
+	if (id_priv->cma_dev)
+		cma_release_dev(id_priv);
+err1:
+	cma_comp_exch(id_priv, RDMA_CM_ADDR_BOUND, RDMA_CM_IDLE);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_bind_addr);
+
+static int cma_format_hdr(void *hdr, struct rdma_id_private *id_priv)
+{
+	struct cma_hdr *cma_hdr;
+
+	cma_hdr = hdr;
+	cma_hdr->cma_version = CMA_VERSION;
+	if (cma_family(id_priv) == AF_INET) {
+		struct sockaddr_in *src4, *dst4;
+
+		src4 = (struct sockaddr_in *) cma_src_addr(id_priv);
+		dst4 = (struct sockaddr_in *) cma_dst_addr(id_priv);
+
+		cma_set_ip_ver(cma_hdr, 4);
+		cma_hdr->src_addr.ip4.addr = src4->sin_addr.s_addr;
+		cma_hdr->dst_addr.ip4.addr = dst4->sin_addr.s_addr;
+		cma_hdr->port = src4->sin_port;
+	} else if (cma_family(id_priv) == AF_INET6) {
+		struct sockaddr_in6 *src6, *dst6;
+
+		src6 = (struct sockaddr_in6 *) cma_src_addr(id_priv);
+		dst6 = (struct sockaddr_in6 *) cma_dst_addr(id_priv);
+
+		cma_set_ip_ver(cma_hdr, 6);
+		cma_hdr->src_addr.ip6 = src6->sin6_addr;
+		cma_hdr->dst_addr.ip6 = dst6->sin6_addr;
+		cma_hdr->port = src6->sin6_port;
+	}
+	return 0;
+}
+
+static int cma_sidr_rep_handler(struct ib_cm_id *cm_id,
+				struct ib_cm_event *ib_event)
+{
+	struct rdma_id_private *id_priv = cm_id->context;
+	struct rdma_cm_event event;
+	struct ib_cm_sidr_rep_event_param *rep = &ib_event->param.sidr_rep_rcvd;
+	int ret = 0;
+
+	if (cma_disable_callback(id_priv, RDMA_CM_CONNECT))
+		return 0;
+
+	memset(&event, 0, sizeof event);
+	switch (ib_event->event) {
+	case IB_CM_SIDR_REQ_ERROR:
+		event.event = RDMA_CM_EVENT_UNREACHABLE;
+		event.status = -ETIMEDOUT;
+		break;
+	case IB_CM_SIDR_REP_RECEIVED:
+		event.param.ud.private_data = ib_event->private_data;
+		event.param.ud.private_data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+		if (rep->status != IB_SIDR_SUCCESS) {
+			event.event = RDMA_CM_EVENT_UNREACHABLE;
+			event.status = ib_event->param.sidr_rep_rcvd.status;
+			break;
+		}
+		ret = cma_set_qkey(id_priv, rep->qkey);
+		if (ret) {
+			event.event = RDMA_CM_EVENT_ADDR_ERROR;
+			event.status = ret;
+			break;
+		}
+		ib_init_ah_from_path(id_priv->id.device, id_priv->id.port_num,
+				     id_priv->id.route.path_rec,
+				     &event.param.ud.ah_attr);
+		event.param.ud.qp_num = rep->qpn;
+		event.param.ud.qkey = rep->qkey;
+		event.event = RDMA_CM_EVENT_ESTABLISHED;
+		event.status = 0;
+		break;
+	default:
+		printk(KERN_ERR "RDMA CMA: unexpected IB CM event: %d\n",
+		       ib_event->event);
+		goto out;
+	}
+
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		/* Destroy the CM ID by returning a non-zero value. */
+		id_priv->cm_id.ib = NULL;
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return ret;
+	}
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static int cma_resolve_ib_udp(struct rdma_id_private *id_priv,
+			      struct rdma_conn_param *conn_param)
+{
+	struct ib_cm_sidr_req_param req;
+	struct ib_cm_id	*id;
+	void *private_data;
+	int offset, ret;
+
+	memset(&req, 0, sizeof req);
+	offset = cma_user_data_offset(id_priv);
+	req.private_data_len = offset + conn_param->private_data_len;
+	if (req.private_data_len < conn_param->private_data_len)
+		return -EINVAL;
+
+	if (req.private_data_len) {
+		private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+		if (!private_data)
+			return -ENOMEM;
+	} else {
+		private_data = NULL;
+	}
+
+	if (conn_param->private_data && conn_param->private_data_len)
+		memcpy(private_data + offset, conn_param->private_data,
+		       conn_param->private_data_len);
+
+	if (private_data) {
+		ret = cma_format_hdr(private_data, id_priv);
+		if (ret)
+			goto out;
+		req.private_data = private_data;
+	}
+
+	id = ib_create_cm_id(id_priv->id.device, cma_sidr_rep_handler,
+			     id_priv);
+	if (IS_ERR(id)) {
+		ret = PTR_ERR(id);
+		goto out;
+	}
+	id_priv->cm_id.ib = id;
+
+	req.path = id_priv->id.route.path_rec;
+	req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
+	req.timeout_ms = 1 << (CMA_CM_RESPONSE_TIMEOUT - 8);
+	req.max_cm_retries = CMA_MAX_CM_RETRIES;
+
+	ret = ib_send_cm_sidr_req(id_priv->cm_id.ib, &req);
+	if (ret) {
+		ib_destroy_cm_id(id_priv->cm_id.ib);
+		id_priv->cm_id.ib = NULL;
+	}
+out:
+	kfree(private_data);
+	return ret;
+}
+
+static int cma_connect_ib(struct rdma_id_private *id_priv,
+			  struct rdma_conn_param *conn_param)
+{
+	struct ib_cm_req_param req;
+	struct rdma_route *route;
+	void *private_data;
+	struct ib_cm_id	*id;
+	int offset, ret;
+
+	memset(&req, 0, sizeof req);
+	offset = cma_user_data_offset(id_priv);
+	req.private_data_len = offset + conn_param->private_data_len;
+	if (req.private_data_len < conn_param->private_data_len)
+		return -EINVAL;
+
+	if (req.private_data_len) {
+		private_data = kzalloc(req.private_data_len, GFP_ATOMIC);
+		if (!private_data)
+			return -ENOMEM;
+	} else {
+		private_data = NULL;
+	}
+
+	if (conn_param->private_data && conn_param->private_data_len)
+		memcpy(private_data + offset, conn_param->private_data,
+		       conn_param->private_data_len);
+
+	id = ib_create_cm_id(id_priv->id.device, cma_ib_handler, id_priv);
+	if (IS_ERR(id)) {
+		ret = PTR_ERR(id);
+		goto out;
+	}
+	id_priv->cm_id.ib = id;
+
+	route = &id_priv->id.route;
+	if (private_data) {
+		ret = cma_format_hdr(private_data, id_priv);
+		if (ret)
+			goto out;
+		req.private_data = private_data;
+	}
+
+	req.primary_path = &route->path_rec[0];
+	if (route->num_paths == 2)
+		req.alternate_path = &route->path_rec[1];
+
+	req.service_id = rdma_get_service_id(&id_priv->id, cma_dst_addr(id_priv));
+	req.qp_num = id_priv->qp_num;
+	req.qp_type = id_priv->id.qp_type;
+	req.starting_psn = id_priv->seq_num;
+	req.responder_resources = conn_param->responder_resources;
+	req.initiator_depth = conn_param->initiator_depth;
+	req.flow_control = conn_param->flow_control;
+	req.retry_count = min_t(u8, 7, conn_param->retry_count);
+	req.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
+	req.remote_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+	req.local_cm_response_timeout = CMA_CM_RESPONSE_TIMEOUT;
+	req.max_cm_retries = CMA_MAX_CM_RETRIES;
+	req.srq = id_priv->srq ? 1 : 0;
+
+	ret = ib_send_cm_req(id_priv->cm_id.ib, &req);
+out:
+	if (ret && !IS_ERR(id)) {
+		ib_destroy_cm_id(id);
+		id_priv->cm_id.ib = NULL;
+	}
+
+	kfree(private_data);
+	return ret;
+}
+
+static int cma_connect_iw(struct rdma_id_private *id_priv,
+			  struct rdma_conn_param *conn_param)
+{
+	struct iw_cm_id *cm_id;
+	int ret;
+	struct iw_cm_conn_param iw_param;
+
+	cm_id = iw_create_cm_id(id_priv->id.device, cma_iw_handler, id_priv);
+	if (IS_ERR(cm_id))
+		return PTR_ERR(cm_id);
+
+	id_priv->cm_id.iw = cm_id;
+
+	memcpy(&cm_id->local_addr, cma_src_addr(id_priv),
+	       rdma_addr_size(cma_src_addr(id_priv)));
+	memcpy(&cm_id->remote_addr, cma_dst_addr(id_priv),
+	       rdma_addr_size(cma_dst_addr(id_priv)));
+
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		goto out;
+
+	if (conn_param) {
+		iw_param.ord = conn_param->initiator_depth;
+		iw_param.ird = conn_param->responder_resources;
+		iw_param.private_data = conn_param->private_data;
+		iw_param.private_data_len = conn_param->private_data_len;
+		iw_param.qpn = id_priv->id.qp ? id_priv->qp_num : conn_param->qp_num;
+	} else {
+		memset(&iw_param, 0, sizeof iw_param);
+		iw_param.qpn = id_priv->qp_num;
+	}
+	ret = iw_cm_connect(cm_id, &iw_param);
+out:
+	if (ret) {
+		iw_destroy_cm_id(cm_id);
+		id_priv->cm_id.iw = NULL;
+	}
+	return ret;
+}
+
+int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp_exch(id_priv, RDMA_CM_ROUTE_RESOLVED, RDMA_CM_CONNECT))
+		return -EINVAL;
+
+	if (!id->qp) {
+		id_priv->qp_num = conn_param->qp_num;
+		id_priv->srq = conn_param->srq;
+	}
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (id->qp_type == IB_QPT_UD)
+			ret = cma_resolve_ib_udp(id_priv, conn_param);
+		else
+			ret = cma_connect_ib(id_priv, conn_param);
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = cma_connect_iw(id_priv, conn_param);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+	if (ret)
+		goto err;
+
+	return 0;
+err:
+	cma_comp_exch(id_priv, RDMA_CM_CONNECT, RDMA_CM_ROUTE_RESOLVED);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_connect);
+
+static int cma_accept_ib(struct rdma_id_private *id_priv,
+			 struct rdma_conn_param *conn_param)
+{
+	struct ib_cm_rep_param rep;
+	int ret;
+
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		goto out;
+
+	ret = cma_modify_qp_rts(id_priv, conn_param);
+	if (ret)
+		goto out;
+
+	memset(&rep, 0, sizeof rep);
+	rep.qp_num = id_priv->qp_num;
+	rep.starting_psn = id_priv->seq_num;
+	rep.private_data = conn_param->private_data;
+	rep.private_data_len = conn_param->private_data_len;
+	rep.responder_resources = conn_param->responder_resources;
+	rep.initiator_depth = conn_param->initiator_depth;
+	rep.failover_accepted = 0;
+	rep.flow_control = conn_param->flow_control;
+	rep.rnr_retry_count = min_t(u8, 7, conn_param->rnr_retry_count);
+	rep.srq = id_priv->srq ? 1 : 0;
+
+	ret = ib_send_cm_rep(id_priv->cm_id.ib, &rep);
+out:
+	return ret;
+}
+
+static int cma_accept_iw(struct rdma_id_private *id_priv,
+		  struct rdma_conn_param *conn_param)
+{
+	struct iw_cm_conn_param iw_param;
+	int ret;
+
+	ret = cma_modify_qp_rtr(id_priv, conn_param);
+	if (ret)
+		return ret;
+
+	iw_param.ord = conn_param->initiator_depth;
+	iw_param.ird = conn_param->responder_resources;
+	iw_param.private_data = conn_param->private_data;
+	iw_param.private_data_len = conn_param->private_data_len;
+	if (id_priv->id.qp) {
+		iw_param.qpn = id_priv->qp_num;
+	} else
+		iw_param.qpn = conn_param->qp_num;
+
+	return iw_cm_accept(id_priv->cm_id.iw, &iw_param);
+}
+
+static int cma_send_sidr_rep(struct rdma_id_private *id_priv,
+			     enum ib_cm_sidr_status status, u32 qkey,
+			     const void *private_data, int private_data_len)
+{
+	struct ib_cm_sidr_rep_param rep;
+	int ret;
+
+	memset(&rep, 0, sizeof rep);
+	rep.status = status;
+	if (status == IB_SIDR_SUCCESS) {
+		ret = cma_set_qkey(id_priv, qkey);
+		if (ret)
+			return ret;
+		rep.qp_num = id_priv->qp_num;
+		rep.qkey = id_priv->qkey;
+	}
+	rep.private_data = private_data;
+	rep.private_data_len = private_data_len;
+
+	return ib_send_cm_sidr_rep(id_priv->cm_id.ib, &rep);
+}
+
+int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+
+	id_priv->owner = task_pid_nr(current);
+
+	if (!cma_comp(id_priv, RDMA_CM_CONNECT))
+		return -EINVAL;
+
+	if (!id->qp && conn_param) {
+		id_priv->qp_num = conn_param->qp_num;
+		id_priv->srq = conn_param->srq;
+	}
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (id->qp_type == IB_QPT_UD) {
+			if (conn_param)
+				ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+							conn_param->qkey,
+							conn_param->private_data,
+							conn_param->private_data_len);
+			else
+				ret = cma_send_sidr_rep(id_priv, IB_SIDR_SUCCESS,
+							0, NULL, 0);
+		} else {
+			if (conn_param)
+				ret = cma_accept_ib(id_priv, conn_param);
+			else
+				ret = cma_rep_recv(id_priv);
+		}
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = cma_accept_iw(id_priv, conn_param);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	if (ret)
+		goto reject;
+
+	return 0;
+reject:
+	cma_modify_qp_err(id_priv);
+	rdma_reject(id, NULL, 0);
+	return ret;
+}
+EXPORT_SYMBOL(rdma_accept);
+
+int rdma_notify(struct rdma_cm_id *id, enum ib_event_type event)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!id_priv->cm_id.ib)
+		return -EINVAL;
+
+	switch (id->device->node_type) {
+	case RDMA_NODE_IB_CA:
+		ret = ib_cm_notify(id_priv->cm_id.ib, event);
+		break;
+	default:
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rdma_notify);
+
+int rdma_reject(struct rdma_cm_id *id, const void *private_data,
+		u8 private_data_len)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!id_priv->cm_id.ib)
+		return -EINVAL;
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		if (id->qp_type == IB_QPT_UD)
+			ret = cma_send_sidr_rep(id_priv, IB_SIDR_REJECT, 0,
+						private_data, private_data_len);
+		else
+			ret = ib_send_cm_rej(id_priv->cm_id.ib,
+					     IB_CM_REJ_CONSUMER_DEFINED, NULL,
+					     0, private_data, private_data_len);
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = iw_cm_reject(id_priv->cm_id.iw,
+				   private_data, private_data_len);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rdma_reject);
+
+int rdma_disconnect(struct rdma_cm_id *id)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!id_priv->cm_id.ib)
+		return -EINVAL;
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		ret = cma_modify_qp_err(id_priv);
+		if (ret)
+			goto out;
+		/* Initiate or respond to a disconnect. */
+		if (ib_send_cm_dreq(id_priv->cm_id.ib, NULL, 0))
+			ib_send_cm_drep(id_priv->cm_id.ib, NULL, 0);
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ret = iw_cm_disconnect(id_priv->cm_id.iw, 0);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(rdma_disconnect);
+
+static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast)
+{
+	struct rdma_id_private *id_priv;
+	struct cma_multicast *mc = multicast->context;
+	struct rdma_cm_event event;
+	int ret;
+
+	id_priv = mc->id_priv;
+	if (cma_disable_callback(id_priv, RDMA_CM_ADDR_BOUND) &&
+	    cma_disable_callback(id_priv, RDMA_CM_ADDR_RESOLVED))
+		return 0;
+
+	if (!status)
+		status = cma_set_qkey(id_priv, be32_to_cpu(multicast->rec.qkey));
+	mutex_lock(&id_priv->qp_mutex);
+	if (!status && id_priv->id.qp)
+		status = ib_attach_mcast(id_priv->id.qp, &multicast->rec.mgid,
+					 be16_to_cpu(multicast->rec.mlid));
+	mutex_unlock(&id_priv->qp_mutex);
+
+	memset(&event, 0, sizeof event);
+	event.status = status;
+	event.param.ud.private_data = mc->context;
+	if (!status) {
+		event.event = RDMA_CM_EVENT_MULTICAST_JOIN;
+		ib_init_ah_from_mcmember(id_priv->id.device,
+					 id_priv->id.port_num, &multicast->rec,
+					 &event.param.ud.ah_attr);
+		event.param.ud.qp_num = 0xFFFFFF;
+		event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey);
+	} else
+		event.event = RDMA_CM_EVENT_MULTICAST_ERROR;
+
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+	if (ret) {
+		cma_exch(id_priv, RDMA_CM_DESTROYING);
+		mutex_unlock(&id_priv->handler_mutex);
+		rdma_destroy_id(&id_priv->id);
+		return 0;
+	}
+
+	mutex_unlock(&id_priv->handler_mutex);
+	return 0;
+}
+
+static void cma_set_mgid(struct rdma_id_private *id_priv,
+			 struct sockaddr *addr, union ib_gid *mgid)
+{
+	unsigned char mc_map[MAX_ADDR_LEN];
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	struct sockaddr_in *sin = (struct sockaddr_in *) addr;
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) addr;
+
+	if (cma_any_addr(addr)) {
+		memset(mgid, 0, sizeof *mgid);
+	} else if ((addr->sa_family == AF_INET6) &&
+		   ((be32_to_cpu(sin6->sin6_addr.s6_addr32[0]) & 0xFFF0FFFF) ==
+								 0xFF10A01B)) {
+		/* IPv6 address is an SA assigned MGID. */
+		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+	} else if (addr->sa_family == AF_IB) {
+		memcpy(mgid, &((struct sockaddr_ib *) addr)->sib_addr, sizeof *mgid);
+	} else if ((addr->sa_family == AF_INET6)) {
+		ipv6_ib_mc_map(&sin6->sin6_addr, dev_addr->broadcast, mc_map);
+		if (id_priv->id.ps == RDMA_PS_UDP)
+			mc_map[7] = 0x01;	/* Use RDMA CM signature */
+		*mgid = *(union ib_gid *) (mc_map + 4);
+	} else {
+		ip_ib_mc_map(sin->sin_addr.s_addr, dev_addr->broadcast, mc_map);
+		if (id_priv->id.ps == RDMA_PS_UDP)
+			mc_map[7] = 0x01;	/* Use RDMA CM signature */
+		*mgid = *(union ib_gid *) (mc_map + 4);
+	}
+}
+
+static int cma_join_ib_multicast(struct rdma_id_private *id_priv,
+				 struct cma_multicast *mc)
+{
+	struct ib_sa_mcmember_rec rec;
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	ib_sa_comp_mask comp_mask;
+	int ret;
+
+	ib_addr_get_mgid(dev_addr, &rec.mgid);
+	ret = ib_sa_get_mcmember_rec(id_priv->id.device, id_priv->id.port_num,
+				     &rec.mgid, &rec);
+	if (ret)
+		return ret;
+
+	ret = cma_set_qkey(id_priv, 0);
+	if (ret)
+		return ret;
+
+	cma_set_mgid(id_priv, (struct sockaddr *) &mc->addr, &rec.mgid);
+	rec.qkey = cpu_to_be32(id_priv->qkey);
+	rdma_addr_get_sgid(dev_addr, &rec.port_gid);
+	rec.pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+	rec.join_state = 1;
+
+	comp_mask = IB_SA_MCMEMBER_REC_MGID | IB_SA_MCMEMBER_REC_PORT_GID |
+		    IB_SA_MCMEMBER_REC_PKEY | IB_SA_MCMEMBER_REC_JOIN_STATE |
+		    IB_SA_MCMEMBER_REC_QKEY | IB_SA_MCMEMBER_REC_SL |
+		    IB_SA_MCMEMBER_REC_FLOW_LABEL |
+		    IB_SA_MCMEMBER_REC_TRAFFIC_CLASS;
+
+	if (id_priv->id.ps == RDMA_PS_IPOIB)
+		comp_mask |= IB_SA_MCMEMBER_REC_RATE |
+			     IB_SA_MCMEMBER_REC_RATE_SELECTOR |
+			     IB_SA_MCMEMBER_REC_MTU_SELECTOR |
+			     IB_SA_MCMEMBER_REC_MTU |
+			     IB_SA_MCMEMBER_REC_HOP_LIMIT;
+
+	mc->multicast.ib = ib_sa_join_multicast(&sa_client, id_priv->id.device,
+						id_priv->id.port_num, &rec,
+						comp_mask, GFP_KERNEL,
+						cma_ib_mc_handler, mc);
+	return PTR_ERR_OR_ZERO(mc->multicast.ib);
+}
+
+static void iboe_mcast_work_handler(struct work_struct *work)
+{
+	struct iboe_mcast_work *mw = container_of(work, struct iboe_mcast_work, work);
+	struct cma_multicast *mc = mw->mc;
+	struct ib_sa_multicast *m = mc->multicast.ib;
+
+	mc->multicast.ib->context = mc;
+	cma_ib_mc_handler(0, m);
+	kref_put(&mc->mcref, release_mc);
+	kfree(mw);
+}
+
+static void cma_iboe_set_mgid(struct sockaddr *addr, union ib_gid *mgid)
+{
+	struct sockaddr_in *sin = (struct sockaddr_in *)addr;
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)addr;
+
+	if (cma_any_addr(addr)) {
+		memset(mgid, 0, sizeof *mgid);
+	} else if (addr->sa_family == AF_INET6) {
+		memcpy(mgid, &sin6->sin6_addr, sizeof *mgid);
+	} else {
+		mgid->raw[0] = 0xff;
+		mgid->raw[1] = 0x0e;
+		mgid->raw[2] = 0;
+		mgid->raw[3] = 0;
+		mgid->raw[4] = 0;
+		mgid->raw[5] = 0;
+		mgid->raw[6] = 0;
+		mgid->raw[7] = 0;
+		mgid->raw[8] = 0;
+		mgid->raw[9] = 0;
+		mgid->raw[10] = 0xff;
+		mgid->raw[11] = 0xff;
+		*(__be32 *)(&mgid->raw[12]) = sin->sin_addr.s_addr;
+	}
+}
+
+static int cma_iboe_join_multicast(struct rdma_id_private *id_priv,
+				   struct cma_multicast *mc)
+{
+	struct iboe_mcast_work *work;
+	struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr;
+	int err;
+	struct sockaddr *addr = (struct sockaddr *)&mc->addr;
+	struct net_device *ndev = NULL;
+
+	if (cma_zero_addr((struct sockaddr *)&mc->addr))
+		return -EINVAL;
+
+	work = kzalloc(sizeof *work, GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	mc->multicast.ib = kzalloc(sizeof(struct ib_sa_multicast), GFP_KERNEL);
+	if (!mc->multicast.ib) {
+		err = -ENOMEM;
+		goto out1;
+	}
+
+	cma_iboe_set_mgid(addr, &mc->multicast.ib->rec.mgid);
+
+	mc->multicast.ib->rec.pkey = cpu_to_be16(0xffff);
+	if (id_priv->id.ps == RDMA_PS_UDP)
+		mc->multicast.ib->rec.qkey = cpu_to_be32(RDMA_UDP_QKEY);
+
+	if (dev_addr->bound_dev_if)
+		ndev = dev_get_by_index(&init_net, dev_addr->bound_dev_if);
+	if (!ndev) {
+		err = -ENODEV;
+		goto out2;
+	}
+	mc->multicast.ib->rec.rate = iboe_get_rate(ndev);
+	mc->multicast.ib->rec.hop_limit = 1;
+	mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu);
+	dev_put(ndev);
+	if (!mc->multicast.ib->rec.mtu) {
+		err = -EINVAL;
+		goto out2;
+	}
+	rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr,
+		    &mc->multicast.ib->rec.port_gid);
+	work->id = id_priv;
+	work->mc = mc;
+	INIT_WORK(&work->work, iboe_mcast_work_handler);
+	kref_get(&mc->mcref);
+	queue_work(cma_wq, &work->work);
+
+	return 0;
+
+out2:
+	kfree(mc->multicast.ib);
+out1:
+	kfree(work);
+	return err;
+}
+
+int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr,
+			void *context)
+{
+	struct rdma_id_private *id_priv;
+	struct cma_multicast *mc;
+	int ret;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	if (!cma_comp(id_priv, RDMA_CM_ADDR_BOUND) &&
+	    !cma_comp(id_priv, RDMA_CM_ADDR_RESOLVED))
+		return -EINVAL;
+
+	mc = kmalloc(sizeof *mc, GFP_KERNEL);
+	if (!mc)
+		return -ENOMEM;
+
+	memcpy(&mc->addr, addr, rdma_addr_size(addr));
+	mc->context = context;
+	mc->id_priv = id_priv;
+
+	spin_lock(&id_priv->lock);
+	list_add(&mc->list, &id_priv->mc_list);
+	spin_unlock(&id_priv->lock);
+
+	switch (rdma_node_get_transport(id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+		case IB_LINK_LAYER_INFINIBAND:
+			ret = cma_join_ib_multicast(id_priv, mc);
+			break;
+		case IB_LINK_LAYER_ETHERNET:
+			kref_init(&mc->mcref);
+			ret = cma_iboe_join_multicast(id_priv, mc);
+			break;
+		default:
+			ret = -EINVAL;
+		}
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	if (ret) {
+		spin_lock_irq(&id_priv->lock);
+		list_del(&mc->list);
+		spin_unlock_irq(&id_priv->lock);
+		kfree(mc);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(rdma_join_multicast);
+
+void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr)
+{
+	struct rdma_id_private *id_priv;
+	struct cma_multicast *mc;
+
+	id_priv = container_of(id, struct rdma_id_private, id);
+	spin_lock_irq(&id_priv->lock);
+	list_for_each_entry(mc, &id_priv->mc_list, list) {
+		if (!memcmp(&mc->addr, addr, rdma_addr_size(addr))) {
+			list_del(&mc->list);
+			spin_unlock_irq(&id_priv->lock);
+
+			if (id->qp)
+				ib_detach_mcast(id->qp,
+						&mc->multicast.ib->rec.mgid,
+						be16_to_cpu(mc->multicast.ib->rec.mlid));
+			if (rdma_node_get_transport(id_priv->cma_dev->device->node_type) == RDMA_TRANSPORT_IB) {
+				switch (rdma_port_get_link_layer(id->device, id->port_num)) {
+				case IB_LINK_LAYER_INFINIBAND:
+					ib_sa_free_multicast(mc->multicast.ib);
+					kfree(mc);
+					break;
+				case IB_LINK_LAYER_ETHERNET:
+					kref_put(&mc->mcref, release_mc);
+					break;
+				default:
+					break;
+				}
+			}
+			return;
+		}
+	}
+	spin_unlock_irq(&id_priv->lock);
+}
+EXPORT_SYMBOL(rdma_leave_multicast);
+
+static int cma_netdev_change(struct net_device *ndev, struct rdma_id_private *id_priv)
+{
+	struct rdma_dev_addr *dev_addr;
+	struct cma_ndev_work *work;
+
+	dev_addr = &id_priv->id.route.addr.dev_addr;
+
+	if ((dev_addr->bound_dev_if == ndev->ifindex) &&
+	    memcmp(dev_addr->src_dev_addr, ndev->dev_addr, ndev->addr_len)) {
+		printk(KERN_INFO "RDMA CM addr change for ndev %s used by id %p\n",
+		       ndev->name, &id_priv->id);
+		work = kzalloc(sizeof *work, GFP_KERNEL);
+		if (!work)
+			return -ENOMEM;
+
+		INIT_WORK(&work->work, cma_ndev_work_handler);
+		work->id = id_priv;
+		work->event.event = RDMA_CM_EVENT_ADDR_CHANGE;
+		atomic_inc(&id_priv->refcount);
+		queue_work(cma_wq, &work->work);
+	}
+
+	return 0;
+}
+
+static int cma_netdev_callback(struct notifier_block *self, unsigned long event,
+			       void *ptr)
+{
+	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
+	struct cma_device *cma_dev;
+	struct rdma_id_private *id_priv;
+	int ret = NOTIFY_DONE;
+
+	if (dev_net(ndev) != &init_net)
+		return NOTIFY_DONE;
+
+	if (event != NETDEV_BONDING_FAILOVER)
+		return NOTIFY_DONE;
+
+	if (!(ndev->flags & IFF_MASTER) || !(ndev->priv_flags & IFF_BONDING))
+		return NOTIFY_DONE;
+
+	mutex_lock(&lock);
+	list_for_each_entry(cma_dev, &dev_list, list)
+		list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+			ret = cma_netdev_change(ndev, id_priv);
+			if (ret)
+				goto out;
+		}
+
+out:
+	mutex_unlock(&lock);
+	return ret;
+}
+
+static struct notifier_block cma_nb = {
+	.notifier_call = cma_netdev_callback
+};
+
+static void cma_add_one(struct ib_device *device)
+{
+	struct cma_device *cma_dev;
+	struct rdma_id_private *id_priv;
+
+	cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL);
+	if (!cma_dev)
+		return;
+
+	cma_dev->device = device;
+
+	init_completion(&cma_dev->comp);
+	atomic_set(&cma_dev->refcount, 1);
+	INIT_LIST_HEAD(&cma_dev->id_list);
+	ib_set_client_data(device, &cma_client, cma_dev);
+
+	mutex_lock(&lock);
+	list_add_tail(&cma_dev->list, &dev_list);
+	list_for_each_entry(id_priv, &listen_any_list, list)
+		cma_listen_on_dev(id_priv, cma_dev);
+	mutex_unlock(&lock);
+}
+
+static int cma_remove_id_dev(struct rdma_id_private *id_priv)
+{
+	struct rdma_cm_event event;
+	enum rdma_cm_state state;
+	int ret = 0;
+
+	/* Record that we want to remove the device */
+	state = cma_exch(id_priv, RDMA_CM_DEVICE_REMOVAL);
+	if (state == RDMA_CM_DESTROYING)
+		return 0;
+
+	cma_cancel_operation(id_priv, state);
+	mutex_lock(&id_priv->handler_mutex);
+
+	/* Check for destruction from another callback. */
+	if (!cma_comp(id_priv, RDMA_CM_DEVICE_REMOVAL))
+		goto out;
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_DEVICE_REMOVAL;
+	ret = id_priv->id.event_handler(&id_priv->id, &event);
+out:
+	mutex_unlock(&id_priv->handler_mutex);
+	return ret;
+}
+
+static void cma_process_remove(struct cma_device *cma_dev)
+{
+	struct rdma_id_private *id_priv;
+	int ret;
+
+	mutex_lock(&lock);
+	while (!list_empty(&cma_dev->id_list)) {
+		id_priv = list_entry(cma_dev->id_list.next,
+				     struct rdma_id_private, list);
+
+		list_del(&id_priv->listen_list);
+		list_del_init(&id_priv->list);
+		atomic_inc(&id_priv->refcount);
+		mutex_unlock(&lock);
+
+		ret = id_priv->internal_id ? 1 : cma_remove_id_dev(id_priv);
+		cma_deref_id(id_priv);
+		if (ret)
+			rdma_destroy_id(&id_priv->id);
+
+		mutex_lock(&lock);
+	}
+	mutex_unlock(&lock);
+
+	cma_deref_dev(cma_dev);
+	wait_for_completion(&cma_dev->comp);
+}
+
+static void cma_remove_one(struct ib_device *device)
+{
+	struct cma_device *cma_dev;
+
+	cma_dev = ib_get_client_data(device, &cma_client);
+	if (!cma_dev)
+		return;
+
+	mutex_lock(&lock);
+	list_del(&cma_dev->list);
+	mutex_unlock(&lock);
+
+	cma_process_remove(cma_dev);
+	kfree(cma_dev);
+}
+
+static int cma_get_id_stats(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlmsghdr *nlh;
+	struct rdma_cm_id_stats *id_stats;
+	struct rdma_id_private *id_priv;
+	struct rdma_cm_id *id = NULL;
+	struct cma_device *cma_dev;
+	int i_dev = 0, i_id = 0;
+
+	/*
+	 * We export all of the IDs as a sequence of messages.  Each
+	 * ID gets its own netlink message.
+	 */
+	mutex_lock(&lock);
+
+	list_for_each_entry(cma_dev, &dev_list, list) {
+		if (i_dev < cb->args[0]) {
+			i_dev++;
+			continue;
+		}
+
+		i_id = 0;
+		list_for_each_entry(id_priv, &cma_dev->id_list, list) {
+			if (i_id < cb->args[1]) {
+				i_id++;
+				continue;
+			}
+
+			id_stats = ibnl_put_msg(skb, &nlh, cb->nlh->nlmsg_seq,
+						sizeof *id_stats, RDMA_NL_RDMA_CM,
+						RDMA_NL_RDMA_CM_ID_STATS,
+						NLM_F_MULTI);
+			if (!id_stats)
+				goto out;
+
+			memset(id_stats, 0, sizeof *id_stats);
+			id = &id_priv->id;
+			id_stats->node_type = id->route.addr.dev_addr.dev_type;
+			id_stats->port_num = id->port_num;
+			id_stats->bound_dev_if =
+				id->route.addr.dev_addr.bound_dev_if;
+
+			if (ibnl_put_attr(skb, nlh,
+					  rdma_addr_size(cma_src_addr(id_priv)),
+					  cma_src_addr(id_priv),
+					  RDMA_NL_RDMA_CM_ATTR_SRC_ADDR))
+				goto out;
+			if (ibnl_put_attr(skb, nlh,
+					  rdma_addr_size(cma_src_addr(id_priv)),
+					  cma_dst_addr(id_priv),
+					  RDMA_NL_RDMA_CM_ATTR_DST_ADDR))
+				goto out;
+
+			id_stats->pid		= id_priv->owner;
+			id_stats->port_space	= id->ps;
+			id_stats->cm_state	= id_priv->state;
+			id_stats->qp_num	= id_priv->qp_num;
+			id_stats->qp_type	= id->qp_type;
+
+			i_id++;
+		}
+
+		cb->args[1] = 0;
+		i_dev++;
+	}
+
+out:
+	mutex_unlock(&lock);
+	cb->args[0] = i_dev;
+	cb->args[1] = i_id;
+
+	return skb->len;
+}
+
+static const struct ibnl_client_cbs cma_cb_table[] = {
+	[RDMA_NL_RDMA_CM_ID_STATS] = { .dump = cma_get_id_stats,
+				       .module = THIS_MODULE },
+};
+
+static int __init cma_init(void)
+{
+	int ret;
+
+	cma_wq = create_singlethread_workqueue("rdma_cm");
+	if (!cma_wq)
+		return -ENOMEM;
+
+	ib_sa_register_client(&sa_client);
+	rdma_addr_register_client(&addr_client);
+	register_netdevice_notifier(&cma_nb);
+
+	ret = ib_register_client(&cma_client);
+	if (ret)
+		goto err;
+
+	if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table))
+		printk(KERN_WARNING "RDMA CMA: failed to add netlink callback\n");
+
+	return 0;
+
+err:
+	unregister_netdevice_notifier(&cma_nb);
+	rdma_addr_unregister_client(&addr_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(cma_wq);
+	return ret;
+}
+
+static void __exit cma_cleanup(void)
+{
+	ibnl_remove_client(RDMA_NL_RDMA_CM);
+	ib_unregister_client(&cma_client);
+	unregister_netdevice_notifier(&cma_nb);
+	rdma_addr_unregister_client(&addr_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(cma_wq);
+	idr_destroy(&tcp_ps);
+	idr_destroy(&udp_ps);
+	idr_destroy(&ipoib_ps);
+	idr_destroy(&ib_ps);
+}
+
+module_init(cma_init);
+module_exit(cma_cleanup);
diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h
new file mode 100644
index 000000000..87d1936f5
--- /dev/null
+++ b/drivers/infiniband/core/core_priv.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _CORE_PRIV_H
+#define _CORE_PRIV_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include <rdma/ib_verbs.h>
+
+int  ib_device_register_sysfs(struct ib_device *device,
+			      int (*port_callback)(struct ib_device *,
+						   u8, struct kobject *));
+void ib_device_unregister_sysfs(struct ib_device *device);
+
+int  ib_sysfs_setup(void);
+void ib_sysfs_cleanup(void);
+
+int  ib_cache_setup(void);
+void ib_cache_cleanup(void);
+
+int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
+			    struct ib_qp_attr *qp_attr, int *qp_attr_mask);
+#endif /* _CORE_PRIV_H */
diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
new file mode 100644
index 000000000..18c1ece76
--- /dev/null
+++ b/drivers/infiniband/core/device.c
@@ -0,0 +1,785 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <rdma/rdma_netlink.h>
+
+#include "core_priv.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("core kernel InfiniBand API");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_client_data {
+	struct list_head  list;
+	struct ib_client *client;
+	void *            data;
+};
+
+struct workqueue_struct *ib_wq;
+EXPORT_SYMBOL_GPL(ib_wq);
+
+static LIST_HEAD(device_list);
+static LIST_HEAD(client_list);
+
+/*
+ * device_mutex protects access to both device_list and client_list.
+ * There's no real point to using multiple locks or something fancier
+ * like an rwsem: we always access both lists, and we're always
+ * modifying one list or the other list.  In any case this is not a
+ * hot path so there's no point in trying to optimize.
+ */
+static DEFINE_MUTEX(device_mutex);
+
+static int ib_device_check_mandatory(struct ib_device *device)
+{
+#define IB_MANDATORY_FUNC(x) { offsetof(struct ib_device, x), #x }
+	static const struct {
+		size_t offset;
+		char  *name;
+	} mandatory_table[] = {
+		IB_MANDATORY_FUNC(query_device),
+		IB_MANDATORY_FUNC(query_port),
+		IB_MANDATORY_FUNC(query_pkey),
+		IB_MANDATORY_FUNC(query_gid),
+		IB_MANDATORY_FUNC(alloc_pd),
+		IB_MANDATORY_FUNC(dealloc_pd),
+		IB_MANDATORY_FUNC(create_ah),
+		IB_MANDATORY_FUNC(destroy_ah),
+		IB_MANDATORY_FUNC(create_qp),
+		IB_MANDATORY_FUNC(modify_qp),
+		IB_MANDATORY_FUNC(destroy_qp),
+		IB_MANDATORY_FUNC(post_send),
+		IB_MANDATORY_FUNC(post_recv),
+		IB_MANDATORY_FUNC(create_cq),
+		IB_MANDATORY_FUNC(destroy_cq),
+		IB_MANDATORY_FUNC(poll_cq),
+		IB_MANDATORY_FUNC(req_notify_cq),
+		IB_MANDATORY_FUNC(get_dma_mr),
+		IB_MANDATORY_FUNC(dereg_mr)
+	};
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(mandatory_table); ++i) {
+		if (!*(void **) ((void *) device + mandatory_table[i].offset)) {
+			printk(KERN_WARNING "Device %s is missing mandatory function %s\n",
+			       device->name, mandatory_table[i].name);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static struct ib_device *__ib_device_get_by_name(const char *name)
+{
+	struct ib_device *device;
+
+	list_for_each_entry(device, &device_list, core_list)
+		if (!strncmp(name, device->name, IB_DEVICE_NAME_MAX))
+			return device;
+
+	return NULL;
+}
+
+
+static int alloc_name(char *name)
+{
+	unsigned long *inuse;
+	char buf[IB_DEVICE_NAME_MAX];
+	struct ib_device *device;
+	int i;
+
+	inuse = (unsigned long *) get_zeroed_page(GFP_KERNEL);
+	if (!inuse)
+		return -ENOMEM;
+
+	list_for_each_entry(device, &device_list, core_list) {
+		if (!sscanf(device->name, name, &i))
+			continue;
+		if (i < 0 || i >= PAGE_SIZE * 8)
+			continue;
+		snprintf(buf, sizeof buf, name, i);
+		if (!strncmp(buf, device->name, IB_DEVICE_NAME_MAX))
+			set_bit(i, inuse);
+	}
+
+	i = find_first_zero_bit(inuse, PAGE_SIZE * 8);
+	free_page((unsigned long) inuse);
+	snprintf(buf, sizeof buf, name, i);
+
+	if (__ib_device_get_by_name(buf))
+		return -ENFILE;
+
+	strlcpy(name, buf, IB_DEVICE_NAME_MAX);
+	return 0;
+}
+
+static int start_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ? 0 : 1;
+}
+
+
+static int end_port(struct ib_device *device)
+{
+	return (device->node_type == RDMA_NODE_IB_SWITCH) ?
+		0 : device->phys_port_cnt;
+}
+
+/**
+ * ib_alloc_device - allocate an IB device struct
+ * @size:size of structure to allocate
+ *
+ * Low-level drivers should use ib_alloc_device() to allocate &struct
+ * ib_device.  @size is the size of the structure to be allocated,
+ * including any private data used by the low-level driver.
+ * ib_dealloc_device() must be used to free structures allocated with
+ * ib_alloc_device().
+ */
+struct ib_device *ib_alloc_device(size_t size)
+{
+	BUG_ON(size < sizeof (struct ib_device));
+
+	return kzalloc(size, GFP_KERNEL);
+}
+EXPORT_SYMBOL(ib_alloc_device);
+
+/**
+ * ib_dealloc_device - free an IB device struct
+ * @device:structure to free
+ *
+ * Free a structure allocated with ib_alloc_device().
+ */
+void ib_dealloc_device(struct ib_device *device)
+{
+	if (device->reg_state == IB_DEV_UNINITIALIZED) {
+		kfree(device);
+		return;
+	}
+
+	BUG_ON(device->reg_state != IB_DEV_UNREGISTERED);
+
+	kobject_put(&device->dev.kobj);
+}
+EXPORT_SYMBOL(ib_dealloc_device);
+
+static int add_client_context(struct ib_device *device, struct ib_client *client)
+{
+	struct ib_client_data *context;
+	unsigned long flags;
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context) {
+		printk(KERN_WARNING "Couldn't allocate client context for %s/%s\n",
+		       device->name, client->name);
+		return -ENOMEM;
+	}
+
+	context->client = client;
+	context->data   = NULL;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_add(&context->list, &device->client_data_list);
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+	return 0;
+}
+
+static int read_port_table_lengths(struct ib_device *device)
+{
+	struct ib_port_attr *tprops = NULL;
+	int num_ports, ret = -ENOMEM;
+	u8 port_index;
+
+	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+	if (!tprops)
+		goto out;
+
+	num_ports = end_port(device) - start_port(device) + 1;
+
+	device->pkey_tbl_len = kmalloc(sizeof *device->pkey_tbl_len * num_ports,
+				       GFP_KERNEL);
+	device->gid_tbl_len = kmalloc(sizeof *device->gid_tbl_len * num_ports,
+				      GFP_KERNEL);
+	if (!device->pkey_tbl_len || !device->gid_tbl_len)
+		goto err;
+
+	for (port_index = 0; port_index < num_ports; ++port_index) {
+		ret = ib_query_port(device, port_index + start_port(device),
+					tprops);
+		if (ret)
+			goto err;
+		device->pkey_tbl_len[port_index] = tprops->pkey_tbl_len;
+		device->gid_tbl_len[port_index]  = tprops->gid_tbl_len;
+	}
+
+	ret = 0;
+	goto out;
+
+err:
+	kfree(device->gid_tbl_len);
+	kfree(device->pkey_tbl_len);
+out:
+	kfree(tprops);
+	return ret;
+}
+
+/**
+ * ib_register_device - Register an IB device with IB core
+ * @device:Device to register
+ *
+ * Low-level drivers use ib_register_device() to register their
+ * devices with the IB core.  All registered clients will receive a
+ * callback for each device that is added. @device must be allocated
+ * with ib_alloc_device().
+ */
+int ib_register_device(struct ib_device *device,
+		       int (*port_callback)(struct ib_device *,
+					    u8, struct kobject *))
+{
+	int ret;
+
+	mutex_lock(&device_mutex);
+
+	if (strchr(device->name, '%')) {
+		ret = alloc_name(device->name);
+		if (ret)
+			goto out;
+	}
+
+	if (ib_device_check_mandatory(device)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&device->event_handler_list);
+	INIT_LIST_HEAD(&device->client_data_list);
+	spin_lock_init(&device->event_handler_lock);
+	spin_lock_init(&device->client_data_lock);
+
+	ret = read_port_table_lengths(device);
+	if (ret) {
+		printk(KERN_WARNING "Couldn't create table lengths cache for device %s\n",
+		       device->name);
+		goto out;
+	}
+
+	ret = ib_device_register_sysfs(device, port_callback);
+	if (ret) {
+		printk(KERN_WARNING "Couldn't register device %s with driver model\n",
+		       device->name);
+		kfree(device->gid_tbl_len);
+		kfree(device->pkey_tbl_len);
+		goto out;
+	}
+
+	list_add_tail(&device->core_list, &device_list);
+
+	device->reg_state = IB_DEV_REGISTERED;
+
+	{
+		struct ib_client *client;
+
+		list_for_each_entry(client, &client_list, list)
+			if (client->add && !add_client_context(device, client))
+				client->add(device);
+	}
+
+ out:
+	mutex_unlock(&device_mutex);
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_device);
+
+/**
+ * ib_unregister_device - Unregister an IB device
+ * @device:Device to unregister
+ *
+ * Unregister an IB device.  All clients will receive a remove callback.
+ */
+void ib_unregister_device(struct ib_device *device)
+{
+	struct ib_client *client;
+	struct ib_client_data *context, *tmp;
+	unsigned long flags;
+
+	mutex_lock(&device_mutex);
+
+	list_for_each_entry_reverse(client, &client_list, list)
+		if (client->remove)
+			client->remove(device);
+
+	list_del(&device->core_list);
+
+	kfree(device->gid_tbl_len);
+	kfree(device->pkey_tbl_len);
+
+	mutex_unlock(&device_mutex);
+
+	ib_device_unregister_sysfs(device);
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+		kfree(context);
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+	device->reg_state = IB_DEV_UNREGISTERED;
+}
+EXPORT_SYMBOL(ib_unregister_device);
+
+/**
+ * ib_register_client - Register an IB client
+ * @client:Client to register
+ *
+ * Upper level users of the IB drivers can use ib_register_client() to
+ * register callbacks for IB device addition and removal.  When an IB
+ * device is added, each registered client's add method will be called
+ * (in the order the clients were registered), and when a device is
+ * removed, each client's remove method will be called (in the reverse
+ * order that clients were registered).  In addition, when
+ * ib_register_client() is called, the client will receive an add
+ * callback for all devices already registered.
+ */
+int ib_register_client(struct ib_client *client)
+{
+	struct ib_device *device;
+
+	mutex_lock(&device_mutex);
+
+	list_add_tail(&client->list, &client_list);
+	list_for_each_entry(device, &device_list, core_list)
+		if (client->add && !add_client_context(device, client))
+			client->add(device);
+
+	mutex_unlock(&device_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_register_client);
+
+/**
+ * ib_unregister_client - Unregister an IB client
+ * @client:Client to unregister
+ *
+ * Upper level users use ib_unregister_client() to remove their client
+ * registration.  When ib_unregister_client() is called, the client
+ * will receive a remove callback for each IB device still registered.
+ */
+void ib_unregister_client(struct ib_client *client)
+{
+	struct ib_client_data *context, *tmp;
+	struct ib_device *device;
+	unsigned long flags;
+
+	mutex_lock(&device_mutex);
+
+	list_for_each_entry(device, &device_list, core_list) {
+		if (client->remove)
+			client->remove(device);
+
+		spin_lock_irqsave(&device->client_data_lock, flags);
+		list_for_each_entry_safe(context, tmp, &device->client_data_list, list)
+			if (context->client == client) {
+				list_del(&context->list);
+				kfree(context);
+			}
+		spin_unlock_irqrestore(&device->client_data_lock, flags);
+	}
+	list_del(&client->list);
+
+	mutex_unlock(&device_mutex);
+}
+EXPORT_SYMBOL(ib_unregister_client);
+
+/**
+ * ib_get_client_data - Get IB client context
+ * @device:Device to get context for
+ * @client:Client to get context for
+ *
+ * ib_get_client_data() returns client context set with
+ * ib_set_client_data().
+ */
+void *ib_get_client_data(struct ib_device *device, struct ib_client *client)
+{
+	struct ib_client_data *context;
+	void *ret = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry(context, &device->client_data_list, list)
+		if (context->client == client) {
+			ret = context->data;
+			break;
+		}
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_get_client_data);
+
+/**
+ * ib_set_client_data - Set IB client context
+ * @device:Device to set context for
+ * @client:Client to set context for
+ * @data:Context to set
+ *
+ * ib_set_client_data() sets client context that can be retrieved with
+ * ib_get_client_data().
+ */
+void ib_set_client_data(struct ib_device *device, struct ib_client *client,
+			void *data)
+{
+	struct ib_client_data *context;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device->client_data_lock, flags);
+	list_for_each_entry(context, &device->client_data_list, list)
+		if (context->client == client) {
+			context->data = data;
+			goto out;
+		}
+
+	printk(KERN_WARNING "No client context found for %s/%s\n",
+	       device->name, client->name);
+
+out:
+	spin_unlock_irqrestore(&device->client_data_lock, flags);
+}
+EXPORT_SYMBOL(ib_set_client_data);
+
+/**
+ * ib_register_event_handler - Register an IB event handler
+ * @event_handler:Handler to register
+ *
+ * ib_register_event_handler() registers an event handler that will be
+ * called back when asynchronous IB events occur (as defined in
+ * chapter 11 of the InfiniBand Architecture Specification).  This
+ * callback may occur in interrupt context.
+ */
+int ib_register_event_handler  (struct ib_event_handler *event_handler)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+	list_add_tail(&event_handler->list,
+		      &event_handler->device->event_handler_list);
+	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_register_event_handler);
+
+/**
+ * ib_unregister_event_handler - Unregister an event handler
+ * @event_handler:Handler to unregister
+ *
+ * Unregister an event handler registered with
+ * ib_register_event_handler().
+ */
+int ib_unregister_event_handler(struct ib_event_handler *event_handler)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&event_handler->device->event_handler_lock, flags);
+	list_del(&event_handler->list);
+	spin_unlock_irqrestore(&event_handler->device->event_handler_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_unregister_event_handler);
+
+/**
+ * ib_dispatch_event - Dispatch an asynchronous event
+ * @event:Event to dispatch
+ *
+ * Low-level drivers must call ib_dispatch_event() to dispatch the
+ * event to all registered event handlers when an asynchronous event
+ * occurs.
+ */
+void ib_dispatch_event(struct ib_event *event)
+{
+	unsigned long flags;
+	struct ib_event_handler *handler;
+
+	spin_lock_irqsave(&event->device->event_handler_lock, flags);
+
+	list_for_each_entry(handler, &event->device->event_handler_list, list)
+		handler->handler(handler, event);
+
+	spin_unlock_irqrestore(&event->device->event_handler_lock, flags);
+}
+EXPORT_SYMBOL(ib_dispatch_event);
+
+/**
+ * ib_query_device - Query IB device attributes
+ * @device:Device to query
+ * @device_attr:Device attributes
+ *
+ * ib_query_device() returns the attributes of a device through the
+ * @device_attr pointer.
+ */
+int ib_query_device(struct ib_device *device,
+		    struct ib_device_attr *device_attr)
+{
+	return device->query_device(device, device_attr);
+}
+EXPORT_SYMBOL(ib_query_device);
+
+/**
+ * ib_query_port - Query IB port attributes
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @port_attr:Port attributes
+ *
+ * ib_query_port() returns the attributes of a port through the
+ * @port_attr pointer.
+ */
+int ib_query_port(struct ib_device *device,
+		  u8 port_num,
+		  struct ib_port_attr *port_attr)
+{
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	return device->query_port(device, port_num, port_attr);
+}
+EXPORT_SYMBOL(ib_query_port);
+
+/**
+ * ib_query_gid - Get GID table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:GID table index to query
+ * @gid:Returned GID
+ *
+ * ib_query_gid() fetches the specified GID table entry.
+ */
+int ib_query_gid(struct ib_device *device,
+		 u8 port_num, int index, union ib_gid *gid)
+{
+	return device->query_gid(device, port_num, index, gid);
+}
+EXPORT_SYMBOL(ib_query_gid);
+
+/**
+ * ib_query_pkey - Get P_Key table entry
+ * @device:Device to query
+ * @port_num:Port number to query
+ * @index:P_Key table index to query
+ * @pkey:Returned P_Key
+ *
+ * ib_query_pkey() fetches the specified P_Key table entry.
+ */
+int ib_query_pkey(struct ib_device *device,
+		  u8 port_num, u16 index, u16 *pkey)
+{
+	return device->query_pkey(device, port_num, index, pkey);
+}
+EXPORT_SYMBOL(ib_query_pkey);
+
+/**
+ * ib_modify_device - Change IB device attributes
+ * @device:Device to modify
+ * @device_modify_mask:Mask of attributes to change
+ * @device_modify:New attribute values
+ *
+ * ib_modify_device() changes a device's attributes as specified by
+ * the @device_modify_mask and @device_modify structure.
+ */
+int ib_modify_device(struct ib_device *device,
+		     int device_modify_mask,
+		     struct ib_device_modify *device_modify)
+{
+	if (!device->modify_device)
+		return -ENOSYS;
+
+	return device->modify_device(device, device_modify_mask,
+				     device_modify);
+}
+EXPORT_SYMBOL(ib_modify_device);
+
+/**
+ * ib_modify_port - Modifies the attributes for the specified port.
+ * @device: The device to modify.
+ * @port_num: The number of the port to modify.
+ * @port_modify_mask: Mask used to specify which attributes of the port
+ *   to change.
+ * @port_modify: New attribute values for the port.
+ *
+ * ib_modify_port() changes a port's attributes as specified by the
+ * @port_modify_mask and @port_modify structure.
+ */
+int ib_modify_port(struct ib_device *device,
+		   u8 port_num, int port_modify_mask,
+		   struct ib_port_modify *port_modify)
+{
+	if (!device->modify_port)
+		return -ENOSYS;
+
+	if (port_num < start_port(device) || port_num > end_port(device))
+		return -EINVAL;
+
+	return device->modify_port(device, port_num, port_modify_mask,
+				   port_modify);
+}
+EXPORT_SYMBOL(ib_modify_port);
+
+/**
+ * ib_find_gid - Returns the port number and GID table index where
+ *   a specified GID value occurs.
+ * @device: The device to query.
+ * @gid: The GID value to search for.
+ * @port_num: The port number of the device where the GID value was found.
+ * @index: The index into the GID table where the GID was found.  This
+ *   parameter may be NULL.
+ */
+int ib_find_gid(struct ib_device *device, union ib_gid *gid,
+		u8 *port_num, u16 *index)
+{
+	union ib_gid tmp_gid;
+	int ret, port, i;
+
+	for (port = start_port(device); port <= end_port(device); ++port) {
+		for (i = 0; i < device->gid_tbl_len[port - start_port(device)]; ++i) {
+			ret = ib_query_gid(device, port, i, &tmp_gid);
+			if (ret)
+				return ret;
+			if (!memcmp(&tmp_gid, gid, sizeof *gid)) {
+				*port_num = port;
+				if (index)
+					*index = i;
+				return 0;
+			}
+		}
+	}
+
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_gid);
+
+/**
+ * ib_find_pkey - Returns the PKey table index where a specified
+ *   PKey value occurs.
+ * @device: The device to query.
+ * @port_num: The port number of the device to search for the PKey.
+ * @pkey: The PKey value to search for.
+ * @index: The index into the PKey table where the PKey was found.
+ */
+int ib_find_pkey(struct ib_device *device,
+		 u8 port_num, u16 pkey, u16 *index)
+{
+	int ret, i;
+	u16 tmp_pkey;
+	int partial_ix = -1;
+
+	for (i = 0; i < device->pkey_tbl_len[port_num - start_port(device)]; ++i) {
+		ret = ib_query_pkey(device, port_num, i, &tmp_pkey);
+		if (ret)
+			return ret;
+		if ((pkey & 0x7fff) == (tmp_pkey & 0x7fff)) {
+			/* if there is full-member pkey take it.*/
+			if (tmp_pkey & 0x8000) {
+				*index = i;
+				return 0;
+			}
+			if (partial_ix < 0)
+				partial_ix = i;
+		}
+	}
+
+	/*no full-member, if exists take the limited*/
+	if (partial_ix >= 0) {
+		*index = partial_ix;
+		return 0;
+	}
+	return -ENOENT;
+}
+EXPORT_SYMBOL(ib_find_pkey);
+
+static int __init ib_core_init(void)
+{
+	int ret;
+
+	ib_wq = alloc_workqueue("infiniband", 0, 0);
+	if (!ib_wq)
+		return -ENOMEM;
+
+	ret = ib_sysfs_setup();
+	if (ret) {
+		printk(KERN_WARNING "Couldn't create InfiniBand device class\n");
+		goto err;
+	}
+
+	ret = ibnl_init();
+	if (ret) {
+		printk(KERN_WARNING "Couldn't init IB netlink interface\n");
+		goto err_sysfs;
+	}
+
+	ret = ib_cache_setup();
+	if (ret) {
+		printk(KERN_WARNING "Couldn't set up InfiniBand P_Key/GID cache\n");
+		goto err_nl;
+	}
+
+	return 0;
+
+err_nl:
+	ibnl_cleanup();
+
+err_sysfs:
+	ib_sysfs_cleanup();
+
+err:
+	destroy_workqueue(ib_wq);
+	return ret;
+}
+
+static void __exit ib_core_cleanup(void)
+{
+	ib_cache_cleanup();
+	ibnl_cleanup();
+	ib_sysfs_cleanup();
+	/* Make sure that any pending umem accounting work is done. */
+	destroy_workqueue(ib_wq);
+}
+
+module_init(ib_core_init);
+module_exit(ib_core_cleanup);
diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c
new file mode 100644
index 000000000..9f5ad7cc3
--- /dev/null
+++ b/drivers/infiniband/core/fmr_pool.c
@@ -0,0 +1,544 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/jhash.h>
+#include <linux/kthread.h>
+
+#include <rdma/ib_fmr_pool.h>
+
+#include "core_priv.h"
+
+#define PFX "fmr_pool: "
+
+enum {
+	IB_FMR_MAX_REMAPS = 32,
+
+	IB_FMR_HASH_BITS  = 8,
+	IB_FMR_HASH_SIZE  = 1 << IB_FMR_HASH_BITS,
+	IB_FMR_HASH_MASK  = IB_FMR_HASH_SIZE - 1
+};
+
+/*
+ * If an FMR is not in use, then the list member will point to either
+ * its pool's free_list (if the FMR can be mapped again; that is,
+ * remap_count < pool->max_remaps) or its pool's dirty_list (if the
+ * FMR needs to be unmapped before being remapped).  In either of
+ * these cases it is a bug if the ref_count is not 0.  In other words,
+ * if ref_count is > 0, then the list member must not be linked into
+ * either free_list or dirty_list.
+ *
+ * The cache_node member is used to link the FMR into a cache bucket
+ * (if caching is enabled).  This is independent of the reference
+ * count of the FMR.  When a valid FMR is released, its ref_count is
+ * decremented, and if ref_count reaches 0, the FMR is placed in
+ * either free_list or dirty_list as appropriate.  However, it is not
+ * removed from the cache and may be "revived" if a call to
+ * ib_fmr_register_physical() occurs before the FMR is remapped.  In
+ * this case we just increment the ref_count and remove the FMR from
+ * free_list/dirty_list.
+ *
+ * Before we remap an FMR from free_list, we remove it from the cache
+ * (to prevent another user from obtaining a stale FMR).  When an FMR
+ * is released, we add it to the tail of the free list, so that our
+ * cache eviction policy is "least recently used."
+ *
+ * All manipulation of ref_count, list and cache_node is protected by
+ * pool_lock to maintain consistency.
+ */
+
+struct ib_fmr_pool {
+	spinlock_t                pool_lock;
+
+	int                       pool_size;
+	int                       max_pages;
+	int			  max_remaps;
+	int                       dirty_watermark;
+	int                       dirty_len;
+	struct list_head          free_list;
+	struct list_head          dirty_list;
+	struct hlist_head        *cache_bucket;
+
+	void                     (*flush_function)(struct ib_fmr_pool *pool,
+						   void *              arg);
+	void                     *flush_arg;
+
+	struct task_struct       *thread;
+
+	atomic_t                  req_ser;
+	atomic_t                  flush_ser;
+
+	wait_queue_head_t         force_wait;
+};
+
+static inline u32 ib_fmr_hash(u64 first_page)
+{
+	return jhash_2words((u32) first_page, (u32) (first_page >> 32), 0) &
+		(IB_FMR_HASH_SIZE - 1);
+}
+
+/* Caller must hold pool_lock */
+static inline struct ib_pool_fmr *ib_fmr_cache_lookup(struct ib_fmr_pool *pool,
+						      u64 *page_list,
+						      int  page_list_len,
+						      u64  io_virtual_address)
+{
+	struct hlist_head *bucket;
+	struct ib_pool_fmr *fmr;
+
+	if (!pool->cache_bucket)
+		return NULL;
+
+	bucket = pool->cache_bucket + ib_fmr_hash(*page_list);
+
+	hlist_for_each_entry(fmr, bucket, cache_node)
+		if (io_virtual_address == fmr->io_virtual_address &&
+		    page_list_len      == fmr->page_list_len      &&
+		    !memcmp(page_list, fmr->page_list,
+			    page_list_len * sizeof *page_list))
+			return fmr;
+
+	return NULL;
+}
+
+static void ib_fmr_batch_release(struct ib_fmr_pool *pool)
+{
+	int                 ret;
+	struct ib_pool_fmr *fmr;
+	LIST_HEAD(unmap_list);
+	LIST_HEAD(fmr_list);
+
+	spin_lock_irq(&pool->pool_lock);
+
+	list_for_each_entry(fmr, &pool->dirty_list, list) {
+		hlist_del_init(&fmr->cache_node);
+		fmr->remap_count = 0;
+		list_add_tail(&fmr->fmr->list, &fmr_list);
+
+#ifdef DEBUG
+		if (fmr->ref_count !=0) {
+			printk(KERN_WARNING PFX "Unmapping FMR 0x%08x with ref count %d\n",
+			       fmr, fmr->ref_count);
+		}
+#endif
+	}
+
+	list_splice_init(&pool->dirty_list, &unmap_list);
+	pool->dirty_len = 0;
+
+	spin_unlock_irq(&pool->pool_lock);
+
+	if (list_empty(&unmap_list)) {
+		return;
+	}
+
+	ret = ib_unmap_fmr(&fmr_list);
+	if (ret)
+		printk(KERN_WARNING PFX "ib_unmap_fmr returned %d\n", ret);
+
+	spin_lock_irq(&pool->pool_lock);
+	list_splice(&unmap_list, &pool->free_list);
+	spin_unlock_irq(&pool->pool_lock);
+}
+
+static int ib_fmr_cleanup_thread(void *pool_ptr)
+{
+	struct ib_fmr_pool *pool = pool_ptr;
+
+	do {
+		if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) < 0) {
+			ib_fmr_batch_release(pool);
+
+			atomic_inc(&pool->flush_ser);
+			wake_up_interruptible(&pool->force_wait);
+
+			if (pool->flush_function)
+				pool->flush_function(pool, pool->flush_arg);
+		}
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (atomic_read(&pool->flush_ser) - atomic_read(&pool->req_ser) >= 0 &&
+		    !kthread_should_stop())
+			schedule();
+		__set_current_state(TASK_RUNNING);
+	} while (!kthread_should_stop());
+
+	return 0;
+}
+
+/**
+ * ib_create_fmr_pool - Create an FMR pool
+ * @pd:Protection domain for FMRs
+ * @params:FMR pool parameters
+ *
+ * Create a pool of FMRs.  Return value is pointer to new pool or
+ * error code if creation failed.
+ */
+struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd             *pd,
+				       struct ib_fmr_pool_param *params)
+{
+	struct ib_device   *device;
+	struct ib_fmr_pool *pool;
+	struct ib_device_attr *attr;
+	int i;
+	int ret;
+	int max_remaps;
+
+	if (!params)
+		return ERR_PTR(-EINVAL);
+
+	device = pd->device;
+	if (!device->alloc_fmr    || !device->dealloc_fmr  ||
+	    !device->map_phys_fmr || !device->unmap_fmr) {
+		printk(KERN_INFO PFX "Device %s does not support FMRs\n",
+		       device->name);
+		return ERR_PTR(-ENOSYS);
+	}
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr) {
+		printk(KERN_WARNING PFX "couldn't allocate device attr struct\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ret = ib_query_device(device, attr);
+	if (ret) {
+		printk(KERN_WARNING PFX "couldn't query device: %d\n", ret);
+		kfree(attr);
+		return ERR_PTR(ret);
+	}
+
+	if (!attr->max_map_per_fmr)
+		max_remaps = IB_FMR_MAX_REMAPS;
+	else
+		max_remaps = attr->max_map_per_fmr;
+
+	kfree(attr);
+
+	pool = kmalloc(sizeof *pool, GFP_KERNEL);
+	if (!pool) {
+		printk(KERN_WARNING PFX "couldn't allocate pool struct\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	pool->cache_bucket   = NULL;
+
+	pool->flush_function = params->flush_function;
+	pool->flush_arg      = params->flush_arg;
+
+	INIT_LIST_HEAD(&pool->free_list);
+	INIT_LIST_HEAD(&pool->dirty_list);
+
+	if (params->cache) {
+		pool->cache_bucket =
+			kmalloc(IB_FMR_HASH_SIZE * sizeof *pool->cache_bucket,
+				GFP_KERNEL);
+		if (!pool->cache_bucket) {
+			printk(KERN_WARNING PFX "Failed to allocate cache in pool\n");
+			ret = -ENOMEM;
+			goto out_free_pool;
+		}
+
+		for (i = 0; i < IB_FMR_HASH_SIZE; ++i)
+			INIT_HLIST_HEAD(pool->cache_bucket + i);
+	}
+
+	pool->pool_size       = 0;
+	pool->max_pages       = params->max_pages_per_fmr;
+	pool->max_remaps      = max_remaps;
+	pool->dirty_watermark = params->dirty_watermark;
+	pool->dirty_len       = 0;
+	spin_lock_init(&pool->pool_lock);
+	atomic_set(&pool->req_ser,   0);
+	atomic_set(&pool->flush_ser, 0);
+	init_waitqueue_head(&pool->force_wait);
+
+	pool->thread = kthread_run(ib_fmr_cleanup_thread,
+				   pool,
+				   "ib_fmr(%s)",
+				   device->name);
+	if (IS_ERR(pool->thread)) {
+		printk(KERN_WARNING PFX "couldn't start cleanup thread\n");
+		ret = PTR_ERR(pool->thread);
+		goto out_free_pool;
+	}
+
+	{
+		struct ib_pool_fmr *fmr;
+		struct ib_fmr_attr fmr_attr = {
+			.max_pages  = params->max_pages_per_fmr,
+			.max_maps   = pool->max_remaps,
+			.page_shift = params->page_shift
+		};
+		int bytes_per_fmr = sizeof *fmr;
+
+		if (pool->cache_bucket)
+			bytes_per_fmr += params->max_pages_per_fmr * sizeof (u64);
+
+		for (i = 0; i < params->pool_size; ++i) {
+			fmr = kmalloc(bytes_per_fmr, GFP_KERNEL);
+			if (!fmr) {
+				printk(KERN_WARNING PFX "failed to allocate fmr "
+				       "struct for FMR %d\n", i);
+				goto out_fail;
+			}
+
+			fmr->pool             = pool;
+			fmr->remap_count      = 0;
+			fmr->ref_count        = 0;
+			INIT_HLIST_NODE(&fmr->cache_node);
+
+			fmr->fmr = ib_alloc_fmr(pd, params->access, &fmr_attr);
+			if (IS_ERR(fmr->fmr)) {
+				printk(KERN_WARNING PFX "fmr_create failed "
+				       "for FMR %d\n", i);
+				kfree(fmr);
+				goto out_fail;
+			}
+
+			list_add_tail(&fmr->list, &pool->free_list);
+			++pool->pool_size;
+		}
+	}
+
+	return pool;
+
+ out_free_pool:
+	kfree(pool->cache_bucket);
+	kfree(pool);
+
+	return ERR_PTR(ret);
+
+ out_fail:
+	ib_destroy_fmr_pool(pool);
+
+	return ERR_PTR(-ENOMEM);
+}
+EXPORT_SYMBOL(ib_create_fmr_pool);
+
+/**
+ * ib_destroy_fmr_pool - Free FMR pool
+ * @pool:FMR pool to free
+ *
+ * Destroy an FMR pool and free all associated resources.
+ */
+void ib_destroy_fmr_pool(struct ib_fmr_pool *pool)
+{
+	struct ib_pool_fmr *fmr;
+	struct ib_pool_fmr *tmp;
+	LIST_HEAD(fmr_list);
+	int                 i;
+
+	kthread_stop(pool->thread);
+	ib_fmr_batch_release(pool);
+
+	i = 0;
+	list_for_each_entry_safe(fmr, tmp, &pool->free_list, list) {
+		if (fmr->remap_count) {
+			INIT_LIST_HEAD(&fmr_list);
+			list_add_tail(&fmr->fmr->list, &fmr_list);
+			ib_unmap_fmr(&fmr_list);
+		}
+		ib_dealloc_fmr(fmr->fmr);
+		list_del(&fmr->list);
+		kfree(fmr);
+		++i;
+	}
+
+	if (i < pool->pool_size)
+		printk(KERN_WARNING PFX "pool still has %d regions registered\n",
+		       pool->pool_size - i);
+
+	kfree(pool->cache_bucket);
+	kfree(pool);
+}
+EXPORT_SYMBOL(ib_destroy_fmr_pool);
+
+/**
+ * ib_flush_fmr_pool - Invalidate all unmapped FMRs
+ * @pool:FMR pool to flush
+ *
+ * Ensure that all unmapped FMRs are fully invalidated.
+ */
+int ib_flush_fmr_pool(struct ib_fmr_pool *pool)
+{
+	int serial;
+	struct ib_pool_fmr *fmr, *next;
+
+	/*
+	 * The free_list holds FMRs that may have been used
+	 * but have not been remapped enough times to be dirty.
+	 * Put them on the dirty list now so that the cleanup
+	 * thread will reap them too.
+	 */
+	spin_lock_irq(&pool->pool_lock);
+	list_for_each_entry_safe(fmr, next, &pool->free_list, list) {
+		if (fmr->remap_count > 0)
+			list_move(&fmr->list, &pool->dirty_list);
+	}
+	spin_unlock_irq(&pool->pool_lock);
+
+	serial = atomic_inc_return(&pool->req_ser);
+	wake_up_process(pool->thread);
+
+	if (wait_event_interruptible(pool->force_wait,
+				     atomic_read(&pool->flush_ser) - serial >= 0))
+		return -EINTR;
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_flush_fmr_pool);
+
+/**
+ * ib_fmr_pool_map_phys -
+ * @pool:FMR pool to allocate FMR from
+ * @page_list:List of pages to map
+ * @list_len:Number of pages in @page_list
+ * @io_virtual_address:I/O virtual address for new FMR
+ *
+ * Map an FMR from an FMR pool.
+ */
+struct ib_pool_fmr *ib_fmr_pool_map_phys(struct ib_fmr_pool *pool_handle,
+					 u64                *page_list,
+					 int                 list_len,
+					 u64                 io_virtual_address)
+{
+	struct ib_fmr_pool *pool = pool_handle;
+	struct ib_pool_fmr *fmr;
+	unsigned long       flags;
+	int                 result;
+
+	if (list_len < 1 || list_len > pool->max_pages)
+		return ERR_PTR(-EINVAL);
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+	fmr = ib_fmr_cache_lookup(pool,
+				  page_list,
+				  list_len,
+				  io_virtual_address);
+	if (fmr) {
+		/* found in cache */
+		++fmr->ref_count;
+		if (fmr->ref_count == 1) {
+			list_del(&fmr->list);
+		}
+
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+		return fmr;
+	}
+
+	if (list_empty(&pool->free_list)) {
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+		return ERR_PTR(-EAGAIN);
+	}
+
+	fmr = list_entry(pool->free_list.next, struct ib_pool_fmr, list);
+	list_del(&fmr->list);
+	hlist_del_init(&fmr->cache_node);
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	result = ib_map_phys_fmr(fmr->fmr, page_list, list_len,
+				 io_virtual_address);
+
+	if (result) {
+		spin_lock_irqsave(&pool->pool_lock, flags);
+		list_add(&fmr->list, &pool->free_list);
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+		printk(KERN_WARNING PFX "fmr_map returns %d\n", result);
+
+		return ERR_PTR(result);
+	}
+
+	++fmr->remap_count;
+	fmr->ref_count = 1;
+
+	if (pool->cache_bucket) {
+		fmr->io_virtual_address = io_virtual_address;
+		fmr->page_list_len      = list_len;
+		memcpy(fmr->page_list, page_list, list_len * sizeof(*page_list));
+
+		spin_lock_irqsave(&pool->pool_lock, flags);
+		hlist_add_head(&fmr->cache_node,
+			       pool->cache_bucket + ib_fmr_hash(fmr->page_list[0]));
+		spin_unlock_irqrestore(&pool->pool_lock, flags);
+	}
+
+	return fmr;
+}
+EXPORT_SYMBOL(ib_fmr_pool_map_phys);
+
+/**
+ * ib_fmr_pool_unmap - Unmap FMR
+ * @fmr:FMR to unmap
+ *
+ * Unmap an FMR.  The FMR mapping may remain valid until the FMR is
+ * reused (or until ib_flush_fmr_pool() is called).
+ */
+int ib_fmr_pool_unmap(struct ib_pool_fmr *fmr)
+{
+	struct ib_fmr_pool *pool;
+	unsigned long flags;
+
+	pool = fmr->pool;
+
+	spin_lock_irqsave(&pool->pool_lock, flags);
+
+	--fmr->ref_count;
+	if (!fmr->ref_count) {
+		if (fmr->remap_count < pool->max_remaps) {
+			list_add_tail(&fmr->list, &pool->free_list);
+		} else {
+			list_add_tail(&fmr->list, &pool->dirty_list);
+			if (++pool->dirty_len >= pool->dirty_watermark) {
+				atomic_inc(&pool->req_ser);
+				wake_up_process(pool->thread);
+			}
+		}
+	}
+
+#ifdef DEBUG
+	if (fmr->ref_count < 0)
+		printk(KERN_WARNING PFX "FMR %p has ref count %d < 0\n",
+		       fmr, fmr->ref_count);
+#endif
+
+	spin_unlock_irqrestore(&pool->pool_lock, flags);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_fmr_pool_unmap);
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
new file mode 100644
index 000000000..ff9163dc1
--- /dev/null
+++ b/drivers/infiniband/core/iwcm.c
@@ -0,0 +1,1069 @@
+/*
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/interrupt.h>
+#include <linux/rbtree.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/workqueue.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_addr.h>
+
+#include "iwcm.h"
+
+MODULE_AUTHOR("Tom Tucker");
+MODULE_DESCRIPTION("iWARP CM");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static struct workqueue_struct *iwcm_wq;
+struct iwcm_work {
+	struct work_struct work;
+	struct iwcm_id_private *cm_id;
+	struct list_head list;
+	struct iw_cm_event event;
+	struct list_head free_list;
+};
+
+static unsigned int default_backlog = 256;
+
+static struct ctl_table_header *iwcm_ctl_table_hdr;
+static struct ctl_table iwcm_ctl_table[] = {
+	{
+		.procname	= "default_backlog",
+		.data		= &default_backlog,
+		.maxlen		= sizeof(default_backlog),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+/*
+ * The following services provide a mechanism for pre-allocating iwcm_work
+ * elements.  The design pre-allocates them  based on the cm_id type:
+ *	LISTENING IDS: 	Get enough elements preallocated to handle the
+ *			listen backlog.
+ *	ACTIVE IDS:	4: CONNECT_REPLY, ESTABLISHED, DISCONNECT, CLOSE
+ *	PASSIVE IDS:	3: ESTABLISHED, DISCONNECT, CLOSE
+ *
+ * Allocating them in connect and listen avoids having to deal
+ * with allocation failures on the event upcall from the provider (which
+ * is called in the interrupt context).
+ *
+ * One exception is when creating the cm_id for incoming connection requests.
+ * There are two cases:
+ * 1) in the event upcall, cm_event_handler(), for a listening cm_id.  If
+ *    the backlog is exceeded, then no more connection request events will
+ *    be processed.  cm_event_handler() returns -ENOMEM in this case.  Its up
+ *    to the provider to reject the connection request.
+ * 2) in the connection request workqueue handler, cm_conn_req_handler().
+ *    If work elements cannot be allocated for the new connect request cm_id,
+ *    then IWCM will call the provider reject method.  This is ok since
+ *    cm_conn_req_handler() runs in the workqueue thread context.
+ */
+
+static struct iwcm_work *get_work(struct iwcm_id_private *cm_id_priv)
+{
+	struct iwcm_work *work;
+
+	if (list_empty(&cm_id_priv->work_free_list))
+		return NULL;
+	work = list_entry(cm_id_priv->work_free_list.next, struct iwcm_work,
+			  free_list);
+	list_del_init(&work->free_list);
+	return work;
+}
+
+static void put_work(struct iwcm_work *work)
+{
+	list_add(&work->free_list, &work->cm_id->work_free_list);
+}
+
+static void dealloc_work_entries(struct iwcm_id_private *cm_id_priv)
+{
+	struct list_head *e, *tmp;
+
+	list_for_each_safe(e, tmp, &cm_id_priv->work_free_list)
+		kfree(list_entry(e, struct iwcm_work, free_list));
+}
+
+static int alloc_work_entries(struct iwcm_id_private *cm_id_priv, int count)
+{
+	struct iwcm_work *work;
+
+	BUG_ON(!list_empty(&cm_id_priv->work_free_list));
+	while (count--) {
+		work = kmalloc(sizeof(struct iwcm_work), GFP_KERNEL);
+		if (!work) {
+			dealloc_work_entries(cm_id_priv);
+			return -ENOMEM;
+		}
+		work->cm_id = cm_id_priv;
+		INIT_LIST_HEAD(&work->list);
+		put_work(work);
+	}
+	return 0;
+}
+
+/*
+ * Save private data from incoming connection requests to
+ * iw_cm_event, so the low level driver doesn't have to. Adjust
+ * the event ptr to point to the local copy.
+ */
+static int copy_private_data(struct iw_cm_event *event)
+{
+	void *p;
+
+	p = kmemdup(event->private_data, event->private_data_len, GFP_ATOMIC);
+	if (!p)
+		return -ENOMEM;
+	event->private_data = p;
+	return 0;
+}
+
+static void free_cm_id(struct iwcm_id_private *cm_id_priv)
+{
+	dealloc_work_entries(cm_id_priv);
+	kfree(cm_id_priv);
+}
+
+/*
+ * Release a reference on cm_id. If the last reference is being
+ * released, enable the waiting thread (in iw_destroy_cm_id) to
+ * get woken up, and return 1 if a thread is already waiting.
+ */
+static int iwcm_deref_id(struct iwcm_id_private *cm_id_priv)
+{
+	BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
+	if (atomic_dec_and_test(&cm_id_priv->refcount)) {
+		BUG_ON(!list_empty(&cm_id_priv->work_list));
+		complete(&cm_id_priv->destroy_comp);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void add_ref(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	atomic_inc(&cm_id_priv->refcount);
+}
+
+static void rem_ref(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+	int cb_destroy;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	/*
+	 * Test bit before deref in case the cm_id gets freed on another
+	 * thread.
+	 */
+	cb_destroy = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+	if (iwcm_deref_id(cm_id_priv) && cb_destroy) {
+		BUG_ON(!list_empty(&cm_id_priv->work_list));
+		free_cm_id(cm_id_priv);
+	}
+}
+
+static int cm_event_handler(struct iw_cm_id *cm_id, struct iw_cm_event *event);
+
+struct iw_cm_id *iw_create_cm_id(struct ib_device *device,
+				 iw_cm_handler cm_handler,
+				 void *context)
+{
+	struct iwcm_id_private *cm_id_priv;
+
+	cm_id_priv = kzalloc(sizeof(*cm_id_priv), GFP_KERNEL);
+	if (!cm_id_priv)
+		return ERR_PTR(-ENOMEM);
+
+	cm_id_priv->state = IW_CM_STATE_IDLE;
+	cm_id_priv->id.device = device;
+	cm_id_priv->id.cm_handler = cm_handler;
+	cm_id_priv->id.context = context;
+	cm_id_priv->id.event_handler = cm_event_handler;
+	cm_id_priv->id.add_ref = add_ref;
+	cm_id_priv->id.rem_ref = rem_ref;
+	spin_lock_init(&cm_id_priv->lock);
+	atomic_set(&cm_id_priv->refcount, 1);
+	init_waitqueue_head(&cm_id_priv->connect_wait);
+	init_completion(&cm_id_priv->destroy_comp);
+	INIT_LIST_HEAD(&cm_id_priv->work_list);
+	INIT_LIST_HEAD(&cm_id_priv->work_free_list);
+
+	return &cm_id_priv->id;
+}
+EXPORT_SYMBOL(iw_create_cm_id);
+
+
+static int iwcm_modify_qp_err(struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+
+	if (!qp)
+		return -EINVAL;
+
+	qp_attr.qp_state = IB_QPS_ERR;
+	return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * This is really the RDMAC CLOSING state. It is most similar to the
+ * IB SQD QP state.
+ */
+static int iwcm_modify_qp_sqd(struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+
+	BUG_ON(qp == NULL);
+	qp_attr.qp_state = IB_QPS_SQD;
+	return ib_modify_qp(qp, &qp_attr, IB_QP_STATE);
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * Block if a passive or active connection is currently being processed. Then
+ * process the event as follows:
+ * - If we are ESTABLISHED, move to CLOSING and modify the QP state
+ *   based on the abrupt flag
+ * - If the connection is already in the CLOSING or IDLE state, the peer is
+ *   disconnecting concurrently with us and we've already seen the
+ *   DISCONNECT event -- ignore the request and return 0
+ * - Disconnect on a listening endpoint returns -EINVAL
+ */
+int iw_cm_disconnect(struct iw_cm_id *cm_id, int abrupt)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+	struct ib_qp *qp = NULL;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	/* Wait if we're currently in a connect or accept downcall */
+	wait_event(cm_id_priv->connect_wait,
+		   !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_ESTABLISHED:
+		cm_id_priv->state = IW_CM_STATE_CLOSING;
+
+		/* QP could be <nul> for user-mode client */
+		if (cm_id_priv->qp)
+			qp = cm_id_priv->qp;
+		else
+			ret = -EINVAL;
+		break;
+	case IW_CM_STATE_LISTEN:
+		ret = -EINVAL;
+		break;
+	case IW_CM_STATE_CLOSING:
+		/* remote peer closed first */
+	case IW_CM_STATE_IDLE:
+		/* accept or connect returned !0 */
+		break;
+	case IW_CM_STATE_CONN_RECV:
+		/*
+		 * App called disconnect before/without calling accept after
+		 * connect_request event delivered.
+		 */
+		break;
+	case IW_CM_STATE_CONN_SENT:
+		/* Can only get here if wait above fails */
+	default:
+		BUG();
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	if (qp) {
+		if (abrupt)
+			ret = iwcm_modify_qp_err(qp);
+		else
+			ret = iwcm_modify_qp_sqd(qp);
+
+		/*
+		 * If both sides are disconnecting the QP could
+		 * already be in ERR or SQD states
+		 */
+		ret = 0;
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_disconnect);
+
+/*
+ * CM_ID <-- DESTROYING
+ *
+ * Clean up all resources associated with the connection and release
+ * the initial reference taken by iw_create_cm_id.
+ */
+static void destroy_cm_id(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	/*
+	 * Wait if we're currently in a connect or accept downcall. A
+	 * listening endpoint should never block here.
+	 */
+	wait_event(cm_id_priv->connect_wait,
+		   !test_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags));
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_LISTEN:
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		/* destroy the listening endpoint */
+		cm_id->device->iwcm->destroy_listen(cm_id);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_ESTABLISHED:
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		/* Abrupt close of the connection */
+		(void)iwcm_modify_qp_err(cm_id_priv->qp);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_IDLE:
+	case IW_CM_STATE_CLOSING:
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		break;
+	case IW_CM_STATE_CONN_RECV:
+		/*
+		 * App called destroy before/without calling accept after
+		 * receiving connection request event notification or
+		 * returned non zero from the event callback function.
+		 * In either case, must tell the provider to reject.
+		 */
+		cm_id_priv->state = IW_CM_STATE_DESTROYING;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		cm_id->device->iwcm->reject(cm_id, NULL, 0);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_CONN_SENT:
+	case IW_CM_STATE_DESTROYING:
+	default:
+		BUG();
+		break;
+	}
+	if (cm_id_priv->qp) {
+		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+		cm_id_priv->qp = NULL;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	(void)iwcm_deref_id(cm_id_priv);
+}
+
+/*
+ * This function is only called by the application thread and cannot
+ * be called by the event thread. The function will wait for all
+ * references to be released on the cm_id and then kfree the cm_id
+ * object.
+ */
+void iw_destroy_cm_id(struct iw_cm_id *cm_id)
+{
+	struct iwcm_id_private *cm_id_priv;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	BUG_ON(test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags));
+
+	destroy_cm_id(cm_id);
+
+	wait_for_completion(&cm_id_priv->destroy_comp);
+
+	free_cm_id(cm_id_priv);
+}
+EXPORT_SYMBOL(iw_destroy_cm_id);
+
+/*
+ * CM_ID <-- LISTEN
+ *
+ * Start listening for connect requests. Generates one CONNECT_REQUEST
+ * event for each inbound connect request.
+ */
+int iw_cm_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	if (!backlog)
+		backlog = default_backlog;
+
+	ret = alloc_work_entries(cm_id_priv, backlog);
+	if (ret)
+		return ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_IDLE:
+		cm_id_priv->state = IW_CM_STATE_LISTEN;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		ret = cm_id->device->iwcm->create_listen(cm_id, backlog);
+		if (ret)
+			cm_id_priv->state = IW_CM_STATE_IDLE;
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_listen);
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * Rejects an inbound connection request. No events are generated.
+ */
+int iw_cm_reject(struct iw_cm_id *cm_id,
+		 const void *private_data,
+		 u8 private_data_len)
+{
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+	cm_id_priv->state = IW_CM_STATE_IDLE;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	ret = cm_id->device->iwcm->reject(cm_id, private_data,
+					  private_data_len);
+
+	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	wake_up_all(&cm_id_priv->connect_wait);
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_reject);
+
+/*
+ * CM_ID <-- ESTABLISHED
+ *
+ * Accepts an inbound connection request and generates an ESTABLISHED
+ * event. Callers of iw_cm_disconnect and iw_destroy_cm_id will block
+ * until the ESTABLISHED event is received from the provider.
+ */
+int iw_cm_accept(struct iw_cm_id *cm_id,
+		 struct iw_cm_conn_param *iw_param)
+{
+	struct iwcm_id_private *cm_id_priv;
+	struct ib_qp *qp;
+	unsigned long flags;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->state != IW_CM_STATE_CONN_RECV) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+	/* Get the ib_qp given the QPN */
+	qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+	if (!qp) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+	cm_id->device->iwcm->add_ref(qp);
+	cm_id_priv->qp = qp;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	ret = cm_id->device->iwcm->accept(cm_id, iw_param);
+	if (ret) {
+		/* An error on accept precludes provider events */
+		BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		if (cm_id_priv->qp) {
+			cm_id->device->iwcm->rem_ref(qp);
+			cm_id_priv->qp = NULL;
+		}
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_accept);
+
+/*
+ * Active Side: CM_ID <-- CONN_SENT
+ *
+ * If successful, results in the generation of a CONNECT_REPLY
+ * event. iw_cm_disconnect and iw_cm_destroy will block until the
+ * CONNECT_REPLY event is received from the provider.
+ */
+int iw_cm_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	struct iwcm_id_private *cm_id_priv;
+	int ret;
+	unsigned long flags;
+	struct ib_qp *qp;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	ret = alloc_work_entries(cm_id_priv, 4);
+	if (ret)
+		return ret;
+
+	set_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+	if (cm_id_priv->state != IW_CM_STATE_IDLE) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+
+	/* Get the ib_qp given the QPN */
+	qp = cm_id->device->iwcm->get_qp(cm_id->device, iw_param->qpn);
+	if (!qp) {
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+		return -EINVAL;
+	}
+	cm_id->device->iwcm->add_ref(qp);
+	cm_id_priv->qp = qp;
+	cm_id_priv->state = IW_CM_STATE_CONN_SENT;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	ret = cm_id->device->iwcm->connect(cm_id, iw_param);
+	if (ret) {
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		if (cm_id_priv->qp) {
+			cm_id->device->iwcm->rem_ref(qp);
+			cm_id_priv->qp = NULL;
+		}
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+		clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+		wake_up_all(&cm_id_priv->connect_wait);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_connect);
+
+/*
+ * Passive Side: new CM_ID <-- CONN_RECV
+ *
+ * Handles an inbound connect request. The function creates a new
+ * iw_cm_id to represent the new connection and inherits the client
+ * callback function and other attributes from the listening parent.
+ *
+ * The work item contains a pointer to the listen_cm_id and the event. The
+ * listen_cm_id contains the client cm_handler, context and
+ * device. These are copied when the device is cloned. The event
+ * contains the new four tuple.
+ *
+ * An error on the child should not affect the parent, so this
+ * function does not return a value.
+ */
+static void cm_conn_req_handler(struct iwcm_id_private *listen_id_priv,
+				struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	struct iw_cm_id *cm_id;
+	struct iwcm_id_private *cm_id_priv;
+	int ret;
+
+	/*
+	 * The provider should never generate a connection request
+	 * event with a bad status.
+	 */
+	BUG_ON(iw_event->status);
+
+	cm_id = iw_create_cm_id(listen_id_priv->id.device,
+				listen_id_priv->id.cm_handler,
+				listen_id_priv->id.context);
+	/* If the cm_id could not be created, ignore the request */
+	if (IS_ERR(cm_id))
+		goto out;
+
+	cm_id->provider_data = iw_event->provider_data;
+	cm_id->local_addr = iw_event->local_addr;
+	cm_id->remote_addr = iw_event->remote_addr;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	cm_id_priv->state = IW_CM_STATE_CONN_RECV;
+
+	/*
+	 * We could be destroying the listening id. If so, ignore this
+	 * upcall.
+	 */
+	spin_lock_irqsave(&listen_id_priv->lock, flags);
+	if (listen_id_priv->state != IW_CM_STATE_LISTEN) {
+		spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+		iw_cm_reject(cm_id, NULL, 0);
+		iw_destroy_cm_id(cm_id);
+		goto out;
+	}
+	spin_unlock_irqrestore(&listen_id_priv->lock, flags);
+
+	ret = alloc_work_entries(cm_id_priv, 3);
+	if (ret) {
+		iw_cm_reject(cm_id, NULL, 0);
+		iw_destroy_cm_id(cm_id);
+		goto out;
+	}
+
+	/* Call the client CM handler */
+	ret = cm_id->cm_handler(cm_id, iw_event);
+	if (ret) {
+		iw_cm_reject(cm_id, NULL, 0);
+		set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+		destroy_cm_id(cm_id);
+		if (atomic_read(&cm_id_priv->refcount)==0)
+			free_cm_id(cm_id_priv);
+	}
+
+out:
+	if (iw_event->private_data_len)
+		kfree(iw_event->private_data);
+}
+
+/*
+ * Passive Side: CM_ID <-- ESTABLISHED
+ *
+ * The provider generated an ESTABLISHED event which means that
+ * the MPA negotion has completed successfully and we are now in MPA
+ * FPDU mode.
+ *
+ * This event can only be received in the CONN_RECV state. If the
+ * remote peer closed, the ESTABLISHED event would be received followed
+ * by the CLOSE event. If the app closes, it will block until we wake
+ * it up after processing this event.
+ */
+static int cm_conn_est_handler(struct iwcm_id_private *cm_id_priv,
+			       struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+	/*
+	 * We clear the CONNECT_WAIT bit here to allow the callback
+	 * function to call iw_cm_disconnect. Calling iw_destroy_cm_id
+	 * from a callback handler is not allowed.
+	 */
+	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_RECV);
+	cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+	wake_up_all(&cm_id_priv->connect_wait);
+
+	return ret;
+}
+
+/*
+ * Active Side: CM_ID <-- ESTABLISHED
+ *
+ * The app has called connect and is waiting for the established event to
+ * post it's requests to the server. This event will wake up anyone
+ * blocked in iw_cm_disconnect or iw_destroy_id.
+ */
+static int cm_conn_rep_handler(struct iwcm_id_private *cm_id_priv,
+			       struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	/*
+	 * Clear the connect wait bit so a callback function calling
+	 * iw_cm_disconnect will not wait and deadlock this thread
+	 */
+	clear_bit(IWCM_F_CONNECT_WAIT, &cm_id_priv->flags);
+	BUG_ON(cm_id_priv->state != IW_CM_STATE_CONN_SENT);
+	if (iw_event->status == 0) {
+		cm_id_priv->id.local_addr = iw_event->local_addr;
+		cm_id_priv->id.remote_addr = iw_event->remote_addr;
+		cm_id_priv->state = IW_CM_STATE_ESTABLISHED;
+	} else {
+		/* REJECTED or RESET */
+		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+		cm_id_priv->qp = NULL;
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+
+	if (iw_event->private_data_len)
+		kfree(iw_event->private_data);
+
+	/* Wake up waiters on connect complete */
+	wake_up_all(&cm_id_priv->connect_wait);
+
+	return ret;
+}
+
+/*
+ * CM_ID <-- CLOSING
+ *
+ * If in the ESTABLISHED state, move to CLOSING.
+ */
+static void cm_disconnect_handler(struct iwcm_id_private *cm_id_priv,
+				  struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	if (cm_id_priv->state == IW_CM_STATE_ESTABLISHED)
+		cm_id_priv->state = IW_CM_STATE_CLOSING;
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * CM_ID <-- IDLE
+ *
+ * If in the ESTBLISHED or CLOSING states, the QP will have have been
+ * moved by the provider to the ERR state. Disassociate the CM_ID from
+ * the QP,  move to IDLE, and remove the 'connected' reference.
+ *
+ * If in some other state, the cm_id was destroyed asynchronously.
+ * This is the last reference that will result in waking up
+ * the app thread blocked in iw_destroy_cm_id.
+ */
+static int cm_close_handler(struct iwcm_id_private *cm_id_priv,
+				  struct iw_cm_event *iw_event)
+{
+	unsigned long flags;
+	int ret = 0;
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+
+	if (cm_id_priv->qp) {
+		cm_id_priv->id.device->iwcm->rem_ref(cm_id_priv->qp);
+		cm_id_priv->qp = NULL;
+	}
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_ESTABLISHED:
+	case IW_CM_STATE_CLOSING:
+		cm_id_priv->state = IW_CM_STATE_IDLE;
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+		ret = cm_id_priv->id.cm_handler(&cm_id_priv->id, iw_event);
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+		break;
+	case IW_CM_STATE_DESTROYING:
+		break;
+	default:
+		BUG();
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+	return ret;
+}
+
+static int process_event(struct iwcm_id_private *cm_id_priv,
+			 struct iw_cm_event *iw_event)
+{
+	int ret = 0;
+
+	switch (iw_event->event) {
+	case IW_CM_EVENT_CONNECT_REQUEST:
+		cm_conn_req_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_CONNECT_REPLY:
+		ret = cm_conn_rep_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_ESTABLISHED:
+		ret = cm_conn_est_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_DISCONNECT:
+		cm_disconnect_handler(cm_id_priv, iw_event);
+		break;
+	case IW_CM_EVENT_CLOSE:
+		ret = cm_close_handler(cm_id_priv, iw_event);
+		break;
+	default:
+		BUG();
+	}
+
+	return ret;
+}
+
+/*
+ * Process events on the work_list for the cm_id. If the callback
+ * function requests that the cm_id be deleted, a flag is set in the
+ * cm_id flags to indicate that when the last reference is
+ * removed, the cm_id is to be destroyed. This is necessary to
+ * distinguish between an object that will be destroyed by the app
+ * thread asleep on the destroy_comp list vs. an object destroyed
+ * here synchronously when the last reference is removed.
+ */
+static void cm_work_handler(struct work_struct *_work)
+{
+	struct iwcm_work *work = container_of(_work, struct iwcm_work, work);
+	struct iw_cm_event levent;
+	struct iwcm_id_private *cm_id_priv = work->cm_id;
+	unsigned long flags;
+	int empty;
+	int ret = 0;
+	int destroy_id;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	empty = list_empty(&cm_id_priv->work_list);
+	while (!empty) {
+		work = list_entry(cm_id_priv->work_list.next,
+				  struct iwcm_work, list);
+		list_del_init(&work->list);
+		empty = list_empty(&cm_id_priv->work_list);
+		levent = work->event;
+		put_work(work);
+		spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+
+		ret = process_event(cm_id_priv, &levent);
+		if (ret) {
+			set_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+			destroy_cm_id(&cm_id_priv->id);
+		}
+		BUG_ON(atomic_read(&cm_id_priv->refcount)==0);
+		destroy_id = test_bit(IWCM_F_CALLBACK_DESTROY, &cm_id_priv->flags);
+		if (iwcm_deref_id(cm_id_priv)) {
+			if (destroy_id) {
+				BUG_ON(!list_empty(&cm_id_priv->work_list));
+				free_cm_id(cm_id_priv);
+			}
+			return;
+		}
+		if (empty)
+			return;
+		spin_lock_irqsave(&cm_id_priv->lock, flags);
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+}
+
+/*
+ * This function is called on interrupt context. Schedule events on
+ * the iwcm_wq thread to allow callback functions to downcall into
+ * the CM and/or block.  Events are queued to a per-CM_ID
+ * work_list. If this is the first event on the work_list, the work
+ * element is also queued on the iwcm_wq thread.
+ *
+ * Each event holds a reference on the cm_id. Until the last posted
+ * event has been delivered and processed, the cm_id cannot be
+ * deleted.
+ *
+ * Returns:
+ * 	      0	- the event was handled.
+ *	-ENOMEM	- the event was not handled due to lack of resources.
+ */
+static int cm_event_handler(struct iw_cm_id *cm_id,
+			     struct iw_cm_event *iw_event)
+{
+	struct iwcm_work *work;
+	struct iwcm_id_private *cm_id_priv;
+	unsigned long flags;
+	int ret = 0;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	work = get_work(cm_id_priv);
+	if (!work) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	INIT_WORK(&work->work, cm_work_handler);
+	work->cm_id = cm_id_priv;
+	work->event = *iw_event;
+
+	if ((work->event.event == IW_CM_EVENT_CONNECT_REQUEST ||
+	     work->event.event == IW_CM_EVENT_CONNECT_REPLY) &&
+	    work->event.private_data_len) {
+		ret = copy_private_data(&work->event);
+		if (ret) {
+			put_work(work);
+			goto out;
+		}
+	}
+
+	atomic_inc(&cm_id_priv->refcount);
+	if (list_empty(&cm_id_priv->work_list)) {
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+		queue_work(iwcm_wq, &work->work);
+	} else
+		list_add_tail(&work->list, &cm_id_priv->work_list);
+out:
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int iwcm_init_qp_init_attr(struct iwcm_id_private *cm_id_priv,
+				  struct ib_qp_attr *qp_attr,
+				  int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_IDLE:
+	case IW_CM_STATE_CONN_SENT:
+	case IW_CM_STATE_CONN_RECV:
+	case IW_CM_STATE_ESTABLISHED:
+		*qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS;
+		qp_attr->qp_access_flags = IB_ACCESS_REMOTE_WRITE|
+					   IB_ACCESS_REMOTE_READ;
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+static int iwcm_init_qp_rts_attr(struct iwcm_id_private *cm_id_priv,
+				  struct ib_qp_attr *qp_attr,
+				  int *qp_attr_mask)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&cm_id_priv->lock, flags);
+	switch (cm_id_priv->state) {
+	case IW_CM_STATE_IDLE:
+	case IW_CM_STATE_CONN_SENT:
+	case IW_CM_STATE_CONN_RECV:
+	case IW_CM_STATE_ESTABLISHED:
+		*qp_attr_mask = 0;
+		ret = 0;
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	spin_unlock_irqrestore(&cm_id_priv->lock, flags);
+	return ret;
+}
+
+int iw_cm_init_qp_attr(struct iw_cm_id *cm_id,
+		       struct ib_qp_attr *qp_attr,
+		       int *qp_attr_mask)
+{
+	struct iwcm_id_private *cm_id_priv;
+	int ret;
+
+	cm_id_priv = container_of(cm_id, struct iwcm_id_private, id);
+	switch (qp_attr->qp_state) {
+	case IB_QPS_INIT:
+	case IB_QPS_RTR:
+		ret = iwcm_init_qp_init_attr(cm_id_priv,
+					     qp_attr, qp_attr_mask);
+		break;
+	case IB_QPS_RTS:
+		ret = iwcm_init_qp_rts_attr(cm_id_priv,
+					    qp_attr, qp_attr_mask);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(iw_cm_init_qp_attr);
+
+static int __init iw_cm_init(void)
+{
+	iwcm_wq = create_singlethread_workqueue("iw_cm_wq");
+	if (!iwcm_wq)
+		return -ENOMEM;
+
+	iwcm_ctl_table_hdr = register_net_sysctl(&init_net, "net/iw_cm",
+						 iwcm_ctl_table);
+	if (!iwcm_ctl_table_hdr) {
+		pr_err("iw_cm: couldn't register sysctl paths\n");
+		destroy_workqueue(iwcm_wq);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void __exit iw_cm_cleanup(void)
+{
+	unregister_net_sysctl_table(iwcm_ctl_table_hdr);
+	destroy_workqueue(iwcm_wq);
+}
+
+module_init(iw_cm_init);
+module_exit(iw_cm_cleanup);
diff --git a/drivers/infiniband/core/iwcm.h b/drivers/infiniband/core/iwcm.h
new file mode 100644
index 000000000..3f6cc8256
--- /dev/null
+++ b/drivers/infiniband/core/iwcm.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2005 Network Appliance, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef IWCM_H
+#define IWCM_H
+
+enum iw_cm_state {
+	IW_CM_STATE_IDLE,             /* unbound, inactive */
+	IW_CM_STATE_LISTEN,           /* listen waiting for connect */
+	IW_CM_STATE_CONN_RECV,        /* inbound waiting for user accept */
+	IW_CM_STATE_CONN_SENT,        /* outbound waiting for peer accept */
+	IW_CM_STATE_ESTABLISHED,      /* established */
+	IW_CM_STATE_CLOSING,	      /* disconnect */
+	IW_CM_STATE_DESTROYING        /* object being deleted */
+};
+
+struct iwcm_id_private {
+	struct iw_cm_id	id;
+	enum iw_cm_state state;
+	unsigned long flags;
+	struct ib_qp *qp;
+	struct completion destroy_comp;
+	wait_queue_head_t connect_wait;
+	struct list_head work_list;
+	spinlock_t lock;
+	atomic_t refcount;
+	struct list_head work_free_list;
+};
+
+#define IWCM_F_CALLBACK_DESTROY   1
+#define IWCM_F_CONNECT_WAIT       2
+
+#endif /* IWCM_H */
diff --git a/drivers/infiniband/core/iwpm_msg.c b/drivers/infiniband/core/iwpm_msg.c
new file mode 100644
index 000000000..e6ffa2e66
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_msg.c
@@ -0,0 +1,756 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iwpm_util.h"
+
+static const char iwpm_ulib_name[IWPM_ULIBNAME_SIZE] = "iWarpPortMapperUser";
+static int iwpm_ulib_version = 3;
+static int iwpm_user_pid = IWPM_PID_UNDEFINED;
+static atomic_t echo_nlmsg_seq;
+
+int iwpm_valid_pid(void)
+{
+	return iwpm_user_pid > 0;
+}
+EXPORT_SYMBOL(iwpm_valid_pid);
+
+/*
+ * iwpm_register_pid - Send a netlink query to user space
+ *                     for the iwarp port mapper pid
+ *
+ * nlmsg attributes:
+ *	[IWPM_NLA_REG_PID_SEQ]
+ *	[IWPM_NLA_REG_IF_NAME]
+ *	[IWPM_NLA_REG_IBDEV_NAME]
+ *	[IWPM_NLA_REG_ULIB_NAME]
+ */
+int iwpm_register_pid(struct iwpm_dev_data *pm_msg, u8 nl_client)
+{
+	struct sk_buff *skb = NULL;
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	if (!iwpm_valid_client(nl_client)) {
+		err_str = "Invalid port mapper client";
+		goto pid_query_error;
+	}
+	if (iwpm_registered_client(nl_client))
+		return 0;
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REG_PID, &nlh, nl_client);
+	if (!skb) {
+		err_str = "Unable to create a nlmsg";
+		goto pid_query_error;
+	}
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL);
+	if (!nlmsg_request) {
+		err_str = "Unable to allocate netlink request";
+		goto pid_query_error;
+	}
+	msg_seq = atomic_read(&echo_nlmsg_seq);
+
+	/* fill in the pid request message */
+	err_str = "Unable to put attribute of the nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_REG_PID_SEQ);
+	if (ret)
+		goto pid_query_error;
+	ret = ibnl_put_attr(skb, nlh, IWPM_IFNAME_SIZE,
+				pm_msg->if_name, IWPM_NLA_REG_IF_NAME);
+	if (ret)
+		goto pid_query_error;
+	ret = ibnl_put_attr(skb, nlh, IWPM_DEVNAME_SIZE,
+				pm_msg->dev_name, IWPM_NLA_REG_IBDEV_NAME);
+	if (ret)
+		goto pid_query_error;
+	ret = ibnl_put_attr(skb, nlh, IWPM_ULIBNAME_SIZE,
+				(char *)iwpm_ulib_name, IWPM_NLA_REG_ULIB_NAME);
+	if (ret)
+		goto pid_query_error;
+
+	pr_debug("%s: Multicasting a nlmsg (dev = %s ifname = %s iwpm = %s)\n",
+		__func__, pm_msg->dev_name, pm_msg->if_name, iwpm_ulib_name);
+
+	ret = ibnl_multicast(skb, nlh, RDMA_NL_GROUP_IWPM, GFP_KERNEL);
+	if (ret) {
+		skb = NULL; /* skb is freed in the netlink send-op handling */
+		iwpm_set_registered(nl_client, 1);
+		iwpm_user_pid = IWPM_PID_UNAVAILABLE;
+		err_str = "Unable to send a nlmsg";
+		goto pid_query_error;
+	}
+	nlmsg_request->req_buffer = pm_msg;
+	ret = iwpm_wait_complete_req(nlmsg_request);
+	return ret;
+pid_query_error:
+	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+	if (skb)
+		dev_kfree_skb(skb);
+	if (nlmsg_request)
+		iwpm_free_nlmsg_request(&nlmsg_request->kref);
+	return ret;
+}
+EXPORT_SYMBOL(iwpm_register_pid);
+
+/*
+ * iwpm_add_mapping - Send a netlink add mapping message
+ *                    to the port mapper
+ * nlmsg attributes:
+ *	[IWPM_NLA_MANAGE_MAPPING_SEQ]
+ *	[IWPM_NLA_MANAGE_ADDR]
+ */
+int iwpm_add_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
+{
+	struct sk_buff *skb = NULL;
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	if (!iwpm_valid_client(nl_client)) {
+		err_str = "Invalid port mapper client";
+		goto add_mapping_error;
+	}
+	if (!iwpm_registered_client(nl_client)) {
+		err_str = "Unregistered port mapper client";
+		goto add_mapping_error;
+	}
+	if (!iwpm_valid_pid())
+		return 0;
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_ADD_MAPPING, &nlh, nl_client);
+	if (!skb) {
+		err_str = "Unable to create a nlmsg";
+		goto add_mapping_error;
+	}
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq, nl_client, GFP_KERNEL);
+	if (!nlmsg_request) {
+		err_str = "Unable to allocate netlink request";
+		goto add_mapping_error;
+	}
+	msg_seq = atomic_read(&echo_nlmsg_seq);
+	/* fill in the add mapping message */
+	err_str = "Unable to put attribute of the nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+				IWPM_NLA_MANAGE_MAPPING_SEQ);
+	if (ret)
+		goto add_mapping_error;
+	ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+				&pm_msg->loc_addr, IWPM_NLA_MANAGE_ADDR);
+	if (ret)
+		goto add_mapping_error;
+	nlmsg_request->req_buffer = pm_msg;
+
+	ret = ibnl_unicast(skb, nlh, iwpm_user_pid);
+	if (ret) {
+		skb = NULL; /* skb is freed in the netlink send-op handling */
+		iwpm_user_pid = IWPM_PID_UNDEFINED;
+		err_str = "Unable to send a nlmsg";
+		goto add_mapping_error;
+	}
+	ret = iwpm_wait_complete_req(nlmsg_request);
+	return ret;
+add_mapping_error:
+	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+	if (skb)
+		dev_kfree_skb(skb);
+	if (nlmsg_request)
+		iwpm_free_nlmsg_request(&nlmsg_request->kref);
+	return ret;
+}
+EXPORT_SYMBOL(iwpm_add_mapping);
+
+/*
+ * iwpm_add_and_query_mapping - Send a netlink add and query
+ *                              mapping message to the port mapper
+ * nlmsg attributes:
+ *	[IWPM_NLA_QUERY_MAPPING_SEQ]
+ *	[IWPM_NLA_QUERY_LOCAL_ADDR]
+ *	[IWPM_NLA_QUERY_REMOTE_ADDR]
+ */
+int iwpm_add_and_query_mapping(struct iwpm_sa_data *pm_msg, u8 nl_client)
+{
+	struct sk_buff *skb = NULL;
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	if (!iwpm_valid_client(nl_client)) {
+		err_str = "Invalid port mapper client";
+		goto query_mapping_error;
+	}
+	if (!iwpm_registered_client(nl_client)) {
+		err_str = "Unregistered port mapper client";
+		goto query_mapping_error;
+	}
+	if (!iwpm_valid_pid())
+		return 0;
+	ret = -ENOMEM;
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_QUERY_MAPPING, &nlh, nl_client);
+	if (!skb) {
+		err_str = "Unable to create a nlmsg";
+		goto query_mapping_error;
+	}
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	nlmsg_request = iwpm_get_nlmsg_request(nlh->nlmsg_seq,
+				nl_client, GFP_KERNEL);
+	if (!nlmsg_request) {
+		err_str = "Unable to allocate netlink request";
+		goto query_mapping_error;
+	}
+	msg_seq = atomic_read(&echo_nlmsg_seq);
+
+	/* fill in the query message */
+	err_str = "Unable to put attribute of the nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+				IWPM_NLA_QUERY_MAPPING_SEQ);
+	if (ret)
+		goto query_mapping_error;
+	ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+				&pm_msg->loc_addr, IWPM_NLA_QUERY_LOCAL_ADDR);
+	if (ret)
+		goto query_mapping_error;
+	ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+				&pm_msg->rem_addr, IWPM_NLA_QUERY_REMOTE_ADDR);
+	if (ret)
+		goto query_mapping_error;
+	nlmsg_request->req_buffer = pm_msg;
+
+	ret = ibnl_unicast(skb, nlh, iwpm_user_pid);
+	if (ret) {
+		skb = NULL; /* skb is freed in the netlink send-op handling */
+		err_str = "Unable to send a nlmsg";
+		goto query_mapping_error;
+	}
+	ret = iwpm_wait_complete_req(nlmsg_request);
+	return ret;
+query_mapping_error:
+	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+	if (skb)
+		dev_kfree_skb(skb);
+	if (nlmsg_request)
+		iwpm_free_nlmsg_request(&nlmsg_request->kref);
+	return ret;
+}
+EXPORT_SYMBOL(iwpm_add_and_query_mapping);
+
+/*
+ * iwpm_remove_mapping - Send a netlink remove mapping message
+ *                       to the port mapper
+ * nlmsg attributes:
+ *	[IWPM_NLA_MANAGE_MAPPING_SEQ]
+ *	[IWPM_NLA_MANAGE_ADDR]
+ */
+int iwpm_remove_mapping(struct sockaddr_storage *local_addr, u8 nl_client)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	if (!iwpm_valid_client(nl_client)) {
+		err_str = "Invalid port mapper client";
+		goto remove_mapping_error;
+	}
+	if (!iwpm_registered_client(nl_client)) {
+		err_str = "Unregistered port mapper client";
+		goto remove_mapping_error;
+	}
+	if (!iwpm_valid_pid())
+		return 0;
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_REMOVE_MAPPING, &nlh, nl_client);
+	if (!skb) {
+		ret = -ENOMEM;
+		err_str = "Unable to create a nlmsg";
+		goto remove_mapping_error;
+	}
+	msg_seq = atomic_read(&echo_nlmsg_seq);
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	err_str = "Unable to put attribute of the nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq,
+				IWPM_NLA_MANAGE_MAPPING_SEQ);
+	if (ret)
+		goto remove_mapping_error;
+	ret = ibnl_put_attr(skb, nlh, sizeof(struct sockaddr_storage),
+				local_addr, IWPM_NLA_MANAGE_ADDR);
+	if (ret)
+		goto remove_mapping_error;
+
+	ret = ibnl_unicast(skb, nlh, iwpm_user_pid);
+	if (ret) {
+		skb = NULL; /* skb is freed in the netlink send-op handling */
+		iwpm_user_pid = IWPM_PID_UNDEFINED;
+		err_str = "Unable to send a nlmsg";
+		goto remove_mapping_error;
+	}
+	iwpm_print_sockaddr(local_addr,
+			"remove_mapping: Local sockaddr:");
+	return 0;
+remove_mapping_error:
+	pr_info("%s: %s (client = %d)\n", __func__, err_str, nl_client);
+	if (skb)
+		dev_kfree_skb_any(skb);
+	return ret;
+}
+EXPORT_SYMBOL(iwpm_remove_mapping);
+
+/* netlink attribute policy for the received response to register pid request */
+static const struct nla_policy resp_reg_policy[IWPM_NLA_RREG_PID_MAX] = {
+	[IWPM_NLA_RREG_PID_SEQ]     = { .type = NLA_U32 },
+	[IWPM_NLA_RREG_IBDEV_NAME]  = { .type = NLA_STRING,
+					.len = IWPM_DEVNAME_SIZE - 1 },
+	[IWPM_NLA_RREG_ULIB_NAME]   = { .type = NLA_STRING,
+					.len = IWPM_ULIBNAME_SIZE - 1 },
+	[IWPM_NLA_RREG_ULIB_VER]    = { .type = NLA_U16 },
+	[IWPM_NLA_RREG_PID_ERR]     = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_register_pid_cb - Process a port mapper response to
+ *                        iwpm_register_pid()
+ */
+int iwpm_register_pid_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlattr *nltb[IWPM_NLA_RREG_PID_MAX];
+	struct iwpm_dev_data *pm_msg;
+	char *dev_name, *iwpm_name;
+	u32 msg_seq;
+	u8 nl_client;
+	u16 iwpm_version;
+	const char *msg_type = "Register Pid response";
+
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_RREG_PID_MAX,
+				resp_reg_policy, nltb, msg_type))
+		return -EINVAL;
+
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_RREG_PID_SEQ]);
+	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+	if (!nlmsg_request) {
+		pr_info("%s: Could not find a matching request (seq = %u)\n",
+				 __func__, msg_seq);
+		return -EINVAL;
+	}
+	pm_msg = nlmsg_request->req_buffer;
+	nl_client = nlmsg_request->nl_client;
+	dev_name = (char *)nla_data(nltb[IWPM_NLA_RREG_IBDEV_NAME]);
+	iwpm_name = (char *)nla_data(nltb[IWPM_NLA_RREG_ULIB_NAME]);
+	iwpm_version = nla_get_u16(nltb[IWPM_NLA_RREG_ULIB_VER]);
+
+	/* check device name, ulib name and version */
+	if (strcmp(pm_msg->dev_name, dev_name) ||
+			strcmp(iwpm_ulib_name, iwpm_name) ||
+			iwpm_version != iwpm_ulib_version) {
+
+		pr_info("%s: Incorrect info (dev = %s name = %s version = %d)\n",
+				__func__, dev_name, iwpm_name, iwpm_version);
+		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+		goto register_pid_response_exit;
+	}
+	iwpm_user_pid = cb->nlh->nlmsg_pid;
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+	pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
+			__func__, iwpm_user_pid);
+	if (iwpm_valid_client(nl_client))
+		iwpm_set_registered(nl_client, 1);
+register_pid_response_exit:
+	nlmsg_request->request_done = 1;
+	/* always for found nlmsg_request */
+	kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+	barrier();
+	wake_up(&nlmsg_request->waitq);
+	return 0;
+}
+EXPORT_SYMBOL(iwpm_register_pid_cb);
+
+/* netlink attribute policy for the received response to add mapping request */
+static const struct nla_policy resp_add_policy[IWPM_NLA_RMANAGE_MAPPING_MAX] = {
+	[IWPM_NLA_MANAGE_MAPPING_SEQ]     = { .type = NLA_U32 },
+	[IWPM_NLA_MANAGE_ADDR]            = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RMANAGE_MAPPING_ERR]	  = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_add_mapping_cb - Process a port mapper response to
+ *                       iwpm_add_mapping()
+ */
+int iwpm_add_mapping_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct iwpm_sa_data *pm_msg;
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlattr *nltb[IWPM_NLA_RMANAGE_MAPPING_MAX];
+	struct sockaddr_storage *local_sockaddr;
+	struct sockaddr_storage *mapped_sockaddr;
+	const char *msg_type;
+	u32 msg_seq;
+
+	msg_type = "Add Mapping response";
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_RMANAGE_MAPPING_MAX,
+				resp_add_policy, nltb, msg_type))
+		return -EINVAL;
+
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_MANAGE_MAPPING_SEQ]);
+	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+	if (!nlmsg_request) {
+		pr_info("%s: Could not find a matching request (seq = %u)\n",
+				 __func__, msg_seq);
+		return -EINVAL;
+	}
+	pm_msg = nlmsg_request->req_buffer;
+	local_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_MANAGE_ADDR]);
+	mapped_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_MANAGE_MAPPED_LOC_ADDR]);
+
+	if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr)) {
+		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+		goto add_mapping_response_exit;
+	}
+	if (mapped_sockaddr->ss_family != local_sockaddr->ss_family) {
+		pr_info("%s: Sockaddr family doesn't match the requested one\n",
+				__func__);
+		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+		goto add_mapping_response_exit;
+	}
+	memcpy(&pm_msg->mapped_loc_addr, mapped_sockaddr,
+			sizeof(*mapped_sockaddr));
+	iwpm_print_sockaddr(&pm_msg->loc_addr,
+			"add_mapping: Local sockaddr:");
+	iwpm_print_sockaddr(&pm_msg->mapped_loc_addr,
+			"add_mapping: Mapped local sockaddr:");
+
+add_mapping_response_exit:
+	nlmsg_request->request_done = 1;
+	/* always for found request */
+	kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+	barrier();
+	wake_up(&nlmsg_request->waitq);
+	return 0;
+}
+EXPORT_SYMBOL(iwpm_add_mapping_cb);
+
+/* netlink attribute policy for the response to add and query mapping request
+ * and response with remote address info */
+static const struct nla_policy resp_query_policy[IWPM_NLA_RQUERY_MAPPING_MAX] = {
+	[IWPM_NLA_QUERY_MAPPING_SEQ]      = { .type = NLA_U32 },
+	[IWPM_NLA_QUERY_LOCAL_ADDR]       = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_QUERY_REMOTE_ADDR]      = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPED_REM_ADDR] = { .len = sizeof(struct sockaddr_storage) },
+	[IWPM_NLA_RQUERY_MAPPING_ERR]	  = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_add_and_query_mapping_cb - Process a port mapper response to
+ *                                 iwpm_add_and_query_mapping()
+ */
+int iwpm_add_and_query_mapping_cb(struct sk_buff *skb,
+				struct netlink_callback *cb)
+{
+	struct iwpm_sa_data *pm_msg;
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX];
+	struct sockaddr_storage *local_sockaddr, *remote_sockaddr;
+	struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr;
+	const char *msg_type;
+	u32 msg_seq;
+	u16 err_code;
+
+	msg_type = "Query Mapping response";
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX,
+				resp_query_policy, nltb, msg_type))
+		return -EINVAL;
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_QUERY_MAPPING_SEQ]);
+	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+	if (!nlmsg_request) {
+		pr_info("%s: Could not find a matching request (seq = %u)\n",
+				 __func__, msg_seq);
+			return -EINVAL;
+	}
+	pm_msg = nlmsg_request->req_buffer;
+	local_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+	remote_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+	mapped_loc_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
+	mapped_rem_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]);
+
+	err_code = nla_get_u16(nltb[IWPM_NLA_RQUERY_MAPPING_ERR]);
+	if (err_code == IWPM_REMOTE_QUERY_REJECT) {
+		pr_info("%s: Received a Reject (pid = %u, echo seq = %u)\n",
+			__func__, cb->nlh->nlmsg_pid, msg_seq);
+		nlmsg_request->err_code = IWPM_REMOTE_QUERY_REJECT;
+	}
+	if (iwpm_compare_sockaddr(local_sockaddr, &pm_msg->loc_addr) ||
+		iwpm_compare_sockaddr(remote_sockaddr, &pm_msg->rem_addr)) {
+		pr_info("%s: Incorrect local sockaddr\n", __func__);
+		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+		goto query_mapping_response_exit;
+	}
+	if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family ||
+		mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) {
+		pr_info("%s: Sockaddr family doesn't match the requested one\n",
+				__func__);
+		nlmsg_request->err_code = IWPM_USER_LIB_INFO_ERR;
+		goto query_mapping_response_exit;
+	}
+	memcpy(&pm_msg->mapped_loc_addr, mapped_loc_sockaddr,
+			sizeof(*mapped_loc_sockaddr));
+	memcpy(&pm_msg->mapped_rem_addr, mapped_rem_sockaddr,
+			sizeof(*mapped_rem_sockaddr));
+
+	iwpm_print_sockaddr(&pm_msg->loc_addr,
+			"query_mapping: Local sockaddr:");
+	iwpm_print_sockaddr(&pm_msg->mapped_loc_addr,
+			"query_mapping: Mapped local sockaddr:");
+	iwpm_print_sockaddr(&pm_msg->rem_addr,
+			"query_mapping: Remote sockaddr:");
+	iwpm_print_sockaddr(&pm_msg->mapped_rem_addr,
+			"query_mapping: Mapped remote sockaddr:");
+query_mapping_response_exit:
+	nlmsg_request->request_done = 1;
+	/* always for found request */
+	kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+	barrier();
+	wake_up(&nlmsg_request->waitq);
+	return 0;
+}
+EXPORT_SYMBOL(iwpm_add_and_query_mapping_cb);
+
+/*
+ * iwpm_remote_info_cb - Process a port mapper message, containing
+ *			  the remote connecting peer address info
+ */
+int iwpm_remote_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *nltb[IWPM_NLA_RQUERY_MAPPING_MAX];
+	struct sockaddr_storage *local_sockaddr, *remote_sockaddr;
+	struct sockaddr_storage *mapped_loc_sockaddr, *mapped_rem_sockaddr;
+	struct iwpm_remote_info *rem_info;
+	const char *msg_type;
+	u8 nl_client;
+	int ret = -EINVAL;
+
+	msg_type = "Remote Mapping info";
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_RQUERY_MAPPING_MAX,
+				resp_query_policy, nltb, msg_type))
+		return ret;
+
+	nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+	if (!iwpm_valid_client(nl_client)) {
+		pr_info("%s: Invalid port mapper client = %d\n",
+				__func__, nl_client);
+		return ret;
+	}
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+
+	local_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_QUERY_LOCAL_ADDR]);
+	remote_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_QUERY_REMOTE_ADDR]);
+	mapped_loc_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_LOC_ADDR]);
+	mapped_rem_sockaddr = (struct sockaddr_storage *)
+			nla_data(nltb[IWPM_NLA_RQUERY_MAPPED_REM_ADDR]);
+
+	if (mapped_loc_sockaddr->ss_family != local_sockaddr->ss_family ||
+		mapped_rem_sockaddr->ss_family != remote_sockaddr->ss_family) {
+		pr_info("%s: Sockaddr family doesn't match the requested one\n",
+				__func__);
+		return ret;
+	}
+	rem_info = kzalloc(sizeof(struct iwpm_remote_info), GFP_ATOMIC);
+	if (!rem_info) {
+		pr_err("%s: Unable to allocate a remote info\n", __func__);
+		ret = -ENOMEM;
+		return ret;
+	}
+	memcpy(&rem_info->mapped_loc_sockaddr, mapped_loc_sockaddr,
+	       sizeof(struct sockaddr_storage));
+	memcpy(&rem_info->remote_sockaddr, remote_sockaddr,
+	       sizeof(struct sockaddr_storage));
+	memcpy(&rem_info->mapped_rem_sockaddr, mapped_rem_sockaddr,
+	       sizeof(struct sockaddr_storage));
+	rem_info->nl_client = nl_client;
+
+	iwpm_add_remote_info(rem_info);
+
+	iwpm_print_sockaddr(local_sockaddr,
+			"remote_info: Local sockaddr:");
+	iwpm_print_sockaddr(mapped_loc_sockaddr,
+			"remote_info: Mapped local sockaddr:");
+	iwpm_print_sockaddr(remote_sockaddr,
+			"remote_info: Remote sockaddr:");
+	iwpm_print_sockaddr(mapped_rem_sockaddr,
+			"remote_info: Mapped remote sockaddr:");
+	return ret;
+}
+EXPORT_SYMBOL(iwpm_remote_info_cb);
+
+/* netlink attribute policy for the received request for mapping info */
+static const struct nla_policy resp_mapinfo_policy[IWPM_NLA_MAPINFO_REQ_MAX] = {
+	[IWPM_NLA_MAPINFO_ULIB_NAME] = { .type = NLA_STRING,
+					.len = IWPM_ULIBNAME_SIZE - 1 },
+	[IWPM_NLA_MAPINFO_ULIB_VER]  = { .type = NLA_U16 }
+};
+
+/*
+ * iwpm_mapping_info_cb - Process a port mapper request for mapping info
+ */
+int iwpm_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *nltb[IWPM_NLA_MAPINFO_REQ_MAX];
+	const char *msg_type = "Mapping Info response";
+	int iwpm_pid;
+	u8 nl_client;
+	char *iwpm_name;
+	u16 iwpm_version;
+	int ret = -EINVAL;
+
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_REQ_MAX,
+				resp_mapinfo_policy, nltb, msg_type)) {
+		pr_info("%s: Unable to parse nlmsg\n", __func__);
+		return ret;
+	}
+	iwpm_name = (char *)nla_data(nltb[IWPM_NLA_MAPINFO_ULIB_NAME]);
+	iwpm_version = nla_get_u16(nltb[IWPM_NLA_MAPINFO_ULIB_VER]);
+	if (strcmp(iwpm_ulib_name, iwpm_name) ||
+			iwpm_version != iwpm_ulib_version) {
+		pr_info("%s: Invalid port mapper name = %s version = %d\n",
+				__func__, iwpm_name, iwpm_version);
+		return ret;
+	}
+	nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+	if (!iwpm_valid_client(nl_client)) {
+		pr_info("%s: Invalid port mapper client = %d\n",
+				__func__, nl_client);
+		return ret;
+	}
+	iwpm_set_registered(nl_client, 0);
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+	if (!iwpm_mapinfo_available())
+		return 0;
+	iwpm_pid = cb->nlh->nlmsg_pid;
+	pr_debug("%s: iWarp Port Mapper (pid = %d) is available!\n",
+		 __func__, iwpm_pid);
+	ret = iwpm_send_mapinfo(nl_client, iwpm_pid);
+	return ret;
+}
+EXPORT_SYMBOL(iwpm_mapping_info_cb);
+
+/* netlink attribute policy for the received mapping info ack */
+static const struct nla_policy ack_mapinfo_policy[IWPM_NLA_MAPINFO_NUM_MAX] = {
+	[IWPM_NLA_MAPINFO_SEQ]    =   { .type = NLA_U32 },
+	[IWPM_NLA_MAPINFO_SEND_NUM] = { .type = NLA_U32 },
+	[IWPM_NLA_MAPINFO_ACK_NUM] =  { .type = NLA_U32 }
+};
+
+/*
+ * iwpm_ack_mapping_info_cb - Process a port mapper ack for
+ *                            the provided mapping info records
+ */
+int iwpm_ack_mapping_info_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *nltb[IWPM_NLA_MAPINFO_NUM_MAX];
+	u32 mapinfo_send, mapinfo_ack;
+	const char *msg_type = "Mapping Info Ack";
+
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_MAPINFO_NUM_MAX,
+				ack_mapinfo_policy, nltb, msg_type))
+		return -EINVAL;
+	mapinfo_send = nla_get_u32(nltb[IWPM_NLA_MAPINFO_SEND_NUM]);
+	mapinfo_ack = nla_get_u32(nltb[IWPM_NLA_MAPINFO_ACK_NUM]);
+	if (mapinfo_ack != mapinfo_send)
+		pr_info("%s: Invalid mapinfo number (sent = %u ack-ed = %u)\n",
+			__func__, mapinfo_send, mapinfo_ack);
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+	return 0;
+}
+EXPORT_SYMBOL(iwpm_ack_mapping_info_cb);
+
+/* netlink attribute policy for the received port mapper error message */
+static const struct nla_policy map_error_policy[IWPM_NLA_ERR_MAX] = {
+	[IWPM_NLA_ERR_SEQ]        = { .type = NLA_U32 },
+	[IWPM_NLA_ERR_CODE]       = { .type = NLA_U16 },
+};
+
+/*
+ * iwpm_mapping_error_cb - Process a port mapper error message
+ */
+int iwpm_mapping_error_cb(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	int nl_client = RDMA_NL_GET_CLIENT(cb->nlh->nlmsg_type);
+	struct nlattr *nltb[IWPM_NLA_ERR_MAX];
+	u32 msg_seq;
+	u16 err_code;
+	const char *msg_type = "Mapping Error Msg";
+
+	if (iwpm_parse_nlmsg(cb, IWPM_NLA_ERR_MAX,
+				map_error_policy, nltb, msg_type))
+		return -EINVAL;
+
+	msg_seq = nla_get_u32(nltb[IWPM_NLA_ERR_SEQ]);
+	err_code = nla_get_u16(nltb[IWPM_NLA_ERR_CODE]);
+	pr_info("%s: Received msg seq = %u err code = %u client = %d\n",
+				__func__, msg_seq, err_code, nl_client);
+	/* look for nlmsg_request */
+	nlmsg_request = iwpm_find_nlmsg_request(msg_seq);
+	if (!nlmsg_request) {
+		/* not all errors have associated requests */
+		pr_debug("Could not find matching req (seq = %u)\n", msg_seq);
+		return 0;
+	}
+	atomic_set(&echo_nlmsg_seq, cb->nlh->nlmsg_seq);
+	nlmsg_request->err_code = err_code;
+	nlmsg_request->request_done = 1;
+	/* always for found request */
+	kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+	barrier();
+	wake_up(&nlmsg_request->waitq);
+	return 0;
+}
+EXPORT_SYMBOL(iwpm_mapping_error_cb);
diff --git a/drivers/infiniband/core/iwpm_util.c b/drivers/infiniband/core/iwpm_util.c
new file mode 100644
index 000000000..a626795bf
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_util.c
@@ -0,0 +1,749 @@
+/*
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iwpm_util.h"
+
+#define IWPM_MAPINFO_HASH_SIZE	512
+#define IWPM_MAPINFO_HASH_MASK	(IWPM_MAPINFO_HASH_SIZE - 1)
+#define IWPM_REMINFO_HASH_SIZE	64
+#define IWPM_REMINFO_HASH_MASK	(IWPM_REMINFO_HASH_SIZE - 1)
+
+static LIST_HEAD(iwpm_nlmsg_req_list);
+static DEFINE_SPINLOCK(iwpm_nlmsg_req_lock);
+
+static struct hlist_head *iwpm_hash_bucket;
+static DEFINE_SPINLOCK(iwpm_mapinfo_lock);
+
+static struct hlist_head *iwpm_reminfo_bucket;
+static DEFINE_SPINLOCK(iwpm_reminfo_lock);
+
+static DEFINE_MUTEX(iwpm_admin_lock);
+static struct iwpm_admin_data iwpm_admin;
+
+int iwpm_init(u8 nl_client)
+{
+	int ret = 0;
+	if (iwpm_valid_client(nl_client))
+		return -EINVAL;
+	mutex_lock(&iwpm_admin_lock);
+	if (atomic_read(&iwpm_admin.refcount) == 0) {
+		iwpm_hash_bucket = kzalloc(IWPM_MAPINFO_HASH_SIZE *
+					sizeof(struct hlist_head), GFP_KERNEL);
+		if (!iwpm_hash_bucket) {
+			ret = -ENOMEM;
+			pr_err("%s Unable to create mapinfo hash table\n", __func__);
+			goto init_exit;
+		}
+		iwpm_reminfo_bucket = kzalloc(IWPM_REMINFO_HASH_SIZE *
+					sizeof(struct hlist_head), GFP_KERNEL);
+		if (!iwpm_reminfo_bucket) {
+			kfree(iwpm_hash_bucket);
+			ret = -ENOMEM;
+			pr_err("%s Unable to create reminfo hash table\n", __func__);
+			goto init_exit;
+		}
+	}
+	atomic_inc(&iwpm_admin.refcount);
+init_exit:
+	mutex_unlock(&iwpm_admin_lock);
+	if (!ret) {
+		iwpm_set_valid(nl_client, 1);
+		pr_debug("%s: Mapinfo and reminfo tables are created\n",
+				__func__);
+	}
+	return ret;
+}
+EXPORT_SYMBOL(iwpm_init);
+
+static void free_hash_bucket(void);
+static void free_reminfo_bucket(void);
+
+int iwpm_exit(u8 nl_client)
+{
+
+	if (!iwpm_valid_client(nl_client))
+		return -EINVAL;
+	mutex_lock(&iwpm_admin_lock);
+	if (atomic_read(&iwpm_admin.refcount) == 0) {
+		mutex_unlock(&iwpm_admin_lock);
+		pr_err("%s Incorrect usage - negative refcount\n", __func__);
+		return -EINVAL;
+	}
+	if (atomic_dec_and_test(&iwpm_admin.refcount)) {
+		free_hash_bucket();
+		free_reminfo_bucket();
+		pr_debug("%s: Resources are destroyed\n", __func__);
+	}
+	mutex_unlock(&iwpm_admin_lock);
+	iwpm_set_valid(nl_client, 0);
+	return 0;
+}
+EXPORT_SYMBOL(iwpm_exit);
+
+static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage *,
+					       struct sockaddr_storage *);
+
+int iwpm_create_mapinfo(struct sockaddr_storage *local_sockaddr,
+			struct sockaddr_storage *mapped_sockaddr,
+			u8 nl_client)
+{
+	struct hlist_head *hash_bucket_head;
+	struct iwpm_mapping_info *map_info;
+	unsigned long flags;
+	int ret = -EINVAL;
+
+	if (!iwpm_valid_client(nl_client))
+		return ret;
+	map_info = kzalloc(sizeof(struct iwpm_mapping_info), GFP_KERNEL);
+	if (!map_info) {
+		pr_err("%s: Unable to allocate a mapping info\n", __func__);
+		return -ENOMEM;
+	}
+	memcpy(&map_info->local_sockaddr, local_sockaddr,
+	       sizeof(struct sockaddr_storage));
+	memcpy(&map_info->mapped_sockaddr, mapped_sockaddr,
+	       sizeof(struct sockaddr_storage));
+	map_info->nl_client = nl_client;
+
+	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+	if (iwpm_hash_bucket) {
+		hash_bucket_head = get_mapinfo_hash_bucket(
+					&map_info->local_sockaddr,
+					&map_info->mapped_sockaddr);
+		if (hash_bucket_head) {
+			hlist_add_head(&map_info->hlist_node, hash_bucket_head);
+			ret = 0;
+		}
+	}
+	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(iwpm_create_mapinfo);
+
+int iwpm_remove_mapinfo(struct sockaddr_storage *local_sockaddr,
+			struct sockaddr_storage *mapped_local_addr)
+{
+	struct hlist_node *tmp_hlist_node;
+	struct hlist_head *hash_bucket_head;
+	struct iwpm_mapping_info *map_info = NULL;
+	unsigned long flags;
+	int ret = -EINVAL;
+
+	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+	if (iwpm_hash_bucket) {
+		hash_bucket_head = get_mapinfo_hash_bucket(
+					local_sockaddr,
+					mapped_local_addr);
+		if (!hash_bucket_head)
+			goto remove_mapinfo_exit;
+
+		hlist_for_each_entry_safe(map_info, tmp_hlist_node,
+					hash_bucket_head, hlist_node) {
+
+			if (!iwpm_compare_sockaddr(&map_info->mapped_sockaddr,
+						mapped_local_addr)) {
+
+				hlist_del_init(&map_info->hlist_node);
+				kfree(map_info);
+				ret = 0;
+				break;
+			}
+		}
+	}
+remove_mapinfo_exit:
+	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(iwpm_remove_mapinfo);
+
+static void free_hash_bucket(void)
+{
+	struct hlist_node *tmp_hlist_node;
+	struct iwpm_mapping_info *map_info;
+	unsigned long flags;
+	int i;
+
+	/* remove all the mapinfo data from the list */
+	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+	for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
+		hlist_for_each_entry_safe(map_info, tmp_hlist_node,
+			&iwpm_hash_bucket[i], hlist_node) {
+
+				hlist_del_init(&map_info->hlist_node);
+				kfree(map_info);
+			}
+	}
+	/* free the hash list */
+	kfree(iwpm_hash_bucket);
+	iwpm_hash_bucket = NULL;
+	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+}
+
+static void free_reminfo_bucket(void)
+{
+	struct hlist_node *tmp_hlist_node;
+	struct iwpm_remote_info *rem_info;
+	unsigned long flags;
+	int i;
+
+	/* remove all the remote info from the list */
+	spin_lock_irqsave(&iwpm_reminfo_lock, flags);
+	for (i = 0; i < IWPM_REMINFO_HASH_SIZE; i++) {
+		hlist_for_each_entry_safe(rem_info, tmp_hlist_node,
+			&iwpm_reminfo_bucket[i], hlist_node) {
+
+				hlist_del_init(&rem_info->hlist_node);
+				kfree(rem_info);
+			}
+	}
+	/* free the hash list */
+	kfree(iwpm_reminfo_bucket);
+	iwpm_reminfo_bucket = NULL;
+	spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
+}
+
+static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage *,
+						struct sockaddr_storage *);
+
+void iwpm_add_remote_info(struct iwpm_remote_info *rem_info)
+{
+	struct hlist_head *hash_bucket_head;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iwpm_reminfo_lock, flags);
+	if (iwpm_reminfo_bucket) {
+		hash_bucket_head = get_reminfo_hash_bucket(
+					&rem_info->mapped_loc_sockaddr,
+					&rem_info->mapped_rem_sockaddr);
+		if (hash_bucket_head)
+			hlist_add_head(&rem_info->hlist_node, hash_bucket_head);
+	}
+	spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
+}
+
+int iwpm_get_remote_info(struct sockaddr_storage *mapped_loc_addr,
+				struct sockaddr_storage *mapped_rem_addr,
+				struct sockaddr_storage *remote_addr,
+				u8 nl_client)
+{
+	struct hlist_node *tmp_hlist_node;
+	struct hlist_head *hash_bucket_head;
+	struct iwpm_remote_info *rem_info = NULL;
+	unsigned long flags;
+	int ret = -EINVAL;
+
+	if (!iwpm_valid_client(nl_client)) {
+		pr_info("%s: Invalid client = %d\n", __func__, nl_client);
+		return ret;
+	}
+	spin_lock_irqsave(&iwpm_reminfo_lock, flags);
+	if (iwpm_reminfo_bucket) {
+		hash_bucket_head = get_reminfo_hash_bucket(
+					mapped_loc_addr,
+					mapped_rem_addr);
+		if (!hash_bucket_head)
+			goto get_remote_info_exit;
+		hlist_for_each_entry_safe(rem_info, tmp_hlist_node,
+					hash_bucket_head, hlist_node) {
+
+			if (!iwpm_compare_sockaddr(&rem_info->mapped_loc_sockaddr,
+				mapped_loc_addr) &&
+				!iwpm_compare_sockaddr(&rem_info->mapped_rem_sockaddr,
+				mapped_rem_addr)) {
+
+				memcpy(remote_addr, &rem_info->remote_sockaddr,
+					sizeof(struct sockaddr_storage));
+				iwpm_print_sockaddr(remote_addr,
+						"get_remote_info: Remote sockaddr:");
+
+				hlist_del_init(&rem_info->hlist_node);
+				kfree(rem_info);
+				ret = 0;
+				break;
+			}
+		}
+	}
+get_remote_info_exit:
+	spin_unlock_irqrestore(&iwpm_reminfo_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(iwpm_get_remote_info);
+
+struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq,
+					u8 nl_client, gfp_t gfp)
+{
+	struct iwpm_nlmsg_request *nlmsg_request = NULL;
+	unsigned long flags;
+
+	nlmsg_request = kzalloc(sizeof(struct iwpm_nlmsg_request), gfp);
+	if (!nlmsg_request) {
+		pr_err("%s Unable to allocate a nlmsg_request\n", __func__);
+		return NULL;
+	}
+	spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+	list_add_tail(&nlmsg_request->inprocess_list, &iwpm_nlmsg_req_list);
+	spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+
+	kref_init(&nlmsg_request->kref);
+	kref_get(&nlmsg_request->kref);
+	nlmsg_request->nlmsg_seq = nlmsg_seq;
+	nlmsg_request->nl_client = nl_client;
+	nlmsg_request->request_done = 0;
+	nlmsg_request->err_code = 0;
+	return nlmsg_request;
+}
+
+void iwpm_free_nlmsg_request(struct kref *kref)
+{
+	struct iwpm_nlmsg_request *nlmsg_request;
+	unsigned long flags;
+
+	nlmsg_request = container_of(kref, struct iwpm_nlmsg_request, kref);
+
+	spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+	list_del_init(&nlmsg_request->inprocess_list);
+	spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+
+	if (!nlmsg_request->request_done)
+		pr_debug("%s Freeing incomplete nlmsg request (seq = %u).\n",
+			__func__, nlmsg_request->nlmsg_seq);
+	kfree(nlmsg_request);
+}
+
+struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq)
+{
+	struct iwpm_nlmsg_request *nlmsg_request;
+	struct iwpm_nlmsg_request *found_request = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&iwpm_nlmsg_req_lock, flags);
+	list_for_each_entry(nlmsg_request, &iwpm_nlmsg_req_list,
+			    inprocess_list) {
+		if (nlmsg_request->nlmsg_seq == echo_seq) {
+			found_request = nlmsg_request;
+			kref_get(&nlmsg_request->kref);
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&iwpm_nlmsg_req_lock, flags);
+	return found_request;
+}
+
+int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request)
+{
+	int ret;
+	init_waitqueue_head(&nlmsg_request->waitq);
+
+	ret = wait_event_timeout(nlmsg_request->waitq,
+			(nlmsg_request->request_done != 0), IWPM_NL_TIMEOUT);
+	if (!ret) {
+		ret = -EINVAL;
+		pr_info("%s: Timeout %d sec for netlink request (seq = %u)\n",
+			__func__, (IWPM_NL_TIMEOUT/HZ), nlmsg_request->nlmsg_seq);
+	} else {
+		ret = nlmsg_request->err_code;
+	}
+	kref_put(&nlmsg_request->kref, iwpm_free_nlmsg_request);
+	return ret;
+}
+
+int iwpm_get_nlmsg_seq(void)
+{
+	return atomic_inc_return(&iwpm_admin.nlmsg_seq);
+}
+
+int iwpm_valid_client(u8 nl_client)
+{
+	if (nl_client >= RDMA_NL_NUM_CLIENTS)
+		return 0;
+	return iwpm_admin.client_list[nl_client];
+}
+
+void iwpm_set_valid(u8 nl_client, int valid)
+{
+	if (nl_client >= RDMA_NL_NUM_CLIENTS)
+		return;
+	iwpm_admin.client_list[nl_client] = valid;
+}
+
+/* valid client */
+int iwpm_registered_client(u8 nl_client)
+{
+	return iwpm_admin.reg_list[nl_client];
+}
+
+/* valid client */
+void iwpm_set_registered(u8 nl_client, int reg)
+{
+	iwpm_admin.reg_list[nl_client] = reg;
+}
+
+int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,
+				struct sockaddr_storage *b_sockaddr)
+{
+	if (a_sockaddr->ss_family != b_sockaddr->ss_family)
+		return 1;
+	if (a_sockaddr->ss_family == AF_INET) {
+		struct sockaddr_in *a4_sockaddr =
+			(struct sockaddr_in *)a_sockaddr;
+		struct sockaddr_in *b4_sockaddr =
+			(struct sockaddr_in *)b_sockaddr;
+		if (!memcmp(&a4_sockaddr->sin_addr,
+			&b4_sockaddr->sin_addr, sizeof(struct in_addr))
+			&& a4_sockaddr->sin_port == b4_sockaddr->sin_port)
+				return 0;
+
+	} else if (a_sockaddr->ss_family == AF_INET6) {
+		struct sockaddr_in6 *a6_sockaddr =
+			(struct sockaddr_in6 *)a_sockaddr;
+		struct sockaddr_in6 *b6_sockaddr =
+			(struct sockaddr_in6 *)b_sockaddr;
+		if (!memcmp(&a6_sockaddr->sin6_addr,
+			&b6_sockaddr->sin6_addr, sizeof(struct in6_addr))
+			&& a6_sockaddr->sin6_port == b6_sockaddr->sin6_port)
+				return 0;
+
+	} else {
+		pr_err("%s: Invalid sockaddr family\n", __func__);
+	}
+	return 1;
+}
+
+struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh,
+						int nl_client)
+{
+	struct sk_buff *skb = NULL;
+
+	skb = dev_alloc_skb(NLMSG_GOODSIZE);
+	if (!skb) {
+		pr_err("%s Unable to allocate skb\n", __func__);
+		goto create_nlmsg_exit;
+	}
+	if (!(ibnl_put_msg(skb, nlh, 0, 0, nl_client, nl_op,
+			   NLM_F_REQUEST))) {
+		pr_warn("%s: Unable to put the nlmsg header\n", __func__);
+		dev_kfree_skb(skb);
+		skb = NULL;
+	}
+create_nlmsg_exit:
+	return skb;
+}
+
+int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,
+				   const struct nla_policy *nlmsg_policy,
+				   struct nlattr *nltb[], const char *msg_type)
+{
+	int nlh_len = 0;
+	int ret;
+	const char *err_str = "";
+
+	ret = nlmsg_validate(cb->nlh, nlh_len, policy_max-1, nlmsg_policy);
+	if (ret) {
+		err_str = "Invalid attribute";
+		goto parse_nlmsg_error;
+	}
+	ret = nlmsg_parse(cb->nlh, nlh_len, nltb, policy_max-1, nlmsg_policy);
+	if (ret) {
+		err_str = "Unable to parse the nlmsg";
+		goto parse_nlmsg_error;
+	}
+	ret = iwpm_validate_nlmsg_attr(nltb, policy_max);
+	if (ret) {
+		err_str = "Invalid NULL attribute";
+		goto parse_nlmsg_error;
+	}
+	return 0;
+parse_nlmsg_error:
+	pr_warn("%s: %s (msg type %s ret = %d)\n",
+			__func__, err_str, msg_type, ret);
+	return ret;
+}
+
+void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg)
+{
+	struct sockaddr_in6 *sockaddr_v6;
+	struct sockaddr_in *sockaddr_v4;
+
+	switch (sockaddr->ss_family) {
+	case AF_INET:
+		sockaddr_v4 = (struct sockaddr_in *)sockaddr;
+		pr_debug("%s IPV4 %pI4: %u(0x%04X)\n",
+			msg, &sockaddr_v4->sin_addr,
+			ntohs(sockaddr_v4->sin_port),
+			ntohs(sockaddr_v4->sin_port));
+		break;
+	case AF_INET6:
+		sockaddr_v6 = (struct sockaddr_in6 *)sockaddr;
+		pr_debug("%s IPV6 %pI6: %u(0x%04X)\n",
+			msg, &sockaddr_v6->sin6_addr,
+			ntohs(sockaddr_v6->sin6_port),
+			ntohs(sockaddr_v6->sin6_port));
+		break;
+	default:
+		break;
+	}
+}
+
+static u32 iwpm_ipv6_jhash(struct sockaddr_in6 *ipv6_sockaddr)
+{
+	u32 ipv6_hash = jhash(&ipv6_sockaddr->sin6_addr, sizeof(struct in6_addr), 0);
+	u32 hash = jhash_2words(ipv6_hash, (__force u32) ipv6_sockaddr->sin6_port, 0);
+	return hash;
+}
+
+static u32 iwpm_ipv4_jhash(struct sockaddr_in *ipv4_sockaddr)
+{
+	u32 ipv4_hash = jhash(&ipv4_sockaddr->sin_addr, sizeof(struct in_addr), 0);
+	u32 hash = jhash_2words(ipv4_hash, (__force u32) ipv4_sockaddr->sin_port, 0);
+	return hash;
+}
+
+static int get_hash_bucket(struct sockaddr_storage *a_sockaddr,
+				struct sockaddr_storage *b_sockaddr, u32 *hash)
+{
+	u32 a_hash, b_hash;
+
+	if (a_sockaddr->ss_family == AF_INET) {
+		a_hash = iwpm_ipv4_jhash((struct sockaddr_in *) a_sockaddr);
+		b_hash = iwpm_ipv4_jhash((struct sockaddr_in *) b_sockaddr);
+
+	} else if (a_sockaddr->ss_family == AF_INET6) {
+		a_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) a_sockaddr);
+		b_hash = iwpm_ipv6_jhash((struct sockaddr_in6 *) b_sockaddr);
+	} else {
+		pr_err("%s: Invalid sockaddr family\n", __func__);
+		return -EINVAL;
+	}
+
+	if (a_hash == b_hash) /* if port mapper isn't available */
+		*hash = a_hash;
+	else
+		*hash = jhash_2words(a_hash, b_hash, 0);
+	return 0;
+}
+
+static struct hlist_head *get_mapinfo_hash_bucket(struct sockaddr_storage
+				*local_sockaddr, struct sockaddr_storage
+				*mapped_sockaddr)
+{
+	u32 hash;
+	int ret;
+
+	ret = get_hash_bucket(local_sockaddr, mapped_sockaddr, &hash);
+	if (ret)
+		return NULL;
+	return &iwpm_hash_bucket[hash & IWPM_MAPINFO_HASH_MASK];
+}
+
+static struct hlist_head *get_reminfo_hash_bucket(struct sockaddr_storage
+				*mapped_loc_sockaddr, struct sockaddr_storage
+				*mapped_rem_sockaddr)
+{
+	u32 hash;
+	int ret;
+
+	ret = get_hash_bucket(mapped_loc_sockaddr, mapped_rem_sockaddr, &hash);
+	if (ret)
+		return NULL;
+	return &iwpm_reminfo_bucket[hash & IWPM_REMINFO_HASH_MASK];
+}
+
+static int send_mapinfo_num(u32 mapping_num, u8 nl_client, int iwpm_pid)
+{
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	u32 msg_seq;
+	const char *err_str = "";
+	int ret = -EINVAL;
+
+	skb = iwpm_create_nlmsg(RDMA_NL_IWPM_MAPINFO_NUM, &nlh, nl_client);
+	if (!skb) {
+		err_str = "Unable to create a nlmsg";
+		goto mapinfo_num_error;
+	}
+	nlh->nlmsg_seq = iwpm_get_nlmsg_seq();
+	msg_seq = 0;
+	err_str = "Unable to put attribute of mapinfo number nlmsg";
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32), &msg_seq, IWPM_NLA_MAPINFO_SEQ);
+	if (ret)
+		goto mapinfo_num_error;
+	ret = ibnl_put_attr(skb, nlh, sizeof(u32),
+				&mapping_num, IWPM_NLA_MAPINFO_SEND_NUM);
+	if (ret)
+		goto mapinfo_num_error;
+	ret = ibnl_unicast(skb, nlh, iwpm_pid);
+	if (ret) {
+		skb = NULL;
+		err_str = "Unable to send a nlmsg";
+		goto mapinfo_num_error;
+	}
+	pr_debug("%s: Sent mapping number = %d\n", __func__, mapping_num);
+	return 0;
+mapinfo_num_error:
+	pr_info("%s: %s\n", __func__, err_str);
+	if (skb)
+		dev_kfree_skb(skb);
+	return ret;
+}
+
+static int send_nlmsg_done(struct sk_buff *skb, u8 nl_client, int iwpm_pid)
+{
+	struct nlmsghdr *nlh = NULL;
+	int ret = 0;
+
+	if (!skb)
+		return ret;
+	if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client,
+			   RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) {
+		pr_warn("%s Unable to put NLMSG_DONE\n", __func__);
+		return -ENOMEM;
+	}
+	nlh->nlmsg_type = NLMSG_DONE;
+	ret = ibnl_unicast(skb, (struct nlmsghdr *)skb->data, iwpm_pid);
+	if (ret)
+		pr_warn("%s Unable to send a nlmsg\n", __func__);
+	return ret;
+}
+
+int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid)
+{
+	struct iwpm_mapping_info *map_info;
+	struct sk_buff *skb = NULL;
+	struct nlmsghdr *nlh;
+	int skb_num = 0, mapping_num = 0;
+	int i = 0, nlmsg_bytes = 0;
+	unsigned long flags;
+	const char *err_str = "";
+	int ret;
+
+	skb = dev_alloc_skb(NLMSG_GOODSIZE);
+	if (!skb) {
+		ret = -ENOMEM;
+		err_str = "Unable to allocate skb";
+		goto send_mapping_info_exit;
+	}
+	skb_num++;
+	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+	for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
+		hlist_for_each_entry(map_info, &iwpm_hash_bucket[i],
+				     hlist_node) {
+			if (map_info->nl_client != nl_client)
+				continue;
+			nlh = NULL;
+			if (!(ibnl_put_msg(skb, &nlh, 0, 0, nl_client,
+					RDMA_NL_IWPM_MAPINFO, NLM_F_MULTI))) {
+				ret = -ENOMEM;
+				err_str = "Unable to put the nlmsg header";
+				goto send_mapping_info_unlock;
+			}
+			err_str = "Unable to put attribute of the nlmsg";
+			ret = ibnl_put_attr(skb, nlh,
+					sizeof(struct sockaddr_storage),
+					&map_info->local_sockaddr,
+					IWPM_NLA_MAPINFO_LOCAL_ADDR);
+			if (ret)
+				goto send_mapping_info_unlock;
+
+			ret = ibnl_put_attr(skb, nlh,
+					sizeof(struct sockaddr_storage),
+					&map_info->mapped_sockaddr,
+					IWPM_NLA_MAPINFO_MAPPED_ADDR);
+			if (ret)
+				goto send_mapping_info_unlock;
+
+			iwpm_print_sockaddr(&map_info->local_sockaddr,
+				"send_mapping_info: Local sockaddr:");
+			iwpm_print_sockaddr(&map_info->mapped_sockaddr,
+				"send_mapping_info: Mapped local sockaddr:");
+			mapping_num++;
+			nlmsg_bytes += nlh->nlmsg_len;
+
+			/* check if all mappings can fit in one skb */
+			if (NLMSG_GOODSIZE - nlmsg_bytes < nlh->nlmsg_len * 2) {
+				/* and leave room for NLMSG_DONE */
+				nlmsg_bytes = 0;
+				skb_num++;
+				spin_unlock_irqrestore(&iwpm_mapinfo_lock,
+						       flags);
+				/* send the skb */
+				ret = send_nlmsg_done(skb, nl_client, iwpm_pid);
+				skb = NULL;
+				if (ret) {
+					err_str = "Unable to send map info";
+					goto send_mapping_info_exit;
+				}
+				if (skb_num == IWPM_MAPINFO_SKB_COUNT) {
+					ret = -ENOMEM;
+					err_str = "Insufficient skbs for map info";
+					goto send_mapping_info_exit;
+				}
+				skb = dev_alloc_skb(NLMSG_GOODSIZE);
+				if (!skb) {
+					ret = -ENOMEM;
+					err_str = "Unable to allocate skb";
+					goto send_mapping_info_exit;
+				}
+				spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+			}
+		}
+	}
+send_mapping_info_unlock:
+	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+send_mapping_info_exit:
+	if (ret) {
+		pr_warn("%s: %s (ret = %d)\n", __func__, err_str, ret);
+		if (skb)
+			dev_kfree_skb(skb);
+		return ret;
+	}
+	send_nlmsg_done(skb, nl_client, iwpm_pid);
+	return send_mapinfo_num(mapping_num, nl_client, iwpm_pid);
+}
+
+int iwpm_mapinfo_available(void)
+{
+	unsigned long flags;
+	int full_bucket = 0, i = 0;
+
+	spin_lock_irqsave(&iwpm_mapinfo_lock, flags);
+	if (iwpm_hash_bucket) {
+		for (i = 0; i < IWPM_MAPINFO_HASH_SIZE; i++) {
+			if (!hlist_empty(&iwpm_hash_bucket[i])) {
+				full_bucket = 1;
+				break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&iwpm_mapinfo_lock, flags);
+	return full_bucket;
+}
diff --git a/drivers/infiniband/core/iwpm_util.h b/drivers/infiniband/core/iwpm_util.h
new file mode 100644
index 000000000..ee2d9ff09
--- /dev/null
+++ b/drivers/infiniband/core/iwpm_util.h
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2014 Intel Corporation. All rights reserved.
+ * Copyright (c) 2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _IWPM_UTIL_H
+#define _IWPM_UTIL_H
+
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <linux/jhash.h>
+#include <linux/kref.h>
+#include <net/netlink.h>
+#include <linux/errno.h>
+#include <rdma/iw_portmap.h>
+#include <rdma/rdma_netlink.h>
+
+
+#define IWPM_NL_RETRANS		3
+#define IWPM_NL_TIMEOUT		(10*HZ)
+#define IWPM_MAPINFO_SKB_COUNT	20
+
+#define IWPM_PID_UNDEFINED     -1
+#define IWPM_PID_UNAVAILABLE   -2
+
+struct iwpm_nlmsg_request {
+	struct list_head    inprocess_list;
+	__u32               nlmsg_seq;
+	void                *req_buffer;
+	u8	            nl_client;
+	u8                  request_done;
+	u16                 err_code;
+	wait_queue_head_t   waitq;
+	struct kref         kref;
+};
+
+struct iwpm_mapping_info {
+	struct hlist_node hlist_node;
+	struct sockaddr_storage local_sockaddr;
+	struct sockaddr_storage mapped_sockaddr;
+	u8     nl_client;
+};
+
+struct iwpm_remote_info {
+	struct hlist_node hlist_node;
+	struct sockaddr_storage remote_sockaddr;
+	struct sockaddr_storage mapped_loc_sockaddr;
+	struct sockaddr_storage mapped_rem_sockaddr;
+	u8     nl_client;
+};
+
+struct iwpm_admin_data {
+	atomic_t refcount;
+	atomic_t nlmsg_seq;
+	int      client_list[RDMA_NL_NUM_CLIENTS];
+	int      reg_list[RDMA_NL_NUM_CLIENTS];
+};
+
+/**
+ * iwpm_get_nlmsg_request - Allocate and initialize netlink message request
+ * @nlmsg_seq: Sequence number of the netlink message
+ * @nl_client: The index of the netlink client
+ * @gfp: Indicates how the memory for the request should be allocated
+ *
+ * Returns the newly allocated netlink request object if successful,
+ * otherwise returns NULL
+ */
+struct iwpm_nlmsg_request *iwpm_get_nlmsg_request(__u32 nlmsg_seq,
+						u8 nl_client, gfp_t gfp);
+
+/**
+ * iwpm_free_nlmsg_request - Deallocate netlink message request
+ * @kref: Holds reference of netlink message request
+ */
+void iwpm_free_nlmsg_request(struct kref *kref);
+
+/**
+ * iwpm_find_nlmsg_request - Find netlink message request in the request list
+ * @echo_seq: Sequence number of the netlink request to find
+ *
+ * Returns the found netlink message request,
+ * if not found, returns NULL
+ */
+struct iwpm_nlmsg_request *iwpm_find_nlmsg_request(__u32 echo_seq);
+
+/**
+ * iwpm_wait_complete_req - Block while servicing the netlink request
+ * @nlmsg_request: Netlink message request to service
+ *
+ * Wakes up, after the request is completed or expired
+ * Returns 0 if the request is complete without error
+ */
+int iwpm_wait_complete_req(struct iwpm_nlmsg_request *nlmsg_request);
+
+/**
+ * iwpm_get_nlmsg_seq - Get the sequence number for a netlink
+ *			message to send to the port mapper
+ *
+ * Returns the sequence number for the netlink message.
+ */
+int iwpm_get_nlmsg_seq(void);
+
+/**
+ * iwpm_add_reminfo - Add remote address info of the connecting peer
+ *                    to the remote info hash table
+ * @reminfo: The remote info to be added
+ */
+void iwpm_add_remote_info(struct iwpm_remote_info *reminfo);
+
+/**
+ * iwpm_valid_client - Check if the port mapper client is valid
+ * @nl_client: The index of the netlink client
+ *
+ * Valid clients need to call iwpm_init() before using
+ * the port mapper
+ */
+int iwpm_valid_client(u8 nl_client);
+
+/**
+ * iwpm_set_valid - Set the port mapper client to valid or not
+ * @nl_client: The index of the netlink client
+ * @valid: 1 if valid or 0 if invalid
+ */
+void iwpm_set_valid(u8 nl_client, int valid);
+
+/**
+ * iwpm_registered_client - Check if the port mapper client is registered
+ * @nl_client: The index of the netlink client
+ *
+ * Call iwpm_register_pid() to register a client
+ */
+int iwpm_registered_client(u8 nl_client);
+
+/**
+ * iwpm_set_registered - Set the port mapper client to registered or not
+ * @nl_client: The index of the netlink client
+ * @reg: 1 if registered or 0 if not
+ */
+void iwpm_set_registered(u8 nl_client, int reg);
+
+/**
+ * iwpm_send_mapinfo - Send local and mapped IPv4/IPv6 address info of
+ *                     a client to the user space port mapper
+ * @nl_client: The index of the netlink client
+ * @iwpm_pid: The pid of the user space port mapper
+ *
+ * If successful, returns the number of sent mapping info records
+ */
+int iwpm_send_mapinfo(u8 nl_client, int iwpm_pid);
+
+/**
+ * iwpm_mapinfo_available - Check if any mapping info records is available
+ *		            in the hash table
+ *
+ * Returns 1 if mapping information is available, otherwise returns 0
+ */
+int iwpm_mapinfo_available(void);
+
+/**
+ * iwpm_compare_sockaddr - Compare two sockaddr storage structs
+ *
+ * Returns 0 if they are holding the same ip/tcp address info,
+ * otherwise returns 1
+ */
+int iwpm_compare_sockaddr(struct sockaddr_storage *a_sockaddr,
+			struct sockaddr_storage *b_sockaddr);
+
+/**
+ * iwpm_validate_nlmsg_attr - Check for NULL netlink attributes
+ * @nltb: Holds address of each netlink message attributes
+ * @nla_count: Number of netlink message attributes
+ *
+ * Returns error if any of the nla_count attributes is NULL
+ */
+static inline int iwpm_validate_nlmsg_attr(struct nlattr *nltb[],
+					   int nla_count)
+{
+	int i;
+	for (i = 1; i < nla_count; i++) {
+		if (!nltb[i])
+			return -EINVAL;
+	}
+	return 0;
+}
+
+/**
+ * iwpm_create_nlmsg - Allocate skb and form a netlink message
+ * @nl_op: Netlink message opcode
+ * @nlh: Holds address of the netlink message header in skb
+ * @nl_client: The index of the netlink client
+ *
+ * Returns the newly allcated skb, or NULL if the tailroom of the skb
+ * is insufficient to store the message header and payload
+ */
+struct sk_buff *iwpm_create_nlmsg(u32 nl_op, struct nlmsghdr **nlh,
+					int nl_client);
+
+/**
+ * iwpm_parse_nlmsg - Validate and parse the received netlink message
+ * @cb: Netlink callback structure
+ * @policy_max: Maximum attribute type to be expected
+ * @nlmsg_policy: Validation policy
+ * @nltb: Array to store policy_max parsed elements
+ * @msg_type: Type of netlink message
+ *
+ * Returns 0 on success or a negative error code
+ */
+int iwpm_parse_nlmsg(struct netlink_callback *cb, int policy_max,
+				const struct nla_policy *nlmsg_policy,
+				struct nlattr *nltb[], const char *msg_type);
+
+/**
+ * iwpm_print_sockaddr - Print IPv4/IPv6 address and TCP port
+ * @sockaddr: Socket address to print
+ * @msg: Message to print
+ */
+void iwpm_print_sockaddr(struct sockaddr_storage *sockaddr, char *msg);
+#endif
diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c
new file mode 100644
index 000000000..74c30f4c5
--- /dev/null
+++ b/drivers/infiniband/core/mad.c
@@ -0,0 +1,3176 @@
+/*
+ * Copyright (c) 2004-2007 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <rdma/ib_cache.h>
+
+#include "mad_priv.h"
+#include "mad_rmpp.h"
+#include "smi.h"
+#include "agent.h"
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("kernel IB MAD API");
+MODULE_AUTHOR("Hal Rosenstock");
+MODULE_AUTHOR("Sean Hefty");
+
+static int mad_sendq_size = IB_MAD_QP_SEND_SIZE;
+static int mad_recvq_size = IB_MAD_QP_RECV_SIZE;
+
+module_param_named(send_queue_size, mad_sendq_size, int, 0444);
+MODULE_PARM_DESC(send_queue_size, "Size of send queue in number of work requests");
+module_param_named(recv_queue_size, mad_recvq_size, int, 0444);
+MODULE_PARM_DESC(recv_queue_size, "Size of receive queue in number of work requests");
+
+static struct kmem_cache *ib_mad_cache;
+
+static struct list_head ib_mad_port_list;
+static u32 ib_mad_client_id = 0;
+
+/* Port list lock */
+static DEFINE_SPINLOCK(ib_mad_port_list_lock);
+
+/* Forward declarations */
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+			 struct ib_mad_reg_req *mad_reg_req);
+static void remove_mad_reg_req(struct ib_mad_agent_private *priv);
+static struct ib_mad_agent_private *find_mad_agent(
+					struct ib_mad_port_private *port_priv,
+					struct ib_mad *mad);
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+				    struct ib_mad_private *mad);
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv);
+static void timeout_sends(struct work_struct *work);
+static void local_completions(struct work_struct *work);
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			      struct ib_mad_agent_private *agent_priv,
+			      u8 mgmt_class);
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			   struct ib_mad_agent_private *agent_priv);
+
+/*
+ * Returns a ib_mad_port_private structure or NULL for a device/port
+ * Assumes ib_mad_port_list_lock is being held
+ */
+static inline struct ib_mad_port_private *
+__ib_get_mad_port(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *entry;
+
+	list_for_each_entry(entry, &ib_mad_port_list, port_list) {
+		if (entry->device == device && entry->port_num == port_num)
+			return entry;
+	}
+	return NULL;
+}
+
+/*
+ * Wrapper function to return a ib_mad_port_private structure or NULL
+ * for a device/port
+ */
+static inline struct ib_mad_port_private *
+ib_get_mad_port(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	entry = __ib_get_mad_port(device, port_num);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	return entry;
+}
+
+static inline u8 convert_mgmt_class(u8 mgmt_class)
+{
+	/* Alias IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE to 0 */
+	return mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE ?
+		0 : mgmt_class;
+}
+
+static int get_spl_qp_index(enum ib_qp_type qp_type)
+{
+	switch (qp_type)
+	{
+	case IB_QPT_SMI:
+		return 0;
+	case IB_QPT_GSI:
+		return 1;
+	default:
+		return -1;
+	}
+}
+
+static int vendor_class_index(u8 mgmt_class)
+{
+	return mgmt_class - IB_MGMT_CLASS_VENDOR_RANGE2_START;
+}
+
+static int is_vendor_class(u8 mgmt_class)
+{
+	if ((mgmt_class < IB_MGMT_CLASS_VENDOR_RANGE2_START) ||
+	    (mgmt_class > IB_MGMT_CLASS_VENDOR_RANGE2_END))
+		return 0;
+	return 1;
+}
+
+static int is_vendor_oui(char *oui)
+{
+	if (oui[0] || oui[1] || oui[2])
+		return 1;
+	return 0;
+}
+
+static int is_vendor_method_in_use(
+		struct ib_mad_mgmt_vendor_class *vendor_class,
+		struct ib_mad_reg_req *mad_reg_req)
+{
+	struct ib_mad_mgmt_method_table *method;
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		if (!memcmp(vendor_class->oui[i], mad_reg_req->oui, 3)) {
+			method = vendor_class->method_table[i];
+			if (method) {
+				if (method_in_use(&method, mad_reg_req))
+					return 1;
+				else
+					break;
+			}
+		}
+	}
+	return 0;
+}
+
+int ib_response_mad(struct ib_mad *mad)
+{
+	return ((mad->mad_hdr.method & IB_MGMT_METHOD_RESP) ||
+		(mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS) ||
+		((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_BM) &&
+		 (mad->mad_hdr.attr_mod & IB_BM_ATTR_MOD_RESP)));
+}
+EXPORT_SYMBOL(ib_response_mad);
+
+/*
+ * ib_register_mad_agent - Register to send/receive MADs
+ */
+struct ib_mad_agent *ib_register_mad_agent(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   struct ib_mad_reg_req *mad_reg_req,
+					   u8 rmpp_version,
+					   ib_mad_send_handler send_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context,
+					   u32 registration_flags)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent *ret = ERR_PTR(-EINVAL);
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_reg_req *reg_req = NULL;
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+	struct ib_mad_mgmt_vendor_class *vendor_class;
+	struct ib_mad_mgmt_method_table *method;
+	int ret2, qpn;
+	unsigned long flags;
+	u8 mgmt_class, vclass;
+
+	/* Validate parameters */
+	qpn = get_spl_qp_index(qp_type);
+	if (qpn == -1) {
+		dev_notice(&device->dev,
+			   "ib_register_mad_agent: invalid QP Type %d\n",
+			   qp_type);
+		goto error1;
+	}
+
+	if (rmpp_version && rmpp_version != IB_MGMT_RMPP_VERSION) {
+		dev_notice(&device->dev,
+			   "ib_register_mad_agent: invalid RMPP Version %u\n",
+			   rmpp_version);
+		goto error1;
+	}
+
+	/* Validate MAD registration request if supplied */
+	if (mad_reg_req) {
+		if (mad_reg_req->mgmt_class_version >= MAX_MGMT_VERSION) {
+			dev_notice(&device->dev,
+				   "ib_register_mad_agent: invalid Class Version %u\n",
+				   mad_reg_req->mgmt_class_version);
+			goto error1;
+		}
+		if (!recv_handler) {
+			dev_notice(&device->dev,
+				   "ib_register_mad_agent: no recv_handler\n");
+			goto error1;
+		}
+		if (mad_reg_req->mgmt_class >= MAX_MGMT_CLASS) {
+			/*
+			 * IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE is the only
+			 * one in this range currently allowed
+			 */
+			if (mad_reg_req->mgmt_class !=
+			    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+				dev_notice(&device->dev,
+					   "ib_register_mad_agent: Invalid Mgmt Class 0x%x\n",
+					   mad_reg_req->mgmt_class);
+				goto error1;
+			}
+		} else if (mad_reg_req->mgmt_class == 0) {
+			/*
+			 * Class 0 is reserved in IBA and is used for
+			 * aliasing of IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+			 */
+			dev_notice(&device->dev,
+				   "ib_register_mad_agent: Invalid Mgmt Class 0\n");
+			goto error1;
+		} else if (is_vendor_class(mad_reg_req->mgmt_class)) {
+			/*
+			 * If class is in "new" vendor range,
+			 * ensure supplied OUI is not zero
+			 */
+			if (!is_vendor_oui(mad_reg_req->oui)) {
+				dev_notice(&device->dev,
+					   "ib_register_mad_agent: No OUI specified for class 0x%x\n",
+					   mad_reg_req->mgmt_class);
+				goto error1;
+			}
+		}
+		/* Make sure class supplied is consistent with RMPP */
+		if (!ib_is_mad_class_rmpp(mad_reg_req->mgmt_class)) {
+			if (rmpp_version) {
+				dev_notice(&device->dev,
+					   "ib_register_mad_agent: RMPP version for non-RMPP class 0x%x\n",
+					   mad_reg_req->mgmt_class);
+				goto error1;
+			}
+		}
+
+		/* Make sure class supplied is consistent with QP type */
+		if (qp_type == IB_QPT_SMI) {
+			if ((mad_reg_req->mgmt_class !=
+					IB_MGMT_CLASS_SUBN_LID_ROUTED) &&
+			    (mad_reg_req->mgmt_class !=
+					IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+				dev_notice(&device->dev,
+					   "ib_register_mad_agent: Invalid SM QP type: class 0x%x\n",
+					   mad_reg_req->mgmt_class);
+				goto error1;
+			}
+		} else {
+			if ((mad_reg_req->mgmt_class ==
+					IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+			    (mad_reg_req->mgmt_class ==
+					IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+				dev_notice(&device->dev,
+					   "ib_register_mad_agent: Invalid GS QP type: class 0x%x\n",
+					   mad_reg_req->mgmt_class);
+				goto error1;
+			}
+		}
+	} else {
+		/* No registration request supplied */
+		if (!send_handler)
+			goto error1;
+		if (registration_flags & IB_MAD_USER_RMPP)
+			goto error1;
+	}
+
+	/* Validate device and port */
+	port_priv = ib_get_mad_port(device, port_num);
+	if (!port_priv) {
+		dev_notice(&device->dev, "ib_register_mad_agent: Invalid port\n");
+		ret = ERR_PTR(-ENODEV);
+		goto error1;
+	}
+
+	/* Verify the QP requested is supported.  For example, Ethernet devices
+	 * will not have QP0 */
+	if (!port_priv->qp_info[qpn].qp) {
+		dev_notice(&device->dev,
+			   "ib_register_mad_agent: QP %d not supported\n", qpn);
+		ret = ERR_PTR(-EPROTONOSUPPORT);
+		goto error1;
+	}
+
+	/* Allocate structures */
+	mad_agent_priv = kzalloc(sizeof *mad_agent_priv, GFP_KERNEL);
+	if (!mad_agent_priv) {
+		ret = ERR_PTR(-ENOMEM);
+		goto error1;
+	}
+
+	mad_agent_priv->agent.mr = ib_get_dma_mr(port_priv->qp_info[qpn].qp->pd,
+						 IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(mad_agent_priv->agent.mr)) {
+		ret = ERR_PTR(-ENOMEM);
+		goto error2;
+	}
+
+	if (mad_reg_req) {
+		reg_req = kmemdup(mad_reg_req, sizeof *reg_req, GFP_KERNEL);
+		if (!reg_req) {
+			ret = ERR_PTR(-ENOMEM);
+			goto error3;
+		}
+	}
+
+	/* Now, fill in the various structures */
+	mad_agent_priv->qp_info = &port_priv->qp_info[qpn];
+	mad_agent_priv->reg_req = reg_req;
+	mad_agent_priv->agent.rmpp_version = rmpp_version;
+	mad_agent_priv->agent.device = device;
+	mad_agent_priv->agent.recv_handler = recv_handler;
+	mad_agent_priv->agent.send_handler = send_handler;
+	mad_agent_priv->agent.context = context;
+	mad_agent_priv->agent.qp = port_priv->qp_info[qpn].qp;
+	mad_agent_priv->agent.port_num = port_num;
+	mad_agent_priv->agent.flags = registration_flags;
+	spin_lock_init(&mad_agent_priv->lock);
+	INIT_LIST_HEAD(&mad_agent_priv->send_list);
+	INIT_LIST_HEAD(&mad_agent_priv->wait_list);
+	INIT_LIST_HEAD(&mad_agent_priv->done_list);
+	INIT_LIST_HEAD(&mad_agent_priv->rmpp_list);
+	INIT_DELAYED_WORK(&mad_agent_priv->timed_work, timeout_sends);
+	INIT_LIST_HEAD(&mad_agent_priv->local_list);
+	INIT_WORK(&mad_agent_priv->local_work, local_completions);
+	atomic_set(&mad_agent_priv->refcount, 1);
+	init_completion(&mad_agent_priv->comp);
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	mad_agent_priv->agent.hi_tid = ++ib_mad_client_id;
+
+	/*
+	 * Make sure MAD registration (if supplied)
+	 * is non overlapping with any existing ones
+	 */
+	if (mad_reg_req) {
+		mgmt_class = convert_mgmt_class(mad_reg_req->mgmt_class);
+		if (!is_vendor_class(mgmt_class)) {
+			class = port_priv->version[mad_reg_req->
+						   mgmt_class_version].class;
+			if (class) {
+				method = class->method_table[mgmt_class];
+				if (method) {
+					if (method_in_use(&method,
+							   mad_reg_req))
+						goto error4;
+				}
+			}
+			ret2 = add_nonoui_reg_req(mad_reg_req, mad_agent_priv,
+						  mgmt_class);
+		} else {
+			/* "New" vendor class range */
+			vendor = port_priv->version[mad_reg_req->
+						    mgmt_class_version].vendor;
+			if (vendor) {
+				vclass = vendor_class_index(mgmt_class);
+				vendor_class = vendor->vendor_class[vclass];
+				if (vendor_class) {
+					if (is_vendor_method_in_use(
+							vendor_class,
+							mad_reg_req))
+						goto error4;
+				}
+			}
+			ret2 = add_oui_reg_req(mad_reg_req, mad_agent_priv);
+		}
+		if (ret2) {
+			ret = ERR_PTR(ret2);
+			goto error4;
+		}
+	}
+
+	/* Add mad agent into port's agent list */
+	list_add_tail(&mad_agent_priv->agent_list, &port_priv->agent_list);
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	return &mad_agent_priv->agent;
+
+error4:
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+	kfree(reg_req);
+error3:
+	ib_dereg_mr(mad_agent_priv->agent.mr);
+error2:
+	kfree(mad_agent_priv);
+error1:
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_agent);
+
+static inline int is_snooping_sends(int mad_snoop_flags)
+{
+	return (mad_snoop_flags &
+		(/*IB_MAD_SNOOP_POSTED_SENDS |
+		 IB_MAD_SNOOP_RMPP_SENDS |*/
+		 IB_MAD_SNOOP_SEND_COMPLETIONS /*|
+		 IB_MAD_SNOOP_RMPP_SEND_COMPLETIONS*/));
+}
+
+static inline int is_snooping_recvs(int mad_snoop_flags)
+{
+	return (mad_snoop_flags &
+		(IB_MAD_SNOOP_RECVS /*|
+		 IB_MAD_SNOOP_RMPP_RECVS*/));
+}
+
+static int register_snoop_agent(struct ib_mad_qp_info *qp_info,
+				struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	struct ib_mad_snoop_private **new_snoop_table;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	/* Check for empty slot in array. */
+	for (i = 0; i < qp_info->snoop_table_size; i++)
+		if (!qp_info->snoop_table[i])
+			break;
+
+	if (i == qp_info->snoop_table_size) {
+		/* Grow table. */
+		new_snoop_table = krealloc(qp_info->snoop_table,
+					   sizeof mad_snoop_priv *
+					   (qp_info->snoop_table_size + 1),
+					   GFP_ATOMIC);
+		if (!new_snoop_table) {
+			i = -ENOMEM;
+			goto out;
+		}
+
+		qp_info->snoop_table = new_snoop_table;
+		qp_info->snoop_table_size++;
+	}
+	qp_info->snoop_table[i] = mad_snoop_priv;
+	atomic_inc(&qp_info->snoop_count);
+out:
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+	return i;
+}
+
+struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device,
+					   u8 port_num,
+					   enum ib_qp_type qp_type,
+					   int mad_snoop_flags,
+					   ib_mad_snoop_handler snoop_handler,
+					   ib_mad_recv_handler recv_handler,
+					   void *context)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent *ret;
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	int qpn;
+
+	/* Validate parameters */
+	if ((is_snooping_sends(mad_snoop_flags) && !snoop_handler) ||
+	    (is_snooping_recvs(mad_snoop_flags) && !recv_handler)) {
+		ret = ERR_PTR(-EINVAL);
+		goto error1;
+	}
+	qpn = get_spl_qp_index(qp_type);
+	if (qpn == -1) {
+		ret = ERR_PTR(-EINVAL);
+		goto error1;
+	}
+	port_priv = ib_get_mad_port(device, port_num);
+	if (!port_priv) {
+		ret = ERR_PTR(-ENODEV);
+		goto error1;
+	}
+	/* Allocate structures */
+	mad_snoop_priv = kzalloc(sizeof *mad_snoop_priv, GFP_KERNEL);
+	if (!mad_snoop_priv) {
+		ret = ERR_PTR(-ENOMEM);
+		goto error1;
+	}
+
+	/* Now, fill in the various structures */
+	mad_snoop_priv->qp_info = &port_priv->qp_info[qpn];
+	mad_snoop_priv->agent.device = device;
+	mad_snoop_priv->agent.recv_handler = recv_handler;
+	mad_snoop_priv->agent.snoop_handler = snoop_handler;
+	mad_snoop_priv->agent.context = context;
+	mad_snoop_priv->agent.qp = port_priv->qp_info[qpn].qp;
+	mad_snoop_priv->agent.port_num = port_num;
+	mad_snoop_priv->mad_snoop_flags = mad_snoop_flags;
+	init_completion(&mad_snoop_priv->comp);
+	mad_snoop_priv->snoop_index = register_snoop_agent(
+						&port_priv->qp_info[qpn],
+						mad_snoop_priv);
+	if (mad_snoop_priv->snoop_index < 0) {
+		ret = ERR_PTR(mad_snoop_priv->snoop_index);
+		goto error2;
+	}
+
+	atomic_set(&mad_snoop_priv->refcount, 1);
+	return &mad_snoop_priv->agent;
+
+error2:
+	kfree(mad_snoop_priv);
+error1:
+	return ret;
+}
+EXPORT_SYMBOL(ib_register_mad_snoop);
+
+static inline void deref_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+	if (atomic_dec_and_test(&mad_agent_priv->refcount))
+		complete(&mad_agent_priv->comp);
+}
+
+static inline void deref_snoop_agent(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	if (atomic_dec_and_test(&mad_snoop_priv->refcount))
+		complete(&mad_snoop_priv->comp);
+}
+
+static void unregister_mad_agent(struct ib_mad_agent_private *mad_agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+
+	/* Note that we could still be handling received MADs */
+
+	/*
+	 * Canceling all sends results in dropping received response
+	 * MADs, preventing us from queuing additional work
+	 */
+	cancel_mads(mad_agent_priv);
+	port_priv = mad_agent_priv->qp_info->port_priv;
+	cancel_delayed_work(&mad_agent_priv->timed_work);
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	remove_mad_reg_req(mad_agent_priv);
+	list_del(&mad_agent_priv->agent_list);
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	flush_workqueue(port_priv->wq);
+	ib_cancel_rmpp_recvs(mad_agent_priv);
+
+	deref_mad_agent(mad_agent_priv);
+	wait_for_completion(&mad_agent_priv->comp);
+
+	kfree(mad_agent_priv->reg_req);
+	ib_dereg_mr(mad_agent_priv->agent.mr);
+	kfree(mad_agent_priv);
+}
+
+static void unregister_mad_snoop(struct ib_mad_snoop_private *mad_snoop_priv)
+{
+	struct ib_mad_qp_info *qp_info;
+	unsigned long flags;
+
+	qp_info = mad_snoop_priv->qp_info;
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	qp_info->snoop_table[mad_snoop_priv->snoop_index] = NULL;
+	atomic_dec(&qp_info->snoop_count);
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+
+	deref_snoop_agent(mad_snoop_priv);
+	wait_for_completion(&mad_snoop_priv->comp);
+
+	kfree(mad_snoop_priv);
+}
+
+/*
+ * ib_unregister_mad_agent - Unregisters a client from using MAD services
+ */
+int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_snoop_private *mad_snoop_priv;
+
+	/* If the TID is zero, the agent can only snoop. */
+	if (mad_agent->hi_tid) {
+		mad_agent_priv = container_of(mad_agent,
+					      struct ib_mad_agent_private,
+					      agent);
+		unregister_mad_agent(mad_agent_priv);
+	} else {
+		mad_snoop_priv = container_of(mad_agent,
+					      struct ib_mad_snoop_private,
+					      agent);
+		unregister_mad_snoop(mad_snoop_priv);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ib_unregister_mad_agent);
+
+static void dequeue_mad(struct ib_mad_list_head *mad_list)
+{
+	struct ib_mad_queue *mad_queue;
+	unsigned long flags;
+
+	BUG_ON(!mad_list->mad_queue);
+	mad_queue = mad_list->mad_queue;
+	spin_lock_irqsave(&mad_queue->lock, flags);
+	list_del(&mad_list->list);
+	mad_queue->count--;
+	spin_unlock_irqrestore(&mad_queue->lock, flags);
+}
+
+static void snoop_send(struct ib_mad_qp_info *qp_info,
+		       struct ib_mad_send_buf *send_buf,
+		       struct ib_mad_send_wc *mad_send_wc,
+		       int mad_snoop_flags)
+{
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	for (i = 0; i < qp_info->snoop_table_size; i++) {
+		mad_snoop_priv = qp_info->snoop_table[i];
+		if (!mad_snoop_priv ||
+		    !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+			continue;
+
+		atomic_inc(&mad_snoop_priv->refcount);
+		spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+		mad_snoop_priv->agent.snoop_handler(&mad_snoop_priv->agent,
+						    send_buf, mad_send_wc);
+		deref_snoop_agent(mad_snoop_priv);
+		spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	}
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void snoop_recv(struct ib_mad_qp_info *qp_info,
+		       struct ib_mad_recv_wc *mad_recv_wc,
+		       int mad_snoop_flags)
+{
+	struct ib_mad_snoop_private *mad_snoop_priv;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	for (i = 0; i < qp_info->snoop_table_size; i++) {
+		mad_snoop_priv = qp_info->snoop_table[i];
+		if (!mad_snoop_priv ||
+		    !(mad_snoop_priv->mad_snoop_flags & mad_snoop_flags))
+			continue;
+
+		atomic_inc(&mad_snoop_priv->refcount);
+		spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+		mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent,
+						   mad_recv_wc);
+		deref_snoop_agent(mad_snoop_priv);
+		spin_lock_irqsave(&qp_info->snoop_lock, flags);
+	}
+	spin_unlock_irqrestore(&qp_info->snoop_lock, flags);
+}
+
+static void build_smp_wc(struct ib_qp *qp,
+			 u64 wr_id, u16 slid, u16 pkey_index, u8 port_num,
+			 struct ib_wc *wc)
+{
+	memset(wc, 0, sizeof *wc);
+	wc->wr_id = wr_id;
+	wc->status = IB_WC_SUCCESS;
+	wc->opcode = IB_WC_RECV;
+	wc->pkey_index = pkey_index;
+	wc->byte_len = sizeof(struct ib_mad) + sizeof(struct ib_grh);
+	wc->src_qp = IB_QP0;
+	wc->qp = qp;
+	wc->slid = slid;
+	wc->sl = 0;
+	wc->dlid_path_bits = 0;
+	wc->port_num = port_num;
+}
+
+/*
+ * Return 0 if SMP is to be sent
+ * Return 1 if SMP was consumed locally (whether or not solicited)
+ * Return < 0 if error
+ */
+static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv,
+				  struct ib_mad_send_wr_private *mad_send_wr)
+{
+	int ret = 0;
+	struct ib_smp *smp = mad_send_wr->send_buf.mad;
+	unsigned long flags;
+	struct ib_mad_local_private *local;
+	struct ib_mad_private *mad_priv;
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_agent_private *recv_mad_agent = NULL;
+	struct ib_device *device = mad_agent_priv->agent.device;
+	u8 port_num;
+	struct ib_wc mad_wc;
+	struct ib_send_wr *send_wr = &mad_send_wr->send_wr;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH &&
+	    smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		port_num = send_wr->wr.ud.port_num;
+	else
+		port_num = mad_agent_priv->agent.port_num;
+
+	/*
+	 * Directed route handling starts if the initial LID routed part of
+	 * a request or the ending LID routed part of a response is empty.
+	 * If we are at the start of the LID routed part, don't update the
+	 * hop_ptr or hop_cnt.  See section 14.2.2, Vol 1 IB spec.
+	 */
+	if ((ib_get_smp_direction(smp) ? smp->dr_dlid : smp->dr_slid) ==
+	     IB_LID_PERMISSIVE &&
+	     smi_handle_dr_smp_send(smp, device->node_type, port_num) ==
+	     IB_SMI_DISCARD) {
+		ret = -EINVAL;
+		dev_err(&device->dev, "Invalid directed route\n");
+		goto out;
+	}
+
+	/* Check to post send on QP or process locally */
+	if (smi_check_local_smp(smp, device) == IB_SMI_DISCARD &&
+	    smi_check_local_returning_smp(smp, device) == IB_SMI_DISCARD)
+		goto out;
+
+	local = kmalloc(sizeof *local, GFP_ATOMIC);
+	if (!local) {
+		ret = -ENOMEM;
+		dev_err(&device->dev, "No memory for ib_mad_local_private\n");
+		goto out;
+	}
+	local->mad_priv = NULL;
+	local->recv_mad_agent = NULL;
+	mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_ATOMIC);
+	if (!mad_priv) {
+		ret = -ENOMEM;
+		dev_err(&device->dev, "No memory for local response MAD\n");
+		kfree(local);
+		goto out;
+	}
+
+	build_smp_wc(mad_agent_priv->agent.qp,
+		     send_wr->wr_id, be16_to_cpu(smp->dr_slid),
+		     send_wr->wr.ud.pkey_index,
+		     send_wr->wr.ud.port_num, &mad_wc);
+
+	/* No GRH for DR SMP */
+	ret = device->process_mad(device, 0, port_num, &mad_wc, NULL,
+				  (struct ib_mad *)smp,
+				  (struct ib_mad *)&mad_priv->mad);
+	switch (ret)
+	{
+	case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY:
+		if (ib_response_mad(&mad_priv->mad.mad) &&
+		    mad_agent_priv->agent.recv_handler) {
+			local->mad_priv = mad_priv;
+			local->recv_mad_agent = mad_agent_priv;
+			/*
+			 * Reference MAD agent until receive
+			 * side of local completion handled
+			 */
+			atomic_inc(&mad_agent_priv->refcount);
+		} else
+			kmem_cache_free(ib_mad_cache, mad_priv);
+		break;
+	case IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED:
+		kmem_cache_free(ib_mad_cache, mad_priv);
+		break;
+	case IB_MAD_RESULT_SUCCESS:
+		/* Treat like an incoming receive MAD */
+		port_priv = ib_get_mad_port(mad_agent_priv->agent.device,
+					    mad_agent_priv->agent.port_num);
+		if (port_priv) {
+			memcpy(&mad_priv->mad.mad, smp, sizeof(struct ib_mad));
+			recv_mad_agent = find_mad_agent(port_priv,
+						        &mad_priv->mad.mad);
+		}
+		if (!port_priv || !recv_mad_agent) {
+			/*
+			 * No receiving agent so drop packet and
+			 * generate send completion.
+			 */
+			kmem_cache_free(ib_mad_cache, mad_priv);
+			break;
+		}
+		local->mad_priv = mad_priv;
+		local->recv_mad_agent = recv_mad_agent;
+		break;
+	default:
+		kmem_cache_free(ib_mad_cache, mad_priv);
+		kfree(local);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	local->mad_send_wr = mad_send_wr;
+	/* Reference MAD agent until send side of local completion handled */
+	atomic_inc(&mad_agent_priv->refcount);
+	/* Queue local completion to local list */
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	list_add_tail(&local->completion_list, &mad_agent_priv->local_list);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+	queue_work(mad_agent_priv->qp_info->port_priv->wq,
+		   &mad_agent_priv->local_work);
+	ret = 1;
+out:
+	return ret;
+}
+
+static int get_pad_size(int hdr_len, int data_len)
+{
+	int seg_size, pad;
+
+	seg_size = sizeof(struct ib_mad) - hdr_len;
+	if (data_len && seg_size) {
+		pad = seg_size - data_len % seg_size;
+		return pad == seg_size ? 0 : pad;
+	} else
+		return seg_size;
+}
+
+static void free_send_rmpp_list(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_segment *s, *t;
+
+	list_for_each_entry_safe(s, t, &mad_send_wr->rmpp_list, list) {
+		list_del(&s->list);
+		kfree(s);
+	}
+}
+
+static int alloc_send_rmpp_list(struct ib_mad_send_wr_private *send_wr,
+				gfp_t gfp_mask)
+{
+	struct ib_mad_send_buf *send_buf = &send_wr->send_buf;
+	struct ib_rmpp_mad *rmpp_mad = send_buf->mad;
+	struct ib_rmpp_segment *seg = NULL;
+	int left, seg_size, pad;
+
+	send_buf->seg_size = sizeof (struct ib_mad) - send_buf->hdr_len;
+	seg_size = send_buf->seg_size;
+	pad = send_wr->pad;
+
+	/* Allocate data segments. */
+	for (left = send_buf->data_len + pad; left > 0; left -= seg_size) {
+		seg = kmalloc(sizeof (*seg) + seg_size, gfp_mask);
+		if (!seg) {
+			dev_err(&send_buf->mad_agent->device->dev,
+				"alloc_send_rmpp_segs: RMPP mem alloc failed for len %zd, gfp %#x\n",
+				sizeof (*seg) + seg_size, gfp_mask);
+			free_send_rmpp_list(send_wr);
+			return -ENOMEM;
+		}
+		seg->num = ++send_buf->seg_count;
+		list_add_tail(&seg->list, &send_wr->rmpp_list);
+	}
+
+	/* Zero any padding */
+	if (pad)
+		memset(seg->data + seg_size - pad, 0, pad);
+
+	rmpp_mad->rmpp_hdr.rmpp_version = send_wr->mad_agent_priv->
+					  agent.rmpp_version;
+	rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_DATA;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+	send_wr->cur_seg = container_of(send_wr->rmpp_list.next,
+					struct ib_rmpp_segment, list);
+	send_wr->last_ack_seg = send_wr->cur_seg;
+	return 0;
+}
+
+int ib_mad_kernel_rmpp_agent(struct ib_mad_agent *agent)
+{
+	return agent->rmpp_version && !(agent->flags & IB_MAD_USER_RMPP);
+}
+EXPORT_SYMBOL(ib_mad_kernel_rmpp_agent);
+
+struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent,
+					    u32 remote_qpn, u16 pkey_index,
+					    int rmpp_active,
+					    int hdr_len, int data_len,
+					    gfp_t gfp_mask)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	int pad, message_size, ret, size;
+	void *buf;
+
+	mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+				      agent);
+	pad = get_pad_size(hdr_len, data_len);
+	message_size = hdr_len + data_len + pad;
+
+	if (ib_mad_kernel_rmpp_agent(mad_agent)) {
+		if (!rmpp_active && message_size > sizeof(struct ib_mad))
+			return ERR_PTR(-EINVAL);
+	} else
+		if (rmpp_active || message_size > sizeof(struct ib_mad))
+			return ERR_PTR(-EINVAL);
+
+	size = rmpp_active ? hdr_len : sizeof(struct ib_mad);
+	buf = kzalloc(sizeof *mad_send_wr + size, gfp_mask);
+	if (!buf)
+		return ERR_PTR(-ENOMEM);
+
+	mad_send_wr = buf + size;
+	INIT_LIST_HEAD(&mad_send_wr->rmpp_list);
+	mad_send_wr->send_buf.mad = buf;
+	mad_send_wr->send_buf.hdr_len = hdr_len;
+	mad_send_wr->send_buf.data_len = data_len;
+	mad_send_wr->pad = pad;
+
+	mad_send_wr->mad_agent_priv = mad_agent_priv;
+	mad_send_wr->sg_list[0].length = hdr_len;
+	mad_send_wr->sg_list[0].lkey = mad_agent->mr->lkey;
+	mad_send_wr->sg_list[1].length = sizeof(struct ib_mad) - hdr_len;
+	mad_send_wr->sg_list[1].lkey = mad_agent->mr->lkey;
+
+	mad_send_wr->send_wr.wr_id = (unsigned long) mad_send_wr;
+	mad_send_wr->send_wr.sg_list = mad_send_wr->sg_list;
+	mad_send_wr->send_wr.num_sge = 2;
+	mad_send_wr->send_wr.opcode = IB_WR_SEND;
+	mad_send_wr->send_wr.send_flags = IB_SEND_SIGNALED;
+	mad_send_wr->send_wr.wr.ud.remote_qpn = remote_qpn;
+	mad_send_wr->send_wr.wr.ud.remote_qkey = IB_QP_SET_QKEY;
+	mad_send_wr->send_wr.wr.ud.pkey_index = pkey_index;
+
+	if (rmpp_active) {
+		ret = alloc_send_rmpp_list(mad_send_wr, gfp_mask);
+		if (ret) {
+			kfree(buf);
+			return ERR_PTR(ret);
+		}
+	}
+
+	mad_send_wr->send_buf.mad_agent = mad_agent;
+	atomic_inc(&mad_agent_priv->refcount);
+	return &mad_send_wr->send_buf;
+}
+EXPORT_SYMBOL(ib_create_send_mad);
+
+int ib_get_mad_data_offset(u8 mgmt_class)
+{
+	if (mgmt_class == IB_MGMT_CLASS_SUBN_ADM)
+		return IB_MGMT_SA_HDR;
+	else if ((mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+		 (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+		 (mgmt_class == IB_MGMT_CLASS_BIS))
+		return IB_MGMT_DEVICE_HDR;
+	else if ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+		 (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END))
+		return IB_MGMT_VENDOR_HDR;
+	else
+		return IB_MGMT_MAD_HDR;
+}
+EXPORT_SYMBOL(ib_get_mad_data_offset);
+
+int ib_is_mad_class_rmpp(u8 mgmt_class)
+{
+	if ((mgmt_class == IB_MGMT_CLASS_SUBN_ADM) ||
+	    (mgmt_class == IB_MGMT_CLASS_DEVICE_MGMT) ||
+	    (mgmt_class == IB_MGMT_CLASS_DEVICE_ADM) ||
+	    (mgmt_class == IB_MGMT_CLASS_BIS) ||
+	    ((mgmt_class >= IB_MGMT_CLASS_VENDOR_RANGE2_START) &&
+	     (mgmt_class <= IB_MGMT_CLASS_VENDOR_RANGE2_END)))
+		return 1;
+	return 0;
+}
+EXPORT_SYMBOL(ib_is_mad_class_rmpp);
+
+void *ib_get_rmpp_segment(struct ib_mad_send_buf *send_buf, int seg_num)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct list_head *list;
+
+	mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+				   send_buf);
+	list = &mad_send_wr->cur_seg->list;
+
+	if (mad_send_wr->cur_seg->num < seg_num) {
+		list_for_each_entry(mad_send_wr->cur_seg, list, list)
+			if (mad_send_wr->cur_seg->num == seg_num)
+				break;
+	} else if (mad_send_wr->cur_seg->num > seg_num) {
+		list_for_each_entry_reverse(mad_send_wr->cur_seg, list, list)
+			if (mad_send_wr->cur_seg->num == seg_num)
+				break;
+	}
+	return mad_send_wr->cur_seg->data;
+}
+EXPORT_SYMBOL(ib_get_rmpp_segment);
+
+static inline void *ib_get_payload(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	if (mad_send_wr->send_buf.seg_count)
+		return ib_get_rmpp_segment(&mad_send_wr->send_buf,
+					   mad_send_wr->seg_num);
+	else
+		return mad_send_wr->send_buf.mad +
+		       mad_send_wr->send_buf.hdr_len;
+}
+
+void ib_free_send_mad(struct ib_mad_send_buf *send_buf)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	mad_agent_priv = container_of(send_buf->mad_agent,
+				      struct ib_mad_agent_private, agent);
+	mad_send_wr = container_of(send_buf, struct ib_mad_send_wr_private,
+				   send_buf);
+
+	free_send_rmpp_list(mad_send_wr);
+	kfree(send_buf->mad);
+	deref_mad_agent(mad_agent_priv);
+}
+EXPORT_SYMBOL(ib_free_send_mad);
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_mad_qp_info *qp_info;
+	struct list_head *list;
+	struct ib_send_wr *bad_send_wr;
+	struct ib_mad_agent *mad_agent;
+	struct ib_sge *sge;
+	unsigned long flags;
+	int ret;
+
+	/* Set WR ID to find mad_send_wr upon completion */
+	qp_info = mad_send_wr->mad_agent_priv->qp_info;
+	mad_send_wr->send_wr.wr_id = (unsigned long)&mad_send_wr->mad_list;
+	mad_send_wr->mad_list.mad_queue = &qp_info->send_queue;
+
+	mad_agent = mad_send_wr->send_buf.mad_agent;
+	sge = mad_send_wr->sg_list;
+	sge[0].addr = ib_dma_map_single(mad_agent->device,
+					mad_send_wr->send_buf.mad,
+					sge[0].length,
+					DMA_TO_DEVICE);
+	if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[0].addr)))
+		return -ENOMEM;
+
+	mad_send_wr->header_mapping = sge[0].addr;
+
+	sge[1].addr = ib_dma_map_single(mad_agent->device,
+					ib_get_payload(mad_send_wr),
+					sge[1].length,
+					DMA_TO_DEVICE);
+	if (unlikely(ib_dma_mapping_error(mad_agent->device, sge[1].addr))) {
+		ib_dma_unmap_single(mad_agent->device,
+				    mad_send_wr->header_mapping,
+				    sge[0].length, DMA_TO_DEVICE);
+		return -ENOMEM;
+	}
+	mad_send_wr->payload_mapping = sge[1].addr;
+
+	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+	if (qp_info->send_queue.count < qp_info->send_queue.max_active) {
+		ret = ib_post_send(mad_agent->qp, &mad_send_wr->send_wr,
+				   &bad_send_wr);
+		list = &qp_info->send_queue.list;
+	} else {
+		ret = 0;
+		list = &qp_info->overflow_list;
+	}
+
+	if (!ret) {
+		qp_info->send_queue.count++;
+		list_add_tail(&mad_send_wr->mad_list.list, list);
+	}
+	spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+	if (ret) {
+		ib_dma_unmap_single(mad_agent->device,
+				    mad_send_wr->header_mapping,
+				    sge[0].length, DMA_TO_DEVICE);
+		ib_dma_unmap_single(mad_agent->device,
+				    mad_send_wr->payload_mapping,
+				    sge[1].length, DMA_TO_DEVICE);
+	}
+	return ret;
+}
+
+/*
+ * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated
+ *  with the registered client
+ */
+int ib_post_send_mad(struct ib_mad_send_buf *send_buf,
+		     struct ib_mad_send_buf **bad_send_buf)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_buf *next_send_buf;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	unsigned long flags;
+	int ret = -EINVAL;
+
+	/* Walk list of send WRs and post each on send list */
+	for (; send_buf; send_buf = next_send_buf) {
+
+		mad_send_wr = container_of(send_buf,
+					   struct ib_mad_send_wr_private,
+					   send_buf);
+		mad_agent_priv = mad_send_wr->mad_agent_priv;
+
+		if (!send_buf->mad_agent->send_handler ||
+		    (send_buf->timeout_ms &&
+		     !send_buf->mad_agent->recv_handler)) {
+			ret = -EINVAL;
+			goto error;
+		}
+
+		if (!ib_is_mad_class_rmpp(((struct ib_mad_hdr *) send_buf->mad)->mgmt_class)) {
+			if (mad_agent_priv->agent.rmpp_version) {
+				ret = -EINVAL;
+				goto error;
+			}
+		}
+
+		/*
+		 * Save pointer to next work request to post in case the
+		 * current one completes, and the user modifies the work
+		 * request associated with the completion
+		 */
+		next_send_buf = send_buf->next;
+		mad_send_wr->send_wr.wr.ud.ah = send_buf->ah;
+
+		if (((struct ib_mad_hdr *) send_buf->mad)->mgmt_class ==
+		    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+			ret = handle_outgoing_dr_smp(mad_agent_priv,
+						     mad_send_wr);
+			if (ret < 0)		/* error */
+				goto error;
+			else if (ret == 1)	/* locally consumed */
+				continue;
+		}
+
+		mad_send_wr->tid = ((struct ib_mad_hdr *) send_buf->mad)->tid;
+		/* Timeout will be updated after send completes */
+		mad_send_wr->timeout = msecs_to_jiffies(send_buf->timeout_ms);
+		mad_send_wr->max_retries = send_buf->retries;
+		mad_send_wr->retries_left = send_buf->retries;
+		send_buf->retries = 0;
+		/* Reference for work request to QP + response */
+		mad_send_wr->refcount = 1 + (mad_send_wr->timeout > 0);
+		mad_send_wr->status = IB_WC_SUCCESS;
+
+		/* Reference MAD agent until send completes */
+		atomic_inc(&mad_agent_priv->refcount);
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		list_add_tail(&mad_send_wr->agent_list,
+			      &mad_agent_priv->send_list);
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+		if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+			ret = ib_send_rmpp_mad(mad_send_wr);
+			if (ret >= 0 && ret != IB_RMPP_RESULT_CONSUMED)
+				ret = ib_send_mad(mad_send_wr);
+		} else
+			ret = ib_send_mad(mad_send_wr);
+		if (ret < 0) {
+			/* Fail send request */
+			spin_lock_irqsave(&mad_agent_priv->lock, flags);
+			list_del(&mad_send_wr->agent_list);
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+			atomic_dec(&mad_agent_priv->refcount);
+			goto error;
+		}
+	}
+	return 0;
+error:
+	if (bad_send_buf)
+		*bad_send_buf = send_buf;
+	return ret;
+}
+EXPORT_SYMBOL(ib_post_send_mad);
+
+/*
+ * ib_free_recv_mad - Returns data buffers used to receive
+ *  a MAD to the access layer
+ */
+void ib_free_recv_mad(struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_recv_buf *mad_recv_buf, *temp_recv_buf;
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *priv;
+	struct list_head free_list;
+
+	INIT_LIST_HEAD(&free_list);
+	list_splice_init(&mad_recv_wc->rmpp_list, &free_list);
+
+	list_for_each_entry_safe(mad_recv_buf, temp_recv_buf,
+					&free_list, list) {
+		mad_recv_wc = container_of(mad_recv_buf, struct ib_mad_recv_wc,
+					   recv_buf);
+		mad_priv_hdr = container_of(mad_recv_wc,
+					    struct ib_mad_private_header,
+					    recv_wc);
+		priv = container_of(mad_priv_hdr, struct ib_mad_private,
+				    header);
+		kmem_cache_free(ib_mad_cache, priv);
+	}
+}
+EXPORT_SYMBOL(ib_free_recv_mad);
+
+struct ib_mad_agent *ib_redirect_mad_qp(struct ib_qp *qp,
+					u8 rmpp_version,
+					ib_mad_send_handler send_handler,
+					ib_mad_recv_handler recv_handler,
+					void *context)
+{
+	return ERR_PTR(-EINVAL);	/* XXX: for now */
+}
+EXPORT_SYMBOL(ib_redirect_mad_qp);
+
+int ib_process_mad_wc(struct ib_mad_agent *mad_agent,
+		      struct ib_wc *wc)
+{
+	dev_err(&mad_agent->device->dev,
+		"ib_process_mad_wc() not implemented yet\n");
+	return 0;
+}
+EXPORT_SYMBOL(ib_process_mad_wc);
+
+static int method_in_use(struct ib_mad_mgmt_method_table **method,
+			 struct ib_mad_reg_req *mad_reg_req)
+{
+	int i;
+
+	for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS) {
+		if ((*method)->agent[i]) {
+			pr_err("Method %d already in use\n", i);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int allocate_method_table(struct ib_mad_mgmt_method_table **method)
+{
+	/* Allocate management method table */
+	*method = kzalloc(sizeof **method, GFP_ATOMIC);
+	if (!*method) {
+		pr_err("No memory for ib_mad_mgmt_method_table\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Check to see if there are any methods still in use
+ */
+static int check_method_table(struct ib_mad_mgmt_method_table *method)
+{
+	int i;
+
+	for (i = 0; i < IB_MGMT_MAX_METHODS; i++)
+		if (method->agent[i])
+			return 1;
+	return 0;
+}
+
+/*
+ * Check to see if there are any method tables for this class still in use
+ */
+static int check_class_table(struct ib_mad_mgmt_class_table *class)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_CLASS; i++)
+		if (class->method_table[i])
+			return 1;
+	return 0;
+}
+
+static int check_vendor_class(struct ib_mad_mgmt_vendor_class *vendor_class)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++)
+		if (vendor_class->method_table[i])
+			return 1;
+	return 0;
+}
+
+static int find_vendor_oui(struct ib_mad_mgmt_vendor_class *vendor_class,
+			   char *oui)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_OUI; i++)
+		/* Is there matching OUI for this vendor class ? */
+		if (!memcmp(vendor_class->oui[i], oui, 3))
+			return i;
+
+	return -1;
+}
+
+static int check_vendor_table(struct ib_mad_mgmt_vendor_class_table *vendor)
+{
+	int i;
+
+	for (i = 0; i < MAX_MGMT_VENDOR_RANGE2; i++)
+		if (vendor->vendor_class[i])
+			return 1;
+
+	return 0;
+}
+
+static void remove_methods_mad_agent(struct ib_mad_mgmt_method_table *method,
+				     struct ib_mad_agent_private *agent)
+{
+	int i;
+
+	/* Remove any methods for this mad agent */
+	for (i = 0; i < IB_MGMT_MAX_METHODS; i++) {
+		if (method->agent[i] == agent) {
+			method->agent[i] = NULL;
+		}
+	}
+}
+
+static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			      struct ib_mad_agent_private *agent_priv,
+			      u8 mgmt_class)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_class_table **class;
+	struct ib_mad_mgmt_method_table **method;
+	int i, ret;
+
+	port_priv = agent_priv->qp_info->port_priv;
+	class = &port_priv->version[mad_reg_req->mgmt_class_version].class;
+	if (!*class) {
+		/* Allocate management class table for "new" class version */
+		*class = kzalloc(sizeof **class, GFP_ATOMIC);
+		if (!*class) {
+			dev_err(&agent_priv->agent.device->dev,
+				"No memory for ib_mad_mgmt_class_table\n");
+			ret = -ENOMEM;
+			goto error1;
+		}
+
+		/* Allocate method table for this management class */
+		method = &(*class)->method_table[mgmt_class];
+		if ((ret = allocate_method_table(method)))
+			goto error2;
+	} else {
+		method = &(*class)->method_table[mgmt_class];
+		if (!*method) {
+			/* Allocate method table for this management class */
+			if ((ret = allocate_method_table(method)))
+				goto error1;
+		}
+	}
+
+	/* Now, make sure methods are not already in use */
+	if (method_in_use(method, mad_reg_req))
+		goto error3;
+
+	/* Finally, add in methods being registered */
+	for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS)
+		(*method)->agent[i] = agent_priv;
+
+	return 0;
+
+error3:
+	/* Remove any methods for this mad agent */
+	remove_methods_mad_agent(*method, agent_priv);
+	/* Now, check to see if there are any methods in use */
+	if (!check_method_table(*method)) {
+		/* If not, release management method table */
+		kfree(*method);
+		*method = NULL;
+	}
+	ret = -EINVAL;
+	goto error1;
+error2:
+	kfree(*class);
+	*class = NULL;
+error1:
+	return ret;
+}
+
+static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req,
+			   struct ib_mad_agent_private *agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_vendor_class_table **vendor_table;
+	struct ib_mad_mgmt_vendor_class_table *vendor = NULL;
+	struct ib_mad_mgmt_vendor_class *vendor_class = NULL;
+	struct ib_mad_mgmt_method_table **method;
+	int i, ret = -ENOMEM;
+	u8 vclass;
+
+	/* "New" vendor (with OUI) class */
+	vclass = vendor_class_index(mad_reg_req->mgmt_class);
+	port_priv = agent_priv->qp_info->port_priv;
+	vendor_table = &port_priv->version[
+				mad_reg_req->mgmt_class_version].vendor;
+	if (!*vendor_table) {
+		/* Allocate mgmt vendor class table for "new" class version */
+		vendor = kzalloc(sizeof *vendor, GFP_ATOMIC);
+		if (!vendor) {
+			dev_err(&agent_priv->agent.device->dev,
+				"No memory for ib_mad_mgmt_vendor_class_table\n");
+			goto error1;
+		}
+
+		*vendor_table = vendor;
+	}
+	if (!(*vendor_table)->vendor_class[vclass]) {
+		/* Allocate table for this management vendor class */
+		vendor_class = kzalloc(sizeof *vendor_class, GFP_ATOMIC);
+		if (!vendor_class) {
+			dev_err(&agent_priv->agent.device->dev,
+				"No memory for ib_mad_mgmt_vendor_class\n");
+			goto error2;
+		}
+
+		(*vendor_table)->vendor_class[vclass] = vendor_class;
+	}
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		/* Is there matching OUI for this vendor class ? */
+		if (!memcmp((*vendor_table)->vendor_class[vclass]->oui[i],
+			    mad_reg_req->oui, 3)) {
+			method = &(*vendor_table)->vendor_class[
+						vclass]->method_table[i];
+			BUG_ON(!*method);
+			goto check_in_use;
+		}
+	}
+	for (i = 0; i < MAX_MGMT_OUI; i++) {
+		/* OUI slot available ? */
+		if (!is_vendor_oui((*vendor_table)->vendor_class[
+				vclass]->oui[i])) {
+			method = &(*vendor_table)->vendor_class[
+				vclass]->method_table[i];
+			BUG_ON(*method);
+			/* Allocate method table for this OUI */
+			if ((ret = allocate_method_table(method)))
+				goto error3;
+			memcpy((*vendor_table)->vendor_class[vclass]->oui[i],
+			       mad_reg_req->oui, 3);
+			goto check_in_use;
+		}
+	}
+	dev_err(&agent_priv->agent.device->dev, "All OUI slots in use\n");
+	goto error3;
+
+check_in_use:
+	/* Now, make sure methods are not already in use */
+	if (method_in_use(method, mad_reg_req))
+		goto error4;
+
+	/* Finally, add in methods being registered */
+	for_each_set_bit(i, mad_reg_req->method_mask, IB_MGMT_MAX_METHODS)
+		(*method)->agent[i] = agent_priv;
+
+	return 0;
+
+error4:
+	/* Remove any methods for this mad agent */
+	remove_methods_mad_agent(*method, agent_priv);
+	/* Now, check to see if there are any methods in use */
+	if (!check_method_table(*method)) {
+		/* If not, release management method table */
+		kfree(*method);
+		*method = NULL;
+	}
+	ret = -EINVAL;
+error3:
+	if (vendor_class) {
+		(*vendor_table)->vendor_class[vclass] = NULL;
+		kfree(vendor_class);
+	}
+error2:
+	if (vendor) {
+		*vendor_table = NULL;
+		kfree(vendor);
+	}
+error1:
+	return ret;
+}
+
+static void remove_mad_reg_req(struct ib_mad_agent_private *agent_priv)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_method_table *method;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+	struct ib_mad_mgmt_vendor_class *vendor_class;
+	int index;
+	u8 mgmt_class;
+
+	/*
+	 * Was MAD registration request supplied
+	 * with original registration ?
+	 */
+	if (!agent_priv->reg_req) {
+		goto out;
+	}
+
+	port_priv = agent_priv->qp_info->port_priv;
+	mgmt_class = convert_mgmt_class(agent_priv->reg_req->mgmt_class);
+	class = port_priv->version[
+			agent_priv->reg_req->mgmt_class_version].class;
+	if (!class)
+		goto vendor_check;
+
+	method = class->method_table[mgmt_class];
+	if (method) {
+		/* Remove any methods for this mad agent */
+		remove_methods_mad_agent(method, agent_priv);
+		/* Now, check to see if there are any methods still in use */
+		if (!check_method_table(method)) {
+			/* If not, release management method table */
+			 kfree(method);
+			 class->method_table[mgmt_class] = NULL;
+			 /* Any management classes left ? */
+			if (!check_class_table(class)) {
+				/* If not, release management class table */
+				kfree(class);
+				port_priv->version[
+					agent_priv->reg_req->
+					mgmt_class_version].class = NULL;
+			}
+		}
+	}
+
+vendor_check:
+	if (!is_vendor_class(mgmt_class))
+		goto out;
+
+	/* normalize mgmt_class to vendor range 2 */
+	mgmt_class = vendor_class_index(agent_priv->reg_req->mgmt_class);
+	vendor = port_priv->version[
+			agent_priv->reg_req->mgmt_class_version].vendor;
+
+	if (!vendor)
+		goto out;
+
+	vendor_class = vendor->vendor_class[mgmt_class];
+	if (vendor_class) {
+		index = find_vendor_oui(vendor_class, agent_priv->reg_req->oui);
+		if (index < 0)
+			goto out;
+		method = vendor_class->method_table[index];
+		if (method) {
+			/* Remove any methods for this mad agent */
+			remove_methods_mad_agent(method, agent_priv);
+			/*
+			 * Now, check to see if there are
+			 * any methods still in use
+			 */
+			if (!check_method_table(method)) {
+				/* If not, release management method table */
+				kfree(method);
+				vendor_class->method_table[index] = NULL;
+				memset(vendor_class->oui[index], 0, 3);
+				/* Any OUIs left ? */
+				if (!check_vendor_class(vendor_class)) {
+					/* If not, release vendor class table */
+					kfree(vendor_class);
+					vendor->vendor_class[mgmt_class] = NULL;
+					/* Any other vendor classes left ? */
+					if (!check_vendor_table(vendor)) {
+						kfree(vendor);
+						port_priv->version[
+							agent_priv->reg_req->
+							mgmt_class_version].
+							vendor = NULL;
+					}
+				}
+			}
+		}
+	}
+
+out:
+	return;
+}
+
+static struct ib_mad_agent_private *
+find_mad_agent(struct ib_mad_port_private *port_priv,
+	       struct ib_mad *mad)
+{
+	struct ib_mad_agent_private *mad_agent = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port_priv->reg_lock, flags);
+	if (ib_response_mad(mad)) {
+		u32 hi_tid;
+		struct ib_mad_agent_private *entry;
+
+		/*
+		 * Routing is based on high 32 bits of transaction ID
+		 * of MAD.
+		 */
+		hi_tid = be64_to_cpu(mad->mad_hdr.tid) >> 32;
+		list_for_each_entry(entry, &port_priv->agent_list, agent_list) {
+			if (entry->agent.hi_tid == hi_tid) {
+				mad_agent = entry;
+				break;
+			}
+		}
+	} else {
+		struct ib_mad_mgmt_class_table *class;
+		struct ib_mad_mgmt_method_table *method;
+		struct ib_mad_mgmt_vendor_class_table *vendor;
+		struct ib_mad_mgmt_vendor_class *vendor_class;
+		struct ib_vendor_mad *vendor_mad;
+		int index;
+
+		/*
+		 * Routing is based on version, class, and method
+		 * For "newer" vendor MADs, also based on OUI
+		 */
+		if (mad->mad_hdr.class_version >= MAX_MGMT_VERSION)
+			goto out;
+		if (!is_vendor_class(mad->mad_hdr.mgmt_class)) {
+			class = port_priv->version[
+					mad->mad_hdr.class_version].class;
+			if (!class)
+				goto out;
+			if (convert_mgmt_class(mad->mad_hdr.mgmt_class) >=
+			    IB_MGMT_MAX_METHODS)
+				goto out;
+			method = class->method_table[convert_mgmt_class(
+							mad->mad_hdr.mgmt_class)];
+			if (method)
+				mad_agent = method->agent[mad->mad_hdr.method &
+							  ~IB_MGMT_METHOD_RESP];
+		} else {
+			vendor = port_priv->version[
+					mad->mad_hdr.class_version].vendor;
+			if (!vendor)
+				goto out;
+			vendor_class = vendor->vendor_class[vendor_class_index(
+						mad->mad_hdr.mgmt_class)];
+			if (!vendor_class)
+				goto out;
+			/* Find matching OUI */
+			vendor_mad = (struct ib_vendor_mad *)mad;
+			index = find_vendor_oui(vendor_class, vendor_mad->oui);
+			if (index == -1)
+				goto out;
+			method = vendor_class->method_table[index];
+			if (method) {
+				mad_agent = method->agent[mad->mad_hdr.method &
+							  ~IB_MGMT_METHOD_RESP];
+			}
+		}
+	}
+
+	if (mad_agent) {
+		if (mad_agent->agent.recv_handler)
+			atomic_inc(&mad_agent->refcount);
+		else {
+			dev_notice(&port_priv->device->dev,
+				   "No receive handler for client %p on port %d\n",
+				   &mad_agent->agent, port_priv->port_num);
+			mad_agent = NULL;
+		}
+	}
+out:
+	spin_unlock_irqrestore(&port_priv->reg_lock, flags);
+
+	return mad_agent;
+}
+
+static int validate_mad(struct ib_mad *mad, u32 qp_num)
+{
+	int valid = 0;
+
+	/* Make sure MAD base version is understood */
+	if (mad->mad_hdr.base_version != IB_MGMT_BASE_VERSION) {
+		pr_err("MAD received with unsupported base version %d\n",
+			mad->mad_hdr.base_version);
+		goto out;
+	}
+
+	/* Filter SMI packets sent to other than QP0 */
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED) ||
+	    (mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)) {
+		if (qp_num == 0)
+			valid = 1;
+	} else {
+		/* Filter GSI packets sent to QP0 */
+		if (qp_num != 0)
+			valid = 1;
+	}
+
+out:
+	return valid;
+}
+
+static int is_data_mad(struct ib_mad_agent_private *mad_agent_priv,
+		       struct ib_mad_hdr *mad_hdr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_hdr;
+	return !mad_agent_priv->agent.rmpp_version ||
+		!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent) ||
+		!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+				    IB_MGMT_RMPP_FLAG_ACTIVE) ||
+		(rmpp_mad->rmpp_hdr.rmpp_type == IB_MGMT_RMPP_TYPE_DATA);
+}
+
+static inline int rcv_has_same_class(struct ib_mad_send_wr_private *wr,
+				     struct ib_mad_recv_wc *rwc)
+{
+	return ((struct ib_mad *)(wr->send_buf.mad))->mad_hdr.mgmt_class ==
+		rwc->recv_buf.mad->mad_hdr.mgmt_class;
+}
+
+static inline int rcv_has_same_gid(struct ib_mad_agent_private *mad_agent_priv,
+				   struct ib_mad_send_wr_private *wr,
+				   struct ib_mad_recv_wc *rwc )
+{
+	struct ib_ah_attr attr;
+	u8 send_resp, rcv_resp;
+	union ib_gid sgid;
+	struct ib_device *device = mad_agent_priv->agent.device;
+	u8 port_num = mad_agent_priv->agent.port_num;
+	u8 lmc;
+
+	send_resp = ib_response_mad((struct ib_mad *)wr->send_buf.mad);
+	rcv_resp = ib_response_mad(rwc->recv_buf.mad);
+
+	if (send_resp == rcv_resp)
+		/* both requests, or both responses. GIDs different */
+		return 0;
+
+	if (ib_query_ah(wr->send_buf.ah, &attr))
+		/* Assume not equal, to avoid false positives. */
+		return 0;
+
+	if (!!(attr.ah_flags & IB_AH_GRH) !=
+	    !!(rwc->wc->wc_flags & IB_WC_GRH))
+		/* one has GID, other does not.  Assume different */
+		return 0;
+
+	if (!send_resp && rcv_resp) {
+		/* is request/response. */
+		if (!(attr.ah_flags & IB_AH_GRH)) {
+			if (ib_get_cached_lmc(device, port_num, &lmc))
+				return 0;
+			return (!lmc || !((attr.src_path_bits ^
+					   rwc->wc->dlid_path_bits) &
+					  ((1 << lmc) - 1)));
+		} else {
+			if (ib_get_cached_gid(device, port_num,
+					      attr.grh.sgid_index, &sgid))
+				return 0;
+			return !memcmp(sgid.raw, rwc->recv_buf.grh->dgid.raw,
+				       16);
+		}
+	}
+
+	if (!(attr.ah_flags & IB_AH_GRH))
+		return attr.dlid == rwc->wc->slid;
+	else
+		return !memcmp(attr.grh.dgid.raw, rwc->recv_buf.grh->sgid.raw,
+			       16);
+}
+
+static inline int is_direct(u8 class)
+{
+	return (class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE);
+}
+
+struct ib_mad_send_wr_private*
+ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
+		 struct ib_mad_recv_wc *wc)
+{
+	struct ib_mad_send_wr_private *wr;
+	struct ib_mad *mad;
+
+	mad = (struct ib_mad *)wc->recv_buf.mad;
+
+	list_for_each_entry(wr, &mad_agent_priv->wait_list, agent_list) {
+		if ((wr->tid == mad->mad_hdr.tid) &&
+		    rcv_has_same_class(wr, wc) &&
+		    /*
+		     * Don't check GID for direct routed MADs.
+		     * These might have permissive LIDs.
+		     */
+		    (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) ||
+		     rcv_has_same_gid(mad_agent_priv, wr, wc)))
+			return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+	}
+
+	/*
+	 * It's possible to receive the response before we've
+	 * been notified that the send has completed
+	 */
+	list_for_each_entry(wr, &mad_agent_priv->send_list, agent_list) {
+		if (is_data_mad(mad_agent_priv, wr->send_buf.mad) &&
+		    wr->tid == mad->mad_hdr.tid &&
+		    wr->timeout &&
+		    rcv_has_same_class(wr, wc) &&
+		    /*
+		     * Don't check GID for direct routed MADs.
+		     * These might have permissive LIDs.
+		     */
+		    (is_direct(wc->recv_buf.mad->mad_hdr.mgmt_class) ||
+		     rcv_has_same_gid(mad_agent_priv, wr, wc)))
+			/* Verify request has not been canceled */
+			return (wr->status == IB_WC_SUCCESS) ? wr : NULL;
+	}
+	return NULL;
+}
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	mad_send_wr->timeout = 0;
+	if (mad_send_wr->refcount == 1)
+		list_move_tail(&mad_send_wr->agent_list,
+			      &mad_send_wr->mad_agent_priv->done_list);
+}
+
+static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv,
+				 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	unsigned long flags;
+
+	INIT_LIST_HEAD(&mad_recv_wc->rmpp_list);
+	list_add(&mad_recv_wc->recv_buf.list, &mad_recv_wc->rmpp_list);
+	if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+		mad_recv_wc = ib_process_rmpp_recv_wc(mad_agent_priv,
+						      mad_recv_wc);
+		if (!mad_recv_wc) {
+			deref_mad_agent(mad_agent_priv);
+			return;
+		}
+	}
+
+	/* Complete corresponding request */
+	if (ib_response_mad(mad_recv_wc->recv_buf.mad)) {
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		mad_send_wr = ib_find_send_mad(mad_agent_priv, mad_recv_wc);
+		if (!mad_send_wr) {
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+			if (!ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)
+			   && ib_is_mad_class_rmpp(mad_recv_wc->recv_buf.mad->mad_hdr.mgmt_class)
+			   && (ib_get_rmpp_flags(&((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr)
+					& IB_MGMT_RMPP_FLAG_ACTIVE)) {
+				/* user rmpp is in effect
+				 * and this is an active RMPP MAD
+				 */
+				mad_recv_wc->wc->wr_id = 0;
+				mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+								   mad_recv_wc);
+				atomic_dec(&mad_agent_priv->refcount);
+			} else {
+				/* not user rmpp, revert to normal behavior and
+				 * drop the mad */
+				ib_free_recv_mad(mad_recv_wc);
+				deref_mad_agent(mad_agent_priv);
+				return;
+			}
+		} else {
+			ib_mark_mad_done(mad_send_wr);
+			spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+			/* Defined behavior is to complete response before request */
+			mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf;
+			mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+							   mad_recv_wc);
+			atomic_dec(&mad_agent_priv->refcount);
+
+			mad_send_wc.status = IB_WC_SUCCESS;
+			mad_send_wc.vendor_err = 0;
+			mad_send_wc.send_buf = &mad_send_wr->send_buf;
+			ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+		}
+	} else {
+		mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent,
+						   mad_recv_wc);
+		deref_mad_agent(mad_agent_priv);
+	}
+}
+
+static bool generate_unmatched_resp(struct ib_mad_private *recv,
+				    struct ib_mad_private *response)
+{
+	if (recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_GET ||
+	    recv->mad.mad.mad_hdr.method == IB_MGMT_METHOD_SET) {
+		memcpy(response, recv, sizeof *response);
+		response->header.recv_wc.wc = &response->header.wc;
+		response->header.recv_wc.recv_buf.mad = &response->mad.mad;
+		response->header.recv_wc.recv_buf.grh = &response->grh;
+		response->mad.mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+		response->mad.mad.mad_hdr.status =
+			cpu_to_be16(IB_MGMT_MAD_STATUS_UNSUPPORTED_METHOD_ATTRIB);
+		if (recv->mad.mad.mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+			response->mad.mad.mad_hdr.status |= IB_SMP_DIRECTION;
+
+		return true;
+	} else {
+		return false;
+	}
+}
+static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv,
+				     struct ib_wc *wc)
+{
+	struct ib_mad_qp_info *qp_info;
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *recv, *response = NULL;
+	struct ib_mad_list_head *mad_list;
+	struct ib_mad_agent_private *mad_agent;
+	int port_num;
+	int ret = IB_MAD_RESULT_SUCCESS;
+
+	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	qp_info = mad_list->mad_queue->qp_info;
+	dequeue_mad(mad_list);
+
+	mad_priv_hdr = container_of(mad_list, struct ib_mad_private_header,
+				    mad_list);
+	recv = container_of(mad_priv_hdr, struct ib_mad_private, header);
+	ib_dma_unmap_single(port_priv->device,
+			    recv->header.mapping,
+			    sizeof(struct ib_mad_private) -
+			      sizeof(struct ib_mad_private_header),
+			    DMA_FROM_DEVICE);
+
+	/* Setup MAD receive work completion from "normal" work completion */
+	recv->header.wc = *wc;
+	recv->header.recv_wc.wc = &recv->header.wc;
+	recv->header.recv_wc.mad_len = sizeof(struct ib_mad);
+	recv->header.recv_wc.recv_buf.mad = &recv->mad.mad;
+	recv->header.recv_wc.recv_buf.grh = &recv->grh;
+
+	if (atomic_read(&qp_info->snoop_count))
+		snoop_recv(qp_info, &recv->header.recv_wc, IB_MAD_SNOOP_RECVS);
+
+	/* Validate MAD */
+	if (!validate_mad(&recv->mad.mad, qp_info->qp->qp_num))
+		goto out;
+
+	response = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+	if (!response) {
+		dev_err(&port_priv->device->dev,
+			"ib_mad_recv_done_handler no memory for response buffer\n");
+		goto out;
+	}
+
+	if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH)
+		port_num = wc->port_num;
+	else
+		port_num = port_priv->port_num;
+
+	if (recv->mad.mad.mad_hdr.mgmt_class ==
+	    IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		enum smi_forward_action retsmi;
+
+		if (smi_handle_dr_smp_recv(&recv->mad.smp,
+					   port_priv->device->node_type,
+					   port_num,
+					   port_priv->device->phys_port_cnt) ==
+					   IB_SMI_DISCARD)
+			goto out;
+
+		retsmi = smi_check_forward_dr_smp(&recv->mad.smp);
+		if (retsmi == IB_SMI_LOCAL)
+			goto local;
+
+		if (retsmi == IB_SMI_SEND) { /* don't forward */
+			if (smi_handle_dr_smp_send(&recv->mad.smp,
+						   port_priv->device->node_type,
+						   port_num) == IB_SMI_DISCARD)
+				goto out;
+
+			if (smi_check_local_smp(&recv->mad.smp, port_priv->device) == IB_SMI_DISCARD)
+				goto out;
+		} else if (port_priv->device->node_type == RDMA_NODE_IB_SWITCH) {
+			/* forward case for switches */
+			memcpy(response, recv, sizeof(*response));
+			response->header.recv_wc.wc = &response->header.wc;
+			response->header.recv_wc.recv_buf.mad = &response->mad.mad;
+			response->header.recv_wc.recv_buf.grh = &response->grh;
+
+			agent_send_response(&response->mad.mad,
+					    &response->grh, wc,
+					    port_priv->device,
+					    smi_get_fwd_port(&recv->mad.smp),
+					    qp_info->qp->qp_num);
+
+			goto out;
+		}
+	}
+
+local:
+	/* Give driver "right of first refusal" on incoming MAD */
+	if (port_priv->device->process_mad) {
+		ret = port_priv->device->process_mad(port_priv->device, 0,
+						     port_priv->port_num,
+						     wc, &recv->grh,
+						     &recv->mad.mad,
+						     &response->mad.mad);
+		if (ret & IB_MAD_RESULT_SUCCESS) {
+			if (ret & IB_MAD_RESULT_CONSUMED)
+				goto out;
+			if (ret & IB_MAD_RESULT_REPLY) {
+				agent_send_response(&response->mad.mad,
+						    &recv->grh, wc,
+						    port_priv->device,
+						    port_num,
+						    qp_info->qp->qp_num);
+				goto out;
+			}
+		}
+	}
+
+	mad_agent = find_mad_agent(port_priv, &recv->mad.mad);
+	if (mad_agent) {
+		ib_mad_complete_recv(mad_agent, &recv->header.recv_wc);
+		/*
+		 * recv is freed up in error cases in ib_mad_complete_recv
+		 * or via recv_handler in ib_mad_complete_recv()
+		 */
+		recv = NULL;
+	} else if ((ret & IB_MAD_RESULT_SUCCESS) &&
+		   generate_unmatched_resp(recv, response)) {
+		agent_send_response(&response->mad.mad, &recv->grh, wc,
+				    port_priv->device, port_num, qp_info->qp->qp_num);
+	}
+
+out:
+	/* Post another receive request for this QP */
+	if (response) {
+		ib_mad_post_receive_mads(qp_info, response);
+		if (recv)
+			kmem_cache_free(ib_mad_cache, recv);
+	} else
+		ib_mad_post_receive_mads(qp_info, recv);
+}
+
+static void adjust_timeout(struct ib_mad_agent_private *mad_agent_priv)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	unsigned long delay;
+
+	if (list_empty(&mad_agent_priv->wait_list)) {
+		cancel_delayed_work(&mad_agent_priv->timed_work);
+	} else {
+		mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+					 struct ib_mad_send_wr_private,
+					 agent_list);
+
+		if (time_after(mad_agent_priv->timeout,
+			       mad_send_wr->timeout)) {
+			mad_agent_priv->timeout = mad_send_wr->timeout;
+			delay = mad_send_wr->timeout - jiffies;
+			if ((long)delay <= 0)
+				delay = 1;
+			mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq,
+					 &mad_agent_priv->timed_work, delay);
+		}
+	}
+}
+
+static void wait_for_response(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *temp_mad_send_wr;
+	struct list_head *list_item;
+	unsigned long delay;
+
+	mad_agent_priv = mad_send_wr->mad_agent_priv;
+	list_del(&mad_send_wr->agent_list);
+
+	delay = mad_send_wr->timeout;
+	mad_send_wr->timeout += jiffies;
+
+	if (delay) {
+		list_for_each_prev(list_item, &mad_agent_priv->wait_list) {
+			temp_mad_send_wr = list_entry(list_item,
+						struct ib_mad_send_wr_private,
+						agent_list);
+			if (time_after(mad_send_wr->timeout,
+				       temp_mad_send_wr->timeout))
+				break;
+		}
+	}
+	else
+		list_item = &mad_agent_priv->wait_list;
+	list_add(&mad_send_wr->agent_list, list_item);
+
+	/* Reschedule a work item if we have a shorter timeout */
+	if (mad_agent_priv->wait_list.next == &mad_send_wr->agent_list)
+		mod_delayed_work(mad_agent_priv->qp_info->port_priv->wq,
+				 &mad_agent_priv->timed_work, delay);
+}
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+			  int timeout_ms)
+{
+	mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+	wait_for_response(mad_send_wr);
+}
+
+/*
+ * Process a send work completion
+ */
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+			     struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_mad_agent_private	*mad_agent_priv;
+	unsigned long			flags;
+	int				ret;
+
+	mad_agent_priv = mad_send_wr->mad_agent_priv;
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	if (ib_mad_kernel_rmpp_agent(&mad_agent_priv->agent)) {
+		ret = ib_process_rmpp_send_wc(mad_send_wr, mad_send_wc);
+		if (ret == IB_RMPP_RESULT_CONSUMED)
+			goto done;
+	} else
+		ret = IB_RMPP_RESULT_UNHANDLED;
+
+	if (mad_send_wc->status != IB_WC_SUCCESS &&
+	    mad_send_wr->status == IB_WC_SUCCESS) {
+		mad_send_wr->status = mad_send_wc->status;
+		mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+	}
+
+	if (--mad_send_wr->refcount > 0) {
+		if (mad_send_wr->refcount == 1 && mad_send_wr->timeout &&
+		    mad_send_wr->status == IB_WC_SUCCESS) {
+			wait_for_response(mad_send_wr);
+		}
+		goto done;
+	}
+
+	/* Remove send from MAD agent and notify client of completion */
+	list_del(&mad_send_wr->agent_list);
+	adjust_timeout(mad_agent_priv);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+	if (mad_send_wr->status != IB_WC_SUCCESS )
+		mad_send_wc->status = mad_send_wr->status;
+	if (ret == IB_RMPP_RESULT_INTERNAL)
+		ib_rmpp_send_handler(mad_send_wc);
+	else
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   mad_send_wc);
+
+	/* Release reference on agent taken when sending */
+	deref_mad_agent(mad_agent_priv);
+	return;
+done:
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv,
+				     struct ib_wc *wc)
+{
+	struct ib_mad_send_wr_private	*mad_send_wr, *queued_send_wr;
+	struct ib_mad_list_head		*mad_list;
+	struct ib_mad_qp_info		*qp_info;
+	struct ib_mad_queue		*send_queue;
+	struct ib_send_wr		*bad_send_wr;
+	struct ib_mad_send_wc		mad_send_wc;
+	unsigned long flags;
+	int ret;
+
+	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+				   mad_list);
+	send_queue = mad_list->mad_queue;
+	qp_info = send_queue->qp_info;
+
+retry:
+	ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+			    mad_send_wr->header_mapping,
+			    mad_send_wr->sg_list[0].length, DMA_TO_DEVICE);
+	ib_dma_unmap_single(mad_send_wr->send_buf.mad_agent->device,
+			    mad_send_wr->payload_mapping,
+			    mad_send_wr->sg_list[1].length, DMA_TO_DEVICE);
+	queued_send_wr = NULL;
+	spin_lock_irqsave(&send_queue->lock, flags);
+	list_del(&mad_list->list);
+
+	/* Move queued send to the send queue */
+	if (send_queue->count-- > send_queue->max_active) {
+		mad_list = container_of(qp_info->overflow_list.next,
+					struct ib_mad_list_head, list);
+		queued_send_wr = container_of(mad_list,
+					struct ib_mad_send_wr_private,
+					mad_list);
+		list_move_tail(&mad_list->list, &send_queue->list);
+	}
+	spin_unlock_irqrestore(&send_queue->lock, flags);
+
+	mad_send_wc.send_buf = &mad_send_wr->send_buf;
+	mad_send_wc.status = wc->status;
+	mad_send_wc.vendor_err = wc->vendor_err;
+	if (atomic_read(&qp_info->snoop_count))
+		snoop_send(qp_info, &mad_send_wr->send_buf, &mad_send_wc,
+			   IB_MAD_SNOOP_SEND_COMPLETIONS);
+	ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc);
+
+	if (queued_send_wr) {
+		ret = ib_post_send(qp_info->qp, &queued_send_wr->send_wr,
+				   &bad_send_wr);
+		if (ret) {
+			dev_err(&port_priv->device->dev,
+				"ib_post_send failed: %d\n", ret);
+			mad_send_wr = queued_send_wr;
+			wc->status = IB_WC_LOC_QP_OP_ERR;
+			goto retry;
+		}
+	}
+}
+
+static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_list_head *mad_list;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp_info->send_queue.lock, flags);
+	list_for_each_entry(mad_list, &qp_info->send_queue.list, list) {
+		mad_send_wr = container_of(mad_list,
+					   struct ib_mad_send_wr_private,
+					   mad_list);
+		mad_send_wr->retry = 1;
+	}
+	spin_unlock_irqrestore(&qp_info->send_queue.lock, flags);
+}
+
+static void mad_error_handler(struct ib_mad_port_private *port_priv,
+			      struct ib_wc *wc)
+{
+	struct ib_mad_list_head *mad_list;
+	struct ib_mad_qp_info *qp_info;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	int ret;
+
+	/* Determine if failure was a send or receive */
+	mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id;
+	qp_info = mad_list->mad_queue->qp_info;
+	if (mad_list->mad_queue == &qp_info->recv_queue)
+		/*
+		 * Receive errors indicate that the QP has entered the error
+		 * state - error handling/shutdown code will cleanup
+		 */
+		return;
+
+	/*
+	 * Send errors will transition the QP to SQE - move
+	 * QP to RTS and repost flushed work requests
+	 */
+	mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private,
+				   mad_list);
+	if (wc->status == IB_WC_WR_FLUSH_ERR) {
+		if (mad_send_wr->retry) {
+			/* Repost send */
+			struct ib_send_wr *bad_send_wr;
+
+			mad_send_wr->retry = 0;
+			ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr,
+					&bad_send_wr);
+			if (ret)
+				ib_mad_send_done_handler(port_priv, wc);
+		} else
+			ib_mad_send_done_handler(port_priv, wc);
+	} else {
+		struct ib_qp_attr *attr;
+
+		/* Transition QP to RTS and fail offending send */
+		attr = kmalloc(sizeof *attr, GFP_KERNEL);
+		if (attr) {
+			attr->qp_state = IB_QPS_RTS;
+			attr->cur_qp_state = IB_QPS_SQE;
+			ret = ib_modify_qp(qp_info->qp, attr,
+					   IB_QP_STATE | IB_QP_CUR_STATE);
+			kfree(attr);
+			if (ret)
+				dev_err(&port_priv->device->dev,
+					"mad_error_handler - ib_modify_qp to RTS : %d\n",
+					ret);
+			else
+				mark_sends_for_retry(qp_info);
+		}
+		ib_mad_send_done_handler(port_priv, wc);
+	}
+}
+
+/*
+ * IB MAD completion callback
+ */
+static void ib_mad_completion_handler(struct work_struct *work)
+{
+	struct ib_mad_port_private *port_priv;
+	struct ib_wc wc;
+
+	port_priv = container_of(work, struct ib_mad_port_private, work);
+	ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+
+	while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) {
+		if (wc.status == IB_WC_SUCCESS) {
+			switch (wc.opcode) {
+			case IB_WC_SEND:
+				ib_mad_send_done_handler(port_priv, &wc);
+				break;
+			case IB_WC_RECV:
+				ib_mad_recv_done_handler(port_priv, &wc);
+				break;
+			default:
+				BUG_ON(1);
+				break;
+			}
+		} else
+			mad_error_handler(port_priv, &wc);
+	}
+}
+
+static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv)
+{
+	unsigned long flags;
+	struct ib_mad_send_wr_private *mad_send_wr, *temp_mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	struct list_head cancel_list;
+
+	INIT_LIST_HEAD(&cancel_list);
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+				 &mad_agent_priv->send_list, agent_list) {
+		if (mad_send_wr->status == IB_WC_SUCCESS) {
+			mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+			mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+		}
+	}
+
+	/* Empty wait list to prevent receives from finding a request */
+	list_splice_init(&mad_agent_priv->wait_list, &cancel_list);
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+	/* Report all cancelled requests */
+	mad_send_wc.status = IB_WC_WR_FLUSH_ERR;
+	mad_send_wc.vendor_err = 0;
+
+	list_for_each_entry_safe(mad_send_wr, temp_mad_send_wr,
+				 &cancel_list, agent_list) {
+		mad_send_wc.send_buf = &mad_send_wr->send_buf;
+		list_del(&mad_send_wr->agent_list);
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+		atomic_dec(&mad_agent_priv->refcount);
+	}
+}
+
+static struct ib_mad_send_wr_private*
+find_send_wr(struct ib_mad_agent_private *mad_agent_priv,
+	     struct ib_mad_send_buf *send_buf)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+
+	list_for_each_entry(mad_send_wr, &mad_agent_priv->wait_list,
+			    agent_list) {
+		if (&mad_send_wr->send_buf == send_buf)
+			return mad_send_wr;
+	}
+
+	list_for_each_entry(mad_send_wr, &mad_agent_priv->send_list,
+			    agent_list) {
+		if (is_data_mad(mad_agent_priv, mad_send_wr->send_buf.mad) &&
+		    &mad_send_wr->send_buf == send_buf)
+			return mad_send_wr;
+	}
+	return NULL;
+}
+
+int ib_modify_mad(struct ib_mad_agent *mad_agent,
+		  struct ib_mad_send_buf *send_buf, u32 timeout_ms)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	unsigned long flags;
+	int active;
+
+	mad_agent_priv = container_of(mad_agent, struct ib_mad_agent_private,
+				      agent);
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	mad_send_wr = find_send_wr(mad_agent_priv, send_buf);
+	if (!mad_send_wr || mad_send_wr->status != IB_WC_SUCCESS) {
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+		return -EINVAL;
+	}
+
+	active = (!mad_send_wr->timeout || mad_send_wr->refcount > 1);
+	if (!timeout_ms) {
+		mad_send_wr->status = IB_WC_WR_FLUSH_ERR;
+		mad_send_wr->refcount -= (mad_send_wr->timeout > 0);
+	}
+
+	mad_send_wr->send_buf.timeout_ms = timeout_ms;
+	if (active)
+		mad_send_wr->timeout = msecs_to_jiffies(timeout_ms);
+	else
+		ib_reset_mad_timeout(mad_send_wr, timeout_ms);
+
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+	return 0;
+}
+EXPORT_SYMBOL(ib_modify_mad);
+
+void ib_cancel_mad(struct ib_mad_agent *mad_agent,
+		   struct ib_mad_send_buf *send_buf)
+{
+	ib_modify_mad(mad_agent, send_buf, 0);
+}
+EXPORT_SYMBOL(ib_cancel_mad);
+
+static void local_completions(struct work_struct *work)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_local_private *local;
+	struct ib_mad_agent_private *recv_mad_agent;
+	unsigned long flags;
+	int free_mad;
+	struct ib_wc wc;
+	struct ib_mad_send_wc mad_send_wc;
+
+	mad_agent_priv =
+		container_of(work, struct ib_mad_agent_private, local_work);
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	while (!list_empty(&mad_agent_priv->local_list)) {
+		local = list_entry(mad_agent_priv->local_list.next,
+				   struct ib_mad_local_private,
+				   completion_list);
+		list_del(&local->completion_list);
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+		free_mad = 0;
+		if (local->mad_priv) {
+			recv_mad_agent = local->recv_mad_agent;
+			if (!recv_mad_agent) {
+				dev_err(&mad_agent_priv->agent.device->dev,
+					"No receive MAD agent for local completion\n");
+				free_mad = 1;
+				goto local_send_completion;
+			}
+
+			/*
+			 * Defined behavior is to complete response
+			 * before request
+			 */
+			build_smp_wc(recv_mad_agent->agent.qp,
+				     (unsigned long) local->mad_send_wr,
+				     be16_to_cpu(IB_LID_PERMISSIVE),
+				     0, recv_mad_agent->agent.port_num, &wc);
+
+			local->mad_priv->header.recv_wc.wc = &wc;
+			local->mad_priv->header.recv_wc.mad_len =
+						sizeof(struct ib_mad);
+			INIT_LIST_HEAD(&local->mad_priv->header.recv_wc.rmpp_list);
+			list_add(&local->mad_priv->header.recv_wc.recv_buf.list,
+				 &local->mad_priv->header.recv_wc.rmpp_list);
+			local->mad_priv->header.recv_wc.recv_buf.grh = NULL;
+			local->mad_priv->header.recv_wc.recv_buf.mad =
+						&local->mad_priv->mad.mad;
+			if (atomic_read(&recv_mad_agent->qp_info->snoop_count))
+				snoop_recv(recv_mad_agent->qp_info,
+					  &local->mad_priv->header.recv_wc,
+					   IB_MAD_SNOOP_RECVS);
+			recv_mad_agent->agent.recv_handler(
+						&recv_mad_agent->agent,
+						&local->mad_priv->header.recv_wc);
+			spin_lock_irqsave(&recv_mad_agent->lock, flags);
+			atomic_dec(&recv_mad_agent->refcount);
+			spin_unlock_irqrestore(&recv_mad_agent->lock, flags);
+		}
+
+local_send_completion:
+		/* Complete send */
+		mad_send_wc.status = IB_WC_SUCCESS;
+		mad_send_wc.vendor_err = 0;
+		mad_send_wc.send_buf = &local->mad_send_wr->send_buf;
+		if (atomic_read(&mad_agent_priv->qp_info->snoop_count))
+			snoop_send(mad_agent_priv->qp_info,
+				   &local->mad_send_wr->send_buf,
+				   &mad_send_wc, IB_MAD_SNOOP_SEND_COMPLETIONS);
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+		atomic_dec(&mad_agent_priv->refcount);
+		if (free_mad)
+			kmem_cache_free(ib_mad_cache, local->mad_priv);
+		kfree(local);
+	}
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static int retry_send(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	int ret;
+
+	if (!mad_send_wr->retries_left)
+		return -ETIMEDOUT;
+
+	mad_send_wr->retries_left--;
+	mad_send_wr->send_buf.retries++;
+
+	mad_send_wr->timeout = msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+
+	if (ib_mad_kernel_rmpp_agent(&mad_send_wr->mad_agent_priv->agent)) {
+		ret = ib_retry_rmpp(mad_send_wr);
+		switch (ret) {
+		case IB_RMPP_RESULT_UNHANDLED:
+			ret = ib_send_mad(mad_send_wr);
+			break;
+		case IB_RMPP_RESULT_CONSUMED:
+			ret = 0;
+			break;
+		default:
+			ret = -ECOMM;
+			break;
+		}
+	} else
+		ret = ib_send_mad(mad_send_wr);
+
+	if (!ret) {
+		mad_send_wr->refcount++;
+		list_add_tail(&mad_send_wr->agent_list,
+			      &mad_send_wr->mad_agent_priv->send_list);
+	}
+	return ret;
+}
+
+static void timeout_sends(struct work_struct *work)
+{
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc mad_send_wc;
+	unsigned long flags, delay;
+
+	mad_agent_priv = container_of(work, struct ib_mad_agent_private,
+				      timed_work.work);
+	mad_send_wc.vendor_err = 0;
+
+	spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	while (!list_empty(&mad_agent_priv->wait_list)) {
+		mad_send_wr = list_entry(mad_agent_priv->wait_list.next,
+					 struct ib_mad_send_wr_private,
+					 agent_list);
+
+		if (time_after(mad_send_wr->timeout, jiffies)) {
+			delay = mad_send_wr->timeout - jiffies;
+			if ((long)delay <= 0)
+				delay = 1;
+			queue_delayed_work(mad_agent_priv->qp_info->
+					   port_priv->wq,
+					   &mad_agent_priv->timed_work, delay);
+			break;
+		}
+
+		list_del(&mad_send_wr->agent_list);
+		if (mad_send_wr->status == IB_WC_SUCCESS &&
+		    !retry_send(mad_send_wr))
+			continue;
+
+		spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+
+		if (mad_send_wr->status == IB_WC_SUCCESS)
+			mad_send_wc.status = IB_WC_RESP_TIMEOUT_ERR;
+		else
+			mad_send_wc.status = mad_send_wr->status;
+		mad_send_wc.send_buf = &mad_send_wr->send_buf;
+		mad_agent_priv->agent.send_handler(&mad_agent_priv->agent,
+						   &mad_send_wc);
+
+		atomic_dec(&mad_agent_priv->refcount);
+		spin_lock_irqsave(&mad_agent_priv->lock, flags);
+	}
+	spin_unlock_irqrestore(&mad_agent_priv->lock, flags);
+}
+
+static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg)
+{
+	struct ib_mad_port_private *port_priv = cq->cq_context;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	if (!list_empty(&port_priv->port_list))
+		queue_work(port_priv->wq, &port_priv->work);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+}
+
+/*
+ * Allocate receive MADs and post receive WRs for them
+ */
+static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info,
+				    struct ib_mad_private *mad)
+{
+	unsigned long flags;
+	int post, ret;
+	struct ib_mad_private *mad_priv;
+	struct ib_sge sg_list;
+	struct ib_recv_wr recv_wr, *bad_recv_wr;
+	struct ib_mad_queue *recv_queue = &qp_info->recv_queue;
+
+	/* Initialize common scatter list fields */
+	sg_list.length = sizeof *mad_priv - sizeof mad_priv->header;
+	sg_list.lkey = (*qp_info->port_priv->mr).lkey;
+
+	/* Initialize common receive WR fields */
+	recv_wr.next = NULL;
+	recv_wr.sg_list = &sg_list;
+	recv_wr.num_sge = 1;
+
+	do {
+		/* Allocate and map receive buffer */
+		if (mad) {
+			mad_priv = mad;
+			mad = NULL;
+		} else {
+			mad_priv = kmem_cache_alloc(ib_mad_cache, GFP_KERNEL);
+			if (!mad_priv) {
+				dev_err(&qp_info->port_priv->device->dev,
+					"No memory for receive buffer\n");
+				ret = -ENOMEM;
+				break;
+			}
+		}
+		sg_list.addr = ib_dma_map_single(qp_info->port_priv->device,
+						 &mad_priv->grh,
+						 sizeof *mad_priv -
+						   sizeof mad_priv->header,
+						 DMA_FROM_DEVICE);
+		if (unlikely(ib_dma_mapping_error(qp_info->port_priv->device,
+						  sg_list.addr))) {
+			ret = -ENOMEM;
+			break;
+		}
+		mad_priv->header.mapping = sg_list.addr;
+		recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list;
+		mad_priv->header.mad_list.mad_queue = recv_queue;
+
+		/* Post receive WR */
+		spin_lock_irqsave(&recv_queue->lock, flags);
+		post = (++recv_queue->count < recv_queue->max_active);
+		list_add_tail(&mad_priv->header.mad_list.list, &recv_queue->list);
+		spin_unlock_irqrestore(&recv_queue->lock, flags);
+		ret = ib_post_recv(qp_info->qp, &recv_wr, &bad_recv_wr);
+		if (ret) {
+			spin_lock_irqsave(&recv_queue->lock, flags);
+			list_del(&mad_priv->header.mad_list.list);
+			recv_queue->count--;
+			spin_unlock_irqrestore(&recv_queue->lock, flags);
+			ib_dma_unmap_single(qp_info->port_priv->device,
+					    mad_priv->header.mapping,
+					    sizeof *mad_priv -
+					      sizeof mad_priv->header,
+					    DMA_FROM_DEVICE);
+			kmem_cache_free(ib_mad_cache, mad_priv);
+			dev_err(&qp_info->port_priv->device->dev,
+				"ib_post_recv failed: %d\n", ret);
+			break;
+		}
+	} while (post);
+
+	return ret;
+}
+
+/*
+ * Return all the posted receive MADs
+ */
+static void cleanup_recv_queue(struct ib_mad_qp_info *qp_info)
+{
+	struct ib_mad_private_header *mad_priv_hdr;
+	struct ib_mad_private *recv;
+	struct ib_mad_list_head *mad_list;
+
+	if (!qp_info->qp)
+		return;
+
+	while (!list_empty(&qp_info->recv_queue.list)) {
+
+		mad_list = list_entry(qp_info->recv_queue.list.next,
+				      struct ib_mad_list_head, list);
+		mad_priv_hdr = container_of(mad_list,
+					    struct ib_mad_private_header,
+					    mad_list);
+		recv = container_of(mad_priv_hdr, struct ib_mad_private,
+				    header);
+
+		/* Remove from posted receive MAD list */
+		list_del(&mad_list->list);
+
+		ib_dma_unmap_single(qp_info->port_priv->device,
+				    recv->header.mapping,
+				    sizeof(struct ib_mad_private) -
+				      sizeof(struct ib_mad_private_header),
+				    DMA_FROM_DEVICE);
+		kmem_cache_free(ib_mad_cache, recv);
+	}
+
+	qp_info->recv_queue.count = 0;
+}
+
+/*
+ * Start the port
+ */
+static int ib_mad_port_start(struct ib_mad_port_private *port_priv)
+{
+	int ret, i;
+	struct ib_qp_attr *attr;
+	struct ib_qp *qp;
+	u16 pkey_index;
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr) {
+		dev_err(&port_priv->device->dev,
+			"Couldn't kmalloc ib_qp_attr\n");
+		return -ENOMEM;
+	}
+
+	ret = ib_find_pkey(port_priv->device, port_priv->port_num,
+			   IB_DEFAULT_PKEY_FULL, &pkey_index);
+	if (ret)
+		pkey_index = 0;
+
+	for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+		qp = port_priv->qp_info[i].qp;
+		if (!qp)
+			continue;
+
+		/*
+		 * PKey index for QP1 is irrelevant but
+		 * one is needed for the Reset to Init transition
+		 */
+		attr->qp_state = IB_QPS_INIT;
+		attr->pkey_index = pkey_index;
+		attr->qkey = (qp->qp_num == 0) ? 0 : IB_QP1_QKEY;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE |
+					     IB_QP_PKEY_INDEX | IB_QP_QKEY);
+		if (ret) {
+			dev_err(&port_priv->device->dev,
+				"Couldn't change QP%d state to INIT: %d\n",
+				i, ret);
+			goto out;
+		}
+
+		attr->qp_state = IB_QPS_RTR;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE);
+		if (ret) {
+			dev_err(&port_priv->device->dev,
+				"Couldn't change QP%d state to RTR: %d\n",
+				i, ret);
+			goto out;
+		}
+
+		attr->qp_state = IB_QPS_RTS;
+		attr->sq_psn = IB_MAD_SEND_Q_PSN;
+		ret = ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_SQ_PSN);
+		if (ret) {
+			dev_err(&port_priv->device->dev,
+				"Couldn't change QP%d state to RTS: %d\n",
+				i, ret);
+			goto out;
+		}
+	}
+
+	ret = ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		dev_err(&port_priv->device->dev,
+			"Failed to request completion notification: %d\n",
+			ret);
+		goto out;
+	}
+
+	for (i = 0; i < IB_MAD_QPS_CORE; i++) {
+		if (!port_priv->qp_info[i].qp)
+			continue;
+
+		ret = ib_mad_post_receive_mads(&port_priv->qp_info[i], NULL);
+		if (ret) {
+			dev_err(&port_priv->device->dev,
+				"Couldn't post receive WRs\n");
+			goto out;
+		}
+	}
+out:
+	kfree(attr);
+	return ret;
+}
+
+static void qp_event_handler(struct ib_event *event, void *qp_context)
+{
+	struct ib_mad_qp_info	*qp_info = qp_context;
+
+	/* It's worse than that! He's dead, Jim! */
+	dev_err(&qp_info->port_priv->device->dev,
+		"Fatal error (%d) on MAD QP (%d)\n",
+		event->event, qp_info->qp->qp_num);
+}
+
+static void init_mad_queue(struct ib_mad_qp_info *qp_info,
+			   struct ib_mad_queue *mad_queue)
+{
+	mad_queue->qp_info = qp_info;
+	mad_queue->count = 0;
+	spin_lock_init(&mad_queue->lock);
+	INIT_LIST_HEAD(&mad_queue->list);
+}
+
+static void init_mad_qp(struct ib_mad_port_private *port_priv,
+			struct ib_mad_qp_info *qp_info)
+{
+	qp_info->port_priv = port_priv;
+	init_mad_queue(qp_info, &qp_info->send_queue);
+	init_mad_queue(qp_info, &qp_info->recv_queue);
+	INIT_LIST_HEAD(&qp_info->overflow_list);
+	spin_lock_init(&qp_info->snoop_lock);
+	qp_info->snoop_table = NULL;
+	qp_info->snoop_table_size = 0;
+	atomic_set(&qp_info->snoop_count, 0);
+}
+
+static int create_mad_qp(struct ib_mad_qp_info *qp_info,
+			 enum ib_qp_type qp_type)
+{
+	struct ib_qp_init_attr	qp_init_attr;
+	int ret;
+
+	memset(&qp_init_attr, 0, sizeof qp_init_attr);
+	qp_init_attr.send_cq = qp_info->port_priv->cq;
+	qp_init_attr.recv_cq = qp_info->port_priv->cq;
+	qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+	qp_init_attr.cap.max_send_wr = mad_sendq_size;
+	qp_init_attr.cap.max_recv_wr = mad_recvq_size;
+	qp_init_attr.cap.max_send_sge = IB_MAD_SEND_REQ_MAX_SG;
+	qp_init_attr.cap.max_recv_sge = IB_MAD_RECV_REQ_MAX_SG;
+	qp_init_attr.qp_type = qp_type;
+	qp_init_attr.port_num = qp_info->port_priv->port_num;
+	qp_init_attr.qp_context = qp_info;
+	qp_init_attr.event_handler = qp_event_handler;
+	qp_info->qp = ib_create_qp(qp_info->port_priv->pd, &qp_init_attr);
+	if (IS_ERR(qp_info->qp)) {
+		dev_err(&qp_info->port_priv->device->dev,
+			"Couldn't create ib_mad QP%d\n",
+			get_spl_qp_index(qp_type));
+		ret = PTR_ERR(qp_info->qp);
+		goto error;
+	}
+	/* Use minimum queue sizes unless the CQ is resized */
+	qp_info->send_queue.max_active = mad_sendq_size;
+	qp_info->recv_queue.max_active = mad_recvq_size;
+	return 0;
+
+error:
+	return ret;
+}
+
+static void destroy_mad_qp(struct ib_mad_qp_info *qp_info)
+{
+	if (!qp_info->qp)
+		return;
+
+	ib_destroy_qp(qp_info->qp);
+	kfree(qp_info->snoop_table);
+}
+
+/*
+ * Open the port
+ * Create the QP, PD, MR, and CQ if needed
+ */
+static int ib_mad_port_open(struct ib_device *device,
+			    int port_num)
+{
+	int ret, cq_size;
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+	char name[sizeof "ib_mad123"];
+	int has_smi;
+
+	/* Create new device info */
+	port_priv = kzalloc(sizeof *port_priv, GFP_KERNEL);
+	if (!port_priv) {
+		dev_err(&device->dev, "No memory for ib_mad_port_private\n");
+		return -ENOMEM;
+	}
+
+	port_priv->device = device;
+	port_priv->port_num = port_num;
+	spin_lock_init(&port_priv->reg_lock);
+	INIT_LIST_HEAD(&port_priv->agent_list);
+	init_mad_qp(port_priv, &port_priv->qp_info[0]);
+	init_mad_qp(port_priv, &port_priv->qp_info[1]);
+
+	cq_size = mad_sendq_size + mad_recvq_size;
+	has_smi = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_INFINIBAND;
+	if (has_smi)
+		cq_size *= 2;
+
+	port_priv->cq = ib_create_cq(port_priv->device,
+				     ib_mad_thread_completion_handler,
+				     NULL, port_priv, cq_size, 0);
+	if (IS_ERR(port_priv->cq)) {
+		dev_err(&device->dev, "Couldn't create ib_mad CQ\n");
+		ret = PTR_ERR(port_priv->cq);
+		goto error3;
+	}
+
+	port_priv->pd = ib_alloc_pd(device);
+	if (IS_ERR(port_priv->pd)) {
+		dev_err(&device->dev, "Couldn't create ib_mad PD\n");
+		ret = PTR_ERR(port_priv->pd);
+		goto error4;
+	}
+
+	port_priv->mr = ib_get_dma_mr(port_priv->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(port_priv->mr)) {
+		dev_err(&device->dev, "Couldn't get ib_mad DMA MR\n");
+		ret = PTR_ERR(port_priv->mr);
+		goto error5;
+	}
+
+	if (has_smi) {
+		ret = create_mad_qp(&port_priv->qp_info[0], IB_QPT_SMI);
+		if (ret)
+			goto error6;
+	}
+	ret = create_mad_qp(&port_priv->qp_info[1], IB_QPT_GSI);
+	if (ret)
+		goto error7;
+
+	snprintf(name, sizeof name, "ib_mad%d", port_num);
+	port_priv->wq = create_singlethread_workqueue(name);
+	if (!port_priv->wq) {
+		ret = -ENOMEM;
+		goto error8;
+	}
+	INIT_WORK(&port_priv->work, ib_mad_completion_handler);
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	list_add_tail(&port_priv->port_list, &ib_mad_port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	ret = ib_mad_port_start(port_priv);
+	if (ret) {
+		dev_err(&device->dev, "Couldn't start port\n");
+		goto error9;
+	}
+
+	return 0;
+
+error9:
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	list_del_init(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	destroy_workqueue(port_priv->wq);
+error8:
+	destroy_mad_qp(&port_priv->qp_info[1]);
+error7:
+	destroy_mad_qp(&port_priv->qp_info[0]);
+error6:
+	ib_dereg_mr(port_priv->mr);
+error5:
+	ib_dealloc_pd(port_priv->pd);
+error4:
+	ib_destroy_cq(port_priv->cq);
+	cleanup_recv_queue(&port_priv->qp_info[1]);
+	cleanup_recv_queue(&port_priv->qp_info[0]);
+error3:
+	kfree(port_priv);
+
+	return ret;
+}
+
+/*
+ * Close the port
+ * If there are no classes using the port, free the port
+ * resources (CQ, MR, PD, QP) and remove the port's info structure
+ */
+static int ib_mad_port_close(struct ib_device *device, int port_num)
+{
+	struct ib_mad_port_private *port_priv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_mad_port_list_lock, flags);
+	port_priv = __ib_get_mad_port(device, port_num);
+	if (port_priv == NULL) {
+		spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+		dev_err(&device->dev, "Port %d not found\n", port_num);
+		return -ENODEV;
+	}
+	list_del_init(&port_priv->port_list);
+	spin_unlock_irqrestore(&ib_mad_port_list_lock, flags);
+
+	destroy_workqueue(port_priv->wq);
+	destroy_mad_qp(&port_priv->qp_info[1]);
+	destroy_mad_qp(&port_priv->qp_info[0]);
+	ib_dereg_mr(port_priv->mr);
+	ib_dealloc_pd(port_priv->pd);
+	ib_destroy_cq(port_priv->cq);
+	cleanup_recv_queue(&port_priv->qp_info[1]);
+	cleanup_recv_queue(&port_priv->qp_info[0]);
+	/* XXX: Handle deallocation of MAD registration tables */
+
+	kfree(port_priv);
+
+	return 0;
+}
+
+static void ib_mad_init_device(struct ib_device *device)
+{
+	int start, end, i;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		start = 0;
+		end   = 0;
+	} else {
+		start = 1;
+		end   = device->phys_port_cnt;
+	}
+
+	for (i = start; i <= end; i++) {
+		if (ib_mad_port_open(device, i)) {
+			dev_err(&device->dev, "Couldn't open port %d\n", i);
+			goto error;
+		}
+		if (ib_agent_port_open(device, i)) {
+			dev_err(&device->dev,
+				"Couldn't open port %d for agents\n", i);
+			goto error_agent;
+		}
+	}
+	return;
+
+error_agent:
+	if (ib_mad_port_close(device, i))
+		dev_err(&device->dev, "Couldn't close port %d\n", i);
+
+error:
+	i--;
+
+	while (i >= start) {
+		if (ib_agent_port_close(device, i))
+			dev_err(&device->dev,
+				"Couldn't close port %d for agents\n", i);
+		if (ib_mad_port_close(device, i))
+			dev_err(&device->dev, "Couldn't close port %d\n", i);
+		i--;
+	}
+}
+
+static void ib_mad_remove_device(struct ib_device *device)
+{
+	int i, num_ports, cur_port;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		num_ports = 1;
+		cur_port = 0;
+	} else {
+		num_ports = device->phys_port_cnt;
+		cur_port = 1;
+	}
+	for (i = 0; i < num_ports; i++, cur_port++) {
+		if (ib_agent_port_close(device, cur_port))
+			dev_err(&device->dev,
+				"Couldn't close port %d for agents\n",
+				cur_port);
+		if (ib_mad_port_close(device, cur_port))
+			dev_err(&device->dev, "Couldn't close port %d\n",
+				cur_port);
+	}
+}
+
+static struct ib_client mad_client = {
+	.name   = "mad",
+	.add = ib_mad_init_device,
+	.remove = ib_mad_remove_device
+};
+
+static int __init ib_mad_init_module(void)
+{
+	int ret;
+
+	mad_recvq_size = min(mad_recvq_size, IB_MAD_QP_MAX_SIZE);
+	mad_recvq_size = max(mad_recvq_size, IB_MAD_QP_MIN_SIZE);
+
+	mad_sendq_size = min(mad_sendq_size, IB_MAD_QP_MAX_SIZE);
+	mad_sendq_size = max(mad_sendq_size, IB_MAD_QP_MIN_SIZE);
+
+	ib_mad_cache = kmem_cache_create("ib_mad",
+					 sizeof(struct ib_mad_private),
+					 0,
+					 SLAB_HWCACHE_ALIGN,
+					 NULL);
+	if (!ib_mad_cache) {
+		pr_err("Couldn't create ib_mad cache\n");
+		ret = -ENOMEM;
+		goto error1;
+	}
+
+	INIT_LIST_HEAD(&ib_mad_port_list);
+
+	if (ib_register_client(&mad_client)) {
+		pr_err("Couldn't register ib_mad client\n");
+		ret = -EINVAL;
+		goto error2;
+	}
+
+	return 0;
+
+error2:
+	kmem_cache_destroy(ib_mad_cache);
+error1:
+	return ret;
+}
+
+static void __exit ib_mad_cleanup_module(void)
+{
+	ib_unregister_client(&mad_client);
+	kmem_cache_destroy(ib_mad_cache);
+}
+
+module_init(ib_mad_init_module);
+module_exit(ib_mad_cleanup_module);
diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h
new file mode 100644
index 000000000..d1a0b0ee9
--- /dev/null
+++ b/drivers/infiniband/core/mad_priv.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2004, 2005, Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Intel Corporation. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2009 HNR Consulting. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __IB_MAD_PRIV_H__
+#define __IB_MAD_PRIV_H__
+
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/workqueue.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#define IB_MAD_QPS_CORE		2 /* Always QP0 and QP1 as a minimum */
+
+/* QP and CQ parameters */
+#define IB_MAD_QP_SEND_SIZE	128
+#define IB_MAD_QP_RECV_SIZE	512
+#define IB_MAD_QP_MIN_SIZE	64
+#define IB_MAD_QP_MAX_SIZE	8192
+#define IB_MAD_SEND_REQ_MAX_SG	2
+#define IB_MAD_RECV_REQ_MAX_SG	1
+
+#define IB_MAD_SEND_Q_PSN	0
+
+/* Registration table sizes */
+#define MAX_MGMT_CLASS		80
+#define MAX_MGMT_VERSION	8
+#define MAX_MGMT_OUI		8
+#define MAX_MGMT_VENDOR_RANGE2	(IB_MGMT_CLASS_VENDOR_RANGE2_END - \
+				IB_MGMT_CLASS_VENDOR_RANGE2_START + 1)
+
+struct ib_mad_list_head {
+	struct list_head list;
+	struct ib_mad_queue *mad_queue;
+};
+
+struct ib_mad_private_header {
+	struct ib_mad_list_head mad_list;
+	struct ib_mad_recv_wc recv_wc;
+	struct ib_wc wc;
+	u64 mapping;
+} __attribute__ ((packed));
+
+struct ib_mad_private {
+	struct ib_mad_private_header header;
+	struct ib_grh grh;
+	union {
+		struct ib_mad mad;
+		struct ib_rmpp_mad rmpp_mad;
+		struct ib_smp smp;
+	} mad;
+} __attribute__ ((packed));
+
+struct ib_rmpp_segment {
+	struct list_head list;
+	u32 num;
+	u8 data[0];
+};
+
+struct ib_mad_agent_private {
+	struct list_head agent_list;
+	struct ib_mad_agent agent;
+	struct ib_mad_reg_req *reg_req;
+	struct ib_mad_qp_info *qp_info;
+
+	spinlock_t lock;
+	struct list_head send_list;
+	struct list_head wait_list;
+	struct list_head done_list;
+	struct delayed_work timed_work;
+	unsigned long timeout;
+	struct list_head local_list;
+	struct work_struct local_work;
+	struct list_head rmpp_list;
+
+	atomic_t refcount;
+	struct completion comp;
+};
+
+struct ib_mad_snoop_private {
+	struct ib_mad_agent agent;
+	struct ib_mad_qp_info *qp_info;
+	int snoop_index;
+	int mad_snoop_flags;
+	atomic_t refcount;
+	struct completion comp;
+};
+
+struct ib_mad_send_wr_private {
+	struct ib_mad_list_head mad_list;
+	struct list_head agent_list;
+	struct ib_mad_agent_private *mad_agent_priv;
+	struct ib_mad_send_buf send_buf;
+	u64 header_mapping;
+	u64 payload_mapping;
+	struct ib_send_wr send_wr;
+	struct ib_sge sg_list[IB_MAD_SEND_REQ_MAX_SG];
+	__be64 tid;
+	unsigned long timeout;
+	int max_retries;
+	int retries_left;
+	int retry;
+	int refcount;
+	enum ib_wc_status status;
+
+	/* RMPP control */
+	struct list_head rmpp_list;
+	struct ib_rmpp_segment *last_ack_seg;
+	struct ib_rmpp_segment *cur_seg;
+	int last_ack;
+	int seg_num;
+	int newwin;
+	int pad;
+};
+
+struct ib_mad_local_private {
+	struct list_head completion_list;
+	struct ib_mad_private *mad_priv;
+	struct ib_mad_agent_private *recv_mad_agent;
+	struct ib_mad_send_wr_private *mad_send_wr;
+};
+
+struct ib_mad_mgmt_method_table {
+	struct ib_mad_agent_private *agent[IB_MGMT_MAX_METHODS];
+};
+
+struct ib_mad_mgmt_class_table {
+	struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_CLASS];
+};
+
+struct ib_mad_mgmt_vendor_class {
+	u8	oui[MAX_MGMT_OUI][3];
+	struct ib_mad_mgmt_method_table *method_table[MAX_MGMT_OUI];
+};
+
+struct ib_mad_mgmt_vendor_class_table {
+	struct ib_mad_mgmt_vendor_class *vendor_class[MAX_MGMT_VENDOR_RANGE2];
+};
+
+struct ib_mad_mgmt_version_table {
+	struct ib_mad_mgmt_class_table *class;
+	struct ib_mad_mgmt_vendor_class_table *vendor;
+};
+
+struct ib_mad_queue {
+	spinlock_t lock;
+	struct list_head list;
+	int count;
+	int max_active;
+	struct ib_mad_qp_info *qp_info;
+};
+
+struct ib_mad_qp_info {
+	struct ib_mad_port_private *port_priv;
+	struct ib_qp *qp;
+	struct ib_mad_queue send_queue;
+	struct ib_mad_queue recv_queue;
+	struct list_head overflow_list;
+	spinlock_t snoop_lock;
+	struct ib_mad_snoop_private **snoop_table;
+	int snoop_table_size;
+	atomic_t snoop_count;
+};
+
+struct ib_mad_port_private {
+	struct list_head port_list;
+	struct ib_device *device;
+	int port_num;
+	struct ib_cq *cq;
+	struct ib_pd *pd;
+	struct ib_mr *mr;
+
+	spinlock_t reg_lock;
+	struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION];
+	struct list_head agent_list;
+	struct workqueue_struct *wq;
+	struct work_struct work;
+	struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE];
+};
+
+int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_send_wr_private *
+ib_find_send_mad(struct ib_mad_agent_private *mad_agent_priv,
+		 struct ib_mad_recv_wc *mad_recv_wc);
+
+void ib_mad_complete_send_wr(struct ib_mad_send_wr_private *mad_send_wr,
+			     struct ib_mad_send_wc *mad_send_wc);
+
+void ib_mark_mad_done(struct ib_mad_send_wr_private *mad_send_wr);
+
+void ib_reset_mad_timeout(struct ib_mad_send_wr_private *mad_send_wr,
+			  int timeout_ms);
+
+#endif	/* __IB_MAD_PRIV_H__ */
diff --git a/drivers/infiniband/core/mad_rmpp.c b/drivers/infiniband/core/mad_rmpp.c
new file mode 100644
index 000000000..f37878c9c
--- /dev/null
+++ b/drivers/infiniband/core/mad_rmpp.c
@@ -0,0 +1,953 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ * Copyright (c) 2005-2006 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mad_priv.h"
+#include "mad_rmpp.h"
+
+enum rmpp_state {
+	RMPP_STATE_ACTIVE,
+	RMPP_STATE_TIMEOUT,
+	RMPP_STATE_COMPLETE,
+	RMPP_STATE_CANCELING
+};
+
+struct mad_rmpp_recv {
+	struct ib_mad_agent_private *agent;
+	struct list_head list;
+	struct delayed_work timeout_work;
+	struct delayed_work cleanup_work;
+	struct completion comp;
+	enum rmpp_state state;
+	spinlock_t lock;
+	atomic_t refcount;
+
+	struct ib_ah *ah;
+	struct ib_mad_recv_wc *rmpp_wc;
+	struct ib_mad_recv_buf *cur_seg_buf;
+	int last_ack;
+	int seg_num;
+	int newwin;
+	int repwin;
+
+	__be64 tid;
+	u32 src_qp;
+	u16 slid;
+	u8 mgmt_class;
+	u8 class_version;
+	u8 method;
+};
+
+static inline void deref_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+	if (atomic_dec_and_test(&rmpp_recv->refcount))
+		complete(&rmpp_recv->comp);
+}
+
+static void destroy_rmpp_recv(struct mad_rmpp_recv *rmpp_recv)
+{
+	deref_rmpp_recv(rmpp_recv);
+	wait_for_completion(&rmpp_recv->comp);
+	ib_destroy_ah(rmpp_recv->ah);
+	kfree(rmpp_recv);
+}
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent)
+{
+	struct mad_rmpp_recv *rmpp_recv, *temp_rmpp_recv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		if (rmpp_recv->state != RMPP_STATE_COMPLETE)
+			ib_free_recv_mad(rmpp_recv->rmpp_wc);
+		rmpp_recv->state = RMPP_STATE_CANCELING;
+	}
+	spin_unlock_irqrestore(&agent->lock, flags);
+
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		cancel_delayed_work(&rmpp_recv->timeout_work);
+		cancel_delayed_work(&rmpp_recv->cleanup_work);
+	}
+
+	flush_workqueue(agent->qp_info->port_priv->wq);
+
+	list_for_each_entry_safe(rmpp_recv, temp_rmpp_recv,
+				 &agent->rmpp_list, list) {
+		list_del(&rmpp_recv->list);
+		destroy_rmpp_recv(rmpp_recv);
+	}
+}
+
+static void format_ack(struct ib_mad_send_buf *msg,
+		       struct ib_rmpp_mad *data,
+		       struct mad_rmpp_recv *rmpp_recv)
+{
+	struct ib_rmpp_mad *ack = msg->mad;
+	unsigned long flags;
+
+	memcpy(ack, &data->mad_hdr, msg->hdr_len);
+
+	ack->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+	ack->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ACK;
+	ib_set_rmpp_flags(&ack->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+
+	spin_lock_irqsave(&rmpp_recv->lock, flags);
+	rmpp_recv->last_ack = rmpp_recv->seg_num;
+	ack->rmpp_hdr.seg_num = cpu_to_be32(rmpp_recv->seg_num);
+	ack->rmpp_hdr.paylen_newwin = cpu_to_be32(rmpp_recv->newwin);
+	spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+}
+
+static void ack_recv(struct mad_rmpp_recv *rmpp_recv,
+		     struct ib_mad_recv_wc *recv_wc)
+{
+	struct ib_mad_send_buf *msg;
+	int ret, hdr_len;
+
+	hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+	msg = ib_create_send_mad(&rmpp_recv->agent->agent, recv_wc->wc->src_qp,
+				 recv_wc->wc->pkey_index, 1, hdr_len,
+				 0, GFP_KERNEL);
+	if (IS_ERR(msg))
+		return;
+
+	format_ack(msg, (struct ib_rmpp_mad *) recv_wc->recv_buf.mad, rmpp_recv);
+	msg->ah = rmpp_recv->ah;
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret)
+		ib_free_send_mad(msg);
+}
+
+static struct ib_mad_send_buf *alloc_response_msg(struct ib_mad_agent *agent,
+						  struct ib_mad_recv_wc *recv_wc)
+{
+	struct ib_mad_send_buf *msg;
+	struct ib_ah *ah;
+	int hdr_len;
+
+	ah = ib_create_ah_from_wc(agent->qp->pd, recv_wc->wc,
+				  recv_wc->recv_buf.grh, agent->port_num);
+	if (IS_ERR(ah))
+		return (void *) ah;
+
+	hdr_len = ib_get_mad_data_offset(recv_wc->recv_buf.mad->mad_hdr.mgmt_class);
+	msg = ib_create_send_mad(agent, recv_wc->wc->src_qp,
+				 recv_wc->wc->pkey_index, 1,
+				 hdr_len, 0, GFP_KERNEL);
+	if (IS_ERR(msg))
+		ib_destroy_ah(ah);
+	else {
+		msg->ah = ah;
+		msg->context[0] = ah;
+	}
+
+	return msg;
+}
+
+static void ack_ds_ack(struct ib_mad_agent_private *agent,
+		       struct ib_mad_recv_wc *recv_wc)
+{
+	struct ib_mad_send_buf *msg;
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	msg = alloc_response_msg(&agent->agent, recv_wc);
+	if (IS_ERR(msg))
+		return;
+
+	rmpp_mad = msg->mad;
+	memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+	rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+	rmpp_mad->rmpp_hdr.seg_num = 0;
+	rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(1);
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		ib_destroy_ah(msg->ah);
+		ib_free_send_mad(msg);
+	}
+}
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc)
+{
+	if (mad_send_wc->send_buf->context[0] == mad_send_wc->send_buf->ah)
+		ib_destroy_ah(mad_send_wc->send_buf->ah);
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+static void nack_recv(struct ib_mad_agent_private *agent,
+		      struct ib_mad_recv_wc *recv_wc, u8 rmpp_status)
+{
+	struct ib_mad_send_buf *msg;
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	msg = alloc_response_msg(&agent->agent, recv_wc);
+	if (IS_ERR(msg))
+		return;
+
+	rmpp_mad = msg->mad;
+	memcpy(rmpp_mad, recv_wc->recv_buf.mad, msg->hdr_len);
+
+	rmpp_mad->mad_hdr.method ^= IB_MGMT_METHOD_RESP;
+	rmpp_mad->rmpp_hdr.rmpp_version = IB_MGMT_RMPP_VERSION;
+	rmpp_mad->rmpp_hdr.rmpp_type = IB_MGMT_RMPP_TYPE_ABORT;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+	rmpp_mad->rmpp_hdr.rmpp_status = rmpp_status;
+	rmpp_mad->rmpp_hdr.seg_num = 0;
+	rmpp_mad->rmpp_hdr.paylen_newwin = 0;
+
+	ret = ib_post_send_mad(msg, NULL);
+	if (ret) {
+		ib_destroy_ah(msg->ah);
+		ib_free_send_mad(msg);
+	}
+}
+
+static void recv_timeout_handler(struct work_struct *work)
+{
+	struct mad_rmpp_recv *rmpp_recv =
+		container_of(work, struct mad_rmpp_recv, timeout_work.work);
+	struct ib_mad_recv_wc *rmpp_wc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+	if (rmpp_recv->state != RMPP_STATE_ACTIVE) {
+		spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+		return;
+	}
+	rmpp_recv->state = RMPP_STATE_TIMEOUT;
+	list_del(&rmpp_recv->list);
+	spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+
+	rmpp_wc = rmpp_recv->rmpp_wc;
+	nack_recv(rmpp_recv->agent, rmpp_wc, IB_MGMT_RMPP_STATUS_T2L);
+	destroy_rmpp_recv(rmpp_recv);
+	ib_free_recv_mad(rmpp_wc);
+}
+
+static void recv_cleanup_handler(struct work_struct *work)
+{
+	struct mad_rmpp_recv *rmpp_recv =
+		container_of(work, struct mad_rmpp_recv, cleanup_work.work);
+	unsigned long flags;
+
+	spin_lock_irqsave(&rmpp_recv->agent->lock, flags);
+	if (rmpp_recv->state == RMPP_STATE_CANCELING) {
+		spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+		return;
+	}
+	list_del(&rmpp_recv->list);
+	spin_unlock_irqrestore(&rmpp_recv->agent->lock, flags);
+	destroy_rmpp_recv(rmpp_recv);
+}
+
+static struct mad_rmpp_recv *
+create_rmpp_recv(struct ib_mad_agent_private *agent,
+		 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_mad_hdr *mad_hdr;
+
+	rmpp_recv = kmalloc(sizeof *rmpp_recv, GFP_KERNEL);
+	if (!rmpp_recv)
+		return NULL;
+
+	rmpp_recv->ah = ib_create_ah_from_wc(agent->agent.qp->pd,
+					     mad_recv_wc->wc,
+					     mad_recv_wc->recv_buf.grh,
+					     agent->agent.port_num);
+	if (IS_ERR(rmpp_recv->ah))
+		goto error;
+
+	rmpp_recv->agent = agent;
+	init_completion(&rmpp_recv->comp);
+	INIT_DELAYED_WORK(&rmpp_recv->timeout_work, recv_timeout_handler);
+	INIT_DELAYED_WORK(&rmpp_recv->cleanup_work, recv_cleanup_handler);
+	spin_lock_init(&rmpp_recv->lock);
+	rmpp_recv->state = RMPP_STATE_ACTIVE;
+	atomic_set(&rmpp_recv->refcount, 1);
+
+	rmpp_recv->rmpp_wc = mad_recv_wc;
+	rmpp_recv->cur_seg_buf = &mad_recv_wc->recv_buf;
+	rmpp_recv->newwin = 1;
+	rmpp_recv->seg_num = 1;
+	rmpp_recv->last_ack = 0;
+	rmpp_recv->repwin = 1;
+
+	mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+	rmpp_recv->tid = mad_hdr->tid;
+	rmpp_recv->src_qp = mad_recv_wc->wc->src_qp;
+	rmpp_recv->slid = mad_recv_wc->wc->slid;
+	rmpp_recv->mgmt_class = mad_hdr->mgmt_class;
+	rmpp_recv->class_version = mad_hdr->class_version;
+	rmpp_recv->method  = mad_hdr->method;
+	return rmpp_recv;
+
+error:	kfree(rmpp_recv);
+	return NULL;
+}
+
+static struct mad_rmpp_recv *
+find_rmpp_recv(struct ib_mad_agent_private *agent,
+	       struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_mad_hdr *mad_hdr = &mad_recv_wc->recv_buf.mad->mad_hdr;
+
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		if (rmpp_recv->tid == mad_hdr->tid &&
+		    rmpp_recv->src_qp == mad_recv_wc->wc->src_qp &&
+		    rmpp_recv->slid == mad_recv_wc->wc->slid &&
+		    rmpp_recv->mgmt_class == mad_hdr->mgmt_class &&
+		    rmpp_recv->class_version == mad_hdr->class_version &&
+		    rmpp_recv->method == mad_hdr->method)
+			return rmpp_recv;
+	}
+	return NULL;
+}
+
+static struct mad_rmpp_recv *
+acquire_rmpp_recv(struct ib_mad_agent_private *agent,
+		  struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	unsigned long flags;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+	if (rmpp_recv)
+		atomic_inc(&rmpp_recv->refcount);
+	spin_unlock_irqrestore(&agent->lock, flags);
+	return rmpp_recv;
+}
+
+static struct mad_rmpp_recv *
+insert_rmpp_recv(struct ib_mad_agent_private *agent,
+		 struct mad_rmpp_recv *rmpp_recv)
+{
+	struct mad_rmpp_recv *cur_rmpp_recv;
+
+	cur_rmpp_recv = find_rmpp_recv(agent, rmpp_recv->rmpp_wc);
+	if (!cur_rmpp_recv)
+		list_add_tail(&rmpp_recv->list, &agent->rmpp_list);
+
+	return cur_rmpp_recv;
+}
+
+static inline int get_last_flag(struct ib_mad_recv_buf *seg)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+	return ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_LAST;
+}
+
+static inline int get_seg_num(struct ib_mad_recv_buf *seg)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *) seg->mad;
+	return be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+}
+
+static inline struct ib_mad_recv_buf * get_next_seg(struct list_head *rmpp_list,
+						    struct ib_mad_recv_buf *seg)
+{
+	if (seg->list.next == rmpp_list)
+		return NULL;
+
+	return container_of(seg->list.next, struct ib_mad_recv_buf, list);
+}
+
+static inline int window_size(struct ib_mad_agent_private *agent)
+{
+	return max(agent->qp_info->recv_queue.max_active >> 3, 1);
+}
+
+static struct ib_mad_recv_buf * find_seg_location(struct list_head *rmpp_list,
+						  int seg_num)
+{
+	struct ib_mad_recv_buf *seg_buf;
+	int cur_seg_num;
+
+	list_for_each_entry_reverse(seg_buf, rmpp_list, list) {
+		cur_seg_num = get_seg_num(seg_buf);
+		if (seg_num > cur_seg_num)
+			return seg_buf;
+		if (seg_num == cur_seg_num)
+			break;
+	}
+	return NULL;
+}
+
+static void update_seg_num(struct mad_rmpp_recv *rmpp_recv,
+			   struct ib_mad_recv_buf *new_buf)
+{
+	struct list_head *rmpp_list = &rmpp_recv->rmpp_wc->rmpp_list;
+
+	while (new_buf && (get_seg_num(new_buf) == rmpp_recv->seg_num + 1)) {
+		rmpp_recv->cur_seg_buf = new_buf;
+		rmpp_recv->seg_num++;
+		new_buf = get_next_seg(rmpp_list, new_buf);
+	}
+}
+
+static inline int get_mad_len(struct mad_rmpp_recv *rmpp_recv)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int hdr_size, data_size, pad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)rmpp_recv->cur_seg_buf->mad;
+
+	hdr_size = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+	data_size = sizeof(struct ib_rmpp_mad) - hdr_size;
+	pad = IB_MGMT_RMPP_DATA - be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+	if (pad > IB_MGMT_RMPP_DATA || pad < 0)
+		pad = 0;
+
+	return hdr_size + rmpp_recv->seg_num * data_size - pad;
+}
+
+static struct ib_mad_recv_wc * complete_rmpp(struct mad_rmpp_recv *rmpp_recv)
+{
+	struct ib_mad_recv_wc *rmpp_wc;
+
+	ack_recv(rmpp_recv, rmpp_recv->rmpp_wc);
+	if (rmpp_recv->seg_num > 1)
+		cancel_delayed_work(&rmpp_recv->timeout_work);
+
+	rmpp_wc = rmpp_recv->rmpp_wc;
+	rmpp_wc->mad_len = get_mad_len(rmpp_recv);
+	/* 10 seconds until we can find the packet lifetime */
+	queue_delayed_work(rmpp_recv->agent->qp_info->port_priv->wq,
+			   &rmpp_recv->cleanup_work, msecs_to_jiffies(10000));
+	return rmpp_wc;
+}
+
+static struct ib_mad_recv_wc *
+continue_rmpp(struct ib_mad_agent_private *agent,
+	      struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_mad_recv_buf *prev_buf;
+	struct ib_mad_recv_wc *done_wc;
+	int seg_num;
+	unsigned long flags;
+
+	rmpp_recv = acquire_rmpp_recv(agent, mad_recv_wc);
+	if (!rmpp_recv)
+		goto drop1;
+
+	seg_num = get_seg_num(&mad_recv_wc->recv_buf);
+
+	spin_lock_irqsave(&rmpp_recv->lock, flags);
+	if ((rmpp_recv->state == RMPP_STATE_TIMEOUT) ||
+	    (seg_num > rmpp_recv->newwin))
+		goto drop3;
+
+	if ((seg_num <= rmpp_recv->last_ack) ||
+	    (rmpp_recv->state == RMPP_STATE_COMPLETE)) {
+		spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+		ack_recv(rmpp_recv, mad_recv_wc);
+		goto drop2;
+	}
+
+	prev_buf = find_seg_location(&rmpp_recv->rmpp_wc->rmpp_list, seg_num);
+	if (!prev_buf)
+		goto drop3;
+
+	done_wc = NULL;
+	list_add(&mad_recv_wc->recv_buf.list, &prev_buf->list);
+	if (rmpp_recv->cur_seg_buf == prev_buf) {
+		update_seg_num(rmpp_recv, &mad_recv_wc->recv_buf);
+		if (get_last_flag(rmpp_recv->cur_seg_buf)) {
+			rmpp_recv->state = RMPP_STATE_COMPLETE;
+			spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+			done_wc = complete_rmpp(rmpp_recv);
+			goto out;
+		} else if (rmpp_recv->seg_num == rmpp_recv->newwin) {
+			rmpp_recv->newwin += window_size(agent);
+			spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+			ack_recv(rmpp_recv, mad_recv_wc);
+			goto out;
+		}
+	}
+	spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+out:
+	deref_rmpp_recv(rmpp_recv);
+	return done_wc;
+
+drop3:	spin_unlock_irqrestore(&rmpp_recv->lock, flags);
+drop2:	deref_rmpp_recv(rmpp_recv);
+drop1:	ib_free_recv_mad(mad_recv_wc);
+	return NULL;
+}
+
+static struct ib_mad_recv_wc *
+start_rmpp(struct ib_mad_agent_private *agent,
+	   struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+	unsigned long flags;
+
+	rmpp_recv = create_rmpp_recv(agent, mad_recv_wc);
+	if (!rmpp_recv) {
+		ib_free_recv_mad(mad_recv_wc);
+		return NULL;
+	}
+
+	spin_lock_irqsave(&agent->lock, flags);
+	if (insert_rmpp_recv(agent, rmpp_recv)) {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		/* duplicate first MAD */
+		destroy_rmpp_recv(rmpp_recv);
+		return continue_rmpp(agent, mad_recv_wc);
+	}
+	atomic_inc(&rmpp_recv->refcount);
+
+	if (get_last_flag(&mad_recv_wc->recv_buf)) {
+		rmpp_recv->state = RMPP_STATE_COMPLETE;
+		spin_unlock_irqrestore(&agent->lock, flags);
+		complete_rmpp(rmpp_recv);
+	} else {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		/* 40 seconds until we can find the packet lifetimes */
+		queue_delayed_work(agent->qp_info->port_priv->wq,
+				   &rmpp_recv->timeout_work,
+				   msecs_to_jiffies(40000));
+		rmpp_recv->newwin += window_size(agent);
+		ack_recv(rmpp_recv, mad_recv_wc);
+		mad_recv_wc = NULL;
+	}
+	deref_rmpp_recv(rmpp_recv);
+	return mad_recv_wc;
+}
+
+static int send_next_seg(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int timeout;
+	u32 paylen = 0;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	ib_set_rmpp_flags(&rmpp_mad->rmpp_hdr, IB_MGMT_RMPP_FLAG_ACTIVE);
+	rmpp_mad->rmpp_hdr.seg_num = cpu_to_be32(++mad_send_wr->seg_num);
+
+	if (mad_send_wr->seg_num == 1) {
+		rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_FIRST;
+		paylen = mad_send_wr->send_buf.seg_count * IB_MGMT_RMPP_DATA -
+			 mad_send_wr->pad;
+	}
+
+	if (mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count) {
+		rmpp_mad->rmpp_hdr.rmpp_rtime_flags |= IB_MGMT_RMPP_FLAG_LAST;
+		paylen = IB_MGMT_RMPP_DATA - mad_send_wr->pad;
+	}
+	rmpp_mad->rmpp_hdr.paylen_newwin = cpu_to_be32(paylen);
+
+	/* 2 seconds for an ACK until we can find the packet lifetime */
+	timeout = mad_send_wr->send_buf.timeout_ms;
+	if (!timeout || timeout > 2000)
+		mad_send_wr->timeout = msecs_to_jiffies(2000);
+
+	return ib_send_mad(mad_send_wr);
+}
+
+static void abort_send(struct ib_mad_agent_private *agent,
+		       struct ib_mad_recv_wc *mad_recv_wc, u8 rmpp_status)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_mad_send_wc wc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+	if (!mad_send_wr)
+		goto out;	/* Unmatched send */
+
+	if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+	    (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+		goto out;	/* Send is already done */
+
+	ib_mark_mad_done(mad_send_wr);
+	spin_unlock_irqrestore(&agent->lock, flags);
+
+	wc.status = IB_WC_REM_ABORT_ERR;
+	wc.vendor_err = rmpp_status;
+	wc.send_buf = &mad_send_wr->send_buf;
+	ib_mad_complete_send_wr(mad_send_wr, &wc);
+	return;
+out:
+	spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static inline void adjust_last_ack(struct ib_mad_send_wr_private *wr,
+				   int seg_num)
+{
+	struct list_head *list;
+
+	wr->last_ack = seg_num;
+	list = &wr->last_ack_seg->list;
+	list_for_each_entry(wr->last_ack_seg, list, list)
+		if (wr->last_ack_seg->num == seg_num)
+			break;
+}
+
+static void process_ds_ack(struct ib_mad_agent_private *agent,
+			   struct ib_mad_recv_wc *mad_recv_wc, int newwin)
+{
+	struct mad_rmpp_recv *rmpp_recv;
+
+	rmpp_recv = find_rmpp_recv(agent, mad_recv_wc);
+	if (rmpp_recv && rmpp_recv->state == RMPP_STATE_COMPLETE)
+		rmpp_recv->repwin = newwin;
+}
+
+static void process_rmpp_ack(struct ib_mad_agent_private *agent,
+			     struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_mad_send_wr_private *mad_send_wr;
+	struct ib_rmpp_mad *rmpp_mad;
+	unsigned long flags;
+	int seg_num, newwin, ret;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+	if (rmpp_mad->rmpp_hdr.rmpp_status) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		return;
+	}
+
+	seg_num = be32_to_cpu(rmpp_mad->rmpp_hdr.seg_num);
+	newwin = be32_to_cpu(rmpp_mad->rmpp_hdr.paylen_newwin);
+	if (newwin < seg_num) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_W2S);
+		return;
+	}
+
+	spin_lock_irqsave(&agent->lock, flags);
+	mad_send_wr = ib_find_send_mad(agent, mad_recv_wc);
+	if (!mad_send_wr) {
+		if (!seg_num)
+			process_ds_ack(agent, mad_recv_wc, newwin);
+		goto out;	/* Unmatched or DS RMPP ACK */
+	}
+
+	if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) &&
+	    (mad_send_wr->timeout)) {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		ack_ds_ack(agent, mad_recv_wc);
+		return;		/* Repeated ACK for DS RMPP transaction */
+	}
+
+	if ((mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) ||
+	    (!mad_send_wr->timeout) || (mad_send_wr->status != IB_WC_SUCCESS))
+		goto out;	/* Send is already done */
+
+	if (seg_num > mad_send_wr->send_buf.seg_count ||
+	    seg_num > mad_send_wr->newwin) {
+		spin_unlock_irqrestore(&agent->lock, flags);
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_S2B);
+		return;
+	}
+
+	if (newwin < mad_send_wr->newwin || seg_num < mad_send_wr->last_ack)
+		goto out;	/* Old ACK */
+
+	if (seg_num > mad_send_wr->last_ack) {
+		adjust_last_ack(mad_send_wr, seg_num);
+		mad_send_wr->retries_left = mad_send_wr->max_retries;
+	}
+	mad_send_wr->newwin = newwin;
+	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+		/* If no response is expected, the ACK completes the send */
+		if (!mad_send_wr->send_buf.timeout_ms) {
+			struct ib_mad_send_wc wc;
+
+			ib_mark_mad_done(mad_send_wr);
+			spin_unlock_irqrestore(&agent->lock, flags);
+
+			wc.status = IB_WC_SUCCESS;
+			wc.vendor_err = 0;
+			wc.send_buf = &mad_send_wr->send_buf;
+			ib_mad_complete_send_wr(mad_send_wr, &wc);
+			return;
+		}
+		if (mad_send_wr->refcount == 1)
+			ib_reset_mad_timeout(mad_send_wr,
+					     mad_send_wr->send_buf.timeout_ms);
+		spin_unlock_irqrestore(&agent->lock, flags);
+		ack_ds_ack(agent, mad_recv_wc);
+		return;
+	} else if (mad_send_wr->refcount == 1 &&
+		   mad_send_wr->seg_num < mad_send_wr->newwin &&
+		   mad_send_wr->seg_num < mad_send_wr->send_buf.seg_count) {
+		/* Send failure will just result in a timeout/retry */
+		ret = send_next_seg(mad_send_wr);
+		if (ret)
+			goto out;
+
+		mad_send_wr->refcount++;
+		list_move_tail(&mad_send_wr->agent_list,
+			      &mad_send_wr->mad_agent_priv->send_list);
+	}
+out:
+	spin_unlock_irqrestore(&agent->lock, flags);
+}
+
+static struct ib_mad_recv_wc *
+process_rmpp_data(struct ib_mad_agent_private *agent,
+		  struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_hdr *rmpp_hdr;
+	u8 rmpp_status;
+
+	rmpp_hdr = &((struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad)->rmpp_hdr;
+
+	if (rmpp_hdr->rmpp_status) {
+		rmpp_status = IB_MGMT_RMPP_STATUS_BAD_STATUS;
+		goto bad;
+	}
+
+	if (rmpp_hdr->seg_num == cpu_to_be32(1)) {
+		if (!(ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST)) {
+			rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+			goto bad;
+		}
+		return start_rmpp(agent, mad_recv_wc);
+	} else {
+		if (ib_get_rmpp_flags(rmpp_hdr) & IB_MGMT_RMPP_FLAG_FIRST) {
+			rmpp_status = IB_MGMT_RMPP_STATUS_BAD_SEG;
+			goto bad;
+		}
+		return continue_rmpp(agent, mad_recv_wc);
+	}
+bad:
+	nack_recv(agent, mad_recv_wc, rmpp_status);
+	ib_free_recv_mad(mad_recv_wc);
+	return NULL;
+}
+
+static void process_rmpp_stop(struct ib_mad_agent_private *agent,
+			      struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_status != IB_MGMT_RMPP_STATUS_RESX) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+	} else
+		abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+static void process_rmpp_abort(struct ib_mad_agent_private *agent,
+			       struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_status < IB_MGMT_RMPP_STATUS_ABORT_MIN ||
+	    rmpp_mad->rmpp_hdr.rmpp_status > IB_MGMT_RMPP_STATUS_ABORT_MAX) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BAD_STATUS);
+	} else
+		abort_send(agent, mad_recv_wc, rmpp_mad->rmpp_hdr.rmpp_status);
+}
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+			struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+
+	rmpp_mad = (struct ib_rmpp_mad *)mad_recv_wc->recv_buf.mad;
+	if (!(rmpp_mad->rmpp_hdr.rmpp_rtime_flags & IB_MGMT_RMPP_FLAG_ACTIVE))
+		return mad_recv_wc;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_version != IB_MGMT_RMPP_VERSION) {
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_UNV);
+		goto out;
+	}
+
+	switch (rmpp_mad->rmpp_hdr.rmpp_type) {
+	case IB_MGMT_RMPP_TYPE_DATA:
+		return process_rmpp_data(agent, mad_recv_wc);
+	case IB_MGMT_RMPP_TYPE_ACK:
+		process_rmpp_ack(agent, mad_recv_wc);
+		break;
+	case IB_MGMT_RMPP_TYPE_STOP:
+		process_rmpp_stop(agent, mad_recv_wc);
+		break;
+	case IB_MGMT_RMPP_TYPE_ABORT:
+		process_rmpp_abort(agent, mad_recv_wc);
+		break;
+	default:
+		abort_send(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+		nack_recv(agent, mad_recv_wc, IB_MGMT_RMPP_STATUS_BADT);
+		break;
+	}
+out:
+	ib_free_recv_mad(mad_recv_wc);
+	return NULL;
+}
+
+static int init_newwin(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_mad_agent_private *agent = mad_send_wr->mad_agent_priv;
+	struct ib_mad_hdr *mad_hdr = mad_send_wr->send_buf.mad;
+	struct mad_rmpp_recv *rmpp_recv;
+	struct ib_ah_attr ah_attr;
+	unsigned long flags;
+	int newwin = 1;
+
+	if (!(mad_hdr->method & IB_MGMT_METHOD_RESP))
+		goto out;
+
+	spin_lock_irqsave(&agent->lock, flags);
+	list_for_each_entry(rmpp_recv, &agent->rmpp_list, list) {
+		if (rmpp_recv->tid != mad_hdr->tid ||
+		    rmpp_recv->mgmt_class != mad_hdr->mgmt_class ||
+		    rmpp_recv->class_version != mad_hdr->class_version ||
+		    (rmpp_recv->method & IB_MGMT_METHOD_RESP))
+			continue;
+
+		if (ib_query_ah(mad_send_wr->send_buf.ah, &ah_attr))
+			continue;
+
+		if (rmpp_recv->slid == ah_attr.dlid) {
+			newwin = rmpp_recv->repwin;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&agent->lock, flags);
+out:
+	return newwin;
+}
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+	      IB_MGMT_RMPP_FLAG_ACTIVE))
+		return IB_RMPP_RESULT_UNHANDLED;
+
+	if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA) {
+		mad_send_wr->seg_num = 1;
+		return IB_RMPP_RESULT_INTERNAL;
+	}
+
+	mad_send_wr->newwin = init_newwin(mad_send_wr);
+
+	/* We need to wait for the final ACK even if there isn't a response */
+	mad_send_wr->refcount += (mad_send_wr->timeout == 0);
+	ret = send_next_seg(mad_send_wr);
+	if (!ret)
+		return IB_RMPP_RESULT_CONSUMED;
+	return ret;
+}
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+			    struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+	      IB_MGMT_RMPP_FLAG_ACTIVE))
+		return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+	if (rmpp_mad->rmpp_hdr.rmpp_type != IB_MGMT_RMPP_TYPE_DATA)
+		return IB_RMPP_RESULT_INTERNAL;	 /* ACK, STOP, or ABORT */
+
+	if (mad_send_wc->status != IB_WC_SUCCESS ||
+	    mad_send_wr->status != IB_WC_SUCCESS)
+		return IB_RMPP_RESULT_PROCESSED; /* Canceled or send error */
+
+	if (!mad_send_wr->timeout)
+		return IB_RMPP_RESULT_PROCESSED; /* Response received */
+
+	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count) {
+		mad_send_wr->timeout =
+			msecs_to_jiffies(mad_send_wr->send_buf.timeout_ms);
+		return IB_RMPP_RESULT_PROCESSED; /* Send done */
+	}
+
+	if (mad_send_wr->seg_num == mad_send_wr->newwin ||
+	    mad_send_wr->seg_num == mad_send_wr->send_buf.seg_count)
+		return IB_RMPP_RESULT_PROCESSED; /* Wait for ACK */
+
+	ret = send_next_seg(mad_send_wr);
+	if (ret) {
+		mad_send_wc->status = IB_WC_GENERAL_ERR;
+		return IB_RMPP_RESULT_PROCESSED;
+	}
+	return IB_RMPP_RESULT_CONSUMED;
+}
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr)
+{
+	struct ib_rmpp_mad *rmpp_mad;
+	int ret;
+
+	rmpp_mad = mad_send_wr->send_buf.mad;
+	if (!(ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+	      IB_MGMT_RMPP_FLAG_ACTIVE))
+		return IB_RMPP_RESULT_UNHANDLED; /* RMPP not active */
+
+	if (mad_send_wr->last_ack == mad_send_wr->send_buf.seg_count)
+		return IB_RMPP_RESULT_PROCESSED;
+
+	mad_send_wr->seg_num = mad_send_wr->last_ack;
+	mad_send_wr->cur_seg = mad_send_wr->last_ack_seg;
+
+	ret = send_next_seg(mad_send_wr);
+	if (ret)
+		return IB_RMPP_RESULT_PROCESSED;
+
+	return IB_RMPP_RESULT_CONSUMED;
+}
diff --git a/drivers/infiniband/core/mad_rmpp.h b/drivers/infiniband/core/mad_rmpp.h
new file mode 100644
index 000000000..3d336bff1
--- /dev/null
+++ b/drivers/infiniband/core/mad_rmpp.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2005 Intel Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __MAD_RMPP_H__
+#define __MAD_RMPP_H__
+
+enum {
+	IB_RMPP_RESULT_PROCESSED,
+	IB_RMPP_RESULT_CONSUMED,
+	IB_RMPP_RESULT_INTERNAL,
+	IB_RMPP_RESULT_UNHANDLED
+};
+
+int ib_send_rmpp_mad(struct ib_mad_send_wr_private *mad_send_wr);
+
+struct ib_mad_recv_wc *
+ib_process_rmpp_recv_wc(struct ib_mad_agent_private *agent,
+			struct ib_mad_recv_wc *mad_recv_wc);
+
+int ib_process_rmpp_send_wc(struct ib_mad_send_wr_private *mad_send_wr,
+			    struct ib_mad_send_wc *mad_send_wc);
+
+void ib_rmpp_send_handler(struct ib_mad_send_wc *mad_send_wc);
+
+void ib_cancel_rmpp_recvs(struct ib_mad_agent_private *agent);
+
+int ib_retry_rmpp(struct ib_mad_send_wr_private *mad_send_wr);
+
+#endif	/* __MAD_RMPP_H__ */
diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c
new file mode 100644
index 000000000..fa17b552f
--- /dev/null
+++ b/drivers/infiniband/core/multicast.c
@@ -0,0 +1,903 @@
+/*
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/dma-mapping.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/export.h>
+#include <linux/slab.h>
+#include <linux/bitops.h>
+#include <linux/random.h>
+
+#include <rdma/ib_cache.h>
+#include "sa.h"
+
+static void mcast_add_one(struct ib_device *device);
+static void mcast_remove_one(struct ib_device *device);
+
+static struct ib_client mcast_client = {
+	.name   = "ib_multicast",
+	.add    = mcast_add_one,
+	.remove = mcast_remove_one
+};
+
+static struct ib_sa_client	sa_client;
+static struct workqueue_struct	*mcast_wq;
+static union ib_gid mgid0;
+
+struct mcast_device;
+
+struct mcast_port {
+	struct mcast_device	*dev;
+	spinlock_t		lock;
+	struct rb_root		table;
+	atomic_t		refcount;
+	struct completion	comp;
+	u8			port_num;
+};
+
+struct mcast_device {
+	struct ib_device	*device;
+	struct ib_event_handler	event_handler;
+	int			start_port;
+	int			end_port;
+	struct mcast_port	port[0];
+};
+
+enum mcast_state {
+	MCAST_JOINING,
+	MCAST_MEMBER,
+	MCAST_ERROR,
+};
+
+enum mcast_group_state {
+	MCAST_IDLE,
+	MCAST_BUSY,
+	MCAST_GROUP_ERROR,
+	MCAST_PKEY_EVENT
+};
+
+enum {
+	MCAST_INVALID_PKEY_INDEX = 0xFFFF
+};
+
+struct mcast_member;
+
+struct mcast_group {
+	struct ib_sa_mcmember_rec rec;
+	struct rb_node		node;
+	struct mcast_port	*port;
+	spinlock_t		lock;
+	struct work_struct	work;
+	struct list_head	pending_list;
+	struct list_head	active_list;
+	struct mcast_member	*last_join;
+	int			members[3];
+	atomic_t		refcount;
+	enum mcast_group_state	state;
+	struct ib_sa_query	*query;
+	int			query_id;
+	u16			pkey_index;
+	u8			leave_state;
+	int			retries;
+};
+
+struct mcast_member {
+	struct ib_sa_multicast	multicast;
+	struct ib_sa_client	*client;
+	struct mcast_group	*group;
+	struct list_head	list;
+	enum mcast_state	state;
+	atomic_t		refcount;
+	struct completion	comp;
+};
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+			 void *context);
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+			  void *context);
+
+static struct mcast_group *mcast_find(struct mcast_port *port,
+				      union ib_gid *mgid)
+{
+	struct rb_node *node = port->table.rb_node;
+	struct mcast_group *group;
+	int ret;
+
+	while (node) {
+		group = rb_entry(node, struct mcast_group, node);
+		ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
+		if (!ret)
+			return group;
+
+		if (ret < 0)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct mcast_group *mcast_insert(struct mcast_port *port,
+					struct mcast_group *group,
+					int allow_duplicates)
+{
+	struct rb_node **link = &port->table.rb_node;
+	struct rb_node *parent = NULL;
+	struct mcast_group *cur_group;
+	int ret;
+
+	while (*link) {
+		parent = *link;
+		cur_group = rb_entry(parent, struct mcast_group, node);
+
+		ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
+			     sizeof group->rec.mgid);
+		if (ret < 0)
+			link = &(*link)->rb_left;
+		else if (ret > 0)
+			link = &(*link)->rb_right;
+		else if (allow_duplicates)
+			link = &(*link)->rb_left;
+		else
+			return cur_group;
+	}
+	rb_link_node(&group->node, parent, link);
+	rb_insert_color(&group->node, &port->table);
+	return NULL;
+}
+
+static void deref_port(struct mcast_port *port)
+{
+	if (atomic_dec_and_test(&port->refcount))
+		complete(&port->comp);
+}
+
+static void release_group(struct mcast_group *group)
+{
+	struct mcast_port *port = group->port;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	if (atomic_dec_and_test(&group->refcount)) {
+		rb_erase(&group->node, &port->table);
+		spin_unlock_irqrestore(&port->lock, flags);
+		kfree(group);
+		deref_port(port);
+	} else
+		spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void deref_member(struct mcast_member *member)
+{
+	if (atomic_dec_and_test(&member->refcount))
+		complete(&member->comp);
+}
+
+static void queue_join(struct mcast_member *member)
+{
+	struct mcast_group *group = member->group;
+	unsigned long flags;
+
+	spin_lock_irqsave(&group->lock, flags);
+	list_add_tail(&member->list, &group->pending_list);
+	if (group->state == MCAST_IDLE) {
+		group->state = MCAST_BUSY;
+		atomic_inc(&group->refcount);
+		queue_work(mcast_wq, &group->work);
+	}
+	spin_unlock_irqrestore(&group->lock, flags);
+}
+
+/*
+ * A multicast group has three types of members: full member, non member, and
+ * send only member.  We need to keep track of the number of members of each
+ * type based on their join state.  Adjust the number of members the belong to
+ * the specified join states.
+ */
+static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
+{
+	int i;
+
+	for (i = 0; i < 3; i++, join_state >>= 1)
+		if (join_state & 0x1)
+			group->members[i] += inc;
+}
+
+/*
+ * If a multicast group has zero members left for a particular join state, but
+ * the group is still a member with the SA, we need to leave that join state.
+ * Determine which join states we still belong to, but that do not have any
+ * active members.
+ */
+static u8 get_leave_state(struct mcast_group *group)
+{
+	u8 leave_state = 0;
+	int i;
+
+	for (i = 0; i < 3; i++)
+		if (!group->members[i])
+			leave_state |= (0x1 << i);
+
+	return leave_state & group->rec.join_state;
+}
+
+static int check_selector(ib_sa_comp_mask comp_mask,
+			  ib_sa_comp_mask selector_mask,
+			  ib_sa_comp_mask value_mask,
+			  u8 selector, u8 src_value, u8 dst_value)
+{
+	int err;
+
+	if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+		return 0;
+
+	switch (selector) {
+	case IB_SA_GT:
+		err = (src_value <= dst_value);
+		break;
+	case IB_SA_LT:
+		err = (src_value >= dst_value);
+		break;
+	case IB_SA_EQ:
+		err = (src_value != dst_value);
+		break;
+	default:
+		err = 0;
+		break;
+	}
+
+	return err;
+}
+
+static int cmp_rec(struct ib_sa_mcmember_rec *src,
+		   struct ib_sa_mcmember_rec *dst, ib_sa_comp_mask comp_mask)
+{
+	/* MGID must already match */
+
+	if (comp_mask & IB_SA_MCMEMBER_REC_PORT_GID &&
+	    memcmp(&src->port_gid, &dst->port_gid, sizeof src->port_gid))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
+		return -EINVAL;
+	if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+			   IB_SA_MCMEMBER_REC_MTU, dst->mtu_selector,
+			   src->mtu, dst->mtu))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
+	    src->traffic_class != dst->traffic_class)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
+		return -EINVAL;
+	if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+			   IB_SA_MCMEMBER_REC_RATE, dst->rate_selector,
+			   src->rate, dst->rate))
+		return -EINVAL;
+	if (check_selector(comp_mask,
+			   IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+			   IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+			   dst->packet_life_time_selector,
+			   src->packet_life_time, dst->packet_life_time))
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SL && src->sl != dst->sl)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
+	    src->flow_label != dst->flow_label)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
+	    src->hop_limit != dst->hop_limit)
+		return -EINVAL;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE && src->scope != dst->scope)
+		return -EINVAL;
+
+	/* join_state checked separately, proxy_join ignored */
+
+	return 0;
+}
+
+static int send_join(struct mcast_group *group, struct mcast_member *member)
+{
+	struct mcast_port *port = group->port;
+	int ret;
+
+	group->last_join = member;
+	ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+				       port->port_num, IB_MGMT_METHOD_SET,
+				       &member->multicast.rec,
+				       member->multicast.comp_mask,
+				       3000, GFP_KERNEL, join_handler, group,
+				       &group->query);
+	if (ret >= 0) {
+		group->query_id = ret;
+		ret = 0;
+	}
+	return ret;
+}
+
+static int send_leave(struct mcast_group *group, u8 leave_state)
+{
+	struct mcast_port *port = group->port;
+	struct ib_sa_mcmember_rec rec;
+	int ret;
+
+	rec = group->rec;
+	rec.join_state = leave_state;
+	group->leave_state = leave_state;
+
+	ret = ib_sa_mcmember_rec_query(&sa_client, port->dev->device,
+				       port->port_num, IB_SA_METHOD_DELETE, &rec,
+				       IB_SA_MCMEMBER_REC_MGID     |
+				       IB_SA_MCMEMBER_REC_PORT_GID |
+				       IB_SA_MCMEMBER_REC_JOIN_STATE,
+				       3000, GFP_KERNEL, leave_handler,
+				       group, &group->query);
+	if (ret >= 0) {
+		group->query_id = ret;
+		ret = 0;
+	}
+	return ret;
+}
+
+static void join_group(struct mcast_group *group, struct mcast_member *member,
+		       u8 join_state)
+{
+	member->state = MCAST_MEMBER;
+	adjust_membership(group, join_state, 1);
+	group->rec.join_state |= join_state;
+	member->multicast.rec = group->rec;
+	member->multicast.rec.join_state = join_state;
+	list_move(&member->list, &group->active_list);
+}
+
+static int fail_join(struct mcast_group *group, struct mcast_member *member,
+		     int status)
+{
+	spin_lock_irq(&group->lock);
+	list_del_init(&member->list);
+	spin_unlock_irq(&group->lock);
+	return member->multicast.callback(status, &member->multicast);
+}
+
+static void process_group_error(struct mcast_group *group)
+{
+	struct mcast_member *member;
+	int ret = 0;
+	u16 pkey_index;
+
+	if (group->state == MCAST_PKEY_EVENT)
+		ret = ib_find_pkey(group->port->dev->device,
+				   group->port->port_num,
+				   be16_to_cpu(group->rec.pkey), &pkey_index);
+
+	spin_lock_irq(&group->lock);
+	if (group->state == MCAST_PKEY_EVENT && !ret &&
+	    group->pkey_index == pkey_index)
+		goto out;
+
+	while (!list_empty(&group->active_list)) {
+		member = list_entry(group->active_list.next,
+				    struct mcast_member, list);
+		atomic_inc(&member->refcount);
+		list_del_init(&member->list);
+		adjust_membership(group, member->multicast.rec.join_state, -1);
+		member->state = MCAST_ERROR;
+		spin_unlock_irq(&group->lock);
+
+		ret = member->multicast.callback(-ENETRESET,
+						 &member->multicast);
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+		spin_lock_irq(&group->lock);
+	}
+
+	group->rec.join_state = 0;
+out:
+	group->state = MCAST_BUSY;
+	spin_unlock_irq(&group->lock);
+}
+
+static void mcast_work_handler(struct work_struct *work)
+{
+	struct mcast_group *group;
+	struct mcast_member *member;
+	struct ib_sa_multicast *multicast;
+	int status, ret;
+	u8 join_state;
+
+	group = container_of(work, typeof(*group), work);
+retest:
+	spin_lock_irq(&group->lock);
+	while (!list_empty(&group->pending_list) ||
+	       (group->state != MCAST_BUSY)) {
+
+		if (group->state != MCAST_BUSY) {
+			spin_unlock_irq(&group->lock);
+			process_group_error(group);
+			goto retest;
+		}
+
+		member = list_entry(group->pending_list.next,
+				    struct mcast_member, list);
+		multicast = &member->multicast;
+		join_state = multicast->rec.join_state;
+		atomic_inc(&member->refcount);
+
+		if (join_state == (group->rec.join_state & join_state)) {
+			status = cmp_rec(&group->rec, &multicast->rec,
+					 multicast->comp_mask);
+			if (!status)
+				join_group(group, member, join_state);
+			else
+				list_del_init(&member->list);
+			spin_unlock_irq(&group->lock);
+			ret = multicast->callback(status, multicast);
+		} else {
+			spin_unlock_irq(&group->lock);
+			status = send_join(group, member);
+			if (!status) {
+				deref_member(member);
+				return;
+			}
+			ret = fail_join(group, member, status);
+		}
+
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+		spin_lock_irq(&group->lock);
+	}
+
+	join_state = get_leave_state(group);
+	if (join_state) {
+		group->rec.join_state &= ~join_state;
+		spin_unlock_irq(&group->lock);
+		if (send_leave(group, join_state))
+			goto retest;
+	} else {
+		group->state = MCAST_IDLE;
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+}
+
+/*
+ * Fail a join request if it is still active - at the head of the pending queue.
+ */
+static void process_join_error(struct mcast_group *group, int status)
+{
+	struct mcast_member *member;
+	int ret;
+
+	spin_lock_irq(&group->lock);
+	member = list_entry(group->pending_list.next,
+			    struct mcast_member, list);
+	if (group->last_join == member) {
+		atomic_inc(&member->refcount);
+		list_del_init(&member->list);
+		spin_unlock_irq(&group->lock);
+		ret = member->multicast.callback(status, &member->multicast);
+		deref_member(member);
+		if (ret)
+			ib_sa_free_multicast(&member->multicast);
+	} else
+		spin_unlock_irq(&group->lock);
+}
+
+static void join_handler(int status, struct ib_sa_mcmember_rec *rec,
+			 void *context)
+{
+	struct mcast_group *group = context;
+	u16 pkey_index = MCAST_INVALID_PKEY_INDEX;
+
+	if (status)
+		process_join_error(group, status);
+	else {
+		int mgids_changed, is_mgid0;
+		ib_find_pkey(group->port->dev->device, group->port->port_num,
+			     be16_to_cpu(rec->pkey), &pkey_index);
+
+		spin_lock_irq(&group->port->lock);
+		if (group->state == MCAST_BUSY &&
+		    group->pkey_index == MCAST_INVALID_PKEY_INDEX)
+			group->pkey_index = pkey_index;
+		mgids_changed = memcmp(&rec->mgid, &group->rec.mgid,
+				       sizeof(group->rec.mgid));
+		group->rec = *rec;
+		if (mgids_changed) {
+			rb_erase(&group->node, &group->port->table);
+			is_mgid0 = !memcmp(&mgid0, &group->rec.mgid,
+					   sizeof(mgid0));
+			mcast_insert(group->port, group, is_mgid0);
+		}
+		spin_unlock_irq(&group->port->lock);
+	}
+	mcast_work_handler(&group->work);
+}
+
+static void leave_handler(int status, struct ib_sa_mcmember_rec *rec,
+			  void *context)
+{
+	struct mcast_group *group = context;
+
+	if (status && group->retries > 0 &&
+	    !send_leave(group, group->leave_state))
+		group->retries--;
+	else
+		mcast_work_handler(&group->work);
+}
+
+static struct mcast_group *acquire_group(struct mcast_port *port,
+					 union ib_gid *mgid, gfp_t gfp_mask)
+{
+	struct mcast_group *group, *cur_group;
+	unsigned long flags;
+	int is_mgid0;
+
+	is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
+	if (!is_mgid0) {
+		spin_lock_irqsave(&port->lock, flags);
+		group = mcast_find(port, mgid);
+		if (group)
+			goto found;
+		spin_unlock_irqrestore(&port->lock, flags);
+	}
+
+	group = kzalloc(sizeof *group, gfp_mask);
+	if (!group)
+		return NULL;
+
+	group->retries = 3;
+	group->port = port;
+	group->rec.mgid = *mgid;
+	group->pkey_index = MCAST_INVALID_PKEY_INDEX;
+	INIT_LIST_HEAD(&group->pending_list);
+	INIT_LIST_HEAD(&group->active_list);
+	INIT_WORK(&group->work, mcast_work_handler);
+	spin_lock_init(&group->lock);
+
+	spin_lock_irqsave(&port->lock, flags);
+	cur_group = mcast_insert(port, group, is_mgid0);
+	if (cur_group) {
+		kfree(group);
+		group = cur_group;
+	} else
+		atomic_inc(&port->refcount);
+found:
+	atomic_inc(&group->refcount);
+	spin_unlock_irqrestore(&port->lock, flags);
+	return group;
+}
+
+/*
+ * We serialize all join requests to a single group to make our lives much
+ * easier.  Otherwise, two users could try to join the same group
+ * simultaneously, with different configurations, one could leave while the
+ * join is in progress, etc., which makes locking around error recovery
+ * difficult.
+ */
+struct ib_sa_multicast *
+ib_sa_join_multicast(struct ib_sa_client *client,
+		     struct ib_device *device, u8 port_num,
+		     struct ib_sa_mcmember_rec *rec,
+		     ib_sa_comp_mask comp_mask, gfp_t gfp_mask,
+		     int (*callback)(int status,
+				     struct ib_sa_multicast *multicast),
+		     void *context)
+{
+	struct mcast_device *dev;
+	struct mcast_member *member;
+	struct ib_sa_multicast *multicast;
+	int ret;
+
+	dev = ib_get_client_data(device, &mcast_client);
+	if (!dev)
+		return ERR_PTR(-ENODEV);
+
+	member = kmalloc(sizeof *member, gfp_mask);
+	if (!member)
+		return ERR_PTR(-ENOMEM);
+
+	ib_sa_client_get(client);
+	member->client = client;
+	member->multicast.rec = *rec;
+	member->multicast.comp_mask = comp_mask;
+	member->multicast.callback = callback;
+	member->multicast.context = context;
+	init_completion(&member->comp);
+	atomic_set(&member->refcount, 1);
+	member->state = MCAST_JOINING;
+
+	member->group = acquire_group(&dev->port[port_num - dev->start_port],
+				      &rec->mgid, gfp_mask);
+	if (!member->group) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	/*
+	 * The user will get the multicast structure in their callback.  They
+	 * could then free the multicast structure before we can return from
+	 * this routine.  So we save the pointer to return before queuing
+	 * any callback.
+	 */
+	multicast = &member->multicast;
+	queue_join(member);
+	return multicast;
+
+err:
+	ib_sa_client_put(client);
+	kfree(member);
+	return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(ib_sa_join_multicast);
+
+void ib_sa_free_multicast(struct ib_sa_multicast *multicast)
+{
+	struct mcast_member *member;
+	struct mcast_group *group;
+
+	member = container_of(multicast, struct mcast_member, multicast);
+	group = member->group;
+
+	spin_lock_irq(&group->lock);
+	if (member->state == MCAST_MEMBER)
+		adjust_membership(group, multicast->rec.join_state, -1);
+
+	list_del_init(&member->list);
+
+	if (group->state == MCAST_IDLE) {
+		group->state = MCAST_BUSY;
+		spin_unlock_irq(&group->lock);
+		/* Continue to hold reference on group until callback */
+		queue_work(mcast_wq, &group->work);
+	} else {
+		spin_unlock_irq(&group->lock);
+		release_group(group);
+	}
+
+	deref_member(member);
+	wait_for_completion(&member->comp);
+	ib_sa_client_put(member->client);
+	kfree(member);
+}
+EXPORT_SYMBOL(ib_sa_free_multicast);
+
+int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num,
+			   union ib_gid *mgid, struct ib_sa_mcmember_rec *rec)
+{
+	struct mcast_device *dev;
+	struct mcast_port *port;
+	struct mcast_group *group;
+	unsigned long flags;
+	int ret = 0;
+
+	dev = ib_get_client_data(device, &mcast_client);
+	if (!dev)
+		return -ENODEV;
+
+	port = &dev->port[port_num - dev->start_port];
+	spin_lock_irqsave(&port->lock, flags);
+	group = mcast_find(port, mgid);
+	if (group)
+		*rec = group->rec;
+	else
+		ret = -EADDRNOTAVAIL;
+	spin_unlock_irqrestore(&port->lock, flags);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_get_mcmember_rec);
+
+int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num,
+			     struct ib_sa_mcmember_rec *rec,
+			     struct ib_ah_attr *ah_attr)
+{
+	int ret;
+	u16 gid_index;
+	u8 p;
+
+	ret = ib_find_cached_gid(device, &rec->port_gid, &p, &gid_index);
+	if (ret)
+		return ret;
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->dlid = be16_to_cpu(rec->mlid);
+	ah_attr->sl = rec->sl;
+	ah_attr->port_num = port_num;
+	ah_attr->static_rate = rec->rate;
+
+	ah_attr->ah_flags = IB_AH_GRH;
+	ah_attr->grh.dgid = rec->mgid;
+
+	ah_attr->grh.sgid_index = (u8) gid_index;
+	ah_attr->grh.flow_label = be32_to_cpu(rec->flow_label);
+	ah_attr->grh.hop_limit = rec->hop_limit;
+	ah_attr->grh.traffic_class = rec->traffic_class;
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_mcmember);
+
+static void mcast_groups_event(struct mcast_port *port,
+			       enum mcast_group_state state)
+{
+	struct mcast_group *group;
+	struct rb_node *node;
+	unsigned long flags;
+
+	spin_lock_irqsave(&port->lock, flags);
+	for (node = rb_first(&port->table); node; node = rb_next(node)) {
+		group = rb_entry(node, struct mcast_group, node);
+		spin_lock(&group->lock);
+		if (group->state == MCAST_IDLE) {
+			atomic_inc(&group->refcount);
+			queue_work(mcast_wq, &group->work);
+		}
+		if (group->state != MCAST_GROUP_ERROR)
+			group->state = state;
+		spin_unlock(&group->lock);
+	}
+	spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static void mcast_event_handler(struct ib_event_handler *handler,
+				struct ib_event *event)
+{
+	struct mcast_device *dev;
+	int index;
+
+	dev = container_of(handler, struct mcast_device, event_handler);
+	if (rdma_port_get_link_layer(dev->device, event->element.port_num) !=
+	    IB_LINK_LAYER_INFINIBAND)
+		return;
+
+	index = event->element.port_num - dev->start_port;
+
+	switch (event->event) {
+	case IB_EVENT_PORT_ERR:
+	case IB_EVENT_LID_CHANGE:
+	case IB_EVENT_SM_CHANGE:
+	case IB_EVENT_CLIENT_REREGISTER:
+		mcast_groups_event(&dev->port[index], MCAST_GROUP_ERROR);
+		break;
+	case IB_EVENT_PKEY_CHANGE:
+		mcast_groups_event(&dev->port[index], MCAST_PKEY_EVENT);
+		break;
+	default:
+		break;
+	}
+}
+
+static void mcast_add_one(struct ib_device *device)
+{
+	struct mcast_device *dev;
+	struct mcast_port *port;
+	int i;
+	int count = 0;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	dev = kmalloc(sizeof *dev + device->phys_port_cnt * sizeof *port,
+		      GFP_KERNEL);
+	if (!dev)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		dev->start_port = dev->end_port = 0;
+	else {
+		dev->start_port = 1;
+		dev->end_port = device->phys_port_cnt;
+	}
+
+	for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+		if (rdma_port_get_link_layer(device, dev->start_port + i) !=
+		    IB_LINK_LAYER_INFINIBAND)
+			continue;
+		port = &dev->port[i];
+		port->dev = dev;
+		port->port_num = dev->start_port + i;
+		spin_lock_init(&port->lock);
+		port->table = RB_ROOT;
+		init_completion(&port->comp);
+		atomic_set(&port->refcount, 1);
+		++count;
+	}
+
+	if (!count) {
+		kfree(dev);
+		return;
+	}
+
+	dev->device = device;
+	ib_set_client_data(device, &mcast_client, dev);
+
+	INIT_IB_EVENT_HANDLER(&dev->event_handler, device, mcast_event_handler);
+	ib_register_event_handler(&dev->event_handler);
+}
+
+static void mcast_remove_one(struct ib_device *device)
+{
+	struct mcast_device *dev;
+	struct mcast_port *port;
+	int i;
+
+	dev = ib_get_client_data(device, &mcast_client);
+	if (!dev)
+		return;
+
+	ib_unregister_event_handler(&dev->event_handler);
+	flush_workqueue(mcast_wq);
+
+	for (i = 0; i <= dev->end_port - dev->start_port; i++) {
+		if (rdma_port_get_link_layer(device, dev->start_port + i) ==
+		    IB_LINK_LAYER_INFINIBAND) {
+			port = &dev->port[i];
+			deref_port(port);
+			wait_for_completion(&port->comp);
+		}
+	}
+
+	kfree(dev);
+}
+
+int mcast_init(void)
+{
+	int ret;
+
+	mcast_wq = create_singlethread_workqueue("ib_mcast");
+	if (!mcast_wq)
+		return -ENOMEM;
+
+	ib_sa_register_client(&sa_client);
+
+	ret = ib_register_client(&mcast_client);
+	if (ret)
+		goto err;
+	return 0;
+
+err:
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(mcast_wq);
+	return ret;
+}
+
+void mcast_cleanup(void)
+{
+	ib_unregister_client(&mcast_client);
+	ib_sa_unregister_client(&sa_client);
+	destroy_workqueue(mcast_wq);
+}
diff --git a/drivers/infiniband/core/netlink.c b/drivers/infiniband/core/netlink.c
new file mode 100644
index 000000000..23dd5a5c7
--- /dev/null
+++ b/drivers/infiniband/core/netlink.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2010 Voltaire Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "%s:%s: " fmt, KBUILD_MODNAME, __func__
+
+#include <linux/export.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <rdma/rdma_netlink.h>
+
+struct ibnl_client {
+	struct list_head		list;
+	int				index;
+	int				nops;
+	const struct ibnl_client_cbs   *cb_table;
+};
+
+static DEFINE_MUTEX(ibnl_mutex);
+static struct sock *nls;
+static LIST_HEAD(client_list);
+
+int ibnl_add_client(int index, int nops,
+		    const struct ibnl_client_cbs cb_table[])
+{
+	struct ibnl_client *cur;
+	struct ibnl_client *nl_client;
+
+	nl_client = kmalloc(sizeof *nl_client, GFP_KERNEL);
+	if (!nl_client)
+		return -ENOMEM;
+
+	nl_client->index	= index;
+	nl_client->nops		= nops;
+	nl_client->cb_table	= cb_table;
+
+	mutex_lock(&ibnl_mutex);
+
+	list_for_each_entry(cur, &client_list, list) {
+		if (cur->index == index) {
+			pr_warn("Client for %d already exists\n", index);
+			mutex_unlock(&ibnl_mutex);
+			kfree(nl_client);
+			return -EINVAL;
+		}
+	}
+
+	list_add_tail(&nl_client->list, &client_list);
+
+	mutex_unlock(&ibnl_mutex);
+
+	return 0;
+}
+EXPORT_SYMBOL(ibnl_add_client);
+
+int ibnl_remove_client(int index)
+{
+	struct ibnl_client *cur, *next;
+
+	mutex_lock(&ibnl_mutex);
+	list_for_each_entry_safe(cur, next, &client_list, list) {
+		if (cur->index == index) {
+			list_del(&(cur->list));
+			mutex_unlock(&ibnl_mutex);
+			kfree(cur);
+			return 0;
+		}
+	}
+	pr_warn("Can't remove callback for client idx %d. Not found\n", index);
+	mutex_unlock(&ibnl_mutex);
+
+	return -EINVAL;
+}
+EXPORT_SYMBOL(ibnl_remove_client);
+
+void *ibnl_put_msg(struct sk_buff *skb, struct nlmsghdr **nlh, int seq,
+		   int len, int client, int op, int flags)
+{
+	unsigned char *prev_tail;
+
+	prev_tail = skb_tail_pointer(skb);
+	*nlh = nlmsg_put(skb, 0, seq, RDMA_NL_GET_TYPE(client, op),
+			 len, flags);
+	if (!*nlh)
+		goto out_nlmsg_trim;
+	(*nlh)->nlmsg_len = skb_tail_pointer(skb) - prev_tail;
+	return nlmsg_data(*nlh);
+
+out_nlmsg_trim:
+	nlmsg_trim(skb, prev_tail);
+	return NULL;
+}
+EXPORT_SYMBOL(ibnl_put_msg);
+
+int ibnl_put_attr(struct sk_buff *skb, struct nlmsghdr *nlh,
+		  int len, void *data, int type)
+{
+	unsigned char *prev_tail;
+
+	prev_tail = skb_tail_pointer(skb);
+	if (nla_put(skb, type, len, data))
+		goto nla_put_failure;
+	nlh->nlmsg_len += skb_tail_pointer(skb) - prev_tail;
+	return 0;
+
+nla_put_failure:
+	nlmsg_trim(skb, prev_tail - nlh->nlmsg_len);
+	return -EMSGSIZE;
+}
+EXPORT_SYMBOL(ibnl_put_attr);
+
+static int ibnl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+	struct ibnl_client *client;
+	int type = nlh->nlmsg_type;
+	int index = RDMA_NL_GET_CLIENT(type);
+	int op = RDMA_NL_GET_OP(type);
+
+	list_for_each_entry(client, &client_list, list) {
+		if (client->index == index) {
+			if (op < 0 || op >= client->nops ||
+			    !client->cb_table[op].dump)
+				return -EINVAL;
+
+			{
+				struct netlink_dump_control c = {
+					.dump = client->cb_table[op].dump,
+					.module = client->cb_table[op].module,
+				};
+				return netlink_dump_start(nls, skb, nlh, &c);
+			}
+		}
+	}
+
+	pr_info("Index %d wasn't found in client list\n", index);
+	return -EINVAL;
+}
+
+static void ibnl_rcv(struct sk_buff *skb)
+{
+	mutex_lock(&ibnl_mutex);
+	netlink_rcv_skb(skb, &ibnl_rcv_msg);
+	mutex_unlock(&ibnl_mutex);
+}
+
+int ibnl_unicast(struct sk_buff *skb, struct nlmsghdr *nlh,
+			__u32 pid)
+{
+	return nlmsg_unicast(nls, skb, pid);
+}
+EXPORT_SYMBOL(ibnl_unicast);
+
+int ibnl_multicast(struct sk_buff *skb, struct nlmsghdr *nlh,
+			unsigned int group, gfp_t flags)
+{
+	return nlmsg_multicast(nls, skb, 0, group, flags);
+}
+EXPORT_SYMBOL(ibnl_multicast);
+
+int __init ibnl_init(void)
+{
+	struct netlink_kernel_cfg cfg = {
+		.input	= ibnl_rcv,
+	};
+
+	nls = netlink_kernel_create(&init_net, NETLINK_RDMA, &cfg);
+	if (!nls) {
+		pr_warn("Failed to create netlink socket\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void ibnl_cleanup(void)
+{
+	struct ibnl_client *cur, *next;
+
+	mutex_lock(&ibnl_mutex);
+	list_for_each_entry_safe(cur, next, &client_list, list) {
+		list_del(&(cur->list));
+		kfree(cur);
+	}
+	mutex_unlock(&ibnl_mutex);
+
+	netlink_kernel_release(nls);
+}
diff --git a/drivers/infiniband/core/packer.c b/drivers/infiniband/core/packer.c
new file mode 100644
index 000000000..1b65986c0
--- /dev/null
+++ b/drivers/infiniband/core/packer.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <linux/string.h>
+
+#include <rdma/ib_pack.h>
+
+static u64 value_read(int offset, int size, void *structure)
+{
+	switch (size) {
+	case 1: return                *(u8  *) (structure + offset);
+	case 2: return be16_to_cpup((__be16 *) (structure + offset));
+	case 4: return be32_to_cpup((__be32 *) (structure + offset));
+	case 8: return be64_to_cpup((__be64 *) (structure + offset));
+	default:
+		printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+		return 0;
+	}
+}
+
+/**
+ * ib_pack - Pack a structure into a buffer
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @structure:Structure to pack from
+ * @buf:Buffer to pack into
+ *
+ * ib_pack() packs a list of structure fields into a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_pack(const struct ib_field        *desc,
+	     int                           desc_len,
+	     void                         *structure,
+	     void                         *buf)
+{
+	int i;
+
+	for (i = 0; i < desc_len; ++i) {
+		if (desc[i].size_bits <= 32) {
+			int shift;
+			u32 val;
+			__be32 mask;
+			__be32 *addr;
+
+			shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+			if (desc[i].struct_size_bytes)
+				val = value_read(desc[i].struct_offset_bytes,
+						 desc[i].struct_size_bytes,
+						 structure) << shift;
+			else
+				val = 0;
+
+			mask = cpu_to_be32(((1ull << desc[i].size_bits) - 1) << shift);
+			addr = (__be32 *) buf + desc[i].offset_words;
+			*addr = (*addr & ~mask) | (cpu_to_be32(val) & mask);
+		} else if (desc[i].size_bits <= 64) {
+			int shift;
+			u64 val;
+			__be64 mask;
+			__be64 *addr;
+
+			shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+			if (desc[i].struct_size_bytes)
+				val = value_read(desc[i].struct_offset_bytes,
+						 desc[i].struct_size_bytes,
+						 structure) << shift;
+			else
+				val = 0;
+
+			mask = cpu_to_be64((~0ull >> (64 - desc[i].size_bits)) << shift);
+			addr = (__be64 *) ((__be32 *) buf + desc[i].offset_words);
+			*addr = (*addr & ~mask) | (cpu_to_be64(val) & mask);
+		} else {
+			if (desc[i].offset_bits % 8 ||
+			    desc[i].size_bits   % 8) {
+				printk(KERN_WARNING "Structure field %s of size %d "
+				       "bits is not byte-aligned\n",
+				       desc[i].field_name, desc[i].size_bits);
+			}
+
+			if (desc[i].struct_size_bytes)
+				memcpy(buf + desc[i].offset_words * 4 +
+				       desc[i].offset_bits / 8,
+				       structure + desc[i].struct_offset_bytes,
+				       desc[i].size_bits / 8);
+			else
+				memset(buf + desc[i].offset_words * 4 +
+				       desc[i].offset_bits / 8,
+				       0,
+				       desc[i].size_bits / 8);
+		}
+	}
+}
+EXPORT_SYMBOL(ib_pack);
+
+static void value_write(int offset, int size, u64 val, void *structure)
+{
+	switch (size * 8) {
+	case 8:  *(    u8 *) (structure + offset) = val; break;
+	case 16: *(__be16 *) (structure + offset) = cpu_to_be16(val); break;
+	case 32: *(__be32 *) (structure + offset) = cpu_to_be32(val); break;
+	case 64: *(__be64 *) (structure + offset) = cpu_to_be64(val); break;
+	default:
+		printk(KERN_WARNING "Field size %d bits not handled\n", size * 8);
+	}
+}
+
+/**
+ * ib_unpack - Unpack a buffer into a structure
+ * @desc:Array of structure field descriptions
+ * @desc_len:Number of entries in @desc
+ * @buf:Buffer to unpack from
+ * @structure:Structure to unpack into
+ *
+ * ib_pack() unpacks a list of structure fields from a buffer,
+ * controlled by the array of fields in @desc.
+ */
+void ib_unpack(const struct ib_field        *desc,
+	       int                           desc_len,
+	       void                         *buf,
+	       void                         *structure)
+{
+	int i;
+
+	for (i = 0; i < desc_len; ++i) {
+		if (!desc[i].struct_size_bytes)
+			continue;
+
+		if (desc[i].size_bits <= 32) {
+			int shift;
+			u32  val;
+			u32  mask;
+			__be32 *addr;
+
+			shift = 32 - desc[i].offset_bits - desc[i].size_bits;
+			mask = ((1ull << desc[i].size_bits) - 1) << shift;
+			addr = (__be32 *) buf + desc[i].offset_words;
+			val = (be32_to_cpup(addr) & mask) >> shift;
+			value_write(desc[i].struct_offset_bytes,
+				    desc[i].struct_size_bytes,
+				    val,
+				    structure);
+		} else if (desc[i].size_bits <= 64) {
+			int shift;
+			u64  val;
+			u64  mask;
+			__be64 *addr;
+
+			shift = 64 - desc[i].offset_bits - desc[i].size_bits;
+			mask = (~0ull >> (64 - desc[i].size_bits)) << shift;
+			addr = (__be64 *) buf + desc[i].offset_words;
+			val = (be64_to_cpup(addr) & mask) >> shift;
+			value_write(desc[i].struct_offset_bytes,
+				    desc[i].struct_size_bytes,
+				    val,
+				    structure);
+		} else {
+			if (desc[i].offset_bits % 8 ||
+			    desc[i].size_bits   % 8) {
+				printk(KERN_WARNING "Structure field %s of size %d "
+				       "bits is not byte-aligned\n",
+				       desc[i].field_name, desc[i].size_bits);
+			}
+
+			memcpy(structure + desc[i].struct_offset_bytes,
+			       buf + desc[i].offset_words * 4 +
+			       desc[i].offset_bits / 8,
+			       desc[i].size_bits / 8);
+		}
+	}
+}
+EXPORT_SYMBOL(ib_unpack);
diff --git a/drivers/infiniband/core/sa.h b/drivers/infiniband/core/sa.h
new file mode 100644
index 000000000..b1d4bbf4c
--- /dev/null
+++ b/drivers/infiniband/core/sa.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc.  All rights reserved.
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef SA_H
+#define SA_H
+
+#include <rdma/ib_sa.h>
+
+static inline void ib_sa_client_get(struct ib_sa_client *client)
+{
+	atomic_inc(&client->users);
+}
+
+static inline void ib_sa_client_put(struct ib_sa_client *client)
+{
+	if (atomic_dec_and_test(&client->users))
+		complete(&client->comp);
+}
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+			     struct ib_device *device, u8 port_num,
+			     u8 method,
+			     struct ib_sa_mcmember_rec *rec,
+			     ib_sa_comp_mask comp_mask,
+			     int timeout_ms, gfp_t gfp_mask,
+			     void (*callback)(int status,
+					      struct ib_sa_mcmember_rec *resp,
+					      void *context),
+			     void *context,
+			     struct ib_sa_query **sa_query);
+
+int mcast_init(void);
+void mcast_cleanup(void);
+
+#endif /* SA_H */
diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c
new file mode 100644
index 000000000..c38f030f0
--- /dev/null
+++ b/drivers/infiniband/core/sa_query.c
@@ -0,0 +1,1280 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc.  All rights reserved.
+ * Copyright (c) 2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/dma-mapping.h>
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/workqueue.h>
+#include <uapi/linux/if_ether.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_cache.h>
+#include "sa.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand subnet administration query support");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_sa_sm_ah {
+	struct ib_ah        *ah;
+	struct kref          ref;
+	u16		     pkey_index;
+	u8		     src_path_mask;
+};
+
+struct ib_sa_port {
+	struct ib_mad_agent *agent;
+	struct ib_sa_sm_ah  *sm_ah;
+	struct work_struct   update_task;
+	spinlock_t           ah_lock;
+	u8                   port_num;
+};
+
+struct ib_sa_device {
+	int                     start_port, end_port;
+	struct ib_event_handler event_handler;
+	struct ib_sa_port port[0];
+};
+
+struct ib_sa_query {
+	void (*callback)(struct ib_sa_query *, int, struct ib_sa_mad *);
+	void (*release)(struct ib_sa_query *);
+	struct ib_sa_client    *client;
+	struct ib_sa_port      *port;
+	struct ib_mad_send_buf *mad_buf;
+	struct ib_sa_sm_ah     *sm_ah;
+	int			id;
+};
+
+struct ib_sa_service_query {
+	void (*callback)(int, struct ib_sa_service_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_path_query {
+	void (*callback)(int, struct ib_sa_path_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_guidinfo_query {
+	void (*callback)(int, struct ib_sa_guidinfo_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+struct ib_sa_mcmember_query {
+	void (*callback)(int, struct ib_sa_mcmember_rec *, void *);
+	void *context;
+	struct ib_sa_query sa_query;
+};
+
+static void ib_sa_add_one(struct ib_device *device);
+static void ib_sa_remove_one(struct ib_device *device);
+
+static struct ib_client sa_client = {
+	.name   = "sa",
+	.add    = ib_sa_add_one,
+	.remove = ib_sa_remove_one
+};
+
+static DEFINE_SPINLOCK(idr_lock);
+static DEFINE_IDR(query_idr);
+
+static DEFINE_SPINLOCK(tid_lock);
+static u32 tid;
+
+#define PATH_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_path_rec, field),		\
+	.struct_size_bytes   = sizeof ((struct ib_sa_path_rec *) 0)->field,	\
+	.field_name          = "sa_path_rec:" #field
+
+static const struct ib_field path_rec_table[] = {
+	{ PATH_REC_FIELD(service_id),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 64 },
+	{ PATH_REC_FIELD(dgid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ PATH_REC_FIELD(sgid),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ PATH_REC_FIELD(dlid),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(slid),
+	  .offset_words = 10,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(raw_traffic),
+	  .offset_words = 11,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 11,
+	  .offset_bits  = 1,
+	  .size_bits    = 3 },
+	{ PATH_REC_FIELD(flow_label),
+	  .offset_words = 11,
+	  .offset_bits  = 4,
+	  .size_bits    = 20 },
+	{ PATH_REC_FIELD(hop_limit),
+	  .offset_words = 11,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ PATH_REC_FIELD(traffic_class),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ PATH_REC_FIELD(reversible),
+	  .offset_words = 12,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ PATH_REC_FIELD(numb_path),
+	  .offset_words = 12,
+	  .offset_bits  = 9,
+	  .size_bits    = 7 },
+	{ PATH_REC_FIELD(pkey),
+	  .offset_words = 12,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ PATH_REC_FIELD(qos_class),
+	  .offset_words = 13,
+	  .offset_bits  = 0,
+	  .size_bits    = 12 },
+	{ PATH_REC_FIELD(sl),
+	  .offset_words = 13,
+	  .offset_bits  = 12,
+	  .size_bits    = 4 },
+	{ PATH_REC_FIELD(mtu_selector),
+	  .offset_words = 13,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(mtu),
+	  .offset_words = 13,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(rate_selector),
+	  .offset_words = 13,
+	  .offset_bits  = 24,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(rate),
+	  .offset_words = 13,
+	  .offset_bits  = 26,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(packet_life_time_selector),
+	  .offset_words = 14,
+	  .offset_bits  = 0,
+	  .size_bits    = 2 },
+	{ PATH_REC_FIELD(packet_life_time),
+	  .offset_words = 14,
+	  .offset_bits  = 2,
+	  .size_bits    = 6 },
+	{ PATH_REC_FIELD(preference),
+	  .offset_words = 14,
+	  .offset_bits  = 8,
+	  .size_bits    = 8 },
+	{ RESERVED,
+	  .offset_words = 14,
+	  .offset_bits  = 16,
+	  .size_bits    = 48 },
+};
+
+#define MCMEMBER_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_mcmember_rec, field),	\
+	.struct_size_bytes   = sizeof ((struct ib_sa_mcmember_rec *) 0)->field,	\
+	.field_name          = "sa_mcmember_rec:" #field
+
+static const struct ib_field mcmember_rec_table[] = {
+	{ MCMEMBER_REC_FIELD(mgid),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ MCMEMBER_REC_FIELD(port_gid),
+	  .offset_words = 4,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ MCMEMBER_REC_FIELD(qkey),
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ MCMEMBER_REC_FIELD(mlid),
+	  .offset_words = 9,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ MCMEMBER_REC_FIELD(mtu_selector),
+	  .offset_words = 9,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(mtu),
+	  .offset_words = 9,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(traffic_class),
+	  .offset_words = 9,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ MCMEMBER_REC_FIELD(pkey),
+	  .offset_words = 10,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ MCMEMBER_REC_FIELD(rate_selector),
+	  .offset_words = 10,
+	  .offset_bits  = 16,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(rate),
+	  .offset_words = 10,
+	  .offset_bits  = 18,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(packet_life_time_selector),
+	  .offset_words = 10,
+	  .offset_bits  = 24,
+	  .size_bits    = 2 },
+	{ MCMEMBER_REC_FIELD(packet_life_time),
+	  .offset_words = 10,
+	  .offset_bits  = 26,
+	  .size_bits    = 6 },
+	{ MCMEMBER_REC_FIELD(sl),
+	  .offset_words = 11,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(flow_label),
+	  .offset_words = 11,
+	  .offset_bits  = 4,
+	  .size_bits    = 20 },
+	{ MCMEMBER_REC_FIELD(hop_limit),
+	  .offset_words = 11,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ MCMEMBER_REC_FIELD(scope),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(join_state),
+	  .offset_words = 12,
+	  .offset_bits  = 4,
+	  .size_bits    = 4 },
+	{ MCMEMBER_REC_FIELD(proxy_join),
+	  .offset_words = 12,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 12,
+	  .offset_bits  = 9,
+	  .size_bits    = 23 },
+};
+
+#define SERVICE_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_service_rec, field),	\
+	.struct_size_bytes   = sizeof ((struct ib_sa_service_rec *) 0)->field,	\
+	.field_name          = "sa_service_rec:" #field
+
+static const struct ib_field service_rec_table[] = {
+	{ SERVICE_REC_FIELD(id),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 64 },
+	{ SERVICE_REC_FIELD(gid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ SERVICE_REC_FIELD(pkey),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ SERVICE_REC_FIELD(lease),
+	  .offset_words = 7,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ SERVICE_REC_FIELD(key),
+	  .offset_words = 8,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ SERVICE_REC_FIELD(name),
+	  .offset_words = 12,
+	  .offset_bits  = 0,
+	  .size_bits    = 64*8 },
+	{ SERVICE_REC_FIELD(data8),
+	  .offset_words = 28,
+	  .offset_bits  = 0,
+	  .size_bits    = 16*8 },
+	{ SERVICE_REC_FIELD(data16),
+	  .offset_words = 32,
+	  .offset_bits  = 0,
+	  .size_bits    = 8*16 },
+	{ SERVICE_REC_FIELD(data32),
+	  .offset_words = 36,
+	  .offset_bits  = 0,
+	  .size_bits    = 4*32 },
+	{ SERVICE_REC_FIELD(data64),
+	  .offset_words = 40,
+	  .offset_bits  = 0,
+	  .size_bits    = 2*64 },
+};
+
+#define GUIDINFO_REC_FIELD(field) \
+	.struct_offset_bytes = offsetof(struct ib_sa_guidinfo_rec, field),	\
+	.struct_size_bytes   = sizeof((struct ib_sa_guidinfo_rec *) 0)->field,	\
+	.field_name          = "sa_guidinfo_rec:" #field
+
+static const struct ib_field guidinfo_rec_table[] = {
+	{ GUIDINFO_REC_FIELD(lid),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ GUIDINFO_REC_FIELD(block_num),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 8 },
+	{ GUIDINFO_REC_FIELD(res1),
+	  .offset_words = 0,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ GUIDINFO_REC_FIELD(res2),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ GUIDINFO_REC_FIELD(guid_info_list),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 512 },
+};
+
+static void free_sm_ah(struct kref *kref)
+{
+	struct ib_sa_sm_ah *sm_ah = container_of(kref, struct ib_sa_sm_ah, ref);
+
+	ib_destroy_ah(sm_ah->ah);
+	kfree(sm_ah);
+}
+
+static void update_sm_ah(struct work_struct *work)
+{
+	struct ib_sa_port *port =
+		container_of(work, struct ib_sa_port, update_task);
+	struct ib_sa_sm_ah *new_ah;
+	struct ib_port_attr port_attr;
+	struct ib_ah_attr   ah_attr;
+
+	if (ib_query_port(port->agent->device, port->port_num, &port_attr)) {
+		printk(KERN_WARNING "Couldn't query port\n");
+		return;
+	}
+
+	new_ah = kmalloc(sizeof *new_ah, GFP_KERNEL);
+	if (!new_ah) {
+		printk(KERN_WARNING "Couldn't allocate new SM AH\n");
+		return;
+	}
+
+	kref_init(&new_ah->ref);
+	new_ah->src_path_mask = (1 << port_attr.lmc) - 1;
+
+	new_ah->pkey_index = 0;
+	if (ib_find_pkey(port->agent->device, port->port_num,
+			 IB_DEFAULT_PKEY_FULL, &new_ah->pkey_index))
+		printk(KERN_ERR "Couldn't find index for default PKey\n");
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid     = port_attr.sm_lid;
+	ah_attr.sl       = port_attr.sm_sl;
+	ah_attr.port_num = port->port_num;
+
+	new_ah->ah = ib_create_ah(port->agent->qp->pd, &ah_attr);
+	if (IS_ERR(new_ah->ah)) {
+		printk(KERN_WARNING "Couldn't create new SM AH\n");
+		kfree(new_ah);
+		return;
+	}
+
+	spin_lock_irq(&port->ah_lock);
+	if (port->sm_ah)
+		kref_put(&port->sm_ah->ref, free_sm_ah);
+	port->sm_ah = new_ah;
+	spin_unlock_irq(&port->ah_lock);
+
+}
+
+static void ib_sa_event(struct ib_event_handler *handler, struct ib_event *event)
+{
+	if (event->event == IB_EVENT_PORT_ERR    ||
+	    event->event == IB_EVENT_PORT_ACTIVE ||
+	    event->event == IB_EVENT_LID_CHANGE  ||
+	    event->event == IB_EVENT_PKEY_CHANGE ||
+	    event->event == IB_EVENT_SM_CHANGE   ||
+	    event->event == IB_EVENT_CLIENT_REREGISTER) {
+		unsigned long flags;
+		struct ib_sa_device *sa_dev =
+			container_of(handler, typeof(*sa_dev), event_handler);
+		struct ib_sa_port *port =
+			&sa_dev->port[event->element.port_num - sa_dev->start_port];
+
+		if (rdma_port_get_link_layer(handler->device, port->port_num) != IB_LINK_LAYER_INFINIBAND)
+			return;
+
+		spin_lock_irqsave(&port->ah_lock, flags);
+		if (port->sm_ah)
+			kref_put(&port->sm_ah->ref, free_sm_ah);
+		port->sm_ah = NULL;
+		spin_unlock_irqrestore(&port->ah_lock, flags);
+
+		queue_work(ib_wq, &sa_dev->port[event->element.port_num -
+					    sa_dev->start_port].update_task);
+	}
+}
+
+void ib_sa_register_client(struct ib_sa_client *client)
+{
+	atomic_set(&client->users, 1);
+	init_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_register_client);
+
+void ib_sa_unregister_client(struct ib_sa_client *client)
+{
+	ib_sa_client_put(client);
+	wait_for_completion(&client->comp);
+}
+EXPORT_SYMBOL(ib_sa_unregister_client);
+
+/**
+ * ib_sa_cancel_query - try to cancel an SA query
+ * @id:ID of query to cancel
+ * @query:query pointer to cancel
+ *
+ * Try to cancel an SA query.  If the id and query don't match up or
+ * the query has already completed, nothing is done.  Otherwise the
+ * query is canceled and will complete with a status of -EINTR.
+ */
+void ib_sa_cancel_query(int id, struct ib_sa_query *query)
+{
+	unsigned long flags;
+	struct ib_mad_agent *agent;
+	struct ib_mad_send_buf *mad_buf;
+
+	spin_lock_irqsave(&idr_lock, flags);
+	if (idr_find(&query_idr, id) != query) {
+		spin_unlock_irqrestore(&idr_lock, flags);
+		return;
+	}
+	agent = query->port->agent;
+	mad_buf = query->mad_buf;
+	spin_unlock_irqrestore(&idr_lock, flags);
+
+	ib_cancel_mad(agent, mad_buf);
+}
+EXPORT_SYMBOL(ib_sa_cancel_query);
+
+static u8 get_src_path_mask(struct ib_device *device, u8 port_num)
+{
+	struct ib_sa_device *sa_dev;
+	struct ib_sa_port   *port;
+	unsigned long flags;
+	u8 src_path_mask;
+
+	sa_dev = ib_get_client_data(device, &sa_client);
+	if (!sa_dev)
+		return 0x7f;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	spin_lock_irqsave(&port->ah_lock, flags);
+	src_path_mask = port->sm_ah ? port->sm_ah->src_path_mask : 0x7f;
+	spin_unlock_irqrestore(&port->ah_lock, flags);
+
+	return src_path_mask;
+}
+
+int ib_init_ah_from_path(struct ib_device *device, u8 port_num,
+			 struct ib_sa_path_rec *rec, struct ib_ah_attr *ah_attr)
+{
+	int ret;
+	u16 gid_index;
+	int force_grh;
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->dlid = be16_to_cpu(rec->dlid);
+	ah_attr->sl = rec->sl;
+	ah_attr->src_path_bits = be16_to_cpu(rec->slid) &
+				 get_src_path_mask(device, port_num);
+	ah_attr->port_num = port_num;
+	ah_attr->static_rate = rec->rate;
+
+	force_grh = rdma_port_get_link_layer(device, port_num) == IB_LINK_LAYER_ETHERNET;
+
+	if (rec->hop_limit > 1 || force_grh) {
+		ah_attr->ah_flags = IB_AH_GRH;
+		ah_attr->grh.dgid = rec->dgid;
+
+		ret = ib_find_cached_gid(device, &rec->sgid, &port_num,
+					 &gid_index);
+		if (ret)
+			return ret;
+
+		ah_attr->grh.sgid_index    = gid_index;
+		ah_attr->grh.flow_label    = be32_to_cpu(rec->flow_label);
+		ah_attr->grh.hop_limit     = rec->hop_limit;
+		ah_attr->grh.traffic_class = rec->traffic_class;
+	}
+	if (force_grh) {
+		memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN);
+		ah_attr->vlan_id = rec->vlan_id;
+	} else {
+		ah_attr->vlan_id = 0xffff;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_path);
+
+static int alloc_mad(struct ib_sa_query *query, gfp_t gfp_mask)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&query->port->ah_lock, flags);
+	if (!query->port->sm_ah) {
+		spin_unlock_irqrestore(&query->port->ah_lock, flags);
+		return -EAGAIN;
+	}
+	kref_get(&query->port->sm_ah->ref);
+	query->sm_ah = query->port->sm_ah;
+	spin_unlock_irqrestore(&query->port->ah_lock, flags);
+
+	query->mad_buf = ib_create_send_mad(query->port->agent, 1,
+					    query->sm_ah->pkey_index,
+					    0, IB_MGMT_SA_HDR, IB_MGMT_SA_DATA,
+					    gfp_mask);
+	if (IS_ERR(query->mad_buf)) {
+		kref_put(&query->sm_ah->ref, free_sm_ah);
+		return -ENOMEM;
+	}
+
+	query->mad_buf->ah = query->sm_ah->ah;
+
+	return 0;
+}
+
+static void free_mad(struct ib_sa_query *query)
+{
+	ib_free_send_mad(query->mad_buf);
+	kref_put(&query->sm_ah->ref, free_sm_ah);
+}
+
+static void init_mad(struct ib_sa_mad *mad, struct ib_mad_agent *agent)
+{
+	unsigned long flags;
+
+	memset(mad, 0, sizeof *mad);
+
+	mad->mad_hdr.base_version  = IB_MGMT_BASE_VERSION;
+	mad->mad_hdr.mgmt_class    = IB_MGMT_CLASS_SUBN_ADM;
+	mad->mad_hdr.class_version = IB_SA_CLASS_VERSION;
+
+	spin_lock_irqsave(&tid_lock, flags);
+	mad->mad_hdr.tid           =
+		cpu_to_be64(((u64) agent->hi_tid) << 32 | tid++);
+	spin_unlock_irqrestore(&tid_lock, flags);
+}
+
+static int send_mad(struct ib_sa_query *query, int timeout_ms, gfp_t gfp_mask)
+{
+	bool preload = !!(gfp_mask & __GFP_WAIT);
+	unsigned long flags;
+	int ret, id;
+
+	if (preload)
+		idr_preload(gfp_mask);
+	spin_lock_irqsave(&idr_lock, flags);
+
+	id = idr_alloc(&query_idr, query, 0, 0, GFP_NOWAIT);
+
+	spin_unlock_irqrestore(&idr_lock, flags);
+	if (preload)
+		idr_preload_end();
+	if (id < 0)
+		return id;
+
+	query->mad_buf->timeout_ms  = timeout_ms;
+	query->mad_buf->context[0] = query;
+	query->id = id;
+
+	ret = ib_post_send_mad(query->mad_buf, NULL);
+	if (ret) {
+		spin_lock_irqsave(&idr_lock, flags);
+		idr_remove(&query_idr, id);
+		spin_unlock_irqrestore(&idr_lock, flags);
+	}
+
+	/*
+	 * It's not safe to dereference query any more, because the
+	 * send may already have completed and freed the query in
+	 * another context.
+	 */
+	return ret ? ret : id;
+}
+
+void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec)
+{
+	ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table), attribute, rec);
+}
+EXPORT_SYMBOL(ib_sa_unpack_path);
+
+void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute)
+{
+	ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, attribute);
+}
+EXPORT_SYMBOL(ib_sa_pack_path);
+
+static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query,
+				    int status,
+				    struct ib_sa_mad *mad)
+{
+	struct ib_sa_path_query *query =
+		container_of(sa_query, struct ib_sa_path_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_path_rec rec;
+
+		ib_unpack(path_rec_table, ARRAY_SIZE(path_rec_table),
+			  mad->data, &rec);
+		rec.vlan_id = 0xffff;
+		memset(rec.dmac, 0, ETH_ALEN);
+		memset(rec.smac, 0, ETH_ALEN);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_path_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_path_query, sa_query));
+}
+
+/**
+ * ib_sa_path_rec_get - Start a Path get query
+ * @client:SA client
+ * @device:device to send query on
+ * @port_num: port number to send query on
+ * @rec:Path Record to send in query
+ * @comp_mask:component mask to send in query
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when query completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:query context, used to cancel query
+ *
+ * Send a Path Record Get query to the SA to look up a path.  The
+ * callback function will be called when the query completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query.  The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_path_rec_get() is negative, it is an
+ * error code.  Otherwise it is a query ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_path_rec_get(struct ib_sa_client *client,
+		       struct ib_device *device, u8 port_num,
+		       struct ib_sa_path_rec *rec,
+		       ib_sa_comp_mask comp_mask,
+		       int timeout_ms, gfp_t gfp_mask,
+		       void (*callback)(int status,
+					struct ib_sa_path_rec *resp,
+					void *context),
+		       void *context,
+		       struct ib_sa_query **sa_query)
+{
+	struct ib_sa_path_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_path_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_path_rec_release;
+	mad->mad_hdr.method	 = IB_MGMT_METHOD_GET;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_PATH_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(path_rec_table, ARRAY_SIZE(path_rec_table), rec, mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_path_rec_get);
+
+static void ib_sa_service_rec_callback(struct ib_sa_query *sa_query,
+				    int status,
+				    struct ib_sa_mad *mad)
+{
+	struct ib_sa_service_query *query =
+		container_of(sa_query, struct ib_sa_service_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_service_rec rec;
+
+		ib_unpack(service_rec_table, ARRAY_SIZE(service_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_service_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_service_query, sa_query));
+}
+
+/**
+ * ib_sa_service_rec_query - Start Service Record operation
+ * @client:SA client
+ * @device:device to send request on
+ * @port_num: port number to send request on
+ * @method:SA method - should be get, set, or delete
+ * @rec:Service Record to send in request
+ * @comp_mask:component mask to send in request
+ * @timeout_ms:time to wait for response
+ * @gfp_mask:GFP mask to use for internal allocations
+ * @callback:function called when request completes, times out or is
+ * canceled
+ * @context:opaque user context passed to callback
+ * @sa_query:request context, used to cancel request
+ *
+ * Send a Service Record set/get/delete to the SA to register,
+ * unregister or query a service record.
+ * The callback function will be called when the request completes (or
+ * fails); status is 0 for a successful response, -EINTR if the query
+ * is canceled, -ETIMEDOUT is the query timed out, or -EIO if an error
+ * occurred sending the query.  The resp parameter of the callback is
+ * only valid if status is 0.
+ *
+ * If the return value of ib_sa_service_rec_query() is negative, it is an
+ * error code.  Otherwise it is a request ID that can be used to cancel
+ * the query.
+ */
+int ib_sa_service_rec_query(struct ib_sa_client *client,
+			    struct ib_device *device, u8 port_num, u8 method,
+			    struct ib_sa_service_rec *rec,
+			    ib_sa_comp_mask comp_mask,
+			    int timeout_ms, gfp_t gfp_mask,
+			    void (*callback)(int status,
+					     struct ib_sa_service_rec *resp,
+					     void *context),
+			    void *context,
+			    struct ib_sa_query **sa_query)
+{
+	struct ib_sa_service_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	if (method != IB_MGMT_METHOD_GET &&
+	    method != IB_MGMT_METHOD_SET &&
+	    method != IB_SA_METHOD_DELETE)
+		return -EINVAL;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_service_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_service_rec_release;
+	mad->mad_hdr.method	 = method;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_SERVICE_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(service_rec_table, ARRAY_SIZE(service_rec_table),
+		rec, mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_service_rec_query);
+
+static void ib_sa_mcmember_rec_callback(struct ib_sa_query *sa_query,
+					int status,
+					struct ib_sa_mad *mad)
+{
+	struct ib_sa_mcmember_query *query =
+		container_of(sa_query, struct ib_sa_mcmember_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_mcmember_rec rec;
+
+		ib_unpack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_mcmember_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_mcmember_query, sa_query));
+}
+
+int ib_sa_mcmember_rec_query(struct ib_sa_client *client,
+			     struct ib_device *device, u8 port_num,
+			     u8 method,
+			     struct ib_sa_mcmember_rec *rec,
+			     ib_sa_comp_mask comp_mask,
+			     int timeout_ms, gfp_t gfp_mask,
+			     void (*callback)(int status,
+					      struct ib_sa_mcmember_rec *resp,
+					      void *context),
+			     void *context,
+			     struct ib_sa_query **sa_query)
+{
+	struct ib_sa_mcmember_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port   *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port     = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_mcmember_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_mcmember_rec_release;
+	mad->mad_hdr.method	 = method;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(mcmember_rec_table, ARRAY_SIZE(mcmember_rec_table),
+		rec, mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+
+/* Support GuidInfoRecord */
+static void ib_sa_guidinfo_rec_callback(struct ib_sa_query *sa_query,
+					int status,
+					struct ib_sa_mad *mad)
+{
+	struct ib_sa_guidinfo_query *query =
+		container_of(sa_query, struct ib_sa_guidinfo_query, sa_query);
+
+	if (mad) {
+		struct ib_sa_guidinfo_rec rec;
+
+		ib_unpack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table),
+			  mad->data, &rec);
+		query->callback(status, &rec, query->context);
+	} else
+		query->callback(status, NULL, query->context);
+}
+
+static void ib_sa_guidinfo_rec_release(struct ib_sa_query *sa_query)
+{
+	kfree(container_of(sa_query, struct ib_sa_guidinfo_query, sa_query));
+}
+
+int ib_sa_guid_info_rec_query(struct ib_sa_client *client,
+			      struct ib_device *device, u8 port_num,
+			      struct ib_sa_guidinfo_rec *rec,
+			      ib_sa_comp_mask comp_mask, u8 method,
+			      int timeout_ms, gfp_t gfp_mask,
+			      void (*callback)(int status,
+					       struct ib_sa_guidinfo_rec *resp,
+					       void *context),
+			      void *context,
+			      struct ib_sa_query **sa_query)
+{
+	struct ib_sa_guidinfo_query *query;
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	struct ib_sa_port *port;
+	struct ib_mad_agent *agent;
+	struct ib_sa_mad *mad;
+	int ret;
+
+	if (!sa_dev)
+		return -ENODEV;
+
+	if (method != IB_MGMT_METHOD_GET &&
+	    method != IB_MGMT_METHOD_SET &&
+	    method != IB_SA_METHOD_DELETE) {
+		return -EINVAL;
+	}
+
+	port  = &sa_dev->port[port_num - sa_dev->start_port];
+	agent = port->agent;
+
+	query = kmalloc(sizeof *query, gfp_mask);
+	if (!query)
+		return -ENOMEM;
+
+	query->sa_query.port = port;
+	ret = alloc_mad(&query->sa_query, gfp_mask);
+	if (ret)
+		goto err1;
+
+	ib_sa_client_get(client);
+	query->sa_query.client = client;
+	query->callback        = callback;
+	query->context         = context;
+
+	mad = query->sa_query.mad_buf->mad;
+	init_mad(mad, agent);
+
+	query->sa_query.callback = callback ? ib_sa_guidinfo_rec_callback : NULL;
+	query->sa_query.release  = ib_sa_guidinfo_rec_release;
+
+	mad->mad_hdr.method	 = method;
+	mad->mad_hdr.attr_id	 = cpu_to_be16(IB_SA_ATTR_GUID_INFO_REC);
+	mad->sa_hdr.comp_mask	 = comp_mask;
+
+	ib_pack(guidinfo_rec_table, ARRAY_SIZE(guidinfo_rec_table), rec,
+		mad->data);
+
+	*sa_query = &query->sa_query;
+
+	ret = send_mad(&query->sa_query, timeout_ms, gfp_mask);
+	if (ret < 0)
+		goto err2;
+
+	return ret;
+
+err2:
+	*sa_query = NULL;
+	ib_sa_client_put(query->sa_query.client);
+	free_mad(&query->sa_query);
+
+err1:
+	kfree(query);
+	return ret;
+}
+EXPORT_SYMBOL(ib_sa_guid_info_rec_query);
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	struct ib_sa_query *query = mad_send_wc->send_buf->context[0];
+	unsigned long flags;
+
+	if (query->callback)
+		switch (mad_send_wc->status) {
+		case IB_WC_SUCCESS:
+			/* No callback -- already got recv */
+			break;
+		case IB_WC_RESP_TIMEOUT_ERR:
+			query->callback(query, -ETIMEDOUT, NULL);
+			break;
+		case IB_WC_WR_FLUSH_ERR:
+			query->callback(query, -EINTR, NULL);
+			break;
+		default:
+			query->callback(query, -EIO, NULL);
+			break;
+		}
+
+	spin_lock_irqsave(&idr_lock, flags);
+	idr_remove(&query_idr, query->id);
+	spin_unlock_irqrestore(&idr_lock, flags);
+
+	free_mad(query);
+	ib_sa_client_put(query->client);
+	query->release(query);
+}
+
+static void recv_handler(struct ib_mad_agent *mad_agent,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_sa_query *query;
+	struct ib_mad_send_buf *mad_buf;
+
+	mad_buf = (void *) (unsigned long) mad_recv_wc->wc->wr_id;
+	query = mad_buf->context[0];
+
+	if (query->callback) {
+		if (mad_recv_wc->wc->status == IB_WC_SUCCESS)
+			query->callback(query,
+					mad_recv_wc->recv_buf.mad->mad_hdr.status ?
+					-EINVAL : 0,
+					(struct ib_sa_mad *) mad_recv_wc->recv_buf.mad);
+		else
+			query->callback(query, -EIO, NULL);
+	}
+
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static void ib_sa_add_one(struct ib_device *device)
+{
+	struct ib_sa_device *sa_dev;
+	int s, e, i;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		s = e = 0;
+	else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	sa_dev = kzalloc(sizeof *sa_dev +
+			 (e - s + 1) * sizeof (struct ib_sa_port),
+			 GFP_KERNEL);
+	if (!sa_dev)
+		return;
+
+	sa_dev->start_port = s;
+	sa_dev->end_port   = e;
+
+	for (i = 0; i <= e - s; ++i) {
+		spin_lock_init(&sa_dev->port[i].ah_lock);
+		if (rdma_port_get_link_layer(device, i + 1) != IB_LINK_LAYER_INFINIBAND)
+			continue;
+
+		sa_dev->port[i].sm_ah    = NULL;
+		sa_dev->port[i].port_num = i + s;
+
+		sa_dev->port[i].agent =
+			ib_register_mad_agent(device, i + s, IB_QPT_GSI,
+					      NULL, 0, send_handler,
+					      recv_handler, sa_dev, 0);
+		if (IS_ERR(sa_dev->port[i].agent))
+			goto err;
+
+		INIT_WORK(&sa_dev->port[i].update_task, update_sm_ah);
+	}
+
+	ib_set_client_data(device, &sa_client, sa_dev);
+
+	/*
+	 * We register our event handler after everything is set up,
+	 * and then update our cached info after the event handler is
+	 * registered to avoid any problems if a port changes state
+	 * during our initialization.
+	 */
+
+	INIT_IB_EVENT_HANDLER(&sa_dev->event_handler, device, ib_sa_event);
+	if (ib_register_event_handler(&sa_dev->event_handler))
+		goto err;
+
+	for (i = 0; i <= e - s; ++i)
+		if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
+			update_sm_ah(&sa_dev->port[i].update_task);
+
+	return;
+
+err:
+	while (--i >= 0)
+		if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND)
+			ib_unregister_mad_agent(sa_dev->port[i].agent);
+
+	kfree(sa_dev);
+
+	return;
+}
+
+static void ib_sa_remove_one(struct ib_device *device)
+{
+	struct ib_sa_device *sa_dev = ib_get_client_data(device, &sa_client);
+	int i;
+
+	if (!sa_dev)
+		return;
+
+	ib_unregister_event_handler(&sa_dev->event_handler);
+
+	flush_workqueue(ib_wq);
+
+	for (i = 0; i <= sa_dev->end_port - sa_dev->start_port; ++i) {
+		if (rdma_port_get_link_layer(device, i + 1) == IB_LINK_LAYER_INFINIBAND) {
+			ib_unregister_mad_agent(sa_dev->port[i].agent);
+			if (sa_dev->port[i].sm_ah)
+				kref_put(&sa_dev->port[i].sm_ah->ref, free_sm_ah);
+		}
+
+	}
+
+	kfree(sa_dev);
+}
+
+static int __init ib_sa_init(void)
+{
+	int ret;
+
+	get_random_bytes(&tid, sizeof tid);
+
+	ret = ib_register_client(&sa_client);
+	if (ret) {
+		printk(KERN_ERR "Couldn't register ib_sa client\n");
+		goto err1;
+	}
+
+	ret = mcast_init();
+	if (ret) {
+		printk(KERN_ERR "Couldn't initialize multicast handling\n");
+		goto err2;
+	}
+
+	return 0;
+err2:
+	ib_unregister_client(&sa_client);
+err1:
+	return ret;
+}
+
+static void __exit ib_sa_cleanup(void)
+{
+	mcast_cleanup();
+	ib_unregister_client(&sa_client);
+	idr_destroy(&query_idr);
+}
+
+module_init(ib_sa_init);
+module_exit(ib_sa_cleanup);
diff --git a/drivers/infiniband/core/smi.c b/drivers/infiniband/core/smi.c
new file mode 100644
index 000000000..5855e4405
--- /dev/null
+++ b/drivers/infiniband/core/smi.c
@@ -0,0 +1,253 @@
+/*
+ * Copyright (c) 2004, 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004, 2005 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004, 2005 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <rdma/ib_smi.h>
+#include "smi.h"
+
+/*
+ * Fixup a directed route SMP for sending
+ * Return 0 if the SMP should be discarded
+ */
+enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+				       u8 node_type, int port_num)
+{
+	u8 hop_ptr, hop_cnt;
+
+	hop_ptr = smp->hop_ptr;
+	hop_cnt = smp->hop_cnt;
+
+	/* See section 14.2.2.2, Vol 1 IB spec */
+	/* C14-6 -- valid hop_cnt values are from 0 to 63 */
+	if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
+		return IB_SMI_DISCARD;
+
+	if (!ib_get_smp_direction(smp)) {
+		/* C14-9:1 */
+		if (hop_cnt && hop_ptr == 0) {
+			smp->hop_ptr++;
+			return (smp->initial_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:2 */
+		if (hop_ptr && hop_ptr < hop_cnt) {
+			if (node_type != RDMA_NODE_IB_SWITCH)
+				return IB_SMI_DISCARD;
+
+			/* smp->return_path set when received */
+			smp->hop_ptr++;
+			return (smp->initial_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:3 -- We're at the end of the DR segment of path */
+		if (hop_ptr == hop_cnt) {
+			/* smp->return_path set when received */
+			smp->hop_ptr++;
+			return (node_type == RDMA_NODE_IB_SWITCH ||
+				smp->dr_dlid == IB_LID_PERMISSIVE ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		/* C14-9:5 -- Fail unreasonable hop pointer */
+		return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+	} else {
+		/* C14-13:1 */
+		if (hop_cnt && hop_ptr == hop_cnt + 1) {
+			smp->hop_ptr--;
+			return (smp->return_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:2 */
+		if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
+			if (node_type != RDMA_NODE_IB_SWITCH)
+				return IB_SMI_DISCARD;
+
+			smp->hop_ptr--;
+			return (smp->return_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:3 -- at the end of the DR segment of path */
+		if (hop_ptr == 1) {
+			smp->hop_ptr--;
+			/* C14-13:3 -- SMPs destined for SM shouldn't be here */
+			return (node_type == RDMA_NODE_IB_SWITCH ||
+				smp->dr_slid == IB_LID_PERMISSIVE ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:4 -- hop_ptr = 0 -> should have gone to SM */
+		if (hop_ptr == 0)
+			return IB_SMI_HANDLE;
+
+		/* C14-13:5 -- Check for unreasonable hop pointer */
+		return IB_SMI_DISCARD;
+	}
+}
+
+/*
+ * Adjust information for a received SMP
+ * Return 0 if the SMP should be dropped
+ */
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
+				       int port_num, int phys_port_cnt)
+{
+	u8 hop_ptr, hop_cnt;
+
+	hop_ptr = smp->hop_ptr;
+	hop_cnt = smp->hop_cnt;
+
+	/* See section 14.2.2.2, Vol 1 IB spec */
+	/* C14-6 -- valid hop_cnt values are from 0 to 63 */
+	if (hop_cnt >= IB_SMP_MAX_PATH_HOPS)
+		return IB_SMI_DISCARD;
+
+	if (!ib_get_smp_direction(smp)) {
+		/* C14-9:1 -- sender should have incremented hop_ptr */
+		if (hop_cnt && hop_ptr == 0)
+			return IB_SMI_DISCARD;
+
+		/* C14-9:2 -- intermediate hop */
+		if (hop_ptr && hop_ptr < hop_cnt) {
+			if (node_type != RDMA_NODE_IB_SWITCH)
+				return IB_SMI_DISCARD;
+
+			smp->return_path[hop_ptr] = port_num;
+			/* smp->hop_ptr updated when sending */
+			return (smp->initial_path[hop_ptr+1] <= phys_port_cnt ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:3 -- We're at the end of the DR segment of path */
+		if (hop_ptr == hop_cnt) {
+			if (hop_cnt)
+				smp->return_path[hop_ptr] = port_num;
+			/* smp->hop_ptr updated when sending */
+
+			return (node_type == RDMA_NODE_IB_SWITCH ||
+				smp->dr_dlid == IB_LID_PERMISSIVE ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		/* C14-9:5 -- fail unreasonable hop pointer */
+		return (hop_ptr == hop_cnt + 1 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+
+	} else {
+
+		/* C14-13:1 */
+		if (hop_cnt && hop_ptr == hop_cnt + 1) {
+			smp->hop_ptr--;
+			return (smp->return_path[smp->hop_ptr] ==
+				port_num ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:2 */
+		if (2 <= hop_ptr && hop_ptr <= hop_cnt) {
+			if (node_type != RDMA_NODE_IB_SWITCH)
+				return IB_SMI_DISCARD;
+
+			/* smp->hop_ptr updated when sending */
+			return (smp->return_path[hop_ptr-1] <= phys_port_cnt ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:3 -- We're at the end of the DR segment of path */
+		if (hop_ptr == 1) {
+			if (smp->dr_slid == IB_LID_PERMISSIVE) {
+				/* giving SMP to SM - update hop_ptr */
+				smp->hop_ptr--;
+				return IB_SMI_HANDLE;
+			}
+			/* smp->hop_ptr updated when sending */
+			return (node_type == RDMA_NODE_IB_SWITCH ?
+				IB_SMI_HANDLE : IB_SMI_DISCARD);
+		}
+
+		/* C14-13:4 -- hop_ptr = 0 -> give to SM */
+		/* C14-13:5 -- Check for unreasonable hop pointer */
+		return (hop_ptr == 0 ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+	}
+}
+
+enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp)
+{
+	u8 hop_ptr, hop_cnt;
+
+	hop_ptr = smp->hop_ptr;
+	hop_cnt = smp->hop_cnt;
+
+	if (!ib_get_smp_direction(smp)) {
+		/* C14-9:2 -- intermediate hop */
+		if (hop_ptr && hop_ptr < hop_cnt)
+			return IB_SMI_FORWARD;
+
+		/* C14-9:3 -- at the end of the DR segment of path */
+		if (hop_ptr == hop_cnt)
+			return (smp->dr_dlid == IB_LID_PERMISSIVE ?
+				IB_SMI_SEND : IB_SMI_LOCAL);
+
+		/* C14-9:4 -- hop_ptr = hop_cnt + 1 -> give to SMA/SM */
+		if (hop_ptr == hop_cnt + 1)
+			return IB_SMI_SEND;
+	} else {
+		/* C14-13:2  -- intermediate hop */
+		if (2 <= hop_ptr && hop_ptr <= hop_cnt)
+			return IB_SMI_FORWARD;
+
+		/* C14-13:3 -- at the end of the DR segment of path */
+		if (hop_ptr == 1)
+			return (smp->dr_slid != IB_LID_PERMISSIVE ?
+				IB_SMI_SEND : IB_SMI_LOCAL);
+	}
+	return IB_SMI_LOCAL;
+}
+
+/*
+ * Return the forwarding port number from initial_path for outgoing SMP and
+ * from return_path for returning SMP
+ */
+int smi_get_fwd_port(struct ib_smp *smp)
+{
+	return (!ib_get_smp_direction(smp) ? smp->initial_path[smp->hop_ptr+1] :
+		smp->return_path[smp->hop_ptr-1]);
+}
diff --git a/drivers/infiniband/core/smi.h b/drivers/infiniband/core/smi.h
new file mode 100644
index 000000000..aff96bac4
--- /dev/null
+++ b/drivers/infiniband/core/smi.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004-2007 Voltaire Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef __SMI_H_
+#define __SMI_H_
+
+#include <rdma/ib_smi.h>
+
+enum smi_action {
+	IB_SMI_DISCARD,
+	IB_SMI_HANDLE
+};
+
+enum smi_forward_action {
+	IB_SMI_LOCAL,	/* SMP should be completed up the stack */
+	IB_SMI_SEND,	/* received DR SMP should be forwarded to the send queue */
+	IB_SMI_FORWARD	/* SMP should be forwarded (for switches only) */
+};
+
+enum smi_action smi_handle_dr_smp_recv(struct ib_smp *smp, u8 node_type,
+				       int port_num, int phys_port_cnt);
+int smi_get_fwd_port(struct ib_smp *smp);
+extern enum smi_forward_action smi_check_forward_dr_smp(struct ib_smp *smp);
+extern enum smi_action smi_handle_dr_smp_send(struct ib_smp *smp,
+					      u8 node_type, int port_num);
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_smp(struct ib_smp *smp,
+						  struct ib_device *device)
+{
+	/* C14-9:3 -- We're at the end of the DR segment of path */
+	/* C14-9:4 -- Hop Pointer = Hop Count + 1 -> give to SMA/SM */
+	return ((device->process_mad &&
+		!ib_get_smp_direction(smp) &&
+		(smp->hop_ptr == smp->hop_cnt + 1)) ?
+		IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+/*
+ * Return IB_SMI_HANDLE if the SMP should be handled by the local SMA/SM
+ * via process_mad
+ */
+static inline enum smi_action smi_check_local_returning_smp(struct ib_smp *smp,
+						   struct ib_device *device)
+{
+	/* C14-13:3 -- We're at the end of the DR segment of path */
+	/* C14-13:4 -- Hop Pointer == 0 -> give to SM */
+	return ((device->process_mad &&
+		ib_get_smp_direction(smp) &&
+		!smp->hop_ptr) ? IB_SMI_HANDLE : IB_SMI_DISCARD);
+}
+
+#endif	/* __SMI_H_ */
diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c
new file mode 100644
index 000000000..cbd0383f6
--- /dev/null
+++ b/drivers/infiniband/core/sysfs.c
@@ -0,0 +1,922 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "core_priv.h"
+
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/string.h>
+
+#include <rdma/ib_mad.h>
+
+struct ib_port {
+	struct kobject         kobj;
+	struct ib_device      *ibdev;
+	struct attribute_group gid_group;
+	struct attribute_group pkey_group;
+	u8                     port_num;
+};
+
+struct port_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf);
+	ssize_t (*store)(struct ib_port *, struct port_attribute *,
+			 const char *buf, size_t count);
+};
+
+#define PORT_ATTR(_name, _mode, _show, _store) \
+struct port_attribute port_attr_##_name = __ATTR(_name, _mode, _show, _store)
+
+#define PORT_ATTR_RO(_name) \
+struct port_attribute port_attr_##_name = __ATTR_RO(_name)
+
+struct port_table_attribute {
+	struct port_attribute	attr;
+	char			name[8];
+	int			index;
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+
+	if (!port_attr->show)
+		return -EIO;
+
+	return port_attr->show(p, port_attr, buf);
+}
+
+static const struct sysfs_ops port_sysfs_ops = {
+	.show = port_attr_show
+};
+
+static ssize_t state_show(struct ib_port *p, struct port_attribute *unused,
+			  char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	static const char *state_name[] = {
+		[IB_PORT_NOP]		= "NOP",
+		[IB_PORT_DOWN]		= "DOWN",
+		[IB_PORT_INIT]		= "INIT",
+		[IB_PORT_ARMED]		= "ARMED",
+		[IB_PORT_ACTIVE]	= "ACTIVE",
+		[IB_PORT_ACTIVE_DEFER]	= "ACTIVE_DEFER"
+	};
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d: %s\n", attr.state,
+		       attr.state >= 0 && attr.state < ARRAY_SIZE(state_name) ?
+		       state_name[attr.state] : "UNKNOWN");
+}
+
+static ssize_t lid_show(struct ib_port *p, struct port_attribute *unused,
+			char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%x\n", attr.lid);
+}
+
+static ssize_t lid_mask_count_show(struct ib_port *p,
+				   struct port_attribute *unused,
+				   char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d\n", attr.lmc);
+}
+
+static ssize_t sm_lid_show(struct ib_port *p, struct port_attribute *unused,
+			   char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%x\n", attr.sm_lid);
+}
+
+static ssize_t sm_sl_show(struct ib_port *p, struct port_attribute *unused,
+			  char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%d\n", attr.sm_sl);
+}
+
+static ssize_t cap_mask_show(struct ib_port *p, struct port_attribute *unused,
+			     char *buf)
+{
+	struct ib_port_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%08x\n", attr.port_cap_flags);
+}
+
+static ssize_t rate_show(struct ib_port *p, struct port_attribute *unused,
+			 char *buf)
+{
+	struct ib_port_attr attr;
+	char *speed = "";
+	int rate;		/* in deci-Gb/sec */
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	switch (attr.active_speed) {
+	case IB_SPEED_DDR:
+		speed = " DDR";
+		rate = 50;
+		break;
+	case IB_SPEED_QDR:
+		speed = " QDR";
+		rate = 100;
+		break;
+	case IB_SPEED_FDR10:
+		speed = " FDR10";
+		rate = 100;
+		break;
+	case IB_SPEED_FDR:
+		speed = " FDR";
+		rate = 140;
+		break;
+	case IB_SPEED_EDR:
+		speed = " EDR";
+		rate = 250;
+		break;
+	case IB_SPEED_SDR:
+	default:		/* default to SDR for invalid rates */
+		rate = 25;
+		break;
+	}
+
+	rate *= ib_width_enum_to_int(attr.active_width);
+	if (rate < 0)
+		return -EINVAL;
+
+	return sprintf(buf, "%d%s Gb/sec (%dX%s)\n",
+		       rate / 10, rate % 10 ? ".5" : "",
+		       ib_width_enum_to_int(attr.active_width), speed);
+}
+
+static ssize_t phys_state_show(struct ib_port *p, struct port_attribute *unused,
+			       char *buf)
+{
+	struct ib_port_attr attr;
+
+	ssize_t ret;
+
+	ret = ib_query_port(p->ibdev, p->port_num, &attr);
+	if (ret)
+		return ret;
+
+	switch (attr.phys_state) {
+	case 1:  return sprintf(buf, "1: Sleep\n");
+	case 2:  return sprintf(buf, "2: Polling\n");
+	case 3:  return sprintf(buf, "3: Disabled\n");
+	case 4:  return sprintf(buf, "4: PortConfigurationTraining\n");
+	case 5:  return sprintf(buf, "5: LinkUp\n");
+	case 6:  return sprintf(buf, "6: LinkErrorRecovery\n");
+	case 7:  return sprintf(buf, "7: Phy Test\n");
+	default: return sprintf(buf, "%d: <unknown>\n", attr.phys_state);
+	}
+}
+
+static ssize_t link_layer_show(struct ib_port *p, struct port_attribute *unused,
+			       char *buf)
+{
+	switch (rdma_port_get_link_layer(p->ibdev, p->port_num)) {
+	case IB_LINK_LAYER_INFINIBAND:
+		return sprintf(buf, "%s\n", "InfiniBand");
+	case IB_LINK_LAYER_ETHERNET:
+		return sprintf(buf, "%s\n", "Ethernet");
+	default:
+		return sprintf(buf, "%s\n", "Unknown");
+	}
+}
+
+static PORT_ATTR_RO(state);
+static PORT_ATTR_RO(lid);
+static PORT_ATTR_RO(lid_mask_count);
+static PORT_ATTR_RO(sm_lid);
+static PORT_ATTR_RO(sm_sl);
+static PORT_ATTR_RO(cap_mask);
+static PORT_ATTR_RO(rate);
+static PORT_ATTR_RO(phys_state);
+static PORT_ATTR_RO(link_layer);
+
+static struct attribute *port_default_attrs[] = {
+	&port_attr_state.attr,
+	&port_attr_lid.attr,
+	&port_attr_lid_mask_count.attr,
+	&port_attr_sm_lid.attr,
+	&port_attr_sm_sl.attr,
+	&port_attr_cap_mask.attr,
+	&port_attr_rate.attr,
+	&port_attr_phys_state.attr,
+	&port_attr_link_layer.attr,
+	NULL
+};
+
+static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr,
+			     char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	union ib_gid gid;
+	ssize_t ret;
+
+	ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%pI6\n", gid.raw);
+}
+
+static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr,
+			      char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	u16 pkey;
+	ssize_t ret;
+
+	ret = ib_query_pkey(p->ibdev, p->port_num, tab_attr->index, &pkey);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%04x\n", pkey);
+}
+
+#define PORT_PMA_ATTR(_name, _counter, _width, _offset)			\
+struct port_table_attribute port_pma_attr_##_name = {			\
+	.attr  = __ATTR(_name, S_IRUGO, show_pma_counter, NULL),	\
+	.index = (_offset) | ((_width) << 16) | ((_counter) << 24)	\
+}
+
+static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr,
+				char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	int offset = tab_attr->index & 0xffff;
+	int width  = (tab_attr->index >> 16) & 0xff;
+	struct ib_mad *in_mad  = NULL;
+	struct ib_mad *out_mad = NULL;
+	ssize_t ret;
+
+	if (!p->ibdev->process_mad)
+		return sprintf(buf, "N/A (no PMA)\n");
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	in_mad->mad_hdr.base_version  = 1;
+	in_mad->mad_hdr.mgmt_class    = IB_MGMT_CLASS_PERF_MGMT;
+	in_mad->mad_hdr.class_version = 1;
+	in_mad->mad_hdr.method        = IB_MGMT_METHOD_GET;
+	in_mad->mad_hdr.attr_id       = cpu_to_be16(0x12); /* PortCounters */
+
+	in_mad->data[41] = p->port_num;	/* PortSelect field */
+
+	if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY,
+		 p->port_num, NULL, NULL, in_mad, out_mad) &
+	     (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) !=
+	    (IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	switch (width) {
+	case 4:
+		ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >>
+					    (4 - (offset % 8))) & 0xf);
+		break;
+	case 8:
+		ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]);
+		break;
+	case 16:
+		ret = sprintf(buf, "%u\n",
+			      be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8)));
+		break;
+	case 32:
+		ret = sprintf(buf, "%u\n",
+			      be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8)));
+		break;
+	default:
+		ret = 0;
+	}
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return ret;
+}
+
+static PORT_PMA_ATTR(symbol_error		    ,  0, 16,  32);
+static PORT_PMA_ATTR(link_error_recovery	    ,  1,  8,  48);
+static PORT_PMA_ATTR(link_downed		    ,  2,  8,  56);
+static PORT_PMA_ATTR(port_rcv_errors		    ,  3, 16,  64);
+static PORT_PMA_ATTR(port_rcv_remote_physical_errors,  4, 16,  80);
+static PORT_PMA_ATTR(port_rcv_switch_relay_errors   ,  5, 16,  96);
+static PORT_PMA_ATTR(port_xmit_discards		    ,  6, 16, 112);
+static PORT_PMA_ATTR(port_xmit_constraint_errors    ,  7,  8, 128);
+static PORT_PMA_ATTR(port_rcv_constraint_errors	    ,  8,  8, 136);
+static PORT_PMA_ATTR(local_link_integrity_errors    ,  9,  4, 152);
+static PORT_PMA_ATTR(excessive_buffer_overrun_errors, 10,  4, 156);
+static PORT_PMA_ATTR(VL15_dropped		    , 11, 16, 176);
+static PORT_PMA_ATTR(port_xmit_data		    , 12, 32, 192);
+static PORT_PMA_ATTR(port_rcv_data		    , 13, 32, 224);
+static PORT_PMA_ATTR(port_xmit_packets		    , 14, 32, 256);
+static PORT_PMA_ATTR(port_rcv_packets		    , 15, 32, 288);
+
+static struct attribute *pma_attrs[] = {
+	&port_pma_attr_symbol_error.attr.attr,
+	&port_pma_attr_link_error_recovery.attr.attr,
+	&port_pma_attr_link_downed.attr.attr,
+	&port_pma_attr_port_rcv_errors.attr.attr,
+	&port_pma_attr_port_rcv_remote_physical_errors.attr.attr,
+	&port_pma_attr_port_rcv_switch_relay_errors.attr.attr,
+	&port_pma_attr_port_xmit_discards.attr.attr,
+	&port_pma_attr_port_xmit_constraint_errors.attr.attr,
+	&port_pma_attr_port_rcv_constraint_errors.attr.attr,
+	&port_pma_attr_local_link_integrity_errors.attr.attr,
+	&port_pma_attr_excessive_buffer_overrun_errors.attr.attr,
+	&port_pma_attr_VL15_dropped.attr.attr,
+	&port_pma_attr_port_xmit_data.attr.attr,
+	&port_pma_attr_port_rcv_data.attr.attr,
+	&port_pma_attr_port_xmit_packets.attr.attr,
+	&port_pma_attr_port_rcv_packets.attr.attr,
+	NULL
+};
+
+static struct attribute_group pma_group = {
+	.name  = "counters",
+	.attrs  = pma_attrs
+};
+
+static void ib_port_release(struct kobject *kobj)
+{
+	struct ib_port *p = container_of(kobj, struct ib_port, kobj);
+	struct attribute *a;
+	int i;
+
+	if (p->gid_group.attrs) {
+		for (i = 0; (a = p->gid_group.attrs[i]); ++i)
+			kfree(a);
+
+		kfree(p->gid_group.attrs);
+	}
+
+	if (p->pkey_group.attrs) {
+		for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
+			kfree(a);
+
+		kfree(p->pkey_group.attrs);
+	}
+
+	kfree(p);
+}
+
+static struct kobj_type port_type = {
+	.release       = ib_port_release,
+	.sysfs_ops     = &port_sysfs_ops,
+	.default_attrs = port_default_attrs
+};
+
+static void ib_device_release(struct device *device)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	kfree(dev);
+}
+
+static int ib_device_uevent(struct device *device,
+			    struct kobj_uevent_env *env)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	if (add_uevent_var(env, "NAME=%s", dev->name))
+		return -ENOMEM;
+
+	/*
+	 * It would be nice to pass the node GUID with the event...
+	 */
+
+	return 0;
+}
+
+static struct attribute **
+alloc_group_attrs(ssize_t (*show)(struct ib_port *,
+				  struct port_attribute *, char *buf),
+		  int len)
+{
+	struct attribute **tab_attr;
+	struct port_table_attribute *element;
+	int i;
+
+	tab_attr = kcalloc(1 + len, sizeof(struct attribute *), GFP_KERNEL);
+	if (!tab_attr)
+		return NULL;
+
+	for (i = 0; i < len; i++) {
+		element = kzalloc(sizeof(struct port_table_attribute),
+				  GFP_KERNEL);
+		if (!element)
+			goto err;
+
+		if (snprintf(element->name, sizeof(element->name),
+			     "%d", i) >= sizeof(element->name)) {
+			kfree(element);
+			goto err;
+		}
+
+		element->attr.attr.name  = element->name;
+		element->attr.attr.mode  = S_IRUGO;
+		element->attr.show       = show;
+		element->index		 = i;
+		sysfs_attr_init(&element->attr.attr);
+
+		tab_attr[i] = &element->attr.attr;
+	}
+
+	return tab_attr;
+
+err:
+	while (--i >= 0)
+		kfree(tab_attr[i]);
+	kfree(tab_attr);
+	return NULL;
+}
+
+static int add_port(struct ib_device *device, int port_num,
+		    int (*port_callback)(struct ib_device *,
+					 u8, struct kobject *))
+{
+	struct ib_port *p;
+	struct ib_port_attr attr;
+	int i;
+	int ret;
+
+	ret = ib_query_port(device, port_num, &attr);
+	if (ret)
+		return ret;
+
+	p = kzalloc(sizeof *p, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	p->ibdev      = device;
+	p->port_num   = port_num;
+
+	ret = kobject_init_and_add(&p->kobj, &port_type,
+				   device->ports_parent,
+				   "%d", port_num);
+	if (ret) {
+		kfree(p);
+		return ret;
+	}
+
+	ret = sysfs_create_group(&p->kobj, &pma_group);
+	if (ret)
+		goto err_put;
+
+	p->gid_group.name  = "gids";
+	p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len);
+	if (!p->gid_group.attrs) {
+		ret = -ENOMEM;
+		goto err_remove_pma;
+	}
+
+	ret = sysfs_create_group(&p->kobj, &p->gid_group);
+	if (ret)
+		goto err_free_gid;
+
+	p->pkey_group.name  = "pkeys";
+	p->pkey_group.attrs = alloc_group_attrs(show_port_pkey,
+						attr.pkey_tbl_len);
+	if (!p->pkey_group.attrs) {
+		ret = -ENOMEM;
+		goto err_remove_gid;
+	}
+
+	ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+	if (ret)
+		goto err_free_pkey;
+
+	if (port_callback) {
+		ret = port_callback(device, port_num, &p->kobj);
+		if (ret)
+			goto err_remove_pkey;
+	}
+
+	list_add_tail(&p->kobj.entry, &device->port_list);
+
+	kobject_uevent(&p->kobj, KOBJ_ADD);
+	return 0;
+
+err_remove_pkey:
+	sysfs_remove_group(&p->kobj, &p->pkey_group);
+
+err_free_pkey:
+	for (i = 0; i < attr.pkey_tbl_len; ++i)
+		kfree(p->pkey_group.attrs[i]);
+
+	kfree(p->pkey_group.attrs);
+	p->pkey_group.attrs = NULL;
+
+err_remove_gid:
+	sysfs_remove_group(&p->kobj, &p->gid_group);
+
+err_free_gid:
+	for (i = 0; i < attr.gid_tbl_len; ++i)
+		kfree(p->gid_group.attrs[i]);
+
+	kfree(p->gid_group.attrs);
+	p->gid_group.attrs = NULL;
+
+err_remove_pma:
+	sysfs_remove_group(&p->kobj, &pma_group);
+
+err_put:
+	kobject_put(&p->kobj);
+	return ret;
+}
+
+static ssize_t show_node_type(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	switch (dev->node_type) {
+	case RDMA_NODE_IB_CA:	  return sprintf(buf, "%d: CA\n", dev->node_type);
+	case RDMA_NODE_RNIC:	  return sprintf(buf, "%d: RNIC\n", dev->node_type);
+	case RDMA_NODE_USNIC:	  return sprintf(buf, "%d: usNIC\n", dev->node_type);
+	case RDMA_NODE_USNIC_UDP: return sprintf(buf, "%d: usNIC UDP\n", dev->node_type);
+	case RDMA_NODE_IB_SWITCH: return sprintf(buf, "%d: switch\n", dev->node_type);
+	case RDMA_NODE_IB_ROUTER: return sprintf(buf, "%d: router\n", dev->node_type);
+	default:		  return sprintf(buf, "%d: <unknown>\n", dev->node_type);
+	}
+}
+
+static ssize_t show_sys_image_guid(struct device *device,
+				   struct device_attribute *dev_attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device_attr attr;
+	ssize_t ret;
+
+	ret = ib_query_device(dev, &attr);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]),
+		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]),
+		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]),
+		       be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3]));
+}
+
+static ssize_t show_node_guid(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	return sprintf(buf, "%04x:%04x:%04x:%04x\n",
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[0]),
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[1]),
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[2]),
+		       be16_to_cpu(((__be16 *) &dev->node_guid)[3]));
+}
+
+static ssize_t show_node_desc(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+
+	return sprintf(buf, "%.64s\n", dev->node_desc);
+}
+
+static ssize_t set_node_desc(struct device *device,
+			     struct device_attribute *attr,
+			     const char *buf, size_t count)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	struct ib_device_modify desc = {};
+	int ret;
+
+	if (!dev->modify_device)
+		return -EIO;
+
+	memcpy(desc.node_desc, buf, min_t(int, count, 64));
+	ret = ib_modify_device(dev, IB_DEVICE_MODIFY_NODE_DESC, &desc);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+static DEVICE_ATTR(node_type, S_IRUGO, show_node_type, NULL);
+static DEVICE_ATTR(sys_image_guid, S_IRUGO, show_sys_image_guid, NULL);
+static DEVICE_ATTR(node_guid, S_IRUGO, show_node_guid, NULL);
+static DEVICE_ATTR(node_desc, S_IRUGO | S_IWUSR, show_node_desc, set_node_desc);
+
+static struct device_attribute *ib_class_attributes[] = {
+	&dev_attr_node_type,
+	&dev_attr_sys_image_guid,
+	&dev_attr_node_guid,
+	&dev_attr_node_desc
+};
+
+static struct class ib_class = {
+	.name    = "infiniband",
+	.dev_release = ib_device_release,
+	.dev_uevent = ib_device_uevent,
+};
+
+/* Show a given an attribute in the statistics group */
+static ssize_t show_protocol_stat(const struct device *device,
+			    struct device_attribute *attr, char *buf,
+			    unsigned offset)
+{
+	struct ib_device *dev = container_of(device, struct ib_device, dev);
+	union rdma_protocol_stats stats;
+	ssize_t ret;
+
+	ret = dev->get_protocol_stats(dev, &stats);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "%llu\n",
+		       (unsigned long long) ((u64 *) &stats)[offset]);
+}
+
+/* generate a read-only iwarp statistics attribute */
+#define IW_STATS_ENTRY(name)						\
+static ssize_t show_##name(struct device *device,			\
+			   struct device_attribute *attr, char *buf)	\
+{									\
+	return show_protocol_stat(device, attr, buf,			\
+				  offsetof(struct iw_protocol_stats, name) / \
+				  sizeof (u64));			\
+}									\
+static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL)
+
+IW_STATS_ENTRY(ipInReceives);
+IW_STATS_ENTRY(ipInHdrErrors);
+IW_STATS_ENTRY(ipInTooBigErrors);
+IW_STATS_ENTRY(ipInNoRoutes);
+IW_STATS_ENTRY(ipInAddrErrors);
+IW_STATS_ENTRY(ipInUnknownProtos);
+IW_STATS_ENTRY(ipInTruncatedPkts);
+IW_STATS_ENTRY(ipInDiscards);
+IW_STATS_ENTRY(ipInDelivers);
+IW_STATS_ENTRY(ipOutForwDatagrams);
+IW_STATS_ENTRY(ipOutRequests);
+IW_STATS_ENTRY(ipOutDiscards);
+IW_STATS_ENTRY(ipOutNoRoutes);
+IW_STATS_ENTRY(ipReasmTimeout);
+IW_STATS_ENTRY(ipReasmReqds);
+IW_STATS_ENTRY(ipReasmOKs);
+IW_STATS_ENTRY(ipReasmFails);
+IW_STATS_ENTRY(ipFragOKs);
+IW_STATS_ENTRY(ipFragFails);
+IW_STATS_ENTRY(ipFragCreates);
+IW_STATS_ENTRY(ipInMcastPkts);
+IW_STATS_ENTRY(ipOutMcastPkts);
+IW_STATS_ENTRY(ipInBcastPkts);
+IW_STATS_ENTRY(ipOutBcastPkts);
+IW_STATS_ENTRY(tcpRtoAlgorithm);
+IW_STATS_ENTRY(tcpRtoMin);
+IW_STATS_ENTRY(tcpRtoMax);
+IW_STATS_ENTRY(tcpMaxConn);
+IW_STATS_ENTRY(tcpActiveOpens);
+IW_STATS_ENTRY(tcpPassiveOpens);
+IW_STATS_ENTRY(tcpAttemptFails);
+IW_STATS_ENTRY(tcpEstabResets);
+IW_STATS_ENTRY(tcpCurrEstab);
+IW_STATS_ENTRY(tcpInSegs);
+IW_STATS_ENTRY(tcpOutSegs);
+IW_STATS_ENTRY(tcpRetransSegs);
+IW_STATS_ENTRY(tcpInErrs);
+IW_STATS_ENTRY(tcpOutRsts);
+
+static struct attribute *iw_proto_stats_attrs[] = {
+	&dev_attr_ipInReceives.attr,
+	&dev_attr_ipInHdrErrors.attr,
+	&dev_attr_ipInTooBigErrors.attr,
+	&dev_attr_ipInNoRoutes.attr,
+	&dev_attr_ipInAddrErrors.attr,
+	&dev_attr_ipInUnknownProtos.attr,
+	&dev_attr_ipInTruncatedPkts.attr,
+	&dev_attr_ipInDiscards.attr,
+	&dev_attr_ipInDelivers.attr,
+	&dev_attr_ipOutForwDatagrams.attr,
+	&dev_attr_ipOutRequests.attr,
+	&dev_attr_ipOutDiscards.attr,
+	&dev_attr_ipOutNoRoutes.attr,
+	&dev_attr_ipReasmTimeout.attr,
+	&dev_attr_ipReasmReqds.attr,
+	&dev_attr_ipReasmOKs.attr,
+	&dev_attr_ipReasmFails.attr,
+	&dev_attr_ipFragOKs.attr,
+	&dev_attr_ipFragFails.attr,
+	&dev_attr_ipFragCreates.attr,
+	&dev_attr_ipInMcastPkts.attr,
+	&dev_attr_ipOutMcastPkts.attr,
+	&dev_attr_ipInBcastPkts.attr,
+	&dev_attr_ipOutBcastPkts.attr,
+	&dev_attr_tcpRtoAlgorithm.attr,
+	&dev_attr_tcpRtoMin.attr,
+	&dev_attr_tcpRtoMax.attr,
+	&dev_attr_tcpMaxConn.attr,
+	&dev_attr_tcpActiveOpens.attr,
+	&dev_attr_tcpPassiveOpens.attr,
+	&dev_attr_tcpAttemptFails.attr,
+	&dev_attr_tcpEstabResets.attr,
+	&dev_attr_tcpCurrEstab.attr,
+	&dev_attr_tcpInSegs.attr,
+	&dev_attr_tcpOutSegs.attr,
+	&dev_attr_tcpRetransSegs.attr,
+	&dev_attr_tcpInErrs.attr,
+	&dev_attr_tcpOutRsts.attr,
+	NULL
+};
+
+static struct attribute_group iw_stats_group = {
+	.name	= "proto_stats",
+	.attrs	= iw_proto_stats_attrs,
+};
+
+static void free_port_list_attributes(struct ib_device *device)
+{
+	struct kobject *p, *t;
+
+	list_for_each_entry_safe(p, t, &device->port_list, entry) {
+		struct ib_port *port = container_of(p, struct ib_port, kobj);
+		list_del(&p->entry);
+		sysfs_remove_group(p, &pma_group);
+		sysfs_remove_group(p, &port->pkey_group);
+		sysfs_remove_group(p, &port->gid_group);
+		kobject_put(p);
+	}
+
+	kobject_put(device->ports_parent);
+}
+
+int ib_device_register_sysfs(struct ib_device *device,
+			     int (*port_callback)(struct ib_device *,
+						  u8, struct kobject *))
+{
+	struct device *class_dev = &device->dev;
+	int ret;
+	int i;
+
+	class_dev->class      = &ib_class;
+	class_dev->parent     = device->dma_device;
+	dev_set_name(class_dev, "%s", device->name);
+	dev_set_drvdata(class_dev, device);
+
+	INIT_LIST_HEAD(&device->port_list);
+
+	ret = device_register(class_dev);
+	if (ret)
+		goto err;
+
+	for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i) {
+		ret = device_create_file(class_dev, ib_class_attributes[i]);
+		if (ret)
+			goto err_unregister;
+	}
+
+	device->ports_parent = kobject_create_and_add("ports",
+						      &class_dev->kobj);
+	if (!device->ports_parent) {
+		ret = -ENOMEM;
+		goto err_put;
+	}
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		ret = add_port(device, 0, port_callback);
+		if (ret)
+			goto err_put;
+	} else {
+		for (i = 1; i <= device->phys_port_cnt; ++i) {
+			ret = add_port(device, i, port_callback);
+			if (ret)
+				goto err_put;
+		}
+	}
+
+	if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats) {
+		ret = sysfs_create_group(&class_dev->kobj, &iw_stats_group);
+		if (ret)
+			goto err_put;
+	}
+
+	return 0;
+
+err_put:
+	free_port_list_attributes(device);
+
+err_unregister:
+	device_unregister(class_dev);
+
+err:
+	return ret;
+}
+
+void ib_device_unregister_sysfs(struct ib_device *device)
+{
+	/* Hold kobject until ib_dealloc_device() */
+	struct kobject *kobj_dev = kobject_get(&device->dev.kobj);
+	int i;
+
+	if (device->node_type == RDMA_NODE_RNIC && device->get_protocol_stats)
+		sysfs_remove_group(kobj_dev, &iw_stats_group);
+
+	free_port_list_attributes(device);
+
+	for (i = 0; i < ARRAY_SIZE(ib_class_attributes); ++i)
+		device_remove_file(&device->dev, ib_class_attributes[i]);
+
+	device_unregister(&device->dev);
+}
+
+int ib_sysfs_setup(void)
+{
+	return class_register(&ib_class);
+}
+
+void ib_sysfs_cleanup(void)
+{
+	class_unregister(&ib_class);
+}
diff --git a/drivers/infiniband/core/ucm.c b/drivers/infiniband/core/ucm.c
new file mode 100644
index 000000000..f2f63933e
--- /dev/null
+++ b/drivers/infiniband/core/ucm.c
@@ -0,0 +1,1370 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer in the documentation and/or other materials
+ *	provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/cdev.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+
+#include <rdma/ib_cm.h>
+#include <rdma/ib_user_cm.h>
+#include <rdma/ib_marshall.h>
+
+MODULE_AUTHOR("Libor Michalek");
+MODULE_DESCRIPTION("InfiniBand userspace Connection Manager access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+struct ib_ucm_device {
+	int			devnum;
+	struct cdev		cdev;
+	struct device		dev;
+	struct ib_device	*ib_dev;
+};
+
+struct ib_ucm_file {
+	struct mutex file_mutex;
+	struct file *filp;
+	struct ib_ucm_device *device;
+
+	struct list_head  ctxs;
+	struct list_head  events;
+	wait_queue_head_t poll_wait;
+};
+
+struct ib_ucm_context {
+	int                 id;
+	struct completion   comp;
+	atomic_t            ref;
+	int		    events_reported;
+
+	struct ib_ucm_file *file;
+	struct ib_cm_id    *cm_id;
+	__u64		   uid;
+
+	struct list_head    events;    /* list of pending events. */
+	struct list_head    file_list; /* member in file ctx list */
+};
+
+struct ib_ucm_event {
+	struct ib_ucm_context *ctx;
+	struct list_head file_list; /* member in file event list */
+	struct list_head ctx_list;  /* member in ctx event list */
+
+	struct ib_cm_id *cm_id;
+	struct ib_ucm_event_resp resp;
+	void *data;
+	void *info;
+	int data_len;
+	int info_len;
+};
+
+enum {
+	IB_UCM_MAJOR = 231,
+	IB_UCM_BASE_MINOR = 224,
+	IB_UCM_MAX_DEVICES = 32
+};
+
+#define IB_UCM_BASE_DEV MKDEV(IB_UCM_MAJOR, IB_UCM_BASE_MINOR)
+
+static void ib_ucm_add_one(struct ib_device *device);
+static void ib_ucm_remove_one(struct ib_device *device);
+
+static struct ib_client ucm_client = {
+	.name   = "ucm",
+	.add    = ib_ucm_add_one,
+	.remove = ib_ucm_remove_one
+};
+
+static DEFINE_MUTEX(ctx_id_mutex);
+static DEFINE_IDR(ctx_id_table);
+static DECLARE_BITMAP(dev_map, IB_UCM_MAX_DEVICES);
+
+static struct ib_ucm_context *ib_ucm_ctx_get(struct ib_ucm_file *file, int id)
+{
+	struct ib_ucm_context *ctx;
+
+	mutex_lock(&ctx_id_mutex);
+	ctx = idr_find(&ctx_id_table, id);
+	if (!ctx)
+		ctx = ERR_PTR(-ENOENT);
+	else if (ctx->file != file)
+		ctx = ERR_PTR(-EINVAL);
+	else
+		atomic_inc(&ctx->ref);
+	mutex_unlock(&ctx_id_mutex);
+
+	return ctx;
+}
+
+static void ib_ucm_ctx_put(struct ib_ucm_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->ref))
+		complete(&ctx->comp);
+}
+
+static inline int ib_ucm_new_cm_id(int event)
+{
+	return event == IB_CM_REQ_RECEIVED || event == IB_CM_SIDR_REQ_RECEIVED;
+}
+
+static void ib_ucm_cleanup_events(struct ib_ucm_context *ctx)
+{
+	struct ib_ucm_event *uevent;
+
+	mutex_lock(&ctx->file->file_mutex);
+	list_del(&ctx->file_list);
+	while (!list_empty(&ctx->events)) {
+
+		uevent = list_entry(ctx->events.next,
+				    struct ib_ucm_event, ctx_list);
+		list_del(&uevent->file_list);
+		list_del(&uevent->ctx_list);
+		mutex_unlock(&ctx->file->file_mutex);
+
+		/* clear incoming connections. */
+		if (ib_ucm_new_cm_id(uevent->resp.event))
+			ib_destroy_cm_id(uevent->cm_id);
+
+		kfree(uevent);
+		mutex_lock(&ctx->file->file_mutex);
+	}
+	mutex_unlock(&ctx->file->file_mutex);
+}
+
+static struct ib_ucm_context *ib_ucm_ctx_alloc(struct ib_ucm_file *file)
+{
+	struct ib_ucm_context *ctx;
+
+	ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	atomic_set(&ctx->ref, 1);
+	init_completion(&ctx->comp);
+	ctx->file = file;
+	INIT_LIST_HEAD(&ctx->events);
+
+	mutex_lock(&ctx_id_mutex);
+	ctx->id = idr_alloc(&ctx_id_table, ctx, 0, 0, GFP_KERNEL);
+	mutex_unlock(&ctx_id_mutex);
+	if (ctx->id < 0)
+		goto error;
+
+	list_add_tail(&ctx->file_list, &file->ctxs);
+	return ctx;
+
+error:
+	kfree(ctx);
+	return NULL;
+}
+
+static void ib_ucm_event_req_get(struct ib_ucm_req_event_resp *ureq,
+				 struct ib_cm_req_event_param *kreq)
+{
+	ureq->remote_ca_guid             = kreq->remote_ca_guid;
+	ureq->remote_qkey                = kreq->remote_qkey;
+	ureq->remote_qpn                 = kreq->remote_qpn;
+	ureq->qp_type                    = kreq->qp_type;
+	ureq->starting_psn               = kreq->starting_psn;
+	ureq->responder_resources        = kreq->responder_resources;
+	ureq->initiator_depth            = kreq->initiator_depth;
+	ureq->local_cm_response_timeout  = kreq->local_cm_response_timeout;
+	ureq->flow_control               = kreq->flow_control;
+	ureq->remote_cm_response_timeout = kreq->remote_cm_response_timeout;
+	ureq->retry_count                = kreq->retry_count;
+	ureq->rnr_retry_count            = kreq->rnr_retry_count;
+	ureq->srq                        = kreq->srq;
+	ureq->port			 = kreq->port;
+
+	ib_copy_path_rec_to_user(&ureq->primary_path, kreq->primary_path);
+	if (kreq->alternate_path)
+		ib_copy_path_rec_to_user(&ureq->alternate_path,
+					 kreq->alternate_path);
+}
+
+static void ib_ucm_event_rep_get(struct ib_ucm_rep_event_resp *urep,
+				 struct ib_cm_rep_event_param *krep)
+{
+	urep->remote_ca_guid      = krep->remote_ca_guid;
+	urep->remote_qkey         = krep->remote_qkey;
+	urep->remote_qpn          = krep->remote_qpn;
+	urep->starting_psn        = krep->starting_psn;
+	urep->responder_resources = krep->responder_resources;
+	urep->initiator_depth     = krep->initiator_depth;
+	urep->target_ack_delay    = krep->target_ack_delay;
+	urep->failover_accepted   = krep->failover_accepted;
+	urep->flow_control        = krep->flow_control;
+	urep->rnr_retry_count     = krep->rnr_retry_count;
+	urep->srq                 = krep->srq;
+}
+
+static void ib_ucm_event_sidr_rep_get(struct ib_ucm_sidr_rep_event_resp *urep,
+				      struct ib_cm_sidr_rep_event_param *krep)
+{
+	urep->status = krep->status;
+	urep->qkey   = krep->qkey;
+	urep->qpn    = krep->qpn;
+};
+
+static int ib_ucm_event_process(struct ib_cm_event *evt,
+				struct ib_ucm_event *uvt)
+{
+	void *info = NULL;
+
+	switch (evt->event) {
+	case IB_CM_REQ_RECEIVED:
+		ib_ucm_event_req_get(&uvt->resp.u.req_resp,
+				     &evt->param.req_rcvd);
+		uvt->data_len      = IB_CM_REQ_PRIVATE_DATA_SIZE;
+		uvt->resp.present  = IB_UCM_PRES_PRIMARY;
+		uvt->resp.present |= (evt->param.req_rcvd.alternate_path ?
+				      IB_UCM_PRES_ALTERNATE : 0);
+		break;
+	case IB_CM_REP_RECEIVED:
+		ib_ucm_event_rep_get(&uvt->resp.u.rep_resp,
+				     &evt->param.rep_rcvd);
+		uvt->data_len = IB_CM_REP_PRIVATE_DATA_SIZE;
+		break;
+	case IB_CM_RTU_RECEIVED:
+		uvt->data_len = IB_CM_RTU_PRIVATE_DATA_SIZE;
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	case IB_CM_DREQ_RECEIVED:
+		uvt->data_len = IB_CM_DREQ_PRIVATE_DATA_SIZE;
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	case IB_CM_DREP_RECEIVED:
+		uvt->data_len = IB_CM_DREP_PRIVATE_DATA_SIZE;
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	case IB_CM_MRA_RECEIVED:
+		uvt->resp.u.mra_resp.timeout =
+					evt->param.mra_rcvd.service_timeout;
+		uvt->data_len = IB_CM_MRA_PRIVATE_DATA_SIZE;
+		break;
+	case IB_CM_REJ_RECEIVED:
+		uvt->resp.u.rej_resp.reason = evt->param.rej_rcvd.reason;
+		uvt->data_len = IB_CM_REJ_PRIVATE_DATA_SIZE;
+		uvt->info_len = evt->param.rej_rcvd.ari_length;
+		info	      = evt->param.rej_rcvd.ari;
+		break;
+	case IB_CM_LAP_RECEIVED:
+		ib_copy_path_rec_to_user(&uvt->resp.u.lap_resp.path,
+					 evt->param.lap_rcvd.alternate_path);
+		uvt->data_len = IB_CM_LAP_PRIVATE_DATA_SIZE;
+		uvt->resp.present = IB_UCM_PRES_ALTERNATE;
+		break;
+	case IB_CM_APR_RECEIVED:
+		uvt->resp.u.apr_resp.status = evt->param.apr_rcvd.ap_status;
+		uvt->data_len = IB_CM_APR_PRIVATE_DATA_SIZE;
+		uvt->info_len = evt->param.apr_rcvd.info_len;
+		info	      = evt->param.apr_rcvd.apr_info;
+		break;
+	case IB_CM_SIDR_REQ_RECEIVED:
+		uvt->resp.u.sidr_req_resp.pkey =
+					evt->param.sidr_req_rcvd.pkey;
+		uvt->resp.u.sidr_req_resp.port =
+					evt->param.sidr_req_rcvd.port;
+		uvt->data_len = IB_CM_SIDR_REQ_PRIVATE_DATA_SIZE;
+		break;
+	case IB_CM_SIDR_REP_RECEIVED:
+		ib_ucm_event_sidr_rep_get(&uvt->resp.u.sidr_rep_resp,
+					  &evt->param.sidr_rep_rcvd);
+		uvt->data_len = IB_CM_SIDR_REP_PRIVATE_DATA_SIZE;
+		uvt->info_len = evt->param.sidr_rep_rcvd.info_len;
+		info	      = evt->param.sidr_rep_rcvd.info;
+		break;
+	default:
+		uvt->resp.u.send_status = evt->param.send_status;
+		break;
+	}
+
+	if (uvt->data_len) {
+		uvt->data = kmemdup(evt->private_data, uvt->data_len, GFP_KERNEL);
+		if (!uvt->data)
+			goto err1;
+
+		uvt->resp.present |= IB_UCM_PRES_DATA;
+	}
+
+	if (uvt->info_len) {
+		uvt->info = kmemdup(info, uvt->info_len, GFP_KERNEL);
+		if (!uvt->info)
+			goto err2;
+
+		uvt->resp.present |= IB_UCM_PRES_INFO;
+	}
+	return 0;
+
+err2:
+	kfree(uvt->data);
+err1:
+	return -ENOMEM;
+}
+
+static int ib_ucm_event_handler(struct ib_cm_id *cm_id,
+				struct ib_cm_event *event)
+{
+	struct ib_ucm_event *uevent;
+	struct ib_ucm_context *ctx;
+	int result = 0;
+
+	ctx = cm_id->context;
+
+	uevent = kzalloc(sizeof *uevent, GFP_KERNEL);
+	if (!uevent)
+		goto err1;
+
+	uevent->ctx = ctx;
+	uevent->cm_id = cm_id;
+	uevent->resp.uid = ctx->uid;
+	uevent->resp.id = ctx->id;
+	uevent->resp.event = event->event;
+
+	result = ib_ucm_event_process(event, uevent);
+	if (result)
+		goto err2;
+
+	mutex_lock(&ctx->file->file_mutex);
+	list_add_tail(&uevent->file_list, &ctx->file->events);
+	list_add_tail(&uevent->ctx_list, &ctx->events);
+	wake_up_interruptible(&ctx->file->poll_wait);
+	mutex_unlock(&ctx->file->file_mutex);
+	return 0;
+
+err2:
+	kfree(uevent);
+err1:
+	/* Destroy new cm_id's */
+	return ib_ucm_new_cm_id(event->event);
+}
+
+static ssize_t ib_ucm_event(struct ib_ucm_file *file,
+			    const char __user *inbuf,
+			    int in_len, int out_len)
+{
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_event_get cmd;
+	struct ib_ucm_event *uevent;
+	int result = 0;
+
+	if (out_len < sizeof(struct ib_ucm_event_resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->file_mutex);
+	while (list_empty(&file->events)) {
+		mutex_unlock(&file->file_mutex);
+
+		if (file->filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->events)))
+			return -ERESTARTSYS;
+
+		mutex_lock(&file->file_mutex);
+	}
+
+	uevent = list_entry(file->events.next, struct ib_ucm_event, file_list);
+
+	if (ib_ucm_new_cm_id(uevent->resp.event)) {
+		ctx = ib_ucm_ctx_alloc(file);
+		if (!ctx) {
+			result = -ENOMEM;
+			goto done;
+		}
+
+		ctx->cm_id = uevent->cm_id;
+		ctx->cm_id->context = ctx;
+		uevent->resp.id = ctx->id;
+	}
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &uevent->resp, sizeof(uevent->resp))) {
+		result = -EFAULT;
+		goto done;
+	}
+
+	if (uevent->data) {
+		if (cmd.data_len < uevent->data_len) {
+			result = -ENOMEM;
+			goto done;
+		}
+		if (copy_to_user((void __user *)(unsigned long)cmd.data,
+				 uevent->data, uevent->data_len)) {
+			result = -EFAULT;
+			goto done;
+		}
+	}
+
+	if (uevent->info) {
+		if (cmd.info_len < uevent->info_len) {
+			result = -ENOMEM;
+			goto done;
+		}
+		if (copy_to_user((void __user *)(unsigned long)cmd.info,
+				 uevent->info, uevent->info_len)) {
+			result = -EFAULT;
+			goto done;
+		}
+	}
+
+	list_del(&uevent->file_list);
+	list_del(&uevent->ctx_list);
+	uevent->ctx->events_reported++;
+
+	kfree(uevent->data);
+	kfree(uevent->info);
+	kfree(uevent);
+done:
+	mutex_unlock(&file->file_mutex);
+	return result;
+}
+
+static ssize_t ib_ucm_create_id(struct ib_ucm_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	struct ib_ucm_create_id cmd;
+	struct ib_ucm_create_id_resp resp;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->file_mutex);
+	ctx = ib_ucm_ctx_alloc(file);
+	mutex_unlock(&file->file_mutex);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->uid = cmd.uid;
+	ctx->cm_id = ib_create_cm_id(file->device->ib_dev,
+				     ib_ucm_event_handler, ctx);
+	if (IS_ERR(ctx->cm_id)) {
+		result = PTR_ERR(ctx->cm_id);
+		goto err1;
+	}
+
+	resp.id = ctx->id;
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp))) {
+		result = -EFAULT;
+		goto err2;
+	}
+	return 0;
+
+err2:
+	ib_destroy_cm_id(ctx->cm_id);
+err1:
+	mutex_lock(&ctx_id_mutex);
+	idr_remove(&ctx_id_table, ctx->id);
+	mutex_unlock(&ctx_id_mutex);
+	kfree(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_destroy_id(struct ib_ucm_file *file,
+				 const char __user *inbuf,
+				 int in_len, int out_len)
+{
+	struct ib_ucm_destroy_id cmd;
+	struct ib_ucm_destroy_id_resp resp;
+	struct ib_ucm_context *ctx;
+	int result = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&ctx_id_mutex);
+	ctx = idr_find(&ctx_id_table, cmd.id);
+	if (!ctx)
+		ctx = ERR_PTR(-ENOENT);
+	else if (ctx->file != file)
+		ctx = ERR_PTR(-EINVAL);
+	else
+		idr_remove(&ctx_id_table, ctx->id);
+	mutex_unlock(&ctx_id_mutex);
+
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ib_ucm_ctx_put(ctx);
+	wait_for_completion(&ctx->comp);
+
+	/* No new events will be generated after destroying the cm_id. */
+	ib_destroy_cm_id(ctx->cm_id);
+	/* Cleanup events not yet reported to the user. */
+	ib_ucm_cleanup_events(ctx);
+
+	resp.events_reported = ctx->events_reported;
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		result = -EFAULT;
+
+	kfree(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_attr_id(struct ib_ucm_file *file,
+			      const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct ib_ucm_attr_id_resp resp;
+	struct ib_ucm_attr_id cmd;
+	struct ib_ucm_context *ctx;
+	int result = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	resp.service_id   = ctx->cm_id->service_id;
+	resp.service_mask = ctx->cm_id->service_mask;
+	resp.local_id     = ctx->cm_id->local_id;
+	resp.remote_id    = ctx->cm_id->remote_id;
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		result = -EFAULT;
+
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_init_qp_attr(struct ib_ucm_file *file,
+				   const char __user *inbuf,
+				   int in_len, int out_len)
+{
+	struct ib_uverbs_qp_attr resp;
+	struct ib_ucm_init_qp_attr cmd;
+	struct ib_ucm_context *ctx;
+	struct ib_qp_attr qp_attr;
+	int result = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	resp.qp_attr_mask = 0;
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.qp_state = cmd.qp_state;
+	result = ib_cm_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+	if (result)
+		goto out;
+
+	ib_copy_qp_attr_to_user(&resp, &qp_attr);
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		result = -EFAULT;
+
+out:
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static int ucm_validate_listen(__be64 service_id, __be64 service_mask)
+{
+	service_id &= service_mask;
+
+	if (((service_id & IB_CMA_SERVICE_ID_MASK) == IB_CMA_SERVICE_ID) ||
+	    ((service_id & IB_SDP_SERVICE_ID_MASK) == IB_SDP_SERVICE_ID))
+		return -EINVAL;
+
+	return 0;
+}
+
+static ssize_t ib_ucm_listen(struct ib_ucm_file *file,
+			     const char __user *inbuf,
+			     int in_len, int out_len)
+{
+	struct ib_ucm_listen cmd;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	result = ucm_validate_listen(cmd.service_id, cmd.service_mask);
+	if (result)
+		goto out;
+
+	result = ib_cm_listen(ctx->cm_id, cmd.service_id, cmd.service_mask,
+			      NULL);
+out:
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static ssize_t ib_ucm_notify(struct ib_ucm_file *file,
+			     const char __user *inbuf,
+			     int in_len, int out_len)
+{
+	struct ib_ucm_notify cmd;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	result = ib_cm_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
+	ib_ucm_ctx_put(ctx);
+	return result;
+}
+
+static int ib_ucm_alloc_data(const void **dest, u64 src, u32 len)
+{
+	void *data;
+
+	*dest = NULL;
+
+	if (!len)
+		return 0;
+
+	data = memdup_user((void __user *)(unsigned long)src, len);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	*dest = data;
+	return 0;
+}
+
+static int ib_ucm_path_get(struct ib_sa_path_rec **path, u64 src)
+{
+	struct ib_user_path_rec upath;
+	struct ib_sa_path_rec  *sa_path;
+
+	*path = NULL;
+
+	if (!src)
+		return 0;
+
+	sa_path = kmalloc(sizeof(*sa_path), GFP_KERNEL);
+	if (!sa_path)
+		return -ENOMEM;
+
+	if (copy_from_user(&upath, (void __user *)(unsigned long)src,
+			   sizeof(upath))) {
+
+		kfree(sa_path);
+		return -EFAULT;
+	}
+
+	ib_copy_path_rec_from_user(sa_path, &upath);
+	*path = sa_path;
+	return 0;
+}
+
+static ssize_t ib_ucm_send_req(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_cm_req_param param;
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_req cmd;
+	int result;
+
+	param.private_data   = NULL;
+	param.primary_path   = NULL;
+	param.alternate_path = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&param.primary_path, cmd.primary_path);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&param.alternate_path, cmd.alternate_path);
+	if (result)
+		goto done;
+
+	param.private_data_len           = cmd.len;
+	param.service_id                 = cmd.sid;
+	param.qp_num                     = cmd.qpn;
+	param.qp_type                    = cmd.qp_type;
+	param.starting_psn               = cmd.psn;
+	param.peer_to_peer               = cmd.peer_to_peer;
+	param.responder_resources        = cmd.responder_resources;
+	param.initiator_depth            = cmd.initiator_depth;
+	param.remote_cm_response_timeout = cmd.remote_cm_response_timeout;
+	param.flow_control               = cmd.flow_control;
+	param.local_cm_response_timeout  = cmd.local_cm_response_timeout;
+	param.retry_count                = cmd.retry_count;
+	param.rnr_retry_count            = cmd.rnr_retry_count;
+	param.max_cm_retries             = cmd.max_cm_retries;
+	param.srq                        = cmd.srq;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_req(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(param.private_data);
+	kfree(param.primary_path);
+	kfree(param.alternate_path);
+	return result;
+}
+
+static ssize_t ib_ucm_send_rep(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_cm_rep_param param;
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_rep cmd;
+	int result;
+
+	param.private_data = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+	if (result)
+		return result;
+
+	param.qp_num              = cmd.qpn;
+	param.starting_psn        = cmd.psn;
+	param.private_data_len    = cmd.len;
+	param.responder_resources = cmd.responder_resources;
+	param.initiator_depth     = cmd.initiator_depth;
+	param.failover_accepted   = cmd.failover_accepted;
+	param.flow_control        = cmd.flow_control;
+	param.rnr_retry_count     = cmd.rnr_retry_count;
+	param.srq                 = cmd.srq;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		ctx->uid = cmd.uid;
+		result = ib_send_cm_rep(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+	kfree(param.private_data);
+	return result;
+}
+
+static ssize_t ib_ucm_send_private_data(struct ib_ucm_file *file,
+					const char __user *inbuf, int in_len,
+					int (*func)(struct ib_cm_id *cm_id,
+						    const void *private_data,
+						    u8 private_data_len))
+{
+	struct ib_ucm_private_data cmd;
+	struct ib_ucm_context *ctx;
+	const void *private_data = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&private_data, cmd.data, cmd.len);
+	if (result)
+		return result;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = func(ctx->cm_id, private_data, cmd.len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+	kfree(private_data);
+	return result;
+}
+
+static ssize_t ib_ucm_send_rtu(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_rtu);
+}
+
+static ssize_t ib_ucm_send_dreq(struct ib_ucm_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_dreq);
+}
+
+static ssize_t ib_ucm_send_drep(struct ib_ucm_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	return ib_ucm_send_private_data(file, inbuf, in_len, ib_send_cm_drep);
+}
+
+static ssize_t ib_ucm_send_info(struct ib_ucm_file *file,
+				const char __user *inbuf, int in_len,
+				int (*func)(struct ib_cm_id *cm_id,
+					    int status,
+					    const void *info,
+					    u8 info_len,
+					    const void *data,
+					    u8 data_len))
+{
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_info cmd;
+	const void *data = NULL;
+	const void *info = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&data, cmd.data, cmd.data_len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_alloc_data(&info, cmd.info, cmd.info_len);
+	if (result)
+		goto done;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = func(ctx->cm_id, cmd.status, info, cmd.info_len,
+			      data, cmd.data_len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(data);
+	kfree(info);
+	return result;
+}
+
+static ssize_t ib_ucm_send_rej(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_rej);
+}
+
+static ssize_t ib_ucm_send_apr(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	return ib_ucm_send_info(file, inbuf, in_len, (void *)ib_send_cm_apr);
+}
+
+static ssize_t ib_ucm_send_mra(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_mra cmd;
+	const void *data = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
+	if (result)
+		return result;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_mra(ctx->cm_id, cmd.timeout, data, cmd.len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+	kfree(data);
+	return result;
+}
+
+static ssize_t ib_ucm_send_lap(struct ib_ucm_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct ib_ucm_context *ctx;
+	struct ib_sa_path_rec *path = NULL;
+	struct ib_ucm_lap cmd;
+	const void *data = NULL;
+	int result;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&data, cmd.data, cmd.len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&path, cmd.path);
+	if (result)
+		goto done;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_lap(ctx->cm_id, path, data, cmd.len);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(data);
+	kfree(path);
+	return result;
+}
+
+static ssize_t ib_ucm_send_sidr_req(struct ib_ucm_file *file,
+				    const char __user *inbuf,
+				    int in_len, int out_len)
+{
+	struct ib_cm_sidr_req_param param;
+	struct ib_ucm_context *ctx;
+	struct ib_ucm_sidr_req cmd;
+	int result;
+
+	param.private_data = NULL;
+	param.path = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data, cmd.data, cmd.len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_path_get(&param.path, cmd.path);
+	if (result)
+		goto done;
+
+	param.private_data_len = cmd.len;
+	param.service_id       = cmd.sid;
+	param.timeout_ms       = cmd.timeout;
+	param.max_cm_retries   = cmd.max_cm_retries;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_sidr_req(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(param.private_data);
+	kfree(param.path);
+	return result;
+}
+
+static ssize_t ib_ucm_send_sidr_rep(struct ib_ucm_file *file,
+				    const char __user *inbuf,
+				    int in_len, int out_len)
+{
+	struct ib_cm_sidr_rep_param param;
+	struct ib_ucm_sidr_rep cmd;
+	struct ib_ucm_context *ctx;
+	int result;
+
+	param.info = NULL;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	result = ib_ucm_alloc_data(&param.private_data,
+				   cmd.data, cmd.data_len);
+	if (result)
+		goto done;
+
+	result = ib_ucm_alloc_data(&param.info, cmd.info, cmd.info_len);
+	if (result)
+		goto done;
+
+	param.qp_num		= cmd.qpn;
+	param.qkey		= cmd.qkey;
+	param.status		= cmd.status;
+	param.info_length	= cmd.info_len;
+	param.private_data_len	= cmd.data_len;
+
+	ctx = ib_ucm_ctx_get(file, cmd.id);
+	if (!IS_ERR(ctx)) {
+		result = ib_send_cm_sidr_rep(ctx->cm_id, &param);
+		ib_ucm_ctx_put(ctx);
+	} else
+		result = PTR_ERR(ctx);
+
+done:
+	kfree(param.private_data);
+	kfree(param.info);
+	return result;
+}
+
+static ssize_t (*ucm_cmd_table[])(struct ib_ucm_file *file,
+				  const char __user *inbuf,
+				  int in_len, int out_len) = {
+	[IB_USER_CM_CMD_CREATE_ID]     = ib_ucm_create_id,
+	[IB_USER_CM_CMD_DESTROY_ID]    = ib_ucm_destroy_id,
+	[IB_USER_CM_CMD_ATTR_ID]       = ib_ucm_attr_id,
+	[IB_USER_CM_CMD_LISTEN]        = ib_ucm_listen,
+	[IB_USER_CM_CMD_NOTIFY]        = ib_ucm_notify,
+	[IB_USER_CM_CMD_SEND_REQ]      = ib_ucm_send_req,
+	[IB_USER_CM_CMD_SEND_REP]      = ib_ucm_send_rep,
+	[IB_USER_CM_CMD_SEND_RTU]      = ib_ucm_send_rtu,
+	[IB_USER_CM_CMD_SEND_DREQ]     = ib_ucm_send_dreq,
+	[IB_USER_CM_CMD_SEND_DREP]     = ib_ucm_send_drep,
+	[IB_USER_CM_CMD_SEND_REJ]      = ib_ucm_send_rej,
+	[IB_USER_CM_CMD_SEND_MRA]      = ib_ucm_send_mra,
+	[IB_USER_CM_CMD_SEND_LAP]      = ib_ucm_send_lap,
+	[IB_USER_CM_CMD_SEND_APR]      = ib_ucm_send_apr,
+	[IB_USER_CM_CMD_SEND_SIDR_REQ] = ib_ucm_send_sidr_req,
+	[IB_USER_CM_CMD_SEND_SIDR_REP] = ib_ucm_send_sidr_rep,
+	[IB_USER_CM_CMD_EVENT]	       = ib_ucm_event,
+	[IB_USER_CM_CMD_INIT_QP_ATTR]  = ib_ucm_init_qp_attr,
+};
+
+static ssize_t ib_ucm_write(struct file *filp, const char __user *buf,
+			    size_t len, loff_t *pos)
+{
+	struct ib_ucm_file *file = filp->private_data;
+	struct ib_ucm_cmd_hdr hdr;
+	ssize_t result;
+
+	if (len < sizeof(hdr))
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof(hdr)))
+		return -EFAULT;
+
+	if (hdr.cmd >= ARRAY_SIZE(ucm_cmd_table))
+		return -EINVAL;
+
+	if (hdr.in + sizeof(hdr) > len)
+		return -EINVAL;
+
+	result = ucm_cmd_table[hdr.cmd](file, buf + sizeof(hdr),
+					hdr.in, hdr.out);
+	if (!result)
+		result = len;
+
+	return result;
+}
+
+static unsigned int ib_ucm_poll(struct file *filp,
+				struct poll_table_struct *wait)
+{
+	struct ib_ucm_file *file = filp->private_data;
+	unsigned int mask = 0;
+
+	poll_wait(filp, &file->poll_wait, wait);
+
+	if (!list_empty(&file->events))
+		mask = POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+/*
+ * ib_ucm_open() does not need the BKL:
+ *
+ *  - no global state is referred to;
+ *  - there is no ioctl method to race against;
+ *  - no further module initialization is required for open to work
+ *    after the device is registered.
+ */
+static int ib_ucm_open(struct inode *inode, struct file *filp)
+{
+	struct ib_ucm_file *file;
+
+	file = kmalloc(sizeof(*file), GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&file->events);
+	INIT_LIST_HEAD(&file->ctxs);
+	init_waitqueue_head(&file->poll_wait);
+
+	mutex_init(&file->file_mutex);
+
+	filp->private_data = file;
+	file->filp = filp;
+	file->device = container_of(inode->i_cdev, struct ib_ucm_device, cdev);
+
+	return nonseekable_open(inode, filp);
+}
+
+static int ib_ucm_close(struct inode *inode, struct file *filp)
+{
+	struct ib_ucm_file *file = filp->private_data;
+	struct ib_ucm_context *ctx;
+
+	mutex_lock(&file->file_mutex);
+	while (!list_empty(&file->ctxs)) {
+		ctx = list_entry(file->ctxs.next,
+				 struct ib_ucm_context, file_list);
+		mutex_unlock(&file->file_mutex);
+
+		mutex_lock(&ctx_id_mutex);
+		idr_remove(&ctx_id_table, ctx->id);
+		mutex_unlock(&ctx_id_mutex);
+
+		ib_destroy_cm_id(ctx->cm_id);
+		ib_ucm_cleanup_events(ctx);
+		kfree(ctx);
+
+		mutex_lock(&file->file_mutex);
+	}
+	mutex_unlock(&file->file_mutex);
+	kfree(file);
+	return 0;
+}
+
+static void ib_ucm_release_dev(struct device *dev)
+{
+	struct ib_ucm_device *ucm_dev;
+
+	ucm_dev = container_of(dev, struct ib_ucm_device, dev);
+	cdev_del(&ucm_dev->cdev);
+	if (ucm_dev->devnum < IB_UCM_MAX_DEVICES)
+		clear_bit(ucm_dev->devnum, dev_map);
+	else
+		clear_bit(ucm_dev->devnum - IB_UCM_MAX_DEVICES, dev_map);
+	kfree(ucm_dev);
+}
+
+static const struct file_operations ucm_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = ib_ucm_open,
+	.release = ib_ucm_close,
+	.write	 = ib_ucm_write,
+	.poll    = ib_ucm_poll,
+	.llseek	 = no_llseek,
+};
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ib_ucm_device *ucm_dev;
+
+	ucm_dev = container_of(dev, struct ib_ucm_device, dev);
+	return sprintf(buf, "%s\n", ucm_dev->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UCM_MAX_DEVICES);
+static int find_overflow_devnum(void)
+{
+	int ret;
+
+	if (!overflow_maj) {
+		ret = alloc_chrdev_region(&overflow_maj, 0, IB_UCM_MAX_DEVICES,
+					  "infiniband_cm");
+		if (ret) {
+			printk(KERN_ERR "ucm: couldn't register dynamic device number\n");
+			return ret;
+		}
+	}
+
+	ret = find_first_zero_bit(overflow_map, IB_UCM_MAX_DEVICES);
+	if (ret >= IB_UCM_MAX_DEVICES)
+		return -1;
+
+	return ret;
+}
+
+static void ib_ucm_add_one(struct ib_device *device)
+{
+	int devnum;
+	dev_t base;
+	struct ib_ucm_device *ucm_dev;
+
+	if (!device->alloc_ucontext ||
+	    rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	ucm_dev = kzalloc(sizeof *ucm_dev, GFP_KERNEL);
+	if (!ucm_dev)
+		return;
+
+	ucm_dev->ib_dev = device;
+
+	devnum = find_first_zero_bit(dev_map, IB_UCM_MAX_DEVICES);
+	if (devnum >= IB_UCM_MAX_DEVICES) {
+		devnum = find_overflow_devnum();
+		if (devnum < 0)
+			goto err;
+
+		ucm_dev->devnum = devnum + IB_UCM_MAX_DEVICES;
+		base = devnum + overflow_maj;
+		set_bit(devnum, overflow_map);
+	} else {
+		ucm_dev->devnum = devnum;
+		base = devnum + IB_UCM_BASE_DEV;
+		set_bit(devnum, dev_map);
+	}
+
+	cdev_init(&ucm_dev->cdev, &ucm_fops);
+	ucm_dev->cdev.owner = THIS_MODULE;
+	kobject_set_name(&ucm_dev->cdev.kobj, "ucm%d", ucm_dev->devnum);
+	if (cdev_add(&ucm_dev->cdev, base, 1))
+		goto err;
+
+	ucm_dev->dev.class = &cm_class;
+	ucm_dev->dev.parent = device->dma_device;
+	ucm_dev->dev.devt = ucm_dev->cdev.dev;
+	ucm_dev->dev.release = ib_ucm_release_dev;
+	dev_set_name(&ucm_dev->dev, "ucm%d", ucm_dev->devnum);
+	if (device_register(&ucm_dev->dev))
+		goto err_cdev;
+
+	if (device_create_file(&ucm_dev->dev, &dev_attr_ibdev))
+		goto err_dev;
+
+	ib_set_client_data(device, &ucm_client, ucm_dev);
+	return;
+
+err_dev:
+	device_unregister(&ucm_dev->dev);
+err_cdev:
+	cdev_del(&ucm_dev->cdev);
+	if (ucm_dev->devnum < IB_UCM_MAX_DEVICES)
+		clear_bit(devnum, dev_map);
+	else
+		clear_bit(devnum, overflow_map);
+err:
+	kfree(ucm_dev);
+	return;
+}
+
+static void ib_ucm_remove_one(struct ib_device *device)
+{
+	struct ib_ucm_device *ucm_dev = ib_get_client_data(device, &ucm_client);
+
+	if (!ucm_dev)
+		return;
+
+	device_unregister(&ucm_dev->dev);
+}
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+			 __stringify(IB_USER_CM_ABI_VERSION));
+
+static int __init ib_ucm_init(void)
+{
+	int ret;
+
+	ret = register_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES,
+				     "infiniband_cm");
+	if (ret) {
+		printk(KERN_ERR "ucm: couldn't register device number\n");
+		goto error1;
+	}
+
+	ret = class_create_file(&cm_class, &class_attr_abi_version.attr);
+	if (ret) {
+		printk(KERN_ERR "ucm: couldn't create abi_version attribute\n");
+		goto error2;
+	}
+
+	ret = ib_register_client(&ucm_client);
+	if (ret) {
+		printk(KERN_ERR "ucm: couldn't register client\n");
+		goto error3;
+	}
+	return 0;
+
+error3:
+	class_remove_file(&cm_class, &class_attr_abi_version.attr);
+error2:
+	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
+error1:
+	return ret;
+}
+
+static void __exit ib_ucm_cleanup(void)
+{
+	ib_unregister_client(&ucm_client);
+	class_remove_file(&cm_class, &class_attr_abi_version.attr);
+	unregister_chrdev_region(IB_UCM_BASE_DEV, IB_UCM_MAX_DEVICES);
+	if (overflow_maj)
+		unregister_chrdev_region(overflow_maj, IB_UCM_MAX_DEVICES);
+	idr_destroy(&ctx_id_table);
+}
+
+module_init(ib_ucm_init);
+module_exit(ib_ucm_cleanup);
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
new file mode 100644
index 000000000..45d67e922
--- /dev/null
+++ b/drivers/infiniband/core/ucma.c
@@ -0,0 +1,1635 @@
+/*
+ * Copyright (c) 2005-2006 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	copyright notice, this list of conditions and the following
+ *	disclaimer in the documentation and/or other materials
+ *	provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/file.h>
+#include <linux/mutex.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/idr.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <linux/miscdevice.h>
+#include <linux/slab.h>
+#include <linux/sysctl.h>
+#include <linux/module.h>
+
+#include <rdma/rdma_user_cm.h>
+#include <rdma/ib_marshall.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/rdma_cm_ib.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib.h>
+
+MODULE_AUTHOR("Sean Hefty");
+MODULE_DESCRIPTION("RDMA Userspace Connection Manager Access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static unsigned int max_backlog = 1024;
+
+static struct ctl_table_header *ucma_ctl_table_hdr;
+static struct ctl_table ucma_ctl_table[] = {
+	{
+		.procname	= "max_backlog",
+		.data		= &max_backlog,
+		.maxlen		= sizeof max_backlog,
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{ }
+};
+
+struct ucma_file {
+	struct mutex		mut;
+	struct file		*filp;
+	struct list_head	ctx_list;
+	struct list_head	event_list;
+	wait_queue_head_t	poll_wait;
+};
+
+struct ucma_context {
+	int			id;
+	struct completion	comp;
+	atomic_t		ref;
+	int			events_reported;
+	int			backlog;
+
+	struct ucma_file	*file;
+	struct rdma_cm_id	*cm_id;
+	u64			uid;
+
+	struct list_head	list;
+	struct list_head	mc_list;
+};
+
+struct ucma_multicast {
+	struct ucma_context	*ctx;
+	int			id;
+	int			events_reported;
+
+	u64			uid;
+	struct list_head	list;
+	struct sockaddr_storage	addr;
+};
+
+struct ucma_event {
+	struct ucma_context	*ctx;
+	struct ucma_multicast	*mc;
+	struct list_head	list;
+	struct rdma_cm_id	*cm_id;
+	struct rdma_ucm_event_resp resp;
+};
+
+static DEFINE_MUTEX(mut);
+static DEFINE_IDR(ctx_idr);
+static DEFINE_IDR(multicast_idr);
+
+static inline struct ucma_context *_ucma_find_context(int id,
+						      struct ucma_file *file)
+{
+	struct ucma_context *ctx;
+
+	ctx = idr_find(&ctx_idr, id);
+	if (!ctx)
+		ctx = ERR_PTR(-ENOENT);
+	else if (ctx->file != file)
+		ctx = ERR_PTR(-EINVAL);
+	return ctx;
+}
+
+static struct ucma_context *ucma_get_ctx(struct ucma_file *file, int id)
+{
+	struct ucma_context *ctx;
+
+	mutex_lock(&mut);
+	ctx = _ucma_find_context(id, file);
+	if (!IS_ERR(ctx))
+		atomic_inc(&ctx->ref);
+	mutex_unlock(&mut);
+	return ctx;
+}
+
+static void ucma_put_ctx(struct ucma_context *ctx)
+{
+	if (atomic_dec_and_test(&ctx->ref))
+		complete(&ctx->comp);
+}
+
+static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file)
+{
+	struct ucma_context *ctx;
+
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return NULL;
+
+	atomic_set(&ctx->ref, 1);
+	init_completion(&ctx->comp);
+	INIT_LIST_HEAD(&ctx->mc_list);
+	ctx->file = file;
+
+	mutex_lock(&mut);
+	ctx->id = idr_alloc(&ctx_idr, ctx, 0, 0, GFP_KERNEL);
+	mutex_unlock(&mut);
+	if (ctx->id < 0)
+		goto error;
+
+	list_add_tail(&ctx->list, &file->ctx_list);
+	return ctx;
+
+error:
+	kfree(ctx);
+	return NULL;
+}
+
+static struct ucma_multicast* ucma_alloc_multicast(struct ucma_context *ctx)
+{
+	struct ucma_multicast *mc;
+
+	mc = kzalloc(sizeof(*mc), GFP_KERNEL);
+	if (!mc)
+		return NULL;
+
+	mutex_lock(&mut);
+	mc->id = idr_alloc(&multicast_idr, mc, 0, 0, GFP_KERNEL);
+	mutex_unlock(&mut);
+	if (mc->id < 0)
+		goto error;
+
+	mc->ctx = ctx;
+	list_add_tail(&mc->list, &ctx->mc_list);
+	return mc;
+
+error:
+	kfree(mc);
+	return NULL;
+}
+
+static void ucma_copy_conn_event(struct rdma_ucm_conn_param *dst,
+				 struct rdma_conn_param *src)
+{
+	if (src->private_data_len)
+		memcpy(dst->private_data, src->private_data,
+		       src->private_data_len);
+	dst->private_data_len = src->private_data_len;
+	dst->responder_resources =src->responder_resources;
+	dst->initiator_depth = src->initiator_depth;
+	dst->flow_control = src->flow_control;
+	dst->retry_count = src->retry_count;
+	dst->rnr_retry_count = src->rnr_retry_count;
+	dst->srq = src->srq;
+	dst->qp_num = src->qp_num;
+}
+
+static void ucma_copy_ud_event(struct rdma_ucm_ud_param *dst,
+			       struct rdma_ud_param *src)
+{
+	if (src->private_data_len)
+		memcpy(dst->private_data, src->private_data,
+		       src->private_data_len);
+	dst->private_data_len = src->private_data_len;
+	ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr);
+	dst->qp_num = src->qp_num;
+	dst->qkey = src->qkey;
+}
+
+static void ucma_set_event_context(struct ucma_context *ctx,
+				   struct rdma_cm_event *event,
+				   struct ucma_event *uevent)
+{
+	uevent->ctx = ctx;
+	switch (event->event) {
+	case RDMA_CM_EVENT_MULTICAST_JOIN:
+	case RDMA_CM_EVENT_MULTICAST_ERROR:
+		uevent->mc = (struct ucma_multicast *)
+			     event->param.ud.private_data;
+		uevent->resp.uid = uevent->mc->uid;
+		uevent->resp.id = uevent->mc->id;
+		break;
+	default:
+		uevent->resp.uid = ctx->uid;
+		uevent->resp.id = ctx->id;
+		break;
+	}
+}
+
+static int ucma_event_handler(struct rdma_cm_id *cm_id,
+			      struct rdma_cm_event *event)
+{
+	struct ucma_event *uevent;
+	struct ucma_context *ctx = cm_id->context;
+	int ret = 0;
+
+	uevent = kzalloc(sizeof(*uevent), GFP_KERNEL);
+	if (!uevent)
+		return event->event == RDMA_CM_EVENT_CONNECT_REQUEST;
+
+	mutex_lock(&ctx->file->mut);
+	uevent->cm_id = cm_id;
+	ucma_set_event_context(ctx, event, uevent);
+	uevent->resp.event = event->event;
+	uevent->resp.status = event->status;
+	if (cm_id->qp_type == IB_QPT_UD)
+		ucma_copy_ud_event(&uevent->resp.param.ud, &event->param.ud);
+	else
+		ucma_copy_conn_event(&uevent->resp.param.conn,
+				     &event->param.conn);
+
+	if (event->event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+		if (!ctx->backlog) {
+			ret = -ENOMEM;
+			kfree(uevent);
+			goto out;
+		}
+		ctx->backlog--;
+	} else if (!ctx->uid || ctx->cm_id != cm_id) {
+		/*
+		 * We ignore events for new connections until userspace has set
+		 * their context.  This can only happen if an error occurs on a
+		 * new connection before the user accepts it.  This is okay,
+		 * since the accept will just fail later.
+		 */
+		kfree(uevent);
+		goto out;
+	}
+
+	list_add_tail(&uevent->list, &ctx->file->event_list);
+	wake_up_interruptible(&ctx->file->poll_wait);
+out:
+	mutex_unlock(&ctx->file->mut);
+	return ret;
+}
+
+static ssize_t ucma_get_event(struct ucma_file *file, const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct ucma_context *ctx;
+	struct rdma_ucm_get_event cmd;
+	struct ucma_event *uevent;
+	int ret = 0;
+
+	if (out_len < sizeof uevent->resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&file->mut);
+	while (list_empty(&file->event_list)) {
+		mutex_unlock(&file->mut);
+
+		if (file->filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->event_list)))
+			return -ERESTARTSYS;
+
+		mutex_lock(&file->mut);
+	}
+
+	uevent = list_entry(file->event_list.next, struct ucma_event, list);
+
+	if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST) {
+		ctx = ucma_alloc_ctx(file);
+		if (!ctx) {
+			ret = -ENOMEM;
+			goto done;
+		}
+		uevent->ctx->backlog++;
+		ctx->cm_id = uevent->cm_id;
+		ctx->cm_id->context = ctx;
+		uevent->resp.id = ctx->id;
+	}
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &uevent->resp, sizeof uevent->resp)) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	list_del(&uevent->list);
+	uevent->ctx->events_reported++;
+	if (uevent->mc)
+		uevent->mc->events_reported++;
+	kfree(uevent);
+done:
+	mutex_unlock(&file->mut);
+	return ret;
+}
+
+static int ucma_get_qp_type(struct rdma_ucm_create_id *cmd, enum ib_qp_type *qp_type)
+{
+	switch (cmd->ps) {
+	case RDMA_PS_TCP:
+		*qp_type = IB_QPT_RC;
+		return 0;
+	case RDMA_PS_UDP:
+	case RDMA_PS_IPOIB:
+		*qp_type = IB_QPT_UD;
+		return 0;
+	case RDMA_PS_IB:
+		*qp_type = cmd->qp_type;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct rdma_ucm_create_id cmd;
+	struct rdma_ucm_create_id_resp resp;
+	struct ucma_context *ctx;
+	enum ib_qp_type qp_type;
+	int ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ret = ucma_get_qp_type(&cmd, &qp_type);
+	if (ret)
+		return ret;
+
+	mutex_lock(&file->mut);
+	ctx = ucma_alloc_ctx(file);
+	mutex_unlock(&file->mut);
+	if (!ctx)
+		return -ENOMEM;
+
+	ctx->uid = cmd.uid;
+	ctx->cm_id = rdma_create_id(ucma_event_handler, ctx, cmd.ps, qp_type);
+	if (IS_ERR(ctx->cm_id)) {
+		ret = PTR_ERR(ctx->cm_id);
+		goto err1;
+	}
+
+	resp.id = ctx->id;
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp))) {
+		ret = -EFAULT;
+		goto err2;
+	}
+	return 0;
+
+err2:
+	rdma_destroy_id(ctx->cm_id);
+err1:
+	mutex_lock(&mut);
+	idr_remove(&ctx_idr, ctx->id);
+	mutex_unlock(&mut);
+	kfree(ctx);
+	return ret;
+}
+
+static void ucma_cleanup_multicast(struct ucma_context *ctx)
+{
+	struct ucma_multicast *mc, *tmp;
+
+	mutex_lock(&mut);
+	list_for_each_entry_safe(mc, tmp, &ctx->mc_list, list) {
+		list_del(&mc->list);
+		idr_remove(&multicast_idr, mc->id);
+		kfree(mc);
+	}
+	mutex_unlock(&mut);
+}
+
+static void ucma_cleanup_mc_events(struct ucma_multicast *mc)
+{
+	struct ucma_event *uevent, *tmp;
+
+	list_for_each_entry_safe(uevent, tmp, &mc->ctx->file->event_list, list) {
+		if (uevent->mc != mc)
+			continue;
+
+		list_del(&uevent->list);
+		kfree(uevent);
+	}
+}
+
+/*
+ * We cannot hold file->mut when calling rdma_destroy_id() or we can
+ * deadlock.  We also acquire file->mut in ucma_event_handler(), and
+ * rdma_destroy_id() will wait until all callbacks have completed.
+ */
+static int ucma_free_ctx(struct ucma_context *ctx)
+{
+	int events_reported;
+	struct ucma_event *uevent, *tmp;
+	LIST_HEAD(list);
+
+	/* No new events will be generated after destroying the id. */
+	rdma_destroy_id(ctx->cm_id);
+
+	ucma_cleanup_multicast(ctx);
+
+	/* Cleanup events not yet reported to the user. */
+	mutex_lock(&ctx->file->mut);
+	list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) {
+		if (uevent->ctx == ctx)
+			list_move_tail(&uevent->list, &list);
+	}
+	list_del(&ctx->list);
+	mutex_unlock(&ctx->file->mut);
+
+	list_for_each_entry_safe(uevent, tmp, &list, list) {
+		list_del(&uevent->list);
+		if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST)
+			rdma_destroy_id(uevent->cm_id);
+		kfree(uevent);
+	}
+
+	events_reported = ctx->events_reported;
+	kfree(ctx);
+	return events_reported;
+}
+
+static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_destroy_id cmd;
+	struct rdma_ucm_destroy_id_resp resp;
+	struct ucma_context *ctx;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&mut);
+	ctx = _ucma_find_context(cmd.id, file);
+	if (!IS_ERR(ctx))
+		idr_remove(&ctx_idr, ctx->id);
+	mutex_unlock(&mut);
+
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ucma_put_ctx(ctx);
+	wait_for_completion(&ctx->comp);
+	resp.events_reported = ucma_free_ctx(ctx);
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static ssize_t ucma_bind_ip(struct ucma_file *file, const char __user *inbuf,
+			      int in_len, int out_len)
+{
+	struct rdma_ucm_bind_ip cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_bind_addr(ctx->cm_id, (struct sockaddr *) &cmd.addr);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_bind(struct ucma_file *file, const char __user *inbuf,
+			 int in_len, int out_len)
+{
+	struct rdma_ucm_bind cmd;
+	struct sockaddr *addr;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	addr = (struct sockaddr *) &cmd.addr;
+	if (cmd.reserved || !cmd.addr_size || (cmd.addr_size != rdma_addr_size(addr)))
+		return -EINVAL;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_bind_addr(ctx->cm_id, addr);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_resolve_ip(struct ucma_file *file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_resolve_ip cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_resolve_addr(ctx->cm_id, (struct sockaddr *) &cmd.src_addr,
+				(struct sockaddr *) &cmd.dst_addr,
+				cmd.timeout_ms);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_resolve_addr(struct ucma_file *file,
+				 const char __user *inbuf,
+				 int in_len, int out_len)
+{
+	struct rdma_ucm_resolve_addr cmd;
+	struct sockaddr *src, *dst;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	src = (struct sockaddr *) &cmd.src_addr;
+	dst = (struct sockaddr *) &cmd.dst_addr;
+	if (cmd.reserved || (cmd.src_size && (cmd.src_size != rdma_addr_size(src))) ||
+	    !cmd.dst_size || (cmd.dst_size != rdma_addr_size(dst)))
+		return -EINVAL;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_resolve_addr(ctx->cm_id, src, dst, cmd.timeout_ms);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_resolve_route(struct ucma_file *file,
+				  const char __user *inbuf,
+				  int in_len, int out_len)
+{
+	struct rdma_ucm_resolve_route cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_resolve_route(ctx->cm_id, cmd.timeout_ms);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static void ucma_copy_ib_route(struct rdma_ucm_query_route_resp *resp,
+			       struct rdma_route *route)
+{
+	struct rdma_dev_addr *dev_addr;
+
+	resp->num_paths = route->num_paths;
+	switch (route->num_paths) {
+	case 0:
+		dev_addr = &route->addr.dev_addr;
+		rdma_addr_get_dgid(dev_addr,
+				   (union ib_gid *) &resp->ib_route[0].dgid);
+		rdma_addr_get_sgid(dev_addr,
+				   (union ib_gid *) &resp->ib_route[0].sgid);
+		resp->ib_route[0].pkey = cpu_to_be16(ib_addr_get_pkey(dev_addr));
+		break;
+	case 2:
+		ib_copy_path_rec_to_user(&resp->ib_route[1],
+					 &route->path_rec[1]);
+		/* fall through */
+	case 1:
+		ib_copy_path_rec_to_user(&resp->ib_route[0],
+					 &route->path_rec[0]);
+		break;
+	default:
+		break;
+	}
+}
+
+static void ucma_copy_iboe_route(struct rdma_ucm_query_route_resp *resp,
+				 struct rdma_route *route)
+{
+
+	resp->num_paths = route->num_paths;
+	switch (route->num_paths) {
+	case 0:
+		rdma_ip2gid((struct sockaddr *)&route->addr.dst_addr,
+			    (union ib_gid *)&resp->ib_route[0].dgid);
+		rdma_ip2gid((struct sockaddr *)&route->addr.src_addr,
+			    (union ib_gid *)&resp->ib_route[0].sgid);
+		resp->ib_route[0].pkey = cpu_to_be16(0xffff);
+		break;
+	case 2:
+		ib_copy_path_rec_to_user(&resp->ib_route[1],
+					 &route->path_rec[1]);
+		/* fall through */
+	case 1:
+		ib_copy_path_rec_to_user(&resp->ib_route[0],
+					 &route->path_rec[0]);
+		break;
+	default:
+		break;
+	}
+}
+
+static void ucma_copy_iw_route(struct rdma_ucm_query_route_resp *resp,
+			       struct rdma_route *route)
+{
+	struct rdma_dev_addr *dev_addr;
+
+	dev_addr = &route->addr.dev_addr;
+	rdma_addr_get_dgid(dev_addr, (union ib_gid *) &resp->ib_route[0].dgid);
+	rdma_addr_get_sgid(dev_addr, (union ib_gid *) &resp->ib_route[0].sgid);
+}
+
+static ssize_t ucma_query_route(struct ucma_file *file,
+				const char __user *inbuf,
+				int in_len, int out_len)
+{
+	struct rdma_ucm_query cmd;
+	struct rdma_ucm_query_route_resp resp;
+	struct ucma_context *ctx;
+	struct sockaddr *addr;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	memset(&resp, 0, sizeof resp);
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
+	memcpy(&resp.src_addr, addr, addr->sa_family == AF_INET ?
+				     sizeof(struct sockaddr_in) :
+				     sizeof(struct sockaddr_in6));
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
+	memcpy(&resp.dst_addr, addr, addr->sa_family == AF_INET ?
+				     sizeof(struct sockaddr_in) :
+				     sizeof(struct sockaddr_in6));
+	if (!ctx->cm_id->device)
+		goto out;
+
+	resp.node_guid = (__force __u64) ctx->cm_id->device->node_guid;
+	resp.port_num = ctx->cm_id->port_num;
+	switch (rdma_node_get_transport(ctx->cm_id->device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		switch (rdma_port_get_link_layer(ctx->cm_id->device,
+			ctx->cm_id->port_num)) {
+		case IB_LINK_LAYER_INFINIBAND:
+			ucma_copy_ib_route(&resp, &ctx->cm_id->route);
+			break;
+		case IB_LINK_LAYER_ETHERNET:
+			ucma_copy_iboe_route(&resp, &ctx->cm_id->route);
+			break;
+		default:
+			break;
+		}
+		break;
+	case RDMA_TRANSPORT_IWARP:
+		ucma_copy_iw_route(&resp, &ctx->cm_id->route);
+		break;
+	default:
+		break;
+	}
+
+out:
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static void ucma_query_device_addr(struct rdma_cm_id *cm_id,
+				   struct rdma_ucm_query_addr_resp *resp)
+{
+	if (!cm_id->device)
+		return;
+
+	resp->node_guid = (__force __u64) cm_id->device->node_guid;
+	resp->port_num = cm_id->port_num;
+	resp->pkey = (__force __u16) cpu_to_be16(
+		     ib_addr_get_pkey(&cm_id->route.addr.dev_addr));
+}
+
+static ssize_t ucma_query_addr(struct ucma_context *ctx,
+			       void __user *response, int out_len)
+{
+	struct rdma_ucm_query_addr_resp resp;
+	struct sockaddr *addr;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	memset(&resp, 0, sizeof resp);
+
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.src_addr;
+	resp.src_size = rdma_addr_size(addr);
+	memcpy(&resp.src_addr, addr, resp.src_size);
+
+	addr = (struct sockaddr *) &ctx->cm_id->route.addr.dst_addr;
+	resp.dst_size = rdma_addr_size(addr);
+	memcpy(&resp.dst_addr, addr, resp.dst_size);
+
+	ucma_query_device_addr(ctx->cm_id, &resp);
+
+	if (copy_to_user(response, &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static ssize_t ucma_query_path(struct ucma_context *ctx,
+			       void __user *response, int out_len)
+{
+	struct rdma_ucm_query_path_resp *resp;
+	int i, ret = 0;
+
+	if (out_len < sizeof(*resp))
+		return -ENOSPC;
+
+	resp = kzalloc(out_len, GFP_KERNEL);
+	if (!resp)
+		return -ENOMEM;
+
+	resp->num_paths = ctx->cm_id->route.num_paths;
+	for (i = 0, out_len -= sizeof(*resp);
+	     i < resp->num_paths && out_len > sizeof(struct ib_path_rec_data);
+	     i++, out_len -= sizeof(struct ib_path_rec_data)) {
+
+		resp->path_data[i].flags = IB_PATH_GMP | IB_PATH_PRIMARY |
+					   IB_PATH_BIDIRECTIONAL;
+		ib_sa_pack_path(&ctx->cm_id->route.path_rec[i],
+				&resp->path_data[i].path_rec);
+	}
+
+	if (copy_to_user(response, resp,
+			 sizeof(*resp) + (i * sizeof(struct ib_path_rec_data))))
+		ret = -EFAULT;
+
+	kfree(resp);
+	return ret;
+}
+
+static ssize_t ucma_query_gid(struct ucma_context *ctx,
+			      void __user *response, int out_len)
+{
+	struct rdma_ucm_query_addr_resp resp;
+	struct sockaddr_ib *addr;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	memset(&resp, 0, sizeof resp);
+
+	ucma_query_device_addr(ctx->cm_id, &resp);
+
+	addr = (struct sockaddr_ib *) &resp.src_addr;
+	resp.src_size = sizeof(*addr);
+	if (ctx->cm_id->route.addr.src_addr.ss_family == AF_IB) {
+		memcpy(addr, &ctx->cm_id->route.addr.src_addr, resp.src_size);
+	} else {
+		addr->sib_family = AF_IB;
+		addr->sib_pkey = (__force __be16) resp.pkey;
+		rdma_addr_get_sgid(&ctx->cm_id->route.addr.dev_addr,
+				   (union ib_gid *) &addr->sib_addr);
+		addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
+						    &ctx->cm_id->route.addr.src_addr);
+	}
+
+	addr = (struct sockaddr_ib *) &resp.dst_addr;
+	resp.dst_size = sizeof(*addr);
+	if (ctx->cm_id->route.addr.dst_addr.ss_family == AF_IB) {
+		memcpy(addr, &ctx->cm_id->route.addr.dst_addr, resp.dst_size);
+	} else {
+		addr->sib_family = AF_IB;
+		addr->sib_pkey = (__force __be16) resp.pkey;
+		rdma_addr_get_dgid(&ctx->cm_id->route.addr.dev_addr,
+				   (union ib_gid *) &addr->sib_addr);
+		addr->sib_sid = rdma_get_service_id(ctx->cm_id, (struct sockaddr *)
+						    &ctx->cm_id->route.addr.dst_addr);
+	}
+
+	if (copy_to_user(response, &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static ssize_t ucma_query(struct ucma_file *file,
+			  const char __user *inbuf,
+			  int in_len, int out_len)
+{
+	struct rdma_ucm_query cmd;
+	struct ucma_context *ctx;
+	void __user *response;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	response = (void __user *)(unsigned long) cmd.response;
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	switch (cmd.option) {
+	case RDMA_USER_CM_QUERY_ADDR:
+		ret = ucma_query_addr(ctx, response, out_len);
+		break;
+	case RDMA_USER_CM_QUERY_PATH:
+		ret = ucma_query_path(ctx, response, out_len);
+		break;
+	case RDMA_USER_CM_QUERY_GID:
+		ret = ucma_query_gid(ctx, response, out_len);
+		break;
+	default:
+		ret = -ENOSYS;
+		break;
+	}
+
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static void ucma_copy_conn_param(struct rdma_cm_id *id,
+				 struct rdma_conn_param *dst,
+				 struct rdma_ucm_conn_param *src)
+{
+	dst->private_data = src->private_data;
+	dst->private_data_len = src->private_data_len;
+	dst->responder_resources =src->responder_resources;
+	dst->initiator_depth = src->initiator_depth;
+	dst->flow_control = src->flow_control;
+	dst->retry_count = src->retry_count;
+	dst->rnr_retry_count = src->rnr_retry_count;
+	dst->srq = src->srq;
+	dst->qp_num = src->qp_num;
+	dst->qkey = (id->route.addr.src_addr.ss_family == AF_IB) ? src->qkey : 0;
+}
+
+static ssize_t ucma_connect(struct ucma_file *file, const char __user *inbuf,
+			    int in_len, int out_len)
+{
+	struct rdma_ucm_connect cmd;
+	struct rdma_conn_param conn_param;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	if (!cmd.conn_param.valid)
+		return -EINVAL;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
+	ret = rdma_connect(ctx->cm_id, &conn_param);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_listen(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_listen cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ctx->backlog = cmd.backlog > 0 && cmd.backlog < max_backlog ?
+		       cmd.backlog : max_backlog;
+	ret = rdma_listen(ctx->cm_id, ctx->backlog);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_accept(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_accept cmd;
+	struct rdma_conn_param conn_param;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	if (cmd.conn_param.valid) {
+		ucma_copy_conn_param(ctx->cm_id, &conn_param, &cmd.conn_param);
+		mutex_lock(&file->mut);
+		ret = rdma_accept(ctx->cm_id, &conn_param);
+		if (!ret)
+			ctx->uid = cmd.uid;
+		mutex_unlock(&file->mut);
+	} else
+		ret = rdma_accept(ctx->cm_id, NULL);
+
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_reject(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_reject cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_reject(ctx->cm_id, cmd.private_data, cmd.private_data_len);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_disconnect(struct ucma_file *file, const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_disconnect cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_disconnect(ctx->cm_id);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_init_qp_attr(struct ucma_file *file,
+				 const char __user *inbuf,
+				 int in_len, int out_len)
+{
+	struct rdma_ucm_init_qp_attr cmd;
+	struct ib_uverbs_qp_attr resp;
+	struct ucma_context *ctx;
+	struct ib_qp_attr qp_attr;
+	int ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	resp.qp_attr_mask = 0;
+	memset(&qp_attr, 0, sizeof qp_attr);
+	qp_attr.qp_state = cmd.qp_state;
+	ret = rdma_init_qp_attr(ctx->cm_id, &qp_attr, &resp.qp_attr_mask);
+	if (ret)
+		goto out;
+
+	ib_copy_qp_attr_to_user(&resp, &qp_attr);
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+out:
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static int ucma_set_option_id(struct ucma_context *ctx, int optname,
+			      void *optval, size_t optlen)
+{
+	int ret = 0;
+
+	switch (optname) {
+	case RDMA_OPTION_ID_TOS:
+		if (optlen != sizeof(u8)) {
+			ret = -EINVAL;
+			break;
+		}
+		rdma_set_service_type(ctx->cm_id, *((u8 *) optval));
+		break;
+	case RDMA_OPTION_ID_REUSEADDR:
+		if (optlen != sizeof(int)) {
+			ret = -EINVAL;
+			break;
+		}
+		ret = rdma_set_reuseaddr(ctx->cm_id, *((int *) optval) ? 1 : 0);
+		break;
+	case RDMA_OPTION_ID_AFONLY:
+		if (optlen != sizeof(int)) {
+			ret = -EINVAL;
+			break;
+		}
+		ret = rdma_set_afonly(ctx->cm_id, *((int *) optval) ? 1 : 0);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static int ucma_set_ib_path(struct ucma_context *ctx,
+			    struct ib_path_rec_data *path_data, size_t optlen)
+{
+	struct ib_sa_path_rec sa_path;
+	struct rdma_cm_event event;
+	int ret;
+
+	if (optlen % sizeof(*path_data))
+		return -EINVAL;
+
+	for (; optlen; optlen -= sizeof(*path_data), path_data++) {
+		if (path_data->flags == (IB_PATH_GMP | IB_PATH_PRIMARY |
+					 IB_PATH_BIDIRECTIONAL))
+			break;
+	}
+
+	if (!optlen)
+		return -EINVAL;
+
+	memset(&sa_path, 0, sizeof(sa_path));
+	sa_path.vlan_id = 0xffff;
+
+	ib_sa_unpack_path(path_data->path_rec, &sa_path);
+	ret = rdma_set_ib_paths(ctx->cm_id, &sa_path, 1);
+	if (ret)
+		return ret;
+
+	memset(&event, 0, sizeof event);
+	event.event = RDMA_CM_EVENT_ROUTE_RESOLVED;
+	return ucma_event_handler(ctx->cm_id, &event);
+}
+
+static int ucma_set_option_ib(struct ucma_context *ctx, int optname,
+			      void *optval, size_t optlen)
+{
+	int ret;
+
+	switch (optname) {
+	case RDMA_OPTION_IB_PATH:
+		ret = ucma_set_ib_path(ctx, optval, optlen);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static int ucma_set_option_level(struct ucma_context *ctx, int level,
+				 int optname, void *optval, size_t optlen)
+{
+	int ret;
+
+	switch (level) {
+	case RDMA_OPTION_ID:
+		ret = ucma_set_option_id(ctx, optname, optval, optlen);
+		break;
+	case RDMA_OPTION_IB:
+		ret = ucma_set_option_ib(ctx, optname, optval, optlen);
+		break;
+	default:
+		ret = -ENOSYS;
+	}
+
+	return ret;
+}
+
+static ssize_t ucma_set_option(struct ucma_file *file, const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_set_option cmd;
+	struct ucma_context *ctx;
+	void *optval;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	optval = memdup_user((void __user *) (unsigned long) cmd.optval,
+			     cmd.optlen);
+	if (IS_ERR(optval)) {
+		ret = PTR_ERR(optval);
+		goto out;
+	}
+
+	ret = ucma_set_option_level(ctx, cmd.level, cmd.optname, optval,
+				    cmd.optlen);
+	kfree(optval);
+
+out:
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_notify(struct ucma_file *file, const char __user *inbuf,
+			   int in_len, int out_len)
+{
+	struct rdma_ucm_notify cmd;
+	struct ucma_context *ctx;
+	int ret;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	ctx = ucma_get_ctx(file, cmd.id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	ret = rdma_notify(ctx->cm_id, (enum ib_event_type) cmd.event);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_process_join(struct ucma_file *file,
+				 struct rdma_ucm_join_mcast *cmd,  int out_len)
+{
+	struct rdma_ucm_create_id_resp resp;
+	struct ucma_context *ctx;
+	struct ucma_multicast *mc;
+	struct sockaddr *addr;
+	int ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	addr = (struct sockaddr *) &cmd->addr;
+	if (cmd->reserved || !cmd->addr_size || (cmd->addr_size != rdma_addr_size(addr)))
+		return -EINVAL;
+
+	ctx = ucma_get_ctx(file, cmd->id);
+	if (IS_ERR(ctx))
+		return PTR_ERR(ctx);
+
+	mutex_lock(&file->mut);
+	mc = ucma_alloc_multicast(ctx);
+	if (!mc) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	mc->uid = cmd->uid;
+	memcpy(&mc->addr, addr, cmd->addr_size);
+	ret = rdma_join_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr, mc);
+	if (ret)
+		goto err2;
+
+	resp.id = mc->id;
+	if (copy_to_user((void __user *)(unsigned long) cmd->response,
+			 &resp, sizeof(resp))) {
+		ret = -EFAULT;
+		goto err3;
+	}
+
+	mutex_unlock(&file->mut);
+	ucma_put_ctx(ctx);
+	return 0;
+
+err3:
+	rdma_leave_multicast(ctx->cm_id, (struct sockaddr *) &mc->addr);
+	ucma_cleanup_mc_events(mc);
+err2:
+	mutex_lock(&mut);
+	idr_remove(&multicast_idr, mc->id);
+	mutex_unlock(&mut);
+	list_del(&mc->list);
+	kfree(mc);
+err1:
+	mutex_unlock(&file->mut);
+	ucma_put_ctx(ctx);
+	return ret;
+}
+
+static ssize_t ucma_join_ip_multicast(struct ucma_file *file,
+				      const char __user *inbuf,
+				      int in_len, int out_len)
+{
+	struct rdma_ucm_join_ip_mcast cmd;
+	struct rdma_ucm_join_mcast join_cmd;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	join_cmd.response = cmd.response;
+	join_cmd.uid = cmd.uid;
+	join_cmd.id = cmd.id;
+	join_cmd.addr_size = rdma_addr_size((struct sockaddr *) &cmd.addr);
+	join_cmd.reserved = 0;
+	memcpy(&join_cmd.addr, &cmd.addr, join_cmd.addr_size);
+
+	return ucma_process_join(file, &join_cmd, out_len);
+}
+
+static ssize_t ucma_join_multicast(struct ucma_file *file,
+				   const char __user *inbuf,
+				   int in_len, int out_len)
+{
+	struct rdma_ucm_join_mcast cmd;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	return ucma_process_join(file, &cmd, out_len);
+}
+
+static ssize_t ucma_leave_multicast(struct ucma_file *file,
+				    const char __user *inbuf,
+				    int in_len, int out_len)
+{
+	struct rdma_ucm_destroy_id cmd;
+	struct rdma_ucm_destroy_id_resp resp;
+	struct ucma_multicast *mc;
+	int ret = 0;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	mutex_lock(&mut);
+	mc = idr_find(&multicast_idr, cmd.id);
+	if (!mc)
+		mc = ERR_PTR(-ENOENT);
+	else if (mc->ctx->file != file)
+		mc = ERR_PTR(-EINVAL);
+	else {
+		idr_remove(&multicast_idr, mc->id);
+		atomic_inc(&mc->ctx->ref);
+	}
+	mutex_unlock(&mut);
+
+	if (IS_ERR(mc)) {
+		ret = PTR_ERR(mc);
+		goto out;
+	}
+
+	rdma_leave_multicast(mc->ctx->cm_id, (struct sockaddr *) &mc->addr);
+	mutex_lock(&mc->ctx->file->mut);
+	ucma_cleanup_mc_events(mc);
+	list_del(&mc->list);
+	mutex_unlock(&mc->ctx->file->mut);
+
+	ucma_put_ctx(mc->ctx);
+	resp.events_reported = mc->events_reported;
+	kfree(mc);
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+out:
+	return ret;
+}
+
+static void ucma_lock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+	/* Acquire mutex's based on pointer comparison to prevent deadlock. */
+	if (file1 < file2) {
+		mutex_lock(&file1->mut);
+		mutex_lock(&file2->mut);
+	} else {
+		mutex_lock(&file2->mut);
+		mutex_lock(&file1->mut);
+	}
+}
+
+static void ucma_unlock_files(struct ucma_file *file1, struct ucma_file *file2)
+{
+	if (file1 < file2) {
+		mutex_unlock(&file2->mut);
+		mutex_unlock(&file1->mut);
+	} else {
+		mutex_unlock(&file1->mut);
+		mutex_unlock(&file2->mut);
+	}
+}
+
+static void ucma_move_events(struct ucma_context *ctx, struct ucma_file *file)
+{
+	struct ucma_event *uevent, *tmp;
+
+	list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list)
+		if (uevent->ctx == ctx)
+			list_move_tail(&uevent->list, &file->event_list);
+}
+
+static ssize_t ucma_migrate_id(struct ucma_file *new_file,
+			       const char __user *inbuf,
+			       int in_len, int out_len)
+{
+	struct rdma_ucm_migrate_id cmd;
+	struct rdma_ucm_migrate_resp resp;
+	struct ucma_context *ctx;
+	struct fd f;
+	struct ucma_file *cur_file;
+	int ret = 0;
+
+	if (copy_from_user(&cmd, inbuf, sizeof(cmd)))
+		return -EFAULT;
+
+	/* Get current fd to protect against it being closed */
+	f = fdget(cmd.fd);
+	if (!f.file)
+		return -ENOENT;
+
+	/* Validate current fd and prevent destruction of id. */
+	ctx = ucma_get_ctx(f.file->private_data, cmd.id);
+	if (IS_ERR(ctx)) {
+		ret = PTR_ERR(ctx);
+		goto file_put;
+	}
+
+	cur_file = ctx->file;
+	if (cur_file == new_file) {
+		resp.events_reported = ctx->events_reported;
+		goto response;
+	}
+
+	/*
+	 * Migrate events between fd's, maintaining order, and avoiding new
+	 * events being added before existing events.
+	 */
+	ucma_lock_files(cur_file, new_file);
+	mutex_lock(&mut);
+
+	list_move_tail(&ctx->list, &new_file->ctx_list);
+	ucma_move_events(ctx, new_file);
+	ctx->file = new_file;
+	resp.events_reported = ctx->events_reported;
+
+	mutex_unlock(&mut);
+	ucma_unlock_files(cur_file, new_file);
+
+response:
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+
+	ucma_put_ctx(ctx);
+file_put:
+	fdput(f);
+	return ret;
+}
+
+static ssize_t (*ucma_cmd_table[])(struct ucma_file *file,
+				   const char __user *inbuf,
+				   int in_len, int out_len) = {
+	[RDMA_USER_CM_CMD_CREATE_ID] 	 = ucma_create_id,
+	[RDMA_USER_CM_CMD_DESTROY_ID]	 = ucma_destroy_id,
+	[RDMA_USER_CM_CMD_BIND_IP]	 = ucma_bind_ip,
+	[RDMA_USER_CM_CMD_RESOLVE_IP]	 = ucma_resolve_ip,
+	[RDMA_USER_CM_CMD_RESOLVE_ROUTE] = ucma_resolve_route,
+	[RDMA_USER_CM_CMD_QUERY_ROUTE]	 = ucma_query_route,
+	[RDMA_USER_CM_CMD_CONNECT]	 = ucma_connect,
+	[RDMA_USER_CM_CMD_LISTEN]	 = ucma_listen,
+	[RDMA_USER_CM_CMD_ACCEPT]	 = ucma_accept,
+	[RDMA_USER_CM_CMD_REJECT]	 = ucma_reject,
+	[RDMA_USER_CM_CMD_DISCONNECT]	 = ucma_disconnect,
+	[RDMA_USER_CM_CMD_INIT_QP_ATTR]	 = ucma_init_qp_attr,
+	[RDMA_USER_CM_CMD_GET_EVENT]	 = ucma_get_event,
+	[RDMA_USER_CM_CMD_GET_OPTION]	 = NULL,
+	[RDMA_USER_CM_CMD_SET_OPTION]	 = ucma_set_option,
+	[RDMA_USER_CM_CMD_NOTIFY]	 = ucma_notify,
+	[RDMA_USER_CM_CMD_JOIN_IP_MCAST] = ucma_join_ip_multicast,
+	[RDMA_USER_CM_CMD_LEAVE_MCAST]	 = ucma_leave_multicast,
+	[RDMA_USER_CM_CMD_MIGRATE_ID]	 = ucma_migrate_id,
+	[RDMA_USER_CM_CMD_QUERY]	 = ucma_query,
+	[RDMA_USER_CM_CMD_BIND]		 = ucma_bind,
+	[RDMA_USER_CM_CMD_RESOLVE_ADDR]	 = ucma_resolve_addr,
+	[RDMA_USER_CM_CMD_JOIN_MCAST]	 = ucma_join_multicast
+};
+
+static ssize_t ucma_write(struct file *filp, const char __user *buf,
+			  size_t len, loff_t *pos)
+{
+	struct ucma_file *file = filp->private_data;
+	struct rdma_ucm_cmd_hdr hdr;
+	ssize_t ret;
+
+	if (len < sizeof(hdr))
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof(hdr)))
+		return -EFAULT;
+
+	if (hdr.cmd >= ARRAY_SIZE(ucma_cmd_table))
+		return -EINVAL;
+
+	if (hdr.in + sizeof(hdr) > len)
+		return -EINVAL;
+
+	if (!ucma_cmd_table[hdr.cmd])
+		return -ENOSYS;
+
+	ret = ucma_cmd_table[hdr.cmd](file, buf + sizeof(hdr), hdr.in, hdr.out);
+	if (!ret)
+		ret = len;
+
+	return ret;
+}
+
+static unsigned int ucma_poll(struct file *filp, struct poll_table_struct *wait)
+{
+	struct ucma_file *file = filp->private_data;
+	unsigned int mask = 0;
+
+	poll_wait(filp, &file->poll_wait, wait);
+
+	if (!list_empty(&file->event_list))
+		mask = POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+/*
+ * ucma_open() does not need the BKL:
+ *
+ *  - no global state is referred to;
+ *  - there is no ioctl method to race against;
+ *  - no further module initialization is required for open to work
+ *    after the device is registered.
+ */
+static int ucma_open(struct inode *inode, struct file *filp)
+{
+	struct ucma_file *file;
+
+	file = kmalloc(sizeof *file, GFP_KERNEL);
+	if (!file)
+		return -ENOMEM;
+
+	INIT_LIST_HEAD(&file->event_list);
+	INIT_LIST_HEAD(&file->ctx_list);
+	init_waitqueue_head(&file->poll_wait);
+	mutex_init(&file->mut);
+
+	filp->private_data = file;
+	file->filp = filp;
+
+	return nonseekable_open(inode, filp);
+}
+
+static int ucma_close(struct inode *inode, struct file *filp)
+{
+	struct ucma_file *file = filp->private_data;
+	struct ucma_context *ctx, *tmp;
+
+	mutex_lock(&file->mut);
+	list_for_each_entry_safe(ctx, tmp, &file->ctx_list, list) {
+		mutex_unlock(&file->mut);
+
+		mutex_lock(&mut);
+		idr_remove(&ctx_idr, ctx->id);
+		mutex_unlock(&mut);
+
+		ucma_free_ctx(ctx);
+		mutex_lock(&file->mut);
+	}
+	mutex_unlock(&file->mut);
+	kfree(file);
+	return 0;
+}
+
+static const struct file_operations ucma_fops = {
+	.owner 	 = THIS_MODULE,
+	.open 	 = ucma_open,
+	.release = ucma_close,
+	.write	 = ucma_write,
+	.poll    = ucma_poll,
+	.llseek	 = no_llseek,
+};
+
+static struct miscdevice ucma_misc = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "rdma_cm",
+	.nodename	= "infiniband/rdma_cm",
+	.mode		= 0666,
+	.fops		= &ucma_fops,
+};
+
+static ssize_t show_abi_version(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	return sprintf(buf, "%d\n", RDMA_USER_CM_ABI_VERSION);
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_abi_version, NULL);
+
+static int __init ucma_init(void)
+{
+	int ret;
+
+	ret = misc_register(&ucma_misc);
+	if (ret)
+		return ret;
+
+	ret = device_create_file(ucma_misc.this_device, &dev_attr_abi_version);
+	if (ret) {
+		printk(KERN_ERR "rdma_ucm: couldn't create abi_version attr\n");
+		goto err1;
+	}
+
+	ucma_ctl_table_hdr = register_net_sysctl(&init_net, "net/rdma_ucm", ucma_ctl_table);
+	if (!ucma_ctl_table_hdr) {
+		printk(KERN_ERR "rdma_ucm: couldn't register sysctl paths\n");
+		ret = -ENOMEM;
+		goto err2;
+	}
+	return 0;
+err2:
+	device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
+err1:
+	misc_deregister(&ucma_misc);
+	return ret;
+}
+
+static void __exit ucma_cleanup(void)
+{
+	unregister_net_sysctl_table(ucma_ctl_table_hdr);
+	device_remove_file(ucma_misc.this_device, &dev_attr_abi_version);
+	misc_deregister(&ucma_misc);
+	idr_destroy(&ctx_idr);
+}
+
+module_init(ucma_init);
+module_exit(ucma_cleanup);
diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c
new file mode 100644
index 000000000..72feee620
--- /dev/null
+++ b/drivers/infiniband/core/ud_header.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/export.h>
+#include <linux/if_ether.h>
+
+#include <rdma/ib_pack.h>
+
+#define STRUCT_FIELD(header, field) \
+	.struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field),      \
+	.struct_size_bytes   = sizeof ((struct ib_unpacked_ ## header *) 0)->field, \
+	.field_name          = #header ":" #field
+
+static const struct ib_field lrh_table[]  = {
+	{ STRUCT_FIELD(lrh, virtual_lane),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(lrh, link_version),
+	  .offset_words = 0,
+	  .offset_bits  = 4,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(lrh, service_level),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 4 },
+	{ RESERVED,
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(lrh, link_next_header),
+	  .offset_words = 0,
+	  .offset_bits  = 14,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(lrh, destination_lid),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 5 },
+	{ STRUCT_FIELD(lrh, packet_length),
+	  .offset_words = 1,
+	  .offset_bits  = 5,
+	  .size_bits    = 11 },
+	{ STRUCT_FIELD(lrh, source_lid),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field eth_table[]  = {
+	{ STRUCT_FIELD(eth, dmac_h),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ STRUCT_FIELD(eth, dmac_l),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(eth, smac_h),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(eth, smac_l),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ STRUCT_FIELD(eth, type),
+	  .offset_words = 3,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field vlan_table[]  = {
+	{ STRUCT_FIELD(vlan, tag),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(vlan, type),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 }
+};
+
+static const struct ib_field grh_table[]  = {
+	{ STRUCT_FIELD(grh, ip_version),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(grh, traffic_class),
+	  .offset_words = 0,
+	  .offset_bits  = 4,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, flow_label),
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 20 },
+	{ STRUCT_FIELD(grh, payload_length),
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 16 },
+	{ STRUCT_FIELD(grh, next_header),
+	  .offset_words = 1,
+	  .offset_bits  = 16,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, hop_limit),
+	  .offset_words = 1,
+	  .offset_bits  = 24,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(grh, source_gid),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 },
+	{ STRUCT_FIELD(grh, destination_gid),
+	  .offset_words = 6,
+	  .offset_bits  = 0,
+	  .size_bits    = 128 }
+};
+
+static const struct ib_field bth_table[]  = {
+	{ STRUCT_FIELD(bth, opcode),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(bth, solicited_event),
+	  .offset_words = 0,
+	  .offset_bits  = 8,
+	  .size_bits    = 1 },
+	{ STRUCT_FIELD(bth, mig_req),
+	  .offset_words = 0,
+	  .offset_bits  = 9,
+	  .size_bits    = 1 },
+	{ STRUCT_FIELD(bth, pad_count),
+	  .offset_words = 0,
+	  .offset_bits  = 10,
+	  .size_bits    = 2 },
+	{ STRUCT_FIELD(bth, transport_header_version),
+	  .offset_words = 0,
+	  .offset_bits  = 12,
+	  .size_bits    = 4 },
+	{ STRUCT_FIELD(bth, pkey),
+	  .offset_words = 0,
+	  .offset_bits  = 16,
+	  .size_bits    = 16 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(bth, destination_qpn),
+	  .offset_words = 1,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 },
+	{ STRUCT_FIELD(bth, ack_req),
+	  .offset_words = 2,
+	  .offset_bits  = 0,
+	  .size_bits    = 1 },
+	{ RESERVED,
+	  .offset_words = 2,
+	  .offset_bits  = 1,
+	  .size_bits    = 7 },
+	{ STRUCT_FIELD(bth, psn),
+	  .offset_words = 2,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 }
+};
+
+static const struct ib_field deth_table[] = {
+	{ STRUCT_FIELD(deth, qkey),
+	  .offset_words = 0,
+	  .offset_bits  = 0,
+	  .size_bits    = 32 },
+	{ RESERVED,
+	  .offset_words = 1,
+	  .offset_bits  = 0,
+	  .size_bits    = 8 },
+	{ STRUCT_FIELD(deth, source_qpn),
+	  .offset_words = 1,
+	  .offset_bits  = 8,
+	  .size_bits    = 24 }
+};
+
+/**
+ * ib_ud_header_init - Initialize UD header structure
+ * @payload_bytes:Length of packet payload
+ * @lrh_present: specify if LRH is present
+ * @eth_present: specify if Eth header is present
+ * @vlan_present: packet is tagged vlan
+ * @grh_present:GRH flag (if non-zero, GRH will be included)
+ * @immediate_present: specify if immediate data is present
+ * @header:Structure to initialize
+ */
+void ib_ud_header_init(int     		    payload_bytes,
+		       int		    lrh_present,
+		       int		    eth_present,
+		       int		    vlan_present,
+		       int    		    grh_present,
+		       int		    immediate_present,
+		       struct ib_ud_header *header)
+{
+	memset(header, 0, sizeof *header);
+
+	if (lrh_present) {
+		u16 packet_length;
+
+		header->lrh.link_version     = 0;
+		header->lrh.link_next_header =
+			grh_present ? IB_LNH_IBA_GLOBAL : IB_LNH_IBA_LOCAL;
+		packet_length = (IB_LRH_BYTES	+
+				 IB_BTH_BYTES	+
+				 IB_DETH_BYTES	+
+				 (grh_present ? IB_GRH_BYTES : 0) +
+				 payload_bytes	+
+				 4		+ /* ICRC     */
+				 3) / 4;	  /* round up */
+		header->lrh.packet_length = cpu_to_be16(packet_length);
+	}
+
+	if (vlan_present)
+		header->eth.type = cpu_to_be16(ETH_P_8021Q);
+
+	if (grh_present) {
+		header->grh.ip_version      = 6;
+		header->grh.payload_length  =
+			cpu_to_be16((IB_BTH_BYTES     +
+				     IB_DETH_BYTES    +
+				     payload_bytes    +
+				     4                + /* ICRC     */
+				     3) & ~3);          /* round up */
+		header->grh.next_header     = 0x1b;
+	}
+
+	if (immediate_present)
+		header->bth.opcode           = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+	else
+		header->bth.opcode           = IB_OPCODE_UD_SEND_ONLY;
+	header->bth.pad_count                = (4 - payload_bytes) & 3;
+	header->bth.transport_header_version = 0;
+
+	header->lrh_present = lrh_present;
+	header->eth_present = eth_present;
+	header->vlan_present = vlan_present;
+	header->grh_present = grh_present;
+	header->immediate_present = immediate_present;
+}
+EXPORT_SYMBOL(ib_ud_header_init);
+
+/**
+ * ib_ud_header_pack - Pack UD header struct into wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() packs the UD header structure @header into wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_pack(struct ib_ud_header *header,
+		      void                *buf)
+{
+	int len = 0;
+
+	if (header->lrh_present) {
+		ib_pack(lrh_table, ARRAY_SIZE(lrh_table),
+			&header->lrh, buf + len);
+		len += IB_LRH_BYTES;
+	}
+	if (header->eth_present) {
+		ib_pack(eth_table, ARRAY_SIZE(eth_table),
+			&header->eth, buf + len);
+		len += IB_ETH_BYTES;
+	}
+	if (header->vlan_present) {
+		ib_pack(vlan_table, ARRAY_SIZE(vlan_table),
+			&header->vlan, buf + len);
+		len += IB_VLAN_BYTES;
+	}
+	if (header->grh_present) {
+		ib_pack(grh_table, ARRAY_SIZE(grh_table),
+			&header->grh, buf + len);
+		len += IB_GRH_BYTES;
+	}
+
+	ib_pack(bth_table, ARRAY_SIZE(bth_table),
+		&header->bth, buf + len);
+	len += IB_BTH_BYTES;
+
+	ib_pack(deth_table, ARRAY_SIZE(deth_table),
+		&header->deth, buf + len);
+	len += IB_DETH_BYTES;
+
+	if (header->immediate_present) {
+		memcpy(buf + len, &header->immediate_data, sizeof header->immediate_data);
+		len += sizeof header->immediate_data;
+	}
+
+	return len;
+}
+EXPORT_SYMBOL(ib_ud_header_pack);
+
+/**
+ * ib_ud_header_unpack - Unpack UD header struct from wire format
+ * @header:UD header struct
+ * @buf:Buffer to pack into
+ *
+ * ib_ud_header_pack() unpacks the UD header structure @header from wire
+ * format in the buffer @buf.
+ */
+int ib_ud_header_unpack(void                *buf,
+			struct ib_ud_header *header)
+{
+	ib_unpack(lrh_table, ARRAY_SIZE(lrh_table),
+		  buf, &header->lrh);
+	buf += IB_LRH_BYTES;
+
+	if (header->lrh.link_version != 0) {
+		printk(KERN_WARNING "Invalid LRH.link_version %d\n",
+		       header->lrh.link_version);
+		return -EINVAL;
+	}
+
+	switch (header->lrh.link_next_header) {
+	case IB_LNH_IBA_LOCAL:
+		header->grh_present = 0;
+		break;
+
+	case IB_LNH_IBA_GLOBAL:
+		header->grh_present = 1;
+		ib_unpack(grh_table, ARRAY_SIZE(grh_table),
+			  buf, &header->grh);
+		buf += IB_GRH_BYTES;
+
+		if (header->grh.ip_version != 6) {
+			printk(KERN_WARNING "Invalid GRH.ip_version %d\n",
+			       header->grh.ip_version);
+			return -EINVAL;
+		}
+		if (header->grh.next_header != 0x1b) {
+			printk(KERN_WARNING "Invalid GRH.next_header 0x%02x\n",
+			       header->grh.next_header);
+			return -EINVAL;
+		}
+		break;
+
+	default:
+		printk(KERN_WARNING "Invalid LRH.link_next_header %d\n",
+		       header->lrh.link_next_header);
+		return -EINVAL;
+	}
+
+	ib_unpack(bth_table, ARRAY_SIZE(bth_table),
+		  buf, &header->bth);
+	buf += IB_BTH_BYTES;
+
+	switch (header->bth.opcode) {
+	case IB_OPCODE_UD_SEND_ONLY:
+		header->immediate_present = 0;
+		break;
+	case IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE:
+		header->immediate_present = 1;
+		break;
+	default:
+		printk(KERN_WARNING "Invalid BTH.opcode 0x%02x\n",
+		       header->bth.opcode);
+		return -EINVAL;
+	}
+
+	if (header->bth.transport_header_version != 0) {
+		printk(KERN_WARNING "Invalid BTH.transport_header_version %d\n",
+		       header->bth.transport_header_version);
+		return -EINVAL;
+	}
+
+	ib_unpack(deth_table, ARRAY_SIZE(deth_table),
+		  buf, &header->deth);
+	buf += IB_DETH_BYTES;
+
+	if (header->immediate_present)
+		memcpy(&header->immediate_data, buf, sizeof header->immediate_data);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_ud_header_unpack);
diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c
new file mode 100644
index 000000000..38acb3cfc
--- /dev/null
+++ b/drivers/infiniband/core/umem.c
@@ -0,0 +1,365 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched.h>
+#include <linux/export.h>
+#include <linux/hugetlb.h>
+#include <linux/dma-attrs.h>
+#include <linux/slab.h>
+#include <rdma/ib_umem_odp.h>
+
+#include "uverbs.h"
+
+
+static void __ib_umem_release(struct ib_device *dev, struct ib_umem *umem, int dirty)
+{
+	struct scatterlist *sg;
+	struct page *page;
+	int i;
+
+	if (umem->nmap > 0)
+		ib_dma_unmap_sg(dev, umem->sg_head.sgl,
+				umem->nmap,
+				DMA_BIDIRECTIONAL);
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->npages, i) {
+
+		page = sg_page(sg);
+		if (umem->writable && dirty)
+			set_page_dirty_lock(page);
+		put_page(page);
+	}
+
+	sg_free_table(&umem->sg_head);
+	return;
+
+}
+
+/**
+ * ib_umem_get - Pin and DMA map userspace memory.
+ *
+ * If access flags indicate ODP memory, avoid pinning. Instead, stores
+ * the mm for future page fault handling in conjunction with MMU notifiers.
+ *
+ * @context: userspace context to pin memory for
+ * @addr: userspace virtual address to start at
+ * @size: length of region to pin
+ * @access: IB_ACCESS_xxx flags for memory being pinned
+ * @dmasync: flush in-flight DMA when the memory region is written
+ */
+struct ib_umem *ib_umem_get(struct ib_ucontext *context, unsigned long addr,
+			    size_t size, int access, int dmasync)
+{
+	struct ib_umem *umem;
+	struct page **page_list;
+	struct vm_area_struct **vma_list;
+	unsigned long locked;
+	unsigned long lock_limit;
+	unsigned long cur_base;
+	unsigned long npages;
+	int ret;
+	int i;
+	DEFINE_DMA_ATTRS(attrs);
+	struct scatterlist *sg, *sg_list_start;
+	int need_release = 0;
+
+	if (dmasync)
+		dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
+
+	if (!size)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * If the combination of the addr and size requested for this memory
+	 * region causes an integer overflow, return error.
+	 */
+	if (((addr + size) < addr) ||
+	    PAGE_ALIGN(addr + size) < (addr + size))
+		return ERR_PTR(-EINVAL);
+
+	if (!can_do_mlock())
+		return ERR_PTR(-EPERM);
+
+	umem = kzalloc(sizeof *umem, GFP_KERNEL);
+	if (!umem)
+		return ERR_PTR(-ENOMEM);
+
+	umem->context   = context;
+	umem->length    = size;
+	umem->address   = addr;
+	umem->page_size = PAGE_SIZE;
+	umem->pid       = get_task_pid(current, PIDTYPE_PID);
+	/*
+	 * We ask for writable memory if any of the following
+	 * access flags are set.  "Local write" and "remote write"
+	 * obviously require write access.  "Remote atomic" can do
+	 * things like fetch and add, which will modify memory, and
+	 * "MW bind" can change permissions by binding a window.
+	 */
+	umem->writable  = !!(access &
+		(IB_ACCESS_LOCAL_WRITE   | IB_ACCESS_REMOTE_WRITE |
+		 IB_ACCESS_REMOTE_ATOMIC | IB_ACCESS_MW_BIND));
+
+	if (access & IB_ACCESS_ON_DEMAND) {
+		ret = ib_umem_odp_get(context, umem);
+		if (ret) {
+			kfree(umem);
+			return ERR_PTR(ret);
+		}
+		return umem;
+	}
+
+	umem->odp_data = NULL;
+
+	/* We assume the memory is from hugetlb until proved otherwise */
+	umem->hugetlb   = 1;
+
+	page_list = (struct page **) __get_free_page(GFP_KERNEL);
+	if (!page_list) {
+		kfree(umem);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/*
+	 * if we can't alloc the vma_list, it's not so bad;
+	 * just assume the memory is not hugetlb memory
+	 */
+	vma_list = (struct vm_area_struct **) __get_free_page(GFP_KERNEL);
+	if (!vma_list)
+		umem->hugetlb = 0;
+
+	npages = ib_umem_num_pages(umem);
+
+	down_write(&current->mm->mmap_sem);
+
+	locked     = npages + current->mm->pinned_vm;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	cur_base = addr & PAGE_MASK;
+
+	if (npages == 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = sg_alloc_table(&umem->sg_head, npages, GFP_KERNEL);
+	if (ret)
+		goto out;
+
+	need_release = 1;
+	sg_list_start = umem->sg_head.sgl;
+
+	while (npages) {
+		ret = get_user_pages(current, current->mm, cur_base,
+				     min_t(unsigned long, npages,
+					   PAGE_SIZE / sizeof (struct page *)),
+				     1, !umem->writable, page_list, vma_list);
+
+		if (ret < 0)
+			goto out;
+
+		umem->npages += ret;
+		cur_base += ret * PAGE_SIZE;
+		npages   -= ret;
+
+		for_each_sg(sg_list_start, sg, ret, i) {
+			if (vma_list && !is_vm_hugetlb_page(vma_list[i]))
+				umem->hugetlb = 0;
+
+			sg_set_page(sg, page_list[i], PAGE_SIZE, 0);
+		}
+
+		/* preparing for next loop */
+		sg_list_start = sg;
+	}
+
+	umem->nmap = ib_dma_map_sg_attrs(context->device,
+				  umem->sg_head.sgl,
+				  umem->npages,
+				  DMA_BIDIRECTIONAL,
+				  &attrs);
+
+	if (umem->nmap <= 0) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = 0;
+
+out:
+	if (ret < 0) {
+		if (need_release)
+			__ib_umem_release(context->device, umem, 0);
+		put_pid(umem->pid);
+		kfree(umem);
+	} else
+		current->mm->pinned_vm = locked;
+
+	up_write(&current->mm->mmap_sem);
+	if (vma_list)
+		free_page((unsigned long) vma_list);
+	free_page((unsigned long) page_list);
+
+	return ret < 0 ? ERR_PTR(ret) : umem;
+}
+EXPORT_SYMBOL(ib_umem_get);
+
+static void ib_umem_account(struct work_struct *work)
+{
+	struct ib_umem *umem = container_of(work, struct ib_umem, work);
+
+	down_write(&umem->mm->mmap_sem);
+	umem->mm->pinned_vm -= umem->diff;
+	up_write(&umem->mm->mmap_sem);
+	mmput(umem->mm);
+	kfree(umem);
+}
+
+/**
+ * ib_umem_release - release memory pinned with ib_umem_get
+ * @umem: umem struct to release
+ */
+void ib_umem_release(struct ib_umem *umem)
+{
+	struct ib_ucontext *context = umem->context;
+	struct mm_struct *mm;
+	struct task_struct *task;
+	unsigned long diff;
+
+	if (umem->odp_data) {
+		ib_umem_odp_release(umem);
+		return;
+	}
+
+	__ib_umem_release(umem->context->device, umem, 1);
+
+	task = get_pid_task(umem->pid, PIDTYPE_PID);
+	put_pid(umem->pid);
+	if (!task)
+		goto out;
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	diff = ib_umem_num_pages(umem);
+
+	/*
+	 * We may be called with the mm's mmap_sem already held.  This
+	 * can happen when a userspace munmap() is the call that drops
+	 * the last reference to our file and calls our release
+	 * method.  If there are memory regions to destroy, we'll end
+	 * up here and not be able to take the mmap_sem.  In that case
+	 * we defer the vm_locked accounting to the system workqueue.
+	 */
+	if (context->closing) {
+		if (!down_write_trylock(&mm->mmap_sem)) {
+			INIT_WORK(&umem->work, ib_umem_account);
+			umem->mm   = mm;
+			umem->diff = diff;
+
+			queue_work(ib_wq, &umem->work);
+			return;
+		}
+	} else
+		down_write(&mm->mmap_sem);
+
+	mm->pinned_vm -= diff;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+out:
+	kfree(umem);
+}
+EXPORT_SYMBOL(ib_umem_release);
+
+int ib_umem_page_count(struct ib_umem *umem)
+{
+	int shift;
+	int i;
+	int n;
+	struct scatterlist *sg;
+
+	if (umem->odp_data)
+		return ib_umem_num_pages(umem);
+
+	shift = ilog2(umem->page_size);
+
+	n = 0;
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i)
+		n += sg_dma_len(sg) >> shift;
+
+	return n;
+}
+EXPORT_SYMBOL(ib_umem_page_count);
+
+/*
+ * Copy from the given ib_umem's pages to the given buffer.
+ *
+ * umem - the umem to copy from
+ * offset - offset to start copying from
+ * dst - destination buffer
+ * length - buffer length
+ *
+ * Returns 0 on success, or an error code.
+ */
+int ib_umem_copy_from(void *dst, struct ib_umem *umem, size_t offset,
+		      size_t length)
+{
+	size_t end = offset + length;
+	int ret;
+
+	if (offset > umem->length || length > umem->length - offset) {
+		pr_err("ib_umem_copy_from not in range. offset: %zd umem length: %zd end: %zd\n",
+		       offset, umem->length, end);
+		return -EINVAL;
+	}
+
+	ret = sg_pcopy_to_buffer(umem->sg_head.sgl, umem->nmap, dst, length,
+				 offset + ib_umem_offset(umem));
+
+	if (ret < 0)
+		return ret;
+	else if (ret != length)
+		return -EINVAL;
+	else
+		return 0;
+}
+EXPORT_SYMBOL(ib_umem_copy_from);
diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c
new file mode 100644
index 000000000..40becdb31
--- /dev/null
+++ b/drivers/infiniband/core/umem_odp.c
@@ -0,0 +1,669 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/pid.h>
+#include <linux/slab.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
+
+static void ib_umem_notifier_start_account(struct ib_umem *item)
+{
+	mutex_lock(&item->odp_data->umem_mutex);
+
+	/* Only update private counters for this umem if it has them.
+	 * Otherwise skip it. All page faults will be delayed for this umem. */
+	if (item->odp_data->mn_counters_active) {
+		int notifiers_count = item->odp_data->notifiers_count++;
+
+		if (notifiers_count == 0)
+			/* Initialize the completion object for waiting on
+			 * notifiers. Since notifier_count is zero, no one
+			 * should be waiting right now. */
+			reinit_completion(&item->odp_data->notifier_completion);
+	}
+	mutex_unlock(&item->odp_data->umem_mutex);
+}
+
+static void ib_umem_notifier_end_account(struct ib_umem *item)
+{
+	mutex_lock(&item->odp_data->umem_mutex);
+
+	/* Only update private counters for this umem if it has them.
+	 * Otherwise skip it. All page faults will be delayed for this umem. */
+	if (item->odp_data->mn_counters_active) {
+		/*
+		 * This sequence increase will notify the QP page fault that
+		 * the page that is going to be mapped in the spte could have
+		 * been freed.
+		 */
+		++item->odp_data->notifiers_seq;
+		if (--item->odp_data->notifiers_count == 0)
+			complete_all(&item->odp_data->notifier_completion);
+	}
+	mutex_unlock(&item->odp_data->umem_mutex);
+}
+
+/* Account for a new mmu notifier in an ib_ucontext. */
+static void ib_ucontext_notifier_start_account(struct ib_ucontext *context)
+{
+	atomic_inc(&context->notifier_count);
+}
+
+/* Account for a terminating mmu notifier in an ib_ucontext.
+ *
+ * Must be called with the ib_ucontext->umem_rwsem semaphore unlocked, since
+ * the function takes the semaphore itself. */
+static void ib_ucontext_notifier_end_account(struct ib_ucontext *context)
+{
+	int zero_notifiers = atomic_dec_and_test(&context->notifier_count);
+
+	if (zero_notifiers &&
+	    !list_empty(&context->no_private_counters)) {
+		/* No currently running mmu notifiers. Now is the chance to
+		 * add private accounting to all previously added umems. */
+		struct ib_umem_odp *odp_data, *next;
+
+		/* Prevent concurrent mmu notifiers from working on the
+		 * no_private_counters list. */
+		down_write(&context->umem_rwsem);
+
+		/* Read the notifier_count again, with the umem_rwsem
+		 * semaphore taken for write. */
+		if (!atomic_read(&context->notifier_count)) {
+			list_for_each_entry_safe(odp_data, next,
+						 &context->no_private_counters,
+						 no_private_counters) {
+				mutex_lock(&odp_data->umem_mutex);
+				odp_data->mn_counters_active = true;
+				list_del(&odp_data->no_private_counters);
+				complete_all(&odp_data->notifier_completion);
+				mutex_unlock(&odp_data->umem_mutex);
+			}
+		}
+
+		up_write(&context->umem_rwsem);
+	}
+}
+
+static int ib_umem_notifier_release_trampoline(struct ib_umem *item, u64 start,
+					       u64 end, void *cookie) {
+	/*
+	 * Increase the number of notifiers running, to
+	 * prevent any further fault handling on this MR.
+	 */
+	ib_umem_notifier_start_account(item);
+	item->odp_data->dying = 1;
+	/* Make sure that the fact the umem is dying is out before we release
+	 * all pending page faults. */
+	smp_wmb();
+	complete_all(&item->odp_data->notifier_completion);
+	item->context->invalidate_range(item, ib_umem_start(item),
+					ib_umem_end(item));
+	return 0;
+}
+
+static void ib_umem_notifier_release(struct mmu_notifier *mn,
+				     struct mm_struct *mm)
+{
+	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+	if (!context->invalidate_range)
+		return;
+
+	ib_ucontext_notifier_start_account(context);
+	down_read(&context->umem_rwsem);
+	rbt_ib_umem_for_each_in_range(&context->umem_tree, 0,
+				      ULLONG_MAX,
+				      ib_umem_notifier_release_trampoline,
+				      NULL);
+	up_read(&context->umem_rwsem);
+}
+
+static int invalidate_page_trampoline(struct ib_umem *item, u64 start,
+				      u64 end, void *cookie)
+{
+	ib_umem_notifier_start_account(item);
+	item->context->invalidate_range(item, start, start + PAGE_SIZE);
+	ib_umem_notifier_end_account(item);
+	return 0;
+}
+
+static void ib_umem_notifier_invalidate_page(struct mmu_notifier *mn,
+					     struct mm_struct *mm,
+					     unsigned long address)
+{
+	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+	if (!context->invalidate_range)
+		return;
+
+	ib_ucontext_notifier_start_account(context);
+	down_read(&context->umem_rwsem);
+	rbt_ib_umem_for_each_in_range(&context->umem_tree, address,
+				      address + PAGE_SIZE,
+				      invalidate_page_trampoline, NULL);
+	up_read(&context->umem_rwsem);
+	ib_ucontext_notifier_end_account(context);
+}
+
+static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start,
+					     u64 end, void *cookie)
+{
+	ib_umem_notifier_start_account(item);
+	item->context->invalidate_range(item, start, end);
+	return 0;
+}
+
+static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn,
+						    struct mm_struct *mm,
+						    unsigned long start,
+						    unsigned long end)
+{
+	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+	if (!context->invalidate_range)
+		return;
+
+	ib_ucontext_notifier_start_account(context);
+	down_read(&context->umem_rwsem);
+	rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+				      end,
+				      invalidate_range_start_trampoline, NULL);
+	up_read(&context->umem_rwsem);
+}
+
+static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start,
+					   u64 end, void *cookie)
+{
+	ib_umem_notifier_end_account(item);
+	return 0;
+}
+
+static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn,
+						  struct mm_struct *mm,
+						  unsigned long start,
+						  unsigned long end)
+{
+	struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn);
+
+	if (!context->invalidate_range)
+		return;
+
+	down_read(&context->umem_rwsem);
+	rbt_ib_umem_for_each_in_range(&context->umem_tree, start,
+				      end,
+				      invalidate_range_end_trampoline, NULL);
+	up_read(&context->umem_rwsem);
+	ib_ucontext_notifier_end_account(context);
+}
+
+static struct mmu_notifier_ops ib_umem_notifiers = {
+	.release                    = ib_umem_notifier_release,
+	.invalidate_page            = ib_umem_notifier_invalidate_page,
+	.invalidate_range_start     = ib_umem_notifier_invalidate_range_start,
+	.invalidate_range_end       = ib_umem_notifier_invalidate_range_end,
+};
+
+int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem)
+{
+	int ret_val;
+	struct pid *our_pid;
+	struct mm_struct *mm = get_task_mm(current);
+
+	if (!mm)
+		return -EINVAL;
+
+	/* Prevent creating ODP MRs in child processes */
+	rcu_read_lock();
+	our_pid = get_task_pid(current->group_leader, PIDTYPE_PID);
+	rcu_read_unlock();
+	put_pid(our_pid);
+	if (context->tgid != our_pid) {
+		ret_val = -EINVAL;
+		goto out_mm;
+	}
+
+	umem->hugetlb = 0;
+	umem->odp_data = kzalloc(sizeof(*umem->odp_data), GFP_KERNEL);
+	if (!umem->odp_data) {
+		ret_val = -ENOMEM;
+		goto out_mm;
+	}
+	umem->odp_data->umem = umem;
+
+	mutex_init(&umem->odp_data->umem_mutex);
+
+	init_completion(&umem->odp_data->notifier_completion);
+
+	umem->odp_data->page_list = vzalloc(ib_umem_num_pages(umem) *
+					    sizeof(*umem->odp_data->page_list));
+	if (!umem->odp_data->page_list) {
+		ret_val = -ENOMEM;
+		goto out_odp_data;
+	}
+
+	umem->odp_data->dma_list = vzalloc(ib_umem_num_pages(umem) *
+					  sizeof(*umem->odp_data->dma_list));
+	if (!umem->odp_data->dma_list) {
+		ret_val = -ENOMEM;
+		goto out_page_list;
+	}
+
+	/*
+	 * When using MMU notifiers, we will get a
+	 * notification before the "current" task (and MM) is
+	 * destroyed. We use the umem_rwsem semaphore to synchronize.
+	 */
+	down_write(&context->umem_rwsem);
+	context->odp_mrs_count++;
+	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+		rbt_ib_umem_insert(&umem->odp_data->interval_tree,
+				   &context->umem_tree);
+	if (likely(!atomic_read(&context->notifier_count)) ||
+	    context->odp_mrs_count == 1)
+		umem->odp_data->mn_counters_active = true;
+	else
+		list_add(&umem->odp_data->no_private_counters,
+			 &context->no_private_counters);
+	downgrade_write(&context->umem_rwsem);
+
+	if (context->odp_mrs_count == 1) {
+		/*
+		 * Note that at this point, no MMU notifier is running
+		 * for this context!
+		 */
+		atomic_set(&context->notifier_count, 0);
+		INIT_HLIST_NODE(&context->mn.hlist);
+		context->mn.ops = &ib_umem_notifiers;
+		/*
+		 * Lock-dep detects a false positive for mmap_sem vs.
+		 * umem_rwsem, due to not grasping downgrade_write correctly.
+		 */
+		lockdep_off();
+		ret_val = mmu_notifier_register(&context->mn, mm);
+		lockdep_on();
+		if (ret_val) {
+			pr_err("Failed to register mmu_notifier %d\n", ret_val);
+			ret_val = -EBUSY;
+			goto out_mutex;
+		}
+	}
+
+	up_read(&context->umem_rwsem);
+
+	/*
+	 * Note that doing an mmput can cause a notifier for the relevant mm.
+	 * If the notifier is called while we hold the umem_rwsem, this will
+	 * cause a deadlock. Therefore, we release the reference only after we
+	 * released the semaphore.
+	 */
+	mmput(mm);
+	return 0;
+
+out_mutex:
+	up_read(&context->umem_rwsem);
+	vfree(umem->odp_data->dma_list);
+out_page_list:
+	vfree(umem->odp_data->page_list);
+out_odp_data:
+	kfree(umem->odp_data);
+out_mm:
+	mmput(mm);
+	return ret_val;
+}
+
+void ib_umem_odp_release(struct ib_umem *umem)
+{
+	struct ib_ucontext *context = umem->context;
+
+	/*
+	 * Ensure that no more pages are mapped in the umem.
+	 *
+	 * It is the driver's responsibility to ensure, before calling us,
+	 * that the hardware will not attempt to access the MR any more.
+	 */
+	ib_umem_odp_unmap_dma_pages(umem, ib_umem_start(umem),
+				    ib_umem_end(umem));
+
+	down_write(&context->umem_rwsem);
+	if (likely(ib_umem_start(umem) != ib_umem_end(umem)))
+		rbt_ib_umem_remove(&umem->odp_data->interval_tree,
+				   &context->umem_tree);
+	context->odp_mrs_count--;
+	if (!umem->odp_data->mn_counters_active) {
+		list_del(&umem->odp_data->no_private_counters);
+		complete_all(&umem->odp_data->notifier_completion);
+	}
+
+	/*
+	 * Downgrade the lock to a read lock. This ensures that the notifiers
+	 * (who lock the mutex for reading) will be able to finish, and we
+	 * will be able to enventually obtain the mmu notifiers SRCU. Note
+	 * that since we are doing it atomically, no other user could register
+	 * and unregister while we do the check.
+	 */
+	downgrade_write(&context->umem_rwsem);
+	if (!context->odp_mrs_count) {
+		struct task_struct *owning_process = NULL;
+		struct mm_struct *owning_mm        = NULL;
+
+		owning_process = get_pid_task(context->tgid,
+					      PIDTYPE_PID);
+		if (owning_process == NULL)
+			/*
+			 * The process is already dead, notifier were removed
+			 * already.
+			 */
+			goto out;
+
+		owning_mm = get_task_mm(owning_process);
+		if (owning_mm == NULL)
+			/*
+			 * The process' mm is already dead, notifier were
+			 * removed already.
+			 */
+			goto out_put_task;
+		mmu_notifier_unregister(&context->mn, owning_mm);
+
+		mmput(owning_mm);
+
+out_put_task:
+		put_task_struct(owning_process);
+	}
+out:
+	up_read(&context->umem_rwsem);
+
+	vfree(umem->odp_data->dma_list);
+	vfree(umem->odp_data->page_list);
+	kfree(umem->odp_data);
+	kfree(umem);
+}
+
+/*
+ * Map for DMA and insert a single page into the on-demand paging page tables.
+ *
+ * @umem: the umem to insert the page to.
+ * @page_index: index in the umem to add the page to.
+ * @page: the page struct to map and add.
+ * @access_mask: access permissions needed for this page.
+ * @current_seq: sequence number for synchronization with invalidations.
+ *               the sequence number is taken from
+ *               umem->odp_data->notifiers_seq.
+ *
+ * The function returns -EFAULT if the DMA mapping operation fails. It returns
+ * -EAGAIN if a concurrent invalidation prevents us from updating the page.
+ *
+ * The page is released via put_page even if the operation failed. For
+ * on-demand pinning, the page is released whenever it isn't stored in the
+ * umem.
+ */
+static int ib_umem_odp_map_dma_single_page(
+		struct ib_umem *umem,
+		int page_index,
+		u64 base_virt_addr,
+		struct page *page,
+		u64 access_mask,
+		unsigned long current_seq)
+{
+	struct ib_device *dev = umem->context->device;
+	dma_addr_t dma_addr;
+	int stored_page = 0;
+	int remove_existing_mapping = 0;
+	int ret = 0;
+
+	/*
+	 * Note: we avoid writing if seq is different from the initial seq, to
+	 * handle case of a racing notifier. This check also allows us to bail
+	 * early if we have a notifier running in parallel with us.
+	 */
+	if (ib_umem_mmu_notifier_retry(umem, current_seq)) {
+		ret = -EAGAIN;
+		goto out;
+	}
+	if (!(umem->odp_data->dma_list[page_index])) {
+		dma_addr = ib_dma_map_page(dev,
+					   page,
+					   0, PAGE_SIZE,
+					   DMA_BIDIRECTIONAL);
+		if (ib_dma_mapping_error(dev, dma_addr)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		umem->odp_data->dma_list[page_index] = dma_addr | access_mask;
+		umem->odp_data->page_list[page_index] = page;
+		stored_page = 1;
+	} else if (umem->odp_data->page_list[page_index] == page) {
+		umem->odp_data->dma_list[page_index] |= access_mask;
+	} else {
+		pr_err("error: got different pages in IB device and from get_user_pages. IB device page: %p, gup page: %p\n",
+		       umem->odp_data->page_list[page_index], page);
+		/* Better remove the mapping now, to prevent any further
+		 * damage. */
+		remove_existing_mapping = 1;
+	}
+
+out:
+	/* On Demand Paging - avoid pinning the page */
+	if (umem->context->invalidate_range || !stored_page)
+		put_page(page);
+
+	if (remove_existing_mapping && umem->context->invalidate_range) {
+		invalidate_page_trampoline(
+			umem,
+			base_virt_addr + (page_index * PAGE_SIZE),
+			base_virt_addr + ((page_index+1)*PAGE_SIZE),
+			NULL);
+		ret = -EAGAIN;
+	}
+
+	return ret;
+}
+
+/**
+ * ib_umem_odp_map_dma_pages - Pin and DMA map userspace memory in an ODP MR.
+ *
+ * Pins the range of pages passed in the argument, and maps them to
+ * DMA addresses. The DMA addresses of the mapped pages is updated in
+ * umem->odp_data->dma_list.
+ *
+ * Returns the number of pages mapped in success, negative error code
+ * for failure.
+ * An -EAGAIN error code is returned when a concurrent mmu notifier prevents
+ * the function from completing its task.
+ *
+ * @umem: the umem to map and pin
+ * @user_virt: the address from which we need to map.
+ * @bcnt: the minimal number of bytes to pin and map. The mapping might be
+ *        bigger due to alignment, and may also be smaller in case of an error
+ *        pinning or mapping a page. The actual pages mapped is returned in
+ *        the return value.
+ * @access_mask: bit mask of the requested access permissions for the given
+ *               range.
+ * @current_seq: the MMU notifiers sequance value for synchronization with
+ *               invalidations. the sequance number is read from
+ *               umem->odp_data->notifiers_seq before calling this function
+ */
+int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt,
+			      u64 access_mask, unsigned long current_seq)
+{
+	struct task_struct *owning_process  = NULL;
+	struct mm_struct   *owning_mm       = NULL;
+	struct page       **local_page_list = NULL;
+	u64 off;
+	int j, k, ret = 0, start_idx, npages = 0;
+	u64 base_virt_addr;
+
+	if (access_mask == 0)
+		return -EINVAL;
+
+	if (user_virt < ib_umem_start(umem) ||
+	    user_virt + bcnt > ib_umem_end(umem))
+		return -EFAULT;
+
+	local_page_list = (struct page **)__get_free_page(GFP_KERNEL);
+	if (!local_page_list)
+		return -ENOMEM;
+
+	off = user_virt & (~PAGE_MASK);
+	user_virt = user_virt & PAGE_MASK;
+	base_virt_addr = user_virt;
+	bcnt += off; /* Charge for the first page offset as well. */
+
+	owning_process = get_pid_task(umem->context->tgid, PIDTYPE_PID);
+	if (owning_process == NULL) {
+		ret = -EINVAL;
+		goto out_no_task;
+	}
+
+	owning_mm = get_task_mm(owning_process);
+	if (owning_mm == NULL) {
+		ret = -EINVAL;
+		goto out_put_task;
+	}
+
+	start_idx = (user_virt - ib_umem_start(umem)) >> PAGE_SHIFT;
+	k = start_idx;
+
+	while (bcnt > 0) {
+		const size_t gup_num_pages =
+			min_t(size_t, ALIGN(bcnt, PAGE_SIZE) / PAGE_SIZE,
+			      PAGE_SIZE / sizeof(struct page *));
+
+		down_read(&owning_mm->mmap_sem);
+		/*
+		 * Note: this might result in redundent page getting. We can
+		 * avoid this by checking dma_list to be 0 before calling
+		 * get_user_pages. However, this make the code much more
+		 * complex (and doesn't gain us much performance in most use
+		 * cases).
+		 */
+		npages = get_user_pages(owning_process, owning_mm, user_virt,
+					gup_num_pages,
+					access_mask & ODP_WRITE_ALLOWED_BIT, 0,
+					local_page_list, NULL);
+		up_read(&owning_mm->mmap_sem);
+
+		if (npages < 0)
+			break;
+
+		bcnt -= min_t(size_t, npages << PAGE_SHIFT, bcnt);
+		user_virt += npages << PAGE_SHIFT;
+		mutex_lock(&umem->odp_data->umem_mutex);
+		for (j = 0; j < npages; ++j) {
+			ret = ib_umem_odp_map_dma_single_page(
+				umem, k, base_virt_addr, local_page_list[j],
+				access_mask, current_seq);
+			if (ret < 0)
+				break;
+			k++;
+		}
+		mutex_unlock(&umem->odp_data->umem_mutex);
+
+		if (ret < 0) {
+			/* Release left over pages when handling errors. */
+			for (++j; j < npages; ++j)
+				put_page(local_page_list[j]);
+			break;
+		}
+	}
+
+	if (ret >= 0) {
+		if (npages < 0 && k == start_idx)
+			ret = npages;
+		else
+			ret = k - start_idx;
+	}
+
+	mmput(owning_mm);
+out_put_task:
+	put_task_struct(owning_process);
+out_no_task:
+	free_page((unsigned long)local_page_list);
+	return ret;
+}
+EXPORT_SYMBOL(ib_umem_odp_map_dma_pages);
+
+void ib_umem_odp_unmap_dma_pages(struct ib_umem *umem, u64 virt,
+				 u64 bound)
+{
+	int idx;
+	u64 addr;
+	struct ib_device *dev = umem->context->device;
+
+	virt  = max_t(u64, virt,  ib_umem_start(umem));
+	bound = min_t(u64, bound, ib_umem_end(umem));
+	/* Note that during the run of this function, the
+	 * notifiers_count of the MR is > 0, preventing any racing
+	 * faults from completion. We might be racing with other
+	 * invalidations, so we must make sure we free each page only
+	 * once. */
+	mutex_lock(&umem->odp_data->umem_mutex);
+	for (addr = virt; addr < bound; addr += (u64)umem->page_size) {
+		idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
+		if (umem->odp_data->page_list[idx]) {
+			struct page *page = umem->odp_data->page_list[idx];
+			dma_addr_t dma = umem->odp_data->dma_list[idx];
+			dma_addr_t dma_addr = dma & ODP_DMA_ADDR_MASK;
+
+			WARN_ON(!dma_addr);
+
+			ib_dma_unmap_page(dev, dma_addr, PAGE_SIZE,
+					  DMA_BIDIRECTIONAL);
+			if (dma & ODP_WRITE_ALLOWED_BIT) {
+				struct page *head_page = compound_head(page);
+				/*
+				 * set_page_dirty prefers being called with
+				 * the page lock. However, MMU notifiers are
+				 * called sometimes with and sometimes without
+				 * the lock. We rely on the umem_mutex instead
+				 * to prevent other mmu notifiers from
+				 * continuing and allowing the page mapping to
+				 * be removed.
+				 */
+				set_page_dirty(head_page);
+			}
+			/* on demand pinning support */
+			if (!umem->context->invalidate_range)
+				put_page(page);
+			umem->odp_data->page_list[idx] = NULL;
+			umem->odp_data->dma_list[idx] = 0;
+		}
+	}
+	mutex_unlock(&umem->odp_data->umem_mutex);
+}
+EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages);
diff --git a/drivers/infiniband/core/umem_rbtree.c b/drivers/infiniband/core/umem_rbtree.c
new file mode 100644
index 000000000..727d78844
--- /dev/null
+++ b/drivers/infiniband/core/umem_rbtree.c
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/interval_tree_generic.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <rdma/ib_umem_odp.h>
+
+/*
+ * The ib_umem list keeps track of memory regions for which the HW
+ * device request to receive notification when the related memory
+ * mapping is changed.
+ *
+ * ib_umem_lock protects the list.
+ */
+
+static inline u64 node_start(struct umem_odp_node *n)
+{
+	struct ib_umem_odp *umem_odp =
+			container_of(n, struct ib_umem_odp, interval_tree);
+
+	return ib_umem_start(umem_odp->umem);
+}
+
+/* Note that the representation of the intervals in the interval tree
+ * considers the ending point as contained in the interval, while the
+ * function ib_umem_end returns the first address which is not contained
+ * in the umem.
+ */
+static inline u64 node_last(struct umem_odp_node *n)
+{
+	struct ib_umem_odp *umem_odp =
+			container_of(n, struct ib_umem_odp, interval_tree);
+
+	return ib_umem_end(umem_odp->umem) - 1;
+}
+
+INTERVAL_TREE_DEFINE(struct umem_odp_node, rb, u64, __subtree_last,
+		     node_start, node_last, , rbt_ib_umem)
+
+/* @last is not a part of the interval. See comment for function
+ * node_last.
+ */
+int rbt_ib_umem_for_each_in_range(struct rb_root *root,
+				  u64 start, u64 last,
+				  umem_call_back cb,
+				  void *cookie)
+{
+	int ret_val = 0;
+	struct umem_odp_node *node;
+	struct ib_umem_odp *umem;
+
+	if (unlikely(start == last))
+		return ret_val;
+
+	for (node = rbt_ib_umem_iter_first(root, start, last - 1); node;
+			node = rbt_ib_umem_iter_next(node, start, last - 1)) {
+		umem = container_of(node, struct ib_umem_odp, interval_tree);
+		ret_val = cb(umem->umem, start, last, cookie) || ret_val;
+	}
+
+	return ret_val;
+}
diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c
new file mode 100644
index 000000000..928cdd20e
--- /dev/null
+++ b/drivers/infiniband/core/user_mad.c
@@ -0,0 +1,1390 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2008 Cisco. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "user_mad: " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <linux/dma-mapping.h>
+#include <linux/poll.h>
+#include <linux/mutex.h>
+#include <linux/kref.h>
+#include <linux/compat.h>
+#include <linux/sched.h>
+#include <linux/semaphore.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_mad.h>
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace MAD packet access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+	IB_UMAD_MAX_PORTS  = 64,
+	IB_UMAD_MAX_AGENTS = 32,
+
+	IB_UMAD_MAJOR      = 231,
+	IB_UMAD_MINOR_BASE = 0
+};
+
+/*
+ * Our lifetime rules for these structs are the following:
+ * device special file is opened, we take a reference on the
+ * ib_umad_port's struct ib_umad_device. We drop these
+ * references in the corresponding close().
+ *
+ * In addition to references coming from open character devices, there
+ * is one more reference to each ib_umad_device representing the
+ * module's reference taken when allocating the ib_umad_device in
+ * ib_umad_add_one().
+ *
+ * When destroying an ib_umad_device, we drop the module's reference.
+ */
+
+struct ib_umad_port {
+	struct cdev           cdev;
+	struct device	      *dev;
+
+	struct cdev           sm_cdev;
+	struct device	      *sm_dev;
+	struct semaphore       sm_sem;
+
+	struct mutex	       file_mutex;
+	struct list_head       file_list;
+
+	struct ib_device      *ib_dev;
+	struct ib_umad_device *umad_dev;
+	int                    dev_num;
+	u8                     port_num;
+};
+
+struct ib_umad_device {
+	int                  start_port, end_port;
+	struct kobject       kobj;
+	struct ib_umad_port  port[0];
+};
+
+struct ib_umad_file {
+	struct mutex		mutex;
+	struct ib_umad_port    *port;
+	struct list_head	recv_list;
+	struct list_head	send_list;
+	struct list_head	port_list;
+	spinlock_t		send_lock;
+	wait_queue_head_t	recv_wait;
+	struct ib_mad_agent    *agent[IB_UMAD_MAX_AGENTS];
+	int			agents_dead;
+	u8			use_pkey_index;
+	u8			already_used;
+};
+
+struct ib_umad_packet {
+	struct ib_mad_send_buf *msg;
+	struct ib_mad_recv_wc  *recv_wc;
+	struct list_head   list;
+	int		   length;
+	struct ib_user_mad mad;
+};
+
+static struct class *umad_class;
+
+static const dev_t base_dev = MKDEV(IB_UMAD_MAJOR, IB_UMAD_MINOR_BASE);
+
+static DEFINE_SPINLOCK(port_lock);
+static DECLARE_BITMAP(dev_map, IB_UMAD_MAX_PORTS);
+
+static void ib_umad_add_one(struct ib_device *device);
+static void ib_umad_remove_one(struct ib_device *device);
+
+static void ib_umad_release_dev(struct kobject *kobj)
+{
+	struct ib_umad_device *dev =
+		container_of(kobj, struct ib_umad_device, kobj);
+
+	kfree(dev);
+}
+
+static struct kobj_type ib_umad_dev_ktype = {
+	.release = ib_umad_release_dev,
+};
+
+static int hdr_size(struct ib_umad_file *file)
+{
+	return file->use_pkey_index ? sizeof (struct ib_user_mad_hdr) :
+		sizeof (struct ib_user_mad_hdr_old);
+}
+
+/* caller must hold file->mutex */
+static struct ib_mad_agent *__get_agent(struct ib_umad_file *file, int id)
+{
+	return file->agents_dead ? NULL : file->agent[id];
+}
+
+static int queue_packet(struct ib_umad_file *file,
+			struct ib_mad_agent *agent,
+			struct ib_umad_packet *packet)
+{
+	int ret = 1;
+
+	mutex_lock(&file->mutex);
+
+	for (packet->mad.hdr.id = 0;
+	     packet->mad.hdr.id < IB_UMAD_MAX_AGENTS;
+	     packet->mad.hdr.id++)
+		if (agent == __get_agent(file, packet->mad.hdr.id)) {
+			list_add_tail(&packet->list, &file->recv_list);
+			wake_up_interruptible(&file->recv_wait);
+			ret = 0;
+			break;
+		}
+
+	mutex_unlock(&file->mutex);
+
+	return ret;
+}
+
+static void dequeue_send(struct ib_umad_file *file,
+			 struct ib_umad_packet *packet)
+{
+	spin_lock_irq(&file->send_lock);
+	list_del(&packet->list);
+	spin_unlock_irq(&file->send_lock);
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *send_wc)
+{
+	struct ib_umad_file *file = agent->context;
+	struct ib_umad_packet *packet = send_wc->send_buf->context[0];
+
+	dequeue_send(file, packet);
+	ib_destroy_ah(packet->msg->ah);
+	ib_free_send_mad(packet->msg);
+
+	if (send_wc->status == IB_WC_RESP_TIMEOUT_ERR) {
+		packet->length = IB_MGMT_MAD_HDR;
+		packet->mad.hdr.status = ETIMEDOUT;
+		if (!queue_packet(file, agent, packet))
+			return;
+	}
+	kfree(packet);
+}
+
+static void recv_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_recv_wc *mad_recv_wc)
+{
+	struct ib_umad_file *file = agent->context;
+	struct ib_umad_packet *packet;
+
+	if (mad_recv_wc->wc->status != IB_WC_SUCCESS)
+		goto err1;
+
+	packet = kzalloc(sizeof *packet, GFP_KERNEL);
+	if (!packet)
+		goto err1;
+
+	packet->length = mad_recv_wc->mad_len;
+	packet->recv_wc = mad_recv_wc;
+
+	packet->mad.hdr.status	   = 0;
+	packet->mad.hdr.length	   = hdr_size(file) + mad_recv_wc->mad_len;
+	packet->mad.hdr.qpn	   = cpu_to_be32(mad_recv_wc->wc->src_qp);
+	packet->mad.hdr.lid	   = cpu_to_be16(mad_recv_wc->wc->slid);
+	packet->mad.hdr.sl	   = mad_recv_wc->wc->sl;
+	packet->mad.hdr.path_bits  = mad_recv_wc->wc->dlid_path_bits;
+	packet->mad.hdr.pkey_index = mad_recv_wc->wc->pkey_index;
+	packet->mad.hdr.grh_present = !!(mad_recv_wc->wc->wc_flags & IB_WC_GRH);
+	if (packet->mad.hdr.grh_present) {
+		struct ib_ah_attr ah_attr;
+
+		ib_init_ah_from_wc(agent->device, agent->port_num,
+				   mad_recv_wc->wc, mad_recv_wc->recv_buf.grh,
+				   &ah_attr);
+
+		packet->mad.hdr.gid_index = ah_attr.grh.sgid_index;
+		packet->mad.hdr.hop_limit = ah_attr.grh.hop_limit;
+		packet->mad.hdr.traffic_class = ah_attr.grh.traffic_class;
+		memcpy(packet->mad.hdr.gid, &ah_attr.grh.dgid, 16);
+		packet->mad.hdr.flow_label = cpu_to_be32(ah_attr.grh.flow_label);
+	}
+
+	if (queue_packet(file, agent, packet))
+		goto err2;
+	return;
+
+err2:
+	kfree(packet);
+err1:
+	ib_free_recv_mad(mad_recv_wc);
+}
+
+static ssize_t copy_recv_mad(struct ib_umad_file *file, char __user *buf,
+			     struct ib_umad_packet *packet, size_t count)
+{
+	struct ib_mad_recv_buf *recv_buf;
+	int left, seg_payload, offset, max_seg_payload;
+
+	/* We need enough room to copy the first (or only) MAD segment. */
+	recv_buf = &packet->recv_wc->recv_buf;
+	if ((packet->length <= sizeof (*recv_buf->mad) &&
+	     count < hdr_size(file) + packet->length) ||
+	    (packet->length > sizeof (*recv_buf->mad) &&
+	     count < hdr_size(file) + sizeof (*recv_buf->mad)))
+		return -EINVAL;
+
+	if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+		return -EFAULT;
+
+	buf += hdr_size(file);
+	seg_payload = min_t(int, packet->length, sizeof (*recv_buf->mad));
+	if (copy_to_user(buf, recv_buf->mad, seg_payload))
+		return -EFAULT;
+
+	if (seg_payload < packet->length) {
+		/*
+		 * Multipacket RMPP MAD message. Copy remainder of message.
+		 * Note that last segment may have a shorter payload.
+		 */
+		if (count < hdr_size(file) + packet->length) {
+			/*
+			 * The buffer is too small, return the first RMPP segment,
+			 * which includes the RMPP message length.
+			 */
+			return -ENOSPC;
+		}
+		offset = ib_get_mad_data_offset(recv_buf->mad->mad_hdr.mgmt_class);
+		max_seg_payload = sizeof (struct ib_mad) - offset;
+
+		for (left = packet->length - seg_payload, buf += seg_payload;
+		     left; left -= seg_payload, buf += seg_payload) {
+			recv_buf = container_of(recv_buf->list.next,
+						struct ib_mad_recv_buf, list);
+			seg_payload = min(left, max_seg_payload);
+			if (copy_to_user(buf, ((void *) recv_buf->mad) + offset,
+					 seg_payload))
+				return -EFAULT;
+		}
+	}
+	return hdr_size(file) + packet->length;
+}
+
+static ssize_t copy_send_mad(struct ib_umad_file *file, char __user *buf,
+			     struct ib_umad_packet *packet, size_t count)
+{
+	ssize_t size = hdr_size(file) + packet->length;
+
+	if (count < size)
+		return -EINVAL;
+
+	if (copy_to_user(buf, &packet->mad, hdr_size(file)))
+		return -EFAULT;
+
+	buf += hdr_size(file);
+
+	if (copy_to_user(buf, packet->mad.data, packet->length))
+		return -EFAULT;
+
+	return size;
+}
+
+static ssize_t ib_umad_read(struct file *filp, char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_packet *packet;
+	ssize_t ret;
+
+	if (count < hdr_size(file))
+		return -EINVAL;
+
+	mutex_lock(&file->mutex);
+
+	while (list_empty(&file->recv_list)) {
+		mutex_unlock(&file->mutex);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->recv_wait,
+					     !list_empty(&file->recv_list)))
+			return -ERESTARTSYS;
+
+		mutex_lock(&file->mutex);
+	}
+
+	packet = list_entry(file->recv_list.next, struct ib_umad_packet, list);
+	list_del(&packet->list);
+
+	mutex_unlock(&file->mutex);
+
+	if (packet->recv_wc)
+		ret = copy_recv_mad(file, buf, packet, count);
+	else
+		ret = copy_send_mad(file, buf, packet, count);
+
+	if (ret < 0) {
+		/* Requeue packet */
+		mutex_lock(&file->mutex);
+		list_add(&packet->list, &file->recv_list);
+		mutex_unlock(&file->mutex);
+	} else {
+		if (packet->recv_wc)
+			ib_free_recv_mad(packet->recv_wc);
+		kfree(packet);
+	}
+	return ret;
+}
+
+static int copy_rmpp_mad(struct ib_mad_send_buf *msg, const char __user *buf)
+{
+	int left, seg;
+
+	/* Copy class specific header */
+	if ((msg->hdr_len > IB_MGMT_RMPP_HDR) &&
+	    copy_from_user(msg->mad + IB_MGMT_RMPP_HDR, buf + IB_MGMT_RMPP_HDR,
+			   msg->hdr_len - IB_MGMT_RMPP_HDR))
+		return -EFAULT;
+
+	/* All headers are in place.  Copy data segments. */
+	for (seg = 1, left = msg->data_len, buf += msg->hdr_len; left > 0;
+	     seg++, left -= msg->seg_size, buf += msg->seg_size) {
+		if (copy_from_user(ib_get_rmpp_segment(msg, seg), buf,
+				   min(left, msg->seg_size)))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+static int same_destination(struct ib_user_mad_hdr *hdr1,
+			    struct ib_user_mad_hdr *hdr2)
+{
+	if (!hdr1->grh_present && !hdr2->grh_present)
+	   return (hdr1->lid == hdr2->lid);
+
+	if (hdr1->grh_present && hdr2->grh_present)
+	   return !memcmp(hdr1->gid, hdr2->gid, 16);
+
+	return 0;
+}
+
+static int is_duplicate(struct ib_umad_file *file,
+			struct ib_umad_packet *packet)
+{
+	struct ib_umad_packet *sent_packet;
+	struct ib_mad_hdr *sent_hdr, *hdr;
+
+	hdr = (struct ib_mad_hdr *) packet->mad.data;
+	list_for_each_entry(sent_packet, &file->send_list, list) {
+		sent_hdr = (struct ib_mad_hdr *) sent_packet->mad.data;
+
+		if ((hdr->tid != sent_hdr->tid) ||
+		    (hdr->mgmt_class != sent_hdr->mgmt_class))
+			continue;
+
+		/*
+		 * No need to be overly clever here.  If two new operations have
+		 * the same TID, reject the second as a duplicate.  This is more
+		 * restrictive than required by the spec.
+		 */
+		if (!ib_response_mad((struct ib_mad *) hdr)) {
+			if (!ib_response_mad((struct ib_mad *) sent_hdr))
+				return 1;
+			continue;
+		} else if (!ib_response_mad((struct ib_mad *) sent_hdr))
+			continue;
+
+		if (same_destination(&packet->mad.hdr, &sent_packet->mad.hdr))
+			return 1;
+	}
+
+	return 0;
+}
+
+static ssize_t ib_umad_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_packet *packet;
+	struct ib_mad_agent *agent;
+	struct ib_ah_attr ah_attr;
+	struct ib_ah *ah;
+	struct ib_rmpp_mad *rmpp_mad;
+	__be64 *tid;
+	int ret, data_len, hdr_len, copy_offset, rmpp_active;
+
+	if (count < hdr_size(file) + IB_MGMT_RMPP_HDR)
+		return -EINVAL;
+
+	packet = kzalloc(sizeof *packet + IB_MGMT_RMPP_HDR, GFP_KERNEL);
+	if (!packet)
+		return -ENOMEM;
+
+	if (copy_from_user(&packet->mad, buf, hdr_size(file))) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	if (packet->mad.hdr.id >= IB_UMAD_MAX_AGENTS) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	buf += hdr_size(file);
+
+	if (copy_from_user(packet->mad.data, buf, IB_MGMT_RMPP_HDR)) {
+		ret = -EFAULT;
+		goto err;
+	}
+
+	mutex_lock(&file->mutex);
+
+	agent = __get_agent(file, packet->mad.hdr.id);
+	if (!agent) {
+		ret = -EINVAL;
+		goto err_up;
+	}
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid          = be16_to_cpu(packet->mad.hdr.lid);
+	ah_attr.sl            = packet->mad.hdr.sl;
+	ah_attr.src_path_bits = packet->mad.hdr.path_bits;
+	ah_attr.port_num      = file->port->port_num;
+	if (packet->mad.hdr.grh_present) {
+		ah_attr.ah_flags = IB_AH_GRH;
+		memcpy(ah_attr.grh.dgid.raw, packet->mad.hdr.gid, 16);
+		ah_attr.grh.sgid_index	   = packet->mad.hdr.gid_index;
+		ah_attr.grh.flow_label	   = be32_to_cpu(packet->mad.hdr.flow_label);
+		ah_attr.grh.hop_limit	   = packet->mad.hdr.hop_limit;
+		ah_attr.grh.traffic_class  = packet->mad.hdr.traffic_class;
+	}
+
+	ah = ib_create_ah(agent->qp->pd, &ah_attr);
+	if (IS_ERR(ah)) {
+		ret = PTR_ERR(ah);
+		goto err_up;
+	}
+
+	rmpp_mad = (struct ib_rmpp_mad *) packet->mad.data;
+	hdr_len = ib_get_mad_data_offset(rmpp_mad->mad_hdr.mgmt_class);
+
+	if (ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+	    && ib_mad_kernel_rmpp_agent(agent)) {
+		copy_offset = IB_MGMT_RMPP_HDR;
+		rmpp_active = ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) &
+						IB_MGMT_RMPP_FLAG_ACTIVE;
+	} else {
+		copy_offset = IB_MGMT_MAD_HDR;
+		rmpp_active = 0;
+	}
+
+	data_len = count - hdr_size(file) - hdr_len;
+	packet->msg = ib_create_send_mad(agent,
+					 be32_to_cpu(packet->mad.hdr.qpn),
+					 packet->mad.hdr.pkey_index, rmpp_active,
+					 hdr_len, data_len, GFP_KERNEL);
+	if (IS_ERR(packet->msg)) {
+		ret = PTR_ERR(packet->msg);
+		goto err_ah;
+	}
+
+	packet->msg->ah		= ah;
+	packet->msg->timeout_ms = packet->mad.hdr.timeout_ms;
+	packet->msg->retries	= packet->mad.hdr.retries;
+	packet->msg->context[0] = packet;
+
+	/* Copy MAD header.  Any RMPP header is already in place. */
+	memcpy(packet->msg->mad, packet->mad.data, IB_MGMT_MAD_HDR);
+
+	if (!rmpp_active) {
+		if (copy_from_user(packet->msg->mad + copy_offset,
+				   buf + copy_offset,
+				   hdr_len + data_len - copy_offset)) {
+			ret = -EFAULT;
+			goto err_msg;
+		}
+	} else {
+		ret = copy_rmpp_mad(packet->msg, buf);
+		if (ret)
+			goto err_msg;
+	}
+
+	/*
+	 * Set the high-order part of the transaction ID to make MADs from
+	 * different agents unique, and allow routing responses back to the
+	 * original requestor.
+	 */
+	if (!ib_response_mad(packet->msg->mad)) {
+		tid = &((struct ib_mad_hdr *) packet->msg->mad)->tid;
+		*tid = cpu_to_be64(((u64) agent->hi_tid) << 32 |
+				   (be64_to_cpup(tid) & 0xffffffff));
+		rmpp_mad->mad_hdr.tid = *tid;
+	}
+
+	if (!ib_mad_kernel_rmpp_agent(agent)
+	   && ib_is_mad_class_rmpp(rmpp_mad->mad_hdr.mgmt_class)
+	   && (ib_get_rmpp_flags(&rmpp_mad->rmpp_hdr) & IB_MGMT_RMPP_FLAG_ACTIVE)) {
+		spin_lock_irq(&file->send_lock);
+		list_add_tail(&packet->list, &file->send_list);
+		spin_unlock_irq(&file->send_lock);
+	} else {
+		spin_lock_irq(&file->send_lock);
+		ret = is_duplicate(file, packet);
+		if (!ret)
+			list_add_tail(&packet->list, &file->send_list);
+		spin_unlock_irq(&file->send_lock);
+		if (ret) {
+			ret = -EINVAL;
+			goto err_msg;
+		}
+	}
+
+	ret = ib_post_send_mad(packet->msg, NULL);
+	if (ret)
+		goto err_send;
+
+	mutex_unlock(&file->mutex);
+	return count;
+
+err_send:
+	dequeue_send(file, packet);
+err_msg:
+	ib_free_send_mad(packet->msg);
+err_ah:
+	ib_destroy_ah(ah);
+err_up:
+	mutex_unlock(&file->mutex);
+err:
+	kfree(packet);
+	return ret;
+}
+
+static unsigned int ib_umad_poll(struct file *filp, struct poll_table_struct *wait)
+{
+	struct ib_umad_file *file = filp->private_data;
+
+	/* we will always be able to post a MAD send */
+	unsigned int mask = POLLOUT | POLLWRNORM;
+
+	poll_wait(filp, &file->recv_wait, wait);
+
+	if (!list_empty(&file->recv_list))
+		mask |= POLLIN | POLLRDNORM;
+
+	return mask;
+}
+
+static int ib_umad_reg_agent(struct ib_umad_file *file, void __user *arg,
+			     int compat_method_mask)
+{
+	struct ib_user_mad_reg_req ureq;
+	struct ib_mad_reg_req req;
+	struct ib_mad_agent *agent = NULL;
+	int agent_id;
+	int ret;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	if (!file->port->ib_dev) {
+		dev_notice(file->port->dev,
+			   "ib_umad_reg_agent: invalid device\n");
+		ret = -EPIPE;
+		goto out;
+	}
+
+	if (copy_from_user(&ureq, arg, sizeof ureq)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (ureq.qpn != 0 && ureq.qpn != 1) {
+		dev_notice(file->port->dev,
+			   "ib_umad_reg_agent: invalid QPN %d specified\n",
+			   ureq.qpn);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+		if (!__get_agent(file, agent_id))
+			goto found;
+
+	dev_notice(file->port->dev,
+		   "ib_umad_reg_agent: Max Agents (%u) reached\n",
+		   IB_UMAD_MAX_AGENTS);
+	ret = -ENOMEM;
+	goto out;
+
+found:
+	if (ureq.mgmt_class) {
+		memset(&req, 0, sizeof(req));
+		req.mgmt_class         = ureq.mgmt_class;
+		req.mgmt_class_version = ureq.mgmt_class_version;
+		memcpy(req.oui, ureq.oui, sizeof req.oui);
+
+		if (compat_method_mask) {
+			u32 *umm = (u32 *) ureq.method_mask;
+			int i;
+
+			for (i = 0; i < BITS_TO_LONGS(IB_MGMT_MAX_METHODS); ++i)
+				req.method_mask[i] =
+					umm[i * 2] | ((u64) umm[i * 2 + 1] << 32);
+		} else
+			memcpy(req.method_mask, ureq.method_mask,
+			       sizeof req.method_mask);
+	}
+
+	agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+				      ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+				      ureq.mgmt_class ? &req : NULL,
+				      ureq.rmpp_version,
+				      send_handler, recv_handler, file, 0);
+	if (IS_ERR(agent)) {
+		ret = PTR_ERR(agent);
+		agent = NULL;
+		goto out;
+	}
+
+	if (put_user(agent_id,
+		     (u32 __user *) (arg + offsetof(struct ib_user_mad_reg_req, id)))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (!file->already_used) {
+		file->already_used = 1;
+		if (!file->use_pkey_index) {
+			dev_warn(file->port->dev,
+				"process %s did not enable P_Key index support.\n",
+				current->comm);
+			dev_warn(file->port->dev,
+				"   Documentation/infiniband/user_mad.txt has info on the new ABI.\n");
+		}
+	}
+
+	file->agent[agent_id] = agent;
+	ret = 0;
+
+out:
+	mutex_unlock(&file->mutex);
+
+	if (ret && agent)
+		ib_unregister_mad_agent(agent);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	return ret;
+}
+
+static int ib_umad_reg_agent2(struct ib_umad_file *file, void __user *arg)
+{
+	struct ib_user_mad_reg_req2 ureq;
+	struct ib_mad_reg_req req;
+	struct ib_mad_agent *agent = NULL;
+	int agent_id;
+	int ret;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	if (!file->port->ib_dev) {
+		dev_notice(file->port->dev,
+			   "ib_umad_reg_agent2: invalid device\n");
+		ret = -EPIPE;
+		goto out;
+	}
+
+	if (copy_from_user(&ureq, arg, sizeof(ureq))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (ureq.qpn != 0 && ureq.qpn != 1) {
+		dev_notice(file->port->dev,
+			   "ib_umad_reg_agent2: invalid QPN %d specified\n",
+			   ureq.qpn);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (ureq.flags & ~IB_USER_MAD_REG_FLAGS_CAP) {
+		dev_notice(file->port->dev,
+			   "ib_umad_reg_agent2 failed: invalid registration flags specified 0x%x; supported 0x%x\n",
+			   ureq.flags, IB_USER_MAD_REG_FLAGS_CAP);
+		ret = -EINVAL;
+
+		if (put_user((u32)IB_USER_MAD_REG_FLAGS_CAP,
+				(u32 __user *) (arg + offsetof(struct
+				ib_user_mad_reg_req2, flags))))
+			ret = -EFAULT;
+
+		goto out;
+	}
+
+	for (agent_id = 0; agent_id < IB_UMAD_MAX_AGENTS; ++agent_id)
+		if (!__get_agent(file, agent_id))
+			goto found;
+
+	dev_notice(file->port->dev,
+		   "ib_umad_reg_agent2: Max Agents (%u) reached\n",
+		   IB_UMAD_MAX_AGENTS);
+	ret = -ENOMEM;
+	goto out;
+
+found:
+	if (ureq.mgmt_class) {
+		memset(&req, 0, sizeof(req));
+		req.mgmt_class         = ureq.mgmt_class;
+		req.mgmt_class_version = ureq.mgmt_class_version;
+		if (ureq.oui & 0xff000000) {
+			dev_notice(file->port->dev,
+				   "ib_umad_reg_agent2 failed: oui invalid 0x%08x\n",
+				   ureq.oui);
+			ret = -EINVAL;
+			goto out;
+		}
+		req.oui[2] =  ureq.oui & 0x0000ff;
+		req.oui[1] = (ureq.oui & 0x00ff00) >> 8;
+		req.oui[0] = (ureq.oui & 0xff0000) >> 16;
+		memcpy(req.method_mask, ureq.method_mask,
+			sizeof(req.method_mask));
+	}
+
+	agent = ib_register_mad_agent(file->port->ib_dev, file->port->port_num,
+				      ureq.qpn ? IB_QPT_GSI : IB_QPT_SMI,
+				      ureq.mgmt_class ? &req : NULL,
+				      ureq.rmpp_version,
+				      send_handler, recv_handler, file,
+				      ureq.flags);
+	if (IS_ERR(agent)) {
+		ret = PTR_ERR(agent);
+		agent = NULL;
+		goto out;
+	}
+
+	if (put_user(agent_id,
+		     (u32 __user *)(arg +
+				offsetof(struct ib_user_mad_reg_req2, id)))) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	if (!file->already_used) {
+		file->already_used = 1;
+		file->use_pkey_index = 1;
+	}
+
+	file->agent[agent_id] = agent;
+	ret = 0;
+
+out:
+	mutex_unlock(&file->mutex);
+
+	if (ret && agent)
+		ib_unregister_mad_agent(agent);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	return ret;
+}
+
+
+static int ib_umad_unreg_agent(struct ib_umad_file *file, u32 __user *arg)
+{
+	struct ib_mad_agent *agent = NULL;
+	u32 id;
+	int ret = 0;
+
+	if (get_user(id, arg))
+		return -EFAULT;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	if (id >= IB_UMAD_MAX_AGENTS || !__get_agent(file, id)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	agent = file->agent[id];
+	file->agent[id] = NULL;
+
+out:
+	mutex_unlock(&file->mutex);
+
+	if (agent)
+		ib_unregister_mad_agent(agent);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	return ret;
+}
+
+static long ib_umad_enable_pkey(struct ib_umad_file *file)
+{
+	int ret = 0;
+
+	mutex_lock(&file->mutex);
+	if (file->already_used)
+		ret = -EINVAL;
+	else
+		file->use_pkey_index = 1;
+	mutex_unlock(&file->mutex);
+
+	return ret;
+}
+
+static long ib_umad_ioctl(struct file *filp, unsigned int cmd,
+			  unsigned long arg)
+{
+	switch (cmd) {
+	case IB_USER_MAD_REGISTER_AGENT:
+		return ib_umad_reg_agent(filp->private_data, (void __user *) arg, 0);
+	case IB_USER_MAD_UNREGISTER_AGENT:
+		return ib_umad_unreg_agent(filp->private_data, (__u32 __user *) arg);
+	case IB_USER_MAD_ENABLE_PKEY:
+		return ib_umad_enable_pkey(filp->private_data);
+	case IB_USER_MAD_REGISTER_AGENT2:
+		return ib_umad_reg_agent2(filp->private_data, (void __user *) arg);
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+
+#ifdef CONFIG_COMPAT
+static long ib_umad_compat_ioctl(struct file *filp, unsigned int cmd,
+				 unsigned long arg)
+{
+	switch (cmd) {
+	case IB_USER_MAD_REGISTER_AGENT:
+		return ib_umad_reg_agent(filp->private_data, compat_ptr(arg), 1);
+	case IB_USER_MAD_UNREGISTER_AGENT:
+		return ib_umad_unreg_agent(filp->private_data, compat_ptr(arg));
+	case IB_USER_MAD_ENABLE_PKEY:
+		return ib_umad_enable_pkey(filp->private_data);
+	case IB_USER_MAD_REGISTER_AGENT2:
+		return ib_umad_reg_agent2(filp->private_data, compat_ptr(arg));
+	default:
+		return -ENOIOCTLCMD;
+	}
+}
+#endif
+
+/*
+ * ib_umad_open() does not need the BKL:
+ *
+ *  - the ib_umad_port structures are properly reference counted, and
+ *    everything else is purely local to the file being created, so
+ *    races against other open calls are not a problem;
+ *  - the ioctl method does not affect any global state outside of the
+ *    file structure being operated on;
+ */
+static int ib_umad_open(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port;
+	struct ib_umad_file *file;
+	int ret = -ENXIO;
+
+	port = container_of(inode->i_cdev, struct ib_umad_port, cdev);
+
+	mutex_lock(&port->file_mutex);
+
+	if (!port->ib_dev)
+		goto out;
+
+	ret = -ENOMEM;
+	file = kzalloc(sizeof *file, GFP_KERNEL);
+	if (!file)
+		goto out;
+
+	mutex_init(&file->mutex);
+	spin_lock_init(&file->send_lock);
+	INIT_LIST_HEAD(&file->recv_list);
+	INIT_LIST_HEAD(&file->send_list);
+	init_waitqueue_head(&file->recv_wait);
+
+	file->port = port;
+	filp->private_data = file;
+
+	list_add_tail(&file->port_list, &port->file_list);
+
+	ret = nonseekable_open(inode, filp);
+	if (ret) {
+		list_del(&file->port_list);
+		kfree(file);
+		goto out;
+	}
+
+	kobject_get(&port->umad_dev->kobj);
+
+out:
+	mutex_unlock(&port->file_mutex);
+	return ret;
+}
+
+static int ib_umad_close(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_file *file = filp->private_data;
+	struct ib_umad_device *dev = file->port->umad_dev;
+	struct ib_umad_packet *packet, *tmp;
+	int already_dead;
+	int i;
+
+	mutex_lock(&file->port->file_mutex);
+	mutex_lock(&file->mutex);
+
+	already_dead = file->agents_dead;
+	file->agents_dead = 1;
+
+	list_for_each_entry_safe(packet, tmp, &file->recv_list, list) {
+		if (packet->recv_wc)
+			ib_free_recv_mad(packet->recv_wc);
+		kfree(packet);
+	}
+
+	list_del(&file->port_list);
+
+	mutex_unlock(&file->mutex);
+
+	if (!already_dead)
+		for (i = 0; i < IB_UMAD_MAX_AGENTS; ++i)
+			if (file->agent[i])
+				ib_unregister_mad_agent(file->agent[i]);
+
+	mutex_unlock(&file->port->file_mutex);
+
+	kfree(file);
+	kobject_put(&dev->kobj);
+
+	return 0;
+}
+
+static const struct file_operations umad_fops = {
+	.owner		= THIS_MODULE,
+	.read		= ib_umad_read,
+	.write		= ib_umad_write,
+	.poll		= ib_umad_poll,
+	.unlocked_ioctl = ib_umad_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= ib_umad_compat_ioctl,
+#endif
+	.open		= ib_umad_open,
+	.release	= ib_umad_close,
+	.llseek		= no_llseek,
+};
+
+static int ib_umad_sm_open(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port;
+	struct ib_port_modify props = {
+		.set_port_cap_mask = IB_PORT_SM
+	};
+	int ret;
+
+	port = container_of(inode->i_cdev, struct ib_umad_port, sm_cdev);
+
+	if (filp->f_flags & O_NONBLOCK) {
+		if (down_trylock(&port->sm_sem)) {
+			ret = -EAGAIN;
+			goto fail;
+		}
+	} else {
+		if (down_interruptible(&port->sm_sem)) {
+			ret = -ERESTARTSYS;
+			goto fail;
+		}
+	}
+
+	ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+	if (ret)
+		goto err_up_sem;
+
+	filp->private_data = port;
+
+	ret = nonseekable_open(inode, filp);
+	if (ret)
+		goto err_clr_sm_cap;
+
+	kobject_get(&port->umad_dev->kobj);
+
+	return 0;
+
+err_clr_sm_cap:
+	swap(props.set_port_cap_mask, props.clr_port_cap_mask);
+	ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+
+err_up_sem:
+	up(&port->sm_sem);
+
+fail:
+	return ret;
+}
+
+static int ib_umad_sm_close(struct inode *inode, struct file *filp)
+{
+	struct ib_umad_port *port = filp->private_data;
+	struct ib_port_modify props = {
+		.clr_port_cap_mask = IB_PORT_SM
+	};
+	int ret = 0;
+
+	mutex_lock(&port->file_mutex);
+	if (port->ib_dev)
+		ret = ib_modify_port(port->ib_dev, port->port_num, 0, &props);
+	mutex_unlock(&port->file_mutex);
+
+	up(&port->sm_sem);
+
+	kobject_put(&port->umad_dev->kobj);
+
+	return ret;
+}
+
+static const struct file_operations umad_sm_fops = {
+	.owner	 = THIS_MODULE,
+	.open	 = ib_umad_sm_open,
+	.release = ib_umad_sm_close,
+	.llseek	 = no_llseek,
+};
+
+static struct ib_client umad_client = {
+	.name   = "umad",
+	.add    = ib_umad_add_one,
+	.remove = ib_umad_remove_one
+};
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ib_umad_port *port = dev_get_drvdata(dev);
+
+	if (!port)
+		return -ENODEV;
+
+	return sprintf(buf, "%s\n", port->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_port(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct ib_umad_port *port = dev_get_drvdata(dev);
+
+	if (!port)
+		return -ENODEV;
+
+	return sprintf(buf, "%d\n", port->port_num);
+}
+static DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+			 __stringify(IB_USER_MAD_ABI_VERSION));
+
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UMAD_MAX_PORTS);
+static int find_overflow_devnum(struct ib_device *device)
+{
+	int ret;
+
+	if (!overflow_maj) {
+		ret = alloc_chrdev_region(&overflow_maj, 0, IB_UMAD_MAX_PORTS * 2,
+					  "infiniband_mad");
+		if (ret) {
+			dev_err(&device->dev,
+				"couldn't register dynamic device number\n");
+			return ret;
+		}
+	}
+
+	ret = find_first_zero_bit(overflow_map, IB_UMAD_MAX_PORTS);
+	if (ret >= IB_UMAD_MAX_PORTS)
+		return -1;
+
+	return ret;
+}
+
+static int ib_umad_init_port(struct ib_device *device, int port_num,
+			     struct ib_umad_device *umad_dev,
+			     struct ib_umad_port *port)
+{
+	int devnum;
+	dev_t base;
+
+	spin_lock(&port_lock);
+	devnum = find_first_zero_bit(dev_map, IB_UMAD_MAX_PORTS);
+	if (devnum >= IB_UMAD_MAX_PORTS) {
+		spin_unlock(&port_lock);
+		devnum = find_overflow_devnum(device);
+		if (devnum < 0)
+			return -1;
+
+		spin_lock(&port_lock);
+		port->dev_num = devnum + IB_UMAD_MAX_PORTS;
+		base = devnum + overflow_maj;
+		set_bit(devnum, overflow_map);
+	} else {
+		port->dev_num = devnum;
+		base = devnum + base_dev;
+		set_bit(devnum, dev_map);
+	}
+	spin_unlock(&port_lock);
+
+	port->ib_dev   = device;
+	port->port_num = port_num;
+	sema_init(&port->sm_sem, 1);
+	mutex_init(&port->file_mutex);
+	INIT_LIST_HEAD(&port->file_list);
+
+	cdev_init(&port->cdev, &umad_fops);
+	port->cdev.owner = THIS_MODULE;
+	port->cdev.kobj.parent = &umad_dev->kobj;
+	kobject_set_name(&port->cdev.kobj, "umad%d", port->dev_num);
+	if (cdev_add(&port->cdev, base, 1))
+		goto err_cdev;
+
+	port->dev = device_create(umad_class, device->dma_device,
+				  port->cdev.dev, port,
+				  "umad%d", port->dev_num);
+	if (IS_ERR(port->dev))
+		goto err_cdev;
+
+	if (device_create_file(port->dev, &dev_attr_ibdev))
+		goto err_dev;
+	if (device_create_file(port->dev, &dev_attr_port))
+		goto err_dev;
+
+	base += IB_UMAD_MAX_PORTS;
+	cdev_init(&port->sm_cdev, &umad_sm_fops);
+	port->sm_cdev.owner = THIS_MODULE;
+	port->sm_cdev.kobj.parent = &umad_dev->kobj;
+	kobject_set_name(&port->sm_cdev.kobj, "issm%d", port->dev_num);
+	if (cdev_add(&port->sm_cdev, base, 1))
+		goto err_sm_cdev;
+
+	port->sm_dev = device_create(umad_class, device->dma_device,
+				     port->sm_cdev.dev, port,
+				     "issm%d", port->dev_num);
+	if (IS_ERR(port->sm_dev))
+		goto err_sm_cdev;
+
+	if (device_create_file(port->sm_dev, &dev_attr_ibdev))
+		goto err_sm_dev;
+	if (device_create_file(port->sm_dev, &dev_attr_port))
+		goto err_sm_dev;
+
+	return 0;
+
+err_sm_dev:
+	device_destroy(umad_class, port->sm_cdev.dev);
+
+err_sm_cdev:
+	cdev_del(&port->sm_cdev);
+
+err_dev:
+	device_destroy(umad_class, port->cdev.dev);
+
+err_cdev:
+	cdev_del(&port->cdev);
+	if (port->dev_num < IB_UMAD_MAX_PORTS)
+		clear_bit(devnum, dev_map);
+	else
+		clear_bit(devnum, overflow_map);
+
+	return -1;
+}
+
+static void ib_umad_kill_port(struct ib_umad_port *port)
+{
+	struct ib_umad_file *file;
+	int id;
+
+	dev_set_drvdata(port->dev,    NULL);
+	dev_set_drvdata(port->sm_dev, NULL);
+
+	device_destroy(umad_class, port->cdev.dev);
+	device_destroy(umad_class, port->sm_cdev.dev);
+
+	cdev_del(&port->cdev);
+	cdev_del(&port->sm_cdev);
+
+	mutex_lock(&port->file_mutex);
+
+	port->ib_dev = NULL;
+
+	list_for_each_entry(file, &port->file_list, port_list) {
+		mutex_lock(&file->mutex);
+		file->agents_dead = 1;
+		mutex_unlock(&file->mutex);
+
+		for (id = 0; id < IB_UMAD_MAX_AGENTS; ++id)
+			if (file->agent[id])
+				ib_unregister_mad_agent(file->agent[id]);
+	}
+
+	mutex_unlock(&port->file_mutex);
+
+	if (port->dev_num < IB_UMAD_MAX_PORTS)
+		clear_bit(port->dev_num, dev_map);
+	else
+		clear_bit(port->dev_num - IB_UMAD_MAX_PORTS, overflow_map);
+}
+
+static void ib_umad_add_one(struct ib_device *device)
+{
+	struct ib_umad_device *umad_dev;
+	int s, e, i;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH)
+		s = e = 0;
+	else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	umad_dev = kzalloc(sizeof *umad_dev +
+			   (e - s + 1) * sizeof (struct ib_umad_port),
+			   GFP_KERNEL);
+	if (!umad_dev)
+		return;
+
+	kobject_init(&umad_dev->kobj, &ib_umad_dev_ktype);
+
+	umad_dev->start_port = s;
+	umad_dev->end_port   = e;
+
+	for (i = s; i <= e; ++i) {
+		umad_dev->port[i - s].umad_dev = umad_dev;
+
+		if (ib_umad_init_port(device, i, umad_dev,
+				      &umad_dev->port[i - s]))
+			goto err;
+	}
+
+	ib_set_client_data(device, &umad_client, umad_dev);
+
+	return;
+
+err:
+	while (--i >= s)
+		ib_umad_kill_port(&umad_dev->port[i - s]);
+
+	kobject_put(&umad_dev->kobj);
+}
+
+static void ib_umad_remove_one(struct ib_device *device)
+{
+	struct ib_umad_device *umad_dev = ib_get_client_data(device, &umad_client);
+	int i;
+
+	if (!umad_dev)
+		return;
+
+	for (i = 0; i <= umad_dev->end_port - umad_dev->start_port; ++i)
+		ib_umad_kill_port(&umad_dev->port[i]);
+
+	kobject_put(&umad_dev->kobj);
+}
+
+static char *umad_devnode(struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static int __init ib_umad_init(void)
+{
+	int ret;
+
+	ret = register_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2,
+				     "infiniband_mad");
+	if (ret) {
+		pr_err("couldn't register device number\n");
+		goto out;
+	}
+
+	umad_class = class_create(THIS_MODULE, "infiniband_mad");
+	if (IS_ERR(umad_class)) {
+		ret = PTR_ERR(umad_class);
+		pr_err("couldn't create class infiniband_mad\n");
+		goto out_chrdev;
+	}
+
+	umad_class->devnode = umad_devnode;
+
+	ret = class_create_file(umad_class, &class_attr_abi_version.attr);
+	if (ret) {
+		pr_err("couldn't create abi_version attribute\n");
+		goto out_class;
+	}
+
+	ret = ib_register_client(&umad_client);
+	if (ret) {
+		pr_err("couldn't register ib_umad client\n");
+		goto out_class;
+	}
+
+	return 0;
+
+out_class:
+	class_destroy(umad_class);
+
+out_chrdev:
+	unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+
+out:
+	return ret;
+}
+
+static void __exit ib_umad_cleanup(void)
+{
+	ib_unregister_client(&umad_client);
+	class_destroy(umad_class);
+	unregister_chrdev_region(base_dev, IB_UMAD_MAX_PORTS * 2);
+	if (overflow_maj)
+		unregister_chrdev_region(overflow_maj, IB_UMAD_MAX_PORTS * 2);
+}
+
+module_init(ib_umad_init);
+module_exit(ib_umad_cleanup);
diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h
new file mode 100644
index 000000000..b716b0815
--- /dev/null
+++ b/drivers/infiniband/core/uverbs.h
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef UVERBS_H
+#define UVERBS_H
+
+#include <linux/kref.h>
+#include <linux/idr.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+#include <linux/cdev.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#define INIT_UDATA(udata, ibuf, obuf, ilen, olen)			\
+	do {								\
+		(udata)->inbuf  = (const void __user *) (ibuf);		\
+		(udata)->outbuf = (void __user *) (obuf);		\
+		(udata)->inlen  = (ilen);				\
+		(udata)->outlen = (olen);				\
+	} while (0)
+
+#define INIT_UDATA_BUF_OR_NULL(udata, ibuf, obuf, ilen, olen)			\
+	do {									\
+		(udata)->inbuf  = (ilen) ? (const void __user *) (ibuf) : NULL;	\
+		(udata)->outbuf = (olen) ? (void __user *) (obuf) : NULL;	\
+		(udata)->inlen  = (ilen);					\
+		(udata)->outlen = (olen);					\
+	} while (0)
+
+/*
+ * Our lifetime rules for these structs are the following:
+ *
+ * struct ib_uverbs_device: One reference is held by the module and
+ * released in ib_uverbs_remove_one().  Another reference is taken by
+ * ib_uverbs_open() each time the character special file is opened,
+ * and released in ib_uverbs_release_file() when the file is released.
+ *
+ * struct ib_uverbs_file: One reference is held by the VFS and
+ * released when the file is closed.  Another reference is taken when
+ * an asynchronous event queue file is created and released when the
+ * event file is closed.
+ *
+ * struct ib_uverbs_event_file: One reference is held by the VFS and
+ * released when the file is closed.  For asynchronous event files,
+ * another reference is held by the corresponding main context file
+ * and released when that file is closed.  For completion event files,
+ * a reference is taken when a CQ is created that uses the file, and
+ * released when the CQ is destroyed.
+ */
+
+struct ib_uverbs_device {
+	struct kref				ref;
+	int					num_comp_vectors;
+	struct completion			comp;
+	struct device			       *dev;
+	struct ib_device		       *ib_dev;
+	int					devnum;
+	struct cdev			        cdev;
+	struct rb_root				xrcd_tree;
+	struct mutex				xrcd_tree_mutex;
+};
+
+struct ib_uverbs_event_file {
+	struct kref				ref;
+	int					is_async;
+	struct ib_uverbs_file		       *uverbs_file;
+	spinlock_t				lock;
+	int					is_closed;
+	wait_queue_head_t			poll_wait;
+	struct fasync_struct		       *async_queue;
+	struct list_head			event_list;
+};
+
+struct ib_uverbs_file {
+	struct kref				ref;
+	struct mutex				mutex;
+	struct ib_uverbs_device		       *device;
+	struct ib_ucontext		       *ucontext;
+	struct ib_event_handler			event_handler;
+	struct ib_uverbs_event_file	       *async_file;
+};
+
+struct ib_uverbs_event {
+	union {
+		struct ib_uverbs_async_event_desc	async;
+		struct ib_uverbs_comp_event_desc	comp;
+	}					desc;
+	struct list_head			list;
+	struct list_head			obj_list;
+	u32				       *counter;
+};
+
+struct ib_uverbs_mcast_entry {
+	struct list_head	list;
+	union ib_gid 		gid;
+	u16 			lid;
+};
+
+struct ib_uevent_object {
+	struct ib_uobject	uobject;
+	struct list_head	event_list;
+	u32			events_reported;
+};
+
+struct ib_uxrcd_object {
+	struct ib_uobject	uobject;
+	atomic_t		refcnt;
+};
+
+struct ib_usrq_object {
+	struct ib_uevent_object	uevent;
+	struct ib_uxrcd_object *uxrcd;
+};
+
+struct ib_uqp_object {
+	struct ib_uevent_object	uevent;
+	struct list_head 	mcast_list;
+	struct ib_uxrcd_object *uxrcd;
+};
+
+struct ib_ucq_object {
+	struct ib_uobject	uobject;
+	struct ib_uverbs_file  *uverbs_file;
+	struct list_head	comp_list;
+	struct list_head	async_list;
+	u32			comp_events_reported;
+	u32			async_events_reported;
+};
+
+extern spinlock_t ib_uverbs_idr_lock;
+extern struct idr ib_uverbs_pd_idr;
+extern struct idr ib_uverbs_mr_idr;
+extern struct idr ib_uverbs_mw_idr;
+extern struct idr ib_uverbs_ah_idr;
+extern struct idr ib_uverbs_cq_idr;
+extern struct idr ib_uverbs_qp_idr;
+extern struct idr ib_uverbs_srq_idr;
+extern struct idr ib_uverbs_xrcd_idr;
+extern struct idr ib_uverbs_rule_idr;
+
+void idr_remove_uobj(struct idr *idp, struct ib_uobject *uobj);
+
+struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+					int is_async);
+struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd);
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+			   struct ib_uverbs_event_file *ev_file,
+			   struct ib_ucq_object *uobj);
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+			      struct ib_uevent_object *uobj);
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context);
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr);
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+			     struct ib_event *event);
+void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd);
+
+struct ib_uverbs_flow_spec {
+	union {
+		union {
+			struct ib_uverbs_flow_spec_hdr hdr;
+			struct {
+				__u32 type;
+				__u16 size;
+				__u16 reserved;
+			};
+		};
+		struct ib_uverbs_flow_spec_eth     eth;
+		struct ib_uverbs_flow_spec_ipv4    ipv4;
+		struct ib_uverbs_flow_spec_tcp_udp tcp_udp;
+	};
+};
+
+#define IB_UVERBS_DECLARE_CMD(name)					\
+	ssize_t ib_uverbs_##name(struct ib_uverbs_file *file,		\
+				 const char __user *buf, int in_len,	\
+				 int out_len)
+
+IB_UVERBS_DECLARE_CMD(get_context);
+IB_UVERBS_DECLARE_CMD(query_device);
+IB_UVERBS_DECLARE_CMD(query_port);
+IB_UVERBS_DECLARE_CMD(alloc_pd);
+IB_UVERBS_DECLARE_CMD(dealloc_pd);
+IB_UVERBS_DECLARE_CMD(reg_mr);
+IB_UVERBS_DECLARE_CMD(rereg_mr);
+IB_UVERBS_DECLARE_CMD(dereg_mr);
+IB_UVERBS_DECLARE_CMD(alloc_mw);
+IB_UVERBS_DECLARE_CMD(dealloc_mw);
+IB_UVERBS_DECLARE_CMD(create_comp_channel);
+IB_UVERBS_DECLARE_CMD(create_cq);
+IB_UVERBS_DECLARE_CMD(resize_cq);
+IB_UVERBS_DECLARE_CMD(poll_cq);
+IB_UVERBS_DECLARE_CMD(req_notify_cq);
+IB_UVERBS_DECLARE_CMD(destroy_cq);
+IB_UVERBS_DECLARE_CMD(create_qp);
+IB_UVERBS_DECLARE_CMD(open_qp);
+IB_UVERBS_DECLARE_CMD(query_qp);
+IB_UVERBS_DECLARE_CMD(modify_qp);
+IB_UVERBS_DECLARE_CMD(destroy_qp);
+IB_UVERBS_DECLARE_CMD(post_send);
+IB_UVERBS_DECLARE_CMD(post_recv);
+IB_UVERBS_DECLARE_CMD(post_srq_recv);
+IB_UVERBS_DECLARE_CMD(create_ah);
+IB_UVERBS_DECLARE_CMD(destroy_ah);
+IB_UVERBS_DECLARE_CMD(attach_mcast);
+IB_UVERBS_DECLARE_CMD(detach_mcast);
+IB_UVERBS_DECLARE_CMD(create_srq);
+IB_UVERBS_DECLARE_CMD(modify_srq);
+IB_UVERBS_DECLARE_CMD(query_srq);
+IB_UVERBS_DECLARE_CMD(destroy_srq);
+IB_UVERBS_DECLARE_CMD(create_xsrq);
+IB_UVERBS_DECLARE_CMD(open_xrcd);
+IB_UVERBS_DECLARE_CMD(close_xrcd);
+
+#define IB_UVERBS_DECLARE_EX_CMD(name)				\
+	int ib_uverbs_ex_##name(struct ib_uverbs_file *file,	\
+				struct ib_udata *ucore,		\
+				struct ib_udata *uhw)
+
+IB_UVERBS_DECLARE_EX_CMD(create_flow);
+IB_UVERBS_DECLARE_EX_CMD(destroy_flow);
+IB_UVERBS_DECLARE_EX_CMD(query_device);
+
+#endif /* UVERBS_H */
diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c
new file mode 100644
index 000000000..a9f048990
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_cmd.c
@@ -0,0 +1,3357 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006, 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc.  All rights reserved.
+ * Copyright (c) 2006 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include <asm/uaccess.h>
+
+#include "uverbs.h"
+#include "core_priv.h"
+
+struct uverbs_lock_class {
+	struct lock_class_key	key;
+	char			name[16];
+};
+
+static struct uverbs_lock_class pd_lock_class	= { .name = "PD-uobj" };
+static struct uverbs_lock_class mr_lock_class	= { .name = "MR-uobj" };
+static struct uverbs_lock_class mw_lock_class	= { .name = "MW-uobj" };
+static struct uverbs_lock_class cq_lock_class	= { .name = "CQ-uobj" };
+static struct uverbs_lock_class qp_lock_class	= { .name = "QP-uobj" };
+static struct uverbs_lock_class ah_lock_class	= { .name = "AH-uobj" };
+static struct uverbs_lock_class srq_lock_class	= { .name = "SRQ-uobj" };
+static struct uverbs_lock_class xrcd_lock_class = { .name = "XRCD-uobj" };
+static struct uverbs_lock_class rule_lock_class = { .name = "RULE-uobj" };
+
+/*
+ * The ib_uobject locking scheme is as follows:
+ *
+ * - ib_uverbs_idr_lock protects the uverbs idrs themselves, so it
+ *   needs to be held during all idr operations.  When an object is
+ *   looked up, a reference must be taken on the object's kref before
+ *   dropping this lock.
+ *
+ * - Each object also has an rwsem.  This rwsem must be held for
+ *   reading while an operation that uses the object is performed.
+ *   For example, while registering an MR, the associated PD's
+ *   uobject.mutex must be held for reading.  The rwsem must be held
+ *   for writing while initializing or destroying an object.
+ *
+ * - In addition, each object has a "live" flag.  If this flag is not
+ *   set, then lookups of the object will fail even if it is found in
+ *   the idr.  This handles a reader that blocks and does not acquire
+ *   the rwsem until after the object is destroyed.  The destroy
+ *   operation will set the live flag to 0 and then drop the rwsem;
+ *   this will allow the reader to acquire the rwsem, see that the
+ *   live flag is 0, and then drop the rwsem and its reference to
+ *   object.  The underlying storage will not be freed until the last
+ *   reference to the object is dropped.
+ */
+
+static void init_uobj(struct ib_uobject *uobj, u64 user_handle,
+		      struct ib_ucontext *context, struct uverbs_lock_class *c)
+{
+	uobj->user_handle = user_handle;
+	uobj->context     = context;
+	kref_init(&uobj->ref);
+	init_rwsem(&uobj->mutex);
+	lockdep_set_class_and_name(&uobj->mutex, &c->key, c->name);
+	uobj->live        = 0;
+}
+
+static void release_uobj(struct kref *kref)
+{
+	kfree(container_of(kref, struct ib_uobject, ref));
+}
+
+static void put_uobj(struct ib_uobject *uobj)
+{
+	kref_put(&uobj->ref, release_uobj);
+}
+
+static void put_uobj_read(struct ib_uobject *uobj)
+{
+	up_read(&uobj->mutex);
+	put_uobj(uobj);
+}
+
+static void put_uobj_write(struct ib_uobject *uobj)
+{
+	up_write(&uobj->mutex);
+	put_uobj(uobj);
+}
+
+static int idr_add_uobj(struct idr *idr, struct ib_uobject *uobj)
+{
+	int ret;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock(&ib_uverbs_idr_lock);
+
+	ret = idr_alloc(idr, uobj, 0, 0, GFP_NOWAIT);
+	if (ret >= 0)
+		uobj->id = ret;
+
+	spin_unlock(&ib_uverbs_idr_lock);
+	idr_preload_end();
+
+	return ret < 0 ? ret : 0;
+}
+
+void idr_remove_uobj(struct idr *idr, struct ib_uobject *uobj)
+{
+	spin_lock(&ib_uverbs_idr_lock);
+	idr_remove(idr, uobj->id);
+	spin_unlock(&ib_uverbs_idr_lock);
+}
+
+static struct ib_uobject *__idr_get_uobj(struct idr *idr, int id,
+					 struct ib_ucontext *context)
+{
+	struct ib_uobject *uobj;
+
+	spin_lock(&ib_uverbs_idr_lock);
+	uobj = idr_find(idr, id);
+	if (uobj) {
+		if (uobj->context == context)
+			kref_get(&uobj->ref);
+		else
+			uobj = NULL;
+	}
+	spin_unlock(&ib_uverbs_idr_lock);
+
+	return uobj;
+}
+
+static struct ib_uobject *idr_read_uobj(struct idr *idr, int id,
+					struct ib_ucontext *context, int nested)
+{
+	struct ib_uobject *uobj;
+
+	uobj = __idr_get_uobj(idr, id, context);
+	if (!uobj)
+		return NULL;
+
+	if (nested)
+		down_read_nested(&uobj->mutex, SINGLE_DEPTH_NESTING);
+	else
+		down_read(&uobj->mutex);
+	if (!uobj->live) {
+		put_uobj_read(uobj);
+		return NULL;
+	}
+
+	return uobj;
+}
+
+static struct ib_uobject *idr_write_uobj(struct idr *idr, int id,
+					 struct ib_ucontext *context)
+{
+	struct ib_uobject *uobj;
+
+	uobj = __idr_get_uobj(idr, id, context);
+	if (!uobj)
+		return NULL;
+
+	down_write(&uobj->mutex);
+	if (!uobj->live) {
+		put_uobj_write(uobj);
+		return NULL;
+	}
+
+	return uobj;
+}
+
+static void *idr_read_obj(struct idr *idr, int id, struct ib_ucontext *context,
+			  int nested)
+{
+	struct ib_uobject *uobj;
+
+	uobj = idr_read_uobj(idr, id, context, nested);
+	return uobj ? uobj->object : NULL;
+}
+
+static struct ib_pd *idr_read_pd(int pd_handle, struct ib_ucontext *context)
+{
+	return idr_read_obj(&ib_uverbs_pd_idr, pd_handle, context, 0);
+}
+
+static void put_pd_read(struct ib_pd *pd)
+{
+	put_uobj_read(pd->uobject);
+}
+
+static struct ib_cq *idr_read_cq(int cq_handle, struct ib_ucontext *context, int nested)
+{
+	return idr_read_obj(&ib_uverbs_cq_idr, cq_handle, context, nested);
+}
+
+static void put_cq_read(struct ib_cq *cq)
+{
+	put_uobj_read(cq->uobject);
+}
+
+static struct ib_ah *idr_read_ah(int ah_handle, struct ib_ucontext *context)
+{
+	return idr_read_obj(&ib_uverbs_ah_idr, ah_handle, context, 0);
+}
+
+static void put_ah_read(struct ib_ah *ah)
+{
+	put_uobj_read(ah->uobject);
+}
+
+static struct ib_qp *idr_read_qp(int qp_handle, struct ib_ucontext *context)
+{
+	return idr_read_obj(&ib_uverbs_qp_idr, qp_handle, context, 0);
+}
+
+static struct ib_qp *idr_write_qp(int qp_handle, struct ib_ucontext *context)
+{
+	struct ib_uobject *uobj;
+
+	uobj = idr_write_uobj(&ib_uverbs_qp_idr, qp_handle, context);
+	return uobj ? uobj->object : NULL;
+}
+
+static void put_qp_read(struct ib_qp *qp)
+{
+	put_uobj_read(qp->uobject);
+}
+
+static void put_qp_write(struct ib_qp *qp)
+{
+	put_uobj_write(qp->uobject);
+}
+
+static struct ib_srq *idr_read_srq(int srq_handle, struct ib_ucontext *context)
+{
+	return idr_read_obj(&ib_uverbs_srq_idr, srq_handle, context, 0);
+}
+
+static void put_srq_read(struct ib_srq *srq)
+{
+	put_uobj_read(srq->uobject);
+}
+
+static struct ib_xrcd *idr_read_xrcd(int xrcd_handle, struct ib_ucontext *context,
+				     struct ib_uobject **uobj)
+{
+	*uobj = idr_read_uobj(&ib_uverbs_xrcd_idr, xrcd_handle, context, 0);
+	return *uobj ? (*uobj)->object : NULL;
+}
+
+static void put_xrcd_read(struct ib_uobject *uobj)
+{
+	put_uobj_read(uobj);
+}
+
+ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file,
+			      const char __user *buf,
+			      int in_len, int out_len)
+{
+	struct ib_uverbs_get_context      cmd;
+	struct ib_uverbs_get_context_resp resp;
+	struct ib_udata                   udata;
+	struct ib_device                 *ibdev = file->device->ib_dev;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	struct ib_device_attr		  dev_attr;
+#endif
+	struct ib_ucontext		 *ucontext;
+	struct file			 *filp;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	mutex_lock(&file->mutex);
+
+	if (file->ucontext) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	ucontext = ibdev->alloc_ucontext(ibdev, &udata);
+	if (IS_ERR(ucontext)) {
+		ret = PTR_ERR(ucontext);
+		goto err;
+	}
+
+	ucontext->device = ibdev;
+	INIT_LIST_HEAD(&ucontext->pd_list);
+	INIT_LIST_HEAD(&ucontext->mr_list);
+	INIT_LIST_HEAD(&ucontext->mw_list);
+	INIT_LIST_HEAD(&ucontext->cq_list);
+	INIT_LIST_HEAD(&ucontext->qp_list);
+	INIT_LIST_HEAD(&ucontext->srq_list);
+	INIT_LIST_HEAD(&ucontext->ah_list);
+	INIT_LIST_HEAD(&ucontext->xrcd_list);
+	INIT_LIST_HEAD(&ucontext->rule_list);
+	rcu_read_lock();
+	ucontext->tgid = get_task_pid(current->group_leader, PIDTYPE_PID);
+	rcu_read_unlock();
+	ucontext->closing = 0;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	ucontext->umem_tree = RB_ROOT;
+	init_rwsem(&ucontext->umem_rwsem);
+	ucontext->odp_mrs_count = 0;
+	INIT_LIST_HEAD(&ucontext->no_private_counters);
+
+	ret = ib_query_device(ibdev, &dev_attr);
+	if (ret)
+		goto err_free;
+	if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING))
+		ucontext->invalidate_range = NULL;
+
+#endif
+
+	resp.num_comp_vectors = file->device->num_comp_vectors;
+
+	ret = get_unused_fd_flags(O_CLOEXEC);
+	if (ret < 0)
+		goto err_free;
+	resp.async_fd = ret;
+
+	filp = ib_uverbs_alloc_event_file(file, 1);
+	if (IS_ERR(filp)) {
+		ret = PTR_ERR(filp);
+		goto err_fd;
+	}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_file;
+	}
+
+	file->async_file = filp->private_data;
+
+	INIT_IB_EVENT_HANDLER(&file->event_handler, file->device->ib_dev,
+			      ib_uverbs_event_handler);
+	ret = ib_register_event_handler(&file->event_handler);
+	if (ret)
+		goto err_file;
+
+	kref_get(&file->async_file->ref);
+	kref_get(&file->ref);
+	file->ucontext = ucontext;
+
+	fd_install(resp.async_fd, filp);
+
+	mutex_unlock(&file->mutex);
+
+	return in_len;
+
+err_file:
+	fput(filp);
+
+err_fd:
+	put_unused_fd(resp.async_fd);
+
+err_free:
+	put_pid(ucontext->tgid);
+	ibdev->dealloc_ucontext(ucontext);
+
+err:
+	mutex_unlock(&file->mutex);
+	return ret;
+}
+
+static void copy_query_dev_fields(struct ib_uverbs_file *file,
+				  struct ib_uverbs_query_device_resp *resp,
+				  struct ib_device_attr *attr)
+{
+	resp->fw_ver		= attr->fw_ver;
+	resp->node_guid		= file->device->ib_dev->node_guid;
+	resp->sys_image_guid	= attr->sys_image_guid;
+	resp->max_mr_size	= attr->max_mr_size;
+	resp->page_size_cap	= attr->page_size_cap;
+	resp->vendor_id		= attr->vendor_id;
+	resp->vendor_part_id	= attr->vendor_part_id;
+	resp->hw_ver		= attr->hw_ver;
+	resp->max_qp		= attr->max_qp;
+	resp->max_qp_wr		= attr->max_qp_wr;
+	resp->device_cap_flags	= attr->device_cap_flags;
+	resp->max_sge		= attr->max_sge;
+	resp->max_sge_rd	= attr->max_sge_rd;
+	resp->max_cq		= attr->max_cq;
+	resp->max_cqe		= attr->max_cqe;
+	resp->max_mr		= attr->max_mr;
+	resp->max_pd		= attr->max_pd;
+	resp->max_qp_rd_atom	= attr->max_qp_rd_atom;
+	resp->max_ee_rd_atom	= attr->max_ee_rd_atom;
+	resp->max_res_rd_atom	= attr->max_res_rd_atom;
+	resp->max_qp_init_rd_atom	= attr->max_qp_init_rd_atom;
+	resp->max_ee_init_rd_atom	= attr->max_ee_init_rd_atom;
+	resp->atomic_cap		= attr->atomic_cap;
+	resp->max_ee			= attr->max_ee;
+	resp->max_rdd			= attr->max_rdd;
+	resp->max_mw			= attr->max_mw;
+	resp->max_raw_ipv6_qp		= attr->max_raw_ipv6_qp;
+	resp->max_raw_ethy_qp		= attr->max_raw_ethy_qp;
+	resp->max_mcast_grp		= attr->max_mcast_grp;
+	resp->max_mcast_qp_attach	= attr->max_mcast_qp_attach;
+	resp->max_total_mcast_qp_attach	= attr->max_total_mcast_qp_attach;
+	resp->max_ah			= attr->max_ah;
+	resp->max_fmr			= attr->max_fmr;
+	resp->max_map_per_fmr		= attr->max_map_per_fmr;
+	resp->max_srq			= attr->max_srq;
+	resp->max_srq_wr		= attr->max_srq_wr;
+	resp->max_srq_sge		= attr->max_srq_sge;
+	resp->max_pkeys			= attr->max_pkeys;
+	resp->local_ca_ack_delay	= attr->local_ca_ack_delay;
+	resp->phys_port_cnt		= file->device->ib_dev->phys_port_cnt;
+}
+
+ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file,
+			       const char __user *buf,
+			       int in_len, int out_len)
+{
+	struct ib_uverbs_query_device      cmd;
+	struct ib_uverbs_query_device_resp resp;
+	struct ib_device_attr              attr;
+	int                                ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ret = ib_query_device(file->device->ib_dev, &attr);
+	if (ret)
+		return ret;
+
+	memset(&resp, 0, sizeof resp);
+	copy_query_dev_fields(file, &resp, &attr);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_query_port(struct ib_uverbs_file *file,
+			     const char __user *buf,
+			     int in_len, int out_len)
+{
+	struct ib_uverbs_query_port      cmd;
+	struct ib_uverbs_query_port_resp resp;
+	struct ib_port_attr              attr;
+	int                              ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ret = ib_query_port(file->device->ib_dev, cmd.port_num, &attr);
+	if (ret)
+		return ret;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.state 	     = attr.state;
+	resp.max_mtu 	     = attr.max_mtu;
+	resp.active_mtu      = attr.active_mtu;
+	resp.gid_tbl_len     = attr.gid_tbl_len;
+	resp.port_cap_flags  = attr.port_cap_flags;
+	resp.max_msg_sz      = attr.max_msg_sz;
+	resp.bad_pkey_cntr   = attr.bad_pkey_cntr;
+	resp.qkey_viol_cntr  = attr.qkey_viol_cntr;
+	resp.pkey_tbl_len    = attr.pkey_tbl_len;
+	resp.lid 	     = attr.lid;
+	resp.sm_lid 	     = attr.sm_lid;
+	resp.lmc 	     = attr.lmc;
+	resp.max_vl_num      = attr.max_vl_num;
+	resp.sm_sl 	     = attr.sm_sl;
+	resp.subnet_timeout  = attr.subnet_timeout;
+	resp.init_type_reply = attr.init_type_reply;
+	resp.active_width    = attr.active_width;
+	resp.active_speed    = attr.active_speed;
+	resp.phys_state      = attr.phys_state;
+	resp.link_layer      = rdma_port_get_link_layer(file->device->ib_dev,
+							cmd.port_num);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_alloc_pd(struct ib_uverbs_file *file,
+			   const char __user *buf,
+			   int in_len, int out_len)
+{
+	struct ib_uverbs_alloc_pd      cmd;
+	struct ib_uverbs_alloc_pd_resp resp;
+	struct ib_udata                udata;
+	struct ib_uobject             *uobj;
+	struct ib_pd                  *pd;
+	int                            ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	init_uobj(uobj, 0, file->ucontext, &pd_lock_class);
+	down_write(&uobj->mutex);
+
+	pd = file->device->ib_dev->alloc_pd(file->device->ib_dev,
+					    file->ucontext, &udata);
+	if (IS_ERR(pd)) {
+		ret = PTR_ERR(pd);
+		goto err;
+	}
+
+	pd->device  = file->device->ib_dev;
+	pd->uobject = uobj;
+	atomic_set(&pd->usecnt, 0);
+
+	uobj->object = pd;
+	ret = idr_add_uobj(&ib_uverbs_pd_idr, uobj);
+	if (ret)
+		goto err_idr;
+
+	memset(&resp, 0, sizeof resp);
+	resp.pd_handle = uobj->id;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->pd_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+
+err_idr:
+	ib_dealloc_pd(pd);
+
+err:
+	put_uobj_write(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_dealloc_pd(struct ib_uverbs_file *file,
+			     const char __user *buf,
+			     int in_len, int out_len)
+{
+	struct ib_uverbs_dealloc_pd cmd;
+	struct ib_uobject          *uobj;
+	int                         ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_pd_idr, cmd.pd_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+
+	ret = ib_dealloc_pd(uobj->object);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return in_len;
+}
+
+struct xrcd_table_entry {
+	struct rb_node  node;
+	struct ib_xrcd *xrcd;
+	struct inode   *inode;
+};
+
+static int xrcd_table_insert(struct ib_uverbs_device *dev,
+			    struct inode *inode,
+			    struct ib_xrcd *xrcd)
+{
+	struct xrcd_table_entry *entry, *scan;
+	struct rb_node **p = &dev->xrcd_tree.rb_node;
+	struct rb_node *parent = NULL;
+
+	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	if (!entry)
+		return -ENOMEM;
+
+	entry->xrcd  = xrcd;
+	entry->inode = inode;
+
+	while (*p) {
+		parent = *p;
+		scan = rb_entry(parent, struct xrcd_table_entry, node);
+
+		if (inode < scan->inode) {
+			p = &(*p)->rb_left;
+		} else if (inode > scan->inode) {
+			p = &(*p)->rb_right;
+		} else {
+			kfree(entry);
+			return -EEXIST;
+		}
+	}
+
+	rb_link_node(&entry->node, parent, p);
+	rb_insert_color(&entry->node, &dev->xrcd_tree);
+	igrab(inode);
+	return 0;
+}
+
+static struct xrcd_table_entry *xrcd_table_search(struct ib_uverbs_device *dev,
+						  struct inode *inode)
+{
+	struct xrcd_table_entry *entry;
+	struct rb_node *p = dev->xrcd_tree.rb_node;
+
+	while (p) {
+		entry = rb_entry(p, struct xrcd_table_entry, node);
+
+		if (inode < entry->inode)
+			p = p->rb_left;
+		else if (inode > entry->inode)
+			p = p->rb_right;
+		else
+			return entry;
+	}
+
+	return NULL;
+}
+
+static struct ib_xrcd *find_xrcd(struct ib_uverbs_device *dev, struct inode *inode)
+{
+	struct xrcd_table_entry *entry;
+
+	entry = xrcd_table_search(dev, inode);
+	if (!entry)
+		return NULL;
+
+	return entry->xrcd;
+}
+
+static void xrcd_table_delete(struct ib_uverbs_device *dev,
+			      struct inode *inode)
+{
+	struct xrcd_table_entry *entry;
+
+	entry = xrcd_table_search(dev, inode);
+	if (entry) {
+		iput(inode);
+		rb_erase(&entry->node, &dev->xrcd_tree);
+		kfree(entry);
+	}
+}
+
+ssize_t ib_uverbs_open_xrcd(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_open_xrcd	cmd;
+	struct ib_uverbs_open_xrcd_resp	resp;
+	struct ib_udata			udata;
+	struct ib_uxrcd_object         *obj;
+	struct ib_xrcd                 *xrcd = NULL;
+	struct fd			f = {NULL, 0};
+	struct inode                   *inode = NULL;
+	int				ret = 0;
+	int				new_xrcd = 0;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof  resp);
+
+	mutex_lock(&file->device->xrcd_tree_mutex);
+
+	if (cmd.fd != -1) {
+		/* search for file descriptor */
+		f = fdget(cmd.fd);
+		if (!f.file) {
+			ret = -EBADF;
+			goto err_tree_mutex_unlock;
+		}
+
+		inode = file_inode(f.file);
+		xrcd = find_xrcd(file->device, inode);
+		if (!xrcd && !(cmd.oflags & O_CREAT)) {
+			/* no file descriptor. Need CREATE flag */
+			ret = -EAGAIN;
+			goto err_tree_mutex_unlock;
+		}
+
+		if (xrcd && cmd.oflags & O_EXCL) {
+			ret = -EINVAL;
+			goto err_tree_mutex_unlock;
+		}
+	}
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj) {
+		ret = -ENOMEM;
+		goto err_tree_mutex_unlock;
+	}
+
+	init_uobj(&obj->uobject, 0, file->ucontext, &xrcd_lock_class);
+
+	down_write(&obj->uobject.mutex);
+
+	if (!xrcd) {
+		xrcd = file->device->ib_dev->alloc_xrcd(file->device->ib_dev,
+							file->ucontext, &udata);
+		if (IS_ERR(xrcd)) {
+			ret = PTR_ERR(xrcd);
+			goto err;
+		}
+
+		xrcd->inode   = inode;
+		xrcd->device  = file->device->ib_dev;
+		atomic_set(&xrcd->usecnt, 0);
+		mutex_init(&xrcd->tgt_qp_mutex);
+		INIT_LIST_HEAD(&xrcd->tgt_qp_list);
+		new_xrcd = 1;
+	}
+
+	atomic_set(&obj->refcnt, 0);
+	obj->uobject.object = xrcd;
+	ret = idr_add_uobj(&ib_uverbs_xrcd_idr, &obj->uobject);
+	if (ret)
+		goto err_idr;
+
+	memset(&resp, 0, sizeof resp);
+	resp.xrcd_handle = obj->uobject.id;
+
+	if (inode) {
+		if (new_xrcd) {
+			/* create new inode/xrcd table entry */
+			ret = xrcd_table_insert(file->device, inode, xrcd);
+			if (ret)
+				goto err_insert_xrcd;
+		}
+		atomic_inc(&xrcd->usecnt);
+	}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	if (f.file)
+		fdput(f);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uobject.list, &file->ucontext->xrcd_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uobject.live = 1;
+	up_write(&obj->uobject.mutex);
+
+	mutex_unlock(&file->device->xrcd_tree_mutex);
+	return in_len;
+
+err_copy:
+	if (inode) {
+		if (new_xrcd)
+			xrcd_table_delete(file->device, inode);
+		atomic_dec(&xrcd->usecnt);
+	}
+
+err_insert_xrcd:
+	idr_remove_uobj(&ib_uverbs_xrcd_idr, &obj->uobject);
+
+err_idr:
+	ib_dealloc_xrcd(xrcd);
+
+err:
+	put_uobj_write(&obj->uobject);
+
+err_tree_mutex_unlock:
+	if (f.file)
+		fdput(f);
+
+	mutex_unlock(&file->device->xrcd_tree_mutex);
+
+	return ret;
+}
+
+ssize_t ib_uverbs_close_xrcd(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_close_xrcd cmd;
+	struct ib_uobject           *uobj;
+	struct ib_xrcd              *xrcd = NULL;
+	struct inode                *inode = NULL;
+	struct ib_uxrcd_object      *obj;
+	int                         live;
+	int                         ret = 0;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	mutex_lock(&file->device->xrcd_tree_mutex);
+	uobj = idr_write_uobj(&ib_uverbs_xrcd_idr, cmd.xrcd_handle, file->ucontext);
+	if (!uobj) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	xrcd  = uobj->object;
+	inode = xrcd->inode;
+	obj   = container_of(uobj, struct ib_uxrcd_object, uobject);
+	if (atomic_read(&obj->refcnt)) {
+		put_uobj_write(uobj);
+		ret = -EBUSY;
+		goto out;
+	}
+
+	if (!inode || atomic_dec_and_test(&xrcd->usecnt)) {
+		ret = ib_dealloc_xrcd(uobj->object);
+		if (!ret)
+			uobj->live = 0;
+	}
+
+	live = uobj->live;
+	if (inode && ret)
+		atomic_inc(&xrcd->usecnt);
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		goto out;
+
+	if (inode && !live)
+		xrcd_table_delete(file->device, inode);
+
+	idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj);
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+	ret = in_len;
+
+out:
+	mutex_unlock(&file->device->xrcd_tree_mutex);
+	return ret;
+}
+
+void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev,
+			    struct ib_xrcd *xrcd)
+{
+	struct inode *inode;
+
+	inode = xrcd->inode;
+	if (inode && !atomic_dec_and_test(&xrcd->usecnt))
+		return;
+
+	ib_dealloc_xrcd(xrcd);
+
+	if (inode)
+		xrcd_table_delete(dev, inode);
+}
+
+ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file,
+			 const char __user *buf, int in_len,
+			 int out_len)
+{
+	struct ib_uverbs_reg_mr      cmd;
+	struct ib_uverbs_reg_mr_resp resp;
+	struct ib_udata              udata;
+	struct ib_uobject           *uobj;
+	struct ib_pd                *pd;
+	struct ib_mr                *mr;
+	int                          ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	if ((cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK))
+		return -EINVAL;
+
+	ret = ib_check_mr_access(cmd.access_flags);
+	if (ret)
+		return ret;
+
+	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	init_uobj(uobj, 0, file->ucontext, &mr_lock_class);
+	down_write(&uobj->mutex);
+
+	pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err_free;
+	}
+
+	if (cmd.access_flags & IB_ACCESS_ON_DEMAND) {
+		struct ib_device_attr attr;
+
+		ret = ib_query_device(pd->device, &attr);
+		if (ret || !(attr.device_cap_flags &
+				IB_DEVICE_ON_DEMAND_PAGING)) {
+			pr_debug("ODP support not available\n");
+			ret = -EINVAL;
+			goto err_put;
+		}
+	}
+
+	mr = pd->device->reg_user_mr(pd, cmd.start, cmd.length, cmd.hca_va,
+				     cmd.access_flags, &udata);
+	if (IS_ERR(mr)) {
+		ret = PTR_ERR(mr);
+		goto err_put;
+	}
+
+	mr->device  = pd->device;
+	mr->pd      = pd;
+	mr->uobject = uobj;
+	atomic_inc(&pd->usecnt);
+	atomic_set(&mr->usecnt, 0);
+
+	uobj->object = mr;
+	ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj);
+	if (ret)
+		goto err_unreg;
+
+	memset(&resp, 0, sizeof resp);
+	resp.lkey      = mr->lkey;
+	resp.rkey      = mr->rkey;
+	resp.mr_handle = uobj->id;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	put_pd_read(pd);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->mr_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+
+err_unreg:
+	ib_dereg_mr(mr);
+
+err_put:
+	put_pd_read(pd);
+
+err_free:
+	put_uobj_write(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_rereg_mr      cmd;
+	struct ib_uverbs_rereg_mr_resp resp;
+	struct ib_udata              udata;
+	struct ib_pd                *pd = NULL;
+	struct ib_mr                *mr;
+	struct ib_pd		    *old_pd;
+	int                          ret;
+	struct ib_uobject	    *uobj;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof(cmd)))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof(cmd),
+		   (unsigned long) cmd.response + sizeof(resp),
+		   in_len - sizeof(cmd), out_len - sizeof(resp));
+
+	if (cmd.flags & ~IB_MR_REREG_SUPPORTED || !cmd.flags)
+		return -EINVAL;
+
+	if ((cmd.flags & IB_MR_REREG_TRANS) &&
+	    (!cmd.start || !cmd.hca_va || 0 >= cmd.length ||
+	     (cmd.start & ~PAGE_MASK) != (cmd.hca_va & ~PAGE_MASK)))
+			return -EINVAL;
+
+	uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle,
+			      file->ucontext);
+
+	if (!uobj)
+		return -EINVAL;
+
+	mr = uobj->object;
+
+	if (cmd.flags & IB_MR_REREG_ACCESS) {
+		ret = ib_check_mr_access(cmd.access_flags);
+		if (ret)
+			goto put_uobjs;
+	}
+
+	if (cmd.flags & IB_MR_REREG_PD) {
+		pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+		if (!pd) {
+			ret = -EINVAL;
+			goto put_uobjs;
+		}
+	}
+
+	if (atomic_read(&mr->usecnt)) {
+		ret = -EBUSY;
+		goto put_uobj_pd;
+	}
+
+	old_pd = mr->pd;
+	ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start,
+					cmd.length, cmd.hca_va,
+					cmd.access_flags, pd, &udata);
+	if (!ret) {
+		if (cmd.flags & IB_MR_REREG_PD) {
+			atomic_inc(&pd->usecnt);
+			mr->pd = pd;
+			atomic_dec(&old_pd->usecnt);
+		}
+	} else {
+		goto put_uobj_pd;
+	}
+
+	memset(&resp, 0, sizeof(resp));
+	resp.lkey      = mr->lkey;
+	resp.rkey      = mr->rkey;
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp)))
+		ret = -EFAULT;
+	else
+		ret = in_len;
+
+put_uobj_pd:
+	if (cmd.flags & IB_MR_REREG_PD)
+		put_pd_read(pd);
+
+put_uobjs:
+
+	put_uobj_write(mr->uobject);
+
+	return ret;
+}
+
+ssize_t ib_uverbs_dereg_mr(struct ib_uverbs_file *file,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_dereg_mr cmd;
+	struct ib_mr             *mr;
+	struct ib_uobject	 *uobj;
+	int                       ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_mr_idr, cmd.mr_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+
+	mr = uobj->object;
+
+	ret = ib_dereg_mr(mr);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_alloc_mw(struct ib_uverbs_file *file,
+			 const char __user *buf, int in_len,
+			 int out_len)
+{
+	struct ib_uverbs_alloc_mw      cmd;
+	struct ib_uverbs_alloc_mw_resp resp;
+	struct ib_uobject             *uobj;
+	struct ib_pd                  *pd;
+	struct ib_mw                  *mw;
+	int                            ret;
+
+	if (out_len < sizeof(resp))
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof(cmd)))
+		return -EFAULT;
+
+	uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	init_uobj(uobj, 0, file->ucontext, &mw_lock_class);
+	down_write(&uobj->mutex);
+
+	pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err_free;
+	}
+
+	mw = pd->device->alloc_mw(pd, cmd.mw_type);
+	if (IS_ERR(mw)) {
+		ret = PTR_ERR(mw);
+		goto err_put;
+	}
+
+	mw->device  = pd->device;
+	mw->pd      = pd;
+	mw->uobject = uobj;
+	atomic_inc(&pd->usecnt);
+
+	uobj->object = mw;
+	ret = idr_add_uobj(&ib_uverbs_mw_idr, uobj);
+	if (ret)
+		goto err_unalloc;
+
+	memset(&resp, 0, sizeof(resp));
+	resp.rkey      = mw->rkey;
+	resp.mw_handle = uobj->id;
+
+	if (copy_to_user((void __user *)(unsigned long)cmd.response,
+			 &resp, sizeof(resp))) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	put_pd_read(pd);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->mw_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
+
+err_unalloc:
+	ib_dealloc_mw(mw);
+
+err_put:
+	put_pd_read(pd);
+
+err_free:
+	put_uobj_write(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_dealloc_mw cmd;
+	struct ib_mw               *mw;
+	struct ib_uobject	   *uobj;
+	int                         ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof(cmd)))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_mw_idr, cmd.mw_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+
+	mw = uobj->object;
+
+	ret = ib_dealloc_mw(mw);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_create_comp_channel(struct ib_uverbs_file *file,
+				      const char __user *buf, int in_len,
+				      int out_len)
+{
+	struct ib_uverbs_create_comp_channel	   cmd;
+	struct ib_uverbs_create_comp_channel_resp  resp;
+	struct file				  *filp;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	ret = get_unused_fd_flags(O_CLOEXEC);
+	if (ret < 0)
+		return ret;
+	resp.fd = ret;
+
+	filp = ib_uverbs_alloc_event_file(file, 0);
+	if (IS_ERR(filp)) {
+		put_unused_fd(resp.fd);
+		return PTR_ERR(filp);
+	}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		put_unused_fd(resp.fd);
+		fput(filp);
+		return -EFAULT;
+	}
+
+	fd_install(resp.fd, filp);
+	return in_len;
+}
+
+ssize_t ib_uverbs_create_cq(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_cq      cmd;
+	struct ib_uverbs_create_cq_resp resp;
+	struct ib_udata                 udata;
+	struct ib_ucq_object           *obj;
+	struct ib_uverbs_event_file    *ev_file = NULL;
+	struct ib_cq                   *cq;
+	int                             ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	if (cmd.comp_vector >= file->device->num_comp_vectors)
+		return -EINVAL;
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	init_uobj(&obj->uobject, cmd.user_handle, file->ucontext, &cq_lock_class);
+	down_write(&obj->uobject.mutex);
+
+	if (cmd.comp_channel >= 0) {
+		ev_file = ib_uverbs_lookup_comp_file(cmd.comp_channel);
+		if (!ev_file) {
+			ret = -EINVAL;
+			goto err;
+		}
+	}
+
+	obj->uverbs_file	   = file;
+	obj->comp_events_reported  = 0;
+	obj->async_events_reported = 0;
+	INIT_LIST_HEAD(&obj->comp_list);
+	INIT_LIST_HEAD(&obj->async_list);
+
+	cq = file->device->ib_dev->create_cq(file->device->ib_dev, cmd.cqe,
+					     cmd.comp_vector,
+					     file->ucontext, &udata);
+	if (IS_ERR(cq)) {
+		ret = PTR_ERR(cq);
+		goto err_file;
+	}
+
+	cq->device        = file->device->ib_dev;
+	cq->uobject       = &obj->uobject;
+	cq->comp_handler  = ib_uverbs_comp_handler;
+	cq->event_handler = ib_uverbs_cq_event_handler;
+	cq->cq_context    = ev_file;
+	atomic_set(&cq->usecnt, 0);
+
+	obj->uobject.object = cq;
+	ret = idr_add_uobj(&ib_uverbs_cq_idr, &obj->uobject);
+	if (ret)
+		goto err_free;
+
+	memset(&resp, 0, sizeof resp);
+	resp.cq_handle = obj->uobject.id;
+	resp.cqe       = cq->cqe;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uobject.list, &file->ucontext->cq_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uobject.live = 1;
+
+	up_write(&obj->uobject.mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_cq_idr, &obj->uobject);
+
+err_free:
+	ib_destroy_cq(cq);
+
+err_file:
+	if (ev_file)
+		ib_uverbs_release_ucq(file, ev_file, obj);
+
+err:
+	put_uobj_write(&obj->uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_resize_cq(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_resize_cq	cmd;
+	struct ib_uverbs_resize_cq_resp	resp;
+	struct ib_udata                 udata;
+	struct ib_cq			*cq;
+	int				ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+	if (!cq)
+		return -EINVAL;
+
+	ret = cq->device->resize_cq(cq, cmd.cqe, &udata);
+	if (ret)
+		goto out;
+
+	resp.cqe = cq->cqe;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp.cqe))
+		ret = -EFAULT;
+
+out:
+	put_cq_read(cq);
+
+	return ret ? ret : in_len;
+}
+
+static int copy_wc_to_user(void __user *dest, struct ib_wc *wc)
+{
+	struct ib_uverbs_wc tmp;
+
+	tmp.wr_id		= wc->wr_id;
+	tmp.status		= wc->status;
+	tmp.opcode		= wc->opcode;
+	tmp.vendor_err		= wc->vendor_err;
+	tmp.byte_len		= wc->byte_len;
+	tmp.ex.imm_data		= (__u32 __force) wc->ex.imm_data;
+	tmp.qp_num		= wc->qp->qp_num;
+	tmp.src_qp		= wc->src_qp;
+	tmp.wc_flags		= wc->wc_flags;
+	tmp.pkey_index		= wc->pkey_index;
+	tmp.slid		= wc->slid;
+	tmp.sl			= wc->sl;
+	tmp.dlid_path_bits	= wc->dlid_path_bits;
+	tmp.port_num		= wc->port_num;
+	tmp.reserved		= 0;
+
+	if (copy_to_user(dest, &tmp, sizeof tmp))
+		return -EFAULT;
+
+	return 0;
+}
+
+ssize_t ib_uverbs_poll_cq(struct ib_uverbs_file *file,
+			  const char __user *buf, int in_len,
+			  int out_len)
+{
+	struct ib_uverbs_poll_cq       cmd;
+	struct ib_uverbs_poll_cq_resp  resp;
+	u8 __user                     *header_ptr;
+	u8 __user                     *data_ptr;
+	struct ib_cq                  *cq;
+	struct ib_wc                   wc;
+	int                            ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+	if (!cq)
+		return -EINVAL;
+
+	/* we copy a struct ib_uverbs_poll_cq_resp to user space */
+	header_ptr = (void __user *)(unsigned long) cmd.response;
+	data_ptr = header_ptr + sizeof resp;
+
+	memset(&resp, 0, sizeof resp);
+	while (resp.count < cmd.ne) {
+		ret = ib_poll_cq(cq, 1, &wc);
+		if (ret < 0)
+			goto out_put;
+		if (!ret)
+			break;
+
+		ret = copy_wc_to_user(data_ptr, &wc);
+		if (ret)
+			goto out_put;
+
+		data_ptr += sizeof(struct ib_uverbs_wc);
+		++resp.count;
+	}
+
+	if (copy_to_user(header_ptr, &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto out_put;
+	}
+
+	ret = in_len;
+
+out_put:
+	put_cq_read(cq);
+	return ret;
+}
+
+ssize_t ib_uverbs_req_notify_cq(struct ib_uverbs_file *file,
+				const char __user *buf, int in_len,
+				int out_len)
+{
+	struct ib_uverbs_req_notify_cq cmd;
+	struct ib_cq                  *cq;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	cq = idr_read_cq(cmd.cq_handle, file->ucontext, 0);
+	if (!cq)
+		return -EINVAL;
+
+	ib_req_notify_cq(cq, cmd.solicited_only ?
+			 IB_CQ_SOLICITED : IB_CQ_NEXT_COMP);
+
+	put_cq_read(cq);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_destroy_cq(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_destroy_cq      cmd;
+	struct ib_uverbs_destroy_cq_resp resp;
+	struct ib_uobject		*uobj;
+	struct ib_cq               	*cq;
+	struct ib_ucq_object        	*obj;
+	struct ib_uverbs_event_file	*ev_file;
+	int                        	 ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_cq_idr, cmd.cq_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	cq      = uobj->object;
+	ev_file = cq->cq_context;
+	obj     = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+	ret = ib_destroy_cq(cq);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	ib_uverbs_release_ucq(file, ev_file, obj);
+
+	memset(&resp, 0, sizeof resp);
+	resp.comp_events_reported  = obj->comp_events_reported;
+	resp.async_events_reported = obj->async_events_reported;
+
+	put_uobj(uobj);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_create_qp(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_qp      cmd;
+	struct ib_uverbs_create_qp_resp resp;
+	struct ib_udata                 udata;
+	struct ib_uqp_object           *obj;
+	struct ib_device	       *device;
+	struct ib_pd                   *pd = NULL;
+	struct ib_xrcd		       *xrcd = NULL;
+	struct ib_uobject	       *uninitialized_var(xrcd_uobj);
+	struct ib_cq                   *scq = NULL, *rcq = NULL;
+	struct ib_srq                  *srq = NULL;
+	struct ib_qp                   *qp;
+	struct ib_qp_init_attr          attr;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	if (cmd.qp_type == IB_QPT_RAW_PACKET && !capable(CAP_NET_RAW))
+		return -EPERM;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	obj = kzalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class);
+	down_write(&obj->uevent.uobject.mutex);
+
+	if (cmd.qp_type == IB_QPT_XRC_TGT) {
+		xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj);
+		if (!xrcd) {
+			ret = -EINVAL;
+			goto err_put;
+		}
+		device = xrcd->device;
+	} else {
+		if (cmd.qp_type == IB_QPT_XRC_INI) {
+			cmd.max_recv_wr = cmd.max_recv_sge = 0;
+		} else {
+			if (cmd.is_srq) {
+				srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+				if (!srq || srq->srq_type != IB_SRQT_BASIC) {
+					ret = -EINVAL;
+					goto err_put;
+				}
+			}
+
+			if (cmd.recv_cq_handle != cmd.send_cq_handle) {
+				rcq = idr_read_cq(cmd.recv_cq_handle, file->ucontext, 0);
+				if (!rcq) {
+					ret = -EINVAL;
+					goto err_put;
+				}
+			}
+		}
+
+		scq = idr_read_cq(cmd.send_cq_handle, file->ucontext, !!rcq);
+		rcq = rcq ?: scq;
+		pd  = idr_read_pd(cmd.pd_handle, file->ucontext);
+		if (!pd || !scq) {
+			ret = -EINVAL;
+			goto err_put;
+		}
+
+		device = pd->device;
+	}
+
+	attr.event_handler = ib_uverbs_qp_event_handler;
+	attr.qp_context    = file;
+	attr.send_cq       = scq;
+	attr.recv_cq       = rcq;
+	attr.srq           = srq;
+	attr.xrcd	   = xrcd;
+	attr.sq_sig_type   = cmd.sq_sig_all ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+	attr.qp_type       = cmd.qp_type;
+	attr.create_flags  = 0;
+
+	attr.cap.max_send_wr     = cmd.max_send_wr;
+	attr.cap.max_recv_wr     = cmd.max_recv_wr;
+	attr.cap.max_send_sge    = cmd.max_send_sge;
+	attr.cap.max_recv_sge    = cmd.max_recv_sge;
+	attr.cap.max_inline_data = cmd.max_inline_data;
+
+	obj->uevent.events_reported     = 0;
+	INIT_LIST_HEAD(&obj->uevent.event_list);
+	INIT_LIST_HEAD(&obj->mcast_list);
+
+	if (cmd.qp_type == IB_QPT_XRC_TGT)
+		qp = ib_create_qp(pd, &attr);
+	else
+		qp = device->create_qp(pd, &attr, &udata);
+
+	if (IS_ERR(qp)) {
+		ret = PTR_ERR(qp);
+		goto err_put;
+	}
+
+	if (cmd.qp_type != IB_QPT_XRC_TGT) {
+		qp->real_qp	  = qp;
+		qp->device	  = device;
+		qp->pd		  = pd;
+		qp->send_cq	  = attr.send_cq;
+		qp->recv_cq	  = attr.recv_cq;
+		qp->srq		  = attr.srq;
+		qp->event_handler = attr.event_handler;
+		qp->qp_context	  = attr.qp_context;
+		qp->qp_type	  = attr.qp_type;
+		atomic_set(&qp->usecnt, 0);
+		atomic_inc(&pd->usecnt);
+		atomic_inc(&attr.send_cq->usecnt);
+		if (attr.recv_cq)
+			atomic_inc(&attr.recv_cq->usecnt);
+		if (attr.srq)
+			atomic_inc(&attr.srq->usecnt);
+	}
+	qp->uobject = &obj->uevent.uobject;
+
+	obj->uevent.uobject.object = qp;
+	ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+	if (ret)
+		goto err_destroy;
+
+	memset(&resp, 0, sizeof resp);
+	resp.qpn             = qp->qp_num;
+	resp.qp_handle       = obj->uevent.uobject.id;
+	resp.max_recv_sge    = attr.cap.max_recv_sge;
+	resp.max_send_sge    = attr.cap.max_send_sge;
+	resp.max_recv_wr     = attr.cap.max_recv_wr;
+	resp.max_send_wr     = attr.cap.max_send_wr;
+	resp.max_inline_data = attr.cap.max_inline_data;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	if (xrcd) {
+		obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object,
+					  uobject);
+		atomic_inc(&obj->uxrcd->refcnt);
+		put_xrcd_read(xrcd_uobj);
+	}
+
+	if (pd)
+		put_pd_read(pd);
+	if (scq)
+		put_cq_read(scq);
+	if (rcq && rcq != scq)
+		put_cq_read(rcq);
+	if (srq)
+		put_srq_read(srq);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uevent.uobject.live = 1;
+
+	up_write(&obj->uevent.uobject.mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+
+err_destroy:
+	ib_destroy_qp(qp);
+
+err_put:
+	if (xrcd)
+		put_xrcd_read(xrcd_uobj);
+	if (pd)
+		put_pd_read(pd);
+	if (scq)
+		put_cq_read(scq);
+	if (rcq && rcq != scq)
+		put_cq_read(rcq);
+	if (srq)
+		put_srq_read(srq);
+
+	put_uobj_write(&obj->uevent.uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_open_qp(struct ib_uverbs_file *file,
+			  const char __user *buf, int in_len, int out_len)
+{
+	struct ib_uverbs_open_qp        cmd;
+	struct ib_uverbs_create_qp_resp resp;
+	struct ib_udata                 udata;
+	struct ib_uqp_object           *obj;
+	struct ib_xrcd		       *xrcd;
+	struct ib_uobject	       *uninitialized_var(xrcd_uobj);
+	struct ib_qp                   *qp;
+	struct ib_qp_open_attr          attr;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	init_uobj(&obj->uevent.uobject, cmd.user_handle, file->ucontext, &qp_lock_class);
+	down_write(&obj->uevent.uobject.mutex);
+
+	xrcd = idr_read_xrcd(cmd.pd_handle, file->ucontext, &xrcd_uobj);
+	if (!xrcd) {
+		ret = -EINVAL;
+		goto err_put;
+	}
+
+	attr.event_handler = ib_uverbs_qp_event_handler;
+	attr.qp_context    = file;
+	attr.qp_num        = cmd.qpn;
+	attr.qp_type       = cmd.qp_type;
+
+	obj->uevent.events_reported = 0;
+	INIT_LIST_HEAD(&obj->uevent.event_list);
+	INIT_LIST_HEAD(&obj->mcast_list);
+
+	qp = ib_open_qp(xrcd, &attr);
+	if (IS_ERR(qp)) {
+		ret = PTR_ERR(qp);
+		goto err_put;
+	}
+
+	qp->uobject = &obj->uevent.uobject;
+
+	obj->uevent.uobject.object = qp;
+	ret = idr_add_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+	if (ret)
+		goto err_destroy;
+
+	memset(&resp, 0, sizeof resp);
+	resp.qpn       = qp->qp_num;
+	resp.qp_handle = obj->uevent.uobject.id;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_remove;
+	}
+
+	obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+	atomic_inc(&obj->uxrcd->refcnt);
+	put_xrcd_read(xrcd_uobj);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uevent.uobject.list, &file->ucontext->qp_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uevent.uobject.live = 1;
+
+	up_write(&obj->uevent.uobject.mutex);
+
+	return in_len;
+
+err_remove:
+	idr_remove_uobj(&ib_uverbs_qp_idr, &obj->uevent.uobject);
+
+err_destroy:
+	ib_destroy_qp(qp);
+
+err_put:
+	put_xrcd_read(xrcd_uobj);
+	put_uobj_write(&obj->uevent.uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_query_qp(struct ib_uverbs_file *file,
+			   const char __user *buf, int in_len,
+			   int out_len)
+{
+	struct ib_uverbs_query_qp      cmd;
+	struct ib_uverbs_query_qp_resp resp;
+	struct ib_qp                   *qp;
+	struct ib_qp_attr              *attr;
+	struct ib_qp_init_attr         *init_attr;
+	int                            ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	attr      = kmalloc(sizeof *attr, GFP_KERNEL);
+	init_attr = kmalloc(sizeof *init_attr, GFP_KERNEL);
+	if (!attr || !init_attr) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = ib_query_qp(qp, attr, cmd.attr_mask, init_attr);
+
+	put_qp_read(qp);
+
+	if (ret)
+		goto out;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.qp_state               = attr->qp_state;
+	resp.cur_qp_state           = attr->cur_qp_state;
+	resp.path_mtu               = attr->path_mtu;
+	resp.path_mig_state         = attr->path_mig_state;
+	resp.qkey                   = attr->qkey;
+	resp.rq_psn                 = attr->rq_psn;
+	resp.sq_psn                 = attr->sq_psn;
+	resp.dest_qp_num            = attr->dest_qp_num;
+	resp.qp_access_flags        = attr->qp_access_flags;
+	resp.pkey_index             = attr->pkey_index;
+	resp.alt_pkey_index         = attr->alt_pkey_index;
+	resp.sq_draining            = attr->sq_draining;
+	resp.max_rd_atomic          = attr->max_rd_atomic;
+	resp.max_dest_rd_atomic     = attr->max_dest_rd_atomic;
+	resp.min_rnr_timer          = attr->min_rnr_timer;
+	resp.port_num               = attr->port_num;
+	resp.timeout                = attr->timeout;
+	resp.retry_cnt              = attr->retry_cnt;
+	resp.rnr_retry              = attr->rnr_retry;
+	resp.alt_port_num           = attr->alt_port_num;
+	resp.alt_timeout            = attr->alt_timeout;
+
+	memcpy(resp.dest.dgid, attr->ah_attr.grh.dgid.raw, 16);
+	resp.dest.flow_label        = attr->ah_attr.grh.flow_label;
+	resp.dest.sgid_index        = attr->ah_attr.grh.sgid_index;
+	resp.dest.hop_limit         = attr->ah_attr.grh.hop_limit;
+	resp.dest.traffic_class     = attr->ah_attr.grh.traffic_class;
+	resp.dest.dlid              = attr->ah_attr.dlid;
+	resp.dest.sl                = attr->ah_attr.sl;
+	resp.dest.src_path_bits     = attr->ah_attr.src_path_bits;
+	resp.dest.static_rate       = attr->ah_attr.static_rate;
+	resp.dest.is_global         = !!(attr->ah_attr.ah_flags & IB_AH_GRH);
+	resp.dest.port_num          = attr->ah_attr.port_num;
+
+	memcpy(resp.alt_dest.dgid, attr->alt_ah_attr.grh.dgid.raw, 16);
+	resp.alt_dest.flow_label    = attr->alt_ah_attr.grh.flow_label;
+	resp.alt_dest.sgid_index    = attr->alt_ah_attr.grh.sgid_index;
+	resp.alt_dest.hop_limit     = attr->alt_ah_attr.grh.hop_limit;
+	resp.alt_dest.traffic_class = attr->alt_ah_attr.grh.traffic_class;
+	resp.alt_dest.dlid          = attr->alt_ah_attr.dlid;
+	resp.alt_dest.sl            = attr->alt_ah_attr.sl;
+	resp.alt_dest.src_path_bits = attr->alt_ah_attr.src_path_bits;
+	resp.alt_dest.static_rate   = attr->alt_ah_attr.static_rate;
+	resp.alt_dest.is_global     = !!(attr->alt_ah_attr.ah_flags & IB_AH_GRH);
+	resp.alt_dest.port_num      = attr->alt_ah_attr.port_num;
+
+	resp.max_send_wr            = init_attr->cap.max_send_wr;
+	resp.max_recv_wr            = init_attr->cap.max_recv_wr;
+	resp.max_send_sge           = init_attr->cap.max_send_sge;
+	resp.max_recv_sge           = init_attr->cap.max_recv_sge;
+	resp.max_inline_data        = init_attr->cap.max_inline_data;
+	resp.sq_sig_all             = init_attr->sq_sig_type == IB_SIGNAL_ALL_WR;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	kfree(attr);
+	kfree(init_attr);
+
+	return ret ? ret : in_len;
+}
+
+/* Remove ignored fields set in the attribute mask */
+static int modify_qp_mask(enum ib_qp_type qp_type, int mask)
+{
+	switch (qp_type) {
+	case IB_QPT_XRC_INI:
+		return mask & ~(IB_QP_MAX_DEST_RD_ATOMIC | IB_QP_MIN_RNR_TIMER);
+	case IB_QPT_XRC_TGT:
+		return mask & ~(IB_QP_MAX_QP_RD_ATOMIC | IB_QP_RETRY_CNT |
+				IB_QP_RNR_RETRY);
+	default:
+		return mask;
+	}
+}
+
+ssize_t ib_uverbs_modify_qp(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_modify_qp cmd;
+	struct ib_udata            udata;
+	struct ib_qp              *qp;
+	struct ib_qp_attr         *attr;
+	int                        ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+		   out_len);
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	attr->qp_state 		  = cmd.qp_state;
+	attr->cur_qp_state 	  = cmd.cur_qp_state;
+	attr->path_mtu 		  = cmd.path_mtu;
+	attr->path_mig_state 	  = cmd.path_mig_state;
+	attr->qkey 		  = cmd.qkey;
+	attr->rq_psn 		  = cmd.rq_psn;
+	attr->sq_psn 		  = cmd.sq_psn;
+	attr->dest_qp_num 	  = cmd.dest_qp_num;
+	attr->qp_access_flags 	  = cmd.qp_access_flags;
+	attr->pkey_index 	  = cmd.pkey_index;
+	attr->alt_pkey_index 	  = cmd.alt_pkey_index;
+	attr->en_sqd_async_notify = cmd.en_sqd_async_notify;
+	attr->max_rd_atomic 	  = cmd.max_rd_atomic;
+	attr->max_dest_rd_atomic  = cmd.max_dest_rd_atomic;
+	attr->min_rnr_timer 	  = cmd.min_rnr_timer;
+	attr->port_num 		  = cmd.port_num;
+	attr->timeout 		  = cmd.timeout;
+	attr->retry_cnt 	  = cmd.retry_cnt;
+	attr->rnr_retry 	  = cmd.rnr_retry;
+	attr->alt_port_num 	  = cmd.alt_port_num;
+	attr->alt_timeout 	  = cmd.alt_timeout;
+
+	memcpy(attr->ah_attr.grh.dgid.raw, cmd.dest.dgid, 16);
+	attr->ah_attr.grh.flow_label        = cmd.dest.flow_label;
+	attr->ah_attr.grh.sgid_index        = cmd.dest.sgid_index;
+	attr->ah_attr.grh.hop_limit         = cmd.dest.hop_limit;
+	attr->ah_attr.grh.traffic_class     = cmd.dest.traffic_class;
+	attr->ah_attr.dlid 	    	    = cmd.dest.dlid;
+	attr->ah_attr.sl   	    	    = cmd.dest.sl;
+	attr->ah_attr.src_path_bits 	    = cmd.dest.src_path_bits;
+	attr->ah_attr.static_rate   	    = cmd.dest.static_rate;
+	attr->ah_attr.ah_flags 	    	    = cmd.dest.is_global ? IB_AH_GRH : 0;
+	attr->ah_attr.port_num 	    	    = cmd.dest.port_num;
+
+	memcpy(attr->alt_ah_attr.grh.dgid.raw, cmd.alt_dest.dgid, 16);
+	attr->alt_ah_attr.grh.flow_label    = cmd.alt_dest.flow_label;
+	attr->alt_ah_attr.grh.sgid_index    = cmd.alt_dest.sgid_index;
+	attr->alt_ah_attr.grh.hop_limit     = cmd.alt_dest.hop_limit;
+	attr->alt_ah_attr.grh.traffic_class = cmd.alt_dest.traffic_class;
+	attr->alt_ah_attr.dlid 	    	    = cmd.alt_dest.dlid;
+	attr->alt_ah_attr.sl   	    	    = cmd.alt_dest.sl;
+	attr->alt_ah_attr.src_path_bits     = cmd.alt_dest.src_path_bits;
+	attr->alt_ah_attr.static_rate       = cmd.alt_dest.static_rate;
+	attr->alt_ah_attr.ah_flags 	    = cmd.alt_dest.is_global ? IB_AH_GRH : 0;
+	attr->alt_ah_attr.port_num 	    = cmd.alt_dest.port_num;
+
+	if (qp->real_qp == qp) {
+		ret = ib_resolve_eth_l2_attrs(qp, attr, &cmd.attr_mask);
+		if (ret)
+			goto release_qp;
+		ret = qp->device->modify_qp(qp, attr,
+			modify_qp_mask(qp->qp_type, cmd.attr_mask), &udata);
+	} else {
+		ret = ib_modify_qp(qp, attr, modify_qp_mask(qp->qp_type, cmd.attr_mask));
+	}
+
+	if (ret)
+		goto release_qp;
+
+	ret = in_len;
+
+release_qp:
+	put_qp_read(qp);
+
+out:
+	kfree(attr);
+
+	return ret;
+}
+
+ssize_t ib_uverbs_destroy_qp(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_destroy_qp      cmd;
+	struct ib_uverbs_destroy_qp_resp resp;
+	struct ib_uobject		*uobj;
+	struct ib_qp               	*qp;
+	struct ib_uqp_object        	*obj;
+	int                        	 ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	memset(&resp, 0, sizeof resp);
+
+	uobj = idr_write_uobj(&ib_uverbs_qp_idr, cmd.qp_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	qp  = uobj->object;
+	obj = container_of(uobj, struct ib_uqp_object, uevent.uobject);
+
+	if (!list_empty(&obj->mcast_list)) {
+		put_uobj_write(uobj);
+		return -EBUSY;
+	}
+
+	ret = ib_destroy_qp(qp);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	if (obj->uxrcd)
+		atomic_dec(&obj->uxrcd->refcnt);
+
+	idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	ib_uverbs_release_uevent(file, &obj->uevent);
+
+	resp.events_reported = obj->uevent.events_reported;
+
+	put_uobj(uobj);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_post_send(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_post_send      cmd;
+	struct ib_uverbs_post_send_resp resp;
+	struct ib_uverbs_send_wr       *user_wr;
+	struct ib_send_wr              *wr = NULL, *last, *next, *bad_wr;
+	struct ib_qp                   *qp;
+	int                             i, sg_ind;
+	int				is_ud;
+	ssize_t                         ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	if (in_len < sizeof cmd + cmd.wqe_size * cmd.wr_count +
+	    cmd.sge_count * sizeof (struct ib_uverbs_sge))
+		return -EINVAL;
+
+	if (cmd.wqe_size < sizeof (struct ib_uverbs_send_wr))
+		return -EINVAL;
+
+	user_wr = kmalloc(cmd.wqe_size, GFP_KERNEL);
+	if (!user_wr)
+		return -ENOMEM;
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp)
+		goto out;
+
+	is_ud = qp->qp_type == IB_QPT_UD;
+	sg_ind = 0;
+	last = NULL;
+	for (i = 0; i < cmd.wr_count; ++i) {
+		if (copy_from_user(user_wr,
+				   buf + sizeof cmd + i * cmd.wqe_size,
+				   cmd.wqe_size)) {
+			ret = -EFAULT;
+			goto out_put;
+		}
+
+		if (user_wr->num_sge + sg_ind > cmd.sge_count) {
+			ret = -EINVAL;
+			goto out_put;
+		}
+
+		next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
+			       user_wr->num_sge * sizeof (struct ib_sge),
+			       GFP_KERNEL);
+		if (!next) {
+			ret = -ENOMEM;
+			goto out_put;
+		}
+
+		if (!last)
+			wr = next;
+		else
+			last->next = next;
+		last = next;
+
+		next->next       = NULL;
+		next->wr_id      = user_wr->wr_id;
+		next->num_sge    = user_wr->num_sge;
+		next->opcode     = user_wr->opcode;
+		next->send_flags = user_wr->send_flags;
+
+		if (is_ud) {
+			next->wr.ud.ah = idr_read_ah(user_wr->wr.ud.ah,
+						     file->ucontext);
+			if (!next->wr.ud.ah) {
+				ret = -EINVAL;
+				goto out_put;
+			}
+			next->wr.ud.remote_qpn  = user_wr->wr.ud.remote_qpn;
+			next->wr.ud.remote_qkey = user_wr->wr.ud.remote_qkey;
+			if (next->opcode == IB_WR_SEND_WITH_IMM)
+				next->ex.imm_data =
+					(__be32 __force) user_wr->ex.imm_data;
+		} else {
+			switch (next->opcode) {
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				next->ex.imm_data =
+					(__be32 __force) user_wr->ex.imm_data;
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_READ:
+				next->wr.rdma.remote_addr =
+					user_wr->wr.rdma.remote_addr;
+				next->wr.rdma.rkey        =
+					user_wr->wr.rdma.rkey;
+				break;
+			case IB_WR_SEND_WITH_IMM:
+				next->ex.imm_data =
+					(__be32 __force) user_wr->ex.imm_data;
+				break;
+			case IB_WR_SEND_WITH_INV:
+				next->ex.invalidate_rkey =
+					user_wr->ex.invalidate_rkey;
+				break;
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				next->wr.atomic.remote_addr =
+					user_wr->wr.atomic.remote_addr;
+				next->wr.atomic.compare_add =
+					user_wr->wr.atomic.compare_add;
+				next->wr.atomic.swap = user_wr->wr.atomic.swap;
+				next->wr.atomic.rkey = user_wr->wr.atomic.rkey;
+				break;
+			default:
+				break;
+			}
+		}
+
+		if (next->num_sge) {
+			next->sg_list = (void *) next +
+				ALIGN(sizeof *next, sizeof (struct ib_sge));
+			if (copy_from_user(next->sg_list,
+					   buf + sizeof cmd +
+					   cmd.wr_count * cmd.wqe_size +
+					   sg_ind * sizeof (struct ib_sge),
+					   next->num_sge * sizeof (struct ib_sge))) {
+				ret = -EFAULT;
+				goto out_put;
+			}
+			sg_ind += next->num_sge;
+		} else
+			next->sg_list = NULL;
+	}
+
+	resp.bad_wr = 0;
+	ret = qp->device->post_send(qp->real_qp, wr, &bad_wr);
+	if (ret)
+		for (next = wr; next; next = next->next) {
+			++resp.bad_wr;
+			if (next == bad_wr)
+				break;
+		}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out_put:
+	put_qp_read(qp);
+
+	while (wr) {
+		if (is_ud && wr->wr.ud.ah)
+			put_ah_read(wr->wr.ud.ah);
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+out:
+	kfree(user_wr);
+
+	return ret ? ret : in_len;
+}
+
+static struct ib_recv_wr *ib_uverbs_unmarshall_recv(const char __user *buf,
+						    int in_len,
+						    u32 wr_count,
+						    u32 sge_count,
+						    u32 wqe_size)
+{
+	struct ib_uverbs_recv_wr *user_wr;
+	struct ib_recv_wr        *wr = NULL, *last, *next;
+	int                       sg_ind;
+	int                       i;
+	int                       ret;
+
+	if (in_len < wqe_size * wr_count +
+	    sge_count * sizeof (struct ib_uverbs_sge))
+		return ERR_PTR(-EINVAL);
+
+	if (wqe_size < sizeof (struct ib_uverbs_recv_wr))
+		return ERR_PTR(-EINVAL);
+
+	user_wr = kmalloc(wqe_size, GFP_KERNEL);
+	if (!user_wr)
+		return ERR_PTR(-ENOMEM);
+
+	sg_ind = 0;
+	last = NULL;
+	for (i = 0; i < wr_count; ++i) {
+		if (copy_from_user(user_wr, buf + i * wqe_size,
+				   wqe_size)) {
+			ret = -EFAULT;
+			goto err;
+		}
+
+		if (user_wr->num_sge + sg_ind > sge_count) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		next = kmalloc(ALIGN(sizeof *next, sizeof (struct ib_sge)) +
+			       user_wr->num_sge * sizeof (struct ib_sge),
+			       GFP_KERNEL);
+		if (!next) {
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		if (!last)
+			wr = next;
+		else
+			last->next = next;
+		last = next;
+
+		next->next       = NULL;
+		next->wr_id      = user_wr->wr_id;
+		next->num_sge    = user_wr->num_sge;
+
+		if (next->num_sge) {
+			next->sg_list = (void *) next +
+				ALIGN(sizeof *next, sizeof (struct ib_sge));
+			if (copy_from_user(next->sg_list,
+					   buf + wr_count * wqe_size +
+					   sg_ind * sizeof (struct ib_sge),
+					   next->num_sge * sizeof (struct ib_sge))) {
+				ret = -EFAULT;
+				goto err;
+			}
+			sg_ind += next->num_sge;
+		} else
+			next->sg_list = NULL;
+	}
+
+	kfree(user_wr);
+	return wr;
+
+err:
+	kfree(user_wr);
+
+	while (wr) {
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+	return ERR_PTR(ret);
+}
+
+ssize_t ib_uverbs_post_recv(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_post_recv      cmd;
+	struct ib_uverbs_post_recv_resp resp;
+	struct ib_recv_wr              *wr, *next, *bad_wr;
+	struct ib_qp                   *qp;
+	ssize_t                         ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+				       in_len - sizeof cmd, cmd.wr_count,
+				       cmd.sge_count, cmd.wqe_size);
+	if (IS_ERR(wr))
+		return PTR_ERR(wr);
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp)
+		goto out;
+
+	resp.bad_wr = 0;
+	ret = qp->device->post_recv(qp->real_qp, wr, &bad_wr);
+
+	put_qp_read(qp);
+
+	if (ret)
+		for (next = wr; next; next = next->next) {
+			++resp.bad_wr;
+			if (next == bad_wr)
+				break;
+		}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	while (wr) {
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_post_srq_recv(struct ib_uverbs_file *file,
+				const char __user *buf, int in_len,
+				int out_len)
+{
+	struct ib_uverbs_post_srq_recv      cmd;
+	struct ib_uverbs_post_srq_recv_resp resp;
+	struct ib_recv_wr                  *wr, *next, *bad_wr;
+	struct ib_srq                      *srq;
+	ssize_t                             ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	wr = ib_uverbs_unmarshall_recv(buf + sizeof cmd,
+				       in_len - sizeof cmd, cmd.wr_count,
+				       cmd.sge_count, cmd.wqe_size);
+	if (IS_ERR(wr))
+		return PTR_ERR(wr);
+
+	srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+	if (!srq)
+		goto out;
+
+	resp.bad_wr = 0;
+	ret = srq->device->post_srq_recv(srq, wr, &bad_wr);
+
+	put_srq_read(srq);
+
+	if (ret)
+		for (next = wr; next; next = next->next) {
+			++resp.bad_wr;
+			if (next == bad_wr)
+				break;
+		}
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+out:
+	while (wr) {
+		next = wr->next;
+		kfree(wr);
+		wr = next;
+	}
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_create_ah(struct ib_uverbs_file *file,
+			    const char __user *buf, int in_len,
+			    int out_len)
+{
+	struct ib_uverbs_create_ah	 cmd;
+	struct ib_uverbs_create_ah_resp	 resp;
+	struct ib_uobject		*uobj;
+	struct ib_pd			*pd;
+	struct ib_ah			*ah;
+	struct ib_ah_attr		attr;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = kmalloc(sizeof *uobj, GFP_KERNEL);
+	if (!uobj)
+		return -ENOMEM;
+
+	init_uobj(uobj, cmd.user_handle, file->ucontext, &ah_lock_class);
+	down_write(&uobj->mutex);
+
+	pd = idr_read_pd(cmd.pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	attr.dlid 	       = cmd.attr.dlid;
+	attr.sl 	       = cmd.attr.sl;
+	attr.src_path_bits     = cmd.attr.src_path_bits;
+	attr.static_rate       = cmd.attr.static_rate;
+	attr.ah_flags          = cmd.attr.is_global ? IB_AH_GRH : 0;
+	attr.port_num 	       = cmd.attr.port_num;
+	attr.grh.flow_label    = cmd.attr.grh.flow_label;
+	attr.grh.sgid_index    = cmd.attr.grh.sgid_index;
+	attr.grh.hop_limit     = cmd.attr.grh.hop_limit;
+	attr.grh.traffic_class = cmd.attr.grh.traffic_class;
+	attr.vlan_id           = 0;
+	memset(&attr.dmac, 0, sizeof(attr.dmac));
+	memcpy(attr.grh.dgid.raw, cmd.attr.grh.dgid, 16);
+
+	ah = ib_create_ah(pd, &attr);
+	if (IS_ERR(ah)) {
+		ret = PTR_ERR(ah);
+		goto err_put;
+	}
+
+	ah->uobject  = uobj;
+	uobj->object = ah;
+
+	ret = idr_add_uobj(&ib_uverbs_ah_idr, uobj);
+	if (ret)
+		goto err_destroy;
+
+	resp.ah_handle = uobj->id;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	put_pd_read(pd);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->ah_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+
+	return in_len;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+
+err_destroy:
+	ib_destroy_ah(ah);
+
+err_put:
+	put_pd_read(pd);
+
+err:
+	put_uobj_write(uobj);
+	return ret;
+}
+
+ssize_t ib_uverbs_destroy_ah(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len, int out_len)
+{
+	struct ib_uverbs_destroy_ah cmd;
+	struct ib_ah		   *ah;
+	struct ib_uobject	   *uobj;
+	int			    ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_ah_idr, cmd.ah_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	ah = uobj->object;
+
+	ret = ib_destroy_ah(ah);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_attach_mcast(struct ib_uverbs_file *file,
+			       const char __user *buf, int in_len,
+			       int out_len)
+{
+	struct ib_uverbs_attach_mcast cmd;
+	struct ib_qp                 *qp;
+	struct ib_uqp_object         *obj;
+	struct ib_uverbs_mcast_entry *mcast;
+	int                           ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	qp = idr_write_qp(cmd.qp_handle, file->ucontext);
+	if (!qp)
+		return -EINVAL;
+
+	obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+
+	list_for_each_entry(mcast, &obj->mcast_list, list)
+		if (cmd.mlid == mcast->lid &&
+		    !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+			ret = 0;
+			goto out_put;
+		}
+
+	mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+	if (!mcast) {
+		ret = -ENOMEM;
+		goto out_put;
+	}
+
+	mcast->lid = cmd.mlid;
+	memcpy(mcast->gid.raw, cmd.gid, sizeof mcast->gid.raw);
+
+	ret = ib_attach_mcast(qp, &mcast->gid, cmd.mlid);
+	if (!ret)
+		list_add_tail(&mcast->list, &obj->mcast_list);
+	else
+		kfree(mcast);
+
+out_put:
+	put_qp_write(qp);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_detach_mcast(struct ib_uverbs_file *file,
+			       const char __user *buf, int in_len,
+			       int out_len)
+{
+	struct ib_uverbs_detach_mcast cmd;
+	struct ib_uqp_object         *obj;
+	struct ib_qp                 *qp;
+	struct ib_uverbs_mcast_entry *mcast;
+	int                           ret = -EINVAL;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	qp = idr_write_qp(cmd.qp_handle, file->ucontext);
+	if (!qp)
+		return -EINVAL;
+
+	ret = ib_detach_mcast(qp, (union ib_gid *) cmd.gid, cmd.mlid);
+	if (ret)
+		goto out_put;
+
+	obj = container_of(qp->uobject, struct ib_uqp_object, uevent.uobject);
+
+	list_for_each_entry(mcast, &obj->mcast_list, list)
+		if (cmd.mlid == mcast->lid &&
+		    !memcmp(cmd.gid, mcast->gid.raw, sizeof mcast->gid.raw)) {
+			list_del(&mcast->list);
+			kfree(mcast);
+			break;
+		}
+
+out_put:
+	put_qp_write(qp);
+
+	return ret ? ret : in_len;
+}
+
+static int kern_spec_to_ib_spec(struct ib_uverbs_flow_spec *kern_spec,
+				union ib_flow_spec *ib_spec)
+{
+	if (kern_spec->reserved)
+		return -EINVAL;
+
+	ib_spec->type = kern_spec->type;
+
+	switch (ib_spec->type) {
+	case IB_FLOW_SPEC_ETH:
+		ib_spec->eth.size = sizeof(struct ib_flow_spec_eth);
+		if (ib_spec->eth.size != kern_spec->eth.size)
+			return -EINVAL;
+		memcpy(&ib_spec->eth.val, &kern_spec->eth.val,
+		       sizeof(struct ib_flow_eth_filter));
+		memcpy(&ib_spec->eth.mask, &kern_spec->eth.mask,
+		       sizeof(struct ib_flow_eth_filter));
+		break;
+	case IB_FLOW_SPEC_IPV4:
+		ib_spec->ipv4.size = sizeof(struct ib_flow_spec_ipv4);
+		if (ib_spec->ipv4.size != kern_spec->ipv4.size)
+			return -EINVAL;
+		memcpy(&ib_spec->ipv4.val, &kern_spec->ipv4.val,
+		       sizeof(struct ib_flow_ipv4_filter));
+		memcpy(&ib_spec->ipv4.mask, &kern_spec->ipv4.mask,
+		       sizeof(struct ib_flow_ipv4_filter));
+		break;
+	case IB_FLOW_SPEC_TCP:
+	case IB_FLOW_SPEC_UDP:
+		ib_spec->tcp_udp.size = sizeof(struct ib_flow_spec_tcp_udp);
+		if (ib_spec->tcp_udp.size != kern_spec->tcp_udp.size)
+			return -EINVAL;
+		memcpy(&ib_spec->tcp_udp.val, &kern_spec->tcp_udp.val,
+		       sizeof(struct ib_flow_tcp_udp_filter));
+		memcpy(&ib_spec->tcp_udp.mask, &kern_spec->tcp_udp.mask,
+		       sizeof(struct ib_flow_tcp_udp_filter));
+		break;
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+int ib_uverbs_ex_create_flow(struct ib_uverbs_file *file,
+			     struct ib_udata *ucore,
+			     struct ib_udata *uhw)
+{
+	struct ib_uverbs_create_flow	  cmd;
+	struct ib_uverbs_create_flow_resp resp;
+	struct ib_uobject		  *uobj;
+	struct ib_flow			  *flow_id;
+	struct ib_uverbs_flow_attr	  *kern_flow_attr;
+	struct ib_flow_attr		  *flow_attr;
+	struct ib_qp			  *qp;
+	int err = 0;
+	void *kern_spec;
+	void *ib_spec;
+	int i;
+
+	if (ucore->inlen < sizeof(cmd))
+		return -EINVAL;
+
+	if (ucore->outlen < sizeof(resp))
+		return -ENOSPC;
+
+	err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+	if (err)
+		return err;
+
+	ucore->inbuf += sizeof(cmd);
+	ucore->inlen -= sizeof(cmd);
+
+	if (cmd.comp_mask)
+		return -EINVAL;
+
+	if ((cmd.flow_attr.type == IB_FLOW_ATTR_SNIFFER &&
+	     !capable(CAP_NET_ADMIN)) || !capable(CAP_NET_RAW))
+		return -EPERM;
+
+	if (cmd.flow_attr.num_of_specs > IB_FLOW_SPEC_SUPPORT_LAYERS)
+		return -EINVAL;
+
+	if (cmd.flow_attr.size > ucore->inlen ||
+	    cmd.flow_attr.size >
+	    (cmd.flow_attr.num_of_specs * sizeof(struct ib_uverbs_flow_spec)))
+		return -EINVAL;
+
+	if (cmd.flow_attr.reserved[0] ||
+	    cmd.flow_attr.reserved[1])
+		return -EINVAL;
+
+	if (cmd.flow_attr.num_of_specs) {
+		kern_flow_attr = kmalloc(sizeof(*kern_flow_attr) + cmd.flow_attr.size,
+					 GFP_KERNEL);
+		if (!kern_flow_attr)
+			return -ENOMEM;
+
+		memcpy(kern_flow_attr, &cmd.flow_attr, sizeof(*kern_flow_attr));
+		err = ib_copy_from_udata(kern_flow_attr + 1, ucore,
+					 cmd.flow_attr.size);
+		if (err)
+			goto err_free_attr;
+	} else {
+		kern_flow_attr = &cmd.flow_attr;
+	}
+
+	uobj = kmalloc(sizeof(*uobj), GFP_KERNEL);
+	if (!uobj) {
+		err = -ENOMEM;
+		goto err_free_attr;
+	}
+	init_uobj(uobj, 0, file->ucontext, &rule_lock_class);
+	down_write(&uobj->mutex);
+
+	qp = idr_read_qp(cmd.qp_handle, file->ucontext);
+	if (!qp) {
+		err = -EINVAL;
+		goto err_uobj;
+	}
+
+	flow_attr = kmalloc(sizeof(*flow_attr) + cmd.flow_attr.size, GFP_KERNEL);
+	if (!flow_attr) {
+		err = -ENOMEM;
+		goto err_put;
+	}
+
+	flow_attr->type = kern_flow_attr->type;
+	flow_attr->priority = kern_flow_attr->priority;
+	flow_attr->num_of_specs = kern_flow_attr->num_of_specs;
+	flow_attr->port = kern_flow_attr->port;
+	flow_attr->flags = kern_flow_attr->flags;
+	flow_attr->size = sizeof(*flow_attr);
+
+	kern_spec = kern_flow_attr + 1;
+	ib_spec = flow_attr + 1;
+	for (i = 0; i < flow_attr->num_of_specs &&
+	     cmd.flow_attr.size > offsetof(struct ib_uverbs_flow_spec, reserved) &&
+	     cmd.flow_attr.size >=
+	     ((struct ib_uverbs_flow_spec *)kern_spec)->size; i++) {
+		err = kern_spec_to_ib_spec(kern_spec, ib_spec);
+		if (err)
+			goto err_free;
+		flow_attr->size +=
+			((union ib_flow_spec *) ib_spec)->size;
+		cmd.flow_attr.size -= ((struct ib_uverbs_flow_spec *)kern_spec)->size;
+		kern_spec += ((struct ib_uverbs_flow_spec *) kern_spec)->size;
+		ib_spec += ((union ib_flow_spec *) ib_spec)->size;
+	}
+	if (cmd.flow_attr.size || (i != flow_attr->num_of_specs)) {
+		pr_warn("create flow failed, flow %d: %d bytes left from uverb cmd\n",
+			i, cmd.flow_attr.size);
+		err = -EINVAL;
+		goto err_free;
+	}
+	flow_id = ib_create_flow(qp, flow_attr, IB_FLOW_DOMAIN_USER);
+	if (IS_ERR(flow_id)) {
+		err = PTR_ERR(flow_id);
+		goto err_free;
+	}
+	flow_id->qp = qp;
+	flow_id->uobject = uobj;
+	uobj->object = flow_id;
+
+	err = idr_add_uobj(&ib_uverbs_rule_idr, uobj);
+	if (err)
+		goto destroy_flow;
+
+	memset(&resp, 0, sizeof(resp));
+	resp.flow_handle = uobj->id;
+
+	err = ib_copy_to_udata(ucore,
+			       &resp, sizeof(resp));
+	if (err)
+		goto err_copy;
+
+	put_qp_read(qp);
+	mutex_lock(&file->mutex);
+	list_add_tail(&uobj->list, &file->ucontext->rule_list);
+	mutex_unlock(&file->mutex);
+
+	uobj->live = 1;
+
+	up_write(&uobj->mutex);
+	kfree(flow_attr);
+	if (cmd.flow_attr.num_of_specs)
+		kfree(kern_flow_attr);
+	return 0;
+err_copy:
+	idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+destroy_flow:
+	ib_destroy_flow(flow_id);
+err_free:
+	kfree(flow_attr);
+err_put:
+	put_qp_read(qp);
+err_uobj:
+	put_uobj_write(uobj);
+err_free_attr:
+	if (cmd.flow_attr.num_of_specs)
+		kfree(kern_flow_attr);
+	return err;
+}
+
+int ib_uverbs_ex_destroy_flow(struct ib_uverbs_file *file,
+			      struct ib_udata *ucore,
+			      struct ib_udata *uhw)
+{
+	struct ib_uverbs_destroy_flow	cmd;
+	struct ib_flow			*flow_id;
+	struct ib_uobject		*uobj;
+	int				ret;
+
+	if (ucore->inlen < sizeof(cmd))
+		return -EINVAL;
+
+	ret = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+	if (ret)
+		return ret;
+
+	if (cmd.comp_mask)
+		return -EINVAL;
+
+	uobj = idr_write_uobj(&ib_uverbs_rule_idr, cmd.flow_handle,
+			      file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	flow_id = uobj->object;
+
+	ret = ib_destroy_flow(flow_id);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	put_uobj(uobj);
+
+	return ret;
+}
+
+static int __uverbs_create_xsrq(struct ib_uverbs_file *file,
+				struct ib_uverbs_create_xsrq *cmd,
+				struct ib_udata *udata)
+{
+	struct ib_uverbs_create_srq_resp resp;
+	struct ib_usrq_object           *obj;
+	struct ib_pd                    *pd;
+	struct ib_srq                   *srq;
+	struct ib_uobject               *uninitialized_var(xrcd_uobj);
+	struct ib_srq_init_attr          attr;
+	int ret;
+
+	obj = kmalloc(sizeof *obj, GFP_KERNEL);
+	if (!obj)
+		return -ENOMEM;
+
+	init_uobj(&obj->uevent.uobject, cmd->user_handle, file->ucontext, &srq_lock_class);
+	down_write(&obj->uevent.uobject.mutex);
+
+	if (cmd->srq_type == IB_SRQT_XRC) {
+		attr.ext.xrc.xrcd  = idr_read_xrcd(cmd->xrcd_handle, file->ucontext, &xrcd_uobj);
+		if (!attr.ext.xrc.xrcd) {
+			ret = -EINVAL;
+			goto err;
+		}
+
+		obj->uxrcd = container_of(xrcd_uobj, struct ib_uxrcd_object, uobject);
+		atomic_inc(&obj->uxrcd->refcnt);
+
+		attr.ext.xrc.cq  = idr_read_cq(cmd->cq_handle, file->ucontext, 0);
+		if (!attr.ext.xrc.cq) {
+			ret = -EINVAL;
+			goto err_put_xrcd;
+		}
+	}
+
+	pd  = idr_read_pd(cmd->pd_handle, file->ucontext);
+	if (!pd) {
+		ret = -EINVAL;
+		goto err_put_cq;
+	}
+
+	attr.event_handler  = ib_uverbs_srq_event_handler;
+	attr.srq_context    = file;
+	attr.srq_type       = cmd->srq_type;
+	attr.attr.max_wr    = cmd->max_wr;
+	attr.attr.max_sge   = cmd->max_sge;
+	attr.attr.srq_limit = cmd->srq_limit;
+
+	obj->uevent.events_reported = 0;
+	INIT_LIST_HEAD(&obj->uevent.event_list);
+
+	srq = pd->device->create_srq(pd, &attr, udata);
+	if (IS_ERR(srq)) {
+		ret = PTR_ERR(srq);
+		goto err_put;
+	}
+
+	srq->device        = pd->device;
+	srq->pd            = pd;
+	srq->srq_type	   = cmd->srq_type;
+	srq->uobject       = &obj->uevent.uobject;
+	srq->event_handler = attr.event_handler;
+	srq->srq_context   = attr.srq_context;
+
+	if (cmd->srq_type == IB_SRQT_XRC) {
+		srq->ext.xrc.cq   = attr.ext.xrc.cq;
+		srq->ext.xrc.xrcd = attr.ext.xrc.xrcd;
+		atomic_inc(&attr.ext.xrc.cq->usecnt);
+		atomic_inc(&attr.ext.xrc.xrcd->usecnt);
+	}
+
+	atomic_inc(&pd->usecnt);
+	atomic_set(&srq->usecnt, 0);
+
+	obj->uevent.uobject.object = srq;
+	ret = idr_add_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject);
+	if (ret)
+		goto err_destroy;
+
+	memset(&resp, 0, sizeof resp);
+	resp.srq_handle = obj->uevent.uobject.id;
+	resp.max_wr     = attr.attr.max_wr;
+	resp.max_sge    = attr.attr.max_sge;
+	if (cmd->srq_type == IB_SRQT_XRC)
+		resp.srqn = srq->ext.xrc.srq_num;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd->response,
+			 &resp, sizeof resp)) {
+		ret = -EFAULT;
+		goto err_copy;
+	}
+
+	if (cmd->srq_type == IB_SRQT_XRC) {
+		put_uobj_read(xrcd_uobj);
+		put_cq_read(attr.ext.xrc.cq);
+	}
+	put_pd_read(pd);
+
+	mutex_lock(&file->mutex);
+	list_add_tail(&obj->uevent.uobject.list, &file->ucontext->srq_list);
+	mutex_unlock(&file->mutex);
+
+	obj->uevent.uobject.live = 1;
+
+	up_write(&obj->uevent.uobject.mutex);
+
+	return 0;
+
+err_copy:
+	idr_remove_uobj(&ib_uverbs_srq_idr, &obj->uevent.uobject);
+
+err_destroy:
+	ib_destroy_srq(srq);
+
+err_put:
+	put_pd_read(pd);
+
+err_put_cq:
+	if (cmd->srq_type == IB_SRQT_XRC)
+		put_cq_read(attr.ext.xrc.cq);
+
+err_put_xrcd:
+	if (cmd->srq_type == IB_SRQT_XRC) {
+		atomic_dec(&obj->uxrcd->refcnt);
+		put_uobj_read(xrcd_uobj);
+	}
+
+err:
+	put_uobj_write(&obj->uevent.uobject);
+	return ret;
+}
+
+ssize_t ib_uverbs_create_srq(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_create_srq      cmd;
+	struct ib_uverbs_create_xsrq     xcmd;
+	struct ib_uverbs_create_srq_resp resp;
+	struct ib_udata                  udata;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	xcmd.response	 = cmd.response;
+	xcmd.user_handle = cmd.user_handle;
+	xcmd.srq_type	 = IB_SRQT_BASIC;
+	xcmd.pd_handle	 = cmd.pd_handle;
+	xcmd.max_wr	 = cmd.max_wr;
+	xcmd.max_sge	 = cmd.max_sge;
+	xcmd.srq_limit	 = cmd.srq_limit;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	ret = __uverbs_create_xsrq(file, &xcmd, &udata);
+	if (ret)
+		return ret;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_create_xsrq(struct ib_uverbs_file *file,
+			      const char __user *buf, int in_len, int out_len)
+{
+	struct ib_uverbs_create_xsrq     cmd;
+	struct ib_uverbs_create_srq_resp resp;
+	struct ib_udata                  udata;
+	int ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd,
+		   (unsigned long) cmd.response + sizeof resp,
+		   in_len - sizeof cmd, out_len - sizeof resp);
+
+	ret = __uverbs_create_xsrq(file, &cmd, &udata);
+	if (ret)
+		return ret;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_modify_srq(struct ib_uverbs_file *file,
+			     const char __user *buf, int in_len,
+			     int out_len)
+{
+	struct ib_uverbs_modify_srq cmd;
+	struct ib_udata             udata;
+	struct ib_srq              *srq;
+	struct ib_srq_attr          attr;
+	int                         ret;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	INIT_UDATA(&udata, buf + sizeof cmd, NULL, in_len - sizeof cmd,
+		   out_len);
+
+	srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+	if (!srq)
+		return -EINVAL;
+
+	attr.max_wr    = cmd.max_wr;
+	attr.srq_limit = cmd.srq_limit;
+
+	ret = srq->device->modify_srq(srq, &attr, cmd.attr_mask, &udata);
+
+	put_srq_read(srq);
+
+	return ret ? ret : in_len;
+}
+
+ssize_t ib_uverbs_query_srq(struct ib_uverbs_file *file,
+			    const char __user *buf,
+			    int in_len, int out_len)
+{
+	struct ib_uverbs_query_srq      cmd;
+	struct ib_uverbs_query_srq_resp resp;
+	struct ib_srq_attr              attr;
+	struct ib_srq                   *srq;
+	int                             ret;
+
+	if (out_len < sizeof resp)
+		return -ENOSPC;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	srq = idr_read_srq(cmd.srq_handle, file->ucontext);
+	if (!srq)
+		return -EINVAL;
+
+	ret = ib_query_srq(srq, &attr);
+
+	put_srq_read(srq);
+
+	if (ret)
+		return ret;
+
+	memset(&resp, 0, sizeof resp);
+
+	resp.max_wr    = attr.max_wr;
+	resp.max_sge   = attr.max_sge;
+	resp.srq_limit = attr.srq_limit;
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		return -EFAULT;
+
+	return in_len;
+}
+
+ssize_t ib_uverbs_destroy_srq(struct ib_uverbs_file *file,
+			      const char __user *buf, int in_len,
+			      int out_len)
+{
+	struct ib_uverbs_destroy_srq      cmd;
+	struct ib_uverbs_destroy_srq_resp resp;
+	struct ib_uobject		 *uobj;
+	struct ib_srq               	 *srq;
+	struct ib_uevent_object        	 *obj;
+	int                         	  ret = -EINVAL;
+	struct ib_usrq_object		 *us;
+	enum ib_srq_type		  srq_type;
+
+	if (copy_from_user(&cmd, buf, sizeof cmd))
+		return -EFAULT;
+
+	uobj = idr_write_uobj(&ib_uverbs_srq_idr, cmd.srq_handle, file->ucontext);
+	if (!uobj)
+		return -EINVAL;
+	srq = uobj->object;
+	obj = container_of(uobj, struct ib_uevent_object, uobject);
+	srq_type = srq->srq_type;
+
+	ret = ib_destroy_srq(srq);
+	if (!ret)
+		uobj->live = 0;
+
+	put_uobj_write(uobj);
+
+	if (ret)
+		return ret;
+
+	if (srq_type == IB_SRQT_XRC) {
+		us = container_of(obj, struct ib_usrq_object, uevent);
+		atomic_dec(&us->uxrcd->refcnt);
+	}
+
+	idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
+
+	mutex_lock(&file->mutex);
+	list_del(&uobj->list);
+	mutex_unlock(&file->mutex);
+
+	ib_uverbs_release_uevent(file, obj);
+
+	memset(&resp, 0, sizeof resp);
+	resp.events_reported = obj->events_reported;
+
+	put_uobj(uobj);
+
+	if (copy_to_user((void __user *) (unsigned long) cmd.response,
+			 &resp, sizeof resp))
+		ret = -EFAULT;
+
+	return ret ? ret : in_len;
+}
+
+int ib_uverbs_ex_query_device(struct ib_uverbs_file *file,
+			      struct ib_udata *ucore,
+			      struct ib_udata *uhw)
+{
+	struct ib_uverbs_ex_query_device_resp resp;
+	struct ib_uverbs_ex_query_device  cmd;
+	struct ib_device_attr attr;
+	struct ib_device *device;
+	int err;
+
+	device = file->device->ib_dev;
+	if (ucore->inlen < sizeof(cmd))
+		return -EINVAL;
+
+	err = ib_copy_from_udata(&cmd, ucore, sizeof(cmd));
+	if (err)
+		return err;
+
+	if (cmd.comp_mask)
+		return -EINVAL;
+
+	if (cmd.reserved)
+		return -EINVAL;
+
+	resp.response_length = offsetof(typeof(resp), odp_caps);
+
+	if (ucore->outlen < resp.response_length)
+		return -ENOSPC;
+
+	err = device->query_device(device, &attr);
+	if (err)
+		return err;
+
+	copy_query_dev_fields(file, &resp.base, &attr);
+	resp.comp_mask = 0;
+
+	if (ucore->outlen < resp.response_length + sizeof(resp.odp_caps))
+		goto end;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	resp.odp_caps.general_caps = attr.odp_caps.general_caps;
+	resp.odp_caps.per_transport_caps.rc_odp_caps =
+		attr.odp_caps.per_transport_caps.rc_odp_caps;
+	resp.odp_caps.per_transport_caps.uc_odp_caps =
+		attr.odp_caps.per_transport_caps.uc_odp_caps;
+	resp.odp_caps.per_transport_caps.ud_odp_caps =
+		attr.odp_caps.per_transport_caps.ud_odp_caps;
+	resp.odp_caps.reserved = 0;
+#else
+	memset(&resp.odp_caps, 0, sizeof(resp.odp_caps));
+#endif
+	resp.response_length += sizeof(resp.odp_caps);
+
+end:
+	err = ib_copy_to_udata(ucore, &resp, resp.response_length);
+	if (err)
+		return err;
+
+	return 0;
+}
diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c
new file mode 100644
index 000000000..88cce9bb7
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_main.c
@@ -0,0 +1,1039 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/fs.h>
+#include <linux/poll.h>
+#include <linux/sched.h>
+#include <linux/file.h>
+#include <linux/cdev.h>
+#include <linux/anon_inodes.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+
+#include "uverbs.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand userspace verbs access");
+MODULE_LICENSE("Dual BSD/GPL");
+
+enum {
+	IB_UVERBS_MAJOR       = 231,
+	IB_UVERBS_BASE_MINOR  = 192,
+	IB_UVERBS_MAX_DEVICES = 32
+};
+
+#define IB_UVERBS_BASE_DEV	MKDEV(IB_UVERBS_MAJOR, IB_UVERBS_BASE_MINOR)
+
+static struct class *uverbs_class;
+
+DEFINE_SPINLOCK(ib_uverbs_idr_lock);
+DEFINE_IDR(ib_uverbs_pd_idr);
+DEFINE_IDR(ib_uverbs_mr_idr);
+DEFINE_IDR(ib_uverbs_mw_idr);
+DEFINE_IDR(ib_uverbs_ah_idr);
+DEFINE_IDR(ib_uverbs_cq_idr);
+DEFINE_IDR(ib_uverbs_qp_idr);
+DEFINE_IDR(ib_uverbs_srq_idr);
+DEFINE_IDR(ib_uverbs_xrcd_idr);
+DEFINE_IDR(ib_uverbs_rule_idr);
+
+static DEFINE_SPINLOCK(map_lock);
+static DECLARE_BITMAP(dev_map, IB_UVERBS_MAX_DEVICES);
+
+static ssize_t (*uverbs_cmd_table[])(struct ib_uverbs_file *file,
+				     const char __user *buf, int in_len,
+				     int out_len) = {
+	[IB_USER_VERBS_CMD_GET_CONTEXT]		= ib_uverbs_get_context,
+	[IB_USER_VERBS_CMD_QUERY_DEVICE]	= ib_uverbs_query_device,
+	[IB_USER_VERBS_CMD_QUERY_PORT]		= ib_uverbs_query_port,
+	[IB_USER_VERBS_CMD_ALLOC_PD]		= ib_uverbs_alloc_pd,
+	[IB_USER_VERBS_CMD_DEALLOC_PD]		= ib_uverbs_dealloc_pd,
+	[IB_USER_VERBS_CMD_REG_MR]		= ib_uverbs_reg_mr,
+	[IB_USER_VERBS_CMD_REREG_MR]		= ib_uverbs_rereg_mr,
+	[IB_USER_VERBS_CMD_DEREG_MR]		= ib_uverbs_dereg_mr,
+	[IB_USER_VERBS_CMD_ALLOC_MW]		= ib_uverbs_alloc_mw,
+	[IB_USER_VERBS_CMD_DEALLOC_MW]		= ib_uverbs_dealloc_mw,
+	[IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL] = ib_uverbs_create_comp_channel,
+	[IB_USER_VERBS_CMD_CREATE_CQ]		= ib_uverbs_create_cq,
+	[IB_USER_VERBS_CMD_RESIZE_CQ]		= ib_uverbs_resize_cq,
+	[IB_USER_VERBS_CMD_POLL_CQ]		= ib_uverbs_poll_cq,
+	[IB_USER_VERBS_CMD_REQ_NOTIFY_CQ]	= ib_uverbs_req_notify_cq,
+	[IB_USER_VERBS_CMD_DESTROY_CQ]		= ib_uverbs_destroy_cq,
+	[IB_USER_VERBS_CMD_CREATE_QP]		= ib_uverbs_create_qp,
+	[IB_USER_VERBS_CMD_QUERY_QP]		= ib_uverbs_query_qp,
+	[IB_USER_VERBS_CMD_MODIFY_QP]		= ib_uverbs_modify_qp,
+	[IB_USER_VERBS_CMD_DESTROY_QP]		= ib_uverbs_destroy_qp,
+	[IB_USER_VERBS_CMD_POST_SEND]		= ib_uverbs_post_send,
+	[IB_USER_VERBS_CMD_POST_RECV]		= ib_uverbs_post_recv,
+	[IB_USER_VERBS_CMD_POST_SRQ_RECV]	= ib_uverbs_post_srq_recv,
+	[IB_USER_VERBS_CMD_CREATE_AH]		= ib_uverbs_create_ah,
+	[IB_USER_VERBS_CMD_DESTROY_AH]		= ib_uverbs_destroy_ah,
+	[IB_USER_VERBS_CMD_ATTACH_MCAST]	= ib_uverbs_attach_mcast,
+	[IB_USER_VERBS_CMD_DETACH_MCAST]	= ib_uverbs_detach_mcast,
+	[IB_USER_VERBS_CMD_CREATE_SRQ]		= ib_uverbs_create_srq,
+	[IB_USER_VERBS_CMD_MODIFY_SRQ]		= ib_uverbs_modify_srq,
+	[IB_USER_VERBS_CMD_QUERY_SRQ]		= ib_uverbs_query_srq,
+	[IB_USER_VERBS_CMD_DESTROY_SRQ]		= ib_uverbs_destroy_srq,
+	[IB_USER_VERBS_CMD_OPEN_XRCD]		= ib_uverbs_open_xrcd,
+	[IB_USER_VERBS_CMD_CLOSE_XRCD]		= ib_uverbs_close_xrcd,
+	[IB_USER_VERBS_CMD_CREATE_XSRQ]		= ib_uverbs_create_xsrq,
+	[IB_USER_VERBS_CMD_OPEN_QP]		= ib_uverbs_open_qp,
+};
+
+static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file,
+				    struct ib_udata *ucore,
+				    struct ib_udata *uhw) = {
+	[IB_USER_VERBS_EX_CMD_CREATE_FLOW]	= ib_uverbs_ex_create_flow,
+	[IB_USER_VERBS_EX_CMD_DESTROY_FLOW]	= ib_uverbs_ex_destroy_flow,
+	[IB_USER_VERBS_EX_CMD_QUERY_DEVICE]	= ib_uverbs_ex_query_device,
+};
+
+static void ib_uverbs_add_one(struct ib_device *device);
+static void ib_uverbs_remove_one(struct ib_device *device);
+
+static void ib_uverbs_release_dev(struct kref *ref)
+{
+	struct ib_uverbs_device *dev =
+		container_of(ref, struct ib_uverbs_device, ref);
+
+	complete(&dev->comp);
+}
+
+static void ib_uverbs_release_event_file(struct kref *ref)
+{
+	struct ib_uverbs_event_file *file =
+		container_of(ref, struct ib_uverbs_event_file, ref);
+
+	kfree(file);
+}
+
+void ib_uverbs_release_ucq(struct ib_uverbs_file *file,
+			  struct ib_uverbs_event_file *ev_file,
+			  struct ib_ucq_object *uobj)
+{
+	struct ib_uverbs_event *evt, *tmp;
+
+	if (ev_file) {
+		spin_lock_irq(&ev_file->lock);
+		list_for_each_entry_safe(evt, tmp, &uobj->comp_list, obj_list) {
+			list_del(&evt->list);
+			kfree(evt);
+		}
+		spin_unlock_irq(&ev_file->lock);
+
+		kref_put(&ev_file->ref, ib_uverbs_release_event_file);
+	}
+
+	spin_lock_irq(&file->async_file->lock);
+	list_for_each_entry_safe(evt, tmp, &uobj->async_list, obj_list) {
+		list_del(&evt->list);
+		kfree(evt);
+	}
+	spin_unlock_irq(&file->async_file->lock);
+}
+
+void ib_uverbs_release_uevent(struct ib_uverbs_file *file,
+			      struct ib_uevent_object *uobj)
+{
+	struct ib_uverbs_event *evt, *tmp;
+
+	spin_lock_irq(&file->async_file->lock);
+	list_for_each_entry_safe(evt, tmp, &uobj->event_list, obj_list) {
+		list_del(&evt->list);
+		kfree(evt);
+	}
+	spin_unlock_irq(&file->async_file->lock);
+}
+
+static void ib_uverbs_detach_umcast(struct ib_qp *qp,
+				    struct ib_uqp_object *uobj)
+{
+	struct ib_uverbs_mcast_entry *mcast, *tmp;
+
+	list_for_each_entry_safe(mcast, tmp, &uobj->mcast_list, list) {
+		ib_detach_mcast(qp, &mcast->gid, mcast->lid);
+		list_del(&mcast->list);
+		kfree(mcast);
+	}
+}
+
+static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file,
+				      struct ib_ucontext *context)
+{
+	struct ib_uobject *uobj, *tmp;
+
+	if (!context)
+		return 0;
+
+	context->closing = 1;
+
+	list_for_each_entry_safe(uobj, tmp, &context->ah_list, list) {
+		struct ib_ah *ah = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_ah_idr, uobj);
+		ib_destroy_ah(ah);
+		kfree(uobj);
+	}
+
+	/* Remove MWs before QPs, in order to support type 2A MWs. */
+	list_for_each_entry_safe(uobj, tmp, &context->mw_list, list) {
+		struct ib_mw *mw = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_mw_idr, uobj);
+		ib_dealloc_mw(mw);
+		kfree(uobj);
+	}
+
+	list_for_each_entry_safe(uobj, tmp, &context->rule_list, list) {
+		struct ib_flow *flow_id = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_rule_idr, uobj);
+		ib_destroy_flow(flow_id);
+		kfree(uobj);
+	}
+
+	list_for_each_entry_safe(uobj, tmp, &context->qp_list, list) {
+		struct ib_qp *qp = uobj->object;
+		struct ib_uqp_object *uqp =
+			container_of(uobj, struct ib_uqp_object, uevent.uobject);
+
+		idr_remove_uobj(&ib_uverbs_qp_idr, uobj);
+		if (qp != qp->real_qp) {
+			ib_close_qp(qp);
+		} else {
+			ib_uverbs_detach_umcast(qp, uqp);
+			ib_destroy_qp(qp);
+		}
+		ib_uverbs_release_uevent(file, &uqp->uevent);
+		kfree(uqp);
+	}
+
+	list_for_each_entry_safe(uobj, tmp, &context->srq_list, list) {
+		struct ib_srq *srq = uobj->object;
+		struct ib_uevent_object *uevent =
+			container_of(uobj, struct ib_uevent_object, uobject);
+
+		idr_remove_uobj(&ib_uverbs_srq_idr, uobj);
+		ib_destroy_srq(srq);
+		ib_uverbs_release_uevent(file, uevent);
+		kfree(uevent);
+	}
+
+	list_for_each_entry_safe(uobj, tmp, &context->cq_list, list) {
+		struct ib_cq *cq = uobj->object;
+		struct ib_uverbs_event_file *ev_file = cq->cq_context;
+		struct ib_ucq_object *ucq =
+			container_of(uobj, struct ib_ucq_object, uobject);
+
+		idr_remove_uobj(&ib_uverbs_cq_idr, uobj);
+		ib_destroy_cq(cq);
+		ib_uverbs_release_ucq(file, ev_file, ucq);
+		kfree(ucq);
+	}
+
+	list_for_each_entry_safe(uobj, tmp, &context->mr_list, list) {
+		struct ib_mr *mr = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_mr_idr, uobj);
+		ib_dereg_mr(mr);
+		kfree(uobj);
+	}
+
+	mutex_lock(&file->device->xrcd_tree_mutex);
+	list_for_each_entry_safe(uobj, tmp, &context->xrcd_list, list) {
+		struct ib_xrcd *xrcd = uobj->object;
+		struct ib_uxrcd_object *uxrcd =
+			container_of(uobj, struct ib_uxrcd_object, uobject);
+
+		idr_remove_uobj(&ib_uverbs_xrcd_idr, uobj);
+		ib_uverbs_dealloc_xrcd(file->device, xrcd);
+		kfree(uxrcd);
+	}
+	mutex_unlock(&file->device->xrcd_tree_mutex);
+
+	list_for_each_entry_safe(uobj, tmp, &context->pd_list, list) {
+		struct ib_pd *pd = uobj->object;
+
+		idr_remove_uobj(&ib_uverbs_pd_idr, uobj);
+		ib_dealloc_pd(pd);
+		kfree(uobj);
+	}
+
+	put_pid(context->tgid);
+
+	return context->device->dealloc_ucontext(context);
+}
+
+static void ib_uverbs_release_file(struct kref *ref)
+{
+	struct ib_uverbs_file *file =
+		container_of(ref, struct ib_uverbs_file, ref);
+
+	module_put(file->device->ib_dev->owner);
+	kref_put(&file->device->ref, ib_uverbs_release_dev);
+
+	kfree(file);
+}
+
+static ssize_t ib_uverbs_event_read(struct file *filp, char __user *buf,
+				    size_t count, loff_t *pos)
+{
+	struct ib_uverbs_event_file *file = filp->private_data;
+	struct ib_uverbs_event *event;
+	int eventsz;
+	int ret = 0;
+
+	spin_lock_irq(&file->lock);
+
+	while (list_empty(&file->event_list)) {
+		spin_unlock_irq(&file->lock);
+
+		if (filp->f_flags & O_NONBLOCK)
+			return -EAGAIN;
+
+		if (wait_event_interruptible(file->poll_wait,
+					     !list_empty(&file->event_list)))
+			return -ERESTARTSYS;
+
+		spin_lock_irq(&file->lock);
+	}
+
+	event = list_entry(file->event_list.next, struct ib_uverbs_event, list);
+
+	if (file->is_async)
+		eventsz = sizeof (struct ib_uverbs_async_event_desc);
+	else
+		eventsz = sizeof (struct ib_uverbs_comp_event_desc);
+
+	if (eventsz > count) {
+		ret   = -EINVAL;
+		event = NULL;
+	} else {
+		list_del(file->event_list.next);
+		if (event->counter) {
+			++(*event->counter);
+			list_del(&event->obj_list);
+		}
+	}
+
+	spin_unlock_irq(&file->lock);
+
+	if (event) {
+		if (copy_to_user(buf, event, eventsz))
+			ret = -EFAULT;
+		else
+			ret = eventsz;
+	}
+
+	kfree(event);
+
+	return ret;
+}
+
+static unsigned int ib_uverbs_event_poll(struct file *filp,
+					 struct poll_table_struct *wait)
+{
+	unsigned int pollflags = 0;
+	struct ib_uverbs_event_file *file = filp->private_data;
+
+	poll_wait(filp, &file->poll_wait, wait);
+
+	spin_lock_irq(&file->lock);
+	if (!list_empty(&file->event_list))
+		pollflags = POLLIN | POLLRDNORM;
+	spin_unlock_irq(&file->lock);
+
+	return pollflags;
+}
+
+static int ib_uverbs_event_fasync(int fd, struct file *filp, int on)
+{
+	struct ib_uverbs_event_file *file = filp->private_data;
+
+	return fasync_helper(fd, filp, on, &file->async_queue);
+}
+
+static int ib_uverbs_event_close(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_event_file *file = filp->private_data;
+	struct ib_uverbs_event *entry, *tmp;
+
+	spin_lock_irq(&file->lock);
+	file->is_closed = 1;
+	list_for_each_entry_safe(entry, tmp, &file->event_list, list) {
+		if (entry->counter)
+			list_del(&entry->obj_list);
+		kfree(entry);
+	}
+	spin_unlock_irq(&file->lock);
+
+	if (file->is_async) {
+		ib_unregister_event_handler(&file->uverbs_file->event_handler);
+		kref_put(&file->uverbs_file->ref, ib_uverbs_release_file);
+	}
+	kref_put(&file->ref, ib_uverbs_release_event_file);
+
+	return 0;
+}
+
+static const struct file_operations uverbs_event_fops = {
+	.owner	 = THIS_MODULE,
+	.read	 = ib_uverbs_event_read,
+	.poll    = ib_uverbs_event_poll,
+	.release = ib_uverbs_event_close,
+	.fasync  = ib_uverbs_event_fasync,
+	.llseek	 = no_llseek,
+};
+
+void ib_uverbs_comp_handler(struct ib_cq *cq, void *cq_context)
+{
+	struct ib_uverbs_event_file    *file = cq_context;
+	struct ib_ucq_object	       *uobj;
+	struct ib_uverbs_event	       *entry;
+	unsigned long			flags;
+
+	if (!file)
+		return;
+
+	spin_lock_irqsave(&file->lock, flags);
+	if (file->is_closed) {
+		spin_unlock_irqrestore(&file->lock, flags);
+		return;
+	}
+
+	entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+	if (!entry) {
+		spin_unlock_irqrestore(&file->lock, flags);
+		return;
+	}
+
+	uobj = container_of(cq->uobject, struct ib_ucq_object, uobject);
+
+	entry->desc.comp.cq_handle = cq->uobject->user_handle;
+	entry->counter		   = &uobj->comp_events_reported;
+
+	list_add_tail(&entry->list, &file->event_list);
+	list_add_tail(&entry->obj_list, &uobj->comp_list);
+	spin_unlock_irqrestore(&file->lock, flags);
+
+	wake_up_interruptible(&file->poll_wait);
+	kill_fasync(&file->async_queue, SIGIO, POLL_IN);
+}
+
+static void ib_uverbs_async_handler(struct ib_uverbs_file *file,
+				    __u64 element, __u64 event,
+				    struct list_head *obj_list,
+				    u32 *counter)
+{
+	struct ib_uverbs_event *entry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&file->async_file->lock, flags);
+	if (file->async_file->is_closed) {
+		spin_unlock_irqrestore(&file->async_file->lock, flags);
+		return;
+	}
+
+	entry = kmalloc(sizeof *entry, GFP_ATOMIC);
+	if (!entry) {
+		spin_unlock_irqrestore(&file->async_file->lock, flags);
+		return;
+	}
+
+	entry->desc.async.element    = element;
+	entry->desc.async.event_type = event;
+	entry->desc.async.reserved   = 0;
+	entry->counter               = counter;
+
+	list_add_tail(&entry->list, &file->async_file->event_list);
+	if (obj_list)
+		list_add_tail(&entry->obj_list, obj_list);
+	spin_unlock_irqrestore(&file->async_file->lock, flags);
+
+	wake_up_interruptible(&file->async_file->poll_wait);
+	kill_fasync(&file->async_file->async_queue, SIGIO, POLL_IN);
+}
+
+void ib_uverbs_cq_event_handler(struct ib_event *event, void *context_ptr)
+{
+	struct ib_ucq_object *uobj = container_of(event->element.cq->uobject,
+						  struct ib_ucq_object, uobject);
+
+	ib_uverbs_async_handler(uobj->uverbs_file, uobj->uobject.user_handle,
+				event->event, &uobj->async_list,
+				&uobj->async_events_reported);
+}
+
+void ib_uverbs_qp_event_handler(struct ib_event *event, void *context_ptr)
+{
+	struct ib_uevent_object *uobj;
+
+	/* for XRC target qp's, check that qp is live */
+	if (!event->element.qp->uobject || !event->element.qp->uobject->live)
+		return;
+
+	uobj = container_of(event->element.qp->uobject,
+			    struct ib_uevent_object, uobject);
+
+	ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+				event->event, &uobj->event_list,
+				&uobj->events_reported);
+}
+
+void ib_uverbs_srq_event_handler(struct ib_event *event, void *context_ptr)
+{
+	struct ib_uevent_object *uobj;
+
+	uobj = container_of(event->element.srq->uobject,
+			    struct ib_uevent_object, uobject);
+
+	ib_uverbs_async_handler(context_ptr, uobj->uobject.user_handle,
+				event->event, &uobj->event_list,
+				&uobj->events_reported);
+}
+
+void ib_uverbs_event_handler(struct ib_event_handler *handler,
+			     struct ib_event *event)
+{
+	struct ib_uverbs_file *file =
+		container_of(handler, struct ib_uverbs_file, event_handler);
+
+	ib_uverbs_async_handler(file, event->element.port_num, event->event,
+				NULL, NULL);
+}
+
+struct file *ib_uverbs_alloc_event_file(struct ib_uverbs_file *uverbs_file,
+					int is_async)
+{
+	struct ib_uverbs_event_file *ev_file;
+	struct file *filp;
+
+	ev_file = kmalloc(sizeof *ev_file, GFP_KERNEL);
+	if (!ev_file)
+		return ERR_PTR(-ENOMEM);
+
+	kref_init(&ev_file->ref);
+	spin_lock_init(&ev_file->lock);
+	INIT_LIST_HEAD(&ev_file->event_list);
+	init_waitqueue_head(&ev_file->poll_wait);
+	ev_file->uverbs_file = uverbs_file;
+	ev_file->async_queue = NULL;
+	ev_file->is_async    = is_async;
+	ev_file->is_closed   = 0;
+
+	filp = anon_inode_getfile("[infinibandevent]", &uverbs_event_fops,
+				  ev_file, O_RDONLY);
+	if (IS_ERR(filp))
+		kfree(ev_file);
+
+	return filp;
+}
+
+/*
+ * Look up a completion event file by FD.  If lookup is successful,
+ * takes a ref to the event file struct that it returns; if
+ * unsuccessful, returns NULL.
+ */
+struct ib_uverbs_event_file *ib_uverbs_lookup_comp_file(int fd)
+{
+	struct ib_uverbs_event_file *ev_file = NULL;
+	struct fd f = fdget(fd);
+
+	if (!f.file)
+		return NULL;
+
+	if (f.file->f_op != &uverbs_event_fops)
+		goto out;
+
+	ev_file = f.file->private_data;
+	if (ev_file->is_async) {
+		ev_file = NULL;
+		goto out;
+	}
+
+	kref_get(&ev_file->ref);
+
+out:
+	fdput(f);
+	return ev_file;
+}
+
+static ssize_t ib_uverbs_write(struct file *filp, const char __user *buf,
+			     size_t count, loff_t *pos)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+	struct ib_uverbs_cmd_hdr hdr;
+	__u32 flags;
+
+	if (count < sizeof hdr)
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, sizeof hdr))
+		return -EFAULT;
+
+	flags = (hdr.command &
+		 IB_USER_VERBS_CMD_FLAGS_MASK) >> IB_USER_VERBS_CMD_FLAGS_SHIFT;
+
+	if (!flags) {
+		__u32 command;
+
+		if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
+					   IB_USER_VERBS_CMD_COMMAND_MASK))
+			return -EINVAL;
+
+		command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
+
+		if (command >= ARRAY_SIZE(uverbs_cmd_table) ||
+		    !uverbs_cmd_table[command])
+			return -EINVAL;
+
+		if (!file->ucontext &&
+		    command != IB_USER_VERBS_CMD_GET_CONTEXT)
+			return -EINVAL;
+
+		if (!(file->device->ib_dev->uverbs_cmd_mask & (1ull << command)))
+			return -ENOSYS;
+
+		if (hdr.in_words * 4 != count)
+			return -EINVAL;
+
+		return uverbs_cmd_table[command](file,
+						 buf + sizeof(hdr),
+						 hdr.in_words * 4,
+						 hdr.out_words * 4);
+
+	} else if (flags == IB_USER_VERBS_CMD_FLAG_EXTENDED) {
+		__u32 command;
+
+		struct ib_uverbs_ex_cmd_hdr ex_hdr;
+		struct ib_udata ucore;
+		struct ib_udata uhw;
+		int err;
+		size_t written_count = count;
+
+		if (hdr.command & ~(__u32)(IB_USER_VERBS_CMD_FLAGS_MASK |
+					   IB_USER_VERBS_CMD_COMMAND_MASK))
+			return -EINVAL;
+
+		command = hdr.command & IB_USER_VERBS_CMD_COMMAND_MASK;
+
+		if (command >= ARRAY_SIZE(uverbs_ex_cmd_table) ||
+		    !uverbs_ex_cmd_table[command])
+			return -ENOSYS;
+
+		if (!file->ucontext)
+			return -EINVAL;
+
+		if (!(file->device->ib_dev->uverbs_ex_cmd_mask & (1ull << command)))
+			return -ENOSYS;
+
+		if (count < (sizeof(hdr) + sizeof(ex_hdr)))
+			return -EINVAL;
+
+		if (copy_from_user(&ex_hdr, buf + sizeof(hdr), sizeof(ex_hdr)))
+			return -EFAULT;
+
+		count -= sizeof(hdr) + sizeof(ex_hdr);
+		buf += sizeof(hdr) + sizeof(ex_hdr);
+
+		if ((hdr.in_words + ex_hdr.provider_in_words) * 8 != count)
+			return -EINVAL;
+
+		if (ex_hdr.cmd_hdr_reserved)
+			return -EINVAL;
+
+		if (ex_hdr.response) {
+			if (!hdr.out_words && !ex_hdr.provider_out_words)
+				return -EINVAL;
+
+			if (!access_ok(VERIFY_WRITE,
+				       (void __user *) (unsigned long) ex_hdr.response,
+				       (hdr.out_words + ex_hdr.provider_out_words) * 8))
+				return -EFAULT;
+		} else {
+			if (hdr.out_words || ex_hdr.provider_out_words)
+				return -EINVAL;
+		}
+
+		INIT_UDATA_BUF_OR_NULL(&ucore, buf, (unsigned long) ex_hdr.response,
+				       hdr.in_words * 8, hdr.out_words * 8);
+
+		INIT_UDATA_BUF_OR_NULL(&uhw,
+				       buf + ucore.inlen,
+				       (unsigned long) ex_hdr.response + ucore.outlen,
+				       ex_hdr.provider_in_words * 8,
+				       ex_hdr.provider_out_words * 8);
+
+		err = uverbs_ex_cmd_table[command](file,
+						   &ucore,
+						   &uhw);
+
+		if (err)
+			return err;
+
+		return written_count;
+	}
+
+	return -ENOSYS;
+}
+
+static int ib_uverbs_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+
+	if (!file->ucontext)
+		return -ENODEV;
+	else
+		return file->device->ib_dev->mmap(file->ucontext, vma);
+}
+
+/*
+ * ib_uverbs_open() does not need the BKL:
+ *
+ *  - the ib_uverbs_device structures are properly reference counted and
+ *    everything else is purely local to the file being created, so
+ *    races against other open calls are not a problem;
+ *  - there is no ioctl method to race against;
+ *  - the open method will either immediately run -ENXIO, or all
+ *    required initialization will be done.
+ */
+static int ib_uverbs_open(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_device *dev;
+	struct ib_uverbs_file *file;
+	int ret;
+
+	dev = container_of(inode->i_cdev, struct ib_uverbs_device, cdev);
+	if (dev)
+		kref_get(&dev->ref);
+	else
+		return -ENXIO;
+
+	if (!try_module_get(dev->ib_dev->owner)) {
+		ret = -ENODEV;
+		goto err;
+	}
+
+	file = kmalloc(sizeof *file, GFP_KERNEL);
+	if (!file) {
+		ret = -ENOMEM;
+		goto err_module;
+	}
+
+	file->device	 = dev;
+	file->ucontext	 = NULL;
+	file->async_file = NULL;
+	kref_init(&file->ref);
+	mutex_init(&file->mutex);
+
+	filp->private_data = file;
+
+	return nonseekable_open(inode, filp);
+
+err_module:
+	module_put(dev->ib_dev->owner);
+
+err:
+	kref_put(&dev->ref, ib_uverbs_release_dev);
+	return ret;
+}
+
+static int ib_uverbs_close(struct inode *inode, struct file *filp)
+{
+	struct ib_uverbs_file *file = filp->private_data;
+
+	ib_uverbs_cleanup_ucontext(file, file->ucontext);
+
+	if (file->async_file)
+		kref_put(&file->async_file->ref, ib_uverbs_release_event_file);
+
+	kref_put(&file->ref, ib_uverbs_release_file);
+
+	return 0;
+}
+
+static const struct file_operations uverbs_fops = {
+	.owner	 = THIS_MODULE,
+	.write	 = ib_uverbs_write,
+	.open	 = ib_uverbs_open,
+	.release = ib_uverbs_close,
+	.llseek	 = no_llseek,
+};
+
+static const struct file_operations uverbs_mmap_fops = {
+	.owner	 = THIS_MODULE,
+	.write	 = ib_uverbs_write,
+	.mmap    = ib_uverbs_mmap,
+	.open	 = ib_uverbs_open,
+	.release = ib_uverbs_close,
+	.llseek	 = no_llseek,
+};
+
+static struct ib_client uverbs_client = {
+	.name   = "uverbs",
+	.add    = ib_uverbs_add_one,
+	.remove = ib_uverbs_remove_one
+};
+
+static ssize_t show_ibdev(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+	if (!dev)
+		return -ENODEV;
+
+	return sprintf(buf, "%s\n", dev->ib_dev->name);
+}
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_dev_abi_version(struct device *device,
+				    struct device_attribute *attr, char *buf)
+{
+	struct ib_uverbs_device *dev = dev_get_drvdata(device);
+
+	if (!dev)
+		return -ENODEV;
+
+	return sprintf(buf, "%d\n", dev->ib_dev->uverbs_abi_ver);
+}
+static DEVICE_ATTR(abi_version, S_IRUGO, show_dev_abi_version, NULL);
+
+static CLASS_ATTR_STRING(abi_version, S_IRUGO,
+			 __stringify(IB_USER_VERBS_ABI_VERSION));
+
+static dev_t overflow_maj;
+static DECLARE_BITMAP(overflow_map, IB_UVERBS_MAX_DEVICES);
+
+/*
+ * If we have more than IB_UVERBS_MAX_DEVICES, dynamically overflow by
+ * requesting a new major number and doubling the number of max devices we
+ * support. It's stupid, but simple.
+ */
+static int find_overflow_devnum(void)
+{
+	int ret;
+
+	if (!overflow_maj) {
+		ret = alloc_chrdev_region(&overflow_maj, 0, IB_UVERBS_MAX_DEVICES,
+					  "infiniband_verbs");
+		if (ret) {
+			printk(KERN_ERR "user_verbs: couldn't register dynamic device number\n");
+			return ret;
+		}
+	}
+
+	ret = find_first_zero_bit(overflow_map, IB_UVERBS_MAX_DEVICES);
+	if (ret >= IB_UVERBS_MAX_DEVICES)
+		return -1;
+
+	return ret;
+}
+
+static void ib_uverbs_add_one(struct ib_device *device)
+{
+	int devnum;
+	dev_t base;
+	struct ib_uverbs_device *uverbs_dev;
+
+	if (!device->alloc_ucontext)
+		return;
+
+	uverbs_dev = kzalloc(sizeof *uverbs_dev, GFP_KERNEL);
+	if (!uverbs_dev)
+		return;
+
+	kref_init(&uverbs_dev->ref);
+	init_completion(&uverbs_dev->comp);
+	uverbs_dev->xrcd_tree = RB_ROOT;
+	mutex_init(&uverbs_dev->xrcd_tree_mutex);
+
+	spin_lock(&map_lock);
+	devnum = find_first_zero_bit(dev_map, IB_UVERBS_MAX_DEVICES);
+	if (devnum >= IB_UVERBS_MAX_DEVICES) {
+		spin_unlock(&map_lock);
+		devnum = find_overflow_devnum();
+		if (devnum < 0)
+			goto err;
+
+		spin_lock(&map_lock);
+		uverbs_dev->devnum = devnum + IB_UVERBS_MAX_DEVICES;
+		base = devnum + overflow_maj;
+		set_bit(devnum, overflow_map);
+	} else {
+		uverbs_dev->devnum = devnum;
+		base = devnum + IB_UVERBS_BASE_DEV;
+		set_bit(devnum, dev_map);
+	}
+	spin_unlock(&map_lock);
+
+	uverbs_dev->ib_dev           = device;
+	uverbs_dev->num_comp_vectors = device->num_comp_vectors;
+
+	cdev_init(&uverbs_dev->cdev, NULL);
+	uverbs_dev->cdev.owner = THIS_MODULE;
+	uverbs_dev->cdev.ops = device->mmap ? &uverbs_mmap_fops : &uverbs_fops;
+	kobject_set_name(&uverbs_dev->cdev.kobj, "uverbs%d", uverbs_dev->devnum);
+	if (cdev_add(&uverbs_dev->cdev, base, 1))
+		goto err_cdev;
+
+	uverbs_dev->dev = device_create(uverbs_class, device->dma_device,
+					uverbs_dev->cdev.dev, uverbs_dev,
+					"uverbs%d", uverbs_dev->devnum);
+	if (IS_ERR(uverbs_dev->dev))
+		goto err_cdev;
+
+	if (device_create_file(uverbs_dev->dev, &dev_attr_ibdev))
+		goto err_class;
+	if (device_create_file(uverbs_dev->dev, &dev_attr_abi_version))
+		goto err_class;
+
+	ib_set_client_data(device, &uverbs_client, uverbs_dev);
+
+	return;
+
+err_class:
+	device_destroy(uverbs_class, uverbs_dev->cdev.dev);
+
+err_cdev:
+	cdev_del(&uverbs_dev->cdev);
+	if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
+		clear_bit(devnum, dev_map);
+	else
+		clear_bit(devnum, overflow_map);
+
+err:
+	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+	wait_for_completion(&uverbs_dev->comp);
+	kfree(uverbs_dev);
+	return;
+}
+
+static void ib_uverbs_remove_one(struct ib_device *device)
+{
+	struct ib_uverbs_device *uverbs_dev = ib_get_client_data(device, &uverbs_client);
+
+	if (!uverbs_dev)
+		return;
+
+	dev_set_drvdata(uverbs_dev->dev, NULL);
+	device_destroy(uverbs_class, uverbs_dev->cdev.dev);
+	cdev_del(&uverbs_dev->cdev);
+
+	if (uverbs_dev->devnum < IB_UVERBS_MAX_DEVICES)
+		clear_bit(uverbs_dev->devnum, dev_map);
+	else
+		clear_bit(uverbs_dev->devnum - IB_UVERBS_MAX_DEVICES, overflow_map);
+
+	kref_put(&uverbs_dev->ref, ib_uverbs_release_dev);
+	wait_for_completion(&uverbs_dev->comp);
+	kfree(uverbs_dev);
+}
+
+static char *uverbs_devnode(struct device *dev, umode_t *mode)
+{
+	if (mode)
+		*mode = 0666;
+	return kasprintf(GFP_KERNEL, "infiniband/%s", dev_name(dev));
+}
+
+static int __init ib_uverbs_init(void)
+{
+	int ret;
+
+	ret = register_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES,
+				     "infiniband_verbs");
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't register device number\n");
+		goto out;
+	}
+
+	uverbs_class = class_create(THIS_MODULE, "infiniband_verbs");
+	if (IS_ERR(uverbs_class)) {
+		ret = PTR_ERR(uverbs_class);
+		printk(KERN_ERR "user_verbs: couldn't create class infiniband_verbs\n");
+		goto out_chrdev;
+	}
+
+	uverbs_class->devnode = uverbs_devnode;
+
+	ret = class_create_file(uverbs_class, &class_attr_abi_version.attr);
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't create abi_version attribute\n");
+		goto out_class;
+	}
+
+	ret = ib_register_client(&uverbs_client);
+	if (ret) {
+		printk(KERN_ERR "user_verbs: couldn't register client\n");
+		goto out_class;
+	}
+
+	return 0;
+
+out_class:
+	class_destroy(uverbs_class);
+
+out_chrdev:
+	unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+
+out:
+	return ret;
+}
+
+static void __exit ib_uverbs_cleanup(void)
+{
+	ib_unregister_client(&uverbs_client);
+	class_destroy(uverbs_class);
+	unregister_chrdev_region(IB_UVERBS_BASE_DEV, IB_UVERBS_MAX_DEVICES);
+	if (overflow_maj)
+		unregister_chrdev_region(overflow_maj, IB_UVERBS_MAX_DEVICES);
+	idr_destroy(&ib_uverbs_pd_idr);
+	idr_destroy(&ib_uverbs_mr_idr);
+	idr_destroy(&ib_uverbs_mw_idr);
+	idr_destroy(&ib_uverbs_ah_idr);
+	idr_destroy(&ib_uverbs_cq_idr);
+	idr_destroy(&ib_uverbs_qp_idr);
+	idr_destroy(&ib_uverbs_srq_idr);
+}
+
+module_init(ib_uverbs_init);
+module_exit(ib_uverbs_cleanup);
diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c
new file mode 100644
index 000000000..abd972474
--- /dev/null
+++ b/drivers/infiniband/core/uverbs_marshall.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2005 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/export.h>
+#include <rdma/ib_marshall.h>
+
+void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst,
+			     struct ib_ah_attr *src)
+{
+	memcpy(dst->grh.dgid, src->grh.dgid.raw, sizeof src->grh.dgid);
+	dst->grh.flow_label        = src->grh.flow_label;
+	dst->grh.sgid_index        = src->grh.sgid_index;
+	dst->grh.hop_limit         = src->grh.hop_limit;
+	dst->grh.traffic_class     = src->grh.traffic_class;
+	memset(&dst->grh.reserved, 0, sizeof(dst->grh.reserved));
+	dst->dlid 	    	   = src->dlid;
+	dst->sl   	    	   = src->sl;
+	dst->src_path_bits 	   = src->src_path_bits;
+	dst->static_rate   	   = src->static_rate;
+	dst->is_global             = src->ah_flags & IB_AH_GRH ? 1 : 0;
+	dst->port_num 	    	   = src->port_num;
+	dst->reserved 		   = 0;
+}
+EXPORT_SYMBOL(ib_copy_ah_attr_to_user);
+
+void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst,
+			     struct ib_qp_attr *src)
+{
+	dst->qp_state	        = src->qp_state;
+	dst->cur_qp_state	= src->cur_qp_state;
+	dst->path_mtu		= src->path_mtu;
+	dst->path_mig_state	= src->path_mig_state;
+	dst->qkey		= src->qkey;
+	dst->rq_psn		= src->rq_psn;
+	dst->sq_psn		= src->sq_psn;
+	dst->dest_qp_num	= src->dest_qp_num;
+	dst->qp_access_flags	= src->qp_access_flags;
+
+	dst->max_send_wr	= src->cap.max_send_wr;
+	dst->max_recv_wr	= src->cap.max_recv_wr;
+	dst->max_send_sge	= src->cap.max_send_sge;
+	dst->max_recv_sge	= src->cap.max_recv_sge;
+	dst->max_inline_data	= src->cap.max_inline_data;
+
+	ib_copy_ah_attr_to_user(&dst->ah_attr, &src->ah_attr);
+	ib_copy_ah_attr_to_user(&dst->alt_ah_attr, &src->alt_ah_attr);
+
+	dst->pkey_index		= src->pkey_index;
+	dst->alt_pkey_index	= src->alt_pkey_index;
+	dst->en_sqd_async_notify = src->en_sqd_async_notify;
+	dst->sq_draining	= src->sq_draining;
+	dst->max_rd_atomic	= src->max_rd_atomic;
+	dst->max_dest_rd_atomic	= src->max_dest_rd_atomic;
+	dst->min_rnr_timer	= src->min_rnr_timer;
+	dst->port_num		= src->port_num;
+	dst->timeout		= src->timeout;
+	dst->retry_cnt		= src->retry_cnt;
+	dst->rnr_retry		= src->rnr_retry;
+	dst->alt_port_num	= src->alt_port_num;
+	dst->alt_timeout	= src->alt_timeout;
+	memset(dst->reserved, 0, sizeof(dst->reserved));
+}
+EXPORT_SYMBOL(ib_copy_qp_attr_to_user);
+
+void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst,
+			      struct ib_sa_path_rec *src)
+{
+	memcpy(dst->dgid, src->dgid.raw, sizeof src->dgid);
+	memcpy(dst->sgid, src->sgid.raw, sizeof src->sgid);
+
+	dst->dlid		= src->dlid;
+	dst->slid		= src->slid;
+	dst->raw_traffic	= src->raw_traffic;
+	dst->flow_label		= src->flow_label;
+	dst->hop_limit		= src->hop_limit;
+	dst->traffic_class	= src->traffic_class;
+	dst->reversible		= src->reversible;
+	dst->numb_path		= src->numb_path;
+	dst->pkey		= src->pkey;
+	dst->sl			= src->sl;
+	dst->mtu_selector	= src->mtu_selector;
+	dst->mtu		= src->mtu;
+	dst->rate_selector	= src->rate_selector;
+	dst->rate		= src->rate;
+	dst->packet_life_time	= src->packet_life_time;
+	dst->preference		= src->preference;
+	dst->packet_life_time_selector = src->packet_life_time_selector;
+}
+EXPORT_SYMBOL(ib_copy_path_rec_to_user);
+
+void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst,
+				struct ib_user_path_rec *src)
+{
+	memcpy(dst->dgid.raw, src->dgid, sizeof dst->dgid);
+	memcpy(dst->sgid.raw, src->sgid, sizeof dst->sgid);
+
+	dst->dlid		= src->dlid;
+	dst->slid		= src->slid;
+	dst->raw_traffic	= src->raw_traffic;
+	dst->flow_label		= src->flow_label;
+	dst->hop_limit		= src->hop_limit;
+	dst->traffic_class	= src->traffic_class;
+	dst->reversible		= src->reversible;
+	dst->numb_path		= src->numb_path;
+	dst->pkey		= src->pkey;
+	dst->sl			= src->sl;
+	dst->mtu_selector	= src->mtu_selector;
+	dst->mtu		= src->mtu;
+	dst->rate_selector	= src->rate_selector;
+	dst->rate		= src->rate;
+	dst->packet_life_time	= src->packet_life_time;
+	dst->preference		= src->preference;
+	dst->packet_life_time_selector = src->packet_life_time_selector;
+
+	memset(dst->smac, 0, sizeof(dst->smac));
+	memset(dst->dmac, 0, sizeof(dst->dmac));
+	dst->vlan_id = 0xffff;
+}
+EXPORT_SYMBOL(ib_copy_path_rec_from_user);
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
new file mode 100644
index 000000000..f93eb8da7
--- /dev/null
+++ b/drivers/infiniband/core/verbs.c
@@ -0,0 +1,1448 @@
+/*
+ * Copyright (c) 2004 Mellanox Technologies Ltd.  All rights reserved.
+ * Copyright (c) 2004 Infinicon Corporation.  All rights reserved.
+ * Copyright (c) 2004 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2004 Topspin Corporation.  All rights reserved.
+ * Copyright (c) 2004 Voltaire Corporation.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/err.h>
+#include <linux/export.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_addr.h>
+
+#include "core_priv.h"
+
+__attribute_const__ int ib_rate_to_mult(enum ib_rate rate)
+{
+	switch (rate) {
+	case IB_RATE_2_5_GBPS: return  1;
+	case IB_RATE_5_GBPS:   return  2;
+	case IB_RATE_10_GBPS:  return  4;
+	case IB_RATE_20_GBPS:  return  8;
+	case IB_RATE_30_GBPS:  return 12;
+	case IB_RATE_40_GBPS:  return 16;
+	case IB_RATE_60_GBPS:  return 24;
+	case IB_RATE_80_GBPS:  return 32;
+	case IB_RATE_120_GBPS: return 48;
+	default:	       return -1;
+	}
+}
+EXPORT_SYMBOL(ib_rate_to_mult);
+
+__attribute_const__ enum ib_rate mult_to_ib_rate(int mult)
+{
+	switch (mult) {
+	case 1:  return IB_RATE_2_5_GBPS;
+	case 2:  return IB_RATE_5_GBPS;
+	case 4:  return IB_RATE_10_GBPS;
+	case 8:  return IB_RATE_20_GBPS;
+	case 12: return IB_RATE_30_GBPS;
+	case 16: return IB_RATE_40_GBPS;
+	case 24: return IB_RATE_60_GBPS;
+	case 32: return IB_RATE_80_GBPS;
+	case 48: return IB_RATE_120_GBPS;
+	default: return IB_RATE_PORT_CURRENT;
+	}
+}
+EXPORT_SYMBOL(mult_to_ib_rate);
+
+__attribute_const__ int ib_rate_to_mbps(enum ib_rate rate)
+{
+	switch (rate) {
+	case IB_RATE_2_5_GBPS: return 2500;
+	case IB_RATE_5_GBPS:   return 5000;
+	case IB_RATE_10_GBPS:  return 10000;
+	case IB_RATE_20_GBPS:  return 20000;
+	case IB_RATE_30_GBPS:  return 30000;
+	case IB_RATE_40_GBPS:  return 40000;
+	case IB_RATE_60_GBPS:  return 60000;
+	case IB_RATE_80_GBPS:  return 80000;
+	case IB_RATE_120_GBPS: return 120000;
+	case IB_RATE_14_GBPS:  return 14062;
+	case IB_RATE_56_GBPS:  return 56250;
+	case IB_RATE_112_GBPS: return 112500;
+	case IB_RATE_168_GBPS: return 168750;
+	case IB_RATE_25_GBPS:  return 25781;
+	case IB_RATE_100_GBPS: return 103125;
+	case IB_RATE_200_GBPS: return 206250;
+	case IB_RATE_300_GBPS: return 309375;
+	default:	       return -1;
+	}
+}
+EXPORT_SYMBOL(ib_rate_to_mbps);
+
+__attribute_const__ enum rdma_transport_type
+rdma_node_get_transport(enum rdma_node_type node_type)
+{
+	switch (node_type) {
+	case RDMA_NODE_IB_CA:
+	case RDMA_NODE_IB_SWITCH:
+	case RDMA_NODE_IB_ROUTER:
+		return RDMA_TRANSPORT_IB;
+	case RDMA_NODE_RNIC:
+		return RDMA_TRANSPORT_IWARP;
+	case RDMA_NODE_USNIC:
+		return RDMA_TRANSPORT_USNIC;
+	case RDMA_NODE_USNIC_UDP:
+		return RDMA_TRANSPORT_USNIC_UDP;
+	default:
+		BUG();
+		return 0;
+	}
+}
+EXPORT_SYMBOL(rdma_node_get_transport);
+
+enum rdma_link_layer rdma_port_get_link_layer(struct ib_device *device, u8 port_num)
+{
+	if (device->get_link_layer)
+		return device->get_link_layer(device, port_num);
+
+	switch (rdma_node_get_transport(device->node_type)) {
+	case RDMA_TRANSPORT_IB:
+		return IB_LINK_LAYER_INFINIBAND;
+	case RDMA_TRANSPORT_IWARP:
+	case RDMA_TRANSPORT_USNIC:
+	case RDMA_TRANSPORT_USNIC_UDP:
+		return IB_LINK_LAYER_ETHERNET;
+	default:
+		return IB_LINK_LAYER_UNSPECIFIED;
+	}
+}
+EXPORT_SYMBOL(rdma_port_get_link_layer);
+
+/* Protection domains */
+
+struct ib_pd *ib_alloc_pd(struct ib_device *device)
+{
+	struct ib_pd *pd;
+
+	pd = device->alloc_pd(device, NULL, NULL);
+
+	if (!IS_ERR(pd)) {
+		pd->device  = device;
+		pd->uobject = NULL;
+		atomic_set(&pd->usecnt, 0);
+	}
+
+	return pd;
+}
+EXPORT_SYMBOL(ib_alloc_pd);
+
+int ib_dealloc_pd(struct ib_pd *pd)
+{
+	if (atomic_read(&pd->usecnt))
+		return -EBUSY;
+
+	return pd->device->dealloc_pd(pd);
+}
+EXPORT_SYMBOL(ib_dealloc_pd);
+
+/* Address handles */
+
+struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	struct ib_ah *ah;
+
+	ah = pd->device->create_ah(pd, ah_attr);
+
+	if (!IS_ERR(ah)) {
+		ah->device  = pd->device;
+		ah->pd      = pd;
+		ah->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return ah;
+}
+EXPORT_SYMBOL(ib_create_ah);
+
+int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, struct ib_wc *wc,
+		       struct ib_grh *grh, struct ib_ah_attr *ah_attr)
+{
+	u32 flow_class;
+	u16 gid_index;
+	int ret;
+	int is_eth = (rdma_port_get_link_layer(device, port_num) ==
+			IB_LINK_LAYER_ETHERNET);
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	if (is_eth) {
+		if (!(wc->wc_flags & IB_WC_GRH))
+			return -EPROTOTYPE;
+
+		if (wc->wc_flags & IB_WC_WITH_SMAC &&
+		    wc->wc_flags & IB_WC_WITH_VLAN) {
+			memcpy(ah_attr->dmac, wc->smac, ETH_ALEN);
+			ah_attr->vlan_id = wc->vlan_id;
+		} else {
+			ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid,
+					ah_attr->dmac, &ah_attr->vlan_id);
+			if (ret)
+				return ret;
+		}
+	} else {
+		ah_attr->vlan_id = 0xffff;
+	}
+
+	ah_attr->dlid = wc->slid;
+	ah_attr->sl = wc->sl;
+	ah_attr->src_path_bits = wc->dlid_path_bits;
+	ah_attr->port_num = port_num;
+
+	if (wc->wc_flags & IB_WC_GRH) {
+		ah_attr->ah_flags = IB_AH_GRH;
+		ah_attr->grh.dgid = grh->sgid;
+
+		ret = ib_find_cached_gid(device, &grh->dgid, &port_num,
+					 &gid_index);
+		if (ret)
+			return ret;
+
+		ah_attr->grh.sgid_index = (u8) gid_index;
+		flow_class = be32_to_cpu(grh->version_tclass_flow);
+		ah_attr->grh.flow_label = flow_class & 0xFFFFF;
+		ah_attr->grh.hop_limit = 0xFF;
+		ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ib_init_ah_from_wc);
+
+struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, struct ib_wc *wc,
+				   struct ib_grh *grh, u8 port_num)
+{
+	struct ib_ah_attr ah_attr;
+	int ret;
+
+	ret = ib_init_ah_from_wc(pd->device, port_num, wc, grh, &ah_attr);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return ib_create_ah(pd, &ah_attr);
+}
+EXPORT_SYMBOL(ib_create_ah_from_wc);
+
+int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+	return ah->device->modify_ah ?
+		ah->device->modify_ah(ah, ah_attr) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_ah);
+
+int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+	return ah->device->query_ah ?
+		ah->device->query_ah(ah, ah_attr) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_ah);
+
+int ib_destroy_ah(struct ib_ah *ah)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = ah->pd;
+	ret = ah->device->destroy_ah(ah);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_ah);
+
+/* Shared receive queues */
+
+struct ib_srq *ib_create_srq(struct ib_pd *pd,
+			     struct ib_srq_init_attr *srq_init_attr)
+{
+	struct ib_srq *srq;
+
+	if (!pd->device->create_srq)
+		return ERR_PTR(-ENOSYS);
+
+	srq = pd->device->create_srq(pd, srq_init_attr, NULL);
+
+	if (!IS_ERR(srq)) {
+		srq->device    	   = pd->device;
+		srq->pd        	   = pd;
+		srq->uobject       = NULL;
+		srq->event_handler = srq_init_attr->event_handler;
+		srq->srq_context   = srq_init_attr->srq_context;
+		srq->srq_type      = srq_init_attr->srq_type;
+		if (srq->srq_type == IB_SRQT_XRC) {
+			srq->ext.xrc.xrcd = srq_init_attr->ext.xrc.xrcd;
+			srq->ext.xrc.cq   = srq_init_attr->ext.xrc.cq;
+			atomic_inc(&srq->ext.xrc.xrcd->usecnt);
+			atomic_inc(&srq->ext.xrc.cq->usecnt);
+		}
+		atomic_inc(&pd->usecnt);
+		atomic_set(&srq->usecnt, 0);
+	}
+
+	return srq;
+}
+EXPORT_SYMBOL(ib_create_srq);
+
+int ib_modify_srq(struct ib_srq *srq,
+		  struct ib_srq_attr *srq_attr,
+		  enum ib_srq_attr_mask srq_attr_mask)
+{
+	return srq->device->modify_srq ?
+		srq->device->modify_srq(srq, srq_attr, srq_attr_mask, NULL) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_srq);
+
+int ib_query_srq(struct ib_srq *srq,
+		 struct ib_srq_attr *srq_attr)
+{
+	return srq->device->query_srq ?
+		srq->device->query_srq(srq, srq_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_srq);
+
+int ib_destroy_srq(struct ib_srq *srq)
+{
+	struct ib_pd *pd;
+	enum ib_srq_type srq_type;
+	struct ib_xrcd *uninitialized_var(xrcd);
+	struct ib_cq *uninitialized_var(cq);
+	int ret;
+
+	if (atomic_read(&srq->usecnt))
+		return -EBUSY;
+
+	pd = srq->pd;
+	srq_type = srq->srq_type;
+	if (srq_type == IB_SRQT_XRC) {
+		xrcd = srq->ext.xrc.xrcd;
+		cq = srq->ext.xrc.cq;
+	}
+
+	ret = srq->device->destroy_srq(srq);
+	if (!ret) {
+		atomic_dec(&pd->usecnt);
+		if (srq_type == IB_SRQT_XRC) {
+			atomic_dec(&xrcd->usecnt);
+			atomic_dec(&cq->usecnt);
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_srq);
+
+/* Queue pairs */
+
+static void __ib_shared_qp_event_handler(struct ib_event *event, void *context)
+{
+	struct ib_qp *qp = context;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->device->event_handler_lock, flags);
+	list_for_each_entry(event->element.qp, &qp->open_list, open_list)
+		if (event->element.qp->event_handler)
+			event->element.qp->event_handler(event, event->element.qp->qp_context);
+	spin_unlock_irqrestore(&qp->device->event_handler_lock, flags);
+}
+
+static void __ib_insert_xrcd_qp(struct ib_xrcd *xrcd, struct ib_qp *qp)
+{
+	mutex_lock(&xrcd->tgt_qp_mutex);
+	list_add(&qp->xrcd_list, &xrcd->tgt_qp_list);
+	mutex_unlock(&xrcd->tgt_qp_mutex);
+}
+
+static struct ib_qp *__ib_open_qp(struct ib_qp *real_qp,
+				  void (*event_handler)(struct ib_event *, void *),
+				  void *qp_context)
+{
+	struct ib_qp *qp;
+	unsigned long flags;
+
+	qp = kzalloc(sizeof *qp, GFP_KERNEL);
+	if (!qp)
+		return ERR_PTR(-ENOMEM);
+
+	qp->real_qp = real_qp;
+	atomic_inc(&real_qp->usecnt);
+	qp->device = real_qp->device;
+	qp->event_handler = event_handler;
+	qp->qp_context = qp_context;
+	qp->qp_num = real_qp->qp_num;
+	qp->qp_type = real_qp->qp_type;
+
+	spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
+	list_add(&qp->open_list, &real_qp->open_list);
+	spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
+
+	return qp;
+}
+
+struct ib_qp *ib_open_qp(struct ib_xrcd *xrcd,
+			 struct ib_qp_open_attr *qp_open_attr)
+{
+	struct ib_qp *qp, *real_qp;
+
+	if (qp_open_attr->qp_type != IB_QPT_XRC_TGT)
+		return ERR_PTR(-EINVAL);
+
+	qp = ERR_PTR(-EINVAL);
+	mutex_lock(&xrcd->tgt_qp_mutex);
+	list_for_each_entry(real_qp, &xrcd->tgt_qp_list, xrcd_list) {
+		if (real_qp->qp_num == qp_open_attr->qp_num) {
+			qp = __ib_open_qp(real_qp, qp_open_attr->event_handler,
+					  qp_open_attr->qp_context);
+			break;
+		}
+	}
+	mutex_unlock(&xrcd->tgt_qp_mutex);
+	return qp;
+}
+EXPORT_SYMBOL(ib_open_qp);
+
+struct ib_qp *ib_create_qp(struct ib_pd *pd,
+			   struct ib_qp_init_attr *qp_init_attr)
+{
+	struct ib_qp *qp, *real_qp;
+	struct ib_device *device;
+
+	device = pd ? pd->device : qp_init_attr->xrcd->device;
+	qp = device->create_qp(pd, qp_init_attr, NULL);
+
+	if (!IS_ERR(qp)) {
+		qp->device     = device;
+		qp->real_qp    = qp;
+		qp->uobject    = NULL;
+		qp->qp_type    = qp_init_attr->qp_type;
+
+		atomic_set(&qp->usecnt, 0);
+		if (qp_init_attr->qp_type == IB_QPT_XRC_TGT) {
+			qp->event_handler = __ib_shared_qp_event_handler;
+			qp->qp_context = qp;
+			qp->pd = NULL;
+			qp->send_cq = qp->recv_cq = NULL;
+			qp->srq = NULL;
+			qp->xrcd = qp_init_attr->xrcd;
+			atomic_inc(&qp_init_attr->xrcd->usecnt);
+			INIT_LIST_HEAD(&qp->open_list);
+
+			real_qp = qp;
+			qp = __ib_open_qp(real_qp, qp_init_attr->event_handler,
+					  qp_init_attr->qp_context);
+			if (!IS_ERR(qp))
+				__ib_insert_xrcd_qp(qp_init_attr->xrcd, real_qp);
+			else
+				real_qp->device->destroy_qp(real_qp);
+		} else {
+			qp->event_handler = qp_init_attr->event_handler;
+			qp->qp_context = qp_init_attr->qp_context;
+			if (qp_init_attr->qp_type == IB_QPT_XRC_INI) {
+				qp->recv_cq = NULL;
+				qp->srq = NULL;
+			} else {
+				qp->recv_cq = qp_init_attr->recv_cq;
+				atomic_inc(&qp_init_attr->recv_cq->usecnt);
+				qp->srq = qp_init_attr->srq;
+				if (qp->srq)
+					atomic_inc(&qp_init_attr->srq->usecnt);
+			}
+
+			qp->pd	    = pd;
+			qp->send_cq = qp_init_attr->send_cq;
+			qp->xrcd    = NULL;
+
+			atomic_inc(&pd->usecnt);
+			atomic_inc(&qp_init_attr->send_cq->usecnt);
+		}
+	}
+
+	return qp;
+}
+EXPORT_SYMBOL(ib_create_qp);
+
+static const struct {
+	int			valid;
+	enum ib_qp_attr_mask	req_param[IB_QPT_MAX];
+	enum ib_qp_attr_mask	req_param_add_eth[IB_QPT_MAX];
+	enum ib_qp_attr_mask	opt_param[IB_QPT_MAX];
+	enum ib_qp_attr_mask	opt_param_add_eth[IB_QPT_MAX];
+} qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_INIT]  = {
+			.valid = 1,
+			.req_param = {
+				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_QKEY),
+				[IB_QPT_RAW_PACKET] = IB_QP_PORT,
+				[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+			}
+		},
+	},
+	[IB_QPS_INIT]  = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_INIT]  = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_RC]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_XRC_INI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_XRC_TGT] = (IB_QP_PKEY_INDEX		|
+						IB_QP_PORT			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_RTR]   = {
+			.valid = 1,
+			.req_param = {
+				[IB_QPT_UC]  = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN),
+				[IB_QPT_RC]  = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN			|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_MIN_RNR_TIMER),
+				[IB_QPT_XRC_INI] = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN),
+				[IB_QPT_XRC_TGT] = (IB_QP_AV			|
+						IB_QP_PATH_MTU			|
+						IB_QP_DEST_QPN			|
+						IB_QP_RQ_PSN			|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_MIN_RNR_TIMER),
+			},
+			.req_param_add_eth = {
+				[IB_QPT_RC]  = (IB_QP_SMAC),
+				[IB_QPT_UC]  = (IB_QP_SMAC),
+				[IB_QPT_XRC_INI]  = (IB_QP_SMAC),
+				[IB_QPT_XRC_TGT]  = (IB_QP_SMAC)
+			},
+			.opt_param = {
+				 [IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						 IB_QP_QKEY),
+				 [IB_QPT_UC]  = (IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PKEY_INDEX),
+				 [IB_QPT_RC]  = (IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PKEY_INDEX),
+				 [IB_QPT_XRC_INI] = (IB_QP_ALT_PATH		|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PKEY_INDEX),
+				 [IB_QPT_XRC_TGT] = (IB_QP_ALT_PATH		|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PKEY_INDEX),
+				 [IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						 IB_QP_QKEY),
+				 [IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						 IB_QP_QKEY),
+			 },
+			.opt_param_add_eth = {
+				[IB_QPT_RC]  = (IB_QP_ALT_SMAC			|
+						IB_QP_VID			|
+						IB_QP_ALT_VID),
+				[IB_QPT_UC]  = (IB_QP_ALT_SMAC			|
+						IB_QP_VID			|
+						IB_QP_ALT_VID),
+				[IB_QPT_XRC_INI]  = (IB_QP_ALT_SMAC			|
+						IB_QP_VID			|
+						IB_QP_ALT_VID),
+				[IB_QPT_XRC_TGT]  = (IB_QP_ALT_SMAC			|
+						IB_QP_VID			|
+						IB_QP_ALT_VID)
+			}
+		}
+	},
+	[IB_QPS_RTR]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.req_param = {
+				[IB_QPT_UD]  = IB_QP_SQ_PSN,
+				[IB_QPT_UC]  = IB_QP_SQ_PSN,
+				[IB_QPT_RC]  = (IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_SQ_PSN			|
+						IB_QP_MAX_QP_RD_ATOMIC),
+				[IB_QPT_XRC_INI] = (IB_QP_TIMEOUT		|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_SQ_PSN			|
+						IB_QP_MAX_QP_RD_ATOMIC),
+				[IB_QPT_XRC_TGT] = (IB_QP_TIMEOUT		|
+						IB_QP_SQ_PSN),
+				[IB_QPT_SMI] = IB_QP_SQ_PSN,
+				[IB_QPT_GSI] = IB_QP_SQ_PSN,
+			},
+			.opt_param = {
+				 [IB_QPT_UD]  = (IB_QP_CUR_STATE		|
+						 IB_QP_QKEY),
+				 [IB_QPT_UC]  = (IB_QP_CUR_STATE		|
+						 IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_RC]  = (IB_QP_CUR_STATE		|
+						 IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_MIN_RNR_TIMER		|
+						 IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_XRC_INI] = (IB_QP_CUR_STATE		|
+						 IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE		|
+						 IB_QP_ALT_PATH			|
+						 IB_QP_ACCESS_FLAGS		|
+						 IB_QP_MIN_RNR_TIMER		|
+						 IB_QP_PATH_MIG_STATE),
+				 [IB_QPT_SMI] = (IB_QP_CUR_STATE		|
+						 IB_QP_QKEY),
+				 [IB_QPT_GSI] = (IB_QP_CUR_STATE		|
+						 IB_QP_QKEY),
+			 }
+		}
+	},
+	[IB_QPS_RTS]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_RC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE		|
+						IB_QP_MIN_RNR_TIMER),
+				[IB_QPT_XRC_INI] = (IB_QP_CUR_STATE		|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE		|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_ALT_PATH			|
+						IB_QP_PATH_MIG_STATE		|
+						IB_QP_MIN_RNR_TIMER),
+				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_SQD]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_UC]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_RC]  = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_XRC_INI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_XRC_TGT] = IB_QP_EN_SQD_ASYNC_NOTIFY, /* ??? */
+				[IB_QPT_SMI] = IB_QP_EN_SQD_ASYNC_NOTIFY,
+				[IB_QPT_GSI] = IB_QP_EN_SQD_ASYNC_NOTIFY
+			}
+		},
+	},
+	[IB_QPS_SQD]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_RC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC_INI] = (IB_QP_CUR_STATE		|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC_TGT] = (IB_QP_CUR_STATE		|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+			}
+		},
+		[IB_QPS_SQD]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_AV			|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_RC]  = (IB_QP_PORT			|
+						IB_QP_AV			|
+						IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_MAX_QP_RD_ATOMIC		|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC_INI] = (IB_QP_PORT			|
+						IB_QP_AV			|
+						IB_QP_TIMEOUT			|
+						IB_QP_RETRY_CNT			|
+						IB_QP_RNR_RETRY			|
+						IB_QP_MAX_QP_RD_ATOMIC		|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_XRC_TGT] = (IB_QP_PORT			|
+						IB_QP_AV			|
+						IB_QP_TIMEOUT			|
+						IB_QP_MAX_DEST_RD_ATOMIC	|
+						IB_QP_ALT_PATH			|
+						IB_QP_ACCESS_FLAGS		|
+						IB_QP_PKEY_INDEX		|
+						IB_QP_MIN_RNR_TIMER		|
+						IB_QP_PATH_MIG_STATE),
+				[IB_QPT_SMI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_PKEY_INDEX		|
+						IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_SQE]   = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 },
+		[IB_QPS_RTS]   = {
+			.valid = 1,
+			.opt_param = {
+				[IB_QPT_UD]  = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_UC]  = (IB_QP_CUR_STATE			|
+						IB_QP_ACCESS_FLAGS),
+				[IB_QPT_SMI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+				[IB_QPT_GSI] = (IB_QP_CUR_STATE			|
+						IB_QP_QKEY),
+			}
+		}
+	},
+	[IB_QPS_ERR] = {
+		[IB_QPS_RESET] = { .valid = 1 },
+		[IB_QPS_ERR] =   { .valid = 1 }
+	}
+};
+
+int ib_modify_qp_is_ok(enum ib_qp_state cur_state, enum ib_qp_state next_state,
+		       enum ib_qp_type type, enum ib_qp_attr_mask mask,
+		       enum rdma_link_layer ll)
+{
+	enum ib_qp_attr_mask req_param, opt_param;
+
+	if (cur_state  < 0 || cur_state  > IB_QPS_ERR ||
+	    next_state < 0 || next_state > IB_QPS_ERR)
+		return 0;
+
+	if (mask & IB_QP_CUR_STATE  &&
+	    cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
+	    cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
+		return 0;
+
+	if (!qp_state_table[cur_state][next_state].valid)
+		return 0;
+
+	req_param = qp_state_table[cur_state][next_state].req_param[type];
+	opt_param = qp_state_table[cur_state][next_state].opt_param[type];
+
+	if (ll == IB_LINK_LAYER_ETHERNET) {
+		req_param |= qp_state_table[cur_state][next_state].
+			req_param_add_eth[type];
+		opt_param |= qp_state_table[cur_state][next_state].
+			opt_param_add_eth[type];
+	}
+
+	if ((mask & req_param) != req_param)
+		return 0;
+
+	if (mask & ~(req_param | opt_param | IB_QP_STATE))
+		return 0;
+
+	return 1;
+}
+EXPORT_SYMBOL(ib_modify_qp_is_ok);
+
+int ib_resolve_eth_l2_attrs(struct ib_qp *qp,
+			    struct ib_qp_attr *qp_attr, int *qp_attr_mask)
+{
+	int           ret = 0;
+	union ib_gid  sgid;
+
+	if ((*qp_attr_mask & IB_QP_AV)  &&
+	    (rdma_port_get_link_layer(qp->device, qp_attr->ah_attr.port_num) == IB_LINK_LAYER_ETHERNET)) {
+		ret = ib_query_gid(qp->device, qp_attr->ah_attr.port_num,
+				   qp_attr->ah_attr.grh.sgid_index, &sgid);
+		if (ret)
+			goto out;
+		if (rdma_link_local_addr((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw)) {
+			rdma_get_ll_mac((struct in6_addr *)qp_attr->ah_attr.grh.dgid.raw, qp_attr->ah_attr.dmac);
+			rdma_get_ll_mac((struct in6_addr *)sgid.raw, qp_attr->smac);
+			if (!(*qp_attr_mask & IB_QP_VID))
+				qp_attr->vlan_id = rdma_get_vlan_id(&sgid);
+		} else {
+			ret = rdma_addr_find_dmac_by_grh(&sgid, &qp_attr->ah_attr.grh.dgid,
+					qp_attr->ah_attr.dmac, &qp_attr->vlan_id);
+			if (ret)
+				goto out;
+			ret = rdma_addr_find_smac_by_sgid(&sgid, qp_attr->smac, NULL);
+			if (ret)
+				goto out;
+		}
+		*qp_attr_mask |= IB_QP_SMAC;
+		if (qp_attr->vlan_id < 0xFFFF)
+			*qp_attr_mask |= IB_QP_VID;
+	}
+out:
+	return ret;
+}
+EXPORT_SYMBOL(ib_resolve_eth_l2_attrs);
+
+
+int ib_modify_qp(struct ib_qp *qp,
+		 struct ib_qp_attr *qp_attr,
+		 int qp_attr_mask)
+{
+	int ret;
+
+	ret = ib_resolve_eth_l2_attrs(qp, qp_attr, &qp_attr_mask);
+	if (ret)
+		return ret;
+
+	return qp->device->modify_qp(qp->real_qp, qp_attr, qp_attr_mask, NULL);
+}
+EXPORT_SYMBOL(ib_modify_qp);
+
+int ib_query_qp(struct ib_qp *qp,
+		struct ib_qp_attr *qp_attr,
+		int qp_attr_mask,
+		struct ib_qp_init_attr *qp_init_attr)
+{
+	return qp->device->query_qp ?
+		qp->device->query_qp(qp->real_qp, qp_attr, qp_attr_mask, qp_init_attr) :
+		-ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_qp);
+
+int ib_close_qp(struct ib_qp *qp)
+{
+	struct ib_qp *real_qp;
+	unsigned long flags;
+
+	real_qp = qp->real_qp;
+	if (real_qp == qp)
+		return -EINVAL;
+
+	spin_lock_irqsave(&real_qp->device->event_handler_lock, flags);
+	list_del(&qp->open_list);
+	spin_unlock_irqrestore(&real_qp->device->event_handler_lock, flags);
+
+	atomic_dec(&real_qp->usecnt);
+	kfree(qp);
+
+	return 0;
+}
+EXPORT_SYMBOL(ib_close_qp);
+
+static int __ib_destroy_shared_qp(struct ib_qp *qp)
+{
+	struct ib_xrcd *xrcd;
+	struct ib_qp *real_qp;
+	int ret;
+
+	real_qp = qp->real_qp;
+	xrcd = real_qp->xrcd;
+
+	mutex_lock(&xrcd->tgt_qp_mutex);
+	ib_close_qp(qp);
+	if (atomic_read(&real_qp->usecnt) == 0)
+		list_del(&real_qp->xrcd_list);
+	else
+		real_qp = NULL;
+	mutex_unlock(&xrcd->tgt_qp_mutex);
+
+	if (real_qp) {
+		ret = ib_destroy_qp(real_qp);
+		if (!ret)
+			atomic_dec(&xrcd->usecnt);
+		else
+			__ib_insert_xrcd_qp(xrcd, real_qp);
+	}
+
+	return 0;
+}
+
+int ib_destroy_qp(struct ib_qp *qp)
+{
+	struct ib_pd *pd;
+	struct ib_cq *scq, *rcq;
+	struct ib_srq *srq;
+	int ret;
+
+	if (atomic_read(&qp->usecnt))
+		return -EBUSY;
+
+	if (qp->real_qp != qp)
+		return __ib_destroy_shared_qp(qp);
+
+	pd   = qp->pd;
+	scq  = qp->send_cq;
+	rcq  = qp->recv_cq;
+	srq  = qp->srq;
+
+	ret = qp->device->destroy_qp(qp);
+	if (!ret) {
+		if (pd)
+			atomic_dec(&pd->usecnt);
+		if (scq)
+			atomic_dec(&scq->usecnt);
+		if (rcq)
+			atomic_dec(&rcq->usecnt);
+		if (srq)
+			atomic_dec(&srq->usecnt);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_qp);
+
+/* Completion queues */
+
+struct ib_cq *ib_create_cq(struct ib_device *device,
+			   ib_comp_handler comp_handler,
+			   void (*event_handler)(struct ib_event *, void *),
+			   void *cq_context, int cqe, int comp_vector)
+{
+	struct ib_cq *cq;
+
+	cq = device->create_cq(device, cqe, comp_vector, NULL, NULL);
+
+	if (!IS_ERR(cq)) {
+		cq->device        = device;
+		cq->uobject       = NULL;
+		cq->comp_handler  = comp_handler;
+		cq->event_handler = event_handler;
+		cq->cq_context    = cq_context;
+		atomic_set(&cq->usecnt, 0);
+	}
+
+	return cq;
+}
+EXPORT_SYMBOL(ib_create_cq);
+
+int ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+	return cq->device->modify_cq ?
+		cq->device->modify_cq(cq, cq_count, cq_period) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_modify_cq);
+
+int ib_destroy_cq(struct ib_cq *cq)
+{
+	if (atomic_read(&cq->usecnt))
+		return -EBUSY;
+
+	return cq->device->destroy_cq(cq);
+}
+EXPORT_SYMBOL(ib_destroy_cq);
+
+int ib_resize_cq(struct ib_cq *cq, int cqe)
+{
+	return cq->device->resize_cq ?
+		cq->device->resize_cq(cq, cqe, NULL) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_resize_cq);
+
+/* Memory regions */
+
+struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+{
+	struct ib_mr *mr;
+	int err;
+
+	err = ib_check_mr_access(mr_access_flags);
+	if (err)
+		return ERR_PTR(err);
+
+	mr = pd->device->get_dma_mr(pd, mr_access_flags);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_get_dma_mr);
+
+struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
+			     struct ib_phys_buf *phys_buf_array,
+			     int num_phys_buf,
+			     int mr_access_flags,
+			     u64 *iova_start)
+{
+	struct ib_mr *mr;
+	int err;
+
+	err = ib_check_mr_access(mr_access_flags);
+	if (err)
+		return ERR_PTR(err);
+
+	if (!pd->device->reg_phys_mr)
+		return ERR_PTR(-ENOSYS);
+
+	mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
+				     mr_access_flags, iova_start);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_reg_phys_mr);
+
+int ib_rereg_phys_mr(struct ib_mr *mr,
+		     int mr_rereg_mask,
+		     struct ib_pd *pd,
+		     struct ib_phys_buf *phys_buf_array,
+		     int num_phys_buf,
+		     int mr_access_flags,
+		     u64 *iova_start)
+{
+	struct ib_pd *old_pd;
+	int ret;
+
+	ret = ib_check_mr_access(mr_access_flags);
+	if (ret)
+		return ret;
+
+	if (!mr->device->rereg_phys_mr)
+		return -ENOSYS;
+
+	if (atomic_read(&mr->usecnt))
+		return -EBUSY;
+
+	old_pd = mr->pd;
+
+	ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
+					phys_buf_array, num_phys_buf,
+					mr_access_flags, iova_start);
+
+	if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
+		atomic_dec(&old_pd->usecnt);
+		atomic_inc(&pd->usecnt);
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_rereg_phys_mr);
+
+int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
+{
+	return mr->device->query_mr ?
+		mr->device->query_mr(mr, mr_attr) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_query_mr);
+
+int ib_dereg_mr(struct ib_mr *mr)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	if (atomic_read(&mr->usecnt))
+		return -EBUSY;
+
+	pd = mr->pd;
+	ret = mr->device->dereg_mr(mr);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dereg_mr);
+
+struct ib_mr *ib_create_mr(struct ib_pd *pd,
+			   struct ib_mr_init_attr *mr_init_attr)
+{
+	struct ib_mr *mr;
+
+	if (!pd->device->create_mr)
+		return ERR_PTR(-ENOSYS);
+
+	mr = pd->device->create_mr(pd, mr_init_attr);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_create_mr);
+
+int ib_destroy_mr(struct ib_mr *mr)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	if (atomic_read(&mr->usecnt))
+		return -EBUSY;
+
+	pd = mr->pd;
+	ret = mr->device->destroy_mr(mr);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_destroy_mr);
+
+struct ib_mr *ib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+{
+	struct ib_mr *mr;
+
+	if (!pd->device->alloc_fast_reg_mr)
+		return ERR_PTR(-ENOSYS);
+
+	mr = pd->device->alloc_fast_reg_mr(pd, max_page_list_len);
+
+	if (!IS_ERR(mr)) {
+		mr->device  = pd->device;
+		mr->pd      = pd;
+		mr->uobject = NULL;
+		atomic_inc(&pd->usecnt);
+		atomic_set(&mr->usecnt, 0);
+	}
+
+	return mr;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_mr);
+
+struct ib_fast_reg_page_list *ib_alloc_fast_reg_page_list(struct ib_device *device,
+							  int max_page_list_len)
+{
+	struct ib_fast_reg_page_list *page_list;
+
+	if (!device->alloc_fast_reg_page_list)
+		return ERR_PTR(-ENOSYS);
+
+	page_list = device->alloc_fast_reg_page_list(device, max_page_list_len);
+
+	if (!IS_ERR(page_list)) {
+		page_list->device = device;
+		page_list->max_page_list_len = max_page_list_len;
+	}
+
+	return page_list;
+}
+EXPORT_SYMBOL(ib_alloc_fast_reg_page_list);
+
+void ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+	page_list->device->free_fast_reg_page_list(page_list);
+}
+EXPORT_SYMBOL(ib_free_fast_reg_page_list);
+
+/* Memory windows */
+
+struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
+{
+	struct ib_mw *mw;
+
+	if (!pd->device->alloc_mw)
+		return ERR_PTR(-ENOSYS);
+
+	mw = pd->device->alloc_mw(pd, type);
+	if (!IS_ERR(mw)) {
+		mw->device  = pd->device;
+		mw->pd      = pd;
+		mw->uobject = NULL;
+		mw->type    = type;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return mw;
+}
+EXPORT_SYMBOL(ib_alloc_mw);
+
+int ib_dealloc_mw(struct ib_mw *mw)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = mw->pd;
+	ret = mw->device->dealloc_mw(mw);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_mw);
+
+/* "Fast" memory regions */
+
+struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd,
+			    int mr_access_flags,
+			    struct ib_fmr_attr *fmr_attr)
+{
+	struct ib_fmr *fmr;
+
+	if (!pd->device->alloc_fmr)
+		return ERR_PTR(-ENOSYS);
+
+	fmr = pd->device->alloc_fmr(pd, mr_access_flags, fmr_attr);
+	if (!IS_ERR(fmr)) {
+		fmr->device = pd->device;
+		fmr->pd     = pd;
+		atomic_inc(&pd->usecnt);
+	}
+
+	return fmr;
+}
+EXPORT_SYMBOL(ib_alloc_fmr);
+
+int ib_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ib_fmr *fmr;
+
+	if (list_empty(fmr_list))
+		return 0;
+
+	fmr = list_entry(fmr_list->next, struct ib_fmr, list);
+	return fmr->device->unmap_fmr(fmr_list);
+}
+EXPORT_SYMBOL(ib_unmap_fmr);
+
+int ib_dealloc_fmr(struct ib_fmr *fmr)
+{
+	struct ib_pd *pd;
+	int ret;
+
+	pd = fmr->pd;
+	ret = fmr->device->dealloc_fmr(fmr);
+	if (!ret)
+		atomic_dec(&pd->usecnt);
+
+	return ret;
+}
+EXPORT_SYMBOL(ib_dealloc_fmr);
+
+/* Multicast groups */
+
+int ib_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+	int ret;
+
+	if (!qp->device->attach_mcast)
+		return -ENOSYS;
+	if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+		return -EINVAL;
+
+	ret = qp->device->attach_mcast(qp, gid, lid);
+	if (!ret)
+		atomic_inc(&qp->usecnt);
+	return ret;
+}
+EXPORT_SYMBOL(ib_attach_mcast);
+
+int ib_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid)
+{
+	int ret;
+
+	if (!qp->device->detach_mcast)
+		return -ENOSYS;
+	if (gid->raw[0] != 0xff || qp->qp_type != IB_QPT_UD)
+		return -EINVAL;
+
+	ret = qp->device->detach_mcast(qp, gid, lid);
+	if (!ret)
+		atomic_dec(&qp->usecnt);
+	return ret;
+}
+EXPORT_SYMBOL(ib_detach_mcast);
+
+struct ib_xrcd *ib_alloc_xrcd(struct ib_device *device)
+{
+	struct ib_xrcd *xrcd;
+
+	if (!device->alloc_xrcd)
+		return ERR_PTR(-ENOSYS);
+
+	xrcd = device->alloc_xrcd(device, NULL, NULL);
+	if (!IS_ERR(xrcd)) {
+		xrcd->device = device;
+		xrcd->inode = NULL;
+		atomic_set(&xrcd->usecnt, 0);
+		mutex_init(&xrcd->tgt_qp_mutex);
+		INIT_LIST_HEAD(&xrcd->tgt_qp_list);
+	}
+
+	return xrcd;
+}
+EXPORT_SYMBOL(ib_alloc_xrcd);
+
+int ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+	struct ib_qp *qp;
+	int ret;
+
+	if (atomic_read(&xrcd->usecnt))
+		return -EBUSY;
+
+	while (!list_empty(&xrcd->tgt_qp_list)) {
+		qp = list_entry(xrcd->tgt_qp_list.next, struct ib_qp, xrcd_list);
+		ret = ib_destroy_qp(qp);
+		if (ret)
+			return ret;
+	}
+
+	return xrcd->device->dealloc_xrcd(xrcd);
+}
+EXPORT_SYMBOL(ib_dealloc_xrcd);
+
+struct ib_flow *ib_create_flow(struct ib_qp *qp,
+			       struct ib_flow_attr *flow_attr,
+			       int domain)
+{
+	struct ib_flow *flow_id;
+	if (!qp->device->create_flow)
+		return ERR_PTR(-ENOSYS);
+
+	flow_id = qp->device->create_flow(qp, flow_attr, domain);
+	if (!IS_ERR(flow_id))
+		atomic_inc(&qp->usecnt);
+	return flow_id;
+}
+EXPORT_SYMBOL(ib_create_flow);
+
+int ib_destroy_flow(struct ib_flow *flow_id)
+{
+	int err;
+	struct ib_qp *qp = flow_id->qp;
+
+	err = qp->device->destroy_flow(flow_id);
+	if (!err)
+		atomic_dec(&qp->usecnt);
+	return err;
+}
+EXPORT_SYMBOL(ib_destroy_flow);
+
+int ib_check_mr_status(struct ib_mr *mr, u32 check_mask,
+		       struct ib_mr_status *mr_status)
+{
+	return mr->device->check_mr_status ?
+		mr->device->check_mr_status(mr, check_mask, mr_status) : -ENOSYS;
+}
+EXPORT_SYMBOL(ib_check_mr_status);
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
new file mode 100644
index 000000000..e900b0353
--- /dev/null
+++ b/drivers/infiniband/hw/Makefile
@@ -0,0 +1,12 @@
+obj-$(CONFIG_INFINIBAND_MTHCA)		+= mthca/
+obj-$(CONFIG_INFINIBAND_IPATH)		+= ipath/
+obj-$(CONFIG_INFINIBAND_QIB)		+= qib/
+obj-$(CONFIG_INFINIBAND_EHCA)		+= ehca/
+obj-$(CONFIG_INFINIBAND_AMSO1100)	+= amso1100/
+obj-$(CONFIG_INFINIBAND_CXGB3)		+= cxgb3/
+obj-$(CONFIG_INFINIBAND_CXGB4)		+= cxgb4/
+obj-$(CONFIG_MLX4_INFINIBAND)		+= mlx4/
+obj-$(CONFIG_MLX5_INFINIBAND)		+= mlx5/
+obj-$(CONFIG_INFINIBAND_NES)		+= nes/
+obj-$(CONFIG_INFINIBAND_OCRDMA)		+= ocrdma/
+obj-$(CONFIG_INFINIBAND_USNIC)		+= usnic/
diff --git a/drivers/infiniband/hw/amso1100/Kbuild b/drivers/infiniband/hw/amso1100/Kbuild
new file mode 100644
index 000000000..950dfabcd
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/Kbuild
@@ -0,0 +1,6 @@
+ccflags-$(CONFIG_INFINIBAND_AMSO1100_DEBUG) := -DDEBUG
+
+obj-$(CONFIG_INFINIBAND_AMSO1100) += iw_c2.o
+
+iw_c2-y := c2.o c2_provider.o c2_rnic.o c2_alloc.o c2_mq.o c2_ae.o c2_vq.o \
+	c2_intr.o c2_cq.o c2_qp.o c2_cm.o c2_mm.o c2_pd.o
diff --git a/drivers/infiniband/hw/amso1100/Kconfig b/drivers/infiniband/hw/amso1100/Kconfig
new file mode 100644
index 000000000..e6ce5f209
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/Kconfig
@@ -0,0 +1,15 @@
+config INFINIBAND_AMSO1100
+	tristate "Ammasso 1100 HCA support"
+	depends on PCI && INET
+	---help---
+	  This is a low-level driver for the Ammasso 1100 host
+	  channel adapter (HCA).
+
+config INFINIBAND_AMSO1100_DEBUG
+	bool "Verbose debugging output"
+	depends on INFINIBAND_AMSO1100
+	default n
+	---help---
+	  This option causes the amso1100 driver to produce a bunch of
+	  debug messages.  Select this if you are developing the driver
+	  or trying to diagnose a problem.
diff --git a/drivers/infiniband/hw/amso1100/c2.c b/drivers/infiniband/hw/amso1100/c2.c
new file mode 100644
index 000000000..766a71cce
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2.c
@@ -0,0 +1,1241 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/prefetch.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/ib_smi.h>
+#include "c2.h"
+#include "c2_provider.h"
+
+MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>");
+MODULE_DESCRIPTION("Ammasso AMSO1100 Low-level iWARP Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
+    | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
+
+static int debug = -1;		/* defaults above */
+module_param(debug, int, 0);
+MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all)");
+
+static int c2_up(struct net_device *netdev);
+static int c2_down(struct net_device *netdev);
+static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
+static void c2_tx_interrupt(struct net_device *netdev);
+static void c2_rx_interrupt(struct net_device *netdev);
+static irqreturn_t c2_interrupt(int irq, void *dev_id);
+static void c2_tx_timeout(struct net_device *netdev);
+static int c2_change_mtu(struct net_device *netdev, int new_mtu);
+static void c2_reset(struct c2_port *c2_port);
+
+static struct pci_device_id c2_pci_table[] = {
+	{ PCI_DEVICE(0x18b8, 0xb001) },
+	{ 0 }
+};
+
+MODULE_DEVICE_TABLE(pci, c2_pci_table);
+
+static void c2_print_macaddr(struct net_device *netdev)
+{
+	pr_debug("%s: MAC %pM, IRQ %u\n", netdev->name, netdev->dev_addr, netdev->irq);
+}
+
+static void c2_set_rxbufsize(struct c2_port *c2_port)
+{
+	struct net_device *netdev = c2_port->netdev;
+
+	if (netdev->mtu > RX_BUF_SIZE)
+		c2_port->rx_buf_size =
+		    netdev->mtu + ETH_HLEN + sizeof(struct c2_rxp_hdr) +
+		    NET_IP_ALIGN;
+	else
+		c2_port->rx_buf_size = sizeof(struct c2_rxp_hdr) + RX_BUF_SIZE;
+}
+
+/*
+ * Allocate TX ring elements and chain them together.
+ * One-to-one association of adapter descriptors with ring elements.
+ */
+static int c2_tx_ring_alloc(struct c2_ring *tx_ring, void *vaddr,
+			    dma_addr_t base, void __iomem * mmio_txp_ring)
+{
+	struct c2_tx_desc *tx_desc;
+	struct c2_txp_desc __iomem *txp_desc;
+	struct c2_element *elem;
+	int i;
+
+	tx_ring->start = kmalloc(sizeof(*elem) * tx_ring->count, GFP_KERNEL);
+	if (!tx_ring->start)
+		return -ENOMEM;
+
+	elem = tx_ring->start;
+	tx_desc = vaddr;
+	txp_desc = mmio_txp_ring;
+	for (i = 0; i < tx_ring->count; i++, elem++, tx_desc++, txp_desc++) {
+		tx_desc->len = 0;
+		tx_desc->status = 0;
+
+		/* Set TXP_HTXD_UNINIT */
+		__raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL),
+			     (void __iomem *) txp_desc + C2_TXP_ADDR);
+		__raw_writew(0, (void __iomem *) txp_desc + C2_TXP_LEN);
+		__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT),
+			     (void __iomem *) txp_desc + C2_TXP_FLAGS);
+
+		elem->skb = NULL;
+		elem->ht_desc = tx_desc;
+		elem->hw_desc = txp_desc;
+
+		if (i == tx_ring->count - 1) {
+			elem->next = tx_ring->start;
+			tx_desc->next_offset = base;
+		} else {
+			elem->next = elem + 1;
+			tx_desc->next_offset =
+			    base + (i + 1) * sizeof(*tx_desc);
+		}
+	}
+
+	tx_ring->to_use = tx_ring->to_clean = tx_ring->start;
+
+	return 0;
+}
+
+/*
+ * Allocate RX ring elements and chain them together.
+ * One-to-one association of adapter descriptors with ring elements.
+ */
+static int c2_rx_ring_alloc(struct c2_ring *rx_ring, void *vaddr,
+			    dma_addr_t base, void __iomem * mmio_rxp_ring)
+{
+	struct c2_rx_desc *rx_desc;
+	struct c2_rxp_desc __iomem *rxp_desc;
+	struct c2_element *elem;
+	int i;
+
+	rx_ring->start = kmalloc(sizeof(*elem) * rx_ring->count, GFP_KERNEL);
+	if (!rx_ring->start)
+		return -ENOMEM;
+
+	elem = rx_ring->start;
+	rx_desc = vaddr;
+	rxp_desc = mmio_rxp_ring;
+	for (i = 0; i < rx_ring->count; i++, elem++, rx_desc++, rxp_desc++) {
+		rx_desc->len = 0;
+		rx_desc->status = 0;
+
+		/* Set RXP_HRXD_UNINIT */
+		__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_OK),
+		       (void __iomem *) rxp_desc + C2_RXP_STATUS);
+		__raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_COUNT);
+		__raw_writew(0, (void __iomem *) rxp_desc + C2_RXP_LEN);
+		__raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL),
+			     (void __iomem *) rxp_desc + C2_RXP_ADDR);
+		__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT),
+			     (void __iomem *) rxp_desc + C2_RXP_FLAGS);
+
+		elem->skb = NULL;
+		elem->ht_desc = rx_desc;
+		elem->hw_desc = rxp_desc;
+
+		if (i == rx_ring->count - 1) {
+			elem->next = rx_ring->start;
+			rx_desc->next_offset = base;
+		} else {
+			elem->next = elem + 1;
+			rx_desc->next_offset =
+			    base + (i + 1) * sizeof(*rx_desc);
+		}
+	}
+
+	rx_ring->to_use = rx_ring->to_clean = rx_ring->start;
+
+	return 0;
+}
+
+/* Setup buffer for receiving */
+static inline int c2_rx_alloc(struct c2_port *c2_port, struct c2_element *elem)
+{
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_rx_desc *rx_desc = elem->ht_desc;
+	struct sk_buff *skb;
+	dma_addr_t mapaddr;
+	u32 maplen;
+	struct c2_rxp_hdr *rxp_hdr;
+
+	skb = dev_alloc_skb(c2_port->rx_buf_size);
+	if (unlikely(!skb)) {
+		pr_debug("%s: out of memory for receive\n",
+			c2_port->netdev->name);
+		return -ENOMEM;
+	}
+
+	/* Zero out the rxp hdr in the sk_buff */
+	memset(skb->data, 0, sizeof(*rxp_hdr));
+
+	skb->dev = c2_port->netdev;
+
+	maplen = c2_port->rx_buf_size;
+	mapaddr =
+	    pci_map_single(c2dev->pcidev, skb->data, maplen,
+			   PCI_DMA_FROMDEVICE);
+
+	/* Set the sk_buff RXP_header to RXP_HRXD_READY */
+	rxp_hdr = (struct c2_rxp_hdr *) skb->data;
+	rxp_hdr->flags = RXP_HRXD_READY;
+
+	__raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
+	__raw_writew((__force u16) cpu_to_be16((u16) maplen - sizeof(*rxp_hdr)),
+		     elem->hw_desc + C2_RXP_LEN);
+	__raw_writeq((__force u64) cpu_to_be64(mapaddr), elem->hw_desc + C2_RXP_ADDR);
+	__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
+		     elem->hw_desc + C2_RXP_FLAGS);
+
+	elem->skb = skb;
+	elem->mapaddr = mapaddr;
+	elem->maplen = maplen;
+	rx_desc->len = maplen;
+
+	return 0;
+}
+
+/*
+ * Allocate buffers for the Rx ring
+ * For receive:  rx_ring.to_clean is next received frame
+ */
+static int c2_rx_fill(struct c2_port *c2_port)
+{
+	struct c2_ring *rx_ring = &c2_port->rx_ring;
+	struct c2_element *elem;
+	int ret = 0;
+
+	elem = rx_ring->start;
+	do {
+		if (c2_rx_alloc(c2_port, elem)) {
+			ret = 1;
+			break;
+		}
+	} while ((elem = elem->next) != rx_ring->start);
+
+	rx_ring->to_clean = rx_ring->start;
+	return ret;
+}
+
+/* Free all buffers in RX ring, assumes receiver stopped */
+static void c2_rx_clean(struct c2_port *c2_port)
+{
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_ring *rx_ring = &c2_port->rx_ring;
+	struct c2_element *elem;
+	struct c2_rx_desc *rx_desc;
+
+	elem = rx_ring->start;
+	do {
+		rx_desc = elem->ht_desc;
+		rx_desc->len = 0;
+
+		__raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
+		__raw_writew(0, elem->hw_desc + C2_RXP_COUNT);
+		__raw_writew(0, elem->hw_desc + C2_RXP_LEN);
+		__raw_writeq((__force u64) cpu_to_be64(0x99aabbccddeeffULL),
+			     elem->hw_desc + C2_RXP_ADDR);
+		__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_UNINIT),
+			     elem->hw_desc + C2_RXP_FLAGS);
+
+		if (elem->skb) {
+			pci_unmap_single(c2dev->pcidev, elem->mapaddr,
+					 elem->maplen, PCI_DMA_FROMDEVICE);
+			dev_kfree_skb(elem->skb);
+			elem->skb = NULL;
+		}
+	} while ((elem = elem->next) != rx_ring->start);
+}
+
+static inline int c2_tx_free(struct c2_dev *c2dev, struct c2_element *elem)
+{
+	struct c2_tx_desc *tx_desc = elem->ht_desc;
+
+	tx_desc->len = 0;
+
+	pci_unmap_single(c2dev->pcidev, elem->mapaddr, elem->maplen,
+			 PCI_DMA_TODEVICE);
+
+	if (elem->skb) {
+		dev_kfree_skb_any(elem->skb);
+		elem->skb = NULL;
+	}
+
+	return 0;
+}
+
+/* Free all buffers in TX ring, assumes transmitter stopped */
+static void c2_tx_clean(struct c2_port *c2_port)
+{
+	struct c2_ring *tx_ring = &c2_port->tx_ring;
+	struct c2_element *elem;
+	struct c2_txp_desc txp_htxd;
+	int retry;
+	unsigned long flags;
+
+	spin_lock_irqsave(&c2_port->tx_lock, flags);
+
+	elem = tx_ring->start;
+
+	do {
+		retry = 0;
+		do {
+			txp_htxd.flags =
+			    readw(elem->hw_desc + C2_TXP_FLAGS);
+
+			if (txp_htxd.flags == TXP_HTXD_READY) {
+				retry = 1;
+				__raw_writew(0,
+					     elem->hw_desc + C2_TXP_LEN);
+				__raw_writeq(0,
+					     elem->hw_desc + C2_TXP_ADDR);
+				__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_DONE),
+					     elem->hw_desc + C2_TXP_FLAGS);
+				c2_port->netdev->stats.tx_dropped++;
+				break;
+			} else {
+				__raw_writew(0,
+					     elem->hw_desc + C2_TXP_LEN);
+				__raw_writeq((__force u64) cpu_to_be64(0x1122334455667788ULL),
+					     elem->hw_desc + C2_TXP_ADDR);
+				__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_UNINIT),
+					     elem->hw_desc + C2_TXP_FLAGS);
+			}
+
+			c2_tx_free(c2_port->c2dev, elem);
+
+		} while ((elem = elem->next) != tx_ring->start);
+	} while (retry);
+
+	c2_port->tx_avail = c2_port->tx_ring.count - 1;
+	c2_port->c2dev->cur_tx = tx_ring->to_use - tx_ring->start;
+
+	if (c2_port->tx_avail > MAX_SKB_FRAGS + 1)
+		netif_wake_queue(c2_port->netdev);
+
+	spin_unlock_irqrestore(&c2_port->tx_lock, flags);
+}
+
+/*
+ * Process transmit descriptors marked 'DONE' by the firmware,
+ * freeing up their unneeded sk_buffs.
+ */
+static void c2_tx_interrupt(struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_ring *tx_ring = &c2_port->tx_ring;
+	struct c2_element *elem;
+	struct c2_txp_desc txp_htxd;
+
+	spin_lock(&c2_port->tx_lock);
+
+	for (elem = tx_ring->to_clean; elem != tx_ring->to_use;
+	     elem = elem->next) {
+		txp_htxd.flags =
+		    be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_FLAGS));
+
+		if (txp_htxd.flags != TXP_HTXD_DONE)
+			break;
+
+		if (netif_msg_tx_done(c2_port)) {
+			/* PCI reads are expensive in fast path */
+			txp_htxd.len =
+			    be16_to_cpu((__force __be16) readw(elem->hw_desc + C2_TXP_LEN));
+			pr_debug("%s: tx done slot %3Zu status 0x%x len "
+				"%5u bytes\n",
+				netdev->name, elem - tx_ring->start,
+				txp_htxd.flags, txp_htxd.len);
+		}
+
+		c2_tx_free(c2dev, elem);
+		++(c2_port->tx_avail);
+	}
+
+	tx_ring->to_clean = elem;
+
+	if (netif_queue_stopped(netdev)
+	    && c2_port->tx_avail > MAX_SKB_FRAGS + 1)
+		netif_wake_queue(netdev);
+
+	spin_unlock(&c2_port->tx_lock);
+}
+
+static void c2_rx_error(struct c2_port *c2_port, struct c2_element *elem)
+{
+	struct c2_rx_desc *rx_desc = elem->ht_desc;
+	struct c2_rxp_hdr *rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data;
+
+	if (rxp_hdr->status != RXP_HRXD_OK ||
+	    rxp_hdr->len > (rx_desc->len - sizeof(*rxp_hdr))) {
+		pr_debug("BAD RXP_HRXD\n");
+		pr_debug("  rx_desc : %p\n", rx_desc);
+		pr_debug("    index : %Zu\n",
+			elem - c2_port->rx_ring.start);
+		pr_debug("    len   : %u\n", rx_desc->len);
+		pr_debug("  rxp_hdr : %p [PA %p]\n", rxp_hdr,
+			(void *) __pa((unsigned long) rxp_hdr));
+		pr_debug("    flags : 0x%x\n", rxp_hdr->flags);
+		pr_debug("    status: 0x%x\n", rxp_hdr->status);
+		pr_debug("    len   : %u\n", rxp_hdr->len);
+		pr_debug("    rsvd  : 0x%x\n", rxp_hdr->rsvd);
+	}
+
+	/* Setup the skb for reuse since we're dropping this pkt */
+	elem->skb->data = elem->skb->head;
+	skb_reset_tail_pointer(elem->skb);
+
+	/* Zero out the rxp hdr in the sk_buff */
+	memset(elem->skb->data, 0, sizeof(*rxp_hdr));
+
+	/* Write the descriptor to the adapter's rx ring */
+	__raw_writew(0, elem->hw_desc + C2_RXP_STATUS);
+	__raw_writew(0, elem->hw_desc + C2_RXP_COUNT);
+	__raw_writew((__force u16) cpu_to_be16((u16) elem->maplen - sizeof(*rxp_hdr)),
+		     elem->hw_desc + C2_RXP_LEN);
+	__raw_writeq((__force u64) cpu_to_be64(elem->mapaddr),
+		     elem->hw_desc + C2_RXP_ADDR);
+	__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
+		     elem->hw_desc + C2_RXP_FLAGS);
+
+	pr_debug("packet dropped\n");
+	c2_port->netdev->stats.rx_dropped++;
+}
+
+static void c2_rx_interrupt(struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_ring *rx_ring = &c2_port->rx_ring;
+	struct c2_element *elem;
+	struct c2_rx_desc *rx_desc;
+	struct c2_rxp_hdr *rxp_hdr;
+	struct sk_buff *skb;
+	dma_addr_t mapaddr;
+	u32 maplen, buflen;
+	unsigned long flags;
+
+	spin_lock_irqsave(&c2dev->lock, flags);
+
+	/* Begin where we left off */
+	rx_ring->to_clean = rx_ring->start + c2dev->cur_rx;
+
+	for (elem = rx_ring->to_clean; elem->next != rx_ring->to_clean;
+	     elem = elem->next) {
+		rx_desc = elem->ht_desc;
+		mapaddr = elem->mapaddr;
+		maplen = elem->maplen;
+		skb = elem->skb;
+		rxp_hdr = (struct c2_rxp_hdr *) skb->data;
+
+		if (rxp_hdr->flags != RXP_HRXD_DONE)
+			break;
+		buflen = rxp_hdr->len;
+
+		/* Sanity check the RXP header */
+		if (rxp_hdr->status != RXP_HRXD_OK ||
+		    buflen > (rx_desc->len - sizeof(*rxp_hdr))) {
+			c2_rx_error(c2_port, elem);
+			continue;
+		}
+
+		/*
+		 * Allocate and map a new skb for replenishing the host
+		 * RX desc
+		 */
+		if (c2_rx_alloc(c2_port, elem)) {
+			c2_rx_error(c2_port, elem);
+			continue;
+		}
+
+		/* Unmap the old skb */
+		pci_unmap_single(c2dev->pcidev, mapaddr, maplen,
+				 PCI_DMA_FROMDEVICE);
+
+		prefetch(skb->data);
+
+		/*
+		 * Skip past the leading 8 bytes comprising of the
+		 * "struct c2_rxp_hdr", prepended by the adapter
+		 * to the usual Ethernet header ("struct ethhdr"),
+		 * to the start of the raw Ethernet packet.
+		 *
+		 * Fix up the various fields in the sk_buff before
+		 * passing it up to netif_rx(). The transfer size
+		 * (in bytes) specified by the adapter len field of
+		 * the "struct rxp_hdr_t" does NOT include the
+		 * "sizeof(struct c2_rxp_hdr)".
+		 */
+		skb->data += sizeof(*rxp_hdr);
+		skb_set_tail_pointer(skb, buflen);
+		skb->len = buflen;
+		skb->protocol = eth_type_trans(skb, netdev);
+
+		netif_rx(skb);
+
+		netdev->stats.rx_packets++;
+		netdev->stats.rx_bytes += buflen;
+	}
+
+	/* Save where we left off */
+	rx_ring->to_clean = elem;
+	c2dev->cur_rx = elem - rx_ring->start;
+	C2_SET_CUR_RX(c2dev, c2dev->cur_rx);
+
+	spin_unlock_irqrestore(&c2dev->lock, flags);
+}
+
+/*
+ * Handle netisr0 TX & RX interrupts.
+ */
+static irqreturn_t c2_interrupt(int irq, void *dev_id)
+{
+	unsigned int netisr0, dmaisr;
+	int handled = 0;
+	struct c2_dev *c2dev = (struct c2_dev *) dev_id;
+
+	/* Process CCILNET interrupts */
+	netisr0 = readl(c2dev->regs + C2_NISR0);
+	if (netisr0) {
+
+		/*
+		 * There is an issue with the firmware that always
+		 * provides the status of RX for both TX & RX
+		 * interrupts.  So process both queues here.
+		 */
+		c2_rx_interrupt(c2dev->netdev);
+		c2_tx_interrupt(c2dev->netdev);
+
+		/* Clear the interrupt */
+		writel(netisr0, c2dev->regs + C2_NISR0);
+		handled++;
+	}
+
+	/* Process RNIC interrupts */
+	dmaisr = readl(c2dev->regs + C2_DISR);
+	if (dmaisr) {
+		writel(dmaisr, c2dev->regs + C2_DISR);
+		c2_rnic_interrupt(c2dev);
+		handled++;
+	}
+
+	if (handled) {
+		return IRQ_HANDLED;
+	} else {
+		return IRQ_NONE;
+	}
+}
+
+static int c2_up(struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_element *elem;
+	struct c2_rxp_hdr *rxp_hdr;
+	struct in_device *in_dev;
+	size_t rx_size, tx_size;
+	int ret, i;
+	unsigned int netimr0;
+
+	if (netif_msg_ifup(c2_port))
+		pr_debug("%s: enabling interface\n", netdev->name);
+
+	/* Set the Rx buffer size based on MTU */
+	c2_set_rxbufsize(c2_port);
+
+	/* Allocate DMA'able memory for Tx/Rx host descriptor rings */
+	rx_size = c2_port->rx_ring.count * sizeof(struct c2_rx_desc);
+	tx_size = c2_port->tx_ring.count * sizeof(struct c2_tx_desc);
+
+	c2_port->mem_size = tx_size + rx_size;
+	c2_port->mem = pci_zalloc_consistent(c2dev->pcidev, c2_port->mem_size,
+					     &c2_port->dma);
+	if (c2_port->mem == NULL) {
+		pr_debug("Unable to allocate memory for "
+			"host descriptor rings\n");
+		return -ENOMEM;
+	}
+
+	/* Create the Rx host descriptor ring */
+	if ((ret =
+	     c2_rx_ring_alloc(&c2_port->rx_ring, c2_port->mem, c2_port->dma,
+			      c2dev->mmio_rxp_ring))) {
+		pr_debug("Unable to create RX ring\n");
+		goto bail0;
+	}
+
+	/* Allocate Rx buffers for the host descriptor ring */
+	if (c2_rx_fill(c2_port)) {
+		pr_debug("Unable to fill RX ring\n");
+		goto bail1;
+	}
+
+	/* Create the Tx host descriptor ring */
+	if ((ret = c2_tx_ring_alloc(&c2_port->tx_ring, c2_port->mem + rx_size,
+				    c2_port->dma + rx_size,
+				    c2dev->mmio_txp_ring))) {
+		pr_debug("Unable to create TX ring\n");
+		goto bail1;
+	}
+
+	/* Set the TX pointer to where we left off */
+	c2_port->tx_avail = c2_port->tx_ring.count - 1;
+	c2_port->tx_ring.to_use = c2_port->tx_ring.to_clean =
+	    c2_port->tx_ring.start + c2dev->cur_tx;
+
+	/* missing: Initialize MAC */
+
+	BUG_ON(c2_port->tx_ring.to_use != c2_port->tx_ring.to_clean);
+
+	/* Reset the adapter, ensures the driver is in sync with the RXP */
+	c2_reset(c2_port);
+
+	/* Reset the READY bit in the sk_buff RXP headers & adapter HRXDQ */
+	for (i = 0, elem = c2_port->rx_ring.start; i < c2_port->rx_ring.count;
+	     i++, elem++) {
+		rxp_hdr = (struct c2_rxp_hdr *) elem->skb->data;
+		rxp_hdr->flags = 0;
+		__raw_writew((__force u16) cpu_to_be16(RXP_HRXD_READY),
+			     elem->hw_desc + C2_RXP_FLAGS);
+	}
+
+	/* Enable network packets */
+	netif_start_queue(netdev);
+
+	/* Enable IRQ */
+	writel(0, c2dev->regs + C2_IDIS);
+	netimr0 = readl(c2dev->regs + C2_NIMR0);
+	netimr0 &= ~(C2_PCI_HTX_INT | C2_PCI_HRX_INT);
+	writel(netimr0, c2dev->regs + C2_NIMR0);
+
+	/* Tell the stack to ignore arp requests for ipaddrs bound to
+	 * other interfaces.  This is needed to prevent the host stack
+	 * from responding to arp requests to the ipaddr bound on the
+	 * rdma interface.
+	 */
+	in_dev = in_dev_get(netdev);
+	IN_DEV_CONF_SET(in_dev, ARP_IGNORE, 1);
+	in_dev_put(in_dev);
+
+	return 0;
+
+      bail1:
+	c2_rx_clean(c2_port);
+	kfree(c2_port->rx_ring.start);
+
+      bail0:
+	pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem,
+			    c2_port->dma);
+
+	return ret;
+}
+
+static int c2_down(struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+	struct c2_dev *c2dev = c2_port->c2dev;
+
+	if (netif_msg_ifdown(c2_port))
+		pr_debug("%s: disabling interface\n",
+			netdev->name);
+
+	/* Wait for all the queued packets to get sent */
+	c2_tx_interrupt(netdev);
+
+	/* Disable network packets */
+	netif_stop_queue(netdev);
+
+	/* Disable IRQs by clearing the interrupt mask */
+	writel(1, c2dev->regs + C2_IDIS);
+	writel(0, c2dev->regs + C2_NIMR0);
+
+	/* missing: Stop transmitter */
+
+	/* missing: Stop receiver */
+
+	/* Reset the adapter, ensures the driver is in sync with the RXP */
+	c2_reset(c2_port);
+
+	/* missing: Turn off LEDs here */
+
+	/* Free all buffers in the host descriptor rings */
+	c2_tx_clean(c2_port);
+	c2_rx_clean(c2_port);
+
+	/* Free the host descriptor rings */
+	kfree(c2_port->rx_ring.start);
+	kfree(c2_port->tx_ring.start);
+	pci_free_consistent(c2dev->pcidev, c2_port->mem_size, c2_port->mem,
+			    c2_port->dma);
+
+	return 0;
+}
+
+static void c2_reset(struct c2_port *c2_port)
+{
+	struct c2_dev *c2dev = c2_port->c2dev;
+	unsigned int cur_rx = c2dev->cur_rx;
+
+	/* Tell the hardware to quiesce */
+	C2_SET_CUR_RX(c2dev, cur_rx | C2_PCI_HRX_QUI);
+
+	/*
+	 * The hardware will reset the C2_PCI_HRX_QUI bit once
+	 * the RXP is quiesced.  Wait 2 seconds for this.
+	 */
+	ssleep(2);
+
+	cur_rx = C2_GET_CUR_RX(c2dev);
+
+	if (cur_rx & C2_PCI_HRX_QUI)
+		pr_debug("c2_reset: failed to quiesce the hardware!\n");
+
+	cur_rx &= ~C2_PCI_HRX_QUI;
+
+	c2dev->cur_rx = cur_rx;
+
+	pr_debug("Current RX: %u\n", c2dev->cur_rx);
+}
+
+static int c2_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+	struct c2_dev *c2dev = c2_port->c2dev;
+	struct c2_ring *tx_ring = &c2_port->tx_ring;
+	struct c2_element *elem;
+	dma_addr_t mapaddr;
+	u32 maplen;
+	unsigned long flags;
+	unsigned int i;
+
+	spin_lock_irqsave(&c2_port->tx_lock, flags);
+
+	if (unlikely(c2_port->tx_avail < (skb_shinfo(skb)->nr_frags + 1))) {
+		netif_stop_queue(netdev);
+		spin_unlock_irqrestore(&c2_port->tx_lock, flags);
+
+		pr_debug("%s: Tx ring full when queue awake!\n",
+			netdev->name);
+		return NETDEV_TX_BUSY;
+	}
+
+	maplen = skb_headlen(skb);
+	mapaddr =
+	    pci_map_single(c2dev->pcidev, skb->data, maplen, PCI_DMA_TODEVICE);
+
+	elem = tx_ring->to_use;
+	elem->skb = skb;
+	elem->mapaddr = mapaddr;
+	elem->maplen = maplen;
+
+	/* Tell HW to xmit */
+	__raw_writeq((__force u64) cpu_to_be64(mapaddr),
+		     elem->hw_desc + C2_TXP_ADDR);
+	__raw_writew((__force u16) cpu_to_be16(maplen),
+		     elem->hw_desc + C2_TXP_LEN);
+	__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY),
+		     elem->hw_desc + C2_TXP_FLAGS);
+
+	netdev->stats.tx_packets++;
+	netdev->stats.tx_bytes += maplen;
+
+	/* Loop thru additional data fragments and queue them */
+	if (skb_shinfo(skb)->nr_frags) {
+		for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+			const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+			maplen = skb_frag_size(frag);
+			mapaddr = skb_frag_dma_map(&c2dev->pcidev->dev, frag,
+						   0, maplen, DMA_TO_DEVICE);
+			elem = elem->next;
+			elem->skb = NULL;
+			elem->mapaddr = mapaddr;
+			elem->maplen = maplen;
+
+			/* Tell HW to xmit */
+			__raw_writeq((__force u64) cpu_to_be64(mapaddr),
+				     elem->hw_desc + C2_TXP_ADDR);
+			__raw_writew((__force u16) cpu_to_be16(maplen),
+				     elem->hw_desc + C2_TXP_LEN);
+			__raw_writew((__force u16) cpu_to_be16(TXP_HTXD_READY),
+				     elem->hw_desc + C2_TXP_FLAGS);
+
+			netdev->stats.tx_packets++;
+			netdev->stats.tx_bytes += maplen;
+		}
+	}
+
+	tx_ring->to_use = elem->next;
+	c2_port->tx_avail -= (skb_shinfo(skb)->nr_frags + 1);
+
+	if (c2_port->tx_avail <= MAX_SKB_FRAGS + 1) {
+		netif_stop_queue(netdev);
+		if (netif_msg_tx_queued(c2_port))
+			pr_debug("%s: transmit queue full\n",
+				netdev->name);
+	}
+
+	spin_unlock_irqrestore(&c2_port->tx_lock, flags);
+
+	netdev->trans_start = jiffies;
+
+	return NETDEV_TX_OK;
+}
+
+static void c2_tx_timeout(struct net_device *netdev)
+{
+	struct c2_port *c2_port = netdev_priv(netdev);
+
+	if (netif_msg_timer(c2_port))
+		pr_debug("%s: tx timeout\n", netdev->name);
+
+	c2_tx_clean(c2_port);
+}
+
+static int c2_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	int ret = 0;
+
+	if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
+		return -EINVAL;
+
+	netdev->mtu = new_mtu;
+
+	if (netif_running(netdev)) {
+		c2_down(netdev);
+
+		c2_up(netdev);
+	}
+
+	return ret;
+}
+
+static const struct net_device_ops c2_netdev = {
+	.ndo_open 		= c2_up,
+	.ndo_stop 		= c2_down,
+	.ndo_start_xmit		= c2_xmit_frame,
+	.ndo_tx_timeout		= c2_tx_timeout,
+	.ndo_change_mtu		= c2_change_mtu,
+	.ndo_set_mac_address 	= eth_mac_addr,
+	.ndo_validate_addr	= eth_validate_addr,
+};
+
+/* Initialize network device */
+static struct net_device *c2_devinit(struct c2_dev *c2dev,
+				     void __iomem * mmio_addr)
+{
+	struct c2_port *c2_port = NULL;
+	struct net_device *netdev = alloc_etherdev(sizeof(*c2_port));
+
+	if (!netdev) {
+		pr_debug("c2_port etherdev alloc failed");
+		return NULL;
+	}
+
+	SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev);
+
+	netdev->netdev_ops = &c2_netdev;
+	netdev->watchdog_timeo = C2_TX_TIMEOUT;
+	netdev->irq = c2dev->pcidev->irq;
+
+	c2_port = netdev_priv(netdev);
+	c2_port->netdev = netdev;
+	c2_port->c2dev = c2dev;
+	c2_port->msg_enable = netif_msg_init(debug, default_msg);
+	c2_port->tx_ring.count = C2_NUM_TX_DESC;
+	c2_port->rx_ring.count = C2_NUM_RX_DESC;
+
+	spin_lock_init(&c2_port->tx_lock);
+
+	/* Copy our 48-bit ethernet hardware address */
+	memcpy_fromio(netdev->dev_addr, mmio_addr + C2_REGS_ENADDR, 6);
+
+	/* Validate the MAC address */
+	if (!is_valid_ether_addr(netdev->dev_addr)) {
+		pr_debug("Invalid MAC Address\n");
+		c2_print_macaddr(netdev);
+		free_netdev(netdev);
+		return NULL;
+	}
+
+	c2dev->netdev = netdev;
+
+	return netdev;
+}
+
+static int c2_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)
+{
+	int ret = 0, i;
+	unsigned long reg0_start, reg0_flags, reg0_len;
+	unsigned long reg2_start, reg2_flags, reg2_len;
+	unsigned long reg4_start, reg4_flags, reg4_len;
+	unsigned kva_map_size;
+	struct net_device *netdev = NULL;
+	struct c2_dev *c2dev = NULL;
+	void __iomem *mmio_regs = NULL;
+
+	printk(KERN_INFO PFX "AMSO1100 Gigabit Ethernet driver v%s loaded\n",
+		DRV_VERSION);
+
+	/* Enable PCI device */
+	ret = pci_enable_device(pcidev);
+	if (ret) {
+		printk(KERN_ERR PFX "%s: Unable to enable PCI device\n",
+			pci_name(pcidev));
+		goto bail0;
+	}
+
+	reg0_start = pci_resource_start(pcidev, BAR_0);
+	reg0_len = pci_resource_len(pcidev, BAR_0);
+	reg0_flags = pci_resource_flags(pcidev, BAR_0);
+
+	reg2_start = pci_resource_start(pcidev, BAR_2);
+	reg2_len = pci_resource_len(pcidev, BAR_2);
+	reg2_flags = pci_resource_flags(pcidev, BAR_2);
+
+	reg4_start = pci_resource_start(pcidev, BAR_4);
+	reg4_len = pci_resource_len(pcidev, BAR_4);
+	reg4_flags = pci_resource_flags(pcidev, BAR_4);
+
+	pr_debug("BAR0 size = 0x%lX bytes\n", reg0_len);
+	pr_debug("BAR2 size = 0x%lX bytes\n", reg2_len);
+	pr_debug("BAR4 size = 0x%lX bytes\n", reg4_len);
+
+	/* Make sure PCI base addr are MMIO */
+	if (!(reg0_flags & IORESOURCE_MEM) ||
+	    !(reg2_flags & IORESOURCE_MEM) || !(reg4_flags & IORESOURCE_MEM)) {
+		printk(KERN_ERR PFX "PCI regions not an MMIO resource\n");
+		ret = -ENODEV;
+		goto bail1;
+	}
+
+	/* Check for weird/broken PCI region reporting */
+	if ((reg0_len < C2_REG0_SIZE) ||
+	    (reg2_len < C2_REG2_SIZE) || (reg4_len < C2_REG4_SIZE)) {
+		printk(KERN_ERR PFX "Invalid PCI region sizes\n");
+		ret = -ENODEV;
+		goto bail1;
+	}
+
+	/* Reserve PCI I/O and memory resources */
+	ret = pci_request_regions(pcidev, DRV_NAME);
+	if (ret) {
+		printk(KERN_ERR PFX "%s: Unable to request regions\n",
+			pci_name(pcidev));
+		goto bail1;
+	}
+
+	if ((sizeof(dma_addr_t) > 4)) {
+		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64));
+		if (ret < 0) {
+			printk(KERN_ERR PFX "64b DMA configuration failed\n");
+			goto bail2;
+		}
+	} else {
+		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
+		if (ret < 0) {
+			printk(KERN_ERR PFX "32b DMA configuration failed\n");
+			goto bail2;
+		}
+	}
+
+	/* Enables bus-mastering on the device */
+	pci_set_master(pcidev);
+
+	/* Remap the adapter PCI registers in BAR4 */
+	mmio_regs = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET,
+				    sizeof(struct c2_adapter_pci_regs));
+	if (!mmio_regs) {
+		printk(KERN_ERR PFX
+			"Unable to remap adapter PCI registers in BAR4\n");
+		ret = -EIO;
+		goto bail2;
+	}
+
+	/* Validate PCI regs magic */
+	for (i = 0; i < sizeof(c2_magic); i++) {
+		if (c2_magic[i] != readb(mmio_regs + C2_REGS_MAGIC + i)) {
+			printk(KERN_ERR PFX "Downlevel Firmware boot loader "
+				"[%d/%Zd: got 0x%x, exp 0x%x]. Use the cc_flash "
+			       "utility to update your boot loader\n",
+				i + 1, sizeof(c2_magic),
+				readb(mmio_regs + C2_REGS_MAGIC + i),
+				c2_magic[i]);
+			printk(KERN_ERR PFX "Adapter not claimed\n");
+			iounmap(mmio_regs);
+			ret = -EIO;
+			goto bail2;
+		}
+	}
+
+	/* Validate the adapter version */
+	if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)) != C2_VERSION) {
+		printk(KERN_ERR PFX "Version mismatch "
+			"[fw=%u, c2=%u], Adapter not claimed\n",
+			be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_VERS)),
+			C2_VERSION);
+		ret = -EINVAL;
+		iounmap(mmio_regs);
+		goto bail2;
+	}
+
+	/* Validate the adapter IVN */
+	if (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)) != C2_IVN) {
+		printk(KERN_ERR PFX "Downlevel FIrmware level. You should be using "
+		       "the OpenIB device support kit. "
+		       "[fw=0x%x, c2=0x%x], Adapter not claimed\n",
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_IVN)),
+		       C2_IVN);
+		ret = -EINVAL;
+		iounmap(mmio_regs);
+		goto bail2;
+	}
+
+	/* Allocate hardware structure */
+	c2dev = (struct c2_dev *) ib_alloc_device(sizeof(*c2dev));
+	if (!c2dev) {
+		printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n",
+			pci_name(pcidev));
+		ret = -ENOMEM;
+		iounmap(mmio_regs);
+		goto bail2;
+	}
+
+	memset(c2dev, 0, sizeof(*c2dev));
+	spin_lock_init(&c2dev->lock);
+	c2dev->pcidev = pcidev;
+	c2dev->cur_tx = 0;
+
+	/* Get the last RX index */
+	c2dev->cur_rx =
+	    (be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_HRX_CUR)) -
+	     0xffffc000) / sizeof(struct c2_rxp_desc);
+
+	/* Request an interrupt line for the driver */
+	ret = request_irq(pcidev->irq, c2_interrupt, IRQF_SHARED, DRV_NAME, c2dev);
+	if (ret) {
+		printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n",
+			pci_name(pcidev), pcidev->irq);
+		iounmap(mmio_regs);
+		goto bail3;
+	}
+
+	/* Set driver specific data */
+	pci_set_drvdata(pcidev, c2dev);
+
+	/* Initialize network device */
+	if ((netdev = c2_devinit(c2dev, mmio_regs)) == NULL) {
+		ret = -ENOMEM;
+		iounmap(mmio_regs);
+		goto bail4;
+	}
+
+	/* Save off the actual size prior to unmapping mmio_regs */
+	kva_map_size = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_PCI_WINSIZE));
+
+	/* Unmap the adapter PCI registers in BAR4 */
+	iounmap(mmio_regs);
+
+	/* Register network device */
+	ret = register_netdev(netdev);
+	if (ret) {
+		printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n",
+			ret);
+		goto bail5;
+	}
+
+	/* Disable network packets */
+	netif_stop_queue(netdev);
+
+	/* Remap the adapter HRXDQ PA space to kernel VA space */
+	c2dev->mmio_rxp_ring = ioremap_nocache(reg4_start + C2_RXP_HRXDQ_OFFSET,
+					       C2_RXP_HRXDQ_SIZE);
+	if (!c2dev->mmio_rxp_ring) {
+		printk(KERN_ERR PFX "Unable to remap MMIO HRXDQ region\n");
+		ret = -EIO;
+		goto bail6;
+	}
+
+	/* Remap the adapter HTXDQ PA space to kernel VA space */
+	c2dev->mmio_txp_ring = ioremap_nocache(reg4_start + C2_TXP_HTXDQ_OFFSET,
+					       C2_TXP_HTXDQ_SIZE);
+	if (!c2dev->mmio_txp_ring) {
+		printk(KERN_ERR PFX "Unable to remap MMIO HTXDQ region\n");
+		ret = -EIO;
+		goto bail7;
+	}
+
+	/* Save off the current RX index in the last 4 bytes of the TXP Ring */
+	C2_SET_CUR_RX(c2dev, c2dev->cur_rx);
+
+	/* Remap the PCI registers in adapter BAR0 to kernel VA space */
+	c2dev->regs = ioremap_nocache(reg0_start, reg0_len);
+	if (!c2dev->regs) {
+		printk(KERN_ERR PFX "Unable to remap BAR0\n");
+		ret = -EIO;
+		goto bail8;
+	}
+
+	/* Remap the PCI registers in adapter BAR4 to kernel VA space */
+	c2dev->pa = reg4_start + C2_PCI_REGS_OFFSET;
+	c2dev->kva = ioremap_nocache(reg4_start + C2_PCI_REGS_OFFSET,
+				     kva_map_size);
+	if (!c2dev->kva) {
+		printk(KERN_ERR PFX "Unable to remap BAR4\n");
+		ret = -EIO;
+		goto bail9;
+	}
+
+	/* Print out the MAC address */
+	c2_print_macaddr(netdev);
+
+	ret = c2_rnic_init(c2dev);
+	if (ret) {
+		printk(KERN_ERR PFX "c2_rnic_init failed: %d\n", ret);
+		goto bail10;
+	}
+
+	ret = c2_register_device(c2dev);
+	if (ret)
+		goto bail10;
+
+	return 0;
+
+ bail10:
+	iounmap(c2dev->kva);
+
+ bail9:
+	iounmap(c2dev->regs);
+
+ bail8:
+	iounmap(c2dev->mmio_txp_ring);
+
+ bail7:
+	iounmap(c2dev->mmio_rxp_ring);
+
+ bail6:
+	unregister_netdev(netdev);
+
+ bail5:
+	free_netdev(netdev);
+
+ bail4:
+	free_irq(pcidev->irq, c2dev);
+
+ bail3:
+	ib_dealloc_device(&c2dev->ibdev);
+
+ bail2:
+	pci_release_regions(pcidev);
+
+ bail1:
+	pci_disable_device(pcidev);
+
+ bail0:
+	return ret;
+}
+
+static void c2_remove(struct pci_dev *pcidev)
+{
+	struct c2_dev *c2dev = pci_get_drvdata(pcidev);
+	struct net_device *netdev = c2dev->netdev;
+
+	/* Unregister with OpenIB */
+	c2_unregister_device(c2dev);
+
+	/* Clean up the RNIC resources */
+	c2_rnic_term(c2dev);
+
+	/* Remove network device from the kernel */
+	unregister_netdev(netdev);
+
+	/* Free network device */
+	free_netdev(netdev);
+
+	/* Free the interrupt line */
+	free_irq(pcidev->irq, c2dev);
+
+	/* missing: Turn LEDs off here */
+
+	/* Unmap adapter PA space */
+	iounmap(c2dev->kva);
+	iounmap(c2dev->regs);
+	iounmap(c2dev->mmio_txp_ring);
+	iounmap(c2dev->mmio_rxp_ring);
+
+	/* Free the hardware structure */
+	ib_dealloc_device(&c2dev->ibdev);
+
+	/* Release reserved PCI I/O and memory resources */
+	pci_release_regions(pcidev);
+
+	/* Disable PCI device */
+	pci_disable_device(pcidev);
+
+	/* Clear driver specific data */
+	pci_set_drvdata(pcidev, NULL);
+}
+
+static struct pci_driver c2_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = c2_pci_table,
+	.probe = c2_probe,
+	.remove = c2_remove,
+};
+
+module_pci_driver(c2_pci_driver);
diff --git a/drivers/infiniband/hw/amso1100/c2.h b/drivers/infiniband/hw/amso1100/c2.h
new file mode 100644
index 000000000..d619d7358
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2.h
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __C2_H
+#define __C2_H
+
+#include <linux/netdevice.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+
+#include "c2_provider.h"
+#include "c2_mq.h"
+#include "c2_status.h"
+
+#define DRV_NAME     "c2"
+#define DRV_VERSION  "1.1"
+#define PFX          DRV_NAME ": "
+
+#define BAR_0                0
+#define BAR_2                2
+#define BAR_4                4
+
+#define RX_BUF_SIZE         (1536 + 8)
+#define ETH_JUMBO_MTU        9000
+#define C2_MAGIC            "CEPHEUS"
+#define C2_VERSION           4
+#define C2_IVN              (18 & 0x7fffffff)
+
+#define C2_REG0_SIZE        (16 * 1024)
+#define C2_REG2_SIZE        (2 * 1024 * 1024)
+#define C2_REG4_SIZE        (256 * 1024 * 1024)
+#define C2_NUM_TX_DESC       341
+#define C2_NUM_RX_DESC       256
+#define C2_PCI_REGS_OFFSET  (0x10000)
+#define C2_RXP_HRXDQ_OFFSET (((C2_REG4_SIZE)/2))
+#define C2_RXP_HRXDQ_SIZE   (4096)
+#define C2_TXP_HTXDQ_OFFSET (((C2_REG4_SIZE)/2) + C2_RXP_HRXDQ_SIZE)
+#define C2_TXP_HTXDQ_SIZE   (4096)
+#define C2_TX_TIMEOUT	    (6*HZ)
+
+/* CEPHEUS */
+static const u8 c2_magic[] = {
+	0x43, 0x45, 0x50, 0x48, 0x45, 0x55, 0x53
+};
+
+enum adapter_pci_regs {
+	C2_REGS_MAGIC = 0x0000,
+	C2_REGS_VERS = 0x0008,
+	C2_REGS_IVN = 0x000C,
+	C2_REGS_PCI_WINSIZE = 0x0010,
+	C2_REGS_Q0_QSIZE = 0x0014,
+	C2_REGS_Q0_MSGSIZE = 0x0018,
+	C2_REGS_Q0_POOLSTART = 0x001C,
+	C2_REGS_Q0_SHARED = 0x0020,
+	C2_REGS_Q1_QSIZE = 0x0024,
+	C2_REGS_Q1_MSGSIZE = 0x0028,
+	C2_REGS_Q1_SHARED = 0x0030,
+	C2_REGS_Q2_QSIZE = 0x0034,
+	C2_REGS_Q2_MSGSIZE = 0x0038,
+	C2_REGS_Q2_SHARED = 0x0040,
+	C2_REGS_ENADDR = 0x004C,
+	C2_REGS_RDMA_ENADDR = 0x0054,
+	C2_REGS_HRX_CUR = 0x006C,
+};
+
+struct c2_adapter_pci_regs {
+	char reg_magic[8];
+	u32 version;
+	u32 ivn;
+	u32 pci_window_size;
+	u32 q0_q_size;
+	u32 q0_msg_size;
+	u32 q0_pool_start;
+	u32 q0_shared;
+	u32 q1_q_size;
+	u32 q1_msg_size;
+	u32 q1_pool_start;
+	u32 q1_shared;
+	u32 q2_q_size;
+	u32 q2_msg_size;
+	u32 q2_pool_start;
+	u32 q2_shared;
+	u32 log_start;
+	u32 log_size;
+	u8 host_enaddr[8];
+	u8 rdma_enaddr[8];
+	u32 crash_entry;
+	u32 crash_ready[2];
+	u32 fw_txd_cur;
+	u32 fw_hrxd_cur;
+	u32 fw_rxd_cur;
+};
+
+enum pci_regs {
+	C2_HISR = 0x0000,
+	C2_DISR = 0x0004,
+	C2_HIMR = 0x0008,
+	C2_DIMR = 0x000C,
+	C2_NISR0 = 0x0010,
+	C2_NISR1 = 0x0014,
+	C2_NIMR0 = 0x0018,
+	C2_NIMR1 = 0x001C,
+	C2_IDIS = 0x0020,
+};
+
+enum {
+	C2_PCI_HRX_INT = 1 << 8,
+	C2_PCI_HTX_INT = 1 << 17,
+	C2_PCI_HRX_QUI = 1 << 31,
+};
+
+/*
+ * Cepheus registers in BAR0.
+ */
+struct c2_pci_regs {
+	u32 hostisr;
+	u32 dmaisr;
+	u32 hostimr;
+	u32 dmaimr;
+	u32 netisr0;
+	u32 netisr1;
+	u32 netimr0;
+	u32 netimr1;
+	u32 int_disable;
+};
+
+/* TXP flags */
+enum c2_txp_flags {
+	TXP_HTXD_DONE = 0,
+	TXP_HTXD_READY = 1 << 0,
+	TXP_HTXD_UNINIT = 1 << 1,
+};
+
+/* RXP flags */
+enum c2_rxp_flags {
+	RXP_HRXD_UNINIT = 0,
+	RXP_HRXD_READY = 1 << 0,
+	RXP_HRXD_DONE = 1 << 1,
+};
+
+/* RXP status */
+enum c2_rxp_status {
+	RXP_HRXD_ZERO = 0,
+	RXP_HRXD_OK = 1 << 0,
+	RXP_HRXD_BUF_OV = 1 << 1,
+};
+
+/* TXP descriptor fields */
+enum txp_desc {
+	C2_TXP_FLAGS = 0x0000,
+	C2_TXP_LEN = 0x0002,
+	C2_TXP_ADDR = 0x0004,
+};
+
+/* RXP descriptor fields */
+enum rxp_desc {
+	C2_RXP_FLAGS = 0x0000,
+	C2_RXP_STATUS = 0x0002,
+	C2_RXP_COUNT = 0x0004,
+	C2_RXP_LEN = 0x0006,
+	C2_RXP_ADDR = 0x0008,
+};
+
+struct c2_txp_desc {
+	u16 flags;
+	u16 len;
+	u64 addr;
+} __attribute__ ((packed));
+
+struct c2_rxp_desc {
+	u16 flags;
+	u16 status;
+	u16 count;
+	u16 len;
+	u64 addr;
+} __attribute__ ((packed));
+
+struct c2_rxp_hdr {
+	u16 flags;
+	u16 status;
+	u16 len;
+	u16 rsvd;
+} __attribute__ ((packed));
+
+struct c2_tx_desc {
+	u32 len;
+	u32 status;
+	dma_addr_t next_offset;
+};
+
+struct c2_rx_desc {
+	u32 len;
+	u32 status;
+	dma_addr_t next_offset;
+};
+
+struct c2_alloc {
+	u32 last;
+	u32 max;
+	spinlock_t lock;
+	unsigned long *table;
+};
+
+struct c2_array {
+	struct {
+		void **page;
+		int used;
+	} *page_list;
+};
+
+/*
+ * The MQ shared pointer pool is organized as a linked list of
+ * chunks. Each chunk contains a linked list of free shared pointers
+ * that can be allocated to a given user mode client.
+ *
+ */
+struct sp_chunk {
+	struct sp_chunk *next;
+	dma_addr_t dma_addr;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	u16 head;
+	u16 shared_ptr[0];
+};
+
+struct c2_pd_table {
+	u32 last;
+	u32 max;
+	spinlock_t lock;
+	unsigned long *table;
+};
+
+struct c2_qp_table {
+	struct idr idr;
+	spinlock_t lock;
+};
+
+struct c2_element {
+	struct c2_element *next;
+	void *ht_desc;		/* host     descriptor */
+	void __iomem *hw_desc;	/* hardware descriptor */
+	struct sk_buff *skb;
+	dma_addr_t mapaddr;
+	u32 maplen;
+};
+
+struct c2_ring {
+	struct c2_element *to_clean;
+	struct c2_element *to_use;
+	struct c2_element *start;
+	unsigned long count;
+};
+
+struct c2_dev {
+	struct ib_device ibdev;
+	void __iomem *regs;
+	void __iomem *mmio_txp_ring; /* remapped adapter memory for hw rings */
+	void __iomem *mmio_rxp_ring;
+	spinlock_t lock;
+	struct pci_dev *pcidev;
+	struct net_device *netdev;
+	struct net_device *pseudo_netdev;
+	unsigned int cur_tx;
+	unsigned int cur_rx;
+	u32 adapter_handle;
+	int device_cap_flags;
+	void __iomem *kva;	/* KVA device memory */
+	unsigned long pa;	/* PA device memory */
+	void **qptr_array;
+
+	struct kmem_cache *host_msg_cache;
+
+	struct list_head cca_link;		/* adapter list */
+	struct list_head eh_wakeup_list;	/* event wakeup list */
+	wait_queue_head_t req_vq_wo;
+
+	/* Cached RNIC properties */
+	struct ib_device_attr props;
+
+	struct c2_pd_table pd_table;
+	struct c2_qp_table qp_table;
+	int ports;		/* num of GigE ports */
+	int devnum;
+	spinlock_t vqlock;	/* sync vbs req MQ */
+
+	/* Verbs Queues */
+	struct c2_mq req_vq;	/* Verbs Request MQ */
+	struct c2_mq rep_vq;	/* Verbs Reply MQ */
+	struct c2_mq aeq;	/* Async Events MQ */
+
+	/* Kernel client MQs */
+	struct sp_chunk *kern_mqsp_pool;
+
+	/* Device updates these values when posting messages to a host
+	 * target queue */
+	u16 req_vq_shared;
+	u16 rep_vq_shared;
+	u16 aeq_shared;
+	u16 irq_claimed;
+
+	/*
+	 * Shared host target pages for user-accessible MQs.
+	 */
+	int hthead;		/* index of first free entry */
+	void *htpages;		/* kernel vaddr */
+	int htlen;		/* length of htpages memory */
+	void *htuva;		/* user mapped vaddr */
+	spinlock_t htlock;	/* serialize allocation */
+
+	u64 adapter_hint_uva;	/* access to the activity FIFO */
+
+	//	spinlock_t aeq_lock;
+	//	spinlock_t rnic_lock;
+
+	__be16 *hint_count;
+	dma_addr_t hint_count_dma;
+	u16 hints_read;
+
+	int init;		/* TRUE if it's ready */
+	char ae_cache_name[16];
+	char vq_cache_name[16];
+};
+
+struct c2_port {
+	u32 msg_enable;
+	struct c2_dev *c2dev;
+	struct net_device *netdev;
+
+	spinlock_t tx_lock;
+	u32 tx_avail;
+	struct c2_ring tx_ring;
+	struct c2_ring rx_ring;
+
+	void *mem;		/* PCI memory for host rings */
+	dma_addr_t dma;
+	unsigned long mem_size;
+
+	u32 rx_buf_size;
+};
+
+/*
+ * Activity FIFO registers in BAR0.
+ */
+#define PCI_BAR0_HOST_HINT	0x100
+#define PCI_BAR0_ADAPTER_HINT	0x2000
+
+/*
+ * Ammasso PCI vendor id and Cepheus PCI device id.
+ */
+#define CQ_ARMED 	0x01
+#define CQ_WAIT_FOR_DMA	0x80
+
+/*
+ * The format of a hint is as follows:
+ * Lower 16 bits are the count of hints for the queue.
+ * Next 15 bits are the qp_index
+ * Upper most bit depends on who reads it:
+ *    If read by producer, then it means Full (1) or Not-Full (0)
+ *    If read by consumer, then it means Empty (1) or Not-Empty (0)
+ */
+#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count)
+#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16)
+#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF)
+
+
+/*
+ * The following defines the offset in SDRAM for the c2_adapter_pci_regs_t
+ * struct.
+ */
+#define C2_ADAPTER_PCI_REGS_OFFSET 0x10000
+
+#ifndef readq
+static inline u64 readq(const void __iomem * addr)
+{
+	u64 ret = readl(addr + 4);
+	ret <<= 32;
+	ret |= readl(addr);
+
+	return ret;
+}
+#endif
+
+#ifndef writeq
+static inline void __raw_writeq(u64 val, void __iomem * addr)
+{
+	__raw_writel((u32) (val), addr);
+	__raw_writel((u32) (val >> 32), (addr + 4));
+}
+#endif
+
+#define C2_SET_CUR_RX(c2dev, cur_rx) \
+	__raw_writel((__force u32) cpu_to_be32(cur_rx), c2dev->mmio_txp_ring + 4092)
+
+#define C2_GET_CUR_RX(c2dev) \
+	be32_to_cpu((__force __be32) readl(c2dev->mmio_txp_ring + 4092))
+
+static inline struct c2_dev *to_c2dev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct c2_dev, ibdev);
+}
+
+static inline int c2_errno(void *reply)
+{
+	switch (c2_wr_get_result(reply)) {
+	case C2_OK:
+		return 0;
+	case CCERR_NO_BUFS:
+	case CCERR_INSUFFICIENT_RESOURCES:
+	case CCERR_ZERO_RDMA_READ_RESOURCES:
+		return -ENOMEM;
+	case CCERR_MR_IN_USE:
+	case CCERR_QP_IN_USE:
+		return -EBUSY;
+	case CCERR_ADDR_IN_USE:
+		return -EADDRINUSE;
+	case CCERR_ADDR_NOT_AVAIL:
+		return -EADDRNOTAVAIL;
+	case CCERR_CONN_RESET:
+		return -ECONNRESET;
+	case CCERR_NOT_IMPLEMENTED:
+	case CCERR_INVALID_WQE:
+		return -ENOSYS;
+	case CCERR_QP_NOT_PRIVILEGED:
+		return -EPERM;
+	case CCERR_STACK_ERROR:
+		return -EPROTO;
+	case CCERR_ACCESS_VIOLATION:
+	case CCERR_BASE_AND_BOUNDS_VIOLATION:
+		return -EFAULT;
+	case CCERR_STAG_STATE_NOT_INVALID:
+	case CCERR_INVALID_ADDRESS:
+	case CCERR_INVALID_CQ:
+	case CCERR_INVALID_EP:
+	case CCERR_INVALID_MODIFIER:
+	case CCERR_INVALID_MTU:
+	case CCERR_INVALID_PD_ID:
+	case CCERR_INVALID_QP:
+	case CCERR_INVALID_RNIC:
+	case CCERR_INVALID_STAG:
+		return -EINVAL;
+	default:
+		return -EAGAIN;
+	}
+}
+
+/* Device */
+extern int c2_register_device(struct c2_dev *c2dev);
+extern void c2_unregister_device(struct c2_dev *c2dev);
+extern int c2_rnic_init(struct c2_dev *c2dev);
+extern void c2_rnic_term(struct c2_dev *c2dev);
+extern void c2_rnic_interrupt(struct c2_dev *c2dev);
+extern int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask);
+extern int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask);
+
+/* QPs */
+extern int c2_alloc_qp(struct c2_dev *c2dev, struct c2_pd *pd,
+		       struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp);
+extern void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp);
+extern struct ib_qp *c2_get_qp(struct ib_device *device, int qpn);
+extern int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp,
+			struct ib_qp_attr *attr, int attr_mask);
+extern int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp,
+				 int ord, int ird);
+extern int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
+			struct ib_send_wr **bad_wr);
+extern int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
+			   struct ib_recv_wr **bad_wr);
+extern void c2_init_qp_table(struct c2_dev *c2dev);
+extern void c2_cleanup_qp_table(struct c2_dev *c2dev);
+extern void c2_set_qp_state(struct c2_qp *, int);
+extern struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn);
+
+/* PDs */
+extern int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd);
+extern void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd);
+extern int c2_init_pd_table(struct c2_dev *c2dev);
+extern void c2_cleanup_pd_table(struct c2_dev *c2dev);
+
+/* CQs */
+extern int c2_init_cq(struct c2_dev *c2dev, int entries,
+		      struct c2_ucontext *ctx, struct c2_cq *cq);
+extern void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq);
+extern void c2_cq_event(struct c2_dev *c2dev, u32 mq_index);
+extern void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index);
+extern int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+extern int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+
+/* CM */
+extern int c2_llp_connect(struct iw_cm_id *cm_id,
+			  struct iw_cm_conn_param *iw_param);
+extern int c2_llp_accept(struct iw_cm_id *cm_id,
+			 struct iw_cm_conn_param *iw_param);
+extern int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata,
+			 u8 pdata_len);
+extern int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog);
+extern int c2_llp_service_destroy(struct iw_cm_id *cm_id);
+
+/* MM */
+extern int c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list,
+ 				      int page_size, int pbl_depth, u32 length,
+ 				      u32 off, u64 *va, enum c2_acf acf,
+				      struct c2_mr *mr);
+extern int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index);
+
+/* AE */
+extern void c2_ae_event(struct c2_dev *c2dev, u32 mq_index);
+
+/* MQSP Allocator */
+extern int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask,
+			     struct sp_chunk **root);
+extern void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root);
+extern __be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
+			     dma_addr_t *dma_addr, gfp_t gfp_mask);
+extern void c2_free_mqsp(__be16* mqsp);
+#endif
diff --git a/drivers/infiniband/hw/amso1100/c2_ae.c b/drivers/infiniband/hw/amso1100/c2_ae.c
new file mode 100644
index 000000000..cedda2523
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_ae.c
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "c2.h"
+#include <rdma/iw_cm.h>
+#include "c2_status.h"
+#include "c2_ae.h"
+
+static int c2_convert_cm_status(u32 c2_status)
+{
+	switch (c2_status) {
+	case C2_CONN_STATUS_SUCCESS:
+		return 0;
+	case C2_CONN_STATUS_REJECTED:
+		return -ENETRESET;
+	case C2_CONN_STATUS_REFUSED:
+		return -ECONNREFUSED;
+	case C2_CONN_STATUS_TIMEDOUT:
+		return -ETIMEDOUT;
+	case C2_CONN_STATUS_NETUNREACH:
+		return -ENETUNREACH;
+	case C2_CONN_STATUS_HOSTUNREACH:
+		return -EHOSTUNREACH;
+	case C2_CONN_STATUS_INVALID_RNIC:
+		return -EINVAL;
+	case C2_CONN_STATUS_INVALID_QP:
+		return -EINVAL;
+	case C2_CONN_STATUS_INVALID_QP_STATE:
+		return -EINVAL;
+	case C2_CONN_STATUS_ADDR_NOT_AVAIL:
+		return -EADDRNOTAVAIL;
+	default:
+		printk(KERN_ERR PFX
+		       "%s - Unable to convert CM status: %d\n",
+		       __func__, c2_status);
+		return -EIO;
+	}
+}
+
+static const char* to_event_str(int event)
+{
+	static const char* event_str[] = {
+		"CCAE_REMOTE_SHUTDOWN",
+		"CCAE_ACTIVE_CONNECT_RESULTS",
+		"CCAE_CONNECTION_REQUEST",
+		"CCAE_LLP_CLOSE_COMPLETE",
+		"CCAE_TERMINATE_MESSAGE_RECEIVED",
+		"CCAE_LLP_CONNECTION_RESET",
+		"CCAE_LLP_CONNECTION_LOST",
+		"CCAE_LLP_SEGMENT_SIZE_INVALID",
+		"CCAE_LLP_INVALID_CRC",
+		"CCAE_LLP_BAD_FPDU",
+		"CCAE_INVALID_DDP_VERSION",
+		"CCAE_INVALID_RDMA_VERSION",
+		"CCAE_UNEXPECTED_OPCODE",
+		"CCAE_INVALID_DDP_QUEUE_NUMBER",
+		"CCAE_RDMA_READ_NOT_ENABLED",
+		"CCAE_RDMA_WRITE_NOT_ENABLED",
+		"CCAE_RDMA_READ_TOO_SMALL",
+		"CCAE_NO_L_BIT",
+		"CCAE_TAGGED_INVALID_STAG",
+		"CCAE_TAGGED_BASE_BOUNDS_VIOLATION",
+		"CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION",
+		"CCAE_TAGGED_INVALID_PD",
+		"CCAE_WRAP_ERROR",
+		"CCAE_BAD_CLOSE",
+		"CCAE_BAD_LLP_CLOSE",
+		"CCAE_INVALID_MSN_RANGE",
+		"CCAE_INVALID_MSN_GAP",
+		"CCAE_IRRQ_OVERFLOW",
+		"CCAE_IRRQ_MSN_GAP",
+		"CCAE_IRRQ_MSN_RANGE",
+		"CCAE_IRRQ_INVALID_STAG",
+		"CCAE_IRRQ_BASE_BOUNDS_VIOLATION",
+		"CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION",
+		"CCAE_IRRQ_INVALID_PD",
+		"CCAE_IRRQ_WRAP_ERROR",
+		"CCAE_CQ_SQ_COMPLETION_OVERFLOW",
+		"CCAE_CQ_RQ_COMPLETION_ERROR",
+		"CCAE_QP_SRQ_WQE_ERROR",
+		"CCAE_QP_LOCAL_CATASTROPHIC_ERROR",
+		"CCAE_CQ_OVERFLOW",
+		"CCAE_CQ_OPERATION_ERROR",
+		"CCAE_SRQ_LIMIT_REACHED",
+		"CCAE_QP_RQ_LIMIT_REACHED",
+		"CCAE_SRQ_CATASTROPHIC_ERROR",
+		"CCAE_RNIC_CATASTROPHIC_ERROR"
+	};
+
+	if (event < CCAE_REMOTE_SHUTDOWN ||
+	    event > CCAE_RNIC_CATASTROPHIC_ERROR)
+		return "<invalid event>";
+
+	event -= CCAE_REMOTE_SHUTDOWN;
+	return event_str[event];
+}
+
+static const char *to_qp_state_str(int state)
+{
+	switch (state) {
+	case C2_QP_STATE_IDLE:
+		return "C2_QP_STATE_IDLE";
+	case C2_QP_STATE_CONNECTING:
+		return "C2_QP_STATE_CONNECTING";
+	case C2_QP_STATE_RTS:
+		return "C2_QP_STATE_RTS";
+	case C2_QP_STATE_CLOSING:
+		return "C2_QP_STATE_CLOSING";
+	case C2_QP_STATE_TERMINATE:
+		return "C2_QP_STATE_TERMINATE";
+	case C2_QP_STATE_ERROR:
+		return "C2_QP_STATE_ERROR";
+	default:
+		return "<invalid QP state>";
+	}
+}
+
+void c2_ae_event(struct c2_dev *c2dev, u32 mq_index)
+{
+	struct c2_mq *mq = c2dev->qptr_array[mq_index];
+	union c2wr *wr;
+	void *resource_user_context;
+	struct iw_cm_event cm_event;
+	struct ib_event ib_event;
+	enum c2_resource_indicator resource_indicator;
+	enum c2_event_id event_id;
+	unsigned long flags;
+	int status;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_event.local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_event.remote_addr;
+
+	/*
+	 * retrieve the message
+	 */
+	wr = c2_mq_consume(mq);
+	if (!wr)
+		return;
+
+	memset(&ib_event, 0, sizeof(ib_event));
+	memset(&cm_event, 0, sizeof(cm_event));
+
+	event_id = c2_wr_get_id(wr);
+	resource_indicator = be32_to_cpu(wr->ae.ae_generic.resource_type);
+	resource_user_context =
+	    (void *) (unsigned long) wr->ae.ae_generic.user_context;
+
+	status = cm_event.status = c2_convert_cm_status(c2_wr_get_result(wr));
+
+	pr_debug("event received c2_dev=%p, event_id=%d, "
+		"resource_indicator=%d, user_context=%p, status = %d\n",
+		c2dev, event_id, resource_indicator, resource_user_context,
+		status);
+
+	switch (resource_indicator) {
+	case C2_RES_IND_QP:{
+
+		struct c2_qp *qp = (struct c2_qp *)resource_user_context;
+		struct iw_cm_id *cm_id = qp->cm_id;
+		struct c2wr_ae_active_connect_results *res;
+
+		if (!cm_id) {
+			pr_debug("event received, but cm_id is <nul>, qp=%p!\n",
+				qp);
+			goto ignore_it;
+		}
+		pr_debug("%s: event = %s, user_context=%llx, "
+			"resource_type=%x, "
+			"resource=%x, qp_state=%s\n",
+			__func__,
+			to_event_str(event_id),
+			(unsigned long long) wr->ae.ae_generic.user_context,
+			be32_to_cpu(wr->ae.ae_generic.resource_type),
+			be32_to_cpu(wr->ae.ae_generic.resource),
+			to_qp_state_str(be32_to_cpu(wr->ae.ae_generic.qp_state)));
+
+		c2_set_qp_state(qp, be32_to_cpu(wr->ae.ae_generic.qp_state));
+
+		switch (event_id) {
+		case CCAE_ACTIVE_CONNECT_RESULTS:
+			res = &wr->ae.ae_active_connect_results;
+			cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+			laddr->sin_addr.s_addr = res->laddr;
+			raddr->sin_addr.s_addr = res->raddr;
+			laddr->sin_port = res->lport;
+			raddr->sin_port = res->rport;
+			if (status == 0) {
+				cm_event.private_data_len =
+					be32_to_cpu(res->private_data_length);
+				cm_event.private_data = res->private_data;
+			} else {
+				spin_lock_irqsave(&qp->lock, flags);
+				if (qp->cm_id) {
+					qp->cm_id->rem_ref(qp->cm_id);
+					qp->cm_id = NULL;
+				}
+				spin_unlock_irqrestore(&qp->lock, flags);
+				cm_event.private_data_len = 0;
+				cm_event.private_data = NULL;
+			}
+			if (cm_id->event_handler)
+				cm_id->event_handler(cm_id, &cm_event);
+			break;
+		case CCAE_TERMINATE_MESSAGE_RECEIVED:
+		case CCAE_CQ_SQ_COMPLETION_OVERFLOW:
+			ib_event.device = &c2dev->ibdev;
+			ib_event.element.qp = &qp->ibqp;
+			ib_event.event = IB_EVENT_QP_REQ_ERR;
+
+			if (qp->ibqp.event_handler)
+				qp->ibqp.event_handler(&ib_event,
+						       qp->ibqp.
+						       qp_context);
+			break;
+		case CCAE_BAD_CLOSE:
+		case CCAE_LLP_CLOSE_COMPLETE:
+		case CCAE_LLP_CONNECTION_RESET:
+		case CCAE_LLP_CONNECTION_LOST:
+			BUG_ON(cm_id->event_handler==(void*)0x6b6b6b6b);
+
+			spin_lock_irqsave(&qp->lock, flags);
+			if (qp->cm_id) {
+				qp->cm_id->rem_ref(qp->cm_id);
+				qp->cm_id = NULL;
+			}
+			spin_unlock_irqrestore(&qp->lock, flags);
+			cm_event.event = IW_CM_EVENT_CLOSE;
+			cm_event.status = 0;
+			if (cm_id->event_handler)
+				cm_id->event_handler(cm_id, &cm_event);
+			break;
+		default:
+			BUG_ON(1);
+			pr_debug("%s:%d Unexpected event_id=%d on QP=%p, "
+				"CM_ID=%p\n",
+				__func__, __LINE__,
+				event_id, qp, cm_id);
+			break;
+		}
+		break;
+	}
+
+	case C2_RES_IND_EP:{
+
+		struct c2wr_ae_connection_request *req =
+			&wr->ae.ae_connection_request;
+		struct iw_cm_id *cm_id =
+			(struct iw_cm_id *)resource_user_context;
+
+		pr_debug("C2_RES_IND_EP event_id=%d\n", event_id);
+		if (event_id != CCAE_CONNECTION_REQUEST) {
+			pr_debug("%s: Invalid event_id: %d\n",
+				__func__, event_id);
+			break;
+		}
+		cm_event.event = IW_CM_EVENT_CONNECT_REQUEST;
+		cm_event.provider_data = (void*)(unsigned long)req->cr_handle;
+		laddr->sin_addr.s_addr = req->laddr;
+		raddr->sin_addr.s_addr = req->raddr;
+		laddr->sin_port = req->lport;
+		raddr->sin_port = req->rport;
+		cm_event.private_data_len =
+			be32_to_cpu(req->private_data_length);
+		cm_event.private_data = req->private_data;
+		/*
+		 * Until ird/ord negotiation via MPAv2 support is added, send
+		 * max supported values
+		 */
+		cm_event.ird = cm_event.ord = 128;
+
+		if (cm_id->event_handler)
+			cm_id->event_handler(cm_id, &cm_event);
+		break;
+	}
+
+	case C2_RES_IND_CQ:{
+		struct c2_cq *cq =
+		    (struct c2_cq *) resource_user_context;
+
+		pr_debug("IB_EVENT_CQ_ERR\n");
+		ib_event.device = &c2dev->ibdev;
+		ib_event.element.cq = &cq->ibcq;
+		ib_event.event = IB_EVENT_CQ_ERR;
+
+		if (cq->ibcq.event_handler)
+			cq->ibcq.event_handler(&ib_event,
+					       cq->ibcq.cq_context);
+		break;
+	}
+
+	default:
+		printk("Bad resource indicator = %d\n",
+		       resource_indicator);
+		break;
+	}
+
+ ignore_it:
+	c2_mq_free(mq);
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_ae.h b/drivers/infiniband/hw/amso1100/c2_ae.h
new file mode 100644
index 000000000..3a065c33b
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_ae.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _C2_AE_H_
+#define _C2_AE_H_
+
+/*
+ * WARNING: If you change this file, also bump C2_IVN_BASE
+ * in common/include/clustercore/c2_ivn.h.
+ */
+
+/*
+ * Asynchronous Event Identifiers
+ *
+ * These start at 0x80 only so it's obvious from inspection that
+ * they are not work-request statuses.  This isn't critical.
+ *
+ * NOTE: these event id's must fit in eight bits.
+ */
+enum c2_event_id {
+	CCAE_REMOTE_SHUTDOWN = 0x80,
+	CCAE_ACTIVE_CONNECT_RESULTS,
+	CCAE_CONNECTION_REQUEST,
+	CCAE_LLP_CLOSE_COMPLETE,
+	CCAE_TERMINATE_MESSAGE_RECEIVED,
+	CCAE_LLP_CONNECTION_RESET,
+	CCAE_LLP_CONNECTION_LOST,
+	CCAE_LLP_SEGMENT_SIZE_INVALID,
+	CCAE_LLP_INVALID_CRC,
+	CCAE_LLP_BAD_FPDU,
+	CCAE_INVALID_DDP_VERSION,
+	CCAE_INVALID_RDMA_VERSION,
+	CCAE_UNEXPECTED_OPCODE,
+	CCAE_INVALID_DDP_QUEUE_NUMBER,
+	CCAE_RDMA_READ_NOT_ENABLED,
+	CCAE_RDMA_WRITE_NOT_ENABLED,
+	CCAE_RDMA_READ_TOO_SMALL,
+	CCAE_NO_L_BIT,
+	CCAE_TAGGED_INVALID_STAG,
+	CCAE_TAGGED_BASE_BOUNDS_VIOLATION,
+	CCAE_TAGGED_ACCESS_RIGHTS_VIOLATION,
+	CCAE_TAGGED_INVALID_PD,
+	CCAE_WRAP_ERROR,
+	CCAE_BAD_CLOSE,
+	CCAE_BAD_LLP_CLOSE,
+	CCAE_INVALID_MSN_RANGE,
+	CCAE_INVALID_MSN_GAP,
+	CCAE_IRRQ_OVERFLOW,
+	CCAE_IRRQ_MSN_GAP,
+	CCAE_IRRQ_MSN_RANGE,
+	CCAE_IRRQ_INVALID_STAG,
+	CCAE_IRRQ_BASE_BOUNDS_VIOLATION,
+	CCAE_IRRQ_ACCESS_RIGHTS_VIOLATION,
+	CCAE_IRRQ_INVALID_PD,
+	CCAE_IRRQ_WRAP_ERROR,
+	CCAE_CQ_SQ_COMPLETION_OVERFLOW,
+	CCAE_CQ_RQ_COMPLETION_ERROR,
+	CCAE_QP_SRQ_WQE_ERROR,
+	CCAE_QP_LOCAL_CATASTROPHIC_ERROR,
+	CCAE_CQ_OVERFLOW,
+	CCAE_CQ_OPERATION_ERROR,
+	CCAE_SRQ_LIMIT_REACHED,
+	CCAE_QP_RQ_LIMIT_REACHED,
+	CCAE_SRQ_CATASTROPHIC_ERROR,
+	CCAE_RNIC_CATASTROPHIC_ERROR
+/* WARNING If you add more id's, make sure their values fit in eight bits. */
+};
+
+/*
+ * Resource Indicators and Identifiers
+ */
+enum c2_resource_indicator {
+	C2_RES_IND_QP = 1,
+	C2_RES_IND_EP,
+	C2_RES_IND_CQ,
+	C2_RES_IND_SRQ,
+};
+
+#endif /* _C2_AE_H_ */
diff --git a/drivers/infiniband/hw/amso1100/c2_alloc.c b/drivers/infiniband/hw/amso1100/c2_alloc.c
new file mode 100644
index 000000000..78d247ec6
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_alloc.c
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/bitmap.h>
+
+#include "c2.h"
+
+static int c2_alloc_mqsp_chunk(struct c2_dev *c2dev, gfp_t gfp_mask,
+			       struct sp_chunk **head)
+{
+	int i;
+	struct sp_chunk *new_head;
+	dma_addr_t dma_addr;
+
+	new_head = dma_alloc_coherent(&c2dev->pcidev->dev, PAGE_SIZE,
+				      &dma_addr, gfp_mask);
+	if (new_head == NULL)
+		return -ENOMEM;
+
+	new_head->dma_addr = dma_addr;
+	dma_unmap_addr_set(new_head, mapping, new_head->dma_addr);
+
+	new_head->next = NULL;
+	new_head->head = 0;
+
+	/* build list where each index is the next free slot */
+	for (i = 0;
+	     i < (PAGE_SIZE - sizeof(struct sp_chunk) -
+		  sizeof(u16)) / sizeof(u16) - 1;
+	     i++) {
+		new_head->shared_ptr[i] = i + 1;
+	}
+	/* terminate list */
+	new_head->shared_ptr[i] = 0xFFFF;
+
+	*head = new_head;
+	return 0;
+}
+
+int c2_init_mqsp_pool(struct c2_dev *c2dev, gfp_t gfp_mask,
+		      struct sp_chunk **root)
+{
+	return c2_alloc_mqsp_chunk(c2dev, gfp_mask, root);
+}
+
+void c2_free_mqsp_pool(struct c2_dev *c2dev, struct sp_chunk *root)
+{
+	struct sp_chunk *next;
+
+	while (root) {
+		next = root->next;
+		dma_free_coherent(&c2dev->pcidev->dev, PAGE_SIZE, root,
+				  dma_unmap_addr(root, mapping));
+		root = next;
+	}
+}
+
+__be16 *c2_alloc_mqsp(struct c2_dev *c2dev, struct sp_chunk *head,
+		      dma_addr_t *dma_addr, gfp_t gfp_mask)
+{
+	u16 mqsp;
+
+	while (head) {
+		mqsp = head->head;
+		if (mqsp != 0xFFFF) {
+			head->head = head->shared_ptr[mqsp];
+			break;
+		} else if (head->next == NULL) {
+			if (c2_alloc_mqsp_chunk(c2dev, gfp_mask, &head->next) ==
+			    0) {
+				head = head->next;
+				mqsp = head->head;
+				head->head = head->shared_ptr[mqsp];
+				break;
+			} else
+				return NULL;
+		} else
+			head = head->next;
+	}
+	if (head) {
+		*dma_addr = head->dma_addr +
+			    ((unsigned long) &(head->shared_ptr[mqsp]) -
+			     (unsigned long) head);
+		pr_debug("%s addr %p dma_addr %llx\n", __func__,
+			 &(head->shared_ptr[mqsp]), (unsigned long long) *dma_addr);
+		return (__force __be16 *) &(head->shared_ptr[mqsp]);
+	}
+	return NULL;
+}
+
+void c2_free_mqsp(__be16 *mqsp)
+{
+	struct sp_chunk *head;
+	u16 idx;
+
+	/* The chunk containing this ptr begins at the page boundary */
+	head = (struct sp_chunk *) ((unsigned long) mqsp & PAGE_MASK);
+
+	/* Link head to new mqsp */
+	*mqsp = (__force __be16) head->head;
+
+	/* Compute the shared_ptr index */
+	idx = ((unsigned long) mqsp & ~PAGE_MASK) >> 1;
+	idx -= (unsigned long) &(((struct sp_chunk *) 0)->shared_ptr[0]) >> 1;
+
+	/* Point this index at the head */
+	head->shared_ptr[idx] = head->head;
+
+	/* Point head at this index */
+	head->head = idx;
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_cm.c b/drivers/infiniband/hw/amso1100/c2_cm.c
new file mode 100644
index 000000000..23bfa94fb
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_cm.c
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/slab.h>
+
+#include "c2.h"
+#include "c2_wr.h"
+#include "c2_vq.h"
+#include <rdma/iw_cm.h>
+
+int c2_llp_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	struct c2_dev *c2dev = to_c2dev(cm_id->device);
+	struct ib_qp *ibqp;
+	struct c2_qp *qp;
+	struct c2wr_qp_connect_req *wr;	/* variable size needs a malloc. */
+	struct c2_vq_req *vq_req;
+	int err;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+
+	if (cm_id->remote_addr.ss_family != AF_INET)
+		return -ENOSYS;
+
+	ibqp = c2_get_qp(cm_id->device, iw_param->qpn);
+	if (!ibqp)
+		return -EINVAL;
+	qp = to_c2qp(ibqp);
+
+	/* Associate QP <--> CM_ID */
+	cm_id->provider_data = qp;
+	cm_id->add_ref(cm_id);
+	qp->cm_id = cm_id;
+
+	/*
+	 * only support the max private_data length
+	 */
+	if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
+		err = -EINVAL;
+		goto bail0;
+	}
+	/*
+	 * Set the rdma read limits
+	 */
+	err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird);
+	if (err)
+		goto bail0;
+
+	/*
+	 * Create and send a WR_QP_CONNECT...
+	 */
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	c2_wr_set_id(wr, CCWR_QP_CONNECT);
+	wr->hdr.context = 0;
+	wr->rnic_handle = c2dev->adapter_handle;
+	wr->qp_handle = qp->adapter_handle;
+
+	wr->remote_addr = raddr->sin_addr.s_addr;
+	wr->remote_port = raddr->sin_port;
+
+	/*
+	 * Move any private data from the callers's buf into
+	 * the WR.
+	 */
+	if (iw_param->private_data) {
+		wr->private_data_length =
+			cpu_to_be32(iw_param->private_data_len);
+		memcpy(&wr->private_data[0], iw_param->private_data,
+		       iw_param->private_data_len);
+	} else
+		wr->private_data_length = 0;
+
+	/*
+	 * Send WR to adapter.  NOTE: There is no synch reply from
+	 * the adapter.
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) wr);
+	vq_req_free(c2dev, vq_req);
+
+ bail1:
+	kfree(wr);
+ bail0:
+	if (err) {
+		/*
+		 * If we fail, release reference on QP and
+		 * disassociate QP from CM_ID
+		 */
+		cm_id->provider_data = NULL;
+		qp->cm_id = NULL;
+		cm_id->rem_ref(cm_id);
+	}
+	return err;
+}
+
+int c2_llp_service_create(struct iw_cm_id *cm_id, int backlog)
+{
+	struct c2_dev *c2dev;
+	struct c2wr_ep_listen_create_req wr;
+	struct c2wr_ep_listen_create_rep *reply;
+	struct c2_vq_req *vq_req;
+	int err;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
+
+	if (cm_id->local_addr.ss_family != AF_INET)
+		return -ENOSYS;
+
+	c2dev = to_c2dev(cm_id->device);
+	if (c2dev == NULL)
+		return -EINVAL;
+
+	/*
+	 * Allocate verbs request.
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	/*
+	 * Build the WR
+	 */
+	c2_wr_set_id(&wr, CCWR_EP_LISTEN_CREATE);
+	wr.hdr.context = (u64) (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.local_addr = laddr->sin_addr.s_addr;
+	wr.local_port = laddr->sin_port;
+	wr.backlog = cpu_to_be32(backlog);
+	wr.user_context = (u64) (unsigned long) cm_id;
+
+	/*
+	 * Reference the request struct.  Dereferenced in the int handler.
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	/*
+	 * Send WR to adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	/*
+	 * Wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail0;
+
+	/*
+	 * Process reply
+	 */
+	reply =
+	    (struct c2wr_ep_listen_create_rep *) (unsigned long) vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	if ((err = c2_errno(reply)) != 0)
+		goto bail1;
+
+	/*
+	 * Keep the adapter handle. Used in subsequent destroy
+	 */
+	cm_id->provider_data = (void*)(unsigned long) reply->ep_handle;
+
+	/*
+	 * free vq stuff
+	 */
+	vq_repbuf_free(c2dev, reply);
+	vq_req_free(c2dev, vq_req);
+
+	return 0;
+
+ bail1:
+	vq_repbuf_free(c2dev, reply);
+ bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+
+int c2_llp_service_destroy(struct iw_cm_id *cm_id)
+{
+
+	struct c2_dev *c2dev;
+	struct c2wr_ep_listen_destroy_req wr;
+	struct c2wr_ep_listen_destroy_rep *reply;
+	struct c2_vq_req *vq_req;
+	int err;
+
+	c2dev = to_c2dev(cm_id->device);
+	if (c2dev == NULL)
+		return -EINVAL;
+
+	/*
+	 * Allocate verbs request.
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	/*
+	 * Build the WR
+	 */
+	c2_wr_set_id(&wr, CCWR_EP_LISTEN_DESTROY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.ep_handle = (u32)(unsigned long)cm_id->provider_data;
+
+	/*
+	 * reference the request struct.  dereferenced in the int handler.
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	/*
+	 * Send WR to adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	/*
+	 * Wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail0;
+
+	/*
+	 * Process reply
+	 */
+	reply=(struct c2wr_ep_listen_destroy_rep *)(unsigned long)vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+	if ((err = c2_errno(reply)) != 0)
+		goto bail1;
+
+ bail1:
+	vq_repbuf_free(c2dev, reply);
+ bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+int c2_llp_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	struct c2_dev *c2dev = to_c2dev(cm_id->device);
+	struct c2_qp *qp;
+	struct ib_qp *ibqp;
+	struct c2wr_cr_accept_req *wr;	/* variable length WR */
+	struct c2_vq_req *vq_req;
+	struct c2wr_cr_accept_rep *reply;	/* VQ Reply msg ptr. */
+	int err;
+
+	ibqp = c2_get_qp(cm_id->device, iw_param->qpn);
+	if (!ibqp)
+		return -EINVAL;
+	qp = to_c2qp(ibqp);
+
+	/* Set the RDMA read limits */
+	err = c2_qp_set_read_limits(c2dev, qp, iw_param->ord, iw_param->ird);
+	if (err)
+		goto bail0;
+
+	/* Allocate verbs request. */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+	vq_req->qp = qp;
+	vq_req->cm_id = cm_id;
+	vq_req->event = IW_CM_EVENT_ESTABLISHED;
+
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	/* Build the WR */
+	c2_wr_set_id(wr, CCWR_CR_ACCEPT);
+	wr->hdr.context = (unsigned long) vq_req;
+	wr->rnic_handle = c2dev->adapter_handle;
+	wr->ep_handle = (u32) (unsigned long) cm_id->provider_data;
+	wr->qp_handle = qp->adapter_handle;
+
+	/* Replace the cr_handle with the QP after accept */
+	cm_id->provider_data = qp;
+	cm_id->add_ref(cm_id);
+	qp->cm_id = cm_id;
+
+	cm_id->provider_data = qp;
+
+	/* Validate private_data length */
+	if (iw_param->private_data_len > C2_MAX_PRIVATE_DATA_SIZE) {
+		err = -EINVAL;
+		goto bail1;
+	}
+
+	if (iw_param->private_data) {
+		wr->private_data_length = cpu_to_be32(iw_param->private_data_len);
+		memcpy(&wr->private_data[0],
+		       iw_param->private_data, iw_param->private_data_len);
+	} else
+		wr->private_data_length = 0;
+
+	/* Reference the request struct.  Dereferenced in the int handler. */
+	vq_req_get(c2dev, vq_req);
+
+	/* Send WR to adapter */
+	err = vq_send_wr(c2dev, (union c2wr *) wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	/* Wait for reply from adapter */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail1;
+
+	/* Check that reply is present */
+	reply = (struct c2wr_cr_accept_rep *) (unsigned long) vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	err = c2_errno(reply);
+	vq_repbuf_free(c2dev, reply);
+
+	if (!err)
+		c2_set_qp_state(qp, C2_QP_STATE_RTS);
+ bail1:
+	kfree(wr);
+	vq_req_free(c2dev, vq_req);
+ bail0:
+	if (err) {
+		/*
+		 * If we fail, release reference on QP and
+		 * disassociate QP from CM_ID
+		 */
+		cm_id->provider_data = NULL;
+		qp->cm_id = NULL;
+		cm_id->rem_ref(cm_id);
+	}
+	return err;
+}
+
+int c2_llp_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	struct c2_dev *c2dev;
+	struct c2wr_cr_reject_req wr;
+	struct c2_vq_req *vq_req;
+	struct c2wr_cr_reject_rep *reply;
+	int err;
+
+	c2dev = to_c2dev(cm_id->device);
+
+	/*
+	 * Allocate verbs request.
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	/*
+	 * Build the WR
+	 */
+	c2_wr_set_id(&wr, CCWR_CR_REJECT);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.ep_handle = (u32) (unsigned long) cm_id->provider_data;
+
+	/*
+	 * reference the request struct.  dereferenced in the int handler.
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	/*
+	 * Send WR to adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	/*
+	 * Wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail0;
+
+	/*
+	 * Process reply
+	 */
+	reply = (struct c2wr_cr_reject_rep *) (unsigned long)
+		vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+	err = c2_errno(reply);
+	/*
+	 * free vq stuff
+	 */
+	vq_repbuf_free(c2dev, reply);
+
+ bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_cq.c b/drivers/infiniband/hw/amso1100/c2_cq.c
new file mode 100644
index 000000000..1b63185b4
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_cq.c
@@ -0,0 +1,440 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/gfp.h>
+
+#include "c2.h"
+#include "c2_vq.h"
+#include "c2_status.h"
+
+#define C2_CQ_MSG_SIZE ((sizeof(struct c2wr_ce) + 32-1) & ~(32-1))
+
+static struct c2_cq *c2_cq_get(struct c2_dev *c2dev, int cqn)
+{
+	struct c2_cq *cq;
+	unsigned long flags;
+
+	spin_lock_irqsave(&c2dev->lock, flags);
+	cq = c2dev->qptr_array[cqn];
+	if (!cq) {
+		spin_unlock_irqrestore(&c2dev->lock, flags);
+		return NULL;
+	}
+	atomic_inc(&cq->refcount);
+	spin_unlock_irqrestore(&c2dev->lock, flags);
+	return cq;
+}
+
+static void c2_cq_put(struct c2_cq *cq)
+{
+	if (atomic_dec_and_test(&cq->refcount))
+		wake_up(&cq->wait);
+}
+
+void c2_cq_event(struct c2_dev *c2dev, u32 mq_index)
+{
+	struct c2_cq *cq;
+
+	cq = c2_cq_get(c2dev, mq_index);
+	if (!cq) {
+		printk("discarding events on destroyed CQN=%d\n", mq_index);
+		return;
+	}
+
+	(*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context);
+	c2_cq_put(cq);
+}
+
+void c2_cq_clean(struct c2_dev *c2dev, struct c2_qp *qp, u32 mq_index)
+{
+	struct c2_cq *cq;
+	struct c2_mq *q;
+
+	cq = c2_cq_get(c2dev, mq_index);
+	if (!cq)
+		return;
+
+	spin_lock_irq(&cq->lock);
+	q = &cq->mq;
+	if (q && !c2_mq_empty(q)) {
+		u16 priv = q->priv;
+		struct c2wr_ce *msg;
+
+		while (priv != be16_to_cpu(*q->shared)) {
+			msg = (struct c2wr_ce *)
+				(q->msg_pool.host + priv * q->msg_size);
+			if (msg->qp_user_context == (u64) (unsigned long) qp) {
+				msg->qp_user_context = (u64) 0;
+			}
+			priv = (priv + 1) % q->q_size;
+		}
+	}
+	spin_unlock_irq(&cq->lock);
+	c2_cq_put(cq);
+}
+
+static inline enum ib_wc_status c2_cqe_status_to_openib(u8 status)
+{
+	switch (status) {
+	case C2_OK:
+		return IB_WC_SUCCESS;
+	case CCERR_FLUSHED:
+		return IB_WC_WR_FLUSH_ERR;
+	case CCERR_BASE_AND_BOUNDS_VIOLATION:
+		return IB_WC_LOC_PROT_ERR;
+	case CCERR_ACCESS_VIOLATION:
+		return IB_WC_LOC_ACCESS_ERR;
+	case CCERR_TOTAL_LENGTH_TOO_BIG:
+		return IB_WC_LOC_LEN_ERR;
+	case CCERR_INVALID_WINDOW:
+		return IB_WC_MW_BIND_ERR;
+	default:
+		return IB_WC_GENERAL_ERR;
+	}
+}
+
+
+static inline int c2_poll_one(struct c2_dev *c2dev,
+			      struct c2_cq *cq, struct ib_wc *entry)
+{
+	struct c2wr_ce *ce;
+	struct c2_qp *qp;
+	int is_recv = 0;
+
+	ce = c2_mq_consume(&cq->mq);
+	if (!ce) {
+		return -EAGAIN;
+	}
+
+	/*
+	 * if the qp returned is null then this qp has already
+	 * been freed and we are unable process the completion.
+	 * try pulling the next message
+	 */
+	while ((qp =
+		(struct c2_qp *) (unsigned long) ce->qp_user_context) == NULL) {
+		c2_mq_free(&cq->mq);
+		ce = c2_mq_consume(&cq->mq);
+		if (!ce)
+			return -EAGAIN;
+	}
+
+	entry->status = c2_cqe_status_to_openib(c2_wr_get_result(ce));
+	entry->wr_id = ce->hdr.context;
+	entry->qp = &qp->ibqp;
+	entry->wc_flags = 0;
+	entry->slid = 0;
+	entry->sl = 0;
+	entry->src_qp = 0;
+	entry->dlid_path_bits = 0;
+	entry->pkey_index = 0;
+
+	switch (c2_wr_get_id(ce)) {
+	case C2_WR_TYPE_SEND:
+		entry->opcode = IB_WC_SEND;
+		break;
+	case C2_WR_TYPE_RDMA_WRITE:
+		entry->opcode = IB_WC_RDMA_WRITE;
+		break;
+	case C2_WR_TYPE_RDMA_READ:
+		entry->opcode = IB_WC_RDMA_READ;
+		break;
+	case C2_WR_TYPE_BIND_MW:
+		entry->opcode = IB_WC_BIND_MW;
+		break;
+	case C2_WR_TYPE_RECV:
+		entry->byte_len = be32_to_cpu(ce->bytes_rcvd);
+		entry->opcode = IB_WC_RECV;
+		is_recv = 1;
+		break;
+	default:
+		break;
+	}
+
+	/* consume the WQEs */
+	if (is_recv)
+		c2_mq_lconsume(&qp->rq_mq, 1);
+	else
+		c2_mq_lconsume(&qp->sq_mq,
+			       be32_to_cpu(c2_wr_get_wqe_count(ce)) + 1);
+
+	/* free the message */
+	c2_mq_free(&cq->mq);
+
+	return 0;
+}
+
+int c2_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+	struct c2_dev *c2dev = to_c2dev(ibcq->device);
+	struct c2_cq *cq = to_c2cq(ibcq);
+	unsigned long flags;
+	int npolled, err;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+
+		err = c2_poll_one(c2dev, cq, entry + npolled);
+		if (err)
+			break;
+	}
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return npolled;
+}
+
+int c2_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+	struct c2_mq_shared __iomem *shared;
+	struct c2_cq *cq;
+	unsigned long flags;
+	int ret = 0;
+
+	cq = to_c2cq(ibcq);
+	shared = cq->mq.peer;
+
+	if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP)
+		writeb(C2_CQ_NOTIFICATION_TYPE_NEXT, &shared->notification_type);
+	else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+		writeb(C2_CQ_NOTIFICATION_TYPE_NEXT_SE, &shared->notification_type);
+	else
+		return -EINVAL;
+
+	writeb(CQ_WAIT_FOR_DMA | CQ_ARMED, &shared->armed);
+
+	/*
+	 * Now read back shared->armed to make the PCI
+	 * write synchronous.  This is necessary for
+	 * correct cq notification semantics.
+	 */
+	readb(&shared->armed);
+
+	if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
+		spin_lock_irqsave(&cq->lock, flags);
+		ret = !c2_mq_empty(&cq->mq);
+		spin_unlock_irqrestore(&cq->lock, flags);
+	}
+
+	return ret;
+}
+
+static void c2_free_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq)
+{
+	dma_free_coherent(&c2dev->pcidev->dev, mq->q_size * mq->msg_size,
+			  mq->msg_pool.host, dma_unmap_addr(mq, mapping));
+}
+
+static int c2_alloc_cq_buf(struct c2_dev *c2dev, struct c2_mq *mq,
+			   size_t q_size, size_t msg_size)
+{
+	u8 *pool_start;
+
+	if (q_size > SIZE_MAX / msg_size)
+		return -EINVAL;
+
+	pool_start = dma_alloc_coherent(&c2dev->pcidev->dev, q_size * msg_size,
+					&mq->host_dma, GFP_KERNEL);
+	if (!pool_start)
+		return -ENOMEM;
+
+	c2_mq_rep_init(mq,
+		       0,		/* index (currently unknown) */
+		       q_size,
+		       msg_size,
+		       pool_start,
+		       NULL,	/* peer (currently unknown) */
+		       C2_MQ_HOST_TARGET);
+
+	dma_unmap_addr_set(mq, mapping, mq->host_dma);
+
+	return 0;
+}
+
+int c2_init_cq(struct c2_dev *c2dev, int entries,
+	       struct c2_ucontext *ctx, struct c2_cq *cq)
+{
+	struct c2wr_cq_create_req wr;
+	struct c2wr_cq_create_rep *reply;
+	unsigned long peer_pa;
+	struct c2_vq_req *vq_req;
+	int err;
+
+	might_sleep();
+
+	cq->ibcq.cqe = entries - 1;
+	cq->is_kernel = !ctx;
+
+	/* Allocate a shared pointer */
+	cq->mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+				      &cq->mq.shared_dma, GFP_KERNEL);
+	if (!cq->mq.shared)
+		return -ENOMEM;
+
+	/* Allocate pages for the message pool */
+	err = c2_alloc_cq_buf(c2dev, &cq->mq, entries + 1, C2_CQ_MSG_SIZE);
+	if (err)
+		goto bail0;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_CQ_CREATE);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.msg_size = cpu_to_be32(cq->mq.msg_size);
+	wr.depth = cpu_to_be32(cq->mq.q_size);
+	wr.shared_ht = cpu_to_be64(cq->mq.shared_dma);
+	wr.msg_pool = cpu_to_be64(cq->mq.host_dma);
+	wr.user_context = (u64) (unsigned long) (cq);
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail2;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail2;
+
+	reply = (struct c2wr_cq_create_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail2;
+	}
+
+	if ((err = c2_errno(reply)) != 0)
+		goto bail3;
+
+	cq->adapter_handle = reply->cq_handle;
+	cq->mq.index = be32_to_cpu(reply->mq_index);
+
+	peer_pa = c2dev->pa + be32_to_cpu(reply->adapter_shared);
+	cq->mq.peer = ioremap_nocache(peer_pa, PAGE_SIZE);
+	if (!cq->mq.peer) {
+		err = -ENOMEM;
+		goto bail3;
+	}
+
+	vq_repbuf_free(c2dev, reply);
+	vq_req_free(c2dev, vq_req);
+
+	spin_lock_init(&cq->lock);
+	atomic_set(&cq->refcount, 1);
+	init_waitqueue_head(&cq->wait);
+
+	/*
+	 * Use the MQ index allocated by the adapter to
+	 * store the CQ in the qptr_array
+	 */
+	cq->cqn = cq->mq.index;
+	c2dev->qptr_array[cq->cqn] = cq;
+
+	return 0;
+
+      bail3:
+	vq_repbuf_free(c2dev, reply);
+      bail2:
+	vq_req_free(c2dev, vq_req);
+      bail1:
+	c2_free_cq_buf(c2dev, &cq->mq);
+      bail0:
+	c2_free_mqsp(cq->mq.shared);
+
+	return err;
+}
+
+void c2_free_cq(struct c2_dev *c2dev, struct c2_cq *cq)
+{
+	int err;
+	struct c2_vq_req *vq_req;
+	struct c2wr_cq_destroy_req wr;
+	struct c2wr_cq_destroy_rep *reply;
+
+	might_sleep();
+
+	/* Clear CQ from the qptr array */
+	spin_lock_irq(&c2dev->lock);
+	c2dev->qptr_array[cq->mq.index] = NULL;
+	atomic_dec(&cq->refcount);
+	spin_unlock_irq(&c2dev->lock);
+
+	wait_event(cq->wait, !atomic_read(&cq->refcount));
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		goto bail0;
+	}
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_CQ_DESTROY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.cq_handle = cq->adapter_handle;
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail1;
+
+	reply = (struct c2wr_cq_destroy_rep *) (unsigned long) (vq_req->reply_msg);
+	if (reply)
+		vq_repbuf_free(c2dev, reply);
+      bail1:
+	vq_req_free(c2dev, vq_req);
+      bail0:
+	if (cq->is_kernel) {
+		c2_free_cq_buf(c2dev, &cq->mq);
+	}
+
+	return;
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_intr.c b/drivers/infiniband/hw/amso1100/c2_intr.c
new file mode 100644
index 000000000..3a17d9b36
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_intr.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "c2.h"
+#include <rdma/iw_cm.h>
+#include "c2_vq.h"
+
+static void handle_mq(struct c2_dev *c2dev, u32 index);
+static void handle_vq(struct c2_dev *c2dev, u32 mq_index);
+
+/*
+ * Handle RNIC interrupts
+ */
+void c2_rnic_interrupt(struct c2_dev *c2dev)
+{
+	unsigned int mq_index;
+
+	while (c2dev->hints_read != be16_to_cpu(*c2dev->hint_count)) {
+		mq_index = readl(c2dev->regs + PCI_BAR0_HOST_HINT);
+		if (mq_index & 0x80000000) {
+			break;
+		}
+
+		c2dev->hints_read++;
+		handle_mq(c2dev, mq_index);
+	}
+
+}
+
+/*
+ * Top level MQ handler
+ */
+static void handle_mq(struct c2_dev *c2dev, u32 mq_index)
+{
+	if (c2dev->qptr_array[mq_index] == NULL) {
+		pr_debug("handle_mq: stray activity for mq_index=%d\n",
+			 mq_index);
+		return;
+	}
+
+	switch (mq_index) {
+	case (0):
+		/*
+		 * An index of 0 in the activity queue
+		 * indicates the req vq now has messages
+		 * available...
+		 *
+		 * Wake up any waiters waiting on req VQ
+		 * message availability.
+		 */
+		wake_up(&c2dev->req_vq_wo);
+		break;
+	case (1):
+		handle_vq(c2dev, mq_index);
+		break;
+	case (2):
+		/* We have to purge the VQ in case there are pending
+		 * accept reply requests that would result in the
+		 * generation of an ESTABLISHED event. If we don't
+		 * generate these first, a CLOSE event could end up
+		 * being delivered before the ESTABLISHED event.
+		 */
+		handle_vq(c2dev, 1);
+
+		c2_ae_event(c2dev, mq_index);
+		break;
+	default:
+		/* There is no event synchronization between CQ events
+		 * and AE or CM events. In fact, CQE could be
+		 * delivered for all of the I/O up to and including the
+		 * FLUSH for a peer disconenct prior to the ESTABLISHED
+		 * event being delivered to the app. The reason for this
+		 * is that CM events are delivered on a thread, while AE
+		 * and CM events are delivered on interrupt context.
+		 */
+		c2_cq_event(c2dev, mq_index);
+		break;
+	}
+
+	return;
+}
+
+/*
+ * Handles verbs WR replies.
+ */
+static void handle_vq(struct c2_dev *c2dev, u32 mq_index)
+{
+	void *adapter_msg, *reply_msg;
+	struct c2wr_hdr *host_msg;
+	struct c2wr_hdr tmp;
+	struct c2_mq *reply_vq;
+	struct c2_vq_req *req;
+	struct iw_cm_event cm_event;
+	int err;
+
+	reply_vq = (struct c2_mq *) c2dev->qptr_array[mq_index];
+
+	/*
+	 * get next msg from mq_index into adapter_msg.
+	 * don't free it yet.
+	 */
+	adapter_msg = c2_mq_consume(reply_vq);
+	if (adapter_msg == NULL) {
+		return;
+	}
+
+	host_msg = vq_repbuf_alloc(c2dev);
+
+	/*
+	 * If we can't get a host buffer, then we'll still
+	 * wakeup the waiter, we just won't give him the msg.
+	 * It is assumed the waiter will deal with this...
+	 */
+	if (!host_msg) {
+		pr_debug("handle_vq: no repbufs!\n");
+
+		/*
+		 * just copy the WR header into a local variable.
+		 * this allows us to still demux on the context
+		 */
+		host_msg = &tmp;
+		memcpy(host_msg, adapter_msg, sizeof(tmp));
+		reply_msg = NULL;
+	} else {
+		memcpy(host_msg, adapter_msg, reply_vq->msg_size);
+		reply_msg = host_msg;
+	}
+
+	/*
+	 * consume the msg from the MQ
+	 */
+	c2_mq_free(reply_vq);
+
+	/*
+	 * wakeup the waiter.
+	 */
+	req = (struct c2_vq_req *) (unsigned long) host_msg->context;
+	if (req == NULL) {
+		/*
+		 * We should never get here, as the adapter should
+		 * never send us a reply that we're not expecting.
+		 */
+		if (reply_msg != NULL)
+			vq_repbuf_free(c2dev, host_msg);
+		pr_debug("handle_vq: UNEXPECTEDLY got NULL req\n");
+		return;
+	}
+
+	if (reply_msg)
+		err = c2_errno(reply_msg);
+	else
+		err = -ENOMEM;
+
+	if (!err) switch (req->event) {
+	case IW_CM_EVENT_ESTABLISHED:
+		c2_set_qp_state(req->qp,
+				C2_QP_STATE_RTS);
+		/*
+		 * Until ird/ord negotiation via MPAv2 support is added, send
+		 * max supported values
+		 */
+		cm_event.ird = cm_event.ord = 128;
+	case IW_CM_EVENT_CLOSE:
+
+		/*
+		 * Move the QP to RTS if this is
+		 * the established event
+		 */
+		cm_event.event = req->event;
+		cm_event.status = 0;
+		cm_event.local_addr = req->cm_id->local_addr;
+		cm_event.remote_addr = req->cm_id->remote_addr;
+		cm_event.private_data = NULL;
+		cm_event.private_data_len = 0;
+		req->cm_id->event_handler(req->cm_id, &cm_event);
+		break;
+	default:
+		break;
+	}
+
+	req->reply_msg = (u64) (unsigned long) (reply_msg);
+	atomic_set(&req->reply_ready, 1);
+	wake_up(&req->wait_object);
+
+	/*
+	 * If the request was cancelled, then this put will
+	 * free the vq_req memory...and reply_msg!!!
+	 */
+	vq_req_put(c2dev, req);
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_mm.c b/drivers/infiniband/hw/amso1100/c2_mm.c
new file mode 100644
index 000000000..119c4f3d9
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_mm.c
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+
+#include "c2.h"
+#include "c2_vq.h"
+
+#define PBL_VIRT 1
+#define PBL_PHYS 2
+
+/*
+ * Send all the PBL messages to convey the remainder of the PBL
+ * Wait for the adapter's reply on the last one.
+ * This is indicated by setting the MEM_PBL_COMPLETE in the flags.
+ *
+ * NOTE:  vq_req is _not_ freed by this function.  The VQ Host
+ *	  Reply buffer _is_ freed by this function.
+ */
+static int
+send_pbl_messages(struct c2_dev *c2dev, __be32 stag_index,
+		  unsigned long va, u32 pbl_depth,
+		  struct c2_vq_req *vq_req, int pbl_type)
+{
+	u32 pbe_count;		/* amt that fits in a PBL msg */
+	u32 count;		/* amt in this PBL MSG. */
+	struct c2wr_nsmr_pbl_req *wr;	/* PBL WR ptr */
+	struct c2wr_nsmr_pbl_rep *reply;	/* reply ptr */
+ 	int err, pbl_virt, pbl_index, i;
+
+	switch (pbl_type) {
+	case PBL_VIRT:
+		pbl_virt = 1;
+		break;
+	case PBL_PHYS:
+		pbl_virt = 0;
+		break;
+	default:
+		return -EINVAL;
+		break;
+	}
+
+	pbe_count = (c2dev->req_vq.msg_size -
+		     sizeof(struct c2wr_nsmr_pbl_req)) / sizeof(u64);
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		return -ENOMEM;
+	}
+	c2_wr_set_id(wr, CCWR_NSMR_PBL);
+
+	/*
+	 * Only the last PBL message will generate a reply from the verbs,
+	 * so we set the context to 0 indicating there is no kernel verbs
+	 * handler blocked awaiting this reply.
+	 */
+	wr->hdr.context = 0;
+	wr->rnic_handle = c2dev->adapter_handle;
+	wr->stag_index = stag_index;	/* already swapped */
+	wr->flags = 0;
+	pbl_index = 0;
+	while (pbl_depth) {
+		count = min(pbe_count, pbl_depth);
+		wr->addrs_length = cpu_to_be32(count);
+
+		/*
+		 *  If this is the last message, then reference the
+		 *  vq request struct cuz we're gonna wait for a reply.
+		 *  also make this PBL msg as the last one.
+		 */
+		if (count == pbl_depth) {
+			/*
+			 * reference the request struct.  dereferenced in the
+			 * int handler.
+			 */
+			vq_req_get(c2dev, vq_req);
+			wr->flags = cpu_to_be32(MEM_PBL_COMPLETE);
+
+			/*
+			 * This is the last PBL message.
+			 * Set the context to our VQ Request Object so we can
+			 * wait for the reply.
+			 */
+			wr->hdr.context = (unsigned long) vq_req;
+		}
+
+		/*
+		 * If pbl_virt is set then va is a virtual address
+		 * that describes a virtually contiguous memory
+		 * allocation. The wr needs the start of each virtual page
+		 * to be converted to the corresponding physical address
+		 * of the page. If pbl_virt is not set then va is an array
+		 * of physical addresses and there is no conversion to do.
+		 * Just fill in the wr with what is in the array.
+		 */
+		for (i = 0; i < count; i++) {
+			if (pbl_virt) {
+				va += PAGE_SIZE;
+			} else {
+ 				wr->paddrs[i] =
+				    cpu_to_be64(((u64 *)va)[pbl_index + i]);
+			}
+		}
+
+		/*
+		 * Send WR to adapter
+		 */
+		err = vq_send_wr(c2dev, (union c2wr *) wr);
+		if (err) {
+			if (count <= pbe_count) {
+				vq_req_put(c2dev, vq_req);
+			}
+			goto bail0;
+		}
+		pbl_depth -= count;
+		pbl_index += count;
+	}
+
+	/*
+	 *  Now wait for the reply...
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail0;
+	}
+
+	/*
+	 * Process reply
+	 */
+	reply = (struct c2wr_nsmr_pbl_rep *) (unsigned long) vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	err = c2_errno(reply);
+
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	kfree(wr);
+	return err;
+}
+
+#define C2_PBL_MAX_DEPTH 131072
+int
+c2_nsmr_register_phys_kern(struct c2_dev *c2dev, u64 *addr_list,
+ 			   int page_size, int pbl_depth, u32 length,
+ 			   u32 offset, u64 *va, enum c2_acf acf,
+			   struct c2_mr *mr)
+{
+	struct c2_vq_req *vq_req;
+	struct c2wr_nsmr_register_req *wr;
+	struct c2wr_nsmr_register_rep *reply;
+	u16 flags;
+	int i, pbe_count, count;
+	int err;
+
+	if (!va || !length || !addr_list || !pbl_depth)
+		return -EINTR;
+
+	/*
+	 * Verify PBL depth is within rnic max
+	 */
+	if (pbl_depth > C2_PBL_MAX_DEPTH) {
+		return -EINTR;
+	}
+
+	/*
+	 * allocate verbs request object
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	/*
+	 * build the WR
+	 */
+	c2_wr_set_id(wr, CCWR_NSMR_REGISTER);
+	wr->hdr.context = (unsigned long) vq_req;
+	wr->rnic_handle = c2dev->adapter_handle;
+
+	flags = (acf | MEM_VA_BASED | MEM_REMOTE);
+
+	/*
+	 * compute how many pbes can fit in the message
+	 */
+	pbe_count = (c2dev->req_vq.msg_size -
+		     sizeof(struct c2wr_nsmr_register_req)) / sizeof(u64);
+
+	if (pbl_depth <= pbe_count) {
+		flags |= MEM_PBL_COMPLETE;
+	}
+	wr->flags = cpu_to_be16(flags);
+	wr->stag_key = 0;	//stag_key;
+	wr->va = cpu_to_be64(*va);
+	wr->pd_id = mr->pd->pd_id;
+	wr->pbe_size = cpu_to_be32(page_size);
+	wr->length = cpu_to_be32(length);
+	wr->pbl_depth = cpu_to_be32(pbl_depth);
+	wr->fbo = cpu_to_be32(offset);
+	count = min(pbl_depth, pbe_count);
+	wr->addrs_length = cpu_to_be32(count);
+
+	/*
+	 * fill out the PBL for this message
+	 */
+	for (i = 0; i < count; i++) {
+		wr->paddrs[i] = cpu_to_be64(addr_list[i]);
+	}
+
+	/*
+	 * regerence the request struct
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	/*
+	 * send the WR to the adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	/*
+	 * wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail1;
+	}
+
+	/*
+	 * process reply
+	 */
+	reply =
+	    (struct c2wr_nsmr_register_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+	if ((err = c2_errno(reply))) {
+		goto bail2;
+	}
+	//*p_pb_entries = be32_to_cpu(reply->pbl_depth);
+	mr->ibmr.lkey = mr->ibmr.rkey = be32_to_cpu(reply->stag_index);
+	vq_repbuf_free(c2dev, reply);
+
+	/*
+	 * if there are still more PBEs we need to send them to
+	 * the adapter and wait for a reply on the final one.
+	 * reuse vq_req for this purpose.
+	 */
+	pbl_depth -= count;
+	if (pbl_depth) {
+
+		vq_req->reply_msg = (unsigned long) NULL;
+		atomic_set(&vq_req->reply_ready, 0);
+		err = send_pbl_messages(c2dev,
+					cpu_to_be32(mr->ibmr.lkey),
+					(unsigned long) &addr_list[i],
+					pbl_depth, vq_req, PBL_PHYS);
+		if (err) {
+			goto bail1;
+		}
+	}
+
+	vq_req_free(c2dev, vq_req);
+	kfree(wr);
+
+	return err;
+
+      bail2:
+	vq_repbuf_free(c2dev, reply);
+      bail1:
+	kfree(wr);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+int c2_stag_dealloc(struct c2_dev *c2dev, u32 stag_index)
+{
+	struct c2_vq_req *vq_req;	/* verbs request object */
+	struct c2wr_stag_dealloc_req wr;	/* work request */
+	struct c2wr_stag_dealloc_rep *reply;	/* WR reply  */
+	int err;
+
+
+	/*
+	 * allocate verbs request object
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		return -ENOMEM;
+	}
+
+	/*
+	 * Build the WR
+	 */
+	c2_wr_set_id(&wr, CCWR_STAG_DEALLOC);
+	wr.hdr.context = (u64) (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.stag_index = cpu_to_be32(stag_index);
+
+	/*
+	 * reference the request struct.  dereferenced in the int handler.
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	/*
+	 * Send WR to adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	/*
+	 * Wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail0;
+	}
+
+	/*
+	 * Process reply
+	 */
+	reply = (struct c2wr_stag_dealloc_rep *) (unsigned long) vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	err = c2_errno(reply);
+
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_mq.c b/drivers/infiniband/hw/amso1100/c2_mq.c
new file mode 100644
index 000000000..0cddc49be
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_mq.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "c2.h"
+#include "c2_mq.h"
+
+void *c2_mq_alloc(struct c2_mq *q)
+{
+	BUG_ON(q->magic != C2_MQ_MAGIC);
+	BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
+
+	if (c2_mq_full(q)) {
+		return NULL;
+	} else {
+#ifdef DEBUG
+		struct c2wr_hdr *m =
+		    (struct c2wr_hdr *) (q->msg_pool.host + q->priv * q->msg_size);
+#ifdef CCMSGMAGIC
+		BUG_ON(m->magic != be32_to_cpu(~CCWR_MAGIC));
+		m->magic = cpu_to_be32(CCWR_MAGIC);
+#endif
+		return m;
+#else
+		return q->msg_pool.host + q->priv * q->msg_size;
+#endif
+	}
+}
+
+void c2_mq_produce(struct c2_mq *q)
+{
+	BUG_ON(q->magic != C2_MQ_MAGIC);
+	BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
+
+	if (!c2_mq_full(q)) {
+		q->priv = (q->priv + 1) % q->q_size;
+		q->hint_count++;
+		/* Update peer's offset. */
+		__raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared);
+	}
+}
+
+void *c2_mq_consume(struct c2_mq *q)
+{
+	BUG_ON(q->magic != C2_MQ_MAGIC);
+	BUG_ON(q->type != C2_MQ_HOST_TARGET);
+
+	if (c2_mq_empty(q)) {
+		return NULL;
+	} else {
+#ifdef DEBUG
+		struct c2wr_hdr *m = (struct c2wr_hdr *)
+		    (q->msg_pool.host + q->priv * q->msg_size);
+#ifdef CCMSGMAGIC
+		BUG_ON(m->magic != be32_to_cpu(CCWR_MAGIC));
+#endif
+		return m;
+#else
+		return q->msg_pool.host + q->priv * q->msg_size;
+#endif
+	}
+}
+
+void c2_mq_free(struct c2_mq *q)
+{
+	BUG_ON(q->magic != C2_MQ_MAGIC);
+	BUG_ON(q->type != C2_MQ_HOST_TARGET);
+
+	if (!c2_mq_empty(q)) {
+
+#ifdef CCMSGMAGIC
+		{
+			struct c2wr_hdr __iomem *m = (struct c2wr_hdr __iomem *)
+			    (q->msg_pool.adapter + q->priv * q->msg_size);
+			__raw_writel(cpu_to_be32(~CCWR_MAGIC), &m->magic);
+		}
+#endif
+		q->priv = (q->priv + 1) % q->q_size;
+		/* Update peer's offset. */
+		__raw_writew((__force u16) cpu_to_be16(q->priv), &q->peer->shared);
+	}
+}
+
+
+void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count)
+{
+	BUG_ON(q->magic != C2_MQ_MAGIC);
+	BUG_ON(q->type != C2_MQ_ADAPTER_TARGET);
+
+	while (wqe_count--) {
+		BUG_ON(c2_mq_empty(q));
+		*q->shared = cpu_to_be16((be16_to_cpu(*q->shared)+1) % q->q_size);
+	}
+}
+
+#if 0
+u32 c2_mq_count(struct c2_mq *q)
+{
+	s32 count;
+
+	if (q->type == C2_MQ_HOST_TARGET)
+		count = be16_to_cpu(*q->shared) - q->priv;
+	else
+		count = q->priv - be16_to_cpu(*q->shared);
+
+	if (count < 0)
+		count += q->q_size;
+
+	return (u32) count;
+}
+#endif  /*  0  */
+
+void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+		    u8 __iomem *pool_start, u16 __iomem *peer, u32 type)
+{
+	BUG_ON(!q->shared);
+
+	/* This code assumes the byte swapping has already been done! */
+	q->index = index;
+	q->q_size = q_size;
+	q->msg_size = msg_size;
+	q->msg_pool.adapter = pool_start;
+	q->peer = (struct c2_mq_shared __iomem *) peer;
+	q->magic = C2_MQ_MAGIC;
+	q->type = type;
+	q->priv = 0;
+	q->hint_count = 0;
+	return;
+}
+void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+		    u8 *pool_start, u16 __iomem *peer, u32 type)
+{
+	BUG_ON(!q->shared);
+
+	/* This code assumes the byte swapping has already been done! */
+	q->index = index;
+	q->q_size = q_size;
+	q->msg_size = msg_size;
+	q->msg_pool.host = pool_start;
+	q->peer = (struct c2_mq_shared __iomem *) peer;
+	q->magic = C2_MQ_MAGIC;
+	q->type = type;
+	q->priv = 0;
+	q->hint_count = 0;
+	return;
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_mq.h b/drivers/infiniband/hw/amso1100/c2_mq.h
new file mode 100644
index 000000000..fc1b9a7ce
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_mq.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _C2_MQ_H_
+#define _C2_MQ_H_
+#include <linux/kernel.h>
+#include <linux/dma-mapping.h>
+#include "c2_wr.h"
+
+enum c2_shared_regs {
+
+	C2_SHARED_ARMED = 0x10,
+	C2_SHARED_NOTIFY = 0x18,
+	C2_SHARED_SHARED = 0x40,
+};
+
+struct c2_mq_shared {
+	u16 unused1;
+	u8 armed;
+	u8 notification_type;
+	u32 unused2;
+	u16 shared;
+	/* Pad to 64 bytes. */
+	u8 pad[64 - sizeof(u16) - 2 * sizeof(u8) - sizeof(u32) - sizeof(u16)];
+};
+
+enum c2_mq_type {
+	C2_MQ_HOST_TARGET = 1,
+	C2_MQ_ADAPTER_TARGET = 2,
+};
+
+/*
+ * c2_mq_t is for kernel-mode MQs like the VQs Cand the AEQ.
+ * c2_user_mq_t (which is the same format) is for user-mode MQs...
+ */
+#define C2_MQ_MAGIC 0x4d512020	/* 'MQ  ' */
+struct c2_mq {
+	u32 magic;
+	union {
+		u8 *host;
+		u8 __iomem *adapter;
+	} msg_pool;
+	dma_addr_t host_dma;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	u16 hint_count;
+	u16 priv;
+	struct c2_mq_shared __iomem *peer;
+	__be16 *shared;
+	dma_addr_t shared_dma;
+	u32 q_size;
+	u32 msg_size;
+	u32 index;
+	enum c2_mq_type type;
+};
+
+static __inline__ int c2_mq_empty(struct c2_mq *q)
+{
+	return q->priv == be16_to_cpu(*q->shared);
+}
+
+static __inline__ int c2_mq_full(struct c2_mq *q)
+{
+	return q->priv == (be16_to_cpu(*q->shared) + q->q_size - 1) % q->q_size;
+}
+
+extern void c2_mq_lconsume(struct c2_mq *q, u32 wqe_count);
+extern void *c2_mq_alloc(struct c2_mq *q);
+extern void c2_mq_produce(struct c2_mq *q);
+extern void *c2_mq_consume(struct c2_mq *q);
+extern void c2_mq_free(struct c2_mq *q);
+extern void c2_mq_req_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+		       u8 __iomem *pool_start, u16 __iomem *peer, u32 type);
+extern void c2_mq_rep_init(struct c2_mq *q, u32 index, u32 q_size, u32 msg_size,
+			   u8 *pool_start, u16 __iomem *peer, u32 type);
+
+#endif				/* _C2_MQ_H_ */
diff --git a/drivers/infiniband/hw/amso1100/c2_pd.c b/drivers/infiniband/hw/amso1100/c2_pd.c
new file mode 100644
index 000000000..f3e81dc35
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_pd.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include "c2.h"
+#include "c2_provider.h"
+
+int c2_pd_alloc(struct c2_dev *c2dev, int privileged, struct c2_pd *pd)
+{
+	u32 obj;
+	int ret = 0;
+
+	spin_lock(&c2dev->pd_table.lock);
+	obj = find_next_zero_bit(c2dev->pd_table.table, c2dev->pd_table.max,
+				 c2dev->pd_table.last);
+	if (obj >= c2dev->pd_table.max)
+		obj = find_first_zero_bit(c2dev->pd_table.table,
+					  c2dev->pd_table.max);
+	if (obj < c2dev->pd_table.max) {
+		pd->pd_id = obj;
+		__set_bit(obj, c2dev->pd_table.table);
+		c2dev->pd_table.last = obj+1;
+		if (c2dev->pd_table.last >= c2dev->pd_table.max)
+			c2dev->pd_table.last = 0;
+	} else
+		ret = -ENOMEM;
+	spin_unlock(&c2dev->pd_table.lock);
+	return ret;
+}
+
+void c2_pd_free(struct c2_dev *c2dev, struct c2_pd *pd)
+{
+	spin_lock(&c2dev->pd_table.lock);
+	__clear_bit(pd->pd_id, c2dev->pd_table.table);
+	spin_unlock(&c2dev->pd_table.lock);
+}
+
+int c2_init_pd_table(struct c2_dev *c2dev)
+{
+
+	c2dev->pd_table.last = 0;
+	c2dev->pd_table.max = c2dev->props.max_pd;
+	spin_lock_init(&c2dev->pd_table.lock);
+	c2dev->pd_table.table = kmalloc(BITS_TO_LONGS(c2dev->props.max_pd) *
+					sizeof(long), GFP_KERNEL);
+	if (!c2dev->pd_table.table)
+		return -ENOMEM;
+	bitmap_zero(c2dev->pd_table.table, c2dev->props.max_pd);
+	return 0;
+}
+
+void c2_cleanup_pd_table(struct c2_dev *c2dev)
+{
+	kfree(c2dev->pd_table.table);
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.c b/drivers/infiniband/hw/amso1100/c2_provider.c
new file mode 100644
index 000000000..bdf350781
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_provider.c
@@ -0,0 +1,882 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/delay.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/dma-mapping.h>
+#include <linux/if_arp.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include "c2.h"
+#include "c2_provider.h"
+#include "c2_user.h"
+
+static int c2_query_device(struct ib_device *ibdev,
+			   struct ib_device_attr *props)
+{
+	struct c2_dev *c2dev = to_c2dev(ibdev);
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	*props = c2dev->props;
+	return 0;
+}
+
+static int c2_query_port(struct ib_device *ibdev,
+			 u8 port, struct ib_port_attr *props)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	props->max_mtu = IB_MTU_4096;
+	props->lid = 0;
+	props->lmc = 0;
+	props->sm_lid = 0;
+	props->sm_sl = 0;
+	props->state = IB_PORT_ACTIVE;
+	props->phys_state = 0;
+	props->port_cap_flags =
+	    IB_PORT_CM_SUP |
+	    IB_PORT_REINIT_SUP |
+	    IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->qkey_viol_cntr = 0;
+	props->active_width = 1;
+	props->active_speed = IB_SPEED_SDR;
+
+	return 0;
+}
+
+static int c2_query_pkey(struct ib_device *ibdev,
+			 u8 port, u16 index, u16 * pkey)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	*pkey = 0;
+	return 0;
+}
+
+static int c2_query_gid(struct ib_device *ibdev, u8 port,
+			int index, union ib_gid *gid)
+{
+	struct c2_dev *c2dev = to_c2dev(ibdev);
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	memcpy(&(gid->raw[0]), c2dev->pseudo_netdev->dev_addr, 6);
+
+	return 0;
+}
+
+/* Allocate the user context data structure. This keeps track
+ * of all objects associated with a particular user-mode client.
+ */
+static struct ib_ucontext *c2_alloc_ucontext(struct ib_device *ibdev,
+					     struct ib_udata *udata)
+{
+	struct c2_ucontext *context;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	context = kmalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	return &context->ibucontext;
+}
+
+static int c2_dealloc_ucontext(struct ib_ucontext *context)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	kfree(context);
+	return 0;
+}
+
+static int c2_mmap_uar(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return -ENOSYS;
+}
+
+static struct ib_pd *c2_alloc_pd(struct ib_device *ibdev,
+				 struct ib_ucontext *context,
+				 struct ib_udata *udata)
+{
+	struct c2_pd *pd;
+	int err;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = c2_pd_alloc(to_c2dev(ibdev), !context, pd);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	if (context) {
+		if (ib_copy_to_udata(udata, &pd->pd_id, sizeof(__u32))) {
+			c2_pd_free(to_c2dev(ibdev), pd);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+
+	return &pd->ibpd;
+}
+
+static int c2_dealloc_pd(struct ib_pd *pd)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	c2_pd_free(to_c2dev(pd->device), to_c2pd(pd));
+	kfree(pd);
+
+	return 0;
+}
+
+static struct ib_ah *c2_ah_create(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return ERR_PTR(-ENOSYS);
+}
+
+static int c2_ah_destroy(struct ib_ah *ah)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return -ENOSYS;
+}
+
+static void c2_add_ref(struct ib_qp *ibqp)
+{
+	struct c2_qp *qp;
+	BUG_ON(!ibqp);
+	qp = to_c2qp(ibqp);
+	atomic_inc(&qp->refcount);
+}
+
+static void c2_rem_ref(struct ib_qp *ibqp)
+{
+	struct c2_qp *qp;
+	BUG_ON(!ibqp);
+	qp = to_c2qp(ibqp);
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+struct ib_qp *c2_get_qp(struct ib_device *device, int qpn)
+{
+	struct c2_dev* c2dev = to_c2dev(device);
+	struct c2_qp *qp;
+
+	qp = c2_find_qpn(c2dev, qpn);
+	pr_debug("%s Returning QP=%p for QPN=%d, device=%p, refcount=%d\n",
+		__func__, qp, qpn, device,
+		(qp?atomic_read(&qp->refcount):0));
+
+	return (qp?&qp->ibqp:NULL);
+}
+
+static struct ib_qp *c2_create_qp(struct ib_pd *pd,
+				  struct ib_qp_init_attr *init_attr,
+				  struct ib_udata *udata)
+{
+	struct c2_qp *qp;
+	int err;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	if (init_attr->create_flags)
+		return ERR_PTR(-EINVAL);
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_RC:
+		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+		if (!qp) {
+			pr_debug("%s: Unable to allocate QP\n", __func__);
+			return ERR_PTR(-ENOMEM);
+		}
+		spin_lock_init(&qp->lock);
+		if (pd->uobject) {
+			/* userspace specific */
+		}
+
+		err = c2_alloc_qp(to_c2dev(pd->device),
+				  to_c2pd(pd), init_attr, qp);
+
+		if (err && pd->uobject) {
+			/* userspace specific */
+		}
+
+		break;
+	default:
+		pr_debug("%s: Invalid QP type: %d\n", __func__,
+			init_attr->qp_type);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (err) {
+		kfree(qp);
+		return ERR_PTR(err);
+	}
+
+	return &qp->ibqp;
+}
+
+static int c2_destroy_qp(struct ib_qp *ib_qp)
+{
+	struct c2_qp *qp = to_c2qp(ib_qp);
+
+	pr_debug("%s:%u qp=%p,qp->state=%d\n",
+		__func__, __LINE__, ib_qp, qp->state);
+	c2_free_qp(to_c2dev(ib_qp->device), qp);
+	kfree(qp);
+	return 0;
+}
+
+static struct ib_cq *c2_create_cq(struct ib_device *ibdev, int entries, int vector,
+				  struct ib_ucontext *context,
+				  struct ib_udata *udata)
+{
+	struct c2_cq *cq;
+	int err;
+
+	cq = kmalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq) {
+		pr_debug("%s: Unable to allocate CQ\n", __func__);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	err = c2_init_cq(to_c2dev(ibdev), entries, NULL, cq);
+	if (err) {
+		pr_debug("%s: error initializing CQ\n", __func__);
+		kfree(cq);
+		return ERR_PTR(err);
+	}
+
+	return &cq->ibcq;
+}
+
+static int c2_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct c2_cq *cq = to_c2cq(ib_cq);
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	c2_free_cq(to_c2dev(ib_cq->device), cq);
+	kfree(cq);
+
+	return 0;
+}
+
+static inline u32 c2_convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_WRITE ? C2_ACF_REMOTE_WRITE : 0) |
+	    (acc & IB_ACCESS_REMOTE_READ ? C2_ACF_REMOTE_READ : 0) |
+	    (acc & IB_ACCESS_LOCAL_WRITE ? C2_ACF_LOCAL_WRITE : 0) |
+	    C2_ACF_LOCAL_READ | C2_ACF_WINDOW_BIND;
+}
+
+static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd,
+				    struct ib_phys_buf *buffer_list,
+				    int num_phys_buf, int acc, u64 * iova_start)
+{
+	struct c2_mr *mr;
+	u64 *page_list;
+	u32 total_len;
+	int err, i, j, k, page_shift, pbl_depth;
+
+	pbl_depth = 0;
+	total_len = 0;
+
+	page_shift = PAGE_SHIFT;
+	/*
+	 * If there is only 1 buffer we assume this could
+	 * be a map of all phy mem...use a 32k page_shift.
+	 */
+	if (num_phys_buf == 1)
+		page_shift += 3;
+
+	for (i = 0; i < num_phys_buf; i++) {
+
+		if (buffer_list[i].addr & ~PAGE_MASK) {
+			pr_debug("Unaligned Memory Buffer: 0x%x\n",
+				(unsigned int) buffer_list[i].addr);
+			return ERR_PTR(-EINVAL);
+		}
+
+		if (!buffer_list[i].size) {
+			pr_debug("Invalid Buffer Size\n");
+			return ERR_PTR(-EINVAL);
+		}
+
+		total_len += buffer_list[i].size;
+		pbl_depth += ALIGN(buffer_list[i].size,
+				   (1 << page_shift)) >> page_shift;
+	}
+
+	page_list = vmalloc(sizeof(u64) * pbl_depth);
+	if (!page_list) {
+		pr_debug("couldn't vmalloc page_list of size %zd\n",
+			(sizeof(u64) * pbl_depth));
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0, j = 0; i < num_phys_buf; i++) {
+
+		int naddrs;
+
+ 		naddrs = ALIGN(buffer_list[i].size,
+			       (1 << page_shift)) >> page_shift;
+		for (k = 0; k < naddrs; k++)
+			page_list[j++] = (buffer_list[i].addr +
+						     (k << page_shift));
+	}
+
+	mr = kmalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr) {
+		vfree(page_list);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	mr->pd = to_c2pd(ib_pd);
+	mr->umem = NULL;
+	pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, "
+		"*iova_start %llx, first pa %llx, last pa %llx\n",
+		__func__, page_shift, pbl_depth, total_len,
+		(unsigned long long) *iova_start,
+	       	(unsigned long long) page_list[0],
+	       	(unsigned long long) page_list[pbl_depth-1]);
+  	err = c2_nsmr_register_phys_kern(to_c2dev(ib_pd->device), page_list,
+ 					 (1 << page_shift), pbl_depth,
+					 total_len, 0, iova_start,
+					 c2_convert_access(acc), mr);
+	vfree(page_list);
+	if (err) {
+		kfree(mr);
+		return ERR_PTR(err);
+	}
+
+	return &mr->ibmr;
+}
+
+static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct ib_phys_buf bl;
+	u64 kva = 0;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	/* AMSO1100 limit */
+	bl.size = 0xffffffff;
+	bl.addr = 0;
+	return c2_reg_phys_mr(pd, &bl, 1, acc, &kva);
+}
+
+static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				    u64 virt, int acc, struct ib_udata *udata)
+{
+	u64 *pages;
+	u64 kva = 0;
+	int shift, n, len;
+	int i, k, entry;
+	int err = 0;
+	struct scatterlist *sg;
+	struct c2_pd *c2pd = to_c2pd(pd);
+	struct c2_mr *c2mr;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	c2mr = kmalloc(sizeof(*c2mr), GFP_KERNEL);
+	if (!c2mr)
+		return ERR_PTR(-ENOMEM);
+	c2mr->pd = c2pd;
+
+	c2mr->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	if (IS_ERR(c2mr->umem)) {
+		err = PTR_ERR(c2mr->umem);
+		kfree(c2mr);
+		return ERR_PTR(err);
+	}
+
+	shift = ffs(c2mr->umem->page_size) - 1;
+	n = c2mr->umem->nmap;
+
+	pages = kmalloc(n * sizeof(u64), GFP_KERNEL);
+	if (!pages) {
+		err = -ENOMEM;
+		goto err;
+	}
+
+	i = 0;
+	for_each_sg(c2mr->umem->sg_head.sgl, sg, c2mr->umem->nmap, entry) {
+		len = sg_dma_len(sg) >> shift;
+		for (k = 0; k < len; ++k) {
+			pages[i++] =
+				sg_dma_address(sg) +
+				(c2mr->umem->page_size * k);
+		}
+	}
+
+	kva = virt;
+  	err = c2_nsmr_register_phys_kern(to_c2dev(pd->device),
+					 pages,
+					 c2mr->umem->page_size,
+					 i,
+					 length,
+					 ib_umem_offset(c2mr->umem),
+					 &kva,
+					 c2_convert_access(acc),
+					 c2mr);
+	kfree(pages);
+	if (err)
+		goto err;
+	return &c2mr->ibmr;
+
+err:
+	ib_umem_release(c2mr->umem);
+	kfree(c2mr);
+	return ERR_PTR(err);
+}
+
+static int c2_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct c2_mr *mr = to_c2mr(ib_mr);
+	int err;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	err = c2_stag_dealloc(to_c2dev(ib_mr->device), ib_mr->lkey);
+	if (err)
+		pr_debug("c2_stag_dealloc failed: %d\n", err);
+	else {
+		if (mr->umem)
+			ib_umem_release(mr->umem);
+		kfree(mr);
+	}
+
+	return err;
+}
+
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev);
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return sprintf(buf, "%x\n", c2dev->props.hw_ver);
+}
+
+static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct c2_dev *c2dev = container_of(dev, struct c2_dev, ibdev.dev);
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return sprintf(buf, "%x.%x.%x\n",
+		       (int) (c2dev->props.fw_ver >> 32),
+		       (int) (c2dev->props.fw_ver >> 16) & 0xffff,
+		       (int) (c2dev->props.fw_ver & 0xffff));
+}
+
+static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return sprintf(buf, "AMSO1100\n");
+}
+
+static ssize_t show_board(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return sprintf(buf, "%.*s\n", 32, "AMSO1100 Board ID");
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *c2_dev_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id
+};
+
+static int c2_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+			int attr_mask, struct ib_udata *udata)
+{
+	int err;
+
+	err =
+	    c2_qp_modify(to_c2dev(ibqp->device), to_c2qp(ibqp), attr,
+			 attr_mask);
+
+	return err;
+}
+
+static int c2_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return -ENOSYS;
+}
+
+static int c2_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return -ENOSYS;
+}
+
+static int c2_process_mad(struct ib_device *ibdev,
+			  int mad_flags,
+			  u8 port_num,
+			  struct ib_wc *in_wc,
+			  struct ib_grh *in_grh,
+			  struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	return -ENOSYS;
+}
+
+static int c2_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	/* Request a connection */
+	return c2_llp_connect(cm_id, iw_param);
+}
+
+static int c2_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *iw_param)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	/* Accept the new connection */
+	return c2_llp_accept(cm_id, iw_param);
+}
+
+static int c2_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	int err;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	err = c2_llp_reject(cm_id, pdata, pdata_len);
+	return err;
+}
+
+static int c2_service_create(struct iw_cm_id *cm_id, int backlog)
+{
+	int err;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	err = c2_llp_service_create(cm_id, backlog);
+	pr_debug("%s:%u err=%d\n",
+		__func__, __LINE__,
+		err);
+	return err;
+}
+
+static int c2_service_destroy(struct iw_cm_id *cm_id)
+{
+	int err;
+	pr_debug("%s:%u\n", __func__, __LINE__);
+
+	err = c2_llp_service_destroy(cm_id);
+
+	return err;
+}
+
+static int c2_pseudo_up(struct net_device *netdev)
+{
+	struct in_device *ind;
+	struct c2_dev *c2dev = netdev->ml_priv;
+
+	ind = in_dev_get(netdev);
+	if (!ind)
+		return 0;
+
+	pr_debug("adding...\n");
+	for_ifa(ind) {
+#ifdef DEBUG
+		u8 *ip = (u8 *) & ifa->ifa_address;
+
+		pr_debug("%s: %d.%d.%d.%d\n",
+		       ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]);
+#endif
+		c2_add_addr(c2dev, ifa->ifa_address, ifa->ifa_mask);
+	}
+	endfor_ifa(ind);
+	in_dev_put(ind);
+
+	return 0;
+}
+
+static int c2_pseudo_down(struct net_device *netdev)
+{
+	struct in_device *ind;
+	struct c2_dev *c2dev = netdev->ml_priv;
+
+	ind = in_dev_get(netdev);
+	if (!ind)
+		return 0;
+
+	pr_debug("deleting...\n");
+	for_ifa(ind) {
+#ifdef DEBUG
+		u8 *ip = (u8 *) & ifa->ifa_address;
+
+		pr_debug("%s: %d.%d.%d.%d\n",
+		       ifa->ifa_label, ip[0], ip[1], ip[2], ip[3]);
+#endif
+		c2_del_addr(c2dev, ifa->ifa_address, ifa->ifa_mask);
+	}
+	endfor_ifa(ind);
+	in_dev_put(ind);
+
+	return 0;
+}
+
+static int c2_pseudo_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
+{
+	kfree_skb(skb);
+	return NETDEV_TX_OK;
+}
+
+static int c2_pseudo_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	if (new_mtu < ETH_ZLEN || new_mtu > ETH_JUMBO_MTU)
+		return -EINVAL;
+
+	netdev->mtu = new_mtu;
+
+	/* TODO: Tell rnic about new rmda interface mtu */
+	return 0;
+}
+
+static const struct net_device_ops c2_pseudo_netdev_ops = {
+	.ndo_open 		= c2_pseudo_up,
+	.ndo_stop 		= c2_pseudo_down,
+	.ndo_start_xmit 	= c2_pseudo_xmit_frame,
+	.ndo_change_mtu 	= c2_pseudo_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+};
+
+static void setup(struct net_device *netdev)
+{
+	netdev->netdev_ops = &c2_pseudo_netdev_ops;
+
+	netdev->watchdog_timeo = 0;
+	netdev->type = ARPHRD_ETHER;
+	netdev->mtu = 1500;
+	netdev->hard_header_len = ETH_HLEN;
+	netdev->addr_len = ETH_ALEN;
+	netdev->tx_queue_len = 0;
+	netdev->flags |= IFF_NOARP;
+}
+
+static struct net_device *c2_pseudo_netdev_init(struct c2_dev *c2dev)
+{
+	char name[IFNAMSIZ];
+	struct net_device *netdev;
+
+	/* change ethxxx to iwxxx */
+	strcpy(name, "iw");
+	strcat(name, &c2dev->netdev->name[3]);
+	netdev = alloc_netdev(0, name, NET_NAME_UNKNOWN, setup);
+	if (!netdev) {
+		printk(KERN_ERR PFX "%s -  etherdev alloc failed",
+			__func__);
+		return NULL;
+	}
+
+	netdev->ml_priv = c2dev;
+
+	SET_NETDEV_DEV(netdev, &c2dev->pcidev->dev);
+
+	memcpy_fromio(netdev->dev_addr, c2dev->kva + C2_REGS_RDMA_ENADDR, 6);
+
+	/* Print out the MAC address */
+	pr_debug("%s: MAC %pM\n", netdev->name, netdev->dev_addr);
+
+#if 0
+	/* Disable network packets */
+	netif_stop_queue(netdev);
+#endif
+	return netdev;
+}
+
+int c2_register_device(struct c2_dev *dev)
+{
+	int ret = -ENOMEM;
+	int i;
+
+	/* Register pseudo network device */
+	dev->pseudo_netdev = c2_pseudo_netdev_init(dev);
+	if (!dev->pseudo_netdev)
+		goto out;
+
+	ret = register_netdev(dev->pseudo_netdev);
+	if (ret)
+		goto out_free_netdev;
+
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	strlcpy(dev->ibdev.name, "amso%d", IB_DEVICE_NAME_MAX);
+	dev->ibdev.owner = THIS_MODULE;
+	dev->ibdev.uverbs_cmd_mask =
+	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+	    (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_REG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+	    (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+	    (1ull << IB_USER_VERBS_CMD_POST_RECV);
+
+	dev->ibdev.node_type = RDMA_NODE_RNIC;
+	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
+	memcpy(&dev->ibdev.node_guid, dev->pseudo_netdev->dev_addr, 6);
+	dev->ibdev.phys_port_cnt = 1;
+	dev->ibdev.num_comp_vectors = 1;
+	dev->ibdev.dma_device = &dev->pcidev->dev;
+	dev->ibdev.query_device = c2_query_device;
+	dev->ibdev.query_port = c2_query_port;
+	dev->ibdev.query_pkey = c2_query_pkey;
+	dev->ibdev.query_gid = c2_query_gid;
+	dev->ibdev.alloc_ucontext = c2_alloc_ucontext;
+	dev->ibdev.dealloc_ucontext = c2_dealloc_ucontext;
+	dev->ibdev.mmap = c2_mmap_uar;
+	dev->ibdev.alloc_pd = c2_alloc_pd;
+	dev->ibdev.dealloc_pd = c2_dealloc_pd;
+	dev->ibdev.create_ah = c2_ah_create;
+	dev->ibdev.destroy_ah = c2_ah_destroy;
+	dev->ibdev.create_qp = c2_create_qp;
+	dev->ibdev.modify_qp = c2_modify_qp;
+	dev->ibdev.destroy_qp = c2_destroy_qp;
+	dev->ibdev.create_cq = c2_create_cq;
+	dev->ibdev.destroy_cq = c2_destroy_cq;
+	dev->ibdev.poll_cq = c2_poll_cq;
+	dev->ibdev.get_dma_mr = c2_get_dma_mr;
+	dev->ibdev.reg_phys_mr = c2_reg_phys_mr;
+	dev->ibdev.reg_user_mr = c2_reg_user_mr;
+	dev->ibdev.dereg_mr = c2_dereg_mr;
+
+	dev->ibdev.alloc_fmr = NULL;
+	dev->ibdev.unmap_fmr = NULL;
+	dev->ibdev.dealloc_fmr = NULL;
+	dev->ibdev.map_phys_fmr = NULL;
+
+	dev->ibdev.attach_mcast = c2_multicast_attach;
+	dev->ibdev.detach_mcast = c2_multicast_detach;
+	dev->ibdev.process_mad = c2_process_mad;
+
+	dev->ibdev.req_notify_cq = c2_arm_cq;
+	dev->ibdev.post_send = c2_post_send;
+	dev->ibdev.post_recv = c2_post_receive;
+
+	dev->ibdev.iwcm = kmalloc(sizeof(*dev->ibdev.iwcm), GFP_KERNEL);
+	if (dev->ibdev.iwcm == NULL) {
+		ret = -ENOMEM;
+		goto out_unregister_netdev;
+	}
+	dev->ibdev.iwcm->add_ref = c2_add_ref;
+	dev->ibdev.iwcm->rem_ref = c2_rem_ref;
+	dev->ibdev.iwcm->get_qp = c2_get_qp;
+	dev->ibdev.iwcm->connect = c2_connect;
+	dev->ibdev.iwcm->accept = c2_accept;
+	dev->ibdev.iwcm->reject = c2_reject;
+	dev->ibdev.iwcm->create_listen = c2_service_create;
+	dev->ibdev.iwcm->destroy_listen = c2_service_destroy;
+
+	ret = ib_register_device(&dev->ibdev, NULL);
+	if (ret)
+		goto out_free_iwcm;
+
+	for (i = 0; i < ARRAY_SIZE(c2_dev_attributes); ++i) {
+		ret = device_create_file(&dev->ibdev.dev,
+					       c2_dev_attributes[i]);
+		if (ret)
+			goto out_unregister_ibdev;
+	}
+	goto out;
+
+out_unregister_ibdev:
+	ib_unregister_device(&dev->ibdev);
+out_free_iwcm:
+	kfree(dev->ibdev.iwcm);
+out_unregister_netdev:
+	unregister_netdev(dev->pseudo_netdev);
+out_free_netdev:
+	free_netdev(dev->pseudo_netdev);
+out:
+	pr_debug("%s:%u ret=%d\n", __func__, __LINE__, ret);
+	return ret;
+}
+
+void c2_unregister_device(struct c2_dev *dev)
+{
+	pr_debug("%s:%u\n", __func__, __LINE__);
+	unregister_netdev(dev->pseudo_netdev);
+	free_netdev(dev->pseudo_netdev);
+	ib_unregister_device(&dev->ibdev);
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_provider.h b/drivers/infiniband/hw/amso1100/c2_provider.h
new file mode 100644
index 000000000..bf1899877
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_provider.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef C2_PROVIDER_H
+#define C2_PROVIDER_H
+#include <linux/inetdevice.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+
+#include "c2_mq.h"
+#include <rdma/iw_cm.h>
+
+#define C2_MPT_FLAG_ATOMIC        (1 << 14)
+#define C2_MPT_FLAG_REMOTE_WRITE  (1 << 13)
+#define C2_MPT_FLAG_REMOTE_READ   (1 << 12)
+#define C2_MPT_FLAG_LOCAL_WRITE   (1 << 11)
+#define C2_MPT_FLAG_LOCAL_READ    (1 << 10)
+
+struct c2_buf_list {
+	void *buf;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+};
+
+
+/* The user context keeps track of objects allocated for a
+ * particular user-mode client. */
+struct c2_ucontext {
+	struct ib_ucontext ibucontext;
+};
+
+struct c2_mtt;
+
+/* All objects associated with a PD are kept in the
+ * associated user context if present.
+ */
+struct c2_pd {
+	struct ib_pd ibpd;
+	u32 pd_id;
+};
+
+struct c2_mr {
+	struct ib_mr ibmr;
+	struct c2_pd *pd;
+	struct ib_umem *umem;
+};
+
+struct c2_av;
+
+enum c2_ah_type {
+	C2_AH_ON_HCA,
+	C2_AH_PCI_POOL,
+	C2_AH_KMALLOC
+};
+
+struct c2_ah {
+	struct ib_ah ibah;
+};
+
+struct c2_cq {
+	struct ib_cq ibcq;
+	spinlock_t lock;
+	atomic_t refcount;
+	int cqn;
+	int is_kernel;
+	wait_queue_head_t wait;
+
+	u32 adapter_handle;
+	struct c2_mq mq;
+};
+
+struct c2_wq {
+	spinlock_t lock;
+};
+struct iw_cm_id;
+struct c2_qp {
+	struct ib_qp ibqp;
+	struct iw_cm_id *cm_id;
+	spinlock_t lock;
+	atomic_t refcount;
+	wait_queue_head_t wait;
+	int qpn;
+
+	u32 adapter_handle;
+	u32 send_sgl_depth;
+	u32 recv_sgl_depth;
+	u32 rdma_write_sgl_depth;
+	u8 state;
+
+	struct c2_mq sq_mq;
+	struct c2_mq rq_mq;
+};
+
+struct c2_cr_query_attrs {
+	u32 local_addr;
+	u32 remote_addr;
+	u16 local_port;
+	u16 remote_port;
+};
+
+static inline struct c2_pd *to_c2pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct c2_pd, ibpd);
+}
+
+static inline struct c2_ucontext *to_c2ucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct c2_ucontext, ibucontext);
+}
+
+static inline struct c2_mr *to_c2mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct c2_mr, ibmr);
+}
+
+
+static inline struct c2_ah *to_c2ah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct c2_ah, ibah);
+}
+
+static inline struct c2_cq *to_c2cq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct c2_cq, ibcq);
+}
+
+static inline struct c2_qp *to_c2qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct c2_qp, ibqp);
+}
+
+static inline int is_rnic_addr(struct net_device *netdev, u32 addr)
+{
+	struct in_device *ind;
+	int ret = 0;
+
+	ind = in_dev_get(netdev);
+	if (!ind)
+		return 0;
+
+	for_ifa(ind) {
+		if (ifa->ifa_address == addr) {
+			ret = 1;
+			break;
+		}
+	}
+	endfor_ifa(ind);
+	in_dev_put(ind);
+	return ret;
+}
+#endif				/* C2_PROVIDER_H */
diff --git a/drivers/infiniband/hw/amso1100/c2_qp.c b/drivers/infiniband/hw/amso1100/c2_qp.c
new file mode 100644
index 000000000..86708dee5
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_qp.c
@@ -0,0 +1,1024 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/delay.h>
+#include <linux/gfp.h>
+
+#include "c2.h"
+#include "c2_vq.h"
+#include "c2_status.h"
+
+#define C2_MAX_ORD_PER_QP 128
+#define C2_MAX_IRD_PER_QP 128
+
+#define C2_HINT_MAKE(q_index, hint_count) (((q_index) << 16) | hint_count)
+#define C2_HINT_GET_INDEX(hint) (((hint) & 0x7FFF0000) >> 16)
+#define C2_HINT_GET_COUNT(hint) ((hint) & 0x0000FFFF)
+
+#define NO_SUPPORT -1
+static const u8 c2_opcode[] = {
+	[IB_WR_SEND] = C2_WR_TYPE_SEND,
+	[IB_WR_SEND_WITH_IMM] = NO_SUPPORT,
+	[IB_WR_RDMA_WRITE] = C2_WR_TYPE_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM] = NO_SUPPORT,
+	[IB_WR_RDMA_READ] = C2_WR_TYPE_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP] = NO_SUPPORT,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = NO_SUPPORT,
+};
+
+static int to_c2_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET:
+		return C2_QP_STATE_IDLE;
+	case IB_QPS_RTS:
+		return C2_QP_STATE_RTS;
+	case IB_QPS_SQD:
+		return C2_QP_STATE_CLOSING;
+	case IB_QPS_SQE:
+		return C2_QP_STATE_CLOSING;
+	case IB_QPS_ERR:
+		return C2_QP_STATE_ERROR;
+	default:
+		return -1;
+	}
+}
+
+static int to_ib_state(enum c2_qp_state c2_state)
+{
+	switch (c2_state) {
+	case C2_QP_STATE_IDLE:
+		return IB_QPS_RESET;
+	case C2_QP_STATE_CONNECTING:
+		return IB_QPS_RTR;
+	case C2_QP_STATE_RTS:
+		return IB_QPS_RTS;
+	case C2_QP_STATE_CLOSING:
+		return IB_QPS_SQD;
+	case C2_QP_STATE_ERROR:
+		return IB_QPS_ERR;
+	case C2_QP_STATE_TERMINATE:
+		return IB_QPS_SQE;
+	default:
+		return -1;
+	}
+}
+
+static const char *to_ib_state_str(int ib_state)
+{
+	static const char *state_str[] = {
+		"IB_QPS_RESET",
+		"IB_QPS_INIT",
+		"IB_QPS_RTR",
+		"IB_QPS_RTS",
+		"IB_QPS_SQD",
+		"IB_QPS_SQE",
+		"IB_QPS_ERR"
+	};
+	if (ib_state < IB_QPS_RESET ||
+	    ib_state > IB_QPS_ERR)
+		return "<invalid IB QP state>";
+
+	ib_state -= IB_QPS_RESET;
+	return state_str[ib_state];
+}
+
+void c2_set_qp_state(struct c2_qp *qp, int c2_state)
+{
+	int new_state = to_ib_state(c2_state);
+
+	pr_debug("%s: qp[%p] state modify %s --> %s\n",
+	       __func__,
+		qp,
+		to_ib_state_str(qp->state),
+		to_ib_state_str(new_state));
+	qp->state = new_state;
+}
+
+#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF
+
+int c2_qp_modify(struct c2_dev *c2dev, struct c2_qp *qp,
+		 struct ib_qp_attr *attr, int attr_mask)
+{
+	struct c2wr_qp_modify_req wr;
+	struct c2wr_qp_modify_rep *reply;
+	struct c2_vq_req *vq_req;
+	unsigned long flags;
+	u8 next_state;
+	int err;
+
+	pr_debug("%s:%d qp=%p, %s --> %s\n",
+		__func__, __LINE__,
+		qp,
+		to_ib_state_str(qp->state),
+		to_ib_state_str(attr->qp_state));
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	c2_wr_set_id(&wr, CCWR_QP_MODIFY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.qp_handle = qp->adapter_handle;
+	wr.ord = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+	wr.ird = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+	wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+	wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+
+	if (attr_mask & IB_QP_STATE) {
+		/* Ensure the state is valid */
+		if (attr->qp_state < 0 || attr->qp_state > IB_QPS_ERR) {
+			err = -EINVAL;
+			goto bail0;
+		}
+
+		wr.next_qp_state = cpu_to_be32(to_c2_state(attr->qp_state));
+
+		if (attr->qp_state == IB_QPS_ERR) {
+			spin_lock_irqsave(&qp->lock, flags);
+			if (qp->cm_id && qp->state == IB_QPS_RTS) {
+				pr_debug("Generating CLOSE event for QP-->ERR, "
+					"qp=%p, cm_id=%p\n",qp,qp->cm_id);
+				/* Generate an CLOSE event */
+				vq_req->cm_id = qp->cm_id;
+				vq_req->event = IW_CM_EVENT_CLOSE;
+			}
+			spin_unlock_irqrestore(&qp->lock, flags);
+		}
+		next_state =  attr->qp_state;
+
+	} else if (attr_mask & IB_QP_CUR_STATE) {
+
+		if (attr->cur_qp_state != IB_QPS_RTR &&
+		    attr->cur_qp_state != IB_QPS_RTS &&
+		    attr->cur_qp_state != IB_QPS_SQD &&
+		    attr->cur_qp_state != IB_QPS_SQE) {
+			err = -EINVAL;
+			goto bail0;
+		} else
+			wr.next_qp_state =
+			    cpu_to_be32(to_c2_state(attr->cur_qp_state));
+
+		next_state = attr->cur_qp_state;
+
+	} else {
+		err = 0;
+		goto bail0;
+	}
+
+	/* reference the request struct */
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail0;
+
+	reply = (struct c2wr_qp_modify_rep *) (unsigned long) vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	err = c2_errno(reply);
+	if (!err)
+		qp->state = next_state;
+#ifdef DEBUG
+	else
+		pr_debug("%s: c2_errno=%d\n", __func__, err);
+#endif
+	/*
+	 * If we're going to error and generating the event here, then
+	 * we need to remove the reference because there will be no
+	 * close event generated by the adapter
+	*/
+	spin_lock_irqsave(&qp->lock, flags);
+	if (vq_req->event==IW_CM_EVENT_CLOSE && qp->cm_id) {
+		qp->cm_id->rem_ref(qp->cm_id);
+		qp->cm_id = NULL;
+	}
+	spin_unlock_irqrestore(&qp->lock, flags);
+
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+
+	pr_debug("%s:%d qp=%p, cur_state=%s\n",
+		__func__, __LINE__,
+		qp,
+		to_ib_state_str(qp->state));
+	return err;
+}
+
+int c2_qp_set_read_limits(struct c2_dev *c2dev, struct c2_qp *qp,
+			  int ord, int ird)
+{
+	struct c2wr_qp_modify_req wr;
+	struct c2wr_qp_modify_rep *reply;
+	struct c2_vq_req *vq_req;
+	int err;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	c2_wr_set_id(&wr, CCWR_QP_MODIFY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.qp_handle = qp->adapter_handle;
+	wr.ord = cpu_to_be32(ord);
+	wr.ird = cpu_to_be32(ird);
+	wr.sq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+	wr.rq_depth = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+	wr.next_qp_state = cpu_to_be32(C2_QP_NO_ATTR_CHANGE);
+
+	/* reference the request struct */
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail0;
+
+	reply = (struct c2wr_qp_modify_rep *) (unsigned long)
+		vq_req->reply_msg;
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	err = c2_errno(reply);
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+static int destroy_qp(struct c2_dev *c2dev, struct c2_qp *qp)
+{
+	struct c2_vq_req *vq_req;
+	struct c2wr_qp_destroy_req wr;
+	struct c2wr_qp_destroy_rep *reply;
+	unsigned long flags;
+	int err;
+
+	/*
+	 * Allocate a verb request message
+	 */
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req) {
+		return -ENOMEM;
+	}
+
+	/*
+	 * Initialize the WR
+	 */
+	c2_wr_set_id(&wr, CCWR_QP_DESTROY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.qp_handle = qp->adapter_handle;
+
+	/*
+	 * reference the request struct.  dereferenced in the int handler.
+	 */
+	vq_req_get(c2dev, vq_req);
+
+	spin_lock_irqsave(&qp->lock, flags);
+	if (qp->cm_id && qp->state == IB_QPS_RTS) {
+		pr_debug("destroy_qp: generating CLOSE event for QP-->ERR, "
+			"qp=%p, cm_id=%p\n",qp,qp->cm_id);
+		/* Generate an CLOSE event */
+		vq_req->qp = qp;
+		vq_req->cm_id = qp->cm_id;
+		vq_req->event = IW_CM_EVENT_CLOSE;
+	}
+	spin_unlock_irqrestore(&qp->lock, flags);
+
+	/*
+	 * Send WR to adapter
+	 */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	/*
+	 * Wait for reply from adapter
+	 */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail0;
+	}
+
+	/*
+	 * Process reply
+	 */
+	reply = (struct c2wr_qp_destroy_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	spin_lock_irqsave(&qp->lock, flags);
+	if (qp->cm_id) {
+		qp->cm_id->rem_ref(qp->cm_id);
+		qp->cm_id = NULL;
+	}
+	spin_unlock_irqrestore(&qp->lock, flags);
+
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+static int c2_alloc_qpn(struct c2_dev *c2dev, struct c2_qp *qp)
+{
+	int ret;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irq(&c2dev->qp_table.lock);
+
+	ret = idr_alloc_cyclic(&c2dev->qp_table.idr, qp, 0, 0, GFP_NOWAIT);
+	if (ret >= 0)
+		qp->qpn = ret;
+
+	spin_unlock_irq(&c2dev->qp_table.lock);
+	idr_preload_end();
+	return ret < 0 ? ret : 0;
+}
+
+static void c2_free_qpn(struct c2_dev *c2dev, int qpn)
+{
+	spin_lock_irq(&c2dev->qp_table.lock);
+	idr_remove(&c2dev->qp_table.idr, qpn);
+	spin_unlock_irq(&c2dev->qp_table.lock);
+}
+
+struct c2_qp *c2_find_qpn(struct c2_dev *c2dev, int qpn)
+{
+	unsigned long flags;
+	struct c2_qp *qp;
+
+	spin_lock_irqsave(&c2dev->qp_table.lock, flags);
+	qp = idr_find(&c2dev->qp_table.idr, qpn);
+	spin_unlock_irqrestore(&c2dev->qp_table.lock, flags);
+	return qp;
+}
+
+int c2_alloc_qp(struct c2_dev *c2dev,
+		struct c2_pd *pd,
+		struct ib_qp_init_attr *qp_attrs, struct c2_qp *qp)
+{
+	struct c2wr_qp_create_req wr;
+	struct c2wr_qp_create_rep *reply;
+	struct c2_vq_req *vq_req;
+	struct c2_cq *send_cq = to_c2cq(qp_attrs->send_cq);
+	struct c2_cq *recv_cq = to_c2cq(qp_attrs->recv_cq);
+	unsigned long peer_pa;
+	u32 q_size, msg_size, mmap_size;
+	void __iomem *mmap;
+	int err;
+
+	err = c2_alloc_qpn(c2dev, qp);
+	if (err)
+		return err;
+	qp->ibqp.qp_num = qp->qpn;
+	qp->ibqp.qp_type = IB_QPT_RC;
+
+	/* Allocate the SQ and RQ shared pointers */
+	qp->sq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					 &qp->sq_mq.shared_dma, GFP_KERNEL);
+	if (!qp->sq_mq.shared) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	qp->rq_mq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					 &qp->rq_mq.shared_dma, GFP_KERNEL);
+	if (!qp->rq_mq.shared) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	/* Allocate the verbs request */
+	vq_req = vq_req_alloc(c2dev);
+	if (vq_req == NULL) {
+		err = -ENOMEM;
+		goto bail2;
+	}
+
+	/* Initialize the work request */
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_QP_CREATE);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+	wr.sq_cq_handle = send_cq->adapter_handle;
+	wr.rq_cq_handle = recv_cq->adapter_handle;
+	wr.sq_depth = cpu_to_be32(qp_attrs->cap.max_send_wr + 1);
+	wr.rq_depth = cpu_to_be32(qp_attrs->cap.max_recv_wr + 1);
+	wr.srq_handle = 0;
+	wr.flags = cpu_to_be32(QP_RDMA_READ | QP_RDMA_WRITE | QP_MW_BIND |
+			       QP_ZERO_STAG | QP_RDMA_READ_RESPONSE);
+	wr.send_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge);
+	wr.recv_sgl_depth = cpu_to_be32(qp_attrs->cap.max_recv_sge);
+	wr.rdma_write_sgl_depth = cpu_to_be32(qp_attrs->cap.max_send_sge);
+	wr.shared_sq_ht = cpu_to_be64(qp->sq_mq.shared_dma);
+	wr.shared_rq_ht = cpu_to_be64(qp->rq_mq.shared_dma);
+	wr.ord = cpu_to_be32(C2_MAX_ORD_PER_QP);
+	wr.ird = cpu_to_be32(C2_MAX_IRD_PER_QP);
+	wr.pd_id = pd->pd_id;
+	wr.user_context = (unsigned long) qp;
+
+	vq_req_get(c2dev, vq_req);
+
+	/* Send the WR to the adapter */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail3;
+	}
+
+	/* Wait for the verb reply  */
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail3;
+	}
+
+	/* Process the reply */
+	reply = (struct c2wr_qp_create_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail3;
+	}
+
+	if ((err = c2_wr_get_result(reply)) != 0) {
+		goto bail4;
+	}
+
+	/* Fill in the kernel QP struct */
+	atomic_set(&qp->refcount, 1);
+	qp->adapter_handle = reply->qp_handle;
+	qp->state = IB_QPS_RESET;
+	qp->send_sgl_depth = qp_attrs->cap.max_send_sge;
+	qp->rdma_write_sgl_depth = qp_attrs->cap.max_send_sge;
+	qp->recv_sgl_depth = qp_attrs->cap.max_recv_sge;
+	init_waitqueue_head(&qp->wait);
+
+	/* Initialize the SQ MQ */
+	q_size = be32_to_cpu(reply->sq_depth);
+	msg_size = be32_to_cpu(reply->sq_msg_size);
+	peer_pa = c2dev->pa + be32_to_cpu(reply->sq_mq_start);
+	mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size);
+	mmap = ioremap_nocache(peer_pa, mmap_size);
+	if (!mmap) {
+		err = -ENOMEM;
+		goto bail5;
+	}
+
+	c2_mq_req_init(&qp->sq_mq,
+		       be32_to_cpu(reply->sq_mq_index),
+		       q_size,
+		       msg_size,
+		       mmap + sizeof(struct c2_mq_shared),	/* pool start */
+		       mmap,				/* peer */
+		       C2_MQ_ADAPTER_TARGET);
+
+	/* Initialize the RQ mq */
+	q_size = be32_to_cpu(reply->rq_depth);
+	msg_size = be32_to_cpu(reply->rq_msg_size);
+	peer_pa = c2dev->pa + be32_to_cpu(reply->rq_mq_start);
+	mmap_size = PAGE_ALIGN(sizeof(struct c2_mq_shared) + msg_size * q_size);
+	mmap = ioremap_nocache(peer_pa, mmap_size);
+	if (!mmap) {
+		err = -ENOMEM;
+		goto bail6;
+	}
+
+	c2_mq_req_init(&qp->rq_mq,
+		       be32_to_cpu(reply->rq_mq_index),
+		       q_size,
+		       msg_size,
+		       mmap + sizeof(struct c2_mq_shared),	/* pool start */
+		       mmap,				/* peer */
+		       C2_MQ_ADAPTER_TARGET);
+
+	vq_repbuf_free(c2dev, reply);
+	vq_req_free(c2dev, vq_req);
+
+	return 0;
+
+      bail6:
+	iounmap(qp->sq_mq.peer);
+      bail5:
+	destroy_qp(c2dev, qp);
+      bail4:
+	vq_repbuf_free(c2dev, reply);
+      bail3:
+	vq_req_free(c2dev, vq_req);
+      bail2:
+	c2_free_mqsp(qp->rq_mq.shared);
+      bail1:
+	c2_free_mqsp(qp->sq_mq.shared);
+      bail0:
+	c2_free_qpn(c2dev, qp->qpn);
+	return err;
+}
+
+static inline void c2_lock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq)
+{
+	if (send_cq == recv_cq)
+		spin_lock_irq(&send_cq->lock);
+	else if (send_cq > recv_cq) {
+		spin_lock_irq(&send_cq->lock);
+		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock_irq(&recv_cq->lock);
+		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+	}
+}
+
+static inline void c2_unlock_cqs(struct c2_cq *send_cq, struct c2_cq *recv_cq)
+{
+	if (send_cq == recv_cq)
+		spin_unlock_irq(&send_cq->lock);
+	else if (send_cq > recv_cq) {
+		spin_unlock(&recv_cq->lock);
+		spin_unlock_irq(&send_cq->lock);
+	} else {
+		spin_unlock(&send_cq->lock);
+		spin_unlock_irq(&recv_cq->lock);
+	}
+}
+
+void c2_free_qp(struct c2_dev *c2dev, struct c2_qp *qp)
+{
+	struct c2_cq *send_cq;
+	struct c2_cq *recv_cq;
+
+	send_cq = to_c2cq(qp->ibqp.send_cq);
+	recv_cq = to_c2cq(qp->ibqp.recv_cq);
+
+	/*
+	 * Lock CQs here, so that CQ polling code can do QP lookup
+	 * without taking a lock.
+	 */
+	c2_lock_cqs(send_cq, recv_cq);
+	c2_free_qpn(c2dev, qp->qpn);
+	c2_unlock_cqs(send_cq, recv_cq);
+
+	/*
+	 * Destroy qp in the rnic...
+	 */
+	destroy_qp(c2dev, qp);
+
+	/*
+	 * Mark any unreaped CQEs as null and void.
+	 */
+	c2_cq_clean(c2dev, qp, send_cq->cqn);
+	if (send_cq != recv_cq)
+		c2_cq_clean(c2dev, qp, recv_cq->cqn);
+	/*
+	 * Unmap the MQs and return the shared pointers
+	 * to the message pool.
+	 */
+	iounmap(qp->sq_mq.peer);
+	iounmap(qp->rq_mq.peer);
+	c2_free_mqsp(qp->sq_mq.shared);
+	c2_free_mqsp(qp->rq_mq.shared);
+
+	atomic_dec(&qp->refcount);
+	wait_event(qp->wait, !atomic_read(&qp->refcount));
+}
+
+/*
+ * Function: move_sgl
+ *
+ * Description:
+ * Move an SGL from the user's work request struct into a CCIL Work Request
+ * message, swapping to WR byte order and ensure the total length doesn't
+ * overflow.
+ *
+ * IN:
+ * dst		- ptr to CCIL Work Request message SGL memory.
+ * src		- ptr to the consumers SGL memory.
+ *
+ * OUT: none
+ *
+ * Return:
+ * CCIL status codes.
+ */
+static int
+move_sgl(struct c2_data_addr * dst, struct ib_sge *src, int count, u32 * p_len,
+	 u8 * actual_count)
+{
+	u32 tot = 0;		/* running total */
+	u8 acount = 0;		/* running total non-0 len sge's */
+
+	while (count > 0) {
+		/*
+		 * If the addition of this SGE causes the
+		 * total SGL length to exceed 2^32-1, then
+		 * fail-n-bail.
+		 *
+		 * If the current total plus the next element length
+		 * wraps, then it will go negative and be less than the
+		 * current total...
+		 */
+		if ((tot + src->length) < tot) {
+			return -EINVAL;
+		}
+		/*
+		 * Bug: 1456 (as well as 1498 & 1643)
+		 * Skip over any sge's supplied with len=0
+		 */
+		if (src->length) {
+			tot += src->length;
+			dst->stag = cpu_to_be32(src->lkey);
+			dst->to = cpu_to_be64(src->addr);
+			dst->length = cpu_to_be32(src->length);
+			dst++;
+			acount++;
+		}
+		src++;
+		count--;
+	}
+
+	if (acount == 0) {
+		/*
+		 * Bug: 1476 (as well as 1498, 1456 and 1643)
+		 * Setup the SGL in the WR to make it easier for the RNIC.
+		 * This way, the FW doesn't have to deal with special cases.
+		 * Setting length=0 should be sufficient.
+		 */
+		dst->stag = 0;
+		dst->to = 0;
+		dst->length = 0;
+	}
+
+	*p_len = tot;
+	*actual_count = acount;
+	return 0;
+}
+
+/*
+ * Function: c2_activity (private function)
+ *
+ * Description:
+ * Post an mq index to the host->adapter activity fifo.
+ *
+ * IN:
+ * c2dev	- ptr to c2dev structure
+ * mq_index	- mq index to post
+ * shared	- value most recently written to shared
+ *
+ * OUT:
+ *
+ * Return:
+ * none
+ */
+static inline void c2_activity(struct c2_dev *c2dev, u32 mq_index, u16 shared)
+{
+	/*
+	 * First read the register to see if the FIFO is full, and if so,
+	 * spin until it's not.  This isn't perfect -- there is no
+	 * synchronization among the clients of the register, but in
+	 * practice it prevents multiple CPU from hammering the bus
+	 * with PCI RETRY. Note that when this does happen, the card
+	 * cannot get on the bus and the card and system hang in a
+	 * deadlock -- thus the need for this code. [TOT]
+	 */
+	while (readl(c2dev->regs + PCI_BAR0_ADAPTER_HINT) & 0x80000000)
+		udelay(10);
+
+	__raw_writel(C2_HINT_MAKE(mq_index, shared),
+		     c2dev->regs + PCI_BAR0_ADAPTER_HINT);
+}
+
+/*
+ * Function: qp_wr_post
+ *
+ * Description:
+ * This in-line function allocates a MQ msg, then moves the host-copy of
+ * the completed WR into msg.  Then it posts the message.
+ *
+ * IN:
+ * q		- ptr to user MQ.
+ * wr		- ptr to host-copy of the WR.
+ * qp		- ptr to user qp
+ * size		- Number of bytes to post.  Assumed to be divisible by 4.
+ *
+ * OUT: none
+ *
+ * Return:
+ * CCIL status codes.
+ */
+static int qp_wr_post(struct c2_mq *q, union c2wr * wr, struct c2_qp *qp, u32 size)
+{
+	union c2wr *msg;
+
+	msg = c2_mq_alloc(q);
+	if (msg == NULL) {
+		return -EINVAL;
+	}
+#ifdef CCMSGMAGIC
+	((c2wr_hdr_t *) wr)->magic = cpu_to_be32(CCWR_MAGIC);
+#endif
+
+	/*
+	 * Since all header fields in the WR are the same as the
+	 * CQE, set the following so the adapter need not.
+	 */
+	c2_wr_set_result(wr, CCERR_PENDING);
+
+	/*
+	 * Copy the wr down to the adapter
+	 */
+	memcpy((void *) msg, (void *) wr, size);
+
+	c2_mq_produce(q);
+	return 0;
+}
+
+
+int c2_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
+		 struct ib_send_wr **bad_wr)
+{
+	struct c2_dev *c2dev = to_c2dev(ibqp->device);
+	struct c2_qp *qp = to_c2qp(ibqp);
+	union c2wr wr;
+	unsigned long lock_flags;
+	int err = 0;
+
+	u32 flags;
+	u32 tot_len;
+	u8 actual_sge_count;
+	u32 msg_size;
+
+	if (qp->state > IB_QPS_RTS) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	while (ib_wr) {
+
+		flags = 0;
+		wr.sqwr.sq_hdr.user_hdr.hdr.context = ib_wr->wr_id;
+		if (ib_wr->send_flags & IB_SEND_SIGNALED) {
+			flags |= SQ_SIGNALED;
+		}
+
+		switch (ib_wr->opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_INV:
+			if (ib_wr->opcode == IB_WR_SEND) {
+				if (ib_wr->send_flags & IB_SEND_SOLICITED)
+					c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE);
+				else
+					c2_wr_set_id(&wr, C2_WR_TYPE_SEND);
+				wr.sqwr.send.remote_stag = 0;
+			} else {
+				if (ib_wr->send_flags & IB_SEND_SOLICITED)
+					c2_wr_set_id(&wr, C2_WR_TYPE_SEND_SE_INV);
+				else
+					c2_wr_set_id(&wr, C2_WR_TYPE_SEND_INV);
+				wr.sqwr.send.remote_stag =
+					cpu_to_be32(ib_wr->ex.invalidate_rkey);
+			}
+
+			msg_size = sizeof(struct c2wr_send_req) +
+				sizeof(struct c2_data_addr) * ib_wr->num_sge;
+			if (ib_wr->num_sge > qp->send_sgl_depth) {
+				err = -EINVAL;
+				break;
+			}
+			if (ib_wr->send_flags & IB_SEND_FENCE) {
+				flags |= SQ_READ_FENCE;
+			}
+			err = move_sgl((struct c2_data_addr *) & (wr.sqwr.send.data),
+				       ib_wr->sg_list,
+				       ib_wr->num_sge,
+				       &tot_len, &actual_sge_count);
+			wr.sqwr.send.sge_len = cpu_to_be32(tot_len);
+			c2_wr_set_sge_count(&wr, actual_sge_count);
+			break;
+		case IB_WR_RDMA_WRITE:
+			c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_WRITE);
+			msg_size = sizeof(struct c2wr_rdma_write_req) +
+			    (sizeof(struct c2_data_addr) * ib_wr->num_sge);
+			if (ib_wr->num_sge > qp->rdma_write_sgl_depth) {
+				err = -EINVAL;
+				break;
+			}
+			if (ib_wr->send_flags & IB_SEND_FENCE) {
+				flags |= SQ_READ_FENCE;
+			}
+			wr.sqwr.rdma_write.remote_stag =
+			    cpu_to_be32(ib_wr->wr.rdma.rkey);
+			wr.sqwr.rdma_write.remote_to =
+			    cpu_to_be64(ib_wr->wr.rdma.remote_addr);
+			err = move_sgl((struct c2_data_addr *)
+				       & (wr.sqwr.rdma_write.data),
+				       ib_wr->sg_list,
+				       ib_wr->num_sge,
+				       &tot_len, &actual_sge_count);
+			wr.sqwr.rdma_write.sge_len = cpu_to_be32(tot_len);
+			c2_wr_set_sge_count(&wr, actual_sge_count);
+			break;
+		case IB_WR_RDMA_READ:
+			c2_wr_set_id(&wr, C2_WR_TYPE_RDMA_READ);
+			msg_size = sizeof(struct c2wr_rdma_read_req);
+
+			/* IWarp only suppots 1 sge for RDMA reads */
+			if (ib_wr->num_sge > 1) {
+				err = -EINVAL;
+				break;
+			}
+
+			/*
+			 * Move the local and remote stag/to/len into the WR.
+			 */
+			wr.sqwr.rdma_read.local_stag =
+			    cpu_to_be32(ib_wr->sg_list->lkey);
+			wr.sqwr.rdma_read.local_to =
+			    cpu_to_be64(ib_wr->sg_list->addr);
+			wr.sqwr.rdma_read.remote_stag =
+			    cpu_to_be32(ib_wr->wr.rdma.rkey);
+			wr.sqwr.rdma_read.remote_to =
+			    cpu_to_be64(ib_wr->wr.rdma.remote_addr);
+			wr.sqwr.rdma_read.length =
+			    cpu_to_be32(ib_wr->sg_list->length);
+			break;
+		default:
+			/* error */
+			msg_size = 0;
+			err = -EINVAL;
+			break;
+		}
+
+		/*
+		 * If we had an error on the last wr build, then
+		 * break out.  Possible errors include bogus WR
+		 * type, and a bogus SGL length...
+		 */
+		if (err) {
+			break;
+		}
+
+		/*
+		 * Store flags
+		 */
+		c2_wr_set_flags(&wr, flags);
+
+		/*
+		 * Post the puppy!
+		 */
+		spin_lock_irqsave(&qp->lock, lock_flags);
+		err = qp_wr_post(&qp->sq_mq, &wr, qp, msg_size);
+		if (err) {
+			spin_unlock_irqrestore(&qp->lock, lock_flags);
+			break;
+		}
+
+		/*
+		 * Enqueue mq index to activity FIFO.
+		 */
+		c2_activity(c2dev, qp->sq_mq.index, qp->sq_mq.hint_count);
+		spin_unlock_irqrestore(&qp->lock, lock_flags);
+
+		ib_wr = ib_wr->next;
+	}
+
+out:
+	if (err)
+		*bad_wr = ib_wr;
+	return err;
+}
+
+int c2_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
+		    struct ib_recv_wr **bad_wr)
+{
+	struct c2_dev *c2dev = to_c2dev(ibqp->device);
+	struct c2_qp *qp = to_c2qp(ibqp);
+	union c2wr wr;
+	unsigned long lock_flags;
+	int err = 0;
+
+	if (qp->state > IB_QPS_RTS) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	/*
+	 * Try and post each work request
+	 */
+	while (ib_wr) {
+		u32 tot_len;
+		u8 actual_sge_count;
+
+		if (ib_wr->num_sge > qp->recv_sgl_depth) {
+			err = -EINVAL;
+			break;
+		}
+
+		/*
+		 * Create local host-copy of the WR
+		 */
+		wr.rqwr.rq_hdr.user_hdr.hdr.context = ib_wr->wr_id;
+		c2_wr_set_id(&wr, CCWR_RECV);
+		c2_wr_set_flags(&wr, 0);
+
+		/* sge_count is limited to eight bits. */
+		BUG_ON(ib_wr->num_sge >= 256);
+		err = move_sgl((struct c2_data_addr *) & (wr.rqwr.data),
+			       ib_wr->sg_list,
+			       ib_wr->num_sge, &tot_len, &actual_sge_count);
+		c2_wr_set_sge_count(&wr, actual_sge_count);
+
+		/*
+		 * If we had an error on the last wr build, then
+		 * break out.  Possible errors include bogus WR
+		 * type, and a bogus SGL length...
+		 */
+		if (err) {
+			break;
+		}
+
+		spin_lock_irqsave(&qp->lock, lock_flags);
+		err = qp_wr_post(&qp->rq_mq, &wr, qp, qp->rq_mq.msg_size);
+		if (err) {
+			spin_unlock_irqrestore(&qp->lock, lock_flags);
+			break;
+		}
+
+		/*
+		 * Enqueue mq index to activity FIFO
+		 */
+		c2_activity(c2dev, qp->rq_mq.index, qp->rq_mq.hint_count);
+		spin_unlock_irqrestore(&qp->lock, lock_flags);
+
+		ib_wr = ib_wr->next;
+	}
+
+out:
+	if (err)
+		*bad_wr = ib_wr;
+	return err;
+}
+
+void c2_init_qp_table(struct c2_dev *c2dev)
+{
+	spin_lock_init(&c2dev->qp_table.lock);
+	idr_init(&c2dev->qp_table.idr);
+}
+
+void c2_cleanup_qp_table(struct c2_dev *c2dev)
+{
+	idr_destroy(&c2dev->qp_table.idr);
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_rnic.c b/drivers/infiniband/hw/amso1100/c2_rnic.c
new file mode 100644
index 000000000..d2a6d9613
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_rnic.c
@@ -0,0 +1,655 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/dma-mapping.h>
+#include <linux/mm.h>
+#include <linux/inet.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+
+#include <linux/route.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+#include <rdma/ib_smi.h>
+#include "c2.h"
+#include "c2_vq.h"
+
+/* Device capabilities */
+#define C2_MIN_PAGESIZE  1024
+
+#define C2_MAX_MRS       32768
+#define C2_MAX_QPS       16000
+#define C2_MAX_WQE_SZ    256
+#define C2_MAX_QP_WR     ((128*1024)/C2_MAX_WQE_SZ)
+#define C2_MAX_SGES      4
+#define C2_MAX_SGE_RD    1
+#define C2_MAX_CQS       32768
+#define C2_MAX_CQES      4096
+#define C2_MAX_PDS       16384
+
+/*
+ * Send the adapter INIT message to the amso1100
+ */
+static int c2_adapter_init(struct c2_dev *c2dev)
+{
+	struct c2wr_init_req wr;
+	int err;
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_INIT);
+	wr.hdr.context = 0;
+	wr.hint_count = cpu_to_be64(c2dev->hint_count_dma);
+	wr.q0_host_shared = cpu_to_be64(c2dev->req_vq.shared_dma);
+	wr.q1_host_shared = cpu_to_be64(c2dev->rep_vq.shared_dma);
+	wr.q1_host_msg_pool = cpu_to_be64(c2dev->rep_vq.host_dma);
+	wr.q2_host_shared = cpu_to_be64(c2dev->aeq.shared_dma);
+	wr.q2_host_msg_pool = cpu_to_be64(c2dev->aeq.host_dma);
+
+	/* Post the init message */
+	err = vq_send_wr(c2dev, (union c2wr *) & wr);
+
+	return err;
+}
+
+/*
+ * Send the adapter TERM message to the amso1100
+ */
+static void c2_adapter_term(struct c2_dev *c2dev)
+{
+	struct c2wr_init_req wr;
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_TERM);
+	wr.hdr.context = 0;
+
+	/* Post the init message */
+	vq_send_wr(c2dev, (union c2wr *) & wr);
+	c2dev->init = 0;
+
+	return;
+}
+
+/*
+ * Query the adapter
+ */
+static int c2_rnic_query(struct c2_dev *c2dev, struct ib_device_attr *props)
+{
+	struct c2_vq_req *vq_req;
+	struct c2wr_rnic_query_req wr;
+	struct c2wr_rnic_query_rep *reply;
+	int err;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	c2_wr_set_id(&wr, CCWR_RNIC_QUERY);
+	wr.hdr.context = (unsigned long) vq_req;
+	wr.rnic_handle = c2dev->adapter_handle;
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) &wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail1;
+
+	reply =
+	    (struct c2wr_rnic_query_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply)
+		err = -ENOMEM;
+	else
+		err = c2_errno(reply);
+	if (err)
+		goto bail2;
+
+	props->fw_ver =
+		((u64)be32_to_cpu(reply->fw_ver_major) << 32) |
+		((be32_to_cpu(reply->fw_ver_minor) & 0xFFFF) << 16) |
+		(be32_to_cpu(reply->fw_ver_patch) & 0xFFFF);
+	memcpy(&props->sys_image_guid, c2dev->netdev->dev_addr, 6);
+	props->max_mr_size         = 0xFFFFFFFF;
+	props->page_size_cap       = ~(C2_MIN_PAGESIZE-1);
+	props->vendor_id           = be32_to_cpu(reply->vendor_id);
+	props->vendor_part_id      = be32_to_cpu(reply->part_number);
+	props->hw_ver              = be32_to_cpu(reply->hw_version);
+	props->max_qp              = be32_to_cpu(reply->max_qps);
+	props->max_qp_wr           = be32_to_cpu(reply->max_qp_depth);
+	props->device_cap_flags    = c2dev->device_cap_flags;
+	props->max_sge             = C2_MAX_SGES;
+	props->max_sge_rd          = C2_MAX_SGE_RD;
+	props->max_cq              = be32_to_cpu(reply->max_cqs);
+	props->max_cqe             = be32_to_cpu(reply->max_cq_depth);
+	props->max_mr              = be32_to_cpu(reply->max_mrs);
+	props->max_pd              = be32_to_cpu(reply->max_pds);
+	props->max_qp_rd_atom      = be32_to_cpu(reply->max_qp_ird);
+	props->max_ee_rd_atom      = 0;
+	props->max_res_rd_atom     = be32_to_cpu(reply->max_global_ird);
+	props->max_qp_init_rd_atom = be32_to_cpu(reply->max_qp_ord);
+	props->max_ee_init_rd_atom = 0;
+	props->atomic_cap          = IB_ATOMIC_NONE;
+	props->max_ee              = 0;
+	props->max_rdd             = 0;
+	props->max_mw              = be32_to_cpu(reply->max_mws);
+	props->max_raw_ipv6_qp     = 0;
+	props->max_raw_ethy_qp     = 0;
+	props->max_mcast_grp       = 0;
+	props->max_mcast_qp_attach = 0;
+	props->max_total_mcast_qp_attach = 0;
+	props->max_ah              = 0;
+	props->max_fmr             = 0;
+	props->max_map_per_fmr     = 0;
+	props->max_srq             = 0;
+	props->max_srq_wr          = 0;
+	props->max_srq_sge         = 0;
+	props->max_pkeys           = 0;
+	props->local_ca_ack_delay  = 0;
+
+ bail2:
+	vq_repbuf_free(c2dev, reply);
+
+ bail1:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+/*
+ * Add an IP address to the RNIC interface
+ */
+int c2_add_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask)
+{
+	struct c2_vq_req *vq_req;
+	struct c2wr_rnic_setconfig_req *wr;
+	struct c2wr_rnic_setconfig_rep *reply;
+	struct c2_netaddr netaddr;
+	int err, len;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	len = sizeof(struct c2_netaddr);
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG);
+	wr->hdr.context = (unsigned long) vq_req;
+	wr->rnic_handle = c2dev->adapter_handle;
+	wr->option = cpu_to_be32(C2_CFG_ADD_ADDR);
+
+	netaddr.ip_addr = inaddr;
+	netaddr.netmask = inmask;
+	netaddr.mtu = 0;
+
+	memcpy(wr->data, &netaddr, len);
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail1;
+
+	reply =
+	    (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	err = c2_errno(reply);
+	vq_repbuf_free(c2dev, reply);
+
+      bail1:
+	kfree(wr);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+/*
+ * Delete an IP address from the RNIC interface
+ */
+int c2_del_addr(struct c2_dev *c2dev, __be32 inaddr, __be32 inmask)
+{
+	struct c2_vq_req *vq_req;
+	struct c2wr_rnic_setconfig_req *wr;
+	struct c2wr_rnic_setconfig_rep *reply;
+	struct c2_netaddr netaddr;
+	int err, len;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (!vq_req)
+		return -ENOMEM;
+
+	len = sizeof(struct c2_netaddr);
+	wr = kmalloc(c2dev->req_vq.msg_size, GFP_KERNEL);
+	if (!wr) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	c2_wr_set_id(wr, CCWR_RNIC_SETCONFIG);
+	wr->hdr.context = (unsigned long) vq_req;
+	wr->rnic_handle = c2dev->adapter_handle;
+	wr->option = cpu_to_be32(C2_CFG_DEL_ADDR);
+
+	netaddr.ip_addr = inaddr;
+	netaddr.netmask = inmask;
+	netaddr.mtu = 0;
+
+	memcpy(wr->data, &netaddr, len);
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, (union c2wr *) wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail1;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err)
+		goto bail1;
+
+	reply =
+	    (struct c2wr_rnic_setconfig_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	err = c2_errno(reply);
+	vq_repbuf_free(c2dev, reply);
+
+      bail1:
+	kfree(wr);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+/*
+ * Open a single RNIC instance to use with all
+ * low level openib calls
+ */
+static int c2_rnic_open(struct c2_dev *c2dev)
+{
+	struct c2_vq_req *vq_req;
+	union c2wr wr;
+	struct c2wr_rnic_open_rep *reply;
+	int err;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (vq_req == NULL) {
+		return -ENOMEM;
+	}
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_RNIC_OPEN);
+	wr.rnic_open.req.hdr.context = (unsigned long) (vq_req);
+	wr.rnic_open.req.flags = cpu_to_be16(RNIC_PRIV_MODE);
+	wr.rnic_open.req.port_num = cpu_to_be16(0);
+	wr.rnic_open.req.user_context = (unsigned long) c2dev;
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, &wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail0;
+	}
+
+	reply = (struct c2wr_rnic_open_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	if ((err = c2_errno(reply)) != 0) {
+		goto bail1;
+	}
+
+	c2dev->adapter_handle = reply->rnic_handle;
+
+      bail1:
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+/*
+ * Close the RNIC instance
+ */
+static int c2_rnic_close(struct c2_dev *c2dev)
+{
+	struct c2_vq_req *vq_req;
+	union c2wr wr;
+	struct c2wr_rnic_close_rep *reply;
+	int err;
+
+	vq_req = vq_req_alloc(c2dev);
+	if (vq_req == NULL) {
+		return -ENOMEM;
+	}
+
+	memset(&wr, 0, sizeof(wr));
+	c2_wr_set_id(&wr, CCWR_RNIC_CLOSE);
+	wr.rnic_close.req.hdr.context = (unsigned long) vq_req;
+	wr.rnic_close.req.rnic_handle = c2dev->adapter_handle;
+
+	vq_req_get(c2dev, vq_req);
+
+	err = vq_send_wr(c2dev, &wr);
+	if (err) {
+		vq_req_put(c2dev, vq_req);
+		goto bail0;
+	}
+
+	err = vq_wait_for_reply(c2dev, vq_req);
+	if (err) {
+		goto bail0;
+	}
+
+	reply = (struct c2wr_rnic_close_rep *) (unsigned long) (vq_req->reply_msg);
+	if (!reply) {
+		err = -ENOMEM;
+		goto bail0;
+	}
+
+	if ((err = c2_errno(reply)) != 0) {
+		goto bail1;
+	}
+
+	c2dev->adapter_handle = 0;
+
+      bail1:
+	vq_repbuf_free(c2dev, reply);
+      bail0:
+	vq_req_free(c2dev, vq_req);
+	return err;
+}
+
+/*
+ * Called by c2_probe to initialize the RNIC. This principally
+ * involves initializing the various limits and resource pools that
+ * comprise the RNIC instance.
+ */
+int c2_rnic_init(struct c2_dev *c2dev)
+{
+	int err;
+	u32 qsize, msgsize;
+	void *q1_pages;
+	void *q2_pages;
+	void __iomem *mmio_regs;
+
+	/* Device capabilities */
+	c2dev->device_cap_flags =
+	    (IB_DEVICE_RESIZE_MAX_WR |
+	     IB_DEVICE_CURR_QP_STATE_MOD |
+	     IB_DEVICE_SYS_IMAGE_GUID |
+	     IB_DEVICE_LOCAL_DMA_LKEY |
+	     IB_DEVICE_MEM_WINDOW);
+
+	/* Allocate the qptr_array */
+	c2dev->qptr_array = vzalloc(C2_MAX_CQS * sizeof(void *));
+	if (!c2dev->qptr_array) {
+		return -ENOMEM;
+	}
+
+	/* Initialize the qptr_array */
+	c2dev->qptr_array[0] = (void *) &c2dev->req_vq;
+	c2dev->qptr_array[1] = (void *) &c2dev->rep_vq;
+	c2dev->qptr_array[2] = (void *) &c2dev->aeq;
+
+	/* Initialize data structures */
+	init_waitqueue_head(&c2dev->req_vq_wo);
+	spin_lock_init(&c2dev->vqlock);
+	spin_lock_init(&c2dev->lock);
+
+	/* Allocate MQ shared pointer pool for kernel clients. User
+	 * mode client pools are hung off the user context
+	 */
+	err = c2_init_mqsp_pool(c2dev, GFP_KERNEL, &c2dev->kern_mqsp_pool);
+	if (err) {
+		goto bail0;
+	}
+
+	/* Allocate shared pointers for Q0, Q1, and Q2 from
+	 * the shared pointer pool.
+	 */
+
+	c2dev->hint_count = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					     &c2dev->hint_count_dma,
+					     GFP_KERNEL);
+	c2dev->req_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					     &c2dev->req_vq.shared_dma,
+					     GFP_KERNEL);
+	c2dev->rep_vq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					     &c2dev->rep_vq.shared_dma,
+					     GFP_KERNEL);
+	c2dev->aeq.shared = c2_alloc_mqsp(c2dev, c2dev->kern_mqsp_pool,
+					  &c2dev->aeq.shared_dma, GFP_KERNEL);
+	if (!c2dev->hint_count || !c2dev->req_vq.shared ||
+	    !c2dev->rep_vq.shared || !c2dev->aeq.shared) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+
+	mmio_regs = c2dev->kva;
+	/* Initialize the Verbs Request Queue */
+	c2_mq_req_init(&c2dev->req_vq, 0,
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_QSIZE)),
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_MSGSIZE)),
+		       mmio_regs +
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_POOLSTART)),
+		       mmio_regs +
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q0_SHARED)),
+		       C2_MQ_ADAPTER_TARGET);
+
+	/* Initialize the Verbs Reply Queue */
+	qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_QSIZE));
+	msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_MSGSIZE));
+	q1_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize,
+				      &c2dev->rep_vq.host_dma, GFP_KERNEL);
+	if (!q1_pages) {
+		err = -ENOMEM;
+		goto bail1;
+	}
+	dma_unmap_addr_set(&c2dev->rep_vq, mapping, c2dev->rep_vq.host_dma);
+	pr_debug("%s rep_vq va %p dma %llx\n", __func__, q1_pages,
+		 (unsigned long long) c2dev->rep_vq.host_dma);
+	c2_mq_rep_init(&c2dev->rep_vq,
+		   1,
+		   qsize,
+		   msgsize,
+		   q1_pages,
+		   mmio_regs +
+		   be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q1_SHARED)),
+		   C2_MQ_HOST_TARGET);
+
+	/* Initialize the Asynchronus Event Queue */
+	qsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_QSIZE));
+	msgsize = be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_MSGSIZE));
+	q2_pages = dma_alloc_coherent(&c2dev->pcidev->dev, qsize * msgsize,
+				      &c2dev->aeq.host_dma, GFP_KERNEL);
+	if (!q2_pages) {
+		err = -ENOMEM;
+		goto bail2;
+	}
+	dma_unmap_addr_set(&c2dev->aeq, mapping, c2dev->aeq.host_dma);
+	pr_debug("%s aeq va %p dma %llx\n", __func__, q2_pages,
+		 (unsigned long long) c2dev->aeq.host_dma);
+	c2_mq_rep_init(&c2dev->aeq,
+		       2,
+		       qsize,
+		       msgsize,
+		       q2_pages,
+		       mmio_regs +
+		       be32_to_cpu((__force __be32) readl(mmio_regs + C2_REGS_Q2_SHARED)),
+		       C2_MQ_HOST_TARGET);
+
+	/* Initialize the verbs request allocator */
+	err = vq_init(c2dev);
+	if (err)
+		goto bail3;
+
+	/* Enable interrupts on the adapter */
+	writel(0, c2dev->regs + C2_IDIS);
+
+	/* create the WR init message */
+	err = c2_adapter_init(c2dev);
+	if (err)
+		goto bail4;
+	c2dev->init++;
+
+	/* open an adapter instance */
+	err = c2_rnic_open(c2dev);
+	if (err)
+		goto bail4;
+
+	/* Initialize cached the adapter limits */
+	err = c2_rnic_query(c2dev, &c2dev->props);
+	if (err)
+		goto bail5;
+
+	/* Initialize the PD pool */
+	err = c2_init_pd_table(c2dev);
+	if (err)
+		goto bail5;
+
+	/* Initialize the QP pool */
+	c2_init_qp_table(c2dev);
+	return 0;
+
+      bail5:
+	c2_rnic_close(c2dev);
+      bail4:
+	vq_term(c2dev);
+      bail3:
+	dma_free_coherent(&c2dev->pcidev->dev,
+			  c2dev->aeq.q_size * c2dev->aeq.msg_size,
+			  q2_pages, dma_unmap_addr(&c2dev->aeq, mapping));
+      bail2:
+	dma_free_coherent(&c2dev->pcidev->dev,
+			  c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size,
+			  q1_pages, dma_unmap_addr(&c2dev->rep_vq, mapping));
+      bail1:
+	c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool);
+      bail0:
+	vfree(c2dev->qptr_array);
+
+	return err;
+}
+
+/*
+ * Called by c2_remove to cleanup the RNIC resources.
+ */
+void c2_rnic_term(struct c2_dev *c2dev)
+{
+
+	/* Close the open adapter instance */
+	c2_rnic_close(c2dev);
+
+	/* Send the TERM message to the adapter */
+	c2_adapter_term(c2dev);
+
+	/* Disable interrupts on the adapter */
+	writel(1, c2dev->regs + C2_IDIS);
+
+	/* Free the QP pool */
+	c2_cleanup_qp_table(c2dev);
+
+	/* Free the PD pool */
+	c2_cleanup_pd_table(c2dev);
+
+	/* Free the verbs request allocator */
+	vq_term(c2dev);
+
+	/* Free the asynchronus event queue */
+	dma_free_coherent(&c2dev->pcidev->dev,
+			  c2dev->aeq.q_size * c2dev->aeq.msg_size,
+			  c2dev->aeq.msg_pool.host,
+			  dma_unmap_addr(&c2dev->aeq, mapping));
+
+	/* Free the verbs reply queue */
+	dma_free_coherent(&c2dev->pcidev->dev,
+			  c2dev->rep_vq.q_size * c2dev->rep_vq.msg_size,
+			  c2dev->rep_vq.msg_pool.host,
+			  dma_unmap_addr(&c2dev->rep_vq, mapping));
+
+	/* Free the MQ shared pointer pool */
+	c2_free_mqsp_pool(c2dev, c2dev->kern_mqsp_pool);
+
+	/* Free the qptr_array */
+	vfree(c2dev->qptr_array);
+
+	return;
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_status.h b/drivers/infiniband/hw/amso1100/c2_status.h
new file mode 100644
index 000000000..6ee4aa92d
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_status.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef	_C2_STATUS_H_
+#define _C2_STATUS_H_
+
+/*
+ * Verbs Status Codes
+ */
+enum c2_status {
+	C2_OK = 0,		/* This must be zero */
+	CCERR_INSUFFICIENT_RESOURCES = 1,
+	CCERR_INVALID_MODIFIER = 2,
+	CCERR_INVALID_MODE = 3,
+	CCERR_IN_USE = 4,
+	CCERR_INVALID_RNIC = 5,
+	CCERR_INTERRUPTED_OPERATION = 6,
+	CCERR_INVALID_EH = 7,
+	CCERR_INVALID_CQ = 8,
+	CCERR_CQ_EMPTY = 9,
+	CCERR_NOT_IMPLEMENTED = 10,
+	CCERR_CQ_DEPTH_TOO_SMALL = 11,
+	CCERR_PD_IN_USE = 12,
+	CCERR_INVALID_PD = 13,
+	CCERR_INVALID_SRQ = 14,
+	CCERR_INVALID_ADDRESS = 15,
+	CCERR_INVALID_NETMASK = 16,
+	CCERR_INVALID_QP = 17,
+	CCERR_INVALID_QP_STATE = 18,
+	CCERR_TOO_MANY_WRS_POSTED = 19,
+	CCERR_INVALID_WR_TYPE = 20,
+	CCERR_INVALID_SGL_LENGTH = 21,
+	CCERR_INVALID_SQ_DEPTH = 22,
+	CCERR_INVALID_RQ_DEPTH = 23,
+	CCERR_INVALID_ORD = 24,
+	CCERR_INVALID_IRD = 25,
+	CCERR_QP_ATTR_CANNOT_CHANGE = 26,
+	CCERR_INVALID_STAG = 27,
+	CCERR_QP_IN_USE = 28,
+	CCERR_OUTSTANDING_WRS = 29,
+	CCERR_STAG_IN_USE = 30,
+	CCERR_INVALID_STAG_INDEX = 31,
+	CCERR_INVALID_SGL_FORMAT = 32,
+	CCERR_ADAPTER_TIMEOUT = 33,
+	CCERR_INVALID_CQ_DEPTH = 34,
+	CCERR_INVALID_PRIVATE_DATA_LENGTH = 35,
+	CCERR_INVALID_EP = 36,
+	CCERR_MR_IN_USE = CCERR_STAG_IN_USE,
+	CCERR_FLUSHED = 38,
+	CCERR_INVALID_WQE = 39,
+	CCERR_LOCAL_QP_CATASTROPHIC_ERROR = 40,
+	CCERR_REMOTE_TERMINATION_ERROR = 41,
+	CCERR_BASE_AND_BOUNDS_VIOLATION = 42,
+	CCERR_ACCESS_VIOLATION = 43,
+	CCERR_INVALID_PD_ID = 44,
+	CCERR_WRAP_ERROR = 45,
+	CCERR_INV_STAG_ACCESS_ERROR = 46,
+	CCERR_ZERO_RDMA_READ_RESOURCES = 47,
+	CCERR_QP_NOT_PRIVILEGED = 48,
+	CCERR_STAG_STATE_NOT_INVALID = 49,
+	CCERR_INVALID_PAGE_SIZE = 50,
+	CCERR_INVALID_BUFFER_SIZE = 51,
+	CCERR_INVALID_PBE = 52,
+	CCERR_INVALID_FBO = 53,
+	CCERR_INVALID_LENGTH = 54,
+	CCERR_INVALID_ACCESS_RIGHTS = 55,
+	CCERR_PBL_TOO_BIG = 56,
+	CCERR_INVALID_VA = 57,
+	CCERR_INVALID_REGION = 58,
+	CCERR_INVALID_WINDOW = 59,
+	CCERR_TOTAL_LENGTH_TOO_BIG = 60,
+	CCERR_INVALID_QP_ID = 61,
+	CCERR_ADDR_IN_USE = 62,
+	CCERR_ADDR_NOT_AVAIL = 63,
+	CCERR_NET_DOWN = 64,
+	CCERR_NET_UNREACHABLE = 65,
+	CCERR_CONN_ABORTED = 66,
+	CCERR_CONN_RESET = 67,
+	CCERR_NO_BUFS = 68,
+	CCERR_CONN_TIMEDOUT = 69,
+	CCERR_CONN_REFUSED = 70,
+	CCERR_HOST_UNREACHABLE = 71,
+	CCERR_INVALID_SEND_SGL_DEPTH = 72,
+	CCERR_INVALID_RECV_SGL_DEPTH = 73,
+	CCERR_INVALID_RDMA_WRITE_SGL_DEPTH = 74,
+	CCERR_INSUFFICIENT_PRIVILEGES = 75,
+	CCERR_STACK_ERROR = 76,
+	CCERR_INVALID_VERSION = 77,
+	CCERR_INVALID_MTU = 78,
+	CCERR_INVALID_IMAGE = 79,
+	CCERR_PENDING = 98,	/* not an error; user internally by adapter */
+	CCERR_DEFER = 99,	/* not an error; used internally by adapter */
+	CCERR_FAILED_WRITE = 100,
+	CCERR_FAILED_ERASE = 101,
+	CCERR_FAILED_VERIFICATION = 102,
+	CCERR_NOT_FOUND = 103,
+
+};
+
+/*
+ * CCAE_ACTIVE_CONNECT_RESULTS status result codes.
+ */
+enum c2_connect_status {
+	C2_CONN_STATUS_SUCCESS = C2_OK,
+	C2_CONN_STATUS_NO_MEM = CCERR_INSUFFICIENT_RESOURCES,
+	C2_CONN_STATUS_TIMEDOUT = CCERR_CONN_TIMEDOUT,
+	C2_CONN_STATUS_REFUSED = CCERR_CONN_REFUSED,
+	C2_CONN_STATUS_NETUNREACH = CCERR_NET_UNREACHABLE,
+	C2_CONN_STATUS_HOSTUNREACH = CCERR_HOST_UNREACHABLE,
+	C2_CONN_STATUS_INVALID_RNIC = CCERR_INVALID_RNIC,
+	C2_CONN_STATUS_INVALID_QP = CCERR_INVALID_QP,
+	C2_CONN_STATUS_INVALID_QP_STATE = CCERR_INVALID_QP_STATE,
+	C2_CONN_STATUS_REJECTED = CCERR_CONN_RESET,
+	C2_CONN_STATUS_ADDR_NOT_AVAIL = CCERR_ADDR_NOT_AVAIL,
+};
+
+/*
+ * Flash programming status codes.
+ */
+enum c2_flash_status {
+	C2_FLASH_STATUS_SUCCESS = 0x0000,
+	C2_FLASH_STATUS_VERIFY_ERR = 0x0002,
+	C2_FLASH_STATUS_IMAGE_ERR = 0x0004,
+	C2_FLASH_STATUS_ECLBS = 0x0400,
+	C2_FLASH_STATUS_PSLBS = 0x0800,
+	C2_FLASH_STATUS_VPENS = 0x1000,
+};
+
+#endif				/* _C2_STATUS_H_ */
diff --git a/drivers/infiniband/hw/amso1100/c2_user.h b/drivers/infiniband/hw/amso1100/c2_user.h
new file mode 100644
index 000000000..7e9e7ad65
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_user.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef C2_USER_H
+#define C2_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct c2_alloc_ucontext_resp {
+	__u32 qp_tab_size;
+	__u32 uarc_size;
+};
+
+struct c2_alloc_pd_resp {
+	__u32 pdn;
+	__u32 reserved;
+};
+
+struct c2_create_cq {
+	__u32 lkey;
+	__u32 pdn;
+	__u64 arm_db_page;
+	__u64 set_db_page;
+	__u32 arm_db_index;
+	__u32 set_db_index;
+};
+
+struct c2_create_cq_resp {
+	__u32 cqn;
+	__u32 reserved;
+};
+
+struct c2_create_qp {
+	__u32 lkey;
+	__u32 reserved;
+	__u64 sq_db_page;
+	__u64 rq_db_page;
+	__u32 sq_db_index;
+	__u32 rq_db_index;
+};
+
+#endif				/* C2_USER_H */
diff --git a/drivers/infiniband/hw/amso1100/c2_vq.c b/drivers/infiniband/hw/amso1100/c2_vq.c
new file mode 100644
index 000000000..2ec716fb2
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_vq.c
@@ -0,0 +1,260 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#include "c2_vq.h"
+#include "c2_provider.h"
+
+/*
+ * Verbs Request Objects:
+ *
+ * VQ Request Objects are allocated by the kernel verbs handlers.
+ * They contain a wait object, a refcnt, an atomic bool indicating that the
+ * adapter has replied, and a copy of the verb reply work request.
+ * A pointer to the VQ Request Object is passed down in the context
+ * field of the work request message, and reflected back by the adapter
+ * in the verbs reply message.  The function handle_vq() in the interrupt
+ * path will use this pointer to:
+ * 	1) append a copy of the verbs reply message
+ * 	2) mark that the reply is ready
+ * 	3) wake up the kernel verbs handler blocked awaiting the reply.
+ *
+ *
+ * The kernel verbs handlers do a "get" to put a 2nd reference on the
+ * VQ Request object.  If the kernel verbs handler exits before the adapter
+ * can respond, this extra reference will keep the VQ Request object around
+ * until the adapter's reply can be processed.  The reason we need this is
+ * because a pointer to this object is stuffed into the context field of
+ * the verbs work request message, and reflected back in the reply message.
+ * It is used in the interrupt handler (handle_vq()) to wake up the appropriate
+ * kernel verb handler that is blocked awaiting the verb reply.
+ * So handle_vq() will do a "put" on the object when it's done accessing it.
+ * NOTE:  If we guarantee that the kernel verb handler will never bail before
+ *        getting the reply, then we don't need these refcnts.
+ *
+ *
+ * VQ Request objects are freed by the kernel verbs handlers only
+ * after the verb has been processed, or when the adapter fails and
+ * does not reply.
+ *
+ *
+ * Verbs Reply Buffers:
+ *
+ * VQ Reply bufs are local host memory copies of a
+ * outstanding Verb Request reply
+ * message.  The are always allocated by the kernel verbs handlers, and _may_ be
+ * freed by either the kernel verbs handler -or- the interrupt handler.  The
+ * kernel verbs handler _must_ free the repbuf, then free the vq request object
+ * in that order.
+ */
+
+int vq_init(struct c2_dev *c2dev)
+{
+	sprintf(c2dev->vq_cache_name, "c2-vq:dev%c",
+		(char) ('0' + c2dev->devnum));
+	c2dev->host_msg_cache =
+	    kmem_cache_create(c2dev->vq_cache_name, c2dev->rep_vq.msg_size, 0,
+			      SLAB_HWCACHE_ALIGN, NULL);
+	if (c2dev->host_msg_cache == NULL) {
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void vq_term(struct c2_dev *c2dev)
+{
+	kmem_cache_destroy(c2dev->host_msg_cache);
+}
+
+/* vq_req_alloc - allocate a VQ Request Object and initialize it.
+ * The refcnt is set to 1.
+ */
+struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev)
+{
+	struct c2_vq_req *r;
+
+	r = kmalloc(sizeof(struct c2_vq_req), GFP_KERNEL);
+	if (r) {
+		init_waitqueue_head(&r->wait_object);
+		r->reply_msg = 0;
+		r->event = 0;
+		r->cm_id = NULL;
+		r->qp = NULL;
+		atomic_set(&r->refcnt, 1);
+		atomic_set(&r->reply_ready, 0);
+	}
+	return r;
+}
+
+
+/* vq_req_free - free the VQ Request Object.  It is assumed the verbs handler
+ * has already free the VQ Reply Buffer if it existed.
+ */
+void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *r)
+{
+	r->reply_msg = 0;
+	if (atomic_dec_and_test(&r->refcnt)) {
+		kfree(r);
+	}
+}
+
+/* vq_req_get - reference a VQ Request Object.  Done
+ * only in the kernel verbs handlers.
+ */
+void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *r)
+{
+	atomic_inc(&r->refcnt);
+}
+
+
+/* vq_req_put - dereference and potentially free a VQ Request Object.
+ *
+ * This is only called by handle_vq() on the
+ * interrupt when it is done processing
+ * a verb reply message.  If the associated
+ * kernel verbs handler has already bailed,
+ * then this put will actually free the VQ
+ * Request object _and_ the VQ Reply Buffer
+ * if it exists.
+ */
+void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *r)
+{
+	if (atomic_dec_and_test(&r->refcnt)) {
+		if (r->reply_msg != 0)
+			vq_repbuf_free(c2dev,
+				       (void *) (unsigned long) r->reply_msg);
+		kfree(r);
+	}
+}
+
+
+/*
+ * vq_repbuf_alloc - allocate a VQ Reply Buffer.
+ */
+void *vq_repbuf_alloc(struct c2_dev *c2dev)
+{
+	return kmem_cache_alloc(c2dev->host_msg_cache, GFP_ATOMIC);
+}
+
+/*
+ * vq_send_wr - post a verbs request message to the Verbs Request Queue.
+ * If a message is not available in the MQ, then block until one is available.
+ * NOTE: handle_mq() on the interrupt context will wake up threads blocked here.
+ * When the adapter drains the Verbs Request Queue,
+ * it inserts MQ index 0 in to the
+ * adapter->host activity fifo and interrupts the host.
+ */
+int vq_send_wr(struct c2_dev *c2dev, union c2wr *wr)
+{
+	void *msg;
+	wait_queue_t __wait;
+
+	/*
+	 * grab adapter vq lock
+	 */
+	spin_lock(&c2dev->vqlock);
+
+	/*
+	 * allocate msg
+	 */
+	msg = c2_mq_alloc(&c2dev->req_vq);
+
+	/*
+	 * If we cannot get a msg, then we'll wait
+	 * When a messages are available, the int handler will wake_up()
+	 * any waiters.
+	 */
+	while (msg == NULL) {
+		pr_debug("%s:%d no available msg in VQ, waiting...\n",
+		       __func__, __LINE__);
+		init_waitqueue_entry(&__wait, current);
+		add_wait_queue(&c2dev->req_vq_wo, &__wait);
+		spin_unlock(&c2dev->vqlock);
+		for (;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+			if (!c2_mq_full(&c2dev->req_vq)) {
+				break;
+			}
+			if (!signal_pending(current)) {
+				schedule_timeout(1 * HZ);	/* 1 second... */
+				continue;
+			}
+			set_current_state(TASK_RUNNING);
+			remove_wait_queue(&c2dev->req_vq_wo, &__wait);
+			return -EINTR;
+		}
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&c2dev->req_vq_wo, &__wait);
+		spin_lock(&c2dev->vqlock);
+		msg = c2_mq_alloc(&c2dev->req_vq);
+	}
+
+	/*
+	 * copy wr into adapter msg
+	 */
+	memcpy(msg, wr, c2dev->req_vq.msg_size);
+
+	/*
+	 * post msg
+	 */
+	c2_mq_produce(&c2dev->req_vq);
+
+	/*
+	 * release adapter vq lock
+	 */
+	spin_unlock(&c2dev->vqlock);
+	return 0;
+}
+
+
+/*
+ * vq_wait_for_reply - block until the adapter posts a Verb Reply Message.
+ */
+int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req)
+{
+	if (!wait_event_timeout(req->wait_object,
+				atomic_read(&req->reply_ready),
+				60*HZ))
+		return -ETIMEDOUT;
+
+	return 0;
+}
+
+/*
+ * vq_repbuf_free - Free a Verbs Reply Buffer.
+ */
+void vq_repbuf_free(struct c2_dev *c2dev, void *reply)
+{
+	kmem_cache_free(c2dev->host_msg_cache, reply);
+}
diff --git a/drivers/infiniband/hw/amso1100/c2_vq.h b/drivers/infiniband/hw/amso1100/c2_vq.h
new file mode 100644
index 000000000..33805627a
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_vq.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _C2_VQ_H_
+#define _C2_VQ_H_
+#include <linux/sched.h>
+#include "c2.h"
+#include "c2_wr.h"
+#include "c2_provider.h"
+
+struct c2_vq_req {
+	u64 reply_msg;		/* ptr to reply msg */
+	wait_queue_head_t wait_object;	/* wait object for vq reqs */
+	atomic_t reply_ready;	/* set when reply is ready */
+	atomic_t refcnt;	/* used to cancel WRs... */
+	int event;
+	struct iw_cm_id *cm_id;
+	struct c2_qp *qp;
+};
+
+extern int vq_init(struct c2_dev *c2dev);
+extern void vq_term(struct c2_dev *c2dev);
+
+extern struct c2_vq_req *vq_req_alloc(struct c2_dev *c2dev);
+extern void vq_req_free(struct c2_dev *c2dev, struct c2_vq_req *req);
+extern void vq_req_get(struct c2_dev *c2dev, struct c2_vq_req *req);
+extern void vq_req_put(struct c2_dev *c2dev, struct c2_vq_req *req);
+extern int vq_send_wr(struct c2_dev *c2dev, union c2wr * wr);
+
+extern void *vq_repbuf_alloc(struct c2_dev *c2dev);
+extern void vq_repbuf_free(struct c2_dev *c2dev, void *reply);
+
+extern int vq_wait_for_reply(struct c2_dev *c2dev, struct c2_vq_req *req);
+#endif				/* _C2_VQ_H_ */
diff --git a/drivers/infiniband/hw/amso1100/c2_wr.h b/drivers/infiniband/hw/amso1100/c2_wr.h
new file mode 100644
index 000000000..8d4b4ca46
--- /dev/null
+++ b/drivers/infiniband/hw/amso1100/c2_wr.h
@@ -0,0 +1,1520 @@
+/*
+ * Copyright (c) 2005 Ammasso, Inc. All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _C2_WR_H_
+#define _C2_WR_H_
+
+#ifdef CCDEBUG
+#define CCWR_MAGIC		0xb07700b0
+#endif
+
+#define C2_QP_NO_ATTR_CHANGE 0xFFFFFFFF
+
+/* Maximum allowed size in bytes of private_data exchange
+ * on connect.
+ */
+#define C2_MAX_PRIVATE_DATA_SIZE 200
+
+/*
+ * These types are shared among the adapter, host, and CCIL consumer.
+ */
+enum c2_cq_notification_type {
+	C2_CQ_NOTIFICATION_TYPE_NONE = 1,
+	C2_CQ_NOTIFICATION_TYPE_NEXT,
+	C2_CQ_NOTIFICATION_TYPE_NEXT_SE
+};
+
+enum c2_setconfig_cmd {
+	C2_CFG_ADD_ADDR = 1,
+	C2_CFG_DEL_ADDR = 2,
+	C2_CFG_ADD_ROUTE = 3,
+	C2_CFG_DEL_ROUTE = 4
+};
+
+enum c2_getconfig_cmd {
+	C2_GETCONFIG_ROUTES = 1,
+	C2_GETCONFIG_ADDRS
+};
+
+/*
+ *  CCIL Work Request Identifiers
+ */
+enum c2wr_ids {
+	CCWR_RNIC_OPEN = 1,
+	CCWR_RNIC_QUERY,
+	CCWR_RNIC_SETCONFIG,
+	CCWR_RNIC_GETCONFIG,
+	CCWR_RNIC_CLOSE,
+	CCWR_CQ_CREATE,
+	CCWR_CQ_QUERY,
+	CCWR_CQ_MODIFY,
+	CCWR_CQ_DESTROY,
+	CCWR_QP_CONNECT,
+	CCWR_PD_ALLOC,
+	CCWR_PD_DEALLOC,
+	CCWR_SRQ_CREATE,
+	CCWR_SRQ_QUERY,
+	CCWR_SRQ_MODIFY,
+	CCWR_SRQ_DESTROY,
+	CCWR_QP_CREATE,
+	CCWR_QP_QUERY,
+	CCWR_QP_MODIFY,
+	CCWR_QP_DESTROY,
+	CCWR_NSMR_STAG_ALLOC,
+	CCWR_NSMR_REGISTER,
+	CCWR_NSMR_PBL,
+	CCWR_STAG_DEALLOC,
+	CCWR_NSMR_REREGISTER,
+	CCWR_SMR_REGISTER,
+	CCWR_MR_QUERY,
+	CCWR_MW_ALLOC,
+	CCWR_MW_QUERY,
+	CCWR_EP_CREATE,
+	CCWR_EP_GETOPT,
+	CCWR_EP_SETOPT,
+	CCWR_EP_DESTROY,
+	CCWR_EP_BIND,
+	CCWR_EP_CONNECT,
+	CCWR_EP_LISTEN,
+	CCWR_EP_SHUTDOWN,
+	CCWR_EP_LISTEN_CREATE,
+	CCWR_EP_LISTEN_DESTROY,
+	CCWR_EP_QUERY,
+	CCWR_CR_ACCEPT,
+	CCWR_CR_REJECT,
+	CCWR_CONSOLE,
+	CCWR_TERM,
+	CCWR_FLASH_INIT,
+	CCWR_FLASH,
+	CCWR_BUF_ALLOC,
+	CCWR_BUF_FREE,
+	CCWR_FLASH_WRITE,
+	CCWR_INIT,		/* WARNING: Don't move this ever again! */
+
+
+
+	/* Add new IDs here */
+
+
+
+	/*
+	 * WARNING: CCWR_LAST must always be the last verbs id defined!
+	 *          All the preceding IDs are fixed, and must not change.
+	 *          You can add new IDs, but must not remove or reorder
+	 *          any IDs. If you do, YOU will ruin any hope of
+	 *          compatibility between versions.
+	 */
+	CCWR_LAST,
+
+	/*
+	 * Start over at 1 so that arrays indexed by user wr id's
+	 * begin at 1.  This is OK since the verbs and user wr id's
+	 * are always used on disjoint sets of queues.
+	 */
+	/*
+	 * The order of the CCWR_SEND_XX verbs must
+	 * match the order of the RDMA_OPs
+	 */
+	CCWR_SEND = 1,
+	CCWR_SEND_INV,
+	CCWR_SEND_SE,
+	CCWR_SEND_SE_INV,
+	CCWR_RDMA_WRITE,
+	CCWR_RDMA_READ,
+	CCWR_RDMA_READ_INV,
+	CCWR_MW_BIND,
+	CCWR_NSMR_FASTREG,
+	CCWR_STAG_INVALIDATE,
+	CCWR_RECV,
+	CCWR_NOP,
+	CCWR_UNIMPL,
+/* WARNING: This must always be the last user wr id defined! */
+};
+#define RDMA_SEND_OPCODE_FROM_WR_ID(x)   (x+2)
+
+/*
+ * SQ/RQ Work Request Types
+ */
+enum c2_wr_type {
+	C2_WR_TYPE_SEND = CCWR_SEND,
+	C2_WR_TYPE_SEND_SE = CCWR_SEND_SE,
+	C2_WR_TYPE_SEND_INV = CCWR_SEND_INV,
+	C2_WR_TYPE_SEND_SE_INV = CCWR_SEND_SE_INV,
+	C2_WR_TYPE_RDMA_WRITE = CCWR_RDMA_WRITE,
+	C2_WR_TYPE_RDMA_READ = CCWR_RDMA_READ,
+	C2_WR_TYPE_RDMA_READ_INV_STAG = CCWR_RDMA_READ_INV,
+	C2_WR_TYPE_BIND_MW = CCWR_MW_BIND,
+	C2_WR_TYPE_FASTREG_NSMR = CCWR_NSMR_FASTREG,
+	C2_WR_TYPE_INV_STAG = CCWR_STAG_INVALIDATE,
+	C2_WR_TYPE_RECV = CCWR_RECV,
+	C2_WR_TYPE_NOP = CCWR_NOP,
+};
+
+struct c2_netaddr {
+	__be32 ip_addr;
+	__be32 netmask;
+	u32 mtu;
+};
+
+struct c2_route {
+	u32 ip_addr;		/* 0 indicates the default route */
+	u32 netmask;		/* netmask associated with dst */
+	u32 flags;
+	union {
+		u32 ipaddr;	/* address of the nexthop interface */
+		u8 enaddr[6];
+	} nexthop;
+};
+
+/*
+ * A Scatter Gather Entry.
+ */
+struct c2_data_addr {
+	__be32 stag;
+	__be32 length;
+	__be64 to;
+};
+
+/*
+ * MR and MW flags used by the consumer, RI, and RNIC.
+ */
+enum c2_mm_flags {
+	MEM_REMOTE = 0x0001,	/* allow mw binds with remote access. */
+	MEM_VA_BASED = 0x0002,	/* Not Zero-based */
+	MEM_PBL_COMPLETE = 0x0004,	/* PBL array is complete in this msg */
+	MEM_LOCAL_READ = 0x0008,	/* allow local reads */
+	MEM_LOCAL_WRITE = 0x0010,	/* allow local writes */
+	MEM_REMOTE_READ = 0x0020,	/* allow remote reads */
+	MEM_REMOTE_WRITE = 0x0040,	/* allow remote writes */
+	MEM_WINDOW_BIND = 0x0080,	/* binds allowed */
+	MEM_SHARED = 0x0100,	/* set if MR is shared */
+	MEM_STAG_VALID = 0x0200	/* set if STAG is in valid state */
+};
+
+/*
+ * CCIL API ACF flags defined in terms of the low level mem flags.
+ * This minimizes translation needed in the user API
+ */
+enum c2_acf {
+	C2_ACF_LOCAL_READ = MEM_LOCAL_READ,
+	C2_ACF_LOCAL_WRITE = MEM_LOCAL_WRITE,
+	C2_ACF_REMOTE_READ = MEM_REMOTE_READ,
+	C2_ACF_REMOTE_WRITE = MEM_REMOTE_WRITE,
+	C2_ACF_WINDOW_BIND = MEM_WINDOW_BIND
+};
+
+/*
+ * Image types of objects written to flash
+ */
+#define C2_FLASH_IMG_BITFILE 1
+#define C2_FLASH_IMG_OPTION_ROM 2
+#define C2_FLASH_IMG_VPD 3
+
+/*
+ *  to fix bug 1815 we define the max size allowable of the
+ *  terminate message (per the IETF spec).Refer to the IETF
+ *  protocol specification, section 12.1.6, page 64)
+ *  The message is prefixed by 20 types of DDP info.
+ *
+ *  Then the message has 6 bytes for the terminate control
+ *  and DDP segment length info plus a DDP header (either
+ *  14 or 18 byts) plus 28 bytes for the RDMA header.
+ *  Thus the max size in:
+ *  20 + (6 + 18 + 28) = 72
+ */
+#define C2_MAX_TERMINATE_MESSAGE_SIZE (72)
+
+/*
+ * Build String Length.  It must be the same as C2_BUILD_STR_LEN in ccil_api.h
+ */
+#define WR_BUILD_STR_LEN 64
+
+/*
+ * WARNING:  All of these structs need to align any 64bit types on
+ * 64 bit boundaries!  64bit types include u64 and u64.
+ */
+
+/*
+ * Clustercore Work Request Header.  Be sensitive to field layout
+ * and alignment.
+ */
+struct c2wr_hdr {
+	/* wqe_count is part of the cqe.  It is put here so the
+	 * adapter can write to it while the wr is pending without
+	 * clobbering part of the wr.  This word need not be dma'd
+	 * from the host to adapter by libccil, but we copy it anyway
+	 * to make the memcpy to the adapter better aligned.
+	 */
+	__be32 wqe_count;
+
+	/* Put these fields next so that later 32- and 64-bit
+	 * quantities are naturally aligned.
+	 */
+	u8 id;
+	u8 result;		/* adapter -> host */
+	u8 sge_count;		/* host -> adapter */
+	u8 flags;		/* host -> adapter */
+
+	u64 context;
+#ifdef CCMSGMAGIC
+	u32 magic;
+	u32 pad;
+#endif
+} __attribute__((packed));
+
+/*
+ *------------------------ RNIC ------------------------
+ */
+
+/*
+ * WR_RNIC_OPEN
+ */
+
+/*
+ * Flags for the RNIC WRs
+ */
+enum c2_rnic_flags {
+	RNIC_IRD_STATIC = 0x0001,
+	RNIC_ORD_STATIC = 0x0002,
+	RNIC_QP_STATIC = 0x0004,
+	RNIC_SRQ_SUPPORTED = 0x0008,
+	RNIC_PBL_BLOCK_MODE = 0x0010,
+	RNIC_SRQ_MODEL_ARRIVAL = 0x0020,
+	RNIC_CQ_OVF_DETECTED = 0x0040,
+	RNIC_PRIV_MODE = 0x0080
+};
+
+struct c2wr_rnic_open_req {
+	struct c2wr_hdr hdr;
+	u64 user_context;
+	__be16 flags;		/* See enum c2_rnic_flags */
+	__be16 port_num;
+} __attribute__((packed));
+
+struct c2wr_rnic_open_rep {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+} __attribute__((packed));
+
+union c2wr_rnic_open {
+	struct c2wr_rnic_open_req req;
+	struct c2wr_rnic_open_rep rep;
+} __attribute__((packed));
+
+struct c2wr_rnic_query_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+} __attribute__((packed));
+
+/*
+ * WR_RNIC_QUERY
+ */
+struct c2wr_rnic_query_rep {
+	struct c2wr_hdr hdr;
+	u64 user_context;
+	__be32 vendor_id;
+	__be32 part_number;
+	__be32 hw_version;
+	__be32 fw_ver_major;
+	__be32 fw_ver_minor;
+	__be32 fw_ver_patch;
+	char fw_ver_build_str[WR_BUILD_STR_LEN];
+	__be32 max_qps;
+	__be32 max_qp_depth;
+	u32 max_srq_depth;
+	u32 max_send_sgl_depth;
+	u32 max_rdma_sgl_depth;
+	__be32 max_cqs;
+	__be32 max_cq_depth;
+	u32 max_cq_event_handlers;
+	__be32 max_mrs;
+	u32 max_pbl_depth;
+	__be32 max_pds;
+	__be32 max_global_ird;
+	u32 max_global_ord;
+	__be32 max_qp_ird;
+	__be32 max_qp_ord;
+	u32 flags;
+	__be32 max_mws;
+	u32 pbe_range_low;
+	u32 pbe_range_high;
+	u32 max_srqs;
+	u32 page_size;
+} __attribute__((packed));
+
+union c2wr_rnic_query {
+	struct c2wr_rnic_query_req req;
+	struct c2wr_rnic_query_rep rep;
+} __attribute__((packed));
+
+/*
+ * WR_RNIC_GETCONFIG
+ */
+
+struct c2wr_rnic_getconfig_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 option;		/* see c2_getconfig_cmd_t */
+	u64 reply_buf;
+	u32 reply_buf_len;
+} __attribute__((packed)) ;
+
+struct c2wr_rnic_getconfig_rep {
+	struct c2wr_hdr hdr;
+	u32 option;		/* see c2_getconfig_cmd_t */
+	u32 count_len;		/* length of the number of addresses configured */
+} __attribute__((packed)) ;
+
+union c2wr_rnic_getconfig {
+	struct c2wr_rnic_getconfig_req req;
+	struct c2wr_rnic_getconfig_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ * WR_RNIC_SETCONFIG
+ */
+struct c2wr_rnic_setconfig_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	__be32 option;		/* See c2_setconfig_cmd_t */
+	/* variable data and pad. See c2_netaddr and c2_route */
+	u8 data[0];
+} __attribute__((packed)) ;
+
+struct c2wr_rnic_setconfig_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_rnic_setconfig {
+	struct c2wr_rnic_setconfig_req req;
+	struct c2wr_rnic_setconfig_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ * WR_RNIC_CLOSE
+ */
+struct c2wr_rnic_close_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_rnic_close_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_rnic_close {
+	struct c2wr_rnic_close_req req;
+	struct c2wr_rnic_close_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ CQ ------------------------
+ */
+struct c2wr_cq_create_req {
+	struct c2wr_hdr hdr;
+	__be64 shared_ht;
+	u64 user_context;
+	__be64 msg_pool;
+	u32 rnic_handle;
+	__be32 msg_size;
+	__be32 depth;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_create_rep {
+	struct c2wr_hdr hdr;
+	__be32 mq_index;
+	__be32 adapter_shared;
+	u32 cq_handle;
+} __attribute__((packed)) ;
+
+union c2wr_cq_create {
+	struct c2wr_cq_create_req req;
+	struct c2wr_cq_create_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_modify_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 cq_handle;
+	u32 new_depth;
+	u64 new_msg_pool;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_modify_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_cq_modify {
+	struct c2wr_cq_modify_req req;
+	struct c2wr_cq_modify_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_destroy_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 cq_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_cq_destroy_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_cq_destroy {
+	struct c2wr_cq_destroy_req req;
+	struct c2wr_cq_destroy_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ PD ------------------------
+ */
+struct c2wr_pd_alloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_pd_alloc_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_pd_alloc {
+	struct c2wr_pd_alloc_req req;
+	struct c2wr_pd_alloc_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_pd_dealloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_pd_dealloc_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_pd_dealloc {
+	struct c2wr_pd_dealloc_req req;
+	struct c2wr_pd_dealloc_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ SRQ ------------------------
+ */
+struct c2wr_srq_create_req {
+	struct c2wr_hdr hdr;
+	u64 shared_ht;
+	u64 user_context;
+	u32 rnic_handle;
+	u32 srq_depth;
+	u32 srq_limit;
+	u32 sgl_depth;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_srq_create_rep {
+	struct c2wr_hdr hdr;
+	u32 srq_depth;
+	u32 sgl_depth;
+	u32 msg_size;
+	u32 mq_index;
+	u32 mq_start;
+	u32 srq_handle;
+} __attribute__((packed)) ;
+
+union c2wr_srq_create {
+	struct c2wr_srq_create_req req;
+	struct c2wr_srq_create_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_srq_destroy_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 srq_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_srq_destroy_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_srq_destroy {
+	struct c2wr_srq_destroy_req req;
+	struct c2wr_srq_destroy_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ QP ------------------------
+ */
+enum c2wr_qp_flags {
+	QP_RDMA_READ = 0x00000001,	/* RDMA read enabled? */
+	QP_RDMA_WRITE = 0x00000002,	/* RDMA write enabled? */
+	QP_MW_BIND = 0x00000004,	/* MWs enabled */
+	QP_ZERO_STAG = 0x00000008,	/* enabled? */
+	QP_REMOTE_TERMINATION = 0x00000010,	/* remote end terminated */
+	QP_RDMA_READ_RESPONSE = 0x00000020	/* Remote RDMA read  */
+	    /* enabled? */
+};
+
+struct c2wr_qp_create_req {
+	struct c2wr_hdr hdr;
+	__be64 shared_sq_ht;
+	__be64 shared_rq_ht;
+	u64 user_context;
+	u32 rnic_handle;
+	u32 sq_cq_handle;
+	u32 rq_cq_handle;
+	__be32 sq_depth;
+	__be32 rq_depth;
+	u32 srq_handle;
+	u32 srq_limit;
+	__be32 flags;		/* see enum c2wr_qp_flags */
+	__be32 send_sgl_depth;
+	__be32 recv_sgl_depth;
+	__be32 rdma_write_sgl_depth;
+	__be32 ord;
+	__be32 ird;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_create_rep {
+	struct c2wr_hdr hdr;
+	__be32 sq_depth;
+	__be32 rq_depth;
+	u32 send_sgl_depth;
+	u32 recv_sgl_depth;
+	u32 rdma_write_sgl_depth;
+	u32 ord;
+	u32 ird;
+	__be32 sq_msg_size;
+	__be32 sq_mq_index;
+	__be32 sq_mq_start;
+	__be32 rq_msg_size;
+	__be32 rq_mq_index;
+	__be32 rq_mq_start;
+	u32 qp_handle;
+} __attribute__((packed)) ;
+
+union c2wr_qp_create {
+	struct c2wr_qp_create_req req;
+	struct c2wr_qp_create_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_query_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 qp_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_query_rep {
+	struct c2wr_hdr hdr;
+	u64 user_context;
+	u32 rnic_handle;
+	u32 sq_depth;
+	u32 rq_depth;
+	u32 send_sgl_depth;
+	u32 rdma_write_sgl_depth;
+	u32 recv_sgl_depth;
+	u32 ord;
+	u32 ird;
+	u16 qp_state;
+	u16 flags;		/* see c2wr_qp_flags_t */
+	u32 qp_id;
+	u32 local_addr;
+	u32 remote_addr;
+	u16 local_port;
+	u16 remote_port;
+	u32 terminate_msg_length;	/* 0 if not present */
+	u8 data[0];
+	/* Terminate Message in-line here. */
+} __attribute__((packed)) ;
+
+union c2wr_qp_query {
+	struct c2wr_qp_query_req req;
+	struct c2wr_qp_query_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_modify_req {
+	struct c2wr_hdr hdr;
+	u64 stream_msg;
+	u32 stream_msg_length;
+	u32 rnic_handle;
+	u32 qp_handle;
+	__be32 next_qp_state;
+	__be32 ord;
+	__be32 ird;
+	__be32 sq_depth;
+	__be32 rq_depth;
+	u32 llp_ep_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_modify_rep {
+	struct c2wr_hdr hdr;
+	u32 ord;
+	u32 ird;
+	u32 sq_depth;
+	u32 rq_depth;
+	u32 sq_msg_size;
+	u32 sq_mq_index;
+	u32 sq_mq_start;
+	u32 rq_msg_size;
+	u32 rq_mq_index;
+	u32 rq_mq_start;
+} __attribute__((packed)) ;
+
+union c2wr_qp_modify {
+	struct c2wr_qp_modify_req req;
+	struct c2wr_qp_modify_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_destroy_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 qp_handle;
+} __attribute__((packed)) ;
+
+struct c2wr_qp_destroy_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_qp_destroy {
+	struct c2wr_qp_destroy_req req;
+	struct c2wr_qp_destroy_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ * The CCWR_QP_CONNECT msg is posted on the verbs request queue.  It can
+ * only be posted when a QP is in IDLE state.  After the connect request is
+ * submitted to the LLP, the adapter moves the QP to CONNECT_PENDING state.
+ * No synchronous reply from adapter to this WR.  The results of
+ * connection are passed back in an async event CCAE_ACTIVE_CONNECT_RESULTS
+ * See c2wr_ae_active_connect_results_t
+ */
+struct c2wr_qp_connect_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 qp_handle;
+	__be32 remote_addr;
+	__be16 remote_port;
+	u16 pad;
+	__be32 private_data_length;
+	u8 private_data[0];	/* Private data in-line. */
+} __attribute__((packed)) ;
+
+struct c2wr_qp_connect {
+	struct c2wr_qp_connect_req req;
+	/* no synchronous reply.         */
+} __attribute__((packed)) ;
+
+
+/*
+ *------------------------ MM ------------------------
+ */
+
+struct c2wr_nsmr_stag_alloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 pbl_depth;
+	u32 pd_id;
+	u32 flags;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_stag_alloc_rep {
+	struct c2wr_hdr hdr;
+	u32 pbl_depth;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_stag_alloc {
+	struct c2wr_nsmr_stag_alloc_req req;
+	struct c2wr_nsmr_stag_alloc_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_register_req {
+	struct c2wr_hdr hdr;
+	__be64 va;
+	u32 rnic_handle;
+	__be16 flags;
+	u8 stag_key;
+	u8 pad;
+	u32 pd_id;
+	__be32 pbl_depth;
+	__be32 pbe_size;
+	__be32 fbo;
+	__be32 length;
+	__be32 addrs_length;
+	/* array of paddrs (must be aligned on a 64bit boundary) */
+	__be64 paddrs[0];
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_register_rep {
+	struct c2wr_hdr hdr;
+	u32 pbl_depth;
+	__be32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_register {
+	struct c2wr_nsmr_register_req req;
+	struct c2wr_nsmr_register_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_pbl_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	__be32 flags;
+	__be32 stag_index;
+	__be32 addrs_length;
+	/* array of paddrs (must be aligned on a 64bit boundary) */
+	__be64 paddrs[0];
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_pbl_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_pbl {
+	struct c2wr_nsmr_pbl_req req;
+	struct c2wr_nsmr_pbl_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_mr_query_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+struct c2wr_mr_query_rep {
+	struct c2wr_hdr hdr;
+	u8 stag_key;
+	u8 pad[3];
+	u32 pd_id;
+	u32 flags;
+	u32 pbl_depth;
+} __attribute__((packed)) ;
+
+union c2wr_mr_query {
+	struct c2wr_mr_query_req req;
+	struct c2wr_mr_query_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_query_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_query_rep {
+	struct c2wr_hdr hdr;
+	u8 stag_key;
+	u8 pad[3];
+	u32 pd_id;
+	u32 flags;
+} __attribute__((packed)) ;
+
+union c2wr_mw_query {
+	struct c2wr_mw_query_req req;
+	struct c2wr_mw_query_rep rep;
+} __attribute__((packed)) ;
+
+
+struct c2wr_stag_dealloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	__be32 stag_index;
+} __attribute__((packed)) ;
+
+struct c2wr_stag_dealloc_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed)) ;
+
+union c2wr_stag_dealloc {
+	struct c2wr_stag_dealloc_req req;
+	struct c2wr_stag_dealloc_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_reregister_req {
+	struct c2wr_hdr hdr;
+	u64 va;
+	u32 rnic_handle;
+	u16 flags;
+	u8 stag_key;
+	u8 pad;
+	u32 stag_index;
+	u32 pd_id;
+	u32 pbl_depth;
+	u32 pbe_size;
+	u32 fbo;
+	u32 length;
+	u32 addrs_length;
+	u32 pad1;
+	/* array of paddrs (must be aligned on a 64bit boundary) */
+	u64 paddrs[0];
+} __attribute__((packed)) ;
+
+struct c2wr_nsmr_reregister_rep {
+	struct c2wr_hdr hdr;
+	u32 pbl_depth;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_nsmr_reregister {
+	struct c2wr_nsmr_reregister_req req;
+	struct c2wr_nsmr_reregister_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_smr_register_req {
+	struct c2wr_hdr hdr;
+	u64 va;
+	u32 rnic_handle;
+	u16 flags;
+	u8 stag_key;
+	u8 pad;
+	u32 stag_index;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_smr_register_rep {
+	struct c2wr_hdr hdr;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_smr_register {
+	struct c2wr_smr_register_req req;
+	struct c2wr_smr_register_rep rep;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_alloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 pd_id;
+} __attribute__((packed)) ;
+
+struct c2wr_mw_alloc_rep {
+	struct c2wr_hdr hdr;
+	u32 stag_index;
+} __attribute__((packed)) ;
+
+union c2wr_mw_alloc {
+	struct c2wr_mw_alloc_req req;
+	struct c2wr_mw_alloc_rep rep;
+} __attribute__((packed)) ;
+
+/*
+ *------------------------ WRs -----------------------
+ */
+
+struct c2wr_user_hdr {
+	struct c2wr_hdr hdr;		/* Has status and WR Type */
+} __attribute__((packed)) ;
+
+enum c2_qp_state {
+	C2_QP_STATE_IDLE = 0x01,
+	C2_QP_STATE_CONNECTING = 0x02,
+	C2_QP_STATE_RTS = 0x04,
+	C2_QP_STATE_CLOSING = 0x08,
+	C2_QP_STATE_TERMINATE = 0x10,
+	C2_QP_STATE_ERROR = 0x20,
+};
+
+/* Completion queue entry. */
+struct c2wr_ce {
+	struct c2wr_hdr hdr;		/* Has status and WR Type */
+	u64 qp_user_context;	/* c2_user_qp_t * */
+	u32 qp_state;		/* Current QP State */
+	u32 handle;		/* QPID or EP Handle */
+	__be32 bytes_rcvd;		/* valid for RECV WCs */
+	u32 stag;
+} __attribute__((packed)) ;
+
+
+/*
+ * Flags used for all post-sq WRs.  These must fit in the flags
+ * field of the struct c2wr_hdr (eight bits).
+ */
+enum {
+	SQ_SIGNALED = 0x01,
+	SQ_READ_FENCE = 0x02,
+	SQ_FENCE = 0x04,
+};
+
+/*
+ * Common fields for all post-sq WRs.  Namely the standard header and a
+ * secondary header with fields common to all post-sq WRs.
+ */
+struct c2_sq_hdr {
+	struct c2wr_user_hdr user_hdr;
+} __attribute__((packed));
+
+/*
+ * Same as above but for post-rq WRs.
+ */
+struct c2_rq_hdr {
+	struct c2wr_user_hdr user_hdr;
+} __attribute__((packed));
+
+/*
+ * use the same struct for all sends.
+ */
+struct c2wr_send_req {
+	struct c2_sq_hdr sq_hdr;
+	__be32 sge_len;
+	__be32 remote_stag;
+	u8 data[0];		/* SGE array */
+} __attribute__((packed));
+
+union c2wr_send {
+	struct c2wr_send_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_rdma_write_req {
+	struct c2_sq_hdr sq_hdr;
+	__be64 remote_to;
+	__be32 remote_stag;
+	__be32 sge_len;
+	u8 data[0];		/* SGE array */
+} __attribute__((packed));
+
+union c2wr_rdma_write {
+	struct c2wr_rdma_write_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_rdma_read_req {
+	struct c2_sq_hdr sq_hdr;
+	__be64 local_to;
+	__be64 remote_to;
+	__be32 local_stag;
+	__be32 remote_stag;
+	__be32 length;
+} __attribute__((packed));
+
+union c2wr_rdma_read {
+	struct c2wr_rdma_read_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_mw_bind_req {
+	struct c2_sq_hdr sq_hdr;
+	u64 va;
+	u8 stag_key;
+	u8 pad[3];
+	u32 mw_stag_index;
+	u32 mr_stag_index;
+	u32 length;
+	u32 flags;
+} __attribute__((packed));
+
+union c2wr_mw_bind {
+	struct c2wr_mw_bind_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_nsmr_fastreg_req {
+	struct c2_sq_hdr sq_hdr;
+	u64 va;
+	u8 stag_key;
+	u8 pad[3];
+	u32 stag_index;
+	u32 pbe_size;
+	u32 fbo;
+	u32 length;
+	u32 addrs_length;
+	/* array of paddrs (must be aligned on a 64bit boundary) */
+	u64 paddrs[0];
+} __attribute__((packed));
+
+union c2wr_nsmr_fastreg {
+	struct c2wr_nsmr_fastreg_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_stag_invalidate_req {
+	struct c2_sq_hdr sq_hdr;
+	u8 stag_key;
+	u8 pad[3];
+	u32 stag_index;
+} __attribute__((packed));
+
+union c2wr_stag_invalidate {
+	struct c2wr_stag_invalidate_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+union c2wr_sqwr {
+	struct c2_sq_hdr sq_hdr;
+	struct c2wr_send_req send;
+	struct c2wr_send_req send_se;
+	struct c2wr_send_req send_inv;
+	struct c2wr_send_req send_se_inv;
+	struct c2wr_rdma_write_req rdma_write;
+	struct c2wr_rdma_read_req rdma_read;
+	struct c2wr_mw_bind_req mw_bind;
+	struct c2wr_nsmr_fastreg_req nsmr_fastreg;
+	struct c2wr_stag_invalidate_req stag_inv;
+} __attribute__((packed));
+
+
+/*
+ * RQ WRs
+ */
+struct c2wr_rqwr {
+	struct c2_rq_hdr rq_hdr;
+	u8 data[0];		/* array of SGEs */
+} __attribute__((packed));
+
+union c2wr_recv {
+	struct c2wr_rqwr req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+/*
+ * All AEs start with this header.  Most AEs only need to convey the
+ * information in the header.  Some, like LLP connection events, need
+ * more info.  The union typdef c2wr_ae_t has all the possible AEs.
+ *
+ * hdr.context is the user_context from the rnic_open WR.  NULL If this
+ * is not affiliated with an rnic
+ *
+ * hdr.id is the AE identifier (eg;  CCAE_REMOTE_SHUTDOWN,
+ * CCAE_LLP_CLOSE_COMPLETE)
+ *
+ * resource_type is one of:  C2_RES_IND_QP, C2_RES_IND_CQ, C2_RES_IND_SRQ
+ *
+ * user_context is the context passed down when the host created the resource.
+ */
+struct c2wr_ae_hdr {
+	struct c2wr_hdr hdr;
+	u64 user_context;	/* user context for this res. */
+	__be32 resource_type;	/* see enum c2_resource_indicator */
+	__be32 resource;	/* handle for resource */
+	__be32 qp_state;	/* current QP State */
+} __attribute__((packed));
+
+/*
+ * After submitting the CCAE_ACTIVE_CONNECT_RESULTS message on the AEQ,
+ * the adapter moves the QP into RTS state
+ */
+struct c2wr_ae_active_connect_results {
+	struct c2wr_ae_hdr ae_hdr;
+	__be32 laddr;
+	__be32 raddr;
+	__be16 lport;
+	__be16 rport;
+	__be32 private_data_length;
+	u8 private_data[0];	/* data is in-line in the msg. */
+} __attribute__((packed));
+
+/*
+ * When connections are established by the stack (and the private data
+ * MPA frame is received), the adapter will generate an event to the host.
+ * The details of the connection, any private data, and the new connection
+ * request handle is passed up via the CCAE_CONNECTION_REQUEST msg on the
+ * AE queue:
+ */
+struct c2wr_ae_connection_request {
+	struct c2wr_ae_hdr ae_hdr;
+	u32 cr_handle;		/* connreq handle (sock ptr) */
+	__be32 laddr;
+	__be32 raddr;
+	__be16 lport;
+	__be16 rport;
+	__be32 private_data_length;
+	u8 private_data[0];	/* data is in-line in the msg. */
+} __attribute__((packed));
+
+union c2wr_ae {
+	struct c2wr_ae_hdr ae_generic;
+	struct c2wr_ae_active_connect_results ae_active_connect_results;
+	struct c2wr_ae_connection_request ae_connection_request;
+} __attribute__((packed));
+
+struct c2wr_init_req {
+	struct c2wr_hdr hdr;
+	__be64 hint_count;
+	__be64 q0_host_shared;
+	__be64 q1_host_shared;
+	__be64 q1_host_msg_pool;
+	__be64 q2_host_shared;
+	__be64 q2_host_msg_pool;
+} __attribute__((packed));
+
+struct c2wr_init_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_init {
+	struct c2wr_init_req req;
+	struct c2wr_init_rep rep;
+} __attribute__((packed));
+
+/*
+ * For upgrading flash.
+ */
+
+struct c2wr_flash_init_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+} __attribute__((packed));
+
+struct c2wr_flash_init_rep {
+	struct c2wr_hdr hdr;
+	u32 adapter_flash_buf_offset;
+	u32 adapter_flash_len;
+} __attribute__((packed));
+
+union c2wr_flash_init {
+	struct c2wr_flash_init_req req;
+	struct c2wr_flash_init_rep rep;
+} __attribute__((packed));
+
+struct c2wr_flash_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 len;
+} __attribute__((packed));
+
+struct c2wr_flash_rep {
+	struct c2wr_hdr hdr;
+	u32 status;
+} __attribute__((packed));
+
+union c2wr_flash {
+	struct c2wr_flash_req req;
+	struct c2wr_flash_rep rep;
+} __attribute__((packed));
+
+struct c2wr_buf_alloc_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 size;
+} __attribute__((packed));
+
+struct c2wr_buf_alloc_rep {
+	struct c2wr_hdr hdr;
+	u32 offset;		/* 0 if mem not available */
+	u32 size;		/* 0 if mem not available */
+} __attribute__((packed));
+
+union c2wr_buf_alloc {
+	struct c2wr_buf_alloc_req req;
+	struct c2wr_buf_alloc_rep rep;
+} __attribute__((packed));
+
+struct c2wr_buf_free_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 offset;		/* Must match value from alloc */
+	u32 size;		/* Must match value from alloc */
+} __attribute__((packed));
+
+struct c2wr_buf_free_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_buf_free {
+	struct c2wr_buf_free_req req;
+	struct c2wr_ce rep;
+} __attribute__((packed));
+
+struct c2wr_flash_write_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 offset;
+	u32 size;
+	u32 type;
+	u32 flags;
+} __attribute__((packed));
+
+struct c2wr_flash_write_rep {
+	struct c2wr_hdr hdr;
+	u32 status;
+} __attribute__((packed));
+
+union c2wr_flash_write {
+	struct c2wr_flash_write_req req;
+	struct c2wr_flash_write_rep rep;
+} __attribute__((packed));
+
+/*
+ * Messages for LLP connection setup.
+ */
+
+/*
+ * Listen Request.  This allocates a listening endpoint to allow passive
+ * connection setup.  Newly established LLP connections are passed up
+ * via an AE.  See c2wr_ae_connection_request_t
+ */
+struct c2wr_ep_listen_create_req {
+	struct c2wr_hdr hdr;
+	u64 user_context;	/* returned in AEs. */
+	u32 rnic_handle;
+	__be32 local_addr;		/* local addr, or 0  */
+	__be16 local_port;		/* 0 means "pick one" */
+	u16 pad;
+	__be32 backlog;		/* tradional tcp listen bl */
+} __attribute__((packed));
+
+struct c2wr_ep_listen_create_rep {
+	struct c2wr_hdr hdr;
+	u32 ep_handle;		/* handle to new listening ep */
+	u16 local_port;		/* resulting port... */
+	u16 pad;
+} __attribute__((packed));
+
+union c2wr_ep_listen_create {
+	struct c2wr_ep_listen_create_req req;
+	struct c2wr_ep_listen_create_rep rep;
+} __attribute__((packed));
+
+struct c2wr_ep_listen_destroy_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 ep_handle;
+} __attribute__((packed));
+
+struct c2wr_ep_listen_destroy_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_ep_listen_destroy {
+	struct c2wr_ep_listen_destroy_req req;
+	struct c2wr_ep_listen_destroy_rep rep;
+} __attribute__((packed));
+
+struct c2wr_ep_query_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 ep_handle;
+} __attribute__((packed));
+
+struct c2wr_ep_query_rep {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 local_addr;
+	u32 remote_addr;
+	u16 local_port;
+	u16 remote_port;
+} __attribute__((packed));
+
+union c2wr_ep_query {
+	struct c2wr_ep_query_req req;
+	struct c2wr_ep_query_rep rep;
+} __attribute__((packed));
+
+
+/*
+ * The host passes this down to indicate acceptance of a pending iWARP
+ * connection.  The cr_handle was obtained from the CONNECTION_REQUEST
+ * AE passed up by the adapter.  See c2wr_ae_connection_request_t.
+ */
+struct c2wr_cr_accept_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 qp_handle;		/* QP to bind to this LLP conn */
+	u32 ep_handle;		/* LLP  handle to accept */
+	__be32 private_data_length;
+	u8 private_data[0];	/* data in-line in msg. */
+} __attribute__((packed));
+
+/*
+ * adapter sends reply when private data is successfully submitted to
+ * the LLP.
+ */
+struct c2wr_cr_accept_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_cr_accept {
+	struct c2wr_cr_accept_req req;
+	struct c2wr_cr_accept_rep rep;
+} __attribute__((packed));
+
+/*
+ * The host sends this down if a given iWARP connection request was
+ * rejected by the consumer.  The cr_handle was obtained from a
+ * previous c2wr_ae_connection_request_t AE sent by the adapter.
+ */
+struct  c2wr_cr_reject_req {
+	struct c2wr_hdr hdr;
+	u32 rnic_handle;
+	u32 ep_handle;		/* LLP handle to reject */
+} __attribute__((packed));
+
+/*
+ * Dunno if this is needed, but we'll add it for now.  The adapter will
+ * send the reject_reply after the LLP endpoint has been destroyed.
+ */
+struct  c2wr_cr_reject_rep {
+	struct c2wr_hdr hdr;
+} __attribute__((packed));
+
+union c2wr_cr_reject {
+	struct c2wr_cr_reject_req req;
+	struct c2wr_cr_reject_rep rep;
+} __attribute__((packed));
+
+/*
+ * console command.  Used to implement a debug console over the verbs
+ * request and reply queues.
+ */
+
+/*
+ * Console request message.  It contains:
+ *	- message hdr with id = CCWR_CONSOLE
+ *	- the physaddr/len of host memory to be used for the reply.
+ *	- the command string.  eg:  "netstat -s" or "zoneinfo"
+ */
+struct c2wr_console_req {
+	struct c2wr_hdr hdr;		/* id = CCWR_CONSOLE */
+	u64 reply_buf;		/* pinned host buf for reply */
+	u32 reply_buf_len;	/* length of reply buffer */
+	u8 command[0];		/* NUL terminated ascii string */
+	/* containing the command req */
+} __attribute__((packed));
+
+/*
+ * flags used in the console reply.
+ */
+enum c2_console_flags {
+	CONS_REPLY_TRUNCATED = 0x00000001	/* reply was truncated */
+} __attribute__((packed));
+
+/*
+ * Console reply message.
+ * hdr.result contains the c2_status_t error if the reply was _not_ generated,
+ * or C2_OK if the reply was generated.
+ */
+struct c2wr_console_rep {
+	struct c2wr_hdr hdr;		/* id = CCWR_CONSOLE */
+	u32 flags;
+} __attribute__((packed));
+
+union c2wr_console {
+	struct c2wr_console_req req;
+	struct c2wr_console_rep rep;
+} __attribute__((packed));
+
+
+/*
+ * Giant union with all WRs.  Makes life easier...
+ */
+union c2wr {
+	struct c2wr_hdr hdr;
+	struct c2wr_user_hdr user_hdr;
+	union c2wr_rnic_open rnic_open;
+	union c2wr_rnic_query rnic_query;
+	union c2wr_rnic_getconfig rnic_getconfig;
+	union c2wr_rnic_setconfig rnic_setconfig;
+	union c2wr_rnic_close rnic_close;
+	union c2wr_cq_create cq_create;
+	union c2wr_cq_modify cq_modify;
+	union c2wr_cq_destroy cq_destroy;
+	union c2wr_pd_alloc pd_alloc;
+	union c2wr_pd_dealloc pd_dealloc;
+	union c2wr_srq_create srq_create;
+	union c2wr_srq_destroy srq_destroy;
+	union c2wr_qp_create qp_create;
+	union c2wr_qp_query qp_query;
+	union c2wr_qp_modify qp_modify;
+	union c2wr_qp_destroy qp_destroy;
+	struct c2wr_qp_connect qp_connect;
+	union c2wr_nsmr_stag_alloc nsmr_stag_alloc;
+	union c2wr_nsmr_register nsmr_register;
+	union c2wr_nsmr_pbl nsmr_pbl;
+	union c2wr_mr_query mr_query;
+	union c2wr_mw_query mw_query;
+	union c2wr_stag_dealloc stag_dealloc;
+	union c2wr_sqwr sqwr;
+	struct c2wr_rqwr rqwr;
+	struct c2wr_ce ce;
+	union c2wr_ae ae;
+	union c2wr_init init;
+	union c2wr_ep_listen_create ep_listen_create;
+	union c2wr_ep_listen_destroy ep_listen_destroy;
+	union c2wr_cr_accept cr_accept;
+	union c2wr_cr_reject cr_reject;
+	union c2wr_console console;
+	union c2wr_flash_init flash_init;
+	union c2wr_flash flash;
+	union c2wr_buf_alloc buf_alloc;
+	union c2wr_buf_free buf_free;
+	union c2wr_flash_write flash_write;
+} __attribute__((packed));
+
+
+/*
+ * Accessors for the wr fields that are packed together tightly to
+ * reduce the wr message size.  The wr arguments are void* so that
+ * either a struct c2wr*, a struct c2wr_hdr*, or a pointer to any of the types
+ * in the struct c2wr union can be passed in.
+ */
+static __inline__ u8 c2_wr_get_id(void *wr)
+{
+	return ((struct c2wr_hdr *) wr)->id;
+}
+static __inline__ void c2_wr_set_id(void *wr, u8 id)
+{
+	((struct c2wr_hdr *) wr)->id = id;
+}
+static __inline__ u8 c2_wr_get_result(void *wr)
+{
+	return ((struct c2wr_hdr *) wr)->result;
+}
+static __inline__ void c2_wr_set_result(void *wr, u8 result)
+{
+	((struct c2wr_hdr *) wr)->result = result;
+}
+static __inline__ u8 c2_wr_get_flags(void *wr)
+{
+	return ((struct c2wr_hdr *) wr)->flags;
+}
+static __inline__ void c2_wr_set_flags(void *wr, u8 flags)
+{
+	((struct c2wr_hdr *) wr)->flags = flags;
+}
+static __inline__ u8 c2_wr_get_sge_count(void *wr)
+{
+	return ((struct c2wr_hdr *) wr)->sge_count;
+}
+static __inline__ void c2_wr_set_sge_count(void *wr, u8 sge_count)
+{
+	((struct c2wr_hdr *) wr)->sge_count = sge_count;
+}
+static __inline__ __be32 c2_wr_get_wqe_count(void *wr)
+{
+	return ((struct c2wr_hdr *) wr)->wqe_count;
+}
+static __inline__ void c2_wr_set_wqe_count(void *wr, u32 wqe_count)
+{
+	((struct c2wr_hdr *) wr)->wqe_count = wqe_count;
+}
+
+#endif				/* _C2_WR_H_ */
diff --git a/drivers/infiniband/hw/cxgb3/Kconfig b/drivers/infiniband/hw/cxgb3/Kconfig
new file mode 100644
index 000000000..2b6352b85
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/Kconfig
@@ -0,0 +1,27 @@
+config INFINIBAND_CXGB3
+	tristate "Chelsio RDMA Driver"
+	depends on CHELSIO_T3 && INET
+	select GENERIC_ALLOCATOR
+	---help---
+	  This is an iWARP/RDMA driver for the Chelsio T3 1GbE and
+	  10GbE adapters.
+
+	  For general information about Chelsio and our products, visit
+	  our website at <http://www.chelsio.com>.
+
+	  For customer support, please visit our customer support page at
+	  <http://www.chelsio.com/support.html>.
+
+	  Please send feedback to <linux-bugs@chelsio.com>.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called iw_cxgb3.
+
+config INFINIBAND_CXGB3_DEBUG
+	bool "Verbose debugging output"
+	depends on INFINIBAND_CXGB3
+	default n
+	---help---
+	  This option causes the Chelsio RDMA driver to produce copious
+	  amounts of debug messages.  Select this if you are developing
+	  the driver or trying to diagnose a problem.
diff --git a/drivers/infiniband/hw/cxgb3/Makefile b/drivers/infiniband/hw/cxgb3/Makefile
new file mode 100644
index 000000000..276136418
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/Makefile
@@ -0,0 +1,8 @@
+ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb3
+
+obj-$(CONFIG_INFINIBAND_CXGB3) += iw_cxgb3.o
+
+iw_cxgb3-y :=  iwch_cm.o iwch_ev.o iwch_cq.o iwch_qp.o iwch_mem.o \
+	       iwch_provider.o iwch.o cxio_hal.o cxio_resource.o
+
+ccflags-$(CONFIG_INFINIBAND_CXGB3_DEBUG) += -DDEBUG
diff --git a/drivers/infiniband/hw/cxgb3/cxio_dbg.c b/drivers/infiniband/hw/cxgb3/cxio_dbg.c
new file mode 100644
index 000000000..8bca6b4ec
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_dbg.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifdef DEBUG
+#include <linux/types.h>
+#include <linux/slab.h>
+#include "common.h"
+#include "cxgb3_ioctl.h"
+#include "cxio_hal.h"
+#include "cxio_wr.h"
+
+void cxio_dump_tpt(struct cxio_rdev *rdev, u32 stag)
+{
+	struct ch_mem_range *m;
+	u64 *data;
+	int rc;
+	int size = 32;
+
+	m = kmalloc(sizeof(*m) + size, GFP_ATOMIC);
+	if (!m) {
+		PDBG("%s couldn't allocate memory.\n", __func__);
+		return;
+	}
+	m->mem_id = MEM_PMRX;
+	m->addr = (stag>>8) * 32 + rdev->rnic_info.tpt_base;
+	m->len = size;
+	PDBG("%s TPT addr 0x%x len %d\n", __func__, m->addr, m->len);
+	rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+	if (rc) {
+		PDBG("%s toectl returned error %d\n", __func__, rc);
+		kfree(m);
+		return;
+	}
+
+	data = (u64 *)m->buf;
+	while (size > 0) {
+		PDBG("TPT %08x: %016llx\n", m->addr, (unsigned long long) *data);
+		size -= 8;
+		data++;
+		m->addr += 8;
+	}
+	kfree(m);
+}
+
+void cxio_dump_pbl(struct cxio_rdev *rdev, u32 pbl_addr, uint len, u8 shift)
+{
+	struct ch_mem_range *m;
+	u64 *data;
+	int rc;
+	int size, npages;
+
+	shift += 12;
+	npages = (len + (1ULL << shift) - 1) >> shift;
+	size = npages * sizeof(u64);
+
+	m = kmalloc(sizeof(*m) + size, GFP_ATOMIC);
+	if (!m) {
+		PDBG("%s couldn't allocate memory.\n", __func__);
+		return;
+	}
+	m->mem_id = MEM_PMRX;
+	m->addr = pbl_addr;
+	m->len = size;
+	PDBG("%s PBL addr 0x%x len %d depth %d\n",
+		__func__, m->addr, m->len, npages);
+	rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+	if (rc) {
+		PDBG("%s toectl returned error %d\n", __func__, rc);
+		kfree(m);
+		return;
+	}
+
+	data = (u64 *)m->buf;
+	while (size > 0) {
+		PDBG("PBL %08x: %016llx\n", m->addr, (unsigned long long) *data);
+		size -= 8;
+		data++;
+		m->addr += 8;
+	}
+	kfree(m);
+}
+
+void cxio_dump_wqe(union t3_wr *wqe)
+{
+	__be64 *data = (__be64 *)wqe;
+	uint size = (uint)(be64_to_cpu(*data) & 0xff);
+
+	if (size == 0)
+		size = 8;
+	while (size > 0) {
+		PDBG("WQE %p: %016llx\n", data,
+		     (unsigned long long) be64_to_cpu(*data));
+		size--;
+		data++;
+	}
+}
+
+void cxio_dump_wce(struct t3_cqe *wce)
+{
+	__be64 *data = (__be64 *)wce;
+	int size = sizeof(*wce);
+
+	while (size > 0) {
+		PDBG("WCE %p: %016llx\n", data,
+		     (unsigned long long) be64_to_cpu(*data));
+		size -= 8;
+		data++;
+	}
+}
+
+void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents)
+{
+	struct ch_mem_range *m;
+	int size = nents * 64;
+	u64 *data;
+	int rc;
+
+	m = kmalloc(sizeof(*m) + size, GFP_ATOMIC);
+	if (!m) {
+		PDBG("%s couldn't allocate memory.\n", __func__);
+		return;
+	}
+	m->mem_id = MEM_PMRX;
+	m->addr = ((hwtid)<<10) + rdev->rnic_info.rqt_base;
+	m->len = size;
+	PDBG("%s RQT addr 0x%x len %d\n", __func__, m->addr, m->len);
+	rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+	if (rc) {
+		PDBG("%s toectl returned error %d\n", __func__, rc);
+		kfree(m);
+		return;
+	}
+
+	data = (u64 *)m->buf;
+	while (size > 0) {
+		PDBG("RQT %08x: %016llx\n", m->addr, (unsigned long long) *data);
+		size -= 8;
+		data++;
+		m->addr += 8;
+	}
+	kfree(m);
+}
+
+void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid)
+{
+	struct ch_mem_range *m;
+	int size = TCB_SIZE;
+	u32 *data;
+	int rc;
+
+	m = kmalloc(sizeof(*m) + size, GFP_ATOMIC);
+	if (!m) {
+		PDBG("%s couldn't allocate memory.\n", __func__);
+		return;
+	}
+	m->mem_id = MEM_CM;
+	m->addr = hwtid * size;
+	m->len = size;
+	PDBG("%s TCB %d len %d\n", __func__, m->addr, m->len);
+	rc = rdev->t3cdev_p->ctl(rdev->t3cdev_p, RDMA_GET_MEM, m);
+	if (rc) {
+		PDBG("%s toectl returned error %d\n", __func__, rc);
+		kfree(m);
+		return;
+	}
+
+	data = (u32 *)m->buf;
+	while (size > 0) {
+		printk("%2u: %08x %08x %08x %08x %08x %08x %08x %08x\n",
+			m->addr,
+			*(data+2), *(data+3), *(data),*(data+1),
+			*(data+6), *(data+7), *(data+4), *(data+5));
+		size -= 32;
+		data += 8;
+		m->addr += 32;
+	}
+	kfree(m);
+}
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.c b/drivers/infiniband/hw/cxgb3/cxio_hal.c
new file mode 100644
index 000000000..de1c61b41
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.c
@@ -0,0 +1,1343 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <asm/delay.h>
+
+#include <linux/mutex.h>
+#include <linux/netdevice.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <net/net_namespace.h>
+
+#include "cxio_resource.h"
+#include "cxio_hal.h"
+#include "cxgb3_offload.h"
+#include "sge_defs.h"
+
+static LIST_HEAD(rdev_list);
+static cxio_hal_ev_callback_func_t cxio_ev_cb = NULL;
+
+static struct cxio_rdev *cxio_hal_find_rdev_by_name(char *dev_name)
+{
+	struct cxio_rdev *rdev;
+
+	list_for_each_entry(rdev, &rdev_list, entry)
+		if (!strcmp(rdev->dev_name, dev_name))
+			return rdev;
+	return NULL;
+}
+
+static struct cxio_rdev *cxio_hal_find_rdev_by_t3cdev(struct t3cdev *tdev)
+{
+	struct cxio_rdev *rdev;
+
+	list_for_each_entry(rdev, &rdev_list, entry)
+		if (rdev->t3cdev_p == tdev)
+			return rdev;
+	return NULL;
+}
+
+int cxio_hal_cq_op(struct cxio_rdev *rdev_p, struct t3_cq *cq,
+		   enum t3_cq_opcode op, u32 credit)
+{
+	int ret;
+	struct t3_cqe *cqe;
+	u32 rptr;
+
+	struct rdma_cq_op setup;
+	setup.id = cq->cqid;
+	setup.credits = (op == CQ_CREDIT_UPDATE) ? credit : 0;
+	setup.op = op;
+	ret = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_OP, &setup);
+
+	if ((ret < 0) || (op == CQ_CREDIT_UPDATE))
+		return ret;
+
+	/*
+	 * If the rearm returned an index other than our current index,
+	 * then there might be CQE's in flight (being DMA'd).  We must wait
+	 * here for them to complete or the consumer can miss a notification.
+	 */
+	if (Q_PTR2IDX((cq->rptr), cq->size_log2) != ret) {
+		int i=0;
+
+		rptr = cq->rptr;
+
+		/*
+		 * Keep the generation correct by bumping rptr until it
+		 * matches the index returned by the rearm - 1.
+		 */
+		while (Q_PTR2IDX((rptr+1), cq->size_log2) != ret)
+			rptr++;
+
+		/*
+		 * Now rptr is the index for the (last) cqe that was
+		 * in-flight at the time the HW rearmed the CQ.  We
+		 * spin until that CQE is valid.
+		 */
+		cqe = cq->queue + Q_PTR2IDX(rptr, cq->size_log2);
+		while (!CQ_VLD_ENTRY(rptr, cq->size_log2, cqe)) {
+			udelay(1);
+			if (i++ > 1000000) {
+				printk(KERN_ERR "%s: stalled rnic\n",
+				       rdev_p->dev_name);
+				return -EIO;
+			}
+		}
+
+		return 1;
+	}
+
+	return 0;
+}
+
+static int cxio_hal_clear_cq_ctx(struct cxio_rdev *rdev_p, u32 cqid)
+{
+	struct rdma_cq_setup setup;
+	setup.id = cqid;
+	setup.base_addr = 0;	/* NULL address */
+	setup.size = 0;		/* disaable the CQ */
+	setup.credits = 0;
+	setup.credit_thres = 0;
+	setup.ovfl_mode = 0;
+	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+static int cxio_hal_clear_qp_ctx(struct cxio_rdev *rdev_p, u32 qpid)
+{
+	u64 sge_cmd;
+	struct t3_modify_qp_wr *wqe;
+	struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_KERNEL);
+	if (!skb) {
+		PDBG("%s alloc_skb failed\n", __func__);
+		return -ENOMEM;
+	}
+	wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe));
+	memset(wqe, 0, sizeof(*wqe));
+	build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD,
+		       T3_COMPLETION_FLAG | T3_NOTIFY_FLAG, 0, qpid, 7,
+		       T3_SOPEOP);
+	wqe->flags = cpu_to_be32(MODQP_WRITE_EC);
+	sge_cmd = qpid << 8 | 3;
+	wqe->sge_cmd = cpu_to_be64(sge_cmd);
+	skb->priority = CPL_PRIORITY_CONTROL;
+	return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb);
+}
+
+int cxio_create_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq, int kernel)
+{
+	struct rdma_cq_setup setup;
+	int size = (1UL << (cq->size_log2)) * sizeof(struct t3_cqe);
+
+	size += 1; /* one extra page for storing cq-in-err state */
+	cq->cqid = cxio_hal_get_cqid(rdev_p->rscp);
+	if (!cq->cqid)
+		return -ENOMEM;
+	if (kernel) {
+		cq->sw_queue = kzalloc(size, GFP_KERNEL);
+		if (!cq->sw_queue)
+			return -ENOMEM;
+	}
+	cq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev), size,
+					     &(cq->dma_addr), GFP_KERNEL);
+	if (!cq->queue) {
+		kfree(cq->sw_queue);
+		return -ENOMEM;
+	}
+	dma_unmap_addr_set(cq, mapping, cq->dma_addr);
+	memset(cq->queue, 0, size);
+	setup.id = cq->cqid;
+	setup.base_addr = (u64) (cq->dma_addr);
+	setup.size = 1UL << cq->size_log2;
+	setup.credits = 65535;
+	setup.credit_thres = 1;
+	if (rdev_p->t3cdev_p->type != T3A)
+		setup.ovfl_mode = 0;
+	else
+		setup.ovfl_mode = 1;
+	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+#ifdef notyet
+int cxio_resize_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+{
+	struct rdma_cq_setup setup;
+	setup.id = cq->cqid;
+	setup.base_addr = (u64) (cq->dma_addr);
+	setup.size = 1UL << cq->size_log2;
+	setup.credits = setup.size;
+	setup.credit_thres = setup.size;	/* TBD: overflow recovery */
+	setup.ovfl_mode = 1;
+	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+#endif
+
+static u32 get_qpid(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
+{
+	struct cxio_qpid_list *entry;
+	u32 qpid;
+	int i;
+
+	mutex_lock(&uctx->lock);
+	if (!list_empty(&uctx->qpids)) {
+		entry = list_entry(uctx->qpids.next, struct cxio_qpid_list,
+				   entry);
+		list_del(&entry->entry);
+		qpid = entry->qpid;
+		kfree(entry);
+	} else {
+		qpid = cxio_hal_get_qpid(rdev_p->rscp);
+		if (!qpid)
+			goto out;
+		for (i = qpid+1; i & rdev_p->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				break;
+			entry->qpid = i;
+			list_add_tail(&entry->entry, &uctx->qpids);
+		}
+	}
+out:
+	mutex_unlock(&uctx->lock);
+	PDBG("%s qpid 0x%x\n", __func__, qpid);
+	return qpid;
+}
+
+static void put_qpid(struct cxio_rdev *rdev_p, u32 qpid,
+		     struct cxio_ucontext *uctx)
+{
+	struct cxio_qpid_list *entry;
+
+	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	if (!entry)
+		return;
+	PDBG("%s qpid 0x%x\n", __func__, qpid);
+	entry->qpid = qpid;
+	mutex_lock(&uctx->lock);
+	list_add_tail(&entry->entry, &uctx->qpids);
+	mutex_unlock(&uctx->lock);
+}
+
+void cxio_release_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
+{
+	struct list_head *pos, *nxt;
+	struct cxio_qpid_list *entry;
+
+	mutex_lock(&uctx->lock);
+	list_for_each_safe(pos, nxt, &uctx->qpids) {
+		entry = list_entry(pos, struct cxio_qpid_list, entry);
+		list_del_init(&entry->entry);
+		if (!(entry->qpid & rdev_p->qpmask))
+			cxio_hal_put_qpid(rdev_p->rscp, entry->qpid);
+		kfree(entry);
+	}
+	mutex_unlock(&uctx->lock);
+}
+
+void cxio_init_ucontext(struct cxio_rdev *rdev_p, struct cxio_ucontext *uctx)
+{
+	INIT_LIST_HEAD(&uctx->qpids);
+	mutex_init(&uctx->lock);
+}
+
+int cxio_create_qp(struct cxio_rdev *rdev_p, u32 kernel_domain,
+		   struct t3_wq *wq, struct cxio_ucontext *uctx)
+{
+	int depth = 1UL << wq->size_log2;
+	int rqsize = 1UL << wq->rq_size_log2;
+
+	wq->qpid = get_qpid(rdev_p, uctx);
+	if (!wq->qpid)
+		return -ENOMEM;
+
+	wq->rq = kzalloc(depth * sizeof(struct t3_swrq), GFP_KERNEL);
+	if (!wq->rq)
+		goto err1;
+
+	wq->rq_addr = cxio_hal_rqtpool_alloc(rdev_p, rqsize);
+	if (!wq->rq_addr)
+		goto err2;
+
+	wq->sq = kzalloc(depth * sizeof(struct t3_swsq), GFP_KERNEL);
+	if (!wq->sq)
+		goto err3;
+
+	wq->queue = dma_alloc_coherent(&(rdev_p->rnic_info.pdev->dev),
+					     depth * sizeof(union t3_wr),
+					     &(wq->dma_addr), GFP_KERNEL);
+	if (!wq->queue)
+		goto err4;
+
+	memset(wq->queue, 0, depth * sizeof(union t3_wr));
+	dma_unmap_addr_set(wq, mapping, wq->dma_addr);
+	wq->doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr;
+	if (!kernel_domain)
+		wq->udb = (u64)rdev_p->rnic_info.udbell_physbase +
+					(wq->qpid << rdev_p->qpshift);
+	wq->rdev = rdev_p;
+	PDBG("%s qpid 0x%x doorbell 0x%p udb 0x%llx\n", __func__,
+	     wq->qpid, wq->doorbell, (unsigned long long) wq->udb);
+	return 0;
+err4:
+	kfree(wq->sq);
+err3:
+	cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, rqsize);
+err2:
+	kfree(wq->rq);
+err1:
+	put_qpid(rdev_p, wq->qpid, uctx);
+	return -ENOMEM;
+}
+
+int cxio_destroy_cq(struct cxio_rdev *rdev_p, struct t3_cq *cq)
+{
+	int err;
+	err = cxio_hal_clear_cq_ctx(rdev_p, cq->cqid);
+	kfree(cq->sw_queue);
+	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
+			  (1UL << (cq->size_log2))
+			  * sizeof(struct t3_cqe), cq->queue,
+			  dma_unmap_addr(cq, mapping));
+	cxio_hal_put_cqid(rdev_p->rscp, cq->cqid);
+	return err;
+}
+
+int cxio_destroy_qp(struct cxio_rdev *rdev_p, struct t3_wq *wq,
+		    struct cxio_ucontext *uctx)
+{
+	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
+			  (1UL << (wq->size_log2))
+			  * sizeof(union t3_wr), wq->queue,
+			  dma_unmap_addr(wq, mapping));
+	kfree(wq->sq);
+	cxio_hal_rqtpool_free(rdev_p, wq->rq_addr, (1UL << wq->rq_size_log2));
+	kfree(wq->rq);
+	put_qpid(rdev_p, wq->qpid, uctx);
+	return 0;
+}
+
+static void insert_recv_cqe(struct t3_wq *wq, struct t3_cq *cq)
+{
+	struct t3_cqe cqe;
+
+	PDBG("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __func__,
+	     wq, cq, cq->sw_rptr, cq->sw_wptr);
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) |
+			         V_CQE_OPCODE(T3_SEND) |
+				 V_CQE_TYPE(0) |
+				 V_CQE_SWCQE(1) |
+				 V_CQE_QPID(wq->qpid) |
+				 V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr,
+						       cq->size_log2)));
+	*(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe;
+	cq->sw_wptr++;
+}
+
+int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count)
+{
+	u32 ptr;
+	int flushed = 0;
+
+	PDBG("%s wq %p cq %p\n", __func__, wq, cq);
+
+	/* flush RQ */
+	PDBG("%s rq_rptr %u rq_wptr %u skip count %u\n", __func__,
+	    wq->rq_rptr, wq->rq_wptr, count);
+	ptr = wq->rq_rptr + count;
+	while (ptr++ != wq->rq_wptr) {
+		insert_recv_cqe(wq, cq);
+		flushed++;
+	}
+	return flushed;
+}
+
+static void insert_sq_cqe(struct t3_wq *wq, struct t3_cq *cq,
+		          struct t3_swsq *sqp)
+{
+	struct t3_cqe cqe;
+
+	PDBG("%s wq %p cq %p sw_rptr 0x%x sw_wptr 0x%x\n", __func__,
+	     wq, cq, cq->sw_rptr, cq->sw_wptr);
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.header = cpu_to_be32(V_CQE_STATUS(TPT_ERR_SWFLUSH) |
+			         V_CQE_OPCODE(sqp->opcode) |
+			         V_CQE_TYPE(1) |
+			         V_CQE_SWCQE(1) |
+			         V_CQE_QPID(wq->qpid) |
+			         V_CQE_GENBIT(Q_GENBIT(cq->sw_wptr,
+						       cq->size_log2)));
+	cqe.u.scqe.wrid_hi = sqp->sq_wptr;
+
+	*(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2)) = cqe;
+	cq->sw_wptr++;
+}
+
+int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count)
+{
+	__u32 ptr;
+	int flushed = 0;
+	struct t3_swsq *sqp = wq->sq + Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2);
+
+	ptr = wq->sq_rptr + count;
+	sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2);
+	while (ptr != wq->sq_wptr) {
+		sqp->signaled = 0;
+		insert_sq_cqe(wq, cq, sqp);
+		ptr++;
+		sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2);
+		flushed++;
+	}
+	return flushed;
+}
+
+/*
+ * Move all CQEs from the HWCQ into the SWCQ.
+ */
+void cxio_flush_hw_cq(struct t3_cq *cq)
+{
+	struct t3_cqe *cqe, *swcqe;
+
+	PDBG("%s cq %p cqid 0x%x\n", __func__, cq, cq->cqid);
+	cqe = cxio_next_hw_cqe(cq);
+	while (cqe) {
+		PDBG("%s flushing hwcq rptr 0x%x to swcq wptr 0x%x\n",
+		     __func__, cq->rptr, cq->sw_wptr);
+		swcqe = cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2);
+		*swcqe = *cqe;
+		swcqe->header |= cpu_to_be32(V_CQE_SWCQE(1));
+		cq->sw_wptr++;
+		cq->rptr++;
+		cqe = cxio_next_hw_cqe(cq);
+	}
+}
+
+static int cqe_completes_wr(struct t3_cqe *cqe, struct t3_wq *wq)
+{
+	if (CQE_OPCODE(*cqe) == T3_TERMINATE)
+		return 0;
+
+	if ((CQE_OPCODE(*cqe) == T3_RDMA_WRITE) && RQ_TYPE(*cqe))
+		return 0;
+
+	if ((CQE_OPCODE(*cqe) == T3_READ_RESP) && SQ_TYPE(*cqe))
+		return 0;
+
+	if (CQE_SEND_OPCODE(*cqe) && RQ_TYPE(*cqe) &&
+	    Q_EMPTY(wq->rq_rptr, wq->rq_wptr))
+		return 0;
+
+	return 1;
+}
+
+void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count)
+{
+	struct t3_cqe *cqe;
+	u32 ptr;
+
+	*count = 0;
+	ptr = cq->sw_rptr;
+	while (!Q_EMPTY(ptr, cq->sw_wptr)) {
+		cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2));
+		if ((SQ_TYPE(*cqe) ||
+		     ((CQE_OPCODE(*cqe) == T3_READ_RESP) && wq->oldest_read)) &&
+		    (CQE_QPID(*cqe) == wq->qpid))
+			(*count)++;
+		ptr++;
+	}
+	PDBG("%s cq %p count %d\n", __func__, cq, *count);
+}
+
+void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count)
+{
+	struct t3_cqe *cqe;
+	u32 ptr;
+
+	*count = 0;
+	PDBG("%s count zero %d\n", __func__, *count);
+	ptr = cq->sw_rptr;
+	while (!Q_EMPTY(ptr, cq->sw_wptr)) {
+		cqe = cq->sw_queue + (Q_PTR2IDX(ptr, cq->size_log2));
+		if (RQ_TYPE(*cqe) && (CQE_OPCODE(*cqe) != T3_READ_RESP) &&
+		    (CQE_QPID(*cqe) == wq->qpid) && cqe_completes_wr(cqe, wq))
+			(*count)++;
+		ptr++;
+	}
+	PDBG("%s cq %p count %d\n", __func__, cq, *count);
+}
+
+static int cxio_hal_init_ctrl_cq(struct cxio_rdev *rdev_p)
+{
+	struct rdma_cq_setup setup;
+	setup.id = 0;
+	setup.base_addr = 0;	/* NULL address */
+	setup.size = 1;		/* enable the CQ */
+	setup.credits = 0;
+
+	/* force SGE to redirect to RspQ and interrupt */
+	setup.credit_thres = 0;
+	setup.ovfl_mode = 1;
+	return (rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_CQ_SETUP, &setup));
+}
+
+static int cxio_hal_init_ctrl_qp(struct cxio_rdev *rdev_p)
+{
+	int err;
+	u64 sge_cmd, ctx0, ctx1;
+	u64 base_addr;
+	struct t3_modify_qp_wr *wqe;
+	struct sk_buff *skb;
+
+	skb = alloc_skb(sizeof(*wqe), GFP_KERNEL);
+	if (!skb) {
+		PDBG("%s alloc_skb failed\n", __func__);
+		return -ENOMEM;
+	}
+	err = cxio_hal_init_ctrl_cq(rdev_p);
+	if (err) {
+		PDBG("%s err %d initializing ctrl_cq\n", __func__, err);
+		goto err;
+	}
+	rdev_p->ctrl_qp.workq = dma_alloc_coherent(
+					&(rdev_p->rnic_info.pdev->dev),
+					(1 << T3_CTRL_QP_SIZE_LOG2) *
+					sizeof(union t3_wr),
+					&(rdev_p->ctrl_qp.dma_addr),
+					GFP_KERNEL);
+	if (!rdev_p->ctrl_qp.workq) {
+		PDBG("%s dma_alloc_coherent failed\n", __func__);
+		err = -ENOMEM;
+		goto err;
+	}
+	dma_unmap_addr_set(&rdev_p->ctrl_qp, mapping,
+			   rdev_p->ctrl_qp.dma_addr);
+	rdev_p->ctrl_qp.doorbell = (void __iomem *)rdev_p->rnic_info.kdb_addr;
+	memset(rdev_p->ctrl_qp.workq, 0,
+	       (1 << T3_CTRL_QP_SIZE_LOG2) * sizeof(union t3_wr));
+
+	mutex_init(&rdev_p->ctrl_qp.lock);
+	init_waitqueue_head(&rdev_p->ctrl_qp.waitq);
+
+	/* update HW Ctrl QP context */
+	base_addr = rdev_p->ctrl_qp.dma_addr;
+	base_addr >>= 12;
+	ctx0 = (V_EC_SIZE((1 << T3_CTRL_QP_SIZE_LOG2)) |
+		V_EC_BASE_LO((u32) base_addr & 0xffff));
+	ctx0 <<= 32;
+	ctx0 |= V_EC_CREDITS(FW_WR_NUM);
+	base_addr >>= 16;
+	ctx1 = (u32) base_addr;
+	base_addr >>= 32;
+	ctx1 |= ((u64) (V_EC_BASE_HI((u32) base_addr & 0xf) | V_EC_RESPQ(0) |
+			V_EC_TYPE(0) | V_EC_GEN(1) |
+			V_EC_UP_TOKEN(T3_CTL_QP_TID) | F_EC_VALID)) << 32;
+	wqe = (struct t3_modify_qp_wr *) skb_put(skb, sizeof(*wqe));
+	memset(wqe, 0, sizeof(*wqe));
+	build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_QP_MOD, 0, 0,
+		       T3_CTL_QP_TID, 7, T3_SOPEOP);
+	wqe->flags = cpu_to_be32(MODQP_WRITE_EC);
+	sge_cmd = (3ULL << 56) | FW_RI_SGEEC_START << 8 | 3;
+	wqe->sge_cmd = cpu_to_be64(sge_cmd);
+	wqe->ctx1 = cpu_to_be64(ctx1);
+	wqe->ctx0 = cpu_to_be64(ctx0);
+	PDBG("CtrlQP dma_addr 0x%llx workq %p size %d\n",
+	     (unsigned long long) rdev_p->ctrl_qp.dma_addr,
+	     rdev_p->ctrl_qp.workq, 1 << T3_CTRL_QP_SIZE_LOG2);
+	skb->priority = CPL_PRIORITY_CONTROL;
+	return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb);
+err:
+	kfree_skb(skb);
+	return err;
+}
+
+static int cxio_hal_destroy_ctrl_qp(struct cxio_rdev *rdev_p)
+{
+	dma_free_coherent(&(rdev_p->rnic_info.pdev->dev),
+			  (1UL << T3_CTRL_QP_SIZE_LOG2)
+			  * sizeof(union t3_wr), rdev_p->ctrl_qp.workq,
+			  dma_unmap_addr(&rdev_p->ctrl_qp, mapping));
+	return cxio_hal_clear_qp_ctx(rdev_p, T3_CTRL_QP_ID);
+}
+
+/* write len bytes of data into addr (32B aligned address)
+ * If data is NULL, clear len byte of memory to zero.
+ * caller acquires the ctrl_qp lock before the call
+ */
+static int cxio_hal_ctrl_qp_write_mem(struct cxio_rdev *rdev_p, u32 addr,
+				      u32 len, void *data)
+{
+	u32 i, nr_wqe, copy_len;
+	u8 *copy_data;
+	u8 wr_len, utx_len;	/* length in 8 byte flit */
+	enum t3_wr_flags flag;
+	__be64 *wqe;
+	u64 utx_cmd;
+	addr &= 0x7FFFFFF;
+	nr_wqe = len % 96 ? len / 96 + 1 : len / 96;	/* 96B max per WQE */
+	PDBG("%s wptr 0x%x rptr 0x%x len %d, nr_wqe %d data %p addr 0x%0x\n",
+	     __func__, rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, len,
+	     nr_wqe, data, addr);
+	utx_len = 3;		/* in 32B unit */
+	for (i = 0; i < nr_wqe; i++) {
+		if (Q_FULL(rdev_p->ctrl_qp.rptr, rdev_p->ctrl_qp.wptr,
+		           T3_CTRL_QP_SIZE_LOG2)) {
+			PDBG("%s ctrl_qp full wtpr 0x%0x rptr 0x%0x, "
+			     "wait for more space i %d\n", __func__,
+			     rdev_p->ctrl_qp.wptr, rdev_p->ctrl_qp.rptr, i);
+			if (wait_event_interruptible(rdev_p->ctrl_qp.waitq,
+					     !Q_FULL(rdev_p->ctrl_qp.rptr,
+						     rdev_p->ctrl_qp.wptr,
+						     T3_CTRL_QP_SIZE_LOG2))) {
+				PDBG("%s ctrl_qp workq interrupted\n",
+				     __func__);
+				return -ERESTARTSYS;
+			}
+			PDBG("%s ctrl_qp wakeup, continue posting work request "
+			     "i %d\n", __func__, i);
+		}
+		wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr %
+						(1 << T3_CTRL_QP_SIZE_LOG2)));
+		flag = 0;
+		if (i == (nr_wqe - 1)) {
+			/* last WQE */
+			flag = T3_COMPLETION_FLAG;
+			if (len % 32)
+				utx_len = len / 32 + 1;
+			else
+				utx_len = len / 32;
+		}
+
+		/*
+		 * Force a CQE to return the credit to the workq in case
+		 * we posted more than half the max QP size of WRs
+		 */
+		if ((i != 0) &&
+		    (i % (((1 << T3_CTRL_QP_SIZE_LOG2)) >> 1) == 0)) {
+			flag = T3_COMPLETION_FLAG;
+			PDBG("%s force completion at i %d\n", __func__, i);
+		}
+
+		/* build the utx mem command */
+		wqe += (sizeof(struct t3_bypass_wr) >> 3);
+		utx_cmd = (T3_UTX_MEM_WRITE << 28) | (addr + i * 3);
+		utx_cmd <<= 32;
+		utx_cmd |= (utx_len << 28) | ((utx_len << 2) + 1);
+		*wqe = cpu_to_be64(utx_cmd);
+		wqe++;
+		copy_data = (u8 *) data + i * 96;
+		copy_len = len > 96 ? 96 : len;
+
+		/* clear memory content if data is NULL */
+		if (data)
+			memcpy(wqe, copy_data, copy_len);
+		else
+			memset(wqe, 0, copy_len);
+		if (copy_len % 32)
+			memset(((u8 *) wqe) + copy_len, 0,
+			       32 - (copy_len % 32));
+		wr_len = ((sizeof(struct t3_bypass_wr)) >> 3) + 1 +
+			 (utx_len << 2);
+		wqe = (__be64 *)(rdev_p->ctrl_qp.workq + (rdev_p->ctrl_qp.wptr %
+			      (1 << T3_CTRL_QP_SIZE_LOG2)));
+
+		/* wptr in the WRID[31:0] */
+		((union t3_wrid *)(wqe+1))->id0.low = rdev_p->ctrl_qp.wptr;
+
+		/*
+		 * This must be the last write with a memory barrier
+		 * for the genbit
+		 */
+		build_fw_riwrh((struct fw_riwrh *) wqe, T3_WR_BP, flag,
+			       Q_GENBIT(rdev_p->ctrl_qp.wptr,
+					T3_CTRL_QP_SIZE_LOG2), T3_CTRL_QP_ID,
+			       wr_len, T3_SOPEOP);
+		if (flag == T3_COMPLETION_FLAG)
+			ring_doorbell(rdev_p->ctrl_qp.doorbell, T3_CTRL_QP_ID);
+		len -= 96;
+		rdev_p->ctrl_qp.wptr++;
+	}
+	return 0;
+}
+
+/* IN: stag key, pdid, perm, zbva, to, len, page_size, pbl_size and pbl_addr
+ * OUT: stag index
+ * TBD: shared memory region support
+ */
+static int __cxio_tpt_op(struct cxio_rdev *rdev_p, u32 reset_tpt_entry,
+			 u32 *stag, u8 stag_state, u32 pdid,
+			 enum tpt_mem_type type, enum tpt_mem_perm perm,
+			 u32 zbva, u64 to, u32 len, u8 page_size,
+			 u32 pbl_size, u32 pbl_addr)
+{
+	int err;
+	struct tpt_entry tpt;
+	u32 stag_idx;
+	u32 wptr;
+
+	if (cxio_fatal_error(rdev_p))
+		return -EIO;
+
+	stag_state = stag_state > 0;
+	stag_idx = (*stag) >> 8;
+
+	if ((!reset_tpt_entry) && !(*stag != T3_STAG_UNSET)) {
+		stag_idx = cxio_hal_get_stag(rdev_p->rscp);
+		if (!stag_idx)
+			return -ENOMEM;
+		*stag = (stag_idx << 8) | ((*stag) & 0xFF);
+	}
+	PDBG("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n",
+	     __func__, stag_state, type, pdid, stag_idx);
+
+	mutex_lock(&rdev_p->ctrl_qp.lock);
+
+	/* write TPT entry */
+	if (reset_tpt_entry)
+		memset(&tpt, 0, sizeof(tpt));
+	else {
+		tpt.valid_stag_pdid = cpu_to_be32(F_TPT_VALID |
+				V_TPT_STAG_KEY((*stag) & M_TPT_STAG_KEY) |
+				V_TPT_STAG_STATE(stag_state) |
+				V_TPT_STAG_TYPE(type) | V_TPT_PDID(pdid));
+		BUG_ON(page_size >= 28);
+		tpt.flags_pagesize_qpid = cpu_to_be32(V_TPT_PERM(perm) |
+			((perm & TPT_MW_BIND) ? F_TPT_MW_BIND_ENABLE : 0) |
+			V_TPT_ADDR_TYPE((zbva ? TPT_ZBTO : TPT_VATO)) |
+			V_TPT_PAGE_SIZE(page_size));
+		tpt.rsvd_pbl_addr = cpu_to_be32(V_TPT_PBL_ADDR(PBL_OFF(rdev_p, pbl_addr)>>3));
+		tpt.len = cpu_to_be32(len);
+		tpt.va_hi = cpu_to_be32((u32) (to >> 32));
+		tpt.va_low_or_fbo = cpu_to_be32((u32) (to & 0xFFFFFFFFULL));
+		tpt.rsvd_bind_cnt_or_pstag = 0;
+		tpt.rsvd_pbl_size = cpu_to_be32(V_TPT_PBL_SIZE(pbl_size >> 2));
+	}
+	err = cxio_hal_ctrl_qp_write_mem(rdev_p,
+				       stag_idx +
+				       (rdev_p->rnic_info.tpt_base >> 5),
+				       sizeof(tpt), &tpt);
+
+	/* release the stag index to free pool */
+	if (reset_tpt_entry)
+		cxio_hal_put_stag(rdev_p->rscp, stag_idx);
+
+	wptr = rdev_p->ctrl_qp.wptr;
+	mutex_unlock(&rdev_p->ctrl_qp.lock);
+	if (!err)
+		if (wait_event_interruptible(rdev_p->ctrl_qp.waitq,
+					     SEQ32_GE(rdev_p->ctrl_qp.rptr,
+						      wptr)))
+			return -ERESTARTSYS;
+	return err;
+}
+
+int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl,
+		   u32 pbl_addr, u32 pbl_size)
+{
+	u32 wptr;
+	int err;
+
+	PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
+	     __func__, pbl_addr, rdev_p->rnic_info.pbl_base,
+	     pbl_size);
+
+	mutex_lock(&rdev_p->ctrl_qp.lock);
+	err = cxio_hal_ctrl_qp_write_mem(rdev_p, pbl_addr >> 5, pbl_size << 3,
+					 pbl);
+	wptr = rdev_p->ctrl_qp.wptr;
+	mutex_unlock(&rdev_p->ctrl_qp.lock);
+	if (err)
+		return err;
+
+	if (wait_event_interruptible(rdev_p->ctrl_qp.waitq,
+				     SEQ32_GE(rdev_p->ctrl_qp.rptr,
+					      wptr)))
+		return -ERESTARTSYS;
+
+	return 0;
+}
+
+int cxio_register_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
+			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+			   u8 page_size, u32 pbl_size, u32 pbl_addr)
+{
+	*stag = T3_STAG_UNSET;
+	return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
+			     zbva, to, len, page_size, pbl_size, pbl_addr);
+}
+
+int cxio_reregister_phys_mem(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid,
+			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+			   u8 page_size, u32 pbl_size, u32 pbl_addr)
+{
+	return __cxio_tpt_op(rdev_p, 0, stag, 1, pdid, TPT_NON_SHARED_MR, perm,
+			     zbva, to, len, page_size, pbl_size, pbl_addr);
+}
+
+int cxio_dereg_mem(struct cxio_rdev *rdev_p, u32 stag, u32 pbl_size,
+		   u32 pbl_addr)
+{
+	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0,
+			     pbl_size, pbl_addr);
+}
+
+int cxio_allocate_window(struct cxio_rdev *rdev_p, u32 * stag, u32 pdid)
+{
+	*stag = T3_STAG_UNSET;
+	return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_MW, 0, 0, 0ULL, 0, 0,
+			     0, 0);
+}
+
+int cxio_deallocate_window(struct cxio_rdev *rdev_p, u32 stag)
+{
+	return __cxio_tpt_op(rdev_p, 1, &stag, 0, 0, 0, 0, 0, 0ULL, 0, 0,
+			     0, 0);
+}
+
+int cxio_allocate_stag(struct cxio_rdev *rdev_p, u32 *stag, u32 pdid, u32 pbl_size, u32 pbl_addr)
+{
+	*stag = T3_STAG_UNSET;
+	return __cxio_tpt_op(rdev_p, 0, stag, 0, pdid, TPT_NON_SHARED_MR,
+			     0, 0, 0ULL, 0, 0, pbl_size, pbl_addr);
+}
+
+int cxio_rdma_init(struct cxio_rdev *rdev_p, struct t3_rdma_init_attr *attr)
+{
+	struct t3_rdma_init_wr *wqe;
+	struct sk_buff *skb = alloc_skb(sizeof(*wqe), GFP_ATOMIC);
+	if (!skb)
+		return -ENOMEM;
+	PDBG("%s rdev_p %p\n", __func__, rdev_p);
+	wqe = (struct t3_rdma_init_wr *) __skb_put(skb, sizeof(*wqe));
+	wqe->wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_INIT));
+	wqe->wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(attr->tid) |
+					   V_FW_RIWR_LEN(sizeof(*wqe) >> 3));
+	wqe->wrid.id1 = 0;
+	wqe->qpid = cpu_to_be32(attr->qpid);
+	wqe->pdid = cpu_to_be32(attr->pdid);
+	wqe->scqid = cpu_to_be32(attr->scqid);
+	wqe->rcqid = cpu_to_be32(attr->rcqid);
+	wqe->rq_addr = cpu_to_be32(attr->rq_addr - rdev_p->rnic_info.rqt_base);
+	wqe->rq_size = cpu_to_be32(attr->rq_size);
+	wqe->mpaattrs = attr->mpaattrs;
+	wqe->qpcaps = attr->qpcaps;
+	wqe->ulpdu_size = cpu_to_be16(attr->tcp_emss);
+	wqe->rqe_count = cpu_to_be16(attr->rqe_count);
+	wqe->flags_rtr_type = cpu_to_be16(attr->flags |
+					  V_RTR_TYPE(attr->rtr_type) |
+					  V_CHAN(attr->chan));
+	wqe->ord = cpu_to_be32(attr->ord);
+	wqe->ird = cpu_to_be32(attr->ird);
+	wqe->qp_dma_addr = cpu_to_be64(attr->qp_dma_addr);
+	wqe->qp_dma_size = cpu_to_be32(attr->qp_dma_size);
+	wqe->irs = cpu_to_be32(attr->irs);
+	skb->priority = 0;	/* 0=>ToeQ; 1=>CtrlQ */
+	return iwch_cxgb3_ofld_send(rdev_p->t3cdev_p, skb);
+}
+
+void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb)
+{
+	cxio_ev_cb = ev_cb;
+}
+
+void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb)
+{
+	cxio_ev_cb = NULL;
+}
+
+static int cxio_hal_ev_handler(struct t3cdev *t3cdev_p, struct sk_buff *skb)
+{
+	static int cnt;
+	struct cxio_rdev *rdev_p = NULL;
+	struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data;
+	PDBG("%d: %s cq_id 0x%x cq_ptr 0x%x genbit %0x overflow %0x an %0x"
+	     " se %0x notify %0x cqbranch %0x creditth %0x\n",
+	     cnt, __func__, RSPQ_CQID(rsp_msg), RSPQ_CQPTR(rsp_msg),
+	     RSPQ_GENBIT(rsp_msg), RSPQ_OVERFLOW(rsp_msg), RSPQ_AN(rsp_msg),
+	     RSPQ_SE(rsp_msg), RSPQ_NOTIFY(rsp_msg), RSPQ_CQBRANCH(rsp_msg),
+	     RSPQ_CREDIT_THRESH(rsp_msg));
+	PDBG("CQE: QPID 0x%0x genbit %0x type 0x%0x status 0x%0x opcode %d "
+	     "len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n",
+	     CQE_QPID(rsp_msg->cqe), CQE_GENBIT(rsp_msg->cqe),
+	     CQE_TYPE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe),
+	     CQE_OPCODE(rsp_msg->cqe), CQE_LEN(rsp_msg->cqe),
+	     CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
+	rdev_p = (struct cxio_rdev *)t3cdev_p->ulp;
+	if (!rdev_p) {
+		PDBG("%s called by t3cdev %p with null ulp\n", __func__,
+		     t3cdev_p);
+		return 0;
+	}
+	if (CQE_QPID(rsp_msg->cqe) == T3_CTRL_QP_ID) {
+		rdev_p->ctrl_qp.rptr = CQE_WRID_LOW(rsp_msg->cqe) + 1;
+		wake_up_interruptible(&rdev_p->ctrl_qp.waitq);
+		dev_kfree_skb_irq(skb);
+	} else if (CQE_QPID(rsp_msg->cqe) == 0xfff8)
+		dev_kfree_skb_irq(skb);
+	else if (cxio_ev_cb)
+		(*cxio_ev_cb) (rdev_p, skb);
+	else
+		dev_kfree_skb_irq(skb);
+	cnt++;
+	return 0;
+}
+
+/* Caller takes care of locking if needed */
+int cxio_rdev_open(struct cxio_rdev *rdev_p)
+{
+	struct net_device *netdev_p = NULL;
+	int err = 0;
+	if (strlen(rdev_p->dev_name)) {
+		if (cxio_hal_find_rdev_by_name(rdev_p->dev_name)) {
+			return -EBUSY;
+		}
+		netdev_p = dev_get_by_name(&init_net, rdev_p->dev_name);
+		if (!netdev_p) {
+			return -EINVAL;
+		}
+		dev_put(netdev_p);
+	} else if (rdev_p->t3cdev_p) {
+		if (cxio_hal_find_rdev_by_t3cdev(rdev_p->t3cdev_p)) {
+			return -EBUSY;
+		}
+		netdev_p = rdev_p->t3cdev_p->lldev;
+		strncpy(rdev_p->dev_name, rdev_p->t3cdev_p->name,
+			T3_MAX_DEV_NAME_LEN);
+	} else {
+		PDBG("%s t3cdev_p or dev_name must be set\n", __func__);
+		return -EINVAL;
+	}
+
+	list_add_tail(&rdev_p->entry, &rdev_list);
+
+	PDBG("%s opening rnic dev %s\n", __func__, rdev_p->dev_name);
+	memset(&rdev_p->ctrl_qp, 0, sizeof(rdev_p->ctrl_qp));
+	if (!rdev_p->t3cdev_p)
+		rdev_p->t3cdev_p = dev2t3cdev(netdev_p);
+	rdev_p->t3cdev_p->ulp = (void *) rdev_p;
+
+	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_EMBEDDED_INFO,
+					 &(rdev_p->fw_info));
+	if (err) {
+		printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n",
+		     __func__, rdev_p->t3cdev_p, err);
+		goto err1;
+	}
+	if (G_FW_VERSION_MAJOR(rdev_p->fw_info.fw_vers) != CXIO_FW_MAJ) {
+		printk(KERN_ERR MOD "fatal firmware version mismatch: "
+		       "need version %u but adapter has version %u\n",
+		       CXIO_FW_MAJ,
+		       G_FW_VERSION_MAJOR(rdev_p->fw_info.fw_vers));
+		err = -EINVAL;
+		goto err1;
+	}
+
+	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, RDMA_GET_PARAMS,
+					 &(rdev_p->rnic_info));
+	if (err) {
+		printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n",
+		     __func__, rdev_p->t3cdev_p, err);
+		goto err1;
+	}
+	err = rdev_p->t3cdev_p->ctl(rdev_p->t3cdev_p, GET_PORTS,
+				    &(rdev_p->port_info));
+	if (err) {
+		printk(KERN_ERR "%s t3cdev_p(%p)->ctl returned error %d.\n",
+		     __func__, rdev_p->t3cdev_p, err);
+		goto err1;
+	}
+
+	/*
+	 * qpshift is the number of bits to shift the qpid left in order
+	 * to get the correct address of the doorbell for that qp.
+	 */
+	cxio_init_ucontext(rdev_p, &rdev_p->uctx);
+	rdev_p->qpshift = PAGE_SHIFT -
+			  ilog2(65536 >>
+			            ilog2(rdev_p->rnic_info.udbell_len >>
+					      PAGE_SHIFT));
+	rdev_p->qpnr = rdev_p->rnic_info.udbell_len >> PAGE_SHIFT;
+	rdev_p->qpmask = (65536 >> ilog2(rdev_p->qpnr)) - 1;
+	PDBG("%s rnic %s info: tpt_base 0x%0x tpt_top 0x%0x num stags %d "
+	     "pbl_base 0x%0x pbl_top 0x%0x rqt_base 0x%0x, rqt_top 0x%0x\n",
+	     __func__, rdev_p->dev_name, rdev_p->rnic_info.tpt_base,
+	     rdev_p->rnic_info.tpt_top, cxio_num_stags(rdev_p),
+	     rdev_p->rnic_info.pbl_base,
+	     rdev_p->rnic_info.pbl_top, rdev_p->rnic_info.rqt_base,
+	     rdev_p->rnic_info.rqt_top);
+	PDBG("udbell_len 0x%0x udbell_physbase 0x%lx kdb_addr %p qpshift %lu "
+	     "qpnr %d qpmask 0x%x\n",
+	     rdev_p->rnic_info.udbell_len,
+	     rdev_p->rnic_info.udbell_physbase, rdev_p->rnic_info.kdb_addr,
+	     rdev_p->qpshift, rdev_p->qpnr, rdev_p->qpmask);
+
+	err = cxio_hal_init_ctrl_qp(rdev_p);
+	if (err) {
+		printk(KERN_ERR "%s error %d initializing ctrl_qp.\n",
+		       __func__, err);
+		goto err1;
+	}
+	err = cxio_hal_init_resource(rdev_p, cxio_num_stags(rdev_p), 0,
+				     0, T3_MAX_NUM_QP, T3_MAX_NUM_CQ,
+				     T3_MAX_NUM_PD);
+	if (err) {
+		printk(KERN_ERR "%s error %d initializing hal resources.\n",
+		       __func__, err);
+		goto err2;
+	}
+	err = cxio_hal_pblpool_create(rdev_p);
+	if (err) {
+		printk(KERN_ERR "%s error %d initializing pbl mem pool.\n",
+		       __func__, err);
+		goto err3;
+	}
+	err = cxio_hal_rqtpool_create(rdev_p);
+	if (err) {
+		printk(KERN_ERR "%s error %d initializing rqt mem pool.\n",
+		       __func__, err);
+		goto err4;
+	}
+	return 0;
+err4:
+	cxio_hal_pblpool_destroy(rdev_p);
+err3:
+	cxio_hal_destroy_resource(rdev_p->rscp);
+err2:
+	cxio_hal_destroy_ctrl_qp(rdev_p);
+err1:
+	rdev_p->t3cdev_p->ulp = NULL;
+	list_del(&rdev_p->entry);
+	return err;
+}
+
+void cxio_rdev_close(struct cxio_rdev *rdev_p)
+{
+	if (rdev_p) {
+		cxio_hal_pblpool_destroy(rdev_p);
+		cxio_hal_rqtpool_destroy(rdev_p);
+		list_del(&rdev_p->entry);
+		cxio_hal_destroy_ctrl_qp(rdev_p);
+		cxio_hal_destroy_resource(rdev_p->rscp);
+		rdev_p->t3cdev_p->ulp = NULL;
+	}
+}
+
+int __init cxio_hal_init(void)
+{
+	if (cxio_hal_init_rhdl_resource(T3_MAX_NUM_RI))
+		return -ENOMEM;
+	t3_register_cpl_handler(CPL_ASYNC_NOTIF, cxio_hal_ev_handler);
+	return 0;
+}
+
+void __exit cxio_hal_exit(void)
+{
+	struct cxio_rdev *rdev, *tmp;
+
+	t3_register_cpl_handler(CPL_ASYNC_NOTIF, NULL);
+	list_for_each_entry_safe(rdev, tmp, &rdev_list, entry)
+		cxio_rdev_close(rdev);
+	cxio_hal_destroy_rhdl_resource();
+}
+
+static void flush_completed_wrs(struct t3_wq *wq, struct t3_cq *cq)
+{
+	struct t3_swsq *sqp;
+	__u32 ptr = wq->sq_rptr;
+	int count = Q_COUNT(wq->sq_rptr, wq->sq_wptr);
+
+	sqp = wq->sq + Q_PTR2IDX(ptr, wq->sq_size_log2);
+	while (count--)
+		if (!sqp->signaled) {
+			ptr++;
+			sqp = wq->sq + Q_PTR2IDX(ptr,  wq->sq_size_log2);
+		} else if (sqp->complete) {
+
+			/*
+			 * Insert this completed cqe into the swcq.
+			 */
+			PDBG("%s moving cqe into swcq sq idx %ld cq idx %ld\n",
+			     __func__, Q_PTR2IDX(ptr,  wq->sq_size_log2),
+			     Q_PTR2IDX(cq->sw_wptr, cq->size_log2));
+			sqp->cqe.header |= htonl(V_CQE_SWCQE(1));
+			*(cq->sw_queue + Q_PTR2IDX(cq->sw_wptr, cq->size_log2))
+				= sqp->cqe;
+			cq->sw_wptr++;
+			sqp->signaled = 0;
+			break;
+		} else
+			break;
+}
+
+static void create_read_req_cqe(struct t3_wq *wq, struct t3_cqe *hw_cqe,
+				struct t3_cqe *read_cqe)
+{
+	read_cqe->u.scqe.wrid_hi = wq->oldest_read->sq_wptr;
+	read_cqe->len = wq->oldest_read->read_len;
+	read_cqe->header = htonl(V_CQE_QPID(CQE_QPID(*hw_cqe)) |
+				 V_CQE_SWCQE(SW_CQE(*hw_cqe)) |
+				 V_CQE_OPCODE(T3_READ_REQ) |
+				 V_CQE_TYPE(1));
+}
+
+/*
+ * Return a ptr to the next read wr in the SWSQ or NULL.
+ */
+static void advance_oldest_read(struct t3_wq *wq)
+{
+
+	u32 rptr = wq->oldest_read - wq->sq + 1;
+	u32 wptr = Q_PTR2IDX(wq->sq_wptr, wq->sq_size_log2);
+
+	while (Q_PTR2IDX(rptr, wq->sq_size_log2) != wptr) {
+		wq->oldest_read = wq->sq + Q_PTR2IDX(rptr, wq->sq_size_log2);
+
+		if (wq->oldest_read->opcode == T3_READ_REQ)
+			return;
+		rptr++;
+	}
+	wq->oldest_read = NULL;
+}
+
+/*
+ * cxio_poll_cq
+ *
+ * Caller must:
+ *     check the validity of the first CQE,
+ *     supply the wq assicated with the qpid.
+ *
+ * credit: cq credit to return to sge.
+ * cqe_flushed: 1 iff the CQE is flushed.
+ * cqe: copy of the polled CQE.
+ *
+ * return value:
+ *     0       CQE returned,
+ *    -1       CQE skipped, try again.
+ */
+int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe,
+		     u8 *cqe_flushed, u64 *cookie, u32 *credit)
+{
+	int ret = 0;
+	struct t3_cqe *hw_cqe, read_cqe;
+
+	*cqe_flushed = 0;
+	*credit = 0;
+	hw_cqe = cxio_next_cqe(cq);
+
+	PDBG("%s CQE OOO %d qpid 0x%0x genbit %d type %d status 0x%0x"
+	     " opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n",
+	     __func__, CQE_OOO(*hw_cqe), CQE_QPID(*hw_cqe),
+	     CQE_GENBIT(*hw_cqe), CQE_TYPE(*hw_cqe), CQE_STATUS(*hw_cqe),
+	     CQE_OPCODE(*hw_cqe), CQE_LEN(*hw_cqe), CQE_WRID_HI(*hw_cqe),
+	     CQE_WRID_LOW(*hw_cqe));
+
+	/*
+	 * skip cqe's not affiliated with a QP.
+	 */
+	if (wq == NULL) {
+		ret = -1;
+		goto skip_cqe;
+	}
+
+	/*
+	 * Gotta tweak READ completions:
+	 *	1) the cqe doesn't contain the sq_wptr from the wr.
+	 *	2) opcode not reflected from the wr.
+	 *	3) read_len not reflected from the wr.
+	 *	4) cq_type is RQ_TYPE not SQ_TYPE.
+	 */
+	if (RQ_TYPE(*hw_cqe) && (CQE_OPCODE(*hw_cqe) == T3_READ_RESP)) {
+
+		/*
+		 * If this is an unsolicited read response, then the read
+		 * was generated by the kernel driver as part of peer-2-peer
+		 * connection setup.  So ignore the completion.
+		 */
+		if (!wq->oldest_read) {
+			if (CQE_STATUS(*hw_cqe))
+				wq->error = 1;
+			ret = -1;
+			goto skip_cqe;
+		}
+
+		/*
+		 * Don't write to the HWCQ, so create a new read req CQE
+		 * in local memory.
+		 */
+		create_read_req_cqe(wq, hw_cqe, &read_cqe);
+		hw_cqe = &read_cqe;
+		advance_oldest_read(wq);
+	}
+
+	/*
+	 * T3A: Discard TERMINATE CQEs.
+	 */
+	if (CQE_OPCODE(*hw_cqe) == T3_TERMINATE) {
+		ret = -1;
+		wq->error = 1;
+		goto skip_cqe;
+	}
+
+	if (CQE_STATUS(*hw_cqe) || wq->error) {
+		*cqe_flushed = wq->error;
+		wq->error = 1;
+
+		/*
+		 * T3A inserts errors into the CQE.  We cannot return
+		 * these as work completions.
+		 */
+		/* incoming write failures */
+		if ((CQE_OPCODE(*hw_cqe) == T3_RDMA_WRITE)
+		     && RQ_TYPE(*hw_cqe)) {
+			ret = -1;
+			goto skip_cqe;
+		}
+		/* incoming read request failures */
+		if ((CQE_OPCODE(*hw_cqe) == T3_READ_RESP) && SQ_TYPE(*hw_cqe)) {
+			ret = -1;
+			goto skip_cqe;
+		}
+
+		/* incoming SEND with no receive posted failures */
+		if (CQE_SEND_OPCODE(*hw_cqe) && RQ_TYPE(*hw_cqe) &&
+		    Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) {
+			ret = -1;
+			goto skip_cqe;
+		}
+		BUG_ON((*cqe_flushed == 0) && !SW_CQE(*hw_cqe));
+		goto proc_cqe;
+	}
+
+	/*
+	 * RECV completion.
+	 */
+	if (RQ_TYPE(*hw_cqe)) {
+
+		/*
+		 * HW only validates 4 bits of MSN.  So we must validate that
+		 * the MSN in the SEND is the next expected MSN.  If its not,
+		 * then we complete this with TPT_ERR_MSN and mark the wq in
+		 * error.
+		 */
+
+		if (Q_EMPTY(wq->rq_rptr, wq->rq_wptr)) {
+			wq->error = 1;
+			ret = -1;
+			goto skip_cqe;
+		}
+
+		if (unlikely((CQE_WRID_MSN(*hw_cqe) != (wq->rq_rptr + 1)))) {
+			wq->error = 1;
+			hw_cqe->header |= htonl(V_CQE_STATUS(TPT_ERR_MSN));
+			goto proc_cqe;
+		}
+		goto proc_cqe;
+	}
+
+	/*
+	 * If we get here its a send completion.
+	 *
+	 * Handle out of order completion. These get stuffed
+	 * in the SW SQ. Then the SW SQ is walked to move any
+	 * now in-order completions into the SW CQ.  This handles
+	 * 2 cases:
+	 *	1) reaping unsignaled WRs when the first subsequent
+	 *	   signaled WR is completed.
+	 *	2) out of order read completions.
+	 */
+	if (!SW_CQE(*hw_cqe) && (CQE_WRID_SQ_WPTR(*hw_cqe) != wq->sq_rptr)) {
+		struct t3_swsq *sqp;
+
+		PDBG("%s out of order completion going in swsq at idx %ld\n",
+		     __func__,
+		     Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2));
+		sqp = wq->sq +
+		      Q_PTR2IDX(CQE_WRID_SQ_WPTR(*hw_cqe), wq->sq_size_log2);
+		sqp->cqe = *hw_cqe;
+		sqp->complete = 1;
+		ret = -1;
+		goto flush_wq;
+	}
+
+proc_cqe:
+	*cqe = *hw_cqe;
+
+	/*
+	 * Reap the associated WR(s) that are freed up with this
+	 * completion.
+	 */
+	if (SQ_TYPE(*hw_cqe)) {
+		wq->sq_rptr = CQE_WRID_SQ_WPTR(*hw_cqe);
+		PDBG("%s completing sq idx %ld\n", __func__,
+		     Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2));
+		*cookie = wq->sq[Q_PTR2IDX(wq->sq_rptr, wq->sq_size_log2)].wr_id;
+		wq->sq_rptr++;
+	} else {
+		PDBG("%s completing rq idx %ld\n", __func__,
+		     Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2));
+		*cookie = wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].wr_id;
+		if (wq->rq[Q_PTR2IDX(wq->rq_rptr, wq->rq_size_log2)].pbl_addr)
+			cxio_hal_pblpool_free(wq->rdev,
+				wq->rq[Q_PTR2IDX(wq->rq_rptr,
+				wq->rq_size_log2)].pbl_addr, T3_STAG0_PBL_SIZE);
+		BUG_ON(Q_EMPTY(wq->rq_rptr, wq->rq_wptr));
+		wq->rq_rptr++;
+	}
+
+flush_wq:
+	/*
+	 * Flush any completed cqes that are now in-order.
+	 */
+	flush_completed_wrs(wq, cq);
+
+skip_cqe:
+	if (SW_CQE(*hw_cqe)) {
+		PDBG("%s cq %p cqid 0x%x skip sw cqe sw_rptr 0x%x\n",
+		     __func__, cq, cq->cqid, cq->sw_rptr);
+		++cq->sw_rptr;
+	} else {
+		PDBG("%s cq %p cqid 0x%x skip hw cqe rptr 0x%x\n",
+		     __func__, cq, cq->cqid, cq->rptr);
+		++cq->rptr;
+
+		/*
+		 * T3A: compute credits.
+		 */
+		if (((cq->rptr - cq->wptr) > (1 << (cq->size_log2 - 1)))
+		    || ((cq->rptr - cq->wptr) >= 128)) {
+			*credit = cq->rptr - cq->wptr;
+			cq->wptr = cq->rptr;
+		}
+	}
+	return ret;
+}
diff --git a/drivers/infiniband/hw/cxgb3/cxio_hal.h b/drivers/infiniband/hw/cxgb3/cxio_hal.h
new file mode 100644
index 000000000..78fbe9ffe
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_hal.h
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef  __CXIO_HAL_H__
+#define  __CXIO_HAL_H__
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/kfifo.h>
+
+#include "t3_cpl.h"
+#include "t3cdev.h"
+#include "cxgb3_ctl_defs.h"
+#include "cxio_wr.h"
+
+#define T3_CTRL_QP_ID    FW_RI_SGEEC_START
+#define T3_CTL_QP_TID	 FW_RI_TID_START
+#define T3_CTRL_QP_SIZE_LOG2  8
+#define T3_CTRL_CQ_ID    0
+
+#define T3_MAX_NUM_RI (1<<15)
+#define T3_MAX_NUM_QP (1<<15)
+#define T3_MAX_NUM_CQ (1<<15)
+#define T3_MAX_NUM_PD (1<<15)
+#define T3_MAX_PBL_SIZE 256
+#define T3_MAX_RQ_SIZE 1024
+#define T3_MAX_QP_DEPTH (T3_MAX_RQ_SIZE-1)
+#define T3_MAX_CQ_DEPTH 65536
+#define T3_MAX_NUM_STAG (1<<15)
+#define T3_MAX_MR_SIZE 0x100000000ULL
+#define T3_PAGESIZE_MASK 0xffff000  /* 4KB-128MB */
+
+#define T3_STAG_UNSET 0xffffffff
+
+#define T3_MAX_DEV_NAME_LEN 32
+
+#define CXIO_FW_MAJ 7
+
+struct cxio_hal_ctrl_qp {
+	u32 wptr;
+	u32 rptr;
+	struct mutex lock;	/* for the wtpr, can sleep */
+	wait_queue_head_t waitq;/* wait for RspQ/CQE msg */
+	union t3_wr *workq;	/* the work request queue */
+	dma_addr_t dma_addr;	/* pci bus address of the workq */
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	void __iomem *doorbell;
+};
+
+struct cxio_hal_resource {
+	struct kfifo tpt_fifo;
+	spinlock_t tpt_fifo_lock;
+	struct kfifo qpid_fifo;
+	spinlock_t qpid_fifo_lock;
+	struct kfifo cqid_fifo;
+	spinlock_t cqid_fifo_lock;
+	struct kfifo pdid_fifo;
+	spinlock_t pdid_fifo_lock;
+};
+
+struct cxio_qpid_list {
+	struct list_head entry;
+	u32 qpid;
+};
+
+struct cxio_ucontext {
+	struct list_head qpids;
+	struct mutex lock;
+};
+
+struct cxio_rdev {
+	char dev_name[T3_MAX_DEV_NAME_LEN];
+	struct t3cdev *t3cdev_p;
+	struct rdma_info rnic_info;
+	struct adap_ports port_info;
+	struct cxio_hal_resource *rscp;
+	struct cxio_hal_ctrl_qp ctrl_qp;
+	void *ulp;
+	unsigned long qpshift;
+	u32 qpnr;
+	u32 qpmask;
+	struct cxio_ucontext uctx;
+	struct gen_pool *pbl_pool;
+	struct gen_pool *rqt_pool;
+	struct list_head entry;
+	struct ch_embedded_info fw_info;
+	u32	flags;
+#define	CXIO_ERROR_FATAL	1
+};
+
+static inline int cxio_fatal_error(struct cxio_rdev *rdev_p)
+{
+	return rdev_p->flags & CXIO_ERROR_FATAL;
+}
+
+static inline int cxio_num_stags(struct cxio_rdev *rdev_p)
+{
+	return min((int)T3_MAX_NUM_STAG, (int)((rdev_p->rnic_info.tpt_top - rdev_p->rnic_info.tpt_base) >> 5));
+}
+
+typedef void (*cxio_hal_ev_callback_func_t) (struct cxio_rdev * rdev_p,
+					     struct sk_buff * skb);
+
+#define RSPQ_CQID(rsp) (be32_to_cpu(rsp->cq_ptrid) & 0xffff)
+#define RSPQ_CQPTR(rsp) ((be32_to_cpu(rsp->cq_ptrid) >> 16) & 0xffff)
+#define RSPQ_GENBIT(rsp) ((be32_to_cpu(rsp->flags) >> 16) & 1)
+#define RSPQ_OVERFLOW(rsp) ((be32_to_cpu(rsp->flags) >> 17) & 1)
+#define RSPQ_AN(rsp) ((be32_to_cpu(rsp->flags) >> 18) & 1)
+#define RSPQ_SE(rsp) ((be32_to_cpu(rsp->flags) >> 19) & 1)
+#define RSPQ_NOTIFY(rsp) ((be32_to_cpu(rsp->flags) >> 20) & 1)
+#define RSPQ_CQBRANCH(rsp) ((be32_to_cpu(rsp->flags) >> 21) & 1)
+#define RSPQ_CREDIT_THRESH(rsp) ((be32_to_cpu(rsp->flags) >> 22) & 1)
+
+struct respQ_msg_t {
+	__be32 flags;		/* flit 0 */
+	__be32 cq_ptrid;
+	__be64 rsvd;		/* flit 1 */
+	struct t3_cqe cqe;	/* flits 2-3 */
+};
+
+enum t3_cq_opcode {
+	CQ_ARM_AN = 0x2,
+	CQ_ARM_SE = 0x6,
+	CQ_FORCE_AN = 0x3,
+	CQ_CREDIT_UPDATE = 0x7
+};
+
+int cxio_rdev_open(struct cxio_rdev *rdev);
+void cxio_rdev_close(struct cxio_rdev *rdev);
+int cxio_hal_cq_op(struct cxio_rdev *rdev, struct t3_cq *cq,
+		   enum t3_cq_opcode op, u32 credit);
+int cxio_create_cq(struct cxio_rdev *rdev, struct t3_cq *cq, int kernel);
+int cxio_destroy_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+int cxio_resize_cq(struct cxio_rdev *rdev, struct t3_cq *cq);
+void cxio_release_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
+void cxio_init_ucontext(struct cxio_rdev *rdev, struct cxio_ucontext *uctx);
+int cxio_create_qp(struct cxio_rdev *rdev, u32 kernel_domain, struct t3_wq *wq,
+		   struct cxio_ucontext *uctx);
+int cxio_destroy_qp(struct cxio_rdev *rdev, struct t3_wq *wq,
+		    struct cxio_ucontext *uctx);
+int cxio_peek_cq(struct t3_wq *wr, struct t3_cq *cq, int opcode);
+int cxio_write_pbl(struct cxio_rdev *rdev_p, __be64 *pbl,
+		   u32 pbl_addr, u32 pbl_size);
+int cxio_register_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
+			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+			   u8 page_size, u32 pbl_size, u32 pbl_addr);
+int cxio_reregister_phys_mem(struct cxio_rdev *rdev, u32 * stag, u32 pdid,
+			   enum tpt_mem_perm perm, u32 zbva, u64 to, u32 len,
+			   u8 page_size, u32 pbl_size, u32 pbl_addr);
+int cxio_dereg_mem(struct cxio_rdev *rdev, u32 stag, u32 pbl_size,
+		   u32 pbl_addr);
+int cxio_allocate_window(struct cxio_rdev *rdev, u32 * stag, u32 pdid);
+int cxio_allocate_stag(struct cxio_rdev *rdev, u32 *stag, u32 pdid, u32 pbl_size, u32 pbl_addr);
+int cxio_deallocate_window(struct cxio_rdev *rdev, u32 stag);
+int cxio_rdma_init(struct cxio_rdev *rdev, struct t3_rdma_init_attr *attr);
+void cxio_register_ev_cb(cxio_hal_ev_callback_func_t ev_cb);
+void cxio_unregister_ev_cb(cxio_hal_ev_callback_func_t ev_cb);
+u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp);
+void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid);
+int __init cxio_hal_init(void);
+void __exit cxio_hal_exit(void);
+int cxio_flush_rq(struct t3_wq *wq, struct t3_cq *cq, int count);
+int cxio_flush_sq(struct t3_wq *wq, struct t3_cq *cq, int count);
+void cxio_count_rcqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
+void cxio_count_scqes(struct t3_cq *cq, struct t3_wq *wq, int *count);
+void cxio_flush_hw_cq(struct t3_cq *cq);
+int cxio_poll_cq(struct t3_wq *wq, struct t3_cq *cq, struct t3_cqe *cqe,
+		     u8 *cqe_flushed, u64 *cookie, u32 *credit);
+int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb);
+
+#define MOD "iw_cxgb3: "
+#define PDBG(fmt, args...) pr_debug(MOD fmt, ## args)
+
+#ifdef DEBUG
+void cxio_dump_tpt(struct cxio_rdev *rev, u32 stag);
+void cxio_dump_pbl(struct cxio_rdev *rev, u32 pbl_addr, uint len, u8 shift);
+void cxio_dump_wqe(union t3_wr *wqe);
+void cxio_dump_wce(struct t3_cqe *wce);
+void cxio_dump_rqt(struct cxio_rdev *rdev, u32 hwtid, int nents);
+void cxio_dump_tcb(struct cxio_rdev *rdev, u32 hwtid);
+#endif
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.c b/drivers/infiniband/hw/cxgb3/cxio_resource.c
new file mode 100644
index 000000000..c40088ecf
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_resource.c
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/* Crude resource management */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/kfifo.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include "cxio_resource.h"
+#include "cxio_hal.h"
+
+static struct kfifo rhdl_fifo;
+static spinlock_t rhdl_fifo_lock;
+
+#define RANDOM_SIZE 16
+
+static int __cxio_init_resource_fifo(struct kfifo *fifo,
+				   spinlock_t *fifo_lock,
+				   u32 nr, u32 skip_low,
+				   u32 skip_high,
+				   int random)
+{
+	u32 i, j, entry = 0, idx;
+	u32 random_bytes;
+	u32 rarray[16];
+	spin_lock_init(fifo_lock);
+
+	if (kfifo_alloc(fifo, nr * sizeof(u32), GFP_KERNEL))
+		return -ENOMEM;
+
+	for (i = 0; i < skip_low + skip_high; i++)
+		kfifo_in(fifo, (unsigned char *) &entry, sizeof(u32));
+	if (random) {
+		j = 0;
+		random_bytes = prandom_u32();
+		for (i = 0; i < RANDOM_SIZE; i++)
+			rarray[i] = i + skip_low;
+		for (i = skip_low + RANDOM_SIZE; i < nr - skip_high; i++) {
+			if (j >= RANDOM_SIZE) {
+				j = 0;
+				random_bytes = prandom_u32();
+			}
+			idx = (random_bytes >> (j * 2)) & 0xF;
+			kfifo_in(fifo,
+				(unsigned char *) &rarray[idx],
+				sizeof(u32));
+			rarray[idx] = i;
+			j++;
+		}
+		for (i = 0; i < RANDOM_SIZE; i++)
+			kfifo_in(fifo,
+				(unsigned char *) &rarray[i],
+				sizeof(u32));
+	} else
+		for (i = skip_low; i < nr - skip_high; i++)
+			kfifo_in(fifo, (unsigned char *) &i, sizeof(u32));
+
+	for (i = 0; i < skip_low + skip_high; i++)
+		if (kfifo_out_locked(fifo, (unsigned char *) &entry,
+				sizeof(u32), fifo_lock) != sizeof(u32))
+					break;
+	return 0;
+}
+
+static int cxio_init_resource_fifo(struct kfifo *fifo, spinlock_t * fifo_lock,
+				   u32 nr, u32 skip_low, u32 skip_high)
+{
+	return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low,
+					  skip_high, 0));
+}
+
+static int cxio_init_resource_fifo_random(struct kfifo *fifo,
+				   spinlock_t * fifo_lock,
+				   u32 nr, u32 skip_low, u32 skip_high)
+{
+
+	return (__cxio_init_resource_fifo(fifo, fifo_lock, nr, skip_low,
+					  skip_high, 1));
+}
+
+static int cxio_init_qpid_fifo(struct cxio_rdev *rdev_p)
+{
+	u32 i;
+
+	spin_lock_init(&rdev_p->rscp->qpid_fifo_lock);
+
+	if (kfifo_alloc(&rdev_p->rscp->qpid_fifo, T3_MAX_NUM_QP * sizeof(u32),
+					      GFP_KERNEL))
+		return -ENOMEM;
+
+	for (i = 16; i < T3_MAX_NUM_QP; i++)
+		if (!(i & rdev_p->qpmask))
+			kfifo_in(&rdev_p->rscp->qpid_fifo,
+				    (unsigned char *) &i, sizeof(u32));
+	return 0;
+}
+
+int cxio_hal_init_rhdl_resource(u32 nr_rhdl)
+{
+	return cxio_init_resource_fifo(&rhdl_fifo, &rhdl_fifo_lock, nr_rhdl, 1,
+				       0);
+}
+
+void cxio_hal_destroy_rhdl_resource(void)
+{
+	kfifo_free(&rhdl_fifo);
+}
+
+/* nr_* must be power of 2 */
+int cxio_hal_init_resource(struct cxio_rdev *rdev_p,
+			   u32 nr_tpt, u32 nr_pbl,
+			   u32 nr_rqt, u32 nr_qpid, u32 nr_cqid, u32 nr_pdid)
+{
+	int err = 0;
+	struct cxio_hal_resource *rscp;
+
+	rscp = kmalloc(sizeof(*rscp), GFP_KERNEL);
+	if (!rscp)
+		return -ENOMEM;
+	rdev_p->rscp = rscp;
+	err = cxio_init_resource_fifo_random(&rscp->tpt_fifo,
+				      &rscp->tpt_fifo_lock,
+				      nr_tpt, 1, 0);
+	if (err)
+		goto tpt_err;
+	err = cxio_init_qpid_fifo(rdev_p);
+	if (err)
+		goto qpid_err;
+	err = cxio_init_resource_fifo(&rscp->cqid_fifo, &rscp->cqid_fifo_lock,
+				      nr_cqid, 1, 0);
+	if (err)
+		goto cqid_err;
+	err = cxio_init_resource_fifo(&rscp->pdid_fifo, &rscp->pdid_fifo_lock,
+				      nr_pdid, 1, 0);
+	if (err)
+		goto pdid_err;
+	return 0;
+pdid_err:
+	kfifo_free(&rscp->cqid_fifo);
+cqid_err:
+	kfifo_free(&rscp->qpid_fifo);
+qpid_err:
+	kfifo_free(&rscp->tpt_fifo);
+tpt_err:
+	return -ENOMEM;
+}
+
+/*
+ * returns 0 if no resource available
+ */
+static u32 cxio_hal_get_resource(struct kfifo *fifo, spinlock_t * lock)
+{
+	u32 entry;
+	if (kfifo_out_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock))
+		return entry;
+	else
+		return 0;	/* fifo emptry */
+}
+
+static void cxio_hal_put_resource(struct kfifo *fifo, spinlock_t * lock,
+		u32 entry)
+{
+	BUG_ON(
+	kfifo_in_locked(fifo, (unsigned char *) &entry, sizeof(u32), lock)
+	== 0);
+}
+
+u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp)
+{
+	return cxio_hal_get_resource(&rscp->tpt_fifo, &rscp->tpt_fifo_lock);
+}
+
+void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag)
+{
+	cxio_hal_put_resource(&rscp->tpt_fifo, &rscp->tpt_fifo_lock, stag);
+}
+
+u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp)
+{
+	u32 qpid = cxio_hal_get_resource(&rscp->qpid_fifo,
+			&rscp->qpid_fifo_lock);
+	PDBG("%s qpid 0x%x\n", __func__, qpid);
+	return qpid;
+}
+
+void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid)
+{
+	PDBG("%s qpid 0x%x\n", __func__, qpid);
+	cxio_hal_put_resource(&rscp->qpid_fifo, &rscp->qpid_fifo_lock, qpid);
+}
+
+u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp)
+{
+	return cxio_hal_get_resource(&rscp->cqid_fifo, &rscp->cqid_fifo_lock);
+}
+
+void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid)
+{
+	cxio_hal_put_resource(&rscp->cqid_fifo, &rscp->cqid_fifo_lock, cqid);
+}
+
+u32 cxio_hal_get_pdid(struct cxio_hal_resource *rscp)
+{
+	return cxio_hal_get_resource(&rscp->pdid_fifo, &rscp->pdid_fifo_lock);
+}
+
+void cxio_hal_put_pdid(struct cxio_hal_resource *rscp, u32 pdid)
+{
+	cxio_hal_put_resource(&rscp->pdid_fifo, &rscp->pdid_fifo_lock, pdid);
+}
+
+void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp)
+{
+	kfifo_free(&rscp->tpt_fifo);
+	kfifo_free(&rscp->cqid_fifo);
+	kfifo_free(&rscp->qpid_fifo);
+	kfifo_free(&rscp->pdid_fifo);
+	kfree(rscp);
+}
+
+/*
+ * PBL Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_PBL_SHIFT 8			/* 256B == min PBL size (32 entries) */
+
+u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev_p->pbl_pool, size);
+	PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size);
+	return (u32)addr;
+}
+
+void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size)
+{
+	PDBG("%s addr 0x%x size %d\n", __func__, addr, size);
+	gen_pool_free(rdev_p->pbl_pool, (unsigned long)addr, size);
+}
+
+int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p)
+{
+	unsigned pbl_start, pbl_chunk;
+
+	rdev_p->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1);
+	if (!rdev_p->pbl_pool)
+		return -ENOMEM;
+
+	pbl_start = rdev_p->rnic_info.pbl_base;
+	pbl_chunk = rdev_p->rnic_info.pbl_top - pbl_start + 1;
+
+	while (pbl_start < rdev_p->rnic_info.pbl_top) {
+		pbl_chunk = min(rdev_p->rnic_info.pbl_top - pbl_start + 1,
+				pbl_chunk);
+		if (gen_pool_add(rdev_p->pbl_pool, pbl_start, pbl_chunk, -1)) {
+			PDBG("%s failed to add PBL chunk (%x/%x)\n",
+			     __func__, pbl_start, pbl_chunk);
+			if (pbl_chunk <= 1024 << MIN_PBL_SHIFT) {
+				printk(KERN_WARNING MOD "%s: Failed to add all PBL chunks (%x/%x)\n",
+				       __func__, pbl_start, rdev_p->rnic_info.pbl_top - pbl_start);
+				return 0;
+			}
+			pbl_chunk >>= 1;
+		} else {
+			PDBG("%s added PBL chunk (%x/%x)\n",
+			     __func__, pbl_start, pbl_chunk);
+			pbl_start += pbl_chunk;
+		}
+	}
+
+	return 0;
+}
+
+void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p)
+{
+	gen_pool_destroy(rdev_p->pbl_pool);
+}
+
+/*
+ * RQT Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_RQT_SHIFT 10	/* 1KB == mini RQT size (16 entries) */
+#define RQT_CHUNK 2*1024*1024
+
+u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev_p->rqt_pool, size << 6);
+	PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size << 6);
+	return (u32)addr;
+}
+
+void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size)
+{
+	PDBG("%s addr 0x%x size %d\n", __func__, addr, size << 6);
+	gen_pool_free(rdev_p->rqt_pool, (unsigned long)addr, size << 6);
+}
+
+int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p)
+{
+	unsigned long i;
+	rdev_p->rqt_pool = gen_pool_create(MIN_RQT_SHIFT, -1);
+	if (rdev_p->rqt_pool)
+		for (i = rdev_p->rnic_info.rqt_base;
+		     i <= rdev_p->rnic_info.rqt_top - RQT_CHUNK + 1;
+		     i += RQT_CHUNK)
+			gen_pool_add(rdev_p->rqt_pool, i, RQT_CHUNK, -1);
+	return rdev_p->rqt_pool ? 0 : -ENOMEM;
+}
+
+void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p)
+{
+	gen_pool_destroy(rdev_p->rqt_pool);
+}
diff --git a/drivers/infiniband/hw/cxgb3/cxio_resource.h b/drivers/infiniband/hw/cxgb3/cxio_resource.h
new file mode 100644
index 000000000..a2703a3d8
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_resource.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __CXIO_RESOURCE_H__
+#define __CXIO_RESOURCE_H__
+
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include <linux/slab.h>
+#include <linux/kfifo.h>
+#include <linux/spinlock.h>
+#include <linux/errno.h>
+#include <linux/genalloc.h>
+#include "cxio_hal.h"
+
+extern int cxio_hal_init_rhdl_resource(u32 nr_rhdl);
+extern void cxio_hal_destroy_rhdl_resource(void);
+extern int cxio_hal_init_resource(struct cxio_rdev *rdev_p,
+				  u32 nr_tpt, u32 nr_pbl,
+				  u32 nr_rqt, u32 nr_qpid, u32 nr_cqid,
+				  u32 nr_pdid);
+extern u32 cxio_hal_get_stag(struct cxio_hal_resource *rscp);
+extern void cxio_hal_put_stag(struct cxio_hal_resource *rscp, u32 stag);
+extern u32 cxio_hal_get_qpid(struct cxio_hal_resource *rscp);
+extern void cxio_hal_put_qpid(struct cxio_hal_resource *rscp, u32 qpid);
+extern u32 cxio_hal_get_cqid(struct cxio_hal_resource *rscp);
+extern void cxio_hal_put_cqid(struct cxio_hal_resource *rscp, u32 cqid);
+extern void cxio_hal_destroy_resource(struct cxio_hal_resource *rscp);
+
+#define PBL_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.pbl_base )
+extern int cxio_hal_pblpool_create(struct cxio_rdev *rdev_p);
+extern void cxio_hal_pblpool_destroy(struct cxio_rdev *rdev_p);
+extern u32 cxio_hal_pblpool_alloc(struct cxio_rdev *rdev_p, int size);
+extern void cxio_hal_pblpool_free(struct cxio_rdev *rdev_p, u32 addr, int size);
+
+#define RQT_OFF(rdev_p, a) ( (a) - (rdev_p)->rnic_info.rqt_base )
+extern int cxio_hal_rqtpool_create(struct cxio_rdev *rdev_p);
+extern void cxio_hal_rqtpool_destroy(struct cxio_rdev *rdev_p);
+extern u32 cxio_hal_rqtpool_alloc(struct cxio_rdev *rdev_p, int size);
+extern void cxio_hal_rqtpool_free(struct cxio_rdev *rdev_p, u32 addr, int size);
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/cxio_wr.h b/drivers/infiniband/hw/cxgb3/cxio_wr.h
new file mode 100644
index 000000000..83d2e19d3
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/cxio_wr.h
@@ -0,0 +1,802 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __CXIO_WR_H__
+#define __CXIO_WR_H__
+
+#include <asm/io.h>
+#include <linux/pci.h>
+#include <linux/timer.h>
+#include "firmware_exports.h"
+
+#define T3_MAX_SGE      4
+#define T3_MAX_INLINE	64
+#define T3_STAG0_PBL_SIZE (2 * T3_MAX_SGE << 3)
+#define T3_STAG0_MAX_PBE_LEN (128 * 1024 * 1024)
+#define T3_STAG0_PAGE_SHIFT 15
+
+#define Q_EMPTY(rptr,wptr) ((rptr)==(wptr))
+#define Q_FULL(rptr,wptr,size_log2)  ( (((wptr)-(rptr))>>(size_log2)) && \
+				       ((rptr)!=(wptr)) )
+#define Q_GENBIT(ptr,size_log2) (!(((ptr)>>size_log2)&0x1))
+#define Q_FREECNT(rptr,wptr,size_log2) ((1UL<<size_log2)-((wptr)-(rptr)))
+#define Q_COUNT(rptr,wptr) ((wptr)-(rptr))
+#define Q_PTR2IDX(ptr,size_log2) (ptr & ((1UL<<size_log2)-1))
+
+static inline void ring_doorbell(void __iomem *doorbell, u32 qpid)
+{
+	writel(((1<<31) | qpid), doorbell);
+}
+
+#define SEQ32_GE(x,y) (!( (((u32) (x)) - ((u32) (y))) & 0x80000000 ))
+
+enum t3_wr_flags {
+	T3_COMPLETION_FLAG = 0x01,
+	T3_NOTIFY_FLAG = 0x02,
+	T3_SOLICITED_EVENT_FLAG = 0x04,
+	T3_READ_FENCE_FLAG = 0x08,
+	T3_LOCAL_FENCE_FLAG = 0x10
+} __attribute__ ((packed));
+
+enum t3_wr_opcode {
+	T3_WR_BP = FW_WROPCODE_RI_BYPASS,
+	T3_WR_SEND = FW_WROPCODE_RI_SEND,
+	T3_WR_WRITE = FW_WROPCODE_RI_RDMA_WRITE,
+	T3_WR_READ = FW_WROPCODE_RI_RDMA_READ,
+	T3_WR_INV_STAG = FW_WROPCODE_RI_LOCAL_INV,
+	T3_WR_BIND = FW_WROPCODE_RI_BIND_MW,
+	T3_WR_RCV = FW_WROPCODE_RI_RECEIVE,
+	T3_WR_INIT = FW_WROPCODE_RI_RDMA_INIT,
+	T3_WR_QP_MOD = FW_WROPCODE_RI_MODIFY_QP,
+	T3_WR_FASTREG = FW_WROPCODE_RI_FASTREGISTER_MR
+} __attribute__ ((packed));
+
+enum t3_rdma_opcode {
+	T3_RDMA_WRITE,		/* IETF RDMAP v1.0 ... */
+	T3_READ_REQ,
+	T3_READ_RESP,
+	T3_SEND,
+	T3_SEND_WITH_INV,
+	T3_SEND_WITH_SE,
+	T3_SEND_WITH_SE_INV,
+	T3_TERMINATE,
+	T3_RDMA_INIT,		/* CHELSIO RI specific ... */
+	T3_BIND_MW,
+	T3_FAST_REGISTER,
+	T3_LOCAL_INV,
+	T3_QP_MOD,
+	T3_BYPASS,
+	T3_RDMA_READ_REQ_WITH_INV,
+} __attribute__ ((packed));
+
+static inline enum t3_rdma_opcode wr2opcode(enum t3_wr_opcode wrop)
+{
+	switch (wrop) {
+		case T3_WR_BP: return T3_BYPASS;
+		case T3_WR_SEND: return T3_SEND;
+		case T3_WR_WRITE: return T3_RDMA_WRITE;
+		case T3_WR_READ: return T3_READ_REQ;
+		case T3_WR_INV_STAG: return T3_LOCAL_INV;
+		case T3_WR_BIND: return T3_BIND_MW;
+		case T3_WR_INIT: return T3_RDMA_INIT;
+		case T3_WR_QP_MOD: return T3_QP_MOD;
+		case T3_WR_FASTREG: return T3_FAST_REGISTER;
+		default: break;
+	}
+	return -1;
+}
+
+
+/* Work request id */
+union t3_wrid {
+	struct {
+		u32 hi;
+		u32 low;
+	} id0;
+	u64 id1;
+};
+
+#define WRID(wrid)		(wrid.id1)
+#define WRID_GEN(wrid)		(wrid.id0.wr_gen)
+#define WRID_IDX(wrid)		(wrid.id0.wr_idx)
+#define WRID_LO(wrid)		(wrid.id0.wr_lo)
+
+struct fw_riwrh {
+	__be32 op_seop_flags;
+	__be32 gen_tid_len;
+};
+
+#define S_FW_RIWR_OP		24
+#define M_FW_RIWR_OP		0xff
+#define V_FW_RIWR_OP(x)		((x) << S_FW_RIWR_OP)
+#define G_FW_RIWR_OP(x)	((((x) >> S_FW_RIWR_OP)) & M_FW_RIWR_OP)
+
+#define S_FW_RIWR_SOPEOP	22
+#define M_FW_RIWR_SOPEOP	0x3
+#define V_FW_RIWR_SOPEOP(x)	((x) << S_FW_RIWR_SOPEOP)
+
+#define S_FW_RIWR_FLAGS		8
+#define M_FW_RIWR_FLAGS		0x3fffff
+#define V_FW_RIWR_FLAGS(x)	((x) << S_FW_RIWR_FLAGS)
+#define G_FW_RIWR_FLAGS(x)	((((x) >> S_FW_RIWR_FLAGS)) & M_FW_RIWR_FLAGS)
+
+#define S_FW_RIWR_TID		8
+#define V_FW_RIWR_TID(x)	((x) << S_FW_RIWR_TID)
+
+#define S_FW_RIWR_LEN		0
+#define V_FW_RIWR_LEN(x)	((x) << S_FW_RIWR_LEN)
+
+#define S_FW_RIWR_GEN           31
+#define V_FW_RIWR_GEN(x)        ((x)  << S_FW_RIWR_GEN)
+
+struct t3_sge {
+	__be32 stag;
+	__be32 len;
+	__be64 to;
+};
+
+/* If num_sgle is zero, flit 5+ contains immediate data.*/
+struct t3_send_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+
+	u8 rdmaop;		/* 2 */
+	u8 reserved[3];
+	__be32 rem_stag;
+	__be32 plen;		/* 3 */
+	__be32 num_sgle;
+	struct t3_sge sgl[T3_MAX_SGE];	/* 4+ */
+};
+
+#define T3_MAX_FASTREG_DEPTH 10
+#define T3_MAX_FASTREG_FRAG 10
+
+struct t3_fastreg_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	__be32 stag;		/* 2 */
+	__be32 len;
+	__be32 va_base_hi;	/* 3 */
+	__be32 va_base_lo_fbo;
+	__be32 page_type_perms; /* 4 */
+	__be32 reserved1;
+	__be64 pbl_addrs[0];	/* 5+ */
+};
+
+/*
+ * If a fastreg wr spans multiple wqes, then the 2nd fragment look like this.
+ */
+struct t3_pbl_frag {
+	struct fw_riwrh wrh;	/* 0 */
+	__be64 pbl_addrs[14];	/* 1..14 */
+};
+
+#define S_FR_PAGE_COUNT		24
+#define M_FR_PAGE_COUNT		0xff
+#define V_FR_PAGE_COUNT(x)	((x) << S_FR_PAGE_COUNT)
+#define G_FR_PAGE_COUNT(x)	((((x) >> S_FR_PAGE_COUNT)) & M_FR_PAGE_COUNT)
+
+#define S_FR_PAGE_SIZE		16
+#define M_FR_PAGE_SIZE		0x1f
+#define V_FR_PAGE_SIZE(x)	((x) << S_FR_PAGE_SIZE)
+#define G_FR_PAGE_SIZE(x)	((((x) >> S_FR_PAGE_SIZE)) & M_FR_PAGE_SIZE)
+
+#define S_FR_TYPE		8
+#define M_FR_TYPE		0x1
+#define V_FR_TYPE(x)		((x) << S_FR_TYPE)
+#define G_FR_TYPE(x)		((((x) >> S_FR_TYPE)) & M_FR_TYPE)
+
+#define S_FR_PERMS		0
+#define M_FR_PERMS		0xff
+#define V_FR_PERMS(x)		((x) << S_FR_PERMS)
+#define G_FR_PERMS(x)		((((x) >> S_FR_PERMS)) & M_FR_PERMS)
+
+struct t3_local_inv_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	__be32 stag;		/* 2 */
+	__be32 reserved;
+};
+
+struct t3_rdma_write_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	u8 rdmaop;		/* 2 */
+	u8 reserved[3];
+	__be32 stag_sink;
+	__be64 to_sink;		/* 3 */
+	__be32 plen;		/* 4 */
+	__be32 num_sgle;
+	struct t3_sge sgl[T3_MAX_SGE];	/* 5+ */
+};
+
+struct t3_rdma_read_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	u8 rdmaop;		/* 2 */
+	u8 local_inv;
+	u8 reserved[2];
+	__be32 rem_stag;
+	__be64 rem_to;		/* 3 */
+	__be32 local_stag;	/* 4 */
+	__be32 local_len;
+	__be64 local_to;	/* 5 */
+};
+
+struct t3_bind_mw_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	u16 reserved;		/* 2 */
+	u8 type;
+	u8 perms;
+	__be32 mr_stag;
+	__be32 mw_stag;		/* 3 */
+	__be32 mw_len;
+	__be64 mw_va;		/* 4 */
+	__be32 mr_pbl_addr;	/* 5 */
+	u8 reserved2[3];
+	u8 mr_pagesz;
+};
+
+struct t3_receive_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	u8 pagesz[T3_MAX_SGE];
+	__be32 num_sgle;		/* 2 */
+	struct t3_sge sgl[T3_MAX_SGE];	/* 3+ */
+	__be32 pbl_addr[T3_MAX_SGE];
+};
+
+struct t3_bypass_wr {
+	struct fw_riwrh wrh;
+	union t3_wrid wrid;	/* 1 */
+};
+
+struct t3_modify_qp_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	__be32 flags;		/* 2 */
+	__be32 quiesce;		/* 2 */
+	__be32 max_ird;		/* 3 */
+	__be32 max_ord;		/* 3 */
+	__be64 sge_cmd;		/* 4 */
+	__be64 ctx1;		/* 5 */
+	__be64 ctx0;		/* 6 */
+};
+
+enum t3_modify_qp_flags {
+	MODQP_QUIESCE  = 0x01,
+	MODQP_MAX_IRD  = 0x02,
+	MODQP_MAX_ORD  = 0x04,
+	MODQP_WRITE_EC = 0x08,
+	MODQP_READ_EC  = 0x10,
+};
+
+
+enum t3_mpa_attrs {
+	uP_RI_MPA_RX_MARKER_ENABLE = 0x1,
+	uP_RI_MPA_TX_MARKER_ENABLE = 0x2,
+	uP_RI_MPA_CRC_ENABLE = 0x4,
+	uP_RI_MPA_IETF_ENABLE = 0x8
+} __attribute__ ((packed));
+
+enum t3_qp_caps {
+	uP_RI_QP_RDMA_READ_ENABLE = 0x01,
+	uP_RI_QP_RDMA_WRITE_ENABLE = 0x02,
+	uP_RI_QP_BIND_ENABLE = 0x04,
+	uP_RI_QP_FAST_REGISTER_ENABLE = 0x08,
+	uP_RI_QP_STAG0_ENABLE = 0x10
+} __attribute__ ((packed));
+
+enum rdma_init_rtr_types {
+	RTR_READ = 1,
+	RTR_WRITE = 2,
+	RTR_SEND = 3,
+};
+
+#define S_RTR_TYPE	2
+#define M_RTR_TYPE	0x3
+#define V_RTR_TYPE(x)	((x) << S_RTR_TYPE)
+#define G_RTR_TYPE(x)	((((x) >> S_RTR_TYPE)) & M_RTR_TYPE)
+
+#define S_CHAN		4
+#define M_CHAN		0x3
+#define V_CHAN(x)	((x) << S_CHAN)
+#define G_CHAN(x)	((((x) >> S_CHAN)) & M_CHAN)
+
+struct t3_rdma_init_attr {
+	u32 tid;
+	u32 qpid;
+	u32 pdid;
+	u32 scqid;
+	u32 rcqid;
+	u32 rq_addr;
+	u32 rq_size;
+	enum t3_mpa_attrs mpaattrs;
+	enum t3_qp_caps qpcaps;
+	u16 tcp_emss;
+	u32 ord;
+	u32 ird;
+	u64 qp_dma_addr;
+	u32 qp_dma_size;
+	enum rdma_init_rtr_types rtr_type;
+	u16 flags;
+	u16 rqe_count;
+	u32 irs;
+	u32 chan;
+};
+
+struct t3_rdma_init_wr {
+	struct fw_riwrh wrh;	/* 0 */
+	union t3_wrid wrid;	/* 1 */
+	__be32 qpid;		/* 2 */
+	__be32 pdid;
+	__be32 scqid;		/* 3 */
+	__be32 rcqid;
+	__be32 rq_addr;		/* 4 */
+	__be32 rq_size;
+	u8 mpaattrs;		/* 5 */
+	u8 qpcaps;
+	__be16 ulpdu_size;
+	__be16 flags_rtr_type;
+	__be16 rqe_count;
+	__be32 ord;		/* 6 */
+	__be32 ird;
+	__be64 qp_dma_addr;	/* 7 */
+	__be32 qp_dma_size;	/* 8 */
+	__be32 irs;
+};
+
+struct t3_genbit {
+	u64 flit[15];
+	__be64 genbit;
+};
+
+struct t3_wq_in_err {
+	u64 flit[13];
+	u64 err;
+};
+
+enum rdma_init_wr_flags {
+	MPA_INITIATOR = (1<<0),
+	PRIV_QP = (1<<1),
+};
+
+union t3_wr {
+	struct t3_send_wr send;
+	struct t3_rdma_write_wr write;
+	struct t3_rdma_read_wr read;
+	struct t3_receive_wr recv;
+	struct t3_fastreg_wr fastreg;
+	struct t3_pbl_frag pbl_frag;
+	struct t3_local_inv_wr local_inv;
+	struct t3_bind_mw_wr bind;
+	struct t3_bypass_wr bypass;
+	struct t3_rdma_init_wr init;
+	struct t3_modify_qp_wr qp_mod;
+	struct t3_genbit genbit;
+	struct t3_wq_in_err wq_in_err;
+	__be64 flit[16];
+};
+
+#define T3_SQ_CQE_FLIT	  13
+#define T3_SQ_COOKIE_FLIT 14
+
+#define T3_RQ_COOKIE_FLIT 13
+#define T3_RQ_CQE_FLIT	  14
+
+static inline enum t3_wr_opcode fw_riwrh_opcode(struct fw_riwrh *wqe)
+{
+	return G_FW_RIWR_OP(be32_to_cpu(wqe->op_seop_flags));
+}
+
+enum t3_wr_hdr_bits {
+	T3_EOP = 1,
+	T3_SOP = 2,
+	T3_SOPEOP = T3_EOP|T3_SOP,
+};
+
+static inline void build_fw_riwrh(struct fw_riwrh *wqe, enum t3_wr_opcode op,
+				  enum t3_wr_flags flags, u8 genbit, u32 tid,
+				  u8 len, u8 sopeop)
+{
+	wqe->op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(op) |
+					 V_FW_RIWR_SOPEOP(sopeop) |
+					 V_FW_RIWR_FLAGS(flags));
+	wmb();
+	wqe->gen_tid_len = cpu_to_be32(V_FW_RIWR_GEN(genbit) |
+				       V_FW_RIWR_TID(tid) |
+				       V_FW_RIWR_LEN(len));
+	/* 2nd gen bit... */
+	((union t3_wr *)wqe)->genbit.genbit = cpu_to_be64(genbit);
+}
+
+/*
+ * T3 ULP2_TX commands
+ */
+enum t3_utx_mem_op {
+	T3_UTX_MEM_READ = 2,
+	T3_UTX_MEM_WRITE = 3
+};
+
+/* T3 MC7 RDMA TPT entry format */
+
+enum tpt_mem_type {
+	TPT_NON_SHARED_MR = 0x0,
+	TPT_SHARED_MR = 0x1,
+	TPT_MW = 0x2,
+	TPT_MW_RELAXED_PROTECTION = 0x3
+};
+
+enum tpt_addr_type {
+	TPT_ZBTO = 0,
+	TPT_VATO = 1
+};
+
+enum tpt_mem_perm {
+	TPT_MW_BIND = 0x10,
+	TPT_LOCAL_READ = 0x8,
+	TPT_LOCAL_WRITE = 0x4,
+	TPT_REMOTE_READ = 0x2,
+	TPT_REMOTE_WRITE = 0x1
+};
+
+struct tpt_entry {
+	__be32 valid_stag_pdid;
+	__be32 flags_pagesize_qpid;
+
+	__be32 rsvd_pbl_addr;
+	__be32 len;
+	__be32 va_hi;
+	__be32 va_low_or_fbo;
+
+	__be32 rsvd_bind_cnt_or_pstag;
+	__be32 rsvd_pbl_size;
+};
+
+#define S_TPT_VALID		31
+#define V_TPT_VALID(x)		((x) << S_TPT_VALID)
+#define F_TPT_VALID		V_TPT_VALID(1U)
+
+#define S_TPT_STAG_KEY		23
+#define M_TPT_STAG_KEY		0xFF
+#define V_TPT_STAG_KEY(x)	((x) << S_TPT_STAG_KEY)
+#define G_TPT_STAG_KEY(x)	(((x) >> S_TPT_STAG_KEY) & M_TPT_STAG_KEY)
+
+#define S_TPT_STAG_STATE	22
+#define V_TPT_STAG_STATE(x)	((x) << S_TPT_STAG_STATE)
+#define F_TPT_STAG_STATE	V_TPT_STAG_STATE(1U)
+
+#define S_TPT_STAG_TYPE		20
+#define M_TPT_STAG_TYPE		0x3
+#define V_TPT_STAG_TYPE(x)	((x) << S_TPT_STAG_TYPE)
+#define G_TPT_STAG_TYPE(x)	(((x) >> S_TPT_STAG_TYPE) & M_TPT_STAG_TYPE)
+
+#define S_TPT_PDID		0
+#define M_TPT_PDID		0xFFFFF
+#define V_TPT_PDID(x)		((x) << S_TPT_PDID)
+#define G_TPT_PDID(x)		(((x) >> S_TPT_PDID) & M_TPT_PDID)
+
+#define S_TPT_PERM		28
+#define M_TPT_PERM		0xF
+#define V_TPT_PERM(x)		((x) << S_TPT_PERM)
+#define G_TPT_PERM(x)		(((x) >> S_TPT_PERM) & M_TPT_PERM)
+
+#define S_TPT_REM_INV_DIS	27
+#define V_TPT_REM_INV_DIS(x)	((x) << S_TPT_REM_INV_DIS)
+#define F_TPT_REM_INV_DIS	V_TPT_REM_INV_DIS(1U)
+
+#define S_TPT_ADDR_TYPE		26
+#define V_TPT_ADDR_TYPE(x)	((x) << S_TPT_ADDR_TYPE)
+#define F_TPT_ADDR_TYPE		V_TPT_ADDR_TYPE(1U)
+
+#define S_TPT_MW_BIND_ENABLE	25
+#define V_TPT_MW_BIND_ENABLE(x)	((x) << S_TPT_MW_BIND_ENABLE)
+#define F_TPT_MW_BIND_ENABLE    V_TPT_MW_BIND_ENABLE(1U)
+
+#define S_TPT_PAGE_SIZE		20
+#define M_TPT_PAGE_SIZE		0x1F
+#define V_TPT_PAGE_SIZE(x)	((x) << S_TPT_PAGE_SIZE)
+#define G_TPT_PAGE_SIZE(x)	(((x) >> S_TPT_PAGE_SIZE) & M_TPT_PAGE_SIZE)
+
+#define S_TPT_PBL_ADDR		0
+#define M_TPT_PBL_ADDR		0x1FFFFFFF
+#define V_TPT_PBL_ADDR(x)	((x) << S_TPT_PBL_ADDR)
+#define G_TPT_PBL_ADDR(x)       (((x) >> S_TPT_PBL_ADDR) & M_TPT_PBL_ADDR)
+
+#define S_TPT_QPID		0
+#define M_TPT_QPID		0xFFFFF
+#define V_TPT_QPID(x)		((x) << S_TPT_QPID)
+#define G_TPT_QPID(x)		(((x) >> S_TPT_QPID) & M_TPT_QPID)
+
+#define S_TPT_PSTAG		0
+#define M_TPT_PSTAG		0xFFFFFF
+#define V_TPT_PSTAG(x)		((x) << S_TPT_PSTAG)
+#define G_TPT_PSTAG(x)		(((x) >> S_TPT_PSTAG) & M_TPT_PSTAG)
+
+#define S_TPT_PBL_SIZE		0
+#define M_TPT_PBL_SIZE		0xFFFFF
+#define V_TPT_PBL_SIZE(x)	((x) << S_TPT_PBL_SIZE)
+#define G_TPT_PBL_SIZE(x)	(((x) >> S_TPT_PBL_SIZE) & M_TPT_PBL_SIZE)
+
+/*
+ * CQE defs
+ */
+struct t3_cqe {
+	__be32 header;
+	__be32 len;
+	union {
+		struct {
+			__be32 stag;
+			__be32 msn;
+		} rcqe;
+		struct {
+			u32 wrid_hi;
+			u32 wrid_low;
+		} scqe;
+	} u;
+};
+
+#define S_CQE_OOO	  31
+#define M_CQE_OOO	  0x1
+#define G_CQE_OOO(x)	  ((((x) >> S_CQE_OOO)) & M_CQE_OOO)
+#define V_CEQ_OOO(x)	  ((x)<<S_CQE_OOO)
+
+#define S_CQE_QPID        12
+#define M_CQE_QPID        0x7FFFF
+#define G_CQE_QPID(x)     ((((x) >> S_CQE_QPID)) & M_CQE_QPID)
+#define V_CQE_QPID(x)	  ((x)<<S_CQE_QPID)
+
+#define S_CQE_SWCQE       11
+#define M_CQE_SWCQE       0x1
+#define G_CQE_SWCQE(x)    ((((x) >> S_CQE_SWCQE)) & M_CQE_SWCQE)
+#define V_CQE_SWCQE(x)	  ((x)<<S_CQE_SWCQE)
+
+#define S_CQE_GENBIT      10
+#define M_CQE_GENBIT      0x1
+#define G_CQE_GENBIT(x)   (((x) >> S_CQE_GENBIT) & M_CQE_GENBIT)
+#define V_CQE_GENBIT(x)	  ((x)<<S_CQE_GENBIT)
+
+#define S_CQE_STATUS      5
+#define M_CQE_STATUS      0x1F
+#define G_CQE_STATUS(x)   ((((x) >> S_CQE_STATUS)) & M_CQE_STATUS)
+#define V_CQE_STATUS(x)   ((x)<<S_CQE_STATUS)
+
+#define S_CQE_TYPE        4
+#define M_CQE_TYPE        0x1
+#define G_CQE_TYPE(x)     ((((x) >> S_CQE_TYPE)) & M_CQE_TYPE)
+#define V_CQE_TYPE(x)     ((x)<<S_CQE_TYPE)
+
+#define S_CQE_OPCODE      0
+#define M_CQE_OPCODE      0xF
+#define G_CQE_OPCODE(x)   ((((x) >> S_CQE_OPCODE)) & M_CQE_OPCODE)
+#define V_CQE_OPCODE(x)   ((x)<<S_CQE_OPCODE)
+
+#define SW_CQE(x)         (G_CQE_SWCQE(be32_to_cpu((x).header)))
+#define CQE_OOO(x)        (G_CQE_OOO(be32_to_cpu((x).header)))
+#define CQE_QPID(x)       (G_CQE_QPID(be32_to_cpu((x).header)))
+#define CQE_GENBIT(x)     (G_CQE_GENBIT(be32_to_cpu((x).header)))
+#define CQE_TYPE(x)       (G_CQE_TYPE(be32_to_cpu((x).header)))
+#define SQ_TYPE(x)	  (CQE_TYPE((x)))
+#define RQ_TYPE(x)	  (!CQE_TYPE((x)))
+#define CQE_STATUS(x)     (G_CQE_STATUS(be32_to_cpu((x).header)))
+#define CQE_OPCODE(x)     (G_CQE_OPCODE(be32_to_cpu((x).header)))
+
+#define CQE_SEND_OPCODE(x)( \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND) || \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_SE) || \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_INV) || \
+	(G_CQE_OPCODE(be32_to_cpu((x).header)) == T3_SEND_WITH_SE_INV))
+
+#define CQE_LEN(x)        (be32_to_cpu((x).len))
+
+/* used for RQ completion processing */
+#define CQE_WRID_STAG(x)  (be32_to_cpu((x).u.rcqe.stag))
+#define CQE_WRID_MSN(x)   (be32_to_cpu((x).u.rcqe.msn))
+
+/* used for SQ completion processing */
+#define CQE_WRID_SQ_WPTR(x)	((x).u.scqe.wrid_hi)
+#define CQE_WRID_WPTR(x)	((x).u.scqe.wrid_low)
+
+/* generic accessor macros */
+#define CQE_WRID_HI(x)		((x).u.scqe.wrid_hi)
+#define CQE_WRID_LOW(x)		((x).u.scqe.wrid_low)
+
+#define TPT_ERR_SUCCESS                     0x0
+#define TPT_ERR_STAG                        0x1	 /* STAG invalid: either the */
+						 /* STAG is offlimt, being 0, */
+						 /* or STAG_key mismatch */
+#define TPT_ERR_PDID                        0x2	 /* PDID mismatch */
+#define TPT_ERR_QPID                        0x3	 /* QPID mismatch */
+#define TPT_ERR_ACCESS                      0x4	 /* Invalid access right */
+#define TPT_ERR_WRAP                        0x5	 /* Wrap error */
+#define TPT_ERR_BOUND                       0x6	 /* base and bounds voilation */
+#define TPT_ERR_INVALIDATE_SHARED_MR        0x7	 /* attempt to invalidate a  */
+						 /* shared memory region */
+#define TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND 0x8	 /* attempt to invalidate a  */
+						 /* shared memory region */
+#define TPT_ERR_ECC                         0x9	 /* ECC error detected */
+#define TPT_ERR_ECC_PSTAG                   0xA	 /* ECC error detected when  */
+						 /* reading PSTAG for a MW  */
+						 /* Invalidate */
+#define TPT_ERR_PBL_ADDR_BOUND              0xB	 /* pbl addr out of bounds:  */
+						 /* software error */
+#define TPT_ERR_SWFLUSH			    0xC	 /* SW FLUSHED */
+#define TPT_ERR_CRC                         0x10 /* CRC error */
+#define TPT_ERR_MARKER                      0x11 /* Marker error */
+#define TPT_ERR_PDU_LEN_ERR                 0x12 /* invalid PDU length */
+#define TPT_ERR_OUT_OF_RQE                  0x13 /* out of RQE */
+#define TPT_ERR_DDP_VERSION                 0x14 /* wrong DDP version */
+#define TPT_ERR_RDMA_VERSION                0x15 /* wrong RDMA version */
+#define TPT_ERR_OPCODE                      0x16 /* invalid rdma opcode */
+#define TPT_ERR_DDP_QUEUE_NUM               0x17 /* invalid ddp queue number */
+#define TPT_ERR_MSN                         0x18 /* MSN error */
+#define TPT_ERR_TBIT                        0x19 /* tag bit not set correctly */
+#define TPT_ERR_MO                          0x1A /* MO not 0 for TERMINATE  */
+						 /* or READ_REQ */
+#define TPT_ERR_MSN_GAP                     0x1B
+#define TPT_ERR_MSN_RANGE                   0x1C
+#define TPT_ERR_IRD_OVERFLOW                0x1D
+#define TPT_ERR_RQE_ADDR_BOUND              0x1E /* RQE addr out of bounds:  */
+						 /* software error */
+#define TPT_ERR_INTERNAL_ERR                0x1F /* internal error (opcode  */
+						 /* mismatch) */
+
+struct t3_swsq {
+	__u64			wr_id;
+	struct t3_cqe		cqe;
+	__u32			sq_wptr;
+	__be32			read_len;
+	int			opcode;
+	int			complete;
+	int			signaled;
+};
+
+struct t3_swrq {
+	__u64			wr_id;
+	__u32			pbl_addr;
+};
+
+/*
+ * A T3 WQ implements both the SQ and RQ.
+ */
+struct t3_wq {
+	union t3_wr *queue;		/* DMA accessible memory */
+	dma_addr_t dma_addr;		/* DMA address for HW */
+	DEFINE_DMA_UNMAP_ADDR(mapping); /* unmap kruft */
+	u32 error;			/* 1 once we go to ERROR */
+	u32 qpid;
+	u32 wptr;			/* idx to next available WR slot */
+	u32 size_log2;			/* total wq size */
+	struct t3_swsq *sq;		/* SW SQ */
+	struct t3_swsq *oldest_read;	/* tracks oldest pending read */
+	u32 sq_wptr;			/* sq_wptr - sq_rptr == count of */
+	u32 sq_rptr;			/* pending wrs */
+	u32 sq_size_log2;		/* sq size */
+	struct t3_swrq *rq;		/* SW RQ (holds consumer wr_ids */
+	u32 rq_wptr;			/* rq_wptr - rq_rptr == count of */
+	u32 rq_rptr;			/* pending wrs */
+	struct t3_swrq *rq_oldest_wr;	/* oldest wr on the SW RQ */
+	u32 rq_size_log2;		/* rq size */
+	u32 rq_addr;			/* rq adapter address */
+	void __iomem *doorbell;		/* kernel db */
+	u64 udb;			/* user db if any */
+	struct cxio_rdev *rdev;
+};
+
+struct t3_cq {
+	u32 cqid;
+	u32 rptr;
+	u32 wptr;
+	u32 size_log2;
+	dma_addr_t dma_addr;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	struct t3_cqe *queue;
+	struct t3_cqe *sw_queue;
+	u32 sw_rptr;
+	u32 sw_wptr;
+};
+
+#define CQ_VLD_ENTRY(ptr,size_log2,cqe) (Q_GENBIT(ptr,size_log2) == \
+					 CQE_GENBIT(*cqe))
+
+struct t3_cq_status_page {
+	u32 cq_err;
+};
+
+static inline int cxio_cq_in_error(struct t3_cq *cq)
+{
+	return ((struct t3_cq_status_page *)
+		&cq->queue[1 << cq->size_log2])->cq_err;
+}
+
+static inline void cxio_set_cq_in_error(struct t3_cq *cq)
+{
+	((struct t3_cq_status_page *)
+	 &cq->queue[1 << cq->size_log2])->cq_err = 1;
+}
+
+static inline void cxio_set_wq_in_error(struct t3_wq *wq)
+{
+	wq->queue->wq_in_err.err |= 1;
+}
+
+static inline void cxio_disable_wq_db(struct t3_wq *wq)
+{
+	wq->queue->wq_in_err.err |= 2;
+}
+
+static inline void cxio_enable_wq_db(struct t3_wq *wq)
+{
+	wq->queue->wq_in_err.err &= ~2;
+}
+
+static inline int cxio_wq_db_enabled(struct t3_wq *wq)
+{
+	return !(wq->queue->wq_in_err.err & 2);
+}
+
+static inline struct t3_cqe *cxio_next_hw_cqe(struct t3_cq *cq)
+{
+	struct t3_cqe *cqe;
+
+	cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2));
+	if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe))
+		return cqe;
+	return NULL;
+}
+
+static inline struct t3_cqe *cxio_next_sw_cqe(struct t3_cq *cq)
+{
+	struct t3_cqe *cqe;
+
+	if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) {
+		cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2));
+		return cqe;
+	}
+	return NULL;
+}
+
+static inline struct t3_cqe *cxio_next_cqe(struct t3_cq *cq)
+{
+	struct t3_cqe *cqe;
+
+	if (!Q_EMPTY(cq->sw_rptr, cq->sw_wptr)) {
+		cqe = cq->sw_queue + (Q_PTR2IDX(cq->sw_rptr, cq->size_log2));
+		return cqe;
+	}
+	cqe = cq->queue + (Q_PTR2IDX(cq->rptr, cq->size_log2));
+	if (CQ_VLD_ENTRY(cq->rptr, cq->size_log2, cqe))
+		return cqe;
+	return NULL;
+}
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/iwch.c b/drivers/infiniband/hw/cxgb3/iwch.c
new file mode 100644
index 000000000..8e77dc543
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch.c
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "cxgb3_offload.h"
+#include "iwch_provider.h"
+#include "iwch_user.h"
+#include "iwch.h"
+#include "iwch_cm.h"
+
+#define DRV_VERSION "1.1"
+
+MODULE_AUTHOR("Boyd Faulkner, Steve Wise");
+MODULE_DESCRIPTION("Chelsio T3 RDMA Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static void open_rnic_dev(struct t3cdev *);
+static void close_rnic_dev(struct t3cdev *);
+static void iwch_event_handler(struct t3cdev *, u32, u32);
+
+struct cxgb3_client t3c_client = {
+	.name = "iw_cxgb3",
+	.add = open_rnic_dev,
+	.remove = close_rnic_dev,
+	.handlers = t3c_handlers,
+	.redirect = iwch_ep_redirect,
+	.event_handler = iwch_event_handler
+};
+
+static LIST_HEAD(dev_list);
+static DEFINE_MUTEX(dev_mutex);
+
+static int disable_qp_db(int id, void *p, void *data)
+{
+	struct iwch_qp *qhp = p;
+
+	cxio_disable_wq_db(&qhp->wq);
+	return 0;
+}
+
+static int enable_qp_db(int id, void *p, void *data)
+{
+	struct iwch_qp *qhp = p;
+
+	if (data)
+		ring_doorbell(qhp->rhp->rdev.ctrl_qp.doorbell, qhp->wq.qpid);
+	cxio_enable_wq_db(&qhp->wq);
+	return 0;
+}
+
+static void disable_dbs(struct iwch_dev *rnicp)
+{
+	spin_lock_irq(&rnicp->lock);
+	idr_for_each(&rnicp->qpidr, disable_qp_db, NULL);
+	spin_unlock_irq(&rnicp->lock);
+}
+
+static void enable_dbs(struct iwch_dev *rnicp, int ring_db)
+{
+	spin_lock_irq(&rnicp->lock);
+	idr_for_each(&rnicp->qpidr, enable_qp_db,
+		     (void *)(unsigned long)ring_db);
+	spin_unlock_irq(&rnicp->lock);
+}
+
+static void iwch_db_drop_task(struct work_struct *work)
+{
+	struct iwch_dev *rnicp = container_of(work, struct iwch_dev,
+					      db_drop_task.work);
+	enable_dbs(rnicp, 1);
+}
+
+static void rnic_init(struct iwch_dev *rnicp)
+{
+	PDBG("%s iwch_dev %p\n", __func__,  rnicp);
+	idr_init(&rnicp->cqidr);
+	idr_init(&rnicp->qpidr);
+	idr_init(&rnicp->mmidr);
+	spin_lock_init(&rnicp->lock);
+	INIT_DELAYED_WORK(&rnicp->db_drop_task, iwch_db_drop_task);
+
+	rnicp->attr.max_qps = T3_MAX_NUM_QP - 32;
+	rnicp->attr.max_wrs = T3_MAX_QP_DEPTH;
+	rnicp->attr.max_sge_per_wr = T3_MAX_SGE;
+	rnicp->attr.max_sge_per_rdma_write_wr = T3_MAX_SGE;
+	rnicp->attr.max_cqs = T3_MAX_NUM_CQ - 1;
+	rnicp->attr.max_cqes_per_cq = T3_MAX_CQ_DEPTH;
+	rnicp->attr.max_mem_regs = cxio_num_stags(&rnicp->rdev);
+	rnicp->attr.max_phys_buf_entries = T3_MAX_PBL_SIZE;
+	rnicp->attr.max_pds = T3_MAX_NUM_PD - 1;
+	rnicp->attr.mem_pgsizes_bitmask = T3_PAGESIZE_MASK;
+	rnicp->attr.max_mr_size = T3_MAX_MR_SIZE;
+	rnicp->attr.can_resize_wq = 0;
+	rnicp->attr.max_rdma_reads_per_qp = 8;
+	rnicp->attr.max_rdma_read_resources =
+	    rnicp->attr.max_rdma_reads_per_qp * rnicp->attr.max_qps;
+	rnicp->attr.max_rdma_read_qp_depth = 8;	/* IRD */
+	rnicp->attr.max_rdma_read_depth =
+	    rnicp->attr.max_rdma_read_qp_depth * rnicp->attr.max_qps;
+	rnicp->attr.rq_overflow_handled = 0;
+	rnicp->attr.can_modify_ird = 0;
+	rnicp->attr.can_modify_ord = 0;
+	rnicp->attr.max_mem_windows = rnicp->attr.max_mem_regs - 1;
+	rnicp->attr.stag0_value = 1;
+	rnicp->attr.zbva_support = 1;
+	rnicp->attr.local_invalidate_fence = 1;
+	rnicp->attr.cq_overflow_detection = 1;
+	return;
+}
+
+static void open_rnic_dev(struct t3cdev *tdev)
+{
+	struct iwch_dev *rnicp;
+
+	PDBG("%s t3cdev %p\n", __func__,  tdev);
+	printk_once(KERN_INFO MOD "Chelsio T3 RDMA Driver - version %s\n",
+		       DRV_VERSION);
+	rnicp = (struct iwch_dev *)ib_alloc_device(sizeof(*rnicp));
+	if (!rnicp) {
+		printk(KERN_ERR MOD "Cannot allocate ib device\n");
+		return;
+	}
+	rnicp->rdev.ulp = rnicp;
+	rnicp->rdev.t3cdev_p = tdev;
+
+	mutex_lock(&dev_mutex);
+
+	if (cxio_rdev_open(&rnicp->rdev)) {
+		mutex_unlock(&dev_mutex);
+		printk(KERN_ERR MOD "Unable to open CXIO rdev\n");
+		ib_dealloc_device(&rnicp->ibdev);
+		return;
+	}
+
+	rnic_init(rnicp);
+
+	list_add_tail(&rnicp->entry, &dev_list);
+	mutex_unlock(&dev_mutex);
+
+	if (iwch_register_device(rnicp)) {
+		printk(KERN_ERR MOD "Unable to register device\n");
+		close_rnic_dev(tdev);
+	}
+	printk(KERN_INFO MOD "Initialized device %s\n",
+	       pci_name(rnicp->rdev.rnic_info.pdev));
+	return;
+}
+
+static void close_rnic_dev(struct t3cdev *tdev)
+{
+	struct iwch_dev *dev, *tmp;
+	PDBG("%s t3cdev %p\n", __func__,  tdev);
+	mutex_lock(&dev_mutex);
+	list_for_each_entry_safe(dev, tmp, &dev_list, entry) {
+		if (dev->rdev.t3cdev_p == tdev) {
+			dev->rdev.flags = CXIO_ERROR_FATAL;
+			synchronize_net();
+			cancel_delayed_work_sync(&dev->db_drop_task);
+			list_del(&dev->entry);
+			iwch_unregister_device(dev);
+			cxio_rdev_close(&dev->rdev);
+			idr_destroy(&dev->cqidr);
+			idr_destroy(&dev->qpidr);
+			idr_destroy(&dev->mmidr);
+			ib_dealloc_device(&dev->ibdev);
+			break;
+		}
+	}
+	mutex_unlock(&dev_mutex);
+}
+
+static void iwch_event_handler(struct t3cdev *tdev, u32 evt, u32 port_id)
+{
+	struct cxio_rdev *rdev = tdev->ulp;
+	struct iwch_dev *rnicp;
+	struct ib_event event;
+	u32 portnum = port_id + 1;
+	int dispatch = 0;
+
+	if (!rdev)
+		return;
+	rnicp = rdev_to_iwch_dev(rdev);
+	switch (evt) {
+	case OFFLOAD_STATUS_DOWN: {
+		rdev->flags = CXIO_ERROR_FATAL;
+		synchronize_net();
+		event.event  = IB_EVENT_DEVICE_FATAL;
+		dispatch = 1;
+		break;
+		}
+	case OFFLOAD_PORT_DOWN: {
+		event.event  = IB_EVENT_PORT_ERR;
+		dispatch = 1;
+		break;
+		}
+	case OFFLOAD_PORT_UP: {
+		event.event  = IB_EVENT_PORT_ACTIVE;
+		dispatch = 1;
+		break;
+		}
+	case OFFLOAD_DB_FULL: {
+		disable_dbs(rnicp);
+		break;
+		}
+	case OFFLOAD_DB_EMPTY: {
+		enable_dbs(rnicp, 1);
+		break;
+		}
+	case OFFLOAD_DB_DROP: {
+		unsigned long delay = 1000;
+		unsigned short r;
+
+		disable_dbs(rnicp);
+		get_random_bytes(&r, 2);
+		delay += r & 1023;
+
+		/*
+		 * delay is between 1000-2023 usecs.
+		 */
+		schedule_delayed_work(&rnicp->db_drop_task,
+			usecs_to_jiffies(delay));
+		break;
+		}
+	}
+
+	if (dispatch) {
+		event.device = &rnicp->ibdev;
+		event.element.port_num = portnum;
+		ib_dispatch_event(&event);
+	}
+
+	return;
+}
+
+static int __init iwch_init_module(void)
+{
+	int err;
+
+	err = cxio_hal_init();
+	if (err)
+		return err;
+	err = iwch_cm_init();
+	if (err)
+		return err;
+	cxio_register_ev_cb(iwch_ev_dispatch);
+	cxgb3_register_client(&t3c_client);
+	return 0;
+}
+
+static void __exit iwch_exit_module(void)
+{
+	cxgb3_unregister_client(&t3c_client);
+	cxio_unregister_ev_cb(iwch_ev_dispatch);
+	iwch_cm_term();
+	cxio_hal_exit();
+}
+
+module_init(iwch_init_module);
+module_exit(iwch_exit_module);
diff --git a/drivers/infiniband/hw/cxgb3/iwch.h b/drivers/infiniband/hw/cxgb3/iwch.h
new file mode 100644
index 000000000..837862287
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch.h
@@ -0,0 +1,180 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __IWCH_H__
+#define __IWCH_H__
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/workqueue.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "cxio_hal.h"
+#include "cxgb3_offload.h"
+
+struct iwch_pd;
+struct iwch_cq;
+struct iwch_qp;
+struct iwch_mr;
+
+struct iwch_rnic_attributes {
+	u32 max_qps;
+	u32 max_wrs;				/* Max for any SQ/RQ */
+	u32 max_sge_per_wr;
+	u32 max_sge_per_rdma_write_wr;	/* for RDMA Write WR */
+	u32 max_cqs;
+	u32 max_cqes_per_cq;
+	u32 max_mem_regs;
+	u32 max_phys_buf_entries;		/* for phys buf list */
+	u32 max_pds;
+
+	/*
+	 * The memory page sizes supported by this RNIC.
+	 * Bit position i in bitmap indicates page of
+	 * size (4k)^i.  Phys block list mode unsupported.
+	 */
+	u32 mem_pgsizes_bitmask;
+	u64 max_mr_size;
+	u8 can_resize_wq;
+
+	/*
+	 * The maximum number of RDMA Reads that can be outstanding
+	 * per QP with this RNIC as the target.
+	 */
+	u32 max_rdma_reads_per_qp;
+
+	/*
+	 * The maximum number of resources used for RDMA Reads
+	 * by this RNIC with this RNIC as the target.
+	 */
+	u32 max_rdma_read_resources;
+
+	/*
+	 * The max depth per QP for initiation of RDMA Read
+	 * by this RNIC.
+	 */
+	u32 max_rdma_read_qp_depth;
+
+	/*
+	 * The maximum depth for initiation of RDMA Read
+	 * operations by this RNIC on all QPs
+	 */
+	u32 max_rdma_read_depth;
+	u8 rq_overflow_handled;
+	u32 can_modify_ird;
+	u32 can_modify_ord;
+	u32 max_mem_windows;
+	u32 stag0_value;
+	u8 zbva_support;
+	u8 local_invalidate_fence;
+	u32 cq_overflow_detection;
+};
+
+struct iwch_dev {
+	struct ib_device ibdev;
+	struct cxio_rdev rdev;
+	u32 device_cap_flags;
+	struct iwch_rnic_attributes attr;
+	struct idr cqidr;
+	struct idr qpidr;
+	struct idr mmidr;
+	spinlock_t lock;
+	struct list_head entry;
+	struct delayed_work db_drop_task;
+};
+
+static inline struct iwch_dev *to_iwch_dev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct iwch_dev, ibdev);
+}
+
+static inline struct iwch_dev *rdev_to_iwch_dev(struct cxio_rdev *rdev)
+{
+	return container_of(rdev, struct iwch_dev, rdev);
+}
+
+static inline int t3b_device(const struct iwch_dev *rhp)
+{
+	return rhp->rdev.t3cdev_p->type == T3B;
+}
+
+static inline int t3a_device(const struct iwch_dev *rhp)
+{
+	return rhp->rdev.t3cdev_p->type == T3A;
+}
+
+static inline struct iwch_cq *get_chp(struct iwch_dev *rhp, u32 cqid)
+{
+	return idr_find(&rhp->cqidr, cqid);
+}
+
+static inline struct iwch_qp *get_qhp(struct iwch_dev *rhp, u32 qpid)
+{
+	return idr_find(&rhp->qpidr, qpid);
+}
+
+static inline struct iwch_mr *get_mhp(struct iwch_dev *rhp, u32 mmid)
+{
+	return idr_find(&rhp->mmidr, mmid);
+}
+
+static inline int insert_handle(struct iwch_dev *rhp, struct idr *idr,
+				void *handle, u32 id)
+{
+	int ret;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irq(&rhp->lock);
+
+	ret = idr_alloc(idr, handle, id, id + 1, GFP_NOWAIT);
+
+	spin_unlock_irq(&rhp->lock);
+	idr_preload_end();
+
+	BUG_ON(ret == -ENOSPC);
+	return ret < 0 ? ret : 0;
+}
+
+static inline void remove_handle(struct iwch_dev *rhp, struct idr *idr, u32 id)
+{
+	spin_lock_irq(&rhp->lock);
+	idr_remove(idr, id);
+	spin_unlock_irq(&rhp->lock);
+}
+
+extern struct cxgb3_client t3c_client;
+extern cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS];
+extern void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb);
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
new file mode 100644
index 000000000..cb78b1e9b
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -0,0 +1,2272 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+
+#include <net/neighbour.h>
+#include <net/netevent.h>
+#include <net/route.h>
+
+#include "tcb.h"
+#include "cxgb3_offload.h"
+#include "iwch.h"
+#include "iwch_provider.h"
+#include "iwch_cm.h"
+
+static char *states[] = {
+	"idle",
+	"listen",
+	"connecting",
+	"mpa_wait_req",
+	"mpa_req_sent",
+	"mpa_req_rcvd",
+	"mpa_rep_sent",
+	"fpdu_mode",
+	"aborting",
+	"closing",
+	"moribund",
+	"dead",
+	NULL,
+};
+
+int peer2peer = 0;
+module_param(peer2peer, int, 0644);
+MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=0)");
+
+static int ep_timeout_secs = 60;
+module_param(ep_timeout_secs, int, 0644);
+MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
+				   "in seconds (default=60)");
+
+static int mpa_rev = 1;
+module_param(mpa_rev, int, 0644);
+MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, "
+		 "1 is spec compliant. (default=1)");
+
+static int markers_enabled = 0;
+module_param(markers_enabled, int, 0644);
+MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)");
+
+static int crc_enabled = 1;
+module_param(crc_enabled, int, 0644);
+MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)");
+
+static int rcv_win = 256 * 1024;
+module_param(rcv_win, int, 0644);
+MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256)");
+
+static int snd_win = 32 * 1024;
+module_param(snd_win, int, 0644);
+MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=32KB)");
+
+static unsigned int nocong = 0;
+module_param(nocong, uint, 0644);
+MODULE_PARM_DESC(nocong, "Turn off congestion control (default=0)");
+
+static unsigned int cong_flavor = 1;
+module_param(cong_flavor, uint, 0644);
+MODULE_PARM_DESC(cong_flavor, "TCP Congestion control flavor (default=1)");
+
+static struct workqueue_struct *workq;
+
+static struct sk_buff_head rxq;
+
+static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp);
+static void ep_timeout(unsigned long arg);
+static void connect_reply_upcall(struct iwch_ep *ep, int status);
+
+static void start_ep_timer(struct iwch_ep *ep)
+{
+	PDBG("%s ep %p\n", __func__, ep);
+	if (timer_pending(&ep->timer)) {
+		PDBG("%s stopped / restarted timer ep %p\n", __func__, ep);
+		del_timer_sync(&ep->timer);
+	} else
+		get_ep(&ep->com);
+	ep->timer.expires = jiffies + ep_timeout_secs * HZ;
+	ep->timer.data = (unsigned long)ep;
+	ep->timer.function = ep_timeout;
+	add_timer(&ep->timer);
+}
+
+static void stop_ep_timer(struct iwch_ep *ep)
+{
+	PDBG("%s ep %p\n", __func__, ep);
+	if (!timer_pending(&ep->timer)) {
+		WARN(1, "%s timer stopped when its not running!  ep %p state %u\n",
+			__func__, ep, ep->com.state);
+		return;
+	}
+	del_timer_sync(&ep->timer);
+	put_ep(&ep->com);
+}
+
+static int iwch_l2t_send(struct t3cdev *tdev, struct sk_buff *skb, struct l2t_entry *l2e)
+{
+	int	error = 0;
+	struct cxio_rdev *rdev;
+
+	rdev = (struct cxio_rdev *)tdev->ulp;
+	if (cxio_fatal_error(rdev)) {
+		kfree_skb(skb);
+		return -EIO;
+	}
+	error = l2t_send(tdev, skb, l2e);
+	if (error < 0)
+		kfree_skb(skb);
+	return error;
+}
+
+int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb)
+{
+	int	error = 0;
+	struct cxio_rdev *rdev;
+
+	rdev = (struct cxio_rdev *)tdev->ulp;
+	if (cxio_fatal_error(rdev)) {
+		kfree_skb(skb);
+		return -EIO;
+	}
+	error = cxgb3_ofld_send(tdev, skb);
+	if (error < 0)
+		kfree_skb(skb);
+	return error;
+}
+
+static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb)
+{
+	struct cpl_tid_release *req;
+
+	skb = get_skb(skb, sizeof *req, GFP_KERNEL);
+	if (!skb)
+		return;
+	req = (struct cpl_tid_release *) skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_TID_RELEASE, hwtid));
+	skb->priority = CPL_PRIORITY_SETUP;
+	iwch_cxgb3_ofld_send(tdev, skb);
+	return;
+}
+
+int iwch_quiesce_tid(struct iwch_ep *ep)
+{
+	struct cpl_set_tcb_field *req;
+	struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+
+	if (!skb)
+		return -ENOMEM;
+	req = (struct cpl_set_tcb_field *) skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
+	req->reply = 0;
+	req->cpu_idx = 0;
+	req->word = htons(W_TCB_RX_QUIESCE);
+	req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
+	req->val = cpu_to_be64(1 << S_TCB_RX_QUIESCE);
+
+	skb->priority = CPL_PRIORITY_DATA;
+	return iwch_cxgb3_ofld_send(ep->com.tdev, skb);
+}
+
+int iwch_resume_tid(struct iwch_ep *ep)
+{
+	struct cpl_set_tcb_field *req;
+	struct sk_buff *skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+
+	if (!skb)
+		return -ENOMEM;
+	req = (struct cpl_set_tcb_field *) skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_SET_TCB_FIELD, ep->hwtid));
+	req->reply = 0;
+	req->cpu_idx = 0;
+	req->word = htons(W_TCB_RX_QUIESCE);
+	req->mask = cpu_to_be64(1ULL << S_TCB_RX_QUIESCE);
+	req->val = 0;
+
+	skb->priority = CPL_PRIORITY_DATA;
+	return iwch_cxgb3_ofld_send(ep->com.tdev, skb);
+}
+
+static void set_emss(struct iwch_ep *ep, u16 opt)
+{
+	PDBG("%s ep %p opt %u\n", __func__, ep, opt);
+	ep->emss = T3C_DATA(ep->com.tdev)->mtus[G_TCPOPT_MSS(opt)] - 40;
+	if (G_TCPOPT_TSTAMP(opt))
+		ep->emss -= 12;
+	if (ep->emss < 128)
+		ep->emss = 128;
+	PDBG("emss=%d\n", ep->emss);
+}
+
+static enum iwch_ep_state state_read(struct iwch_ep_common *epc)
+{
+	unsigned long flags;
+	enum iwch_ep_state state;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	state = epc->state;
+	spin_unlock_irqrestore(&epc->lock, flags);
+	return state;
+}
+
+static void __state_set(struct iwch_ep_common *epc, enum iwch_ep_state new)
+{
+	epc->state = new;
+}
+
+static void state_set(struct iwch_ep_common *epc, enum iwch_ep_state new)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&epc->lock, flags);
+	PDBG("%s - %s -> %s\n", __func__, states[epc->state], states[new]);
+	__state_set(epc, new);
+	spin_unlock_irqrestore(&epc->lock, flags);
+	return;
+}
+
+static void *alloc_ep(int size, gfp_t gfp)
+{
+	struct iwch_ep_common *epc;
+
+	epc = kzalloc(size, gfp);
+	if (epc) {
+		kref_init(&epc->kref);
+		spin_lock_init(&epc->lock);
+		init_waitqueue_head(&epc->waitq);
+	}
+	PDBG("%s alloc ep %p\n", __func__, epc);
+	return epc;
+}
+
+void __free_ep(struct kref *kref)
+{
+	struct iwch_ep *ep;
+	ep = container_of(container_of(kref, struct iwch_ep_common, kref),
+			  struct iwch_ep, com);
+	PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]);
+	if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) {
+		cxgb3_remove_tid(ep->com.tdev, (void *)ep, ep->hwtid);
+		dst_release(ep->dst);
+		l2t_release(ep->com.tdev, ep->l2t);
+	}
+	kfree(ep);
+}
+
+static void release_ep_resources(struct iwch_ep *ep)
+{
+	PDBG("%s ep %p tid %d\n", __func__, ep, ep->hwtid);
+	set_bit(RELEASE_RESOURCES, &ep->com.flags);
+	put_ep(&ep->com);
+}
+
+static int status2errno(int status)
+{
+	switch (status) {
+	case CPL_ERR_NONE:
+		return 0;
+	case CPL_ERR_CONN_RESET:
+		return -ECONNRESET;
+	case CPL_ERR_ARP_MISS:
+		return -EHOSTUNREACH;
+	case CPL_ERR_CONN_TIMEDOUT:
+		return -ETIMEDOUT;
+	case CPL_ERR_TCAM_FULL:
+		return -ENOMEM;
+	case CPL_ERR_CONN_EXIST:
+		return -EADDRINUSE;
+	default:
+		return -EIO;
+	}
+}
+
+/*
+ * Try and reuse skbs already allocated...
+ */
+static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp)
+{
+	if (skb && !skb_is_nonlinear(skb) && !skb_cloned(skb)) {
+		skb_trim(skb, 0);
+		skb_get(skb);
+	} else {
+		skb = alloc_skb(len, gfp);
+	}
+	return skb;
+}
+
+static struct rtable *find_route(struct t3cdev *dev, __be32 local_ip,
+				 __be32 peer_ip, __be16 local_port,
+				 __be16 peer_port, u8 tos)
+{
+	struct rtable *rt;
+	struct flowi4 fl4;
+
+	rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip, local_ip,
+				   peer_port, local_port, IPPROTO_TCP,
+				   tos, 0);
+	if (IS_ERR(rt))
+		return NULL;
+	return rt;
+}
+
+static unsigned int find_best_mtu(const struct t3c_data *d, unsigned short mtu)
+{
+	int i = 0;
+
+	while (i < d->nmtus - 1 && d->mtus[i + 1] <= mtu)
+		++i;
+	return i;
+}
+
+static void arp_failure_discard(struct t3cdev *dev, struct sk_buff *skb)
+{
+	PDBG("%s t3cdev %p\n", __func__, dev);
+	kfree_skb(skb);
+}
+
+/*
+ * Handle an ARP failure for an active open.
+ */
+static void act_open_req_arp_failure(struct t3cdev *dev, struct sk_buff *skb)
+{
+	printk(KERN_ERR MOD "ARP failure duing connect\n");
+	kfree_skb(skb);
+}
+
+/*
+ * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
+ * and send it along.
+ */
+static void abort_arp_failure(struct t3cdev *dev, struct sk_buff *skb)
+{
+	struct cpl_abort_req *req = cplhdr(skb);
+
+	PDBG("%s t3cdev %p\n", __func__, dev);
+	req->cmd = CPL_ABORT_NO_RST;
+	iwch_cxgb3_ofld_send(dev, skb);
+}
+
+static int send_halfclose(struct iwch_ep *ep, gfp_t gfp)
+{
+	struct cpl_close_con_req *req;
+	struct sk_buff *skb;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	skb = get_skb(NULL, sizeof(*req), gfp);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__);
+		return -ENOMEM;
+	}
+	skb->priority = CPL_PRIORITY_DATA;
+	set_arp_failure_handler(skb, arp_failure_discard);
+	req = (struct cpl_close_con_req *) skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_CLOSE_CON));
+	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_CON_REQ, ep->hwtid));
+	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+}
+
+static int send_abort(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp)
+{
+	struct cpl_abort_req *req;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	skb = get_skb(skb, sizeof(*req), gfp);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb.\n",
+		       __func__);
+		return -ENOMEM;
+	}
+	skb->priority = CPL_PRIORITY_DATA;
+	set_arp_failure_handler(skb, abort_arp_failure);
+	req = (struct cpl_abort_req *) skb_put(skb, sizeof(*req));
+	memset(req, 0, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_REQ));
+	req->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid));
+	req->cmd = CPL_ABORT_SEND_RST;
+	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+}
+
+static int send_connect(struct iwch_ep *ep)
+{
+	struct cpl_act_open_req *req;
+	struct sk_buff *skb;
+	u32 opt0h, opt0l, opt2;
+	unsigned int mtu_idx;
+	int wscale;
+
+	PDBG("%s ep %p\n", __func__, ep);
+
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb.\n",
+		       __func__);
+		return -ENOMEM;
+	}
+	mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst));
+	wscale = compute_wscale(rcv_win);
+	opt0h = V_NAGLE(0) |
+	    V_NO_CONG(nocong) |
+	    V_KEEP_ALIVE(1) |
+	    F_TCAM_BYPASS |
+	    V_WND_SCALE(wscale) |
+	    V_MSS_IDX(mtu_idx) |
+	    V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx);
+	opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10);
+	opt2 = F_RX_COALESCE_VALID | V_RX_COALESCE(0) | V_FLAVORS_VALID(1) |
+	       V_CONG_CONTROL_FLAVOR(cong_flavor);
+	skb->priority = CPL_PRIORITY_SETUP;
+	set_arp_failure_handler(skb, act_open_req_arp_failure);
+
+	req = (struct cpl_act_open_req *) skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_ACT_OPEN_REQ, ep->atid));
+	req->local_port = ep->com.local_addr.sin_port;
+	req->peer_port = ep->com.remote_addr.sin_port;
+	req->local_ip = ep->com.local_addr.sin_addr.s_addr;
+	req->peer_ip = ep->com.remote_addr.sin_addr.s_addr;
+	req->opt0h = htonl(opt0h);
+	req->opt0l = htonl(opt0l);
+	req->params = 0;
+	req->opt2 = htonl(opt2);
+	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+}
+
+static void send_mpa_req(struct iwch_ep *ep, struct sk_buff *skb)
+{
+	int mpalen;
+	struct tx_data_wr *req;
+	struct mpa_message *mpa;
+	int len;
+
+	PDBG("%s ep %p pd_len %d\n", __func__, ep, ep->plen);
+
+	BUG_ON(skb_cloned(skb));
+
+	mpalen = sizeof(*mpa) + ep->plen;
+	if (skb->data + mpalen + sizeof(*req) > skb_end_pointer(skb)) {
+		kfree_skb(skb);
+		skb=alloc_skb(mpalen + sizeof(*req), GFP_KERNEL);
+		if (!skb) {
+			connect_reply_upcall(ep, -ENOMEM);
+			return;
+		}
+	}
+	skb_trim(skb, 0);
+	skb_reserve(skb, sizeof(*req));
+	skb_put(skb, mpalen);
+	skb->priority = CPL_PRIORITY_DATA;
+	mpa = (struct mpa_message *) skb->data;
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key));
+	mpa->flags = (crc_enabled ? MPA_CRC : 0) |
+		     (markers_enabled ? MPA_MARKERS : 0);
+	mpa->private_data_size = htons(ep->plen);
+	mpa->revision = mpa_rev;
+
+	if (ep->plen)
+		memcpy(mpa->private_data, ep->mpa_pkt + sizeof(*mpa), ep->plen);
+
+	/*
+	 * Reference the mpa skb.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function tx_ack() will deref it.
+	 */
+	skb_get(skb);
+	set_arp_failure_handler(skb, arp_failure_discard);
+	skb_reset_transport_header(skb);
+	len = skb->len;
+	req = (struct tx_data_wr *) skb_push(skb, sizeof(*req));
+	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL);
+	req->wr_lo = htonl(V_WR_TID(ep->hwtid));
+	req->len = htonl(len);
+	req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
+			   V_TX_SNDBUF(snd_win>>15));
+	req->flags = htonl(F_TX_INIT);
+	req->sndseq = htonl(ep->snd_seq);
+	BUG_ON(ep->mpa_skb);
+	ep->mpa_skb = skb;
+	iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+	start_ep_timer(ep);
+	state_set(&ep->com, MPA_REQ_SENT);
+	return;
+}
+
+static int send_mpa_reject(struct iwch_ep *ep, const void *pdata, u8 plen)
+{
+	int mpalen;
+	struct tx_data_wr *req;
+	struct mpa_message *mpa;
+	struct sk_buff *skb;
+
+	PDBG("%s ep %p plen %d\n", __func__, ep, plen);
+
+	mpalen = sizeof(*mpa) + plen;
+
+	skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__);
+		return -ENOMEM;
+	}
+	skb_reserve(skb, sizeof(*req));
+	mpa = (struct mpa_message *) skb_put(skb, mpalen);
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+	mpa->flags = MPA_REJECT;
+	mpa->revision = mpa_rev;
+	mpa->private_data_size = htons(plen);
+	if (plen)
+		memcpy(mpa->private_data, pdata, plen);
+
+	/*
+	 * Reference the mpa skb again.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function tx_ack() will deref it.
+	 */
+	skb_get(skb);
+	skb->priority = CPL_PRIORITY_DATA;
+	set_arp_failure_handler(skb, arp_failure_discard);
+	skb_reset_transport_header(skb);
+	req = (struct tx_data_wr *) skb_push(skb, sizeof(*req));
+	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL);
+	req->wr_lo = htonl(V_WR_TID(ep->hwtid));
+	req->len = htonl(mpalen);
+	req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
+			   V_TX_SNDBUF(snd_win>>15));
+	req->flags = htonl(F_TX_INIT);
+	req->sndseq = htonl(ep->snd_seq);
+	BUG_ON(ep->mpa_skb);
+	ep->mpa_skb = skb;
+	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+}
+
+static int send_mpa_reply(struct iwch_ep *ep, const void *pdata, u8 plen)
+{
+	int mpalen;
+	struct tx_data_wr *req;
+	struct mpa_message *mpa;
+	int len;
+	struct sk_buff *skb;
+
+	PDBG("%s ep %p plen %d\n", __func__, ep, plen);
+
+	mpalen = sizeof(*mpa) + plen;
+
+	skb = get_skb(NULL, mpalen + sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__);
+		return -ENOMEM;
+	}
+	skb->priority = CPL_PRIORITY_DATA;
+	skb_reserve(skb, sizeof(*req));
+	mpa = (struct mpa_message *) skb_put(skb, mpalen);
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+	mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) |
+		     (markers_enabled ? MPA_MARKERS : 0);
+	mpa->revision = mpa_rev;
+	mpa->private_data_size = htons(plen);
+	if (plen)
+		memcpy(mpa->private_data, pdata, plen);
+
+	/*
+	 * Reference the mpa skb.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function tx_ack() will deref it.
+	 */
+	skb_get(skb);
+	set_arp_failure_handler(skb, arp_failure_discard);
+	skb_reset_transport_header(skb);
+	len = skb->len;
+	req = (struct tx_data_wr *) skb_push(skb, sizeof(*req));
+	req->wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_TX_DATA)|F_WR_COMPL);
+	req->wr_lo = htonl(V_WR_TID(ep->hwtid));
+	req->len = htonl(len);
+	req->param = htonl(V_TX_PORT(ep->l2t->smt_idx) |
+			   V_TX_SNDBUF(snd_win>>15));
+	req->flags = htonl(F_TX_INIT);
+	req->sndseq = htonl(ep->snd_seq);
+	ep->mpa_skb = skb;
+	state_set(&ep->com, MPA_REP_SENT);
+	return iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+}
+
+static int act_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct cpl_act_establish *req = cplhdr(skb);
+	unsigned int tid = GET_TID(req);
+
+	PDBG("%s ep %p tid %d\n", __func__, ep, tid);
+
+	dst_confirm(ep->dst);
+
+	/* setup the hwtid for this connection */
+	ep->hwtid = tid;
+	cxgb3_insert_tid(ep->com.tdev, &t3c_client, ep, tid);
+
+	ep->snd_seq = ntohl(req->snd_isn);
+	ep->rcv_seq = ntohl(req->rcv_isn);
+
+	set_emss(ep, ntohs(req->tcp_opt));
+
+	/* dealloc the atid */
+	cxgb3_free_atid(ep->com.tdev, ep->atid);
+
+	/* start MPA negotiation */
+	send_mpa_req(ep, skb);
+
+	return 0;
+}
+
+static void abort_connection(struct iwch_ep *ep, struct sk_buff *skb, gfp_t gfp)
+{
+	PDBG("%s ep %p\n", __FILE__, ep);
+	state_set(&ep->com, ABORTING);
+	send_abort(ep, skb, gfp);
+}
+
+static void close_complete_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CLOSE;
+	if (ep->com.cm_id) {
+		PDBG("close complete delivered ep %p cm_id %p tid %d\n",
+		     ep, ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		ep->com.qp = NULL;
+	}
+}
+
+static void peer_close_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_DISCONNECT;
+	if (ep->com.cm_id) {
+		PDBG("peer close delivered ep %p cm_id %p tid %d\n",
+		     ep, ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+	}
+}
+
+static void peer_abort_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CLOSE;
+	event.status = -ECONNRESET;
+	if (ep->com.cm_id) {
+		PDBG("abort delivered ep %p cm_id %p tid %d\n", ep,
+		     ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		ep->com.qp = NULL;
+	}
+}
+
+static void connect_reply_upcall(struct iwch_ep *ep, int status)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p status %d\n", __func__, ep, status);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REPLY;
+	event.status = status;
+	memcpy(&event.local_addr, &ep->com.local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&event.remote_addr, &ep->com.remote_addr,
+	       sizeof(ep->com.remote_addr));
+
+	if ((status == 0) || (status == -ECONNREFUSED)) {
+		event.private_data_len = ep->plen;
+		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
+	}
+	if (ep->com.cm_id) {
+		PDBG("%s ep %p tid %d status %d\n", __func__, ep,
+		     ep->hwtid, status);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+	}
+	if (status < 0) {
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		ep->com.qp = NULL;
+	}
+}
+
+static void connect_request_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %d\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REQUEST;
+	memcpy(&event.local_addr, &ep->com.local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&event.remote_addr, &ep->com.remote_addr,
+	       sizeof(ep->com.local_addr));
+	event.private_data_len = ep->plen;
+	event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
+	event.provider_data = ep;
+	/*
+	 * Until ird/ord negotiation via MPAv2 support is added, send max
+	 * supported values
+	 */
+	event.ird = event.ord = 8;
+	if (state_read(&ep->parent_ep->com) != DEAD) {
+		get_ep(&ep->com);
+		ep->parent_ep->com.cm_id->event_handler(
+						ep->parent_ep->com.cm_id,
+						&event);
+	}
+	put_ep(&ep->parent_ep->com);
+	ep->parent_ep = NULL;
+}
+
+static void established_upcall(struct iwch_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_ESTABLISHED;
+	/*
+	 * Until ird/ord negotiation via MPAv2 support is added, send max
+	 * supported values
+	 */
+	event.ird = event.ord = 8;
+	if (ep->com.cm_id) {
+		PDBG("%s ep %p tid %d\n", __func__, ep, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+	}
+}
+
+static int update_rx_credits(struct iwch_ep *ep, u32 credits)
+{
+	struct cpl_rx_data_ack *req;
+	struct sk_buff *skb;
+
+	PDBG("%s ep %p credits %u\n", __func__, ep, credits);
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "update_rx_credits - cannot alloc skb!\n");
+		return 0;
+	}
+
+	req = (struct cpl_rx_data_ack *) skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_RX_DATA_ACK, ep->hwtid));
+	req->credit_dack = htonl(V_RX_CREDITS(credits) | V_RX_FORCE_ACK(1));
+	skb->priority = CPL_PRIORITY_ACK;
+	iwch_cxgb3_ofld_send(ep->com.tdev, skb);
+	return credits;
+}
+
+static void process_mpa_reply(struct iwch_ep *ep, struct sk_buff *skb)
+{
+	struct mpa_message *mpa;
+	u16 plen;
+	struct iwch_qp_attributes attrs;
+	enum iwch_qp_attr_mask mask;
+	int err;
+
+	PDBG("%s ep %p\n", __func__, ep);
+
+	/*
+	 * Stop mpa timer.  If it expired, then the state has
+	 * changed and we bail since ep_timeout already aborted
+	 * the connection.
+	 */
+	stop_ep_timer(ep);
+	if (state_read(&ep->com) != MPA_REQ_SENT)
+		return;
+
+	/*
+	 * If we get more than the supported amount of private data
+	 * then we must fail this connection.
+	 */
+	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
+		err = -EINVAL;
+		goto err;
+	}
+
+	/*
+	 * copy the new data into our accumulation buffer.
+	 */
+	skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]),
+				  skb->len);
+	ep->mpa_pkt_len += skb->len;
+
+	/*
+	 * if we don't even have the mpa message, then bail.
+	 */
+	if (ep->mpa_pkt_len < sizeof(*mpa))
+		return;
+	mpa = (struct mpa_message *) ep->mpa_pkt;
+
+	/* Validate MPA header. */
+	if (mpa->revision != mpa_rev) {
+		err = -EPROTO;
+		goto err;
+	}
+	if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	plen = ntohs(mpa->private_data_size);
+
+	/*
+	 * Fail if there's too much private data.
+	 */
+	if (plen > MPA_MAX_PRIVATE_DATA) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	/*
+	 * If plen does not account for pkt size
+	 */
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	ep->plen = (u8) plen;
+
+	/*
+	 * If we don't have all the pdata yet, then bail.
+	 * We'll continue process when more data arrives.
+	 */
+	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+		return;
+
+	if (mpa->flags & MPA_REJECT) {
+		err = -ECONNREFUSED;
+		goto err;
+	}
+
+	/*
+	 * If we get here we have accumulated the entire mpa
+	 * start reply message including private data. And
+	 * the MPA header is valid.
+	 */
+	state_set(&ep->com, FPDU_MODE);
+	ep->mpa_attr.initiator = 1;
+	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+	ep->mpa_attr.recv_marker_enabled = markers_enabled;
+	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+	ep->mpa_attr.version = mpa_rev;
+	PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, "
+	     "xmit_marker_enabled=%d, version=%d\n", __func__,
+	     ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
+	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
+
+	attrs.mpa_attr = ep->mpa_attr;
+	attrs.max_ird = ep->ird;
+	attrs.max_ord = ep->ord;
+	attrs.llp_stream_handle = ep;
+	attrs.next_state = IWCH_QP_STATE_RTS;
+
+	mask = IWCH_QP_ATTR_NEXT_STATE |
+	    IWCH_QP_ATTR_LLP_STREAM_HANDLE | IWCH_QP_ATTR_MPA_ATTR |
+	    IWCH_QP_ATTR_MAX_IRD | IWCH_QP_ATTR_MAX_ORD;
+
+	/* bind QP and TID with INIT_WR */
+	err = iwch_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, mask, &attrs, 1);
+	if (err)
+		goto err;
+
+	if (peer2peer && iwch_rqes_posted(ep->com.qp) == 0) {
+		iwch_post_zb_read(ep);
+	}
+
+	goto out;
+err:
+	abort_connection(ep, skb, GFP_KERNEL);
+out:
+	connect_reply_upcall(ep, err);
+	return;
+}
+
+static void process_mpa_request(struct iwch_ep *ep, struct sk_buff *skb)
+{
+	struct mpa_message *mpa;
+	u16 plen;
+
+	PDBG("%s ep %p\n", __func__, ep);
+
+	/*
+	 * Stop mpa timer.  If it expired, then the state has
+	 * changed and we bail since ep_timeout already aborted
+	 * the connection.
+	 */
+	stop_ep_timer(ep);
+	if (state_read(&ep->com) != MPA_REQ_WAIT)
+		return;
+
+	/*
+	 * If we get more than the supported amount of private data
+	 * then we must fail this connection.
+	 */
+	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
+
+	/*
+	 * Copy the new data into our accumulation buffer.
+	 */
+	skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]),
+				  skb->len);
+	ep->mpa_pkt_len += skb->len;
+
+	/*
+	 * If we don't even have the mpa message, then bail.
+	 * We'll continue process when more data arrives.
+	 */
+	if (ep->mpa_pkt_len < sizeof(*mpa))
+		return;
+	PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
+	mpa = (struct mpa_message *) ep->mpa_pkt;
+
+	/*
+	 * Validate MPA Header.
+	 */
+	if (mpa->revision != mpa_rev) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	plen = ntohs(mpa->private_data_size);
+
+	/*
+	 * Fail if there's too much private data.
+	 */
+	if (plen > MPA_MAX_PRIVATE_DATA) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	/*
+	 * If plen does not account for pkt size
+	 */
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+	ep->plen = (u8) plen;
+
+	/*
+	 * If we don't have all the pdata yet, then bail.
+	 */
+	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+		return;
+
+	/*
+	 * If we get here we have accumulated the entire mpa
+	 * start reply message including private data.
+	 */
+	ep->mpa_attr.initiator = 0;
+	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+	ep->mpa_attr.recv_marker_enabled = markers_enabled;
+	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+	ep->mpa_attr.version = mpa_rev;
+	PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, "
+	     "xmit_marker_enabled=%d, version=%d\n", __func__,
+	     ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
+	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version);
+
+	state_set(&ep->com, MPA_REQ_RCVD);
+
+	/* drive upcall */
+	connect_request_upcall(ep);
+	return;
+}
+
+static int rx_data(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct cpl_rx_data *hdr = cplhdr(skb);
+	unsigned int dlen = ntohs(hdr->len);
+
+	PDBG("%s ep %p dlen %u\n", __func__, ep, dlen);
+
+	skb_pull(skb, sizeof(*hdr));
+	skb_trim(skb, dlen);
+
+	ep->rcv_seq += dlen;
+	BUG_ON(ep->rcv_seq != (ntohl(hdr->seq) + dlen));
+
+	switch (state_read(&ep->com)) {
+	case MPA_REQ_SENT:
+		process_mpa_reply(ep, skb);
+		break;
+	case MPA_REQ_WAIT:
+		process_mpa_request(ep, skb);
+		break;
+	case MPA_REP_SENT:
+		break;
+	default:
+		printk(KERN_ERR MOD "%s Unexpected streaming data."
+		       " ep %p state %d tid %d\n",
+		       __func__, ep, state_read(&ep->com), ep->hwtid);
+
+		/*
+		 * The ep will timeout and inform the ULP of the failure.
+		 * See ep_timeout().
+		 */
+		break;
+	}
+
+	/* update RX credits */
+	update_rx_credits(ep, dlen);
+
+	return CPL_RET_BUF_DONE;
+}
+
+/*
+ * Upcall from the adapter indicating data has been transmitted.
+ * For us its just the single MPA request or reply.  We can now free
+ * the skb holding the mpa message.
+ */
+static int tx_ack(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct cpl_wr_ack *hdr = cplhdr(skb);
+	unsigned int credits = ntohs(hdr->credits);
+	unsigned long flags;
+	int post_zb = 0;
+
+	PDBG("%s ep %p credits %u\n", __func__, ep, credits);
+
+	if (credits == 0) {
+		PDBG("%s 0 credit ack  ep %p state %u\n",
+		     __func__, ep, state_read(&ep->com));
+		return CPL_RET_BUF_DONE;
+	}
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	BUG_ON(credits != 1);
+	dst_confirm(ep->dst);
+	if (!ep->mpa_skb) {
+		PDBG("%s rdma_init wr_ack ep %p state %u\n",
+			__func__, ep, ep->com.state);
+		if (ep->mpa_attr.initiator) {
+			PDBG("%s initiator ep %p state %u\n",
+				__func__, ep, ep->com.state);
+			if (peer2peer && ep->com.state == FPDU_MODE)
+				post_zb = 1;
+		} else {
+			PDBG("%s responder ep %p state %u\n",
+				__func__, ep, ep->com.state);
+			if (ep->com.state == MPA_REQ_RCVD) {
+				ep->com.rpl_done = 1;
+				wake_up(&ep->com.waitq);
+			}
+		}
+	} else {
+		PDBG("%s lsm ack ep %p state %u freeing skb\n",
+			__func__, ep, ep->com.state);
+		kfree_skb(ep->mpa_skb);
+		ep->mpa_skb = NULL;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (post_zb)
+		iwch_post_zb_read(ep);
+	return CPL_RET_BUF_DONE;
+}
+
+static int abort_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	unsigned long flags;
+	int release = 0;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	BUG_ON(!ep);
+
+	/*
+	 * We get 2 abort replies from the HW.  The first one must
+	 * be ignored except for scribbling that we need one more.
+	 */
+	if (!test_and_set_bit(ABORT_REQ_IN_PROGRESS, &ep->com.flags)) {
+		return CPL_RET_BUF_DONE;
+	}
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	switch (ep->com.state) {
+	case ABORTING:
+		close_complete_upcall(ep);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		break;
+	default:
+		printk(KERN_ERR "%s ep %p state %d\n",
+		     __func__, ep, ep->com.state);
+		break;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+
+	if (release)
+		release_ep_resources(ep);
+	return CPL_RET_BUF_DONE;
+}
+
+/*
+ * Return whether a failed active open has allocated a TID
+ */
+static inline int act_open_has_tid(int status)
+{
+	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
+	       status != CPL_ERR_ARP_MISS;
+}
+
+static int act_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct cpl_act_open_rpl *rpl = cplhdr(skb);
+
+	PDBG("%s ep %p status %u errno %d\n", __func__, ep, rpl->status,
+	     status2errno(rpl->status));
+	connect_reply_upcall(ep, status2errno(rpl->status));
+	state_set(&ep->com, DEAD);
+	if (ep->com.tdev->type != T3A && act_open_has_tid(rpl->status))
+		release_tid(ep->com.tdev, GET_TID(rpl), NULL);
+	cxgb3_free_atid(ep->com.tdev, ep->atid);
+	dst_release(ep->dst);
+	l2t_release(ep->com.tdev, ep->l2t);
+	put_ep(&ep->com);
+	return CPL_RET_BUF_DONE;
+}
+
+static int listen_start(struct iwch_listen_ep *ep)
+{
+	struct sk_buff *skb;
+	struct cpl_pass_open_req *req;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "t3c_listen_start failed to alloc skb!\n");
+		return -ENOMEM;
+	}
+
+	req = (struct cpl_pass_open_req *) skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_OPEN_REQ, ep->stid));
+	req->local_port = ep->com.local_addr.sin_port;
+	req->local_ip = ep->com.local_addr.sin_addr.s_addr;
+	req->peer_port = 0;
+	req->peer_ip = 0;
+	req->peer_netmask = 0;
+	req->opt0h = htonl(F_DELACK | F_TCAM_BYPASS);
+	req->opt0l = htonl(V_RCV_BUFSIZ(rcv_win>>10));
+	req->opt1 = htonl(V_CONN_POLICY(CPL_CONN_POLICY_ASK));
+
+	skb->priority = 1;
+	return iwch_cxgb3_ofld_send(ep->com.tdev, skb);
+}
+
+static int pass_open_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_listen_ep *ep = ctx;
+	struct cpl_pass_open_rpl *rpl = cplhdr(skb);
+
+	PDBG("%s ep %p status %d error %d\n", __func__, ep,
+	     rpl->status, status2errno(rpl->status));
+	ep->com.rpl_err = status2errno(rpl->status);
+	ep->com.rpl_done = 1;
+	wake_up(&ep->com.waitq);
+
+	return CPL_RET_BUF_DONE;
+}
+
+static int listen_stop(struct iwch_listen_ep *ep)
+{
+	struct sk_buff *skb;
+	struct cpl_close_listserv_req *req;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__);
+		return -ENOMEM;
+	}
+	req = (struct cpl_close_listserv_req *) skb_put(skb, sizeof(*req));
+	req->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	req->cpu_idx = 0;
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_CLOSE_LISTSRV_REQ, ep->stid));
+	skb->priority = 1;
+	return iwch_cxgb3_ofld_send(ep->com.tdev, skb);
+}
+
+static int close_listsrv_rpl(struct t3cdev *tdev, struct sk_buff *skb,
+			     void *ctx)
+{
+	struct iwch_listen_ep *ep = ctx;
+	struct cpl_close_listserv_rpl *rpl = cplhdr(skb);
+
+	PDBG("%s ep %p\n", __func__, ep);
+	ep->com.rpl_err = status2errno(rpl->status);
+	ep->com.rpl_done = 1;
+	wake_up(&ep->com.waitq);
+	return CPL_RET_BUF_DONE;
+}
+
+static void accept_cr(struct iwch_ep *ep, __be32 peer_ip, struct sk_buff *skb)
+{
+	struct cpl_pass_accept_rpl *rpl;
+	unsigned int mtu_idx;
+	u32 opt0h, opt0l, opt2;
+	int wscale;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	BUG_ON(skb_cloned(skb));
+	skb_trim(skb, sizeof(*rpl));
+	skb_get(skb);
+	mtu_idx = find_best_mtu(T3C_DATA(ep->com.tdev), dst_mtu(ep->dst));
+	wscale = compute_wscale(rcv_win);
+	opt0h = V_NAGLE(0) |
+	    V_NO_CONG(nocong) |
+	    V_KEEP_ALIVE(1) |
+	    F_TCAM_BYPASS |
+	    V_WND_SCALE(wscale) |
+	    V_MSS_IDX(mtu_idx) |
+	    V_L2T_IDX(ep->l2t->idx) | V_TX_CHANNEL(ep->l2t->smt_idx);
+	opt0l = V_TOS((ep->tos >> 2) & M_TOS) | V_RCV_BUFSIZ(rcv_win>>10);
+	opt2 = F_RX_COALESCE_VALID | V_RX_COALESCE(0) | V_FLAVORS_VALID(1) |
+	       V_CONG_CONTROL_FLAVOR(cong_flavor);
+
+	rpl = cplhdr(skb);
+	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL, ep->hwtid));
+	rpl->peer_ip = peer_ip;
+	rpl->opt0h = htonl(opt0h);
+	rpl->opt0l_status = htonl(opt0l | CPL_PASS_OPEN_ACCEPT);
+	rpl->opt2 = htonl(opt2);
+	rpl->rsvd = rpl->opt2;	/* workaround for HW bug */
+	skb->priority = CPL_PRIORITY_SETUP;
+	iwch_l2t_send(ep->com.tdev, skb, ep->l2t);
+
+	return;
+}
+
+static void reject_cr(struct t3cdev *tdev, u32 hwtid, __be32 peer_ip,
+		      struct sk_buff *skb)
+{
+	PDBG("%s t3cdev %p tid %u peer_ip %x\n", __func__, tdev, hwtid,
+	     peer_ip);
+	BUG_ON(skb_cloned(skb));
+	skb_trim(skb, sizeof(struct cpl_tid_release));
+	skb_get(skb);
+
+	if (tdev->type != T3A)
+		release_tid(tdev, hwtid, skb);
+	else {
+		struct cpl_pass_accept_rpl *rpl;
+
+		rpl = cplhdr(skb);
+		skb->priority = CPL_PRIORITY_SETUP;
+		rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_FORWARD));
+		OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL,
+						      hwtid));
+		rpl->peer_ip = peer_ip;
+		rpl->opt0h = htonl(F_TCAM_BYPASS);
+		rpl->opt0l_status = htonl(CPL_PASS_OPEN_REJECT);
+		rpl->opt2 = 0;
+		rpl->rsvd = rpl->opt2;
+		iwch_cxgb3_ofld_send(tdev, skb);
+	}
+}
+
+static int pass_accept_req(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *child_ep, *parent_ep = ctx;
+	struct cpl_pass_accept_req *req = cplhdr(skb);
+	unsigned int hwtid = GET_TID(req);
+	struct dst_entry *dst;
+	struct l2t_entry *l2t;
+	struct rtable *rt;
+	struct iff_mac tim;
+
+	PDBG("%s parent ep %p tid %u\n", __func__, parent_ep, hwtid);
+
+	if (state_read(&parent_ep->com) != LISTEN) {
+		printk(KERN_ERR "%s - listening ep not in LISTEN\n",
+		       __func__);
+		goto reject;
+	}
+
+	/*
+	 * Find the netdev for this connection request.
+	 */
+	tim.mac_addr = req->dst_mac;
+	tim.vlan_tag = ntohs(req->vlan_tag);
+	if (tdev->ctl(tdev, GET_IFF_FROM_MAC, &tim) < 0 || !tim.dev) {
+		printk(KERN_ERR "%s bad dst mac %pM\n",
+			__func__, req->dst_mac);
+		goto reject;
+	}
+
+	/* Find output route */
+	rt = find_route(tdev,
+			req->local_ip,
+			req->peer_ip,
+			req->local_port,
+			req->peer_port, G_PASS_OPEN_TOS(ntohl(req->tos_tid)));
+	if (!rt) {
+		printk(KERN_ERR MOD "%s - failed to find dst entry!\n",
+		       __func__);
+		goto reject;
+	}
+	dst = &rt->dst;
+	l2t = t3_l2t_get(tdev, dst, NULL, &req->peer_ip);
+	if (!l2t) {
+		printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n",
+		       __func__);
+		dst_release(dst);
+		goto reject;
+	}
+	child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL);
+	if (!child_ep) {
+		printk(KERN_ERR MOD "%s - failed to allocate ep entry!\n",
+		       __func__);
+		l2t_release(tdev, l2t);
+		dst_release(dst);
+		goto reject;
+	}
+	state_set(&child_ep->com, CONNECTING);
+	child_ep->com.tdev = tdev;
+	child_ep->com.cm_id = NULL;
+	child_ep->com.local_addr.sin_family = PF_INET;
+	child_ep->com.local_addr.sin_port = req->local_port;
+	child_ep->com.local_addr.sin_addr.s_addr = req->local_ip;
+	child_ep->com.remote_addr.sin_family = PF_INET;
+	child_ep->com.remote_addr.sin_port = req->peer_port;
+	child_ep->com.remote_addr.sin_addr.s_addr = req->peer_ip;
+	get_ep(&parent_ep->com);
+	child_ep->parent_ep = parent_ep;
+	child_ep->tos = G_PASS_OPEN_TOS(ntohl(req->tos_tid));
+	child_ep->l2t = l2t;
+	child_ep->dst = dst;
+	child_ep->hwtid = hwtid;
+	init_timer(&child_ep->timer);
+	cxgb3_insert_tid(tdev, &t3c_client, child_ep, hwtid);
+	accept_cr(child_ep, req->peer_ip, skb);
+	goto out;
+reject:
+	reject_cr(tdev, hwtid, req->peer_ip, skb);
+out:
+	return CPL_RET_BUF_DONE;
+}
+
+static int pass_establish(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct cpl_pass_establish *req = cplhdr(skb);
+
+	PDBG("%s ep %p\n", __func__, ep);
+	ep->snd_seq = ntohl(req->snd_isn);
+	ep->rcv_seq = ntohl(req->rcv_isn);
+
+	set_emss(ep, ntohs(req->tcp_opt));
+
+	dst_confirm(ep->dst);
+	state_set(&ep->com, MPA_REQ_WAIT);
+	start_ep_timer(ep);
+
+	return CPL_RET_BUF_DONE;
+}
+
+static int peer_close(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct iwch_qp_attributes attrs;
+	unsigned long flags;
+	int disconnect = 1;
+	int release = 0;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	dst_confirm(ep->dst);
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	switch (ep->com.state) {
+	case MPA_REQ_WAIT:
+		__state_set(&ep->com, CLOSING);
+		break;
+	case MPA_REQ_SENT:
+		__state_set(&ep->com, CLOSING);
+		connect_reply_upcall(ep, -ECONNRESET);
+		break;
+	case MPA_REQ_RCVD:
+
+		/*
+		 * We're gonna mark this puppy DEAD, but keep
+		 * the reference on it until the ULP accepts or
+		 * rejects the CR. Also wake up anyone waiting
+		 * in rdma connection migration (see iwch_accept_cr()).
+		 */
+		__state_set(&ep->com, CLOSING);
+		ep->com.rpl_done = 1;
+		ep->com.rpl_err = -ECONNRESET;
+		PDBG("waking up ep %p\n", ep);
+		wake_up(&ep->com.waitq);
+		break;
+	case MPA_REP_SENT:
+		__state_set(&ep->com, CLOSING);
+		ep->com.rpl_done = 1;
+		ep->com.rpl_err = -ECONNRESET;
+		PDBG("waking up ep %p\n", ep);
+		wake_up(&ep->com.waitq);
+		break;
+	case FPDU_MODE:
+		start_ep_timer(ep);
+		__state_set(&ep->com, CLOSING);
+		attrs.next_state = IWCH_QP_STATE_CLOSING;
+		iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
+			       IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
+		peer_close_upcall(ep);
+		break;
+	case ABORTING:
+		disconnect = 0;
+		break;
+	case CLOSING:
+		__state_set(&ep->com, MORIBUND);
+		disconnect = 0;
+		break;
+	case MORIBUND:
+		stop_ep_timer(ep);
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = IWCH_QP_STATE_IDLE;
+			iwch_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				       IWCH_QP_ATTR_NEXT_STATE, &attrs, 1);
+		}
+		close_complete_upcall(ep);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		disconnect = 0;
+		break;
+	case DEAD:
+		disconnect = 0;
+		break;
+	default:
+		BUG_ON(1);
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (disconnect)
+		iwch_ep_disconnect(ep, 0, GFP_KERNEL);
+	if (release)
+		release_ep_resources(ep);
+	return CPL_RET_BUF_DONE;
+}
+
+/*
+ * Returns whether an ABORT_REQ_RSS message is a negative advice.
+ */
+static int is_neg_adv_abort(unsigned int status)
+{
+	return status == CPL_ERR_RTX_NEG_ADVICE ||
+	       status == CPL_ERR_PERSIST_NEG_ADVICE;
+}
+
+static int peer_abort(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct cpl_abort_req_rss *req = cplhdr(skb);
+	struct iwch_ep *ep = ctx;
+	struct cpl_abort_rpl *rpl;
+	struct sk_buff *rpl_skb;
+	struct iwch_qp_attributes attrs;
+	int ret;
+	int release = 0;
+	unsigned long flags;
+
+	if (is_neg_adv_abort(req->status)) {
+		PDBG("%s neg_adv_abort ep %p tid %d\n", __func__, ep,
+		     ep->hwtid);
+		t3_l2t_send_event(ep->com.tdev, ep->l2t);
+		return CPL_RET_BUF_DONE;
+	}
+
+	/*
+	 * We get 2 peer aborts from the HW.  The first one must
+	 * be ignored except for scribbling that we need one more.
+	 */
+	if (!test_and_set_bit(PEER_ABORT_IN_PROGRESS, &ep->com.flags)) {
+		return CPL_RET_BUF_DONE;
+	}
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	PDBG("%s ep %p state %u\n", __func__, ep, ep->com.state);
+	switch (ep->com.state) {
+	case CONNECTING:
+		break;
+	case MPA_REQ_WAIT:
+		stop_ep_timer(ep);
+		break;
+	case MPA_REQ_SENT:
+		stop_ep_timer(ep);
+		connect_reply_upcall(ep, -ECONNRESET);
+		break;
+	case MPA_REP_SENT:
+		ep->com.rpl_done = 1;
+		ep->com.rpl_err = -ECONNRESET;
+		PDBG("waking up ep %p\n", ep);
+		wake_up(&ep->com.waitq);
+		break;
+	case MPA_REQ_RCVD:
+
+		/*
+		 * We're gonna mark this puppy DEAD, but keep
+		 * the reference on it until the ULP accepts or
+		 * rejects the CR. Also wake up anyone waiting
+		 * in rdma connection migration (see iwch_accept_cr()).
+		 */
+		ep->com.rpl_done = 1;
+		ep->com.rpl_err = -ECONNRESET;
+		PDBG("waking up ep %p\n", ep);
+		wake_up(&ep->com.waitq);
+		break;
+	case MORIBUND:
+	case CLOSING:
+		stop_ep_timer(ep);
+		/*FALLTHROUGH*/
+	case FPDU_MODE:
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = IWCH_QP_STATE_ERROR;
+			ret = iwch_modify_qp(ep->com.qp->rhp,
+				     ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
+				     &attrs, 1);
+			if (ret)
+				printk(KERN_ERR MOD
+				       "%s - qp <- error failed!\n",
+				       __func__);
+		}
+		peer_abort_upcall(ep);
+		break;
+	case ABORTING:
+		break;
+	case DEAD:
+		PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__);
+		spin_unlock_irqrestore(&ep->com.lock, flags);
+		return CPL_RET_BUF_DONE;
+	default:
+		BUG_ON(1);
+		break;
+	}
+	dst_confirm(ep->dst);
+	if (ep->com.state != ABORTING) {
+		__state_set(&ep->com, DEAD);
+		release = 1;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+
+	rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL);
+	if (!rpl_skb) {
+		printk(KERN_ERR MOD "%s - cannot allocate skb!\n",
+		       __func__);
+		release = 1;
+		goto out;
+	}
+	rpl_skb->priority = CPL_PRIORITY_DATA;
+	rpl = (struct cpl_abort_rpl *) skb_put(rpl_skb, sizeof(*rpl));
+	rpl->wr.wr_hi = htonl(V_WR_OP(FW_WROPCODE_OFLD_HOST_ABORT_CON_RPL));
+	rpl->wr.wr_lo = htonl(V_WR_TID(ep->hwtid));
+	OPCODE_TID(rpl) = htonl(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid));
+	rpl->cmd = CPL_ABORT_NO_RST;
+	iwch_cxgb3_ofld_send(ep->com.tdev, rpl_skb);
+out:
+	if (release)
+		release_ep_resources(ep);
+	return CPL_RET_BUF_DONE;
+}
+
+static int close_con_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+	struct iwch_qp_attributes attrs;
+	unsigned long flags;
+	int release = 0;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	BUG_ON(!ep);
+
+	/* The cm_id may be null if we failed to connect */
+	spin_lock_irqsave(&ep->com.lock, flags);
+	switch (ep->com.state) {
+	case CLOSING:
+		__state_set(&ep->com, MORIBUND);
+		break;
+	case MORIBUND:
+		stop_ep_timer(ep);
+		if ((ep->com.cm_id) && (ep->com.qp)) {
+			attrs.next_state = IWCH_QP_STATE_IDLE;
+			iwch_modify_qp(ep->com.qp->rhp,
+					     ep->com.qp,
+					     IWCH_QP_ATTR_NEXT_STATE,
+					     &attrs, 1);
+		}
+		close_complete_upcall(ep);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		break;
+	case ABORTING:
+	case DEAD:
+		break;
+	default:
+		BUG_ON(1);
+		break;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (release)
+		release_ep_resources(ep);
+	return CPL_RET_BUF_DONE;
+}
+
+/*
+ * T3A does 3 things when a TERM is received:
+ * 1) send up a CPL_RDMA_TERMINATE message with the TERM packet
+ * 2) generate an async event on the QP with the TERMINATE opcode
+ * 3) post a TERMINATE opcode cqe into the associated CQ.
+ *
+ * For (1), we save the message in the qp for later consumer consumption.
+ * For (2), we move the QP into TERMINATE, post a QP event and disconnect.
+ * For (3), we toss the CQE in cxio_poll_cq().
+ *
+ * terminate() handles case (1)...
+ */
+static int terminate(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep *ep = ctx;
+
+	if (state_read(&ep->com) != FPDU_MODE)
+		return CPL_RET_BUF_DONE;
+
+	PDBG("%s ep %p\n", __func__, ep);
+	skb_pull(skb, sizeof(struct cpl_rdma_terminate));
+	PDBG("%s saving %d bytes of term msg\n", __func__, skb->len);
+	skb_copy_from_linear_data(skb, ep->com.qp->attr.terminate_buffer,
+				  skb->len);
+	ep->com.qp->attr.terminate_msg_len = skb->len;
+	ep->com.qp->attr.is_terminate_local = 0;
+	return CPL_RET_BUF_DONE;
+}
+
+static int ec_status(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct cpl_rdma_ec_status *rep = cplhdr(skb);
+	struct iwch_ep *ep = ctx;
+
+	PDBG("%s ep %p tid %u status %d\n", __func__, ep, ep->hwtid,
+	     rep->status);
+	if (rep->status) {
+		struct iwch_qp_attributes attrs;
+
+		printk(KERN_ERR MOD "%s BAD CLOSE - Aborting tid %u\n",
+		       __func__, ep->hwtid);
+		stop_ep_timer(ep);
+		attrs.next_state = IWCH_QP_STATE_ERROR;
+		iwch_modify_qp(ep->com.qp->rhp,
+			       ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
+			       &attrs, 1);
+		abort_connection(ep, NULL, GFP_KERNEL);
+	}
+	return CPL_RET_BUF_DONE;
+}
+
+static void ep_timeout(unsigned long arg)
+{
+	struct iwch_ep *ep = (struct iwch_ep *)arg;
+	struct iwch_qp_attributes attrs;
+	unsigned long flags;
+	int abort = 1;
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+	PDBG("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid,
+	     ep->com.state);
+	switch (ep->com.state) {
+	case MPA_REQ_SENT:
+		__state_set(&ep->com, ABORTING);
+		connect_reply_upcall(ep, -ETIMEDOUT);
+		break;
+	case MPA_REQ_WAIT:
+		__state_set(&ep->com, ABORTING);
+		break;
+	case CLOSING:
+	case MORIBUND:
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = IWCH_QP_STATE_ERROR;
+			iwch_modify_qp(ep->com.qp->rhp,
+				     ep->com.qp, IWCH_QP_ATTR_NEXT_STATE,
+				     &attrs, 1);
+		}
+		__state_set(&ep->com, ABORTING);
+		break;
+	default:
+		WARN(1, "%s unexpected state ep %p state %u\n",
+			__func__, ep, ep->com.state);
+		abort = 0;
+	}
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (abort)
+		abort_connection(ep, NULL, GFP_ATOMIC);
+	put_ep(&ep->com);
+}
+
+int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	int err;
+	struct iwch_ep *ep = to_ep(cm_id);
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+
+	if (state_read(&ep->com) == DEAD) {
+		put_ep(&ep->com);
+		return -ECONNRESET;
+	}
+	BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD);
+	if (mpa_rev == 0)
+		abort_connection(ep, NULL, GFP_KERNEL);
+	else {
+		err = send_mpa_reject(ep, pdata, pdata_len);
+		err = iwch_ep_disconnect(ep, 0, GFP_KERNEL);
+	}
+	put_ep(&ep->com);
+	return 0;
+}
+
+int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	int err;
+	struct iwch_qp_attributes attrs;
+	enum iwch_qp_attr_mask mask;
+	struct iwch_ep *ep = to_ep(cm_id);
+	struct iwch_dev *h = to_iwch_dev(cm_id->device);
+	struct iwch_qp *qp = get_qhp(h, conn_param->qpn);
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	if (state_read(&ep->com) == DEAD) {
+		err = -ECONNRESET;
+		goto err;
+	}
+
+	BUG_ON(state_read(&ep->com) != MPA_REQ_RCVD);
+	BUG_ON(!qp);
+
+	if ((conn_param->ord > qp->rhp->attr.max_rdma_read_qp_depth) ||
+	    (conn_param->ird > qp->rhp->attr.max_rdma_reads_per_qp)) {
+		abort_connection(ep, NULL, GFP_KERNEL);
+		err = -EINVAL;
+		goto err;
+	}
+
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->com.qp = qp;
+
+	ep->ird = conn_param->ird;
+	ep->ord = conn_param->ord;
+
+	if (peer2peer && ep->ird == 0)
+		ep->ird = 1;
+
+	PDBG("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord);
+
+	/* bind QP to EP and move to RTS */
+	attrs.mpa_attr = ep->mpa_attr;
+	attrs.max_ird = ep->ird;
+	attrs.max_ord = ep->ord;
+	attrs.llp_stream_handle = ep;
+	attrs.next_state = IWCH_QP_STATE_RTS;
+
+	/* bind QP and TID with INIT_WR */
+	mask = IWCH_QP_ATTR_NEXT_STATE |
+			     IWCH_QP_ATTR_LLP_STREAM_HANDLE |
+			     IWCH_QP_ATTR_MPA_ATTR |
+			     IWCH_QP_ATTR_MAX_IRD |
+			     IWCH_QP_ATTR_MAX_ORD;
+
+	err = iwch_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, mask, &attrs, 1);
+	if (err)
+		goto err1;
+
+	/* if needed, wait for wr_ack */
+	if (iwch_rqes_posted(qp)) {
+		wait_event(ep->com.waitq, ep->com.rpl_done);
+		err = ep->com.rpl_err;
+		if (err)
+			goto err1;
+	}
+
+	err = send_mpa_reply(ep, conn_param->private_data,
+			     conn_param->private_data_len);
+	if (err)
+		goto err1;
+
+
+	state_set(&ep->com, FPDU_MODE);
+	established_upcall(ep);
+	put_ep(&ep->com);
+	return 0;
+err1:
+	ep->com.cm_id = NULL;
+	ep->com.qp = NULL;
+	cm_id->rem_ref(cm_id);
+err:
+	put_ep(&ep->com);
+	return err;
+}
+
+static int is_loopback_dst(struct iw_cm_id *cm_id)
+{
+	struct net_device *dev;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+
+	dev = ip_dev_find(&init_net, raddr->sin_addr.s_addr);
+	if (!dev)
+		return 0;
+	dev_put(dev);
+	return 1;
+}
+
+int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	struct iwch_dev *h = to_iwch_dev(cm_id->device);
+	struct iwch_ep *ep;
+	struct rtable *rt;
+	int err = 0;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+
+	if (cm_id->remote_addr.ss_family != PF_INET) {
+		err = -ENOSYS;
+		goto out;
+	}
+
+	if (is_loopback_dst(cm_id)) {
+		err = -ENOSYS;
+		goto out;
+	}
+
+	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
+	if (!ep) {
+		printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__);
+		err = -ENOMEM;
+		goto out;
+	}
+	init_timer(&ep->timer);
+	ep->plen = conn_param->private_data_len;
+	if (ep->plen)
+		memcpy(ep->mpa_pkt + sizeof(struct mpa_message),
+		       conn_param->private_data, ep->plen);
+	ep->ird = conn_param->ird;
+	ep->ord = conn_param->ord;
+
+	if (peer2peer && ep->ord == 0)
+		ep->ord = 1;
+
+	ep->com.tdev = h->rdev.t3cdev_p;
+
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->com.qp = get_qhp(h, conn_param->qpn);
+	BUG_ON(!ep->com.qp);
+	PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn,
+	     ep->com.qp, cm_id);
+
+	/*
+	 * Allocate an active TID to initiate a TCP connection.
+	 */
+	ep->atid = cxgb3_alloc_atid(h->rdev.t3cdev_p, &t3c_client, ep);
+	if (ep->atid == -1) {
+		printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__);
+		err = -ENOMEM;
+		goto fail2;
+	}
+
+	/* find a route */
+	rt = find_route(h->rdev.t3cdev_p, laddr->sin_addr.s_addr,
+			raddr->sin_addr.s_addr, laddr->sin_port,
+			raddr->sin_port, IPTOS_LOWDELAY);
+	if (!rt) {
+		printk(KERN_ERR MOD "%s - cannot find route.\n", __func__);
+		err = -EHOSTUNREACH;
+		goto fail3;
+	}
+	ep->dst = &rt->dst;
+	ep->l2t = t3_l2t_get(ep->com.tdev, ep->dst, NULL,
+			     &raddr->sin_addr.s_addr);
+	if (!ep->l2t) {
+		printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__);
+		err = -ENOMEM;
+		goto fail4;
+	}
+
+	state_set(&ep->com, CONNECTING);
+	ep->tos = IPTOS_LOWDELAY;
+	memcpy(&ep->com.local_addr, &cm_id->local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&ep->com.remote_addr, &cm_id->remote_addr,
+	       sizeof(ep->com.remote_addr));
+
+	/* send connect request to rnic */
+	err = send_connect(ep);
+	if (!err)
+		goto out;
+
+	l2t_release(h->rdev.t3cdev_p, ep->l2t);
+fail4:
+	dst_release(ep->dst);
+fail3:
+	cxgb3_free_atid(ep->com.tdev, ep->atid);
+fail2:
+	cm_id->rem_ref(cm_id);
+	put_ep(&ep->com);
+out:
+	return err;
+}
+
+int iwch_create_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	int err = 0;
+	struct iwch_dev *h = to_iwch_dev(cm_id->device);
+	struct iwch_listen_ep *ep;
+
+
+	might_sleep();
+
+	if (cm_id->local_addr.ss_family != PF_INET) {
+		err = -ENOSYS;
+		goto fail1;
+	}
+
+	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
+	if (!ep) {
+		printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__);
+		err = -ENOMEM;
+		goto fail1;
+	}
+	PDBG("%s ep %p\n", __func__, ep);
+	ep->com.tdev = h->rdev.t3cdev_p;
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->backlog = backlog;
+	memcpy(&ep->com.local_addr, &cm_id->local_addr,
+	       sizeof(ep->com.local_addr));
+
+	/*
+	 * Allocate a server TID.
+	 */
+	ep->stid = cxgb3_alloc_stid(h->rdev.t3cdev_p, &t3c_client, ep);
+	if (ep->stid == -1) {
+		printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__);
+		err = -ENOMEM;
+		goto fail2;
+	}
+
+	state_set(&ep->com, LISTEN);
+	err = listen_start(ep);
+	if (err)
+		goto fail3;
+
+	/* wait for pass_open_rpl */
+	wait_event(ep->com.waitq, ep->com.rpl_done);
+	err = ep->com.rpl_err;
+	if (!err) {
+		cm_id->provider_data = ep;
+		goto out;
+	}
+fail3:
+	cxgb3_free_stid(ep->com.tdev, ep->stid);
+fail2:
+	cm_id->rem_ref(cm_id);
+	put_ep(&ep->com);
+fail1:
+out:
+	return err;
+}
+
+int iwch_destroy_listen(struct iw_cm_id *cm_id)
+{
+	int err;
+	struct iwch_listen_ep *ep = to_listen_ep(cm_id);
+
+	PDBG("%s ep %p\n", __func__, ep);
+
+	might_sleep();
+	state_set(&ep->com, DEAD);
+	ep->com.rpl_done = 0;
+	ep->com.rpl_err = 0;
+	err = listen_stop(ep);
+	if (err)
+		goto done;
+	wait_event(ep->com.waitq, ep->com.rpl_done);
+	cxgb3_free_stid(ep->com.tdev, ep->stid);
+done:
+	err = ep->com.rpl_err;
+	cm_id->rem_ref(cm_id);
+	put_ep(&ep->com);
+	return err;
+}
+
+int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp)
+{
+	int ret=0;
+	unsigned long flags;
+	int close = 0;
+	int fatal = 0;
+	struct t3cdev *tdev;
+	struct cxio_rdev *rdev;
+
+	spin_lock_irqsave(&ep->com.lock, flags);
+
+	PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep,
+	     states[ep->com.state], abrupt);
+
+	tdev = (struct t3cdev *)ep->com.tdev;
+	rdev = (struct cxio_rdev *)tdev->ulp;
+	if (cxio_fatal_error(rdev)) {
+		fatal = 1;
+		close_complete_upcall(ep);
+		ep->com.state = DEAD;
+	}
+	switch (ep->com.state) {
+	case MPA_REQ_WAIT:
+	case MPA_REQ_SENT:
+	case MPA_REQ_RCVD:
+	case MPA_REP_SENT:
+	case FPDU_MODE:
+		close = 1;
+		if (abrupt)
+			ep->com.state = ABORTING;
+		else {
+			ep->com.state = CLOSING;
+			start_ep_timer(ep);
+		}
+		set_bit(CLOSE_SENT, &ep->com.flags);
+		break;
+	case CLOSING:
+		if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) {
+			close = 1;
+			if (abrupt) {
+				stop_ep_timer(ep);
+				ep->com.state = ABORTING;
+			} else
+				ep->com.state = MORIBUND;
+		}
+		break;
+	case MORIBUND:
+	case ABORTING:
+	case DEAD:
+		PDBG("%s ignoring disconnect ep %p state %u\n",
+		     __func__, ep, ep->com.state);
+		break;
+	default:
+		BUG();
+		break;
+	}
+
+	spin_unlock_irqrestore(&ep->com.lock, flags);
+	if (close) {
+		if (abrupt)
+			ret = send_abort(ep, NULL, gfp);
+		else
+			ret = send_halfclose(ep, gfp);
+		if (ret)
+			fatal = 1;
+	}
+	if (fatal)
+		release_ep_resources(ep);
+	return ret;
+}
+
+int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new,
+		     struct l2t_entry *l2t)
+{
+	struct iwch_ep *ep = ctx;
+
+	if (ep->dst != old)
+		return 0;
+
+	PDBG("%s ep %p redirect to dst %p l2t %p\n", __func__, ep, new,
+	     l2t);
+	dst_hold(new);
+	l2t_release(ep->com.tdev, ep->l2t);
+	ep->l2t = l2t;
+	dst_release(old);
+	ep->dst = new;
+	return 1;
+}
+
+/*
+ * All the CM events are handled on a work queue to have a safe context.
+ * These are the real handlers that are called from the work queue.
+ */
+static const cxgb3_cpl_handler_func work_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH]	= act_establish,
+	[CPL_ACT_OPEN_RPL]	= act_open_rpl,
+	[CPL_RX_DATA]		= rx_data,
+	[CPL_TX_DMA_ACK]	= tx_ack,
+	[CPL_ABORT_RPL_RSS]	= abort_rpl,
+	[CPL_ABORT_RPL]		= abort_rpl,
+	[CPL_PASS_OPEN_RPL]	= pass_open_rpl,
+	[CPL_CLOSE_LISTSRV_RPL]	= close_listsrv_rpl,
+	[CPL_PASS_ACCEPT_REQ]	= pass_accept_req,
+	[CPL_PASS_ESTABLISH]	= pass_establish,
+	[CPL_PEER_CLOSE]	= peer_close,
+	[CPL_ABORT_REQ_RSS]	= peer_abort,
+	[CPL_CLOSE_CON_RPL]	= close_con_rpl,
+	[CPL_RDMA_TERMINATE]	= terminate,
+	[CPL_RDMA_EC_STATUS]	= ec_status,
+};
+
+static void process_work(struct work_struct *work)
+{
+	struct sk_buff *skb = NULL;
+	void *ep;
+	struct t3cdev *tdev;
+	int ret;
+
+	while ((skb = skb_dequeue(&rxq))) {
+		ep = *((void **) (skb->cb));
+		tdev = *((struct t3cdev **) (skb->cb + sizeof(void *)));
+		ret = work_handlers[G_OPCODE(ntohl((__force __be32)skb->csum))](tdev, skb, ep);
+		if (ret & CPL_RET_BUF_DONE)
+			kfree_skb(skb);
+
+		/*
+		 * ep was referenced in sched(), and is freed here.
+		 */
+		put_ep((struct iwch_ep_common *)ep);
+	}
+}
+
+static DECLARE_WORK(skb_work, process_work);
+
+static int sched(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct iwch_ep_common *epc = ctx;
+
+	get_ep(epc);
+
+	/*
+	 * Save ctx and tdev in the skb->cb area.
+	 */
+	*((void **) skb->cb) = ctx;
+	*((struct t3cdev **) (skb->cb + sizeof(void *))) = tdev;
+
+	/*
+	 * Queue the skb and schedule the worker thread.
+	 */
+	skb_queue_tail(&rxq, skb);
+	queue_work(workq, &skb_work);
+	return 0;
+}
+
+static int set_tcb_rpl(struct t3cdev *tdev, struct sk_buff *skb, void *ctx)
+{
+	struct cpl_set_tcb_rpl *rpl = cplhdr(skb);
+
+	if (rpl->status != CPL_ERR_NONE) {
+		printk(KERN_ERR MOD "Unexpected SET_TCB_RPL status %u "
+		       "for tid %u\n", rpl->status, GET_TID(rpl));
+	}
+	return CPL_RET_BUF_DONE;
+}
+
+/*
+ * All upcalls from the T3 Core go to sched() to schedule the
+ * processing on a work queue.
+ */
+cxgb3_cpl_handler_func t3c_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH]	= sched,
+	[CPL_ACT_OPEN_RPL]	= sched,
+	[CPL_RX_DATA]		= sched,
+	[CPL_TX_DMA_ACK]	= sched,
+	[CPL_ABORT_RPL_RSS]	= sched,
+	[CPL_ABORT_RPL]		= sched,
+	[CPL_PASS_OPEN_RPL]	= sched,
+	[CPL_CLOSE_LISTSRV_RPL]	= sched,
+	[CPL_PASS_ACCEPT_REQ]	= sched,
+	[CPL_PASS_ESTABLISH]	= sched,
+	[CPL_PEER_CLOSE]	= sched,
+	[CPL_CLOSE_CON_RPL]	= sched,
+	[CPL_ABORT_REQ_RSS]	= sched,
+	[CPL_RDMA_TERMINATE]	= sched,
+	[CPL_RDMA_EC_STATUS]	= sched,
+	[CPL_SET_TCB_RPL]	= set_tcb_rpl,
+};
+
+int __init iwch_cm_init(void)
+{
+	skb_queue_head_init(&rxq);
+
+	workq = create_singlethread_workqueue("iw_cxgb3");
+	if (!workq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void __exit iwch_cm_term(void)
+{
+	flush_workqueue(workq);
+	destroy_workqueue(workq);
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.h b/drivers/infiniband/hw/cxgb3/iwch_cm.h
new file mode 100644
index 000000000..b9efadfff
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.h
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _IWCH_CM_H_
+#define _IWCH_CM_H_
+
+#include <linux/inet.h>
+#include <linux/wait.h>
+#include <linux/spinlock.h>
+#include <linux/kref.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/iw_cm.h>
+
+#include "cxgb3_offload.h"
+#include "iwch_provider.h"
+
+#define MPA_KEY_REQ "MPA ID Req Frame"
+#define MPA_KEY_REP "MPA ID Rep Frame"
+
+#define MPA_MAX_PRIVATE_DATA	256
+#define MPA_REV		0	/* XXX - amso1100 uses rev 0 ! */
+#define MPA_REJECT		0x20
+#define MPA_CRC			0x40
+#define MPA_MARKERS		0x80
+#define MPA_FLAGS_MASK		0xE0
+
+#define put_ep(ep) { \
+	PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__,  \
+	     ep, atomic_read(&((ep)->kref.refcount))); \
+	WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \
+	kref_put(&((ep)->kref), __free_ep); \
+}
+
+#define get_ep(ep) { \
+	PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \
+	     ep, atomic_read(&((ep)->kref.refcount))); \
+	kref_get(&((ep)->kref));  \
+}
+
+struct mpa_message {
+	u8 key[16];
+	u8 flags;
+	u8 revision;
+	__be16 private_data_size;
+	u8 private_data[0];
+};
+
+struct terminate_message {
+	u8 layer_etype;
+	u8 ecode;
+	__be16 hdrct_rsvd;
+	u8 len_hdrs[0];
+};
+
+#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28)
+
+enum iwch_layers_types {
+	LAYER_RDMAP		= 0x00,
+	LAYER_DDP		= 0x10,
+	LAYER_MPA		= 0x20,
+	RDMAP_LOCAL_CATA	= 0x00,
+	RDMAP_REMOTE_PROT	= 0x01,
+	RDMAP_REMOTE_OP		= 0x02,
+	DDP_LOCAL_CATA		= 0x00,
+	DDP_TAGGED_ERR		= 0x01,
+	DDP_UNTAGGED_ERR	= 0x02,
+	DDP_LLP			= 0x03
+};
+
+enum iwch_rdma_ecodes {
+	RDMAP_INV_STAG		= 0x00,
+	RDMAP_BASE_BOUNDS	= 0x01,
+	RDMAP_ACC_VIOL		= 0x02,
+	RDMAP_STAG_NOT_ASSOC	= 0x03,
+	RDMAP_TO_WRAP		= 0x04,
+	RDMAP_INV_VERS		= 0x05,
+	RDMAP_INV_OPCODE	= 0x06,
+	RDMAP_STREAM_CATA	= 0x07,
+	RDMAP_GLOBAL_CATA	= 0x08,
+	RDMAP_CANT_INV_STAG	= 0x09,
+	RDMAP_UNSPECIFIED	= 0xff
+};
+
+enum iwch_ddp_ecodes {
+	DDPT_INV_STAG		= 0x00,
+	DDPT_BASE_BOUNDS	= 0x01,
+	DDPT_STAG_NOT_ASSOC	= 0x02,
+	DDPT_TO_WRAP		= 0x03,
+	DDPT_INV_VERS		= 0x04,
+	DDPU_INV_QN		= 0x01,
+	DDPU_INV_MSN_NOBUF	= 0x02,
+	DDPU_INV_MSN_RANGE	= 0x03,
+	DDPU_INV_MO		= 0x04,
+	DDPU_MSG_TOOBIG		= 0x05,
+	DDPU_INV_VERS		= 0x06
+};
+
+enum iwch_mpa_ecodes {
+	MPA_CRC_ERR		= 0x02,
+	MPA_MARKER_ERR		= 0x03
+};
+
+enum iwch_ep_state {
+	IDLE = 0,
+	LISTEN,
+	CONNECTING,
+	MPA_REQ_WAIT,
+	MPA_REQ_SENT,
+	MPA_REQ_RCVD,
+	MPA_REP_SENT,
+	FPDU_MODE,
+	ABORTING,
+	CLOSING,
+	MORIBUND,
+	DEAD,
+};
+
+enum iwch_ep_flags {
+	PEER_ABORT_IN_PROGRESS	= 0,
+	ABORT_REQ_IN_PROGRESS	= 1,
+	RELEASE_RESOURCES	= 2,
+	CLOSE_SENT		= 3,
+};
+
+struct iwch_ep_common {
+	struct iw_cm_id *cm_id;
+	struct iwch_qp *qp;
+	struct t3cdev *tdev;
+	enum iwch_ep_state state;
+	struct kref kref;
+	spinlock_t lock;
+	struct sockaddr_in local_addr;
+	struct sockaddr_in remote_addr;
+	wait_queue_head_t waitq;
+	int rpl_done;
+	int rpl_err;
+	unsigned long flags;
+};
+
+struct iwch_listen_ep {
+	struct iwch_ep_common com;
+	unsigned int stid;
+	int backlog;
+};
+
+struct iwch_ep {
+	struct iwch_ep_common com;
+	struct iwch_ep *parent_ep;
+	struct timer_list timer;
+	unsigned int atid;
+	u32 hwtid;
+	u32 snd_seq;
+	u32 rcv_seq;
+	struct l2t_entry *l2t;
+	struct dst_entry *dst;
+	struct sk_buff *mpa_skb;
+	struct iwch_mpa_attributes mpa_attr;
+	unsigned int mpa_pkt_len;
+	u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA];
+	u8 tos;
+	u16 emss;
+	u16 plen;
+	u32 ird;
+	u32 ord;
+};
+
+static inline struct iwch_ep *to_ep(struct iw_cm_id *cm_id)
+{
+	return cm_id->provider_data;
+}
+
+static inline struct iwch_listen_ep *to_listen_ep(struct iw_cm_id *cm_id)
+{
+	return cm_id->provider_data;
+}
+
+static inline int compute_wscale(int win)
+{
+	int wscale = 0;
+
+	while (wscale < 14 && (65535<<wscale) < win)
+		wscale++;
+	return wscale;
+}
+
+/* CM prototypes */
+
+int iwch_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int iwch_create_listen(struct iw_cm_id *cm_id, int backlog);
+int iwch_destroy_listen(struct iw_cm_id *cm_id);
+int iwch_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len);
+int iwch_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int iwch_ep_disconnect(struct iwch_ep *ep, int abrupt, gfp_t gfp);
+int iwch_quiesce_tid(struct iwch_ep *ep);
+int iwch_resume_tid(struct iwch_ep *ep);
+void __free_ep(struct kref *kref);
+void iwch_rearp(struct iwch_ep *ep);
+int iwch_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new, struct l2t_entry *l2t);
+
+int __init iwch_cm_init(void);
+void __exit iwch_cm_term(void);
+extern int peer2peer;
+
+#endif				/* _IWCH_CM_H_ */
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cq.c b/drivers/infiniband/hw/cxgb3/iwch_cq.c
new file mode 100644
index 000000000..cf5474ae6
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_cq.c
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "iwch_provider.h"
+#include "iwch.h"
+
+/*
+ * Get one cq entry from cxio and map it to openib.
+ *
+ * Returns:
+ *	0			EMPTY;
+ *	1			cqe returned
+ *	-EAGAIN		caller must try again
+ *	any other -errno	fatal error
+ */
+static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp,
+			    struct ib_wc *wc)
+{
+	struct iwch_qp *qhp = NULL;
+	struct t3_cqe cqe, *rd_cqe;
+	struct t3_wq *wq;
+	u32 credit = 0;
+	u8 cqe_flushed;
+	u64 cookie;
+	int ret = 1;
+
+	rd_cqe = cxio_next_cqe(&chp->cq);
+
+	if (!rd_cqe)
+		return 0;
+
+	qhp = get_qhp(rhp, CQE_QPID(*rd_cqe));
+	if (!qhp)
+		wq = NULL;
+	else {
+		spin_lock(&qhp->lock);
+		wq = &(qhp->wq);
+	}
+	ret = cxio_poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie,
+				   &credit);
+	if (t3a_device(chp->rhp) && credit) {
+		PDBG("%s updating %d cq credits on id %d\n", __func__,
+		     credit, chp->cq.cqid);
+		cxio_hal_cq_op(&rhp->rdev, &chp->cq, CQ_CREDIT_UPDATE, credit);
+	}
+
+	if (ret) {
+		ret = -EAGAIN;
+		goto out;
+	}
+	ret = 1;
+
+	wc->wr_id = cookie;
+	wc->qp = &qhp->ibqp;
+	wc->vendor_err = CQE_STATUS(cqe);
+	wc->wc_flags = 0;
+
+	PDBG("%s qpid 0x%x type %d opcode %d status 0x%x wrid hi 0x%x "
+	     "lo 0x%x cookie 0x%llx\n", __func__,
+	     CQE_QPID(cqe), CQE_TYPE(cqe),
+	     CQE_OPCODE(cqe), CQE_STATUS(cqe), CQE_WRID_HI(cqe),
+	     CQE_WRID_LOW(cqe), (unsigned long long) cookie);
+
+	if (CQE_TYPE(cqe) == 0) {
+		if (!CQE_STATUS(cqe))
+			wc->byte_len = CQE_LEN(cqe);
+		else
+			wc->byte_len = 0;
+		wc->opcode = IB_WC_RECV;
+		if (CQE_OPCODE(cqe) == T3_SEND_WITH_INV ||
+		    CQE_OPCODE(cqe) == T3_SEND_WITH_SE_INV) {
+			wc->ex.invalidate_rkey = CQE_WRID_STAG(cqe);
+			wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+		}
+	} else {
+		switch (CQE_OPCODE(cqe)) {
+		case T3_RDMA_WRITE:
+			wc->opcode = IB_WC_RDMA_WRITE;
+			break;
+		case T3_READ_REQ:
+			wc->opcode = IB_WC_RDMA_READ;
+			wc->byte_len = CQE_LEN(cqe);
+			break;
+		case T3_SEND:
+		case T3_SEND_WITH_SE:
+		case T3_SEND_WITH_INV:
+		case T3_SEND_WITH_SE_INV:
+			wc->opcode = IB_WC_SEND;
+			break;
+		case T3_BIND_MW:
+			wc->opcode = IB_WC_BIND_MW;
+			break;
+
+		case T3_LOCAL_INV:
+			wc->opcode = IB_WC_LOCAL_INV;
+			break;
+		case T3_FAST_REGISTER:
+			wc->opcode = IB_WC_FAST_REG_MR;
+			break;
+		default:
+			printk(KERN_ERR MOD "Unexpected opcode %d "
+			       "in the CQE received for QPID=0x%0x\n",
+			       CQE_OPCODE(cqe), CQE_QPID(cqe));
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (cqe_flushed)
+		wc->status = IB_WC_WR_FLUSH_ERR;
+	else {
+
+		switch (CQE_STATUS(cqe)) {
+		case TPT_ERR_SUCCESS:
+			wc->status = IB_WC_SUCCESS;
+			break;
+		case TPT_ERR_STAG:
+			wc->status = IB_WC_LOC_ACCESS_ERR;
+			break;
+		case TPT_ERR_PDID:
+			wc->status = IB_WC_LOC_PROT_ERR;
+			break;
+		case TPT_ERR_QPID:
+		case TPT_ERR_ACCESS:
+			wc->status = IB_WC_LOC_ACCESS_ERR;
+			break;
+		case TPT_ERR_WRAP:
+			wc->status = IB_WC_GENERAL_ERR;
+			break;
+		case TPT_ERR_BOUND:
+			wc->status = IB_WC_LOC_LEN_ERR;
+			break;
+		case TPT_ERR_INVALIDATE_SHARED_MR:
+		case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+			wc->status = IB_WC_MW_BIND_ERR;
+			break;
+		case TPT_ERR_CRC:
+		case TPT_ERR_MARKER:
+		case TPT_ERR_PDU_LEN_ERR:
+		case TPT_ERR_OUT_OF_RQE:
+		case TPT_ERR_DDP_VERSION:
+		case TPT_ERR_RDMA_VERSION:
+		case TPT_ERR_DDP_QUEUE_NUM:
+		case TPT_ERR_MSN:
+		case TPT_ERR_TBIT:
+		case TPT_ERR_MO:
+		case TPT_ERR_MSN_RANGE:
+		case TPT_ERR_IRD_OVERFLOW:
+		case TPT_ERR_OPCODE:
+			wc->status = IB_WC_FATAL_ERR;
+			break;
+		case TPT_ERR_SWFLUSH:
+			wc->status = IB_WC_WR_FLUSH_ERR;
+			break;
+		default:
+			printk(KERN_ERR MOD "Unexpected cqe_status 0x%x for "
+			       "QPID=0x%0x\n", CQE_STATUS(cqe), CQE_QPID(cqe));
+			ret = -EINVAL;
+		}
+	}
+out:
+	if (wq)
+		spin_unlock(&qhp->lock);
+	return ret;
+}
+
+int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct iwch_dev *rhp;
+	struct iwch_cq *chp;
+	unsigned long flags;
+	int npolled;
+	int err = 0;
+
+	chp = to_iwch_cq(ibcq);
+	rhp = chp->rhp;
+
+	spin_lock_irqsave(&chp->lock, flags);
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+#ifdef DEBUG
+		int i=0;
+#endif
+
+		/*
+		 * Because T3 can post CQEs that are _not_ associated
+		 * with a WR, we might have to poll again after removing
+		 * one of these.
+		 */
+		do {
+			err = iwch_poll_cq_one(rhp, chp, wc + npolled);
+#ifdef DEBUG
+			BUG_ON(++i > 1000);
+#endif
+		} while (err == -EAGAIN);
+		if (err <= 0)
+			break;
+	}
+	spin_unlock_irqrestore(&chp->lock, flags);
+
+	if (err < 0)
+		return err;
+	else {
+		return npolled;
+	}
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_ev.c b/drivers/infiniband/hw/cxgb3/iwch_ev.c
new file mode 100644
index 000000000..abcc9e769
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_ev.c
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/gfp.h>
+#include <linux/mman.h>
+#include <net/sock.h>
+#include "iwch_provider.h"
+#include "iwch.h"
+#include "iwch_cm.h"
+#include "cxio_hal.h"
+#include "cxio_wr.h"
+
+static void post_qp_event(struct iwch_dev *rnicp, struct iwch_cq *chp,
+			  struct respQ_msg_t *rsp_msg,
+			  enum ib_event_type ib_event,
+			  int send_term)
+{
+	struct ib_event event;
+	struct iwch_qp_attributes attrs;
+	struct iwch_qp *qhp;
+	unsigned long flag;
+
+	spin_lock(&rnicp->lock);
+	qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe));
+
+	if (!qhp) {
+		printk(KERN_ERR "%s unaffiliated error 0x%x qpid 0x%x\n",
+		       __func__, CQE_STATUS(rsp_msg->cqe),
+		       CQE_QPID(rsp_msg->cqe));
+		spin_unlock(&rnicp->lock);
+		return;
+	}
+
+	if ((qhp->attr.state == IWCH_QP_STATE_ERROR) ||
+	    (qhp->attr.state == IWCH_QP_STATE_TERMINATE)) {
+		PDBG("%s AE received after RTS - "
+		     "qp state %d qpid 0x%x status 0x%x\n", __func__,
+		     qhp->attr.state, qhp->wq.qpid, CQE_STATUS(rsp_msg->cqe));
+		spin_unlock(&rnicp->lock);
+		return;
+	}
+
+	printk(KERN_ERR "%s - AE qpid 0x%x opcode %d status 0x%x "
+	       "type %d wrid.hi 0x%x wrid.lo 0x%x \n", __func__,
+	       CQE_QPID(rsp_msg->cqe), CQE_OPCODE(rsp_msg->cqe),
+	       CQE_STATUS(rsp_msg->cqe), CQE_TYPE(rsp_msg->cqe),
+	       CQE_WRID_HI(rsp_msg->cqe), CQE_WRID_LOW(rsp_msg->cqe));
+
+	atomic_inc(&qhp->refcnt);
+	spin_unlock(&rnicp->lock);
+
+	if (qhp->attr.state == IWCH_QP_STATE_RTS) {
+		attrs.next_state = IWCH_QP_STATE_TERMINATE;
+		iwch_modify_qp(qhp->rhp, qhp, IWCH_QP_ATTR_NEXT_STATE,
+			       &attrs, 1);
+		if (send_term)
+			iwch_post_terminate(qhp, rsp_msg);
+	}
+
+	event.event = ib_event;
+	event.device = chp->ibcq.device;
+	if (ib_event == IB_EVENT_CQ_ERR)
+		event.element.cq = &chp->ibcq;
+	else
+		event.element.qp = &qhp->ibqp;
+
+	if (qhp->ibqp.event_handler)
+		(*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context);
+
+	spin_lock_irqsave(&chp->comp_handler_lock, flag);
+	(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+	spin_unlock_irqrestore(&chp->comp_handler_lock, flag);
+
+	if (atomic_dec_and_test(&qhp->refcnt))
+		wake_up(&qhp->wait);
+}
+
+void iwch_ev_dispatch(struct cxio_rdev *rdev_p, struct sk_buff *skb)
+{
+	struct iwch_dev *rnicp;
+	struct respQ_msg_t *rsp_msg = (struct respQ_msg_t *) skb->data;
+	struct iwch_cq *chp;
+	struct iwch_qp *qhp;
+	u32 cqid = RSPQ_CQID(rsp_msg);
+	unsigned long flag;
+
+	rnicp = (struct iwch_dev *) rdev_p->ulp;
+	spin_lock(&rnicp->lock);
+	chp = get_chp(rnicp, cqid);
+	qhp = get_qhp(rnicp, CQE_QPID(rsp_msg->cqe));
+	if (!chp || !qhp) {
+		printk(KERN_ERR MOD "BAD AE cqid 0x%x qpid 0x%x opcode %d "
+		       "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x \n",
+		       cqid, CQE_QPID(rsp_msg->cqe),
+		       CQE_OPCODE(rsp_msg->cqe), CQE_STATUS(rsp_msg->cqe),
+		       CQE_TYPE(rsp_msg->cqe), CQE_WRID_HI(rsp_msg->cqe),
+		       CQE_WRID_LOW(rsp_msg->cqe));
+		spin_unlock(&rnicp->lock);
+		goto out;
+	}
+	iwch_qp_add_ref(&qhp->ibqp);
+	atomic_inc(&chp->refcnt);
+	spin_unlock(&rnicp->lock);
+
+	/*
+	 * 1) completion of our sending a TERMINATE.
+	 * 2) incoming TERMINATE message.
+	 */
+	if ((CQE_OPCODE(rsp_msg->cqe) == T3_TERMINATE) &&
+	    (CQE_STATUS(rsp_msg->cqe) == 0)) {
+		if (SQ_TYPE(rsp_msg->cqe)) {
+			PDBG("%s QPID 0x%x ep %p disconnecting\n",
+			     __func__, qhp->wq.qpid, qhp->ep);
+			iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC);
+		} else {
+			PDBG("%s post REQ_ERR AE QPID 0x%x\n", __func__,
+			     qhp->wq.qpid);
+			post_qp_event(rnicp, chp, rsp_msg,
+				      IB_EVENT_QP_REQ_ERR, 0);
+			iwch_ep_disconnect(qhp->ep, 0, GFP_ATOMIC);
+		}
+		goto done;
+	}
+
+	/* Bad incoming Read request */
+	if (SQ_TYPE(rsp_msg->cqe) &&
+	    (CQE_OPCODE(rsp_msg->cqe) == T3_READ_RESP)) {
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1);
+		goto done;
+	}
+
+	/* Bad incoming write */
+	if (RQ_TYPE(rsp_msg->cqe) &&
+	    (CQE_OPCODE(rsp_msg->cqe) == T3_RDMA_WRITE)) {
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_REQ_ERR, 1);
+		goto done;
+	}
+
+	switch (CQE_STATUS(rsp_msg->cqe)) {
+
+	/* Completion Events */
+	case TPT_ERR_SUCCESS:
+
+		/*
+		 * Confirm the destination entry if this is a RECV completion.
+		 */
+		if (qhp->ep && SQ_TYPE(rsp_msg->cqe))
+			dst_confirm(qhp->ep->dst);
+		spin_lock_irqsave(&chp->comp_handler_lock, flag);
+		(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+		spin_unlock_irqrestore(&chp->comp_handler_lock, flag);
+		break;
+
+	case TPT_ERR_STAG:
+	case TPT_ERR_PDID:
+	case TPT_ERR_QPID:
+	case TPT_ERR_ACCESS:
+	case TPT_ERR_WRAP:
+	case TPT_ERR_BOUND:
+	case TPT_ERR_INVALIDATE_SHARED_MR:
+	case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_ACCESS_ERR, 1);
+		break;
+
+	/* Device Fatal Errors */
+	case TPT_ERR_ECC:
+	case TPT_ERR_ECC_PSTAG:
+	case TPT_ERR_INTERNAL_ERR:
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_DEVICE_FATAL, 1);
+		break;
+
+	/* QP Fatal Errors */
+	case TPT_ERR_OUT_OF_RQE:
+	case TPT_ERR_PBL_ADDR_BOUND:
+	case TPT_ERR_CRC:
+	case TPT_ERR_MARKER:
+	case TPT_ERR_PDU_LEN_ERR:
+	case TPT_ERR_DDP_VERSION:
+	case TPT_ERR_RDMA_VERSION:
+	case TPT_ERR_OPCODE:
+	case TPT_ERR_DDP_QUEUE_NUM:
+	case TPT_ERR_MSN:
+	case TPT_ERR_TBIT:
+	case TPT_ERR_MO:
+	case TPT_ERR_MSN_GAP:
+	case TPT_ERR_MSN_RANGE:
+	case TPT_ERR_RQE_ADDR_BOUND:
+	case TPT_ERR_IRD_OVERFLOW:
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1);
+		break;
+
+	default:
+		printk(KERN_ERR MOD "Unknown T3 status 0x%x QPID 0x%x\n",
+		       CQE_STATUS(rsp_msg->cqe), qhp->wq.qpid);
+		post_qp_event(rnicp, chp, rsp_msg, IB_EVENT_QP_FATAL, 1);
+		break;
+	}
+done:
+	if (atomic_dec_and_test(&chp->refcnt))
+	        wake_up(&chp->wait);
+	iwch_qp_rem_ref(&qhp->ibqp);
+out:
+	dev_kfree_skb_irq(skb);
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_mem.c b/drivers/infiniband/hw/cxgb3/iwch_mem.c
new file mode 100644
index 000000000..5c36ee280
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_mem.c
@@ -0,0 +1,203 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+
+#include "cxio_hal.h"
+#include "cxio_resource.h"
+#include "iwch.h"
+#include "iwch_provider.h"
+
+static int iwch_finish_mem_reg(struct iwch_mr *mhp, u32 stag)
+{
+	u32 mmid;
+
+	mhp->attr.state = 1;
+	mhp->attr.stag = stag;
+	mmid = stag >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp);
+	return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid);
+}
+
+int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+		      struct iwch_mr *mhp, int shift)
+{
+	u32 stag;
+	int ret;
+
+	if (cxio_register_phys_mem(&rhp->rdev,
+				   &stag, mhp->attr.pdid,
+				   mhp->attr.perms,
+				   mhp->attr.zbva,
+				   mhp->attr.va_fbo,
+				   mhp->attr.len,
+				   shift - 12,
+				   mhp->attr.pbl_size, mhp->attr.pbl_addr))
+		return -ENOMEM;
+
+	ret = iwch_finish_mem_reg(mhp, stag);
+	if (ret)
+		cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+	return ret;
+}
+
+int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+					struct iwch_mr *mhp,
+					int shift,
+					int npages)
+{
+	u32 stag;
+	int ret;
+
+	/* We could support this... */
+	if (npages > mhp->attr.pbl_size)
+		return -ENOMEM;
+
+	stag = mhp->attr.stag;
+	if (cxio_reregister_phys_mem(&rhp->rdev,
+				   &stag, mhp->attr.pdid,
+				   mhp->attr.perms,
+				   mhp->attr.zbva,
+				   mhp->attr.va_fbo,
+				   mhp->attr.len,
+				   shift - 12,
+				   mhp->attr.pbl_size, mhp->attr.pbl_addr))
+		return -ENOMEM;
+
+	ret = iwch_finish_mem_reg(mhp, stag);
+	if (ret)
+		cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+
+	return ret;
+}
+
+int iwch_alloc_pbl(struct iwch_mr *mhp, int npages)
+{
+	mhp->attr.pbl_addr = cxio_hal_pblpool_alloc(&mhp->rhp->rdev,
+						    npages << 3);
+
+	if (!mhp->attr.pbl_addr)
+		return -ENOMEM;
+
+	mhp->attr.pbl_size = npages;
+
+	return 0;
+}
+
+void iwch_free_pbl(struct iwch_mr *mhp)
+{
+	cxio_hal_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+			      mhp->attr.pbl_size << 3);
+}
+
+int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset)
+{
+	return cxio_write_pbl(&mhp->rhp->rdev, pages,
+			      mhp->attr.pbl_addr + (offset << 3), npages);
+}
+
+int build_phys_page_list(struct ib_phys_buf *buffer_list,
+					int num_phys_buf,
+					u64 *iova_start,
+					u64 *total_size,
+					int *npages,
+					int *shift,
+					__be64 **page_list)
+{
+	u64 mask;
+	int i, j, n;
+
+	mask = 0;
+	*total_size = 0;
+	for (i = 0; i < num_phys_buf; ++i) {
+		if (i != 0 && buffer_list[i].addr & ~PAGE_MASK)
+			return -EINVAL;
+		if (i != 0 && i != num_phys_buf - 1 &&
+		    (buffer_list[i].size & ~PAGE_MASK))
+			return -EINVAL;
+		*total_size += buffer_list[i].size;
+		if (i > 0)
+			mask |= buffer_list[i].addr;
+		else
+			mask |= buffer_list[i].addr & PAGE_MASK;
+		if (i != num_phys_buf - 1)
+			mask |= buffer_list[i].addr + buffer_list[i].size;
+		else
+			mask |= (buffer_list[i].addr + buffer_list[i].size +
+				PAGE_SIZE - 1) & PAGE_MASK;
+	}
+
+	if (*total_size > 0xFFFFFFFFULL)
+		return -ENOMEM;
+
+	/* Find largest page shift we can use to cover buffers */
+	for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift))
+		if ((1ULL << *shift) & mask)
+			break;
+
+	buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1);
+	buffer_list[0].addr &= ~0ull << *shift;
+
+	*npages = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		*npages += (buffer_list[i].size +
+			(1ULL << *shift) - 1) >> *shift;
+
+	if (!*npages)
+		return -EINVAL;
+
+	*page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL);
+	if (!*page_list)
+		return -ENOMEM;
+
+	n = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		for (j = 0;
+		     j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift;
+		     ++j)
+			(*page_list)[n++] = cpu_to_be64(buffer_list[i].addr +
+			    ((u64) j << *shift));
+
+	PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n",
+	     __func__, (unsigned long long) *iova_start,
+	     (unsigned long long) mask, *shift, (unsigned long long) *total_size,
+	     *npages);
+
+	return 0;
+
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
new file mode 100644
index 000000000..811b24a53
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -0,0 +1,1467 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/slab.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "cxio_hal.h"
+#include "iwch.h"
+#include "iwch_provider.h"
+#include "iwch_cm.h"
+#include "iwch_user.h"
+#include "common.h"
+
+static struct ib_ah *iwch_ah_create(struct ib_pd *pd,
+				    struct ib_ah_attr *ah_attr)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static int iwch_ah_destroy(struct ib_ah *ah)
+{
+	return -ENOSYS;
+}
+
+static int iwch_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return -ENOSYS;
+}
+
+static int iwch_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return -ENOSYS;
+}
+
+static int iwch_process_mad(struct ib_device *ibdev,
+			    int mad_flags,
+			    u8 port_num,
+			    struct ib_wc *in_wc,
+			    struct ib_grh *in_grh,
+			    struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	return -ENOSYS;
+}
+
+static int iwch_dealloc_ucontext(struct ib_ucontext *context)
+{
+	struct iwch_dev *rhp = to_iwch_dev(context->device);
+	struct iwch_ucontext *ucontext = to_iwch_ucontext(context);
+	struct iwch_mm_entry *mm, *tmp;
+
+	PDBG("%s context %p\n", __func__, context);
+	list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
+		kfree(mm);
+	cxio_release_ucontext(&rhp->rdev, &ucontext->uctx);
+	kfree(ucontext);
+	return 0;
+}
+
+static struct ib_ucontext *iwch_alloc_ucontext(struct ib_device *ibdev,
+					struct ib_udata *udata)
+{
+	struct iwch_ucontext *context;
+	struct iwch_dev *rhp = to_iwch_dev(ibdev);
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+	cxio_init_ucontext(&rhp->rdev, &context->uctx);
+	INIT_LIST_HEAD(&context->mmaps);
+	spin_lock_init(&context->mmap_lock);
+	return &context->ibucontext;
+}
+
+static int iwch_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct iwch_cq *chp;
+
+	PDBG("%s ib_cq %p\n", __func__, ib_cq);
+	chp = to_iwch_cq(ib_cq);
+
+	remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid);
+	atomic_dec(&chp->refcnt);
+	wait_event(chp->wait, !atomic_read(&chp->refcnt));
+
+	cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
+	kfree(chp);
+	return 0;
+}
+
+static struct ib_cq *iwch_create_cq(struct ib_device *ibdev, int entries, int vector,
+			     struct ib_ucontext *ib_context,
+			     struct ib_udata *udata)
+{
+	struct iwch_dev *rhp;
+	struct iwch_cq *chp;
+	struct iwch_create_cq_resp uresp;
+	struct iwch_create_cq_req ureq;
+	struct iwch_ucontext *ucontext = NULL;
+	static int warned;
+	size_t resplen;
+
+	PDBG("%s ib_dev %p entries %d\n", __func__, ibdev, entries);
+	rhp = to_iwch_dev(ibdev);
+	chp = kzalloc(sizeof(*chp), GFP_KERNEL);
+	if (!chp)
+		return ERR_PTR(-ENOMEM);
+
+	if (ib_context) {
+		ucontext = to_iwch_ucontext(ib_context);
+		if (!t3a_device(rhp)) {
+			if (ib_copy_from_udata(&ureq, udata, sizeof (ureq))) {
+				kfree(chp);
+				return ERR_PTR(-EFAULT);
+			}
+			chp->user_rptr_addr = (u32 __user *)(unsigned long)ureq.user_rptr_addr;
+		}
+	}
+
+	if (t3a_device(rhp)) {
+
+		/*
+		 * T3A: Add some fluff to handle extra CQEs inserted
+		 * for various errors.
+		 * Additional CQE possibilities:
+		 *      TERMINATE,
+		 *      incoming RDMA WRITE Failures
+		 *      incoming RDMA READ REQUEST FAILUREs
+		 * NOTE: We cannot ensure the CQ won't overflow.
+		 */
+		entries += 16;
+	}
+	entries = roundup_pow_of_two(entries);
+	chp->cq.size_log2 = ilog2(entries);
+
+	if (cxio_create_cq(&rhp->rdev, &chp->cq, !ucontext)) {
+		kfree(chp);
+		return ERR_PTR(-ENOMEM);
+	}
+	chp->rhp = rhp;
+	chp->ibcq.cqe = 1 << chp->cq.size_log2;
+	spin_lock_init(&chp->lock);
+	spin_lock_init(&chp->comp_handler_lock);
+	atomic_set(&chp->refcnt, 1);
+	init_waitqueue_head(&chp->wait);
+	if (insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid)) {
+		cxio_destroy_cq(&chp->rhp->rdev, &chp->cq);
+		kfree(chp);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (ucontext) {
+		struct iwch_mm_entry *mm;
+
+		mm = kmalloc(sizeof *mm, GFP_KERNEL);
+		if (!mm) {
+			iwch_destroy_cq(&chp->ibcq);
+			return ERR_PTR(-ENOMEM);
+		}
+		uresp.cqid = chp->cq.cqid;
+		uresp.size_log2 = chp->cq.size_log2;
+		spin_lock(&ucontext->mmap_lock);
+		uresp.key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		spin_unlock(&ucontext->mmap_lock);
+		mm->key = uresp.key;
+		mm->addr = virt_to_phys(chp->cq.queue);
+		if (udata->outlen < sizeof uresp) {
+			if (!warned++)
+				printk(KERN_WARNING MOD "Warning - "
+				       "downlevel libcxgb3 (non-fatal).\n");
+			mm->len = PAGE_ALIGN((1UL << uresp.size_log2) *
+					     sizeof(struct t3_cqe));
+			resplen = sizeof(struct iwch_create_cq_resp_v0);
+		} else {
+			mm->len = PAGE_ALIGN(((1UL << uresp.size_log2) + 1) *
+					     sizeof(struct t3_cqe));
+			uresp.memsize = mm->len;
+			uresp.reserved = 0;
+			resplen = sizeof uresp;
+		}
+		if (ib_copy_to_udata(udata, &uresp, resplen)) {
+			kfree(mm);
+			iwch_destroy_cq(&chp->ibcq);
+			return ERR_PTR(-EFAULT);
+		}
+		insert_mmap(ucontext, mm);
+	}
+	PDBG("created cqid 0x%0x chp %p size 0x%0x, dma_addr 0x%0llx\n",
+	     chp->cq.cqid, chp, (1 << chp->cq.size_log2),
+	     (unsigned long long) chp->cq.dma_addr);
+	return &chp->ibcq;
+}
+
+static int iwch_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
+{
+#ifdef notyet
+	struct iwch_cq *chp = to_iwch_cq(cq);
+	struct t3_cq oldcq, newcq;
+	int ret;
+
+	PDBG("%s ib_cq %p cqe %d\n", __func__, cq, cqe);
+
+	/* We don't downsize... */
+	if (cqe <= cq->cqe)
+		return 0;
+
+	/* create new t3_cq with new size */
+	cqe = roundup_pow_of_two(cqe+1);
+	newcq.size_log2 = ilog2(cqe);
+
+	/* Dont allow resize to less than the current wce count */
+	if (cqe < Q_COUNT(chp->cq.rptr, chp->cq.wptr)) {
+		return -ENOMEM;
+	}
+
+	/* Quiesce all QPs using this CQ */
+	ret = iwch_quiesce_qps(chp);
+	if (ret) {
+		return ret;
+	}
+
+	ret = cxio_create_cq(&chp->rhp->rdev, &newcq);
+	if (ret) {
+		return ret;
+	}
+
+	/* copy CQEs */
+	memcpy(newcq.queue, chp->cq.queue, (1 << chp->cq.size_log2) *
+				        sizeof(struct t3_cqe));
+
+	/* old iwch_qp gets new t3_cq but keeps old cqid */
+	oldcq = chp->cq;
+	chp->cq = newcq;
+	chp->cq.cqid = oldcq.cqid;
+
+	/* resize new t3_cq to update the HW context */
+	ret = cxio_resize_cq(&chp->rhp->rdev, &chp->cq);
+	if (ret) {
+		chp->cq = oldcq;
+		return ret;
+	}
+	chp->ibcq.cqe = (1<<chp->cq.size_log2) - 1;
+
+	/* destroy old t3_cq */
+	oldcq.cqid = newcq.cqid;
+	ret = cxio_destroy_cq(&chp->rhp->rdev, &oldcq);
+	if (ret) {
+		printk(KERN_ERR MOD "%s - cxio_destroy_cq failed %d\n",
+			__func__, ret);
+	}
+
+	/* add user hooks here */
+
+	/* resume qps */
+	ret = iwch_resume_qps(chp);
+	return ret;
+#else
+	return -ENOSYS;
+#endif
+}
+
+static int iwch_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct iwch_dev *rhp;
+	struct iwch_cq *chp;
+	enum t3_cq_opcode cq_op;
+	int err;
+	unsigned long flag;
+	u32 rptr;
+
+	chp = to_iwch_cq(ibcq);
+	rhp = chp->rhp;
+	if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+		cq_op = CQ_ARM_SE;
+	else
+		cq_op = CQ_ARM_AN;
+	if (chp->user_rptr_addr) {
+		if (get_user(rptr, chp->user_rptr_addr))
+			return -EFAULT;
+		spin_lock_irqsave(&chp->lock, flag);
+		chp->cq.rptr = rptr;
+	} else
+		spin_lock_irqsave(&chp->lock, flag);
+	PDBG("%s rptr 0x%x\n", __func__, chp->cq.rptr);
+	err = cxio_hal_cq_op(&rhp->rdev, &chp->cq, cq_op, 0);
+	spin_unlock_irqrestore(&chp->lock, flag);
+	if (err < 0)
+		printk(KERN_ERR MOD "Error %d rearming CQID 0x%x\n", err,
+		       chp->cq.cqid);
+	if (err > 0 && !(flags & IB_CQ_REPORT_MISSED_EVENTS))
+		err = 0;
+	return err;
+}
+
+static int iwch_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	int len = vma->vm_end - vma->vm_start;
+	u32 key = vma->vm_pgoff << PAGE_SHIFT;
+	struct cxio_rdev *rdev_p;
+	int ret = 0;
+	struct iwch_mm_entry *mm;
+	struct iwch_ucontext *ucontext;
+	u64 addr;
+
+	PDBG("%s pgoff 0x%lx key 0x%x len %d\n", __func__, vma->vm_pgoff,
+	     key, len);
+
+	if (vma->vm_start & (PAGE_SIZE-1)) {
+	        return -EINVAL;
+	}
+
+	rdev_p = &(to_iwch_dev(context->device)->rdev);
+	ucontext = to_iwch_ucontext(context);
+
+	mm = remove_mmap(ucontext, key, len);
+	if (!mm)
+		return -EINVAL;
+	addr = mm->addr;
+	kfree(mm);
+
+	if ((addr >= rdev_p->rnic_info.udbell_physbase) &&
+	    (addr < (rdev_p->rnic_info.udbell_physbase +
+		       rdev_p->rnic_info.udbell_len))) {
+
+		/*
+		 * Map T3 DB register.
+		 */
+		if (vma->vm_flags & VM_READ) {
+			return -EPERM;
+		}
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		vma->vm_flags &= ~VM_MAYREAD;
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 addr >> PAGE_SHIFT,
+				         len, vma->vm_page_prot);
+	} else {
+
+		/*
+		 * Map WQ or CQ contig dma memory...
+		 */
+		ret = remap_pfn_range(vma, vma->vm_start,
+				      addr >> PAGE_SHIFT,
+				      len, vma->vm_page_prot);
+	}
+
+	return ret;
+}
+
+static int iwch_deallocate_pd(struct ib_pd *pd)
+{
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	PDBG("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid);
+	cxio_hal_put_pdid(rhp->rdev.rscp, php->pdid);
+	kfree(php);
+	return 0;
+}
+
+static struct ib_pd *iwch_allocate_pd(struct ib_device *ibdev,
+			       struct ib_ucontext *context,
+			       struct ib_udata *udata)
+{
+	struct iwch_pd *php;
+	u32 pdid;
+	struct iwch_dev *rhp;
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	rhp = (struct iwch_dev *) ibdev;
+	pdid = cxio_hal_get_pdid(rhp->rdev.rscp);
+	if (!pdid)
+		return ERR_PTR(-EINVAL);
+	php = kzalloc(sizeof(*php), GFP_KERNEL);
+	if (!php) {
+		cxio_hal_put_pdid(rhp->rdev.rscp, pdid);
+		return ERR_PTR(-ENOMEM);
+	}
+	php->pdid = pdid;
+	php->rhp = rhp;
+	if (context) {
+		if (ib_copy_to_udata(udata, &php->pdid, sizeof (__u32))) {
+			iwch_deallocate_pd(&php->ibpd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+	PDBG("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php);
+	return &php->ibpd;
+}
+
+static int iwch_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct iwch_dev *rhp;
+	struct iwch_mr *mhp;
+	u32 mmid;
+
+	PDBG("%s ib_mr %p\n", __func__, ib_mr);
+	/* There can be no memory windows */
+	if (atomic_read(&ib_mr->usecnt))
+		return -EINVAL;
+
+	mhp = to_iwch_mr(ib_mr);
+	rhp = mhp->rhp;
+	mmid = mhp->attr.stag >> 8;
+	cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+	iwch_free_pbl(mhp);
+	remove_handle(rhp, &rhp->mmidr, mmid);
+	if (mhp->kva)
+		kfree((void *) (unsigned long) mhp->kva);
+	if (mhp->umem)
+		ib_umem_release(mhp->umem);
+	PDBG("%s mmid 0x%x ptr %p\n", __func__, mmid, mhp);
+	kfree(mhp);
+	return 0;
+}
+
+static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd,
+					struct ib_phys_buf *buffer_list,
+					int num_phys_buf,
+					int acc,
+					u64 *iova_start)
+{
+	__be64 *page_list;
+	int shift;
+	u64 total_size;
+	int npages;
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+	struct iwch_mr *mhp;
+	int ret;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->rhp = rhp;
+
+	/* First check that we have enough alignment */
+	if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (num_phys_buf > 1 &&
+	    ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start,
+				   &total_size, &npages, &shift, &page_list);
+	if (ret)
+		goto err;
+
+	ret = iwch_alloc_pbl(mhp, npages);
+	if (ret) {
+		kfree(page_list);
+		goto err_pbl;
+	}
+
+	ret = iwch_write_pbl(mhp, page_list, npages, 0);
+	kfree(page_list);
+	if (ret)
+		goto err_pbl;
+
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.zbva = 0;
+
+	mhp->attr.perms = iwch_ib_to_tpt_access(acc);
+	mhp->attr.va_fbo = *iova_start;
+	mhp->attr.page_size = shift - 12;
+
+	mhp->attr.len = (u32) total_size;
+	mhp->attr.pbl_size = npages;
+	ret = iwch_register_mem(rhp, php, mhp, shift);
+	if (ret)
+		goto err_pbl;
+
+	return &mhp->ibmr;
+
+err_pbl:
+	iwch_free_pbl(mhp);
+
+err:
+	kfree(mhp);
+	return ERR_PTR(ret);
+
+}
+
+static int iwch_reregister_phys_mem(struct ib_mr *mr,
+				     int mr_rereg_mask,
+				     struct ib_pd *pd,
+	                             struct ib_phys_buf *buffer_list,
+	                             int num_phys_buf,
+	                             int acc, u64 * iova_start)
+{
+
+	struct iwch_mr mh, *mhp;
+	struct iwch_pd *php;
+	struct iwch_dev *rhp;
+	__be64 *page_list = NULL;
+	int shift = 0;
+	u64 total_size;
+	int npages = 0;
+	int ret;
+
+	PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd);
+
+	/* There can be no memory windows */
+	if (atomic_read(&mr->usecnt))
+		return -EINVAL;
+
+	mhp = to_iwch_mr(mr);
+	rhp = mhp->rhp;
+	php = to_iwch_pd(mr->pd);
+
+	/* make sure we are on the same adapter */
+	if (rhp != php->rhp)
+		return -EINVAL;
+
+	memcpy(&mh, mhp, sizeof *mhp);
+
+	if (mr_rereg_mask & IB_MR_REREG_PD)
+		php = to_iwch_pd(pd);
+	if (mr_rereg_mask & IB_MR_REREG_ACCESS)
+		mh.attr.perms = iwch_ib_to_tpt_access(acc);
+	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+		ret = build_phys_page_list(buffer_list, num_phys_buf,
+					   iova_start,
+					   &total_size, &npages,
+					   &shift, &page_list);
+		if (ret)
+			return ret;
+	}
+
+	ret = iwch_reregister_mem(rhp, php, &mh, shift, npages);
+	kfree(page_list);
+	if (ret) {
+		return ret;
+	}
+	if (mr_rereg_mask & IB_MR_REREG_PD)
+		mhp->attr.pdid = php->pdid;
+	if (mr_rereg_mask & IB_MR_REREG_ACCESS)
+		mhp->attr.perms = iwch_ib_to_tpt_access(acc);
+	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+		mhp->attr.zbva = 0;
+		mhp->attr.va_fbo = *iova_start;
+		mhp->attr.page_size = shift - 12;
+		mhp->attr.len = (u32) total_size;
+		mhp->attr.pbl_size = npages;
+	}
+
+	return 0;
+}
+
+
+static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				      u64 virt, int acc, struct ib_udata *udata)
+{
+	__be64 *pages;
+	int shift, n, len;
+	int i, k, entry;
+	int err = 0;
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+	struct iwch_mr *mhp;
+	struct iwch_reg_user_mr_resp uresp;
+	struct scatterlist *sg;
+	PDBG("%s ib_pd %p\n", __func__, pd);
+
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->rhp = rhp;
+
+	mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	if (IS_ERR(mhp->umem)) {
+		err = PTR_ERR(mhp->umem);
+		kfree(mhp);
+		return ERR_PTR(err);
+	}
+
+	shift = ffs(mhp->umem->page_size) - 1;
+
+	n = mhp->umem->nmap;
+
+	err = iwch_alloc_pbl(mhp, n);
+	if (err)
+		goto err;
+
+	pages = (__be64 *) __get_free_page(GFP_KERNEL);
+	if (!pages) {
+		err = -ENOMEM;
+		goto err_pbl;
+	}
+
+	i = n = 0;
+
+	for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) {
+			len = sg_dma_len(sg) >> shift;
+			for (k = 0; k < len; ++k) {
+				pages[i++] = cpu_to_be64(sg_dma_address(sg) +
+					mhp->umem->page_size * k);
+				if (i == PAGE_SIZE / sizeof *pages) {
+					err = iwch_write_pbl(mhp, pages, i, n);
+					if (err)
+						goto pbl_done;
+					n += i;
+					i = 0;
+				}
+			}
+	}
+
+	if (i)
+		err = iwch_write_pbl(mhp, pages, i, n);
+
+pbl_done:
+	free_page((unsigned long) pages);
+	if (err)
+		goto err_pbl;
+
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.zbva = 0;
+	mhp->attr.perms = iwch_ib_to_tpt_access(acc);
+	mhp->attr.va_fbo = virt;
+	mhp->attr.page_size = shift - 12;
+	mhp->attr.len = (u32) length;
+
+	err = iwch_register_mem(rhp, php, mhp, shift);
+	if (err)
+		goto err_pbl;
+
+	if (udata && !t3a_device(rhp)) {
+		uresp.pbl_addr = (mhp->attr.pbl_addr -
+				 rhp->rdev.rnic_info.pbl_base) >> 3;
+		PDBG("%s user resp pbl_addr 0x%x\n", __func__,
+		     uresp.pbl_addr);
+
+		if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
+			iwch_dereg_mr(&mhp->ibmr);
+			err = -EFAULT;
+			goto err;
+		}
+	}
+
+	return &mhp->ibmr;
+
+err_pbl:
+	iwch_free_pbl(mhp);
+
+err:
+	ib_umem_release(mhp->umem);
+	kfree(mhp);
+	return ERR_PTR(err);
+}
+
+static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct ib_phys_buf bl;
+	u64 kva;
+	struct ib_mr *ibmr;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+
+	/*
+	 * T3 only supports 32 bits of size.
+	 */
+	bl.size = 0xffffffff;
+	bl.addr = 0;
+	kva = 0;
+	ibmr = iwch_register_phys_mem(pd, &bl, 1, acc, &kva);
+	return ibmr;
+}
+
+static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
+{
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+	struct iwch_mw *mhp;
+	u32 mmid;
+	u32 stag = 0;
+	int ret;
+
+	if (type != IB_MW_TYPE_1)
+		return ERR_PTR(-EINVAL);
+
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+	ret = cxio_allocate_window(&rhp->rdev, &stag, php->pdid);
+	if (ret) {
+		kfree(mhp);
+		return ERR_PTR(ret);
+	}
+	mhp->rhp = rhp;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.type = TPT_MW;
+	mhp->attr.stag = stag;
+	mmid = (stag) >> 8;
+	mhp->ibmw.rkey = stag;
+	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+		cxio_deallocate_window(&rhp->rdev, mhp->attr.stag);
+		kfree(mhp);
+		return ERR_PTR(-ENOMEM);
+	}
+	PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
+	return &(mhp->ibmw);
+}
+
+static int iwch_dealloc_mw(struct ib_mw *mw)
+{
+	struct iwch_dev *rhp;
+	struct iwch_mw *mhp;
+	u32 mmid;
+
+	mhp = to_iwch_mw(mw);
+	rhp = mhp->rhp;
+	mmid = (mw->rkey) >> 8;
+	cxio_deallocate_window(&rhp->rdev, mhp->attr.stag);
+	remove_handle(rhp, &rhp->mmidr, mmid);
+	PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp);
+	kfree(mhp);
+	return 0;
+}
+
+static struct ib_mr *iwch_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
+{
+	struct iwch_dev *rhp;
+	struct iwch_pd *php;
+	struct iwch_mr *mhp;
+	u32 mmid;
+	u32 stag = 0;
+	int ret = 0;
+
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		goto err;
+
+	mhp->rhp = rhp;
+	ret = iwch_alloc_pbl(mhp, pbl_depth);
+	if (ret)
+		goto err1;
+	mhp->attr.pbl_size = pbl_depth;
+	ret = cxio_allocate_stag(&rhp->rdev, &stag, php->pdid,
+				 mhp->attr.pbl_size, mhp->attr.pbl_addr);
+	if (ret)
+		goto err2;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.type = TPT_NON_SHARED_MR;
+	mhp->attr.stag = stag;
+	mhp->attr.state = 1;
+	mmid = (stag) >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid))
+		goto err3;
+
+	PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
+	return &(mhp->ibmr);
+err3:
+	cxio_dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+err2:
+	iwch_free_pbl(mhp);
+err1:
+	kfree(mhp);
+err:
+	return ERR_PTR(ret);
+}
+
+static struct ib_fast_reg_page_list *iwch_alloc_fastreg_pbl(
+					struct ib_device *device,
+					int page_list_len)
+{
+	struct ib_fast_reg_page_list *page_list;
+
+	page_list = kmalloc(sizeof *page_list + page_list_len * sizeof(u64),
+			    GFP_KERNEL);
+	if (!page_list)
+		return ERR_PTR(-ENOMEM);
+
+	page_list->page_list = (u64 *)(page_list + 1);
+	page_list->max_page_list_len = page_list_len;
+
+	return page_list;
+}
+
+static void iwch_free_fastreg_pbl(struct ib_fast_reg_page_list *page_list)
+{
+	kfree(page_list);
+}
+
+static int iwch_destroy_qp(struct ib_qp *ib_qp)
+{
+	struct iwch_dev *rhp;
+	struct iwch_qp *qhp;
+	struct iwch_qp_attributes attrs;
+	struct iwch_ucontext *ucontext;
+
+	qhp = to_iwch_qp(ib_qp);
+	rhp = qhp->rhp;
+
+	attrs.next_state = IWCH_QP_STATE_ERROR;
+	iwch_modify_qp(rhp, qhp, IWCH_QP_ATTR_NEXT_STATE, &attrs, 0);
+	wait_event(qhp->wait, !qhp->ep);
+
+	remove_handle(rhp, &rhp->qpidr, qhp->wq.qpid);
+
+	atomic_dec(&qhp->refcnt);
+	wait_event(qhp->wait, !atomic_read(&qhp->refcnt));
+
+	ucontext = ib_qp->uobject ? to_iwch_ucontext(ib_qp->uobject->context)
+				  : NULL;
+	cxio_destroy_qp(&rhp->rdev, &qhp->wq,
+			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+
+	PDBG("%s ib_qp %p qpid 0x%0x qhp %p\n", __func__,
+	     ib_qp, qhp->wq.qpid, qhp);
+	kfree(qhp);
+	return 0;
+}
+
+static struct ib_qp *iwch_create_qp(struct ib_pd *pd,
+			     struct ib_qp_init_attr *attrs,
+			     struct ib_udata *udata)
+{
+	struct iwch_dev *rhp;
+	struct iwch_qp *qhp;
+	struct iwch_pd *php;
+	struct iwch_cq *schp;
+	struct iwch_cq *rchp;
+	struct iwch_create_qp_resp uresp;
+	int wqsize, sqsize, rqsize;
+	struct iwch_ucontext *ucontext;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+	if (attrs->qp_type != IB_QPT_RC)
+		return ERR_PTR(-EINVAL);
+	php = to_iwch_pd(pd);
+	rhp = php->rhp;
+	schp = get_chp(rhp, ((struct iwch_cq *) attrs->send_cq)->cq.cqid);
+	rchp = get_chp(rhp, ((struct iwch_cq *) attrs->recv_cq)->cq.cqid);
+	if (!schp || !rchp)
+		return ERR_PTR(-EINVAL);
+
+	/* The RQT size must be # of entries + 1 rounded up to a power of two */
+	rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr);
+	if (rqsize == attrs->cap.max_recv_wr)
+		rqsize = roundup_pow_of_two(attrs->cap.max_recv_wr+1);
+
+	/* T3 doesn't support RQT depth < 16 */
+	if (rqsize < 16)
+		rqsize = 16;
+
+	if (rqsize > T3_MAX_RQ_SIZE)
+		return ERR_PTR(-EINVAL);
+
+	if (attrs->cap.max_inline_data > T3_MAX_INLINE)
+		return ERR_PTR(-EINVAL);
+
+	/*
+	 * NOTE: The SQ and total WQ sizes don't need to be
+	 * a power of two.  However, all the code assumes
+	 * they are. EG: Q_FREECNT() and friends.
+	 */
+	sqsize = roundup_pow_of_two(attrs->cap.max_send_wr);
+	wqsize = roundup_pow_of_two(rqsize + sqsize);
+
+	/*
+	 * Kernel users need more wq space for fastreg WRs which can take
+	 * 2 WR fragments.
+	 */
+	ucontext = pd->uobject ? to_iwch_ucontext(pd->uobject->context) : NULL;
+	if (!ucontext && wqsize < (rqsize + (2 * sqsize)))
+		wqsize = roundup_pow_of_two(rqsize +
+				roundup_pow_of_two(attrs->cap.max_send_wr * 2));
+	PDBG("%s wqsize %d sqsize %d rqsize %d\n", __func__,
+	     wqsize, sqsize, rqsize);
+	qhp = kzalloc(sizeof(*qhp), GFP_KERNEL);
+	if (!qhp)
+		return ERR_PTR(-ENOMEM);
+	qhp->wq.size_log2 = ilog2(wqsize);
+	qhp->wq.rq_size_log2 = ilog2(rqsize);
+	qhp->wq.sq_size_log2 = ilog2(sqsize);
+	if (cxio_create_qp(&rhp->rdev, !udata, &qhp->wq,
+			   ucontext ? &ucontext->uctx : &rhp->rdev.uctx)) {
+		kfree(qhp);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	attrs->cap.max_recv_wr = rqsize - 1;
+	attrs->cap.max_send_wr = sqsize;
+	attrs->cap.max_inline_data = T3_MAX_INLINE;
+
+	qhp->rhp = rhp;
+	qhp->attr.pd = php->pdid;
+	qhp->attr.scq = ((struct iwch_cq *) attrs->send_cq)->cq.cqid;
+	qhp->attr.rcq = ((struct iwch_cq *) attrs->recv_cq)->cq.cqid;
+	qhp->attr.sq_num_entries = attrs->cap.max_send_wr;
+	qhp->attr.rq_num_entries = attrs->cap.max_recv_wr;
+	qhp->attr.sq_max_sges = attrs->cap.max_send_sge;
+	qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge;
+	qhp->attr.rq_max_sges = attrs->cap.max_recv_sge;
+	qhp->attr.state = IWCH_QP_STATE_IDLE;
+	qhp->attr.next_state = IWCH_QP_STATE_IDLE;
+
+	/*
+	 * XXX - These don't get passed in from the openib user
+	 * at create time.  The CM sets them via a QP modify.
+	 * Need to fix...  I think the CM should
+	 */
+	qhp->attr.enable_rdma_read = 1;
+	qhp->attr.enable_rdma_write = 1;
+	qhp->attr.enable_bind = 1;
+	qhp->attr.max_ord = 1;
+	qhp->attr.max_ird = 1;
+
+	spin_lock_init(&qhp->lock);
+	init_waitqueue_head(&qhp->wait);
+	atomic_set(&qhp->refcnt, 1);
+
+	if (insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.qpid)) {
+		cxio_destroy_qp(&rhp->rdev, &qhp->wq,
+			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+		kfree(qhp);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (udata) {
+
+		struct iwch_mm_entry *mm1, *mm2;
+
+		mm1 = kmalloc(sizeof *mm1, GFP_KERNEL);
+		if (!mm1) {
+			iwch_destroy_qp(&qhp->ibqp);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
+		if (!mm2) {
+			kfree(mm1);
+			iwch_destroy_qp(&qhp->ibqp);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		uresp.qpid = qhp->wq.qpid;
+		uresp.size_log2 = qhp->wq.size_log2;
+		uresp.sq_size_log2 = qhp->wq.sq_size_log2;
+		uresp.rq_size_log2 = qhp->wq.rq_size_log2;
+		spin_lock(&ucontext->mmap_lock);
+		uresp.key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.db_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		spin_unlock(&ucontext->mmap_lock);
+		if (ib_copy_to_udata(udata, &uresp, sizeof (uresp))) {
+			kfree(mm1);
+			kfree(mm2);
+			iwch_destroy_qp(&qhp->ibqp);
+			return ERR_PTR(-EFAULT);
+		}
+		mm1->key = uresp.key;
+		mm1->addr = virt_to_phys(qhp->wq.queue);
+		mm1->len = PAGE_ALIGN(wqsize * sizeof (union t3_wr));
+		insert_mmap(ucontext, mm1);
+		mm2->key = uresp.db_key;
+		mm2->addr = qhp->wq.udb & PAGE_MASK;
+		mm2->len = PAGE_SIZE;
+		insert_mmap(ucontext, mm2);
+	}
+	qhp->ibqp.qp_num = qhp->wq.qpid;
+	init_timer(&(qhp->timer));
+	PDBG("%s sq_num_entries %d, rq_num_entries %d "
+	     "qpid 0x%0x qhp %p dma_addr 0x%llx size %d rq_addr 0x%x\n",
+	     __func__, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries,
+	     qhp->wq.qpid, qhp, (unsigned long long) qhp->wq.dma_addr,
+	     1 << qhp->wq.size_log2, qhp->wq.rq_addr);
+	return &qhp->ibqp;
+}
+
+static int iwch_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct iwch_dev *rhp;
+	struct iwch_qp *qhp;
+	enum iwch_qp_attr_mask mask = 0;
+	struct iwch_qp_attributes attrs;
+
+	PDBG("%s ib_qp %p\n", __func__, ibqp);
+
+	/* iwarp does not support the RTR state */
+	if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR))
+		attr_mask &= ~IB_QP_STATE;
+
+	/* Make sure we still have something left to do */
+	if (!attr_mask)
+		return 0;
+
+	memset(&attrs, 0, sizeof attrs);
+	qhp = to_iwch_qp(ibqp);
+	rhp = qhp->rhp;
+
+	attrs.next_state = iwch_convert_state(attr->qp_state);
+	attrs.enable_rdma_read = (attr->qp_access_flags &
+			       IB_ACCESS_REMOTE_READ) ?  1 : 0;
+	attrs.enable_rdma_write = (attr->qp_access_flags &
+				IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0;
+
+
+	mask |= (attr_mask & IB_QP_STATE) ? IWCH_QP_ATTR_NEXT_STATE : 0;
+	mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ?
+			(IWCH_QP_ATTR_ENABLE_RDMA_READ |
+			 IWCH_QP_ATTR_ENABLE_RDMA_WRITE |
+			 IWCH_QP_ATTR_ENABLE_RDMA_BIND) : 0;
+
+	return iwch_modify_qp(rhp, qhp, mask, &attrs, 0);
+}
+
+void iwch_qp_add_ref(struct ib_qp *qp)
+{
+	PDBG("%s ib_qp %p\n", __func__, qp);
+	atomic_inc(&(to_iwch_qp(qp)->refcnt));
+}
+
+void iwch_qp_rem_ref(struct ib_qp *qp)
+{
+	PDBG("%s ib_qp %p\n", __func__, qp);
+	if (atomic_dec_and_test(&(to_iwch_qp(qp)->refcnt)))
+	        wake_up(&(to_iwch_qp(qp)->wait));
+}
+
+static struct ib_qp *iwch_get_qp(struct ib_device *dev, int qpn)
+{
+	PDBG("%s ib_dev %p qpn 0x%x\n", __func__, dev, qpn);
+	return (struct ib_qp *)get_qhp(to_iwch_dev(dev), qpn);
+}
+
+
+static int iwch_query_pkey(struct ib_device *ibdev,
+			   u8 port, u16 index, u16 * pkey)
+{
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	*pkey = 0;
+	return 0;
+}
+
+static int iwch_query_gid(struct ib_device *ibdev, u8 port,
+			  int index, union ib_gid *gid)
+{
+	struct iwch_dev *dev;
+
+	PDBG("%s ibdev %p, port %d, index %d, gid %p\n",
+	       __func__, ibdev, port, index, gid);
+	dev = to_iwch_dev(ibdev);
+	BUG_ON(port == 0 || port > 2);
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	memcpy(&(gid->raw[0]), dev->rdev.port_info.lldevs[port-1]->dev_addr, 6);
+	return 0;
+}
+
+static u64 fw_vers_string_to_u64(struct iwch_dev *iwch_dev)
+{
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
+	char *cp, *next;
+	unsigned fw_maj, fw_min, fw_mic;
+
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+
+	next = info.fw_version + 1;
+	cp = strsep(&next, ".");
+	sscanf(cp, "%i", &fw_maj);
+	cp = strsep(&next, ".");
+	sscanf(cp, "%i", &fw_min);
+	cp = strsep(&next, ".");
+	sscanf(cp, "%i", &fw_mic);
+
+	return (((u64)fw_maj & 0xffff) << 32) | ((fw_min & 0xffff) << 16) |
+	       (fw_mic & 0xffff);
+}
+
+static int iwch_query_device(struct ib_device *ibdev,
+			     struct ib_device_attr *props)
+{
+
+	struct iwch_dev *dev;
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+
+	dev = to_iwch_dev(ibdev);
+	memset(props, 0, sizeof *props);
+	memcpy(&props->sys_image_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
+	props->hw_ver = dev->rdev.t3cdev_p->type;
+	props->fw_ver = fw_vers_string_to_u64(dev);
+	props->device_cap_flags = dev->device_cap_flags;
+	props->page_size_cap = dev->attr.mem_pgsizes_bitmask;
+	props->vendor_id = (u32)dev->rdev.rnic_info.pdev->vendor;
+	props->vendor_part_id = (u32)dev->rdev.rnic_info.pdev->device;
+	props->max_mr_size = dev->attr.max_mr_size;
+	props->max_qp = dev->attr.max_qps;
+	props->max_qp_wr = dev->attr.max_wrs;
+	props->max_sge = dev->attr.max_sge_per_wr;
+	props->max_sge_rd = 1;
+	props->max_qp_rd_atom = dev->attr.max_rdma_reads_per_qp;
+	props->max_qp_init_rd_atom = dev->attr.max_rdma_reads_per_qp;
+	props->max_cq = dev->attr.max_cqs;
+	props->max_cqe = dev->attr.max_cqes_per_cq;
+	props->max_mr = dev->attr.max_mem_regs;
+	props->max_pd = dev->attr.max_pds;
+	props->local_ca_ack_delay = 0;
+	props->max_fast_reg_page_list_len = T3_MAX_FASTREG_DEPTH;
+
+	return 0;
+}
+
+static int iwch_query_port(struct ib_device *ibdev,
+			   u8 port, struct ib_port_attr *props)
+{
+	struct iwch_dev *dev;
+	struct net_device *netdev;
+	struct in_device *inetdev;
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+
+	dev = to_iwch_dev(ibdev);
+	netdev = dev->rdev.port_info.lldevs[port-1];
+
+	memset(props, 0, sizeof(struct ib_port_attr));
+	props->max_mtu = IB_MTU_4096;
+	if (netdev->mtu >= 4096)
+		props->active_mtu = IB_MTU_4096;
+	else if (netdev->mtu >= 2048)
+		props->active_mtu = IB_MTU_2048;
+	else if (netdev->mtu >= 1024)
+		props->active_mtu = IB_MTU_1024;
+	else if (netdev->mtu >= 512)
+		props->active_mtu = IB_MTU_512;
+	else
+		props->active_mtu = IB_MTU_256;
+
+	if (!netif_carrier_ok(netdev))
+		props->state = IB_PORT_DOWN;
+	else {
+		inetdev = in_dev_get(netdev);
+		if (inetdev) {
+			if (inetdev->ifa_list)
+				props->state = IB_PORT_ACTIVE;
+			else
+				props->state = IB_PORT_INIT;
+			in_dev_put(inetdev);
+		} else
+			props->state = IB_PORT_INIT;
+	}
+
+	props->port_cap_flags =
+	    IB_PORT_CM_SUP |
+	    IB_PORT_SNMP_TUNNEL_SUP |
+	    IB_PORT_REINIT_SUP |
+	    IB_PORT_DEVICE_MGMT_SUP |
+	    IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->active_width = 2;
+	props->active_speed = IB_SPEED_DDR;
+	props->max_msg_sz = -1;
+
+	return 0;
+}
+
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
+						 ibdev.dev);
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	return sprintf(buf, "%d\n", iwch_dev->rdev.t3cdev_p->type);
+}
+
+static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
+						 ibdev.dev);
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
+
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+	return sprintf(buf, "%s\n", info.fw_version);
+}
+
+static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
+						 ibdev.dev);
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = iwch_dev->rdev.t3cdev_p->lldev;
+
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+	return sprintf(buf, "%s\n", info.driver);
+}
+
+static ssize_t show_board(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct iwch_dev *iwch_dev = container_of(dev, struct iwch_dev,
+						 ibdev.dev);
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	return sprintf(buf, "%x.%x\n", iwch_dev->rdev.rnic_info.pdev->vendor,
+		       iwch_dev->rdev.rnic_info.pdev->device);
+}
+
+static int iwch_get_mib(struct ib_device *ibdev,
+			union rdma_protocol_stats *stats)
+{
+	struct iwch_dev *dev;
+	struct tp_mib_stats m;
+	int ret;
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	dev = to_iwch_dev(ibdev);
+	ret = dev->rdev.t3cdev_p->ctl(dev->rdev.t3cdev_p, RDMA_GET_MIB, &m);
+	if (ret)
+		return -ENOSYS;
+
+	memset(stats, 0, sizeof *stats);
+	stats->iw.ipInReceives = ((u64) m.ipInReceive_hi << 32) +
+				m.ipInReceive_lo;
+	stats->iw.ipInHdrErrors = ((u64) m.ipInHdrErrors_hi << 32) +
+				  m.ipInHdrErrors_lo;
+	stats->iw.ipInAddrErrors = ((u64) m.ipInAddrErrors_hi << 32) +
+				   m.ipInAddrErrors_lo;
+	stats->iw.ipInUnknownProtos = ((u64) m.ipInUnknownProtos_hi << 32) +
+				      m.ipInUnknownProtos_lo;
+	stats->iw.ipInDiscards = ((u64) m.ipInDiscards_hi << 32) +
+				 m.ipInDiscards_lo;
+	stats->iw.ipInDelivers = ((u64) m.ipInDelivers_hi << 32) +
+				 m.ipInDelivers_lo;
+	stats->iw.ipOutRequests = ((u64) m.ipOutRequests_hi << 32) +
+				  m.ipOutRequests_lo;
+	stats->iw.ipOutDiscards = ((u64) m.ipOutDiscards_hi << 32) +
+				  m.ipOutDiscards_lo;
+	stats->iw.ipOutNoRoutes = ((u64) m.ipOutNoRoutes_hi << 32) +
+				  m.ipOutNoRoutes_lo;
+	stats->iw.ipReasmTimeout = (u64) m.ipReasmTimeout;
+	stats->iw.ipReasmReqds = (u64) m.ipReasmReqds;
+	stats->iw.ipReasmOKs = (u64) m.ipReasmOKs;
+	stats->iw.ipReasmFails = (u64) m.ipReasmFails;
+	stats->iw.tcpActiveOpens = (u64) m.tcpActiveOpens;
+	stats->iw.tcpPassiveOpens = (u64) m.tcpPassiveOpens;
+	stats->iw.tcpAttemptFails = (u64) m.tcpAttemptFails;
+	stats->iw.tcpEstabResets = (u64) m.tcpEstabResets;
+	stats->iw.tcpOutRsts = (u64) m.tcpOutRsts;
+	stats->iw.tcpCurrEstab = (u64) m.tcpCurrEstab;
+	stats->iw.tcpInSegs = ((u64) m.tcpInSegs_hi << 32) +
+			      m.tcpInSegs_lo;
+	stats->iw.tcpOutSegs = ((u64) m.tcpOutSegs_hi << 32) +
+			       m.tcpOutSegs_lo;
+	stats->iw.tcpRetransSegs = ((u64) m.tcpRetransSeg_hi << 32) +
+				  m.tcpRetransSeg_lo;
+	stats->iw.tcpInErrs = ((u64) m.tcpInErrs_hi << 32) +
+			      m.tcpInErrs_lo;
+	stats->iw.tcpRtoMin = (u64) m.tcpRtoMin;
+	stats->iw.tcpRtoMax = (u64) m.tcpRtoMax;
+	return 0;
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *iwch_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+};
+
+int iwch_register_device(struct iwch_dev *dev)
+{
+	int ret;
+	int i;
+
+	PDBG("%s iwch_dev %p\n", __func__, dev);
+	strlcpy(dev->ibdev.name, "cxgb3_%d", IB_DEVICE_NAME_MAX);
+	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
+	memcpy(&dev->ibdev.node_guid, dev->rdev.t3cdev_p->lldev->dev_addr, 6);
+	dev->ibdev.owner = THIS_MODULE;
+	dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
+				IB_DEVICE_MEM_WINDOW |
+				IB_DEVICE_MEM_MGT_EXTENSIONS;
+
+	/* cxgb3 supports STag 0. */
+	dev->ibdev.local_dma_lkey = 0;
+
+	dev->ibdev.uverbs_cmd_mask =
+	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+	    (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_REG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+	    (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+	    (1ull << IB_USER_VERBS_CMD_POST_RECV);
+	dev->ibdev.node_type = RDMA_NODE_RNIC;
+	memcpy(dev->ibdev.node_desc, IWCH_NODE_DESC, sizeof(IWCH_NODE_DESC));
+	dev->ibdev.phys_port_cnt = dev->rdev.port_info.nports;
+	dev->ibdev.num_comp_vectors = 1;
+	dev->ibdev.dma_device = &(dev->rdev.rnic_info.pdev->dev);
+	dev->ibdev.query_device = iwch_query_device;
+	dev->ibdev.query_port = iwch_query_port;
+	dev->ibdev.query_pkey = iwch_query_pkey;
+	dev->ibdev.query_gid = iwch_query_gid;
+	dev->ibdev.alloc_ucontext = iwch_alloc_ucontext;
+	dev->ibdev.dealloc_ucontext = iwch_dealloc_ucontext;
+	dev->ibdev.mmap = iwch_mmap;
+	dev->ibdev.alloc_pd = iwch_allocate_pd;
+	dev->ibdev.dealloc_pd = iwch_deallocate_pd;
+	dev->ibdev.create_ah = iwch_ah_create;
+	dev->ibdev.destroy_ah = iwch_ah_destroy;
+	dev->ibdev.create_qp = iwch_create_qp;
+	dev->ibdev.modify_qp = iwch_ib_modify_qp;
+	dev->ibdev.destroy_qp = iwch_destroy_qp;
+	dev->ibdev.create_cq = iwch_create_cq;
+	dev->ibdev.destroy_cq = iwch_destroy_cq;
+	dev->ibdev.resize_cq = iwch_resize_cq;
+	dev->ibdev.poll_cq = iwch_poll_cq;
+	dev->ibdev.get_dma_mr = iwch_get_dma_mr;
+	dev->ibdev.reg_phys_mr = iwch_register_phys_mem;
+	dev->ibdev.rereg_phys_mr = iwch_reregister_phys_mem;
+	dev->ibdev.reg_user_mr = iwch_reg_user_mr;
+	dev->ibdev.dereg_mr = iwch_dereg_mr;
+	dev->ibdev.alloc_mw = iwch_alloc_mw;
+	dev->ibdev.bind_mw = iwch_bind_mw;
+	dev->ibdev.dealloc_mw = iwch_dealloc_mw;
+	dev->ibdev.alloc_fast_reg_mr = iwch_alloc_fast_reg_mr;
+	dev->ibdev.alloc_fast_reg_page_list = iwch_alloc_fastreg_pbl;
+	dev->ibdev.free_fast_reg_page_list = iwch_free_fastreg_pbl;
+	dev->ibdev.attach_mcast = iwch_multicast_attach;
+	dev->ibdev.detach_mcast = iwch_multicast_detach;
+	dev->ibdev.process_mad = iwch_process_mad;
+	dev->ibdev.req_notify_cq = iwch_arm_cq;
+	dev->ibdev.post_send = iwch_post_send;
+	dev->ibdev.post_recv = iwch_post_receive;
+	dev->ibdev.get_protocol_stats = iwch_get_mib;
+	dev->ibdev.uverbs_abi_ver = IWCH_UVERBS_ABI_VERSION;
+
+	dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
+	if (!dev->ibdev.iwcm)
+		return -ENOMEM;
+
+	dev->ibdev.iwcm->connect = iwch_connect;
+	dev->ibdev.iwcm->accept = iwch_accept_cr;
+	dev->ibdev.iwcm->reject = iwch_reject_cr;
+	dev->ibdev.iwcm->create_listen = iwch_create_listen;
+	dev->ibdev.iwcm->destroy_listen = iwch_destroy_listen;
+	dev->ibdev.iwcm->add_ref = iwch_qp_add_ref;
+	dev->ibdev.iwcm->rem_ref = iwch_qp_rem_ref;
+	dev->ibdev.iwcm->get_qp = iwch_get_qp;
+
+	ret = ib_register_device(&dev->ibdev, NULL);
+	if (ret)
+		goto bail1;
+
+	for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i) {
+		ret = device_create_file(&dev->ibdev.dev,
+					 iwch_class_attributes[i]);
+		if (ret) {
+			goto bail2;
+		}
+	}
+	return 0;
+bail2:
+	ib_unregister_device(&dev->ibdev);
+bail1:
+	kfree(dev->ibdev.iwcm);
+	return ret;
+}
+
+void iwch_unregister_device(struct iwch_dev *dev)
+{
+	int i;
+
+	PDBG("%s iwch_dev %p\n", __func__, dev);
+	for (i = 0; i < ARRAY_SIZE(iwch_class_attributes); ++i)
+		device_remove_file(&dev->ibdev.dev,
+				   iwch_class_attributes[i]);
+	ib_unregister_device(&dev->ibdev);
+	kfree(dev->ibdev.iwcm);
+	return;
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h
new file mode 100644
index 000000000..87c14b0c5
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __IWCH_PROVIDER_H__
+#define __IWCH_PROVIDER_H__
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <rdma/ib_verbs.h>
+#include <asm/types.h>
+#include "t3cdev.h"
+#include "iwch.h"
+#include "cxio_wr.h"
+#include "cxio_hal.h"
+
+struct iwch_pd {
+	struct ib_pd ibpd;
+	u32 pdid;
+	struct iwch_dev *rhp;
+};
+
+static inline struct iwch_pd *to_iwch_pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct iwch_pd, ibpd);
+}
+
+struct tpt_attributes {
+	u32 stag;
+	u32 state:1;
+	u32 type:2;
+	u32 rsvd:1;
+	enum tpt_mem_perm perms;
+	u32 remote_invaliate_disable:1;
+	u32 zbva:1;
+	u32 mw_bind_enable:1;
+	u32 page_size:5;
+
+	u32 pdid;
+	u32 qpid;
+	u32 pbl_addr;
+	u32 len;
+	u64 va_fbo;
+	u32 pbl_size;
+};
+
+struct iwch_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct iwch_dev *rhp;
+	u64 kva;
+	struct tpt_attributes attr;
+};
+
+typedef struct iwch_mw iwch_mw_handle;
+
+static inline struct iwch_mr *to_iwch_mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct iwch_mr, ibmr);
+}
+
+struct iwch_mw {
+	struct ib_mw ibmw;
+	struct iwch_dev *rhp;
+	u64 kva;
+	struct tpt_attributes attr;
+};
+
+static inline struct iwch_mw *to_iwch_mw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct iwch_mw, ibmw);
+}
+
+struct iwch_cq {
+	struct ib_cq ibcq;
+	struct iwch_dev *rhp;
+	struct t3_cq cq;
+	spinlock_t lock;
+	spinlock_t comp_handler_lock;
+	atomic_t refcnt;
+	wait_queue_head_t wait;
+	u32 __user *user_rptr_addr;
+};
+
+static inline struct iwch_cq *to_iwch_cq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct iwch_cq, ibcq);
+}
+
+enum IWCH_QP_FLAGS {
+	QP_QUIESCED = 0x01
+};
+
+struct iwch_mpa_attributes {
+	u8 initiator;
+	u8 recv_marker_enabled;
+	u8 xmit_marker_enabled;	/* iWARP: enable inbound Read Resp. */
+	u8 crc_enabled;
+	u8 version;	/* 0 or 1 */
+};
+
+struct iwch_qp_attributes {
+	u32 scq;
+	u32 rcq;
+	u32 sq_num_entries;
+	u32 rq_num_entries;
+	u32 sq_max_sges;
+	u32 sq_max_sges_rdma_write;
+	u32 rq_max_sges;
+	u32 state;
+	u8 enable_rdma_read;
+	u8 enable_rdma_write;	/* enable inbound Read Resp. */
+	u8 enable_bind;
+	u8 enable_mmid0_fastreg;	/* Enable STAG0 + Fast-register */
+	/*
+	 * Next QP state. If specify the current state, only the
+	 * QP attributes will be modified.
+	 */
+	u32 max_ord;
+	u32 max_ird;
+	u32 pd;	/* IN */
+	u32 next_state;
+	char terminate_buffer[52];
+	u32 terminate_msg_len;
+	u8 is_terminate_local;
+	struct iwch_mpa_attributes mpa_attr;	/* IN-OUT */
+	struct iwch_ep *llp_stream_handle;
+	char *stream_msg_buf;	/* Last stream msg. before Idle -> RTS */
+	u32 stream_msg_buf_len;	/* Only on Idle -> RTS */
+};
+
+struct iwch_qp {
+	struct ib_qp ibqp;
+	struct iwch_dev *rhp;
+	struct iwch_ep *ep;
+	struct iwch_qp_attributes attr;
+	struct t3_wq wq;
+	spinlock_t lock;
+	atomic_t refcnt;
+	wait_queue_head_t wait;
+	enum IWCH_QP_FLAGS flags;
+	struct timer_list timer;
+};
+
+static inline int qp_quiesced(struct iwch_qp *qhp)
+{
+	return qhp->flags & QP_QUIESCED;
+}
+
+static inline struct iwch_qp *to_iwch_qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct iwch_qp, ibqp);
+}
+
+void iwch_qp_add_ref(struct ib_qp *qp);
+void iwch_qp_rem_ref(struct ib_qp *qp);
+
+struct iwch_ucontext {
+	struct ib_ucontext ibucontext;
+	struct cxio_ucontext uctx;
+	u32 key;
+	spinlock_t mmap_lock;
+	struct list_head mmaps;
+};
+
+static inline struct iwch_ucontext *to_iwch_ucontext(struct ib_ucontext *c)
+{
+	return container_of(c, struct iwch_ucontext, ibucontext);
+}
+
+struct iwch_mm_entry {
+	struct list_head entry;
+	u64 addr;
+	u32 key;
+	unsigned len;
+};
+
+static inline struct iwch_mm_entry *remove_mmap(struct iwch_ucontext *ucontext,
+						u32 key, unsigned len)
+{
+	struct list_head *pos, *nxt;
+	struct iwch_mm_entry *mm;
+
+	spin_lock(&ucontext->mmap_lock);
+	list_for_each_safe(pos, nxt, &ucontext->mmaps) {
+
+		mm = list_entry(pos, struct iwch_mm_entry, entry);
+		if (mm->key == key && mm->len == len) {
+			list_del_init(&mm->entry);
+			spin_unlock(&ucontext->mmap_lock);
+			PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__,
+			     key, (unsigned long long) mm->addr, mm->len);
+			return mm;
+		}
+	}
+	spin_unlock(&ucontext->mmap_lock);
+	return NULL;
+}
+
+static inline void insert_mmap(struct iwch_ucontext *ucontext,
+			       struct iwch_mm_entry *mm)
+{
+	spin_lock(&ucontext->mmap_lock);
+	PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__,
+	     mm->key, (unsigned long long) mm->addr, mm->len);
+	list_add_tail(&mm->entry, &ucontext->mmaps);
+	spin_unlock(&ucontext->mmap_lock);
+}
+
+enum iwch_qp_attr_mask {
+	IWCH_QP_ATTR_NEXT_STATE = 1 << 0,
+	IWCH_QP_ATTR_ENABLE_RDMA_READ = 1 << 7,
+	IWCH_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8,
+	IWCH_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9,
+	IWCH_QP_ATTR_MAX_ORD = 1 << 11,
+	IWCH_QP_ATTR_MAX_IRD = 1 << 12,
+	IWCH_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22,
+	IWCH_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23,
+	IWCH_QP_ATTR_MPA_ATTR = 1 << 24,
+	IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25,
+	IWCH_QP_ATTR_VALID_MODIFY = (IWCH_QP_ATTR_ENABLE_RDMA_READ |
+				     IWCH_QP_ATTR_ENABLE_RDMA_WRITE |
+				     IWCH_QP_ATTR_MAX_ORD |
+				     IWCH_QP_ATTR_MAX_IRD |
+				     IWCH_QP_ATTR_LLP_STREAM_HANDLE |
+				     IWCH_QP_ATTR_STREAM_MSG_BUFFER |
+				     IWCH_QP_ATTR_MPA_ATTR |
+				     IWCH_QP_ATTR_QP_CONTEXT_ACTIVATE)
+};
+
+int iwch_modify_qp(struct iwch_dev *rhp,
+				struct iwch_qp *qhp,
+				enum iwch_qp_attr_mask mask,
+				struct iwch_qp_attributes *attrs,
+				int internal);
+
+enum iwch_qp_state {
+	IWCH_QP_STATE_IDLE,
+	IWCH_QP_STATE_RTS,
+	IWCH_QP_STATE_ERROR,
+	IWCH_QP_STATE_TERMINATE,
+	IWCH_QP_STATE_CLOSING,
+	IWCH_QP_STATE_TOT
+};
+
+static inline int iwch_convert_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET:
+	case IB_QPS_INIT:
+		return IWCH_QP_STATE_IDLE;
+	case IB_QPS_RTS:
+		return IWCH_QP_STATE_RTS;
+	case IB_QPS_SQD:
+		return IWCH_QP_STATE_CLOSING;
+	case IB_QPS_SQE:
+		return IWCH_QP_STATE_TERMINATE;
+	case IB_QPS_ERR:
+		return IWCH_QP_STATE_ERROR;
+	default:
+		return -1;
+	}
+}
+
+static inline u32 iwch_ib_to_tpt_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_WRITE ? TPT_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ ? TPT_REMOTE_READ : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE ? TPT_LOCAL_WRITE : 0) |
+	       (acc & IB_ACCESS_MW_BIND ? TPT_MW_BIND : 0) |
+	       TPT_LOCAL_READ;
+}
+
+static inline u32 iwch_ib_to_tpt_bind_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_WRITE ? TPT_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ ? TPT_REMOTE_READ : 0);
+}
+
+enum iwch_mmid_state {
+	IWCH_STAG_STATE_VALID,
+	IWCH_STAG_STATE_INVALID
+};
+
+enum iwch_qp_query_flags {
+	IWCH_QP_QUERY_CONTEXT_NONE = 0x0,	/* No ctx; Only attrs */
+	IWCH_QP_QUERY_CONTEXT_GET = 0x1,	/* Get ctx + attrs */
+	IWCH_QP_QUERY_CONTEXT_SUSPEND = 0x2,	/* Not Supported */
+
+	/*
+	 * Quiesce QP context; Consumer
+	 * will NOT replay outstanding WR
+	 */
+	IWCH_QP_QUERY_CONTEXT_QUIESCE = 0x4,
+	IWCH_QP_QUERY_CONTEXT_REMOVE = 0x8,
+	IWCH_QP_QUERY_TEST_USERWRITE = 0x32	/* Test special */
+};
+
+u16 iwch_rqes_posted(struct iwch_qp *qhp);
+int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+int iwch_bind_mw(struct ib_qp *qp,
+			     struct ib_mw *mw,
+			     struct ib_mw_bind *mw_bind);
+int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg);
+int iwch_post_zb_read(struct iwch_ep *ep);
+int iwch_register_device(struct iwch_dev *dev);
+void iwch_unregister_device(struct iwch_dev *dev);
+void stop_read_rep_timer(struct iwch_qp *qhp);
+int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+		      struct iwch_mr *mhp, int shift);
+int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php,
+					struct iwch_mr *mhp,
+					int shift,
+					int npages);
+int iwch_alloc_pbl(struct iwch_mr *mhp, int npages);
+void iwch_free_pbl(struct iwch_mr *mhp);
+int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset);
+int build_phys_page_list(struct ib_phys_buf *buffer_list,
+					int num_phys_buf,
+					u64 *iova_start,
+					u64 *total_size,
+					int *npages,
+					int *shift,
+					__be64 **page_list);
+
+
+#define IWCH_NODE_DESC "cxgb3 Chelsio Communications"
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
new file mode 100644
index 000000000..b57c0befd
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -0,0 +1,1163 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include "iwch_provider.h"
+#include "iwch.h"
+#include "iwch_cm.h"
+#include "cxio_hal.h"
+#include "cxio_resource.h"
+
+#define NO_SUPPORT -1
+
+static int build_rdma_send(union t3_wr *wqe, struct ib_send_wr *wr,
+				u8 * flit_cnt)
+{
+	int i;
+	u32 plen;
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.rdmaop = T3_SEND_WITH_SE;
+		else
+			wqe->send.rdmaop = T3_SEND;
+		wqe->send.rem_stag = 0;
+		break;
+	case IB_WR_SEND_WITH_INV:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.rdmaop = T3_SEND_WITH_SE_INV;
+		else
+			wqe->send.rdmaop = T3_SEND_WITH_INV;
+		wqe->send.rem_stag = cpu_to_be32(wr->ex.invalidate_rkey);
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (wr->num_sge > T3_MAX_SGE)
+		return -EINVAL;
+	wqe->send.reserved[0] = 0;
+	wqe->send.reserved[1] = 0;
+	wqe->send.reserved[2] = 0;
+	plen = 0;
+	for (i = 0; i < wr->num_sge; i++) {
+		if ((plen + wr->sg_list[i].length) < plen)
+			return -EMSGSIZE;
+
+		plen += wr->sg_list[i].length;
+		wqe->send.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey);
+		wqe->send.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
+		wqe->send.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr);
+	}
+	wqe->send.num_sgle = cpu_to_be32(wr->num_sge);
+	*flit_cnt = 4 + ((wr->num_sge) << 1);
+	wqe->send.plen = cpu_to_be32(plen);
+	return 0;
+}
+
+static int build_rdma_write(union t3_wr *wqe, struct ib_send_wr *wr,
+				 u8 *flit_cnt)
+{
+	int i;
+	u32 plen;
+	if (wr->num_sge > T3_MAX_SGE)
+		return -EINVAL;
+	wqe->write.rdmaop = T3_RDMA_WRITE;
+	wqe->write.reserved[0] = 0;
+	wqe->write.reserved[1] = 0;
+	wqe->write.reserved[2] = 0;
+	wqe->write.stag_sink = cpu_to_be32(wr->wr.rdma.rkey);
+	wqe->write.to_sink = cpu_to_be64(wr->wr.rdma.remote_addr);
+
+	if (wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+		plen = 4;
+		wqe->write.sgl[0].stag = wr->ex.imm_data;
+		wqe->write.sgl[0].len = cpu_to_be32(0);
+		wqe->write.num_sgle = cpu_to_be32(0);
+		*flit_cnt = 6;
+	} else {
+		plen = 0;
+		for (i = 0; i < wr->num_sge; i++) {
+			if ((plen + wr->sg_list[i].length) < plen) {
+				return -EMSGSIZE;
+			}
+			plen += wr->sg_list[i].length;
+			wqe->write.sgl[i].stag =
+			    cpu_to_be32(wr->sg_list[i].lkey);
+			wqe->write.sgl[i].len =
+			    cpu_to_be32(wr->sg_list[i].length);
+			wqe->write.sgl[i].to =
+			    cpu_to_be64(wr->sg_list[i].addr);
+		}
+		wqe->write.num_sgle = cpu_to_be32(wr->num_sge);
+		*flit_cnt = 5 + ((wr->num_sge) << 1);
+	}
+	wqe->write.plen = cpu_to_be32(plen);
+	return 0;
+}
+
+static int build_rdma_read(union t3_wr *wqe, struct ib_send_wr *wr,
+				u8 *flit_cnt)
+{
+	if (wr->num_sge > 1)
+		return -EINVAL;
+	wqe->read.rdmaop = T3_READ_REQ;
+	if (wr->opcode == IB_WR_RDMA_READ_WITH_INV)
+		wqe->read.local_inv = 1;
+	else
+		wqe->read.local_inv = 0;
+	wqe->read.reserved[0] = 0;
+	wqe->read.reserved[1] = 0;
+	wqe->read.rem_stag = cpu_to_be32(wr->wr.rdma.rkey);
+	wqe->read.rem_to = cpu_to_be64(wr->wr.rdma.remote_addr);
+	wqe->read.local_stag = cpu_to_be32(wr->sg_list[0].lkey);
+	wqe->read.local_len = cpu_to_be32(wr->sg_list[0].length);
+	wqe->read.local_to = cpu_to_be64(wr->sg_list[0].addr);
+	*flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3;
+	return 0;
+}
+
+static int build_fastreg(union t3_wr *wqe, struct ib_send_wr *wr,
+				u8 *flit_cnt, int *wr_cnt, struct t3_wq *wq)
+{
+	int i;
+	__be64 *p;
+
+	if (wr->wr.fast_reg.page_list_len > T3_MAX_FASTREG_DEPTH)
+		return -EINVAL;
+	*wr_cnt = 1;
+	wqe->fastreg.stag = cpu_to_be32(wr->wr.fast_reg.rkey);
+	wqe->fastreg.len = cpu_to_be32(wr->wr.fast_reg.length);
+	wqe->fastreg.va_base_hi = cpu_to_be32(wr->wr.fast_reg.iova_start >> 32);
+	wqe->fastreg.va_base_lo_fbo =
+				cpu_to_be32(wr->wr.fast_reg.iova_start & 0xffffffff);
+	wqe->fastreg.page_type_perms = cpu_to_be32(
+		V_FR_PAGE_COUNT(wr->wr.fast_reg.page_list_len) |
+		V_FR_PAGE_SIZE(wr->wr.fast_reg.page_shift-12) |
+		V_FR_TYPE(TPT_VATO) |
+		V_FR_PERMS(iwch_ib_to_tpt_access(wr->wr.fast_reg.access_flags)));
+	p = &wqe->fastreg.pbl_addrs[0];
+	for (i = 0; i < wr->wr.fast_reg.page_list_len; i++, p++) {
+
+		/* If we need a 2nd WR, then set it up */
+		if (i == T3_MAX_FASTREG_FRAG) {
+			*wr_cnt = 2;
+			wqe = (union t3_wr *)(wq->queue +
+				Q_PTR2IDX((wq->wptr+1), wq->size_log2));
+			build_fw_riwrh((void *)wqe, T3_WR_FASTREG, 0,
+			       Q_GENBIT(wq->wptr + 1, wq->size_log2),
+			       0, 1 + wr->wr.fast_reg.page_list_len - T3_MAX_FASTREG_FRAG,
+			       T3_EOP);
+
+			p = &wqe->pbl_frag.pbl_addrs[0];
+		}
+		*p = cpu_to_be64((u64)wr->wr.fast_reg.page_list->page_list[i]);
+	}
+	*flit_cnt = 5 + wr->wr.fast_reg.page_list_len;
+	if (*flit_cnt > 15)
+		*flit_cnt = 15;
+	return 0;
+}
+
+static int build_inv_stag(union t3_wr *wqe, struct ib_send_wr *wr,
+				u8 *flit_cnt)
+{
+	wqe->local_inv.stag = cpu_to_be32(wr->ex.invalidate_rkey);
+	wqe->local_inv.reserved = 0;
+	*flit_cnt = sizeof(struct t3_local_inv_wr) >> 3;
+	return 0;
+}
+
+static int iwch_sgl2pbl_map(struct iwch_dev *rhp, struct ib_sge *sg_list,
+			    u32 num_sgle, u32 * pbl_addr, u8 * page_size)
+{
+	int i;
+	struct iwch_mr *mhp;
+	u64 offset;
+	for (i = 0; i < num_sgle; i++) {
+
+		mhp = get_mhp(rhp, (sg_list[i].lkey) >> 8);
+		if (!mhp) {
+			PDBG("%s %d\n", __func__, __LINE__);
+			return -EIO;
+		}
+		if (!mhp->attr.state) {
+			PDBG("%s %d\n", __func__, __LINE__);
+			return -EIO;
+		}
+		if (mhp->attr.zbva) {
+			PDBG("%s %d\n", __func__, __LINE__);
+			return -EIO;
+		}
+
+		if (sg_list[i].addr < mhp->attr.va_fbo) {
+			PDBG("%s %d\n", __func__, __LINE__);
+			return -EINVAL;
+		}
+		if (sg_list[i].addr + ((u64) sg_list[i].length) <
+		    sg_list[i].addr) {
+			PDBG("%s %d\n", __func__, __LINE__);
+			return -EINVAL;
+		}
+		if (sg_list[i].addr + ((u64) sg_list[i].length) >
+		    mhp->attr.va_fbo + ((u64) mhp->attr.len)) {
+			PDBG("%s %d\n", __func__, __LINE__);
+			return -EINVAL;
+		}
+		offset = sg_list[i].addr - mhp->attr.va_fbo;
+		offset += mhp->attr.va_fbo &
+			  ((1UL << (12 + mhp->attr.page_size)) - 1);
+		pbl_addr[i] = ((mhp->attr.pbl_addr -
+			        rhp->rdev.rnic_info.pbl_base) >> 3) +
+			      (offset >> (12 + mhp->attr.page_size));
+		page_size[i] = mhp->attr.page_size;
+	}
+	return 0;
+}
+
+static int build_rdma_recv(struct iwch_qp *qhp, union t3_wr *wqe,
+				struct ib_recv_wr *wr)
+{
+	int i, err = 0;
+	u32 pbl_addr[T3_MAX_SGE];
+	u8 page_size[T3_MAX_SGE];
+
+	err = iwch_sgl2pbl_map(qhp->rhp, wr->sg_list, wr->num_sge, pbl_addr,
+			       page_size);
+	if (err)
+		return err;
+	wqe->recv.pagesz[0] = page_size[0];
+	wqe->recv.pagesz[1] = page_size[1];
+	wqe->recv.pagesz[2] = page_size[2];
+	wqe->recv.pagesz[3] = page_size[3];
+	wqe->recv.num_sgle = cpu_to_be32(wr->num_sge);
+	for (i = 0; i < wr->num_sge; i++) {
+		wqe->recv.sgl[i].stag = cpu_to_be32(wr->sg_list[i].lkey);
+		wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
+
+		/* to in the WQE == the offset into the page */
+		wqe->recv.sgl[i].to = cpu_to_be64(((u32)wr->sg_list[i].addr) &
+				((1UL << (12 + page_size[i])) - 1));
+
+		/* pbl_addr is the adapters address in the PBL */
+		wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_addr[i]);
+	}
+	for (; i < T3_MAX_SGE; i++) {
+		wqe->recv.sgl[i].stag = 0;
+		wqe->recv.sgl[i].len = 0;
+		wqe->recv.sgl[i].to = 0;
+		wqe->recv.pbl_addr[i] = 0;
+	}
+	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+			     qhp->wq.rq_size_log2)].wr_id = wr->wr_id;
+	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+			     qhp->wq.rq_size_log2)].pbl_addr = 0;
+	return 0;
+}
+
+static int build_zero_stag_recv(struct iwch_qp *qhp, union t3_wr *wqe,
+				struct ib_recv_wr *wr)
+{
+	int i;
+	u32 pbl_addr;
+	u32 pbl_offset;
+
+
+	/*
+	 * The T3 HW requires the PBL in the HW recv descriptor to reference
+	 * a PBL entry.  So we allocate the max needed PBL memory here and pass
+	 * it to the uP in the recv WR.  The uP will build the PBL and setup
+	 * the HW recv descriptor.
+	 */
+	pbl_addr = cxio_hal_pblpool_alloc(&qhp->rhp->rdev, T3_STAG0_PBL_SIZE);
+	if (!pbl_addr)
+		return -ENOMEM;
+
+	/*
+	 * Compute the 8B aligned offset.
+	 */
+	pbl_offset = (pbl_addr - qhp->rhp->rdev.rnic_info.pbl_base) >> 3;
+
+	wqe->recv.num_sgle = cpu_to_be32(wr->num_sge);
+
+	for (i = 0; i < wr->num_sge; i++) {
+
+		/*
+		 * Use a 128MB page size. This and an imposed 128MB
+		 * sge length limit allows us to require only a 2-entry HW
+		 * PBL for each SGE.  This restriction is acceptable since
+		 * since it is not possible to allocate 128MB of contiguous
+		 * DMA coherent memory!
+		 */
+		if (wr->sg_list[i].length > T3_STAG0_MAX_PBE_LEN)
+			return -EINVAL;
+		wqe->recv.pagesz[i] = T3_STAG0_PAGE_SHIFT;
+
+		/*
+		 * T3 restricts a recv to all zero-stag or all non-zero-stag.
+		 */
+		if (wr->sg_list[i].lkey != 0)
+			return -EINVAL;
+		wqe->recv.sgl[i].stag = 0;
+		wqe->recv.sgl[i].len = cpu_to_be32(wr->sg_list[i].length);
+		wqe->recv.sgl[i].to = cpu_to_be64(wr->sg_list[i].addr);
+		wqe->recv.pbl_addr[i] = cpu_to_be32(pbl_offset);
+		pbl_offset += 2;
+	}
+	for (; i < T3_MAX_SGE; i++) {
+		wqe->recv.pagesz[i] = 0;
+		wqe->recv.sgl[i].stag = 0;
+		wqe->recv.sgl[i].len = 0;
+		wqe->recv.sgl[i].to = 0;
+		wqe->recv.pbl_addr[i] = 0;
+	}
+	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+			     qhp->wq.rq_size_log2)].wr_id = wr->wr_id;
+	qhp->wq.rq[Q_PTR2IDX(qhp->wq.rq_wptr,
+			     qhp->wq.rq_size_log2)].pbl_addr = pbl_addr;
+	return 0;
+}
+
+int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr)
+{
+	int err = 0;
+	u8 uninitialized_var(t3_wr_flit_cnt);
+	enum t3_wr_opcode t3_wr_opcode = 0;
+	enum t3_wr_flags t3_wr_flags;
+	struct iwch_qp *qhp;
+	u32 idx;
+	union t3_wr *wqe;
+	u32 num_wrs;
+	unsigned long flag;
+	struct t3_swsq *sqp;
+	int wr_cnt = 1;
+
+	qhp = to_iwch_qp(ibqp);
+	spin_lock_irqsave(&qhp->lock, flag);
+	if (qhp->attr.state > IWCH_QP_STATE_RTS) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		err = -EINVAL;
+		goto out;
+	}
+	num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr,
+		  qhp->wq.sq_size_log2);
+	if (num_wrs == 0) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		err = -ENOMEM;
+		goto out;
+	}
+	while (wr) {
+		if (num_wrs == 0) {
+			err = -ENOMEM;
+			break;
+		}
+		idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
+		wqe = (union t3_wr *) (qhp->wq.queue + idx);
+		t3_wr_flags = 0;
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			t3_wr_flags |= T3_SOLICITED_EVENT_FLAG;
+		if (wr->send_flags & IB_SEND_SIGNALED)
+			t3_wr_flags |= T3_COMPLETION_FLAG;
+		sqp = qhp->wq.sq +
+		      Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
+		switch (wr->opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_INV:
+			if (wr->send_flags & IB_SEND_FENCE)
+				t3_wr_flags |= T3_READ_FENCE_FLAG;
+			t3_wr_opcode = T3_WR_SEND;
+			err = build_rdma_send(wqe, wr, &t3_wr_flit_cnt);
+			break;
+		case IB_WR_RDMA_WRITE:
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			t3_wr_opcode = T3_WR_WRITE;
+			err = build_rdma_write(wqe, wr, &t3_wr_flit_cnt);
+			break;
+		case IB_WR_RDMA_READ:
+		case IB_WR_RDMA_READ_WITH_INV:
+			t3_wr_opcode = T3_WR_READ;
+			t3_wr_flags = 0; /* T3 reads are always signaled */
+			err = build_rdma_read(wqe, wr, &t3_wr_flit_cnt);
+			if (err)
+				break;
+			sqp->read_len = wqe->read.local_len;
+			if (!qhp->wq.oldest_read)
+				qhp->wq.oldest_read = sqp;
+			break;
+		case IB_WR_FAST_REG_MR:
+			t3_wr_opcode = T3_WR_FASTREG;
+			err = build_fastreg(wqe, wr, &t3_wr_flit_cnt,
+						 &wr_cnt, &qhp->wq);
+			break;
+		case IB_WR_LOCAL_INV:
+			if (wr->send_flags & IB_SEND_FENCE)
+				t3_wr_flags |= T3_LOCAL_FENCE_FLAG;
+			t3_wr_opcode = T3_WR_INV_STAG;
+			err = build_inv_stag(wqe, wr, &t3_wr_flit_cnt);
+			break;
+		default:
+			PDBG("%s post of type=%d TBD!\n", __func__,
+			     wr->opcode);
+			err = -EINVAL;
+		}
+		if (err)
+			break;
+		wqe->send.wrid.id0.hi = qhp->wq.sq_wptr;
+		sqp->wr_id = wr->wr_id;
+		sqp->opcode = wr2opcode(t3_wr_opcode);
+		sqp->sq_wptr = qhp->wq.sq_wptr;
+		sqp->complete = 0;
+		sqp->signaled = (wr->send_flags & IB_SEND_SIGNALED);
+
+		build_fw_riwrh((void *) wqe, t3_wr_opcode, t3_wr_flags,
+			       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
+			       0, t3_wr_flit_cnt,
+			       (wr_cnt == 1) ? T3_SOPEOP : T3_SOP);
+		PDBG("%s cookie 0x%llx wq idx 0x%x swsq idx %ld opcode %d\n",
+		     __func__, (unsigned long long) wr->wr_id, idx,
+		     Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2),
+		     sqp->opcode);
+		wr = wr->next;
+		num_wrs--;
+		qhp->wq.wptr += wr_cnt;
+		++(qhp->wq.sq_wptr);
+	}
+	spin_unlock_irqrestore(&qhp->lock, flag);
+	if (cxio_wq_db_enabled(&qhp->wq))
+		ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+
+out:
+	if (err)
+		*bad_wr = wr;
+	return err;
+}
+
+int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	int err = 0;
+	struct iwch_qp *qhp;
+	u32 idx;
+	union t3_wr *wqe;
+	u32 num_wrs;
+	unsigned long flag;
+
+	qhp = to_iwch_qp(ibqp);
+	spin_lock_irqsave(&qhp->lock, flag);
+	if (qhp->attr.state > IWCH_QP_STATE_RTS) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		err = -EINVAL;
+		goto out;
+	}
+	num_wrs = Q_FREECNT(qhp->wq.rq_rptr, qhp->wq.rq_wptr,
+			    qhp->wq.rq_size_log2) - 1;
+	if (!wr) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		err = -ENOMEM;
+		goto out;
+	}
+	while (wr) {
+		if (wr->num_sge > T3_MAX_SGE) {
+			err = -EINVAL;
+			break;
+		}
+		idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
+		wqe = (union t3_wr *) (qhp->wq.queue + idx);
+		if (num_wrs)
+			if (wr->sg_list[0].lkey)
+				err = build_rdma_recv(qhp, wqe, wr);
+			else
+				err = build_zero_stag_recv(qhp, wqe, wr);
+		else
+			err = -ENOMEM;
+
+		if (err)
+			break;
+
+		build_fw_riwrh((void *) wqe, T3_WR_RCV, T3_COMPLETION_FLAG,
+			       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2),
+			       0, sizeof(struct t3_receive_wr) >> 3, T3_SOPEOP);
+		PDBG("%s cookie 0x%llx idx 0x%x rq_wptr 0x%x rw_rptr 0x%x "
+		     "wqe %p \n", __func__, (unsigned long long) wr->wr_id,
+		     idx, qhp->wq.rq_wptr, qhp->wq.rq_rptr, wqe);
+		++(qhp->wq.rq_wptr);
+		++(qhp->wq.wptr);
+		wr = wr->next;
+		num_wrs--;
+	}
+	spin_unlock_irqrestore(&qhp->lock, flag);
+	if (cxio_wq_db_enabled(&qhp->wq))
+		ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+
+out:
+	if (err)
+		*bad_wr = wr;
+	return err;
+}
+
+int iwch_bind_mw(struct ib_qp *qp,
+			     struct ib_mw *mw,
+			     struct ib_mw_bind *mw_bind)
+{
+	struct iwch_dev *rhp;
+	struct iwch_mw *mhp;
+	struct iwch_qp *qhp;
+	union t3_wr *wqe;
+	u32 pbl_addr;
+	u8 page_size;
+	u32 num_wrs;
+	unsigned long flag;
+	struct ib_sge sgl;
+	int err=0;
+	enum t3_wr_flags t3_wr_flags;
+	u32 idx;
+	struct t3_swsq *sqp;
+
+	qhp = to_iwch_qp(qp);
+	mhp = to_iwch_mw(mw);
+	rhp = qhp->rhp;
+
+	spin_lock_irqsave(&qhp->lock, flag);
+	if (qhp->attr.state > IWCH_QP_STATE_RTS) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return -EINVAL;
+	}
+	num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr,
+			    qhp->wq.sq_size_log2);
+	if (num_wrs == 0) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return -ENOMEM;
+	}
+	idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2);
+	PDBG("%s: idx 0x%0x, mw 0x%p, mw_bind 0x%p\n", __func__, idx,
+	     mw, mw_bind);
+	wqe = (union t3_wr *) (qhp->wq.queue + idx);
+
+	t3_wr_flags = 0;
+	if (mw_bind->send_flags & IB_SEND_SIGNALED)
+		t3_wr_flags = T3_COMPLETION_FLAG;
+
+	sgl.addr = mw_bind->bind_info.addr;
+	sgl.lkey = mw_bind->bind_info.mr->lkey;
+	sgl.length = mw_bind->bind_info.length;
+	wqe->bind.reserved = 0;
+	wqe->bind.type = TPT_VATO;
+
+	/* TBD: check perms */
+	wqe->bind.perms = iwch_ib_to_tpt_bind_access(
+		mw_bind->bind_info.mw_access_flags);
+	wqe->bind.mr_stag = cpu_to_be32(mw_bind->bind_info.mr->lkey);
+	wqe->bind.mw_stag = cpu_to_be32(mw->rkey);
+	wqe->bind.mw_len = cpu_to_be32(mw_bind->bind_info.length);
+	wqe->bind.mw_va = cpu_to_be64(mw_bind->bind_info.addr);
+	err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size);
+	if (err) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return err;
+	}
+	wqe->send.wrid.id0.hi = qhp->wq.sq_wptr;
+	sqp = qhp->wq.sq + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2);
+	sqp->wr_id = mw_bind->wr_id;
+	sqp->opcode = T3_BIND_MW;
+	sqp->sq_wptr = qhp->wq.sq_wptr;
+	sqp->complete = 0;
+	sqp->signaled = (mw_bind->send_flags & IB_SEND_SIGNALED);
+	wqe->bind.mr_pbl_addr = cpu_to_be32(pbl_addr);
+	wqe->bind.mr_pagesz = page_size;
+	build_fw_riwrh((void *)wqe, T3_WR_BIND, t3_wr_flags,
+		       Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0,
+		       sizeof(struct t3_bind_mw_wr) >> 3, T3_SOPEOP);
+	++(qhp->wq.wptr);
+	++(qhp->wq.sq_wptr);
+	spin_unlock_irqrestore(&qhp->lock, flag);
+
+	if (cxio_wq_db_enabled(&qhp->wq))
+		ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid);
+
+	return err;
+}
+
+static inline void build_term_codes(struct respQ_msg_t *rsp_msg,
+				    u8 *layer_type, u8 *ecode)
+{
+	int status = TPT_ERR_INTERNAL_ERR;
+	int tagged = 0;
+	int opcode = -1;
+	int rqtype = 0;
+	int send_inv = 0;
+
+	if (rsp_msg) {
+		status = CQE_STATUS(rsp_msg->cqe);
+		opcode = CQE_OPCODE(rsp_msg->cqe);
+		rqtype = RQ_TYPE(rsp_msg->cqe);
+		send_inv = (opcode == T3_SEND_WITH_INV) ||
+		           (opcode == T3_SEND_WITH_SE_INV);
+		tagged = (opcode == T3_RDMA_WRITE) ||
+			 (rqtype && (opcode == T3_READ_RESP));
+	}
+
+	switch (status) {
+	case TPT_ERR_STAG:
+		if (send_inv) {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+			*ecode = RDMAP_CANT_INV_STAG;
+		} else {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+			*ecode = RDMAP_INV_STAG;
+		}
+		break;
+	case TPT_ERR_PDID:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		if ((opcode == T3_SEND_WITH_INV) ||
+		    (opcode == T3_SEND_WITH_SE_INV))
+			*ecode = RDMAP_CANT_INV_STAG;
+		else
+			*ecode = RDMAP_STAG_NOT_ASSOC;
+		break;
+	case TPT_ERR_QPID:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_STAG_NOT_ASSOC;
+		break;
+	case TPT_ERR_ACCESS:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_ACC_VIOL;
+		break;
+	case TPT_ERR_WRAP:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_TO_WRAP;
+		break;
+	case TPT_ERR_BOUND:
+		if (tagged) {
+			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+			*ecode = DDPT_BASE_BOUNDS;
+		} else {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+			*ecode = RDMAP_BASE_BOUNDS;
+		}
+		break;
+	case TPT_ERR_INVALIDATE_SHARED_MR:
+	case TPT_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_CANT_INV_STAG;
+		break;
+	case TPT_ERR_ECC:
+	case TPT_ERR_ECC_PSTAG:
+	case TPT_ERR_INTERNAL_ERR:
+		*layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	case TPT_ERR_OUT_OF_RQE:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MSN_NOBUF;
+		break;
+	case TPT_ERR_PBL_ADDR_BOUND:
+		*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+		*ecode = DDPT_BASE_BOUNDS;
+		break;
+	case TPT_ERR_CRC:
+		*layer_type = LAYER_MPA|DDP_LLP;
+		*ecode = MPA_CRC_ERR;
+		break;
+	case TPT_ERR_MARKER:
+		*layer_type = LAYER_MPA|DDP_LLP;
+		*ecode = MPA_MARKER_ERR;
+		break;
+	case TPT_ERR_PDU_LEN_ERR:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_MSG_TOOBIG;
+		break;
+	case TPT_ERR_DDP_VERSION:
+		if (tagged) {
+			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+			*ecode = DDPT_INV_VERS;
+		} else {
+			*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+			*ecode = DDPU_INV_VERS;
+		}
+		break;
+	case TPT_ERR_RDMA_VERSION:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_INV_VERS;
+		break;
+	case TPT_ERR_OPCODE:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_INV_OPCODE;
+		break;
+	case TPT_ERR_DDP_QUEUE_NUM:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_QN;
+		break;
+	case TPT_ERR_MSN:
+	case TPT_ERR_MSN_GAP:
+	case TPT_ERR_MSN_RANGE:
+	case TPT_ERR_IRD_OVERFLOW:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MSN_RANGE;
+		break;
+	case TPT_ERR_TBIT:
+		*layer_type = LAYER_DDP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	case TPT_ERR_MO:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MO;
+		break;
+	default:
+		*layer_type = LAYER_RDMAP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	}
+}
+
+int iwch_post_zb_read(struct iwch_ep *ep)
+{
+	union t3_wr *wqe;
+	struct sk_buff *skb;
+	u8 flit_cnt = sizeof(struct t3_rdma_read_wr) >> 3;
+
+	PDBG("%s enter\n", __func__);
+	skb = alloc_skb(40, GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR "%s cannot send zb_read!!\n", __func__);
+		return -ENOMEM;
+	}
+	wqe = (union t3_wr *)skb_put(skb, sizeof(struct t3_rdma_read_wr));
+	memset(wqe, 0, sizeof(struct t3_rdma_read_wr));
+	wqe->read.rdmaop = T3_READ_REQ;
+	wqe->read.reserved[0] = 0;
+	wqe->read.reserved[1] = 0;
+	wqe->read.rem_stag = cpu_to_be32(1);
+	wqe->read.rem_to = cpu_to_be64(1);
+	wqe->read.local_stag = cpu_to_be32(1);
+	wqe->read.local_len = cpu_to_be32(0);
+	wqe->read.local_to = cpu_to_be64(1);
+	wqe->send.wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_READ));
+	wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(ep->hwtid)|
+						V_FW_RIWR_LEN(flit_cnt));
+	skb->priority = CPL_PRIORITY_DATA;
+	return iwch_cxgb3_ofld_send(ep->com.qp->rhp->rdev.t3cdev_p, skb);
+}
+
+/*
+ * This posts a TERMINATE with layer=RDMA, type=catastrophic.
+ */
+int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg)
+{
+	union t3_wr *wqe;
+	struct terminate_message *term;
+	struct sk_buff *skb;
+
+	PDBG("%s %d\n", __func__, __LINE__);
+	skb = alloc_skb(40, GFP_ATOMIC);
+	if (!skb) {
+		printk(KERN_ERR "%s cannot send TERMINATE!\n", __func__);
+		return -ENOMEM;
+	}
+	wqe = (union t3_wr *)skb_put(skb, 40);
+	memset(wqe, 0, 40);
+	wqe->send.rdmaop = T3_TERMINATE;
+
+	/* immediate data length */
+	wqe->send.plen = htonl(4);
+
+	/* immediate data starts here. */
+	term = (struct terminate_message *)wqe->send.sgl;
+	build_term_codes(rsp_msg, &term->layer_etype, &term->ecode);
+	wqe->send.wrh.op_seop_flags = cpu_to_be32(V_FW_RIWR_OP(T3_WR_SEND) |
+			 V_FW_RIWR_FLAGS(T3_COMPLETION_FLAG | T3_NOTIFY_FLAG));
+	wqe->send.wrh.gen_tid_len = cpu_to_be32(V_FW_RIWR_TID(qhp->ep->hwtid));
+	skb->priority = CPL_PRIORITY_DATA;
+	return iwch_cxgb3_ofld_send(qhp->rhp->rdev.t3cdev_p, skb);
+}
+
+/*
+ * Assumes qhp lock is held.
+ */
+static void __flush_qp(struct iwch_qp *qhp, struct iwch_cq *rchp,
+				struct iwch_cq *schp)
+{
+	int count;
+	int flushed;
+
+
+	PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp);
+	/* take a ref on the qhp since we must release the lock */
+	atomic_inc(&qhp->refcnt);
+	spin_unlock(&qhp->lock);
+
+	/* locking hierarchy: cq lock first, then qp lock. */
+	spin_lock(&rchp->lock);
+	spin_lock(&qhp->lock);
+	cxio_flush_hw_cq(&rchp->cq);
+	cxio_count_rcqes(&rchp->cq, &qhp->wq, &count);
+	flushed = cxio_flush_rq(&qhp->wq, &rchp->cq, count);
+	spin_unlock(&qhp->lock);
+	spin_unlock(&rchp->lock);
+	if (flushed) {
+		spin_lock(&rchp->comp_handler_lock);
+		(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
+		spin_unlock(&rchp->comp_handler_lock);
+	}
+
+	/* locking hierarchy: cq lock first, then qp lock. */
+	spin_lock(&schp->lock);
+	spin_lock(&qhp->lock);
+	cxio_flush_hw_cq(&schp->cq);
+	cxio_count_scqes(&schp->cq, &qhp->wq, &count);
+	flushed = cxio_flush_sq(&qhp->wq, &schp->cq, count);
+	spin_unlock(&qhp->lock);
+	spin_unlock(&schp->lock);
+	if (flushed) {
+		spin_lock(&schp->comp_handler_lock);
+		(*schp->ibcq.comp_handler)(&schp->ibcq, schp->ibcq.cq_context);
+		spin_unlock(&schp->comp_handler_lock);
+	}
+
+	/* deref */
+	if (atomic_dec_and_test(&qhp->refcnt))
+	        wake_up(&qhp->wait);
+
+	spin_lock(&qhp->lock);
+}
+
+static void flush_qp(struct iwch_qp *qhp)
+{
+	struct iwch_cq *rchp, *schp;
+
+	rchp = get_chp(qhp->rhp, qhp->attr.rcq);
+	schp = get_chp(qhp->rhp, qhp->attr.scq);
+
+	if (qhp->ibqp.uobject) {
+		cxio_set_wq_in_error(&qhp->wq);
+		cxio_set_cq_in_error(&rchp->cq);
+		spin_lock(&rchp->comp_handler_lock);
+		(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
+		spin_unlock(&rchp->comp_handler_lock);
+		if (schp != rchp) {
+			cxio_set_cq_in_error(&schp->cq);
+			spin_lock(&schp->comp_handler_lock);
+			(*schp->ibcq.comp_handler)(&schp->ibcq,
+						   schp->ibcq.cq_context);
+			spin_unlock(&schp->comp_handler_lock);
+		}
+		return;
+	}
+	__flush_qp(qhp, rchp, schp);
+}
+
+
+/*
+ * Return count of RECV WRs posted
+ */
+u16 iwch_rqes_posted(struct iwch_qp *qhp)
+{
+	union t3_wr *wqe = qhp->wq.queue;
+	u16 count = 0;
+
+	while (count < USHRT_MAX && fw_riwrh_opcode((struct fw_riwrh *)wqe) == T3_WR_RCV) {
+		count++;
+		wqe++;
+	}
+	PDBG("%s qhp %p count %u\n", __func__, qhp, count);
+	return count;
+}
+
+static int rdma_init(struct iwch_dev *rhp, struct iwch_qp *qhp,
+				enum iwch_qp_attr_mask mask,
+				struct iwch_qp_attributes *attrs)
+{
+	struct t3_rdma_init_attr init_attr;
+	int ret;
+
+	init_attr.tid = qhp->ep->hwtid;
+	init_attr.qpid = qhp->wq.qpid;
+	init_attr.pdid = qhp->attr.pd;
+	init_attr.scqid = qhp->attr.scq;
+	init_attr.rcqid = qhp->attr.rcq;
+	init_attr.rq_addr = qhp->wq.rq_addr;
+	init_attr.rq_size = 1 << qhp->wq.rq_size_log2;
+	init_attr.mpaattrs = uP_RI_MPA_IETF_ENABLE |
+		qhp->attr.mpa_attr.recv_marker_enabled |
+		(qhp->attr.mpa_attr.xmit_marker_enabled << 1) |
+		(qhp->attr.mpa_attr.crc_enabled << 2);
+
+	init_attr.qpcaps = uP_RI_QP_RDMA_READ_ENABLE |
+			   uP_RI_QP_RDMA_WRITE_ENABLE |
+			   uP_RI_QP_BIND_ENABLE;
+	if (!qhp->ibqp.uobject)
+		init_attr.qpcaps |= uP_RI_QP_STAG0_ENABLE |
+				    uP_RI_QP_FAST_REGISTER_ENABLE;
+
+	init_attr.tcp_emss = qhp->ep->emss;
+	init_attr.ord = qhp->attr.max_ord;
+	init_attr.ird = qhp->attr.max_ird;
+	init_attr.qp_dma_addr = qhp->wq.dma_addr;
+	init_attr.qp_dma_size = (1UL << qhp->wq.size_log2);
+	init_attr.rqe_count = iwch_rqes_posted(qhp);
+	init_attr.flags = qhp->attr.mpa_attr.initiator ? MPA_INITIATOR : 0;
+	init_attr.chan = qhp->ep->l2t->smt_idx;
+	if (peer2peer) {
+		init_attr.rtr_type = RTR_READ;
+		if (init_attr.ord == 0 && qhp->attr.mpa_attr.initiator)
+			init_attr.ord = 1;
+		if (init_attr.ird == 0 && !qhp->attr.mpa_attr.initiator)
+			init_attr.ird = 1;
+	} else
+		init_attr.rtr_type = 0;
+	init_attr.irs = qhp->ep->rcv_seq;
+	PDBG("%s init_attr.rq_addr 0x%x init_attr.rq_size = %d "
+	     "flags 0x%x qpcaps 0x%x\n", __func__,
+	     init_attr.rq_addr, init_attr.rq_size,
+	     init_attr.flags, init_attr.qpcaps);
+	ret = cxio_rdma_init(&rhp->rdev, &init_attr);
+	PDBG("%s ret %d\n", __func__, ret);
+	return ret;
+}
+
+int iwch_modify_qp(struct iwch_dev *rhp, struct iwch_qp *qhp,
+				enum iwch_qp_attr_mask mask,
+				struct iwch_qp_attributes *attrs,
+				int internal)
+{
+	int ret = 0;
+	struct iwch_qp_attributes newattr = qhp->attr;
+	unsigned long flag;
+	int disconnect = 0;
+	int terminate = 0;
+	int abort = 0;
+	int free = 0;
+	struct iwch_ep *ep = NULL;
+
+	PDBG("%s qhp %p qpid 0x%x ep %p state %d -> %d\n", __func__,
+	     qhp, qhp->wq.qpid, qhp->ep, qhp->attr.state,
+	     (mask & IWCH_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1);
+
+	spin_lock_irqsave(&qhp->lock, flag);
+
+	/* Process attr changes if in IDLE */
+	if (mask & IWCH_QP_ATTR_VALID_MODIFY) {
+		if (qhp->attr.state != IWCH_QP_STATE_IDLE) {
+			ret = -EIO;
+			goto out;
+		}
+		if (mask & IWCH_QP_ATTR_ENABLE_RDMA_READ)
+			newattr.enable_rdma_read = attrs->enable_rdma_read;
+		if (mask & IWCH_QP_ATTR_ENABLE_RDMA_WRITE)
+			newattr.enable_rdma_write = attrs->enable_rdma_write;
+		if (mask & IWCH_QP_ATTR_ENABLE_RDMA_BIND)
+			newattr.enable_bind = attrs->enable_bind;
+		if (mask & IWCH_QP_ATTR_MAX_ORD) {
+			if (attrs->max_ord >
+			    rhp->attr.max_rdma_read_qp_depth) {
+				ret = -EINVAL;
+				goto out;
+			}
+			newattr.max_ord = attrs->max_ord;
+		}
+		if (mask & IWCH_QP_ATTR_MAX_IRD) {
+			if (attrs->max_ird >
+			    rhp->attr.max_rdma_reads_per_qp) {
+				ret = -EINVAL;
+				goto out;
+			}
+			newattr.max_ird = attrs->max_ird;
+		}
+		qhp->attr = newattr;
+	}
+
+	if (!(mask & IWCH_QP_ATTR_NEXT_STATE))
+		goto out;
+	if (qhp->attr.state == attrs->next_state)
+		goto out;
+
+	switch (qhp->attr.state) {
+	case IWCH_QP_STATE_IDLE:
+		switch (attrs->next_state) {
+		case IWCH_QP_STATE_RTS:
+			if (!(mask & IWCH_QP_ATTR_LLP_STREAM_HANDLE)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			if (!(mask & IWCH_QP_ATTR_MPA_ATTR)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			qhp->attr.mpa_attr = attrs->mpa_attr;
+			qhp->attr.llp_stream_handle = attrs->llp_stream_handle;
+			qhp->ep = qhp->attr.llp_stream_handle;
+			qhp->attr.state = IWCH_QP_STATE_RTS;
+
+			/*
+			 * Ref the endpoint here and deref when we
+			 * disassociate the endpoint from the QP.  This
+			 * happens in CLOSING->IDLE transition or *->ERROR
+			 * transition.
+			 */
+			get_ep(&qhp->ep->com);
+			spin_unlock_irqrestore(&qhp->lock, flag);
+			ret = rdma_init(rhp, qhp, mask, attrs);
+			spin_lock_irqsave(&qhp->lock, flag);
+			if (ret)
+				goto err;
+			break;
+		case IWCH_QP_STATE_ERROR:
+			qhp->attr.state = IWCH_QP_STATE_ERROR;
+			flush_qp(qhp);
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case IWCH_QP_STATE_RTS:
+		switch (attrs->next_state) {
+		case IWCH_QP_STATE_CLOSING:
+			BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2);
+			qhp->attr.state = IWCH_QP_STATE_CLOSING;
+			if (!internal) {
+				abort=0;
+				disconnect = 1;
+				ep = qhp->ep;
+				get_ep(&ep->com);
+			}
+			break;
+		case IWCH_QP_STATE_TERMINATE:
+			qhp->attr.state = IWCH_QP_STATE_TERMINATE;
+			if (qhp->ibqp.uobject)
+				cxio_set_wq_in_error(&qhp->wq);
+			if (!internal)
+				terminate = 1;
+			break;
+		case IWCH_QP_STATE_ERROR:
+			qhp->attr.state = IWCH_QP_STATE_ERROR;
+			if (!internal) {
+				abort=1;
+				disconnect = 1;
+				ep = qhp->ep;
+				get_ep(&ep->com);
+			}
+			goto err;
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case IWCH_QP_STATE_CLOSING:
+		if (!internal) {
+			ret = -EINVAL;
+			goto out;
+		}
+		switch (attrs->next_state) {
+			case IWCH_QP_STATE_IDLE:
+				flush_qp(qhp);
+				qhp->attr.state = IWCH_QP_STATE_IDLE;
+				qhp->attr.llp_stream_handle = NULL;
+				put_ep(&qhp->ep->com);
+				qhp->ep = NULL;
+				wake_up(&qhp->wait);
+				break;
+			case IWCH_QP_STATE_ERROR:
+				goto err;
+			default:
+				ret = -EINVAL;
+				goto err;
+		}
+		break;
+	case IWCH_QP_STATE_ERROR:
+		if (attrs->next_state != IWCH_QP_STATE_IDLE) {
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (!Q_EMPTY(qhp->wq.sq_rptr, qhp->wq.sq_wptr) ||
+		    !Q_EMPTY(qhp->wq.rq_rptr, qhp->wq.rq_wptr)) {
+			ret = -EINVAL;
+			goto out;
+		}
+		qhp->attr.state = IWCH_QP_STATE_IDLE;
+		break;
+	case IWCH_QP_STATE_TERMINATE:
+		if (!internal) {
+			ret = -EINVAL;
+			goto out;
+		}
+		goto err;
+		break;
+	default:
+		printk(KERN_ERR "%s in a bad state %d\n",
+		       __func__, qhp->attr.state);
+		ret = -EINVAL;
+		goto err;
+		break;
+	}
+	goto out;
+err:
+	PDBG("%s disassociating ep %p qpid 0x%x\n", __func__, qhp->ep,
+	     qhp->wq.qpid);
+
+	/* disassociate the LLP connection */
+	qhp->attr.llp_stream_handle = NULL;
+	ep = qhp->ep;
+	qhp->ep = NULL;
+	qhp->attr.state = IWCH_QP_STATE_ERROR;
+	free=1;
+	wake_up(&qhp->wait);
+	BUG_ON(!ep);
+	flush_qp(qhp);
+out:
+	spin_unlock_irqrestore(&qhp->lock, flag);
+
+	if (terminate)
+		iwch_post_terminate(qhp, NULL);
+
+	/*
+	 * If disconnect is 1, then we need to initiate a disconnect
+	 * on the EP.  This can be a normal close (RTS->CLOSING) or
+	 * an abnormal close (RTS/CLOSING->ERROR).
+	 */
+	if (disconnect) {
+		iwch_ep_disconnect(ep, abort, GFP_KERNEL);
+		put_ep(&ep->com);
+	}
+
+	/*
+	 * If free is 1, then we've disassociated the EP from the QP
+	 * and we need to dereference the EP.
+	 */
+	if (free)
+		put_ep(&ep->com);
+
+	PDBG("%s exit state %d\n", __func__, qhp->attr.state);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/cxgb3/iwch_user.h b/drivers/infiniband/hw/cxgb3/iwch_user.h
new file mode 100644
index 000000000..a277c31fc
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/iwch_user.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2006 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __IWCH_USER_H__
+#define __IWCH_USER_H__
+
+#define IWCH_UVERBS_ABI_VERSION	1
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+struct iwch_create_cq_req {
+	__u64 user_rptr_addr;
+};
+
+struct iwch_create_cq_resp_v0 {
+	__u64 key;
+	__u32 cqid;
+	__u32 size_log2;
+};
+
+struct iwch_create_cq_resp {
+	__u64 key;
+	__u32 cqid;
+	__u32 size_log2;
+	__u32 memsize;
+	__u32 reserved;
+};
+
+struct iwch_create_qp_resp {
+	__u64 key;
+	__u64 db_key;
+	__u32 qpid;
+	__u32 size_log2;
+	__u32 sq_size_log2;
+	__u32 rq_size_log2;
+};
+
+struct iwch_reg_user_mr_resp {
+	__u32 pbl_addr;
+};
+#endif
diff --git a/drivers/infiniband/hw/cxgb3/tcb.h b/drivers/infiniband/hw/cxgb3/tcb.h
new file mode 100644
index 000000000..c702dc199
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb3/tcb.h
@@ -0,0 +1,632 @@
+/*
+ * Copyright (c) 2007 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _TCB_DEFS_H
+#define _TCB_DEFS_H
+
+#define W_TCB_T_STATE    0
+#define S_TCB_T_STATE    0
+#define M_TCB_T_STATE    0xfULL
+#define V_TCB_T_STATE(x) ((x) << S_TCB_T_STATE)
+
+#define W_TCB_TIMER    0
+#define S_TCB_TIMER    4
+#define M_TCB_TIMER    0x1ULL
+#define V_TCB_TIMER(x) ((x) << S_TCB_TIMER)
+
+#define W_TCB_DACK_TIMER    0
+#define S_TCB_DACK_TIMER    5
+#define M_TCB_DACK_TIMER    0x1ULL
+#define V_TCB_DACK_TIMER(x) ((x) << S_TCB_DACK_TIMER)
+
+#define W_TCB_DEL_FLAG    0
+#define S_TCB_DEL_FLAG    6
+#define M_TCB_DEL_FLAG    0x1ULL
+#define V_TCB_DEL_FLAG(x) ((x) << S_TCB_DEL_FLAG)
+
+#define W_TCB_L2T_IX    0
+#define S_TCB_L2T_IX    7
+#define M_TCB_L2T_IX    0x7ffULL
+#define V_TCB_L2T_IX(x) ((x) << S_TCB_L2T_IX)
+
+#define W_TCB_SMAC_SEL    0
+#define S_TCB_SMAC_SEL    18
+#define M_TCB_SMAC_SEL    0x3ULL
+#define V_TCB_SMAC_SEL(x) ((x) << S_TCB_SMAC_SEL)
+
+#define W_TCB_TOS    0
+#define S_TCB_TOS    20
+#define M_TCB_TOS    0x3fULL
+#define V_TCB_TOS(x) ((x) << S_TCB_TOS)
+
+#define W_TCB_MAX_RT    0
+#define S_TCB_MAX_RT    26
+#define M_TCB_MAX_RT    0xfULL
+#define V_TCB_MAX_RT(x) ((x) << S_TCB_MAX_RT)
+
+#define W_TCB_T_RXTSHIFT    0
+#define S_TCB_T_RXTSHIFT    30
+#define M_TCB_T_RXTSHIFT    0xfULL
+#define V_TCB_T_RXTSHIFT(x) ((x) << S_TCB_T_RXTSHIFT)
+
+#define W_TCB_T_DUPACKS    1
+#define S_TCB_T_DUPACKS    2
+#define M_TCB_T_DUPACKS    0xfULL
+#define V_TCB_T_DUPACKS(x) ((x) << S_TCB_T_DUPACKS)
+
+#define W_TCB_T_MAXSEG    1
+#define S_TCB_T_MAXSEG    6
+#define M_TCB_T_MAXSEG    0xfULL
+#define V_TCB_T_MAXSEG(x) ((x) << S_TCB_T_MAXSEG)
+
+#define W_TCB_T_FLAGS1    1
+#define S_TCB_T_FLAGS1    10
+#define M_TCB_T_FLAGS1    0xffffffffULL
+#define V_TCB_T_FLAGS1(x) ((x) << S_TCB_T_FLAGS1)
+
+#define W_TCB_T_MIGRATION    1
+#define S_TCB_T_MIGRATION    20
+#define M_TCB_T_MIGRATION    0x1ULL
+#define V_TCB_T_MIGRATION(x) ((x) << S_TCB_T_MIGRATION)
+
+#define W_TCB_T_FLAGS2    2
+#define S_TCB_T_FLAGS2    10
+#define M_TCB_T_FLAGS2    0x7fULL
+#define V_TCB_T_FLAGS2(x) ((x) << S_TCB_T_FLAGS2)
+
+#define W_TCB_SND_SCALE    2
+#define S_TCB_SND_SCALE    17
+#define M_TCB_SND_SCALE    0xfULL
+#define V_TCB_SND_SCALE(x) ((x) << S_TCB_SND_SCALE)
+
+#define W_TCB_RCV_SCALE    2
+#define S_TCB_RCV_SCALE    21
+#define M_TCB_RCV_SCALE    0xfULL
+#define V_TCB_RCV_SCALE(x) ((x) << S_TCB_RCV_SCALE)
+
+#define W_TCB_SND_UNA_RAW    2
+#define S_TCB_SND_UNA_RAW    25
+#define M_TCB_SND_UNA_RAW    0x7ffffffULL
+#define V_TCB_SND_UNA_RAW(x) ((x) << S_TCB_SND_UNA_RAW)
+
+#define W_TCB_SND_NXT_RAW    3
+#define S_TCB_SND_NXT_RAW    20
+#define M_TCB_SND_NXT_RAW    0x7ffffffULL
+#define V_TCB_SND_NXT_RAW(x) ((x) << S_TCB_SND_NXT_RAW)
+
+#define W_TCB_RCV_NXT    4
+#define S_TCB_RCV_NXT    15
+#define M_TCB_RCV_NXT    0xffffffffULL
+#define V_TCB_RCV_NXT(x) ((x) << S_TCB_RCV_NXT)
+
+#define W_TCB_RCV_ADV    5
+#define S_TCB_RCV_ADV    15
+#define M_TCB_RCV_ADV    0xffffULL
+#define V_TCB_RCV_ADV(x) ((x) << S_TCB_RCV_ADV)
+
+#define W_TCB_SND_MAX_RAW    5
+#define S_TCB_SND_MAX_RAW    31
+#define M_TCB_SND_MAX_RAW    0x7ffffffULL
+#define V_TCB_SND_MAX_RAW(x) ((x) << S_TCB_SND_MAX_RAW)
+
+#define W_TCB_SND_CWND    6
+#define S_TCB_SND_CWND    26
+#define M_TCB_SND_CWND    0x7ffffffULL
+#define V_TCB_SND_CWND(x) ((x) << S_TCB_SND_CWND)
+
+#define W_TCB_SND_SSTHRESH    7
+#define S_TCB_SND_SSTHRESH    21
+#define M_TCB_SND_SSTHRESH    0x7ffffffULL
+#define V_TCB_SND_SSTHRESH(x) ((x) << S_TCB_SND_SSTHRESH)
+
+#define W_TCB_T_RTT_TS_RECENT_AGE    8
+#define S_TCB_T_RTT_TS_RECENT_AGE    16
+#define M_TCB_T_RTT_TS_RECENT_AGE    0xffffffffULL
+#define V_TCB_T_RTT_TS_RECENT_AGE(x) ((x) << S_TCB_T_RTT_TS_RECENT_AGE)
+
+#define W_TCB_T_RTSEQ_RECENT    9
+#define S_TCB_T_RTSEQ_RECENT    16
+#define M_TCB_T_RTSEQ_RECENT    0xffffffffULL
+#define V_TCB_T_RTSEQ_RECENT(x) ((x) << S_TCB_T_RTSEQ_RECENT)
+
+#define W_TCB_T_SRTT    10
+#define S_TCB_T_SRTT    16
+#define M_TCB_T_SRTT    0xffffULL
+#define V_TCB_T_SRTT(x) ((x) << S_TCB_T_SRTT)
+
+#define W_TCB_T_RTTVAR    11
+#define S_TCB_T_RTTVAR    0
+#define M_TCB_T_RTTVAR    0xffffULL
+#define V_TCB_T_RTTVAR(x) ((x) << S_TCB_T_RTTVAR)
+
+#define W_TCB_TS_LAST_ACK_SENT_RAW    11
+#define S_TCB_TS_LAST_ACK_SENT_RAW    16
+#define M_TCB_TS_LAST_ACK_SENT_RAW    0x7ffffffULL
+#define V_TCB_TS_LAST_ACK_SENT_RAW(x) ((x) << S_TCB_TS_LAST_ACK_SENT_RAW)
+
+#define W_TCB_DIP    12
+#define S_TCB_DIP    11
+#define M_TCB_DIP    0xffffffffULL
+#define V_TCB_DIP(x) ((x) << S_TCB_DIP)
+
+#define W_TCB_SIP    13
+#define S_TCB_SIP    11
+#define M_TCB_SIP    0xffffffffULL
+#define V_TCB_SIP(x) ((x) << S_TCB_SIP)
+
+#define W_TCB_DP    14
+#define S_TCB_DP    11
+#define M_TCB_DP    0xffffULL
+#define V_TCB_DP(x) ((x) << S_TCB_DP)
+
+#define W_TCB_SP    14
+#define S_TCB_SP    27
+#define M_TCB_SP    0xffffULL
+#define V_TCB_SP(x) ((x) << S_TCB_SP)
+
+#define W_TCB_TIMESTAMP    15
+#define S_TCB_TIMESTAMP    11
+#define M_TCB_TIMESTAMP    0xffffffffULL
+#define V_TCB_TIMESTAMP(x) ((x) << S_TCB_TIMESTAMP)
+
+#define W_TCB_TIMESTAMP_OFFSET    16
+#define S_TCB_TIMESTAMP_OFFSET    11
+#define M_TCB_TIMESTAMP_OFFSET    0xfULL
+#define V_TCB_TIMESTAMP_OFFSET(x) ((x) << S_TCB_TIMESTAMP_OFFSET)
+
+#define W_TCB_TX_MAX    16
+#define S_TCB_TX_MAX    15
+#define M_TCB_TX_MAX    0xffffffffULL
+#define V_TCB_TX_MAX(x) ((x) << S_TCB_TX_MAX)
+
+#define W_TCB_TX_HDR_PTR_RAW    17
+#define S_TCB_TX_HDR_PTR_RAW    15
+#define M_TCB_TX_HDR_PTR_RAW    0x1ffffULL
+#define V_TCB_TX_HDR_PTR_RAW(x) ((x) << S_TCB_TX_HDR_PTR_RAW)
+
+#define W_TCB_TX_LAST_PTR_RAW    18
+#define S_TCB_TX_LAST_PTR_RAW    0
+#define M_TCB_TX_LAST_PTR_RAW    0x1ffffULL
+#define V_TCB_TX_LAST_PTR_RAW(x) ((x) << S_TCB_TX_LAST_PTR_RAW)
+
+#define W_TCB_TX_COMPACT    18
+#define S_TCB_TX_COMPACT    17
+#define M_TCB_TX_COMPACT    0x1ULL
+#define V_TCB_TX_COMPACT(x) ((x) << S_TCB_TX_COMPACT)
+
+#define W_TCB_RX_COMPACT    18
+#define S_TCB_RX_COMPACT    18
+#define M_TCB_RX_COMPACT    0x1ULL
+#define V_TCB_RX_COMPACT(x) ((x) << S_TCB_RX_COMPACT)
+
+#define W_TCB_RCV_WND    18
+#define S_TCB_RCV_WND    19
+#define M_TCB_RCV_WND    0x7ffffffULL
+#define V_TCB_RCV_WND(x) ((x) << S_TCB_RCV_WND)
+
+#define W_TCB_RX_HDR_OFFSET    19
+#define S_TCB_RX_HDR_OFFSET    14
+#define M_TCB_RX_HDR_OFFSET    0x7ffffffULL
+#define V_TCB_RX_HDR_OFFSET(x) ((x) << S_TCB_RX_HDR_OFFSET)
+
+#define W_TCB_RX_FRAG0_START_IDX_RAW    20
+#define S_TCB_RX_FRAG0_START_IDX_RAW    9
+#define M_TCB_RX_FRAG0_START_IDX_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG0_START_IDX_RAW(x) ((x) << S_TCB_RX_FRAG0_START_IDX_RAW)
+
+#define W_TCB_RX_FRAG1_START_IDX_OFFSET    21
+#define S_TCB_RX_FRAG1_START_IDX_OFFSET    4
+#define M_TCB_RX_FRAG1_START_IDX_OFFSET    0x7ffffffULL
+#define V_TCB_RX_FRAG1_START_IDX_OFFSET(x) ((x) << S_TCB_RX_FRAG1_START_IDX_OFFSET)
+
+#define W_TCB_RX_FRAG0_LEN    21
+#define S_TCB_RX_FRAG0_LEN    31
+#define M_TCB_RX_FRAG0_LEN    0x7ffffffULL
+#define V_TCB_RX_FRAG0_LEN(x) ((x) << S_TCB_RX_FRAG0_LEN)
+
+#define W_TCB_RX_FRAG1_LEN    22
+#define S_TCB_RX_FRAG1_LEN    26
+#define M_TCB_RX_FRAG1_LEN    0x7ffffffULL
+#define V_TCB_RX_FRAG1_LEN(x) ((x) << S_TCB_RX_FRAG1_LEN)
+
+#define W_TCB_NEWRENO_RECOVER    23
+#define S_TCB_NEWRENO_RECOVER    21
+#define M_TCB_NEWRENO_RECOVER    0x7ffffffULL
+#define V_TCB_NEWRENO_RECOVER(x) ((x) << S_TCB_NEWRENO_RECOVER)
+
+#define W_TCB_PDU_HAVE_LEN    24
+#define S_TCB_PDU_HAVE_LEN    16
+#define M_TCB_PDU_HAVE_LEN    0x1ULL
+#define V_TCB_PDU_HAVE_LEN(x) ((x) << S_TCB_PDU_HAVE_LEN)
+
+#define W_TCB_PDU_LEN    24
+#define S_TCB_PDU_LEN    17
+#define M_TCB_PDU_LEN    0xffffULL
+#define V_TCB_PDU_LEN(x) ((x) << S_TCB_PDU_LEN)
+
+#define W_TCB_RX_QUIESCE    25
+#define S_TCB_RX_QUIESCE    1
+#define M_TCB_RX_QUIESCE    0x1ULL
+#define V_TCB_RX_QUIESCE(x) ((x) << S_TCB_RX_QUIESCE)
+
+#define W_TCB_RX_PTR_RAW    25
+#define S_TCB_RX_PTR_RAW    2
+#define M_TCB_RX_PTR_RAW    0x1ffffULL
+#define V_TCB_RX_PTR_RAW(x) ((x) << S_TCB_RX_PTR_RAW)
+
+#define W_TCB_CPU_NO    25
+#define S_TCB_CPU_NO    19
+#define M_TCB_CPU_NO    0x7fULL
+#define V_TCB_CPU_NO(x) ((x) << S_TCB_CPU_NO)
+
+#define W_TCB_ULP_TYPE    25
+#define S_TCB_ULP_TYPE    26
+#define M_TCB_ULP_TYPE    0xfULL
+#define V_TCB_ULP_TYPE(x) ((x) << S_TCB_ULP_TYPE)
+
+#define W_TCB_RX_FRAG1_PTR_RAW    25
+#define S_TCB_RX_FRAG1_PTR_RAW    30
+#define M_TCB_RX_FRAG1_PTR_RAW    0x1ffffULL
+#define V_TCB_RX_FRAG1_PTR_RAW(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW)
+
+#define W_TCB_RX_FRAG2_START_IDX_OFFSET_RAW    26
+#define S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW    15
+#define M_TCB_RX_FRAG2_START_IDX_OFFSET_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG2_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG2_START_IDX_OFFSET_RAW)
+
+#define W_TCB_RX_FRAG2_PTR_RAW    27
+#define S_TCB_RX_FRAG2_PTR_RAW    10
+#define M_TCB_RX_FRAG2_PTR_RAW    0x1ffffULL
+#define V_TCB_RX_FRAG2_PTR_RAW(x) ((x) << S_TCB_RX_FRAG2_PTR_RAW)
+
+#define W_TCB_RX_FRAG2_LEN_RAW    27
+#define S_TCB_RX_FRAG2_LEN_RAW    27
+#define M_TCB_RX_FRAG2_LEN_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG2_LEN_RAW(x) ((x) << S_TCB_RX_FRAG2_LEN_RAW)
+
+#define W_TCB_RX_FRAG3_PTR_RAW    28
+#define S_TCB_RX_FRAG3_PTR_RAW    22
+#define M_TCB_RX_FRAG3_PTR_RAW    0x1ffffULL
+#define V_TCB_RX_FRAG3_PTR_RAW(x) ((x) << S_TCB_RX_FRAG3_PTR_RAW)
+
+#define W_TCB_RX_FRAG3_LEN_RAW    29
+#define S_TCB_RX_FRAG3_LEN_RAW    7
+#define M_TCB_RX_FRAG3_LEN_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG3_LEN_RAW(x) ((x) << S_TCB_RX_FRAG3_LEN_RAW)
+
+#define W_TCB_RX_FRAG3_START_IDX_OFFSET_RAW    30
+#define S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW    2
+#define M_TCB_RX_FRAG3_START_IDX_OFFSET_RAW    0x7ffffffULL
+#define V_TCB_RX_FRAG3_START_IDX_OFFSET_RAW(x) ((x) << S_TCB_RX_FRAG3_START_IDX_OFFSET_RAW)
+
+#define W_TCB_PDU_HDR_LEN    30
+#define S_TCB_PDU_HDR_LEN    29
+#define M_TCB_PDU_HDR_LEN    0xffULL
+#define V_TCB_PDU_HDR_LEN(x) ((x) << S_TCB_PDU_HDR_LEN)
+
+#define W_TCB_SLUSH1    31
+#define S_TCB_SLUSH1    5
+#define M_TCB_SLUSH1    0x7ffffULL
+#define V_TCB_SLUSH1(x) ((x) << S_TCB_SLUSH1)
+
+#define W_TCB_ULP_RAW    31
+#define S_TCB_ULP_RAW    24
+#define M_TCB_ULP_RAW    0xffULL
+#define V_TCB_ULP_RAW(x) ((x) << S_TCB_ULP_RAW)
+
+#define W_TCB_DDP_RDMAP_VERSION    25
+#define S_TCB_DDP_RDMAP_VERSION    30
+#define M_TCB_DDP_RDMAP_VERSION    0x1ULL
+#define V_TCB_DDP_RDMAP_VERSION(x) ((x) << S_TCB_DDP_RDMAP_VERSION)
+
+#define W_TCB_MARKER_ENABLE_RX    25
+#define S_TCB_MARKER_ENABLE_RX    31
+#define M_TCB_MARKER_ENABLE_RX    0x1ULL
+#define V_TCB_MARKER_ENABLE_RX(x) ((x) << S_TCB_MARKER_ENABLE_RX)
+
+#define W_TCB_MARKER_ENABLE_TX    26
+#define S_TCB_MARKER_ENABLE_TX    0
+#define M_TCB_MARKER_ENABLE_TX    0x1ULL
+#define V_TCB_MARKER_ENABLE_TX(x) ((x) << S_TCB_MARKER_ENABLE_TX)
+
+#define W_TCB_CRC_ENABLE    26
+#define S_TCB_CRC_ENABLE    1
+#define M_TCB_CRC_ENABLE    0x1ULL
+#define V_TCB_CRC_ENABLE(x) ((x) << S_TCB_CRC_ENABLE)
+
+#define W_TCB_IRS_ULP    26
+#define S_TCB_IRS_ULP    2
+#define M_TCB_IRS_ULP    0x1ffULL
+#define V_TCB_IRS_ULP(x) ((x) << S_TCB_IRS_ULP)
+
+#define W_TCB_ISS_ULP    26
+#define S_TCB_ISS_ULP    11
+#define M_TCB_ISS_ULP    0x1ffULL
+#define V_TCB_ISS_ULP(x) ((x) << S_TCB_ISS_ULP)
+
+#define W_TCB_TX_PDU_LEN    26
+#define S_TCB_TX_PDU_LEN    20
+#define M_TCB_TX_PDU_LEN    0x3fffULL
+#define V_TCB_TX_PDU_LEN(x) ((x) << S_TCB_TX_PDU_LEN)
+
+#define W_TCB_TX_PDU_OUT    27
+#define S_TCB_TX_PDU_OUT    2
+#define M_TCB_TX_PDU_OUT    0x1ULL
+#define V_TCB_TX_PDU_OUT(x) ((x) << S_TCB_TX_PDU_OUT)
+
+#define W_TCB_CQ_IDX_SQ    27
+#define S_TCB_CQ_IDX_SQ    3
+#define M_TCB_CQ_IDX_SQ    0xffffULL
+#define V_TCB_CQ_IDX_SQ(x) ((x) << S_TCB_CQ_IDX_SQ)
+
+#define W_TCB_CQ_IDX_RQ    27
+#define S_TCB_CQ_IDX_RQ    19
+#define M_TCB_CQ_IDX_RQ    0xffffULL
+#define V_TCB_CQ_IDX_RQ(x) ((x) << S_TCB_CQ_IDX_RQ)
+
+#define W_TCB_QP_ID    28
+#define S_TCB_QP_ID    3
+#define M_TCB_QP_ID    0xffffULL
+#define V_TCB_QP_ID(x) ((x) << S_TCB_QP_ID)
+
+#define W_TCB_PD_ID    28
+#define S_TCB_PD_ID    19
+#define M_TCB_PD_ID    0xffffULL
+#define V_TCB_PD_ID(x) ((x) << S_TCB_PD_ID)
+
+#define W_TCB_STAG    29
+#define S_TCB_STAG    3
+#define M_TCB_STAG    0xffffffffULL
+#define V_TCB_STAG(x) ((x) << S_TCB_STAG)
+
+#define W_TCB_RQ_START    30
+#define S_TCB_RQ_START    3
+#define M_TCB_RQ_START    0x3ffffffULL
+#define V_TCB_RQ_START(x) ((x) << S_TCB_RQ_START)
+
+#define W_TCB_RQ_MSN    30
+#define S_TCB_RQ_MSN    29
+#define M_TCB_RQ_MSN    0x3ffULL
+#define V_TCB_RQ_MSN(x) ((x) << S_TCB_RQ_MSN)
+
+#define W_TCB_RQ_MAX_OFFSET    31
+#define S_TCB_RQ_MAX_OFFSET    7
+#define M_TCB_RQ_MAX_OFFSET    0xfULL
+#define V_TCB_RQ_MAX_OFFSET(x) ((x) << S_TCB_RQ_MAX_OFFSET)
+
+#define W_TCB_RQ_WRITE_PTR    31
+#define S_TCB_RQ_WRITE_PTR    11
+#define M_TCB_RQ_WRITE_PTR    0x3ffULL
+#define V_TCB_RQ_WRITE_PTR(x) ((x) << S_TCB_RQ_WRITE_PTR)
+
+#define W_TCB_INB_WRITE_PERM    31
+#define S_TCB_INB_WRITE_PERM    21
+#define M_TCB_INB_WRITE_PERM    0x1ULL
+#define V_TCB_INB_WRITE_PERM(x) ((x) << S_TCB_INB_WRITE_PERM)
+
+#define W_TCB_INB_READ_PERM    31
+#define S_TCB_INB_READ_PERM    22
+#define M_TCB_INB_READ_PERM    0x1ULL
+#define V_TCB_INB_READ_PERM(x) ((x) << S_TCB_INB_READ_PERM)
+
+#define W_TCB_ORD_L_BIT_VLD    31
+#define S_TCB_ORD_L_BIT_VLD    23
+#define M_TCB_ORD_L_BIT_VLD    0x1ULL
+#define V_TCB_ORD_L_BIT_VLD(x) ((x) << S_TCB_ORD_L_BIT_VLD)
+
+#define W_TCB_RDMAP_OPCODE    31
+#define S_TCB_RDMAP_OPCODE    24
+#define M_TCB_RDMAP_OPCODE    0xfULL
+#define V_TCB_RDMAP_OPCODE(x) ((x) << S_TCB_RDMAP_OPCODE)
+
+#define W_TCB_TX_FLUSH    31
+#define S_TCB_TX_FLUSH    28
+#define M_TCB_TX_FLUSH    0x1ULL
+#define V_TCB_TX_FLUSH(x) ((x) << S_TCB_TX_FLUSH)
+
+#define W_TCB_TX_OOS_RXMT    31
+#define S_TCB_TX_OOS_RXMT    29
+#define M_TCB_TX_OOS_RXMT    0x1ULL
+#define V_TCB_TX_OOS_RXMT(x) ((x) << S_TCB_TX_OOS_RXMT)
+
+#define W_TCB_TX_OOS_TXMT    31
+#define S_TCB_TX_OOS_TXMT    30
+#define M_TCB_TX_OOS_TXMT    0x1ULL
+#define V_TCB_TX_OOS_TXMT(x) ((x) << S_TCB_TX_OOS_TXMT)
+
+#define W_TCB_SLUSH_AUX2    31
+#define S_TCB_SLUSH_AUX2    31
+#define M_TCB_SLUSH_AUX2    0x1ULL
+#define V_TCB_SLUSH_AUX2(x) ((x) << S_TCB_SLUSH_AUX2)
+
+#define W_TCB_RX_FRAG1_PTR_RAW2    25
+#define S_TCB_RX_FRAG1_PTR_RAW2    30
+#define M_TCB_RX_FRAG1_PTR_RAW2    0x1ffffULL
+#define V_TCB_RX_FRAG1_PTR_RAW2(x) ((x) << S_TCB_RX_FRAG1_PTR_RAW2)
+
+#define W_TCB_RX_DDP_FLAGS    26
+#define S_TCB_RX_DDP_FLAGS    15
+#define M_TCB_RX_DDP_FLAGS    0x3ffULL
+#define V_TCB_RX_DDP_FLAGS(x) ((x) << S_TCB_RX_DDP_FLAGS)
+
+#define W_TCB_SLUSH_AUX3    26
+#define S_TCB_SLUSH_AUX3    31
+#define M_TCB_SLUSH_AUX3    0x1ffULL
+#define V_TCB_SLUSH_AUX3(x) ((x) << S_TCB_SLUSH_AUX3)
+
+#define W_TCB_RX_DDP_BUF0_OFFSET    27
+#define S_TCB_RX_DDP_BUF0_OFFSET    8
+#define M_TCB_RX_DDP_BUF0_OFFSET    0x3fffffULL
+#define V_TCB_RX_DDP_BUF0_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF0_OFFSET)
+
+#define W_TCB_RX_DDP_BUF0_LEN    27
+#define S_TCB_RX_DDP_BUF0_LEN    30
+#define M_TCB_RX_DDP_BUF0_LEN    0x3fffffULL
+#define V_TCB_RX_DDP_BUF0_LEN(x) ((x) << S_TCB_RX_DDP_BUF0_LEN)
+
+#define W_TCB_RX_DDP_BUF1_OFFSET    28
+#define S_TCB_RX_DDP_BUF1_OFFSET    20
+#define M_TCB_RX_DDP_BUF1_OFFSET    0x3fffffULL
+#define V_TCB_RX_DDP_BUF1_OFFSET(x) ((x) << S_TCB_RX_DDP_BUF1_OFFSET)
+
+#define W_TCB_RX_DDP_BUF1_LEN    29
+#define S_TCB_RX_DDP_BUF1_LEN    10
+#define M_TCB_RX_DDP_BUF1_LEN    0x3fffffULL
+#define V_TCB_RX_DDP_BUF1_LEN(x) ((x) << S_TCB_RX_DDP_BUF1_LEN)
+
+#define W_TCB_RX_DDP_BUF0_TAG    30
+#define S_TCB_RX_DDP_BUF0_TAG    0
+#define M_TCB_RX_DDP_BUF0_TAG    0xffffffffULL
+#define V_TCB_RX_DDP_BUF0_TAG(x) ((x) << S_TCB_RX_DDP_BUF0_TAG)
+
+#define W_TCB_RX_DDP_BUF1_TAG    31
+#define S_TCB_RX_DDP_BUF1_TAG    0
+#define M_TCB_RX_DDP_BUF1_TAG    0xffffffffULL
+#define V_TCB_RX_DDP_BUF1_TAG(x) ((x) << S_TCB_RX_DDP_BUF1_TAG)
+
+#define S_TF_DACK    10
+#define V_TF_DACK(x) ((x) << S_TF_DACK)
+
+#define S_TF_NAGLE    11
+#define V_TF_NAGLE(x) ((x) << S_TF_NAGLE)
+
+#define S_TF_RECV_SCALE    12
+#define V_TF_RECV_SCALE(x) ((x) << S_TF_RECV_SCALE)
+
+#define S_TF_RECV_TSTMP    13
+#define V_TF_RECV_TSTMP(x) ((x) << S_TF_RECV_TSTMP)
+
+#define S_TF_RECV_SACK    14
+#define V_TF_RECV_SACK(x) ((x) << S_TF_RECV_SACK)
+
+#define S_TF_TURBO    15
+#define V_TF_TURBO(x) ((x) << S_TF_TURBO)
+
+#define S_TF_KEEPALIVE    16
+#define V_TF_KEEPALIVE(x) ((x) << S_TF_KEEPALIVE)
+
+#define S_TF_TCAM_BYPASS    17
+#define V_TF_TCAM_BYPASS(x) ((x) << S_TF_TCAM_BYPASS)
+
+#define S_TF_CORE_FIN    18
+#define V_TF_CORE_FIN(x) ((x) << S_TF_CORE_FIN)
+
+#define S_TF_CORE_MORE    19
+#define V_TF_CORE_MORE(x) ((x) << S_TF_CORE_MORE)
+
+#define S_TF_MIGRATING    20
+#define V_TF_MIGRATING(x) ((x) << S_TF_MIGRATING)
+
+#define S_TF_ACTIVE_OPEN    21
+#define V_TF_ACTIVE_OPEN(x) ((x) << S_TF_ACTIVE_OPEN)
+
+#define S_TF_ASK_MODE    22
+#define V_TF_ASK_MODE(x) ((x) << S_TF_ASK_MODE)
+
+#define S_TF_NON_OFFLOAD    23
+#define V_TF_NON_OFFLOAD(x) ((x) << S_TF_NON_OFFLOAD)
+
+#define S_TF_MOD_SCHD    24
+#define V_TF_MOD_SCHD(x) ((x) << S_TF_MOD_SCHD)
+
+#define S_TF_MOD_SCHD_REASON0    25
+#define V_TF_MOD_SCHD_REASON0(x) ((x) << S_TF_MOD_SCHD_REASON0)
+
+#define S_TF_MOD_SCHD_REASON1    26
+#define V_TF_MOD_SCHD_REASON1(x) ((x) << S_TF_MOD_SCHD_REASON1)
+
+#define S_TF_MOD_SCHD_RX    27
+#define V_TF_MOD_SCHD_RX(x) ((x) << S_TF_MOD_SCHD_RX)
+
+#define S_TF_CORE_PUSH    28
+#define V_TF_CORE_PUSH(x) ((x) << S_TF_CORE_PUSH)
+
+#define S_TF_RCV_COALESCE_ENABLE    29
+#define V_TF_RCV_COALESCE_ENABLE(x) ((x) << S_TF_RCV_COALESCE_ENABLE)
+
+#define S_TF_RCV_COALESCE_PUSH    30
+#define V_TF_RCV_COALESCE_PUSH(x) ((x) << S_TF_RCV_COALESCE_PUSH)
+
+#define S_TF_RCV_COALESCE_LAST_PSH    31
+#define V_TF_RCV_COALESCE_LAST_PSH(x) ((x) << S_TF_RCV_COALESCE_LAST_PSH)
+
+#define S_TF_RCV_COALESCE_HEARTBEAT    32
+#define V_TF_RCV_COALESCE_HEARTBEAT(x) ((x) << S_TF_RCV_COALESCE_HEARTBEAT)
+
+#define S_TF_HALF_CLOSE    33
+#define V_TF_HALF_CLOSE(x) ((x) << S_TF_HALF_CLOSE)
+
+#define S_TF_DACK_MSS    34
+#define V_TF_DACK_MSS(x) ((x) << S_TF_DACK_MSS)
+
+#define S_TF_CCTRL_SEL0    35
+#define V_TF_CCTRL_SEL0(x) ((x) << S_TF_CCTRL_SEL0)
+
+#define S_TF_CCTRL_SEL1    36
+#define V_TF_CCTRL_SEL1(x) ((x) << S_TF_CCTRL_SEL1)
+
+#define S_TF_TCP_NEWRENO_FAST_RECOVERY    37
+#define V_TF_TCP_NEWRENO_FAST_RECOVERY(x) ((x) << S_TF_TCP_NEWRENO_FAST_RECOVERY)
+
+#define S_TF_TX_PACE_AUTO    38
+#define V_TF_TX_PACE_AUTO(x) ((x) << S_TF_TX_PACE_AUTO)
+
+#define S_TF_PEER_FIN_HELD    39
+#define V_TF_PEER_FIN_HELD(x) ((x) << S_TF_PEER_FIN_HELD)
+
+#define S_TF_CORE_URG    40
+#define V_TF_CORE_URG(x) ((x) << S_TF_CORE_URG)
+
+#define S_TF_RDMA_ERROR    41
+#define V_TF_RDMA_ERROR(x) ((x) << S_TF_RDMA_ERROR)
+
+#define S_TF_SSWS_DISABLED    42
+#define V_TF_SSWS_DISABLED(x) ((x) << S_TF_SSWS_DISABLED)
+
+#define S_TF_DUPACK_COUNT_ODD    43
+#define V_TF_DUPACK_COUNT_ODD(x) ((x) << S_TF_DUPACK_COUNT_ODD)
+
+#define S_TF_TX_CHANNEL    44
+#define V_TF_TX_CHANNEL(x) ((x) << S_TF_TX_CHANNEL)
+
+#define S_TF_RX_CHANNEL    45
+#define V_TF_RX_CHANNEL(x) ((x) << S_TF_RX_CHANNEL)
+
+#define S_TF_TX_PACE_FIXED    46
+#define V_TF_TX_PACE_FIXED(x) ((x) << S_TF_TX_PACE_FIXED)
+
+#define S_TF_RDMA_FLM_ERROR    47
+#define V_TF_RDMA_FLM_ERROR(x) ((x) << S_TF_RDMA_FLM_ERROR)
+
+#define S_TF_RX_FLOW_CONTROL_DISABLE    48
+#define V_TF_RX_FLOW_CONTROL_DISABLE(x) ((x) << S_TF_RX_FLOW_CONTROL_DISABLE)
+
+#endif /* _TCB_DEFS_H */
diff --git a/drivers/infiniband/hw/cxgb4/Kconfig b/drivers/infiniband/hw/cxgb4/Kconfig
new file mode 100644
index 000000000..23f38cf2c
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/Kconfig
@@ -0,0 +1,18 @@
+config INFINIBAND_CXGB4
+	tristate "Chelsio T4/T5 RDMA Driver"
+	depends on CHELSIO_T4 && INET && (IPV6 || IPV6=n)
+	select GENERIC_ALLOCATOR
+	---help---
+	  This is an iWARP/RDMA driver for the Chelsio T4 and T5
+	  1GbE, 10GbE adapters and T5 40GbE adapter.
+
+	  For general information about Chelsio and our products, visit
+	  our website at <http://www.chelsio.com>.
+
+	  For customer support, please visit our customer support page at
+	  <http://www.chelsio.com/support.html>.
+
+	  Please send feedback to <linux-bugs@chelsio.com>.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called iw_cxgb4.
diff --git a/drivers/infiniband/hw/cxgb4/Makefile b/drivers/infiniband/hw/cxgb4/Makefile
new file mode 100644
index 000000000..e11cf7299
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/Makefile
@@ -0,0 +1,5 @@
+ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb4
+
+obj-$(CONFIG_INFINIBAND_CXGB4) += iw_cxgb4.o
+
+iw_cxgb4-y :=  device.o cm.o provider.o mem.o cq.o qp.o resource.o ev.o id_table.o
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
new file mode 100644
index 000000000..3ad8dc798
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -0,0 +1,4053 @@
+/*
+ * Copyright (c) 2009-2014 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/skbuff.h>
+#include <linux/timer.h>
+#include <linux/notifier.h>
+#include <linux/inetdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if_vlan.h>
+
+#include <net/neighbour.h>
+#include <net/netevent.h>
+#include <net/route.h>
+#include <net/tcp.h>
+#include <net/ip6_route.h>
+#include <net/addrconf.h>
+
+#include <rdma/ib_addr.h>
+
+#include "iw_cxgb4.h"
+
+static char *states[] = {
+	"idle",
+	"listen",
+	"connecting",
+	"mpa_wait_req",
+	"mpa_req_sent",
+	"mpa_req_rcvd",
+	"mpa_rep_sent",
+	"fpdu_mode",
+	"aborting",
+	"closing",
+	"moribund",
+	"dead",
+	NULL,
+};
+
+static int nocong;
+module_param(nocong, int, 0644);
+MODULE_PARM_DESC(nocong, "Turn of congestion control (default=0)");
+
+static int enable_ecn;
+module_param(enable_ecn, int, 0644);
+MODULE_PARM_DESC(enable_ecn, "Enable ECN (default=0/disabled)");
+
+static int dack_mode = 1;
+module_param(dack_mode, int, 0644);
+MODULE_PARM_DESC(dack_mode, "Delayed ack mode (default=1)");
+
+uint c4iw_max_read_depth = 32;
+module_param(c4iw_max_read_depth, int, 0644);
+MODULE_PARM_DESC(c4iw_max_read_depth,
+		 "Per-connection max ORD/IRD (default=32)");
+
+static int enable_tcp_timestamps;
+module_param(enable_tcp_timestamps, int, 0644);
+MODULE_PARM_DESC(enable_tcp_timestamps, "Enable tcp timestamps (default=0)");
+
+static int enable_tcp_sack;
+module_param(enable_tcp_sack, int, 0644);
+MODULE_PARM_DESC(enable_tcp_sack, "Enable tcp SACK (default=0)");
+
+static int enable_tcp_window_scaling = 1;
+module_param(enable_tcp_window_scaling, int, 0644);
+MODULE_PARM_DESC(enable_tcp_window_scaling,
+		 "Enable tcp window scaling (default=1)");
+
+int c4iw_debug;
+module_param(c4iw_debug, int, 0644);
+MODULE_PARM_DESC(c4iw_debug, "Enable debug logging (default=0)");
+
+static int peer2peer = 1;
+module_param(peer2peer, int, 0644);
+MODULE_PARM_DESC(peer2peer, "Support peer2peer ULPs (default=1)");
+
+static int p2p_type = FW_RI_INIT_P2PTYPE_READ_REQ;
+module_param(p2p_type, int, 0644);
+MODULE_PARM_DESC(p2p_type, "RDMAP opcode to use for the RTR message: "
+			   "1=RDMA_READ 0=RDMA_WRITE (default 1)");
+
+static int ep_timeout_secs = 60;
+module_param(ep_timeout_secs, int, 0644);
+MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
+				   "in seconds (default=60)");
+
+static int mpa_rev = 1;
+module_param(mpa_rev, int, 0644);
+MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, "
+		"1 is RFC0544 spec compliant, 2 is IETF MPA Peer Connect Draft"
+		" compliant (default=1)");
+
+static int markers_enabled;
+module_param(markers_enabled, int, 0644);
+MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)");
+
+static int crc_enabled = 1;
+module_param(crc_enabled, int, 0644);
+MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)");
+
+static int rcv_win = 256 * 1024;
+module_param(rcv_win, int, 0644);
+MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256KB)");
+
+static int snd_win = 128 * 1024;
+module_param(snd_win, int, 0644);
+MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=128KB)");
+
+static struct workqueue_struct *workq;
+
+static struct sk_buff_head rxq;
+
+static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp);
+static void ep_timeout(unsigned long arg);
+static void connect_reply_upcall(struct c4iw_ep *ep, int status);
+
+static LIST_HEAD(timeout_list);
+static spinlock_t timeout_lock;
+
+static void deref_qp(struct c4iw_ep *ep)
+{
+	c4iw_qp_rem_ref(&ep->com.qp->ibqp);
+	clear_bit(QP_REFERENCED, &ep->com.flags);
+}
+
+static void ref_qp(struct c4iw_ep *ep)
+{
+	set_bit(QP_REFERENCED, &ep->com.flags);
+	c4iw_qp_add_ref(&ep->com.qp->ibqp);
+}
+
+static void start_ep_timer(struct c4iw_ep *ep)
+{
+	PDBG("%s ep %p\n", __func__, ep);
+	if (timer_pending(&ep->timer)) {
+		pr_err("%s timer already started! ep %p\n",
+		       __func__, ep);
+		return;
+	}
+	clear_bit(TIMEOUT, &ep->com.flags);
+	c4iw_get_ep(&ep->com);
+	ep->timer.expires = jiffies + ep_timeout_secs * HZ;
+	ep->timer.data = (unsigned long)ep;
+	ep->timer.function = ep_timeout;
+	add_timer(&ep->timer);
+}
+
+static int stop_ep_timer(struct c4iw_ep *ep)
+{
+	PDBG("%s ep %p stopping\n", __func__, ep);
+	del_timer_sync(&ep->timer);
+	if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) {
+		c4iw_put_ep(&ep->com);
+		return 0;
+	}
+	return 1;
+}
+
+static int c4iw_l2t_send(struct c4iw_rdev *rdev, struct sk_buff *skb,
+		  struct l2t_entry *l2e)
+{
+	int	error = 0;
+
+	if (c4iw_fatal_error(rdev)) {
+		kfree_skb(skb);
+		PDBG("%s - device in error state - dropping\n", __func__);
+		return -EIO;
+	}
+	error = cxgb4_l2t_send(rdev->lldi.ports[0], skb, l2e);
+	if (error < 0)
+		kfree_skb(skb);
+	return error < 0 ? error : 0;
+}
+
+int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb)
+{
+	int	error = 0;
+
+	if (c4iw_fatal_error(rdev)) {
+		kfree_skb(skb);
+		PDBG("%s - device in error state - dropping\n", __func__);
+		return -EIO;
+	}
+	error = cxgb4_ofld_send(rdev->lldi.ports[0], skb);
+	if (error < 0)
+		kfree_skb(skb);
+	return error < 0 ? error : 0;
+}
+
+static void release_tid(struct c4iw_rdev *rdev, u32 hwtid, struct sk_buff *skb)
+{
+	struct cpl_tid_release *req;
+
+	skb = get_skb(skb, sizeof *req, GFP_KERNEL);
+	if (!skb)
+		return;
+	req = (struct cpl_tid_release *) skb_put(skb, sizeof(*req));
+	INIT_TP_WR(req, hwtid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_TID_RELEASE, hwtid));
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, 0);
+	c4iw_ofld_send(rdev, skb);
+	return;
+}
+
+static void set_emss(struct c4iw_ep *ep, u16 opt)
+{
+	ep->emss = ep->com.dev->rdev.lldi.mtus[TCPOPT_MSS_G(opt)] -
+		   ((AF_INET == ep->com.remote_addr.ss_family) ?
+		    sizeof(struct iphdr) : sizeof(struct ipv6hdr)) -
+		   sizeof(struct tcphdr);
+	ep->mss = ep->emss;
+	if (TCPOPT_TSTAMP_G(opt))
+		ep->emss -= round_up(TCPOLEN_TIMESTAMP, 4);
+	if (ep->emss < 128)
+		ep->emss = 128;
+	if (ep->emss & 7)
+		PDBG("Warning: misaligned mtu idx %u mss %u emss=%u\n",
+		     TCPOPT_MSS_G(opt), ep->mss, ep->emss);
+	PDBG("%s mss_idx %u mss %u emss=%u\n", __func__, TCPOPT_MSS_G(opt),
+	     ep->mss, ep->emss);
+}
+
+static enum c4iw_ep_state state_read(struct c4iw_ep_common *epc)
+{
+	enum c4iw_ep_state state;
+
+	mutex_lock(&epc->mutex);
+	state = epc->state;
+	mutex_unlock(&epc->mutex);
+	return state;
+}
+
+static void __state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new)
+{
+	epc->state = new;
+}
+
+static void state_set(struct c4iw_ep_common *epc, enum c4iw_ep_state new)
+{
+	mutex_lock(&epc->mutex);
+	PDBG("%s - %s -> %s\n", __func__, states[epc->state], states[new]);
+	__state_set(epc, new);
+	mutex_unlock(&epc->mutex);
+	return;
+}
+
+static void *alloc_ep(int size, gfp_t gfp)
+{
+	struct c4iw_ep_common *epc;
+
+	epc = kzalloc(size, gfp);
+	if (epc) {
+		kref_init(&epc->kref);
+		mutex_init(&epc->mutex);
+		c4iw_init_wr_wait(&epc->wr_wait);
+	}
+	PDBG("%s alloc ep %p\n", __func__, epc);
+	return epc;
+}
+
+void _c4iw_free_ep(struct kref *kref)
+{
+	struct c4iw_ep *ep;
+
+	ep = container_of(kref, struct c4iw_ep, com.kref);
+	PDBG("%s ep %p state %s\n", __func__, ep, states[state_read(&ep->com)]);
+	if (test_bit(QP_REFERENCED, &ep->com.flags))
+		deref_qp(ep);
+	if (test_bit(RELEASE_RESOURCES, &ep->com.flags)) {
+		remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid);
+		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid);
+		dst_release(ep->dst);
+		cxgb4_l2t_release(ep->l2t);
+	}
+	if (test_bit(RELEASE_MAPINFO, &ep->com.flags)) {
+		print_addr(&ep->com, __func__, "remove_mapinfo/mapping");
+		iwpm_remove_mapinfo(&ep->com.local_addr,
+				    &ep->com.mapped_local_addr);
+		iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW);
+	}
+	kfree(ep);
+}
+
+static void release_ep_resources(struct c4iw_ep *ep)
+{
+	set_bit(RELEASE_RESOURCES, &ep->com.flags);
+	c4iw_put_ep(&ep->com);
+}
+
+static int status2errno(int status)
+{
+	switch (status) {
+	case CPL_ERR_NONE:
+		return 0;
+	case CPL_ERR_CONN_RESET:
+		return -ECONNRESET;
+	case CPL_ERR_ARP_MISS:
+		return -EHOSTUNREACH;
+	case CPL_ERR_CONN_TIMEDOUT:
+		return -ETIMEDOUT;
+	case CPL_ERR_TCAM_FULL:
+		return -ENOMEM;
+	case CPL_ERR_CONN_EXIST:
+		return -EADDRINUSE;
+	default:
+		return -EIO;
+	}
+}
+
+/*
+ * Try and reuse skbs already allocated...
+ */
+static struct sk_buff *get_skb(struct sk_buff *skb, int len, gfp_t gfp)
+{
+	if (skb && !skb_is_nonlinear(skb) && !skb_cloned(skb)) {
+		skb_trim(skb, 0);
+		skb_get(skb);
+		skb_reset_transport_header(skb);
+	} else {
+		skb = alloc_skb(len, gfp);
+	}
+	t4_set_arp_err_handler(skb, NULL, NULL);
+	return skb;
+}
+
+static struct net_device *get_real_dev(struct net_device *egress_dev)
+{
+	return rdma_vlan_dev_real_dev(egress_dev) ? : egress_dev;
+}
+
+static int our_interface(struct c4iw_dev *dev, struct net_device *egress_dev)
+{
+	int i;
+
+	egress_dev = get_real_dev(egress_dev);
+	for (i = 0; i < dev->rdev.lldi.nports; i++)
+		if (dev->rdev.lldi.ports[i] == egress_dev)
+			return 1;
+	return 0;
+}
+
+static struct dst_entry *find_route6(struct c4iw_dev *dev, __u8 *local_ip,
+				     __u8 *peer_ip, __be16 local_port,
+				     __be16 peer_port, u8 tos,
+				     __u32 sin6_scope_id)
+{
+	struct dst_entry *dst = NULL;
+
+	if (IS_ENABLED(CONFIG_IPV6)) {
+		struct flowi6 fl6;
+
+		memset(&fl6, 0, sizeof(fl6));
+		memcpy(&fl6.daddr, peer_ip, 16);
+		memcpy(&fl6.saddr, local_ip, 16);
+		if (ipv6_addr_type(&fl6.daddr) & IPV6_ADDR_LINKLOCAL)
+			fl6.flowi6_oif = sin6_scope_id;
+		dst = ip6_route_output(&init_net, NULL, &fl6);
+		if (!dst)
+			goto out;
+		if (!our_interface(dev, ip6_dst_idev(dst)->dev) &&
+		    !(ip6_dst_idev(dst)->dev->flags & IFF_LOOPBACK)) {
+			dst_release(dst);
+			dst = NULL;
+		}
+	}
+
+out:
+	return dst;
+}
+
+static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip,
+				 __be32 peer_ip, __be16 local_port,
+				 __be16 peer_port, u8 tos)
+{
+	struct rtable *rt;
+	struct flowi4 fl4;
+	struct neighbour *n;
+
+	rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip, local_ip,
+				   peer_port, local_port, IPPROTO_TCP,
+				   tos, 0);
+	if (IS_ERR(rt))
+		return NULL;
+	n = dst_neigh_lookup(&rt->dst, &peer_ip);
+	if (!n)
+		return NULL;
+	if (!our_interface(dev, n->dev) &&
+	    !(n->dev->flags & IFF_LOOPBACK)) {
+		neigh_release(n);
+		dst_release(&rt->dst);
+		return NULL;
+	}
+	neigh_release(n);
+	return &rt->dst;
+}
+
+static void arp_failure_discard(void *handle, struct sk_buff *skb)
+{
+	PDBG("%s c4iw_dev %p\n", __func__, handle);
+	kfree_skb(skb);
+}
+
+/*
+ * Handle an ARP failure for an active open.
+ */
+static void act_open_req_arp_failure(void *handle, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep = handle;
+
+	printk(KERN_ERR MOD "ARP failure duing connect\n");
+	kfree_skb(skb);
+	connect_reply_upcall(ep, -EHOSTUNREACH);
+	state_set(&ep->com, DEAD);
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
+	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
+	dst_release(ep->dst);
+	cxgb4_l2t_release(ep->l2t);
+	c4iw_put_ep(&ep->com);
+}
+
+/*
+ * Handle an ARP failure for a CPL_ABORT_REQ.  Change it into a no RST variant
+ * and send it along.
+ */
+static void abort_arp_failure(void *handle, struct sk_buff *skb)
+{
+	struct c4iw_rdev *rdev = handle;
+	struct cpl_abort_req *req = cplhdr(skb);
+
+	PDBG("%s rdev %p\n", __func__, rdev);
+	req->cmd = CPL_ABORT_NO_RST;
+	c4iw_ofld_send(rdev, skb);
+}
+
+static void send_flowc(struct c4iw_ep *ep, struct sk_buff *skb)
+{
+	unsigned int flowclen = 80;
+	struct fw_flowc_wr *flowc;
+	int i;
+
+	skb = get_skb(skb, flowclen, GFP_KERNEL);
+	flowc = (struct fw_flowc_wr *)__skb_put(skb, flowclen);
+
+	flowc->op_to_nparams = cpu_to_be32(FW_WR_OP_V(FW_FLOWC_WR) |
+					   FW_FLOWC_WR_NPARAMS_V(8));
+	flowc->flowid_len16 = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(flowclen,
+					  16)) | FW_WR_FLOWID_V(ep->hwtid));
+
+	flowc->mnemval[0].mnemonic = FW_FLOWC_MNEM_PFNVFN;
+	flowc->mnemval[0].val = cpu_to_be32(FW_PFVF_CMD_PFN_V
+					    (ep->com.dev->rdev.lldi.pf));
+	flowc->mnemval[1].mnemonic = FW_FLOWC_MNEM_CH;
+	flowc->mnemval[1].val = cpu_to_be32(ep->tx_chan);
+	flowc->mnemval[2].mnemonic = FW_FLOWC_MNEM_PORT;
+	flowc->mnemval[2].val = cpu_to_be32(ep->tx_chan);
+	flowc->mnemval[3].mnemonic = FW_FLOWC_MNEM_IQID;
+	flowc->mnemval[3].val = cpu_to_be32(ep->rss_qid);
+	flowc->mnemval[4].mnemonic = FW_FLOWC_MNEM_SNDNXT;
+	flowc->mnemval[4].val = cpu_to_be32(ep->snd_seq);
+	flowc->mnemval[5].mnemonic = FW_FLOWC_MNEM_RCVNXT;
+	flowc->mnemval[5].val = cpu_to_be32(ep->rcv_seq);
+	flowc->mnemval[6].mnemonic = FW_FLOWC_MNEM_SNDBUF;
+	flowc->mnemval[6].val = cpu_to_be32(ep->snd_win);
+	flowc->mnemval[7].mnemonic = FW_FLOWC_MNEM_MSS;
+	flowc->mnemval[7].val = cpu_to_be32(ep->emss);
+	/* Pad WR to 16 byte boundary */
+	flowc->mnemval[8].mnemonic = 0;
+	flowc->mnemval[8].val = 0;
+	for (i = 0; i < 9; i++) {
+		flowc->mnemval[i].r4[0] = 0;
+		flowc->mnemval[i].r4[1] = 0;
+		flowc->mnemval[i].r4[2] = 0;
+	}
+
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	c4iw_ofld_send(&ep->com.dev->rdev, skb);
+}
+
+static int send_halfclose(struct c4iw_ep *ep, gfp_t gfp)
+{
+	struct cpl_close_con_req *req;
+	struct sk_buff *skb;
+	int wrlen = roundup(sizeof *req, 16);
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	skb = get_skb(NULL, wrlen, gfp);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb\n", __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+	req = (struct cpl_close_con_req *) skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	INIT_TP_WR(req, ep->hwtid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_CLOSE_CON_REQ,
+						    ep->hwtid));
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static int send_abort(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
+{
+	struct cpl_abort_req *req;
+	int wrlen = roundup(sizeof *req, 16);
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	skb = get_skb(skb, wrlen, gfp);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb.\n",
+		       __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	t4_set_arp_err_handler(skb, &ep->com.dev->rdev, abort_arp_failure);
+	req = (struct cpl_abort_req *) skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	INIT_TP_WR(req, ep->hwtid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_REQ, ep->hwtid));
+	req->cmd = CPL_ABORT_SEND_RST;
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+/*
+ * c4iw_form_pm_msg - Form a port mapper message with mapping info
+ */
+static void c4iw_form_pm_msg(struct c4iw_ep *ep,
+				struct iwpm_sa_data *pm_msg)
+{
+	memcpy(&pm_msg->loc_addr, &ep->com.local_addr,
+		sizeof(ep->com.local_addr));
+	memcpy(&pm_msg->rem_addr, &ep->com.remote_addr,
+		sizeof(ep->com.remote_addr));
+}
+
+/*
+ * c4iw_form_reg_msg - Form a port mapper message with dev info
+ */
+static void c4iw_form_reg_msg(struct c4iw_dev *dev,
+				struct iwpm_dev_data *pm_msg)
+{
+	memcpy(pm_msg->dev_name, dev->ibdev.name, IWPM_DEVNAME_SIZE);
+	memcpy(pm_msg->if_name, dev->rdev.lldi.ports[0]->name,
+				IWPM_IFNAME_SIZE);
+}
+
+static void c4iw_record_pm_msg(struct c4iw_ep *ep,
+			struct iwpm_sa_data *pm_msg)
+{
+	memcpy(&ep->com.mapped_local_addr, &pm_msg->mapped_loc_addr,
+		sizeof(ep->com.mapped_local_addr));
+	memcpy(&ep->com.mapped_remote_addr, &pm_msg->mapped_rem_addr,
+		sizeof(ep->com.mapped_remote_addr));
+}
+
+static int get_remote_addr(struct c4iw_ep *parent_ep, struct c4iw_ep *child_ep)
+{
+	int ret;
+
+	print_addr(&parent_ep->com, __func__, "get_remote_addr parent_ep ");
+	print_addr(&child_ep->com, __func__, "get_remote_addr child_ep ");
+
+	ret = iwpm_get_remote_info(&parent_ep->com.mapped_local_addr,
+				   &child_ep->com.mapped_remote_addr,
+				   &child_ep->com.remote_addr, RDMA_NL_C4IW);
+	if (ret)
+		PDBG("Unable to find remote peer addr info - err %d\n", ret);
+
+	return ret;
+}
+
+static void best_mtu(const unsigned short *mtus, unsigned short mtu,
+		     unsigned int *idx, int use_ts, int ipv6)
+{
+	unsigned short hdr_size = (ipv6 ?
+				   sizeof(struct ipv6hdr) :
+				   sizeof(struct iphdr)) +
+				  sizeof(struct tcphdr) +
+				  (use_ts ?
+				   round_up(TCPOLEN_TIMESTAMP, 4) : 0);
+	unsigned short data_size = mtu - hdr_size;
+
+	cxgb4_best_aligned_mtu(mtus, hdr_size, data_size, 8, idx);
+}
+
+static int send_connect(struct c4iw_ep *ep)
+{
+	struct cpl_act_open_req *req;
+	struct cpl_t5_act_open_req *t5_req;
+	struct cpl_act_open_req6 *req6;
+	struct cpl_t5_act_open_req6 *t5_req6;
+	struct sk_buff *skb;
+	u64 opt0;
+	u32 opt2;
+	unsigned int mtu_idx;
+	int wscale;
+	int wrlen;
+	int sizev4 = is_t4(ep->com.dev->rdev.lldi.adapter_type) ?
+				sizeof(struct cpl_act_open_req) :
+				sizeof(struct cpl_t5_act_open_req);
+	int sizev6 = is_t4(ep->com.dev->rdev.lldi.adapter_type) ?
+				sizeof(struct cpl_act_open_req6) :
+				sizeof(struct cpl_t5_act_open_req6);
+	struct sockaddr_in *la = (struct sockaddr_in *)
+				 &ep->com.mapped_local_addr;
+	struct sockaddr_in *ra = (struct sockaddr_in *)
+				 &ep->com.mapped_remote_addr;
+	struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)
+				   &ep->com.mapped_local_addr;
+	struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)
+				   &ep->com.mapped_remote_addr;
+	int win;
+
+	wrlen = (ep->com.remote_addr.ss_family == AF_INET) ?
+			roundup(sizev4, 16) :
+			roundup(sizev6, 16);
+
+	PDBG("%s ep %p atid %u\n", __func__, ep, ep->atid);
+
+	skb = get_skb(NULL, wrlen, GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - failed to alloc skb.\n",
+		       __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx);
+
+	best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
+		 enable_tcp_timestamps,
+		 (AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1);
+	wscale = compute_wscale(rcv_win);
+
+	/*
+	 * Specify the largest window that will fit in opt0. The
+	 * remainder will be specified in the rx_data_ack.
+	 */
+	win = ep->rcv_win >> 10;
+	if (win > RCV_BUFSIZ_M)
+		win = RCV_BUFSIZ_M;
+
+	opt0 = (nocong ? NO_CONG_F : 0) |
+	       KEEP_ALIVE_F |
+	       DELACK_F |
+	       WND_SCALE_V(wscale) |
+	       MSS_IDX_V(mtu_idx) |
+	       L2T_IDX_V(ep->l2t->idx) |
+	       TX_CHAN_V(ep->tx_chan) |
+	       SMAC_SEL_V(ep->smac_idx) |
+	       DSCP_V(ep->tos) |
+	       ULP_MODE_V(ULP_MODE_TCPDDP) |
+	       RCV_BUFSIZ_V(win);
+	opt2 = RX_CHANNEL_V(0) |
+	       CCTRL_ECN_V(enable_ecn) |
+	       RSS_QUEUE_VALID_F | RSS_QUEUE_V(ep->rss_qid);
+	if (enable_tcp_timestamps)
+		opt2 |= TSTAMPS_EN_F;
+	if (enable_tcp_sack)
+		opt2 |= SACK_EN_F;
+	if (wscale && enable_tcp_window_scaling)
+		opt2 |= WND_SCALE_EN_F;
+	if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) {
+		opt2 |= T5_OPT_2_VALID_F;
+		opt2 |= CONG_CNTRL_V(CONG_ALG_TAHOE);
+		opt2 |= T5_ISS_F;
+	}
+	t4_set_arp_err_handler(skb, ep, act_open_req_arp_failure);
+
+	if (is_t4(ep->com.dev->rdev.lldi.adapter_type)) {
+		if (ep->com.remote_addr.ss_family == AF_INET) {
+			req = (struct cpl_act_open_req *) skb_put(skb, wrlen);
+			INIT_TP_WR(req, 0);
+			OPCODE_TID(req) = cpu_to_be32(
+					MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
+					((ep->rss_qid << 14) | ep->atid)));
+			req->local_port = la->sin_port;
+			req->peer_port = ra->sin_port;
+			req->local_ip = la->sin_addr.s_addr;
+			req->peer_ip = ra->sin_addr.s_addr;
+			req->opt0 = cpu_to_be64(opt0);
+			req->params = cpu_to_be32(cxgb4_select_ntuple(
+						ep->com.dev->rdev.lldi.ports[0],
+						ep->l2t));
+			req->opt2 = cpu_to_be32(opt2);
+		} else {
+			req6 = (struct cpl_act_open_req6 *)skb_put(skb, wrlen);
+
+			INIT_TP_WR(req6, 0);
+			OPCODE_TID(req6) = cpu_to_be32(
+					   MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
+					   ((ep->rss_qid<<14)|ep->atid)));
+			req6->local_port = la6->sin6_port;
+			req6->peer_port = ra6->sin6_port;
+			req6->local_ip_hi = *((__be64 *)
+						(la6->sin6_addr.s6_addr));
+			req6->local_ip_lo = *((__be64 *)
+						(la6->sin6_addr.s6_addr + 8));
+			req6->peer_ip_hi = *((__be64 *)
+						(ra6->sin6_addr.s6_addr));
+			req6->peer_ip_lo = *((__be64 *)
+						(ra6->sin6_addr.s6_addr + 8));
+			req6->opt0 = cpu_to_be64(opt0);
+			req6->params = cpu_to_be32(cxgb4_select_ntuple(
+						ep->com.dev->rdev.lldi.ports[0],
+						ep->l2t));
+			req6->opt2 = cpu_to_be32(opt2);
+		}
+	} else {
+		u32 isn = (prandom_u32() & ~7UL) - 1;
+
+		if (peer2peer)
+			isn += 4;
+
+		if (ep->com.remote_addr.ss_family == AF_INET) {
+			t5_req = (struct cpl_t5_act_open_req *)
+				 skb_put(skb, wrlen);
+			INIT_TP_WR(t5_req, 0);
+			OPCODE_TID(t5_req) = cpu_to_be32(
+					MK_OPCODE_TID(CPL_ACT_OPEN_REQ,
+					((ep->rss_qid << 14) | ep->atid)));
+			t5_req->local_port = la->sin_port;
+			t5_req->peer_port = ra->sin_port;
+			t5_req->local_ip = la->sin_addr.s_addr;
+			t5_req->peer_ip = ra->sin_addr.s_addr;
+			t5_req->opt0 = cpu_to_be64(opt0);
+			t5_req->params = cpu_to_be64(FILTER_TUPLE_V(
+						     cxgb4_select_ntuple(
+					     ep->com.dev->rdev.lldi.ports[0],
+					     ep->l2t)));
+			t5_req->rsvd = cpu_to_be32(isn);
+			PDBG("%s snd_isn %u\n", __func__,
+			     be32_to_cpu(t5_req->rsvd));
+			t5_req->opt2 = cpu_to_be32(opt2);
+		} else {
+			t5_req6 = (struct cpl_t5_act_open_req6 *)
+				  skb_put(skb, wrlen);
+			INIT_TP_WR(t5_req6, 0);
+			OPCODE_TID(t5_req6) = cpu_to_be32(
+					      MK_OPCODE_TID(CPL_ACT_OPEN_REQ6,
+					      ((ep->rss_qid<<14)|ep->atid)));
+			t5_req6->local_port = la6->sin6_port;
+			t5_req6->peer_port = ra6->sin6_port;
+			t5_req6->local_ip_hi = *((__be64 *)
+						(la6->sin6_addr.s6_addr));
+			t5_req6->local_ip_lo = *((__be64 *)
+						(la6->sin6_addr.s6_addr + 8));
+			t5_req6->peer_ip_hi = *((__be64 *)
+						(ra6->sin6_addr.s6_addr));
+			t5_req6->peer_ip_lo = *((__be64 *)
+						(ra6->sin6_addr.s6_addr + 8));
+			t5_req6->opt0 = cpu_to_be64(opt0);
+			t5_req6->params = cpu_to_be64(FILTER_TUPLE_V(
+							cxgb4_select_ntuple(
+						ep->com.dev->rdev.lldi.ports[0],
+						ep->l2t)));
+			t5_req6->rsvd = cpu_to_be32(isn);
+			PDBG("%s snd_isn %u\n", __func__,
+			     be32_to_cpu(t5_req6->rsvd));
+			t5_req6->opt2 = cpu_to_be32(opt2);
+		}
+	}
+
+	set_bit(ACT_OPEN_REQ, &ep->com.history);
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static void send_mpa_req(struct c4iw_ep *ep, struct sk_buff *skb,
+		u8 mpa_rev_to_use)
+{
+	int mpalen, wrlen;
+	struct fw_ofld_tx_data_wr *req;
+	struct mpa_message *mpa;
+	struct mpa_v2_conn_params mpa_v2_params;
+
+	PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen);
+
+	BUG_ON(skb_cloned(skb));
+
+	mpalen = sizeof(*mpa) + ep->plen;
+	if (mpa_rev_to_use == 2)
+		mpalen += sizeof(struct mpa_v2_conn_params);
+	wrlen = roundup(mpalen + sizeof *req, 16);
+	skb = get_skb(skb, wrlen, GFP_KERNEL);
+	if (!skb) {
+		connect_reply_upcall(ep, -ENOMEM);
+		return;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+
+	req = (struct fw_ofld_tx_data_wr *)skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	req->op_to_immdlen = cpu_to_be32(
+		FW_WR_OP_V(FW_OFLD_TX_DATA_WR) |
+		FW_WR_COMPL_F |
+		FW_WR_IMMDLEN_V(mpalen));
+	req->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID_V(ep->hwtid) |
+		FW_WR_LEN16_V(wrlen >> 4));
+	req->plen = cpu_to_be32(mpalen);
+	req->tunnel_to_proxy = cpu_to_be32(
+		FW_OFLD_TX_DATA_WR_FLUSH_F |
+		FW_OFLD_TX_DATA_WR_SHOVE_F);
+
+	mpa = (struct mpa_message *)(req + 1);
+	memcpy(mpa->key, MPA_KEY_REQ, sizeof(mpa->key));
+	mpa->flags = (crc_enabled ? MPA_CRC : 0) |
+		     (markers_enabled ? MPA_MARKERS : 0) |
+		     (mpa_rev_to_use == 2 ? MPA_ENHANCED_RDMA_CONN : 0);
+	mpa->private_data_size = htons(ep->plen);
+	mpa->revision = mpa_rev_to_use;
+	if (mpa_rev_to_use == 1) {
+		ep->tried_with_mpa_v1 = 1;
+		ep->retry_with_mpa_v1 = 0;
+	}
+
+	if (mpa_rev_to_use == 2) {
+		mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
+					       sizeof (struct mpa_v2_conn_params));
+		PDBG("%s initiator ird %u ord %u\n", __func__, ep->ird,
+		     ep->ord);
+		mpa_v2_params.ird = htons((u16)ep->ird);
+		mpa_v2_params.ord = htons((u16)ep->ord);
+
+		if (peer2peer) {
+			mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL);
+			if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE)
+				mpa_v2_params.ord |=
+					htons(MPA_V2_RDMA_WRITE_RTR);
+			else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ)
+				mpa_v2_params.ord |=
+					htons(MPA_V2_RDMA_READ_RTR);
+		}
+		memcpy(mpa->private_data, &mpa_v2_params,
+		       sizeof(struct mpa_v2_conn_params));
+
+		if (ep->plen)
+			memcpy(mpa->private_data +
+			       sizeof(struct mpa_v2_conn_params),
+			       ep->mpa_pkt + sizeof(*mpa), ep->plen);
+	} else
+		if (ep->plen)
+			memcpy(mpa->private_data,
+					ep->mpa_pkt + sizeof(*mpa), ep->plen);
+
+	/*
+	 * Reference the mpa skb.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function fw4_ack() will deref it.
+	 */
+	skb_get(skb);
+	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+	BUG_ON(ep->mpa_skb);
+	ep->mpa_skb = skb;
+	c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+	start_ep_timer(ep);
+	__state_set(&ep->com, MPA_REQ_SENT);
+	ep->mpa_attr.initiator = 1;
+	ep->snd_seq += mpalen;
+	return;
+}
+
+static int send_mpa_reject(struct c4iw_ep *ep, const void *pdata, u8 plen)
+{
+	int mpalen, wrlen;
+	struct fw_ofld_tx_data_wr *req;
+	struct mpa_message *mpa;
+	struct sk_buff *skb;
+	struct mpa_v2_conn_params mpa_v2_params;
+
+	PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen);
+
+	mpalen = sizeof(*mpa) + plen;
+	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn)
+		mpalen += sizeof(struct mpa_v2_conn_params);
+	wrlen = roundup(mpalen + sizeof *req, 16);
+
+	skb = get_skb(NULL, wrlen, GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+
+	req = (struct fw_ofld_tx_data_wr *)skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	req->op_to_immdlen = cpu_to_be32(
+		FW_WR_OP_V(FW_OFLD_TX_DATA_WR) |
+		FW_WR_COMPL_F |
+		FW_WR_IMMDLEN_V(mpalen));
+	req->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID_V(ep->hwtid) |
+		FW_WR_LEN16_V(wrlen >> 4));
+	req->plen = cpu_to_be32(mpalen);
+	req->tunnel_to_proxy = cpu_to_be32(
+		FW_OFLD_TX_DATA_WR_FLUSH_F |
+		FW_OFLD_TX_DATA_WR_SHOVE_F);
+
+	mpa = (struct mpa_message *)(req + 1);
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+	mpa->flags = MPA_REJECT;
+	mpa->revision = ep->mpa_attr.version;
+	mpa->private_data_size = htons(plen);
+
+	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
+		mpa->flags |= MPA_ENHANCED_RDMA_CONN;
+		mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
+					       sizeof (struct mpa_v2_conn_params));
+		mpa_v2_params.ird = htons(((u16)ep->ird) |
+					  (peer2peer ? MPA_V2_PEER2PEER_MODEL :
+					   0));
+		mpa_v2_params.ord = htons(((u16)ep->ord) | (peer2peer ?
+					  (p2p_type ==
+					   FW_RI_INIT_P2PTYPE_RDMA_WRITE ?
+					   MPA_V2_RDMA_WRITE_RTR : p2p_type ==
+					   FW_RI_INIT_P2PTYPE_READ_REQ ?
+					   MPA_V2_RDMA_READ_RTR : 0) : 0));
+		memcpy(mpa->private_data, &mpa_v2_params,
+		       sizeof(struct mpa_v2_conn_params));
+
+		if (ep->plen)
+			memcpy(mpa->private_data +
+			       sizeof(struct mpa_v2_conn_params), pdata, plen);
+	} else
+		if (plen)
+			memcpy(mpa->private_data, pdata, plen);
+
+	/*
+	 * Reference the mpa skb again.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function fw4_ack() will deref it.
+	 */
+	skb_get(skb);
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+	BUG_ON(ep->mpa_skb);
+	ep->mpa_skb = skb;
+	ep->snd_seq += mpalen;
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static int send_mpa_reply(struct c4iw_ep *ep, const void *pdata, u8 plen)
+{
+	int mpalen, wrlen;
+	struct fw_ofld_tx_data_wr *req;
+	struct mpa_message *mpa;
+	struct sk_buff *skb;
+	struct mpa_v2_conn_params mpa_v2_params;
+
+	PDBG("%s ep %p tid %u pd_len %d\n", __func__, ep, ep->hwtid, ep->plen);
+
+	mpalen = sizeof(*mpa) + plen;
+	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn)
+		mpalen += sizeof(struct mpa_v2_conn_params);
+	wrlen = roundup(mpalen + sizeof *req, 16);
+
+	skb = get_skb(NULL, wrlen, GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "%s - cannot alloc skb!\n", __func__);
+		return -ENOMEM;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+
+	req = (struct fw_ofld_tx_data_wr *) skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	req->op_to_immdlen = cpu_to_be32(
+		FW_WR_OP_V(FW_OFLD_TX_DATA_WR) |
+		FW_WR_COMPL_F |
+		FW_WR_IMMDLEN_V(mpalen));
+	req->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID_V(ep->hwtid) |
+		FW_WR_LEN16_V(wrlen >> 4));
+	req->plen = cpu_to_be32(mpalen);
+	req->tunnel_to_proxy = cpu_to_be32(
+		FW_OFLD_TX_DATA_WR_FLUSH_F |
+		FW_OFLD_TX_DATA_WR_SHOVE_F);
+
+	mpa = (struct mpa_message *)(req + 1);
+	memset(mpa, 0, sizeof(*mpa));
+	memcpy(mpa->key, MPA_KEY_REP, sizeof(mpa->key));
+	mpa->flags = (ep->mpa_attr.crc_enabled ? MPA_CRC : 0) |
+		     (markers_enabled ? MPA_MARKERS : 0);
+	mpa->revision = ep->mpa_attr.version;
+	mpa->private_data_size = htons(plen);
+
+	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
+		mpa->flags |= MPA_ENHANCED_RDMA_CONN;
+		mpa->private_data_size = htons(ntohs(mpa->private_data_size) +
+					       sizeof (struct mpa_v2_conn_params));
+		mpa_v2_params.ird = htons((u16)ep->ird);
+		mpa_v2_params.ord = htons((u16)ep->ord);
+		if (peer2peer && (ep->mpa_attr.p2p_type !=
+					FW_RI_INIT_P2PTYPE_DISABLED)) {
+			mpa_v2_params.ird |= htons(MPA_V2_PEER2PEER_MODEL);
+
+			if (p2p_type == FW_RI_INIT_P2PTYPE_RDMA_WRITE)
+				mpa_v2_params.ord |=
+					htons(MPA_V2_RDMA_WRITE_RTR);
+			else if (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ)
+				mpa_v2_params.ord |=
+					htons(MPA_V2_RDMA_READ_RTR);
+		}
+
+		memcpy(mpa->private_data, &mpa_v2_params,
+		       sizeof(struct mpa_v2_conn_params));
+
+		if (ep->plen)
+			memcpy(mpa->private_data +
+			       sizeof(struct mpa_v2_conn_params), pdata, plen);
+	} else
+		if (plen)
+			memcpy(mpa->private_data, pdata, plen);
+
+	/*
+	 * Reference the mpa skb.  This ensures the data area
+	 * will remain in memory until the hw acks the tx.
+	 * Function fw4_ack() will deref it.
+	 */
+	skb_get(skb);
+	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+	ep->mpa_skb = skb;
+	__state_set(&ep->com, MPA_REP_SENT);
+	ep->snd_seq += mpalen;
+	return c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+static int act_establish(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_act_establish *req = cplhdr(skb);
+	unsigned int tid = GET_TID(req);
+	unsigned int atid = TID_TID_G(ntohl(req->tos_atid));
+	struct tid_info *t = dev->rdev.lldi.tids;
+
+	ep = lookup_atid(t, atid);
+
+	PDBG("%s ep %p tid %u snd_isn %u rcv_isn %u\n", __func__, ep, tid,
+	     be32_to_cpu(req->snd_isn), be32_to_cpu(req->rcv_isn));
+
+	mutex_lock(&ep->com.mutex);
+	dst_confirm(ep->dst);
+
+	/* setup the hwtid for this connection */
+	ep->hwtid = tid;
+	cxgb4_insert_tid(t, ep, tid);
+	insert_handle(dev, &dev->hwtid_idr, ep, ep->hwtid);
+
+	ep->snd_seq = be32_to_cpu(req->snd_isn);
+	ep->rcv_seq = be32_to_cpu(req->rcv_isn);
+
+	set_emss(ep, ntohs(req->tcp_opt));
+
+	/* dealloc the atid */
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid);
+	cxgb4_free_atid(t, atid);
+	set_bit(ACT_ESTAB, &ep->com.history);
+
+	/* start MPA negotiation */
+	send_flowc(ep, NULL);
+	if (ep->retry_with_mpa_v1)
+		send_mpa_req(ep, skb, 1);
+	else
+		send_mpa_req(ep, skb, mpa_rev);
+	mutex_unlock(&ep->com.mutex);
+	return 0;
+}
+
+static void close_complete_upcall(struct c4iw_ep *ep, int status)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CLOSE;
+	event.status = status;
+	if (ep->com.cm_id) {
+		PDBG("close complete delivered ep %p cm_id %p tid %u\n",
+		     ep, ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		set_bit(CLOSE_UPCALL, &ep->com.history);
+	}
+}
+
+static int abort_connection(struct c4iw_ep *ep, struct sk_buff *skb, gfp_t gfp)
+{
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	__state_set(&ep->com, ABORTING);
+	set_bit(ABORT_CONN, &ep->com.history);
+	return send_abort(ep, skb, gfp);
+}
+
+static void peer_close_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_DISCONNECT;
+	if (ep->com.cm_id) {
+		PDBG("peer close delivered ep %p cm_id %p tid %u\n",
+		     ep, ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		set_bit(DISCONN_UPCALL, &ep->com.history);
+	}
+}
+
+static void peer_abort_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CLOSE;
+	event.status = -ECONNRESET;
+	if (ep->com.cm_id) {
+		PDBG("abort delivered ep %p cm_id %p tid %u\n", ep,
+		     ep->com.cm_id, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+		set_bit(ABORT_UPCALL, &ep->com.history);
+	}
+}
+
+static void connect_reply_upcall(struct c4iw_ep *ep, int status)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %u status %d\n", __func__, ep, ep->hwtid, status);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REPLY;
+	event.status = status;
+	memcpy(&event.local_addr, &ep->com.local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&event.remote_addr, &ep->com.remote_addr,
+	       sizeof(ep->com.remote_addr));
+
+	if ((status == 0) || (status == -ECONNREFUSED)) {
+		if (!ep->tried_with_mpa_v1) {
+			/* this means MPA_v2 is used */
+			event.private_data_len = ep->plen -
+				sizeof(struct mpa_v2_conn_params);
+			event.private_data = ep->mpa_pkt +
+				sizeof(struct mpa_message) +
+				sizeof(struct mpa_v2_conn_params);
+		} else {
+			/* this means MPA_v1 is used */
+			event.private_data_len = ep->plen;
+			event.private_data = ep->mpa_pkt +
+				sizeof(struct mpa_message);
+		}
+	}
+
+	PDBG("%s ep %p tid %u status %d\n", __func__, ep,
+	     ep->hwtid, status);
+	set_bit(CONN_RPL_UPCALL, &ep->com.history);
+	ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+
+	if (status < 0) {
+		ep->com.cm_id->rem_ref(ep->com.cm_id);
+		ep->com.cm_id = NULL;
+	}
+}
+
+static int connect_request_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+	int ret;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_CONNECT_REQUEST;
+	memcpy(&event.local_addr, &ep->com.local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&event.remote_addr, &ep->com.remote_addr,
+	       sizeof(ep->com.remote_addr));
+	event.provider_data = ep;
+	if (!ep->tried_with_mpa_v1) {
+		/* this means MPA_v2 is used */
+		event.ord = ep->ord;
+		event.ird = ep->ird;
+		event.private_data_len = ep->plen -
+			sizeof(struct mpa_v2_conn_params);
+		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message) +
+			sizeof(struct mpa_v2_conn_params);
+	} else {
+		/* this means MPA_v1 is used. Send max supported */
+		event.ord = cur_max_read_depth(ep->com.dev);
+		event.ird = cur_max_read_depth(ep->com.dev);
+		event.private_data_len = ep->plen;
+		event.private_data = ep->mpa_pkt + sizeof(struct mpa_message);
+	}
+	c4iw_get_ep(&ep->com);
+	ret = ep->parent_ep->com.cm_id->event_handler(ep->parent_ep->com.cm_id,
+						      &event);
+	if (ret)
+		c4iw_put_ep(&ep->com);
+	set_bit(CONNREQ_UPCALL, &ep->com.history);
+	c4iw_put_ep(&ep->parent_ep->com);
+	return ret;
+}
+
+static void established_upcall(struct c4iw_ep *ep)
+{
+	struct iw_cm_event event;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	memset(&event, 0, sizeof(event));
+	event.event = IW_CM_EVENT_ESTABLISHED;
+	event.ird = ep->ird;
+	event.ord = ep->ord;
+	if (ep->com.cm_id) {
+		PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+		ep->com.cm_id->event_handler(ep->com.cm_id, &event);
+		set_bit(ESTAB_UPCALL, &ep->com.history);
+	}
+}
+
+static int update_rx_credits(struct c4iw_ep *ep, u32 credits)
+{
+	struct cpl_rx_data_ack *req;
+	struct sk_buff *skb;
+	int wrlen = roundup(sizeof *req, 16);
+
+	PDBG("%s ep %p tid %u credits %u\n", __func__, ep, ep->hwtid, credits);
+	skb = get_skb(NULL, wrlen, GFP_KERNEL);
+	if (!skb) {
+		printk(KERN_ERR MOD "update_rx_credits - cannot alloc skb!\n");
+		return 0;
+	}
+
+	/*
+	 * If we couldn't specify the entire rcv window at connection setup
+	 * due to the limit in the number of bits in the RCV_BUFSIZ field,
+	 * then add the overage in to the credits returned.
+	 */
+	if (ep->rcv_win > RCV_BUFSIZ_M * 1024)
+		credits += ep->rcv_win - RCV_BUFSIZ_M * 1024;
+
+	req = (struct cpl_rx_data_ack *) skb_put(skb, wrlen);
+	memset(req, 0, wrlen);
+	INIT_TP_WR(req, ep->hwtid);
+	OPCODE_TID(req) = cpu_to_be32(MK_OPCODE_TID(CPL_RX_DATA_ACK,
+						    ep->hwtid));
+	req->credit_dack = cpu_to_be32(credits | RX_FORCE_ACK_F |
+				       RX_DACK_CHANGE_F |
+				       RX_DACK_MODE_V(dack_mode));
+	set_wr_txq(skb, CPL_PRIORITY_ACK, ep->ctrlq_idx);
+	c4iw_ofld_send(&ep->com.dev->rdev, skb);
+	return credits;
+}
+
+#define RELAXED_IRD_NEGOTIATION 1
+
+static int process_mpa_reply(struct c4iw_ep *ep, struct sk_buff *skb)
+{
+	struct mpa_message *mpa;
+	struct mpa_v2_conn_params *mpa_v2_params;
+	u16 plen;
+	u16 resp_ird, resp_ord;
+	u8 rtr_mismatch = 0, insuff_ird = 0;
+	struct c4iw_qp_attributes attrs;
+	enum c4iw_qp_attr_mask mask;
+	int err;
+	int disconnect = 0;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+
+	/*
+	 * Stop mpa timer.  If it expired, then
+	 * we ignore the MPA reply.  process_timeout()
+	 * will abort the connection.
+	 */
+	if (stop_ep_timer(ep))
+		return 0;
+
+	/*
+	 * If we get more than the supported amount of private data
+	 * then we must fail this connection.
+	 */
+	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
+		err = -EINVAL;
+		goto err;
+	}
+
+	/*
+	 * copy the new data into our accumulation buffer.
+	 */
+	skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]),
+				  skb->len);
+	ep->mpa_pkt_len += skb->len;
+
+	/*
+	 * if we don't even have the mpa message, then bail.
+	 */
+	if (ep->mpa_pkt_len < sizeof(*mpa))
+		return 0;
+	mpa = (struct mpa_message *) ep->mpa_pkt;
+
+	/* Validate MPA header. */
+	if (mpa->revision > mpa_rev) {
+		printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d,"
+		       " Received = %d\n", __func__, mpa_rev, mpa->revision);
+		err = -EPROTO;
+		goto err;
+	}
+	if (memcmp(mpa->key, MPA_KEY_REP, sizeof(mpa->key))) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	plen = ntohs(mpa->private_data_size);
+
+	/*
+	 * Fail if there's too much private data.
+	 */
+	if (plen > MPA_MAX_PRIVATE_DATA) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	/*
+	 * If plen does not account for pkt size
+	 */
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+		err = -EPROTO;
+		goto err;
+	}
+
+	ep->plen = (u8) plen;
+
+	/*
+	 * If we don't have all the pdata yet, then bail.
+	 * We'll continue process when more data arrives.
+	 */
+	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+		return 0;
+
+	if (mpa->flags & MPA_REJECT) {
+		err = -ECONNREFUSED;
+		goto err;
+	}
+
+	/*
+	 * If we get here we have accumulated the entire mpa
+	 * start reply message including private data. And
+	 * the MPA header is valid.
+	 */
+	__state_set(&ep->com, FPDU_MODE);
+	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+	ep->mpa_attr.recv_marker_enabled = markers_enabled;
+	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+	ep->mpa_attr.version = mpa->revision;
+	ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED;
+
+	if (mpa->revision == 2) {
+		ep->mpa_attr.enhanced_rdma_conn =
+			mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0;
+		if (ep->mpa_attr.enhanced_rdma_conn) {
+			mpa_v2_params = (struct mpa_v2_conn_params *)
+				(ep->mpa_pkt + sizeof(*mpa));
+			resp_ird = ntohs(mpa_v2_params->ird) &
+				MPA_V2_IRD_ORD_MASK;
+			resp_ord = ntohs(mpa_v2_params->ord) &
+				MPA_V2_IRD_ORD_MASK;
+			PDBG("%s responder ird %u ord %u ep ird %u ord %u\n",
+			     __func__, resp_ird, resp_ord, ep->ird, ep->ord);
+
+			/*
+			 * This is a double-check. Ideally, below checks are
+			 * not required since ird/ord stuff has been taken
+			 * care of in c4iw_accept_cr
+			 */
+			if (ep->ird < resp_ord) {
+				if (RELAXED_IRD_NEGOTIATION && resp_ord <=
+				    ep->com.dev->rdev.lldi.max_ordird_qp)
+					ep->ird = resp_ord;
+				else
+					insuff_ird = 1;
+			} else if (ep->ird > resp_ord) {
+				ep->ird = resp_ord;
+			}
+			if (ep->ord > resp_ird) {
+				if (RELAXED_IRD_NEGOTIATION)
+					ep->ord = resp_ird;
+				else
+					insuff_ird = 1;
+			}
+			if (insuff_ird) {
+				err = -ENOMEM;
+				ep->ird = resp_ord;
+				ep->ord = resp_ird;
+			}
+
+			if (ntohs(mpa_v2_params->ird) &
+					MPA_V2_PEER2PEER_MODEL) {
+				if (ntohs(mpa_v2_params->ord) &
+						MPA_V2_RDMA_WRITE_RTR)
+					ep->mpa_attr.p2p_type =
+						FW_RI_INIT_P2PTYPE_RDMA_WRITE;
+				else if (ntohs(mpa_v2_params->ord) &
+						MPA_V2_RDMA_READ_RTR)
+					ep->mpa_attr.p2p_type =
+						FW_RI_INIT_P2PTYPE_READ_REQ;
+			}
+		}
+	} else if (mpa->revision == 1)
+		if (peer2peer)
+			ep->mpa_attr.p2p_type = p2p_type;
+
+	PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, "
+	     "xmit_marker_enabled=%d, version=%d p2p_type=%d local-p2p_type = "
+	     "%d\n", __func__, ep->mpa_attr.crc_enabled,
+	     ep->mpa_attr.recv_marker_enabled,
+	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version,
+	     ep->mpa_attr.p2p_type, p2p_type);
+
+	/*
+	 * If responder's RTR does not match with that of initiator, assign
+	 * FW_RI_INIT_P2PTYPE_DISABLED in mpa attributes so that RTR is not
+	 * generated when moving QP to RTS state.
+	 * A TERM message will be sent after QP has moved to RTS state
+	 */
+	if ((ep->mpa_attr.version == 2) && peer2peer &&
+			(ep->mpa_attr.p2p_type != p2p_type)) {
+		ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED;
+		rtr_mismatch = 1;
+	}
+
+	attrs.mpa_attr = ep->mpa_attr;
+	attrs.max_ird = ep->ird;
+	attrs.max_ord = ep->ord;
+	attrs.llp_stream_handle = ep;
+	attrs.next_state = C4IW_QP_STATE_RTS;
+
+	mask = C4IW_QP_ATTR_NEXT_STATE |
+	    C4IW_QP_ATTR_LLP_STREAM_HANDLE | C4IW_QP_ATTR_MPA_ATTR |
+	    C4IW_QP_ATTR_MAX_IRD | C4IW_QP_ATTR_MAX_ORD;
+
+	/* bind QP and TID with INIT_WR */
+	err = c4iw_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, mask, &attrs, 1);
+	if (err)
+		goto err;
+
+	/*
+	 * If responder's RTR requirement did not match with what initiator
+	 * supports, generate TERM message
+	 */
+	if (rtr_mismatch) {
+		printk(KERN_ERR "%s: RTR mismatch, sending TERM\n", __func__);
+		attrs.layer_etype = LAYER_MPA | DDP_LLP;
+		attrs.ecode = MPA_NOMATCH_RTR;
+		attrs.next_state = C4IW_QP_STATE_TERMINATE;
+		attrs.send_term = 1;
+		err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		err = -ENOMEM;
+		disconnect = 1;
+		goto out;
+	}
+
+	/*
+	 * Generate TERM if initiator IRD is not sufficient for responder
+	 * provided ORD. Currently, we do the same behaviour even when
+	 * responder provided IRD is also not sufficient as regards to
+	 * initiator ORD.
+	 */
+	if (insuff_ird) {
+		printk(KERN_ERR "%s: Insufficient IRD, sending TERM\n",
+				__func__);
+		attrs.layer_etype = LAYER_MPA | DDP_LLP;
+		attrs.ecode = MPA_INSUFF_IRD;
+		attrs.next_state = C4IW_QP_STATE_TERMINATE;
+		attrs.send_term = 1;
+		err = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		err = -ENOMEM;
+		disconnect = 1;
+		goto out;
+	}
+	goto out;
+err:
+	__state_set(&ep->com, ABORTING);
+	send_abort(ep, skb, GFP_KERNEL);
+out:
+	connect_reply_upcall(ep, err);
+	return disconnect;
+}
+
+static void process_mpa_request(struct c4iw_ep *ep, struct sk_buff *skb)
+{
+	struct mpa_message *mpa;
+	struct mpa_v2_conn_params *mpa_v2_params;
+	u16 plen;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+
+	/*
+	 * If we get more than the supported amount of private data
+	 * then we must fail this connection.
+	 */
+	if (ep->mpa_pkt_len + skb->len > sizeof(ep->mpa_pkt)) {
+		(void)stop_ep_timer(ep);
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
+
+	/*
+	 * Copy the new data into our accumulation buffer.
+	 */
+	skb_copy_from_linear_data(skb, &(ep->mpa_pkt[ep->mpa_pkt_len]),
+				  skb->len);
+	ep->mpa_pkt_len += skb->len;
+
+	/*
+	 * If we don't even have the mpa message, then bail.
+	 * We'll continue process when more data arrives.
+	 */
+	if (ep->mpa_pkt_len < sizeof(*mpa))
+		return;
+
+	PDBG("%s enter (%s line %u)\n", __func__, __FILE__, __LINE__);
+	mpa = (struct mpa_message *) ep->mpa_pkt;
+
+	/*
+	 * Validate MPA Header.
+	 */
+	if (mpa->revision > mpa_rev) {
+		printk(KERN_ERR MOD "%s MPA version mismatch. Local = %d,"
+		       " Received = %d\n", __func__, mpa_rev, mpa->revision);
+		(void)stop_ep_timer(ep);
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	if (memcmp(mpa->key, MPA_KEY_REQ, sizeof(mpa->key))) {
+		(void)stop_ep_timer(ep);
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	plen = ntohs(mpa->private_data_size);
+
+	/*
+	 * Fail if there's too much private data.
+	 */
+	if (plen > MPA_MAX_PRIVATE_DATA) {
+		(void)stop_ep_timer(ep);
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+
+	/*
+	 * If plen does not account for pkt size
+	 */
+	if (ep->mpa_pkt_len > (sizeof(*mpa) + plen)) {
+		(void)stop_ep_timer(ep);
+		abort_connection(ep, skb, GFP_KERNEL);
+		return;
+	}
+	ep->plen = (u8) plen;
+
+	/*
+	 * If we don't have all the pdata yet, then bail.
+	 */
+	if (ep->mpa_pkt_len < (sizeof(*mpa) + plen))
+		return;
+
+	/*
+	 * If we get here we have accumulated the entire mpa
+	 * start reply message including private data.
+	 */
+	ep->mpa_attr.initiator = 0;
+	ep->mpa_attr.crc_enabled = (mpa->flags & MPA_CRC) | crc_enabled ? 1 : 0;
+	ep->mpa_attr.recv_marker_enabled = markers_enabled;
+	ep->mpa_attr.xmit_marker_enabled = mpa->flags & MPA_MARKERS ? 1 : 0;
+	ep->mpa_attr.version = mpa->revision;
+	if (mpa->revision == 1)
+		ep->tried_with_mpa_v1 = 1;
+	ep->mpa_attr.p2p_type = FW_RI_INIT_P2PTYPE_DISABLED;
+
+	if (mpa->revision == 2) {
+		ep->mpa_attr.enhanced_rdma_conn =
+			mpa->flags & MPA_ENHANCED_RDMA_CONN ? 1 : 0;
+		if (ep->mpa_attr.enhanced_rdma_conn) {
+			mpa_v2_params = (struct mpa_v2_conn_params *)
+				(ep->mpa_pkt + sizeof(*mpa));
+			ep->ird = ntohs(mpa_v2_params->ird) &
+				MPA_V2_IRD_ORD_MASK;
+			ep->ord = ntohs(mpa_v2_params->ord) &
+				MPA_V2_IRD_ORD_MASK;
+			PDBG("%s initiator ird %u ord %u\n", __func__, ep->ird,
+			     ep->ord);
+			if (ntohs(mpa_v2_params->ird) & MPA_V2_PEER2PEER_MODEL)
+				if (peer2peer) {
+					if (ntohs(mpa_v2_params->ord) &
+							MPA_V2_RDMA_WRITE_RTR)
+						ep->mpa_attr.p2p_type =
+						FW_RI_INIT_P2PTYPE_RDMA_WRITE;
+					else if (ntohs(mpa_v2_params->ord) &
+							MPA_V2_RDMA_READ_RTR)
+						ep->mpa_attr.p2p_type =
+						FW_RI_INIT_P2PTYPE_READ_REQ;
+				}
+		}
+	} else if (mpa->revision == 1)
+		if (peer2peer)
+			ep->mpa_attr.p2p_type = p2p_type;
+
+	PDBG("%s - crc_enabled=%d, recv_marker_enabled=%d, "
+	     "xmit_marker_enabled=%d, version=%d p2p_type=%d\n", __func__,
+	     ep->mpa_attr.crc_enabled, ep->mpa_attr.recv_marker_enabled,
+	     ep->mpa_attr.xmit_marker_enabled, ep->mpa_attr.version,
+	     ep->mpa_attr.p2p_type);
+
+	/*
+	 * If the endpoint timer already expired, then we ignore
+	 * the start request.  process_timeout() will abort
+	 * the connection.
+	 */
+	if (!stop_ep_timer(ep)) {
+		__state_set(&ep->com, MPA_REQ_RCVD);
+
+		/* drive upcall */
+		mutex_lock_nested(&ep->parent_ep->com.mutex,
+				  SINGLE_DEPTH_NESTING);
+		if (ep->parent_ep->com.state != DEAD) {
+			if (connect_request_upcall(ep))
+				abort_connection(ep, skb, GFP_KERNEL);
+		} else {
+			abort_connection(ep, skb, GFP_KERNEL);
+		}
+		mutex_unlock(&ep->parent_ep->com.mutex);
+	}
+	return;
+}
+
+static int rx_data(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_rx_data *hdr = cplhdr(skb);
+	unsigned int dlen = ntohs(hdr->len);
+	unsigned int tid = GET_TID(hdr);
+	struct tid_info *t = dev->rdev.lldi.tids;
+	__u8 status = hdr->status;
+	int disconnect = 0;
+
+	ep = lookup_tid(t, tid);
+	if (!ep)
+		return 0;
+	PDBG("%s ep %p tid %u dlen %u\n", __func__, ep, ep->hwtid, dlen);
+	skb_pull(skb, sizeof(*hdr));
+	skb_trim(skb, dlen);
+	mutex_lock(&ep->com.mutex);
+
+	/* update RX credits */
+	update_rx_credits(ep, dlen);
+
+	switch (ep->com.state) {
+	case MPA_REQ_SENT:
+		ep->rcv_seq += dlen;
+		disconnect = process_mpa_reply(ep, skb);
+		break;
+	case MPA_REQ_WAIT:
+		ep->rcv_seq += dlen;
+		process_mpa_request(ep, skb);
+		break;
+	case FPDU_MODE: {
+		struct c4iw_qp_attributes attrs;
+		BUG_ON(!ep->com.qp);
+		if (status)
+			pr_err("%s Unexpected streaming data." \
+			       " qpid %u ep %p state %d tid %u status %d\n",
+			       __func__, ep->com.qp->wq.sq.qid, ep,
+			       ep->com.state, ep->hwtid, status);
+		attrs.next_state = C4IW_QP_STATE_TERMINATE;
+		c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+			       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		disconnect = 1;
+		break;
+	}
+	default:
+		break;
+	}
+	mutex_unlock(&ep->com.mutex);
+	if (disconnect)
+		c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+	return 0;
+}
+
+static int abort_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_abort_rpl_rss *rpl = cplhdr(skb);
+	int release = 0;
+	unsigned int tid = GET_TID(rpl);
+	struct tid_info *t = dev->rdev.lldi.tids;
+
+	ep = lookup_tid(t, tid);
+	if (!ep) {
+		printk(KERN_WARNING MOD "Abort rpl to freed endpoint\n");
+		return 0;
+	}
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	mutex_lock(&ep->com.mutex);
+	switch (ep->com.state) {
+	case ABORTING:
+		c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		break;
+	default:
+		printk(KERN_ERR "%s ep %p state %d\n",
+		     __func__, ep, ep->com.state);
+		break;
+	}
+	mutex_unlock(&ep->com.mutex);
+
+	if (release)
+		release_ep_resources(ep);
+	return 0;
+}
+
+static void send_fw_act_open_req(struct c4iw_ep *ep, unsigned int atid)
+{
+	struct sk_buff *skb;
+	struct fw_ofld_connection_wr *req;
+	unsigned int mtu_idx;
+	int wscale;
+	struct sockaddr_in *sin;
+	int win;
+
+	skb = get_skb(NULL, sizeof(*req), GFP_KERNEL);
+	req = (struct fw_ofld_connection_wr *)__skb_put(skb, sizeof(*req));
+	memset(req, 0, sizeof(*req));
+	req->op_compl = htonl(WR_OP_V(FW_OFLD_CONNECTION_WR));
+	req->len16_pkd = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*req), 16)));
+	req->le.filter = cpu_to_be32(cxgb4_select_ntuple(
+				     ep->com.dev->rdev.lldi.ports[0],
+				     ep->l2t));
+	sin = (struct sockaddr_in *)&ep->com.mapped_local_addr;
+	req->le.lport = sin->sin_port;
+	req->le.u.ipv4.lip = sin->sin_addr.s_addr;
+	sin = (struct sockaddr_in *)&ep->com.mapped_remote_addr;
+	req->le.pport = sin->sin_port;
+	req->le.u.ipv4.pip = sin->sin_addr.s_addr;
+	req->tcb.t_state_to_astid =
+			htonl(FW_OFLD_CONNECTION_WR_T_STATE_V(TCP_SYN_SENT) |
+			FW_OFLD_CONNECTION_WR_ASTID_V(atid));
+	req->tcb.cplrxdataack_cplpassacceptrpl =
+			htons(FW_OFLD_CONNECTION_WR_CPLRXDATAACK_F);
+	req->tcb.tx_max = (__force __be32) jiffies;
+	req->tcb.rcv_adv = htons(1);
+	best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
+		 enable_tcp_timestamps,
+		 (AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1);
+	wscale = compute_wscale(rcv_win);
+
+	/*
+	 * Specify the largest window that will fit in opt0. The
+	 * remainder will be specified in the rx_data_ack.
+	 */
+	win = ep->rcv_win >> 10;
+	if (win > RCV_BUFSIZ_M)
+		win = RCV_BUFSIZ_M;
+
+	req->tcb.opt0 = (__force __be64) (TCAM_BYPASS_F |
+		(nocong ? NO_CONG_F : 0) |
+		KEEP_ALIVE_F |
+		DELACK_F |
+		WND_SCALE_V(wscale) |
+		MSS_IDX_V(mtu_idx) |
+		L2T_IDX_V(ep->l2t->idx) |
+		TX_CHAN_V(ep->tx_chan) |
+		SMAC_SEL_V(ep->smac_idx) |
+		DSCP_V(ep->tos) |
+		ULP_MODE_V(ULP_MODE_TCPDDP) |
+		RCV_BUFSIZ_V(win));
+	req->tcb.opt2 = (__force __be32) (PACE_V(1) |
+		TX_QUEUE_V(ep->com.dev->rdev.lldi.tx_modq[ep->tx_chan]) |
+		RX_CHANNEL_V(0) |
+		CCTRL_ECN_V(enable_ecn) |
+		RSS_QUEUE_VALID_F | RSS_QUEUE_V(ep->rss_qid));
+	if (enable_tcp_timestamps)
+		req->tcb.opt2 |= (__force __be32)TSTAMPS_EN_F;
+	if (enable_tcp_sack)
+		req->tcb.opt2 |= (__force __be32)SACK_EN_F;
+	if (wscale && enable_tcp_window_scaling)
+		req->tcb.opt2 |= (__force __be32)WND_SCALE_EN_F;
+	req->tcb.opt0 = cpu_to_be64((__force u64)req->tcb.opt0);
+	req->tcb.opt2 = cpu_to_be32((__force u32)req->tcb.opt2);
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, ep->ctrlq_idx);
+	set_bit(ACT_OFLD_CONN, &ep->com.history);
+	c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+}
+
+/*
+ * Return whether a failed active open has allocated a TID
+ */
+static inline int act_open_has_tid(int status)
+{
+	return status != CPL_ERR_TCAM_FULL && status != CPL_ERR_CONN_EXIST &&
+	       status != CPL_ERR_ARP_MISS;
+}
+
+/* Returns whether a CPL status conveys negative advice.
+ */
+static int is_neg_adv(unsigned int status)
+{
+	return status == CPL_ERR_RTX_NEG_ADVICE ||
+	       status == CPL_ERR_PERSIST_NEG_ADVICE ||
+	       status == CPL_ERR_KEEPALV_NEG_ADVICE;
+}
+
+static char *neg_adv_str(unsigned int status)
+{
+	switch (status) {
+	case CPL_ERR_RTX_NEG_ADVICE:
+		return "Retransmit timeout";
+	case CPL_ERR_PERSIST_NEG_ADVICE:
+		return "Persist timeout";
+	case CPL_ERR_KEEPALV_NEG_ADVICE:
+		return "Keepalive timeout";
+	default:
+		return "Unknown";
+	}
+}
+
+static void set_tcp_window(struct c4iw_ep *ep, struct port_info *pi)
+{
+	ep->snd_win = snd_win;
+	ep->rcv_win = rcv_win;
+	PDBG("%s snd_win %d rcv_win %d\n", __func__, ep->snd_win, ep->rcv_win);
+}
+
+#define ACT_OPEN_RETRY_COUNT 2
+
+static int import_ep(struct c4iw_ep *ep, int iptype, __u8 *peer_ip,
+		     struct dst_entry *dst, struct c4iw_dev *cdev,
+		     bool clear_mpa_v1)
+{
+	struct neighbour *n;
+	int err, step;
+	struct net_device *pdev;
+
+	n = dst_neigh_lookup(dst, peer_ip);
+	if (!n)
+		return -ENODEV;
+
+	rcu_read_lock();
+	err = -ENOMEM;
+	if (n->dev->flags & IFF_LOOPBACK) {
+		if (iptype == 4)
+			pdev = ip_dev_find(&init_net, *(__be32 *)peer_ip);
+		else if (IS_ENABLED(CONFIG_IPV6))
+			for_each_netdev(&init_net, pdev) {
+				if (ipv6_chk_addr(&init_net,
+						  (struct in6_addr *)peer_ip,
+						  pdev, 1))
+					break;
+			}
+		else
+			pdev = NULL;
+
+		if (!pdev) {
+			err = -ENODEV;
+			goto out;
+		}
+		ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t,
+					n, pdev, 0);
+		if (!ep->l2t)
+			goto out;
+		ep->mtu = pdev->mtu;
+		ep->tx_chan = cxgb4_port_chan(pdev);
+		ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1;
+		step = cdev->rdev.lldi.ntxq /
+			cdev->rdev.lldi.nchan;
+		ep->txq_idx = cxgb4_port_idx(pdev) * step;
+		step = cdev->rdev.lldi.nrxq /
+			cdev->rdev.lldi.nchan;
+		ep->ctrlq_idx = cxgb4_port_idx(pdev);
+		ep->rss_qid = cdev->rdev.lldi.rxq_ids[
+			cxgb4_port_idx(pdev) * step];
+		set_tcp_window(ep, (struct port_info *)netdev_priv(pdev));
+		dev_put(pdev);
+	} else {
+		pdev = get_real_dev(n->dev);
+		ep->l2t = cxgb4_l2t_get(cdev->rdev.lldi.l2t,
+					n, pdev, 0);
+		if (!ep->l2t)
+			goto out;
+		ep->mtu = dst_mtu(dst);
+		ep->tx_chan = cxgb4_port_chan(pdev);
+		ep->smac_idx = (cxgb4_port_viid(pdev) & 0x7F) << 1;
+		step = cdev->rdev.lldi.ntxq /
+			cdev->rdev.lldi.nchan;
+		ep->txq_idx = cxgb4_port_idx(pdev) * step;
+		ep->ctrlq_idx = cxgb4_port_idx(pdev);
+		step = cdev->rdev.lldi.nrxq /
+			cdev->rdev.lldi.nchan;
+		ep->rss_qid = cdev->rdev.lldi.rxq_ids[
+			cxgb4_port_idx(pdev) * step];
+		set_tcp_window(ep, (struct port_info *)netdev_priv(pdev));
+
+		if (clear_mpa_v1) {
+			ep->retry_with_mpa_v1 = 0;
+			ep->tried_with_mpa_v1 = 0;
+		}
+	}
+	err = 0;
+out:
+	rcu_read_unlock();
+
+	neigh_release(n);
+
+	return err;
+}
+
+static int c4iw_reconnect(struct c4iw_ep *ep)
+{
+	int err = 0;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)
+				    &ep->com.cm_id->local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)
+				    &ep->com.cm_id->remote_addr;
+	struct sockaddr_in6 *laddr6 = (struct sockaddr_in6 *)
+				      &ep->com.cm_id->local_addr;
+	struct sockaddr_in6 *raddr6 = (struct sockaddr_in6 *)
+				      &ep->com.cm_id->remote_addr;
+	int iptype;
+	__u8 *ra;
+
+	PDBG("%s qp %p cm_id %p\n", __func__, ep->com.qp, ep->com.cm_id);
+	init_timer(&ep->timer);
+
+	/*
+	 * Allocate an active TID to initiate a TCP connection.
+	 */
+	ep->atid = cxgb4_alloc_atid(ep->com.dev->rdev.lldi.tids, ep);
+	if (ep->atid == -1) {
+		pr_err("%s - cannot alloc atid.\n", __func__);
+		err = -ENOMEM;
+		goto fail2;
+	}
+	insert_handle(ep->com.dev, &ep->com.dev->atid_idr, ep, ep->atid);
+
+	/* find a route */
+	if (ep->com.cm_id->local_addr.ss_family == AF_INET) {
+		ep->dst = find_route(ep->com.dev, laddr->sin_addr.s_addr,
+				     raddr->sin_addr.s_addr, laddr->sin_port,
+				     raddr->sin_port, 0);
+		iptype = 4;
+		ra = (__u8 *)&raddr->sin_addr;
+	} else {
+		ep->dst = find_route6(ep->com.dev, laddr6->sin6_addr.s6_addr,
+				      raddr6->sin6_addr.s6_addr,
+				      laddr6->sin6_port, raddr6->sin6_port, 0,
+				      raddr6->sin6_scope_id);
+		iptype = 6;
+		ra = (__u8 *)&raddr6->sin6_addr;
+	}
+	if (!ep->dst) {
+		pr_err("%s - cannot find route.\n", __func__);
+		err = -EHOSTUNREACH;
+		goto fail3;
+	}
+	err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, false);
+	if (err) {
+		pr_err("%s - cannot alloc l2e.\n", __func__);
+		goto fail4;
+	}
+
+	PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n",
+	     __func__, ep->txq_idx, ep->tx_chan, ep->smac_idx, ep->rss_qid,
+	     ep->l2t->idx);
+
+	state_set(&ep->com, CONNECTING);
+	ep->tos = 0;
+
+	/* send connect request to rnic */
+	err = send_connect(ep);
+	if (!err)
+		goto out;
+
+	cxgb4_l2t_release(ep->l2t);
+fail4:
+	dst_release(ep->dst);
+fail3:
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
+	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
+fail2:
+	/*
+	 * remember to send notification to upper layer.
+	 * We are in here so the upper layer is not aware that this is
+	 * re-connect attempt and so, upper layer is still waiting for
+	 * response of 1st connect request.
+	 */
+	connect_reply_upcall(ep, -ECONNRESET);
+	c4iw_put_ep(&ep->com);
+out:
+	return err;
+}
+
+static int act_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_act_open_rpl *rpl = cplhdr(skb);
+	unsigned int atid = TID_TID_G(AOPEN_ATID_G(
+				      ntohl(rpl->atid_status)));
+	struct tid_info *t = dev->rdev.lldi.tids;
+	int status = AOPEN_STATUS_G(ntohl(rpl->atid_status));
+	struct sockaddr_in *la;
+	struct sockaddr_in *ra;
+	struct sockaddr_in6 *la6;
+	struct sockaddr_in6 *ra6;
+
+	ep = lookup_atid(t, atid);
+	la = (struct sockaddr_in *)&ep->com.mapped_local_addr;
+	ra = (struct sockaddr_in *)&ep->com.mapped_remote_addr;
+	la6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr;
+	ra6 = (struct sockaddr_in6 *)&ep->com.mapped_remote_addr;
+
+	PDBG("%s ep %p atid %u status %u errno %d\n", __func__, ep, atid,
+	     status, status2errno(status));
+
+	if (is_neg_adv(status)) {
+		PDBG("%s Connection problems for atid %u status %u (%s)\n",
+		     __func__, atid, status, neg_adv_str(status));
+		ep->stats.connect_neg_adv++;
+		mutex_lock(&dev->rdev.stats.lock);
+		dev->rdev.stats.neg_adv++;
+		mutex_unlock(&dev->rdev.stats.lock);
+		return 0;
+	}
+
+	set_bit(ACT_OPEN_RPL, &ep->com.history);
+
+	/*
+	 * Log interesting failures.
+	 */
+	switch (status) {
+	case CPL_ERR_CONN_RESET:
+	case CPL_ERR_CONN_TIMEDOUT:
+		break;
+	case CPL_ERR_TCAM_FULL:
+		mutex_lock(&dev->rdev.stats.lock);
+		dev->rdev.stats.tcam_full++;
+		mutex_unlock(&dev->rdev.stats.lock);
+		if (ep->com.local_addr.ss_family == AF_INET &&
+		    dev->rdev.lldi.enable_fw_ofld_conn) {
+			send_fw_act_open_req(ep,
+					     TID_TID_G(AOPEN_ATID_G(
+					     ntohl(rpl->atid_status))));
+			return 0;
+		}
+		break;
+	case CPL_ERR_CONN_EXIST:
+		if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) {
+			set_bit(ACT_RETRY_INUSE, &ep->com.history);
+			remove_handle(ep->com.dev, &ep->com.dev->atid_idr,
+					atid);
+			cxgb4_free_atid(t, atid);
+			dst_release(ep->dst);
+			cxgb4_l2t_release(ep->l2t);
+			c4iw_reconnect(ep);
+			return 0;
+		}
+		break;
+	default:
+		if (ep->com.local_addr.ss_family == AF_INET) {
+			pr_info("Active open failure - atid %u status %u errno %d %pI4:%u->%pI4:%u\n",
+				atid, status, status2errno(status),
+				&la->sin_addr.s_addr, ntohs(la->sin_port),
+				&ra->sin_addr.s_addr, ntohs(ra->sin_port));
+		} else {
+			pr_info("Active open failure - atid %u status %u errno %d %pI6:%u->%pI6:%u\n",
+				atid, status, status2errno(status),
+				la6->sin6_addr.s6_addr, ntohs(la6->sin6_port),
+				ra6->sin6_addr.s6_addr, ntohs(ra6->sin6_port));
+		}
+		break;
+	}
+
+	connect_reply_upcall(ep, status2errno(status));
+	state_set(&ep->com, DEAD);
+
+	if (status && act_open_has_tid(status))
+		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, GET_TID(rpl));
+
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, atid);
+	cxgb4_free_atid(t, atid);
+	dst_release(ep->dst);
+	cxgb4_l2t_release(ep->l2t);
+	c4iw_put_ep(&ep->com);
+
+	return 0;
+}
+
+static int pass_open_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_pass_open_rpl *rpl = cplhdr(skb);
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int stid = GET_TID(rpl);
+	struct c4iw_listen_ep *ep = lookup_stid(t, stid);
+
+	if (!ep) {
+		PDBG("%s stid %d lookup failure!\n", __func__, stid);
+		goto out;
+	}
+	PDBG("%s ep %p status %d error %d\n", __func__, ep,
+	     rpl->status, status2errno(rpl->status));
+	c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status));
+
+out:
+	return 0;
+}
+
+static int close_listsrv_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_close_listsvr_rpl *rpl = cplhdr(skb);
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int stid = GET_TID(rpl);
+	struct c4iw_listen_ep *ep = lookup_stid(t, stid);
+
+	PDBG("%s ep %p\n", __func__, ep);
+	c4iw_wake_up(&ep->com.wr_wait, status2errno(rpl->status));
+	return 0;
+}
+
+static void accept_cr(struct c4iw_ep *ep, struct sk_buff *skb,
+		      struct cpl_pass_accept_req *req)
+{
+	struct cpl_pass_accept_rpl *rpl;
+	unsigned int mtu_idx;
+	u64 opt0;
+	u32 opt2;
+	int wscale;
+	struct cpl_t5_pass_accept_rpl *rpl5 = NULL;
+	int win;
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	BUG_ON(skb_cloned(skb));
+
+	skb_get(skb);
+	rpl = cplhdr(skb);
+	if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) {
+		skb_trim(skb, roundup(sizeof(*rpl5), 16));
+		rpl5 = (void *)rpl;
+		INIT_TP_WR(rpl5, ep->hwtid);
+	} else {
+		skb_trim(skb, sizeof(*rpl));
+		INIT_TP_WR(rpl, ep->hwtid);
+	}
+	OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL,
+						    ep->hwtid));
+
+	best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
+		 enable_tcp_timestamps && req->tcpopt.tstamp,
+		 (AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1);
+	wscale = compute_wscale(rcv_win);
+
+	/*
+	 * Specify the largest window that will fit in opt0. The
+	 * remainder will be specified in the rx_data_ack.
+	 */
+	win = ep->rcv_win >> 10;
+	if (win > RCV_BUFSIZ_M)
+		win = RCV_BUFSIZ_M;
+	opt0 = (nocong ? NO_CONG_F : 0) |
+	       KEEP_ALIVE_F |
+	       DELACK_F |
+	       WND_SCALE_V(wscale) |
+	       MSS_IDX_V(mtu_idx) |
+	       L2T_IDX_V(ep->l2t->idx) |
+	       TX_CHAN_V(ep->tx_chan) |
+	       SMAC_SEL_V(ep->smac_idx) |
+	       DSCP_V(ep->tos >> 2) |
+	       ULP_MODE_V(ULP_MODE_TCPDDP) |
+	       RCV_BUFSIZ_V(win);
+	opt2 = RX_CHANNEL_V(0) |
+	       RSS_QUEUE_VALID_F | RSS_QUEUE_V(ep->rss_qid);
+
+	if (enable_tcp_timestamps && req->tcpopt.tstamp)
+		opt2 |= TSTAMPS_EN_F;
+	if (enable_tcp_sack && req->tcpopt.sack)
+		opt2 |= SACK_EN_F;
+	if (wscale && enable_tcp_window_scaling)
+		opt2 |= WND_SCALE_EN_F;
+	if (enable_ecn) {
+		const struct tcphdr *tcph;
+		u32 hlen = ntohl(req->hdr_len);
+
+		tcph = (const void *)(req + 1) + ETH_HDR_LEN_G(hlen) +
+			IP_HDR_LEN_G(hlen);
+		if (tcph->ece && tcph->cwr)
+			opt2 |= CCTRL_ECN_V(1);
+	}
+	if (is_t5(ep->com.dev->rdev.lldi.adapter_type)) {
+		u32 isn = (prandom_u32() & ~7UL) - 1;
+		opt2 |= T5_OPT_2_VALID_F;
+		opt2 |= CONG_CNTRL_V(CONG_ALG_TAHOE);
+		opt2 |= T5_ISS_F;
+		rpl5 = (void *)rpl;
+		memset(&rpl5->iss, 0, roundup(sizeof(*rpl5)-sizeof(*rpl), 16));
+		if (peer2peer)
+			isn += 4;
+		rpl5->iss = cpu_to_be32(isn);
+		PDBG("%s iss %u\n", __func__, be32_to_cpu(rpl5->iss));
+	}
+
+	rpl->opt0 = cpu_to_be64(opt0);
+	rpl->opt2 = cpu_to_be32(opt2);
+	set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx);
+	t4_set_arp_err_handler(skb, NULL, arp_failure_discard);
+	c4iw_l2t_send(&ep->com.dev->rdev, skb, ep->l2t);
+
+	return;
+}
+
+static void reject_cr(struct c4iw_dev *dev, u32 hwtid, struct sk_buff *skb)
+{
+	PDBG("%s c4iw_dev %p tid %u\n", __func__, dev, hwtid);
+	BUG_ON(skb_cloned(skb));
+	skb_trim(skb, sizeof(struct cpl_tid_release));
+	release_tid(&dev->rdev, hwtid, skb);
+	return;
+}
+
+static void get_4tuple(struct cpl_pass_accept_req *req, int *iptype,
+		       __u8 *local_ip, __u8 *peer_ip,
+		       __be16 *local_port, __be16 *peer_port)
+{
+	int eth_len = ETH_HDR_LEN_G(be32_to_cpu(req->hdr_len));
+	int ip_len = IP_HDR_LEN_G(be32_to_cpu(req->hdr_len));
+	struct iphdr *ip = (struct iphdr *)((u8 *)(req + 1) + eth_len);
+	struct ipv6hdr *ip6 = (struct ipv6hdr *)((u8 *)(req + 1) + eth_len);
+	struct tcphdr *tcp = (struct tcphdr *)
+			     ((u8 *)(req + 1) + eth_len + ip_len);
+
+	if (ip->version == 4) {
+		PDBG("%s saddr 0x%x daddr 0x%x sport %u dport %u\n", __func__,
+		     ntohl(ip->saddr), ntohl(ip->daddr), ntohs(tcp->source),
+		     ntohs(tcp->dest));
+		*iptype = 4;
+		memcpy(peer_ip, &ip->saddr, 4);
+		memcpy(local_ip, &ip->daddr, 4);
+	} else {
+		PDBG("%s saddr %pI6 daddr %pI6 sport %u dport %u\n", __func__,
+		     ip6->saddr.s6_addr, ip6->daddr.s6_addr, ntohs(tcp->source),
+		     ntohs(tcp->dest));
+		*iptype = 6;
+		memcpy(peer_ip, ip6->saddr.s6_addr, 16);
+		memcpy(local_ip, ip6->daddr.s6_addr, 16);
+	}
+	*peer_port = tcp->source;
+	*local_port = tcp->dest;
+
+	return;
+}
+
+static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *child_ep = NULL, *parent_ep;
+	struct cpl_pass_accept_req *req = cplhdr(skb);
+	unsigned int stid = PASS_OPEN_TID_G(ntohl(req->tos_stid));
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int hwtid = GET_TID(req);
+	struct dst_entry *dst;
+	__u8 local_ip[16], peer_ip[16];
+	__be16 local_port, peer_port;
+	int err;
+	u16 peer_mss = ntohs(req->tcpopt.mss);
+	int iptype;
+	unsigned short hdrs;
+
+	parent_ep = lookup_stid(t, stid);
+	if (!parent_ep) {
+		PDBG("%s connect request on invalid stid %d\n", __func__, stid);
+		goto reject;
+	}
+
+	if (state_read(&parent_ep->com) != LISTEN) {
+		printk(KERN_ERR "%s - listening ep not in LISTEN\n",
+		       __func__);
+		goto reject;
+	}
+
+	get_4tuple(req, &iptype, local_ip, peer_ip, &local_port, &peer_port);
+
+	/* Find output route */
+	if (iptype == 4)  {
+		PDBG("%s parent ep %p hwtid %u laddr %pI4 raddr %pI4 lport %d rport %d peer_mss %d\n"
+		     , __func__, parent_ep, hwtid,
+		     local_ip, peer_ip, ntohs(local_port),
+		     ntohs(peer_port), peer_mss);
+		dst = find_route(dev, *(__be32 *)local_ip, *(__be32 *)peer_ip,
+				 local_port, peer_port,
+				 PASS_OPEN_TOS_G(ntohl(req->tos_stid)));
+	} else {
+		PDBG("%s parent ep %p hwtid %u laddr %pI6 raddr %pI6 lport %d rport %d peer_mss %d\n"
+		     , __func__, parent_ep, hwtid,
+		     local_ip, peer_ip, ntohs(local_port),
+		     ntohs(peer_port), peer_mss);
+		dst = find_route6(dev, local_ip, peer_ip, local_port, peer_port,
+				  PASS_OPEN_TOS_G(ntohl(req->tos_stid)),
+				  ((struct sockaddr_in6 *)
+				  &parent_ep->com.local_addr)->sin6_scope_id);
+	}
+	if (!dst) {
+		printk(KERN_ERR MOD "%s - failed to find dst entry!\n",
+		       __func__);
+		goto reject;
+	}
+
+	child_ep = alloc_ep(sizeof(*child_ep), GFP_KERNEL);
+	if (!child_ep) {
+		printk(KERN_ERR MOD "%s - failed to allocate ep entry!\n",
+		       __func__);
+		dst_release(dst);
+		goto reject;
+	}
+
+	err = import_ep(child_ep, iptype, peer_ip, dst, dev, false);
+	if (err) {
+		printk(KERN_ERR MOD "%s - failed to allocate l2t entry!\n",
+		       __func__);
+		dst_release(dst);
+		kfree(child_ep);
+		goto reject;
+	}
+
+	hdrs = sizeof(struct iphdr) + sizeof(struct tcphdr) +
+	       ((enable_tcp_timestamps && req->tcpopt.tstamp) ? 12 : 0);
+	if (peer_mss && child_ep->mtu > (peer_mss + hdrs))
+		child_ep->mtu = peer_mss + hdrs;
+
+	state_set(&child_ep->com, CONNECTING);
+	child_ep->com.dev = dev;
+	child_ep->com.cm_id = NULL;
+
+	/*
+	 * The mapped_local and mapped_remote addresses get setup with
+	 * the actual 4-tuple.  The local address will be based on the
+	 * actual local address of the connection, but on the port number
+	 * of the parent listening endpoint.  The remote address is
+	 * setup based on a query to the IWPM since we don't know what it
+	 * originally was before mapping.  If no mapping was done, then
+	 * mapped_remote == remote, and mapped_local == local.
+	 */
+	if (iptype == 4) {
+		struct sockaddr_in *sin = (struct sockaddr_in *)
+			&child_ep->com.mapped_local_addr;
+
+		sin->sin_family = PF_INET;
+		sin->sin_port = local_port;
+		sin->sin_addr.s_addr = *(__be32 *)local_ip;
+
+		sin = (struct sockaddr_in *)&child_ep->com.local_addr;
+		sin->sin_family = PF_INET;
+		sin->sin_port = ((struct sockaddr_in *)
+				 &parent_ep->com.local_addr)->sin_port;
+		sin->sin_addr.s_addr = *(__be32 *)local_ip;
+
+		sin = (struct sockaddr_in *)&child_ep->com.mapped_remote_addr;
+		sin->sin_family = PF_INET;
+		sin->sin_port = peer_port;
+		sin->sin_addr.s_addr = *(__be32 *)peer_ip;
+	} else {
+		struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)
+			&child_ep->com.mapped_local_addr;
+
+		sin6->sin6_family = PF_INET6;
+		sin6->sin6_port = local_port;
+		memcpy(sin6->sin6_addr.s6_addr, local_ip, 16);
+
+		sin6 = (struct sockaddr_in6 *)&child_ep->com.local_addr;
+		sin6->sin6_family = PF_INET6;
+		sin6->sin6_port = ((struct sockaddr_in6 *)
+				   &parent_ep->com.local_addr)->sin6_port;
+		memcpy(sin6->sin6_addr.s6_addr, local_ip, 16);
+
+		sin6 = (struct sockaddr_in6 *)&child_ep->com.mapped_remote_addr;
+		sin6->sin6_family = PF_INET6;
+		sin6->sin6_port = peer_port;
+		memcpy(sin6->sin6_addr.s6_addr, peer_ip, 16);
+	}
+	memcpy(&child_ep->com.remote_addr, &child_ep->com.mapped_remote_addr,
+	       sizeof(child_ep->com.remote_addr));
+	get_remote_addr(parent_ep, child_ep);
+
+	c4iw_get_ep(&parent_ep->com);
+	child_ep->parent_ep = parent_ep;
+	child_ep->tos = PASS_OPEN_TOS_G(ntohl(req->tos_stid));
+	child_ep->dst = dst;
+	child_ep->hwtid = hwtid;
+
+	PDBG("%s tx_chan %u smac_idx %u rss_qid %u\n", __func__,
+	     child_ep->tx_chan, child_ep->smac_idx, child_ep->rss_qid);
+
+	init_timer(&child_ep->timer);
+	cxgb4_insert_tid(t, child_ep, hwtid);
+	insert_handle(dev, &dev->hwtid_idr, child_ep, child_ep->hwtid);
+	accept_cr(child_ep, skb, req);
+	set_bit(PASS_ACCEPT_REQ, &child_ep->com.history);
+	goto out;
+reject:
+	reject_cr(dev, hwtid, skb);
+out:
+	return 0;
+}
+
+static int pass_establish(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_pass_establish *req = cplhdr(skb);
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(req);
+
+	ep = lookup_tid(t, tid);
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	ep->snd_seq = be32_to_cpu(req->snd_isn);
+	ep->rcv_seq = be32_to_cpu(req->rcv_isn);
+
+	PDBG("%s ep %p hwtid %u tcp_opt 0x%02x\n", __func__, ep, tid,
+	     ntohs(req->tcp_opt));
+
+	set_emss(ep, ntohs(req->tcp_opt));
+
+	dst_confirm(ep->dst);
+	state_set(&ep->com, MPA_REQ_WAIT);
+	start_ep_timer(ep);
+	send_flowc(ep, skb);
+	set_bit(PASS_ESTAB, &ep->com.history);
+
+	return 0;
+}
+
+static int peer_close(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_peer_close *hdr = cplhdr(skb);
+	struct c4iw_ep *ep;
+	struct c4iw_qp_attributes attrs;
+	int disconnect = 1;
+	int release = 0;
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(hdr);
+	int ret;
+
+	ep = lookup_tid(t, tid);
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	dst_confirm(ep->dst);
+
+	set_bit(PEER_CLOSE, &ep->com.history);
+	mutex_lock(&ep->com.mutex);
+	switch (ep->com.state) {
+	case MPA_REQ_WAIT:
+		__state_set(&ep->com, CLOSING);
+		break;
+	case MPA_REQ_SENT:
+		__state_set(&ep->com, CLOSING);
+		connect_reply_upcall(ep, -ECONNRESET);
+		break;
+	case MPA_REQ_RCVD:
+
+		/*
+		 * We're gonna mark this puppy DEAD, but keep
+		 * the reference on it until the ULP accepts or
+		 * rejects the CR. Also wake up anyone waiting
+		 * in rdma connection migration (see c4iw_accept_cr()).
+		 */
+		__state_set(&ep->com, CLOSING);
+		PDBG("waking up ep %p tid %u\n", ep, ep->hwtid);
+		c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
+		break;
+	case MPA_REP_SENT:
+		__state_set(&ep->com, CLOSING);
+		PDBG("waking up ep %p tid %u\n", ep, ep->hwtid);
+		c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
+		break;
+	case FPDU_MODE:
+		start_ep_timer(ep);
+		__state_set(&ep->com, CLOSING);
+		attrs.next_state = C4IW_QP_STATE_CLOSING;
+		ret = c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		if (ret != -ECONNRESET) {
+			peer_close_upcall(ep);
+			disconnect = 1;
+		}
+		break;
+	case ABORTING:
+		disconnect = 0;
+		break;
+	case CLOSING:
+		__state_set(&ep->com, MORIBUND);
+		disconnect = 0;
+		break;
+	case MORIBUND:
+		(void)stop_ep_timer(ep);
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = C4IW_QP_STATE_IDLE;
+			c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+				       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+		}
+		close_complete_upcall(ep, 0);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		disconnect = 0;
+		break;
+	case DEAD:
+		disconnect = 0;
+		break;
+	default:
+		BUG_ON(1);
+	}
+	mutex_unlock(&ep->com.mutex);
+	if (disconnect)
+		c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+	if (release)
+		release_ep_resources(ep);
+	return 0;
+}
+
+static int peer_abort(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_abort_req_rss *req = cplhdr(skb);
+	struct c4iw_ep *ep;
+	struct cpl_abort_rpl *rpl;
+	struct sk_buff *rpl_skb;
+	struct c4iw_qp_attributes attrs;
+	int ret;
+	int release = 0;
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(req);
+
+	ep = lookup_tid(t, tid);
+	if (is_neg_adv(req->status)) {
+		PDBG("%s Negative advice on abort- tid %u status %d (%s)\n",
+		     __func__, ep->hwtid, req->status,
+		     neg_adv_str(req->status));
+		ep->stats.abort_neg_adv++;
+		mutex_lock(&dev->rdev.stats.lock);
+		dev->rdev.stats.neg_adv++;
+		mutex_unlock(&dev->rdev.stats.lock);
+		return 0;
+	}
+	PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid,
+	     ep->com.state);
+	set_bit(PEER_ABORT, &ep->com.history);
+
+	/*
+	 * Wake up any threads in rdma_init() or rdma_fini().
+	 * However, this is not needed if com state is just
+	 * MPA_REQ_SENT
+	 */
+	if (ep->com.state != MPA_REQ_SENT)
+		c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
+
+	mutex_lock(&ep->com.mutex);
+	switch (ep->com.state) {
+	case CONNECTING:
+		break;
+	case MPA_REQ_WAIT:
+		(void)stop_ep_timer(ep);
+		break;
+	case MPA_REQ_SENT:
+		(void)stop_ep_timer(ep);
+		if (mpa_rev == 1 || (mpa_rev == 2 && ep->tried_with_mpa_v1))
+			connect_reply_upcall(ep, -ECONNRESET);
+		else {
+			/*
+			 * we just don't send notification upwards because we
+			 * want to retry with mpa_v1 without upper layers even
+			 * knowing it.
+			 *
+			 * do some housekeeping so as to re-initiate the
+			 * connection
+			 */
+			PDBG("%s: mpa_rev=%d. Retrying with mpav1\n", __func__,
+			     mpa_rev);
+			ep->retry_with_mpa_v1 = 1;
+		}
+		break;
+	case MPA_REP_SENT:
+		break;
+	case MPA_REQ_RCVD:
+		break;
+	case MORIBUND:
+	case CLOSING:
+		stop_ep_timer(ep);
+		/*FALLTHROUGH*/
+	case FPDU_MODE:
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = C4IW_QP_STATE_ERROR;
+			ret = c4iw_modify_qp(ep->com.qp->rhp,
+				     ep->com.qp, C4IW_QP_ATTR_NEXT_STATE,
+				     &attrs, 1);
+			if (ret)
+				printk(KERN_ERR MOD
+				       "%s - qp <- error failed!\n",
+				       __func__);
+		}
+		peer_abort_upcall(ep);
+		break;
+	case ABORTING:
+		break;
+	case DEAD:
+		PDBG("%s PEER_ABORT IN DEAD STATE!!!!\n", __func__);
+		mutex_unlock(&ep->com.mutex);
+		return 0;
+	default:
+		BUG_ON(1);
+		break;
+	}
+	dst_confirm(ep->dst);
+	if (ep->com.state != ABORTING) {
+		__state_set(&ep->com, DEAD);
+		/* we don't release if we want to retry with mpa_v1 */
+		if (!ep->retry_with_mpa_v1)
+			release = 1;
+	}
+	mutex_unlock(&ep->com.mutex);
+
+	rpl_skb = get_skb(skb, sizeof(*rpl), GFP_KERNEL);
+	if (!rpl_skb) {
+		printk(KERN_ERR MOD "%s - cannot allocate skb!\n",
+		       __func__);
+		release = 1;
+		goto out;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+	rpl = (struct cpl_abort_rpl *) skb_put(rpl_skb, sizeof(*rpl));
+	INIT_TP_WR(rpl, ep->hwtid);
+	OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_ABORT_RPL, ep->hwtid));
+	rpl->cmd = CPL_ABORT_NO_RST;
+	c4iw_ofld_send(&ep->com.dev->rdev, rpl_skb);
+out:
+	if (release)
+		release_ep_resources(ep);
+	else if (ep->retry_with_mpa_v1) {
+		remove_handle(ep->com.dev, &ep->com.dev->hwtid_idr, ep->hwtid);
+		cxgb4_remove_tid(ep->com.dev->rdev.lldi.tids, 0, ep->hwtid);
+		dst_release(ep->dst);
+		cxgb4_l2t_release(ep->l2t);
+		c4iw_reconnect(ep);
+	}
+
+	return 0;
+}
+
+static int close_con_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct c4iw_qp_attributes attrs;
+	struct cpl_close_con_rpl *rpl = cplhdr(skb);
+	int release = 0;
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(rpl);
+
+	ep = lookup_tid(t, tid);
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+	BUG_ON(!ep);
+
+	/* The cm_id may be null if we failed to connect */
+	mutex_lock(&ep->com.mutex);
+	switch (ep->com.state) {
+	case CLOSING:
+		__state_set(&ep->com, MORIBUND);
+		break;
+	case MORIBUND:
+		(void)stop_ep_timer(ep);
+		if ((ep->com.cm_id) && (ep->com.qp)) {
+			attrs.next_state = C4IW_QP_STATE_IDLE;
+			c4iw_modify_qp(ep->com.qp->rhp,
+					     ep->com.qp,
+					     C4IW_QP_ATTR_NEXT_STATE,
+					     &attrs, 1);
+		}
+		close_complete_upcall(ep, 0);
+		__state_set(&ep->com, DEAD);
+		release = 1;
+		break;
+	case ABORTING:
+	case DEAD:
+		break;
+	default:
+		BUG_ON(1);
+		break;
+	}
+	mutex_unlock(&ep->com.mutex);
+	if (release)
+		release_ep_resources(ep);
+	return 0;
+}
+
+static int terminate(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_rdma_terminate *rpl = cplhdr(skb);
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(rpl);
+	struct c4iw_ep *ep;
+	struct c4iw_qp_attributes attrs;
+
+	ep = lookup_tid(t, tid);
+	BUG_ON(!ep);
+
+	if (ep && ep->com.qp) {
+		printk(KERN_WARNING MOD "TERM received tid %u qpid %u\n", tid,
+		       ep->com.qp->wq.sq.qid);
+		attrs.next_state = C4IW_QP_STATE_TERMINATE;
+		c4iw_modify_qp(ep->com.qp->rhp, ep->com.qp,
+			       C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+	} else
+		printk(KERN_WARNING MOD "TERM received tid %u no ep/qp\n", tid);
+
+	return 0;
+}
+
+/*
+ * Upcall from the adapter indicating data has been transmitted.
+ * For us its just the single MPA request or reply.  We can now free
+ * the skb holding the mpa message.
+ */
+static int fw4_ack(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct c4iw_ep *ep;
+	struct cpl_fw4_ack *hdr = cplhdr(skb);
+	u8 credits = hdr->credits;
+	unsigned int tid = GET_TID(hdr);
+	struct tid_info *t = dev->rdev.lldi.tids;
+
+
+	ep = lookup_tid(t, tid);
+	PDBG("%s ep %p tid %u credits %u\n", __func__, ep, ep->hwtid, credits);
+	if (credits == 0) {
+		PDBG("%s 0 credit ack ep %p tid %u state %u\n",
+		     __func__, ep, ep->hwtid, state_read(&ep->com));
+		return 0;
+	}
+
+	dst_confirm(ep->dst);
+	if (ep->mpa_skb) {
+		PDBG("%s last streaming msg ack ep %p tid %u state %u "
+		     "initiator %u freeing skb\n", __func__, ep, ep->hwtid,
+		     state_read(&ep->com), ep->mpa_attr.initiator ? 1 : 0);
+		kfree_skb(ep->mpa_skb);
+		ep->mpa_skb = NULL;
+	}
+	return 0;
+}
+
+int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	int err = 0;
+	int disconnect = 0;
+	struct c4iw_ep *ep = to_ep(cm_id);
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+
+	mutex_lock(&ep->com.mutex);
+	if (ep->com.state == DEAD) {
+		mutex_unlock(&ep->com.mutex);
+		c4iw_put_ep(&ep->com);
+		return -ECONNRESET;
+	}
+	set_bit(ULP_REJECT, &ep->com.history);
+	BUG_ON(ep->com.state != MPA_REQ_RCVD);
+	if (mpa_rev == 0)
+		abort_connection(ep, NULL, GFP_KERNEL);
+	else {
+		err = send_mpa_reject(ep, pdata, pdata_len);
+		disconnect = 1;
+	}
+	mutex_unlock(&ep->com.mutex);
+	if (disconnect)
+		err = c4iw_ep_disconnect(ep, 0, GFP_KERNEL);
+	c4iw_put_ep(&ep->com);
+	return 0;
+}
+
+int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	int err;
+	struct c4iw_qp_attributes attrs;
+	enum c4iw_qp_attr_mask mask;
+	struct c4iw_ep *ep = to_ep(cm_id);
+	struct c4iw_dev *h = to_c4iw_dev(cm_id->device);
+	struct c4iw_qp *qp = get_qhp(h, conn_param->qpn);
+
+	PDBG("%s ep %p tid %u\n", __func__, ep, ep->hwtid);
+
+	mutex_lock(&ep->com.mutex);
+	if (ep->com.state == DEAD) {
+		err = -ECONNRESET;
+		goto err;
+	}
+
+	BUG_ON(ep->com.state != MPA_REQ_RCVD);
+	BUG_ON(!qp);
+
+	set_bit(ULP_ACCEPT, &ep->com.history);
+	if ((conn_param->ord > cur_max_read_depth(ep->com.dev)) ||
+	    (conn_param->ird > cur_max_read_depth(ep->com.dev))) {
+		abort_connection(ep, NULL, GFP_KERNEL);
+		err = -EINVAL;
+		goto err;
+	}
+
+	if (ep->mpa_attr.version == 2 && ep->mpa_attr.enhanced_rdma_conn) {
+		if (conn_param->ord > ep->ird) {
+			if (RELAXED_IRD_NEGOTIATION) {
+				ep->ord = ep->ird;
+			} else {
+				ep->ird = conn_param->ird;
+				ep->ord = conn_param->ord;
+				send_mpa_reject(ep, conn_param->private_data,
+						conn_param->private_data_len);
+				abort_connection(ep, NULL, GFP_KERNEL);
+				err = -ENOMEM;
+				goto err;
+			}
+		}
+		if (conn_param->ird < ep->ord) {
+			if (RELAXED_IRD_NEGOTIATION &&
+			    ep->ord <= h->rdev.lldi.max_ordird_qp) {
+				conn_param->ird = ep->ord;
+			} else {
+				abort_connection(ep, NULL, GFP_KERNEL);
+				err = -ENOMEM;
+				goto err;
+			}
+		}
+	}
+	ep->ird = conn_param->ird;
+	ep->ord = conn_param->ord;
+
+	if (ep->mpa_attr.version == 1) {
+		if (peer2peer && ep->ird == 0)
+			ep->ird = 1;
+	} else {
+		if (peer2peer &&
+		    (ep->mpa_attr.p2p_type != FW_RI_INIT_P2PTYPE_DISABLED) &&
+		    (p2p_type == FW_RI_INIT_P2PTYPE_READ_REQ) && ep->ord == 0)
+			ep->ird = 1;
+	}
+
+	PDBG("%s %d ird %d ord %d\n", __func__, __LINE__, ep->ird, ep->ord);
+
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->com.qp = qp;
+	ref_qp(ep);
+
+	/* bind QP to EP and move to RTS */
+	attrs.mpa_attr = ep->mpa_attr;
+	attrs.max_ird = ep->ird;
+	attrs.max_ord = ep->ord;
+	attrs.llp_stream_handle = ep;
+	attrs.next_state = C4IW_QP_STATE_RTS;
+
+	/* bind QP and TID with INIT_WR */
+	mask = C4IW_QP_ATTR_NEXT_STATE |
+			     C4IW_QP_ATTR_LLP_STREAM_HANDLE |
+			     C4IW_QP_ATTR_MPA_ATTR |
+			     C4IW_QP_ATTR_MAX_IRD |
+			     C4IW_QP_ATTR_MAX_ORD;
+
+	err = c4iw_modify_qp(ep->com.qp->rhp,
+			     ep->com.qp, mask, &attrs, 1);
+	if (err)
+		goto err1;
+	err = send_mpa_reply(ep, conn_param->private_data,
+			     conn_param->private_data_len);
+	if (err)
+		goto err1;
+
+	__state_set(&ep->com, FPDU_MODE);
+	established_upcall(ep);
+	mutex_unlock(&ep->com.mutex);
+	c4iw_put_ep(&ep->com);
+	return 0;
+err1:
+	ep->com.cm_id = NULL;
+	abort_connection(ep, NULL, GFP_KERNEL);
+	cm_id->rem_ref(cm_id);
+err:
+	mutex_unlock(&ep->com.mutex);
+	c4iw_put_ep(&ep->com);
+	return err;
+}
+
+static int pick_local_ipaddrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id)
+{
+	struct in_device *ind;
+	int found = 0;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+
+	ind = in_dev_get(dev->rdev.lldi.ports[0]);
+	if (!ind)
+		return -EADDRNOTAVAIL;
+	for_primary_ifa(ind) {
+		laddr->sin_addr.s_addr = ifa->ifa_address;
+		raddr->sin_addr.s_addr = ifa->ifa_address;
+		found = 1;
+		break;
+	}
+	endfor_ifa(ind);
+	in_dev_put(ind);
+	return found ? 0 : -EADDRNOTAVAIL;
+}
+
+static int get_lladdr(struct net_device *dev, struct in6_addr *addr,
+		      unsigned char banned_flags)
+{
+	struct inet6_dev *idev;
+	int err = -EADDRNOTAVAIL;
+
+	rcu_read_lock();
+	idev = __in6_dev_get(dev);
+	if (idev != NULL) {
+		struct inet6_ifaddr *ifp;
+
+		read_lock_bh(&idev->lock);
+		list_for_each_entry(ifp, &idev->addr_list, if_list) {
+			if (ifp->scope == IFA_LINK &&
+			    !(ifp->flags & banned_flags)) {
+				memcpy(addr, &ifp->addr, 16);
+				err = 0;
+				break;
+			}
+		}
+		read_unlock_bh(&idev->lock);
+	}
+	rcu_read_unlock();
+	return err;
+}
+
+static int pick_local_ip6addrs(struct c4iw_dev *dev, struct iw_cm_id *cm_id)
+{
+	struct in6_addr uninitialized_var(addr);
+	struct sockaddr_in6 *la6 = (struct sockaddr_in6 *)&cm_id->local_addr;
+	struct sockaddr_in6 *ra6 = (struct sockaddr_in6 *)&cm_id->remote_addr;
+
+	if (get_lladdr(dev->rdev.lldi.ports[0], &addr, IFA_F_TENTATIVE)) {
+		memcpy(la6->sin6_addr.s6_addr, &addr, 16);
+		memcpy(ra6->sin6_addr.s6_addr, &addr, 16);
+		return 0;
+	}
+	return -EADDRNOTAVAIL;
+}
+
+int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	struct c4iw_dev *dev = to_c4iw_dev(cm_id->device);
+	struct c4iw_ep *ep;
+	int err = 0;
+	struct sockaddr_in *laddr;
+	struct sockaddr_in *raddr;
+	struct sockaddr_in6 *laddr6;
+	struct sockaddr_in6 *raddr6;
+	struct iwpm_dev_data pm_reg_msg;
+	struct iwpm_sa_data pm_msg;
+	__u8 *ra;
+	int iptype;
+	int iwpm_err = 0;
+
+	if ((conn_param->ord > cur_max_read_depth(dev)) ||
+	    (conn_param->ird > cur_max_read_depth(dev))) {
+		err = -EINVAL;
+		goto out;
+	}
+	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
+	if (!ep) {
+		printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__);
+		err = -ENOMEM;
+		goto out;
+	}
+	init_timer(&ep->timer);
+	ep->plen = conn_param->private_data_len;
+	if (ep->plen)
+		memcpy(ep->mpa_pkt + sizeof(struct mpa_message),
+		       conn_param->private_data, ep->plen);
+	ep->ird = conn_param->ird;
+	ep->ord = conn_param->ord;
+
+	if (peer2peer && ep->ord == 0)
+		ep->ord = 1;
+
+	cm_id->add_ref(cm_id);
+	ep->com.dev = dev;
+	ep->com.cm_id = cm_id;
+	ep->com.qp = get_qhp(dev, conn_param->qpn);
+	if (!ep->com.qp) {
+		PDBG("%s qpn 0x%x not found!\n", __func__, conn_param->qpn);
+		err = -EINVAL;
+		goto fail1;
+	}
+	ref_qp(ep);
+	PDBG("%s qpn 0x%x qp %p cm_id %p\n", __func__, conn_param->qpn,
+	     ep->com.qp, cm_id);
+
+	/*
+	 * Allocate an active TID to initiate a TCP connection.
+	 */
+	ep->atid = cxgb4_alloc_atid(dev->rdev.lldi.tids, ep);
+	if (ep->atid == -1) {
+		printk(KERN_ERR MOD "%s - cannot alloc atid.\n", __func__);
+		err = -ENOMEM;
+		goto fail1;
+	}
+	insert_handle(dev, &dev->atid_idr, ep, ep->atid);
+
+	memcpy(&ep->com.local_addr, &cm_id->local_addr,
+	       sizeof(ep->com.local_addr));
+	memcpy(&ep->com.remote_addr, &cm_id->remote_addr,
+	       sizeof(ep->com.remote_addr));
+
+	/* No port mapper available, go with the specified peer information */
+	memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr,
+	       sizeof(ep->com.mapped_local_addr));
+	memcpy(&ep->com.mapped_remote_addr, &cm_id->remote_addr,
+	       sizeof(ep->com.mapped_remote_addr));
+
+	c4iw_form_reg_msg(dev, &pm_reg_msg);
+	iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW);
+	if (iwpm_err) {
+		PDBG("%s: Port Mapper reg pid fail (err = %d).\n",
+			__func__, iwpm_err);
+	}
+	if (iwpm_valid_pid() && !iwpm_err) {
+		c4iw_form_pm_msg(ep, &pm_msg);
+		iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_C4IW);
+		if (iwpm_err)
+			PDBG("%s: Port Mapper query fail (err = %d).\n",
+				__func__, iwpm_err);
+		else
+			c4iw_record_pm_msg(ep, &pm_msg);
+	}
+	if (iwpm_create_mapinfo(&ep->com.local_addr,
+				&ep->com.mapped_local_addr, RDMA_NL_C4IW)) {
+		iwpm_remove_mapping(&ep->com.local_addr, RDMA_NL_C4IW);
+		err = -ENOMEM;
+		goto fail1;
+	}
+	print_addr(&ep->com, __func__, "add_query/create_mapinfo");
+	set_bit(RELEASE_MAPINFO, &ep->com.flags);
+
+	laddr = (struct sockaddr_in *)&ep->com.mapped_local_addr;
+	raddr = (struct sockaddr_in *)&ep->com.mapped_remote_addr;
+	laddr6 = (struct sockaddr_in6 *)&ep->com.mapped_local_addr;
+	raddr6 = (struct sockaddr_in6 *) &ep->com.mapped_remote_addr;
+
+	if (cm_id->remote_addr.ss_family == AF_INET) {
+		iptype = 4;
+		ra = (__u8 *)&raddr->sin_addr;
+
+		/*
+		 * Handle loopback requests to INADDR_ANY.
+		 */
+		if ((__force int)raddr->sin_addr.s_addr == INADDR_ANY) {
+			err = pick_local_ipaddrs(dev, cm_id);
+			if (err)
+				goto fail1;
+		}
+
+		/* find a route */
+		PDBG("%s saddr %pI4 sport 0x%x raddr %pI4 rport 0x%x\n",
+		     __func__, &laddr->sin_addr, ntohs(laddr->sin_port),
+		     ra, ntohs(raddr->sin_port));
+		ep->dst = find_route(dev, laddr->sin_addr.s_addr,
+				     raddr->sin_addr.s_addr, laddr->sin_port,
+				     raddr->sin_port, 0);
+	} else {
+		iptype = 6;
+		ra = (__u8 *)&raddr6->sin6_addr;
+
+		/*
+		 * Handle loopback requests to INADDR_ANY.
+		 */
+		if (ipv6_addr_type(&raddr6->sin6_addr) == IPV6_ADDR_ANY) {
+			err = pick_local_ip6addrs(dev, cm_id);
+			if (err)
+				goto fail1;
+		}
+
+		/* find a route */
+		PDBG("%s saddr %pI6 sport 0x%x raddr %pI6 rport 0x%x\n",
+		     __func__, laddr6->sin6_addr.s6_addr,
+		     ntohs(laddr6->sin6_port),
+		     raddr6->sin6_addr.s6_addr, ntohs(raddr6->sin6_port));
+		ep->dst = find_route6(dev, laddr6->sin6_addr.s6_addr,
+				      raddr6->sin6_addr.s6_addr,
+				      laddr6->sin6_port, raddr6->sin6_port, 0,
+				      raddr6->sin6_scope_id);
+	}
+	if (!ep->dst) {
+		printk(KERN_ERR MOD "%s - cannot find route.\n", __func__);
+		err = -EHOSTUNREACH;
+		goto fail2;
+	}
+
+	err = import_ep(ep, iptype, ra, ep->dst, ep->com.dev, true);
+	if (err) {
+		printk(KERN_ERR MOD "%s - cannot alloc l2e.\n", __func__);
+		goto fail3;
+	}
+
+	PDBG("%s txq_idx %u tx_chan %u smac_idx %u rss_qid %u l2t_idx %u\n",
+		__func__, ep->txq_idx, ep->tx_chan, ep->smac_idx, ep->rss_qid,
+		ep->l2t->idx);
+
+	state_set(&ep->com, CONNECTING);
+	ep->tos = 0;
+
+	/* send connect request to rnic */
+	err = send_connect(ep);
+	if (!err)
+		goto out;
+
+	cxgb4_l2t_release(ep->l2t);
+fail3:
+	dst_release(ep->dst);
+fail2:
+	remove_handle(ep->com.dev, &ep->com.dev->atid_idr, ep->atid);
+	cxgb4_free_atid(ep->com.dev->rdev.lldi.tids, ep->atid);
+fail1:
+	cm_id->rem_ref(cm_id);
+	c4iw_put_ep(&ep->com);
+out:
+	return err;
+}
+
+static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
+{
+	int err;
+	struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)
+				    &ep->com.mapped_local_addr;
+
+	c4iw_init_wr_wait(&ep->com.wr_wait);
+	err = cxgb4_create_server6(ep->com.dev->rdev.lldi.ports[0],
+				   ep->stid, &sin6->sin6_addr,
+				   sin6->sin6_port,
+				   ep->com.dev->rdev.lldi.rxq_ids[0]);
+	if (!err)
+		err = c4iw_wait_for_reply(&ep->com.dev->rdev,
+					  &ep->com.wr_wait,
+					  0, 0, __func__);
+	else if (err > 0)
+		err = net_xmit_errno(err);
+	if (err)
+		pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n",
+		       err, ep->stid,
+		       sin6->sin6_addr.s6_addr, ntohs(sin6->sin6_port));
+	return err;
+}
+
+static int create_server4(struct c4iw_dev *dev, struct c4iw_listen_ep *ep)
+{
+	int err;
+	struct sockaddr_in *sin = (struct sockaddr_in *)
+				  &ep->com.mapped_local_addr;
+
+	if (dev->rdev.lldi.enable_fw_ofld_conn) {
+		do {
+			err = cxgb4_create_server_filter(
+				ep->com.dev->rdev.lldi.ports[0], ep->stid,
+				sin->sin_addr.s_addr, sin->sin_port, 0,
+				ep->com.dev->rdev.lldi.rxq_ids[0], 0, 0);
+			if (err == -EBUSY) {
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				schedule_timeout(usecs_to_jiffies(100));
+			}
+		} while (err == -EBUSY);
+	} else {
+		c4iw_init_wr_wait(&ep->com.wr_wait);
+		err = cxgb4_create_server(ep->com.dev->rdev.lldi.ports[0],
+				ep->stid, sin->sin_addr.s_addr, sin->sin_port,
+				0, ep->com.dev->rdev.lldi.rxq_ids[0]);
+		if (!err)
+			err = c4iw_wait_for_reply(&ep->com.dev->rdev,
+						  &ep->com.wr_wait,
+						  0, 0, __func__);
+		else if (err > 0)
+			err = net_xmit_errno(err);
+	}
+	if (err)
+		pr_err("cxgb4_create_server/filter failed err %d stid %d laddr %pI4 lport %d\n"
+		       , err, ep->stid,
+		       &sin->sin_addr, ntohs(sin->sin_port));
+	return err;
+}
+
+int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	int err = 0;
+	struct c4iw_dev *dev = to_c4iw_dev(cm_id->device);
+	struct c4iw_listen_ep *ep;
+	struct iwpm_dev_data pm_reg_msg;
+	struct iwpm_sa_data pm_msg;
+	int iwpm_err = 0;
+
+	might_sleep();
+
+	ep = alloc_ep(sizeof(*ep), GFP_KERNEL);
+	if (!ep) {
+		printk(KERN_ERR MOD "%s - cannot alloc ep.\n", __func__);
+		err = -ENOMEM;
+		goto fail1;
+	}
+	PDBG("%s ep %p\n", __func__, ep);
+	cm_id->add_ref(cm_id);
+	ep->com.cm_id = cm_id;
+	ep->com.dev = dev;
+	ep->backlog = backlog;
+	memcpy(&ep->com.local_addr, &cm_id->local_addr,
+	       sizeof(ep->com.local_addr));
+
+	/*
+	 * Allocate a server TID.
+	 */
+	if (dev->rdev.lldi.enable_fw_ofld_conn &&
+	    ep->com.local_addr.ss_family == AF_INET)
+		ep->stid = cxgb4_alloc_sftid(dev->rdev.lldi.tids,
+					     cm_id->local_addr.ss_family, ep);
+	else
+		ep->stid = cxgb4_alloc_stid(dev->rdev.lldi.tids,
+					    cm_id->local_addr.ss_family, ep);
+
+	if (ep->stid == -1) {
+		printk(KERN_ERR MOD "%s - cannot alloc stid.\n", __func__);
+		err = -ENOMEM;
+		goto fail2;
+	}
+	insert_handle(dev, &dev->stid_idr, ep, ep->stid);
+
+	/* No port mapper available, go with the specified info */
+	memcpy(&ep->com.mapped_local_addr, &cm_id->local_addr,
+	       sizeof(ep->com.mapped_local_addr));
+
+	c4iw_form_reg_msg(dev, &pm_reg_msg);
+	iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_C4IW);
+	if (iwpm_err) {
+		PDBG("%s: Port Mapper reg pid fail (err = %d).\n",
+			__func__, iwpm_err);
+	}
+	if (iwpm_valid_pid() && !iwpm_err) {
+		memcpy(&pm_msg.loc_addr, &ep->com.local_addr,
+				sizeof(ep->com.local_addr));
+		iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_C4IW);
+		if (iwpm_err)
+			PDBG("%s: Port Mapper query fail (err = %d).\n",
+				__func__, iwpm_err);
+		else
+			memcpy(&ep->com.mapped_local_addr,
+				&pm_msg.mapped_loc_addr,
+				sizeof(ep->com.mapped_local_addr));
+	}
+	if (iwpm_create_mapinfo(&ep->com.local_addr,
+				&ep->com.mapped_local_addr, RDMA_NL_C4IW)) {
+		err = -ENOMEM;
+		goto fail3;
+	}
+	print_addr(&ep->com, __func__, "add_mapping/create_mapinfo");
+
+	set_bit(RELEASE_MAPINFO, &ep->com.flags);
+	state_set(&ep->com, LISTEN);
+	if (ep->com.local_addr.ss_family == AF_INET)
+		err = create_server4(dev, ep);
+	else
+		err = create_server6(dev, ep);
+	if (!err) {
+		cm_id->provider_data = ep;
+		goto out;
+	}
+
+fail3:
+	cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
+			ep->com.local_addr.ss_family);
+fail2:
+	cm_id->rem_ref(cm_id);
+	c4iw_put_ep(&ep->com);
+fail1:
+out:
+	return err;
+}
+
+int c4iw_destroy_listen(struct iw_cm_id *cm_id)
+{
+	int err;
+	struct c4iw_listen_ep *ep = to_listen_ep(cm_id);
+
+	PDBG("%s ep %p\n", __func__, ep);
+
+	might_sleep();
+	state_set(&ep->com, DEAD);
+	if (ep->com.dev->rdev.lldi.enable_fw_ofld_conn &&
+	    ep->com.local_addr.ss_family == AF_INET) {
+		err = cxgb4_remove_server_filter(
+			ep->com.dev->rdev.lldi.ports[0], ep->stid,
+			ep->com.dev->rdev.lldi.rxq_ids[0], 0);
+	} else {
+		c4iw_init_wr_wait(&ep->com.wr_wait);
+		err = cxgb4_remove_server(
+				ep->com.dev->rdev.lldi.ports[0], ep->stid,
+				ep->com.dev->rdev.lldi.rxq_ids[0], 0);
+		if (err)
+			goto done;
+		err = c4iw_wait_for_reply(&ep->com.dev->rdev, &ep->com.wr_wait,
+					  0, 0, __func__);
+	}
+	remove_handle(ep->com.dev, &ep->com.dev->stid_idr, ep->stid);
+	cxgb4_free_stid(ep->com.dev->rdev.lldi.tids, ep->stid,
+			ep->com.local_addr.ss_family);
+done:
+	cm_id->rem_ref(cm_id);
+	c4iw_put_ep(&ep->com);
+	return err;
+}
+
+int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp)
+{
+	int ret = 0;
+	int close = 0;
+	int fatal = 0;
+	struct c4iw_rdev *rdev;
+
+	mutex_lock(&ep->com.mutex);
+
+	PDBG("%s ep %p state %s, abrupt %d\n", __func__, ep,
+	     states[ep->com.state], abrupt);
+
+	rdev = &ep->com.dev->rdev;
+	if (c4iw_fatal_error(rdev)) {
+		fatal = 1;
+		close_complete_upcall(ep, -EIO);
+		ep->com.state = DEAD;
+	}
+	switch (ep->com.state) {
+	case MPA_REQ_WAIT:
+	case MPA_REQ_SENT:
+	case MPA_REQ_RCVD:
+	case MPA_REP_SENT:
+	case FPDU_MODE:
+		close = 1;
+		if (abrupt)
+			ep->com.state = ABORTING;
+		else {
+			ep->com.state = CLOSING;
+			start_ep_timer(ep);
+		}
+		set_bit(CLOSE_SENT, &ep->com.flags);
+		break;
+	case CLOSING:
+		if (!test_and_set_bit(CLOSE_SENT, &ep->com.flags)) {
+			close = 1;
+			if (abrupt) {
+				(void)stop_ep_timer(ep);
+				ep->com.state = ABORTING;
+			} else
+				ep->com.state = MORIBUND;
+		}
+		break;
+	case MORIBUND:
+	case ABORTING:
+	case DEAD:
+		PDBG("%s ignoring disconnect ep %p state %u\n",
+		     __func__, ep, ep->com.state);
+		break;
+	default:
+		BUG();
+		break;
+	}
+
+	if (close) {
+		if (abrupt) {
+			set_bit(EP_DISC_ABORT, &ep->com.history);
+			close_complete_upcall(ep, -ECONNRESET);
+			ret = send_abort(ep, NULL, gfp);
+		} else {
+			set_bit(EP_DISC_CLOSE, &ep->com.history);
+			ret = send_halfclose(ep, gfp);
+		}
+		if (ret)
+			fatal = 1;
+	}
+	mutex_unlock(&ep->com.mutex);
+	if (fatal)
+		release_ep_resources(ep);
+	return ret;
+}
+
+static void active_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
+			struct cpl_fw6_msg_ofld_connection_wr_rpl *req)
+{
+	struct c4iw_ep *ep;
+	int atid = be32_to_cpu(req->tid);
+
+	ep = (struct c4iw_ep *)lookup_atid(dev->rdev.lldi.tids,
+					   (__force u32) req->tid);
+	if (!ep)
+		return;
+
+	switch (req->retval) {
+	case FW_ENOMEM:
+		set_bit(ACT_RETRY_NOMEM, &ep->com.history);
+		if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) {
+			send_fw_act_open_req(ep, atid);
+			return;
+		}
+	case FW_EADDRINUSE:
+		set_bit(ACT_RETRY_INUSE, &ep->com.history);
+		if (ep->retry_count++ < ACT_OPEN_RETRY_COUNT) {
+			send_fw_act_open_req(ep, atid);
+			return;
+		}
+		break;
+	default:
+		pr_info("%s unexpected ofld conn wr retval %d\n",
+		       __func__, req->retval);
+		break;
+	}
+	pr_err("active ofld_connect_wr failure %d atid %d\n",
+	       req->retval, atid);
+	mutex_lock(&dev->rdev.stats.lock);
+	dev->rdev.stats.act_ofld_conn_fails++;
+	mutex_unlock(&dev->rdev.stats.lock);
+	connect_reply_upcall(ep, status2errno(req->retval));
+	state_set(&ep->com, DEAD);
+	remove_handle(dev, &dev->atid_idr, atid);
+	cxgb4_free_atid(dev->rdev.lldi.tids, atid);
+	dst_release(ep->dst);
+	cxgb4_l2t_release(ep->l2t);
+	c4iw_put_ep(&ep->com);
+}
+
+static void passive_ofld_conn_reply(struct c4iw_dev *dev, struct sk_buff *skb,
+			struct cpl_fw6_msg_ofld_connection_wr_rpl *req)
+{
+	struct sk_buff *rpl_skb;
+	struct cpl_pass_accept_req *cpl;
+	int ret;
+
+	rpl_skb = (struct sk_buff *)(unsigned long)req->cookie;
+	BUG_ON(!rpl_skb);
+	if (req->retval) {
+		PDBG("%s passive open failure %d\n", __func__, req->retval);
+		mutex_lock(&dev->rdev.stats.lock);
+		dev->rdev.stats.pas_ofld_conn_fails++;
+		mutex_unlock(&dev->rdev.stats.lock);
+		kfree_skb(rpl_skb);
+	} else {
+		cpl = (struct cpl_pass_accept_req *)cplhdr(rpl_skb);
+		OPCODE_TID(cpl) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_REQ,
+					(__force u32) htonl(
+					(__force u32) req->tid)));
+		ret = pass_accept_req(dev, rpl_skb);
+		if (!ret)
+			kfree_skb(rpl_skb);
+	}
+	return;
+}
+
+static int deferred_fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_fw6_msg *rpl = cplhdr(skb);
+	struct cpl_fw6_msg_ofld_connection_wr_rpl *req;
+
+	switch (rpl->type) {
+	case FW6_TYPE_CQE:
+		c4iw_ev_dispatch(dev, (struct t4_cqe *)&rpl->data[0]);
+		break;
+	case FW6_TYPE_OFLD_CONNECTION_WR_RPL:
+		req = (struct cpl_fw6_msg_ofld_connection_wr_rpl *)rpl->data;
+		switch (req->t_state) {
+		case TCP_SYN_SENT:
+			active_ofld_conn_reply(dev, skb, req);
+			break;
+		case TCP_SYN_RECV:
+			passive_ofld_conn_reply(dev, skb, req);
+			break;
+		default:
+			pr_err("%s unexpected ofld conn wr state %d\n",
+			       __func__, req->t_state);
+			break;
+		}
+		break;
+	}
+	return 0;
+}
+
+static void build_cpl_pass_accept_req(struct sk_buff *skb, int stid , u8 tos)
+{
+	u32 l2info;
+	u16 vlantag, len, hdr_len, eth_hdr_len;
+	u8 intf;
+	struct cpl_rx_pkt *cpl = cplhdr(skb);
+	struct cpl_pass_accept_req *req;
+	struct tcp_options_received tmp_opt;
+	struct c4iw_dev *dev;
+
+	dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *)));
+	/* Store values from cpl_rx_pkt in temporary location. */
+	vlantag = (__force u16) cpl->vlan;
+	len = (__force u16) cpl->len;
+	l2info  = (__force u32) cpl->l2info;
+	hdr_len = (__force u16) cpl->hdr_len;
+	intf = cpl->iff;
+
+	__skb_pull(skb, sizeof(*req) + sizeof(struct rss_header));
+
+	/*
+	 * We need to parse the TCP options from SYN packet.
+	 * to generate cpl_pass_accept_req.
+	 */
+	memset(&tmp_opt, 0, sizeof(tmp_opt));
+	tcp_clear_options(&tmp_opt);
+	tcp_parse_options(skb, &tmp_opt, 0, NULL);
+
+	req = (struct cpl_pass_accept_req *)__skb_push(skb, sizeof(*req));
+	memset(req, 0, sizeof(*req));
+	req->l2info = cpu_to_be16(SYN_INTF_V(intf) |
+			 SYN_MAC_IDX_V(RX_MACIDX_G(
+			 (__force int) htonl(l2info))) |
+			 SYN_XACT_MATCH_F);
+	eth_hdr_len = is_t4(dev->rdev.lldi.adapter_type) ?
+			    RX_ETHHDR_LEN_G((__force int)htonl(l2info)) :
+			    RX_T5_ETHHDR_LEN_G((__force int)htonl(l2info));
+	req->hdr_len = cpu_to_be32(SYN_RX_CHAN_V(RX_CHAN_G(
+					(__force int) htonl(l2info))) |
+				   TCP_HDR_LEN_V(RX_TCPHDR_LEN_G(
+					(__force int) htons(hdr_len))) |
+				   IP_HDR_LEN_V(RX_IPHDR_LEN_G(
+					(__force int) htons(hdr_len))) |
+				   ETH_HDR_LEN_V(RX_ETHHDR_LEN_G(eth_hdr_len)));
+	req->vlan = (__force __be16) vlantag;
+	req->len = (__force __be16) len;
+	req->tos_stid = cpu_to_be32(PASS_OPEN_TID_V(stid) |
+				    PASS_OPEN_TOS_V(tos));
+	req->tcpopt.mss = htons(tmp_opt.mss_clamp);
+	if (tmp_opt.wscale_ok)
+		req->tcpopt.wsf = tmp_opt.snd_wscale;
+	req->tcpopt.tstamp = tmp_opt.saw_tstamp;
+	if (tmp_opt.sack_ok)
+		req->tcpopt.sack = 1;
+	OPCODE_TID(req) = htonl(MK_OPCODE_TID(CPL_PASS_ACCEPT_REQ, 0));
+	return;
+}
+
+static void send_fw_pass_open_req(struct c4iw_dev *dev, struct sk_buff *skb,
+				  __be32 laddr, __be16 lport,
+				  __be32 raddr, __be16 rport,
+				  u32 rcv_isn, u32 filter, u16 window,
+				  u32 rss_qid, u8 port_id)
+{
+	struct sk_buff *req_skb;
+	struct fw_ofld_connection_wr *req;
+	struct cpl_pass_accept_req *cpl = cplhdr(skb);
+	int ret;
+
+	req_skb = alloc_skb(sizeof(struct fw_ofld_connection_wr), GFP_KERNEL);
+	req = (struct fw_ofld_connection_wr *)__skb_put(req_skb, sizeof(*req));
+	memset(req, 0, sizeof(*req));
+	req->op_compl = htonl(WR_OP_V(FW_OFLD_CONNECTION_WR) | FW_WR_COMPL_F);
+	req->len16_pkd = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*req), 16)));
+	req->le.version_cpl = htonl(FW_OFLD_CONNECTION_WR_CPL_F);
+	req->le.filter = (__force __be32) filter;
+	req->le.lport = lport;
+	req->le.pport = rport;
+	req->le.u.ipv4.lip = laddr;
+	req->le.u.ipv4.pip = raddr;
+	req->tcb.rcv_nxt = htonl(rcv_isn + 1);
+	req->tcb.rcv_adv = htons(window);
+	req->tcb.t_state_to_astid =
+		 htonl(FW_OFLD_CONNECTION_WR_T_STATE_V(TCP_SYN_RECV) |
+			FW_OFLD_CONNECTION_WR_RCV_SCALE_V(cpl->tcpopt.wsf) |
+			FW_OFLD_CONNECTION_WR_ASTID_V(
+			PASS_OPEN_TID_G(ntohl(cpl->tos_stid))));
+
+	/*
+	 * We store the qid in opt2 which will be used by the firmware
+	 * to send us the wr response.
+	 */
+	req->tcb.opt2 = htonl(RSS_QUEUE_V(rss_qid));
+
+	/*
+	 * We initialize the MSS index in TCB to 0xF.
+	 * So that when driver sends cpl_pass_accept_rpl
+	 * TCB picks up the correct value. If this was 0
+	 * TP will ignore any value > 0 for MSS index.
+	 */
+	req->tcb.opt0 = cpu_to_be64(MSS_IDX_V(0xF));
+	req->cookie = (uintptr_t)skb;
+
+	set_wr_txq(req_skb, CPL_PRIORITY_CONTROL, port_id);
+	ret = cxgb4_ofld_send(dev->rdev.lldi.ports[0], req_skb);
+	if (ret < 0) {
+		pr_err("%s - cxgb4_ofld_send error %d - dropping\n", __func__,
+		       ret);
+		kfree_skb(skb);
+		kfree_skb(req_skb);
+	}
+}
+
+/*
+ * Handler for CPL_RX_PKT message. Need to handle cpl_rx_pkt
+ * messages when a filter is being used instead of server to
+ * redirect a syn packet. When packets hit filter they are redirected
+ * to the offload queue and driver tries to establish the connection
+ * using firmware work request.
+ */
+static int rx_pkt(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	int stid;
+	unsigned int filter;
+	struct ethhdr *eh = NULL;
+	struct vlan_ethhdr *vlan_eh = NULL;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	struct rss_header *rss = (void *)skb->data;
+	struct cpl_rx_pkt *cpl = (void *)skb->data;
+	struct cpl_pass_accept_req *req = (void *)(rss + 1);
+	struct l2t_entry *e;
+	struct dst_entry *dst;
+	struct c4iw_ep *lep;
+	u16 window;
+	struct port_info *pi;
+	struct net_device *pdev;
+	u16 rss_qid, eth_hdr_len;
+	int step;
+	u32 tx_chan;
+	struct neighbour *neigh;
+
+	/* Drop all non-SYN packets */
+	if (!(cpl->l2info & cpu_to_be32(RXF_SYN_F)))
+		goto reject;
+
+	/*
+	 * Drop all packets which did not hit the filter.
+	 * Unlikely to happen.
+	 */
+	if (!(rss->filter_hit && rss->filter_tid))
+		goto reject;
+
+	/*
+	 * Calculate the server tid from filter hit index from cpl_rx_pkt.
+	 */
+	stid = (__force int) cpu_to_be32((__force u32) rss->hash_val);
+
+	lep = (struct c4iw_ep *)lookup_stid(dev->rdev.lldi.tids, stid);
+	if (!lep) {
+		PDBG("%s connect request on invalid stid %d\n", __func__, stid);
+		goto reject;
+	}
+
+	eth_hdr_len = is_t4(dev->rdev.lldi.adapter_type) ?
+			    RX_ETHHDR_LEN_G(htonl(cpl->l2info)) :
+			    RX_T5_ETHHDR_LEN_G(htonl(cpl->l2info));
+	if (eth_hdr_len == ETH_HLEN) {
+		eh = (struct ethhdr *)(req + 1);
+		iph = (struct iphdr *)(eh + 1);
+	} else {
+		vlan_eh = (struct vlan_ethhdr *)(req + 1);
+		iph = (struct iphdr *)(vlan_eh + 1);
+		skb->vlan_tci = ntohs(cpl->vlan);
+	}
+
+	if (iph->version != 0x4)
+		goto reject;
+
+	tcph = (struct tcphdr *)(iph + 1);
+	skb_set_network_header(skb, (void *)iph - (void *)rss);
+	skb_set_transport_header(skb, (void *)tcph - (void *)rss);
+	skb_get(skb);
+
+	PDBG("%s lip 0x%x lport %u pip 0x%x pport %u tos %d\n", __func__,
+	     ntohl(iph->daddr), ntohs(tcph->dest), ntohl(iph->saddr),
+	     ntohs(tcph->source), iph->tos);
+
+	dst = find_route(dev, iph->daddr, iph->saddr, tcph->dest, tcph->source,
+			 iph->tos);
+	if (!dst) {
+		pr_err("%s - failed to find dst entry!\n",
+		       __func__);
+		goto reject;
+	}
+	neigh = dst_neigh_lookup_skb(dst, skb);
+
+	if (!neigh) {
+		pr_err("%s - failed to allocate neigh!\n",
+		       __func__);
+		goto free_dst;
+	}
+
+	if (neigh->dev->flags & IFF_LOOPBACK) {
+		pdev = ip_dev_find(&init_net, iph->daddr);
+		e = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh,
+				    pdev, 0);
+		pi = (struct port_info *)netdev_priv(pdev);
+		tx_chan = cxgb4_port_chan(pdev);
+		dev_put(pdev);
+	} else {
+		pdev = get_real_dev(neigh->dev);
+		e = cxgb4_l2t_get(dev->rdev.lldi.l2t, neigh,
+					pdev, 0);
+		pi = (struct port_info *)netdev_priv(pdev);
+		tx_chan = cxgb4_port_chan(pdev);
+	}
+	neigh_release(neigh);
+	if (!e) {
+		pr_err("%s - failed to allocate l2t entry!\n",
+		       __func__);
+		goto free_dst;
+	}
+
+	step = dev->rdev.lldi.nrxq / dev->rdev.lldi.nchan;
+	rss_qid = dev->rdev.lldi.rxq_ids[pi->port_id * step];
+	window = (__force u16) htons((__force u16)tcph->window);
+
+	/* Calcuate filter portion for LE region. */
+	filter = (__force unsigned int) cpu_to_be32(cxgb4_select_ntuple(
+						    dev->rdev.lldi.ports[0],
+						    e));
+
+	/*
+	 * Synthesize the cpl_pass_accept_req. We have everything except the
+	 * TID. Once firmware sends a reply with TID we update the TID field
+	 * in cpl and pass it through the regular cpl_pass_accept_req path.
+	 */
+	build_cpl_pass_accept_req(skb, stid, iph->tos);
+	send_fw_pass_open_req(dev, skb, iph->daddr, tcph->dest, iph->saddr,
+			      tcph->source, ntohl(tcph->seq), filter, window,
+			      rss_qid, pi->port_id);
+	cxgb4_l2t_release(e);
+free_dst:
+	dst_release(dst);
+reject:
+	return 0;
+}
+
+/*
+ * These are the real handlers that are called from a
+ * work queue.
+ */
+static c4iw_handler_func work_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH] = act_establish,
+	[CPL_ACT_OPEN_RPL] = act_open_rpl,
+	[CPL_RX_DATA] = rx_data,
+	[CPL_ABORT_RPL_RSS] = abort_rpl,
+	[CPL_ABORT_RPL] = abort_rpl,
+	[CPL_PASS_OPEN_RPL] = pass_open_rpl,
+	[CPL_CLOSE_LISTSRV_RPL] = close_listsrv_rpl,
+	[CPL_PASS_ACCEPT_REQ] = pass_accept_req,
+	[CPL_PASS_ESTABLISH] = pass_establish,
+	[CPL_PEER_CLOSE] = peer_close,
+	[CPL_ABORT_REQ_RSS] = peer_abort,
+	[CPL_CLOSE_CON_RPL] = close_con_rpl,
+	[CPL_RDMA_TERMINATE] = terminate,
+	[CPL_FW4_ACK] = fw4_ack,
+	[CPL_FW6_MSG] = deferred_fw6_msg,
+	[CPL_RX_PKT] = rx_pkt
+};
+
+static void process_timeout(struct c4iw_ep *ep)
+{
+	struct c4iw_qp_attributes attrs;
+	int abort = 1;
+
+	mutex_lock(&ep->com.mutex);
+	PDBG("%s ep %p tid %u state %d\n", __func__, ep, ep->hwtid,
+	     ep->com.state);
+	set_bit(TIMEDOUT, &ep->com.history);
+	switch (ep->com.state) {
+	case MPA_REQ_SENT:
+		__state_set(&ep->com, ABORTING);
+		connect_reply_upcall(ep, -ETIMEDOUT);
+		break;
+	case MPA_REQ_WAIT:
+		__state_set(&ep->com, ABORTING);
+		break;
+	case CLOSING:
+	case MORIBUND:
+		if (ep->com.cm_id && ep->com.qp) {
+			attrs.next_state = C4IW_QP_STATE_ERROR;
+			c4iw_modify_qp(ep->com.qp->rhp,
+				     ep->com.qp, C4IW_QP_ATTR_NEXT_STATE,
+				     &attrs, 1);
+		}
+		__state_set(&ep->com, ABORTING);
+		close_complete_upcall(ep, -ETIMEDOUT);
+		break;
+	case ABORTING:
+	case DEAD:
+
+		/*
+		 * These states are expected if the ep timed out at the same
+		 * time as another thread was calling stop_ep_timer().
+		 * So we silently do nothing for these states.
+		 */
+		abort = 0;
+		break;
+	default:
+		WARN(1, "%s unexpected state ep %p tid %u state %u\n",
+			__func__, ep, ep->hwtid, ep->com.state);
+		abort = 0;
+	}
+	if (abort)
+		abort_connection(ep, NULL, GFP_KERNEL);
+	mutex_unlock(&ep->com.mutex);
+	c4iw_put_ep(&ep->com);
+}
+
+static void process_timedout_eps(void)
+{
+	struct c4iw_ep *ep;
+
+	spin_lock_irq(&timeout_lock);
+	while (!list_empty(&timeout_list)) {
+		struct list_head *tmp;
+
+		tmp = timeout_list.next;
+		list_del(tmp);
+		tmp->next = NULL;
+		tmp->prev = NULL;
+		spin_unlock_irq(&timeout_lock);
+		ep = list_entry(tmp, struct c4iw_ep, entry);
+		process_timeout(ep);
+		spin_lock_irq(&timeout_lock);
+	}
+	spin_unlock_irq(&timeout_lock);
+}
+
+static void process_work(struct work_struct *work)
+{
+	struct sk_buff *skb = NULL;
+	struct c4iw_dev *dev;
+	struct cpl_act_establish *rpl;
+	unsigned int opcode;
+	int ret;
+
+	process_timedout_eps();
+	while ((skb = skb_dequeue(&rxq))) {
+		rpl = cplhdr(skb);
+		dev = *((struct c4iw_dev **) (skb->cb + sizeof(void *)));
+		opcode = rpl->ot.opcode;
+
+		BUG_ON(!work_handlers[opcode]);
+		ret = work_handlers[opcode](dev, skb);
+		if (!ret)
+			kfree_skb(skb);
+		process_timedout_eps();
+	}
+}
+
+static DECLARE_WORK(skb_work, process_work);
+
+static void ep_timeout(unsigned long arg)
+{
+	struct c4iw_ep *ep = (struct c4iw_ep *)arg;
+	int kickit = 0;
+
+	spin_lock(&timeout_lock);
+	if (!test_and_set_bit(TIMEOUT, &ep->com.flags)) {
+		/*
+		 * Only insert if it is not already on the list.
+		 */
+		if (!ep->entry.next) {
+			list_add_tail(&ep->entry, &timeout_list);
+			kickit = 1;
+		}
+	}
+	spin_unlock(&timeout_lock);
+	if (kickit)
+		queue_work(workq, &skb_work);
+}
+
+/*
+ * All the CM events are handled on a work queue to have a safe context.
+ */
+static int sched(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+
+	/*
+	 * Save dev in the skb->cb area.
+	 */
+	*((struct c4iw_dev **) (skb->cb + sizeof(void *))) = dev;
+
+	/*
+	 * Queue the skb and schedule the worker thread.
+	 */
+	skb_queue_tail(&rxq, skb);
+	queue_work(workq, &skb_work);
+	return 0;
+}
+
+static int set_tcb_rpl(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_set_tcb_rpl *rpl = cplhdr(skb);
+
+	if (rpl->status != CPL_ERR_NONE) {
+		printk(KERN_ERR MOD "Unexpected SET_TCB_RPL status %u "
+		       "for tid %u\n", rpl->status, GET_TID(rpl));
+	}
+	kfree_skb(skb);
+	return 0;
+}
+
+static int fw6_msg(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_fw6_msg *rpl = cplhdr(skb);
+	struct c4iw_wr_wait *wr_waitp;
+	int ret;
+
+	PDBG("%s type %u\n", __func__, rpl->type);
+
+	switch (rpl->type) {
+	case FW6_TYPE_WR_RPL:
+		ret = (int)((be64_to_cpu(rpl->data[0]) >> 8) & 0xff);
+		wr_waitp = (struct c4iw_wr_wait *)(__force unsigned long) rpl->data[1];
+		PDBG("%s wr_waitp %p ret %u\n", __func__, wr_waitp, ret);
+		if (wr_waitp)
+			c4iw_wake_up(wr_waitp, ret ? -ret : 0);
+		kfree_skb(skb);
+		break;
+	case FW6_TYPE_CQE:
+	case FW6_TYPE_OFLD_CONNECTION_WR_RPL:
+		sched(dev, skb);
+		break;
+	default:
+		printk(KERN_ERR MOD "%s unexpected fw6 msg type %u\n", __func__,
+		       rpl->type);
+		kfree_skb(skb);
+		break;
+	}
+	return 0;
+}
+
+static int peer_abort_intr(struct c4iw_dev *dev, struct sk_buff *skb)
+{
+	struct cpl_abort_req_rss *req = cplhdr(skb);
+	struct c4iw_ep *ep;
+	struct tid_info *t = dev->rdev.lldi.tids;
+	unsigned int tid = GET_TID(req);
+
+	ep = lookup_tid(t, tid);
+	if (!ep) {
+		printk(KERN_WARNING MOD
+		       "Abort on non-existent endpoint, tid %d\n", tid);
+		kfree_skb(skb);
+		return 0;
+	}
+	if (is_neg_adv(req->status)) {
+		PDBG("%s Negative advice on abort- tid %u status %d (%s)\n",
+		     __func__, ep->hwtid, req->status,
+		     neg_adv_str(req->status));
+		ep->stats.abort_neg_adv++;
+		dev->rdev.stats.neg_adv++;
+		kfree_skb(skb);
+		return 0;
+	}
+	PDBG("%s ep %p tid %u state %u\n", __func__, ep, ep->hwtid,
+	     ep->com.state);
+
+	/*
+	 * Wake up any threads in rdma_init() or rdma_fini().
+	 * However, if we are on MPAv2 and want to retry with MPAv1
+	 * then, don't wake up yet.
+	 */
+	if (mpa_rev == 2 && !ep->tried_with_mpa_v1) {
+		if (ep->com.state != MPA_REQ_SENT)
+			c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
+	} else
+		c4iw_wake_up(&ep->com.wr_wait, -ECONNRESET);
+	sched(dev, skb);
+	return 0;
+}
+
+/*
+ * Most upcalls from the T4 Core go to sched() to
+ * schedule the processing on a work queue.
+ */
+c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS] = {
+	[CPL_ACT_ESTABLISH] = sched,
+	[CPL_ACT_OPEN_RPL] = sched,
+	[CPL_RX_DATA] = sched,
+	[CPL_ABORT_RPL_RSS] = sched,
+	[CPL_ABORT_RPL] = sched,
+	[CPL_PASS_OPEN_RPL] = sched,
+	[CPL_CLOSE_LISTSRV_RPL] = sched,
+	[CPL_PASS_ACCEPT_REQ] = sched,
+	[CPL_PASS_ESTABLISH] = sched,
+	[CPL_PEER_CLOSE] = sched,
+	[CPL_CLOSE_CON_RPL] = sched,
+	[CPL_ABORT_REQ_RSS] = peer_abort_intr,
+	[CPL_RDMA_TERMINATE] = sched,
+	[CPL_FW4_ACK] = sched,
+	[CPL_SET_TCB_RPL] = set_tcb_rpl,
+	[CPL_FW6_MSG] = fw6_msg,
+	[CPL_RX_PKT] = sched
+};
+
+int __init c4iw_cm_init(void)
+{
+	spin_lock_init(&timeout_lock);
+	skb_queue_head_init(&rxq);
+
+	workq = create_singlethread_workqueue("iw_cxgb4");
+	if (!workq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void c4iw_cm_term(void)
+{
+	WARN_ON(!list_empty(&timeout_list));
+	flush_workqueue(workq);
+	destroy_workqueue(workq);
+}
diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c
new file mode 100644
index 000000000..68ddb3710
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/cq.c
@@ -0,0 +1,1015 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "iw_cxgb4.h"
+
+static int destroy_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
+		      struct c4iw_dev_ucontext *uctx)
+{
+	struct fw_ri_res_wr *res_wr;
+	struct fw_ri_res *res;
+	int wr_len;
+	struct c4iw_wr_wait wr_wait;
+	struct sk_buff *skb;
+	int ret;
+
+	wr_len = sizeof *res_wr + sizeof *res;
+	skb = alloc_skb(wr_len, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+	res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len);
+	memset(res_wr, 0, wr_len);
+	res_wr->op_nres = cpu_to_be32(
+			FW_WR_OP_V(FW_RI_RES_WR) |
+			FW_RI_RES_WR_NRES_V(1) |
+			FW_WR_COMPL_F);
+	res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16));
+	res_wr->cookie = (uintptr_t)&wr_wait;
+	res = res_wr->res;
+	res->u.cq.restype = FW_RI_RES_TYPE_CQ;
+	res->u.cq.op = FW_RI_RES_OP_RESET;
+	res->u.cq.iqid = cpu_to_be32(cq->cqid);
+
+	c4iw_init_wr_wait(&wr_wait);
+	ret = c4iw_ofld_send(rdev, skb);
+	if (!ret) {
+		ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__);
+	}
+
+	kfree(cq->sw_queue);
+	dma_free_coherent(&(rdev->lldi.pdev->dev),
+			  cq->memsize, cq->queue,
+			  dma_unmap_addr(cq, mapping));
+	c4iw_put_cqid(rdev, cq->cqid, uctx);
+	return ret;
+}
+
+static int create_cq(struct c4iw_rdev *rdev, struct t4_cq *cq,
+		     struct c4iw_dev_ucontext *uctx)
+{
+	struct fw_ri_res_wr *res_wr;
+	struct fw_ri_res *res;
+	int wr_len;
+	int user = (uctx != &rdev->uctx);
+	struct c4iw_wr_wait wr_wait;
+	int ret;
+	struct sk_buff *skb;
+
+	cq->cqid = c4iw_get_cqid(rdev, uctx);
+	if (!cq->cqid) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	if (!user) {
+		cq->sw_queue = kzalloc(cq->memsize, GFP_KERNEL);
+		if (!cq->sw_queue) {
+			ret = -ENOMEM;
+			goto err2;
+		}
+	}
+	cq->queue = dma_alloc_coherent(&rdev->lldi.pdev->dev, cq->memsize,
+				       &cq->dma_addr, GFP_KERNEL);
+	if (!cq->queue) {
+		ret = -ENOMEM;
+		goto err3;
+	}
+	dma_unmap_addr_set(cq, mapping, cq->dma_addr);
+	memset(cq->queue, 0, cq->memsize);
+
+	/* build fw_ri_res_wr */
+	wr_len = sizeof *res_wr + sizeof *res;
+
+	skb = alloc_skb(wr_len, GFP_KERNEL);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto err4;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+	res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len);
+	memset(res_wr, 0, wr_len);
+	res_wr->op_nres = cpu_to_be32(
+			FW_WR_OP_V(FW_RI_RES_WR) |
+			FW_RI_RES_WR_NRES_V(1) |
+			FW_WR_COMPL_F);
+	res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16));
+	res_wr->cookie = (uintptr_t)&wr_wait;
+	res = res_wr->res;
+	res->u.cq.restype = FW_RI_RES_TYPE_CQ;
+	res->u.cq.op = FW_RI_RES_OP_WRITE;
+	res->u.cq.iqid = cpu_to_be32(cq->cqid);
+	res->u.cq.iqandst_to_iqandstindex = cpu_to_be32(
+			FW_RI_RES_WR_IQANUS_V(0) |
+			FW_RI_RES_WR_IQANUD_V(1) |
+			FW_RI_RES_WR_IQANDST_F |
+			FW_RI_RES_WR_IQANDSTINDEX_V(
+				rdev->lldi.ciq_ids[cq->vector]));
+	res->u.cq.iqdroprss_to_iqesize = cpu_to_be16(
+			FW_RI_RES_WR_IQDROPRSS_F |
+			FW_RI_RES_WR_IQPCIECH_V(2) |
+			FW_RI_RES_WR_IQINTCNTTHRESH_V(0) |
+			FW_RI_RES_WR_IQO_F |
+			FW_RI_RES_WR_IQESIZE_V(1));
+	res->u.cq.iqsize = cpu_to_be16(cq->size);
+	res->u.cq.iqaddr = cpu_to_be64(cq->dma_addr);
+
+	c4iw_init_wr_wait(&wr_wait);
+
+	ret = c4iw_ofld_send(rdev, skb);
+	if (ret)
+		goto err4;
+	PDBG("%s wait_event wr_wait %p\n", __func__, &wr_wait);
+	ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__);
+	if (ret)
+		goto err4;
+
+	cq->gen = 1;
+	cq->rdev = rdev;
+	if (user) {
+		u32 off = (cq->cqid << rdev->cqshift) & PAGE_MASK;
+
+		cq->ugts = (u64)rdev->bar2_pa + off;
+	} else if (is_t4(rdev->lldi.adapter_type)) {
+		cq->gts = rdev->lldi.gts_reg;
+		cq->qid_mask = -1U;
+	} else {
+		u32 off = ((cq->cqid << rdev->cqshift) & PAGE_MASK) + 12;
+
+		cq->gts = rdev->bar2_kva + off;
+		cq->qid_mask = rdev->qpmask;
+	}
+	return 0;
+err4:
+	dma_free_coherent(&rdev->lldi.pdev->dev, cq->memsize, cq->queue,
+			  dma_unmap_addr(cq, mapping));
+err3:
+	kfree(cq->sw_queue);
+err2:
+	c4iw_put_cqid(rdev, cq->cqid, uctx);
+err1:
+	return ret;
+}
+
+static void insert_recv_cqe(struct t4_wq *wq, struct t4_cq *cq)
+{
+	struct t4_cqe cqe;
+
+	PDBG("%s wq %p cq %p sw_cidx %u sw_pidx %u\n", __func__,
+	     wq, cq, cq->sw_cidx, cq->sw_pidx);
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.header = cpu_to_be32(CQE_STATUS_V(T4_ERR_SWFLUSH) |
+				 CQE_OPCODE_V(FW_RI_SEND) |
+				 CQE_TYPE_V(0) |
+				 CQE_SWCQE_V(1) |
+				 CQE_QPID_V(wq->sq.qid));
+	cqe.bits_type_ts = cpu_to_be64(CQE_GENBIT_V((u64)cq->gen));
+	cq->sw_queue[cq->sw_pidx] = cqe;
+	t4_swcq_produce(cq);
+}
+
+int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count)
+{
+	int flushed = 0;
+	int in_use = wq->rq.in_use - count;
+
+	BUG_ON(in_use < 0);
+	PDBG("%s wq %p cq %p rq.in_use %u skip count %u\n", __func__,
+	     wq, cq, wq->rq.in_use, count);
+	while (in_use--) {
+		insert_recv_cqe(wq, cq);
+		flushed++;
+	}
+	return flushed;
+}
+
+static void insert_sq_cqe(struct t4_wq *wq, struct t4_cq *cq,
+			  struct t4_swsqe *swcqe)
+{
+	struct t4_cqe cqe;
+
+	PDBG("%s wq %p cq %p sw_cidx %u sw_pidx %u\n", __func__,
+	     wq, cq, cq->sw_cidx, cq->sw_pidx);
+	memset(&cqe, 0, sizeof(cqe));
+	cqe.header = cpu_to_be32(CQE_STATUS_V(T4_ERR_SWFLUSH) |
+				 CQE_OPCODE_V(swcqe->opcode) |
+				 CQE_TYPE_V(1) |
+				 CQE_SWCQE_V(1) |
+				 CQE_QPID_V(wq->sq.qid));
+	CQE_WRID_SQ_IDX(&cqe) = swcqe->idx;
+	cqe.bits_type_ts = cpu_to_be64(CQE_GENBIT_V((u64)cq->gen));
+	cq->sw_queue[cq->sw_pidx] = cqe;
+	t4_swcq_produce(cq);
+}
+
+static void advance_oldest_read(struct t4_wq *wq);
+
+int c4iw_flush_sq(struct c4iw_qp *qhp)
+{
+	int flushed = 0;
+	struct t4_wq *wq = &qhp->wq;
+	struct c4iw_cq *chp = to_c4iw_cq(qhp->ibqp.send_cq);
+	struct t4_cq *cq = &chp->cq;
+	int idx;
+	struct t4_swsqe *swsqe;
+
+	if (wq->sq.flush_cidx == -1)
+		wq->sq.flush_cidx = wq->sq.cidx;
+	idx = wq->sq.flush_cidx;
+	BUG_ON(idx >= wq->sq.size);
+	while (idx != wq->sq.pidx) {
+		swsqe = &wq->sq.sw_sq[idx];
+		BUG_ON(swsqe->flushed);
+		swsqe->flushed = 1;
+		insert_sq_cqe(wq, cq, swsqe);
+		if (wq->sq.oldest_read == swsqe) {
+			BUG_ON(swsqe->opcode != FW_RI_READ_REQ);
+			advance_oldest_read(wq);
+		}
+		flushed++;
+		if (++idx == wq->sq.size)
+			idx = 0;
+	}
+	wq->sq.flush_cidx += flushed;
+	if (wq->sq.flush_cidx >= wq->sq.size)
+		wq->sq.flush_cidx -= wq->sq.size;
+	return flushed;
+}
+
+static void flush_completed_wrs(struct t4_wq *wq, struct t4_cq *cq)
+{
+	struct t4_swsqe *swsqe;
+	int cidx;
+
+	if (wq->sq.flush_cidx == -1)
+		wq->sq.flush_cidx = wq->sq.cidx;
+	cidx = wq->sq.flush_cidx;
+	BUG_ON(cidx > wq->sq.size);
+
+	while (cidx != wq->sq.pidx) {
+		swsqe = &wq->sq.sw_sq[cidx];
+		if (!swsqe->signaled) {
+			if (++cidx == wq->sq.size)
+				cidx = 0;
+		} else if (swsqe->complete) {
+
+			BUG_ON(swsqe->flushed);
+
+			/*
+			 * Insert this completed cqe into the swcq.
+			 */
+			PDBG("%s moving cqe into swcq sq idx %u cq idx %u\n",
+					__func__, cidx, cq->sw_pidx);
+			swsqe->cqe.header |= htonl(CQE_SWCQE_V(1));
+			cq->sw_queue[cq->sw_pidx] = swsqe->cqe;
+			t4_swcq_produce(cq);
+			swsqe->flushed = 1;
+			if (++cidx == wq->sq.size)
+				cidx = 0;
+			wq->sq.flush_cidx = cidx;
+		} else
+			break;
+	}
+}
+
+static void create_read_req_cqe(struct t4_wq *wq, struct t4_cqe *hw_cqe,
+		struct t4_cqe *read_cqe)
+{
+	read_cqe->u.scqe.cidx = wq->sq.oldest_read->idx;
+	read_cqe->len = htonl(wq->sq.oldest_read->read_len);
+	read_cqe->header = htonl(CQE_QPID_V(CQE_QPID(hw_cqe)) |
+			CQE_SWCQE_V(SW_CQE(hw_cqe)) |
+			CQE_OPCODE_V(FW_RI_READ_REQ) |
+			CQE_TYPE_V(1));
+	read_cqe->bits_type_ts = hw_cqe->bits_type_ts;
+}
+
+static void advance_oldest_read(struct t4_wq *wq)
+{
+
+	u32 rptr = wq->sq.oldest_read - wq->sq.sw_sq + 1;
+
+	if (rptr == wq->sq.size)
+		rptr = 0;
+	while (rptr != wq->sq.pidx) {
+		wq->sq.oldest_read = &wq->sq.sw_sq[rptr];
+
+		if (wq->sq.oldest_read->opcode == FW_RI_READ_REQ)
+			return;
+		if (++rptr == wq->sq.size)
+			rptr = 0;
+	}
+	wq->sq.oldest_read = NULL;
+}
+
+/*
+ * Move all CQEs from the HWCQ into the SWCQ.
+ * Deal with out-of-order and/or completions that complete
+ * prior unsignalled WRs.
+ */
+void c4iw_flush_hw_cq(struct c4iw_cq *chp)
+{
+	struct t4_cqe *hw_cqe, *swcqe, read_cqe;
+	struct c4iw_qp *qhp;
+	struct t4_swsqe *swsqe;
+	int ret;
+
+	PDBG("%s  cqid 0x%x\n", __func__, chp->cq.cqid);
+	ret = t4_next_hw_cqe(&chp->cq, &hw_cqe);
+
+	/*
+	 * This logic is similar to poll_cq(), but not quite the same
+	 * unfortunately.  Need to move pertinent HW CQEs to the SW CQ but
+	 * also do any translation magic that poll_cq() normally does.
+	 */
+	while (!ret) {
+		qhp = get_qhp(chp->rhp, CQE_QPID(hw_cqe));
+
+		/*
+		 * drop CQEs with no associated QP
+		 */
+		if (qhp == NULL)
+			goto next_cqe;
+
+		if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE)
+			goto next_cqe;
+
+		if (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP) {
+
+			/* If we have reached here because of async
+			 * event or other error, and have egress error
+			 * then drop
+			 */
+			if (CQE_TYPE(hw_cqe) == 1)
+				goto next_cqe;
+
+			/* drop peer2peer RTR reads.
+			 */
+			if (CQE_WRID_STAG(hw_cqe) == 1)
+				goto next_cqe;
+
+			/*
+			 * Eat completions for unsignaled read WRs.
+			 */
+			if (!qhp->wq.sq.oldest_read->signaled) {
+				advance_oldest_read(&qhp->wq);
+				goto next_cqe;
+			}
+
+			/*
+			 * Don't write to the HWCQ, create a new read req CQE
+			 * in local memory and move it into the swcq.
+			 */
+			create_read_req_cqe(&qhp->wq, hw_cqe, &read_cqe);
+			hw_cqe = &read_cqe;
+			advance_oldest_read(&qhp->wq);
+		}
+
+		/* if its a SQ completion, then do the magic to move all the
+		 * unsignaled and now in-order completions into the swcq.
+		 */
+		if (SQ_TYPE(hw_cqe)) {
+			swsqe = &qhp->wq.sq.sw_sq[CQE_WRID_SQ_IDX(hw_cqe)];
+			swsqe->cqe = *hw_cqe;
+			swsqe->complete = 1;
+			flush_completed_wrs(&qhp->wq, &chp->cq);
+		} else {
+			swcqe = &chp->cq.sw_queue[chp->cq.sw_pidx];
+			*swcqe = *hw_cqe;
+			swcqe->header |= cpu_to_be32(CQE_SWCQE_V(1));
+			t4_swcq_produce(&chp->cq);
+		}
+next_cqe:
+		t4_hwcq_consume(&chp->cq);
+		ret = t4_next_hw_cqe(&chp->cq, &hw_cqe);
+	}
+}
+
+static int cqe_completes_wr(struct t4_cqe *cqe, struct t4_wq *wq)
+{
+	if (CQE_OPCODE(cqe) == FW_RI_TERMINATE)
+		return 0;
+
+	if ((CQE_OPCODE(cqe) == FW_RI_RDMA_WRITE) && RQ_TYPE(cqe))
+		return 0;
+
+	if ((CQE_OPCODE(cqe) == FW_RI_READ_RESP) && SQ_TYPE(cqe))
+		return 0;
+
+	if (CQE_SEND_OPCODE(cqe) && RQ_TYPE(cqe) && t4_rq_empty(wq))
+		return 0;
+	return 1;
+}
+
+void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count)
+{
+	struct t4_cqe *cqe;
+	u32 ptr;
+
+	*count = 0;
+	PDBG("%s count zero %d\n", __func__, *count);
+	ptr = cq->sw_cidx;
+	while (ptr != cq->sw_pidx) {
+		cqe = &cq->sw_queue[ptr];
+		if (RQ_TYPE(cqe) && (CQE_OPCODE(cqe) != FW_RI_READ_RESP) &&
+		    (CQE_QPID(cqe) == wq->sq.qid) && cqe_completes_wr(cqe, wq))
+			(*count)++;
+		if (++ptr == cq->size)
+			ptr = 0;
+	}
+	PDBG("%s cq %p count %d\n", __func__, cq, *count);
+}
+
+/*
+ * poll_cq
+ *
+ * Caller must:
+ *     check the validity of the first CQE,
+ *     supply the wq assicated with the qpid.
+ *
+ * credit: cq credit to return to sge.
+ * cqe_flushed: 1 iff the CQE is flushed.
+ * cqe: copy of the polled CQE.
+ *
+ * return value:
+ *    0		    CQE returned ok.
+ *    -EAGAIN       CQE skipped, try again.
+ *    -EOVERFLOW    CQ overflow detected.
+ */
+static int poll_cq(struct t4_wq *wq, struct t4_cq *cq, struct t4_cqe *cqe,
+		   u8 *cqe_flushed, u64 *cookie, u32 *credit)
+{
+	int ret = 0;
+	struct t4_cqe *hw_cqe, read_cqe;
+
+	*cqe_flushed = 0;
+	*credit = 0;
+	ret = t4_next_cqe(cq, &hw_cqe);
+	if (ret)
+		return ret;
+
+	PDBG("%s CQE OVF %u qpid 0x%0x genbit %u type %u status 0x%0x"
+	     " opcode 0x%0x len 0x%0x wrid_hi_stag 0x%x wrid_low_msn 0x%x\n",
+	     __func__, CQE_OVFBIT(hw_cqe), CQE_QPID(hw_cqe),
+	     CQE_GENBIT(hw_cqe), CQE_TYPE(hw_cqe), CQE_STATUS(hw_cqe),
+	     CQE_OPCODE(hw_cqe), CQE_LEN(hw_cqe), CQE_WRID_HI(hw_cqe),
+	     CQE_WRID_LOW(hw_cqe));
+
+	/*
+	 * skip cqe's not affiliated with a QP.
+	 */
+	if (wq == NULL) {
+		ret = -EAGAIN;
+		goto skip_cqe;
+	}
+
+	/*
+	* skip hw cqe's if the wq is flushed.
+	*/
+	if (wq->flushed && !SW_CQE(hw_cqe)) {
+		ret = -EAGAIN;
+		goto skip_cqe;
+	}
+
+	/*
+	 * skip TERMINATE cqes...
+	 */
+	if (CQE_OPCODE(hw_cqe) == FW_RI_TERMINATE) {
+		ret = -EAGAIN;
+		goto skip_cqe;
+	}
+
+	/*
+	 * Gotta tweak READ completions:
+	 *	1) the cqe doesn't contain the sq_wptr from the wr.
+	 *	2) opcode not reflected from the wr.
+	 *	3) read_len not reflected from the wr.
+	 *	4) cq_type is RQ_TYPE not SQ_TYPE.
+	 */
+	if (RQ_TYPE(hw_cqe) && (CQE_OPCODE(hw_cqe) == FW_RI_READ_RESP)) {
+
+		/* If we have reached here because of async
+		 * event or other error, and have egress error
+		 * then drop
+		 */
+		if (CQE_TYPE(hw_cqe) == 1) {
+			if (CQE_STATUS(hw_cqe))
+				t4_set_wq_in_error(wq);
+			ret = -EAGAIN;
+			goto skip_cqe;
+		}
+
+		/* If this is an unsolicited read response, then the read
+		 * was generated by the kernel driver as part of peer-2-peer
+		 * connection setup.  So ignore the completion.
+		 */
+		if (CQE_WRID_STAG(hw_cqe) == 1) {
+			if (CQE_STATUS(hw_cqe))
+				t4_set_wq_in_error(wq);
+			ret = -EAGAIN;
+			goto skip_cqe;
+		}
+
+		/*
+		 * Eat completions for unsignaled read WRs.
+		 */
+		if (!wq->sq.oldest_read->signaled) {
+			advance_oldest_read(wq);
+			ret = -EAGAIN;
+			goto skip_cqe;
+		}
+
+		/*
+		 * Don't write to the HWCQ, so create a new read req CQE
+		 * in local memory.
+		 */
+		create_read_req_cqe(wq, hw_cqe, &read_cqe);
+		hw_cqe = &read_cqe;
+		advance_oldest_read(wq);
+	}
+
+	if (CQE_STATUS(hw_cqe) || t4_wq_in_error(wq)) {
+		*cqe_flushed = (CQE_STATUS(hw_cqe) == T4_ERR_SWFLUSH);
+		t4_set_wq_in_error(wq);
+	}
+
+	/*
+	 * RECV completion.
+	 */
+	if (RQ_TYPE(hw_cqe)) {
+
+		/*
+		 * HW only validates 4 bits of MSN.  So we must validate that
+		 * the MSN in the SEND is the next expected MSN.  If its not,
+		 * then we complete this with T4_ERR_MSN and mark the wq in
+		 * error.
+		 */
+
+		if (t4_rq_empty(wq)) {
+			t4_set_wq_in_error(wq);
+			ret = -EAGAIN;
+			goto skip_cqe;
+		}
+		if (unlikely((CQE_WRID_MSN(hw_cqe) != (wq->rq.msn)))) {
+			t4_set_wq_in_error(wq);
+			hw_cqe->header |= htonl(CQE_STATUS_V(T4_ERR_MSN));
+			goto proc_cqe;
+		}
+		goto proc_cqe;
+	}
+
+	/*
+	 * If we get here its a send completion.
+	 *
+	 * Handle out of order completion. These get stuffed
+	 * in the SW SQ. Then the SW SQ is walked to move any
+	 * now in-order completions into the SW CQ.  This handles
+	 * 2 cases:
+	 *	1) reaping unsignaled WRs when the first subsequent
+	 *	   signaled WR is completed.
+	 *	2) out of order read completions.
+	 */
+	if (!SW_CQE(hw_cqe) && (CQE_WRID_SQ_IDX(hw_cqe) != wq->sq.cidx)) {
+		struct t4_swsqe *swsqe;
+
+		PDBG("%s out of order completion going in sw_sq at idx %u\n",
+		     __func__, CQE_WRID_SQ_IDX(hw_cqe));
+		swsqe = &wq->sq.sw_sq[CQE_WRID_SQ_IDX(hw_cqe)];
+		swsqe->cqe = *hw_cqe;
+		swsqe->complete = 1;
+		ret = -EAGAIN;
+		goto flush_wq;
+	}
+
+proc_cqe:
+	*cqe = *hw_cqe;
+
+	/*
+	 * Reap the associated WR(s) that are freed up with this
+	 * completion.
+	 */
+	if (SQ_TYPE(hw_cqe)) {
+		int idx = CQE_WRID_SQ_IDX(hw_cqe);
+		BUG_ON(idx >= wq->sq.size);
+
+		/*
+		* Account for any unsignaled completions completed by
+		* this signaled completion.  In this case, cidx points
+		* to the first unsignaled one, and idx points to the
+		* signaled one.  So adjust in_use based on this delta.
+		* if this is not completing any unsigned wrs, then the
+		* delta will be 0. Handle wrapping also!
+		*/
+		if (idx < wq->sq.cidx)
+			wq->sq.in_use -= wq->sq.size + idx - wq->sq.cidx;
+		else
+			wq->sq.in_use -= idx - wq->sq.cidx;
+		BUG_ON(wq->sq.in_use <= 0 && wq->sq.in_use >= wq->sq.size);
+
+		wq->sq.cidx = (uint16_t)idx;
+		PDBG("%s completing sq idx %u\n", __func__, wq->sq.cidx);
+		*cookie = wq->sq.sw_sq[wq->sq.cidx].wr_id;
+		if (c4iw_wr_log)
+			c4iw_log_wr_stats(wq, hw_cqe);
+		t4_sq_consume(wq);
+	} else {
+		PDBG("%s completing rq idx %u\n", __func__, wq->rq.cidx);
+		*cookie = wq->rq.sw_rq[wq->rq.cidx].wr_id;
+		BUG_ON(t4_rq_empty(wq));
+		if (c4iw_wr_log)
+			c4iw_log_wr_stats(wq, hw_cqe);
+		t4_rq_consume(wq);
+		goto skip_cqe;
+	}
+
+flush_wq:
+	/*
+	 * Flush any completed cqes that are now in-order.
+	 */
+	flush_completed_wrs(wq, cq);
+
+skip_cqe:
+	if (SW_CQE(hw_cqe)) {
+		PDBG("%s cq %p cqid 0x%x skip sw cqe cidx %u\n",
+		     __func__, cq, cq->cqid, cq->sw_cidx);
+		t4_swcq_consume(cq);
+	} else {
+		PDBG("%s cq %p cqid 0x%x skip hw cqe cidx %u\n",
+		     __func__, cq, cq->cqid, cq->cidx);
+		t4_hwcq_consume(cq);
+	}
+	return ret;
+}
+
+/*
+ * Get one cq entry from c4iw and map it to openib.
+ *
+ * Returns:
+ *	0			cqe returned
+ *	-ENODATA		EMPTY;
+ *	-EAGAIN			caller must try again
+ *	any other -errno	fatal error
+ */
+static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc)
+{
+	struct c4iw_qp *qhp = NULL;
+	struct t4_cqe uninitialized_var(cqe), *rd_cqe;
+	struct t4_wq *wq;
+	u32 credit = 0;
+	u8 cqe_flushed;
+	u64 cookie = 0;
+	int ret;
+
+	ret = t4_next_cqe(&chp->cq, &rd_cqe);
+
+	if (ret)
+		return ret;
+
+	qhp = get_qhp(chp->rhp, CQE_QPID(rd_cqe));
+	if (!qhp)
+		wq = NULL;
+	else {
+		spin_lock(&qhp->lock);
+		wq = &(qhp->wq);
+	}
+	ret = poll_cq(wq, &(chp->cq), &cqe, &cqe_flushed, &cookie, &credit);
+	if (ret)
+		goto out;
+
+	wc->wr_id = cookie;
+	wc->qp = &qhp->ibqp;
+	wc->vendor_err = CQE_STATUS(&cqe);
+	wc->wc_flags = 0;
+
+	PDBG("%s qpid 0x%x type %d opcode %d status 0x%x len %u wrid hi 0x%x "
+	     "lo 0x%x cookie 0x%llx\n", __func__, CQE_QPID(&cqe),
+	     CQE_TYPE(&cqe), CQE_OPCODE(&cqe), CQE_STATUS(&cqe), CQE_LEN(&cqe),
+	     CQE_WRID_HI(&cqe), CQE_WRID_LOW(&cqe), (unsigned long long)cookie);
+
+	if (CQE_TYPE(&cqe) == 0) {
+		if (!CQE_STATUS(&cqe))
+			wc->byte_len = CQE_LEN(&cqe);
+		else
+			wc->byte_len = 0;
+		wc->opcode = IB_WC_RECV;
+		if (CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_INV ||
+		    CQE_OPCODE(&cqe) == FW_RI_SEND_WITH_SE_INV) {
+			wc->ex.invalidate_rkey = CQE_WRID_STAG(&cqe);
+			wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+		}
+	} else {
+		switch (CQE_OPCODE(&cqe)) {
+		case FW_RI_RDMA_WRITE:
+			wc->opcode = IB_WC_RDMA_WRITE;
+			break;
+		case FW_RI_READ_REQ:
+			wc->opcode = IB_WC_RDMA_READ;
+			wc->byte_len = CQE_LEN(&cqe);
+			break;
+		case FW_RI_SEND_WITH_INV:
+		case FW_RI_SEND_WITH_SE_INV:
+			wc->opcode = IB_WC_SEND;
+			wc->wc_flags |= IB_WC_WITH_INVALIDATE;
+			break;
+		case FW_RI_SEND:
+		case FW_RI_SEND_WITH_SE:
+			wc->opcode = IB_WC_SEND;
+			break;
+		case FW_RI_BIND_MW:
+			wc->opcode = IB_WC_BIND_MW;
+			break;
+
+		case FW_RI_LOCAL_INV:
+			wc->opcode = IB_WC_LOCAL_INV;
+			break;
+		case FW_RI_FAST_REGISTER:
+			wc->opcode = IB_WC_FAST_REG_MR;
+			break;
+		default:
+			printk(KERN_ERR MOD "Unexpected opcode %d "
+			       "in the CQE received for QPID=0x%0x\n",
+			       CQE_OPCODE(&cqe), CQE_QPID(&cqe));
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	if (cqe_flushed)
+		wc->status = IB_WC_WR_FLUSH_ERR;
+	else {
+
+		switch (CQE_STATUS(&cqe)) {
+		case T4_ERR_SUCCESS:
+			wc->status = IB_WC_SUCCESS;
+			break;
+		case T4_ERR_STAG:
+			wc->status = IB_WC_LOC_ACCESS_ERR;
+			break;
+		case T4_ERR_PDID:
+			wc->status = IB_WC_LOC_PROT_ERR;
+			break;
+		case T4_ERR_QPID:
+		case T4_ERR_ACCESS:
+			wc->status = IB_WC_LOC_ACCESS_ERR;
+			break;
+		case T4_ERR_WRAP:
+			wc->status = IB_WC_GENERAL_ERR;
+			break;
+		case T4_ERR_BOUND:
+			wc->status = IB_WC_LOC_LEN_ERR;
+			break;
+		case T4_ERR_INVALIDATE_SHARED_MR:
+		case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+			wc->status = IB_WC_MW_BIND_ERR;
+			break;
+		case T4_ERR_CRC:
+		case T4_ERR_MARKER:
+		case T4_ERR_PDU_LEN_ERR:
+		case T4_ERR_OUT_OF_RQE:
+		case T4_ERR_DDP_VERSION:
+		case T4_ERR_RDMA_VERSION:
+		case T4_ERR_DDP_QUEUE_NUM:
+		case T4_ERR_MSN:
+		case T4_ERR_TBIT:
+		case T4_ERR_MO:
+		case T4_ERR_MSN_RANGE:
+		case T4_ERR_IRD_OVERFLOW:
+		case T4_ERR_OPCODE:
+		case T4_ERR_INTERNAL_ERR:
+			wc->status = IB_WC_FATAL_ERR;
+			break;
+		case T4_ERR_SWFLUSH:
+			wc->status = IB_WC_WR_FLUSH_ERR;
+			break;
+		default:
+			printk(KERN_ERR MOD
+			       "Unexpected cqe_status 0x%x for QPID=0x%0x\n",
+			       CQE_STATUS(&cqe), CQE_QPID(&cqe));
+			ret = -EINVAL;
+		}
+	}
+out:
+	if (wq)
+		spin_unlock(&qhp->lock);
+	return ret;
+}
+
+int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct c4iw_cq *chp;
+	unsigned long flags;
+	int npolled;
+	int err = 0;
+
+	chp = to_c4iw_cq(ibcq);
+
+	spin_lock_irqsave(&chp->lock, flags);
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+		do {
+			err = c4iw_poll_cq_one(chp, wc + npolled);
+		} while (err == -EAGAIN);
+		if (err)
+			break;
+	}
+	spin_unlock_irqrestore(&chp->lock, flags);
+	return !err || err == -ENODATA ? npolled : err;
+}
+
+int c4iw_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct c4iw_cq *chp;
+	struct c4iw_ucontext *ucontext;
+
+	PDBG("%s ib_cq %p\n", __func__, ib_cq);
+	chp = to_c4iw_cq(ib_cq);
+
+	remove_handle(chp->rhp, &chp->rhp->cqidr, chp->cq.cqid);
+	atomic_dec(&chp->refcnt);
+	wait_event(chp->wait, !atomic_read(&chp->refcnt));
+
+	ucontext = ib_cq->uobject ? to_c4iw_ucontext(ib_cq->uobject->context)
+				  : NULL;
+	destroy_cq(&chp->rhp->rdev, &chp->cq,
+		   ucontext ? &ucontext->uctx : &chp->cq.rdev->uctx);
+	kfree(chp);
+	return 0;
+}
+
+struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries,
+			     int vector, struct ib_ucontext *ib_context,
+			     struct ib_udata *udata)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_cq *chp;
+	struct c4iw_create_cq_resp uresp;
+	struct c4iw_ucontext *ucontext = NULL;
+	int ret;
+	size_t memsize, hwentries;
+	struct c4iw_mm_entry *mm, *mm2;
+
+	PDBG("%s ib_dev %p entries %d\n", __func__, ibdev, entries);
+
+	rhp = to_c4iw_dev(ibdev);
+
+	if (vector >= rhp->rdev.lldi.nciq)
+		return ERR_PTR(-EINVAL);
+
+	chp = kzalloc(sizeof(*chp), GFP_KERNEL);
+	if (!chp)
+		return ERR_PTR(-ENOMEM);
+
+	if (ib_context)
+		ucontext = to_c4iw_ucontext(ib_context);
+
+	/* account for the status page. */
+	entries++;
+
+	/* IQ needs one extra entry to differentiate full vs empty. */
+	entries++;
+
+	/*
+	 * entries must be multiple of 16 for HW.
+	 */
+	entries = roundup(entries, 16);
+
+	/*
+	 * Make actual HW queue 2x to avoid cdix_inc overflows.
+	 */
+	hwentries = min(entries * 2, rhp->rdev.hw_queue.t4_max_iq_size);
+
+	/*
+	 * Make HW queue at least 64 entries so GTS updates aren't too
+	 * frequent.
+	 */
+	if (hwentries < 64)
+		hwentries = 64;
+
+	memsize = hwentries * sizeof *chp->cq.queue;
+
+	/*
+	 * memsize must be a multiple of the page size if its a user cq.
+	 */
+	if (ucontext)
+		memsize = roundup(memsize, PAGE_SIZE);
+	chp->cq.size = hwentries;
+	chp->cq.memsize = memsize;
+	chp->cq.vector = vector;
+
+	ret = create_cq(&rhp->rdev, &chp->cq,
+			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+	if (ret)
+		goto err1;
+
+	chp->rhp = rhp;
+	chp->cq.size--;				/* status page */
+	chp->ibcq.cqe = entries - 2;
+	spin_lock_init(&chp->lock);
+	spin_lock_init(&chp->comp_handler_lock);
+	atomic_set(&chp->refcnt, 1);
+	init_waitqueue_head(&chp->wait);
+	ret = insert_handle(rhp, &rhp->cqidr, chp, chp->cq.cqid);
+	if (ret)
+		goto err2;
+
+	if (ucontext) {
+		mm = kmalloc(sizeof *mm, GFP_KERNEL);
+		if (!mm)
+			goto err3;
+		mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
+		if (!mm2)
+			goto err4;
+
+		uresp.qid_mask = rhp->rdev.cqmask;
+		uresp.cqid = chp->cq.cqid;
+		uresp.size = chp->cq.size;
+		uresp.memsize = chp->cq.memsize;
+		spin_lock(&ucontext->mmap_lock);
+		uresp.key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.gts_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		spin_unlock(&ucontext->mmap_lock);
+		ret = ib_copy_to_udata(udata, &uresp,
+				       sizeof(uresp) - sizeof(uresp.reserved));
+		if (ret)
+			goto err5;
+
+		mm->key = uresp.key;
+		mm->addr = virt_to_phys(chp->cq.queue);
+		mm->len = chp->cq.memsize;
+		insert_mmap(ucontext, mm);
+
+		mm2->key = uresp.gts_key;
+		mm2->addr = chp->cq.ugts;
+		mm2->len = PAGE_SIZE;
+		insert_mmap(ucontext, mm2);
+	}
+	PDBG("%s cqid 0x%0x chp %p size %u memsize %zu, dma_addr 0x%0llx\n",
+	     __func__, chp->cq.cqid, chp, chp->cq.size,
+	     chp->cq.memsize, (unsigned long long) chp->cq.dma_addr);
+	return &chp->ibcq;
+err5:
+	kfree(mm2);
+err4:
+	kfree(mm);
+err3:
+	remove_handle(rhp, &rhp->cqidr, chp->cq.cqid);
+err2:
+	destroy_cq(&chp->rhp->rdev, &chp->cq,
+		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+err1:
+	kfree(chp);
+	return ERR_PTR(ret);
+}
+
+int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
+{
+	return -ENOSYS;
+}
+
+int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct c4iw_cq *chp;
+	int ret;
+	unsigned long flag;
+
+	chp = to_c4iw_cq(ibcq);
+	spin_lock_irqsave(&chp->lock, flag);
+	ret = t4_arm_cq(&chp->cq,
+			(flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED);
+	spin_unlock_irqrestore(&chp->lock, flag);
+	if (ret && !(flags & IB_CQ_REPORT_MISSED_EVENTS))
+		ret = 0;
+	return ret;
+}
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
new file mode 100644
index 000000000..7e895d714
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -0,0 +1,1564 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+#include <linux/vmalloc.h>
+#include <linux/math64.h>
+
+#include <rdma/ib_verbs.h>
+
+#include "iw_cxgb4.h"
+
+#define DRV_VERSION "0.1"
+
+MODULE_AUTHOR("Steve Wise");
+MODULE_DESCRIPTION("Chelsio T4/T5 RDMA Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+static int allow_db_fc_on_t5;
+module_param(allow_db_fc_on_t5, int, 0644);
+MODULE_PARM_DESC(allow_db_fc_on_t5,
+		 "Allow DB Flow Control on T5 (default = 0)");
+
+static int allow_db_coalescing_on_t5;
+module_param(allow_db_coalescing_on_t5, int, 0644);
+MODULE_PARM_DESC(allow_db_coalescing_on_t5,
+		 "Allow DB Coalescing on T5 (default = 0)");
+
+int c4iw_wr_log = 0;
+module_param(c4iw_wr_log, int, 0444);
+MODULE_PARM_DESC(c4iw_wr_log, "Enables logging of work request timing data.");
+
+static int c4iw_wr_log_size_order = 12;
+module_param(c4iw_wr_log_size_order, int, 0444);
+MODULE_PARM_DESC(c4iw_wr_log_size_order,
+		 "Number of entries (log2) in the work request timing log.");
+
+struct uld_ctx {
+	struct list_head entry;
+	struct cxgb4_lld_info lldi;
+	struct c4iw_dev *dev;
+};
+
+static LIST_HEAD(uld_ctx_list);
+static DEFINE_MUTEX(dev_mutex);
+
+#define DB_FC_RESUME_SIZE 64
+#define DB_FC_RESUME_DELAY 1
+#define DB_FC_DRAIN_THRESH 0
+
+static struct dentry *c4iw_debugfs_root;
+
+struct c4iw_debugfs_data {
+	struct c4iw_dev *devp;
+	char *buf;
+	int bufsize;
+	int pos;
+};
+
+/* registered cxgb4 netlink callbacks */
+static struct ibnl_client_cbs c4iw_nl_cb_table[] = {
+	[RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb},
+	[RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb},
+	[RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb},
+	[RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb},
+	[RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb},
+	[RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb},
+	[RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}
+};
+
+static int count_idrs(int id, void *p, void *data)
+{
+	int *countp = data;
+
+	*countp = *countp + 1;
+	return 0;
+}
+
+static ssize_t debugfs_read(struct file *file, char __user *buf, size_t count,
+			    loff_t *ppos)
+{
+	struct c4iw_debugfs_data *d = file->private_data;
+
+	return simple_read_from_buffer(buf, count, ppos, d->buf, d->pos);
+}
+
+void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe)
+{
+	struct wr_log_entry le;
+	int idx;
+
+	if (!wq->rdev->wr_log)
+		return;
+
+	idx = (atomic_inc_return(&wq->rdev->wr_log_idx) - 1) &
+		(wq->rdev->wr_log_size - 1);
+	le.poll_sge_ts = cxgb4_read_sge_timestamp(wq->rdev->lldi.ports[0]);
+	getnstimeofday(&le.poll_host_ts);
+	le.valid = 1;
+	le.cqe_sge_ts = CQE_TS(cqe);
+	if (SQ_TYPE(cqe)) {
+		le.qid = wq->sq.qid;
+		le.opcode = CQE_OPCODE(cqe);
+		le.post_host_ts = wq->sq.sw_sq[wq->sq.cidx].host_ts;
+		le.post_sge_ts = wq->sq.sw_sq[wq->sq.cidx].sge_ts;
+		le.wr_id = CQE_WRID_SQ_IDX(cqe);
+	} else {
+		le.qid = wq->rq.qid;
+		le.opcode = FW_RI_RECEIVE;
+		le.post_host_ts = wq->rq.sw_rq[wq->rq.cidx].host_ts;
+		le.post_sge_ts = wq->rq.sw_rq[wq->rq.cidx].sge_ts;
+		le.wr_id = CQE_WRID_MSN(cqe);
+	}
+	wq->rdev->wr_log[idx] = le;
+}
+
+static int wr_log_show(struct seq_file *seq, void *v)
+{
+	struct c4iw_dev *dev = seq->private;
+	struct timespec prev_ts = {0, 0};
+	struct wr_log_entry *lep;
+	int prev_ts_set = 0;
+	int idx, end;
+
+#define ts2ns(ts) div64_u64((ts) * dev->rdev.lldi.cclk_ps, 1000)
+
+	idx = atomic_read(&dev->rdev.wr_log_idx) &
+		(dev->rdev.wr_log_size - 1);
+	end = idx - 1;
+	if (end < 0)
+		end = dev->rdev.wr_log_size - 1;
+	lep = &dev->rdev.wr_log[idx];
+	while (idx != end) {
+		if (lep->valid) {
+			if (!prev_ts_set) {
+				prev_ts_set = 1;
+				prev_ts = lep->poll_host_ts;
+			}
+			seq_printf(seq, "%04u: sec %lu nsec %lu qid %u opcode "
+				   "%u %s 0x%x host_wr_delta sec %lu nsec %lu "
+				   "post_sge_ts 0x%llx cqe_sge_ts 0x%llx "
+				   "poll_sge_ts 0x%llx post_poll_delta_ns %llu "
+				   "cqe_poll_delta_ns %llu\n",
+				   idx,
+				   timespec_sub(lep->poll_host_ts,
+						prev_ts).tv_sec,
+				   timespec_sub(lep->poll_host_ts,
+						prev_ts).tv_nsec,
+				   lep->qid, lep->opcode,
+				   lep->opcode == FW_RI_RECEIVE ?
+							"msn" : "wrid",
+				   lep->wr_id,
+				   timespec_sub(lep->poll_host_ts,
+						lep->post_host_ts).tv_sec,
+				   timespec_sub(lep->poll_host_ts,
+						lep->post_host_ts).tv_nsec,
+				   lep->post_sge_ts, lep->cqe_sge_ts,
+				   lep->poll_sge_ts,
+				   ts2ns(lep->poll_sge_ts - lep->post_sge_ts),
+				   ts2ns(lep->poll_sge_ts - lep->cqe_sge_ts));
+			prev_ts = lep->poll_host_ts;
+		}
+		idx++;
+		if (idx > (dev->rdev.wr_log_size - 1))
+			idx = 0;
+		lep = &dev->rdev.wr_log[idx];
+	}
+#undef ts2ns
+	return 0;
+}
+
+static int wr_log_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, wr_log_show, inode->i_private);
+}
+
+static ssize_t wr_log_clear(struct file *file, const char __user *buf,
+			    size_t count, loff_t *pos)
+{
+	struct c4iw_dev *dev = ((struct seq_file *)file->private_data)->private;
+	int i;
+
+	if (dev->rdev.wr_log)
+		for (i = 0; i < dev->rdev.wr_log_size; i++)
+			dev->rdev.wr_log[i].valid = 0;
+	return count;
+}
+
+static const struct file_operations wr_log_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = wr_log_open,
+	.release = single_release,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.write   = wr_log_clear,
+};
+
+static int dump_qp(int id, void *p, void *data)
+{
+	struct c4iw_qp *qp = p;
+	struct c4iw_debugfs_data *qpd = data;
+	int space;
+	int cc;
+
+	if (id != qp->wq.sq.qid)
+		return 0;
+
+	space = qpd->bufsize - qpd->pos - 1;
+	if (space == 0)
+		return 1;
+
+	if (qp->ep) {
+		if (qp->ep->com.local_addr.ss_family == AF_INET) {
+			struct sockaddr_in *lsin = (struct sockaddr_in *)
+				&qp->ep->com.local_addr;
+			struct sockaddr_in *rsin = (struct sockaddr_in *)
+				&qp->ep->com.remote_addr;
+			struct sockaddr_in *mapped_lsin = (struct sockaddr_in *)
+				&qp->ep->com.mapped_local_addr;
+			struct sockaddr_in *mapped_rsin = (struct sockaddr_in *)
+				&qp->ep->com.mapped_remote_addr;
+
+			cc = snprintf(qpd->buf + qpd->pos, space,
+				      "rc qp sq id %u rq id %u state %u "
+				      "onchip %u ep tid %u state %u "
+				      "%pI4:%u/%u->%pI4:%u/%u\n",
+				      qp->wq.sq.qid, qp->wq.rq.qid,
+				      (int)qp->attr.state,
+				      qp->wq.sq.flags & T4_SQ_ONCHIP,
+				      qp->ep->hwtid, (int)qp->ep->com.state,
+				      &lsin->sin_addr, ntohs(lsin->sin_port),
+				      ntohs(mapped_lsin->sin_port),
+				      &rsin->sin_addr, ntohs(rsin->sin_port),
+				      ntohs(mapped_rsin->sin_port));
+		} else {
+			struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *)
+				&qp->ep->com.local_addr;
+			struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *)
+				&qp->ep->com.remote_addr;
+			struct sockaddr_in6 *mapped_lsin6 =
+				(struct sockaddr_in6 *)
+				&qp->ep->com.mapped_local_addr;
+			struct sockaddr_in6 *mapped_rsin6 =
+				(struct sockaddr_in6 *)
+				&qp->ep->com.mapped_remote_addr;
+
+			cc = snprintf(qpd->buf + qpd->pos, space,
+				      "rc qp sq id %u rq id %u state %u "
+				      "onchip %u ep tid %u state %u "
+				      "%pI6:%u/%u->%pI6:%u/%u\n",
+				      qp->wq.sq.qid, qp->wq.rq.qid,
+				      (int)qp->attr.state,
+				      qp->wq.sq.flags & T4_SQ_ONCHIP,
+				      qp->ep->hwtid, (int)qp->ep->com.state,
+				      &lsin6->sin6_addr,
+				      ntohs(lsin6->sin6_port),
+				      ntohs(mapped_lsin6->sin6_port),
+				      &rsin6->sin6_addr,
+				      ntohs(rsin6->sin6_port),
+				      ntohs(mapped_rsin6->sin6_port));
+		}
+	} else
+		cc = snprintf(qpd->buf + qpd->pos, space,
+			     "qp sq id %u rq id %u state %u onchip %u\n",
+			      qp->wq.sq.qid, qp->wq.rq.qid,
+			      (int)qp->attr.state,
+			      qp->wq.sq.flags & T4_SQ_ONCHIP);
+	if (cc < space)
+		qpd->pos += cc;
+	return 0;
+}
+
+static int qp_release(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *qpd = file->private_data;
+	if (!qpd) {
+		printk(KERN_INFO "%s null qpd?\n", __func__);
+		return 0;
+	}
+	vfree(qpd->buf);
+	kfree(qpd);
+	return 0;
+}
+
+static int qp_open(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *qpd;
+	int ret = 0;
+	int count = 1;
+
+	qpd = kmalloc(sizeof *qpd, GFP_KERNEL);
+	if (!qpd) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	qpd->devp = inode->i_private;
+	qpd->pos = 0;
+
+	spin_lock_irq(&qpd->devp->lock);
+	idr_for_each(&qpd->devp->qpidr, count_idrs, &count);
+	spin_unlock_irq(&qpd->devp->lock);
+
+	qpd->bufsize = count * 128;
+	qpd->buf = vmalloc(qpd->bufsize);
+	if (!qpd->buf) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	spin_lock_irq(&qpd->devp->lock);
+	idr_for_each(&qpd->devp->qpidr, dump_qp, qpd);
+	spin_unlock_irq(&qpd->devp->lock);
+
+	qpd->buf[qpd->pos++] = 0;
+	file->private_data = qpd;
+	goto out;
+err1:
+	kfree(qpd);
+out:
+	return ret;
+}
+
+static const struct file_operations qp_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = qp_open,
+	.release = qp_release,
+	.read    = debugfs_read,
+	.llseek  = default_llseek,
+};
+
+static int dump_stag(int id, void *p, void *data)
+{
+	struct c4iw_debugfs_data *stagd = data;
+	int space;
+	int cc;
+	struct fw_ri_tpte tpte;
+	int ret;
+
+	space = stagd->bufsize - stagd->pos - 1;
+	if (space == 0)
+		return 1;
+
+	ret = cxgb4_read_tpte(stagd->devp->rdev.lldi.ports[0], (u32)id<<8,
+			      (__be32 *)&tpte);
+	if (ret) {
+		dev_err(&stagd->devp->rdev.lldi.pdev->dev,
+			"%s cxgb4_read_tpte err %d\n", __func__, ret);
+		return ret;
+	}
+	cc = snprintf(stagd->buf + stagd->pos, space,
+		      "stag: idx 0x%x valid %d key 0x%x state %d pdid %d "
+		      "perm 0x%x ps %d len 0x%llx va 0x%llx\n",
+		      (u32)id<<8,
+		      FW_RI_TPTE_VALID_G(ntohl(tpte.valid_to_pdid)),
+		      FW_RI_TPTE_STAGKEY_G(ntohl(tpte.valid_to_pdid)),
+		      FW_RI_TPTE_STAGSTATE_G(ntohl(tpte.valid_to_pdid)),
+		      FW_RI_TPTE_PDID_G(ntohl(tpte.valid_to_pdid)),
+		      FW_RI_TPTE_PERM_G(ntohl(tpte.locread_to_qpid)),
+		      FW_RI_TPTE_PS_G(ntohl(tpte.locread_to_qpid)),
+		      ((u64)ntohl(tpte.len_hi) << 32) | ntohl(tpte.len_lo),
+		      ((u64)ntohl(tpte.va_hi) << 32) | ntohl(tpte.va_lo_fbo));
+	if (cc < space)
+		stagd->pos += cc;
+	return 0;
+}
+
+static int stag_release(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *stagd = file->private_data;
+	if (!stagd) {
+		printk(KERN_INFO "%s null stagd?\n", __func__);
+		return 0;
+	}
+	vfree(stagd->buf);
+	kfree(stagd);
+	return 0;
+}
+
+static int stag_open(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *stagd;
+	int ret = 0;
+	int count = 1;
+
+	stagd = kmalloc(sizeof *stagd, GFP_KERNEL);
+	if (!stagd) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	stagd->devp = inode->i_private;
+	stagd->pos = 0;
+
+	spin_lock_irq(&stagd->devp->lock);
+	idr_for_each(&stagd->devp->mmidr, count_idrs, &count);
+	spin_unlock_irq(&stagd->devp->lock);
+
+	stagd->bufsize = count * 256;
+	stagd->buf = vmalloc(stagd->bufsize);
+	if (!stagd->buf) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	spin_lock_irq(&stagd->devp->lock);
+	idr_for_each(&stagd->devp->mmidr, dump_stag, stagd);
+	spin_unlock_irq(&stagd->devp->lock);
+
+	stagd->buf[stagd->pos++] = 0;
+	file->private_data = stagd;
+	goto out;
+err1:
+	kfree(stagd);
+out:
+	return ret;
+}
+
+static const struct file_operations stag_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = stag_open,
+	.release = stag_release,
+	.read    = debugfs_read,
+	.llseek  = default_llseek,
+};
+
+static char *db_state_str[] = {"NORMAL", "FLOW_CONTROL", "RECOVERY", "STOPPED"};
+
+static int stats_show(struct seq_file *seq, void *v)
+{
+	struct c4iw_dev *dev = seq->private;
+
+	seq_printf(seq, "   Object: %10s %10s %10s %10s\n", "Total", "Current",
+		   "Max", "Fail");
+	seq_printf(seq, "     PDID: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.pd.total, dev->rdev.stats.pd.cur,
+			dev->rdev.stats.pd.max, dev->rdev.stats.pd.fail);
+	seq_printf(seq, "      QID: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.qid.total, dev->rdev.stats.qid.cur,
+			dev->rdev.stats.qid.max, dev->rdev.stats.qid.fail);
+	seq_printf(seq, "   TPTMEM: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.stag.total, dev->rdev.stats.stag.cur,
+			dev->rdev.stats.stag.max, dev->rdev.stats.stag.fail);
+	seq_printf(seq, "   PBLMEM: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.pbl.total, dev->rdev.stats.pbl.cur,
+			dev->rdev.stats.pbl.max, dev->rdev.stats.pbl.fail);
+	seq_printf(seq, "   RQTMEM: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.rqt.total, dev->rdev.stats.rqt.cur,
+			dev->rdev.stats.rqt.max, dev->rdev.stats.rqt.fail);
+	seq_printf(seq, "  OCQPMEM: %10llu %10llu %10llu %10llu\n",
+			dev->rdev.stats.ocqp.total, dev->rdev.stats.ocqp.cur,
+			dev->rdev.stats.ocqp.max, dev->rdev.stats.ocqp.fail);
+	seq_printf(seq, "  DB FULL: %10llu\n", dev->rdev.stats.db_full);
+	seq_printf(seq, " DB EMPTY: %10llu\n", dev->rdev.stats.db_empty);
+	seq_printf(seq, "  DB DROP: %10llu\n", dev->rdev.stats.db_drop);
+	seq_printf(seq, " DB State: %s Transitions %llu FC Interruptions %llu\n",
+		   db_state_str[dev->db_state],
+		   dev->rdev.stats.db_state_transitions,
+		   dev->rdev.stats.db_fc_interruptions);
+	seq_printf(seq, "TCAM_FULL: %10llu\n", dev->rdev.stats.tcam_full);
+	seq_printf(seq, "ACT_OFLD_CONN_FAILS: %10llu\n",
+		   dev->rdev.stats.act_ofld_conn_fails);
+	seq_printf(seq, "PAS_OFLD_CONN_FAILS: %10llu\n",
+		   dev->rdev.stats.pas_ofld_conn_fails);
+	seq_printf(seq, "NEG_ADV_RCVD: %10llu\n", dev->rdev.stats.neg_adv);
+	seq_printf(seq, "AVAILABLE IRD: %10u\n", dev->avail_ird);
+	return 0;
+}
+
+static int stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, stats_show, inode->i_private);
+}
+
+static ssize_t stats_clear(struct file *file, const char __user *buf,
+		size_t count, loff_t *pos)
+{
+	struct c4iw_dev *dev = ((struct seq_file *)file->private_data)->private;
+
+	mutex_lock(&dev->rdev.stats.lock);
+	dev->rdev.stats.pd.max = 0;
+	dev->rdev.stats.pd.fail = 0;
+	dev->rdev.stats.qid.max = 0;
+	dev->rdev.stats.qid.fail = 0;
+	dev->rdev.stats.stag.max = 0;
+	dev->rdev.stats.stag.fail = 0;
+	dev->rdev.stats.pbl.max = 0;
+	dev->rdev.stats.pbl.fail = 0;
+	dev->rdev.stats.rqt.max = 0;
+	dev->rdev.stats.rqt.fail = 0;
+	dev->rdev.stats.ocqp.max = 0;
+	dev->rdev.stats.ocqp.fail = 0;
+	dev->rdev.stats.db_full = 0;
+	dev->rdev.stats.db_empty = 0;
+	dev->rdev.stats.db_drop = 0;
+	dev->rdev.stats.db_state_transitions = 0;
+	dev->rdev.stats.tcam_full = 0;
+	dev->rdev.stats.act_ofld_conn_fails = 0;
+	dev->rdev.stats.pas_ofld_conn_fails = 0;
+	mutex_unlock(&dev->rdev.stats.lock);
+	return count;
+}
+
+static const struct file_operations stats_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = stats_open,
+	.release = single_release,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.write   = stats_clear,
+};
+
+static int dump_ep(int id, void *p, void *data)
+{
+	struct c4iw_ep *ep = p;
+	struct c4iw_debugfs_data *epd = data;
+	int space;
+	int cc;
+
+	space = epd->bufsize - epd->pos - 1;
+	if (space == 0)
+		return 1;
+
+	if (ep->com.local_addr.ss_family == AF_INET) {
+		struct sockaddr_in *lsin = (struct sockaddr_in *)
+			&ep->com.local_addr;
+		struct sockaddr_in *rsin = (struct sockaddr_in *)
+			&ep->com.remote_addr;
+		struct sockaddr_in *mapped_lsin = (struct sockaddr_in *)
+			&ep->com.mapped_local_addr;
+		struct sockaddr_in *mapped_rsin = (struct sockaddr_in *)
+			&ep->com.mapped_remote_addr;
+
+		cc = snprintf(epd->buf + epd->pos, space,
+			      "ep %p cm_id %p qp %p state %d flags 0x%lx "
+			      "history 0x%lx hwtid %d atid %d "
+			      "conn_na %u abort_na %u "
+			      "%pI4:%d/%d <-> %pI4:%d/%d\n",
+			      ep, ep->com.cm_id, ep->com.qp,
+			      (int)ep->com.state, ep->com.flags,
+			      ep->com.history, ep->hwtid, ep->atid,
+			      ep->stats.connect_neg_adv,
+			      ep->stats.abort_neg_adv,
+			      &lsin->sin_addr, ntohs(lsin->sin_port),
+			      ntohs(mapped_lsin->sin_port),
+			      &rsin->sin_addr, ntohs(rsin->sin_port),
+			      ntohs(mapped_rsin->sin_port));
+	} else {
+		struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *)
+			&ep->com.local_addr;
+		struct sockaddr_in6 *rsin6 = (struct sockaddr_in6 *)
+			&ep->com.remote_addr;
+		struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *)
+			&ep->com.mapped_local_addr;
+		struct sockaddr_in6 *mapped_rsin6 = (struct sockaddr_in6 *)
+			&ep->com.mapped_remote_addr;
+
+		cc = snprintf(epd->buf + epd->pos, space,
+			      "ep %p cm_id %p qp %p state %d flags 0x%lx "
+			      "history 0x%lx hwtid %d atid %d "
+			      "conn_na %u abort_na %u "
+			      "%pI6:%d/%d <-> %pI6:%d/%d\n",
+			      ep, ep->com.cm_id, ep->com.qp,
+			      (int)ep->com.state, ep->com.flags,
+			      ep->com.history, ep->hwtid, ep->atid,
+			      ep->stats.connect_neg_adv,
+			      ep->stats.abort_neg_adv,
+			      &lsin6->sin6_addr, ntohs(lsin6->sin6_port),
+			      ntohs(mapped_lsin6->sin6_port),
+			      &rsin6->sin6_addr, ntohs(rsin6->sin6_port),
+			      ntohs(mapped_rsin6->sin6_port));
+	}
+	if (cc < space)
+		epd->pos += cc;
+	return 0;
+}
+
+static int dump_listen_ep(int id, void *p, void *data)
+{
+	struct c4iw_listen_ep *ep = p;
+	struct c4iw_debugfs_data *epd = data;
+	int space;
+	int cc;
+
+	space = epd->bufsize - epd->pos - 1;
+	if (space == 0)
+		return 1;
+
+	if (ep->com.local_addr.ss_family == AF_INET) {
+		struct sockaddr_in *lsin = (struct sockaddr_in *)
+			&ep->com.local_addr;
+		struct sockaddr_in *mapped_lsin = (struct sockaddr_in *)
+			&ep->com.mapped_local_addr;
+
+		cc = snprintf(epd->buf + epd->pos, space,
+			      "ep %p cm_id %p state %d flags 0x%lx stid %d "
+			      "backlog %d %pI4:%d/%d\n",
+			      ep, ep->com.cm_id, (int)ep->com.state,
+			      ep->com.flags, ep->stid, ep->backlog,
+			      &lsin->sin_addr, ntohs(lsin->sin_port),
+			      ntohs(mapped_lsin->sin_port));
+	} else {
+		struct sockaddr_in6 *lsin6 = (struct sockaddr_in6 *)
+			&ep->com.local_addr;
+		struct sockaddr_in6 *mapped_lsin6 = (struct sockaddr_in6 *)
+			&ep->com.mapped_local_addr;
+
+		cc = snprintf(epd->buf + epd->pos, space,
+			      "ep %p cm_id %p state %d flags 0x%lx stid %d "
+			      "backlog %d %pI6:%d/%d\n",
+			      ep, ep->com.cm_id, (int)ep->com.state,
+			      ep->com.flags, ep->stid, ep->backlog,
+			      &lsin6->sin6_addr, ntohs(lsin6->sin6_port),
+			      ntohs(mapped_lsin6->sin6_port));
+	}
+	if (cc < space)
+		epd->pos += cc;
+	return 0;
+}
+
+static int ep_release(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *epd = file->private_data;
+	if (!epd) {
+		pr_info("%s null qpd?\n", __func__);
+		return 0;
+	}
+	vfree(epd->buf);
+	kfree(epd);
+	return 0;
+}
+
+static int ep_open(struct inode *inode, struct file *file)
+{
+	struct c4iw_debugfs_data *epd;
+	int ret = 0;
+	int count = 1;
+
+	epd = kmalloc(sizeof(*epd), GFP_KERNEL);
+	if (!epd) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	epd->devp = inode->i_private;
+	epd->pos = 0;
+
+	spin_lock_irq(&epd->devp->lock);
+	idr_for_each(&epd->devp->hwtid_idr, count_idrs, &count);
+	idr_for_each(&epd->devp->atid_idr, count_idrs, &count);
+	idr_for_each(&epd->devp->stid_idr, count_idrs, &count);
+	spin_unlock_irq(&epd->devp->lock);
+
+	epd->bufsize = count * 240;
+	epd->buf = vmalloc(epd->bufsize);
+	if (!epd->buf) {
+		ret = -ENOMEM;
+		goto err1;
+	}
+
+	spin_lock_irq(&epd->devp->lock);
+	idr_for_each(&epd->devp->hwtid_idr, dump_ep, epd);
+	idr_for_each(&epd->devp->atid_idr, dump_ep, epd);
+	idr_for_each(&epd->devp->stid_idr, dump_listen_ep, epd);
+	spin_unlock_irq(&epd->devp->lock);
+
+	file->private_data = epd;
+	goto out;
+err1:
+	kfree(epd);
+out:
+	return ret;
+}
+
+static const struct file_operations ep_debugfs_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ep_open,
+	.release = ep_release,
+	.read    = debugfs_read,
+};
+
+static int setup_debugfs(struct c4iw_dev *devp)
+{
+	if (!devp->debugfs_root)
+		return -1;
+
+	debugfs_create_file_size("qps", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &qp_debugfs_fops, 4096);
+
+	debugfs_create_file_size("stags", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &stag_debugfs_fops, 4096);
+
+	debugfs_create_file_size("stats", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &stats_debugfs_fops, 4096);
+
+	debugfs_create_file_size("eps", S_IWUSR, devp->debugfs_root,
+				 (void *)devp, &ep_debugfs_fops, 4096);
+
+	if (c4iw_wr_log)
+		debugfs_create_file_size("wr_log", S_IWUSR, devp->debugfs_root,
+					 (void *)devp, &wr_log_debugfs_fops, 4096);
+	return 0;
+}
+
+void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev,
+			       struct c4iw_dev_ucontext *uctx)
+{
+	struct list_head *pos, *nxt;
+	struct c4iw_qid_list *entry;
+
+	mutex_lock(&uctx->lock);
+	list_for_each_safe(pos, nxt, &uctx->qpids) {
+		entry = list_entry(pos, struct c4iw_qid_list, entry);
+		list_del_init(&entry->entry);
+		if (!(entry->qid & rdev->qpmask)) {
+			c4iw_put_resource(&rdev->resource.qid_table,
+					  entry->qid);
+			mutex_lock(&rdev->stats.lock);
+			rdev->stats.qid.cur -= rdev->qpmask + 1;
+			mutex_unlock(&rdev->stats.lock);
+		}
+		kfree(entry);
+	}
+
+	list_for_each_safe(pos, nxt, &uctx->qpids) {
+		entry = list_entry(pos, struct c4iw_qid_list, entry);
+		list_del_init(&entry->entry);
+		kfree(entry);
+	}
+	mutex_unlock(&uctx->lock);
+}
+
+void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev,
+			    struct c4iw_dev_ucontext *uctx)
+{
+	INIT_LIST_HEAD(&uctx->qpids);
+	INIT_LIST_HEAD(&uctx->cqids);
+	mutex_init(&uctx->lock);
+}
+
+/* Caller takes care of locking if needed */
+static int c4iw_rdev_open(struct c4iw_rdev *rdev)
+{
+	int err;
+
+	c4iw_init_dev_ucontext(rdev, &rdev->uctx);
+
+	/*
+	 * This implementation assumes udb_density == ucq_density!  Eventually
+	 * we might need to support this but for now fail the open. Also the
+	 * cqid and qpid range must match for now.
+	 */
+	if (rdev->lldi.udb_density != rdev->lldi.ucq_density) {
+		pr_err(MOD "%s: unsupported udb/ucq densities %u/%u\n",
+		       pci_name(rdev->lldi.pdev), rdev->lldi.udb_density,
+		       rdev->lldi.ucq_density);
+		err = -EINVAL;
+		goto err1;
+	}
+	if (rdev->lldi.vr->qp.start != rdev->lldi.vr->cq.start ||
+	    rdev->lldi.vr->qp.size != rdev->lldi.vr->cq.size) {
+		pr_err(MOD "%s: unsupported qp and cq id ranges "
+		       "qp start %u size %u cq start %u size %u\n",
+		       pci_name(rdev->lldi.pdev), rdev->lldi.vr->qp.start,
+		       rdev->lldi.vr->qp.size, rdev->lldi.vr->cq.size,
+		       rdev->lldi.vr->cq.size);
+		err = -EINVAL;
+		goto err1;
+	}
+
+	/*
+	 * qpshift is the number of bits to shift the qpid left in order
+	 * to get the correct address of the doorbell for that qp.
+	 */
+	rdev->qpshift = PAGE_SHIFT - ilog2(rdev->lldi.udb_density);
+	rdev->qpmask = rdev->lldi.udb_density - 1;
+	rdev->cqshift = PAGE_SHIFT - ilog2(rdev->lldi.ucq_density);
+	rdev->cqmask = rdev->lldi.ucq_density - 1;
+	PDBG("%s dev %s stag start 0x%0x size 0x%0x num stags %d "
+	     "pbl start 0x%0x size 0x%0x rq start 0x%0x size 0x%0x "
+	     "qp qid start %u size %u cq qid start %u size %u\n",
+	     __func__, pci_name(rdev->lldi.pdev), rdev->lldi.vr->stag.start,
+	     rdev->lldi.vr->stag.size, c4iw_num_stags(rdev),
+	     rdev->lldi.vr->pbl.start,
+	     rdev->lldi.vr->pbl.size, rdev->lldi.vr->rq.start,
+	     rdev->lldi.vr->rq.size,
+	     rdev->lldi.vr->qp.start,
+	     rdev->lldi.vr->qp.size,
+	     rdev->lldi.vr->cq.start,
+	     rdev->lldi.vr->cq.size);
+	PDBG("udb len 0x%x udb base %p db_reg %p gts_reg %p qpshift %lu "
+	     "qpmask 0x%x cqshift %lu cqmask 0x%x\n",
+	     (unsigned)pci_resource_len(rdev->lldi.pdev, 2),
+	     (void *)pci_resource_start(rdev->lldi.pdev, 2),
+	     rdev->lldi.db_reg,
+	     rdev->lldi.gts_reg,
+	     rdev->qpshift, rdev->qpmask,
+	     rdev->cqshift, rdev->cqmask);
+
+	if (c4iw_num_stags(rdev) == 0) {
+		err = -EINVAL;
+		goto err1;
+	}
+
+	rdev->stats.pd.total = T4_MAX_NUM_PD;
+	rdev->stats.stag.total = rdev->lldi.vr->stag.size;
+	rdev->stats.pbl.total = rdev->lldi.vr->pbl.size;
+	rdev->stats.rqt.total = rdev->lldi.vr->rq.size;
+	rdev->stats.ocqp.total = rdev->lldi.vr->ocq.size;
+	rdev->stats.qid.total = rdev->lldi.vr->qp.size;
+
+	err = c4iw_init_resource(rdev, c4iw_num_stags(rdev), T4_MAX_NUM_PD);
+	if (err) {
+		printk(KERN_ERR MOD "error %d initializing resources\n", err);
+		goto err1;
+	}
+	err = c4iw_pblpool_create(rdev);
+	if (err) {
+		printk(KERN_ERR MOD "error %d initializing pbl pool\n", err);
+		goto err2;
+	}
+	err = c4iw_rqtpool_create(rdev);
+	if (err) {
+		printk(KERN_ERR MOD "error %d initializing rqt pool\n", err);
+		goto err3;
+	}
+	err = c4iw_ocqp_pool_create(rdev);
+	if (err) {
+		printk(KERN_ERR MOD "error %d initializing ocqp pool\n", err);
+		goto err4;
+	}
+	rdev->status_page = (struct t4_dev_status_page *)
+			    __get_free_page(GFP_KERNEL);
+	if (!rdev->status_page) {
+		pr_err(MOD "error allocating status page\n");
+		goto err4;
+	}
+
+	if (c4iw_wr_log) {
+		rdev->wr_log = kzalloc((1 << c4iw_wr_log_size_order) *
+				       sizeof(*rdev->wr_log), GFP_KERNEL);
+		if (rdev->wr_log) {
+			rdev->wr_log_size = 1 << c4iw_wr_log_size_order;
+			atomic_set(&rdev->wr_log_idx, 0);
+		} else {
+			pr_err(MOD "error allocating wr_log. Logging disabled\n");
+		}
+	}
+
+	rdev->status_page->db_off = 0;
+
+	return 0;
+err4:
+	c4iw_rqtpool_destroy(rdev);
+err3:
+	c4iw_pblpool_destroy(rdev);
+err2:
+	c4iw_destroy_resource(&rdev->resource);
+err1:
+	return err;
+}
+
+static void c4iw_rdev_close(struct c4iw_rdev *rdev)
+{
+	kfree(rdev->wr_log);
+	free_page((unsigned long)rdev->status_page);
+	c4iw_pblpool_destroy(rdev);
+	c4iw_rqtpool_destroy(rdev);
+	c4iw_destroy_resource(&rdev->resource);
+}
+
+static void c4iw_dealloc(struct uld_ctx *ctx)
+{
+	c4iw_rdev_close(&ctx->dev->rdev);
+	idr_destroy(&ctx->dev->cqidr);
+	idr_destroy(&ctx->dev->qpidr);
+	idr_destroy(&ctx->dev->mmidr);
+	idr_destroy(&ctx->dev->hwtid_idr);
+	idr_destroy(&ctx->dev->stid_idr);
+	idr_destroy(&ctx->dev->atid_idr);
+	if (ctx->dev->rdev.bar2_kva)
+		iounmap(ctx->dev->rdev.bar2_kva);
+	if (ctx->dev->rdev.oc_mw_kva)
+		iounmap(ctx->dev->rdev.oc_mw_kva);
+	ib_dealloc_device(&ctx->dev->ibdev);
+	ctx->dev = NULL;
+}
+
+static void c4iw_remove(struct uld_ctx *ctx)
+{
+	PDBG("%s c4iw_dev %p\n", __func__,  ctx->dev);
+	c4iw_unregister_device(ctx->dev);
+	c4iw_dealloc(ctx);
+}
+
+static int rdma_supported(const struct cxgb4_lld_info *infop)
+{
+	return infop->vr->stag.size > 0 && infop->vr->pbl.size > 0 &&
+	       infop->vr->rq.size > 0 && infop->vr->qp.size > 0 &&
+	       infop->vr->cq.size > 0;
+}
+
+static struct c4iw_dev *c4iw_alloc(const struct cxgb4_lld_info *infop)
+{
+	struct c4iw_dev *devp;
+	int ret;
+
+	if (!rdma_supported(infop)) {
+		printk(KERN_INFO MOD "%s: RDMA not supported on this device.\n",
+		       pci_name(infop->pdev));
+		return ERR_PTR(-ENOSYS);
+	}
+	if (!ocqp_supported(infop))
+		pr_info("%s: On-Chip Queues not supported on this device.\n",
+			pci_name(infop->pdev));
+
+	devp = (struct c4iw_dev *)ib_alloc_device(sizeof(*devp));
+	if (!devp) {
+		printk(KERN_ERR MOD "Cannot allocate ib device\n");
+		return ERR_PTR(-ENOMEM);
+	}
+	devp->rdev.lldi = *infop;
+
+	/* init various hw-queue params based on lld info */
+	PDBG("%s: Ing. padding boundary is %d, egrsstatuspagesize = %d\n",
+	     __func__, devp->rdev.lldi.sge_ingpadboundary,
+	     devp->rdev.lldi.sge_egrstatuspagesize);
+
+	devp->rdev.hw_queue.t4_eq_status_entries =
+		devp->rdev.lldi.sge_ingpadboundary > 64 ? 2 : 1;
+	devp->rdev.hw_queue.t4_max_eq_size = 65520;
+	devp->rdev.hw_queue.t4_max_iq_size = 65520;
+	devp->rdev.hw_queue.t4_max_rq_size = 8192 -
+		devp->rdev.hw_queue.t4_eq_status_entries - 1;
+	devp->rdev.hw_queue.t4_max_sq_size =
+		devp->rdev.hw_queue.t4_max_eq_size -
+		devp->rdev.hw_queue.t4_eq_status_entries - 1;
+	devp->rdev.hw_queue.t4_max_qp_depth =
+		devp->rdev.hw_queue.t4_max_rq_size;
+	devp->rdev.hw_queue.t4_max_cq_depth =
+		devp->rdev.hw_queue.t4_max_iq_size - 2;
+	devp->rdev.hw_queue.t4_stat_len =
+		devp->rdev.lldi.sge_egrstatuspagesize;
+
+	/*
+	 * For T5 devices, we map all of BAR2 with WC.
+	 * For T4 devices with onchip qp mem, we map only that part
+	 * of BAR2 with WC.
+	 */
+	devp->rdev.bar2_pa = pci_resource_start(devp->rdev.lldi.pdev, 2);
+	if (is_t5(devp->rdev.lldi.adapter_type)) {
+		devp->rdev.bar2_kva = ioremap_wc(devp->rdev.bar2_pa,
+			pci_resource_len(devp->rdev.lldi.pdev, 2));
+		if (!devp->rdev.bar2_kva) {
+			pr_err(MOD "Unable to ioremap BAR2\n");
+			ib_dealloc_device(&devp->ibdev);
+			return ERR_PTR(-EINVAL);
+		}
+	} else if (ocqp_supported(infop)) {
+		devp->rdev.oc_mw_pa =
+			pci_resource_start(devp->rdev.lldi.pdev, 2) +
+			pci_resource_len(devp->rdev.lldi.pdev, 2) -
+			roundup_pow_of_two(devp->rdev.lldi.vr->ocq.size);
+		devp->rdev.oc_mw_kva = ioremap_wc(devp->rdev.oc_mw_pa,
+			devp->rdev.lldi.vr->ocq.size);
+		if (!devp->rdev.oc_mw_kva) {
+			pr_err(MOD "Unable to ioremap onchip mem\n");
+			ib_dealloc_device(&devp->ibdev);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	PDBG(KERN_INFO MOD "ocq memory: "
+	       "hw_start 0x%x size %u mw_pa 0x%lx mw_kva %p\n",
+	       devp->rdev.lldi.vr->ocq.start, devp->rdev.lldi.vr->ocq.size,
+	       devp->rdev.oc_mw_pa, devp->rdev.oc_mw_kva);
+
+	ret = c4iw_rdev_open(&devp->rdev);
+	if (ret) {
+		printk(KERN_ERR MOD "Unable to open CXIO rdev err %d\n", ret);
+		ib_dealloc_device(&devp->ibdev);
+		return ERR_PTR(ret);
+	}
+
+	idr_init(&devp->cqidr);
+	idr_init(&devp->qpidr);
+	idr_init(&devp->mmidr);
+	idr_init(&devp->hwtid_idr);
+	idr_init(&devp->stid_idr);
+	idr_init(&devp->atid_idr);
+	spin_lock_init(&devp->lock);
+	mutex_init(&devp->rdev.stats.lock);
+	mutex_init(&devp->db_mutex);
+	INIT_LIST_HEAD(&devp->db_fc_list);
+	devp->avail_ird = devp->rdev.lldi.max_ird_adapter;
+
+	if (c4iw_debugfs_root) {
+		devp->debugfs_root = debugfs_create_dir(
+					pci_name(devp->rdev.lldi.pdev),
+					c4iw_debugfs_root);
+		setup_debugfs(devp);
+	}
+
+
+	return devp;
+}
+
+static void *c4iw_uld_add(const struct cxgb4_lld_info *infop)
+{
+	struct uld_ctx *ctx;
+	static int vers_printed;
+	int i;
+
+	if (!vers_printed++)
+		pr_info("Chelsio T4/T5 RDMA Driver - version %s\n",
+			DRV_VERSION);
+
+	ctx = kzalloc(sizeof *ctx, GFP_KERNEL);
+	if (!ctx) {
+		ctx = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	ctx->lldi = *infop;
+
+	PDBG("%s found device %s nchan %u nrxq %u ntxq %u nports %u\n",
+	     __func__, pci_name(ctx->lldi.pdev),
+	     ctx->lldi.nchan, ctx->lldi.nrxq,
+	     ctx->lldi.ntxq, ctx->lldi.nports);
+
+	mutex_lock(&dev_mutex);
+	list_add_tail(&ctx->entry, &uld_ctx_list);
+	mutex_unlock(&dev_mutex);
+
+	for (i = 0; i < ctx->lldi.nrxq; i++)
+		PDBG("rxqid[%u] %u\n", i, ctx->lldi.rxq_ids[i]);
+out:
+	return ctx;
+}
+
+static inline struct sk_buff *copy_gl_to_skb_pkt(const struct pkt_gl *gl,
+						 const __be64 *rsp,
+						 u32 pktshift)
+{
+	struct sk_buff *skb;
+
+	/*
+	 * Allocate space for cpl_pass_accept_req which will be synthesized by
+	 * driver. Once the driver synthesizes the request the skb will go
+	 * through the regular cpl_pass_accept_req processing.
+	 * The math here assumes sizeof cpl_pass_accept_req >= sizeof
+	 * cpl_rx_pkt.
+	 */
+	skb = alloc_skb(gl->tot_len + sizeof(struct cpl_pass_accept_req) +
+			sizeof(struct rss_header) - pktshift, GFP_ATOMIC);
+	if (unlikely(!skb))
+		return NULL;
+
+	 __skb_put(skb, gl->tot_len + sizeof(struct cpl_pass_accept_req) +
+		   sizeof(struct rss_header) - pktshift);
+
+	/*
+	 * This skb will contain:
+	 *   rss_header from the rspq descriptor (1 flit)
+	 *   cpl_rx_pkt struct from the rspq descriptor (2 flits)
+	 *   space for the difference between the size of an
+	 *      rx_pkt and pass_accept_req cpl (1 flit)
+	 *   the packet data from the gl
+	 */
+	skb_copy_to_linear_data(skb, rsp, sizeof(struct cpl_pass_accept_req) +
+				sizeof(struct rss_header));
+	skb_copy_to_linear_data_offset(skb, sizeof(struct rss_header) +
+				       sizeof(struct cpl_pass_accept_req),
+				       gl->va + pktshift,
+				       gl->tot_len - pktshift);
+	return skb;
+}
+
+static inline int recv_rx_pkt(struct c4iw_dev *dev, const struct pkt_gl *gl,
+			   const __be64 *rsp)
+{
+	unsigned int opcode = *(u8 *)rsp;
+	struct sk_buff *skb;
+
+	if (opcode != CPL_RX_PKT)
+		goto out;
+
+	skb = copy_gl_to_skb_pkt(gl , rsp, dev->rdev.lldi.sge_pktshift);
+	if (skb == NULL)
+		goto out;
+
+	if (c4iw_handlers[opcode] == NULL) {
+		pr_info("%s no handler opcode 0x%x...\n", __func__,
+		       opcode);
+		kfree_skb(skb);
+		goto out;
+	}
+	c4iw_handlers[opcode](dev, skb);
+	return 1;
+out:
+	return 0;
+}
+
+static int c4iw_uld_rx_handler(void *handle, const __be64 *rsp,
+			const struct pkt_gl *gl)
+{
+	struct uld_ctx *ctx = handle;
+	struct c4iw_dev *dev = ctx->dev;
+	struct sk_buff *skb;
+	u8 opcode;
+
+	if (gl == NULL) {
+		/* omit RSS and rsp_ctrl at end of descriptor */
+		unsigned int len = 64 - sizeof(struct rsp_ctrl) - 8;
+
+		skb = alloc_skb(256, GFP_ATOMIC);
+		if (!skb)
+			goto nomem;
+		__skb_put(skb, len);
+		skb_copy_to_linear_data(skb, &rsp[1], len);
+	} else if (gl == CXGB4_MSG_AN) {
+		const struct rsp_ctrl *rc = (void *)rsp;
+
+		u32 qid = be32_to_cpu(rc->pldbuflen_qid);
+		c4iw_ev_handler(dev, qid);
+		return 0;
+	} else if (unlikely(*(u8 *)rsp != *(u8 *)gl->va)) {
+		if (recv_rx_pkt(dev, gl, rsp))
+			return 0;
+
+		pr_info("%s: unexpected FL contents at %p, " \
+		       "RSS %#llx, FL %#llx, len %u\n",
+		       pci_name(ctx->lldi.pdev), gl->va,
+		       (unsigned long long)be64_to_cpu(*rsp),
+		       (unsigned long long)be64_to_cpu(
+		       *(__force __be64 *)gl->va),
+		       gl->tot_len);
+
+		return 0;
+	} else {
+		skb = cxgb4_pktgl_to_skb(gl, 128, 128);
+		if (unlikely(!skb))
+			goto nomem;
+	}
+
+	opcode = *(u8 *)rsp;
+	if (c4iw_handlers[opcode]) {
+		c4iw_handlers[opcode](dev, skb);
+	} else {
+		pr_info("%s no handler opcode 0x%x...\n", __func__,
+		       opcode);
+		kfree_skb(skb);
+	}
+
+	return 0;
+nomem:
+	return -1;
+}
+
+static int c4iw_uld_state_change(void *handle, enum cxgb4_state new_state)
+{
+	struct uld_ctx *ctx = handle;
+
+	PDBG("%s new_state %u\n", __func__, new_state);
+	switch (new_state) {
+	case CXGB4_STATE_UP:
+		printk(KERN_INFO MOD "%s: Up\n", pci_name(ctx->lldi.pdev));
+		if (!ctx->dev) {
+			int ret;
+
+			ctx->dev = c4iw_alloc(&ctx->lldi);
+			if (IS_ERR(ctx->dev)) {
+				printk(KERN_ERR MOD
+				       "%s: initialization failed: %ld\n",
+				       pci_name(ctx->lldi.pdev),
+				       PTR_ERR(ctx->dev));
+				ctx->dev = NULL;
+				break;
+			}
+			ret = c4iw_register_device(ctx->dev);
+			if (ret) {
+				printk(KERN_ERR MOD
+				       "%s: RDMA registration failed: %d\n",
+				       pci_name(ctx->lldi.pdev), ret);
+				c4iw_dealloc(ctx);
+			}
+		}
+		break;
+	case CXGB4_STATE_DOWN:
+		printk(KERN_INFO MOD "%s: Down\n",
+		       pci_name(ctx->lldi.pdev));
+		if (ctx->dev)
+			c4iw_remove(ctx);
+		break;
+	case CXGB4_STATE_START_RECOVERY:
+		printk(KERN_INFO MOD "%s: Fatal Error\n",
+		       pci_name(ctx->lldi.pdev));
+		if (ctx->dev) {
+			struct ib_event event;
+
+			ctx->dev->rdev.flags |= T4_FATAL_ERROR;
+			memset(&event, 0, sizeof event);
+			event.event  = IB_EVENT_DEVICE_FATAL;
+			event.device = &ctx->dev->ibdev;
+			ib_dispatch_event(&event);
+			c4iw_remove(ctx);
+		}
+		break;
+	case CXGB4_STATE_DETACH:
+		printk(KERN_INFO MOD "%s: Detach\n",
+		       pci_name(ctx->lldi.pdev));
+		if (ctx->dev)
+			c4iw_remove(ctx);
+		break;
+	}
+	return 0;
+}
+
+static int disable_qp_db(int id, void *p, void *data)
+{
+	struct c4iw_qp *qp = p;
+
+	t4_disable_wq_db(&qp->wq);
+	return 0;
+}
+
+static void stop_queues(struct uld_ctx *ctx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->dev->lock, flags);
+	ctx->dev->rdev.stats.db_state_transitions++;
+	ctx->dev->db_state = STOPPED;
+	if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED)
+		idr_for_each(&ctx->dev->qpidr, disable_qp_db, NULL);
+	else
+		ctx->dev->rdev.status_page->db_off = 1;
+	spin_unlock_irqrestore(&ctx->dev->lock, flags);
+}
+
+static int enable_qp_db(int id, void *p, void *data)
+{
+	struct c4iw_qp *qp = p;
+
+	t4_enable_wq_db(&qp->wq);
+	return 0;
+}
+
+static void resume_rc_qp(struct c4iw_qp *qp)
+{
+	spin_lock(&qp->lock);
+	t4_ring_sq_db(&qp->wq, qp->wq.sq.wq_pidx_inc,
+		      is_t5(qp->rhp->rdev.lldi.adapter_type), NULL);
+	qp->wq.sq.wq_pidx_inc = 0;
+	t4_ring_rq_db(&qp->wq, qp->wq.rq.wq_pidx_inc,
+		      is_t5(qp->rhp->rdev.lldi.adapter_type), NULL);
+	qp->wq.rq.wq_pidx_inc = 0;
+	spin_unlock(&qp->lock);
+}
+
+static void resume_a_chunk(struct uld_ctx *ctx)
+{
+	int i;
+	struct c4iw_qp *qp;
+
+	for (i = 0; i < DB_FC_RESUME_SIZE; i++) {
+		qp = list_first_entry(&ctx->dev->db_fc_list, struct c4iw_qp,
+				      db_fc_entry);
+		list_del_init(&qp->db_fc_entry);
+		resume_rc_qp(qp);
+		if (list_empty(&ctx->dev->db_fc_list))
+			break;
+	}
+}
+
+static void resume_queues(struct uld_ctx *ctx)
+{
+	spin_lock_irq(&ctx->dev->lock);
+	if (ctx->dev->db_state != STOPPED)
+		goto out;
+	ctx->dev->db_state = FLOW_CONTROL;
+	while (1) {
+		if (list_empty(&ctx->dev->db_fc_list)) {
+			WARN_ON(ctx->dev->db_state != FLOW_CONTROL);
+			ctx->dev->db_state = NORMAL;
+			ctx->dev->rdev.stats.db_state_transitions++;
+			if (ctx->dev->rdev.flags & T4_STATUS_PAGE_DISABLED) {
+				idr_for_each(&ctx->dev->qpidr, enable_qp_db,
+					     NULL);
+			} else {
+				ctx->dev->rdev.status_page->db_off = 0;
+			}
+			break;
+		} else {
+			if (cxgb4_dbfifo_count(ctx->dev->rdev.lldi.ports[0], 1)
+			    < (ctx->dev->rdev.lldi.dbfifo_int_thresh <<
+			       DB_FC_DRAIN_THRESH)) {
+				resume_a_chunk(ctx);
+			}
+			if (!list_empty(&ctx->dev->db_fc_list)) {
+				spin_unlock_irq(&ctx->dev->lock);
+				if (DB_FC_RESUME_DELAY) {
+					set_current_state(TASK_UNINTERRUPTIBLE);
+					schedule_timeout(DB_FC_RESUME_DELAY);
+				}
+				spin_lock_irq(&ctx->dev->lock);
+				if (ctx->dev->db_state != FLOW_CONTROL)
+					break;
+			}
+		}
+	}
+out:
+	if (ctx->dev->db_state != NORMAL)
+		ctx->dev->rdev.stats.db_fc_interruptions++;
+	spin_unlock_irq(&ctx->dev->lock);
+}
+
+struct qp_list {
+	unsigned idx;
+	struct c4iw_qp **qps;
+};
+
+static int add_and_ref_qp(int id, void *p, void *data)
+{
+	struct qp_list *qp_listp = data;
+	struct c4iw_qp *qp = p;
+
+	c4iw_qp_add_ref(&qp->ibqp);
+	qp_listp->qps[qp_listp->idx++] = qp;
+	return 0;
+}
+
+static int count_qps(int id, void *p, void *data)
+{
+	unsigned *countp = data;
+	(*countp)++;
+	return 0;
+}
+
+static void deref_qps(struct qp_list *qp_list)
+{
+	int idx;
+
+	for (idx = 0; idx < qp_list->idx; idx++)
+		c4iw_qp_rem_ref(&qp_list->qps[idx]->ibqp);
+}
+
+static void recover_lost_dbs(struct uld_ctx *ctx, struct qp_list *qp_list)
+{
+	int idx;
+	int ret;
+
+	for (idx = 0; idx < qp_list->idx; idx++) {
+		struct c4iw_qp *qp = qp_list->qps[idx];
+
+		spin_lock_irq(&qp->rhp->lock);
+		spin_lock(&qp->lock);
+		ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0],
+					  qp->wq.sq.qid,
+					  t4_sq_host_wq_pidx(&qp->wq),
+					  t4_sq_wq_size(&qp->wq));
+		if (ret) {
+			pr_err(MOD "%s: Fatal error - "
+			       "DB overflow recovery failed - "
+			       "error syncing SQ qid %u\n",
+			       pci_name(ctx->lldi.pdev), qp->wq.sq.qid);
+			spin_unlock(&qp->lock);
+			spin_unlock_irq(&qp->rhp->lock);
+			return;
+		}
+		qp->wq.sq.wq_pidx_inc = 0;
+
+		ret = cxgb4_sync_txq_pidx(qp->rhp->rdev.lldi.ports[0],
+					  qp->wq.rq.qid,
+					  t4_rq_host_wq_pidx(&qp->wq),
+					  t4_rq_wq_size(&qp->wq));
+
+		if (ret) {
+			pr_err(MOD "%s: Fatal error - "
+			       "DB overflow recovery failed - "
+			       "error syncing RQ qid %u\n",
+			       pci_name(ctx->lldi.pdev), qp->wq.rq.qid);
+			spin_unlock(&qp->lock);
+			spin_unlock_irq(&qp->rhp->lock);
+			return;
+		}
+		qp->wq.rq.wq_pidx_inc = 0;
+		spin_unlock(&qp->lock);
+		spin_unlock_irq(&qp->rhp->lock);
+
+		/* Wait for the dbfifo to drain */
+		while (cxgb4_dbfifo_count(qp->rhp->rdev.lldi.ports[0], 1) > 0) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(usecs_to_jiffies(10));
+		}
+	}
+}
+
+static void recover_queues(struct uld_ctx *ctx)
+{
+	int count = 0;
+	struct qp_list qp_list;
+	int ret;
+
+	/* slow everybody down */
+	set_current_state(TASK_UNINTERRUPTIBLE);
+	schedule_timeout(usecs_to_jiffies(1000));
+
+	/* flush the SGE contexts */
+	ret = cxgb4_flush_eq_cache(ctx->dev->rdev.lldi.ports[0]);
+	if (ret) {
+		printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n",
+		       pci_name(ctx->lldi.pdev));
+		return;
+	}
+
+	/* Count active queues so we can build a list of queues to recover */
+	spin_lock_irq(&ctx->dev->lock);
+	WARN_ON(ctx->dev->db_state != STOPPED);
+	ctx->dev->db_state = RECOVERY;
+	idr_for_each(&ctx->dev->qpidr, count_qps, &count);
+
+	qp_list.qps = kzalloc(count * sizeof *qp_list.qps, GFP_ATOMIC);
+	if (!qp_list.qps) {
+		printk(KERN_ERR MOD "%s: Fatal error - DB overflow recovery failed\n",
+		       pci_name(ctx->lldi.pdev));
+		spin_unlock_irq(&ctx->dev->lock);
+		return;
+	}
+	qp_list.idx = 0;
+
+	/* add and ref each qp so it doesn't get freed */
+	idr_for_each(&ctx->dev->qpidr, add_and_ref_qp, &qp_list);
+
+	spin_unlock_irq(&ctx->dev->lock);
+
+	/* now traverse the list in a safe context to recover the db state*/
+	recover_lost_dbs(ctx, &qp_list);
+
+	/* we're almost done!  deref the qps and clean up */
+	deref_qps(&qp_list);
+	kfree(qp_list.qps);
+
+	spin_lock_irq(&ctx->dev->lock);
+	WARN_ON(ctx->dev->db_state != RECOVERY);
+	ctx->dev->db_state = STOPPED;
+	spin_unlock_irq(&ctx->dev->lock);
+}
+
+static int c4iw_uld_control(void *handle, enum cxgb4_control control, ...)
+{
+	struct uld_ctx *ctx = handle;
+
+	switch (control) {
+	case CXGB4_CONTROL_DB_FULL:
+		stop_queues(ctx);
+		ctx->dev->rdev.stats.db_full++;
+		break;
+	case CXGB4_CONTROL_DB_EMPTY:
+		resume_queues(ctx);
+		mutex_lock(&ctx->dev->rdev.stats.lock);
+		ctx->dev->rdev.stats.db_empty++;
+		mutex_unlock(&ctx->dev->rdev.stats.lock);
+		break;
+	case CXGB4_CONTROL_DB_DROP:
+		recover_queues(ctx);
+		mutex_lock(&ctx->dev->rdev.stats.lock);
+		ctx->dev->rdev.stats.db_drop++;
+		mutex_unlock(&ctx->dev->rdev.stats.lock);
+		break;
+	default:
+		printk(KERN_WARNING MOD "%s: unknown control cmd %u\n",
+		       pci_name(ctx->lldi.pdev), control);
+		break;
+	}
+	return 0;
+}
+
+static struct cxgb4_uld_info c4iw_uld_info = {
+	.name = DRV_NAME,
+	.add = c4iw_uld_add,
+	.rx_handler = c4iw_uld_rx_handler,
+	.state_change = c4iw_uld_state_change,
+	.control = c4iw_uld_control,
+};
+
+static int __init c4iw_init_module(void)
+{
+	int err;
+
+	err = c4iw_cm_init();
+	if (err)
+		return err;
+
+	c4iw_debugfs_root = debugfs_create_dir(DRV_NAME, NULL);
+	if (!c4iw_debugfs_root)
+		printk(KERN_WARNING MOD
+		       "could not create debugfs entry, continuing\n");
+
+	if (ibnl_add_client(RDMA_NL_C4IW, RDMA_NL_IWPM_NUM_OPS,
+			    c4iw_nl_cb_table))
+		pr_err("%s[%u]: Failed to add netlink callback\n"
+		       , __func__, __LINE__);
+
+	err = iwpm_init(RDMA_NL_C4IW);
+	if (err) {
+		pr_err("port mapper initialization failed with %d\n", err);
+		ibnl_remove_client(RDMA_NL_C4IW);
+		c4iw_cm_term();
+		debugfs_remove_recursive(c4iw_debugfs_root);
+		return err;
+	}
+
+	cxgb4_register_uld(CXGB4_ULD_RDMA, &c4iw_uld_info);
+
+	return 0;
+}
+
+static void __exit c4iw_exit_module(void)
+{
+	struct uld_ctx *ctx, *tmp;
+
+	mutex_lock(&dev_mutex);
+	list_for_each_entry_safe(ctx, tmp, &uld_ctx_list, entry) {
+		if (ctx->dev)
+			c4iw_remove(ctx);
+		kfree(ctx);
+	}
+	mutex_unlock(&dev_mutex);
+	cxgb4_unregister_uld(CXGB4_ULD_RDMA);
+	iwpm_exit(RDMA_NL_C4IW);
+	ibnl_remove_client(RDMA_NL_C4IW);
+	c4iw_cm_term();
+	debugfs_remove_recursive(c4iw_debugfs_root);
+}
+
+module_init(c4iw_init_module);
+module_exit(c4iw_exit_module);
diff --git a/drivers/infiniband/hw/cxgb4/ev.c b/drivers/infiniband/hw/cxgb4/ev.c
new file mode 100644
index 000000000..bdfac2ccb
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/ev.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/slab.h>
+#include <linux/mman.h>
+#include <net/sock.h>
+
+#include "iw_cxgb4.h"
+
+static void print_tpte(struct c4iw_dev *dev, u32 stag)
+{
+	int ret;
+	struct fw_ri_tpte tpte;
+
+	ret = cxgb4_read_tpte(dev->rdev.lldi.ports[0], stag,
+			      (__be32 *)&tpte);
+	if (ret) {
+		dev_err(&dev->rdev.lldi.pdev->dev,
+			"%s cxgb4_read_tpte err %d\n", __func__, ret);
+		return;
+	}
+	PDBG("stag idx 0x%x valid %d key 0x%x state %d pdid %d "
+	       "perm 0x%x ps %d len 0x%llx va 0x%llx\n",
+	       stag & 0xffffff00,
+	       FW_RI_TPTE_VALID_G(ntohl(tpte.valid_to_pdid)),
+	       FW_RI_TPTE_STAGKEY_G(ntohl(tpte.valid_to_pdid)),
+	       FW_RI_TPTE_STAGSTATE_G(ntohl(tpte.valid_to_pdid)),
+	       FW_RI_TPTE_PDID_G(ntohl(tpte.valid_to_pdid)),
+	       FW_RI_TPTE_PERM_G(ntohl(tpte.locread_to_qpid)),
+	       FW_RI_TPTE_PS_G(ntohl(tpte.locread_to_qpid)),
+	       ((u64)ntohl(tpte.len_hi) << 32) | ntohl(tpte.len_lo),
+	       ((u64)ntohl(tpte.va_hi) << 32) | ntohl(tpte.va_lo_fbo));
+}
+
+static void dump_err_cqe(struct c4iw_dev *dev, struct t4_cqe *err_cqe)
+{
+	__be64 *p = (void *)err_cqe;
+
+	dev_err(&dev->rdev.lldi.pdev->dev,
+		"AE qpid %d opcode %d status 0x%x "
+		"type %d len 0x%x wrid.hi 0x%x wrid.lo 0x%x\n",
+		CQE_QPID(err_cqe), CQE_OPCODE(err_cqe),
+		CQE_STATUS(err_cqe), CQE_TYPE(err_cqe), ntohl(err_cqe->len),
+		CQE_WRID_HI(err_cqe), CQE_WRID_LOW(err_cqe));
+
+	PDBG("%016llx %016llx %016llx %016llx\n",
+	     be64_to_cpu(p[0]), be64_to_cpu(p[1]), be64_to_cpu(p[2]),
+	     be64_to_cpu(p[3]));
+
+	/*
+	 * Ingress WRITE and READ_RESP errors provide
+	 * the offending stag, so parse and log it.
+	 */
+	if (RQ_TYPE(err_cqe) && (CQE_OPCODE(err_cqe) == FW_RI_RDMA_WRITE ||
+				 CQE_OPCODE(err_cqe) == FW_RI_READ_RESP))
+		print_tpte(dev, CQE_WRID_STAG(err_cqe));
+}
+
+static void post_qp_event(struct c4iw_dev *dev, struct c4iw_cq *chp,
+			  struct c4iw_qp *qhp,
+			  struct t4_cqe *err_cqe,
+			  enum ib_event_type ib_event)
+{
+	struct ib_event event;
+	struct c4iw_qp_attributes attrs;
+	unsigned long flag;
+
+	dump_err_cqe(dev, err_cqe);
+
+	if (qhp->attr.state == C4IW_QP_STATE_RTS) {
+		attrs.next_state = C4IW_QP_STATE_TERMINATE;
+		c4iw_modify_qp(qhp->rhp, qhp, C4IW_QP_ATTR_NEXT_STATE,
+			       &attrs, 0);
+	}
+
+	event.event = ib_event;
+	event.device = chp->ibcq.device;
+	if (ib_event == IB_EVENT_CQ_ERR)
+		event.element.cq = &chp->ibcq;
+	else
+		event.element.qp = &qhp->ibqp;
+	if (qhp->ibqp.event_handler)
+		(*qhp->ibqp.event_handler)(&event, qhp->ibqp.qp_context);
+
+	spin_lock_irqsave(&chp->comp_handler_lock, flag);
+	(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+	spin_unlock_irqrestore(&chp->comp_handler_lock, flag);
+}
+
+void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe)
+{
+	struct c4iw_cq *chp;
+	struct c4iw_qp *qhp;
+	u32 cqid;
+
+	spin_lock_irq(&dev->lock);
+	qhp = get_qhp(dev, CQE_QPID(err_cqe));
+	if (!qhp) {
+		printk(KERN_ERR MOD "BAD AE qpid 0x%x opcode %d "
+		       "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n",
+		       CQE_QPID(err_cqe),
+		       CQE_OPCODE(err_cqe), CQE_STATUS(err_cqe),
+		       CQE_TYPE(err_cqe), CQE_WRID_HI(err_cqe),
+		       CQE_WRID_LOW(err_cqe));
+		spin_unlock_irq(&dev->lock);
+		goto out;
+	}
+
+	if (SQ_TYPE(err_cqe))
+		cqid = qhp->attr.scq;
+	else
+		cqid = qhp->attr.rcq;
+	chp = get_chp(dev, cqid);
+	if (!chp) {
+		printk(KERN_ERR MOD "BAD AE cqid 0x%x qpid 0x%x opcode %d "
+		       "status 0x%x type %d wrid.hi 0x%x wrid.lo 0x%x\n",
+		       cqid, CQE_QPID(err_cqe),
+		       CQE_OPCODE(err_cqe), CQE_STATUS(err_cqe),
+		       CQE_TYPE(err_cqe), CQE_WRID_HI(err_cqe),
+		       CQE_WRID_LOW(err_cqe));
+		spin_unlock_irq(&dev->lock);
+		goto out;
+	}
+
+	c4iw_qp_add_ref(&qhp->ibqp);
+	atomic_inc(&chp->refcnt);
+	spin_unlock_irq(&dev->lock);
+
+	/* Bad incoming write */
+	if (RQ_TYPE(err_cqe) &&
+	    (CQE_OPCODE(err_cqe) == FW_RI_RDMA_WRITE)) {
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_REQ_ERR);
+		goto done;
+	}
+
+	switch (CQE_STATUS(err_cqe)) {
+
+	/* Completion Events */
+	case T4_ERR_SUCCESS:
+		printk(KERN_ERR MOD "AE with status 0!\n");
+		break;
+
+	case T4_ERR_STAG:
+	case T4_ERR_PDID:
+	case T4_ERR_QPID:
+	case T4_ERR_ACCESS:
+	case T4_ERR_WRAP:
+	case T4_ERR_BOUND:
+	case T4_ERR_INVALIDATE_SHARED_MR:
+	case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_ACCESS_ERR);
+		break;
+
+	/* Device Fatal Errors */
+	case T4_ERR_ECC:
+	case T4_ERR_ECC_PSTAG:
+	case T4_ERR_INTERNAL_ERR:
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_DEVICE_FATAL);
+		break;
+
+	/* QP Fatal Errors */
+	case T4_ERR_OUT_OF_RQE:
+	case T4_ERR_PBL_ADDR_BOUND:
+	case T4_ERR_CRC:
+	case T4_ERR_MARKER:
+	case T4_ERR_PDU_LEN_ERR:
+	case T4_ERR_DDP_VERSION:
+	case T4_ERR_RDMA_VERSION:
+	case T4_ERR_OPCODE:
+	case T4_ERR_DDP_QUEUE_NUM:
+	case T4_ERR_MSN:
+	case T4_ERR_TBIT:
+	case T4_ERR_MO:
+	case T4_ERR_MSN_GAP:
+	case T4_ERR_MSN_RANGE:
+	case T4_ERR_RQE_ADDR_BOUND:
+	case T4_ERR_IRD_OVERFLOW:
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_FATAL);
+		break;
+
+	default:
+		printk(KERN_ERR MOD "Unknown T4 status 0x%x QPID 0x%x\n",
+		       CQE_STATUS(err_cqe), qhp->wq.sq.qid);
+		post_qp_event(dev, chp, qhp, err_cqe, IB_EVENT_QP_FATAL);
+		break;
+	}
+done:
+	if (atomic_dec_and_test(&chp->refcnt))
+		wake_up(&chp->wait);
+	c4iw_qp_rem_ref(&qhp->ibqp);
+out:
+	return;
+}
+
+int c4iw_ev_handler(struct c4iw_dev *dev, u32 qid)
+{
+	struct c4iw_cq *chp;
+	unsigned long flag;
+
+	spin_lock_irqsave(&dev->lock, flag);
+	chp = get_chp(dev, qid);
+	if (chp) {
+		atomic_inc(&chp->refcnt);
+		spin_unlock_irqrestore(&dev->lock, flag);
+		t4_clear_cq_armed(&chp->cq);
+		spin_lock_irqsave(&chp->comp_handler_lock, flag);
+		(*chp->ibcq.comp_handler)(&chp->ibcq, chp->ibcq.cq_context);
+		spin_unlock_irqrestore(&chp->comp_handler_lock, flag);
+		if (atomic_dec_and_test(&chp->refcnt))
+			wake_up(&chp->wait);
+	} else {
+		PDBG("%s unknown cqid 0x%x\n", __func__, qid);
+		spin_unlock_irqrestore(&dev->lock, flag);
+	}
+	return 0;
+}
diff --git a/drivers/infiniband/hw/cxgb4/id_table.c b/drivers/infiniband/hw/cxgb4/id_table.c
new file mode 100644
index 000000000..0161ae6ad
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/id_table.c
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2011 Chelsio Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/kernel.h>
+#include <linux/random.h>
+#include "iw_cxgb4.h"
+
+#define RANDOM_SKIP 16
+
+/*
+ * Trivial bitmap-based allocator. If the random flag is set, the
+ * allocator is designed to:
+ * - pseudo-randomize the id returned such that it is not trivially predictable.
+ * - avoid reuse of recently used id (at the expense of predictability)
+ */
+u32 c4iw_id_alloc(struct c4iw_id_table *alloc)
+{
+	unsigned long flags;
+	u32 obj;
+
+	spin_lock_irqsave(&alloc->lock, flags);
+
+	obj = find_next_zero_bit(alloc->table, alloc->max, alloc->last);
+	if (obj >= alloc->max)
+		obj = find_first_zero_bit(alloc->table, alloc->max);
+
+	if (obj < alloc->max) {
+		if (alloc->flags & C4IW_ID_TABLE_F_RANDOM)
+			alloc->last += prandom_u32() % RANDOM_SKIP;
+		else
+			alloc->last = obj + 1;
+		if (alloc->last >= alloc->max)
+			alloc->last = 0;
+		set_bit(obj, alloc->table);
+		obj += alloc->start;
+	} else
+		obj = -1;
+
+	spin_unlock_irqrestore(&alloc->lock, flags);
+	return obj;
+}
+
+void c4iw_id_free(struct c4iw_id_table *alloc, u32 obj)
+{
+	unsigned long flags;
+
+	obj -= alloc->start;
+	BUG_ON((int)obj < 0);
+
+	spin_lock_irqsave(&alloc->lock, flags);
+	clear_bit(obj, alloc->table);
+	spin_unlock_irqrestore(&alloc->lock, flags);
+}
+
+int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num,
+			u32 reserved, u32 flags)
+{
+	int i;
+
+	alloc->start = start;
+	alloc->flags = flags;
+	if (flags & C4IW_ID_TABLE_F_RANDOM)
+		alloc->last = prandom_u32() % RANDOM_SKIP;
+	else
+		alloc->last = 0;
+	alloc->max  = num;
+	spin_lock_init(&alloc->lock);
+	alloc->table = kmalloc(BITS_TO_LONGS(num) * sizeof(long),
+				GFP_KERNEL);
+	if (!alloc->table)
+		return -ENOMEM;
+
+	bitmap_zero(alloc->table, num);
+	if (!(alloc->flags & C4IW_ID_TABLE_F_EMPTY))
+		for (i = 0; i < reserved; ++i)
+			set_bit(i, alloc->table);
+
+	return 0;
+}
+
+void c4iw_id_table_free(struct c4iw_id_table *alloc)
+{
+	kfree(alloc->table);
+}
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
new file mode 100644
index 000000000..97bb5550a
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -0,0 +1,1042 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *      - Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __IW_CXGB4_H__
+#define __IW_CXGB4_H__
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/completion.h>
+#include <linux/netdevice.h>
+#include <linux/sched.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/inet.h>
+#include <linux/wait.h>
+#include <linux/kref.h>
+#include <linux/timer.h>
+#include <linux/io.h>
+
+#include <asm/byteorder.h>
+
+#include <net/net_namespace.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/iw_cm.h>
+#include <rdma/rdma_netlink.h>
+#include <rdma/iw_portmap.h>
+
+#include "cxgb4.h"
+#include "cxgb4_uld.h"
+#include "l2t.h"
+#include "user.h"
+
+#define DRV_NAME "iw_cxgb4"
+#define MOD DRV_NAME ":"
+
+extern int c4iw_debug;
+#define PDBG(fmt, args...) \
+do { \
+	if (c4iw_debug) \
+		printk(MOD fmt, ## args); \
+} while (0)
+
+#include "t4.h"
+
+#define PBL_OFF(rdev_p, a) ((a) - (rdev_p)->lldi.vr->pbl.start)
+#define RQT_OFF(rdev_p, a) ((a) - (rdev_p)->lldi.vr->rq.start)
+
+static inline void *cplhdr(struct sk_buff *skb)
+{
+	return skb->data;
+}
+
+#define C4IW_ID_TABLE_F_RANDOM 1       /* Pseudo-randomize the id's returned */
+#define C4IW_ID_TABLE_F_EMPTY  2       /* Table is initially empty */
+
+struct c4iw_id_table {
+	u32 flags;
+	u32 start;              /* logical minimal id */
+	u32 last;               /* hint for find */
+	u32 max;
+	spinlock_t lock;
+	unsigned long *table;
+};
+
+struct c4iw_resource {
+	struct c4iw_id_table tpt_table;
+	struct c4iw_id_table qid_table;
+	struct c4iw_id_table pdid_table;
+};
+
+struct c4iw_qid_list {
+	struct list_head entry;
+	u32 qid;
+};
+
+struct c4iw_dev_ucontext {
+	struct list_head qpids;
+	struct list_head cqids;
+	struct mutex lock;
+};
+
+enum c4iw_rdev_flags {
+	T4_FATAL_ERROR = (1<<0),
+	T4_STATUS_PAGE_DISABLED = (1<<1),
+};
+
+struct c4iw_stat {
+	u64 total;
+	u64 cur;
+	u64 max;
+	u64 fail;
+};
+
+struct c4iw_stats {
+	struct mutex lock;
+	struct c4iw_stat qid;
+	struct c4iw_stat pd;
+	struct c4iw_stat stag;
+	struct c4iw_stat pbl;
+	struct c4iw_stat rqt;
+	struct c4iw_stat ocqp;
+	u64  db_full;
+	u64  db_empty;
+	u64  db_drop;
+	u64  db_state_transitions;
+	u64  db_fc_interruptions;
+	u64  tcam_full;
+	u64  act_ofld_conn_fails;
+	u64  pas_ofld_conn_fails;
+	u64  neg_adv;
+};
+
+struct c4iw_hw_queue {
+	int t4_eq_status_entries;
+	int t4_max_eq_size;
+	int t4_max_iq_size;
+	int t4_max_rq_size;
+	int t4_max_sq_size;
+	int t4_max_qp_depth;
+	int t4_max_cq_depth;
+	int t4_stat_len;
+};
+
+struct wr_log_entry {
+	struct timespec post_host_ts;
+	struct timespec poll_host_ts;
+	u64 post_sge_ts;
+	u64 cqe_sge_ts;
+	u64 poll_sge_ts;
+	u16 qid;
+	u16 wr_id;
+	u8 opcode;
+	u8 valid;
+};
+
+struct c4iw_rdev {
+	struct c4iw_resource resource;
+	unsigned long qpshift;
+	u32 qpmask;
+	unsigned long cqshift;
+	u32 cqmask;
+	struct c4iw_dev_ucontext uctx;
+	struct gen_pool *pbl_pool;
+	struct gen_pool *rqt_pool;
+	struct gen_pool *ocqp_pool;
+	u32 flags;
+	struct cxgb4_lld_info lldi;
+	unsigned long bar2_pa;
+	void __iomem *bar2_kva;
+	unsigned long oc_mw_pa;
+	void __iomem *oc_mw_kva;
+	struct c4iw_stats stats;
+	struct c4iw_hw_queue hw_queue;
+	struct t4_dev_status_page *status_page;
+	atomic_t wr_log_idx;
+	struct wr_log_entry *wr_log;
+	int wr_log_size;
+};
+
+static inline int c4iw_fatal_error(struct c4iw_rdev *rdev)
+{
+	return rdev->flags & T4_FATAL_ERROR;
+}
+
+static inline int c4iw_num_stags(struct c4iw_rdev *rdev)
+{
+	return (int)(rdev->lldi.vr->stag.size >> 5);
+}
+
+#define C4IW_WR_TO (60*HZ)
+
+struct c4iw_wr_wait {
+	struct completion completion;
+	int ret;
+};
+
+static inline void c4iw_init_wr_wait(struct c4iw_wr_wait *wr_waitp)
+{
+	wr_waitp->ret = 0;
+	init_completion(&wr_waitp->completion);
+}
+
+static inline void c4iw_wake_up(struct c4iw_wr_wait *wr_waitp, int ret)
+{
+	wr_waitp->ret = ret;
+	complete(&wr_waitp->completion);
+}
+
+static inline int c4iw_wait_for_reply(struct c4iw_rdev *rdev,
+				 struct c4iw_wr_wait *wr_waitp,
+				 u32 hwtid, u32 qpid,
+				 const char *func)
+{
+	int ret;
+
+	if (c4iw_fatal_error(rdev)) {
+		wr_waitp->ret = -EIO;
+		goto out;
+	}
+
+	ret = wait_for_completion_timeout(&wr_waitp->completion, C4IW_WR_TO);
+	if (!ret) {
+		PDBG("%s - Device %s not responding (disabling device) - tid %u qpid %u\n",
+		     func, pci_name(rdev->lldi.pdev), hwtid, qpid);
+		rdev->flags |= T4_FATAL_ERROR;
+		wr_waitp->ret = -EIO;
+	}
+out:
+	if (wr_waitp->ret)
+		PDBG("%s: FW reply %d tid %u qpid %u\n",
+		     pci_name(rdev->lldi.pdev), wr_waitp->ret, hwtid, qpid);
+	return wr_waitp->ret;
+}
+
+enum db_state {
+	NORMAL = 0,
+	FLOW_CONTROL = 1,
+	RECOVERY = 2,
+	STOPPED = 3
+};
+
+struct c4iw_dev {
+	struct ib_device ibdev;
+	struct c4iw_rdev rdev;
+	u32 device_cap_flags;
+	struct idr cqidr;
+	struct idr qpidr;
+	struct idr mmidr;
+	spinlock_t lock;
+	struct mutex db_mutex;
+	struct dentry *debugfs_root;
+	enum db_state db_state;
+	struct idr hwtid_idr;
+	struct idr atid_idr;
+	struct idr stid_idr;
+	struct list_head db_fc_list;
+	u32 avail_ird;
+};
+
+static inline struct c4iw_dev *to_c4iw_dev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct c4iw_dev, ibdev);
+}
+
+static inline struct c4iw_dev *rdev_to_c4iw_dev(struct c4iw_rdev *rdev)
+{
+	return container_of(rdev, struct c4iw_dev, rdev);
+}
+
+static inline struct c4iw_cq *get_chp(struct c4iw_dev *rhp, u32 cqid)
+{
+	return idr_find(&rhp->cqidr, cqid);
+}
+
+static inline struct c4iw_qp *get_qhp(struct c4iw_dev *rhp, u32 qpid)
+{
+	return idr_find(&rhp->qpidr, qpid);
+}
+
+static inline struct c4iw_mr *get_mhp(struct c4iw_dev *rhp, u32 mmid)
+{
+	return idr_find(&rhp->mmidr, mmid);
+}
+
+static inline int _insert_handle(struct c4iw_dev *rhp, struct idr *idr,
+				 void *handle, u32 id, int lock)
+{
+	int ret;
+
+	if (lock) {
+		idr_preload(GFP_KERNEL);
+		spin_lock_irq(&rhp->lock);
+	}
+
+	ret = idr_alloc(idr, handle, id, id + 1, GFP_ATOMIC);
+
+	if (lock) {
+		spin_unlock_irq(&rhp->lock);
+		idr_preload_end();
+	}
+
+	BUG_ON(ret == -ENOSPC);
+	return ret < 0 ? ret : 0;
+}
+
+static inline int insert_handle(struct c4iw_dev *rhp, struct idr *idr,
+				void *handle, u32 id)
+{
+	return _insert_handle(rhp, idr, handle, id, 1);
+}
+
+static inline int insert_handle_nolock(struct c4iw_dev *rhp, struct idr *idr,
+				       void *handle, u32 id)
+{
+	return _insert_handle(rhp, idr, handle, id, 0);
+}
+
+static inline void _remove_handle(struct c4iw_dev *rhp, struct idr *idr,
+				   u32 id, int lock)
+{
+	if (lock)
+		spin_lock_irq(&rhp->lock);
+	idr_remove(idr, id);
+	if (lock)
+		spin_unlock_irq(&rhp->lock);
+}
+
+static inline void remove_handle(struct c4iw_dev *rhp, struct idr *idr, u32 id)
+{
+	_remove_handle(rhp, idr, id, 1);
+}
+
+static inline void remove_handle_nolock(struct c4iw_dev *rhp,
+					 struct idr *idr, u32 id)
+{
+	_remove_handle(rhp, idr, id, 0);
+}
+
+extern uint c4iw_max_read_depth;
+
+static inline int cur_max_read_depth(struct c4iw_dev *dev)
+{
+	return min(dev->rdev.lldi.max_ordird_qp, c4iw_max_read_depth);
+}
+
+struct c4iw_pd {
+	struct ib_pd ibpd;
+	u32 pdid;
+	struct c4iw_dev *rhp;
+};
+
+static inline struct c4iw_pd *to_c4iw_pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct c4iw_pd, ibpd);
+}
+
+struct tpt_attributes {
+	u64 len;
+	u64 va_fbo;
+	enum fw_ri_mem_perms perms;
+	u32 stag;
+	u32 pdid;
+	u32 qpid;
+	u32 pbl_addr;
+	u32 pbl_size;
+	u32 state:1;
+	u32 type:2;
+	u32 rsvd:1;
+	u32 remote_invaliate_disable:1;
+	u32 zbva:1;
+	u32 mw_bind_enable:1;
+	u32 page_size:5;
+};
+
+struct c4iw_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct c4iw_dev *rhp;
+	u64 kva;
+	struct tpt_attributes attr;
+};
+
+static inline struct c4iw_mr *to_c4iw_mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct c4iw_mr, ibmr);
+}
+
+struct c4iw_mw {
+	struct ib_mw ibmw;
+	struct c4iw_dev *rhp;
+	u64 kva;
+	struct tpt_attributes attr;
+};
+
+static inline struct c4iw_mw *to_c4iw_mw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct c4iw_mw, ibmw);
+}
+
+struct c4iw_fr_page_list {
+	struct ib_fast_reg_page_list ibpl;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	dma_addr_t dma_addr;
+	struct c4iw_dev *dev;
+	int pll_len;
+};
+
+static inline struct c4iw_fr_page_list *to_c4iw_fr_page_list(
+					struct ib_fast_reg_page_list *ibpl)
+{
+	return container_of(ibpl, struct c4iw_fr_page_list, ibpl);
+}
+
+struct c4iw_cq {
+	struct ib_cq ibcq;
+	struct c4iw_dev *rhp;
+	struct t4_cq cq;
+	spinlock_t lock;
+	spinlock_t comp_handler_lock;
+	atomic_t refcnt;
+	wait_queue_head_t wait;
+};
+
+static inline struct c4iw_cq *to_c4iw_cq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct c4iw_cq, ibcq);
+}
+
+struct c4iw_mpa_attributes {
+	u8 initiator;
+	u8 recv_marker_enabled;
+	u8 xmit_marker_enabled;
+	u8 crc_enabled;
+	u8 enhanced_rdma_conn;
+	u8 version;
+	u8 p2p_type;
+};
+
+struct c4iw_qp_attributes {
+	u32 scq;
+	u32 rcq;
+	u32 sq_num_entries;
+	u32 rq_num_entries;
+	u32 sq_max_sges;
+	u32 sq_max_sges_rdma_write;
+	u32 rq_max_sges;
+	u32 state;
+	u8 enable_rdma_read;
+	u8 enable_rdma_write;
+	u8 enable_bind;
+	u8 enable_mmid0_fastreg;
+	u32 max_ord;
+	u32 max_ird;
+	u32 pd;
+	u32 next_state;
+	char terminate_buffer[52];
+	u32 terminate_msg_len;
+	u8 is_terminate_local;
+	struct c4iw_mpa_attributes mpa_attr;
+	struct c4iw_ep *llp_stream_handle;
+	u8 layer_etype;
+	u8 ecode;
+	u16 sq_db_inc;
+	u16 rq_db_inc;
+	u8 send_term;
+};
+
+struct c4iw_qp {
+	struct ib_qp ibqp;
+	struct list_head db_fc_entry;
+	struct c4iw_dev *rhp;
+	struct c4iw_ep *ep;
+	struct c4iw_qp_attributes attr;
+	struct t4_wq wq;
+	spinlock_t lock;
+	struct mutex mutex;
+	atomic_t refcnt;
+	wait_queue_head_t wait;
+	struct timer_list timer;
+	int sq_sig_all;
+};
+
+static inline struct c4iw_qp *to_c4iw_qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct c4iw_qp, ibqp);
+}
+
+struct c4iw_ucontext {
+	struct ib_ucontext ibucontext;
+	struct c4iw_dev_ucontext uctx;
+	u32 key;
+	spinlock_t mmap_lock;
+	struct list_head mmaps;
+};
+
+static inline struct c4iw_ucontext *to_c4iw_ucontext(struct ib_ucontext *c)
+{
+	return container_of(c, struct c4iw_ucontext, ibucontext);
+}
+
+struct c4iw_mm_entry {
+	struct list_head entry;
+	u64 addr;
+	u32 key;
+	unsigned len;
+};
+
+static inline struct c4iw_mm_entry *remove_mmap(struct c4iw_ucontext *ucontext,
+						u32 key, unsigned len)
+{
+	struct list_head *pos, *nxt;
+	struct c4iw_mm_entry *mm;
+
+	spin_lock(&ucontext->mmap_lock);
+	list_for_each_safe(pos, nxt, &ucontext->mmaps) {
+
+		mm = list_entry(pos, struct c4iw_mm_entry, entry);
+		if (mm->key == key && mm->len == len) {
+			list_del_init(&mm->entry);
+			spin_unlock(&ucontext->mmap_lock);
+			PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__,
+			     key, (unsigned long long) mm->addr, mm->len);
+			return mm;
+		}
+	}
+	spin_unlock(&ucontext->mmap_lock);
+	return NULL;
+}
+
+static inline void insert_mmap(struct c4iw_ucontext *ucontext,
+			       struct c4iw_mm_entry *mm)
+{
+	spin_lock(&ucontext->mmap_lock);
+	PDBG("%s key 0x%x addr 0x%llx len %d\n", __func__,
+	     mm->key, (unsigned long long) mm->addr, mm->len);
+	list_add_tail(&mm->entry, &ucontext->mmaps);
+	spin_unlock(&ucontext->mmap_lock);
+}
+
+enum c4iw_qp_attr_mask {
+	C4IW_QP_ATTR_NEXT_STATE = 1 << 0,
+	C4IW_QP_ATTR_SQ_DB = 1<<1,
+	C4IW_QP_ATTR_RQ_DB = 1<<2,
+	C4IW_QP_ATTR_ENABLE_RDMA_READ = 1 << 7,
+	C4IW_QP_ATTR_ENABLE_RDMA_WRITE = 1 << 8,
+	C4IW_QP_ATTR_ENABLE_RDMA_BIND = 1 << 9,
+	C4IW_QP_ATTR_MAX_ORD = 1 << 11,
+	C4IW_QP_ATTR_MAX_IRD = 1 << 12,
+	C4IW_QP_ATTR_LLP_STREAM_HANDLE = 1 << 22,
+	C4IW_QP_ATTR_STREAM_MSG_BUFFER = 1 << 23,
+	C4IW_QP_ATTR_MPA_ATTR = 1 << 24,
+	C4IW_QP_ATTR_QP_CONTEXT_ACTIVATE = 1 << 25,
+	C4IW_QP_ATTR_VALID_MODIFY = (C4IW_QP_ATTR_ENABLE_RDMA_READ |
+				     C4IW_QP_ATTR_ENABLE_RDMA_WRITE |
+				     C4IW_QP_ATTR_MAX_ORD |
+				     C4IW_QP_ATTR_MAX_IRD |
+				     C4IW_QP_ATTR_LLP_STREAM_HANDLE |
+				     C4IW_QP_ATTR_STREAM_MSG_BUFFER |
+				     C4IW_QP_ATTR_MPA_ATTR |
+				     C4IW_QP_ATTR_QP_CONTEXT_ACTIVATE)
+};
+
+int c4iw_modify_qp(struct c4iw_dev *rhp,
+				struct c4iw_qp *qhp,
+				enum c4iw_qp_attr_mask mask,
+				struct c4iw_qp_attributes *attrs,
+				int internal);
+
+enum c4iw_qp_state {
+	C4IW_QP_STATE_IDLE,
+	C4IW_QP_STATE_RTS,
+	C4IW_QP_STATE_ERROR,
+	C4IW_QP_STATE_TERMINATE,
+	C4IW_QP_STATE_CLOSING,
+	C4IW_QP_STATE_TOT
+};
+
+static inline int c4iw_convert_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET:
+	case IB_QPS_INIT:
+		return C4IW_QP_STATE_IDLE;
+	case IB_QPS_RTS:
+		return C4IW_QP_STATE_RTS;
+	case IB_QPS_SQD:
+		return C4IW_QP_STATE_CLOSING;
+	case IB_QPS_SQE:
+		return C4IW_QP_STATE_TERMINATE;
+	case IB_QPS_ERR:
+		return C4IW_QP_STATE_ERROR;
+	default:
+		return -1;
+	}
+}
+
+static inline int to_ib_qp_state(int c4iw_qp_state)
+{
+	switch (c4iw_qp_state) {
+	case C4IW_QP_STATE_IDLE:
+		return IB_QPS_INIT;
+	case C4IW_QP_STATE_RTS:
+		return IB_QPS_RTS;
+	case C4IW_QP_STATE_CLOSING:
+		return IB_QPS_SQD;
+	case C4IW_QP_STATE_TERMINATE:
+		return IB_QPS_SQE;
+	case C4IW_QP_STATE_ERROR:
+		return IB_QPS_ERR;
+	}
+	return IB_QPS_ERR;
+}
+
+static inline u32 c4iw_ib_to_tpt_access(int a)
+{
+	return (a & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) |
+	       (a & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0) |
+	       (a & IB_ACCESS_LOCAL_WRITE ? FW_RI_MEM_ACCESS_LOCAL_WRITE : 0) |
+	       FW_RI_MEM_ACCESS_LOCAL_READ;
+}
+
+static inline u32 c4iw_ib_to_tpt_bind_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_WRITE ? FW_RI_MEM_ACCESS_REM_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ ? FW_RI_MEM_ACCESS_REM_READ : 0);
+}
+
+enum c4iw_mmid_state {
+	C4IW_STAG_STATE_VALID,
+	C4IW_STAG_STATE_INVALID
+};
+
+#define C4IW_NODE_DESC "cxgb4 Chelsio Communications"
+
+#define MPA_KEY_REQ "MPA ID Req Frame"
+#define MPA_KEY_REP "MPA ID Rep Frame"
+
+#define MPA_MAX_PRIVATE_DATA	256
+#define MPA_ENHANCED_RDMA_CONN	0x10
+#define MPA_REJECT		0x20
+#define MPA_CRC			0x40
+#define MPA_MARKERS		0x80
+#define MPA_FLAGS_MASK		0xE0
+
+#define MPA_V2_PEER2PEER_MODEL          0x8000
+#define MPA_V2_ZERO_LEN_FPDU_RTR        0x4000
+#define MPA_V2_RDMA_WRITE_RTR           0x8000
+#define MPA_V2_RDMA_READ_RTR            0x4000
+#define MPA_V2_IRD_ORD_MASK             0x3FFF
+
+#define c4iw_put_ep(ep) { \
+	PDBG("put_ep (via %s:%u) ep %p refcnt %d\n", __func__, __LINE__,  \
+	     ep, atomic_read(&((ep)->kref.refcount))); \
+	WARN_ON(atomic_read(&((ep)->kref.refcount)) < 1); \
+	kref_put(&((ep)->kref), _c4iw_free_ep); \
+}
+
+#define c4iw_get_ep(ep) { \
+	PDBG("get_ep (via %s:%u) ep %p, refcnt %d\n", __func__, __LINE__, \
+	     ep, atomic_read(&((ep)->kref.refcount))); \
+	kref_get(&((ep)->kref));  \
+}
+void _c4iw_free_ep(struct kref *kref);
+
+struct mpa_message {
+	u8 key[16];
+	u8 flags;
+	u8 revision;
+	__be16 private_data_size;
+	u8 private_data[0];
+};
+
+struct mpa_v2_conn_params {
+	__be16 ird;
+	__be16 ord;
+};
+
+struct terminate_message {
+	u8 layer_etype;
+	u8 ecode;
+	__be16 hdrct_rsvd;
+	u8 len_hdrs[0];
+};
+
+#define TERM_MAX_LENGTH (sizeof(struct terminate_message) + 2 + 18 + 28)
+
+enum c4iw_layers_types {
+	LAYER_RDMAP		= 0x00,
+	LAYER_DDP		= 0x10,
+	LAYER_MPA		= 0x20,
+	RDMAP_LOCAL_CATA	= 0x00,
+	RDMAP_REMOTE_PROT	= 0x01,
+	RDMAP_REMOTE_OP		= 0x02,
+	DDP_LOCAL_CATA		= 0x00,
+	DDP_TAGGED_ERR		= 0x01,
+	DDP_UNTAGGED_ERR	= 0x02,
+	DDP_LLP			= 0x03
+};
+
+enum c4iw_rdma_ecodes {
+	RDMAP_INV_STAG		= 0x00,
+	RDMAP_BASE_BOUNDS	= 0x01,
+	RDMAP_ACC_VIOL		= 0x02,
+	RDMAP_STAG_NOT_ASSOC	= 0x03,
+	RDMAP_TO_WRAP		= 0x04,
+	RDMAP_INV_VERS		= 0x05,
+	RDMAP_INV_OPCODE	= 0x06,
+	RDMAP_STREAM_CATA	= 0x07,
+	RDMAP_GLOBAL_CATA	= 0x08,
+	RDMAP_CANT_INV_STAG	= 0x09,
+	RDMAP_UNSPECIFIED	= 0xff
+};
+
+enum c4iw_ddp_ecodes {
+	DDPT_INV_STAG		= 0x00,
+	DDPT_BASE_BOUNDS	= 0x01,
+	DDPT_STAG_NOT_ASSOC	= 0x02,
+	DDPT_TO_WRAP		= 0x03,
+	DDPT_INV_VERS		= 0x04,
+	DDPU_INV_QN		= 0x01,
+	DDPU_INV_MSN_NOBUF	= 0x02,
+	DDPU_INV_MSN_RANGE	= 0x03,
+	DDPU_INV_MO		= 0x04,
+	DDPU_MSG_TOOBIG		= 0x05,
+	DDPU_INV_VERS		= 0x06
+};
+
+enum c4iw_mpa_ecodes {
+	MPA_CRC_ERR		= 0x02,
+	MPA_MARKER_ERR          = 0x03,
+	MPA_LOCAL_CATA          = 0x05,
+	MPA_INSUFF_IRD          = 0x06,
+	MPA_NOMATCH_RTR         = 0x07,
+};
+
+enum c4iw_ep_state {
+	IDLE = 0,
+	LISTEN,
+	CONNECTING,
+	MPA_REQ_WAIT,
+	MPA_REQ_SENT,
+	MPA_REQ_RCVD,
+	MPA_REP_SENT,
+	FPDU_MODE,
+	ABORTING,
+	CLOSING,
+	MORIBUND,
+	DEAD,
+};
+
+enum c4iw_ep_flags {
+	PEER_ABORT_IN_PROGRESS	= 0,
+	ABORT_REQ_IN_PROGRESS	= 1,
+	RELEASE_RESOURCES	= 2,
+	CLOSE_SENT		= 3,
+	TIMEOUT                 = 4,
+	QP_REFERENCED           = 5,
+	RELEASE_MAPINFO		= 6,
+};
+
+enum c4iw_ep_history {
+	ACT_OPEN_REQ            = 0,
+	ACT_OFLD_CONN           = 1,
+	ACT_OPEN_RPL            = 2,
+	ACT_ESTAB               = 3,
+	PASS_ACCEPT_REQ         = 4,
+	PASS_ESTAB              = 5,
+	ABORT_UPCALL            = 6,
+	ESTAB_UPCALL            = 7,
+	CLOSE_UPCALL            = 8,
+	ULP_ACCEPT              = 9,
+	ULP_REJECT              = 10,
+	TIMEDOUT                = 11,
+	PEER_ABORT              = 12,
+	PEER_CLOSE              = 13,
+	CONNREQ_UPCALL          = 14,
+	ABORT_CONN              = 15,
+	DISCONN_UPCALL          = 16,
+	EP_DISC_CLOSE           = 17,
+	EP_DISC_ABORT           = 18,
+	CONN_RPL_UPCALL         = 19,
+	ACT_RETRY_NOMEM         = 20,
+	ACT_RETRY_INUSE         = 21
+};
+
+struct c4iw_ep_common {
+	struct iw_cm_id *cm_id;
+	struct c4iw_qp *qp;
+	struct c4iw_dev *dev;
+	enum c4iw_ep_state state;
+	struct kref kref;
+	struct mutex mutex;
+	struct sockaddr_storage local_addr;
+	struct sockaddr_storage remote_addr;
+	struct sockaddr_storage mapped_local_addr;
+	struct sockaddr_storage mapped_remote_addr;
+	struct c4iw_wr_wait wr_wait;
+	unsigned long flags;
+	unsigned long history;
+};
+
+struct c4iw_listen_ep {
+	struct c4iw_ep_common com;
+	unsigned int stid;
+	int backlog;
+};
+
+struct c4iw_ep_stats {
+	unsigned connect_neg_adv;
+	unsigned abort_neg_adv;
+};
+
+struct c4iw_ep {
+	struct c4iw_ep_common com;
+	struct c4iw_ep *parent_ep;
+	struct timer_list timer;
+	struct list_head entry;
+	unsigned int atid;
+	u32 hwtid;
+	u32 snd_seq;
+	u32 rcv_seq;
+	struct l2t_entry *l2t;
+	struct dst_entry *dst;
+	struct sk_buff *mpa_skb;
+	struct c4iw_mpa_attributes mpa_attr;
+	u8 mpa_pkt[sizeof(struct mpa_message) + MPA_MAX_PRIVATE_DATA];
+	unsigned int mpa_pkt_len;
+	u32 ird;
+	u32 ord;
+	u32 smac_idx;
+	u32 tx_chan;
+	u32 mtu;
+	u16 mss;
+	u16 emss;
+	u16 plen;
+	u16 rss_qid;
+	u16 txq_idx;
+	u16 ctrlq_idx;
+	u8 tos;
+	u8 retry_with_mpa_v1;
+	u8 tried_with_mpa_v1;
+	unsigned int retry_count;
+	int snd_win;
+	int rcv_win;
+	struct c4iw_ep_stats stats;
+};
+
+static inline void print_addr(struct c4iw_ep_common *epc, const char *func,
+			      const char *msg)
+{
+
+#define SINA(a) (&(((struct sockaddr_in *)(a))->sin_addr.s_addr))
+#define SINP(a) ntohs(((struct sockaddr_in *)(a))->sin_port)
+#define SIN6A(a) (&(((struct sockaddr_in6 *)(a))->sin6_addr))
+#define SIN6P(a) ntohs(((struct sockaddr_in6 *)(a))->sin6_port)
+
+	if (c4iw_debug) {
+		switch (epc->local_addr.ss_family) {
+		case AF_INET:
+			PDBG("%s %s %pI4:%u/%u <-> %pI4:%u/%u\n",
+			     func, msg, SINA(&epc->local_addr),
+			     SINP(&epc->local_addr),
+			     SINP(&epc->mapped_local_addr),
+			     SINA(&epc->remote_addr),
+			     SINP(&epc->remote_addr),
+			     SINP(&epc->mapped_remote_addr));
+			break;
+		case AF_INET6:
+			PDBG("%s %s %pI6:%u/%u <-> %pI6:%u/%u\n",
+			     func, msg, SIN6A(&epc->local_addr),
+			     SIN6P(&epc->local_addr),
+			     SIN6P(&epc->mapped_local_addr),
+			     SIN6A(&epc->remote_addr),
+			     SIN6P(&epc->remote_addr),
+			     SIN6P(&epc->mapped_remote_addr));
+			break;
+		default:
+			break;
+		}
+	}
+#undef SINA
+#undef SINP
+#undef SIN6A
+#undef SIN6P
+}
+
+static inline struct c4iw_ep *to_ep(struct iw_cm_id *cm_id)
+{
+	return cm_id->provider_data;
+}
+
+static inline struct c4iw_listen_ep *to_listen_ep(struct iw_cm_id *cm_id)
+{
+	return cm_id->provider_data;
+}
+
+static inline int compute_wscale(int win)
+{
+	int wscale = 0;
+
+	while (wscale < 14 && (65535<<wscale) < win)
+		wscale++;
+	return wscale;
+}
+
+static inline int ocqp_supported(const struct cxgb4_lld_info *infop)
+{
+#if defined(__i386__) || defined(__x86_64__) || defined(CONFIG_PPC64)
+	return infop->vr->ocq.size > 0;
+#else
+	return 0;
+#endif
+}
+
+u32 c4iw_id_alloc(struct c4iw_id_table *alloc);
+void c4iw_id_free(struct c4iw_id_table *alloc, u32 obj);
+int c4iw_id_table_alloc(struct c4iw_id_table *alloc, u32 start, u32 num,
+			u32 reserved, u32 flags);
+void c4iw_id_table_free(struct c4iw_id_table *alloc);
+
+typedef int (*c4iw_handler_func)(struct c4iw_dev *dev, struct sk_buff *skb);
+
+int c4iw_ep_redirect(void *ctx, struct dst_entry *old, struct dst_entry *new,
+		     struct l2t_entry *l2t);
+void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qpid,
+		   struct c4iw_dev_ucontext *uctx);
+u32 c4iw_get_resource(struct c4iw_id_table *id_table);
+void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry);
+int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid);
+int c4iw_init_ctrl_qp(struct c4iw_rdev *rdev);
+int c4iw_pblpool_create(struct c4iw_rdev *rdev);
+int c4iw_rqtpool_create(struct c4iw_rdev *rdev);
+int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev);
+void c4iw_pblpool_destroy(struct c4iw_rdev *rdev);
+void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev);
+void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev);
+void c4iw_destroy_resource(struct c4iw_resource *rscp);
+int c4iw_destroy_ctrl_qp(struct c4iw_rdev *rdev);
+int c4iw_register_device(struct c4iw_dev *dev);
+void c4iw_unregister_device(struct c4iw_dev *dev);
+int __init c4iw_cm_init(void);
+void c4iw_cm_term(void);
+void c4iw_release_dev_ucontext(struct c4iw_rdev *rdev,
+			       struct c4iw_dev_ucontext *uctx);
+void c4iw_init_dev_ucontext(struct c4iw_rdev *rdev,
+			    struct c4iw_dev_ucontext *uctx);
+int c4iw_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
+		 struct ib_mw_bind *mw_bind);
+int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog);
+int c4iw_destroy_listen(struct iw_cm_id *cm_id);
+int c4iw_accept_cr(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param);
+int c4iw_reject_cr(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len);
+void c4iw_qp_add_ref(struct ib_qp *qp);
+void c4iw_qp_rem_ref(struct ib_qp *qp);
+void c4iw_free_fastreg_pbl(struct ib_fast_reg_page_list *page_list);
+struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl(
+					struct ib_device *device,
+					int page_list_len);
+struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth);
+int c4iw_dealloc_mw(struct ib_mw *mw);
+struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
+struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start,
+					   u64 length, u64 virt, int acc,
+					   struct ib_udata *udata);
+struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc);
+struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd,
+					struct ib_phys_buf *buffer_list,
+					int num_phys_buf,
+					int acc,
+					u64 *iova_start);
+int c4iw_reregister_phys_mem(struct ib_mr *mr,
+				     int mr_rereg_mask,
+				     struct ib_pd *pd,
+				     struct ib_phys_buf *buffer_list,
+				     int num_phys_buf,
+				     int acc, u64 *iova_start);
+int c4iw_dereg_mr(struct ib_mr *ib_mr);
+int c4iw_destroy_cq(struct ib_cq *ib_cq);
+struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, int entries,
+					int vector,
+					struct ib_ucontext *ib_context,
+					struct ib_udata *udata);
+int c4iw_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata);
+int c4iw_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+int c4iw_destroy_qp(struct ib_qp *ib_qp);
+struct ib_qp *c4iw_create_qp(struct ib_pd *pd,
+			     struct ib_qp_init_attr *attrs,
+			     struct ib_udata *udata);
+int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+				 int attr_mask, struct ib_udata *udata);
+int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		     int attr_mask, struct ib_qp_init_attr *init_attr);
+struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn);
+u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size);
+void c4iw_rqtpool_free(struct c4iw_rdev *rdev, u32 addr, int size);
+u32 c4iw_pblpool_alloc(struct c4iw_rdev *rdev, int size);
+void c4iw_pblpool_free(struct c4iw_rdev *rdev, u32 addr, int size);
+u32 c4iw_ocqp_pool_alloc(struct c4iw_rdev *rdev, int size);
+void c4iw_ocqp_pool_free(struct c4iw_rdev *rdev, u32 addr, int size);
+int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb);
+void c4iw_flush_hw_cq(struct c4iw_cq *chp);
+void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count);
+int c4iw_ep_disconnect(struct c4iw_ep *ep, int abrupt, gfp_t gfp);
+int c4iw_flush_rq(struct t4_wq *wq, struct t4_cq *cq, int count);
+int c4iw_flush_sq(struct c4iw_qp *qhp);
+int c4iw_ev_handler(struct c4iw_dev *rnicp, u32 qid);
+u16 c4iw_rqes_posted(struct c4iw_qp *qhp);
+int c4iw_post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe);
+u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx);
+void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid,
+		struct c4iw_dev_ucontext *uctx);
+u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx);
+void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qid,
+		struct c4iw_dev_ucontext *uctx);
+void c4iw_ev_dispatch(struct c4iw_dev *dev, struct t4_cqe *err_cqe);
+
+extern struct cxgb4_client t4c_client;
+extern c4iw_handler_func c4iw_handlers[NUM_CPL_CMDS];
+extern void c4iw_log_wr_stats(struct t4_wq *wq, struct t4_cqe *cqe);
+extern int c4iw_wr_log;
+extern int db_fc_threshold;
+extern int db_coalescing_threshold;
+extern int use_dsgl;
+
+
+#endif
diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c
new file mode 100644
index 000000000..cff815b91
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/mem.c
@@ -0,0 +1,979 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <rdma/ib_umem.h>
+#include <linux/atomic.h>
+
+#include "iw_cxgb4.h"
+
+int use_dsgl = 0;
+module_param(use_dsgl, int, 0644);
+MODULE_PARM_DESC(use_dsgl, "Use DSGL for PBL/FastReg (default=0)");
+
+#define T4_ULPTX_MIN_IO 32
+#define C4IW_MAX_INLINE_SIZE 96
+#define T4_ULPTX_MAX_DMA 1024
+#define C4IW_INLINE_THRESHOLD 128
+
+static int inline_threshold = C4IW_INLINE_THRESHOLD;
+module_param(inline_threshold, int, 0644);
+MODULE_PARM_DESC(inline_threshold, "inline vs dsgl threshold (default=128)");
+
+static int mr_exceeds_hw_limits(struct c4iw_dev *dev, u64 length)
+{
+	return (is_t4(dev->rdev.lldi.adapter_type) ||
+		is_t5(dev->rdev.lldi.adapter_type)) &&
+		length >= 8*1024*1024*1024ULL;
+}
+
+static int _c4iw_write_mem_dma_aligned(struct c4iw_rdev *rdev, u32 addr,
+				       u32 len, dma_addr_t data, int wait)
+{
+	struct sk_buff *skb;
+	struct ulp_mem_io *req;
+	struct ulptx_sgl *sgl;
+	u8 wr_len;
+	int ret = 0;
+	struct c4iw_wr_wait wr_wait;
+
+	addr &= 0x7FFFFFF;
+
+	if (wait)
+		c4iw_init_wr_wait(&wr_wait);
+	wr_len = roundup(sizeof(*req) + sizeof(*sgl), 16);
+
+	skb = alloc_skb(wr_len, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+	req = (struct ulp_mem_io *)__skb_put(skb, wr_len);
+	memset(req, 0, wr_len);
+	INIT_ULPTX_WR(req, wr_len, 0, 0);
+	req->wr.wr_hi = cpu_to_be32(FW_WR_OP_V(FW_ULPTX_WR) |
+			(wait ? FW_WR_COMPL_F : 0));
+	req->wr.wr_lo = wait ? (__force __be64)(unsigned long) &wr_wait : 0L;
+	req->wr.wr_mid = cpu_to_be32(FW_WR_LEN16_V(DIV_ROUND_UP(wr_len, 16)));
+	req->cmd = cpu_to_be32(ULPTX_CMD_V(ULP_TX_MEM_WRITE));
+	req->cmd |= cpu_to_be32(T5_ULP_MEMIO_ORDER_V(1));
+	req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN_V(len>>5));
+	req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr), 16));
+	req->lock_addr = cpu_to_be32(ULP_MEMIO_ADDR_V(addr));
+
+	sgl = (struct ulptx_sgl *)(req + 1);
+	sgl->cmd_nsge = cpu_to_be32(ULPTX_CMD_V(ULP_TX_SC_DSGL) |
+				    ULPTX_NSGE_V(1));
+	sgl->len0 = cpu_to_be32(len);
+	sgl->addr0 = cpu_to_be64(data);
+
+	ret = c4iw_ofld_send(rdev, skb);
+	if (ret)
+		return ret;
+	if (wait)
+		ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__);
+	return ret;
+}
+
+static int _c4iw_write_mem_inline(struct c4iw_rdev *rdev, u32 addr, u32 len,
+				  void *data)
+{
+	struct sk_buff *skb;
+	struct ulp_mem_io *req;
+	struct ulptx_idata *sc;
+	u8 wr_len, *to_dp, *from_dp;
+	int copy_len, num_wqe, i, ret = 0;
+	struct c4iw_wr_wait wr_wait;
+	__be32 cmd = cpu_to_be32(ULPTX_CMD_V(ULP_TX_MEM_WRITE));
+
+	if (is_t4(rdev->lldi.adapter_type))
+		cmd |= cpu_to_be32(ULP_MEMIO_ORDER_F);
+	else
+		cmd |= cpu_to_be32(T5_ULP_MEMIO_IMM_F);
+
+	addr &= 0x7FFFFFF;
+	PDBG("%s addr 0x%x len %u\n", __func__, addr, len);
+	num_wqe = DIV_ROUND_UP(len, C4IW_MAX_INLINE_SIZE);
+	c4iw_init_wr_wait(&wr_wait);
+	for (i = 0; i < num_wqe; i++) {
+
+		copy_len = len > C4IW_MAX_INLINE_SIZE ? C4IW_MAX_INLINE_SIZE :
+			   len;
+		wr_len = roundup(sizeof *req + sizeof *sc +
+				 roundup(copy_len, T4_ULPTX_MIN_IO), 16);
+
+		skb = alloc_skb(wr_len, GFP_KERNEL);
+		if (!skb)
+			return -ENOMEM;
+		set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+		req = (struct ulp_mem_io *)__skb_put(skb, wr_len);
+		memset(req, 0, wr_len);
+		INIT_ULPTX_WR(req, wr_len, 0, 0);
+
+		if (i == (num_wqe-1)) {
+			req->wr.wr_hi = cpu_to_be32(FW_WR_OP_V(FW_ULPTX_WR) |
+						    FW_WR_COMPL_F);
+			req->wr.wr_lo = (__force __be64)&wr_wait;
+		} else
+			req->wr.wr_hi = cpu_to_be32(FW_WR_OP_V(FW_ULPTX_WR));
+		req->wr.wr_mid = cpu_to_be32(
+				       FW_WR_LEN16_V(DIV_ROUND_UP(wr_len, 16)));
+
+		req->cmd = cmd;
+		req->dlen = cpu_to_be32(ULP_MEMIO_DATA_LEN_V(
+				DIV_ROUND_UP(copy_len, T4_ULPTX_MIN_IO)));
+		req->len16 = cpu_to_be32(DIV_ROUND_UP(wr_len-sizeof(req->wr),
+						      16));
+		req->lock_addr = cpu_to_be32(ULP_MEMIO_ADDR_V(addr + i * 3));
+
+		sc = (struct ulptx_idata *)(req + 1);
+		sc->cmd_more = cpu_to_be32(ULPTX_CMD_V(ULP_TX_SC_IMM));
+		sc->len = cpu_to_be32(roundup(copy_len, T4_ULPTX_MIN_IO));
+
+		to_dp = (u8 *)(sc + 1);
+		from_dp = (u8 *)data + i * C4IW_MAX_INLINE_SIZE;
+		if (data)
+			memcpy(to_dp, from_dp, copy_len);
+		else
+			memset(to_dp, 0, copy_len);
+		if (copy_len % T4_ULPTX_MIN_IO)
+			memset(to_dp + copy_len, 0, T4_ULPTX_MIN_IO -
+			       (copy_len % T4_ULPTX_MIN_IO));
+		ret = c4iw_ofld_send(rdev, skb);
+		if (ret)
+			return ret;
+		len -= C4IW_MAX_INLINE_SIZE;
+	}
+
+	ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, 0, __func__);
+	return ret;
+}
+
+static int _c4iw_write_mem_dma(struct c4iw_rdev *rdev, u32 addr, u32 len, void *data)
+{
+	u32 remain = len;
+	u32 dmalen;
+	int ret = 0;
+	dma_addr_t daddr;
+	dma_addr_t save;
+
+	daddr = dma_map_single(&rdev->lldi.pdev->dev, data, len, DMA_TO_DEVICE);
+	if (dma_mapping_error(&rdev->lldi.pdev->dev, daddr))
+		return -1;
+	save = daddr;
+
+	while (remain > inline_threshold) {
+		if (remain < T4_ULPTX_MAX_DMA) {
+			if (remain & ~T4_ULPTX_MIN_IO)
+				dmalen = remain & ~(T4_ULPTX_MIN_IO-1);
+			else
+				dmalen = remain;
+		} else
+			dmalen = T4_ULPTX_MAX_DMA;
+		remain -= dmalen;
+		ret = _c4iw_write_mem_dma_aligned(rdev, addr, dmalen, daddr,
+						 !remain);
+		if (ret)
+			goto out;
+		addr += dmalen >> 5;
+		data += dmalen;
+		daddr += dmalen;
+	}
+	if (remain)
+		ret = _c4iw_write_mem_inline(rdev, addr, remain, data);
+out:
+	dma_unmap_single(&rdev->lldi.pdev->dev, save, len, DMA_TO_DEVICE);
+	return ret;
+}
+
+/*
+ * write len bytes of data into addr (32B aligned address)
+ * If data is NULL, clear len byte of memory to zero.
+ */
+static int write_adapter_mem(struct c4iw_rdev *rdev, u32 addr, u32 len,
+			     void *data)
+{
+	if (is_t5(rdev->lldi.adapter_type) && use_dsgl) {
+		if (len > inline_threshold) {
+			if (_c4iw_write_mem_dma(rdev, addr, len, data)) {
+				printk_ratelimited(KERN_WARNING
+						   "%s: dma map"
+						   " failure (non fatal)\n",
+						   pci_name(rdev->lldi.pdev));
+				return _c4iw_write_mem_inline(rdev, addr, len,
+							      data);
+			} else
+				return 0;
+		} else
+			return _c4iw_write_mem_inline(rdev, addr, len, data);
+	} else
+		return _c4iw_write_mem_inline(rdev, addr, len, data);
+}
+
+/*
+ * Build and write a TPT entry.
+ * IN: stag key, pdid, perm, bind_enabled, zbva, to, len, page_size,
+ *     pbl_size and pbl_addr
+ * OUT: stag index
+ */
+static int write_tpt_entry(struct c4iw_rdev *rdev, u32 reset_tpt_entry,
+			   u32 *stag, u8 stag_state, u32 pdid,
+			   enum fw_ri_stag_type type, enum fw_ri_mem_perms perm,
+			   int bind_enabled, u32 zbva, u64 to,
+			   u64 len, u8 page_size, u32 pbl_size, u32 pbl_addr)
+{
+	int err;
+	struct fw_ri_tpte tpt;
+	u32 stag_idx;
+	static atomic_t key;
+
+	if (c4iw_fatal_error(rdev))
+		return -EIO;
+
+	stag_state = stag_state > 0;
+	stag_idx = (*stag) >> 8;
+
+	if ((!reset_tpt_entry) && (*stag == T4_STAG_UNSET)) {
+		stag_idx = c4iw_get_resource(&rdev->resource.tpt_table);
+		if (!stag_idx) {
+			mutex_lock(&rdev->stats.lock);
+			rdev->stats.stag.fail++;
+			mutex_unlock(&rdev->stats.lock);
+			return -ENOMEM;
+		}
+		mutex_lock(&rdev->stats.lock);
+		rdev->stats.stag.cur += 32;
+		if (rdev->stats.stag.cur > rdev->stats.stag.max)
+			rdev->stats.stag.max = rdev->stats.stag.cur;
+		mutex_unlock(&rdev->stats.lock);
+		*stag = (stag_idx << 8) | (atomic_inc_return(&key) & 0xff);
+	}
+	PDBG("%s stag_state 0x%0x type 0x%0x pdid 0x%0x, stag_idx 0x%x\n",
+	     __func__, stag_state, type, pdid, stag_idx);
+
+	/* write TPT entry */
+	if (reset_tpt_entry)
+		memset(&tpt, 0, sizeof(tpt));
+	else {
+		tpt.valid_to_pdid = cpu_to_be32(FW_RI_TPTE_VALID_F |
+			FW_RI_TPTE_STAGKEY_V((*stag & FW_RI_TPTE_STAGKEY_M)) |
+			FW_RI_TPTE_STAGSTATE_V(stag_state) |
+			FW_RI_TPTE_STAGTYPE_V(type) | FW_RI_TPTE_PDID_V(pdid));
+		tpt.locread_to_qpid = cpu_to_be32(FW_RI_TPTE_PERM_V(perm) |
+			(bind_enabled ? FW_RI_TPTE_MWBINDEN_F : 0) |
+			FW_RI_TPTE_ADDRTYPE_V((zbva ? FW_RI_ZERO_BASED_TO :
+						      FW_RI_VA_BASED_TO))|
+			FW_RI_TPTE_PS_V(page_size));
+		tpt.nosnoop_pbladdr = !pbl_size ? 0 : cpu_to_be32(
+			FW_RI_TPTE_PBLADDR_V(PBL_OFF(rdev, pbl_addr)>>3));
+		tpt.len_lo = cpu_to_be32((u32)(len & 0xffffffffUL));
+		tpt.va_hi = cpu_to_be32((u32)(to >> 32));
+		tpt.va_lo_fbo = cpu_to_be32((u32)(to & 0xffffffffUL));
+		tpt.dca_mwbcnt_pstag = cpu_to_be32(0);
+		tpt.len_hi = cpu_to_be32((u32)(len >> 32));
+	}
+	err = write_adapter_mem(rdev, stag_idx +
+				(rdev->lldi.vr->stag.start >> 5),
+				sizeof(tpt), &tpt);
+
+	if (reset_tpt_entry) {
+		c4iw_put_resource(&rdev->resource.tpt_table, stag_idx);
+		mutex_lock(&rdev->stats.lock);
+		rdev->stats.stag.cur -= 32;
+		mutex_unlock(&rdev->stats.lock);
+	}
+	return err;
+}
+
+static int write_pbl(struct c4iw_rdev *rdev, __be64 *pbl,
+		     u32 pbl_addr, u32 pbl_size)
+{
+	int err;
+
+	PDBG("%s *pdb_addr 0x%x, pbl_base 0x%x, pbl_size %d\n",
+	     __func__, pbl_addr, rdev->lldi.vr->pbl.start,
+	     pbl_size);
+
+	err = write_adapter_mem(rdev, pbl_addr >> 5, pbl_size << 3, pbl);
+	return err;
+}
+
+static int dereg_mem(struct c4iw_rdev *rdev, u32 stag, u32 pbl_size,
+		     u32 pbl_addr)
+{
+	return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0,
+			       pbl_size, pbl_addr);
+}
+
+static int allocate_window(struct c4iw_rdev *rdev, u32 * stag, u32 pdid)
+{
+	*stag = T4_STAG_UNSET;
+	return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_MW, 0, 0, 0,
+			       0UL, 0, 0, 0, 0);
+}
+
+static int deallocate_window(struct c4iw_rdev *rdev, u32 stag)
+{
+	return write_tpt_entry(rdev, 1, &stag, 0, 0, 0, 0, 0, 0, 0UL, 0, 0, 0,
+			       0);
+}
+
+static int allocate_stag(struct c4iw_rdev *rdev, u32 *stag, u32 pdid,
+			 u32 pbl_size, u32 pbl_addr)
+{
+	*stag = T4_STAG_UNSET;
+	return write_tpt_entry(rdev, 0, stag, 0, pdid, FW_RI_STAG_NSMR, 0, 0, 0,
+			       0UL, 0, 0, pbl_size, pbl_addr);
+}
+
+static int finish_mem_reg(struct c4iw_mr *mhp, u32 stag)
+{
+	u32 mmid;
+
+	mhp->attr.state = 1;
+	mhp->attr.stag = stag;
+	mmid = stag >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	PDBG("%s mmid 0x%x mhp %p\n", __func__, mmid, mhp);
+	return insert_handle(mhp->rhp, &mhp->rhp->mmidr, mhp, mmid);
+}
+
+static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
+		      struct c4iw_mr *mhp, int shift)
+{
+	u32 stag = T4_STAG_UNSET;
+	int ret;
+
+	ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid,
+			      FW_RI_STAG_NSMR, mhp->attr.len ?
+			      mhp->attr.perms : 0,
+			      mhp->attr.mw_bind_enable, mhp->attr.zbva,
+			      mhp->attr.va_fbo, mhp->attr.len ?
+			      mhp->attr.len : -1, shift - 12,
+			      mhp->attr.pbl_size, mhp->attr.pbl_addr);
+	if (ret)
+		return ret;
+
+	ret = finish_mem_reg(mhp, stag);
+	if (ret)
+		dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+	return ret;
+}
+
+static int reregister_mem(struct c4iw_dev *rhp, struct c4iw_pd *php,
+			  struct c4iw_mr *mhp, int shift, int npages)
+{
+	u32 stag;
+	int ret;
+
+	if (npages > mhp->attr.pbl_size)
+		return -ENOMEM;
+
+	stag = mhp->attr.stag;
+	ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid,
+			      FW_RI_STAG_NSMR, mhp->attr.perms,
+			      mhp->attr.mw_bind_enable, mhp->attr.zbva,
+			      mhp->attr.va_fbo, mhp->attr.len, shift - 12,
+			      mhp->attr.pbl_size, mhp->attr.pbl_addr);
+	if (ret)
+		return ret;
+
+	ret = finish_mem_reg(mhp, stag);
+	if (ret)
+		dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+
+	return ret;
+}
+
+static int alloc_pbl(struct c4iw_mr *mhp, int npages)
+{
+	mhp->attr.pbl_addr = c4iw_pblpool_alloc(&mhp->rhp->rdev,
+						    npages << 3);
+
+	if (!mhp->attr.pbl_addr)
+		return -ENOMEM;
+
+	mhp->attr.pbl_size = npages;
+
+	return 0;
+}
+
+static int build_phys_page_list(struct ib_phys_buf *buffer_list,
+				int num_phys_buf, u64 *iova_start,
+				u64 *total_size, int *npages,
+				int *shift, __be64 **page_list)
+{
+	u64 mask;
+	int i, j, n;
+
+	mask = 0;
+	*total_size = 0;
+	for (i = 0; i < num_phys_buf; ++i) {
+		if (i != 0 && buffer_list[i].addr & ~PAGE_MASK)
+			return -EINVAL;
+		if (i != 0 && i != num_phys_buf - 1 &&
+		    (buffer_list[i].size & ~PAGE_MASK))
+			return -EINVAL;
+		*total_size += buffer_list[i].size;
+		if (i > 0)
+			mask |= buffer_list[i].addr;
+		else
+			mask |= buffer_list[i].addr & PAGE_MASK;
+		if (i != num_phys_buf - 1)
+			mask |= buffer_list[i].addr + buffer_list[i].size;
+		else
+			mask |= (buffer_list[i].addr + buffer_list[i].size +
+				PAGE_SIZE - 1) & PAGE_MASK;
+	}
+
+	if (*total_size > 0xFFFFFFFFULL)
+		return -ENOMEM;
+
+	/* Find largest page shift we can use to cover buffers */
+	for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift))
+		if ((1ULL << *shift) & mask)
+			break;
+
+	buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1);
+	buffer_list[0].addr &= ~0ull << *shift;
+
+	*npages = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		*npages += (buffer_list[i].size +
+			(1ULL << *shift) - 1) >> *shift;
+
+	if (!*npages)
+		return -EINVAL;
+
+	*page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL);
+	if (!*page_list)
+		return -ENOMEM;
+
+	n = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		for (j = 0;
+		     j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift;
+		     ++j)
+			(*page_list)[n++] = cpu_to_be64(buffer_list[i].addr +
+			    ((u64) j << *shift));
+
+	PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n",
+	     __func__, (unsigned long long)*iova_start,
+	     (unsigned long long)mask, *shift, (unsigned long long)*total_size,
+	     *npages);
+
+	return 0;
+
+}
+
+int c4iw_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask,
+			     struct ib_pd *pd, struct ib_phys_buf *buffer_list,
+			     int num_phys_buf, int acc, u64 *iova_start)
+{
+
+	struct c4iw_mr mh, *mhp;
+	struct c4iw_pd *php;
+	struct c4iw_dev *rhp;
+	__be64 *page_list = NULL;
+	int shift = 0;
+	u64 total_size;
+	int npages;
+	int ret;
+
+	PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd);
+
+	/* There can be no memory windows */
+	if (atomic_read(&mr->usecnt))
+		return -EINVAL;
+
+	mhp = to_c4iw_mr(mr);
+	rhp = mhp->rhp;
+	php = to_c4iw_pd(mr->pd);
+
+	/* make sure we are on the same adapter */
+	if (rhp != php->rhp)
+		return -EINVAL;
+
+	memcpy(&mh, mhp, sizeof *mhp);
+
+	if (mr_rereg_mask & IB_MR_REREG_PD)
+		php = to_c4iw_pd(pd);
+	if (mr_rereg_mask & IB_MR_REREG_ACCESS) {
+		mh.attr.perms = c4iw_ib_to_tpt_access(acc);
+		mh.attr.mw_bind_enable = (acc & IB_ACCESS_MW_BIND) ==
+					 IB_ACCESS_MW_BIND;
+	}
+	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+		ret = build_phys_page_list(buffer_list, num_phys_buf,
+						iova_start,
+						&total_size, &npages,
+						&shift, &page_list);
+		if (ret)
+			return ret;
+	}
+
+	if (mr_exceeds_hw_limits(rhp, total_size)) {
+		kfree(page_list);
+		return -EINVAL;
+	}
+
+	ret = reregister_mem(rhp, php, &mh, shift, npages);
+	kfree(page_list);
+	if (ret)
+		return ret;
+	if (mr_rereg_mask & IB_MR_REREG_PD)
+		mhp->attr.pdid = php->pdid;
+	if (mr_rereg_mask & IB_MR_REREG_ACCESS)
+		mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
+	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+		mhp->attr.zbva = 0;
+		mhp->attr.va_fbo = *iova_start;
+		mhp->attr.page_size = shift - 12;
+		mhp->attr.len = (u32) total_size;
+		mhp->attr.pbl_size = npages;
+	}
+
+	return 0;
+}
+
+struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd,
+				     struct ib_phys_buf *buffer_list,
+				     int num_phys_buf, int acc, u64 *iova_start)
+{
+	__be64 *page_list;
+	int shift;
+	u64 total_size;
+	int npages;
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mr *mhp;
+	int ret;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->rhp = rhp;
+
+	/* First check that we have enough alignment */
+	if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	if (num_phys_buf > 1 &&
+	    ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) {
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start,
+					&total_size, &npages, &shift,
+					&page_list);
+	if (ret)
+		goto err;
+
+	if (mr_exceeds_hw_limits(rhp, total_size)) {
+		kfree(page_list);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	ret = alloc_pbl(mhp, npages);
+	if (ret) {
+		kfree(page_list);
+		goto err;
+	}
+
+	ret = write_pbl(&mhp->rhp->rdev, page_list, mhp->attr.pbl_addr,
+			     npages);
+	kfree(page_list);
+	if (ret)
+		goto err_pbl;
+
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.zbva = 0;
+
+	mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
+	mhp->attr.va_fbo = *iova_start;
+	mhp->attr.page_size = shift - 12;
+
+	mhp->attr.len = (u32) total_size;
+	mhp->attr.pbl_size = npages;
+	ret = register_mem(rhp, php, mhp, shift);
+	if (ret)
+		goto err_pbl;
+
+	return &mhp->ibmr;
+
+err_pbl:
+	c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+			      mhp->attr.pbl_size << 3);
+
+err:
+	kfree(mhp);
+	return ERR_PTR(ret);
+
+}
+
+struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mr *mhp;
+	int ret;
+	u32 stag = T4_STAG_UNSET;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->rhp = rhp;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
+	mhp->attr.mw_bind_enable = (acc&IB_ACCESS_MW_BIND) == IB_ACCESS_MW_BIND;
+	mhp->attr.zbva = 0;
+	mhp->attr.va_fbo = 0;
+	mhp->attr.page_size = 0;
+	mhp->attr.len = ~0ULL;
+	mhp->attr.pbl_size = 0;
+
+	ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, php->pdid,
+			      FW_RI_STAG_NSMR, mhp->attr.perms,
+			      mhp->attr.mw_bind_enable, 0, 0, ~0ULL, 0, 0, 0);
+	if (ret)
+		goto err1;
+
+	ret = finish_mem_reg(mhp, stag);
+	if (ret)
+		goto err2;
+	return &mhp->ibmr;
+err2:
+	dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		  mhp->attr.pbl_addr);
+err1:
+	kfree(mhp);
+	return ERR_PTR(ret);
+}
+
+struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+			       u64 virt, int acc, struct ib_udata *udata)
+{
+	__be64 *pages;
+	int shift, n, len;
+	int i, k, entry;
+	int err = 0;
+	struct scatterlist *sg;
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mr *mhp;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+
+	if (length == ~0ULL)
+		return ERR_PTR(-EINVAL);
+
+	if ((length + start) < start)
+		return ERR_PTR(-EINVAL);
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+
+	if (mr_exceeds_hw_limits(rhp, length))
+		return ERR_PTR(-EINVAL);
+
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+
+	mhp->rhp = rhp;
+
+	mhp->umem = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	if (IS_ERR(mhp->umem)) {
+		err = PTR_ERR(mhp->umem);
+		kfree(mhp);
+		return ERR_PTR(err);
+	}
+
+	shift = ffs(mhp->umem->page_size) - 1;
+
+	n = mhp->umem->nmap;
+	err = alloc_pbl(mhp, n);
+	if (err)
+		goto err;
+
+	pages = (__be64 *) __get_free_page(GFP_KERNEL);
+	if (!pages) {
+		err = -ENOMEM;
+		goto err_pbl;
+	}
+
+	i = n = 0;
+
+	for_each_sg(mhp->umem->sg_head.sgl, sg, mhp->umem->nmap, entry) {
+		len = sg_dma_len(sg) >> shift;
+		for (k = 0; k < len; ++k) {
+			pages[i++] = cpu_to_be64(sg_dma_address(sg) +
+				mhp->umem->page_size * k);
+			if (i == PAGE_SIZE / sizeof *pages) {
+				err = write_pbl(&mhp->rhp->rdev,
+				      pages,
+				      mhp->attr.pbl_addr + (n << 3), i);
+				if (err)
+					goto pbl_done;
+				n += i;
+				i = 0;
+			}
+		}
+	}
+
+	if (i)
+		err = write_pbl(&mhp->rhp->rdev, pages,
+				     mhp->attr.pbl_addr + (n << 3), i);
+
+pbl_done:
+	free_page((unsigned long) pages);
+	if (err)
+		goto err_pbl;
+
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.zbva = 0;
+	mhp->attr.perms = c4iw_ib_to_tpt_access(acc);
+	mhp->attr.va_fbo = virt;
+	mhp->attr.page_size = shift - 12;
+	mhp->attr.len = length;
+
+	err = register_mem(rhp, php, mhp, shift);
+	if (err)
+		goto err_pbl;
+
+	return &mhp->ibmr;
+
+err_pbl:
+	c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+			      mhp->attr.pbl_size << 3);
+
+err:
+	ib_umem_release(mhp->umem);
+	kfree(mhp);
+	return ERR_PTR(err);
+}
+
+struct ib_mw *c4iw_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mw *mhp;
+	u32 mmid;
+	u32 stag = 0;
+	int ret;
+
+	if (type != IB_MW_TYPE_1)
+		return ERR_PTR(-EINVAL);
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp)
+		return ERR_PTR(-ENOMEM);
+	ret = allocate_window(&rhp->rdev, &stag, php->pdid);
+	if (ret) {
+		kfree(mhp);
+		return ERR_PTR(ret);
+	}
+	mhp->rhp = rhp;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.type = FW_RI_STAG_MW;
+	mhp->attr.stag = stag;
+	mmid = (stag) >> 8;
+	mhp->ibmw.rkey = stag;
+	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+		deallocate_window(&rhp->rdev, mhp->attr.stag);
+		kfree(mhp);
+		return ERR_PTR(-ENOMEM);
+	}
+	PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
+	return &(mhp->ibmw);
+}
+
+int c4iw_dealloc_mw(struct ib_mw *mw)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_mw *mhp;
+	u32 mmid;
+
+	mhp = to_c4iw_mw(mw);
+	rhp = mhp->rhp;
+	mmid = (mw->rkey) >> 8;
+	remove_handle(rhp, &rhp->mmidr, mmid);
+	deallocate_window(&rhp->rdev, mhp->attr.stag);
+	kfree(mhp);
+	PDBG("%s ib_mw %p mmid 0x%x ptr %p\n", __func__, mw, mmid, mhp);
+	return 0;
+}
+
+struct ib_mr *c4iw_alloc_fast_reg_mr(struct ib_pd *pd, int pbl_depth)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+	struct c4iw_mr *mhp;
+	u32 mmid;
+	u32 stag = 0;
+	int ret = 0;
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	mhp = kzalloc(sizeof(*mhp), GFP_KERNEL);
+	if (!mhp) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	mhp->rhp = rhp;
+	ret = alloc_pbl(mhp, pbl_depth);
+	if (ret)
+		goto err1;
+	mhp->attr.pbl_size = pbl_depth;
+	ret = allocate_stag(&rhp->rdev, &stag, php->pdid,
+				 mhp->attr.pbl_size, mhp->attr.pbl_addr);
+	if (ret)
+		goto err2;
+	mhp->attr.pdid = php->pdid;
+	mhp->attr.type = FW_RI_STAG_NSMR;
+	mhp->attr.stag = stag;
+	mhp->attr.state = 1;
+	mmid = (stag) >> 8;
+	mhp->ibmr.rkey = mhp->ibmr.lkey = stag;
+	if (insert_handle(rhp, &rhp->mmidr, mhp, mmid)) {
+		ret = -ENOMEM;
+		goto err3;
+	}
+
+	PDBG("%s mmid 0x%x mhp %p stag 0x%x\n", __func__, mmid, mhp, stag);
+	return &(mhp->ibmr);
+err3:
+	dereg_mem(&rhp->rdev, stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+err2:
+	c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+			      mhp->attr.pbl_size << 3);
+err1:
+	kfree(mhp);
+err:
+	return ERR_PTR(ret);
+}
+
+struct ib_fast_reg_page_list *c4iw_alloc_fastreg_pbl(struct ib_device *device,
+						     int page_list_len)
+{
+	struct c4iw_fr_page_list *c4pl;
+	struct c4iw_dev *dev = to_c4iw_dev(device);
+	dma_addr_t dma_addr;
+	int pll_len = roundup(page_list_len * sizeof(u64), 32);
+
+	c4pl = kmalloc(sizeof(*c4pl), GFP_KERNEL);
+	if (!c4pl)
+		return ERR_PTR(-ENOMEM);
+
+	c4pl->ibpl.page_list = dma_alloc_coherent(&dev->rdev.lldi.pdev->dev,
+						  pll_len, &dma_addr,
+						  GFP_KERNEL);
+	if (!c4pl->ibpl.page_list) {
+		kfree(c4pl);
+		return ERR_PTR(-ENOMEM);
+	}
+	dma_unmap_addr_set(c4pl, mapping, dma_addr);
+	c4pl->dma_addr = dma_addr;
+	c4pl->dev = dev;
+	c4pl->pll_len = pll_len;
+
+	PDBG("%s c4pl %p pll_len %u page_list %p dma_addr %pad\n",
+	     __func__, c4pl, c4pl->pll_len, c4pl->ibpl.page_list,
+	     &c4pl->dma_addr);
+
+	return &c4pl->ibpl;
+}
+
+void c4iw_free_fastreg_pbl(struct ib_fast_reg_page_list *ibpl)
+{
+	struct c4iw_fr_page_list *c4pl = to_c4iw_fr_page_list(ibpl);
+
+	PDBG("%s c4pl %p pll_len %u page_list %p dma_addr %pad\n",
+	     __func__, c4pl, c4pl->pll_len, c4pl->ibpl.page_list,
+	     &c4pl->dma_addr);
+
+	dma_free_coherent(&c4pl->dev->rdev.lldi.pdev->dev,
+			  c4pl->pll_len,
+			  c4pl->ibpl.page_list, dma_unmap_addr(c4pl, mapping));
+	kfree(c4pl);
+}
+
+int c4iw_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_mr *mhp;
+	u32 mmid;
+
+	PDBG("%s ib_mr %p\n", __func__, ib_mr);
+	/* There can be no memory windows */
+	if (atomic_read(&ib_mr->usecnt))
+		return -EINVAL;
+
+	mhp = to_c4iw_mr(ib_mr);
+	rhp = mhp->rhp;
+	mmid = mhp->attr.stag >> 8;
+	remove_handle(rhp, &rhp->mmidr, mmid);
+	dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size,
+		       mhp->attr.pbl_addr);
+	if (mhp->attr.pbl_size)
+		c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr,
+				  mhp->attr.pbl_size << 3);
+	if (mhp->kva)
+		kfree((void *) (unsigned long) mhp->kva);
+	if (mhp->umem)
+		ib_umem_release(mhp->umem);
+	PDBG("%s mmid 0x%x ptr %p\n", __func__, mmid, mhp);
+	kfree(mhp);
+	return 0;
+}
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
new file mode 100644
index 000000000..66bd6a2ad
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -0,0 +1,588 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/delay.h>
+#include <linux/errno.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/ethtool.h>
+#include <linux/rtnetlink.h>
+#include <linux/inetdevice.h>
+#include <linux/io.h>
+
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "iw_cxgb4.h"
+
+static int fastreg_support = 1;
+module_param(fastreg_support, int, 0644);
+MODULE_PARM_DESC(fastreg_support, "Advertise fastreg support (default=1)");
+
+static struct ib_ah *c4iw_ah_create(struct ib_pd *pd,
+				    struct ib_ah_attr *ah_attr)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+static int c4iw_ah_destroy(struct ib_ah *ah)
+{
+	return -ENOSYS;
+}
+
+static int c4iw_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return -ENOSYS;
+}
+
+static int c4iw_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	return -ENOSYS;
+}
+
+static int c4iw_process_mad(struct ib_device *ibdev, int mad_flags,
+			    u8 port_num, struct ib_wc *in_wc,
+			    struct ib_grh *in_grh, struct ib_mad *in_mad,
+			    struct ib_mad *out_mad)
+{
+	return -ENOSYS;
+}
+
+static int c4iw_dealloc_ucontext(struct ib_ucontext *context)
+{
+	struct c4iw_dev *rhp = to_c4iw_dev(context->device);
+	struct c4iw_ucontext *ucontext = to_c4iw_ucontext(context);
+	struct c4iw_mm_entry *mm, *tmp;
+
+	PDBG("%s context %p\n", __func__, context);
+	list_for_each_entry_safe(mm, tmp, &ucontext->mmaps, entry)
+		kfree(mm);
+	c4iw_release_dev_ucontext(&rhp->rdev, &ucontext->uctx);
+	kfree(ucontext);
+	return 0;
+}
+
+static struct ib_ucontext *c4iw_alloc_ucontext(struct ib_device *ibdev,
+					       struct ib_udata *udata)
+{
+	struct c4iw_ucontext *context;
+	struct c4iw_dev *rhp = to_c4iw_dev(ibdev);
+	static int warned;
+	struct c4iw_alloc_ucontext_resp uresp;
+	int ret = 0;
+	struct c4iw_mm_entry *mm = NULL;
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	c4iw_init_dev_ucontext(&rhp->rdev, &context->uctx);
+	INIT_LIST_HEAD(&context->mmaps);
+	spin_lock_init(&context->mmap_lock);
+
+	if (udata->outlen < sizeof(uresp) - sizeof(uresp.reserved)) {
+		if (!warned++)
+			pr_err(MOD "Warning - downlevel libcxgb4 (non-fatal), device status page disabled.");
+		rhp->rdev.flags |= T4_STATUS_PAGE_DISABLED;
+	} else {
+		mm = kmalloc(sizeof(*mm), GFP_KERNEL);
+		if (!mm) {
+			ret = -ENOMEM;
+			goto err_free;
+		}
+
+		uresp.status_page_size = PAGE_SIZE;
+
+		spin_lock(&context->mmap_lock);
+		uresp.status_page_key = context->key;
+		context->key += PAGE_SIZE;
+		spin_unlock(&context->mmap_lock);
+
+		ret = ib_copy_to_udata(udata, &uresp,
+				       sizeof(uresp) - sizeof(uresp.reserved));
+		if (ret)
+			goto err_mm;
+
+		mm->key = uresp.status_page_key;
+		mm->addr = virt_to_phys(rhp->rdev.status_page);
+		mm->len = PAGE_SIZE;
+		insert_mmap(context, mm);
+	}
+	return &context->ibucontext;
+err_mm:
+	kfree(mm);
+err_free:
+	kfree(context);
+err:
+	return ERR_PTR(ret);
+}
+
+static int c4iw_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	int len = vma->vm_end - vma->vm_start;
+	u32 key = vma->vm_pgoff << PAGE_SHIFT;
+	struct c4iw_rdev *rdev;
+	int ret = 0;
+	struct c4iw_mm_entry *mm;
+	struct c4iw_ucontext *ucontext;
+	u64 addr;
+
+	PDBG("%s pgoff 0x%lx key 0x%x len %d\n", __func__, vma->vm_pgoff,
+	     key, len);
+
+	if (vma->vm_start & (PAGE_SIZE-1))
+		return -EINVAL;
+
+	rdev = &(to_c4iw_dev(context->device)->rdev);
+	ucontext = to_c4iw_ucontext(context);
+
+	mm = remove_mmap(ucontext, key, len);
+	if (!mm)
+		return -EINVAL;
+	addr = mm->addr;
+	kfree(mm);
+
+	if ((addr >= pci_resource_start(rdev->lldi.pdev, 0)) &&
+	    (addr < (pci_resource_start(rdev->lldi.pdev, 0) +
+		    pci_resource_len(rdev->lldi.pdev, 0)))) {
+
+		/*
+		 * MA_SYNC register...
+		 */
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 addr >> PAGE_SHIFT,
+					 len, vma->vm_page_prot);
+	} else if ((addr >= pci_resource_start(rdev->lldi.pdev, 2)) &&
+		   (addr < (pci_resource_start(rdev->lldi.pdev, 2) +
+		    pci_resource_len(rdev->lldi.pdev, 2)))) {
+
+		/*
+		 * Map user DB or OCQP memory...
+		 */
+		if (addr >= rdev->oc_mw_pa)
+			vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot);
+		else {
+			if (is_t5(rdev->lldi.adapter_type))
+				vma->vm_page_prot =
+					t4_pgprot_wc(vma->vm_page_prot);
+			else
+				vma->vm_page_prot =
+					pgprot_noncached(vma->vm_page_prot);
+		}
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 addr >> PAGE_SHIFT,
+					 len, vma->vm_page_prot);
+	} else {
+
+		/*
+		 * Map WQ or CQ contig dma memory...
+		 */
+		ret = remap_pfn_range(vma, vma->vm_start,
+				      addr >> PAGE_SHIFT,
+				      len, vma->vm_page_prot);
+	}
+
+	return ret;
+}
+
+static int c4iw_deallocate_pd(struct ib_pd *pd)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_pd *php;
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	PDBG("%s ibpd %p pdid 0x%x\n", __func__, pd, php->pdid);
+	c4iw_put_resource(&rhp->rdev.resource.pdid_table, php->pdid);
+	mutex_lock(&rhp->rdev.stats.lock);
+	rhp->rdev.stats.pd.cur--;
+	mutex_unlock(&rhp->rdev.stats.lock);
+	kfree(php);
+	return 0;
+}
+
+static struct ib_pd *c4iw_allocate_pd(struct ib_device *ibdev,
+				      struct ib_ucontext *context,
+				      struct ib_udata *udata)
+{
+	struct c4iw_pd *php;
+	u32 pdid;
+	struct c4iw_dev *rhp;
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	rhp = (struct c4iw_dev *) ibdev;
+	pdid =  c4iw_get_resource(&rhp->rdev.resource.pdid_table);
+	if (!pdid)
+		return ERR_PTR(-EINVAL);
+	php = kzalloc(sizeof(*php), GFP_KERNEL);
+	if (!php) {
+		c4iw_put_resource(&rhp->rdev.resource.pdid_table, pdid);
+		return ERR_PTR(-ENOMEM);
+	}
+	php->pdid = pdid;
+	php->rhp = rhp;
+	if (context) {
+		if (ib_copy_to_udata(udata, &php->pdid, sizeof(u32))) {
+			c4iw_deallocate_pd(&php->ibpd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+	mutex_lock(&rhp->rdev.stats.lock);
+	rhp->rdev.stats.pd.cur++;
+	if (rhp->rdev.stats.pd.cur > rhp->rdev.stats.pd.max)
+		rhp->rdev.stats.pd.max = rhp->rdev.stats.pd.cur;
+	mutex_unlock(&rhp->rdev.stats.lock);
+	PDBG("%s pdid 0x%0x ptr 0x%p\n", __func__, pdid, php);
+	return &php->ibpd;
+}
+
+static int c4iw_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			   u16 *pkey)
+{
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+	*pkey = 0;
+	return 0;
+}
+
+static int c4iw_query_gid(struct ib_device *ibdev, u8 port, int index,
+			  union ib_gid *gid)
+{
+	struct c4iw_dev *dev;
+
+	PDBG("%s ibdev %p, port %d, index %d, gid %p\n",
+	       __func__, ibdev, port, index, gid);
+	dev = to_c4iw_dev(ibdev);
+	BUG_ON(port == 0);
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	memcpy(&(gid->raw[0]), dev->rdev.lldi.ports[port-1]->dev_addr, 6);
+	return 0;
+}
+
+static int c4iw_query_device(struct ib_device *ibdev,
+			     struct ib_device_attr *props)
+{
+
+	struct c4iw_dev *dev;
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+
+	dev = to_c4iw_dev(ibdev);
+	memset(props, 0, sizeof *props);
+	memcpy(&props->sys_image_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
+	props->hw_ver = CHELSIO_CHIP_RELEASE(dev->rdev.lldi.adapter_type);
+	props->fw_ver = dev->rdev.lldi.fw_vers;
+	props->device_cap_flags = dev->device_cap_flags;
+	props->page_size_cap = T4_PAGESIZE_MASK;
+	props->vendor_id = (u32)dev->rdev.lldi.pdev->vendor;
+	props->vendor_part_id = (u32)dev->rdev.lldi.pdev->device;
+	props->max_mr_size = T4_MAX_MR_SIZE;
+	props->max_qp = dev->rdev.lldi.vr->qp.size / 2;
+	props->max_qp_wr = dev->rdev.hw_queue.t4_max_qp_depth;
+	props->max_sge = T4_MAX_RECV_SGE;
+	props->max_sge_rd = 1;
+	props->max_res_rd_atom = dev->rdev.lldi.max_ird_adapter;
+	props->max_qp_rd_atom = min(dev->rdev.lldi.max_ordird_qp,
+				    c4iw_max_read_depth);
+	props->max_qp_init_rd_atom = props->max_qp_rd_atom;
+	props->max_cq = dev->rdev.lldi.vr->qp.size;
+	props->max_cqe = dev->rdev.hw_queue.t4_max_cq_depth;
+	props->max_mr = c4iw_num_stags(&dev->rdev);
+	props->max_pd = T4_MAX_NUM_PD;
+	props->local_ca_ack_delay = 0;
+	props->max_fast_reg_page_list_len = t4_max_fr_depth(use_dsgl);
+
+	return 0;
+}
+
+static int c4iw_query_port(struct ib_device *ibdev, u8 port,
+			   struct ib_port_attr *props)
+{
+	struct c4iw_dev *dev;
+	struct net_device *netdev;
+	struct in_device *inetdev;
+
+	PDBG("%s ibdev %p\n", __func__, ibdev);
+
+	dev = to_c4iw_dev(ibdev);
+	netdev = dev->rdev.lldi.ports[port-1];
+
+	memset(props, 0, sizeof(struct ib_port_attr));
+	props->max_mtu = IB_MTU_4096;
+	if (netdev->mtu >= 4096)
+		props->active_mtu = IB_MTU_4096;
+	else if (netdev->mtu >= 2048)
+		props->active_mtu = IB_MTU_2048;
+	else if (netdev->mtu >= 1024)
+		props->active_mtu = IB_MTU_1024;
+	else if (netdev->mtu >= 512)
+		props->active_mtu = IB_MTU_512;
+	else
+		props->active_mtu = IB_MTU_256;
+
+	if (!netif_carrier_ok(netdev))
+		props->state = IB_PORT_DOWN;
+	else {
+		inetdev = in_dev_get(netdev);
+		if (inetdev) {
+			if (inetdev->ifa_list)
+				props->state = IB_PORT_ACTIVE;
+			else
+				props->state = IB_PORT_INIT;
+			in_dev_put(inetdev);
+		} else
+			props->state = IB_PORT_INIT;
+	}
+
+	props->port_cap_flags =
+	    IB_PORT_CM_SUP |
+	    IB_PORT_SNMP_TUNNEL_SUP |
+	    IB_PORT_REINIT_SUP |
+	    IB_PORT_DEVICE_MGMT_SUP |
+	    IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->active_width = 2;
+	props->active_speed = IB_SPEED_DDR;
+	props->max_msg_sz = -1;
+
+	return 0;
+}
+
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+						 ibdev.dev);
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	return sprintf(buf, "%d\n",
+		       CHELSIO_CHIP_RELEASE(c4iw_dev->rdev.lldi.adapter_type));
+}
+
+static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+						 ibdev.dev);
+	PDBG("%s dev 0x%p\n", __func__, dev);
+
+	return sprintf(buf, "%u.%u.%u.%u\n",
+			FW_HDR_FW_VER_MAJOR_G(c4iw_dev->rdev.lldi.fw_vers),
+			FW_HDR_FW_VER_MINOR_G(c4iw_dev->rdev.lldi.fw_vers),
+			FW_HDR_FW_VER_MICRO_G(c4iw_dev->rdev.lldi.fw_vers),
+			FW_HDR_FW_VER_BUILD_G(c4iw_dev->rdev.lldi.fw_vers));
+}
+
+static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+						 ibdev.dev);
+	struct ethtool_drvinfo info;
+	struct net_device *lldev = c4iw_dev->rdev.lldi.ports[0];
+
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	lldev->ethtool_ops->get_drvinfo(lldev, &info);
+	return sprintf(buf, "%s\n", info.driver);
+}
+
+static ssize_t show_board(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct c4iw_dev *c4iw_dev = container_of(dev, struct c4iw_dev,
+						 ibdev.dev);
+	PDBG("%s dev 0x%p\n", __func__, dev);
+	return sprintf(buf, "%x.%x\n", c4iw_dev->rdev.lldi.pdev->vendor,
+		       c4iw_dev->rdev.lldi.pdev->device);
+}
+
+static int c4iw_get_mib(struct ib_device *ibdev,
+			union rdma_protocol_stats *stats)
+{
+	struct tp_tcp_stats v4, v6;
+	struct c4iw_dev *c4iw_dev = to_c4iw_dev(ibdev);
+
+	cxgb4_get_tcp_stats(c4iw_dev->rdev.lldi.pdev, &v4, &v6);
+	memset(stats, 0, sizeof *stats);
+	stats->iw.tcpInSegs = v4.tcpInSegs + v6.tcpInSegs;
+	stats->iw.tcpOutSegs = v4.tcpOutSegs + v6.tcpOutSegs;
+	stats->iw.tcpRetransSegs = v4.tcpRetransSegs + v6.tcpRetransSegs;
+	stats->iw.tcpOutRsts = v4.tcpOutRsts + v6.tcpOutSegs;
+
+	return 0;
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *c4iw_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+};
+
+int c4iw_register_device(struct c4iw_dev *dev)
+{
+	int ret;
+	int i;
+
+	PDBG("%s c4iw_dev %p\n", __func__, dev);
+	BUG_ON(!dev->rdev.lldi.ports[0]);
+	strlcpy(dev->ibdev.name, "cxgb4_%d", IB_DEVICE_NAME_MAX);
+	memset(&dev->ibdev.node_guid, 0, sizeof(dev->ibdev.node_guid));
+	memcpy(&dev->ibdev.node_guid, dev->rdev.lldi.ports[0]->dev_addr, 6);
+	dev->ibdev.owner = THIS_MODULE;
+	dev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY | IB_DEVICE_MEM_WINDOW;
+	if (fastreg_support)
+		dev->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+	dev->ibdev.local_dma_lkey = 0;
+	dev->ibdev.uverbs_cmd_mask =
+	    (1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+	    (1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+	    (1ull << IB_USER_VERBS_CMD_REG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+	    (1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+	    (1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+	    (1ull << IB_USER_VERBS_CMD_POST_SEND) |
+	    (1ull << IB_USER_VERBS_CMD_POST_RECV);
+	dev->ibdev.node_type = RDMA_NODE_RNIC;
+	memcpy(dev->ibdev.node_desc, C4IW_NODE_DESC, sizeof(C4IW_NODE_DESC));
+	dev->ibdev.phys_port_cnt = dev->rdev.lldi.nports;
+	dev->ibdev.num_comp_vectors =  dev->rdev.lldi.nciq;
+	dev->ibdev.dma_device = &(dev->rdev.lldi.pdev->dev);
+	dev->ibdev.query_device = c4iw_query_device;
+	dev->ibdev.query_port = c4iw_query_port;
+	dev->ibdev.query_pkey = c4iw_query_pkey;
+	dev->ibdev.query_gid = c4iw_query_gid;
+	dev->ibdev.alloc_ucontext = c4iw_alloc_ucontext;
+	dev->ibdev.dealloc_ucontext = c4iw_dealloc_ucontext;
+	dev->ibdev.mmap = c4iw_mmap;
+	dev->ibdev.alloc_pd = c4iw_allocate_pd;
+	dev->ibdev.dealloc_pd = c4iw_deallocate_pd;
+	dev->ibdev.create_ah = c4iw_ah_create;
+	dev->ibdev.destroy_ah = c4iw_ah_destroy;
+	dev->ibdev.create_qp = c4iw_create_qp;
+	dev->ibdev.modify_qp = c4iw_ib_modify_qp;
+	dev->ibdev.query_qp = c4iw_ib_query_qp;
+	dev->ibdev.destroy_qp = c4iw_destroy_qp;
+	dev->ibdev.create_cq = c4iw_create_cq;
+	dev->ibdev.destroy_cq = c4iw_destroy_cq;
+	dev->ibdev.resize_cq = c4iw_resize_cq;
+	dev->ibdev.poll_cq = c4iw_poll_cq;
+	dev->ibdev.get_dma_mr = c4iw_get_dma_mr;
+	dev->ibdev.reg_phys_mr = c4iw_register_phys_mem;
+	dev->ibdev.rereg_phys_mr = c4iw_reregister_phys_mem;
+	dev->ibdev.reg_user_mr = c4iw_reg_user_mr;
+	dev->ibdev.dereg_mr = c4iw_dereg_mr;
+	dev->ibdev.alloc_mw = c4iw_alloc_mw;
+	dev->ibdev.bind_mw = c4iw_bind_mw;
+	dev->ibdev.dealloc_mw = c4iw_dealloc_mw;
+	dev->ibdev.alloc_fast_reg_mr = c4iw_alloc_fast_reg_mr;
+	dev->ibdev.alloc_fast_reg_page_list = c4iw_alloc_fastreg_pbl;
+	dev->ibdev.free_fast_reg_page_list = c4iw_free_fastreg_pbl;
+	dev->ibdev.attach_mcast = c4iw_multicast_attach;
+	dev->ibdev.detach_mcast = c4iw_multicast_detach;
+	dev->ibdev.process_mad = c4iw_process_mad;
+	dev->ibdev.req_notify_cq = c4iw_arm_cq;
+	dev->ibdev.post_send = c4iw_post_send;
+	dev->ibdev.post_recv = c4iw_post_receive;
+	dev->ibdev.get_protocol_stats = c4iw_get_mib;
+	dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
+
+	dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
+	if (!dev->ibdev.iwcm)
+		return -ENOMEM;
+
+	dev->ibdev.iwcm->connect = c4iw_connect;
+	dev->ibdev.iwcm->accept = c4iw_accept_cr;
+	dev->ibdev.iwcm->reject = c4iw_reject_cr;
+	dev->ibdev.iwcm->create_listen = c4iw_create_listen;
+	dev->ibdev.iwcm->destroy_listen = c4iw_destroy_listen;
+	dev->ibdev.iwcm->add_ref = c4iw_qp_add_ref;
+	dev->ibdev.iwcm->rem_ref = c4iw_qp_rem_ref;
+	dev->ibdev.iwcm->get_qp = c4iw_get_qp;
+
+	ret = ib_register_device(&dev->ibdev, NULL);
+	if (ret)
+		goto bail1;
+
+	for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i) {
+		ret = device_create_file(&dev->ibdev.dev,
+					 c4iw_class_attributes[i]);
+		if (ret)
+			goto bail2;
+	}
+	return 0;
+bail2:
+	ib_unregister_device(&dev->ibdev);
+bail1:
+	kfree(dev->ibdev.iwcm);
+	return ret;
+}
+
+void c4iw_unregister_device(struct c4iw_dev *dev)
+{
+	int i;
+
+	PDBG("%s c4iw_dev %p\n", __func__, dev);
+	for (i = 0; i < ARRAY_SIZE(c4iw_class_attributes); ++i)
+		device_remove_file(&dev->ibdev.dev,
+				   c4iw_class_attributes[i]);
+	ib_unregister_device(&dev->ibdev);
+	kfree(dev->ibdev.iwcm);
+	return;
+}
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
new file mode 100644
index 000000000..389ced335
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -0,0 +1,1886 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+
+#include "iw_cxgb4.h"
+
+static int db_delay_usecs = 1;
+module_param(db_delay_usecs, int, 0644);
+MODULE_PARM_DESC(db_delay_usecs, "Usecs to delay awaiting db fifo to drain");
+
+static int ocqp_support = 1;
+module_param(ocqp_support, int, 0644);
+MODULE_PARM_DESC(ocqp_support, "Support on-chip SQs (default=1)");
+
+int db_fc_threshold = 1000;
+module_param(db_fc_threshold, int, 0644);
+MODULE_PARM_DESC(db_fc_threshold,
+		 "QP count/threshold that triggers"
+		 " automatic db flow control mode (default = 1000)");
+
+int db_coalescing_threshold;
+module_param(db_coalescing_threshold, int, 0644);
+MODULE_PARM_DESC(db_coalescing_threshold,
+		 "QP count/threshold that triggers"
+		 " disabling db coalescing (default = 0)");
+
+static int max_fr_immd = T4_MAX_FR_IMMD;
+module_param(max_fr_immd, int, 0644);
+MODULE_PARM_DESC(max_fr_immd, "fastreg threshold for using DSGL instead of immedate");
+
+static int alloc_ird(struct c4iw_dev *dev, u32 ird)
+{
+	int ret = 0;
+
+	spin_lock_irq(&dev->lock);
+	if (ird <= dev->avail_ird)
+		dev->avail_ird -= ird;
+	else
+		ret = -ENOMEM;
+	spin_unlock_irq(&dev->lock);
+
+	if (ret)
+		dev_warn(&dev->rdev.lldi.pdev->dev,
+			 "device IRD resources exhausted\n");
+
+	return ret;
+}
+
+static void free_ird(struct c4iw_dev *dev, int ird)
+{
+	spin_lock_irq(&dev->lock);
+	dev->avail_ird += ird;
+	spin_unlock_irq(&dev->lock);
+}
+
+static void set_state(struct c4iw_qp *qhp, enum c4iw_qp_state state)
+{
+	unsigned long flag;
+	spin_lock_irqsave(&qhp->lock, flag);
+	qhp->attr.state = state;
+	spin_unlock_irqrestore(&qhp->lock, flag);
+}
+
+static void dealloc_oc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	c4iw_ocqp_pool_free(rdev, sq->dma_addr, sq->memsize);
+}
+
+static void dealloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	dma_free_coherent(&(rdev->lldi.pdev->dev), sq->memsize, sq->queue,
+			  pci_unmap_addr(sq, mapping));
+}
+
+static void dealloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	if (t4_sq_onchip(sq))
+		dealloc_oc_sq(rdev, sq);
+	else
+		dealloc_host_sq(rdev, sq);
+}
+
+static int alloc_oc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	if (!ocqp_support || !ocqp_supported(&rdev->lldi))
+		return -ENOSYS;
+	sq->dma_addr = c4iw_ocqp_pool_alloc(rdev, sq->memsize);
+	if (!sq->dma_addr)
+		return -ENOMEM;
+	sq->phys_addr = rdev->oc_mw_pa + sq->dma_addr -
+			rdev->lldi.vr->ocq.start;
+	sq->queue = (__force union t4_wr *)(rdev->oc_mw_kva + sq->dma_addr -
+					    rdev->lldi.vr->ocq.start);
+	sq->flags |= T4_SQ_ONCHIP;
+	return 0;
+}
+
+static int alloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	sq->queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev), sq->memsize,
+				       &(sq->dma_addr), GFP_KERNEL);
+	if (!sq->queue)
+		return -ENOMEM;
+	sq->phys_addr = virt_to_phys(sq->queue);
+	pci_unmap_addr_set(sq, mapping, sq->dma_addr);
+	return 0;
+}
+
+static int alloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq, int user)
+{
+	int ret = -ENOSYS;
+	if (user)
+		ret = alloc_oc_sq(rdev, sq);
+	if (ret)
+		ret = alloc_host_sq(rdev, sq);
+	return ret;
+}
+
+static int destroy_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,
+		      struct c4iw_dev_ucontext *uctx)
+{
+	/*
+	 * uP clears EQ contexts when the connection exits rdma mode,
+	 * so no need to post a RESET WR for these EQs.
+	 */
+	dma_free_coherent(&(rdev->lldi.pdev->dev),
+			  wq->rq.memsize, wq->rq.queue,
+			  dma_unmap_addr(&wq->rq, mapping));
+	dealloc_sq(rdev, &wq->sq);
+	c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size);
+	kfree(wq->rq.sw_rq);
+	kfree(wq->sq.sw_sq);
+	c4iw_put_qpid(rdev, wq->rq.qid, uctx);
+	c4iw_put_qpid(rdev, wq->sq.qid, uctx);
+	return 0;
+}
+
+static int create_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,
+		     struct t4_cq *rcq, struct t4_cq *scq,
+		     struct c4iw_dev_ucontext *uctx)
+{
+	int user = (uctx != &rdev->uctx);
+	struct fw_ri_res_wr *res_wr;
+	struct fw_ri_res *res;
+	int wr_len;
+	struct c4iw_wr_wait wr_wait;
+	struct sk_buff *skb;
+	int ret = 0;
+	int eqsize;
+
+	wq->sq.qid = c4iw_get_qpid(rdev, uctx);
+	if (!wq->sq.qid)
+		return -ENOMEM;
+
+	wq->rq.qid = c4iw_get_qpid(rdev, uctx);
+	if (!wq->rq.qid) {
+		ret = -ENOMEM;
+		goto free_sq_qid;
+	}
+
+	if (!user) {
+		wq->sq.sw_sq = kzalloc(wq->sq.size * sizeof *wq->sq.sw_sq,
+				 GFP_KERNEL);
+		if (!wq->sq.sw_sq) {
+			ret = -ENOMEM;
+			goto free_rq_qid;
+		}
+
+		wq->rq.sw_rq = kzalloc(wq->rq.size * sizeof *wq->rq.sw_rq,
+				 GFP_KERNEL);
+		if (!wq->rq.sw_rq) {
+			ret = -ENOMEM;
+			goto free_sw_sq;
+		}
+	}
+
+	/*
+	 * RQT must be a power of 2 and at least 16 deep.
+	 */
+	wq->rq.rqt_size = roundup_pow_of_two(max_t(u16, wq->rq.size, 16));
+	wq->rq.rqt_hwaddr = c4iw_rqtpool_alloc(rdev, wq->rq.rqt_size);
+	if (!wq->rq.rqt_hwaddr) {
+		ret = -ENOMEM;
+		goto free_sw_rq;
+	}
+
+	ret = alloc_sq(rdev, &wq->sq, user);
+	if (ret)
+		goto free_hwaddr;
+	memset(wq->sq.queue, 0, wq->sq.memsize);
+	dma_unmap_addr_set(&wq->sq, mapping, wq->sq.dma_addr);
+
+	wq->rq.queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev),
+					  wq->rq.memsize, &(wq->rq.dma_addr),
+					  GFP_KERNEL);
+	if (!wq->rq.queue) {
+		ret = -ENOMEM;
+		goto free_sq;
+	}
+	PDBG("%s sq base va 0x%p pa 0x%llx rq base va 0x%p pa 0x%llx\n",
+		__func__, wq->sq.queue,
+		(unsigned long long)virt_to_phys(wq->sq.queue),
+		wq->rq.queue,
+		(unsigned long long)virt_to_phys(wq->rq.queue));
+	memset(wq->rq.queue, 0, wq->rq.memsize);
+	dma_unmap_addr_set(&wq->rq, mapping, wq->rq.dma_addr);
+
+	wq->db = rdev->lldi.db_reg;
+	wq->gts = rdev->lldi.gts_reg;
+	if (user || is_t5(rdev->lldi.adapter_type)) {
+		u32 off;
+
+		off = (wq->sq.qid << rdev->qpshift) & PAGE_MASK;
+		if (user) {
+			wq->sq.udb = (u64 __iomem *)(rdev->bar2_pa + off);
+		} else {
+			off += 128 * (wq->sq.qid & rdev->qpmask) + 8;
+			wq->sq.udb = (u64 __iomem *)(rdev->bar2_kva + off);
+		}
+		off = (wq->rq.qid << rdev->qpshift) & PAGE_MASK;
+		if (user) {
+			wq->rq.udb = (u64 __iomem *)(rdev->bar2_pa + off);
+		} else {
+			off += 128 * (wq->rq.qid & rdev->qpmask) + 8;
+			wq->rq.udb = (u64 __iomem *)(rdev->bar2_kva + off);
+		}
+	}
+	wq->rdev = rdev;
+	wq->rq.msn = 1;
+
+	/* build fw_ri_res_wr */
+	wr_len = sizeof *res_wr + 2 * sizeof *res;
+
+	skb = alloc_skb(wr_len, GFP_KERNEL);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto free_dma;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_CONTROL, 0);
+
+	res_wr = (struct fw_ri_res_wr *)__skb_put(skb, wr_len);
+	memset(res_wr, 0, wr_len);
+	res_wr->op_nres = cpu_to_be32(
+			FW_WR_OP_V(FW_RI_RES_WR) |
+			FW_RI_RES_WR_NRES_V(2) |
+			FW_WR_COMPL_F);
+	res_wr->len16_pkd = cpu_to_be32(DIV_ROUND_UP(wr_len, 16));
+	res_wr->cookie = (uintptr_t)&wr_wait;
+	res = res_wr->res;
+	res->u.sqrq.restype = FW_RI_RES_TYPE_SQ;
+	res->u.sqrq.op = FW_RI_RES_OP_WRITE;
+
+	/*
+	 * eqsize is the number of 64B entries plus the status page size.
+	 */
+	eqsize = wq->sq.size * T4_SQ_NUM_SLOTS +
+		rdev->hw_queue.t4_eq_status_entries;
+
+	res->u.sqrq.fetchszm_to_iqid = cpu_to_be32(
+		FW_RI_RES_WR_HOSTFCMODE_V(0) |	/* no host cidx updates */
+		FW_RI_RES_WR_CPRIO_V(0) |	/* don't keep in chip cache */
+		FW_RI_RES_WR_PCIECHN_V(0) |	/* set by uP at ri_init time */
+		(t4_sq_onchip(&wq->sq) ? FW_RI_RES_WR_ONCHIP_F : 0) |
+		FW_RI_RES_WR_IQID_V(scq->cqid));
+	res->u.sqrq.dcaen_to_eqsize = cpu_to_be32(
+		FW_RI_RES_WR_DCAEN_V(0) |
+		FW_RI_RES_WR_DCACPU_V(0) |
+		FW_RI_RES_WR_FBMIN_V(2) |
+		FW_RI_RES_WR_FBMAX_V(2) |
+		FW_RI_RES_WR_CIDXFTHRESHO_V(0) |
+		FW_RI_RES_WR_CIDXFTHRESH_V(0) |
+		FW_RI_RES_WR_EQSIZE_V(eqsize));
+	res->u.sqrq.eqid = cpu_to_be32(wq->sq.qid);
+	res->u.sqrq.eqaddr = cpu_to_be64(wq->sq.dma_addr);
+	res++;
+	res->u.sqrq.restype = FW_RI_RES_TYPE_RQ;
+	res->u.sqrq.op = FW_RI_RES_OP_WRITE;
+
+	/*
+	 * eqsize is the number of 64B entries plus the status page size.
+	 */
+	eqsize = wq->rq.size * T4_RQ_NUM_SLOTS +
+		rdev->hw_queue.t4_eq_status_entries;
+	res->u.sqrq.fetchszm_to_iqid = cpu_to_be32(
+		FW_RI_RES_WR_HOSTFCMODE_V(0) |	/* no host cidx updates */
+		FW_RI_RES_WR_CPRIO_V(0) |	/* don't keep in chip cache */
+		FW_RI_RES_WR_PCIECHN_V(0) |	/* set by uP at ri_init time */
+		FW_RI_RES_WR_IQID_V(rcq->cqid));
+	res->u.sqrq.dcaen_to_eqsize = cpu_to_be32(
+		FW_RI_RES_WR_DCAEN_V(0) |
+		FW_RI_RES_WR_DCACPU_V(0) |
+		FW_RI_RES_WR_FBMIN_V(2) |
+		FW_RI_RES_WR_FBMAX_V(2) |
+		FW_RI_RES_WR_CIDXFTHRESHO_V(0) |
+		FW_RI_RES_WR_CIDXFTHRESH_V(0) |
+		FW_RI_RES_WR_EQSIZE_V(eqsize));
+	res->u.sqrq.eqid = cpu_to_be32(wq->rq.qid);
+	res->u.sqrq.eqaddr = cpu_to_be64(wq->rq.dma_addr);
+
+	c4iw_init_wr_wait(&wr_wait);
+
+	ret = c4iw_ofld_send(rdev, skb);
+	if (ret)
+		goto free_dma;
+	ret = c4iw_wait_for_reply(rdev, &wr_wait, 0, wq->sq.qid, __func__);
+	if (ret)
+		goto free_dma;
+
+	PDBG("%s sqid 0x%x rqid 0x%x kdb 0x%p squdb 0x%lx rqudb 0x%lx\n",
+	     __func__, wq->sq.qid, wq->rq.qid, wq->db,
+	     (__force unsigned long) wq->sq.udb,
+	     (__force unsigned long) wq->rq.udb);
+
+	return 0;
+free_dma:
+	dma_free_coherent(&(rdev->lldi.pdev->dev),
+			  wq->rq.memsize, wq->rq.queue,
+			  dma_unmap_addr(&wq->rq, mapping));
+free_sq:
+	dealloc_sq(rdev, &wq->sq);
+free_hwaddr:
+	c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size);
+free_sw_rq:
+	kfree(wq->rq.sw_rq);
+free_sw_sq:
+	kfree(wq->sq.sw_sq);
+free_rq_qid:
+	c4iw_put_qpid(rdev, wq->rq.qid, uctx);
+free_sq_qid:
+	c4iw_put_qpid(rdev, wq->sq.qid, uctx);
+	return ret;
+}
+
+static int build_immd(struct t4_sq *sq, struct fw_ri_immd *immdp,
+		      struct ib_send_wr *wr, int max, u32 *plenp)
+{
+	u8 *dstp, *srcp;
+	u32 plen = 0;
+	int i;
+	int rem, len;
+
+	dstp = (u8 *)immdp->data;
+	for (i = 0; i < wr->num_sge; i++) {
+		if ((plen + wr->sg_list[i].length) > max)
+			return -EMSGSIZE;
+		srcp = (u8 *)(unsigned long)wr->sg_list[i].addr;
+		plen += wr->sg_list[i].length;
+		rem = wr->sg_list[i].length;
+		while (rem) {
+			if (dstp == (u8 *)&sq->queue[sq->size])
+				dstp = (u8 *)sq->queue;
+			if (rem <= (u8 *)&sq->queue[sq->size] - dstp)
+				len = rem;
+			else
+				len = (u8 *)&sq->queue[sq->size] - dstp;
+			memcpy(dstp, srcp, len);
+			dstp += len;
+			srcp += len;
+			rem -= len;
+		}
+	}
+	len = roundup(plen + sizeof *immdp, 16) - (plen + sizeof *immdp);
+	if (len)
+		memset(dstp, 0, len);
+	immdp->op = FW_RI_DATA_IMMD;
+	immdp->r1 = 0;
+	immdp->r2 = 0;
+	immdp->immdlen = cpu_to_be32(plen);
+	*plenp = plen;
+	return 0;
+}
+
+static int build_isgl(__be64 *queue_start, __be64 *queue_end,
+		      struct fw_ri_isgl *isglp, struct ib_sge *sg_list,
+		      int num_sge, u32 *plenp)
+
+{
+	int i;
+	u32 plen = 0;
+	__be64 *flitp = (__be64 *)isglp->sge;
+
+	for (i = 0; i < num_sge; i++) {
+		if ((plen + sg_list[i].length) < plen)
+			return -EMSGSIZE;
+		plen += sg_list[i].length;
+		*flitp = cpu_to_be64(((u64)sg_list[i].lkey << 32) |
+				     sg_list[i].length);
+		if (++flitp == queue_end)
+			flitp = queue_start;
+		*flitp = cpu_to_be64(sg_list[i].addr);
+		if (++flitp == queue_end)
+			flitp = queue_start;
+	}
+	*flitp = (__force __be64)0;
+	isglp->op = FW_RI_DATA_ISGL;
+	isglp->r1 = 0;
+	isglp->nsge = cpu_to_be16(num_sge);
+	isglp->r2 = 0;
+	if (plenp)
+		*plenp = plen;
+	return 0;
+}
+
+static int build_rdma_send(struct t4_sq *sq, union t4_wr *wqe,
+			   struct ib_send_wr *wr, u8 *len16)
+{
+	u32 plen;
+	int size;
+	int ret;
+
+	if (wr->num_sge > T4_MAX_SEND_SGE)
+		return -EINVAL;
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.sendop_pkd = cpu_to_be32(
+				FW_RI_SEND_WR_SENDOP_V(FW_RI_SEND_WITH_SE));
+		else
+			wqe->send.sendop_pkd = cpu_to_be32(
+				FW_RI_SEND_WR_SENDOP_V(FW_RI_SEND));
+		wqe->send.stag_inv = 0;
+		break;
+	case IB_WR_SEND_WITH_INV:
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			wqe->send.sendop_pkd = cpu_to_be32(
+				FW_RI_SEND_WR_SENDOP_V(FW_RI_SEND_WITH_SE_INV));
+		else
+			wqe->send.sendop_pkd = cpu_to_be32(
+				FW_RI_SEND_WR_SENDOP_V(FW_RI_SEND_WITH_INV));
+		wqe->send.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	wqe->send.r3 = 0;
+	wqe->send.r4 = 0;
+
+	plen = 0;
+	if (wr->num_sge) {
+		if (wr->send_flags & IB_SEND_INLINE) {
+			ret = build_immd(sq, wqe->send.u.immd_src, wr,
+					 T4_MAX_SEND_INLINE, &plen);
+			if (ret)
+				return ret;
+			size = sizeof wqe->send + sizeof(struct fw_ri_immd) +
+			       plen;
+		} else {
+			ret = build_isgl((__be64 *)sq->queue,
+					 (__be64 *)&sq->queue[sq->size],
+					 wqe->send.u.isgl_src,
+					 wr->sg_list, wr->num_sge, &plen);
+			if (ret)
+				return ret;
+			size = sizeof wqe->send + sizeof(struct fw_ri_isgl) +
+			       wr->num_sge * sizeof(struct fw_ri_sge);
+		}
+	} else {
+		wqe->send.u.immd_src[0].op = FW_RI_DATA_IMMD;
+		wqe->send.u.immd_src[0].r1 = 0;
+		wqe->send.u.immd_src[0].r2 = 0;
+		wqe->send.u.immd_src[0].immdlen = 0;
+		size = sizeof wqe->send + sizeof(struct fw_ri_immd);
+		plen = 0;
+	}
+	*len16 = DIV_ROUND_UP(size, 16);
+	wqe->send.plen = cpu_to_be32(plen);
+	return 0;
+}
+
+static int build_rdma_write(struct t4_sq *sq, union t4_wr *wqe,
+			    struct ib_send_wr *wr, u8 *len16)
+{
+	u32 plen;
+	int size;
+	int ret;
+
+	if (wr->num_sge > T4_MAX_SEND_SGE)
+		return -EINVAL;
+	wqe->write.r2 = 0;
+	wqe->write.stag_sink = cpu_to_be32(wr->wr.rdma.rkey);
+	wqe->write.to_sink = cpu_to_be64(wr->wr.rdma.remote_addr);
+	if (wr->num_sge) {
+		if (wr->send_flags & IB_SEND_INLINE) {
+			ret = build_immd(sq, wqe->write.u.immd_src, wr,
+					 T4_MAX_WRITE_INLINE, &plen);
+			if (ret)
+				return ret;
+			size = sizeof wqe->write + sizeof(struct fw_ri_immd) +
+			       plen;
+		} else {
+			ret = build_isgl((__be64 *)sq->queue,
+					 (__be64 *)&sq->queue[sq->size],
+					 wqe->write.u.isgl_src,
+					 wr->sg_list, wr->num_sge, &plen);
+			if (ret)
+				return ret;
+			size = sizeof wqe->write + sizeof(struct fw_ri_isgl) +
+			       wr->num_sge * sizeof(struct fw_ri_sge);
+		}
+	} else {
+		wqe->write.u.immd_src[0].op = FW_RI_DATA_IMMD;
+		wqe->write.u.immd_src[0].r1 = 0;
+		wqe->write.u.immd_src[0].r2 = 0;
+		wqe->write.u.immd_src[0].immdlen = 0;
+		size = sizeof wqe->write + sizeof(struct fw_ri_immd);
+		plen = 0;
+	}
+	*len16 = DIV_ROUND_UP(size, 16);
+	wqe->write.plen = cpu_to_be32(plen);
+	return 0;
+}
+
+static int build_rdma_read(union t4_wr *wqe, struct ib_send_wr *wr, u8 *len16)
+{
+	if (wr->num_sge > 1)
+		return -EINVAL;
+	if (wr->num_sge) {
+		wqe->read.stag_src = cpu_to_be32(wr->wr.rdma.rkey);
+		wqe->read.to_src_hi = cpu_to_be32((u32)(wr->wr.rdma.remote_addr
+							>> 32));
+		wqe->read.to_src_lo = cpu_to_be32((u32)wr->wr.rdma.remote_addr);
+		wqe->read.stag_sink = cpu_to_be32(wr->sg_list[0].lkey);
+		wqe->read.plen = cpu_to_be32(wr->sg_list[0].length);
+		wqe->read.to_sink_hi = cpu_to_be32((u32)(wr->sg_list[0].addr
+							 >> 32));
+		wqe->read.to_sink_lo = cpu_to_be32((u32)(wr->sg_list[0].addr));
+	} else {
+		wqe->read.stag_src = cpu_to_be32(2);
+		wqe->read.to_src_hi = 0;
+		wqe->read.to_src_lo = 0;
+		wqe->read.stag_sink = cpu_to_be32(2);
+		wqe->read.plen = 0;
+		wqe->read.to_sink_hi = 0;
+		wqe->read.to_sink_lo = 0;
+	}
+	wqe->read.r2 = 0;
+	wqe->read.r5 = 0;
+	*len16 = DIV_ROUND_UP(sizeof wqe->read, 16);
+	return 0;
+}
+
+static int build_rdma_recv(struct c4iw_qp *qhp, union t4_recv_wr *wqe,
+			   struct ib_recv_wr *wr, u8 *len16)
+{
+	int ret;
+
+	ret = build_isgl((__be64 *)qhp->wq.rq.queue,
+			 (__be64 *)&qhp->wq.rq.queue[qhp->wq.rq.size],
+			 &wqe->recv.isgl, wr->sg_list, wr->num_sge, NULL);
+	if (ret)
+		return ret;
+	*len16 = DIV_ROUND_UP(sizeof wqe->recv +
+			      wr->num_sge * sizeof(struct fw_ri_sge), 16);
+	return 0;
+}
+
+static int build_fastreg(struct t4_sq *sq, union t4_wr *wqe,
+			 struct ib_send_wr *wr, u8 *len16, u8 t5dev)
+{
+
+	struct fw_ri_immd *imdp;
+	__be64 *p;
+	int i;
+	int pbllen = roundup(wr->wr.fast_reg.page_list_len * sizeof(u64), 32);
+	int rem;
+
+	if (wr->wr.fast_reg.page_list_len >
+	    t4_max_fr_depth(use_dsgl))
+		return -EINVAL;
+
+	wqe->fr.qpbinde_to_dcacpu = 0;
+	wqe->fr.pgsz_shift = wr->wr.fast_reg.page_shift - 12;
+	wqe->fr.addr_type = FW_RI_VA_BASED_TO;
+	wqe->fr.mem_perms = c4iw_ib_to_tpt_access(wr->wr.fast_reg.access_flags);
+	wqe->fr.len_hi = 0;
+	wqe->fr.len_lo = cpu_to_be32(wr->wr.fast_reg.length);
+	wqe->fr.stag = cpu_to_be32(wr->wr.fast_reg.rkey);
+	wqe->fr.va_hi = cpu_to_be32(wr->wr.fast_reg.iova_start >> 32);
+	wqe->fr.va_lo_fbo = cpu_to_be32(wr->wr.fast_reg.iova_start &
+					0xffffffff);
+
+	if (t5dev && use_dsgl && (pbllen > max_fr_immd)) {
+		struct c4iw_fr_page_list *c4pl =
+			to_c4iw_fr_page_list(wr->wr.fast_reg.page_list);
+		struct fw_ri_dsgl *sglp;
+
+		for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) {
+			wr->wr.fast_reg.page_list->page_list[i] = (__force u64)
+				cpu_to_be64((u64)
+				wr->wr.fast_reg.page_list->page_list[i]);
+		}
+
+		sglp = (struct fw_ri_dsgl *)(&wqe->fr + 1);
+		sglp->op = FW_RI_DATA_DSGL;
+		sglp->r1 = 0;
+		sglp->nsge = cpu_to_be16(1);
+		sglp->addr0 = cpu_to_be64(c4pl->dma_addr);
+		sglp->len0 = cpu_to_be32(pbllen);
+
+		*len16 = DIV_ROUND_UP(sizeof(wqe->fr) + sizeof(*sglp), 16);
+	} else {
+		imdp = (struct fw_ri_immd *)(&wqe->fr + 1);
+		imdp->op = FW_RI_DATA_IMMD;
+		imdp->r1 = 0;
+		imdp->r2 = 0;
+		imdp->immdlen = cpu_to_be32(pbllen);
+		p = (__be64 *)(imdp + 1);
+		rem = pbllen;
+		for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) {
+			*p = cpu_to_be64(
+				(u64)wr->wr.fast_reg.page_list->page_list[i]);
+			rem -= sizeof(*p);
+			if (++p == (__be64 *)&sq->queue[sq->size])
+				p = (__be64 *)sq->queue;
+		}
+		BUG_ON(rem < 0);
+		while (rem) {
+			*p = 0;
+			rem -= sizeof(*p);
+			if (++p == (__be64 *)&sq->queue[sq->size])
+				p = (__be64 *)sq->queue;
+		}
+		*len16 = DIV_ROUND_UP(sizeof(wqe->fr) + sizeof(*imdp)
+				      + pbllen, 16);
+	}
+	return 0;
+}
+
+static int build_inv_stag(union t4_wr *wqe, struct ib_send_wr *wr,
+			  u8 *len16)
+{
+	wqe->inv.stag_inv = cpu_to_be32(wr->ex.invalidate_rkey);
+	wqe->inv.r2 = 0;
+	*len16 = DIV_ROUND_UP(sizeof wqe->inv, 16);
+	return 0;
+}
+
+void c4iw_qp_add_ref(struct ib_qp *qp)
+{
+	PDBG("%s ib_qp %p\n", __func__, qp);
+	atomic_inc(&(to_c4iw_qp(qp)->refcnt));
+}
+
+void c4iw_qp_rem_ref(struct ib_qp *qp)
+{
+	PDBG("%s ib_qp %p\n", __func__, qp);
+	if (atomic_dec_and_test(&(to_c4iw_qp(qp)->refcnt)))
+		wake_up(&(to_c4iw_qp(qp)->wait));
+}
+
+static void add_to_fc_list(struct list_head *head, struct list_head *entry)
+{
+	if (list_empty(entry))
+		list_add_tail(entry, head);
+}
+
+static int ring_kernel_sq_db(struct c4iw_qp *qhp, u16 inc)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&qhp->rhp->lock, flags);
+	spin_lock(&qhp->lock);
+	if (qhp->rhp->db_state == NORMAL)
+		t4_ring_sq_db(&qhp->wq, inc,
+			      is_t5(qhp->rhp->rdev.lldi.adapter_type), NULL);
+	else {
+		add_to_fc_list(&qhp->rhp->db_fc_list, &qhp->db_fc_entry);
+		qhp->wq.sq.wq_pidx_inc += inc;
+	}
+	spin_unlock(&qhp->lock);
+	spin_unlock_irqrestore(&qhp->rhp->lock, flags);
+	return 0;
+}
+
+static int ring_kernel_rq_db(struct c4iw_qp *qhp, u16 inc)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&qhp->rhp->lock, flags);
+	spin_lock(&qhp->lock);
+	if (qhp->rhp->db_state == NORMAL)
+		t4_ring_rq_db(&qhp->wq, inc,
+			      is_t5(qhp->rhp->rdev.lldi.adapter_type), NULL);
+	else {
+		add_to_fc_list(&qhp->rhp->db_fc_list, &qhp->db_fc_entry);
+		qhp->wq.rq.wq_pidx_inc += inc;
+	}
+	spin_unlock(&qhp->lock);
+	spin_unlock_irqrestore(&qhp->rhp->lock, flags);
+	return 0;
+}
+
+int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		   struct ib_send_wr **bad_wr)
+{
+	int err = 0;
+	u8 len16 = 0;
+	enum fw_wr_opcodes fw_opcode = 0;
+	enum fw_ri_wr_flags fw_flags;
+	struct c4iw_qp *qhp;
+	union t4_wr *wqe = NULL;
+	u32 num_wrs;
+	struct t4_swsqe *swsqe;
+	unsigned long flag;
+	u16 idx = 0;
+
+	qhp = to_c4iw_qp(ibqp);
+	spin_lock_irqsave(&qhp->lock, flag);
+	if (t4_wq_in_error(&qhp->wq)) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return -EINVAL;
+	}
+	num_wrs = t4_sq_avail(&qhp->wq);
+	if (num_wrs == 0) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return -ENOMEM;
+	}
+	while (wr) {
+		if (num_wrs == 0) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+		wqe = (union t4_wr *)((u8 *)qhp->wq.sq.queue +
+		      qhp->wq.sq.wq_pidx * T4_EQ_ENTRY_SIZE);
+
+		fw_flags = 0;
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			fw_flags |= FW_RI_SOLICITED_EVENT_FLAG;
+		if (wr->send_flags & IB_SEND_SIGNALED || qhp->sq_sig_all)
+			fw_flags |= FW_RI_COMPLETION_FLAG;
+		swsqe = &qhp->wq.sq.sw_sq[qhp->wq.sq.pidx];
+		switch (wr->opcode) {
+		case IB_WR_SEND_WITH_INV:
+		case IB_WR_SEND:
+			if (wr->send_flags & IB_SEND_FENCE)
+				fw_flags |= FW_RI_READ_FENCE_FLAG;
+			fw_opcode = FW_RI_SEND_WR;
+			if (wr->opcode == IB_WR_SEND)
+				swsqe->opcode = FW_RI_SEND;
+			else
+				swsqe->opcode = FW_RI_SEND_WITH_INV;
+			err = build_rdma_send(&qhp->wq.sq, wqe, wr, &len16);
+			break;
+		case IB_WR_RDMA_WRITE:
+			fw_opcode = FW_RI_RDMA_WRITE_WR;
+			swsqe->opcode = FW_RI_RDMA_WRITE;
+			err = build_rdma_write(&qhp->wq.sq, wqe, wr, &len16);
+			break;
+		case IB_WR_RDMA_READ:
+		case IB_WR_RDMA_READ_WITH_INV:
+			fw_opcode = FW_RI_RDMA_READ_WR;
+			swsqe->opcode = FW_RI_READ_REQ;
+			if (wr->opcode == IB_WR_RDMA_READ_WITH_INV)
+				fw_flags = FW_RI_RDMA_READ_INVALIDATE;
+			else
+				fw_flags = 0;
+			err = build_rdma_read(wqe, wr, &len16);
+			if (err)
+				break;
+			swsqe->read_len = wr->sg_list[0].length;
+			if (!qhp->wq.sq.oldest_read)
+				qhp->wq.sq.oldest_read = swsqe;
+			break;
+		case IB_WR_FAST_REG_MR:
+			fw_opcode = FW_RI_FR_NSMR_WR;
+			swsqe->opcode = FW_RI_FAST_REGISTER;
+			err = build_fastreg(&qhp->wq.sq, wqe, wr, &len16,
+					    is_t5(
+					    qhp->rhp->rdev.lldi.adapter_type) ?
+					    1 : 0);
+			break;
+		case IB_WR_LOCAL_INV:
+			if (wr->send_flags & IB_SEND_FENCE)
+				fw_flags |= FW_RI_LOCAL_FENCE_FLAG;
+			fw_opcode = FW_RI_INV_LSTAG_WR;
+			swsqe->opcode = FW_RI_LOCAL_INV;
+			err = build_inv_stag(wqe, wr, &len16);
+			break;
+		default:
+			PDBG("%s post of type=%d TBD!\n", __func__,
+			     wr->opcode);
+			err = -EINVAL;
+		}
+		if (err) {
+			*bad_wr = wr;
+			break;
+		}
+		swsqe->idx = qhp->wq.sq.pidx;
+		swsqe->complete = 0;
+		swsqe->signaled = (wr->send_flags & IB_SEND_SIGNALED) ||
+				  qhp->sq_sig_all;
+		swsqe->flushed = 0;
+		swsqe->wr_id = wr->wr_id;
+		if (c4iw_wr_log) {
+			swsqe->sge_ts = cxgb4_read_sge_timestamp(
+					qhp->rhp->rdev.lldi.ports[0]);
+			getnstimeofday(&swsqe->host_ts);
+		}
+
+		init_wr_hdr(wqe, qhp->wq.sq.pidx, fw_opcode, fw_flags, len16);
+
+		PDBG("%s cookie 0x%llx pidx 0x%x opcode 0x%x read_len %u\n",
+		     __func__, (unsigned long long)wr->wr_id, qhp->wq.sq.pidx,
+		     swsqe->opcode, swsqe->read_len);
+		wr = wr->next;
+		num_wrs--;
+		t4_sq_produce(&qhp->wq, len16);
+		idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+	}
+	if (!qhp->rhp->rdev.status_page->db_off) {
+		t4_ring_sq_db(&qhp->wq, idx,
+			      is_t5(qhp->rhp->rdev.lldi.adapter_type), wqe);
+		spin_unlock_irqrestore(&qhp->lock, flag);
+	} else {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		ring_kernel_sq_db(qhp, idx);
+	}
+	return err;
+}
+
+int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	int err = 0;
+	struct c4iw_qp *qhp;
+	union t4_recv_wr *wqe = NULL;
+	u32 num_wrs;
+	u8 len16 = 0;
+	unsigned long flag;
+	u16 idx = 0;
+
+	qhp = to_c4iw_qp(ibqp);
+	spin_lock_irqsave(&qhp->lock, flag);
+	if (t4_wq_in_error(&qhp->wq)) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return -EINVAL;
+	}
+	num_wrs = t4_rq_avail(&qhp->wq);
+	if (num_wrs == 0) {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		return -ENOMEM;
+	}
+	while (wr) {
+		if (wr->num_sge > T4_MAX_RECV_SGE) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+		wqe = (union t4_recv_wr *)((u8 *)qhp->wq.rq.queue +
+					   qhp->wq.rq.wq_pidx *
+					   T4_EQ_ENTRY_SIZE);
+		if (num_wrs)
+			err = build_rdma_recv(qhp, wqe, wr, &len16);
+		else
+			err = -ENOMEM;
+		if (err) {
+			*bad_wr = wr;
+			break;
+		}
+
+		qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].wr_id = wr->wr_id;
+		if (c4iw_wr_log) {
+			qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].sge_ts =
+				cxgb4_read_sge_timestamp(
+						qhp->rhp->rdev.lldi.ports[0]);
+			getnstimeofday(
+				&qhp->wq.rq.sw_rq[qhp->wq.rq.pidx].host_ts);
+		}
+
+		wqe->recv.opcode = FW_RI_RECV_WR;
+		wqe->recv.r1 = 0;
+		wqe->recv.wrid = qhp->wq.rq.pidx;
+		wqe->recv.r2[0] = 0;
+		wqe->recv.r2[1] = 0;
+		wqe->recv.r2[2] = 0;
+		wqe->recv.len16 = len16;
+		PDBG("%s cookie 0x%llx pidx %u\n", __func__,
+		     (unsigned long long) wr->wr_id, qhp->wq.rq.pidx);
+		t4_rq_produce(&qhp->wq, len16);
+		idx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+		wr = wr->next;
+		num_wrs--;
+	}
+	if (!qhp->rhp->rdev.status_page->db_off) {
+		t4_ring_rq_db(&qhp->wq, idx,
+			      is_t5(qhp->rhp->rdev.lldi.adapter_type), wqe);
+		spin_unlock_irqrestore(&qhp->lock, flag);
+	} else {
+		spin_unlock_irqrestore(&qhp->lock, flag);
+		ring_kernel_rq_db(qhp, idx);
+	}
+	return err;
+}
+
+int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind)
+{
+	return -ENOSYS;
+}
+
+static inline void build_term_codes(struct t4_cqe *err_cqe, u8 *layer_type,
+				    u8 *ecode)
+{
+	int status;
+	int tagged;
+	int opcode;
+	int rqtype;
+	int send_inv;
+
+	if (!err_cqe) {
+		*layer_type = LAYER_RDMAP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		return;
+	}
+
+	status = CQE_STATUS(err_cqe);
+	opcode = CQE_OPCODE(err_cqe);
+	rqtype = RQ_TYPE(err_cqe);
+	send_inv = (opcode == FW_RI_SEND_WITH_INV) ||
+		   (opcode == FW_RI_SEND_WITH_SE_INV);
+	tagged = (opcode == FW_RI_RDMA_WRITE) ||
+		 (rqtype && (opcode == FW_RI_READ_RESP));
+
+	switch (status) {
+	case T4_ERR_STAG:
+		if (send_inv) {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+			*ecode = RDMAP_CANT_INV_STAG;
+		} else {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+			*ecode = RDMAP_INV_STAG;
+		}
+		break;
+	case T4_ERR_PDID:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		if ((opcode == FW_RI_SEND_WITH_INV) ||
+		    (opcode == FW_RI_SEND_WITH_SE_INV))
+			*ecode = RDMAP_CANT_INV_STAG;
+		else
+			*ecode = RDMAP_STAG_NOT_ASSOC;
+		break;
+	case T4_ERR_QPID:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_STAG_NOT_ASSOC;
+		break;
+	case T4_ERR_ACCESS:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_ACC_VIOL;
+		break;
+	case T4_ERR_WRAP:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+		*ecode = RDMAP_TO_WRAP;
+		break;
+	case T4_ERR_BOUND:
+		if (tagged) {
+			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+			*ecode = DDPT_BASE_BOUNDS;
+		} else {
+			*layer_type = LAYER_RDMAP|RDMAP_REMOTE_PROT;
+			*ecode = RDMAP_BASE_BOUNDS;
+		}
+		break;
+	case T4_ERR_INVALIDATE_SHARED_MR:
+	case T4_ERR_INVALIDATE_MR_WITH_MW_BOUND:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_CANT_INV_STAG;
+		break;
+	case T4_ERR_ECC:
+	case T4_ERR_ECC_PSTAG:
+	case T4_ERR_INTERNAL_ERR:
+		*layer_type = LAYER_RDMAP|RDMAP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	case T4_ERR_OUT_OF_RQE:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MSN_NOBUF;
+		break;
+	case T4_ERR_PBL_ADDR_BOUND:
+		*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+		*ecode = DDPT_BASE_BOUNDS;
+		break;
+	case T4_ERR_CRC:
+		*layer_type = LAYER_MPA|DDP_LLP;
+		*ecode = MPA_CRC_ERR;
+		break;
+	case T4_ERR_MARKER:
+		*layer_type = LAYER_MPA|DDP_LLP;
+		*ecode = MPA_MARKER_ERR;
+		break;
+	case T4_ERR_PDU_LEN_ERR:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_MSG_TOOBIG;
+		break;
+	case T4_ERR_DDP_VERSION:
+		if (tagged) {
+			*layer_type = LAYER_DDP|DDP_TAGGED_ERR;
+			*ecode = DDPT_INV_VERS;
+		} else {
+			*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+			*ecode = DDPU_INV_VERS;
+		}
+		break;
+	case T4_ERR_RDMA_VERSION:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_INV_VERS;
+		break;
+	case T4_ERR_OPCODE:
+		*layer_type = LAYER_RDMAP|RDMAP_REMOTE_OP;
+		*ecode = RDMAP_INV_OPCODE;
+		break;
+	case T4_ERR_DDP_QUEUE_NUM:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_QN;
+		break;
+	case T4_ERR_MSN:
+	case T4_ERR_MSN_GAP:
+	case T4_ERR_MSN_RANGE:
+	case T4_ERR_IRD_OVERFLOW:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MSN_RANGE;
+		break;
+	case T4_ERR_TBIT:
+		*layer_type = LAYER_DDP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	case T4_ERR_MO:
+		*layer_type = LAYER_DDP|DDP_UNTAGGED_ERR;
+		*ecode = DDPU_INV_MO;
+		break;
+	default:
+		*layer_type = LAYER_RDMAP|DDP_LOCAL_CATA;
+		*ecode = 0;
+		break;
+	}
+}
+
+static void post_terminate(struct c4iw_qp *qhp, struct t4_cqe *err_cqe,
+			   gfp_t gfp)
+{
+	struct fw_ri_wr *wqe;
+	struct sk_buff *skb;
+	struct terminate_message *term;
+
+	PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid,
+	     qhp->ep->hwtid);
+
+	skb = alloc_skb(sizeof *wqe, gfp);
+	if (!skb)
+		return;
+	set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx);
+
+	wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe));
+	memset(wqe, 0, sizeof *wqe);
+	wqe->op_compl = cpu_to_be32(FW_WR_OP_V(FW_RI_INIT_WR));
+	wqe->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID_V(qhp->ep->hwtid) |
+		FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*wqe), 16)));
+
+	wqe->u.terminate.type = FW_RI_TYPE_TERMINATE;
+	wqe->u.terminate.immdlen = cpu_to_be32(sizeof *term);
+	term = (struct terminate_message *)wqe->u.terminate.termmsg;
+	if (qhp->attr.layer_etype == (LAYER_MPA|DDP_LLP)) {
+		term->layer_etype = qhp->attr.layer_etype;
+		term->ecode = qhp->attr.ecode;
+	} else
+		build_term_codes(err_cqe, &term->layer_etype, &term->ecode);
+	c4iw_ofld_send(&qhp->rhp->rdev, skb);
+}
+
+/*
+ * Assumes qhp lock is held.
+ */
+static void __flush_qp(struct c4iw_qp *qhp, struct c4iw_cq *rchp,
+		       struct c4iw_cq *schp)
+{
+	int count;
+	int rq_flushed, sq_flushed;
+	unsigned long flag;
+
+	PDBG("%s qhp %p rchp %p schp %p\n", __func__, qhp, rchp, schp);
+
+	/* locking hierarchy: cq lock first, then qp lock. */
+	spin_lock_irqsave(&rchp->lock, flag);
+	spin_lock(&qhp->lock);
+
+	if (qhp->wq.flushed) {
+		spin_unlock(&qhp->lock);
+		spin_unlock_irqrestore(&rchp->lock, flag);
+		return;
+	}
+	qhp->wq.flushed = 1;
+
+	c4iw_flush_hw_cq(rchp);
+	c4iw_count_rcqes(&rchp->cq, &qhp->wq, &count);
+	rq_flushed = c4iw_flush_rq(&qhp->wq, &rchp->cq, count);
+	spin_unlock(&qhp->lock);
+	spin_unlock_irqrestore(&rchp->lock, flag);
+
+	/* locking hierarchy: cq lock first, then qp lock. */
+	spin_lock_irqsave(&schp->lock, flag);
+	spin_lock(&qhp->lock);
+	if (schp != rchp)
+		c4iw_flush_hw_cq(schp);
+	sq_flushed = c4iw_flush_sq(qhp);
+	spin_unlock(&qhp->lock);
+	spin_unlock_irqrestore(&schp->lock, flag);
+
+	if (schp == rchp) {
+		if (t4_clear_cq_armed(&rchp->cq) &&
+		    (rq_flushed || sq_flushed)) {
+			spin_lock_irqsave(&rchp->comp_handler_lock, flag);
+			(*rchp->ibcq.comp_handler)(&rchp->ibcq,
+						   rchp->ibcq.cq_context);
+			spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
+		}
+	} else {
+		if (t4_clear_cq_armed(&rchp->cq) && rq_flushed) {
+			spin_lock_irqsave(&rchp->comp_handler_lock, flag);
+			(*rchp->ibcq.comp_handler)(&rchp->ibcq,
+						   rchp->ibcq.cq_context);
+			spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
+		}
+		if (t4_clear_cq_armed(&schp->cq) && sq_flushed) {
+			spin_lock_irqsave(&schp->comp_handler_lock, flag);
+			(*schp->ibcq.comp_handler)(&schp->ibcq,
+						   schp->ibcq.cq_context);
+			spin_unlock_irqrestore(&schp->comp_handler_lock, flag);
+		}
+	}
+}
+
+static void flush_qp(struct c4iw_qp *qhp)
+{
+	struct c4iw_cq *rchp, *schp;
+	unsigned long flag;
+
+	rchp = to_c4iw_cq(qhp->ibqp.recv_cq);
+	schp = to_c4iw_cq(qhp->ibqp.send_cq);
+
+	t4_set_wq_in_error(&qhp->wq);
+	if (qhp->ibqp.uobject) {
+		t4_set_cq_in_error(&rchp->cq);
+		spin_lock_irqsave(&rchp->comp_handler_lock, flag);
+		(*rchp->ibcq.comp_handler)(&rchp->ibcq, rchp->ibcq.cq_context);
+		spin_unlock_irqrestore(&rchp->comp_handler_lock, flag);
+		if (schp != rchp) {
+			t4_set_cq_in_error(&schp->cq);
+			spin_lock_irqsave(&schp->comp_handler_lock, flag);
+			(*schp->ibcq.comp_handler)(&schp->ibcq,
+					schp->ibcq.cq_context);
+			spin_unlock_irqrestore(&schp->comp_handler_lock, flag);
+		}
+		return;
+	}
+	__flush_qp(qhp, rchp, schp);
+}
+
+static int rdma_fini(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
+		     struct c4iw_ep *ep)
+{
+	struct fw_ri_wr *wqe;
+	int ret;
+	struct sk_buff *skb;
+
+	PDBG("%s qhp %p qid 0x%x tid %u\n", __func__, qhp, qhp->wq.sq.qid,
+	     ep->hwtid);
+
+	skb = alloc_skb(sizeof *wqe, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+	set_wr_txq(skb, CPL_PRIORITY_DATA, ep->txq_idx);
+
+	wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe));
+	memset(wqe, 0, sizeof *wqe);
+	wqe->op_compl = cpu_to_be32(
+		FW_WR_OP_V(FW_RI_INIT_WR) |
+		FW_WR_COMPL_F);
+	wqe->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID_V(ep->hwtid) |
+		FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*wqe), 16)));
+	wqe->cookie = (uintptr_t)&ep->com.wr_wait;
+
+	wqe->u.fini.type = FW_RI_TYPE_FINI;
+	ret = c4iw_ofld_send(&rhp->rdev, skb);
+	if (ret)
+		goto out;
+
+	ret = c4iw_wait_for_reply(&rhp->rdev, &ep->com.wr_wait, qhp->ep->hwtid,
+			     qhp->wq.sq.qid, __func__);
+out:
+	PDBG("%s ret %d\n", __func__, ret);
+	return ret;
+}
+
+static void build_rtr_msg(u8 p2p_type, struct fw_ri_init *init)
+{
+	PDBG("%s p2p_type = %d\n", __func__, p2p_type);
+	memset(&init->u, 0, sizeof init->u);
+	switch (p2p_type) {
+	case FW_RI_INIT_P2PTYPE_RDMA_WRITE:
+		init->u.write.opcode = FW_RI_RDMA_WRITE_WR;
+		init->u.write.stag_sink = cpu_to_be32(1);
+		init->u.write.to_sink = cpu_to_be64(1);
+		init->u.write.u.immd_src[0].op = FW_RI_DATA_IMMD;
+		init->u.write.len16 = DIV_ROUND_UP(sizeof init->u.write +
+						   sizeof(struct fw_ri_immd),
+						   16);
+		break;
+	case FW_RI_INIT_P2PTYPE_READ_REQ:
+		init->u.write.opcode = FW_RI_RDMA_READ_WR;
+		init->u.read.stag_src = cpu_to_be32(1);
+		init->u.read.to_src_lo = cpu_to_be32(1);
+		init->u.read.stag_sink = cpu_to_be32(1);
+		init->u.read.to_sink_lo = cpu_to_be32(1);
+		init->u.read.len16 = DIV_ROUND_UP(sizeof init->u.read, 16);
+		break;
+	}
+}
+
+static int rdma_init(struct c4iw_dev *rhp, struct c4iw_qp *qhp)
+{
+	struct fw_ri_wr *wqe;
+	int ret;
+	struct sk_buff *skb;
+
+	PDBG("%s qhp %p qid 0x%x tid %u ird %u ord %u\n", __func__, qhp,
+	     qhp->wq.sq.qid, qhp->ep->hwtid, qhp->ep->ird, qhp->ep->ord);
+
+	skb = alloc_skb(sizeof *wqe, GFP_KERNEL);
+	if (!skb) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	ret = alloc_ird(rhp, qhp->attr.max_ird);
+	if (ret) {
+		qhp->attr.max_ird = 0;
+		kfree_skb(skb);
+		goto out;
+	}
+	set_wr_txq(skb, CPL_PRIORITY_DATA, qhp->ep->txq_idx);
+
+	wqe = (struct fw_ri_wr *)__skb_put(skb, sizeof(*wqe));
+	memset(wqe, 0, sizeof *wqe);
+	wqe->op_compl = cpu_to_be32(
+		FW_WR_OP_V(FW_RI_INIT_WR) |
+		FW_WR_COMPL_F);
+	wqe->flowid_len16 = cpu_to_be32(
+		FW_WR_FLOWID_V(qhp->ep->hwtid) |
+		FW_WR_LEN16_V(DIV_ROUND_UP(sizeof(*wqe), 16)));
+
+	wqe->cookie = (uintptr_t)&qhp->ep->com.wr_wait;
+
+	wqe->u.init.type = FW_RI_TYPE_INIT;
+	wqe->u.init.mpareqbit_p2ptype =
+		FW_RI_WR_MPAREQBIT_V(qhp->attr.mpa_attr.initiator) |
+		FW_RI_WR_P2PTYPE_V(qhp->attr.mpa_attr.p2p_type);
+	wqe->u.init.mpa_attrs = FW_RI_MPA_IETF_ENABLE;
+	if (qhp->attr.mpa_attr.recv_marker_enabled)
+		wqe->u.init.mpa_attrs |= FW_RI_MPA_RX_MARKER_ENABLE;
+	if (qhp->attr.mpa_attr.xmit_marker_enabled)
+		wqe->u.init.mpa_attrs |= FW_RI_MPA_TX_MARKER_ENABLE;
+	if (qhp->attr.mpa_attr.crc_enabled)
+		wqe->u.init.mpa_attrs |= FW_RI_MPA_CRC_ENABLE;
+
+	wqe->u.init.qp_caps = FW_RI_QP_RDMA_READ_ENABLE |
+			    FW_RI_QP_RDMA_WRITE_ENABLE |
+			    FW_RI_QP_BIND_ENABLE;
+	if (!qhp->ibqp.uobject)
+		wqe->u.init.qp_caps |= FW_RI_QP_FAST_REGISTER_ENABLE |
+				     FW_RI_QP_STAG0_ENABLE;
+	wqe->u.init.nrqe = cpu_to_be16(t4_rqes_posted(&qhp->wq));
+	wqe->u.init.pdid = cpu_to_be32(qhp->attr.pd);
+	wqe->u.init.qpid = cpu_to_be32(qhp->wq.sq.qid);
+	wqe->u.init.sq_eqid = cpu_to_be32(qhp->wq.sq.qid);
+	wqe->u.init.rq_eqid = cpu_to_be32(qhp->wq.rq.qid);
+	wqe->u.init.scqid = cpu_to_be32(qhp->attr.scq);
+	wqe->u.init.rcqid = cpu_to_be32(qhp->attr.rcq);
+	wqe->u.init.ord_max = cpu_to_be32(qhp->attr.max_ord);
+	wqe->u.init.ird_max = cpu_to_be32(qhp->attr.max_ird);
+	wqe->u.init.iss = cpu_to_be32(qhp->ep->snd_seq);
+	wqe->u.init.irs = cpu_to_be32(qhp->ep->rcv_seq);
+	wqe->u.init.hwrqsize = cpu_to_be32(qhp->wq.rq.rqt_size);
+	wqe->u.init.hwrqaddr = cpu_to_be32(qhp->wq.rq.rqt_hwaddr -
+					 rhp->rdev.lldi.vr->rq.start);
+	if (qhp->attr.mpa_attr.initiator)
+		build_rtr_msg(qhp->attr.mpa_attr.p2p_type, &wqe->u.init);
+
+	ret = c4iw_ofld_send(&rhp->rdev, skb);
+	if (ret)
+		goto err1;
+
+	ret = c4iw_wait_for_reply(&rhp->rdev, &qhp->ep->com.wr_wait,
+				  qhp->ep->hwtid, qhp->wq.sq.qid, __func__);
+	if (!ret)
+		goto out;
+err1:
+	free_ird(rhp, qhp->attr.max_ird);
+out:
+	PDBG("%s ret %d\n", __func__, ret);
+	return ret;
+}
+
+int c4iw_modify_qp(struct c4iw_dev *rhp, struct c4iw_qp *qhp,
+		   enum c4iw_qp_attr_mask mask,
+		   struct c4iw_qp_attributes *attrs,
+		   int internal)
+{
+	int ret = 0;
+	struct c4iw_qp_attributes newattr = qhp->attr;
+	int disconnect = 0;
+	int terminate = 0;
+	int abort = 0;
+	int free = 0;
+	struct c4iw_ep *ep = NULL;
+
+	PDBG("%s qhp %p sqid 0x%x rqid 0x%x ep %p state %d -> %d\n", __func__,
+	     qhp, qhp->wq.sq.qid, qhp->wq.rq.qid, qhp->ep, qhp->attr.state,
+	     (mask & C4IW_QP_ATTR_NEXT_STATE) ? attrs->next_state : -1);
+
+	mutex_lock(&qhp->mutex);
+
+	/* Process attr changes if in IDLE */
+	if (mask & C4IW_QP_ATTR_VALID_MODIFY) {
+		if (qhp->attr.state != C4IW_QP_STATE_IDLE) {
+			ret = -EIO;
+			goto out;
+		}
+		if (mask & C4IW_QP_ATTR_ENABLE_RDMA_READ)
+			newattr.enable_rdma_read = attrs->enable_rdma_read;
+		if (mask & C4IW_QP_ATTR_ENABLE_RDMA_WRITE)
+			newattr.enable_rdma_write = attrs->enable_rdma_write;
+		if (mask & C4IW_QP_ATTR_ENABLE_RDMA_BIND)
+			newattr.enable_bind = attrs->enable_bind;
+		if (mask & C4IW_QP_ATTR_MAX_ORD) {
+			if (attrs->max_ord > c4iw_max_read_depth) {
+				ret = -EINVAL;
+				goto out;
+			}
+			newattr.max_ord = attrs->max_ord;
+		}
+		if (mask & C4IW_QP_ATTR_MAX_IRD) {
+			if (attrs->max_ird > cur_max_read_depth(rhp)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			newattr.max_ird = attrs->max_ird;
+		}
+		qhp->attr = newattr;
+	}
+
+	if (mask & C4IW_QP_ATTR_SQ_DB) {
+		ret = ring_kernel_sq_db(qhp, attrs->sq_db_inc);
+		goto out;
+	}
+	if (mask & C4IW_QP_ATTR_RQ_DB) {
+		ret = ring_kernel_rq_db(qhp, attrs->rq_db_inc);
+		goto out;
+	}
+
+	if (!(mask & C4IW_QP_ATTR_NEXT_STATE))
+		goto out;
+	if (qhp->attr.state == attrs->next_state)
+		goto out;
+
+	switch (qhp->attr.state) {
+	case C4IW_QP_STATE_IDLE:
+		switch (attrs->next_state) {
+		case C4IW_QP_STATE_RTS:
+			if (!(mask & C4IW_QP_ATTR_LLP_STREAM_HANDLE)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			if (!(mask & C4IW_QP_ATTR_MPA_ATTR)) {
+				ret = -EINVAL;
+				goto out;
+			}
+			qhp->attr.mpa_attr = attrs->mpa_attr;
+			qhp->attr.llp_stream_handle = attrs->llp_stream_handle;
+			qhp->ep = qhp->attr.llp_stream_handle;
+			set_state(qhp, C4IW_QP_STATE_RTS);
+
+			/*
+			 * Ref the endpoint here and deref when we
+			 * disassociate the endpoint from the QP.  This
+			 * happens in CLOSING->IDLE transition or *->ERROR
+			 * transition.
+			 */
+			c4iw_get_ep(&qhp->ep->com);
+			ret = rdma_init(rhp, qhp);
+			if (ret)
+				goto err;
+			break;
+		case C4IW_QP_STATE_ERROR:
+			set_state(qhp, C4IW_QP_STATE_ERROR);
+			flush_qp(qhp);
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case C4IW_QP_STATE_RTS:
+		switch (attrs->next_state) {
+		case C4IW_QP_STATE_CLOSING:
+			BUG_ON(atomic_read(&qhp->ep->com.kref.refcount) < 2);
+			t4_set_wq_in_error(&qhp->wq);
+			set_state(qhp, C4IW_QP_STATE_CLOSING);
+			ep = qhp->ep;
+			if (!internal) {
+				abort = 0;
+				disconnect = 1;
+				c4iw_get_ep(&qhp->ep->com);
+			}
+			ret = rdma_fini(rhp, qhp, ep);
+			if (ret)
+				goto err;
+			break;
+		case C4IW_QP_STATE_TERMINATE:
+			t4_set_wq_in_error(&qhp->wq);
+			set_state(qhp, C4IW_QP_STATE_TERMINATE);
+			qhp->attr.layer_etype = attrs->layer_etype;
+			qhp->attr.ecode = attrs->ecode;
+			ep = qhp->ep;
+			if (!internal) {
+				c4iw_get_ep(&qhp->ep->com);
+				terminate = 1;
+				disconnect = 1;
+			} else {
+				terminate = qhp->attr.send_term;
+				ret = rdma_fini(rhp, qhp, ep);
+				if (ret)
+					goto err;
+			}
+			break;
+		case C4IW_QP_STATE_ERROR:
+			t4_set_wq_in_error(&qhp->wq);
+			set_state(qhp, C4IW_QP_STATE_ERROR);
+			if (!internal) {
+				abort = 1;
+				disconnect = 1;
+				ep = qhp->ep;
+				c4iw_get_ep(&qhp->ep->com);
+			}
+			goto err;
+			break;
+		default:
+			ret = -EINVAL;
+			goto out;
+		}
+		break;
+	case C4IW_QP_STATE_CLOSING:
+		if (!internal) {
+			ret = -EINVAL;
+			goto out;
+		}
+		switch (attrs->next_state) {
+		case C4IW_QP_STATE_IDLE:
+			flush_qp(qhp);
+			set_state(qhp, C4IW_QP_STATE_IDLE);
+			qhp->attr.llp_stream_handle = NULL;
+			c4iw_put_ep(&qhp->ep->com);
+			qhp->ep = NULL;
+			wake_up(&qhp->wait);
+			break;
+		case C4IW_QP_STATE_ERROR:
+			goto err;
+		default:
+			ret = -EINVAL;
+			goto err;
+		}
+		break;
+	case C4IW_QP_STATE_ERROR:
+		if (attrs->next_state != C4IW_QP_STATE_IDLE) {
+			ret = -EINVAL;
+			goto out;
+		}
+		if (!t4_sq_empty(&qhp->wq) || !t4_rq_empty(&qhp->wq)) {
+			ret = -EINVAL;
+			goto out;
+		}
+		set_state(qhp, C4IW_QP_STATE_IDLE);
+		break;
+	case C4IW_QP_STATE_TERMINATE:
+		if (!internal) {
+			ret = -EINVAL;
+			goto out;
+		}
+		goto err;
+		break;
+	default:
+		printk(KERN_ERR "%s in a bad state %d\n",
+		       __func__, qhp->attr.state);
+		ret = -EINVAL;
+		goto err;
+		break;
+	}
+	goto out;
+err:
+	PDBG("%s disassociating ep %p qpid 0x%x\n", __func__, qhp->ep,
+	     qhp->wq.sq.qid);
+
+	/* disassociate the LLP connection */
+	qhp->attr.llp_stream_handle = NULL;
+	if (!ep)
+		ep = qhp->ep;
+	qhp->ep = NULL;
+	set_state(qhp, C4IW_QP_STATE_ERROR);
+	free = 1;
+	abort = 1;
+	BUG_ON(!ep);
+	flush_qp(qhp);
+	wake_up(&qhp->wait);
+out:
+	mutex_unlock(&qhp->mutex);
+
+	if (terminate)
+		post_terminate(qhp, NULL, internal ? GFP_ATOMIC : GFP_KERNEL);
+
+	/*
+	 * If disconnect is 1, then we need to initiate a disconnect
+	 * on the EP.  This can be a normal close (RTS->CLOSING) or
+	 * an abnormal close (RTS/CLOSING->ERROR).
+	 */
+	if (disconnect) {
+		c4iw_ep_disconnect(ep, abort, internal ? GFP_ATOMIC :
+							 GFP_KERNEL);
+		c4iw_put_ep(&ep->com);
+	}
+
+	/*
+	 * If free is 1, then we've disassociated the EP from the QP
+	 * and we need to dereference the EP.
+	 */
+	if (free)
+		c4iw_put_ep(&ep->com);
+	PDBG("%s exit state %d\n", __func__, qhp->attr.state);
+	return ret;
+}
+
+int c4iw_destroy_qp(struct ib_qp *ib_qp)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_qp *qhp;
+	struct c4iw_qp_attributes attrs;
+	struct c4iw_ucontext *ucontext;
+
+	qhp = to_c4iw_qp(ib_qp);
+	rhp = qhp->rhp;
+
+	attrs.next_state = C4IW_QP_STATE_ERROR;
+	if (qhp->attr.state == C4IW_QP_STATE_TERMINATE)
+		c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 1);
+	else
+		c4iw_modify_qp(rhp, qhp, C4IW_QP_ATTR_NEXT_STATE, &attrs, 0);
+	wait_event(qhp->wait, !qhp->ep);
+
+	remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
+	atomic_dec(&qhp->refcnt);
+	wait_event(qhp->wait, !atomic_read(&qhp->refcnt));
+
+	spin_lock_irq(&rhp->lock);
+	if (!list_empty(&qhp->db_fc_entry))
+		list_del_init(&qhp->db_fc_entry);
+	spin_unlock_irq(&rhp->lock);
+	free_ird(rhp, qhp->attr.max_ird);
+
+	ucontext = ib_qp->uobject ?
+		   to_c4iw_ucontext(ib_qp->uobject->context) : NULL;
+	destroy_qp(&rhp->rdev, &qhp->wq,
+		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+
+	PDBG("%s ib_qp %p qpid 0x%0x\n", __func__, ib_qp, qhp->wq.sq.qid);
+	kfree(qhp);
+	return 0;
+}
+
+struct ib_qp *c4iw_create_qp(struct ib_pd *pd, struct ib_qp_init_attr *attrs,
+			     struct ib_udata *udata)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_qp *qhp;
+	struct c4iw_pd *php;
+	struct c4iw_cq *schp;
+	struct c4iw_cq *rchp;
+	struct c4iw_create_qp_resp uresp;
+	unsigned int sqsize, rqsize;
+	struct c4iw_ucontext *ucontext;
+	int ret;
+	struct c4iw_mm_entry *mm1, *mm2, *mm3, *mm4, *mm5 = NULL;
+
+	PDBG("%s ib_pd %p\n", __func__, pd);
+
+	if (attrs->qp_type != IB_QPT_RC)
+		return ERR_PTR(-EINVAL);
+
+	php = to_c4iw_pd(pd);
+	rhp = php->rhp;
+	schp = get_chp(rhp, ((struct c4iw_cq *)attrs->send_cq)->cq.cqid);
+	rchp = get_chp(rhp, ((struct c4iw_cq *)attrs->recv_cq)->cq.cqid);
+	if (!schp || !rchp)
+		return ERR_PTR(-EINVAL);
+
+	if (attrs->cap.max_inline_data > T4_MAX_SEND_INLINE)
+		return ERR_PTR(-EINVAL);
+
+	if (attrs->cap.max_recv_wr > rhp->rdev.hw_queue.t4_max_rq_size)
+		return ERR_PTR(-E2BIG);
+	rqsize = attrs->cap.max_recv_wr + 1;
+	if (rqsize < 8)
+		rqsize = 8;
+
+	if (attrs->cap.max_send_wr > rhp->rdev.hw_queue.t4_max_sq_size)
+		return ERR_PTR(-E2BIG);
+	sqsize = attrs->cap.max_send_wr + 1;
+	if (sqsize < 8)
+		sqsize = 8;
+
+	ucontext = pd->uobject ? to_c4iw_ucontext(pd->uobject->context) : NULL;
+
+	qhp = kzalloc(sizeof(*qhp), GFP_KERNEL);
+	if (!qhp)
+		return ERR_PTR(-ENOMEM);
+	qhp->wq.sq.size = sqsize;
+	qhp->wq.sq.memsize =
+		(sqsize + rhp->rdev.hw_queue.t4_eq_status_entries) *
+		sizeof(*qhp->wq.sq.queue) + 16 * sizeof(__be64);
+	qhp->wq.sq.flush_cidx = -1;
+	qhp->wq.rq.size = rqsize;
+	qhp->wq.rq.memsize =
+		(rqsize + rhp->rdev.hw_queue.t4_eq_status_entries) *
+		sizeof(*qhp->wq.rq.queue);
+
+	if (ucontext) {
+		qhp->wq.sq.memsize = roundup(qhp->wq.sq.memsize, PAGE_SIZE);
+		qhp->wq.rq.memsize = roundup(qhp->wq.rq.memsize, PAGE_SIZE);
+	}
+
+	ret = create_qp(&rhp->rdev, &qhp->wq, &schp->cq, &rchp->cq,
+			ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+	if (ret)
+		goto err1;
+
+	attrs->cap.max_recv_wr = rqsize - 1;
+	attrs->cap.max_send_wr = sqsize - 1;
+	attrs->cap.max_inline_data = T4_MAX_SEND_INLINE;
+
+	qhp->rhp = rhp;
+	qhp->attr.pd = php->pdid;
+	qhp->attr.scq = ((struct c4iw_cq *) attrs->send_cq)->cq.cqid;
+	qhp->attr.rcq = ((struct c4iw_cq *) attrs->recv_cq)->cq.cqid;
+	qhp->attr.sq_num_entries = attrs->cap.max_send_wr;
+	qhp->attr.rq_num_entries = attrs->cap.max_recv_wr;
+	qhp->attr.sq_max_sges = attrs->cap.max_send_sge;
+	qhp->attr.sq_max_sges_rdma_write = attrs->cap.max_send_sge;
+	qhp->attr.rq_max_sges = attrs->cap.max_recv_sge;
+	qhp->attr.state = C4IW_QP_STATE_IDLE;
+	qhp->attr.next_state = C4IW_QP_STATE_IDLE;
+	qhp->attr.enable_rdma_read = 1;
+	qhp->attr.enable_rdma_write = 1;
+	qhp->attr.enable_bind = 1;
+	qhp->attr.max_ord = 0;
+	qhp->attr.max_ird = 0;
+	qhp->sq_sig_all = attrs->sq_sig_type == IB_SIGNAL_ALL_WR;
+	spin_lock_init(&qhp->lock);
+	mutex_init(&qhp->mutex);
+	init_waitqueue_head(&qhp->wait);
+	atomic_set(&qhp->refcnt, 1);
+
+	ret = insert_handle(rhp, &rhp->qpidr, qhp, qhp->wq.sq.qid);
+	if (ret)
+		goto err2;
+
+	if (udata) {
+		mm1 = kmalloc(sizeof *mm1, GFP_KERNEL);
+		if (!mm1) {
+			ret = -ENOMEM;
+			goto err3;
+		}
+		mm2 = kmalloc(sizeof *mm2, GFP_KERNEL);
+		if (!mm2) {
+			ret = -ENOMEM;
+			goto err4;
+		}
+		mm3 = kmalloc(sizeof *mm3, GFP_KERNEL);
+		if (!mm3) {
+			ret = -ENOMEM;
+			goto err5;
+		}
+		mm4 = kmalloc(sizeof *mm4, GFP_KERNEL);
+		if (!mm4) {
+			ret = -ENOMEM;
+			goto err6;
+		}
+		if (t4_sq_onchip(&qhp->wq.sq)) {
+			mm5 = kmalloc(sizeof *mm5, GFP_KERNEL);
+			if (!mm5) {
+				ret = -ENOMEM;
+				goto err7;
+			}
+			uresp.flags = C4IW_QPF_ONCHIP;
+		} else
+			uresp.flags = 0;
+		uresp.qid_mask = rhp->rdev.qpmask;
+		uresp.sqid = qhp->wq.sq.qid;
+		uresp.sq_size = qhp->wq.sq.size;
+		uresp.sq_memsize = qhp->wq.sq.memsize;
+		uresp.rqid = qhp->wq.rq.qid;
+		uresp.rq_size = qhp->wq.rq.size;
+		uresp.rq_memsize = qhp->wq.rq.memsize;
+		spin_lock(&ucontext->mmap_lock);
+		if (mm5) {
+			uresp.ma_sync_key = ucontext->key;
+			ucontext->key += PAGE_SIZE;
+		} else {
+			uresp.ma_sync_key =  0;
+		}
+		uresp.sq_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.rq_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.sq_db_gts_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		uresp.rq_db_gts_key = ucontext->key;
+		ucontext->key += PAGE_SIZE;
+		spin_unlock(&ucontext->mmap_lock);
+		ret = ib_copy_to_udata(udata, &uresp, sizeof uresp);
+		if (ret)
+			goto err8;
+		mm1->key = uresp.sq_key;
+		mm1->addr = qhp->wq.sq.phys_addr;
+		mm1->len = PAGE_ALIGN(qhp->wq.sq.memsize);
+		insert_mmap(ucontext, mm1);
+		mm2->key = uresp.rq_key;
+		mm2->addr = virt_to_phys(qhp->wq.rq.queue);
+		mm2->len = PAGE_ALIGN(qhp->wq.rq.memsize);
+		insert_mmap(ucontext, mm2);
+		mm3->key = uresp.sq_db_gts_key;
+		mm3->addr = (__force unsigned long)qhp->wq.sq.udb;
+		mm3->len = PAGE_SIZE;
+		insert_mmap(ucontext, mm3);
+		mm4->key = uresp.rq_db_gts_key;
+		mm4->addr = (__force unsigned long)qhp->wq.rq.udb;
+		mm4->len = PAGE_SIZE;
+		insert_mmap(ucontext, mm4);
+		if (mm5) {
+			mm5->key = uresp.ma_sync_key;
+			mm5->addr = (pci_resource_start(rhp->rdev.lldi.pdev, 0)
+				    + PCIE_MA_SYNC_A) & PAGE_MASK;
+			mm5->len = PAGE_SIZE;
+			insert_mmap(ucontext, mm5);
+		}
+	}
+	qhp->ibqp.qp_num = qhp->wq.sq.qid;
+	init_timer(&(qhp->timer));
+	INIT_LIST_HEAD(&qhp->db_fc_entry);
+	PDBG("%s sq id %u size %u memsize %zu num_entries %u "
+	     "rq id %u size %u memsize %zu num_entries %u\n", __func__,
+	     qhp->wq.sq.qid, qhp->wq.sq.size, qhp->wq.sq.memsize,
+	     attrs->cap.max_send_wr, qhp->wq.rq.qid, qhp->wq.rq.size,
+	     qhp->wq.rq.memsize, attrs->cap.max_recv_wr);
+	return &qhp->ibqp;
+err8:
+	kfree(mm5);
+err7:
+	kfree(mm4);
+err6:
+	kfree(mm3);
+err5:
+	kfree(mm2);
+err4:
+	kfree(mm1);
+err3:
+	remove_handle(rhp, &rhp->qpidr, qhp->wq.sq.qid);
+err2:
+	destroy_qp(&rhp->rdev, &qhp->wq,
+		   ucontext ? &ucontext->uctx : &rhp->rdev.uctx);
+err1:
+	kfree(qhp);
+	return ERR_PTR(ret);
+}
+
+int c4iw_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct c4iw_dev *rhp;
+	struct c4iw_qp *qhp;
+	enum c4iw_qp_attr_mask mask = 0;
+	struct c4iw_qp_attributes attrs;
+
+	PDBG("%s ib_qp %p\n", __func__, ibqp);
+
+	/* iwarp does not support the RTR state */
+	if ((attr_mask & IB_QP_STATE) && (attr->qp_state == IB_QPS_RTR))
+		attr_mask &= ~IB_QP_STATE;
+
+	/* Make sure we still have something left to do */
+	if (!attr_mask)
+		return 0;
+
+	memset(&attrs, 0, sizeof attrs);
+	qhp = to_c4iw_qp(ibqp);
+	rhp = qhp->rhp;
+
+	attrs.next_state = c4iw_convert_state(attr->qp_state);
+	attrs.enable_rdma_read = (attr->qp_access_flags &
+			       IB_ACCESS_REMOTE_READ) ?  1 : 0;
+	attrs.enable_rdma_write = (attr->qp_access_flags &
+				IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	attrs.enable_bind = (attr->qp_access_flags & IB_ACCESS_MW_BIND) ? 1 : 0;
+
+
+	mask |= (attr_mask & IB_QP_STATE) ? C4IW_QP_ATTR_NEXT_STATE : 0;
+	mask |= (attr_mask & IB_QP_ACCESS_FLAGS) ?
+			(C4IW_QP_ATTR_ENABLE_RDMA_READ |
+			 C4IW_QP_ATTR_ENABLE_RDMA_WRITE |
+			 C4IW_QP_ATTR_ENABLE_RDMA_BIND) : 0;
+
+	/*
+	 * Use SQ_PSN and RQ_PSN to pass in IDX_INC values for
+	 * ringing the queue db when we're in DB_FULL mode.
+	 * Only allow this on T4 devices.
+	 */
+	attrs.sq_db_inc = attr->sq_psn;
+	attrs.rq_db_inc = attr->rq_psn;
+	mask |= (attr_mask & IB_QP_SQ_PSN) ? C4IW_QP_ATTR_SQ_DB : 0;
+	mask |= (attr_mask & IB_QP_RQ_PSN) ? C4IW_QP_ATTR_RQ_DB : 0;
+	if (is_t5(to_c4iw_qp(ibqp)->rhp->rdev.lldi.adapter_type) &&
+	    (mask & (C4IW_QP_ATTR_SQ_DB|C4IW_QP_ATTR_RQ_DB)))
+		return -EINVAL;
+
+	return c4iw_modify_qp(rhp, qhp, mask, &attrs, 0);
+}
+
+struct ib_qp *c4iw_get_qp(struct ib_device *dev, int qpn)
+{
+	PDBG("%s ib_dev %p qpn 0x%x\n", __func__, dev, qpn);
+	return (struct ib_qp *)get_qhp(to_c4iw_dev(dev), qpn);
+}
+
+int c4iw_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		     int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+	struct c4iw_qp *qhp = to_c4iw_qp(ibqp);
+
+	memset(attr, 0, sizeof *attr);
+	memset(init_attr, 0, sizeof *init_attr);
+	attr->qp_state = to_ib_qp_state(qhp->attr.state);
+	init_attr->cap.max_send_wr = qhp->attr.sq_num_entries;
+	init_attr->cap.max_recv_wr = qhp->attr.rq_num_entries;
+	init_attr->cap.max_send_sge = qhp->attr.sq_max_sges;
+	init_attr->cap.max_recv_sge = qhp->attr.sq_max_sges;
+	init_attr->cap.max_inline_data = T4_MAX_SEND_INLINE;
+	init_attr->sq_sig_type = qhp->sq_sig_all ? IB_SIGNAL_ALL_WR : 0;
+	return 0;
+}
diff --git a/drivers/infiniband/hw/cxgb4/resource.c b/drivers/infiniband/hw/cxgb4/resource.c
new file mode 100644
index 000000000..67df71a70
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/resource.c
@@ -0,0 +1,453 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/* Crude resource management */
+#include <linux/spinlock.h>
+#include <linux/genalloc.h>
+#include <linux/ratelimit.h>
+#include "iw_cxgb4.h"
+
+static int c4iw_init_qid_table(struct c4iw_rdev *rdev)
+{
+	u32 i;
+
+	if (c4iw_id_table_alloc(&rdev->resource.qid_table,
+				rdev->lldi.vr->qp.start,
+				rdev->lldi.vr->qp.size,
+				rdev->lldi.vr->qp.size, 0))
+		return -ENOMEM;
+
+	for (i = rdev->lldi.vr->qp.start;
+		i < rdev->lldi.vr->qp.start + rdev->lldi.vr->qp.size; i++)
+		if (!(i & rdev->qpmask))
+			c4iw_id_free(&rdev->resource.qid_table, i);
+	return 0;
+}
+
+/* nr_* must be power of 2 */
+int c4iw_init_resource(struct c4iw_rdev *rdev, u32 nr_tpt, u32 nr_pdid)
+{
+	int err = 0;
+	err = c4iw_id_table_alloc(&rdev->resource.tpt_table, 0, nr_tpt, 1,
+					C4IW_ID_TABLE_F_RANDOM);
+	if (err)
+		goto tpt_err;
+	err = c4iw_init_qid_table(rdev);
+	if (err)
+		goto qid_err;
+	err = c4iw_id_table_alloc(&rdev->resource.pdid_table, 0,
+					nr_pdid, 1, 0);
+	if (err)
+		goto pdid_err;
+	return 0;
+ pdid_err:
+	c4iw_id_table_free(&rdev->resource.qid_table);
+ qid_err:
+	c4iw_id_table_free(&rdev->resource.tpt_table);
+ tpt_err:
+	return -ENOMEM;
+}
+
+/*
+ * returns 0 if no resource available
+ */
+u32 c4iw_get_resource(struct c4iw_id_table *id_table)
+{
+	u32 entry;
+	entry = c4iw_id_alloc(id_table);
+	if (entry == (u32)(-1))
+		return 0;
+	return entry;
+}
+
+void c4iw_put_resource(struct c4iw_id_table *id_table, u32 entry)
+{
+	PDBG("%s entry 0x%x\n", __func__, entry);
+	c4iw_id_free(id_table, entry);
+}
+
+u32 c4iw_get_cqid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)
+{
+	struct c4iw_qid_list *entry;
+	u32 qid;
+	int i;
+
+	mutex_lock(&uctx->lock);
+	if (!list_empty(&uctx->cqids)) {
+		entry = list_entry(uctx->cqids.next, struct c4iw_qid_list,
+				   entry);
+		list_del(&entry->entry);
+		qid = entry->qid;
+		kfree(entry);
+	} else {
+		qid = c4iw_get_resource(&rdev->resource.qid_table);
+		if (!qid)
+			goto out;
+		mutex_lock(&rdev->stats.lock);
+		rdev->stats.qid.cur += rdev->qpmask + 1;
+		mutex_unlock(&rdev->stats.lock);
+		for (i = qid+1; i & rdev->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				goto out;
+			entry->qid = i;
+			list_add_tail(&entry->entry, &uctx->cqids);
+		}
+
+		/*
+		 * now put the same ids on the qp list since they all
+		 * map to the same db/gts page.
+		 */
+		entry = kmalloc(sizeof *entry, GFP_KERNEL);
+		if (!entry)
+			goto out;
+		entry->qid = qid;
+		list_add_tail(&entry->entry, &uctx->qpids);
+		for (i = qid+1; i & rdev->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				goto out;
+			entry->qid = i;
+			list_add_tail(&entry->entry, &uctx->qpids);
+		}
+	}
+out:
+	mutex_unlock(&uctx->lock);
+	PDBG("%s qid 0x%x\n", __func__, qid);
+	mutex_lock(&rdev->stats.lock);
+	if (rdev->stats.qid.cur > rdev->stats.qid.max)
+		rdev->stats.qid.max = rdev->stats.qid.cur;
+	mutex_unlock(&rdev->stats.lock);
+	return qid;
+}
+
+void c4iw_put_cqid(struct c4iw_rdev *rdev, u32 qid,
+		   struct c4iw_dev_ucontext *uctx)
+{
+	struct c4iw_qid_list *entry;
+
+	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	if (!entry)
+		return;
+	PDBG("%s qid 0x%x\n", __func__, qid);
+	entry->qid = qid;
+	mutex_lock(&uctx->lock);
+	list_add_tail(&entry->entry, &uctx->cqids);
+	mutex_unlock(&uctx->lock);
+}
+
+u32 c4iw_get_qpid(struct c4iw_rdev *rdev, struct c4iw_dev_ucontext *uctx)
+{
+	struct c4iw_qid_list *entry;
+	u32 qid;
+	int i;
+
+	mutex_lock(&uctx->lock);
+	if (!list_empty(&uctx->qpids)) {
+		entry = list_entry(uctx->qpids.next, struct c4iw_qid_list,
+				   entry);
+		list_del(&entry->entry);
+		qid = entry->qid;
+		kfree(entry);
+	} else {
+		qid = c4iw_get_resource(&rdev->resource.qid_table);
+		if (!qid) {
+			mutex_lock(&rdev->stats.lock);
+			rdev->stats.qid.fail++;
+			mutex_unlock(&rdev->stats.lock);
+			goto out;
+		}
+		mutex_lock(&rdev->stats.lock);
+		rdev->stats.qid.cur += rdev->qpmask + 1;
+		mutex_unlock(&rdev->stats.lock);
+		for (i = qid+1; i & rdev->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				goto out;
+			entry->qid = i;
+			list_add_tail(&entry->entry, &uctx->qpids);
+		}
+
+		/*
+		 * now put the same ids on the cq list since they all
+		 * map to the same db/gts page.
+		 */
+		entry = kmalloc(sizeof *entry, GFP_KERNEL);
+		if (!entry)
+			goto out;
+		entry->qid = qid;
+		list_add_tail(&entry->entry, &uctx->cqids);
+		for (i = qid; i & rdev->qpmask; i++) {
+			entry = kmalloc(sizeof *entry, GFP_KERNEL);
+			if (!entry)
+				goto out;
+			entry->qid = i;
+			list_add_tail(&entry->entry, &uctx->cqids);
+		}
+	}
+out:
+	mutex_unlock(&uctx->lock);
+	PDBG("%s qid 0x%x\n", __func__, qid);
+	mutex_lock(&rdev->stats.lock);
+	if (rdev->stats.qid.cur > rdev->stats.qid.max)
+		rdev->stats.qid.max = rdev->stats.qid.cur;
+	mutex_unlock(&rdev->stats.lock);
+	return qid;
+}
+
+void c4iw_put_qpid(struct c4iw_rdev *rdev, u32 qid,
+		   struct c4iw_dev_ucontext *uctx)
+{
+	struct c4iw_qid_list *entry;
+
+	entry = kmalloc(sizeof *entry, GFP_KERNEL);
+	if (!entry)
+		return;
+	PDBG("%s qid 0x%x\n", __func__, qid);
+	entry->qid = qid;
+	mutex_lock(&uctx->lock);
+	list_add_tail(&entry->entry, &uctx->qpids);
+	mutex_unlock(&uctx->lock);
+}
+
+void c4iw_destroy_resource(struct c4iw_resource *rscp)
+{
+	c4iw_id_table_free(&rscp->tpt_table);
+	c4iw_id_table_free(&rscp->qid_table);
+	c4iw_id_table_free(&rscp->pdid_table);
+}
+
+/*
+ * PBL Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_PBL_SHIFT 8			/* 256B == min PBL size (32 entries) */
+
+u32 c4iw_pblpool_alloc(struct c4iw_rdev *rdev, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev->pbl_pool, size);
+	PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size);
+	mutex_lock(&rdev->stats.lock);
+	if (addr) {
+		rdev->stats.pbl.cur += roundup(size, 1 << MIN_PBL_SHIFT);
+		if (rdev->stats.pbl.cur > rdev->stats.pbl.max)
+			rdev->stats.pbl.max = rdev->stats.pbl.cur;
+	} else
+		rdev->stats.pbl.fail++;
+	mutex_unlock(&rdev->stats.lock);
+	return (u32)addr;
+}
+
+void c4iw_pblpool_free(struct c4iw_rdev *rdev, u32 addr, int size)
+{
+	PDBG("%s addr 0x%x size %d\n", __func__, addr, size);
+	mutex_lock(&rdev->stats.lock);
+	rdev->stats.pbl.cur -= roundup(size, 1 << MIN_PBL_SHIFT);
+	mutex_unlock(&rdev->stats.lock);
+	gen_pool_free(rdev->pbl_pool, (unsigned long)addr, size);
+}
+
+int c4iw_pblpool_create(struct c4iw_rdev *rdev)
+{
+	unsigned pbl_start, pbl_chunk, pbl_top;
+
+	rdev->pbl_pool = gen_pool_create(MIN_PBL_SHIFT, -1);
+	if (!rdev->pbl_pool)
+		return -ENOMEM;
+
+	pbl_start = rdev->lldi.vr->pbl.start;
+	pbl_chunk = rdev->lldi.vr->pbl.size;
+	pbl_top = pbl_start + pbl_chunk;
+
+	while (pbl_start < pbl_top) {
+		pbl_chunk = min(pbl_top - pbl_start + 1, pbl_chunk);
+		if (gen_pool_add(rdev->pbl_pool, pbl_start, pbl_chunk, -1)) {
+			PDBG("%s failed to add PBL chunk (%x/%x)\n",
+			     __func__, pbl_start, pbl_chunk);
+			if (pbl_chunk <= 1024 << MIN_PBL_SHIFT) {
+				printk(KERN_WARNING MOD
+				       "Failed to add all PBL chunks (%x/%x)\n",
+				       pbl_start,
+				       pbl_top - pbl_start);
+				return 0;
+			}
+			pbl_chunk >>= 1;
+		} else {
+			PDBG("%s added PBL chunk (%x/%x)\n",
+			     __func__, pbl_start, pbl_chunk);
+			pbl_start += pbl_chunk;
+		}
+	}
+
+	return 0;
+}
+
+void c4iw_pblpool_destroy(struct c4iw_rdev *rdev)
+{
+	gen_pool_destroy(rdev->pbl_pool);
+}
+
+/*
+ * RQT Memory Manager.  Uses Linux generic allocator.
+ */
+
+#define MIN_RQT_SHIFT 10	/* 1KB == min RQT size (16 entries) */
+
+u32 c4iw_rqtpool_alloc(struct c4iw_rdev *rdev, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev->rqt_pool, size << 6);
+	PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size << 6);
+	if (!addr)
+		pr_warn_ratelimited(MOD "%s: Out of RQT memory\n",
+				    pci_name(rdev->lldi.pdev));
+	mutex_lock(&rdev->stats.lock);
+	if (addr) {
+		rdev->stats.rqt.cur += roundup(size << 6, 1 << MIN_RQT_SHIFT);
+		if (rdev->stats.rqt.cur > rdev->stats.rqt.max)
+			rdev->stats.rqt.max = rdev->stats.rqt.cur;
+	} else
+		rdev->stats.rqt.fail++;
+	mutex_unlock(&rdev->stats.lock);
+	return (u32)addr;
+}
+
+void c4iw_rqtpool_free(struct c4iw_rdev *rdev, u32 addr, int size)
+{
+	PDBG("%s addr 0x%x size %d\n", __func__, addr, size << 6);
+	mutex_lock(&rdev->stats.lock);
+	rdev->stats.rqt.cur -= roundup(size << 6, 1 << MIN_RQT_SHIFT);
+	mutex_unlock(&rdev->stats.lock);
+	gen_pool_free(rdev->rqt_pool, (unsigned long)addr, size << 6);
+}
+
+int c4iw_rqtpool_create(struct c4iw_rdev *rdev)
+{
+	unsigned rqt_start, rqt_chunk, rqt_top;
+
+	rdev->rqt_pool = gen_pool_create(MIN_RQT_SHIFT, -1);
+	if (!rdev->rqt_pool)
+		return -ENOMEM;
+
+	rqt_start = rdev->lldi.vr->rq.start;
+	rqt_chunk = rdev->lldi.vr->rq.size;
+	rqt_top = rqt_start + rqt_chunk;
+
+	while (rqt_start < rqt_top) {
+		rqt_chunk = min(rqt_top - rqt_start + 1, rqt_chunk);
+		if (gen_pool_add(rdev->rqt_pool, rqt_start, rqt_chunk, -1)) {
+			PDBG("%s failed to add RQT chunk (%x/%x)\n",
+			     __func__, rqt_start, rqt_chunk);
+			if (rqt_chunk <= 1024 << MIN_RQT_SHIFT) {
+				printk(KERN_WARNING MOD
+				       "Failed to add all RQT chunks (%x/%x)\n",
+				       rqt_start, rqt_top - rqt_start);
+				return 0;
+			}
+			rqt_chunk >>= 1;
+		} else {
+			PDBG("%s added RQT chunk (%x/%x)\n",
+			     __func__, rqt_start, rqt_chunk);
+			rqt_start += rqt_chunk;
+		}
+	}
+	return 0;
+}
+
+void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev)
+{
+	gen_pool_destroy(rdev->rqt_pool);
+}
+
+/*
+ * On-Chip QP Memory.
+ */
+#define MIN_OCQP_SHIFT 12	/* 4KB == min ocqp size */
+
+u32 c4iw_ocqp_pool_alloc(struct c4iw_rdev *rdev, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev->ocqp_pool, size);
+	PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size);
+	if (addr) {
+		mutex_lock(&rdev->stats.lock);
+		rdev->stats.ocqp.cur += roundup(size, 1 << MIN_OCQP_SHIFT);
+		if (rdev->stats.ocqp.cur > rdev->stats.ocqp.max)
+			rdev->stats.ocqp.max = rdev->stats.ocqp.cur;
+		mutex_unlock(&rdev->stats.lock);
+	}
+	return (u32)addr;
+}
+
+void c4iw_ocqp_pool_free(struct c4iw_rdev *rdev, u32 addr, int size)
+{
+	PDBG("%s addr 0x%x size %d\n", __func__, addr, size);
+	mutex_lock(&rdev->stats.lock);
+	rdev->stats.ocqp.cur -= roundup(size, 1 << MIN_OCQP_SHIFT);
+	mutex_unlock(&rdev->stats.lock);
+	gen_pool_free(rdev->ocqp_pool, (unsigned long)addr, size);
+}
+
+int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev)
+{
+	unsigned start, chunk, top;
+
+	rdev->ocqp_pool = gen_pool_create(MIN_OCQP_SHIFT, -1);
+	if (!rdev->ocqp_pool)
+		return -ENOMEM;
+
+	start = rdev->lldi.vr->ocq.start;
+	chunk = rdev->lldi.vr->ocq.size;
+	top = start + chunk;
+
+	while (start < top) {
+		chunk = min(top - start + 1, chunk);
+		if (gen_pool_add(rdev->ocqp_pool, start, chunk, -1)) {
+			PDBG("%s failed to add OCQP chunk (%x/%x)\n",
+			     __func__, start, chunk);
+			if (chunk <= 1024 << MIN_OCQP_SHIFT) {
+				printk(KERN_WARNING MOD
+				       "Failed to add all OCQP chunks (%x/%x)\n",
+				       start, top - start);
+				return 0;
+			}
+			chunk >>= 1;
+		} else {
+			PDBG("%s added OCQP chunk (%x/%x)\n",
+			     __func__, start, chunk);
+			start += chunk;
+		}
+	}
+	return 0;
+}
+
+void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev)
+{
+	gen_pool_destroy(rdev->ocqp_pool);
+}
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
new file mode 100644
index 000000000..7f2a6c244
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -0,0 +1,685 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __T4_H__
+#define __T4_H__
+
+#include "t4_hw.h"
+#include "t4_regs.h"
+#include "t4_msg.h"
+#include "t4fw_ri_api.h"
+
+#define T4_MAX_NUM_PD 65536
+#define T4_MAX_MR_SIZE (~0ULL)
+#define T4_PAGESIZE_MASK 0xffff000  /* 4KB-128MB */
+#define T4_STAG_UNSET 0xffffffff
+#define T4_FW_MAJ 0
+#define PCIE_MA_SYNC_A 0x30b4
+
+struct t4_status_page {
+	__be32 rsvd1;	/* flit 0 - hw owns */
+	__be16 rsvd2;
+	__be16 qid;
+	__be16 cidx;
+	__be16 pidx;
+	u8 qp_err;	/* flit 1 - sw owns */
+	u8 db_off;
+	u8 pad;
+	u16 host_wq_pidx;
+	u16 host_cidx;
+	u16 host_pidx;
+};
+
+#define T4_EQ_ENTRY_SIZE 64
+
+#define T4_SQ_NUM_SLOTS 5
+#define T4_SQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_SQ_NUM_SLOTS)
+#define T4_MAX_SEND_SGE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - \
+			sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge))
+#define T4_MAX_SEND_INLINE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - \
+			sizeof(struct fw_ri_immd)))
+#define T4_MAX_WRITE_INLINE ((T4_SQ_NUM_BYTES - \
+			sizeof(struct fw_ri_rdma_write_wr) - \
+			sizeof(struct fw_ri_immd)))
+#define T4_MAX_WRITE_SGE ((T4_SQ_NUM_BYTES - \
+			sizeof(struct fw_ri_rdma_write_wr) - \
+			sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge))
+#define T4_MAX_FR_IMMD ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_fr_nsmr_wr) - \
+			sizeof(struct fw_ri_immd)) & ~31UL)
+#define T4_MAX_FR_IMMD_DEPTH (T4_MAX_FR_IMMD / sizeof(u64))
+#define T4_MAX_FR_DSGL 1024
+#define T4_MAX_FR_DSGL_DEPTH (T4_MAX_FR_DSGL / sizeof(u64))
+
+static inline int t4_max_fr_depth(int use_dsgl)
+{
+	return use_dsgl ? T4_MAX_FR_DSGL_DEPTH : T4_MAX_FR_IMMD_DEPTH;
+}
+
+#define T4_RQ_NUM_SLOTS 2
+#define T4_RQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_RQ_NUM_SLOTS)
+#define T4_MAX_RECV_SGE 4
+
+union t4_wr {
+	struct fw_ri_res_wr res;
+	struct fw_ri_wr ri;
+	struct fw_ri_rdma_write_wr write;
+	struct fw_ri_send_wr send;
+	struct fw_ri_rdma_read_wr read;
+	struct fw_ri_bind_mw_wr bind;
+	struct fw_ri_fr_nsmr_wr fr;
+	struct fw_ri_inv_lstag_wr inv;
+	struct t4_status_page status;
+	__be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_SQ_NUM_SLOTS];
+};
+
+union t4_recv_wr {
+	struct fw_ri_recv_wr recv;
+	struct t4_status_page status;
+	__be64 flits[T4_EQ_ENTRY_SIZE / sizeof(__be64) * T4_RQ_NUM_SLOTS];
+};
+
+static inline void init_wr_hdr(union t4_wr *wqe, u16 wrid,
+			       enum fw_wr_opcodes opcode, u8 flags, u8 len16)
+{
+	wqe->send.opcode = (u8)opcode;
+	wqe->send.flags = flags;
+	wqe->send.wrid = wrid;
+	wqe->send.r1[0] = 0;
+	wqe->send.r1[1] = 0;
+	wqe->send.r1[2] = 0;
+	wqe->send.len16 = len16;
+}
+
+/* CQE/AE status codes */
+#define T4_ERR_SUCCESS                     0x0
+#define T4_ERR_STAG                        0x1	/* STAG invalid: either the */
+						/* STAG is offlimt, being 0, */
+						/* or STAG_key mismatch */
+#define T4_ERR_PDID                        0x2	/* PDID mismatch */
+#define T4_ERR_QPID                        0x3	/* QPID mismatch */
+#define T4_ERR_ACCESS                      0x4	/* Invalid access right */
+#define T4_ERR_WRAP                        0x5	/* Wrap error */
+#define T4_ERR_BOUND                       0x6	/* base and bounds voilation */
+#define T4_ERR_INVALIDATE_SHARED_MR        0x7	/* attempt to invalidate a  */
+						/* shared memory region */
+#define T4_ERR_INVALIDATE_MR_WITH_MW_BOUND 0x8	/* attempt to invalidate a  */
+						/* shared memory region */
+#define T4_ERR_ECC                         0x9	/* ECC error detected */
+#define T4_ERR_ECC_PSTAG                   0xA	/* ECC error detected when  */
+						/* reading PSTAG for a MW  */
+						/* Invalidate */
+#define T4_ERR_PBL_ADDR_BOUND              0xB	/* pbl addr out of bounds:  */
+						/* software error */
+#define T4_ERR_SWFLUSH			   0xC	/* SW FLUSHED */
+#define T4_ERR_CRC                         0x10 /* CRC error */
+#define T4_ERR_MARKER                      0x11 /* Marker error */
+#define T4_ERR_PDU_LEN_ERR                 0x12 /* invalid PDU length */
+#define T4_ERR_OUT_OF_RQE                  0x13 /* out of RQE */
+#define T4_ERR_DDP_VERSION                 0x14 /* wrong DDP version */
+#define T4_ERR_RDMA_VERSION                0x15 /* wrong RDMA version */
+#define T4_ERR_OPCODE                      0x16 /* invalid rdma opcode */
+#define T4_ERR_DDP_QUEUE_NUM               0x17 /* invalid ddp queue number */
+#define T4_ERR_MSN                         0x18 /* MSN error */
+#define T4_ERR_TBIT                        0x19 /* tag bit not set correctly */
+#define T4_ERR_MO                          0x1A /* MO not 0 for TERMINATE  */
+						/* or READ_REQ */
+#define T4_ERR_MSN_GAP                     0x1B
+#define T4_ERR_MSN_RANGE                   0x1C
+#define T4_ERR_IRD_OVERFLOW                0x1D
+#define T4_ERR_RQE_ADDR_BOUND              0x1E /* RQE addr out of bounds:  */
+						/* software error */
+#define T4_ERR_INTERNAL_ERR                0x1F /* internal error (opcode  */
+						/* mismatch) */
+/*
+ * CQE defs
+ */
+struct t4_cqe {
+	__be32 header;
+	__be32 len;
+	union {
+		struct {
+			__be32 stag;
+			__be32 msn;
+		} rcqe;
+		struct {
+			u32 nada1;
+			u16 nada2;
+			u16 cidx;
+		} scqe;
+		struct {
+			__be32 wrid_hi;
+			__be32 wrid_low;
+		} gen;
+	} u;
+	__be64 reserved;
+	__be64 bits_type_ts;
+};
+
+/* macros for flit 0 of the cqe */
+
+#define CQE_QPID_S        12
+#define CQE_QPID_M        0xFFFFF
+#define CQE_QPID_G(x)     ((((x) >> CQE_QPID_S)) & CQE_QPID_M)
+#define CQE_QPID_V(x)	  ((x)<<CQE_QPID_S)
+
+#define CQE_SWCQE_S       11
+#define CQE_SWCQE_M       0x1
+#define CQE_SWCQE_G(x)    ((((x) >> CQE_SWCQE_S)) & CQE_SWCQE_M)
+#define CQE_SWCQE_V(x)	  ((x)<<CQE_SWCQE_S)
+
+#define CQE_STATUS_S      5
+#define CQE_STATUS_M      0x1F
+#define CQE_STATUS_G(x)   ((((x) >> CQE_STATUS_S)) & CQE_STATUS_M)
+#define CQE_STATUS_V(x)   ((x)<<CQE_STATUS_S)
+
+#define CQE_TYPE_S        4
+#define CQE_TYPE_M        0x1
+#define CQE_TYPE_G(x)     ((((x) >> CQE_TYPE_S)) & CQE_TYPE_M)
+#define CQE_TYPE_V(x)     ((x)<<CQE_TYPE_S)
+
+#define CQE_OPCODE_S      0
+#define CQE_OPCODE_M      0xF
+#define CQE_OPCODE_G(x)   ((((x) >> CQE_OPCODE_S)) & CQE_OPCODE_M)
+#define CQE_OPCODE_V(x)   ((x)<<CQE_OPCODE_S)
+
+#define SW_CQE(x)         (CQE_SWCQE_G(be32_to_cpu((x)->header)))
+#define CQE_QPID(x)       (CQE_QPID_G(be32_to_cpu((x)->header)))
+#define CQE_TYPE(x)       (CQE_TYPE_G(be32_to_cpu((x)->header)))
+#define SQ_TYPE(x)	  (CQE_TYPE((x)))
+#define RQ_TYPE(x)	  (!CQE_TYPE((x)))
+#define CQE_STATUS(x)     (CQE_STATUS_G(be32_to_cpu((x)->header)))
+#define CQE_OPCODE(x)     (CQE_OPCODE_G(be32_to_cpu((x)->header)))
+
+#define CQE_SEND_OPCODE(x)( \
+	(CQE_OPCODE_G(be32_to_cpu((x)->header)) == FW_RI_SEND) || \
+	(CQE_OPCODE_G(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_SE) || \
+	(CQE_OPCODE_G(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_INV) || \
+	(CQE_OPCODE_G(be32_to_cpu((x)->header)) == FW_RI_SEND_WITH_SE_INV))
+
+#define CQE_LEN(x)        (be32_to_cpu((x)->len))
+
+/* used for RQ completion processing */
+#define CQE_WRID_STAG(x)  (be32_to_cpu((x)->u.rcqe.stag))
+#define CQE_WRID_MSN(x)   (be32_to_cpu((x)->u.rcqe.msn))
+
+/* used for SQ completion processing */
+#define CQE_WRID_SQ_IDX(x)	((x)->u.scqe.cidx)
+
+/* generic accessor macros */
+#define CQE_WRID_HI(x)		(be32_to_cpu((x)->u.gen.wrid_hi))
+#define CQE_WRID_LOW(x)		(be32_to_cpu((x)->u.gen.wrid_low))
+
+/* macros for flit 3 of the cqe */
+#define CQE_GENBIT_S	63
+#define CQE_GENBIT_M	0x1
+#define CQE_GENBIT_G(x)	(((x) >> CQE_GENBIT_S) & CQE_GENBIT_M)
+#define CQE_GENBIT_V(x) ((x)<<CQE_GENBIT_S)
+
+#define CQE_OVFBIT_S	62
+#define CQE_OVFBIT_M	0x1
+#define CQE_OVFBIT_G(x)	((((x) >> CQE_OVFBIT_S)) & CQE_OVFBIT_M)
+
+#define CQE_IQTYPE_S	60
+#define CQE_IQTYPE_M	0x3
+#define CQE_IQTYPE_G(x)	((((x) >> CQE_IQTYPE_S)) & CQE_IQTYPE_M)
+
+#define CQE_TS_M	0x0fffffffffffffffULL
+#define CQE_TS_G(x)	((x) & CQE_TS_M)
+
+#define CQE_OVFBIT(x)	((unsigned)CQE_OVFBIT_G(be64_to_cpu((x)->bits_type_ts)))
+#define CQE_GENBIT(x)	((unsigned)CQE_GENBIT_G(be64_to_cpu((x)->bits_type_ts)))
+#define CQE_TS(x)	(CQE_TS_G(be64_to_cpu((x)->bits_type_ts)))
+
+struct t4_swsqe {
+	u64			wr_id;
+	struct t4_cqe		cqe;
+	int			read_len;
+	int			opcode;
+	int			complete;
+	int			signaled;
+	u16			idx;
+	int                     flushed;
+	struct timespec         host_ts;
+	u64                     sge_ts;
+};
+
+static inline pgprot_t t4_pgprot_wc(pgprot_t prot)
+{
+#if defined(__i386__) || defined(__x86_64__) || defined(CONFIG_PPC64)
+	return pgprot_writecombine(prot);
+#else
+	return pgprot_noncached(prot);
+#endif
+}
+
+enum {
+	T4_SQ_ONCHIP = (1<<0),
+};
+
+struct t4_sq {
+	union t4_wr *queue;
+	dma_addr_t dma_addr;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	unsigned long phys_addr;
+	struct t4_swsqe *sw_sq;
+	struct t4_swsqe *oldest_read;
+	u64 __iomem *udb;
+	size_t memsize;
+	u32 qid;
+	u16 in_use;
+	u16 size;
+	u16 cidx;
+	u16 pidx;
+	u16 wq_pidx;
+	u16 wq_pidx_inc;
+	u16 flags;
+	short flush_cidx;
+};
+
+struct t4_swrqe {
+	u64 wr_id;
+	struct timespec host_ts;
+	u64 sge_ts;
+};
+
+struct t4_rq {
+	union  t4_recv_wr *queue;
+	dma_addr_t dma_addr;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	struct t4_swrqe *sw_rq;
+	u64 __iomem *udb;
+	size_t memsize;
+	u32 qid;
+	u32 msn;
+	u32 rqt_hwaddr;
+	u16 rqt_size;
+	u16 in_use;
+	u16 size;
+	u16 cidx;
+	u16 pidx;
+	u16 wq_pidx;
+	u16 wq_pidx_inc;
+};
+
+struct t4_wq {
+	struct t4_sq sq;
+	struct t4_rq rq;
+	void __iomem *db;
+	void __iomem *gts;
+	struct c4iw_rdev *rdev;
+	int flushed;
+};
+
+static inline int t4_rqes_posted(struct t4_wq *wq)
+{
+	return wq->rq.in_use;
+}
+
+static inline int t4_rq_empty(struct t4_wq *wq)
+{
+	return wq->rq.in_use == 0;
+}
+
+static inline int t4_rq_full(struct t4_wq *wq)
+{
+	return wq->rq.in_use == (wq->rq.size - 1);
+}
+
+static inline u32 t4_rq_avail(struct t4_wq *wq)
+{
+	return wq->rq.size - 1 - wq->rq.in_use;
+}
+
+static inline void t4_rq_produce(struct t4_wq *wq, u8 len16)
+{
+	wq->rq.in_use++;
+	if (++wq->rq.pidx == wq->rq.size)
+		wq->rq.pidx = 0;
+	wq->rq.wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+	if (wq->rq.wq_pidx >= wq->rq.size * T4_RQ_NUM_SLOTS)
+		wq->rq.wq_pidx %= wq->rq.size * T4_RQ_NUM_SLOTS;
+}
+
+static inline void t4_rq_consume(struct t4_wq *wq)
+{
+	wq->rq.in_use--;
+	wq->rq.msn++;
+	if (++wq->rq.cidx == wq->rq.size)
+		wq->rq.cidx = 0;
+}
+
+static inline u16 t4_rq_host_wq_pidx(struct t4_wq *wq)
+{
+	return wq->rq.queue[wq->rq.size].status.host_wq_pidx;
+}
+
+static inline u16 t4_rq_wq_size(struct t4_wq *wq)
+{
+		return wq->rq.size * T4_RQ_NUM_SLOTS;
+}
+
+static inline int t4_sq_onchip(struct t4_sq *sq)
+{
+	return sq->flags & T4_SQ_ONCHIP;
+}
+
+static inline int t4_sq_empty(struct t4_wq *wq)
+{
+	return wq->sq.in_use == 0;
+}
+
+static inline int t4_sq_full(struct t4_wq *wq)
+{
+	return wq->sq.in_use == (wq->sq.size - 1);
+}
+
+static inline u32 t4_sq_avail(struct t4_wq *wq)
+{
+	return wq->sq.size - 1 - wq->sq.in_use;
+}
+
+static inline void t4_sq_produce(struct t4_wq *wq, u8 len16)
+{
+	wq->sq.in_use++;
+	if (++wq->sq.pidx == wq->sq.size)
+		wq->sq.pidx = 0;
+	wq->sq.wq_pidx += DIV_ROUND_UP(len16*16, T4_EQ_ENTRY_SIZE);
+	if (wq->sq.wq_pidx >= wq->sq.size * T4_SQ_NUM_SLOTS)
+		wq->sq.wq_pidx %= wq->sq.size * T4_SQ_NUM_SLOTS;
+}
+
+static inline void t4_sq_consume(struct t4_wq *wq)
+{
+	BUG_ON(wq->sq.in_use < 1);
+	if (wq->sq.cidx == wq->sq.flush_cidx)
+		wq->sq.flush_cidx = -1;
+	wq->sq.in_use--;
+	if (++wq->sq.cidx == wq->sq.size)
+		wq->sq.cidx = 0;
+}
+
+static inline u16 t4_sq_host_wq_pidx(struct t4_wq *wq)
+{
+	return wq->sq.queue[wq->sq.size].status.host_wq_pidx;
+}
+
+static inline u16 t4_sq_wq_size(struct t4_wq *wq)
+{
+		return wq->sq.size * T4_SQ_NUM_SLOTS;
+}
+
+/* This function copies 64 byte coalesced work request to memory
+ * mapped BAR2 space. For coalesced WRs, the SGE fetches data
+ * from the FIFO instead of from Host.
+ */
+static inline void pio_copy(u64 __iomem *dst, u64 *src)
+{
+	int count = 8;
+
+	while (count) {
+		writeq(*src, dst);
+		src++;
+		dst++;
+		count--;
+	}
+}
+
+static inline void t4_ring_sq_db(struct t4_wq *wq, u16 inc, u8 t5,
+				 union t4_wr *wqe)
+{
+
+	/* Flush host queue memory writes. */
+	wmb();
+	if (t5) {
+		if (inc == 1 && wqe) {
+			PDBG("%s: WC wq->sq.pidx = %d\n",
+			     __func__, wq->sq.pidx);
+			pio_copy(wq->sq.udb + 7, (void *)wqe);
+		} else {
+			PDBG("%s: DB wq->sq.pidx = %d\n",
+			     __func__, wq->sq.pidx);
+			writel(PIDX_T5_V(inc), wq->sq.udb);
+		}
+
+		/* Flush user doorbell area writes. */
+		wmb();
+		return;
+	}
+	writel(QID_V(wq->sq.qid) | PIDX_V(inc), wq->db);
+}
+
+static inline void t4_ring_rq_db(struct t4_wq *wq, u16 inc, u8 t5,
+				 union t4_recv_wr *wqe)
+{
+
+	/* Flush host queue memory writes. */
+	wmb();
+	if (t5) {
+		if (inc == 1 && wqe) {
+			PDBG("%s: WC wq->rq.pidx = %d\n",
+			     __func__, wq->rq.pidx);
+			pio_copy(wq->rq.udb + 7, (void *)wqe);
+		} else {
+			PDBG("%s: DB wq->rq.pidx = %d\n",
+			     __func__, wq->rq.pidx);
+			writel(PIDX_T5_V(inc), wq->rq.udb);
+		}
+
+		/* Flush user doorbell area writes. */
+		wmb();
+		return;
+	}
+	writel(QID_V(wq->rq.qid) | PIDX_V(inc), wq->db);
+}
+
+static inline int t4_wq_in_error(struct t4_wq *wq)
+{
+	return wq->rq.queue[wq->rq.size].status.qp_err;
+}
+
+static inline void t4_set_wq_in_error(struct t4_wq *wq)
+{
+	wq->rq.queue[wq->rq.size].status.qp_err = 1;
+}
+
+static inline void t4_disable_wq_db(struct t4_wq *wq)
+{
+	wq->rq.queue[wq->rq.size].status.db_off = 1;
+}
+
+static inline void t4_enable_wq_db(struct t4_wq *wq)
+{
+	wq->rq.queue[wq->rq.size].status.db_off = 0;
+}
+
+static inline int t4_wq_db_enabled(struct t4_wq *wq)
+{
+	return !wq->rq.queue[wq->rq.size].status.db_off;
+}
+
+enum t4_cq_flags {
+	CQ_ARMED	= 1,
+};
+
+struct t4_cq {
+	struct t4_cqe *queue;
+	dma_addr_t dma_addr;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+	struct t4_cqe *sw_queue;
+	void __iomem *gts;
+	struct c4iw_rdev *rdev;
+	u64 ugts;
+	size_t memsize;
+	__be64 bits_type_ts;
+	u32 cqid;
+	u32 qid_mask;
+	int vector;
+	u16 size; /* including status page */
+	u16 cidx;
+	u16 sw_pidx;
+	u16 sw_cidx;
+	u16 sw_in_use;
+	u16 cidx_inc;
+	u8 gen;
+	u8 error;
+	unsigned long flags;
+};
+
+static inline int t4_clear_cq_armed(struct t4_cq *cq)
+{
+	return test_and_clear_bit(CQ_ARMED, &cq->flags);
+}
+
+static inline int t4_arm_cq(struct t4_cq *cq, int se)
+{
+	u32 val;
+
+	set_bit(CQ_ARMED, &cq->flags);
+	while (cq->cidx_inc > CIDXINC_M) {
+		val = SEINTARM_V(0) | CIDXINC_V(CIDXINC_M) | TIMERREG_V(7) |
+		      INGRESSQID_V(cq->cqid & cq->qid_mask);
+		writel(val, cq->gts);
+		cq->cidx_inc -= CIDXINC_M;
+	}
+	val = SEINTARM_V(se) | CIDXINC_V(cq->cidx_inc) | TIMERREG_V(6) |
+	      INGRESSQID_V(cq->cqid & cq->qid_mask);
+	writel(val, cq->gts);
+	cq->cidx_inc = 0;
+	return 0;
+}
+
+static inline void t4_swcq_produce(struct t4_cq *cq)
+{
+	cq->sw_in_use++;
+	if (cq->sw_in_use == cq->size) {
+		PDBG("%s cxgb4 sw cq overflow cqid %u\n", __func__, cq->cqid);
+		cq->error = 1;
+		BUG_ON(1);
+	}
+	if (++cq->sw_pidx == cq->size)
+		cq->sw_pidx = 0;
+}
+
+static inline void t4_swcq_consume(struct t4_cq *cq)
+{
+	BUG_ON(cq->sw_in_use < 1);
+	cq->sw_in_use--;
+	if (++cq->sw_cidx == cq->size)
+		cq->sw_cidx = 0;
+}
+
+static inline void t4_hwcq_consume(struct t4_cq *cq)
+{
+	cq->bits_type_ts = cq->queue[cq->cidx].bits_type_ts;
+	if (++cq->cidx_inc == (cq->size >> 4) || cq->cidx_inc == CIDXINC_M) {
+		u32 val;
+
+		val = SEINTARM_V(0) | CIDXINC_V(cq->cidx_inc) | TIMERREG_V(7) |
+		      INGRESSQID_V(cq->cqid & cq->qid_mask);
+		writel(val, cq->gts);
+		cq->cidx_inc = 0;
+	}
+	if (++cq->cidx == cq->size) {
+		cq->cidx = 0;
+		cq->gen ^= 1;
+	}
+}
+
+static inline int t4_valid_cqe(struct t4_cq *cq, struct t4_cqe *cqe)
+{
+	return (CQE_GENBIT(cqe) == cq->gen);
+}
+
+static inline int t4_next_hw_cqe(struct t4_cq *cq, struct t4_cqe **cqe)
+{
+	int ret;
+	u16 prev_cidx;
+
+	if (cq->cidx == 0)
+		prev_cidx = cq->size - 1;
+	else
+		prev_cidx = cq->cidx - 1;
+
+	if (cq->queue[prev_cidx].bits_type_ts != cq->bits_type_ts) {
+		ret = -EOVERFLOW;
+		cq->error = 1;
+		printk(KERN_ERR MOD "cq overflow cqid %u\n", cq->cqid);
+		BUG_ON(1);
+	} else if (t4_valid_cqe(cq, &cq->queue[cq->cidx])) {
+
+		/* Ensure CQE is flushed to memory */
+		rmb();
+		*cqe = &cq->queue[cq->cidx];
+		ret = 0;
+	} else
+		ret = -ENODATA;
+	return ret;
+}
+
+static inline struct t4_cqe *t4_next_sw_cqe(struct t4_cq *cq)
+{
+	if (cq->sw_in_use == cq->size) {
+		PDBG("%s cxgb4 sw cq overflow cqid %u\n", __func__, cq->cqid);
+		cq->error = 1;
+		BUG_ON(1);
+		return NULL;
+	}
+	if (cq->sw_in_use)
+		return &cq->sw_queue[cq->sw_cidx];
+	return NULL;
+}
+
+static inline int t4_next_cqe(struct t4_cq *cq, struct t4_cqe **cqe)
+{
+	int ret = 0;
+
+	if (cq->error)
+		ret = -ENODATA;
+	else if (cq->sw_in_use)
+		*cqe = &cq->sw_queue[cq->sw_cidx];
+	else
+		ret = t4_next_hw_cqe(cq, cqe);
+	return ret;
+}
+
+static inline int t4_cq_in_error(struct t4_cq *cq)
+{
+	return ((struct t4_status_page *)&cq->queue[cq->size])->qp_err;
+}
+
+static inline void t4_set_cq_in_error(struct t4_cq *cq)
+{
+	((struct t4_status_page *)&cq->queue[cq->size])->qp_err = 1;
+}
+#endif
+
+struct t4_dev_status_page {
+	u8 db_off;
+};
diff --git a/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h
new file mode 100644
index 000000000..343e8daf2
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/t4fw_ri_api.h
@@ -0,0 +1,855 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _T4FW_RI_API_H_
+#define _T4FW_RI_API_H_
+
+#include "t4fw_api.h"
+
+enum fw_ri_wr_opcode {
+	FW_RI_RDMA_WRITE		= 0x0,	/* IETF RDMAP v1.0 ... */
+	FW_RI_READ_REQ			= 0x1,
+	FW_RI_READ_RESP			= 0x2,
+	FW_RI_SEND			= 0x3,
+	FW_RI_SEND_WITH_INV		= 0x4,
+	FW_RI_SEND_WITH_SE		= 0x5,
+	FW_RI_SEND_WITH_SE_INV		= 0x6,
+	FW_RI_TERMINATE			= 0x7,
+	FW_RI_RDMA_INIT			= 0x8,	/* CHELSIO RI specific ... */
+	FW_RI_BIND_MW			= 0x9,
+	FW_RI_FAST_REGISTER		= 0xa,
+	FW_RI_LOCAL_INV			= 0xb,
+	FW_RI_QP_MODIFY			= 0xc,
+	FW_RI_BYPASS			= 0xd,
+	FW_RI_RECEIVE			= 0xe,
+
+	FW_RI_SGE_EC_CR_RETURN		= 0xf
+};
+
+enum fw_ri_wr_flags {
+	FW_RI_COMPLETION_FLAG		= 0x01,
+	FW_RI_NOTIFICATION_FLAG		= 0x02,
+	FW_RI_SOLICITED_EVENT_FLAG	= 0x04,
+	FW_RI_READ_FENCE_FLAG		= 0x08,
+	FW_RI_LOCAL_FENCE_FLAG		= 0x10,
+	FW_RI_RDMA_READ_INVALIDATE	= 0x20
+};
+
+enum fw_ri_mpa_attrs {
+	FW_RI_MPA_RX_MARKER_ENABLE	= 0x01,
+	FW_RI_MPA_TX_MARKER_ENABLE	= 0x02,
+	FW_RI_MPA_CRC_ENABLE		= 0x04,
+	FW_RI_MPA_IETF_ENABLE		= 0x08
+};
+
+enum fw_ri_qp_caps {
+	FW_RI_QP_RDMA_READ_ENABLE	= 0x01,
+	FW_RI_QP_RDMA_WRITE_ENABLE	= 0x02,
+	FW_RI_QP_BIND_ENABLE		= 0x04,
+	FW_RI_QP_FAST_REGISTER_ENABLE	= 0x08,
+	FW_RI_QP_STAG0_ENABLE		= 0x10
+};
+
+enum fw_ri_addr_type {
+	FW_RI_ZERO_BASED_TO		= 0x00,
+	FW_RI_VA_BASED_TO		= 0x01
+};
+
+enum fw_ri_mem_perms {
+	FW_RI_MEM_ACCESS_REM_WRITE	= 0x01,
+	FW_RI_MEM_ACCESS_REM_READ	= 0x02,
+	FW_RI_MEM_ACCESS_REM		= 0x03,
+	FW_RI_MEM_ACCESS_LOCAL_WRITE	= 0x04,
+	FW_RI_MEM_ACCESS_LOCAL_READ	= 0x08,
+	FW_RI_MEM_ACCESS_LOCAL		= 0x0C
+};
+
+enum fw_ri_stag_type {
+	FW_RI_STAG_NSMR			= 0x00,
+	FW_RI_STAG_SMR			= 0x01,
+	FW_RI_STAG_MW			= 0x02,
+	FW_RI_STAG_MW_RELAXED		= 0x03
+};
+
+enum fw_ri_data_op {
+	FW_RI_DATA_IMMD			= 0x81,
+	FW_RI_DATA_DSGL			= 0x82,
+	FW_RI_DATA_ISGL			= 0x83
+};
+
+enum fw_ri_sgl_depth {
+	FW_RI_SGL_DEPTH_MAX_SQ		= 16,
+	FW_RI_SGL_DEPTH_MAX_RQ		= 4
+};
+
+struct fw_ri_dsge_pair {
+	__be32	len[2];
+	__be64	addr[2];
+};
+
+struct fw_ri_dsgl {
+	__u8	op;
+	__u8	r1;
+	__be16	nsge;
+	__be32	len0;
+	__be64	addr0;
+#ifndef C99_NOT_SUPPORTED
+	struct fw_ri_dsge_pair sge[0];
+#endif
+};
+
+struct fw_ri_sge {
+	__be32 stag;
+	__be32 len;
+	__be64 to;
+};
+
+struct fw_ri_isgl {
+	__u8	op;
+	__u8	r1;
+	__be16	nsge;
+	__be32	r2;
+#ifndef C99_NOT_SUPPORTED
+	struct fw_ri_sge sge[0];
+#endif
+};
+
+struct fw_ri_immd {
+	__u8	op;
+	__u8	r1;
+	__be16	r2;
+	__be32	immdlen;
+#ifndef C99_NOT_SUPPORTED
+	__u8	data[0];
+#endif
+};
+
+struct fw_ri_tpte {
+	__be32 valid_to_pdid;
+	__be32 locread_to_qpid;
+	__be32 nosnoop_pbladdr;
+	__be32 len_lo;
+	__be32 va_hi;
+	__be32 va_lo_fbo;
+	__be32 dca_mwbcnt_pstag;
+	__be32 len_hi;
+};
+
+#define FW_RI_TPTE_VALID_S		31
+#define FW_RI_TPTE_VALID_M		0x1
+#define FW_RI_TPTE_VALID_V(x)		((x) << FW_RI_TPTE_VALID_S)
+#define FW_RI_TPTE_VALID_G(x)		\
+	(((x) >> FW_RI_TPTE_VALID_S) & FW_RI_TPTE_VALID_M)
+#define FW_RI_TPTE_VALID_F		FW_RI_TPTE_VALID_V(1U)
+
+#define FW_RI_TPTE_STAGKEY_S		23
+#define FW_RI_TPTE_STAGKEY_M		0xff
+#define FW_RI_TPTE_STAGKEY_V(x)		((x) << FW_RI_TPTE_STAGKEY_S)
+#define FW_RI_TPTE_STAGKEY_G(x)		\
+	(((x) >> FW_RI_TPTE_STAGKEY_S) & FW_RI_TPTE_STAGKEY_M)
+
+#define FW_RI_TPTE_STAGSTATE_S		22
+#define FW_RI_TPTE_STAGSTATE_M		0x1
+#define FW_RI_TPTE_STAGSTATE_V(x)	((x) << FW_RI_TPTE_STAGSTATE_S)
+#define FW_RI_TPTE_STAGSTATE_G(x)	\
+	(((x) >> FW_RI_TPTE_STAGSTATE_S) & FW_RI_TPTE_STAGSTATE_M)
+#define FW_RI_TPTE_STAGSTATE_F		FW_RI_TPTE_STAGSTATE_V(1U)
+
+#define FW_RI_TPTE_STAGTYPE_S		20
+#define FW_RI_TPTE_STAGTYPE_M		0x3
+#define FW_RI_TPTE_STAGTYPE_V(x)	((x) << FW_RI_TPTE_STAGTYPE_S)
+#define FW_RI_TPTE_STAGTYPE_G(x)	\
+	(((x) >> FW_RI_TPTE_STAGTYPE_S) & FW_RI_TPTE_STAGTYPE_M)
+
+#define FW_RI_TPTE_PDID_S		0
+#define FW_RI_TPTE_PDID_M		0xfffff
+#define FW_RI_TPTE_PDID_V(x)		((x) << FW_RI_TPTE_PDID_S)
+#define FW_RI_TPTE_PDID_G(x)		\
+	(((x) >> FW_RI_TPTE_PDID_S) & FW_RI_TPTE_PDID_M)
+
+#define FW_RI_TPTE_PERM_S		28
+#define FW_RI_TPTE_PERM_M		0xf
+#define FW_RI_TPTE_PERM_V(x)		((x) << FW_RI_TPTE_PERM_S)
+#define FW_RI_TPTE_PERM_G(x)		\
+	(((x) >> FW_RI_TPTE_PERM_S) & FW_RI_TPTE_PERM_M)
+
+#define FW_RI_TPTE_REMINVDIS_S		27
+#define FW_RI_TPTE_REMINVDIS_M		0x1
+#define FW_RI_TPTE_REMINVDIS_V(x)	((x) << FW_RI_TPTE_REMINVDIS_S)
+#define FW_RI_TPTE_REMINVDIS_G(x)	\
+	(((x) >> FW_RI_TPTE_REMINVDIS_S) & FW_RI_TPTE_REMINVDIS_M)
+#define FW_RI_TPTE_REMINVDIS_F		FW_RI_TPTE_REMINVDIS_V(1U)
+
+#define FW_RI_TPTE_ADDRTYPE_S		26
+#define FW_RI_TPTE_ADDRTYPE_M		1
+#define FW_RI_TPTE_ADDRTYPE_V(x)	((x) << FW_RI_TPTE_ADDRTYPE_S)
+#define FW_RI_TPTE_ADDRTYPE_G(x)	\
+	(((x) >> FW_RI_TPTE_ADDRTYPE_S) & FW_RI_TPTE_ADDRTYPE_M)
+#define FW_RI_TPTE_ADDRTYPE_F		FW_RI_TPTE_ADDRTYPE_V(1U)
+
+#define FW_RI_TPTE_MWBINDEN_S		25
+#define FW_RI_TPTE_MWBINDEN_M		0x1
+#define FW_RI_TPTE_MWBINDEN_V(x)	((x) << FW_RI_TPTE_MWBINDEN_S)
+#define FW_RI_TPTE_MWBINDEN_G(x)	\
+	(((x) >> FW_RI_TPTE_MWBINDEN_S) & FW_RI_TPTE_MWBINDEN_M)
+#define FW_RI_TPTE_MWBINDEN_F		FW_RI_TPTE_MWBINDEN_V(1U)
+
+#define FW_RI_TPTE_PS_S			20
+#define FW_RI_TPTE_PS_M			0x1f
+#define FW_RI_TPTE_PS_V(x)		((x) << FW_RI_TPTE_PS_S)
+#define FW_RI_TPTE_PS_G(x)		\
+	(((x) >> FW_RI_TPTE_PS_S) & FW_RI_TPTE_PS_M)
+
+#define FW_RI_TPTE_QPID_S		0
+#define FW_RI_TPTE_QPID_M		0xfffff
+#define FW_RI_TPTE_QPID_V(x)		((x) << FW_RI_TPTE_QPID_S)
+#define FW_RI_TPTE_QPID_G(x)		\
+	(((x) >> FW_RI_TPTE_QPID_S) & FW_RI_TPTE_QPID_M)
+
+#define FW_RI_TPTE_NOSNOOP_S		30
+#define FW_RI_TPTE_NOSNOOP_M		0x1
+#define FW_RI_TPTE_NOSNOOP_V(x)		((x) << FW_RI_TPTE_NOSNOOP_S)
+#define FW_RI_TPTE_NOSNOOP_G(x)		\
+	(((x) >> FW_RI_TPTE_NOSNOOP_S) & FW_RI_TPTE_NOSNOOP_M)
+#define FW_RI_TPTE_NOSNOOP_F		FW_RI_TPTE_NOSNOOP_V(1U)
+
+#define FW_RI_TPTE_PBLADDR_S		0
+#define FW_RI_TPTE_PBLADDR_M		0x1fffffff
+#define FW_RI_TPTE_PBLADDR_V(x)		((x) << FW_RI_TPTE_PBLADDR_S)
+#define FW_RI_TPTE_PBLADDR_G(x)		\
+	(((x) >> FW_RI_TPTE_PBLADDR_S) & FW_RI_TPTE_PBLADDR_M)
+
+#define FW_RI_TPTE_DCA_S		24
+#define FW_RI_TPTE_DCA_M		0x1f
+#define FW_RI_TPTE_DCA_V(x)		((x) << FW_RI_TPTE_DCA_S)
+#define FW_RI_TPTE_DCA_G(x)		\
+	(((x) >> FW_RI_TPTE_DCA_S) & FW_RI_TPTE_DCA_M)
+
+#define FW_RI_TPTE_MWBCNT_PSTAG_S	0
+#define FW_RI_TPTE_MWBCNT_PSTAG_M	0xffffff
+#define FW_RI_TPTE_MWBCNT_PSTAT_V(x)	\
+	((x) << FW_RI_TPTE_MWBCNT_PSTAG_S)
+#define FW_RI_TPTE_MWBCNT_PSTAG_G(x)	\
+	(((x) >> FW_RI_TPTE_MWBCNT_PSTAG_S) & FW_RI_TPTE_MWBCNT_PSTAG_M)
+
+enum fw_ri_res_type {
+	FW_RI_RES_TYPE_SQ,
+	FW_RI_RES_TYPE_RQ,
+	FW_RI_RES_TYPE_CQ,
+};
+
+enum fw_ri_res_op {
+	FW_RI_RES_OP_WRITE,
+	FW_RI_RES_OP_RESET,
+};
+
+struct fw_ri_res {
+	union fw_ri_restype {
+		struct fw_ri_res_sqrq {
+			__u8   restype;
+			__u8   op;
+			__be16 r3;
+			__be32 eqid;
+			__be32 r4[2];
+			__be32 fetchszm_to_iqid;
+			__be32 dcaen_to_eqsize;
+			__be64 eqaddr;
+		} sqrq;
+		struct fw_ri_res_cq {
+			__u8   restype;
+			__u8   op;
+			__be16 r3;
+			__be32 iqid;
+			__be32 r4[2];
+			__be32 iqandst_to_iqandstindex;
+			__be16 iqdroprss_to_iqesize;
+			__be16 iqsize;
+			__be64 iqaddr;
+			__be32 iqns_iqro;
+			__be32 r6_lo;
+			__be64 r7;
+		} cq;
+	} u;
+};
+
+struct fw_ri_res_wr {
+	__be32 op_nres;
+	__be32 len16_pkd;
+	__u64  cookie;
+#ifndef C99_NOT_SUPPORTED
+	struct fw_ri_res res[0];
+#endif
+};
+
+#define FW_RI_RES_WR_NRES_S	0
+#define FW_RI_RES_WR_NRES_M	0xff
+#define FW_RI_RES_WR_NRES_V(x)	((x) << FW_RI_RES_WR_NRES_S)
+#define FW_RI_RES_WR_NRES_G(x)	\
+	(((x) >> FW_RI_RES_WR_NRES_S) & FW_RI_RES_WR_NRES_M)
+
+#define FW_RI_RES_WR_FETCHSZM_S		26
+#define FW_RI_RES_WR_FETCHSZM_M		0x1
+#define FW_RI_RES_WR_FETCHSZM_V(x)	((x) << FW_RI_RES_WR_FETCHSZM_S)
+#define FW_RI_RES_WR_FETCHSZM_G(x)	\
+	(((x) >> FW_RI_RES_WR_FETCHSZM_S) & FW_RI_RES_WR_FETCHSZM_M)
+#define FW_RI_RES_WR_FETCHSZM_F	FW_RI_RES_WR_FETCHSZM_V(1U)
+
+#define FW_RI_RES_WR_STATUSPGNS_S	25
+#define FW_RI_RES_WR_STATUSPGNS_M	0x1
+#define FW_RI_RES_WR_STATUSPGNS_V(x)	((x) << FW_RI_RES_WR_STATUSPGNS_S)
+#define FW_RI_RES_WR_STATUSPGNS_G(x)	\
+	(((x) >> FW_RI_RES_WR_STATUSPGNS_S) & FW_RI_RES_WR_STATUSPGNS_M)
+#define FW_RI_RES_WR_STATUSPGNS_F	FW_RI_RES_WR_STATUSPGNS_V(1U)
+
+#define FW_RI_RES_WR_STATUSPGRO_S	24
+#define FW_RI_RES_WR_STATUSPGRO_M	0x1
+#define FW_RI_RES_WR_STATUSPGRO_V(x)	((x) << FW_RI_RES_WR_STATUSPGRO_S)
+#define FW_RI_RES_WR_STATUSPGRO_G(x)	\
+	(((x) >> FW_RI_RES_WR_STATUSPGRO_S) & FW_RI_RES_WR_STATUSPGRO_M)
+#define FW_RI_RES_WR_STATUSPGRO_F	FW_RI_RES_WR_STATUSPGRO_V(1U)
+
+#define FW_RI_RES_WR_FETCHNS_S		23
+#define FW_RI_RES_WR_FETCHNS_M		0x1
+#define FW_RI_RES_WR_FETCHNS_V(x)	((x) << FW_RI_RES_WR_FETCHNS_S)
+#define FW_RI_RES_WR_FETCHNS_G(x)	\
+	(((x) >> FW_RI_RES_WR_FETCHNS_S) & FW_RI_RES_WR_FETCHNS_M)
+#define FW_RI_RES_WR_FETCHNS_F	FW_RI_RES_WR_FETCHNS_V(1U)
+
+#define FW_RI_RES_WR_FETCHRO_S		22
+#define FW_RI_RES_WR_FETCHRO_M		0x1
+#define FW_RI_RES_WR_FETCHRO_V(x)	((x) << FW_RI_RES_WR_FETCHRO_S)
+#define FW_RI_RES_WR_FETCHRO_G(x)	\
+	(((x) >> FW_RI_RES_WR_FETCHRO_S) & FW_RI_RES_WR_FETCHRO_M)
+#define FW_RI_RES_WR_FETCHRO_F	FW_RI_RES_WR_FETCHRO_V(1U)
+
+#define FW_RI_RES_WR_HOSTFCMODE_S	20
+#define FW_RI_RES_WR_HOSTFCMODE_M	0x3
+#define FW_RI_RES_WR_HOSTFCMODE_V(x)	((x) << FW_RI_RES_WR_HOSTFCMODE_S)
+#define FW_RI_RES_WR_HOSTFCMODE_G(x)	\
+	(((x) >> FW_RI_RES_WR_HOSTFCMODE_S) & FW_RI_RES_WR_HOSTFCMODE_M)
+
+#define FW_RI_RES_WR_CPRIO_S	19
+#define FW_RI_RES_WR_CPRIO_M	0x1
+#define FW_RI_RES_WR_CPRIO_V(x)	((x) << FW_RI_RES_WR_CPRIO_S)
+#define FW_RI_RES_WR_CPRIO_G(x)	\
+	(((x) >> FW_RI_RES_WR_CPRIO_S) & FW_RI_RES_WR_CPRIO_M)
+#define FW_RI_RES_WR_CPRIO_F	FW_RI_RES_WR_CPRIO_V(1U)
+
+#define FW_RI_RES_WR_ONCHIP_S		18
+#define FW_RI_RES_WR_ONCHIP_M		0x1
+#define FW_RI_RES_WR_ONCHIP_V(x)	((x) << FW_RI_RES_WR_ONCHIP_S)
+#define FW_RI_RES_WR_ONCHIP_G(x)	\
+	(((x) >> FW_RI_RES_WR_ONCHIP_S) & FW_RI_RES_WR_ONCHIP_M)
+#define FW_RI_RES_WR_ONCHIP_F	FW_RI_RES_WR_ONCHIP_V(1U)
+
+#define FW_RI_RES_WR_PCIECHN_S		16
+#define FW_RI_RES_WR_PCIECHN_M		0x3
+#define FW_RI_RES_WR_PCIECHN_V(x)	((x) << FW_RI_RES_WR_PCIECHN_S)
+#define FW_RI_RES_WR_PCIECHN_G(x)	\
+	(((x) >> FW_RI_RES_WR_PCIECHN_S) & FW_RI_RES_WR_PCIECHN_M)
+
+#define FW_RI_RES_WR_IQID_S	0
+#define FW_RI_RES_WR_IQID_M	0xffff
+#define FW_RI_RES_WR_IQID_V(x)	((x) << FW_RI_RES_WR_IQID_S)
+#define FW_RI_RES_WR_IQID_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQID_S) & FW_RI_RES_WR_IQID_M)
+
+#define FW_RI_RES_WR_DCAEN_S	31
+#define FW_RI_RES_WR_DCAEN_M	0x1
+#define FW_RI_RES_WR_DCAEN_V(x)	((x) << FW_RI_RES_WR_DCAEN_S)
+#define FW_RI_RES_WR_DCAEN_G(x)	\
+	(((x) >> FW_RI_RES_WR_DCAEN_S) & FW_RI_RES_WR_DCAEN_M)
+#define FW_RI_RES_WR_DCAEN_F	FW_RI_RES_WR_DCAEN_V(1U)
+
+#define FW_RI_RES_WR_DCACPU_S		26
+#define FW_RI_RES_WR_DCACPU_M		0x1f
+#define FW_RI_RES_WR_DCACPU_V(x)	((x) << FW_RI_RES_WR_DCACPU_S)
+#define FW_RI_RES_WR_DCACPU_G(x)	\
+	(((x) >> FW_RI_RES_WR_DCACPU_S) & FW_RI_RES_WR_DCACPU_M)
+
+#define FW_RI_RES_WR_FBMIN_S	23
+#define FW_RI_RES_WR_FBMIN_M	0x7
+#define FW_RI_RES_WR_FBMIN_V(x)	((x) << FW_RI_RES_WR_FBMIN_S)
+#define FW_RI_RES_WR_FBMIN_G(x)	\
+	(((x) >> FW_RI_RES_WR_FBMIN_S) & FW_RI_RES_WR_FBMIN_M)
+
+#define FW_RI_RES_WR_FBMAX_S	20
+#define FW_RI_RES_WR_FBMAX_M	0x7
+#define FW_RI_RES_WR_FBMAX_V(x)	((x) << FW_RI_RES_WR_FBMAX_S)
+#define FW_RI_RES_WR_FBMAX_G(x)	\
+	(((x) >> FW_RI_RES_WR_FBMAX_S) & FW_RI_RES_WR_FBMAX_M)
+
+#define FW_RI_RES_WR_CIDXFTHRESHO_S	19
+#define FW_RI_RES_WR_CIDXFTHRESHO_M	0x1
+#define FW_RI_RES_WR_CIDXFTHRESHO_V(x)	((x) << FW_RI_RES_WR_CIDXFTHRESHO_S)
+#define FW_RI_RES_WR_CIDXFTHRESHO_G(x)	\
+	(((x) >> FW_RI_RES_WR_CIDXFTHRESHO_S) & FW_RI_RES_WR_CIDXFTHRESHO_M)
+#define FW_RI_RES_WR_CIDXFTHRESHO_F	FW_RI_RES_WR_CIDXFTHRESHO_V(1U)
+
+#define FW_RI_RES_WR_CIDXFTHRESH_S	16
+#define FW_RI_RES_WR_CIDXFTHRESH_M	0x7
+#define FW_RI_RES_WR_CIDXFTHRESH_V(x)	((x) << FW_RI_RES_WR_CIDXFTHRESH_S)
+#define FW_RI_RES_WR_CIDXFTHRESH_G(x)	\
+	(((x) >> FW_RI_RES_WR_CIDXFTHRESH_S) & FW_RI_RES_WR_CIDXFTHRESH_M)
+
+#define FW_RI_RES_WR_EQSIZE_S		0
+#define FW_RI_RES_WR_EQSIZE_M		0xffff
+#define FW_RI_RES_WR_EQSIZE_V(x)	((x) << FW_RI_RES_WR_EQSIZE_S)
+#define FW_RI_RES_WR_EQSIZE_G(x)	\
+	(((x) >> FW_RI_RES_WR_EQSIZE_S) & FW_RI_RES_WR_EQSIZE_M)
+
+#define FW_RI_RES_WR_IQANDST_S		15
+#define FW_RI_RES_WR_IQANDST_M		0x1
+#define FW_RI_RES_WR_IQANDST_V(x)	((x) << FW_RI_RES_WR_IQANDST_S)
+#define FW_RI_RES_WR_IQANDST_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQANDST_S) & FW_RI_RES_WR_IQANDST_M)
+#define FW_RI_RES_WR_IQANDST_F	FW_RI_RES_WR_IQANDST_V(1U)
+
+#define FW_RI_RES_WR_IQANUS_S		14
+#define FW_RI_RES_WR_IQANUS_M		0x1
+#define FW_RI_RES_WR_IQANUS_V(x)	((x) << FW_RI_RES_WR_IQANUS_S)
+#define FW_RI_RES_WR_IQANUS_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQANUS_S) & FW_RI_RES_WR_IQANUS_M)
+#define FW_RI_RES_WR_IQANUS_F	FW_RI_RES_WR_IQANUS_V(1U)
+
+#define FW_RI_RES_WR_IQANUD_S		12
+#define FW_RI_RES_WR_IQANUD_M		0x3
+#define FW_RI_RES_WR_IQANUD_V(x)	((x) << FW_RI_RES_WR_IQANUD_S)
+#define FW_RI_RES_WR_IQANUD_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQANUD_S) & FW_RI_RES_WR_IQANUD_M)
+
+#define FW_RI_RES_WR_IQANDSTINDEX_S	0
+#define FW_RI_RES_WR_IQANDSTINDEX_M	0xfff
+#define FW_RI_RES_WR_IQANDSTINDEX_V(x)	((x) << FW_RI_RES_WR_IQANDSTINDEX_S)
+#define FW_RI_RES_WR_IQANDSTINDEX_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQANDSTINDEX_S) & FW_RI_RES_WR_IQANDSTINDEX_M)
+
+#define FW_RI_RES_WR_IQDROPRSS_S	15
+#define FW_RI_RES_WR_IQDROPRSS_M	0x1
+#define FW_RI_RES_WR_IQDROPRSS_V(x)	((x) << FW_RI_RES_WR_IQDROPRSS_S)
+#define FW_RI_RES_WR_IQDROPRSS_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQDROPRSS_S) & FW_RI_RES_WR_IQDROPRSS_M)
+#define FW_RI_RES_WR_IQDROPRSS_F	FW_RI_RES_WR_IQDROPRSS_V(1U)
+
+#define FW_RI_RES_WR_IQGTSMODE_S	14
+#define FW_RI_RES_WR_IQGTSMODE_M	0x1
+#define FW_RI_RES_WR_IQGTSMODE_V(x)	((x) << FW_RI_RES_WR_IQGTSMODE_S)
+#define FW_RI_RES_WR_IQGTSMODE_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQGTSMODE_S) & FW_RI_RES_WR_IQGTSMODE_M)
+#define FW_RI_RES_WR_IQGTSMODE_F	FW_RI_RES_WR_IQGTSMODE_V(1U)
+
+#define FW_RI_RES_WR_IQPCIECH_S		12
+#define FW_RI_RES_WR_IQPCIECH_M		0x3
+#define FW_RI_RES_WR_IQPCIECH_V(x)	((x) << FW_RI_RES_WR_IQPCIECH_S)
+#define FW_RI_RES_WR_IQPCIECH_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQPCIECH_S) & FW_RI_RES_WR_IQPCIECH_M)
+
+#define FW_RI_RES_WR_IQDCAEN_S		11
+#define FW_RI_RES_WR_IQDCAEN_M		0x1
+#define FW_RI_RES_WR_IQDCAEN_V(x)	((x) << FW_RI_RES_WR_IQDCAEN_S)
+#define FW_RI_RES_WR_IQDCAEN_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQDCAEN_S) & FW_RI_RES_WR_IQDCAEN_M)
+#define FW_RI_RES_WR_IQDCAEN_F	FW_RI_RES_WR_IQDCAEN_V(1U)
+
+#define FW_RI_RES_WR_IQDCACPU_S		6
+#define FW_RI_RES_WR_IQDCACPU_M		0x1f
+#define FW_RI_RES_WR_IQDCACPU_V(x)	((x) << FW_RI_RES_WR_IQDCACPU_S)
+#define FW_RI_RES_WR_IQDCACPU_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQDCACPU_S) & FW_RI_RES_WR_IQDCACPU_M)
+
+#define FW_RI_RES_WR_IQINTCNTTHRESH_S		4
+#define FW_RI_RES_WR_IQINTCNTTHRESH_M		0x3
+#define FW_RI_RES_WR_IQINTCNTTHRESH_V(x)	\
+	((x) << FW_RI_RES_WR_IQINTCNTTHRESH_S)
+#define FW_RI_RES_WR_IQINTCNTTHRESH_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQINTCNTTHRESH_S) & FW_RI_RES_WR_IQINTCNTTHRESH_M)
+
+#define FW_RI_RES_WR_IQO_S	3
+#define FW_RI_RES_WR_IQO_M	0x1
+#define FW_RI_RES_WR_IQO_V(x)	((x) << FW_RI_RES_WR_IQO_S)
+#define FW_RI_RES_WR_IQO_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQO_S) & FW_RI_RES_WR_IQO_M)
+#define FW_RI_RES_WR_IQO_F	FW_RI_RES_WR_IQO_V(1U)
+
+#define FW_RI_RES_WR_IQCPRIO_S		2
+#define FW_RI_RES_WR_IQCPRIO_M		0x1
+#define FW_RI_RES_WR_IQCPRIO_V(x)	((x) << FW_RI_RES_WR_IQCPRIO_S)
+#define FW_RI_RES_WR_IQCPRIO_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQCPRIO_S) & FW_RI_RES_WR_IQCPRIO_M)
+#define FW_RI_RES_WR_IQCPRIO_F	FW_RI_RES_WR_IQCPRIO_V(1U)
+
+#define FW_RI_RES_WR_IQESIZE_S		0
+#define FW_RI_RES_WR_IQESIZE_M		0x3
+#define FW_RI_RES_WR_IQESIZE_V(x)	((x) << FW_RI_RES_WR_IQESIZE_S)
+#define FW_RI_RES_WR_IQESIZE_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQESIZE_S) & FW_RI_RES_WR_IQESIZE_M)
+
+#define FW_RI_RES_WR_IQNS_S	31
+#define FW_RI_RES_WR_IQNS_M	0x1
+#define FW_RI_RES_WR_IQNS_V(x)	((x) << FW_RI_RES_WR_IQNS_S)
+#define FW_RI_RES_WR_IQNS_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQNS_S) & FW_RI_RES_WR_IQNS_M)
+#define FW_RI_RES_WR_IQNS_F	FW_RI_RES_WR_IQNS_V(1U)
+
+#define FW_RI_RES_WR_IQRO_S	30
+#define FW_RI_RES_WR_IQRO_M	0x1
+#define FW_RI_RES_WR_IQRO_V(x)	((x) << FW_RI_RES_WR_IQRO_S)
+#define FW_RI_RES_WR_IQRO_G(x)	\
+	(((x) >> FW_RI_RES_WR_IQRO_S) & FW_RI_RES_WR_IQRO_M)
+#define FW_RI_RES_WR_IQRO_F	FW_RI_RES_WR_IQRO_V(1U)
+
+struct fw_ri_rdma_write_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be64 r2;
+	__be32 plen;
+	__be32 stag_sink;
+	__be64 to_sink;
+#ifndef C99_NOT_SUPPORTED
+	union {
+		struct fw_ri_immd immd_src[0];
+		struct fw_ri_isgl isgl_src[0];
+	} u;
+#endif
+};
+
+struct fw_ri_send_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be32 sendop_pkd;
+	__be32 stag_inv;
+	__be32 plen;
+	__be32 r3;
+	__be64 r4;
+#ifndef C99_NOT_SUPPORTED
+	union {
+		struct fw_ri_immd immd_src[0];
+		struct fw_ri_isgl isgl_src[0];
+	} u;
+#endif
+};
+
+#define FW_RI_SEND_WR_SENDOP_S		0
+#define FW_RI_SEND_WR_SENDOP_M		0xf
+#define FW_RI_SEND_WR_SENDOP_V(x)	((x) << FW_RI_SEND_WR_SENDOP_S)
+#define FW_RI_SEND_WR_SENDOP_G(x)	\
+	(((x) >> FW_RI_SEND_WR_SENDOP_S) & FW_RI_SEND_WR_SENDOP_M)
+
+struct fw_ri_rdma_read_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be64 r2;
+	__be32 stag_sink;
+	__be32 to_sink_hi;
+	__be32 to_sink_lo;
+	__be32 plen;
+	__be32 stag_src;
+	__be32 to_src_hi;
+	__be32 to_src_lo;
+	__be32 r5;
+};
+
+struct fw_ri_recv_wr {
+	__u8   opcode;
+	__u8   r1;
+	__u16  wrid;
+	__u8   r2[3];
+	__u8   len16;
+	struct fw_ri_isgl isgl;
+};
+
+struct fw_ri_bind_mw_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__u8   qpbinde_to_dcacpu;
+	__u8   pgsz_shift;
+	__u8   addr_type;
+	__u8   mem_perms;
+	__be32 stag_mr;
+	__be32 stag_mw;
+	__be32 r3;
+	__be64 len_mw;
+	__be64 va_fbo;
+	__be64 r4;
+};
+
+#define FW_RI_BIND_MW_WR_QPBINDE_S	6
+#define FW_RI_BIND_MW_WR_QPBINDE_M	0x1
+#define FW_RI_BIND_MW_WR_QPBINDE_V(x)	((x) << FW_RI_BIND_MW_WR_QPBINDE_S)
+#define FW_RI_BIND_MW_WR_QPBINDE_G(x)	\
+	(((x) >> FW_RI_BIND_MW_WR_QPBINDE_S) & FW_RI_BIND_MW_WR_QPBINDE_M)
+#define FW_RI_BIND_MW_WR_QPBINDE_F	FW_RI_BIND_MW_WR_QPBINDE_V(1U)
+
+#define FW_RI_BIND_MW_WR_NS_S		5
+#define FW_RI_BIND_MW_WR_NS_M		0x1
+#define FW_RI_BIND_MW_WR_NS_V(x)	((x) << FW_RI_BIND_MW_WR_NS_S)
+#define FW_RI_BIND_MW_WR_NS_G(x)	\
+	(((x) >> FW_RI_BIND_MW_WR_NS_S) & FW_RI_BIND_MW_WR_NS_M)
+#define FW_RI_BIND_MW_WR_NS_F	FW_RI_BIND_MW_WR_NS_V(1U)
+
+#define FW_RI_BIND_MW_WR_DCACPU_S	0
+#define FW_RI_BIND_MW_WR_DCACPU_M	0x1f
+#define FW_RI_BIND_MW_WR_DCACPU_V(x)	((x) << FW_RI_BIND_MW_WR_DCACPU_S)
+#define FW_RI_BIND_MW_WR_DCACPU_G(x)	\
+	(((x) >> FW_RI_BIND_MW_WR_DCACPU_S) & FW_RI_BIND_MW_WR_DCACPU_M)
+
+struct fw_ri_fr_nsmr_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__u8   qpbinde_to_dcacpu;
+	__u8   pgsz_shift;
+	__u8   addr_type;
+	__u8   mem_perms;
+	__be32 stag;
+	__be32 len_hi;
+	__be32 len_lo;
+	__be32 va_hi;
+	__be32 va_lo_fbo;
+};
+
+#define FW_RI_FR_NSMR_WR_QPBINDE_S	6
+#define FW_RI_FR_NSMR_WR_QPBINDE_M	0x1
+#define FW_RI_FR_NSMR_WR_QPBINDE_V(x)	((x) << FW_RI_FR_NSMR_WR_QPBINDE_S)
+#define FW_RI_FR_NSMR_WR_QPBINDE_G(x)	\
+	(((x) >> FW_RI_FR_NSMR_WR_QPBINDE_S) & FW_RI_FR_NSMR_WR_QPBINDE_M)
+#define FW_RI_FR_NSMR_WR_QPBINDE_F	FW_RI_FR_NSMR_WR_QPBINDE_V(1U)
+
+#define FW_RI_FR_NSMR_WR_NS_S		5
+#define FW_RI_FR_NSMR_WR_NS_M		0x1
+#define FW_RI_FR_NSMR_WR_NS_V(x)	((x) << FW_RI_FR_NSMR_WR_NS_S)
+#define FW_RI_FR_NSMR_WR_NS_G(x)	\
+	(((x) >> FW_RI_FR_NSMR_WR_NS_S) & FW_RI_FR_NSMR_WR_NS_M)
+#define FW_RI_FR_NSMR_WR_NS_F	FW_RI_FR_NSMR_WR_NS_V(1U)
+
+#define FW_RI_FR_NSMR_WR_DCACPU_S	0
+#define FW_RI_FR_NSMR_WR_DCACPU_M	0x1f
+#define FW_RI_FR_NSMR_WR_DCACPU_V(x)	((x) << FW_RI_FR_NSMR_WR_DCACPU_S)
+#define FW_RI_FR_NSMR_WR_DCACPU_G(x)	\
+	(((x) >> FW_RI_FR_NSMR_WR_DCACPU_S) & FW_RI_FR_NSMR_WR_DCACPU_M)
+
+struct fw_ri_inv_lstag_wr {
+	__u8   opcode;
+	__u8   flags;
+	__u16  wrid;
+	__u8   r1[3];
+	__u8   len16;
+	__be32 r2;
+	__be32 stag_inv;
+};
+
+enum fw_ri_type {
+	FW_RI_TYPE_INIT,
+	FW_RI_TYPE_FINI,
+	FW_RI_TYPE_TERMINATE
+};
+
+enum fw_ri_init_p2ptype {
+	FW_RI_INIT_P2PTYPE_RDMA_WRITE		= FW_RI_RDMA_WRITE,
+	FW_RI_INIT_P2PTYPE_READ_REQ		= FW_RI_READ_REQ,
+	FW_RI_INIT_P2PTYPE_SEND			= FW_RI_SEND,
+	FW_RI_INIT_P2PTYPE_SEND_WITH_INV	= FW_RI_SEND_WITH_INV,
+	FW_RI_INIT_P2PTYPE_SEND_WITH_SE		= FW_RI_SEND_WITH_SE,
+	FW_RI_INIT_P2PTYPE_SEND_WITH_SE_INV	= FW_RI_SEND_WITH_SE_INV,
+	FW_RI_INIT_P2PTYPE_DISABLED		= 0xf,
+};
+
+struct fw_ri_wr {
+	__be32 op_compl;
+	__be32 flowid_len16;
+	__u64  cookie;
+	union fw_ri {
+		struct fw_ri_init {
+			__u8   type;
+			__u8   mpareqbit_p2ptype;
+			__u8   r4[2];
+			__u8   mpa_attrs;
+			__u8   qp_caps;
+			__be16 nrqe;
+			__be32 pdid;
+			__be32 qpid;
+			__be32 sq_eqid;
+			__be32 rq_eqid;
+			__be32 scqid;
+			__be32 rcqid;
+			__be32 ord_max;
+			__be32 ird_max;
+			__be32 iss;
+			__be32 irs;
+			__be32 hwrqsize;
+			__be32 hwrqaddr;
+			__be64 r5;
+			union fw_ri_init_p2p {
+				struct fw_ri_rdma_write_wr write;
+				struct fw_ri_rdma_read_wr read;
+				struct fw_ri_send_wr send;
+			} u;
+		} init;
+		struct fw_ri_fini {
+			__u8   type;
+			__u8   r3[7];
+			__be64 r4;
+		} fini;
+		struct fw_ri_terminate {
+			__u8   type;
+			__u8   r3[3];
+			__be32 immdlen;
+			__u8   termmsg[40];
+		} terminate;
+	} u;
+};
+
+#define FW_RI_WR_MPAREQBIT_S	7
+#define FW_RI_WR_MPAREQBIT_M	0x1
+#define FW_RI_WR_MPAREQBIT_V(x)	((x) << FW_RI_WR_MPAREQBIT_S)
+#define FW_RI_WR_MPAREQBIT_G(x)	\
+	(((x) >> FW_RI_WR_MPAREQBIT_S) & FW_RI_WR_MPAREQBIT_M)
+#define FW_RI_WR_MPAREQBIT_F	FW_RI_WR_MPAREQBIT_V(1U)
+
+#define FW_RI_WR_P2PTYPE_S	0
+#define FW_RI_WR_P2PTYPE_M	0xf
+#define FW_RI_WR_P2PTYPE_V(x)	((x) << FW_RI_WR_P2PTYPE_S)
+#define FW_RI_WR_P2PTYPE_G(x)	\
+	(((x) >> FW_RI_WR_P2PTYPE_S) & FW_RI_WR_P2PTYPE_M)
+
+struct tcp_options {
+	__be16 mss;
+	__u8 wsf;
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+	__u8:4;
+	__u8 unknown:1;
+	__u8:1;
+	__u8 sack:1;
+	__u8 tstamp:1;
+#else
+	__u8 tstamp:1;
+	__u8 sack:1;
+	__u8:1;
+	__u8 unknown:1;
+	__u8:4;
+#endif
+};
+
+struct cpl_pass_accept_req {
+	union opcode_tid ot;
+	__be16 rsvd;
+	__be16 len;
+	__be32 hdr_len;
+	__be16 vlan;
+	__be16 l2info;
+	__be32 tos_stid;
+	struct tcp_options tcpopt;
+};
+
+/* cpl_pass_accept_req.hdr_len fields */
+#define SYN_RX_CHAN_S    0
+#define SYN_RX_CHAN_M    0xF
+#define SYN_RX_CHAN_V(x) ((x) << SYN_RX_CHAN_S)
+#define SYN_RX_CHAN_G(x) (((x) >> SYN_RX_CHAN_S) & SYN_RX_CHAN_M)
+
+#define TCP_HDR_LEN_S    10
+#define TCP_HDR_LEN_M    0x3F
+#define TCP_HDR_LEN_V(x) ((x) << TCP_HDR_LEN_S)
+#define TCP_HDR_LEN_G(x) (((x) >> TCP_HDR_LEN_S) & TCP_HDR_LEN_M)
+
+#define IP_HDR_LEN_S    16
+#define IP_HDR_LEN_M    0x3FF
+#define IP_HDR_LEN_V(x) ((x) << IP_HDR_LEN_S)
+#define IP_HDR_LEN_G(x) (((x) >> IP_HDR_LEN_S) & IP_HDR_LEN_M)
+
+#define ETH_HDR_LEN_S    26
+#define ETH_HDR_LEN_M    0x1F
+#define ETH_HDR_LEN_V(x) ((x) << ETH_HDR_LEN_S)
+#define ETH_HDR_LEN_G(x) (((x) >> ETH_HDR_LEN_S) & ETH_HDR_LEN_M)
+
+/* cpl_pass_accept_req.l2info fields */
+#define SYN_MAC_IDX_S    0
+#define SYN_MAC_IDX_M    0x1FF
+#define SYN_MAC_IDX_V(x) ((x) << SYN_MAC_IDX_S)
+#define SYN_MAC_IDX_G(x) (((x) >> SYN_MAC_IDX_S) & SYN_MAC_IDX_M)
+
+#define SYN_XACT_MATCH_S    9
+#define SYN_XACT_MATCH_V(x) ((x) << SYN_XACT_MATCH_S)
+#define SYN_XACT_MATCH_F    SYN_XACT_MATCH_V(1U)
+
+#define SYN_INTF_S    12
+#define SYN_INTF_M    0xF
+#define SYN_INTF_V(x) ((x) << SYN_INTF_S)
+#define SYN_INTF_G(x) (((x) >> SYN_INTF_S) & SYN_INTF_M)
+
+struct ulptx_idata {
+	__be32 cmd_more;
+	__be32 len;
+};
+
+#define ULPTX_NSGE_S    0
+#define ULPTX_NSGE_M    0xFFFF
+#define ULPTX_NSGE_V(x) ((x) << ULPTX_NSGE_S)
+
+#define RX_DACK_MODE_S    29
+#define RX_DACK_MODE_M    0x3
+#define RX_DACK_MODE_V(x) ((x) << RX_DACK_MODE_S)
+#define RX_DACK_MODE_G(x) (((x) >> RX_DACK_MODE_S) & RX_DACK_MODE_M)
+
+#define RX_DACK_CHANGE_S    31
+#define RX_DACK_CHANGE_V(x) ((x) << RX_DACK_CHANGE_S)
+#define RX_DACK_CHANGE_F    RX_DACK_CHANGE_V(1U)
+
+enum {                     /* TCP congestion control algorithms */
+	CONG_ALG_RENO,
+	CONG_ALG_TAHOE,
+	CONG_ALG_NEWRENO,
+	CONG_ALG_HIGHSPEED
+};
+
+#define CONG_CNTRL_S    14
+#define CONG_CNTRL_M    0x3
+#define CONG_CNTRL_V(x) ((x) << CONG_CNTRL_S)
+#define CONG_CNTRL_G(x) (((x) >> CONG_CNTRL_S) & CONG_CNTRL_M)
+
+#define T5_ISS_S    18
+#define T5_ISS_V(x) ((x) << T5_ISS_S)
+#define T5_ISS_F    T5_ISS_V(1U)
+
+#endif /* _T4FW_RI_API_H_ */
diff --git a/drivers/infiniband/hw/cxgb4/user.h b/drivers/infiniband/hw/cxgb4/user.h
new file mode 100644
index 000000000..cbd0ce170
--- /dev/null
+++ b/drivers/infiniband/hw/cxgb4/user.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2009-2010 Chelsio, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __C4IW_USER_H__
+#define __C4IW_USER_H__
+
+#define C4IW_UVERBS_ABI_VERSION	2
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+struct c4iw_create_cq_resp {
+	__u64 key;
+	__u64 gts_key;
+	__u64 memsize;
+	__u32 cqid;
+	__u32 size;
+	__u32 qid_mask;
+	__u32 reserved; /* explicit padding (optional for i386) */
+};
+
+
+enum {
+	C4IW_QPF_ONCHIP = (1<<0)
+};
+
+struct c4iw_create_qp_resp {
+	__u64 ma_sync_key;
+	__u64 sq_key;
+	__u64 rq_key;
+	__u64 sq_db_gts_key;
+	__u64 rq_db_gts_key;
+	__u64 sq_memsize;
+	__u64 rq_memsize;
+	__u32 sqid;
+	__u32 rqid;
+	__u32 sq_size;
+	__u32 rq_size;
+	__u32 qid_mask;
+	__u32 flags;
+};
+
+struct c4iw_alloc_ucontext_resp {
+	__u64 status_page_key;
+	__u32 status_page_size;
+	__u32 reserved; /* explicit padding (optional for i386) */
+};
+#endif
diff --git a/drivers/infiniband/hw/ehca/Kconfig b/drivers/infiniband/hw/ehca/Kconfig
new file mode 100644
index 000000000..59f807d8d
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/Kconfig
@@ -0,0 +1,9 @@
+config INFINIBAND_EHCA
+	tristate "eHCA support"
+	depends on IBMEBUS
+	---help---
+	This driver supports the IBM pSeries eHCA InfiniBand adapter.
+
+	To compile the driver as a module, choose M here. The module
+	will be called ib_ehca.
+
diff --git a/drivers/infiniband/hw/ehca/Makefile b/drivers/infiniband/hw/ehca/Makefile
new file mode 100644
index 000000000..74d284e46
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/Makefile
@@ -0,0 +1,16 @@
+#  Authors: Heiko J Schick <schickhj@de.ibm.com>
+#           Christoph Raisch <raisch@de.ibm.com>
+#           Joachim Fenkes <fenkes@de.ibm.com>
+#
+#  Copyright (c) 2005 IBM Corporation
+#
+#  All rights reserved.
+#
+#  This source code is distributed under a dual license of GPL v2.0 and OpenIB BSD.
+
+obj-$(CONFIG_INFINIBAND_EHCA) += ib_ehca.o
+
+ib_ehca-objs  = ehca_main.o ehca_hca.o ehca_mcast.o ehca_pd.o ehca_av.o ehca_eq.o \
+		ehca_cq.o ehca_qp.o ehca_sqp.o ehca_mrmw.o ehca_reqs.o ehca_irq.o \
+		ehca_uverbs.o ipz_pt_fn.o hcp_if.o hcp_phyp.o
+
diff --git a/drivers/infiniband/hw/ehca/ehca_av.c b/drivers/infiniband/hw/ehca/ehca_av.c
new file mode 100644
index 000000000..465926319
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_av.c
@@ -0,0 +1,277 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  address vector functions
+ *
+ *  Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Khadija Souissi <souissik@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+static struct kmem_cache *av_cache;
+
+int ehca_calc_ipd(struct ehca_shca *shca, int port,
+		  enum ib_rate path_rate, u32 *ipd)
+{
+	int path = ib_rate_to_mult(path_rate);
+	int link, ret;
+	struct ib_port_attr pa;
+
+	if (path_rate == IB_RATE_PORT_CURRENT) {
+		*ipd = 0;
+		return 0;
+	}
+
+	if (unlikely(path < 0)) {
+		ehca_err(&shca->ib_device, "Invalid static rate! path_rate=%x",
+			 path_rate);
+		return -EINVAL;
+	}
+
+	ret = ehca_query_port(&shca->ib_device, port, &pa);
+	if (unlikely(ret < 0)) {
+		ehca_err(&shca->ib_device, "Failed to query port  ret=%i", ret);
+		return ret;
+	}
+
+	link = ib_width_enum_to_int(pa.active_width) * pa.active_speed;
+
+	if (path >= link)
+		/* no need to throttle if path faster than link */
+		*ipd = 0;
+	else
+		/* IPD = round((link / path) - 1) */
+		*ipd = ((link + (path >> 1)) / path) - 1;
+
+	return 0;
+}
+
+struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	int ret;
+	struct ehca_av *av;
+	struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
+					      ib_device);
+
+	av = kmem_cache_alloc(av_cache, GFP_KERNEL);
+	if (!av) {
+		ehca_err(pd->device, "Out of memory pd=%p ah_attr=%p",
+			 pd, ah_attr);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	av->av.sl = ah_attr->sl;
+	av->av.dlid = ah_attr->dlid;
+	av->av.slid_path_bits = ah_attr->src_path_bits;
+
+	if (ehca_static_rate < 0) {
+		u32 ipd;
+		if (ehca_calc_ipd(shca, ah_attr->port_num,
+				  ah_attr->static_rate, &ipd)) {
+			ret = -EINVAL;
+			goto create_ah_exit1;
+		}
+		av->av.ipd = ipd;
+	} else
+		av->av.ipd = ehca_static_rate;
+
+	av->av.lnh = ah_attr->ah_flags;
+	av->av.grh.word_0 = EHCA_BMASK_SET(GRH_IPVERSION_MASK, 6);
+	av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_TCLASS_MASK,
+					    ah_attr->grh.traffic_class);
+	av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK,
+					    ah_attr->grh.flow_label);
+	av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK,
+					    ah_attr->grh.hop_limit);
+	av->av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1B);
+	/* set sgid in grh.word_1 */
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		int rc;
+		struct ib_port_attr port_attr;
+		union ib_gid gid;
+		memset(&port_attr, 0, sizeof(port_attr));
+		rc = ehca_query_port(pd->device, ah_attr->port_num,
+				     &port_attr);
+		if (rc) { /* invalid port number */
+			ret = -EINVAL;
+			ehca_err(pd->device, "Invalid port number "
+				 "ehca_query_port() returned %x "
+				 "pd=%p ah_attr=%p", rc, pd, ah_attr);
+			goto create_ah_exit1;
+		}
+		memset(&gid, 0, sizeof(gid));
+		rc = ehca_query_gid(pd->device,
+				    ah_attr->port_num,
+				    ah_attr->grh.sgid_index, &gid);
+		if (rc) {
+			ret = -EINVAL;
+			ehca_err(pd->device, "Failed to retrieve sgid "
+				 "ehca_query_gid() returned %x "
+				 "pd=%p ah_attr=%p", rc, pd, ah_attr);
+			goto create_ah_exit1;
+		}
+		memcpy(&av->av.grh.word_1, &gid, sizeof(gid));
+	}
+	av->av.pmtu = shca->max_mtu;
+
+	/* dgid comes in grh.word_3 */
+	memcpy(&av->av.grh.word_3, &ah_attr->grh.dgid,
+	       sizeof(ah_attr->grh.dgid));
+
+	return &av->ib_ah;
+
+create_ah_exit1:
+	kmem_cache_free(av_cache, av);
+
+	return ERR_PTR(ret);
+}
+
+int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+	struct ehca_av *av;
+	struct ehca_ud_av new_ehca_av;
+	struct ehca_shca *shca = container_of(ah->pd->device, struct ehca_shca,
+					      ib_device);
+
+	memset(&new_ehca_av, 0, sizeof(new_ehca_av));
+	new_ehca_av.sl = ah_attr->sl;
+	new_ehca_av.dlid = ah_attr->dlid;
+	new_ehca_av.slid_path_bits = ah_attr->src_path_bits;
+	new_ehca_av.ipd = ah_attr->static_rate;
+	new_ehca_av.lnh = EHCA_BMASK_SET(GRH_FLAG_MASK,
+					 (ah_attr->ah_flags & IB_AH_GRH) > 0);
+	new_ehca_av.grh.word_0 = EHCA_BMASK_SET(GRH_TCLASS_MASK,
+						ah_attr->grh.traffic_class);
+	new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_FLOWLABEL_MASK,
+						 ah_attr->grh.flow_label);
+	new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_HOPLIMIT_MASK,
+						 ah_attr->grh.hop_limit);
+	new_ehca_av.grh.word_0 |= EHCA_BMASK_SET(GRH_NEXTHEADER_MASK, 0x1b);
+
+	/* set sgid in grh.word_1 */
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		int rc;
+		struct ib_port_attr port_attr;
+		union ib_gid gid;
+		memset(&port_attr, 0, sizeof(port_attr));
+		rc = ehca_query_port(ah->device, ah_attr->port_num,
+				     &port_attr);
+		if (rc) { /* invalid port number */
+			ehca_err(ah->device, "Invalid port number "
+				 "ehca_query_port() returned %x "
+				 "ah=%p ah_attr=%p port_num=%x",
+				 rc, ah, ah_attr, ah_attr->port_num);
+			return -EINVAL;
+		}
+		memset(&gid, 0, sizeof(gid));
+		rc = ehca_query_gid(ah->device,
+				    ah_attr->port_num,
+				    ah_attr->grh.sgid_index, &gid);
+		if (rc) {
+			ehca_err(ah->device, "Failed to retrieve sgid "
+				 "ehca_query_gid() returned %x "
+				 "ah=%p ah_attr=%p port_num=%x "
+				 "sgid_index=%x",
+				 rc, ah, ah_attr, ah_attr->port_num,
+				 ah_attr->grh.sgid_index);
+			return -EINVAL;
+		}
+		memcpy(&new_ehca_av.grh.word_1, &gid, sizeof(gid));
+	}
+
+	new_ehca_av.pmtu = shca->max_mtu;
+
+	memcpy(&new_ehca_av.grh.word_3, &ah_attr->grh.dgid,
+	       sizeof(ah_attr->grh.dgid));
+
+	av = container_of(ah, struct ehca_av, ib_ah);
+	av->av = new_ehca_av;
+
+	return 0;
+}
+
+int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr)
+{
+	struct ehca_av *av = container_of(ah, struct ehca_av, ib_ah);
+
+	memcpy(&ah_attr->grh.dgid, &av->av.grh.word_3,
+	       sizeof(ah_attr->grh.dgid));
+	ah_attr->sl = av->av.sl;
+
+	ah_attr->dlid = av->av.dlid;
+
+	ah_attr->src_path_bits = av->av.slid_path_bits;
+	ah_attr->static_rate = av->av.ipd;
+	ah_attr->ah_flags = EHCA_BMASK_GET(GRH_FLAG_MASK, av->av.lnh);
+	ah_attr->grh.traffic_class = EHCA_BMASK_GET(GRH_TCLASS_MASK,
+						    av->av.grh.word_0);
+	ah_attr->grh.hop_limit = EHCA_BMASK_GET(GRH_HOPLIMIT_MASK,
+						av->av.grh.word_0);
+	ah_attr->grh.flow_label = EHCA_BMASK_GET(GRH_FLOWLABEL_MASK,
+						 av->av.grh.word_0);
+
+	return 0;
+}
+
+int ehca_destroy_ah(struct ib_ah *ah)
+{
+	kmem_cache_free(av_cache, container_of(ah, struct ehca_av, ib_ah));
+
+	return 0;
+}
+
+int ehca_init_av_cache(void)
+{
+	av_cache = kmem_cache_create("ehca_cache_av",
+				   sizeof(struct ehca_av), 0,
+				   SLAB_HWCACHE_ALIGN,
+				   NULL);
+	if (!av_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void ehca_cleanup_av_cache(void)
+{
+	if (av_cache)
+		kmem_cache_destroy(av_cache);
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h
new file mode 100644
index 000000000..bd45e0f39
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_classes.h
@@ -0,0 +1,482 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Struct definition for eHCA internal structures
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_CLASSES_H__
+#define __EHCA_CLASSES_H__
+
+struct ehca_module;
+struct ehca_qp;
+struct ehca_cq;
+struct ehca_eq;
+struct ehca_mr;
+struct ehca_mw;
+struct ehca_pd;
+struct ehca_av;
+
+#include <linux/wait.h>
+#include <linux/mutex.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#ifdef CONFIG_PPC64
+#include "ehca_classes_pSeries.h"
+#endif
+#include "ipz_pt_fn.h"
+#include "ehca_qes.h"
+#include "ehca_irq.h"
+
+#define EHCA_EQE_CACHE_SIZE 20
+#define EHCA_MAX_NUM_QUEUES 0xffff
+
+struct ehca_eqe_cache_entry {
+	struct ehca_eqe *eqe;
+	struct ehca_cq *cq;
+};
+
+struct ehca_eq {
+	u32 length;
+	struct ipz_queue ipz_queue;
+	struct ipz_eq_handle ipz_eq_handle;
+	struct work_struct work;
+	struct h_galpas galpas;
+	int is_initialized;
+	struct ehca_pfeq pf;
+	spinlock_t spinlock;
+	struct tasklet_struct interrupt_task;
+	u32 ist;
+	spinlock_t irq_spinlock;
+	struct ehca_eqe_cache_entry eqe_cache[EHCA_EQE_CACHE_SIZE];
+};
+
+struct ehca_sma_attr {
+	u16 lid, lmc, sm_sl, sm_lid;
+	u16 pkey_tbl_len, pkeys[16];
+};
+
+struct ehca_sport {
+	struct ib_cq *ibcq_aqp1;
+	struct ib_qp *ibqp_sqp[2];
+	/* lock to serialze modify_qp() calls for sqp in normal
+	 * and irq path (when event PORT_ACTIVE is received first time)
+	 */
+	spinlock_t mod_sqp_lock;
+	enum ib_port_state port_state;
+	struct ehca_sma_attr saved_attr;
+	u32 pma_qp_nr;
+};
+
+#define HCA_CAP_MR_PGSIZE_4K  0x80000000
+#define HCA_CAP_MR_PGSIZE_64K 0x40000000
+#define HCA_CAP_MR_PGSIZE_1M  0x20000000
+#define HCA_CAP_MR_PGSIZE_16M 0x10000000
+
+struct ehca_shca {
+	struct ib_device ib_device;
+	struct platform_device *ofdev;
+	u8 num_ports;
+	int hw_level;
+	struct list_head shca_list;
+	struct ipz_adapter_handle ipz_hca_handle;
+	struct ehca_sport sport[2];
+	struct ehca_eq eq;
+	struct ehca_eq neq;
+	struct ehca_mr *maxmr;
+	struct ehca_pd *pd;
+	struct h_galpas galpas;
+	struct mutex modify_mutex;
+	u64 hca_cap;
+	/* MR pgsize: bit 0-3 means 4K, 64K, 1M, 16M respectively */
+	u32 hca_cap_mr_pgsize;
+	int max_mtu;
+	int max_num_qps;
+	int max_num_cqs;
+	atomic_t num_cqs;
+	atomic_t num_qps;
+};
+
+struct ehca_pd {
+	struct ib_pd ib_pd;
+	struct ipz_pd fw_pd;
+	/* small queue mgmt */
+	struct mutex lock;
+	struct list_head free[2];
+	struct list_head full[2];
+};
+
+enum ehca_ext_qp_type {
+	EQPT_NORMAL    = 0,
+	EQPT_LLQP      = 1,
+	EQPT_SRQBASE   = 2,
+	EQPT_SRQ       = 3,
+};
+
+/* struct to cache modify_qp()'s parms for GSI/SMI qp */
+struct ehca_mod_qp_parm {
+	int mask;
+	struct ib_qp_attr attr;
+};
+
+#define EHCA_MOD_QP_PARM_MAX 4
+
+#define QMAP_IDX_MASK 0xFFFFULL
+
+/* struct for tracking if cqes have been reported to the application */
+struct ehca_qmap_entry {
+	u16 app_wr_id;
+	u8 reported;
+	u8 cqe_req;
+};
+
+struct ehca_queue_map {
+	struct ehca_qmap_entry *map;
+	unsigned int entries;
+	unsigned int tail;
+	unsigned int left_to_poll;
+	unsigned int next_wqe_idx;   /* Idx to first wqe to be flushed */
+};
+
+/* function to calculate the next index for the qmap */
+static inline unsigned int next_index(unsigned int cur_index, unsigned int limit)
+{
+	unsigned int temp = cur_index + 1;
+	return (temp == limit) ? 0 : temp;
+}
+
+struct ehca_qp {
+	union {
+		struct ib_qp ib_qp;
+		struct ib_srq ib_srq;
+	};
+	u32 qp_type;
+	enum ehca_ext_qp_type ext_type;
+	enum ib_qp_state state;
+	struct ipz_queue ipz_squeue;
+	struct ehca_queue_map sq_map;
+	struct ipz_queue ipz_rqueue;
+	struct ehca_queue_map rq_map;
+	struct h_galpas galpas;
+	u32 qkey;
+	u32 real_qp_num;
+	u32 token;
+	spinlock_t spinlock_s;
+	spinlock_t spinlock_r;
+	u32 sq_max_inline_data_size;
+	struct ipz_qp_handle ipz_qp_handle;
+	struct ehca_pfqp pf;
+	struct ib_qp_init_attr init_attr;
+	struct ehca_cq *send_cq;
+	struct ehca_cq *recv_cq;
+	unsigned int sqerr_purgeflag;
+	struct hlist_node list_entries;
+	/* array to cache modify_qp()'s parms for GSI/SMI qp */
+	struct ehca_mod_qp_parm *mod_qp_parm;
+	int mod_qp_parm_idx;
+	/* mmap counter for resources mapped into user space */
+	u32 mm_count_squeue;
+	u32 mm_count_rqueue;
+	u32 mm_count_galpa;
+	/* unsolicited ack circumvention */
+	int unsol_ack_circ;
+	int mtu_shift;
+	u32 message_count;
+	u32 packet_count;
+	atomic_t nr_events; /* events seen */
+	wait_queue_head_t wait_completion;
+	int mig_armed;
+	struct list_head sq_err_node;
+	struct list_head rq_err_node;
+};
+
+#define IS_SRQ(qp) (qp->ext_type == EQPT_SRQ)
+#define HAS_SQ(qp) (qp->ext_type != EQPT_SRQ)
+#define HAS_RQ(qp) (qp->ext_type != EQPT_SRQBASE)
+
+/* must be power of 2 */
+#define QP_HASHTAB_LEN 8
+
+struct ehca_cq {
+	struct ib_cq ib_cq;
+	struct ipz_queue ipz_queue;
+	struct h_galpas galpas;
+	spinlock_t spinlock;
+	u32 cq_number;
+	u32 token;
+	u32 nr_of_entries;
+	struct ipz_cq_handle ipz_cq_handle;
+	struct ehca_pfcq pf;
+	spinlock_t cb_lock;
+	struct hlist_head qp_hashtab[QP_HASHTAB_LEN];
+	struct list_head entry;
+	u32 nr_callbacks;   /* #events assigned to cpu by scaling code */
+	atomic_t nr_events; /* #events seen */
+	wait_queue_head_t wait_completion;
+	spinlock_t task_lock;
+	/* mmap counter for resources mapped into user space */
+	u32 mm_count_queue;
+	u32 mm_count_galpa;
+	struct list_head sqp_err_list;
+	struct list_head rqp_err_list;
+};
+
+enum ehca_mr_flag {
+	EHCA_MR_FLAG_FMR = 0x80000000,	 /* FMR, created with ehca_alloc_fmr */
+	EHCA_MR_FLAG_MAXMR = 0x40000000, /* max-MR                           */
+};
+
+struct ehca_mr {
+	union {
+		struct ib_mr ib_mr;	/* must always be first in ehca_mr */
+		struct ib_fmr ib_fmr;	/* must always be first in ehca_mr */
+	} ib;
+	struct ib_umem *umem;
+	spinlock_t mrlock;
+
+	enum ehca_mr_flag flags;
+	u32 num_kpages;		/* number of kernel pages */
+	u32 num_hwpages;	/* number of hw pages to form MR */
+	u64 hwpage_size;	/* hw page size used for this MR */
+	int acl;		/* ACL (stored here for usage in reregister) */
+	u64 *start;		/* virtual start address (stored here for */
+				/* usage in reregister) */
+	u64 size;		/* size (stored here for usage in reregister) */
+	u32 fmr_page_size;	/* page size for FMR */
+	u32 fmr_max_pages;	/* max pages for FMR */
+	u32 fmr_max_maps;	/* max outstanding maps for FMR */
+	u32 fmr_map_cnt;	/* map counter for FMR */
+	/* fw specific data */
+	struct ipz_mrmw_handle ipz_mr_handle;	/* MR handle for h-calls */
+	struct h_galpas galpas;
+};
+
+struct ehca_mw {
+	struct ib_mw ib_mw;	/* gen2 mw, must always be first in ehca_mw */
+	spinlock_t mwlock;
+
+	u8 never_bound;		/* indication MW was never bound */
+	struct ipz_mrmw_handle ipz_mw_handle;	/* MW handle for h-calls */
+	struct h_galpas galpas;
+};
+
+enum ehca_mr_pgi_type {
+	EHCA_MR_PGI_PHYS   = 1,  /* type of ehca_reg_phys_mr,
+				  * ehca_rereg_phys_mr,
+				  * ehca_reg_internal_maxmr */
+	EHCA_MR_PGI_USER   = 2,  /* type of ehca_reg_user_mr */
+	EHCA_MR_PGI_FMR    = 3   /* type of ehca_map_phys_fmr */
+};
+
+struct ehca_mr_pginfo {
+	enum ehca_mr_pgi_type type;
+	u64 num_kpages;
+	u64 kpage_cnt;
+	u64 hwpage_size;     /* hw page size used for this MR */
+	u64 num_hwpages;     /* number of hw pages */
+	u64 hwpage_cnt;      /* counter for hw pages */
+	u64 next_hwpage;     /* next hw page in buffer/chunk/listelem */
+
+	union {
+		struct { /* type EHCA_MR_PGI_PHYS section */
+			int num_phys_buf;
+			struct ib_phys_buf *phys_buf_array;
+			u64 next_buf;
+		} phy;
+		struct { /* type EHCA_MR_PGI_USER section */
+			struct ib_umem *region;
+			struct scatterlist *next_sg;
+			u64 next_nmap;
+		} usr;
+		struct { /* type EHCA_MR_PGI_FMR section */
+			u64 fmr_pgsize;
+			u64 *page_list;
+			u64 next_listelem;
+		} fmr;
+	} u;
+};
+
+/* output parameters for MR/FMR hipz calls */
+struct ehca_mr_hipzout_parms {
+	struct ipz_mrmw_handle handle;
+	u32 lkey;
+	u32 rkey;
+	u64 len;
+	u64 vaddr;
+	u32 acl;
+};
+
+/* output parameters for MW hipz calls */
+struct ehca_mw_hipzout_parms {
+	struct ipz_mrmw_handle handle;
+	u32 rkey;
+};
+
+struct ehca_av {
+	struct ib_ah ib_ah;
+	struct ehca_ud_av av;
+};
+
+struct ehca_ucontext {
+	struct ib_ucontext ib_ucontext;
+};
+
+int ehca_init_pd_cache(void);
+void ehca_cleanup_pd_cache(void);
+int ehca_init_cq_cache(void);
+void ehca_cleanup_cq_cache(void);
+int ehca_init_qp_cache(void);
+void ehca_cleanup_qp_cache(void);
+int ehca_init_av_cache(void);
+void ehca_cleanup_av_cache(void);
+int ehca_init_mrmw_cache(void);
+void ehca_cleanup_mrmw_cache(void);
+int ehca_init_small_qp_cache(void);
+void ehca_cleanup_small_qp_cache(void);
+
+extern rwlock_t ehca_qp_idr_lock;
+extern rwlock_t ehca_cq_idr_lock;
+extern struct idr ehca_qp_idr;
+extern struct idr ehca_cq_idr;
+extern spinlock_t shca_list_lock;
+
+extern int ehca_static_rate;
+extern int ehca_port_act_time;
+extern bool ehca_use_hp_mr;
+extern bool ehca_scaling_code;
+extern int ehca_lock_hcalls;
+extern int ehca_nr_ports;
+extern int ehca_max_cq;
+extern int ehca_max_qp;
+
+struct ipzu_queue_resp {
+	u32 qe_size;      /* queue entry size */
+	u32 act_nr_of_sg;
+	u32 queue_length; /* queue length allocated in bytes */
+	u32 pagesize;
+	u32 toggle_state;
+	u32 offset; /* save offset within a page for small_qp */
+};
+
+struct ehca_create_cq_resp {
+	u32 cq_number;
+	u32 token;
+	struct ipzu_queue_resp ipz_queue;
+	u32 fw_handle_ofs;
+	u32 dummy;
+};
+
+struct ehca_create_qp_resp {
+	u32 qp_num;
+	u32 token;
+	u32 qp_type;
+	u32 ext_type;
+	u32 qkey;
+	/* qp_num assigned by ehca: sqp0/1 may have got different numbers */
+	u32 real_qp_num;
+	u32 fw_handle_ofs;
+	u32 dummy;
+	struct ipzu_queue_resp ipz_squeue;
+	struct ipzu_queue_resp ipz_rqueue;
+};
+
+struct ehca_alloc_cq_parms {
+	u32 nr_cqe;
+	u32 act_nr_of_entries;
+	u32 act_pages;
+	struct ipz_eq_handle eq_handle;
+};
+
+enum ehca_service_type {
+	ST_RC  = 0,
+	ST_UC  = 1,
+	ST_RD  = 2,
+	ST_UD  = 3,
+};
+
+enum ehca_ll_comp_flags {
+	LLQP_SEND_COMP = 0x20,
+	LLQP_RECV_COMP = 0x40,
+	LLQP_COMP_MASK = 0x60,
+};
+
+struct ehca_alloc_queue_parms {
+	/* input parameters */
+	int max_wr;
+	int max_sge;
+	int page_size;
+	int is_small;
+
+	/* output parameters */
+	u16 act_nr_wqes;
+	u8  act_nr_sges;
+	u32 queue_size; /* bytes for small queues, pages otherwise */
+};
+
+struct ehca_alloc_qp_parms {
+	struct ehca_alloc_queue_parms squeue;
+	struct ehca_alloc_queue_parms rqueue;
+
+	/* input parameters */
+	enum ehca_service_type servicetype;
+	int qp_storage;
+	int sigtype;
+	enum ehca_ext_qp_type ext_type;
+	enum ehca_ll_comp_flags ll_comp_flags;
+	int ud_av_l_key_ctl;
+
+	u32 token;
+	struct ipz_eq_handle eq_handle;
+	struct ipz_pd pd;
+	struct ipz_cq_handle send_cq_handle, recv_cq_handle;
+
+	u32 srq_qpn, srq_token, srq_limit;
+
+	/* output parameters */
+	u32 real_qp_num;
+	struct ipz_qp_handle qp_handle;
+	struct h_galpas galpas;
+};
+
+int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp);
+int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int qp_num);
+struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int qp_num);
+
+#endif
diff --git a/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h b/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h
new file mode 100644
index 000000000..689c35786
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_classes_pSeries.h
@@ -0,0 +1,208 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  pSeries interface definitions
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_CLASSES_PSERIES_H__
+#define __EHCA_CLASSES_PSERIES_H__
+
+#include "hcp_phyp.h"
+#include "ipz_pt_fn.h"
+
+
+struct ehca_pfqp {
+	struct ipz_qpt sqpt;
+	struct ipz_qpt rqpt;
+};
+
+struct ehca_pfcq {
+	struct ipz_qpt qpt;
+	u32 cqnr;
+};
+
+struct ehca_pfeq {
+	struct ipz_qpt qpt;
+	struct h_galpa galpa;
+	u32 eqnr;
+};
+
+struct ipz_adapter_handle {
+	u64 handle;
+};
+
+struct ipz_cq_handle {
+	u64 handle;
+};
+
+struct ipz_eq_handle {
+	u64 handle;
+};
+
+struct ipz_qp_handle {
+	u64 handle;
+};
+struct ipz_mrmw_handle {
+	u64 handle;
+};
+
+struct ipz_pd {
+	u32 value;
+};
+
+struct hcp_modify_qp_control_block {
+	u32 qkey;                      /* 00 */
+	u32 rdd;                       /* reliable datagram domain */
+	u32 send_psn;                  /* 02 */
+	u32 receive_psn;               /* 03 */
+	u32 prim_phys_port;            /* 04 */
+	u32 alt_phys_port;             /* 05 */
+	u32 prim_p_key_idx;            /* 06 */
+	u32 alt_p_key_idx;             /* 07 */
+	u32 rdma_atomic_ctrl;          /* 08 */
+	u32 qp_state;                  /* 09 */
+	u32 reserved_10;               /* 10 */
+	u32 rdma_nr_atomic_resp_res;   /* 11 */
+	u32 path_migration_state;      /* 12 */
+	u32 rdma_atomic_outst_dest_qp; /* 13 */
+	u32 dest_qp_nr;                /* 14 */
+	u32 min_rnr_nak_timer_field;   /* 15 */
+	u32 service_level;             /* 16 */
+	u32 send_grh_flag;             /* 17 */
+	u32 retry_count;               /* 18 */
+	u32 timeout;                   /* 19 */
+	u32 path_mtu;                  /* 20 */
+	u32 max_static_rate;           /* 21 */
+	u32 dlid;                      /* 22 */
+	u32 rnr_retry_count;           /* 23 */
+	u32 source_path_bits;          /* 24 */
+	u32 traffic_class;             /* 25 */
+	u32 hop_limit;                 /* 26 */
+	u32 source_gid_idx;            /* 27 */
+	u32 flow_label;                /* 28 */
+	u32 reserved_29;               /* 29 */
+	union {                        /* 30 */
+		u64 dw[2];
+		u8 byte[16];
+	} dest_gid;
+	u32 service_level_al;          /* 34 */
+	u32 send_grh_flag_al;          /* 35 */
+	u32 retry_count_al;            /* 36 */
+	u32 timeout_al;                /* 37 */
+	u32 max_static_rate_al;        /* 38 */
+	u32 dlid_al;                   /* 39 */
+	u32 rnr_retry_count_al;        /* 40 */
+	u32 source_path_bits_al;       /* 41 */
+	u32 traffic_class_al;          /* 42 */
+	u32 hop_limit_al;              /* 43 */
+	u32 source_gid_idx_al;         /* 44 */
+	u32 flow_label_al;             /* 45 */
+	u32 reserved_46;               /* 46 */
+	u32 reserved_47;               /* 47 */
+	union {                        /* 48 */
+		u64 dw[2];
+		u8 byte[16];
+	} dest_gid_al;
+	u32 max_nr_outst_send_wr;      /* 52 */
+	u32 max_nr_outst_recv_wr;      /* 53 */
+	u32 disable_ete_credit_check;  /* 54 */
+	u32 qp_number;                 /* 55 */
+	u64 send_queue_handle;         /* 56 */
+	u64 recv_queue_handle;         /* 58 */
+	u32 actual_nr_sges_in_sq_wqe;  /* 60 */
+	u32 actual_nr_sges_in_rq_wqe;  /* 61 */
+	u32 qp_enable;                 /* 62 */
+	u32 curr_srq_limit;            /* 63 */
+	u64 qp_aff_asyn_ev_log_reg;    /* 64 */
+	u64 shared_rq_hndl;            /* 66 */
+	u64 trigg_doorbell_qp_hndl;    /* 68 */
+	u32 reserved_70_127[58];       /* 70 */
+};
+
+#define MQPCB_MASK_QKEY                         EHCA_BMASK_IBM( 0,  0)
+#define MQPCB_MASK_SEND_PSN                     EHCA_BMASK_IBM( 2,  2)
+#define MQPCB_MASK_RECEIVE_PSN                  EHCA_BMASK_IBM( 3,  3)
+#define MQPCB_MASK_PRIM_PHYS_PORT               EHCA_BMASK_IBM( 4,  4)
+#define MQPCB_PRIM_PHYS_PORT                    EHCA_BMASK_IBM(24, 31)
+#define MQPCB_MASK_ALT_PHYS_PORT                EHCA_BMASK_IBM( 5,  5)
+#define MQPCB_MASK_PRIM_P_KEY_IDX               EHCA_BMASK_IBM( 6,  6)
+#define MQPCB_PRIM_P_KEY_IDX                    EHCA_BMASK_IBM(24, 31)
+#define MQPCB_MASK_ALT_P_KEY_IDX                EHCA_BMASK_IBM( 7,  7)
+#define MQPCB_MASK_RDMA_ATOMIC_CTRL             EHCA_BMASK_IBM( 8,  8)
+#define MQPCB_MASK_QP_STATE                     EHCA_BMASK_IBM( 9,  9)
+#define MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES      EHCA_BMASK_IBM(11, 11)
+#define MQPCB_MASK_PATH_MIGRATION_STATE         EHCA_BMASK_IBM(12, 12)
+#define MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP    EHCA_BMASK_IBM(13, 13)
+#define MQPCB_MASK_DEST_QP_NR                   EHCA_BMASK_IBM(14, 14)
+#define MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD      EHCA_BMASK_IBM(15, 15)
+#define MQPCB_MASK_SERVICE_LEVEL                EHCA_BMASK_IBM(16, 16)
+#define MQPCB_MASK_SEND_GRH_FLAG                EHCA_BMASK_IBM(17, 17)
+#define MQPCB_MASK_RETRY_COUNT                  EHCA_BMASK_IBM(18, 18)
+#define MQPCB_MASK_TIMEOUT                      EHCA_BMASK_IBM(19, 19)
+#define MQPCB_MASK_PATH_MTU                     EHCA_BMASK_IBM(20, 20)
+#define MQPCB_MASK_MAX_STATIC_RATE              EHCA_BMASK_IBM(21, 21)
+#define MQPCB_MASK_DLID                         EHCA_BMASK_IBM(22, 22)
+#define MQPCB_MASK_RNR_RETRY_COUNT              EHCA_BMASK_IBM(23, 23)
+#define MQPCB_MASK_SOURCE_PATH_BITS             EHCA_BMASK_IBM(24, 24)
+#define MQPCB_MASK_TRAFFIC_CLASS                EHCA_BMASK_IBM(25, 25)
+#define MQPCB_MASK_HOP_LIMIT                    EHCA_BMASK_IBM(26, 26)
+#define MQPCB_MASK_SOURCE_GID_IDX               EHCA_BMASK_IBM(27, 27)
+#define MQPCB_MASK_FLOW_LABEL                   EHCA_BMASK_IBM(28, 28)
+#define MQPCB_MASK_DEST_GID                     EHCA_BMASK_IBM(30, 30)
+#define MQPCB_MASK_SERVICE_LEVEL_AL             EHCA_BMASK_IBM(31, 31)
+#define MQPCB_MASK_SEND_GRH_FLAG_AL             EHCA_BMASK_IBM(32, 32)
+#define MQPCB_MASK_RETRY_COUNT_AL               EHCA_BMASK_IBM(33, 33)
+#define MQPCB_MASK_TIMEOUT_AL                   EHCA_BMASK_IBM(34, 34)
+#define MQPCB_MASK_MAX_STATIC_RATE_AL           EHCA_BMASK_IBM(35, 35)
+#define MQPCB_MASK_DLID_AL                      EHCA_BMASK_IBM(36, 36)
+#define MQPCB_MASK_RNR_RETRY_COUNT_AL           EHCA_BMASK_IBM(37, 37)
+#define MQPCB_MASK_SOURCE_PATH_BITS_AL          EHCA_BMASK_IBM(38, 38)
+#define MQPCB_MASK_TRAFFIC_CLASS_AL             EHCA_BMASK_IBM(39, 39)
+#define MQPCB_MASK_HOP_LIMIT_AL                 EHCA_BMASK_IBM(40, 40)
+#define MQPCB_MASK_SOURCE_GID_IDX_AL            EHCA_BMASK_IBM(41, 41)
+#define MQPCB_MASK_FLOW_LABEL_AL                EHCA_BMASK_IBM(42, 42)
+#define MQPCB_MASK_DEST_GID_AL                  EHCA_BMASK_IBM(44, 44)
+#define MQPCB_MASK_MAX_NR_OUTST_SEND_WR         EHCA_BMASK_IBM(45, 45)
+#define MQPCB_MASK_MAX_NR_OUTST_RECV_WR         EHCA_BMASK_IBM(46, 46)
+#define MQPCB_MASK_DISABLE_ETE_CREDIT_CHECK     EHCA_BMASK_IBM(47, 47)
+#define MQPCB_MASK_QP_ENABLE                    EHCA_BMASK_IBM(48, 48)
+#define MQPCB_MASK_CURR_SRQ_LIMIT               EHCA_BMASK_IBM(49, 49)
+#define MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG       EHCA_BMASK_IBM(50, 50)
+#define MQPCB_MASK_SHARED_RQ_HNDL               EHCA_BMASK_IBM(51, 51)
+
+#endif /* __EHCA_CLASSES_PSERIES_H__ */
diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c
new file mode 100644
index 000000000..8cc837537
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_cq.c
@@ -0,0 +1,392 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Completion queue handling
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Khadija Souissi <souissi@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_iverbs.h"
+#include "ehca_classes.h"
+#include "ehca_irq.h"
+#include "hcp_if.h"
+
+static struct kmem_cache *cq_cache;
+
+int ehca_cq_assign_qp(struct ehca_cq *cq, struct ehca_qp *qp)
+{
+	unsigned int qp_num = qp->real_qp_num;
+	unsigned int key = qp_num & (QP_HASHTAB_LEN-1);
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->spinlock, flags);
+	hlist_add_head(&qp->list_entries, &cq->qp_hashtab[key]);
+	spin_unlock_irqrestore(&cq->spinlock, flags);
+
+	ehca_dbg(cq->ib_cq.device, "cq_num=%x real_qp_num=%x",
+		 cq->cq_number, qp_num);
+
+	return 0;
+}
+
+int ehca_cq_unassign_qp(struct ehca_cq *cq, unsigned int real_qp_num)
+{
+	int ret = -EINVAL;
+	unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1);
+	struct hlist_node *iter;
+	struct ehca_qp *qp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->spinlock, flags);
+	hlist_for_each(iter, &cq->qp_hashtab[key]) {
+		qp = hlist_entry(iter, struct ehca_qp, list_entries);
+		if (qp->real_qp_num == real_qp_num) {
+			hlist_del(iter);
+			ehca_dbg(cq->ib_cq.device,
+				 "removed qp from cq .cq_num=%x real_qp_num=%x",
+				 cq->cq_number, real_qp_num);
+			ret = 0;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&cq->spinlock, flags);
+	if (ret)
+		ehca_err(cq->ib_cq.device,
+			 "qp not found cq_num=%x real_qp_num=%x",
+			 cq->cq_number, real_qp_num);
+
+	return ret;
+}
+
+struct ehca_qp *ehca_cq_get_qp(struct ehca_cq *cq, int real_qp_num)
+{
+	struct ehca_qp *ret = NULL;
+	unsigned int key = real_qp_num & (QP_HASHTAB_LEN-1);
+	struct hlist_node *iter;
+	struct ehca_qp *qp;
+	hlist_for_each(iter, &cq->qp_hashtab[key]) {
+		qp = hlist_entry(iter, struct ehca_qp, list_entries);
+		if (qp->real_qp_num == real_qp_num) {
+			ret = qp;
+			break;
+		}
+	}
+	return ret;
+}
+
+struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector,
+			     struct ib_ucontext *context,
+			     struct ib_udata *udata)
+{
+	static const u32 additional_cqe = 20;
+	struct ib_cq *cq;
+	struct ehca_cq *my_cq;
+	struct ehca_shca *shca =
+		container_of(device, struct ehca_shca, ib_device);
+	struct ipz_adapter_handle adapter_handle;
+	struct ehca_alloc_cq_parms param; /* h_call's out parameters */
+	struct h_galpa gal;
+	void *vpage;
+	u32 counter;
+	u64 rpage, cqx_fec, h_ret;
+	int ipz_rc, i;
+	unsigned long flags;
+
+	if (cqe >= 0xFFFFFFFF - 64 - additional_cqe)
+		return ERR_PTR(-EINVAL);
+
+	if (!atomic_add_unless(&shca->num_cqs, 1, shca->max_num_cqs)) {
+		ehca_err(device, "Unable to create CQ, max number of %i "
+			"CQs reached.", shca->max_num_cqs);
+		ehca_err(device, "To increase the maximum number of CQs "
+			"use the number_of_cqs module parameter.\n");
+		return ERR_PTR(-ENOSPC);
+	}
+
+	my_cq = kmem_cache_zalloc(cq_cache, GFP_KERNEL);
+	if (!my_cq) {
+		ehca_err(device, "Out of memory for ehca_cq struct device=%p",
+			 device);
+		atomic_dec(&shca->num_cqs);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	memset(&param, 0, sizeof(struct ehca_alloc_cq_parms));
+
+	spin_lock_init(&my_cq->spinlock);
+	spin_lock_init(&my_cq->cb_lock);
+	spin_lock_init(&my_cq->task_lock);
+	atomic_set(&my_cq->nr_events, 0);
+	init_waitqueue_head(&my_cq->wait_completion);
+
+	cq = &my_cq->ib_cq;
+
+	adapter_handle = shca->ipz_hca_handle;
+	param.eq_handle = shca->eq.ipz_eq_handle;
+
+	idr_preload(GFP_KERNEL);
+	write_lock_irqsave(&ehca_cq_idr_lock, flags);
+	my_cq->token = idr_alloc(&ehca_cq_idr, my_cq, 0, 0x2000000, GFP_NOWAIT);
+	write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
+	idr_preload_end();
+
+	if (my_cq->token < 0) {
+		cq = ERR_PTR(-ENOMEM);
+		ehca_err(device, "Can't allocate new idr entry. device=%p",
+			 device);
+		goto create_cq_exit1;
+	}
+
+	/*
+	 * CQs maximum depth is 4GB-64, but we need additional 20 as buffer
+	 * for receiving errors CQEs.
+	 */
+	param.nr_cqe = cqe + additional_cqe;
+	h_ret = hipz_h_alloc_resource_cq(adapter_handle, my_cq, &param);
+
+	if (h_ret != H_SUCCESS) {
+		ehca_err(device, "hipz_h_alloc_resource_cq() failed "
+			 "h_ret=%lli device=%p", h_ret, device);
+		cq = ERR_PTR(ehca2ib_return_code(h_ret));
+		goto create_cq_exit2;
+	}
+
+	ipz_rc = ipz_queue_ctor(NULL, &my_cq->ipz_queue, param.act_pages,
+				EHCA_PAGESIZE, sizeof(struct ehca_cqe), 0, 0);
+	if (!ipz_rc) {
+		ehca_err(device, "ipz_queue_ctor() failed ipz_rc=%i device=%p",
+			 ipz_rc, device);
+		cq = ERR_PTR(-EINVAL);
+		goto create_cq_exit3;
+	}
+
+	for (counter = 0; counter < param.act_pages; counter++) {
+		vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue);
+		if (!vpage) {
+			ehca_err(device, "ipz_qpageit_get_inc() "
+				 "returns NULL device=%p", device);
+			cq = ERR_PTR(-EAGAIN);
+			goto create_cq_exit4;
+		}
+		rpage = __pa(vpage);
+
+		h_ret = hipz_h_register_rpage_cq(adapter_handle,
+						 my_cq->ipz_cq_handle,
+						 &my_cq->pf,
+						 0,
+						 0,
+						 rpage,
+						 1,
+						 my_cq->galpas.
+						 kernel);
+
+		if (h_ret < H_SUCCESS) {
+			ehca_err(device, "hipz_h_register_rpage_cq() failed "
+				 "ehca_cq=%p cq_num=%x h_ret=%lli counter=%i "
+				 "act_pages=%i", my_cq, my_cq->cq_number,
+				 h_ret, counter, param.act_pages);
+			cq = ERR_PTR(-EINVAL);
+			goto create_cq_exit4;
+		}
+
+		if (counter == (param.act_pages - 1)) {
+			vpage = ipz_qpageit_get_inc(&my_cq->ipz_queue);
+			if ((h_ret != H_SUCCESS) || vpage) {
+				ehca_err(device, "Registration of pages not "
+					 "complete ehca_cq=%p cq_num=%x "
+					 "h_ret=%lli", my_cq, my_cq->cq_number,
+					 h_ret);
+				cq = ERR_PTR(-EAGAIN);
+				goto create_cq_exit4;
+			}
+		} else {
+			if (h_ret != H_PAGE_REGISTERED) {
+				ehca_err(device, "Registration of page failed "
+					 "ehca_cq=%p cq_num=%x h_ret=%lli "
+					 "counter=%i act_pages=%i",
+					 my_cq, my_cq->cq_number,
+					 h_ret, counter, param.act_pages);
+				cq = ERR_PTR(-ENOMEM);
+				goto create_cq_exit4;
+			}
+		}
+	}
+
+	ipz_qeit_reset(&my_cq->ipz_queue);
+
+	gal = my_cq->galpas.kernel;
+	cqx_fec = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_fec));
+	ehca_dbg(device, "ehca_cq=%p cq_num=%x CQX_FEC=%llx",
+		 my_cq, my_cq->cq_number, cqx_fec);
+
+	my_cq->ib_cq.cqe = my_cq->nr_of_entries =
+		param.act_nr_of_entries - additional_cqe;
+	my_cq->cq_number = (my_cq->ipz_cq_handle.handle) & 0xffff;
+
+	for (i = 0; i < QP_HASHTAB_LEN; i++)
+		INIT_HLIST_HEAD(&my_cq->qp_hashtab[i]);
+
+	INIT_LIST_HEAD(&my_cq->sqp_err_list);
+	INIT_LIST_HEAD(&my_cq->rqp_err_list);
+
+	if (context) {
+		struct ipz_queue *ipz_queue = &my_cq->ipz_queue;
+		struct ehca_create_cq_resp resp;
+		memset(&resp, 0, sizeof(resp));
+		resp.cq_number = my_cq->cq_number;
+		resp.token = my_cq->token;
+		resp.ipz_queue.qe_size = ipz_queue->qe_size;
+		resp.ipz_queue.act_nr_of_sg = ipz_queue->act_nr_of_sg;
+		resp.ipz_queue.queue_length = ipz_queue->queue_length;
+		resp.ipz_queue.pagesize = ipz_queue->pagesize;
+		resp.ipz_queue.toggle_state = ipz_queue->toggle_state;
+		resp.fw_handle_ofs = (u32)
+			(my_cq->galpas.user.fw_handle & (PAGE_SIZE - 1));
+		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
+			ehca_err(device, "Copy to udata failed.");
+			cq = ERR_PTR(-EFAULT);
+			goto create_cq_exit4;
+		}
+	}
+
+	return cq;
+
+create_cq_exit4:
+	ipz_queue_dtor(NULL, &my_cq->ipz_queue);
+
+create_cq_exit3:
+	h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
+	if (h_ret != H_SUCCESS)
+		ehca_err(device, "hipz_h_destroy_cq() failed ehca_cq=%p "
+			 "cq_num=%x h_ret=%lli", my_cq, my_cq->cq_number, h_ret);
+
+create_cq_exit2:
+	write_lock_irqsave(&ehca_cq_idr_lock, flags);
+	idr_remove(&ehca_cq_idr, my_cq->token);
+	write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
+
+create_cq_exit1:
+	kmem_cache_free(cq_cache, my_cq);
+
+	atomic_dec(&shca->num_cqs);
+	return cq;
+}
+
+int ehca_destroy_cq(struct ib_cq *cq)
+{
+	u64 h_ret;
+	struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+	int cq_num = my_cq->cq_number;
+	struct ib_device *device = cq->device;
+	struct ehca_shca *shca = container_of(device, struct ehca_shca,
+					      ib_device);
+	struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
+	unsigned long flags;
+
+	if (cq->uobject) {
+		if (my_cq->mm_count_galpa || my_cq->mm_count_queue) {
+			ehca_err(device, "Resources still referenced in "
+				 "user space cq_num=%x", my_cq->cq_number);
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * remove the CQ from the idr first to make sure
+	 * no more interrupt tasklets will touch this CQ
+	 */
+	write_lock_irqsave(&ehca_cq_idr_lock, flags);
+	idr_remove(&ehca_cq_idr, my_cq->token);
+	write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
+
+	/* now wait until all pending events have completed */
+	wait_event(my_cq->wait_completion, !atomic_read(&my_cq->nr_events));
+
+	/* nobody's using our CQ any longer -- we can destroy it */
+	h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 0);
+	if (h_ret == H_R_STATE) {
+		/* cq in err: read err data and destroy it forcibly */
+		ehca_dbg(device, "ehca_cq=%p cq_num=%x resource=%llx in err "
+			 "state. Try to delete it forcibly.",
+			 my_cq, cq_num, my_cq->ipz_cq_handle.handle);
+		ehca_error_data(shca, my_cq, my_cq->ipz_cq_handle.handle);
+		h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
+		if (h_ret == H_SUCCESS)
+			ehca_dbg(device, "cq_num=%x deleted successfully.",
+				 cq_num);
+	}
+	if (h_ret != H_SUCCESS) {
+		ehca_err(device, "hipz_h_destroy_cq() failed h_ret=%lli "
+			 "ehca_cq=%p cq_num=%x", h_ret, my_cq, cq_num);
+		return ehca2ib_return_code(h_ret);
+	}
+	ipz_queue_dtor(NULL, &my_cq->ipz_queue);
+	kmem_cache_free(cq_cache, my_cq);
+
+	atomic_dec(&shca->num_cqs);
+	return 0;
+}
+
+int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata)
+{
+	/* TODO: proper resize needs to be done */
+	ehca_err(cq->device, "not implemented yet");
+
+	return -EFAULT;
+}
+
+int ehca_init_cq_cache(void)
+{
+	cq_cache = kmem_cache_create("ehca_cache_cq",
+				     sizeof(struct ehca_cq), 0,
+				     SLAB_HWCACHE_ALIGN,
+				     NULL);
+	if (!cq_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void ehca_cleanup_cq_cache(void)
+{
+	if (cq_cache)
+		kmem_cache_destroy(cq_cache);
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_eq.c b/drivers/infiniband/hw/ehca/ehca_eq.c
new file mode 100644
index 000000000..90da6747d
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_eq.c
@@ -0,0 +1,189 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Event queue handling
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Khadija Souissi <souissi@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ehca_classes.h"
+#include "ehca_irq.h"
+#include "ehca_iverbs.h"
+#include "ehca_qes.h"
+#include "hcp_if.h"
+#include "ipz_pt_fn.h"
+
+int ehca_create_eq(struct ehca_shca *shca,
+		   struct ehca_eq *eq,
+		   const enum ehca_eq_type type, const u32 length)
+{
+	int ret;
+	u64 h_ret;
+	u32 nr_pages;
+	u32 i;
+	void *vpage;
+	struct ib_device *ib_dev = &shca->ib_device;
+
+	spin_lock_init(&eq->spinlock);
+	spin_lock_init(&eq->irq_spinlock);
+	eq->is_initialized = 0;
+
+	if (type != EHCA_EQ && type != EHCA_NEQ) {
+		ehca_err(ib_dev, "Invalid EQ type %x. eq=%p", type, eq);
+		return -EINVAL;
+	}
+	if (!length) {
+		ehca_err(ib_dev, "EQ length must not be zero. eq=%p", eq);
+		return -EINVAL;
+	}
+
+	h_ret = hipz_h_alloc_resource_eq(shca->ipz_hca_handle,
+					 &eq->pf,
+					 type,
+					 length,
+					 &eq->ipz_eq_handle,
+					 &eq->length,
+					 &nr_pages, &eq->ist);
+
+	if (h_ret != H_SUCCESS) {
+		ehca_err(ib_dev, "Can't allocate EQ/NEQ. eq=%p", eq);
+		return -EINVAL;
+	}
+
+	ret = ipz_queue_ctor(NULL, &eq->ipz_queue, nr_pages,
+			     EHCA_PAGESIZE, sizeof(struct ehca_eqe), 0, 0);
+	if (!ret) {
+		ehca_err(ib_dev, "Can't allocate EQ pages eq=%p", eq);
+		goto create_eq_exit1;
+	}
+
+	for (i = 0; i < nr_pages; i++) {
+		u64 rpage;
+
+		vpage = ipz_qpageit_get_inc(&eq->ipz_queue);
+		if (!vpage)
+			goto create_eq_exit2;
+
+		rpage = __pa(vpage);
+		h_ret = hipz_h_register_rpage_eq(shca->ipz_hca_handle,
+						 eq->ipz_eq_handle,
+						 &eq->pf,
+						 0, 0, rpage, 1);
+
+		if (i == (nr_pages - 1)) {
+			/* last page */
+			vpage = ipz_qpageit_get_inc(&eq->ipz_queue);
+			if (h_ret != H_SUCCESS || vpage)
+				goto create_eq_exit2;
+		} else {
+			if (h_ret != H_PAGE_REGISTERED)
+				goto create_eq_exit2;
+		}
+	}
+
+	ipz_qeit_reset(&eq->ipz_queue);
+
+	/* register interrupt handlers and initialize work queues */
+	if (type == EHCA_EQ) {
+		tasklet_init(&eq->interrupt_task, ehca_tasklet_eq, (long)shca);
+
+		ret = ibmebus_request_irq(eq->ist, ehca_interrupt_eq,
+					  0, "ehca_eq",
+					  (void *)shca);
+		if (ret < 0)
+			ehca_err(ib_dev, "Can't map interrupt handler.");
+	} else if (type == EHCA_NEQ) {
+		tasklet_init(&eq->interrupt_task, ehca_tasklet_neq, (long)shca);
+
+		ret = ibmebus_request_irq(eq->ist, ehca_interrupt_neq,
+					  0, "ehca_neq",
+					  (void *)shca);
+		if (ret < 0)
+			ehca_err(ib_dev, "Can't map interrupt handler.");
+	}
+
+	eq->is_initialized = 1;
+
+	return 0;
+
+create_eq_exit2:
+	ipz_queue_dtor(NULL, &eq->ipz_queue);
+
+create_eq_exit1:
+	hipz_h_destroy_eq(shca->ipz_hca_handle, eq);
+
+	return -EINVAL;
+}
+
+void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq)
+{
+	unsigned long flags;
+	void *eqe;
+
+	spin_lock_irqsave(&eq->spinlock, flags);
+	eqe = ipz_eqit_eq_get_inc_valid(&eq->ipz_queue);
+	spin_unlock_irqrestore(&eq->spinlock, flags);
+
+	return eqe;
+}
+
+int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq)
+{
+	unsigned long flags;
+	u64 h_ret;
+
+	ibmebus_free_irq(eq->ist, (void *)shca);
+
+	spin_lock_irqsave(&shca_list_lock, flags);
+	eq->is_initialized = 0;
+	spin_unlock_irqrestore(&shca_list_lock, flags);
+
+	tasklet_kill(&eq->interrupt_task);
+
+	h_ret = hipz_h_destroy_eq(shca->ipz_hca_handle, eq);
+
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Can't free EQ resources.");
+		return -EINVAL;
+	}
+	ipz_queue_dtor(NULL, &eq->ipz_queue);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_hca.c b/drivers/infiniband/hw/ehca/ehca_hca.c
new file mode 100644
index 000000000..9ed4d2588
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_hca.c
@@ -0,0 +1,410 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  HCA query functions
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/gfp.h>
+
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+static unsigned int limit_uint(unsigned int value)
+{
+	return min_t(unsigned int, value, INT_MAX);
+}
+
+int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props)
+{
+	int i, ret = 0;
+	struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
+					      ib_device);
+	struct hipz_query_hca *rblock;
+
+	static const u32 cap_mapping[] = {
+		IB_DEVICE_RESIZE_MAX_WR,      HCA_CAP_WQE_RESIZE,
+		IB_DEVICE_BAD_PKEY_CNTR,      HCA_CAP_BAD_P_KEY_CTR,
+		IB_DEVICE_BAD_QKEY_CNTR,      HCA_CAP_Q_KEY_VIOL_CTR,
+		IB_DEVICE_RAW_MULTI,          HCA_CAP_RAW_PACKET_MCAST,
+		IB_DEVICE_AUTO_PATH_MIG,      HCA_CAP_AUTO_PATH_MIG,
+		IB_DEVICE_CHANGE_PHY_PORT,    HCA_CAP_SQD_RTS_PORT_CHANGE,
+		IB_DEVICE_UD_AV_PORT_ENFORCE, HCA_CAP_AH_PORT_NR_CHECK,
+		IB_DEVICE_CURR_QP_STATE_MOD,  HCA_CAP_CUR_QP_STATE_MOD,
+		IB_DEVICE_SHUTDOWN_PORT,      HCA_CAP_SHUTDOWN_PORT,
+		IB_DEVICE_INIT_TYPE,          HCA_CAP_INIT_TYPE,
+		IB_DEVICE_PORT_ACTIVE_EVENT,  HCA_CAP_PORT_ACTIVE_EVENT,
+	};
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!rblock) {
+		ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+		return -ENOMEM;
+	}
+
+	if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Can't query device properties");
+		ret = -EINVAL;
+		goto query_device1;
+	}
+
+	memset(props, 0, sizeof(struct ib_device_attr));
+	props->page_size_cap   = shca->hca_cap_mr_pgsize;
+	props->fw_ver          = rblock->hw_ver;
+	props->max_mr_size     = rblock->max_mr_size;
+	props->vendor_id       = rblock->vendor_id >> 8;
+	props->vendor_part_id  = rblock->vendor_part_id >> 16;
+	props->hw_ver          = rblock->hw_ver;
+	props->max_qp          = limit_uint(rblock->max_qp);
+	props->max_qp_wr       = limit_uint(rblock->max_wqes_wq);
+	props->max_sge         = limit_uint(rblock->max_sge);
+	props->max_sge_rd      = limit_uint(rblock->max_sge_rd);
+	props->max_cq          = limit_uint(rblock->max_cq);
+	props->max_cqe         = limit_uint(rblock->max_cqe);
+	props->max_mr          = limit_uint(rblock->max_mr);
+	props->max_mw          = limit_uint(rblock->max_mw);
+	props->max_pd          = limit_uint(rblock->max_pd);
+	props->max_ah          = limit_uint(rblock->max_ah);
+	props->max_ee          = limit_uint(rblock->max_rd_ee_context);
+	props->max_rdd         = limit_uint(rblock->max_rd_domain);
+	props->max_fmr         = limit_uint(rblock->max_mr);
+	props->max_qp_rd_atom  = limit_uint(rblock->max_rr_qp);
+	props->max_ee_rd_atom  = limit_uint(rblock->max_rr_ee_context);
+	props->max_res_rd_atom = limit_uint(rblock->max_rr_hca);
+	props->max_qp_init_rd_atom = limit_uint(rblock->max_act_wqs_qp);
+	props->max_ee_init_rd_atom = limit_uint(rblock->max_act_wqs_ee_context);
+
+	if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
+		props->max_srq         = limit_uint(props->max_qp);
+		props->max_srq_wr      = limit_uint(props->max_qp_wr);
+		props->max_srq_sge     = 3;
+	}
+
+	props->max_pkeys           = 16;
+	/* Some FW versions say 0 here; insert sensible value in that case */
+	props->local_ca_ack_delay  = rblock->local_ca_ack_delay ?
+		min_t(u8, rblock->local_ca_ack_delay, 255) : 12;
+	props->max_raw_ipv6_qp     = limit_uint(rblock->max_raw_ipv6_qp);
+	props->max_raw_ethy_qp     = limit_uint(rblock->max_raw_ethy_qp);
+	props->max_mcast_grp       = limit_uint(rblock->max_mcast_grp);
+	props->max_mcast_qp_attach = limit_uint(rblock->max_mcast_qp_attach);
+	props->max_total_mcast_qp_attach
+		= limit_uint(rblock->max_total_mcast_qp_attach);
+
+	/* translate device capabilities */
+	props->device_cap_flags = IB_DEVICE_SYS_IMAGE_GUID |
+		IB_DEVICE_RC_RNR_NAK_GEN | IB_DEVICE_N_NOTIFY_CQ;
+	for (i = 0; i < ARRAY_SIZE(cap_mapping); i += 2)
+		if (rblock->hca_cap_indicators & cap_mapping[i + 1])
+			props->device_cap_flags |= cap_mapping[i];
+
+query_device1:
+	ehca_free_fw_ctrlblock(rblock);
+
+	return ret;
+}
+
+static enum ib_mtu map_mtu(struct ehca_shca *shca, u32 fw_mtu)
+{
+	switch (fw_mtu) {
+	case 0x1:
+		return IB_MTU_256;
+	case 0x2:
+		return IB_MTU_512;
+	case 0x3:
+		return IB_MTU_1024;
+	case 0x4:
+		return IB_MTU_2048;
+	case 0x5:
+		return IB_MTU_4096;
+	default:
+		ehca_err(&shca->ib_device, "Unknown MTU size: %x.",
+			 fw_mtu);
+		return 0;
+	}
+}
+
+static u8 map_number_of_vls(struct ehca_shca *shca, u32 vl_cap)
+{
+	switch (vl_cap) {
+	case 0x1:
+		return 1;
+	case 0x2:
+		return 2;
+	case 0x3:
+		return 4;
+	case 0x4:
+		return 8;
+	case 0x5:
+		return 15;
+	default:
+		ehca_err(&shca->ib_device, "invalid Vl Capability: %x.",
+			 vl_cap);
+		return 0;
+	}
+}
+
+int ehca_query_port(struct ib_device *ibdev,
+		    u8 port, struct ib_port_attr *props)
+{
+	int ret = 0;
+	u64 h_ret;
+	struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
+					      ib_device);
+	struct hipz_query_port *rblock;
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!rblock) {
+		ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+		return -ENOMEM;
+	}
+
+	h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Can't query port properties");
+		ret = -EINVAL;
+		goto query_port1;
+	}
+
+	memset(props, 0, sizeof(struct ib_port_attr));
+
+	props->active_mtu = props->max_mtu = map_mtu(shca, rblock->max_mtu);
+	props->port_cap_flags  = rblock->capability_mask;
+	props->gid_tbl_len     = rblock->gid_tbl_len;
+	if (rblock->max_msg_sz)
+		props->max_msg_sz      = rblock->max_msg_sz;
+	else
+		props->max_msg_sz      = 0x1 << 31;
+	props->bad_pkey_cntr   = rblock->bad_pkey_cntr;
+	props->qkey_viol_cntr  = rblock->qkey_viol_cntr;
+	props->pkey_tbl_len    = rblock->pkey_tbl_len;
+	props->lid             = rblock->lid;
+	props->sm_lid          = rblock->sm_lid;
+	props->lmc             = rblock->lmc;
+	props->sm_sl           = rblock->sm_sl;
+	props->subnet_timeout  = rblock->subnet_timeout;
+	props->init_type_reply = rblock->init_type_reply;
+	props->max_vl_num      = map_number_of_vls(shca, rblock->vl_cap);
+
+	if (rblock->state && rblock->phys_width) {
+		props->phys_state      = rblock->phys_pstate;
+		props->state           = rblock->phys_state;
+		props->active_width    = rblock->phys_width;
+		props->active_speed    = rblock->phys_speed;
+	} else {
+		/* old firmware releases don't report physical
+		 * port info, so use default values
+		 */
+		props->phys_state      = 5;
+		props->state           = rblock->state;
+		props->active_width    = IB_WIDTH_12X;
+		props->active_speed    = IB_SPEED_SDR;
+	}
+
+query_port1:
+	ehca_free_fw_ctrlblock(rblock);
+
+	return ret;
+}
+
+int ehca_query_sma_attr(struct ehca_shca *shca,
+			u8 port, struct ehca_sma_attr *attr)
+{
+	int ret = 0;
+	u64 h_ret;
+	struct hipz_query_port *rblock;
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
+	if (!rblock) {
+		ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+		return -ENOMEM;
+	}
+
+	h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Can't query port properties");
+		ret = -EINVAL;
+		goto query_sma_attr1;
+	}
+
+	memset(attr, 0, sizeof(struct ehca_sma_attr));
+
+	attr->lid    = rblock->lid;
+	attr->lmc    = rblock->lmc;
+	attr->sm_sl  = rblock->sm_sl;
+	attr->sm_lid = rblock->sm_lid;
+
+	attr->pkey_tbl_len = rblock->pkey_tbl_len;
+	memcpy(attr->pkeys, rblock->pkey_entries, sizeof(attr->pkeys));
+
+query_sma_attr1:
+	ehca_free_fw_ctrlblock(rblock);
+
+	return ret;
+}
+
+int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+	int ret = 0;
+	u64 h_ret;
+	struct ehca_shca *shca;
+	struct hipz_query_port *rblock;
+
+	shca = container_of(ibdev, struct ehca_shca, ib_device);
+	if (index > 16) {
+		ehca_err(&shca->ib_device, "Invalid index: %x.", index);
+		return -EINVAL;
+	}
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!rblock) {
+		ehca_err(&shca->ib_device,  "Can't allocate rblock memory.");
+		return -ENOMEM;
+	}
+
+	h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Can't query port properties");
+		ret = -EINVAL;
+		goto query_pkey1;
+	}
+
+	memcpy(pkey, &rblock->pkey_entries + index, sizeof(u16));
+
+query_pkey1:
+	ehca_free_fw_ctrlblock(rblock);
+
+	return ret;
+}
+
+int ehca_query_gid(struct ib_device *ibdev, u8 port,
+		   int index, union ib_gid *gid)
+{
+	int ret = 0;
+	u64 h_ret;
+	struct ehca_shca *shca = container_of(ibdev, struct ehca_shca,
+					      ib_device);
+	struct hipz_query_port *rblock;
+
+	if (index < 0 || index > 255) {
+		ehca_err(&shca->ib_device, "Invalid index: %x.", index);
+		return -EINVAL;
+	}
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!rblock) {
+		ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+		return -ENOMEM;
+	}
+
+	h_ret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Can't query port properties");
+		ret = -EINVAL;
+		goto query_gid1;
+	}
+
+	memcpy(&gid->raw[0], &rblock->gid_prefix, sizeof(u64));
+	memcpy(&gid->raw[8], &rblock->guid_entries[index], sizeof(u64));
+
+query_gid1:
+	ehca_free_fw_ctrlblock(rblock);
+
+	return ret;
+}
+
+static const u32 allowed_port_caps = (
+	IB_PORT_SM | IB_PORT_LED_INFO_SUP | IB_PORT_CM_SUP |
+	IB_PORT_SNMP_TUNNEL_SUP | IB_PORT_DEVICE_MGMT_SUP |
+	IB_PORT_VENDOR_CLASS_SUP);
+
+int ehca_modify_port(struct ib_device *ibdev,
+		     u8 port, int port_modify_mask,
+		     struct ib_port_modify *props)
+{
+	int ret = 0;
+	struct ehca_shca *shca;
+	struct hipz_query_port *rblock;
+	u32 cap;
+	u64 hret;
+
+	shca = container_of(ibdev, struct ehca_shca, ib_device);
+	if ((props->set_port_cap_mask | props->clr_port_cap_mask)
+	    & ~allowed_port_caps) {
+		ehca_err(&shca->ib_device, "Non-changeable bits set in masks  "
+			 "set=%x  clr=%x  allowed=%x", props->set_port_cap_mask,
+			 props->clr_port_cap_mask, allowed_port_caps);
+		return -EINVAL;
+	}
+
+	if (mutex_lock_interruptible(&shca->modify_mutex))
+		return -ERESTARTSYS;
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!rblock) {
+		ehca_err(&shca->ib_device,  "Can't allocate rblock memory.");
+		ret = -ENOMEM;
+		goto modify_port1;
+	}
+
+	hret = hipz_h_query_port(shca->ipz_hca_handle, port, rblock);
+	if (hret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Can't query port properties");
+		ret = -EINVAL;
+		goto modify_port2;
+	}
+
+	cap = (rblock->capability_mask | props->set_port_cap_mask)
+		& ~props->clr_port_cap_mask;
+
+	hret = hipz_h_modify_port(shca->ipz_hca_handle, port,
+				  cap, props->init_type, port_modify_mask);
+	if (hret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Modify port failed  h_ret=%lli",
+			 hret);
+		ret = -EINVAL;
+	}
+
+modify_port2:
+	ehca_free_fw_ctrlblock(rblock);
+
+modify_port1:
+	mutex_unlock(&shca->modify_mutex);
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
new file mode 100644
index 000000000..8615d7cf7
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -0,0 +1,870 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Functions for EQs, NEQs and interrupts
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Khadija Souissi <souissi@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+#include <linux/smpboot.h>
+
+#include "ehca_classes.h"
+#include "ehca_irq.h"
+#include "ehca_iverbs.h"
+#include "ehca_tools.h"
+#include "hcp_if.h"
+#include "hipz_fns.h"
+#include "ipz_pt_fn.h"
+
+#define EQE_COMPLETION_EVENT   EHCA_BMASK_IBM( 1,  1)
+#define EQE_CQ_QP_NUMBER       EHCA_BMASK_IBM( 8, 31)
+#define EQE_EE_IDENTIFIER      EHCA_BMASK_IBM( 2,  7)
+#define EQE_CQ_NUMBER          EHCA_BMASK_IBM( 8, 31)
+#define EQE_QP_NUMBER          EHCA_BMASK_IBM( 8, 31)
+#define EQE_QP_TOKEN           EHCA_BMASK_IBM(32, 63)
+#define EQE_CQ_TOKEN           EHCA_BMASK_IBM(32, 63)
+
+#define NEQE_COMPLETION_EVENT  EHCA_BMASK_IBM( 1,  1)
+#define NEQE_EVENT_CODE        EHCA_BMASK_IBM( 2,  7)
+#define NEQE_PORT_NUMBER       EHCA_BMASK_IBM( 8, 15)
+#define NEQE_PORT_AVAILABILITY EHCA_BMASK_IBM(16, 16)
+#define NEQE_DISRUPTIVE        EHCA_BMASK_IBM(16, 16)
+#define NEQE_SPECIFIC_EVENT    EHCA_BMASK_IBM(16, 23)
+
+#define ERROR_DATA_LENGTH      EHCA_BMASK_IBM(52, 63)
+#define ERROR_DATA_TYPE        EHCA_BMASK_IBM( 0,  7)
+
+static void queue_comp_task(struct ehca_cq *__cq);
+
+static struct ehca_comp_pool *pool;
+
+static inline void comp_event_callback(struct ehca_cq *cq)
+{
+	if (!cq->ib_cq.comp_handler)
+		return;
+
+	spin_lock(&cq->cb_lock);
+	cq->ib_cq.comp_handler(&cq->ib_cq, cq->ib_cq.cq_context);
+	spin_unlock(&cq->cb_lock);
+
+	return;
+}
+
+static void print_error_data(struct ehca_shca *shca, void *data,
+			     u64 *rblock, int length)
+{
+	u64 type = EHCA_BMASK_GET(ERROR_DATA_TYPE, rblock[2]);
+	u64 resource = rblock[1];
+
+	switch (type) {
+	case 0x1: /* Queue Pair */
+	{
+		struct ehca_qp *qp = (struct ehca_qp *)data;
+
+		/* only print error data if AER is set */
+		if (rblock[6] == 0)
+			return;
+
+		ehca_err(&shca->ib_device,
+			 "QP 0x%x (resource=%llx) has errors.",
+			 qp->ib_qp.qp_num, resource);
+		break;
+	}
+	case 0x4: /* Completion Queue */
+	{
+		struct ehca_cq *cq = (struct ehca_cq *)data;
+
+		ehca_err(&shca->ib_device,
+			 "CQ 0x%x (resource=%llx) has errors.",
+			 cq->cq_number, resource);
+		break;
+	}
+	default:
+		ehca_err(&shca->ib_device,
+			 "Unknown error type: %llx on %s.",
+			 type, shca->ib_device.name);
+		break;
+	}
+
+	ehca_err(&shca->ib_device, "Error data is available: %llx.", resource);
+	ehca_err(&shca->ib_device, "EHCA ----- error data begin "
+		 "---------------------------------------------------");
+	ehca_dmp(rblock, length, "resource=%llx", resource);
+	ehca_err(&shca->ib_device, "EHCA ----- error data end "
+		 "----------------------------------------------------");
+
+	return;
+}
+
+int ehca_error_data(struct ehca_shca *shca, void *data,
+		    u64 resource)
+{
+
+	unsigned long ret;
+	u64 *rblock;
+	unsigned long block_count;
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
+	if (!rblock) {
+		ehca_err(&shca->ib_device, "Cannot allocate rblock memory.");
+		ret = -ENOMEM;
+		goto error_data1;
+	}
+
+	/* rblock must be 4K aligned and should be 4K large */
+	ret = hipz_h_error_data(shca->ipz_hca_handle,
+				resource,
+				rblock,
+				&block_count);
+
+	if (ret == H_R_STATE)
+		ehca_err(&shca->ib_device,
+			 "No error data is available: %llx.", resource);
+	else if (ret == H_SUCCESS) {
+		int length;
+
+		length = EHCA_BMASK_GET(ERROR_DATA_LENGTH, rblock[0]);
+
+		if (length > EHCA_PAGESIZE)
+			length = EHCA_PAGESIZE;
+
+		print_error_data(shca, data, rblock, length);
+	} else
+		ehca_err(&shca->ib_device,
+			 "Error data could not be fetched: %llx", resource);
+
+	ehca_free_fw_ctrlblock(rblock);
+
+error_data1:
+	return ret;
+
+}
+
+static void dispatch_qp_event(struct ehca_shca *shca, struct ehca_qp *qp,
+			      enum ib_event_type event_type)
+{
+	struct ib_event event;
+
+	/* PATH_MIG without the QP ever having been armed is false alarm */
+	if (event_type == IB_EVENT_PATH_MIG && !qp->mig_armed)
+		return;
+
+	event.device = &shca->ib_device;
+	event.event = event_type;
+
+	if (qp->ext_type == EQPT_SRQ) {
+		if (!qp->ib_srq.event_handler)
+			return;
+
+		event.element.srq = &qp->ib_srq;
+		qp->ib_srq.event_handler(&event, qp->ib_srq.srq_context);
+	} else {
+		if (!qp->ib_qp.event_handler)
+			return;
+
+		event.element.qp = &qp->ib_qp;
+		qp->ib_qp.event_handler(&event, qp->ib_qp.qp_context);
+	}
+}
+
+static void qp_event_callback(struct ehca_shca *shca, u64 eqe,
+			      enum ib_event_type event_type, int fatal)
+{
+	struct ehca_qp *qp;
+	u32 token = EHCA_BMASK_GET(EQE_QP_TOKEN, eqe);
+
+	read_lock(&ehca_qp_idr_lock);
+	qp = idr_find(&ehca_qp_idr, token);
+	if (qp)
+		atomic_inc(&qp->nr_events);
+	read_unlock(&ehca_qp_idr_lock);
+
+	if (!qp)
+		return;
+
+	if (fatal)
+		ehca_error_data(shca, qp, qp->ipz_qp_handle.handle);
+
+	dispatch_qp_event(shca, qp, fatal && qp->ext_type == EQPT_SRQ ?
+			  IB_EVENT_SRQ_ERR : event_type);
+
+	/*
+	 * eHCA only processes one WQE at a time for SRQ base QPs,
+	 * so the last WQE has been processed as soon as the QP enters
+	 * error state.
+	 */
+	if (fatal && qp->ext_type == EQPT_SRQBASE)
+		dispatch_qp_event(shca, qp, IB_EVENT_QP_LAST_WQE_REACHED);
+
+	if (atomic_dec_and_test(&qp->nr_events))
+		wake_up(&qp->wait_completion);
+	return;
+}
+
+static void cq_event_callback(struct ehca_shca *shca,
+			      u64 eqe)
+{
+	struct ehca_cq *cq;
+	u32 token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe);
+
+	read_lock(&ehca_cq_idr_lock);
+	cq = idr_find(&ehca_cq_idr, token);
+	if (cq)
+		atomic_inc(&cq->nr_events);
+	read_unlock(&ehca_cq_idr_lock);
+
+	if (!cq)
+		return;
+
+	ehca_error_data(shca, cq, cq->ipz_cq_handle.handle);
+
+	if (atomic_dec_and_test(&cq->nr_events))
+		wake_up(&cq->wait_completion);
+
+	return;
+}
+
+static void parse_identifier(struct ehca_shca *shca, u64 eqe)
+{
+	u8 identifier = EHCA_BMASK_GET(EQE_EE_IDENTIFIER, eqe);
+
+	switch (identifier) {
+	case 0x02: /* path migrated */
+		qp_event_callback(shca, eqe, IB_EVENT_PATH_MIG, 0);
+		break;
+	case 0x03: /* communication established */
+		qp_event_callback(shca, eqe, IB_EVENT_COMM_EST, 0);
+		break;
+	case 0x04: /* send queue drained */
+		qp_event_callback(shca, eqe, IB_EVENT_SQ_DRAINED, 0);
+		break;
+	case 0x05: /* QP error */
+	case 0x06: /* QP error */
+		qp_event_callback(shca, eqe, IB_EVENT_QP_FATAL, 1);
+		break;
+	case 0x07: /* CQ error */
+	case 0x08: /* CQ error */
+		cq_event_callback(shca, eqe);
+		break;
+	case 0x09: /* MRMWPTE error */
+		ehca_err(&shca->ib_device, "MRMWPTE error.");
+		break;
+	case 0x0A: /* port event */
+		ehca_err(&shca->ib_device, "Port event.");
+		break;
+	case 0x0B: /* MR access error */
+		ehca_err(&shca->ib_device, "MR access error.");
+		break;
+	case 0x0C: /* EQ error */
+		ehca_err(&shca->ib_device, "EQ error.");
+		break;
+	case 0x0D: /* P/Q_Key mismatch */
+		ehca_err(&shca->ib_device, "P/Q_Key mismatch.");
+		break;
+	case 0x10: /* sampling complete */
+		ehca_err(&shca->ib_device, "Sampling complete.");
+		break;
+	case 0x11: /* unaffiliated access error */
+		ehca_err(&shca->ib_device, "Unaffiliated access error.");
+		break;
+	case 0x12: /* path migrating */
+		ehca_err(&shca->ib_device, "Path migrating.");
+		break;
+	case 0x13: /* interface trace stopped */
+		ehca_err(&shca->ib_device, "Interface trace stopped.");
+		break;
+	case 0x14: /* first error capture info available */
+		ehca_info(&shca->ib_device, "First error capture available");
+		break;
+	case 0x15: /* SRQ limit reached */
+		qp_event_callback(shca, eqe, IB_EVENT_SRQ_LIMIT_REACHED, 0);
+		break;
+	default:
+		ehca_err(&shca->ib_device, "Unknown identifier: %x on %s.",
+			 identifier, shca->ib_device.name);
+		break;
+	}
+
+	return;
+}
+
+static void dispatch_port_event(struct ehca_shca *shca, int port_num,
+				enum ib_event_type type, const char *msg)
+{
+	struct ib_event event;
+
+	ehca_info(&shca->ib_device, "port %d %s.", port_num, msg);
+	event.device = &shca->ib_device;
+	event.event = type;
+	event.element.port_num = port_num;
+	ib_dispatch_event(&event);
+}
+
+static void notify_port_conf_change(struct ehca_shca *shca, int port_num)
+{
+	struct ehca_sma_attr  new_attr;
+	struct ehca_sma_attr *old_attr = &shca->sport[port_num - 1].saved_attr;
+
+	ehca_query_sma_attr(shca, port_num, &new_attr);
+
+	if (new_attr.sm_sl  != old_attr->sm_sl ||
+	    new_attr.sm_lid != old_attr->sm_lid)
+		dispatch_port_event(shca, port_num, IB_EVENT_SM_CHANGE,
+				    "SM changed");
+
+	if (new_attr.lid != old_attr->lid ||
+	    new_attr.lmc != old_attr->lmc)
+		dispatch_port_event(shca, port_num, IB_EVENT_LID_CHANGE,
+				    "LID changed");
+
+	if (new_attr.pkey_tbl_len != old_attr->pkey_tbl_len ||
+	    memcmp(new_attr.pkeys, old_attr->pkeys,
+		   sizeof(u16) * new_attr.pkey_tbl_len))
+		dispatch_port_event(shca, port_num, IB_EVENT_PKEY_CHANGE,
+				    "P_Key changed");
+
+	*old_attr = new_attr;
+}
+
+/* replay modify_qp for sqps -- return 0 if all is well, 1 if AQP1 destroyed */
+static int replay_modify_qp(struct ehca_sport *sport)
+{
+	int aqp1_destroyed;
+	unsigned long flags;
+
+	spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+
+	aqp1_destroyed = !sport->ibqp_sqp[IB_QPT_GSI];
+
+	if (sport->ibqp_sqp[IB_QPT_SMI])
+		ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_SMI]);
+	if (!aqp1_destroyed)
+		ehca_recover_sqp(sport->ibqp_sqp[IB_QPT_GSI]);
+
+	spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+
+	return aqp1_destroyed;
+}
+
+static void parse_ec(struct ehca_shca *shca, u64 eqe)
+{
+	u8 ec   = EHCA_BMASK_GET(NEQE_EVENT_CODE, eqe);
+	u8 port = EHCA_BMASK_GET(NEQE_PORT_NUMBER, eqe);
+	u8 spec_event;
+	struct ehca_sport *sport = &shca->sport[port - 1];
+
+	switch (ec) {
+	case 0x30: /* port availability change */
+		if (EHCA_BMASK_GET(NEQE_PORT_AVAILABILITY, eqe)) {
+			/* only replay modify_qp calls in autodetect mode;
+			 * if AQP1 was destroyed, the port is already down
+			 * again and we can drop the event.
+			 */
+			if (ehca_nr_ports < 0)
+				if (replay_modify_qp(sport))
+					break;
+
+			sport->port_state = IB_PORT_ACTIVE;
+			dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
+					    "is active");
+			ehca_query_sma_attr(shca, port, &sport->saved_attr);
+		} else {
+			sport->port_state = IB_PORT_DOWN;
+			dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
+					    "is inactive");
+		}
+		break;
+	case 0x31:
+		/* port configuration change
+		 * disruptive change is caused by
+		 * LID, PKEY or SM change
+		 */
+		if (EHCA_BMASK_GET(NEQE_DISRUPTIVE, eqe)) {
+			ehca_warn(&shca->ib_device, "disruptive port "
+				  "%d configuration change", port);
+
+			sport->port_state = IB_PORT_DOWN;
+			dispatch_port_event(shca, port, IB_EVENT_PORT_ERR,
+					    "is inactive");
+
+			sport->port_state = IB_PORT_ACTIVE;
+			dispatch_port_event(shca, port, IB_EVENT_PORT_ACTIVE,
+					    "is active");
+			ehca_query_sma_attr(shca, port,
+					    &sport->saved_attr);
+		} else
+			notify_port_conf_change(shca, port);
+		break;
+	case 0x32: /* adapter malfunction */
+		ehca_err(&shca->ib_device, "Adapter malfunction.");
+		break;
+	case 0x33:  /* trace stopped */
+		ehca_err(&shca->ib_device, "Traced stopped.");
+		break;
+	case 0x34: /* util async event */
+		spec_event = EHCA_BMASK_GET(NEQE_SPECIFIC_EVENT, eqe);
+		if (spec_event == 0x80) /* client reregister required */
+			dispatch_port_event(shca, port,
+					    IB_EVENT_CLIENT_REREGISTER,
+					    "client reregister req.");
+		else
+			ehca_warn(&shca->ib_device, "Unknown util async "
+				  "event %x on port %x", spec_event, port);
+		break;
+	default:
+		ehca_err(&shca->ib_device, "Unknown event code: %x on %s.",
+			 ec, shca->ib_device.name);
+		break;
+	}
+
+	return;
+}
+
+static inline void reset_eq_pending(struct ehca_cq *cq)
+{
+	u64 CQx_EP;
+	struct h_galpa gal = cq->galpas.kernel;
+
+	hipz_galpa_store_cq(gal, cqx_ep, 0x0);
+	CQx_EP = hipz_galpa_load(gal, CQTEMM_OFFSET(cqx_ep));
+
+	return;
+}
+
+irqreturn_t ehca_interrupt_neq(int irq, void *dev_id)
+{
+	struct ehca_shca *shca = (struct ehca_shca*)dev_id;
+
+	tasklet_hi_schedule(&shca->neq.interrupt_task);
+
+	return IRQ_HANDLED;
+}
+
+void ehca_tasklet_neq(unsigned long data)
+{
+	struct ehca_shca *shca = (struct ehca_shca*)data;
+	struct ehca_eqe *eqe;
+	u64 ret;
+
+	eqe = ehca_poll_eq(shca, &shca->neq);
+
+	while (eqe) {
+		if (!EHCA_BMASK_GET(NEQE_COMPLETION_EVENT, eqe->entry))
+			parse_ec(shca, eqe->entry);
+
+		eqe = ehca_poll_eq(shca, &shca->neq);
+	}
+
+	ret = hipz_h_reset_event(shca->ipz_hca_handle,
+				 shca->neq.ipz_eq_handle, 0xFFFFFFFFFFFFFFFFL);
+
+	if (ret != H_SUCCESS)
+		ehca_err(&shca->ib_device, "Can't clear notification events.");
+
+	return;
+}
+
+irqreturn_t ehca_interrupt_eq(int irq, void *dev_id)
+{
+	struct ehca_shca *shca = (struct ehca_shca*)dev_id;
+
+	tasklet_hi_schedule(&shca->eq.interrupt_task);
+
+	return IRQ_HANDLED;
+}
+
+
+static inline void process_eqe(struct ehca_shca *shca, struct ehca_eqe *eqe)
+{
+	u64 eqe_value;
+	u32 token;
+	struct ehca_cq *cq;
+
+	eqe_value = eqe->entry;
+	ehca_dbg(&shca->ib_device, "eqe_value=%llx", eqe_value);
+	if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) {
+		ehca_dbg(&shca->ib_device, "Got completion event");
+		token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value);
+		read_lock(&ehca_cq_idr_lock);
+		cq = idr_find(&ehca_cq_idr, token);
+		if (cq)
+			atomic_inc(&cq->nr_events);
+		read_unlock(&ehca_cq_idr_lock);
+		if (cq == NULL) {
+			ehca_err(&shca->ib_device,
+				 "Invalid eqe for non-existing cq token=%x",
+				 token);
+			return;
+		}
+		reset_eq_pending(cq);
+		if (ehca_scaling_code)
+			queue_comp_task(cq);
+		else {
+			comp_event_callback(cq);
+			if (atomic_dec_and_test(&cq->nr_events))
+				wake_up(&cq->wait_completion);
+		}
+	} else {
+		ehca_dbg(&shca->ib_device, "Got non completion event");
+		parse_identifier(shca, eqe_value);
+	}
+}
+
+void ehca_process_eq(struct ehca_shca *shca, int is_irq)
+{
+	struct ehca_eq *eq = &shca->eq;
+	struct ehca_eqe_cache_entry *eqe_cache = eq->eqe_cache;
+	u64 eqe_value, ret;
+	int eqe_cnt, i;
+	int eq_empty = 0;
+
+	spin_lock(&eq->irq_spinlock);
+	if (is_irq) {
+		const int max_query_cnt = 100;
+		int query_cnt = 0;
+		int int_state = 1;
+		do {
+			int_state = hipz_h_query_int_state(
+				shca->ipz_hca_handle, eq->ist);
+			query_cnt++;
+			iosync();
+		} while (int_state && query_cnt < max_query_cnt);
+		if (unlikely((query_cnt == max_query_cnt)))
+			ehca_dbg(&shca->ib_device, "int_state=%x query_cnt=%x",
+				 int_state, query_cnt);
+	}
+
+	/* read out all eqes */
+	eqe_cnt = 0;
+	do {
+		u32 token;
+		eqe_cache[eqe_cnt].eqe = ehca_poll_eq(shca, eq);
+		if (!eqe_cache[eqe_cnt].eqe)
+			break;
+		eqe_value = eqe_cache[eqe_cnt].eqe->entry;
+		if (EHCA_BMASK_GET(EQE_COMPLETION_EVENT, eqe_value)) {
+			token = EHCA_BMASK_GET(EQE_CQ_TOKEN, eqe_value);
+			read_lock(&ehca_cq_idr_lock);
+			eqe_cache[eqe_cnt].cq = idr_find(&ehca_cq_idr, token);
+			if (eqe_cache[eqe_cnt].cq)
+				atomic_inc(&eqe_cache[eqe_cnt].cq->nr_events);
+			read_unlock(&ehca_cq_idr_lock);
+			if (!eqe_cache[eqe_cnt].cq) {
+				ehca_err(&shca->ib_device,
+					 "Invalid eqe for non-existing cq "
+					 "token=%x", token);
+				continue;
+			}
+		} else
+			eqe_cache[eqe_cnt].cq = NULL;
+		eqe_cnt++;
+	} while (eqe_cnt < EHCA_EQE_CACHE_SIZE);
+	if (!eqe_cnt) {
+		if (is_irq)
+			ehca_dbg(&shca->ib_device,
+				 "No eqe found for irq event");
+		goto unlock_irq_spinlock;
+	} else if (!is_irq) {
+		ret = hipz_h_eoi(eq->ist);
+		if (ret != H_SUCCESS)
+			ehca_err(&shca->ib_device,
+				 "bad return code EOI -rc = %lld\n", ret);
+		ehca_dbg(&shca->ib_device, "deadman found %x eqe", eqe_cnt);
+	}
+	if (unlikely(eqe_cnt == EHCA_EQE_CACHE_SIZE))
+		ehca_dbg(&shca->ib_device, "too many eqes for one irq event");
+	/* enable irq for new packets */
+	for (i = 0; i < eqe_cnt; i++) {
+		if (eq->eqe_cache[i].cq)
+			reset_eq_pending(eq->eqe_cache[i].cq);
+	}
+	/* check eq */
+	spin_lock(&eq->spinlock);
+	eq_empty = (!ipz_eqit_eq_peek_valid(&shca->eq.ipz_queue));
+	spin_unlock(&eq->spinlock);
+	/* call completion handler for cached eqes */
+	for (i = 0; i < eqe_cnt; i++)
+		if (eq->eqe_cache[i].cq) {
+			if (ehca_scaling_code)
+				queue_comp_task(eq->eqe_cache[i].cq);
+			else {
+				struct ehca_cq *cq = eq->eqe_cache[i].cq;
+				comp_event_callback(cq);
+				if (atomic_dec_and_test(&cq->nr_events))
+					wake_up(&cq->wait_completion);
+			}
+		} else {
+			ehca_dbg(&shca->ib_device, "Got non completion event");
+			parse_identifier(shca, eq->eqe_cache[i].eqe->entry);
+		}
+	/* poll eq if not empty */
+	if (eq_empty)
+		goto unlock_irq_spinlock;
+	do {
+		struct ehca_eqe *eqe;
+		eqe = ehca_poll_eq(shca, &shca->eq);
+		if (!eqe)
+			break;
+		process_eqe(shca, eqe);
+	} while (1);
+
+unlock_irq_spinlock:
+	spin_unlock(&eq->irq_spinlock);
+}
+
+void ehca_tasklet_eq(unsigned long data)
+{
+	ehca_process_eq((struct ehca_shca*)data, 1);
+}
+
+static int find_next_online_cpu(struct ehca_comp_pool *pool)
+{
+	int cpu;
+	unsigned long flags;
+
+	WARN_ON_ONCE(!in_interrupt());
+	if (ehca_debug_level >= 3)
+		ehca_dmp(cpu_online_mask, cpumask_size(), "");
+
+	spin_lock_irqsave(&pool->last_cpu_lock, flags);
+	do {
+		cpu = cpumask_next(pool->last_cpu, cpu_online_mask);
+		if (cpu >= nr_cpu_ids)
+			cpu = cpumask_first(cpu_online_mask);
+		pool->last_cpu = cpu;
+	} while (!per_cpu_ptr(pool->cpu_comp_tasks, cpu)->active);
+	spin_unlock_irqrestore(&pool->last_cpu_lock, flags);
+
+	return cpu;
+}
+
+static void __queue_comp_task(struct ehca_cq *__cq,
+			      struct ehca_cpu_comp_task *cct,
+			      struct task_struct *thread)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cct->task_lock, flags);
+	spin_lock(&__cq->task_lock);
+
+	if (__cq->nr_callbacks == 0) {
+		__cq->nr_callbacks++;
+		list_add_tail(&__cq->entry, &cct->cq_list);
+		cct->cq_jobs++;
+		wake_up_process(thread);
+	} else
+		__cq->nr_callbacks++;
+
+	spin_unlock(&__cq->task_lock);
+	spin_unlock_irqrestore(&cct->task_lock, flags);
+}
+
+static void queue_comp_task(struct ehca_cq *__cq)
+{
+	int cpu_id;
+	struct ehca_cpu_comp_task *cct;
+	struct task_struct *thread;
+	int cq_jobs;
+	unsigned long flags;
+
+	cpu_id = find_next_online_cpu(pool);
+	BUG_ON(!cpu_online(cpu_id));
+
+	cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
+	thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+	BUG_ON(!cct || !thread);
+
+	spin_lock_irqsave(&cct->task_lock, flags);
+	cq_jobs = cct->cq_jobs;
+	spin_unlock_irqrestore(&cct->task_lock, flags);
+	if (cq_jobs > 0) {
+		cpu_id = find_next_online_cpu(pool);
+		cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu_id);
+		thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu_id);
+		BUG_ON(!cct || !thread);
+	}
+	__queue_comp_task(__cq, cct, thread);
+}
+
+static void run_comp_task(struct ehca_cpu_comp_task *cct)
+{
+	struct ehca_cq *cq;
+
+	while (!list_empty(&cct->cq_list)) {
+		cq = list_entry(cct->cq_list.next, struct ehca_cq, entry);
+		spin_unlock_irq(&cct->task_lock);
+
+		comp_event_callback(cq);
+		if (atomic_dec_and_test(&cq->nr_events))
+			wake_up(&cq->wait_completion);
+
+		spin_lock_irq(&cct->task_lock);
+		spin_lock(&cq->task_lock);
+		cq->nr_callbacks--;
+		if (!cq->nr_callbacks) {
+			list_del_init(cct->cq_list.next);
+			cct->cq_jobs--;
+		}
+		spin_unlock(&cq->task_lock);
+	}
+}
+
+static void comp_task_park(unsigned int cpu)
+{
+	struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+	struct ehca_cpu_comp_task *target;
+	struct task_struct *thread;
+	struct ehca_cq *cq, *tmp;
+	LIST_HEAD(list);
+
+	spin_lock_irq(&cct->task_lock);
+	cct->cq_jobs = 0;
+	cct->active = 0;
+	list_splice_init(&cct->cq_list, &list);
+	spin_unlock_irq(&cct->task_lock);
+
+	cpu = find_next_online_cpu(pool);
+	target = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+	thread = *per_cpu_ptr(pool->cpu_comp_threads, cpu);
+	spin_lock_irq(&target->task_lock);
+	list_for_each_entry_safe(cq, tmp, &list, entry) {
+		list_del(&cq->entry);
+		__queue_comp_task(cq, target, thread);
+	}
+	spin_unlock_irq(&target->task_lock);
+}
+
+static void comp_task_stop(unsigned int cpu, bool online)
+{
+	struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+
+	spin_lock_irq(&cct->task_lock);
+	cct->cq_jobs = 0;
+	cct->active = 0;
+	WARN_ON(!list_empty(&cct->cq_list));
+	spin_unlock_irq(&cct->task_lock);
+}
+
+static int comp_task_should_run(unsigned int cpu)
+{
+	struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+
+	return cct->cq_jobs;
+}
+
+static void comp_task(unsigned int cpu)
+{
+	struct ehca_cpu_comp_task *cct = this_cpu_ptr(pool->cpu_comp_tasks);
+	int cql_empty;
+
+	spin_lock_irq(&cct->task_lock);
+	cql_empty = list_empty(&cct->cq_list);
+	if (!cql_empty) {
+		__set_current_state(TASK_RUNNING);
+		run_comp_task(cct);
+	}
+	spin_unlock_irq(&cct->task_lock);
+}
+
+static struct smp_hotplug_thread comp_pool_threads = {
+	.thread_should_run	= comp_task_should_run,
+	.thread_fn		= comp_task,
+	.thread_comm		= "ehca_comp/%u",
+	.cleanup		= comp_task_stop,
+	.park			= comp_task_park,
+};
+
+int ehca_create_comp_pool(void)
+{
+	int cpu, ret = -ENOMEM;
+
+	if (!ehca_scaling_code)
+		return 0;
+
+	pool = kzalloc(sizeof(struct ehca_comp_pool), GFP_KERNEL);
+	if (pool == NULL)
+		return -ENOMEM;
+
+	spin_lock_init(&pool->last_cpu_lock);
+	pool->last_cpu = cpumask_any(cpu_online_mask);
+
+	pool->cpu_comp_tasks = alloc_percpu(struct ehca_cpu_comp_task);
+	if (!pool->cpu_comp_tasks)
+		goto out_pool;
+
+	pool->cpu_comp_threads = alloc_percpu(struct task_struct *);
+	if (!pool->cpu_comp_threads)
+		goto out_tasks;
+
+	for_each_present_cpu(cpu) {
+		struct ehca_cpu_comp_task *cct;
+
+		cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
+		spin_lock_init(&cct->task_lock);
+		INIT_LIST_HEAD(&cct->cq_list);
+	}
+
+	comp_pool_threads.store = pool->cpu_comp_threads;
+	ret = smpboot_register_percpu_thread(&comp_pool_threads);
+	if (ret)
+		goto out_threads;
+
+	pr_info("eHCA scaling code enabled\n");
+	return ret;
+
+out_threads:
+	free_percpu(pool->cpu_comp_threads);
+out_tasks:
+	free_percpu(pool->cpu_comp_tasks);
+out_pool:
+	kfree(pool);
+	return ret;
+}
+
+void ehca_destroy_comp_pool(void)
+{
+	if (!ehca_scaling_code)
+		return;
+
+	smpboot_unregister_percpu_thread(&comp_pool_threads);
+
+	free_percpu(pool->cpu_comp_threads);
+	free_percpu(pool->cpu_comp_tasks);
+	kfree(pool);
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.h b/drivers/infiniband/hw/ehca/ehca_irq.h
new file mode 100644
index 000000000..5370199f0
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_irq.h
@@ -0,0 +1,77 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Function definitions and structs for EQs, NEQs and interrupts
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Khadija Souissi <souissi@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_IRQ_H
+#define __EHCA_IRQ_H
+
+
+struct ehca_shca;
+
+#include <linux/interrupt.h>
+#include <linux/types.h>
+
+int ehca_error_data(struct ehca_shca *shca, void *data, u64 resource);
+
+irqreturn_t ehca_interrupt_neq(int irq, void *dev_id);
+void ehca_tasklet_neq(unsigned long data);
+
+irqreturn_t ehca_interrupt_eq(int irq, void *dev_id);
+void ehca_tasklet_eq(unsigned long data);
+void ehca_process_eq(struct ehca_shca *shca, int is_irq);
+
+struct ehca_cpu_comp_task {
+	struct list_head cq_list;
+	spinlock_t task_lock;
+	int cq_jobs;
+	int active;
+};
+
+struct ehca_comp_pool {
+	struct ehca_cpu_comp_task __percpu *cpu_comp_tasks;
+	struct task_struct * __percpu *cpu_comp_threads;
+	int last_cpu;
+	spinlock_t last_cpu_lock;
+};
+
+int ehca_create_comp_pool(void);
+void ehca_destroy_comp_pool(void);
+
+#endif
diff --git a/drivers/infiniband/hw/ehca/ehca_iverbs.h b/drivers/infiniband/hw/ehca/ehca_iverbs.h
new file mode 100644
index 000000000..22f79afa7
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_iverbs.h
@@ -0,0 +1,212 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Function definitions for internal functions
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Dietmar Decker <ddecker@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __EHCA_IVERBS_H__
+#define __EHCA_IVERBS_H__
+
+#include "ehca_classes.h"
+
+int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props);
+
+int ehca_query_port(struct ib_device *ibdev, u8 port,
+		    struct ib_port_attr *props);
+
+int ehca_query_sma_attr(struct ehca_shca *shca, u8 port,
+			struct ehca_sma_attr *attr);
+
+int ehca_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 * pkey);
+
+int ehca_query_gid(struct ib_device *ibdev, u8 port, int index,
+		   union ib_gid *gid);
+
+int ehca_modify_port(struct ib_device *ibdev, u8 port, int port_modify_mask,
+		     struct ib_port_modify *props);
+
+struct ib_pd *ehca_alloc_pd(struct ib_device *device,
+			    struct ib_ucontext *context,
+			    struct ib_udata *udata);
+
+int ehca_dealloc_pd(struct ib_pd *pd);
+
+struct ib_ah *ehca_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+
+int ehca_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+int ehca_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr);
+
+int ehca_destroy_ah(struct ib_ah *ah);
+
+struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags);
+
+struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
+			       struct ib_phys_buf *phys_buf_array,
+			       int num_phys_buf,
+			       int mr_access_flags, u64 *iova_start);
+
+struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+			       u64 virt, int mr_access_flags,
+			       struct ib_udata *udata);
+
+int ehca_rereg_phys_mr(struct ib_mr *mr,
+		       int mr_rereg_mask,
+		       struct ib_pd *pd,
+		       struct ib_phys_buf *phys_buf_array,
+		       int num_phys_buf, int mr_access_flags, u64 *iova_start);
+
+int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr);
+
+int ehca_dereg_mr(struct ib_mr *mr);
+
+struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
+
+int ehca_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
+		 struct ib_mw_bind *mw_bind);
+
+int ehca_dealloc_mw(struct ib_mw *mw);
+
+struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
+			      int mr_access_flags,
+			      struct ib_fmr_attr *fmr_attr);
+
+int ehca_map_phys_fmr(struct ib_fmr *fmr,
+		      u64 *page_list, int list_len, u64 iova);
+
+int ehca_unmap_fmr(struct list_head *fmr_list);
+
+int ehca_dealloc_fmr(struct ib_fmr *fmr);
+
+enum ehca_eq_type {
+	EHCA_EQ = 0, /* Event Queue              */
+	EHCA_NEQ     /* Notification Event Queue */
+};
+
+int ehca_create_eq(struct ehca_shca *shca, struct ehca_eq *eq,
+		   enum ehca_eq_type type, const u32 length);
+
+int ehca_destroy_eq(struct ehca_shca *shca, struct ehca_eq *eq);
+
+void *ehca_poll_eq(struct ehca_shca *shca, struct ehca_eq *eq);
+
+
+struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector,
+			     struct ib_ucontext *context,
+			     struct ib_udata *udata);
+
+int ehca_destroy_cq(struct ib_cq *cq);
+
+int ehca_resize_cq(struct ib_cq *cq, int cqe, struct ib_udata *udata);
+
+int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc);
+
+int ehca_peek_cq(struct ib_cq *cq, int wc_cnt);
+
+int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags);
+
+struct ib_qp *ehca_create_qp(struct ib_pd *pd,
+			     struct ib_qp_init_attr *init_attr,
+			     struct ib_udata *udata);
+
+int ehca_destroy_qp(struct ib_qp *qp);
+
+int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+		   struct ib_udata *udata);
+
+int ehca_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+		  int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr);
+
+int ehca_post_send(struct ib_qp *qp, struct ib_send_wr *send_wr,
+		   struct ib_send_wr **bad_send_wr);
+
+int ehca_post_recv(struct ib_qp *qp, struct ib_recv_wr *recv_wr,
+		   struct ib_recv_wr **bad_recv_wr);
+
+int ehca_post_srq_recv(struct ib_srq *srq,
+		       struct ib_recv_wr *recv_wr,
+		       struct ib_recv_wr **bad_recv_wr);
+
+struct ib_srq *ehca_create_srq(struct ib_pd *pd,
+			       struct ib_srq_init_attr *init_attr,
+			       struct ib_udata *udata);
+
+int ehca_modify_srq(struct ib_srq *srq, struct ib_srq_attr *attr,
+		    enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+
+int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+
+int ehca_destroy_srq(struct ib_srq *srq);
+
+u64 ehca_define_sqp(struct ehca_shca *shca, struct ehca_qp *ibqp,
+		    struct ib_qp_init_attr *qp_init_attr);
+
+int ehca_attach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+int ehca_detach_mcast(struct ib_qp *qp, union ib_gid *gid, u16 lid);
+
+struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device,
+					struct ib_udata *udata);
+
+int ehca_dealloc_ucontext(struct ib_ucontext *context);
+
+int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		     struct ib_wc *in_wc, struct ib_grh *in_grh,
+		     struct ib_mad *in_mad,
+		     struct ib_mad *out_mad);
+
+void ehca_poll_eqs(unsigned long data);
+
+int ehca_calc_ipd(struct ehca_shca *shca, int port,
+		  enum ib_rate path_rate, u32 *ipd);
+
+void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq);
+
+#ifdef CONFIG_PPC_64K_PAGES
+void *ehca_alloc_fw_ctrlblock(gfp_t flags);
+void ehca_free_fw_ctrlblock(void *ptr);
+#else
+#define ehca_alloc_fw_ctrlblock(flags) ((void *)get_zeroed_page(flags))
+#define ehca_free_fw_ctrlblock(ptr) free_page((unsigned long)(ptr))
+#endif
+
+void ehca_recover_sqp(struct ib_qp *sqp);
+
+#endif
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
new file mode 100644
index 000000000..cd8d290a0
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -0,0 +1,1100 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  module start stop, hca detection
+ *
+ *  Authors: Heiko J Schick <schickhj@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef CONFIG_PPC_64K_PAGES
+#include <linux/slab.h>
+#endif
+
+#include <linux/notifier.h>
+#include <linux/memory.h>
+#include "ehca_classes.h"
+#include "ehca_iverbs.h"
+#include "ehca_mrmw.h"
+#include "ehca_tools.h"
+#include "hcp_if.h"
+
+#define HCAD_VERSION "0029"
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
+MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver");
+MODULE_VERSION(HCAD_VERSION);
+
+static bool ehca_open_aqp1    = 0;
+static int ehca_hw_level      = 0;
+static bool ehca_poll_all_eqs = 1;
+
+int ehca_debug_level   = 0;
+int ehca_nr_ports      = -1;
+bool ehca_use_hp_mr    = 0;
+int ehca_port_act_time = 30;
+int ehca_static_rate   = -1;
+bool ehca_scaling_code = 0;
+int ehca_lock_hcalls   = -1;
+int ehca_max_cq        = -1;
+int ehca_max_qp        = -1;
+
+module_param_named(open_aqp1,     ehca_open_aqp1,     bool, S_IRUGO);
+module_param_named(debug_level,   ehca_debug_level,   int,  S_IRUGO);
+module_param_named(hw_level,      ehca_hw_level,      int,  S_IRUGO);
+module_param_named(nr_ports,      ehca_nr_ports,      int,  S_IRUGO);
+module_param_named(use_hp_mr,     ehca_use_hp_mr,     bool, S_IRUGO);
+module_param_named(port_act_time, ehca_port_act_time, int,  S_IRUGO);
+module_param_named(poll_all_eqs,  ehca_poll_all_eqs,  bool, S_IRUGO);
+module_param_named(static_rate,   ehca_static_rate,   int,  S_IRUGO);
+module_param_named(scaling_code,  ehca_scaling_code,  bool, S_IRUGO);
+module_param_named(lock_hcalls,   ehca_lock_hcalls,   bint, S_IRUGO);
+module_param_named(number_of_cqs, ehca_max_cq,        int,  S_IRUGO);
+module_param_named(number_of_qps, ehca_max_qp,        int,  S_IRUGO);
+
+MODULE_PARM_DESC(open_aqp1,
+		 "Open AQP1 on startup (default: no)");
+MODULE_PARM_DESC(debug_level,
+		 "Amount of debug output (0: none (default), 1: traces, "
+		 "2: some dumps, 3: lots)");
+MODULE_PARM_DESC(hw_level,
+		 "Hardware level (0: autosensing (default), "
+		 "0x10..0x14: eHCA, 0x20..0x23: eHCA2)");
+MODULE_PARM_DESC(nr_ports,
+		 "number of connected ports (-1: autodetect (default), "
+		 "1: port one only, 2: two ports)");
+MODULE_PARM_DESC(use_hp_mr,
+		 "Use high performance MRs (default: no)");
+MODULE_PARM_DESC(port_act_time,
+		 "Time to wait for port activation (default: 30 sec)");
+MODULE_PARM_DESC(poll_all_eqs,
+		 "Poll all event queues periodically (default: yes)");
+MODULE_PARM_DESC(static_rate,
+		 "Set permanent static rate (default: no static rate)");
+MODULE_PARM_DESC(scaling_code,
+		 "Enable scaling code (default: no)");
+MODULE_PARM_DESC(lock_hcalls,
+		 "Serialize all hCalls made by the driver "
+		 "(default: autodetect)");
+MODULE_PARM_DESC(number_of_cqs,
+		"Max number of CQs which can be allocated "
+		"(default: autodetect)");
+MODULE_PARM_DESC(number_of_qps,
+		"Max number of QPs which can be allocated "
+		"(default: autodetect)");
+
+DEFINE_RWLOCK(ehca_qp_idr_lock);
+DEFINE_RWLOCK(ehca_cq_idr_lock);
+DEFINE_IDR(ehca_qp_idr);
+DEFINE_IDR(ehca_cq_idr);
+
+static LIST_HEAD(shca_list); /* list of all registered ehcas */
+DEFINE_SPINLOCK(shca_list_lock);
+
+static struct timer_list poll_eqs_timer;
+
+#ifdef CONFIG_PPC_64K_PAGES
+static struct kmem_cache *ctblk_cache;
+
+void *ehca_alloc_fw_ctrlblock(gfp_t flags)
+{
+	void *ret = kmem_cache_zalloc(ctblk_cache, flags);
+	if (!ret)
+		ehca_gen_err("Out of memory for ctblk");
+	return ret;
+}
+
+void ehca_free_fw_ctrlblock(void *ptr)
+{
+	if (ptr)
+		kmem_cache_free(ctblk_cache, ptr);
+
+}
+#endif
+
+int ehca2ib_return_code(u64 ehca_rc)
+{
+	switch (ehca_rc) {
+	case H_SUCCESS:
+		return 0;
+	case H_RESOURCE:             /* Resource in use */
+	case H_BUSY:
+		return -EBUSY;
+	case H_NOT_ENOUGH_RESOURCES: /* insufficient resources */
+	case H_CONSTRAINED:          /* resource constraint */
+	case H_NO_MEM:
+		return -ENOMEM;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int ehca_create_slab_caches(void)
+{
+	int ret;
+
+	ret = ehca_init_pd_cache();
+	if (ret) {
+		ehca_gen_err("Cannot create PD SLAB cache.");
+		return ret;
+	}
+
+	ret = ehca_init_cq_cache();
+	if (ret) {
+		ehca_gen_err("Cannot create CQ SLAB cache.");
+		goto create_slab_caches2;
+	}
+
+	ret = ehca_init_qp_cache();
+	if (ret) {
+		ehca_gen_err("Cannot create QP SLAB cache.");
+		goto create_slab_caches3;
+	}
+
+	ret = ehca_init_av_cache();
+	if (ret) {
+		ehca_gen_err("Cannot create AV SLAB cache.");
+		goto create_slab_caches4;
+	}
+
+	ret = ehca_init_mrmw_cache();
+	if (ret) {
+		ehca_gen_err("Cannot create MR&MW SLAB cache.");
+		goto create_slab_caches5;
+	}
+
+	ret = ehca_init_small_qp_cache();
+	if (ret) {
+		ehca_gen_err("Cannot create small queue SLAB cache.");
+		goto create_slab_caches6;
+	}
+
+#ifdef CONFIG_PPC_64K_PAGES
+	ctblk_cache = kmem_cache_create("ehca_cache_ctblk",
+					EHCA_PAGESIZE, H_CB_ALIGNMENT,
+					SLAB_HWCACHE_ALIGN,
+					NULL);
+	if (!ctblk_cache) {
+		ehca_gen_err("Cannot create ctblk SLAB cache.");
+		ehca_cleanup_small_qp_cache();
+		ret = -ENOMEM;
+		goto create_slab_caches6;
+	}
+#endif
+	return 0;
+
+create_slab_caches6:
+	ehca_cleanup_mrmw_cache();
+
+create_slab_caches5:
+	ehca_cleanup_av_cache();
+
+create_slab_caches4:
+	ehca_cleanup_qp_cache();
+
+create_slab_caches3:
+	ehca_cleanup_cq_cache();
+
+create_slab_caches2:
+	ehca_cleanup_pd_cache();
+
+	return ret;
+}
+
+static void ehca_destroy_slab_caches(void)
+{
+	ehca_cleanup_small_qp_cache();
+	ehca_cleanup_mrmw_cache();
+	ehca_cleanup_av_cache();
+	ehca_cleanup_qp_cache();
+	ehca_cleanup_cq_cache();
+	ehca_cleanup_pd_cache();
+#ifdef CONFIG_PPC_64K_PAGES
+	if (ctblk_cache)
+		kmem_cache_destroy(ctblk_cache);
+#endif
+}
+
+#define EHCA_HCAAVER  EHCA_BMASK_IBM(32, 39)
+#define EHCA_REVID    EHCA_BMASK_IBM(40, 63)
+
+static struct cap_descr {
+	u64 mask;
+	char *descr;
+} hca_cap_descr[] = {
+	{ HCA_CAP_AH_PORT_NR_CHECK, "HCA_CAP_AH_PORT_NR_CHECK" },
+	{ HCA_CAP_ATOMIC, "HCA_CAP_ATOMIC" },
+	{ HCA_CAP_AUTO_PATH_MIG, "HCA_CAP_AUTO_PATH_MIG" },
+	{ HCA_CAP_BAD_P_KEY_CTR, "HCA_CAP_BAD_P_KEY_CTR" },
+	{ HCA_CAP_SQD_RTS_PORT_CHANGE, "HCA_CAP_SQD_RTS_PORT_CHANGE" },
+	{ HCA_CAP_CUR_QP_STATE_MOD, "HCA_CAP_CUR_QP_STATE_MOD" },
+	{ HCA_CAP_INIT_TYPE, "HCA_CAP_INIT_TYPE" },
+	{ HCA_CAP_PORT_ACTIVE_EVENT, "HCA_CAP_PORT_ACTIVE_EVENT" },
+	{ HCA_CAP_Q_KEY_VIOL_CTR, "HCA_CAP_Q_KEY_VIOL_CTR" },
+	{ HCA_CAP_WQE_RESIZE, "HCA_CAP_WQE_RESIZE" },
+	{ HCA_CAP_RAW_PACKET_MCAST, "HCA_CAP_RAW_PACKET_MCAST" },
+	{ HCA_CAP_SHUTDOWN_PORT, "HCA_CAP_SHUTDOWN_PORT" },
+	{ HCA_CAP_RC_LL_QP, "HCA_CAP_RC_LL_QP" },
+	{ HCA_CAP_SRQ, "HCA_CAP_SRQ" },
+	{ HCA_CAP_UD_LL_QP, "HCA_CAP_UD_LL_QP" },
+	{ HCA_CAP_RESIZE_MR, "HCA_CAP_RESIZE_MR" },
+	{ HCA_CAP_MINI_QP, "HCA_CAP_MINI_QP" },
+	{ HCA_CAP_H_ALLOC_RES_SYNC, "HCA_CAP_H_ALLOC_RES_SYNC" },
+};
+
+static int ehca_sense_attributes(struct ehca_shca *shca)
+{
+	int i, ret = 0;
+	u64 h_ret;
+	struct hipz_query_hca *rblock;
+	struct hipz_query_port *port;
+	const char *loc_code;
+
+	static const u32 pgsize_map[] = {
+		HCA_CAP_MR_PGSIZE_4K,  0x1000,
+		HCA_CAP_MR_PGSIZE_64K, 0x10000,
+		HCA_CAP_MR_PGSIZE_1M,  0x100000,
+		HCA_CAP_MR_PGSIZE_16M, 0x1000000,
+	};
+
+	ehca_gen_dbg("Probing adapter %s...",
+		     shca->ofdev->dev.of_node->full_name);
+	loc_code = of_get_property(shca->ofdev->dev.of_node, "ibm,loc-code",
+				   NULL);
+	if (loc_code)
+		ehca_gen_dbg(" ... location lode=%s", loc_code);
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!rblock) {
+		ehca_gen_err("Cannot allocate rblock memory.");
+		return -ENOMEM;
+	}
+
+	h_ret = hipz_h_query_hca(shca->ipz_hca_handle, rblock);
+	if (h_ret != H_SUCCESS) {
+		ehca_gen_err("Cannot query device properties. h_ret=%lli",
+			     h_ret);
+		ret = -EPERM;
+		goto sense_attributes1;
+	}
+
+	if (ehca_nr_ports == 1)
+		shca->num_ports = 1;
+	else
+		shca->num_ports = (u8)rblock->num_ports;
+
+	ehca_gen_dbg(" ... found %x ports", rblock->num_ports);
+
+	if (ehca_hw_level == 0) {
+		u32 hcaaver;
+		u32 revid;
+
+		hcaaver = EHCA_BMASK_GET(EHCA_HCAAVER, rblock->hw_ver);
+		revid   = EHCA_BMASK_GET(EHCA_REVID, rblock->hw_ver);
+
+		ehca_gen_dbg(" ... hardware version=%x:%x", hcaaver, revid);
+
+		if (hcaaver == 1) {
+			if (revid <= 3)
+				shca->hw_level = 0x10 | (revid + 1);
+			else
+				shca->hw_level = 0x14;
+		} else if (hcaaver == 2) {
+			if (revid == 0)
+				shca->hw_level = 0x21;
+			else if (revid == 0x10)
+				shca->hw_level = 0x22;
+			else if (revid == 0x20 || revid == 0x21)
+				shca->hw_level = 0x23;
+		}
+
+		if (!shca->hw_level) {
+			ehca_gen_warn("unknown hardware version"
+				      " - assuming default level");
+			shca->hw_level = 0x22;
+		}
+	} else
+		shca->hw_level = ehca_hw_level;
+	ehca_gen_dbg(" ... hardware level=%x", shca->hw_level);
+
+	shca->hca_cap = rblock->hca_cap_indicators;
+	ehca_gen_dbg(" ... HCA capabilities:");
+	for (i = 0; i < ARRAY_SIZE(hca_cap_descr); i++)
+		if (EHCA_BMASK_GET(hca_cap_descr[i].mask, shca->hca_cap))
+			ehca_gen_dbg("   %s", hca_cap_descr[i].descr);
+
+	/* Autodetect hCall locking -- the "H_ALLOC_RESOURCE synced" flag is
+	 * a firmware property, so it's valid across all adapters
+	 */
+	if (ehca_lock_hcalls == -1)
+		ehca_lock_hcalls = !EHCA_BMASK_GET(HCA_CAP_H_ALLOC_RES_SYNC,
+					shca->hca_cap);
+
+	/* translate supported MR page sizes; always support 4K */
+	shca->hca_cap_mr_pgsize = EHCA_PAGESIZE;
+	for (i = 0; i < ARRAY_SIZE(pgsize_map); i += 2)
+		if (rblock->memory_page_size_supported & pgsize_map[i])
+			shca->hca_cap_mr_pgsize |= pgsize_map[i + 1];
+
+	/* Set maximum number of CQs and QPs to calculate EQ size */
+	if (shca->max_num_qps == -1)
+		shca->max_num_qps = min_t(int, rblock->max_qp,
+					  EHCA_MAX_NUM_QUEUES);
+	else if (shca->max_num_qps < 1 || shca->max_num_qps > rblock->max_qp) {
+		ehca_gen_warn("The requested number of QPs is out of range "
+			      "(1 - %i) specified by HW. Value is set to %i",
+			      rblock->max_qp, rblock->max_qp);
+		shca->max_num_qps = rblock->max_qp;
+	}
+
+	if (shca->max_num_cqs == -1)
+		shca->max_num_cqs = min_t(int, rblock->max_cq,
+					  EHCA_MAX_NUM_QUEUES);
+	else if (shca->max_num_cqs < 1 || shca->max_num_cqs > rblock->max_cq) {
+		ehca_gen_warn("The requested number of CQs is out of range "
+			      "(1 - %i) specified by HW. Value is set to %i",
+			      rblock->max_cq, rblock->max_cq);
+	}
+
+	/* query max MTU from first port -- it's the same for all ports */
+	port = (struct hipz_query_port *)rblock;
+	h_ret = hipz_h_query_port(shca->ipz_hca_handle, 1, port);
+	if (h_ret != H_SUCCESS) {
+		ehca_gen_err("Cannot query port properties. h_ret=%lli",
+			     h_ret);
+		ret = -EPERM;
+		goto sense_attributes1;
+	}
+
+	shca->max_mtu = port->max_mtu;
+
+sense_attributes1:
+	ehca_free_fw_ctrlblock(rblock);
+	return ret;
+}
+
+static int init_node_guid(struct ehca_shca *shca)
+{
+	int ret = 0;
+	struct hipz_query_hca *rblock;
+
+	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!rblock) {
+		ehca_err(&shca->ib_device, "Can't allocate rblock memory.");
+		return -ENOMEM;
+	}
+
+	if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "Can't query device properties");
+		ret = -EINVAL;
+		goto init_node_guid1;
+	}
+
+	memcpy(&shca->ib_device.node_guid, &rblock->node_guid, sizeof(u64));
+
+init_node_guid1:
+	ehca_free_fw_ctrlblock(rblock);
+	return ret;
+}
+
+static int ehca_init_device(struct ehca_shca *shca)
+{
+	int ret;
+
+	ret = init_node_guid(shca);
+	if (ret)
+		return ret;
+
+	strlcpy(shca->ib_device.name, "ehca%d", IB_DEVICE_NAME_MAX);
+	shca->ib_device.owner               = THIS_MODULE;
+
+	shca->ib_device.uverbs_abi_ver	    = 8;
+	shca->ib_device.uverbs_cmd_mask	    =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST);
+
+	shca->ib_device.node_type           = RDMA_NODE_IB_CA;
+	shca->ib_device.phys_port_cnt       = shca->num_ports;
+	shca->ib_device.num_comp_vectors    = 1;
+	shca->ib_device.dma_device          = &shca->ofdev->dev;
+	shca->ib_device.query_device        = ehca_query_device;
+	shca->ib_device.query_port          = ehca_query_port;
+	shca->ib_device.query_gid           = ehca_query_gid;
+	shca->ib_device.query_pkey          = ehca_query_pkey;
+	/* shca->in_device.modify_device    = ehca_modify_device    */
+	shca->ib_device.modify_port         = ehca_modify_port;
+	shca->ib_device.alloc_ucontext      = ehca_alloc_ucontext;
+	shca->ib_device.dealloc_ucontext    = ehca_dealloc_ucontext;
+	shca->ib_device.alloc_pd            = ehca_alloc_pd;
+	shca->ib_device.dealloc_pd          = ehca_dealloc_pd;
+	shca->ib_device.create_ah	    = ehca_create_ah;
+	/* shca->ib_device.modify_ah	    = ehca_modify_ah;	    */
+	shca->ib_device.query_ah	    = ehca_query_ah;
+	shca->ib_device.destroy_ah	    = ehca_destroy_ah;
+	shca->ib_device.create_qp	    = ehca_create_qp;
+	shca->ib_device.modify_qp	    = ehca_modify_qp;
+	shca->ib_device.query_qp	    = ehca_query_qp;
+	shca->ib_device.destroy_qp	    = ehca_destroy_qp;
+	shca->ib_device.post_send	    = ehca_post_send;
+	shca->ib_device.post_recv	    = ehca_post_recv;
+	shca->ib_device.create_cq	    = ehca_create_cq;
+	shca->ib_device.destroy_cq	    = ehca_destroy_cq;
+	shca->ib_device.resize_cq	    = ehca_resize_cq;
+	shca->ib_device.poll_cq		    = ehca_poll_cq;
+	/* shca->ib_device.peek_cq	    = ehca_peek_cq;	    */
+	shca->ib_device.req_notify_cq	    = ehca_req_notify_cq;
+	/* shca->ib_device.req_ncomp_notif  = ehca_req_ncomp_notif; */
+	shca->ib_device.get_dma_mr	    = ehca_get_dma_mr;
+	shca->ib_device.reg_phys_mr	    = ehca_reg_phys_mr;
+	shca->ib_device.reg_user_mr	    = ehca_reg_user_mr;
+	shca->ib_device.query_mr	    = ehca_query_mr;
+	shca->ib_device.dereg_mr	    = ehca_dereg_mr;
+	shca->ib_device.rereg_phys_mr	    = ehca_rereg_phys_mr;
+	shca->ib_device.alloc_mw	    = ehca_alloc_mw;
+	shca->ib_device.bind_mw		    = ehca_bind_mw;
+	shca->ib_device.dealloc_mw	    = ehca_dealloc_mw;
+	shca->ib_device.alloc_fmr	    = ehca_alloc_fmr;
+	shca->ib_device.map_phys_fmr	    = ehca_map_phys_fmr;
+	shca->ib_device.unmap_fmr	    = ehca_unmap_fmr;
+	shca->ib_device.dealloc_fmr	    = ehca_dealloc_fmr;
+	shca->ib_device.attach_mcast	    = ehca_attach_mcast;
+	shca->ib_device.detach_mcast	    = ehca_detach_mcast;
+	shca->ib_device.process_mad	    = ehca_process_mad;
+	shca->ib_device.mmap		    = ehca_mmap;
+	shca->ib_device.dma_ops		    = &ehca_dma_mapping_ops;
+
+	if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
+		shca->ib_device.uverbs_cmd_mask |=
+			(1ull << IB_USER_VERBS_CMD_CREATE_SRQ) |
+			(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ) |
+			(1ull << IB_USER_VERBS_CMD_QUERY_SRQ) |
+			(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+		shca->ib_device.create_srq          = ehca_create_srq;
+		shca->ib_device.modify_srq          = ehca_modify_srq;
+		shca->ib_device.query_srq           = ehca_query_srq;
+		shca->ib_device.destroy_srq         = ehca_destroy_srq;
+		shca->ib_device.post_srq_recv       = ehca_post_srq_recv;
+	}
+
+	return ret;
+}
+
+static int ehca_create_aqp1(struct ehca_shca *shca, u32 port)
+{
+	struct ehca_sport *sport = &shca->sport[port - 1];
+	struct ib_cq *ibcq;
+	struct ib_qp *ibqp;
+	struct ib_qp_init_attr qp_init_attr;
+	int ret;
+
+	if (sport->ibcq_aqp1) {
+		ehca_err(&shca->ib_device, "AQP1 CQ is already created.");
+		return -EPERM;
+	}
+
+	ibcq = ib_create_cq(&shca->ib_device, NULL, NULL, (void *)(-1), 10, 0);
+	if (IS_ERR(ibcq)) {
+		ehca_err(&shca->ib_device, "Cannot create AQP1 CQ.");
+		return PTR_ERR(ibcq);
+	}
+	sport->ibcq_aqp1 = ibcq;
+
+	if (sport->ibqp_sqp[IB_QPT_GSI]) {
+		ehca_err(&shca->ib_device, "AQP1 QP is already created.");
+		ret = -EPERM;
+		goto create_aqp1;
+	}
+
+	memset(&qp_init_attr, 0, sizeof(struct ib_qp_init_attr));
+	qp_init_attr.send_cq          = ibcq;
+	qp_init_attr.recv_cq          = ibcq;
+	qp_init_attr.sq_sig_type      = IB_SIGNAL_ALL_WR;
+	qp_init_attr.cap.max_send_wr  = 100;
+	qp_init_attr.cap.max_recv_wr  = 100;
+	qp_init_attr.cap.max_send_sge = 2;
+	qp_init_attr.cap.max_recv_sge = 1;
+	qp_init_attr.qp_type          = IB_QPT_GSI;
+	qp_init_attr.port_num         = port;
+	qp_init_attr.qp_context       = NULL;
+	qp_init_attr.event_handler    = NULL;
+	qp_init_attr.srq              = NULL;
+
+	ibqp = ib_create_qp(&shca->pd->ib_pd, &qp_init_attr);
+	if (IS_ERR(ibqp)) {
+		ehca_err(&shca->ib_device, "Cannot create AQP1 QP.");
+		ret = PTR_ERR(ibqp);
+		goto create_aqp1;
+	}
+	sport->ibqp_sqp[IB_QPT_GSI] = ibqp;
+
+	return 0;
+
+create_aqp1:
+	ib_destroy_cq(sport->ibcq_aqp1);
+	return ret;
+}
+
+static int ehca_destroy_aqp1(struct ehca_sport *sport)
+{
+	int ret;
+
+	ret = ib_destroy_qp(sport->ibqp_sqp[IB_QPT_GSI]);
+	if (ret) {
+		ehca_gen_err("Cannot destroy AQP1 QP. ret=%i", ret);
+		return ret;
+	}
+
+	ret = ib_destroy_cq(sport->ibcq_aqp1);
+	if (ret)
+		ehca_gen_err("Cannot destroy AQP1 CQ. ret=%i", ret);
+
+	return ret;
+}
+
+static ssize_t ehca_show_debug_level(struct device_driver *ddp, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%d\n", ehca_debug_level);
+}
+
+static ssize_t ehca_store_debug_level(struct device_driver *ddp,
+				      const char *buf, size_t count)
+{
+	int value = (*buf) - '0';
+	if (value >= 0 && value <= 9)
+		ehca_debug_level = value;
+	return 1;
+}
+
+static DRIVER_ATTR(debug_level, S_IRUSR | S_IWUSR,
+		   ehca_show_debug_level, ehca_store_debug_level);
+
+static struct attribute *ehca_drv_attrs[] = {
+	&driver_attr_debug_level.attr,
+	NULL
+};
+
+static struct attribute_group ehca_drv_attr_grp = {
+	.attrs = ehca_drv_attrs
+};
+
+static const struct attribute_group *ehca_drv_attr_groups[] = {
+	&ehca_drv_attr_grp,
+	NULL,
+};
+
+#define EHCA_RESOURCE_ATTR(name)                                           \
+static ssize_t  ehca_show_##name(struct device *dev,                       \
+				 struct device_attribute *attr,            \
+				 char *buf)                                \
+{									   \
+	struct ehca_shca *shca;						   \
+	struct hipz_query_hca *rblock;				           \
+	int data;                                                          \
+									   \
+	shca = dev_get_drvdata(dev);					   \
+									   \
+	rblock = ehca_alloc_fw_ctrlblock(GFP_KERNEL);			   \
+	if (!rblock) {						           \
+		dev_err(dev, "Can't allocate rblock memory.\n");           \
+		return 0;						   \
+	}								   \
+									   \
+	if (hipz_h_query_hca(shca->ipz_hca_handle, rblock) != H_SUCCESS) { \
+		dev_err(dev, "Can't query device properties\n");           \
+		ehca_free_fw_ctrlblock(rblock);			   	   \
+		return 0;					   	   \
+	}								   \
+									   \
+	data = rblock->name;                                               \
+	ehca_free_fw_ctrlblock(rblock);                                    \
+									   \
+	if ((strcmp(#name, "num_ports") == 0) && (ehca_nr_ports == 1))	   \
+		return snprintf(buf, 256, "1\n");			   \
+	else								   \
+		return snprintf(buf, 256, "%d\n", data);		   \
+									   \
+}									   \
+static DEVICE_ATTR(name, S_IRUGO, ehca_show_##name, NULL);
+
+EHCA_RESOURCE_ATTR(num_ports);
+EHCA_RESOURCE_ATTR(hw_ver);
+EHCA_RESOURCE_ATTR(max_eq);
+EHCA_RESOURCE_ATTR(cur_eq);
+EHCA_RESOURCE_ATTR(max_cq);
+EHCA_RESOURCE_ATTR(cur_cq);
+EHCA_RESOURCE_ATTR(max_qp);
+EHCA_RESOURCE_ATTR(cur_qp);
+EHCA_RESOURCE_ATTR(max_mr);
+EHCA_RESOURCE_ATTR(cur_mr);
+EHCA_RESOURCE_ATTR(max_mw);
+EHCA_RESOURCE_ATTR(cur_mw);
+EHCA_RESOURCE_ATTR(max_pd);
+EHCA_RESOURCE_ATTR(max_ah);
+
+static ssize_t ehca_show_adapter_handle(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct ehca_shca *shca = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%llx\n", shca->ipz_hca_handle.handle);
+
+}
+static DEVICE_ATTR(adapter_handle, S_IRUGO, ehca_show_adapter_handle, NULL);
+
+static struct attribute *ehca_dev_attrs[] = {
+	&dev_attr_adapter_handle.attr,
+	&dev_attr_num_ports.attr,
+	&dev_attr_hw_ver.attr,
+	&dev_attr_max_eq.attr,
+	&dev_attr_cur_eq.attr,
+	&dev_attr_max_cq.attr,
+	&dev_attr_cur_cq.attr,
+	&dev_attr_max_qp.attr,
+	&dev_attr_cur_qp.attr,
+	&dev_attr_max_mr.attr,
+	&dev_attr_cur_mr.attr,
+	&dev_attr_max_mw.attr,
+	&dev_attr_cur_mw.attr,
+	&dev_attr_max_pd.attr,
+	&dev_attr_max_ah.attr,
+	NULL
+};
+
+static struct attribute_group ehca_dev_attr_grp = {
+	.attrs = ehca_dev_attrs
+};
+
+static int ehca_probe(struct platform_device *dev)
+{
+	struct ehca_shca *shca;
+	const u64 *handle;
+	struct ib_pd *ibpd;
+	int ret, i, eq_size;
+	unsigned long flags;
+
+	handle = of_get_property(dev->dev.of_node, "ibm,hca-handle", NULL);
+	if (!handle) {
+		ehca_gen_err("Cannot get eHCA handle for adapter: %s.",
+			     dev->dev.of_node->full_name);
+		return -ENODEV;
+	}
+
+	if (!(*handle)) {
+		ehca_gen_err("Wrong eHCA handle for adapter: %s.",
+			     dev->dev.of_node->full_name);
+		return -ENODEV;
+	}
+
+	shca = (struct ehca_shca *)ib_alloc_device(sizeof(*shca));
+	if (!shca) {
+		ehca_gen_err("Cannot allocate shca memory.");
+		return -ENOMEM;
+	}
+
+	mutex_init(&shca->modify_mutex);
+	atomic_set(&shca->num_cqs, 0);
+	atomic_set(&shca->num_qps, 0);
+	shca->max_num_qps = ehca_max_qp;
+	shca->max_num_cqs = ehca_max_cq;
+
+	for (i = 0; i < ARRAY_SIZE(shca->sport); i++)
+		spin_lock_init(&shca->sport[i].mod_sqp_lock);
+
+	shca->ofdev = dev;
+	shca->ipz_hca_handle.handle = *handle;
+	dev_set_drvdata(&dev->dev, shca);
+
+	ret = ehca_sense_attributes(shca);
+	if (ret < 0) {
+		ehca_gen_err("Cannot sense eHCA attributes.");
+		goto probe1;
+	}
+
+	ret = ehca_init_device(shca);
+	if (ret) {
+		ehca_gen_err("Cannot init ehca  device struct");
+		goto probe1;
+	}
+
+	eq_size = 2 * shca->max_num_cqs + 4 * shca->max_num_qps;
+	/* create event queues */
+	ret = ehca_create_eq(shca, &shca->eq, EHCA_EQ, eq_size);
+	if (ret) {
+		ehca_err(&shca->ib_device, "Cannot create EQ.");
+		goto probe1;
+	}
+
+	ret = ehca_create_eq(shca, &shca->neq, EHCA_NEQ, 513);
+	if (ret) {
+		ehca_err(&shca->ib_device, "Cannot create NEQ.");
+		goto probe3;
+	}
+
+	/* create internal protection domain */
+	ibpd = ehca_alloc_pd(&shca->ib_device, (void *)(-1), NULL);
+	if (IS_ERR(ibpd)) {
+		ehca_err(&shca->ib_device, "Cannot create internal PD.");
+		ret = PTR_ERR(ibpd);
+		goto probe4;
+	}
+
+	shca->pd = container_of(ibpd, struct ehca_pd, ib_pd);
+	shca->pd->ib_pd.device = &shca->ib_device;
+
+	/* create internal max MR */
+	ret = ehca_reg_internal_maxmr(shca, shca->pd, &shca->maxmr);
+
+	if (ret) {
+		ehca_err(&shca->ib_device, "Cannot create internal MR ret=%i",
+			 ret);
+		goto probe5;
+	}
+
+	ret = ib_register_device(&shca->ib_device, NULL);
+	if (ret) {
+		ehca_err(&shca->ib_device,
+			 "ib_register_device() failed ret=%i", ret);
+		goto probe6;
+	}
+
+	/* create AQP1 for port 1 */
+	if (ehca_open_aqp1 == 1) {
+		shca->sport[0].port_state = IB_PORT_DOWN;
+		ret = ehca_create_aqp1(shca, 1);
+		if (ret) {
+			ehca_err(&shca->ib_device,
+				 "Cannot create AQP1 for port 1.");
+			goto probe7;
+		}
+	}
+
+	/* create AQP1 for port 2 */
+	if ((ehca_open_aqp1 == 1) && (shca->num_ports == 2)) {
+		shca->sport[1].port_state = IB_PORT_DOWN;
+		ret = ehca_create_aqp1(shca, 2);
+		if (ret) {
+			ehca_err(&shca->ib_device,
+				 "Cannot create AQP1 for port 2.");
+			goto probe8;
+		}
+	}
+
+	ret = sysfs_create_group(&dev->dev.kobj, &ehca_dev_attr_grp);
+	if (ret) /* only complain; we can live without attributes */
+		ehca_err(&shca->ib_device,
+			 "Cannot create device attributes  ret=%d", ret);
+
+	spin_lock_irqsave(&shca_list_lock, flags);
+	list_add(&shca->shca_list, &shca_list);
+	spin_unlock_irqrestore(&shca_list_lock, flags);
+
+	return 0;
+
+probe8:
+	ret = ehca_destroy_aqp1(&shca->sport[0]);
+	if (ret)
+		ehca_err(&shca->ib_device,
+			 "Cannot destroy AQP1 for port 1. ret=%i", ret);
+
+probe7:
+	ib_unregister_device(&shca->ib_device);
+
+probe6:
+	ret = ehca_dereg_internal_maxmr(shca);
+	if (ret)
+		ehca_err(&shca->ib_device,
+			 "Cannot destroy internal MR. ret=%x", ret);
+
+probe5:
+	ret = ehca_dealloc_pd(&shca->pd->ib_pd);
+	if (ret)
+		ehca_err(&shca->ib_device,
+			 "Cannot destroy internal PD. ret=%x", ret);
+
+probe4:
+	ret = ehca_destroy_eq(shca, &shca->neq);
+	if (ret)
+		ehca_err(&shca->ib_device,
+			 "Cannot destroy NEQ. ret=%x", ret);
+
+probe3:
+	ret = ehca_destroy_eq(shca, &shca->eq);
+	if (ret)
+		ehca_err(&shca->ib_device,
+			 "Cannot destroy EQ. ret=%x", ret);
+
+probe1:
+	ib_dealloc_device(&shca->ib_device);
+
+	return -EINVAL;
+}
+
+static int ehca_remove(struct platform_device *dev)
+{
+	struct ehca_shca *shca = dev_get_drvdata(&dev->dev);
+	unsigned long flags;
+	int ret;
+
+	sysfs_remove_group(&dev->dev.kobj, &ehca_dev_attr_grp);
+
+	if (ehca_open_aqp1 == 1) {
+		int i;
+		for (i = 0; i < shca->num_ports; i++) {
+			ret = ehca_destroy_aqp1(&shca->sport[i]);
+			if (ret)
+				ehca_err(&shca->ib_device,
+					 "Cannot destroy AQP1 for port %x "
+					 "ret=%i", ret, i);
+		}
+	}
+
+	ib_unregister_device(&shca->ib_device);
+
+	ret = ehca_dereg_internal_maxmr(shca);
+	if (ret)
+		ehca_err(&shca->ib_device,
+			 "Cannot destroy internal MR. ret=%i", ret);
+
+	ret = ehca_dealloc_pd(&shca->pd->ib_pd);
+	if (ret)
+		ehca_err(&shca->ib_device,
+			 "Cannot destroy internal PD. ret=%i", ret);
+
+	ret = ehca_destroy_eq(shca, &shca->eq);
+	if (ret)
+		ehca_err(&shca->ib_device, "Cannot destroy EQ. ret=%i", ret);
+
+	ret = ehca_destroy_eq(shca, &shca->neq);
+	if (ret)
+		ehca_err(&shca->ib_device, "Canot destroy NEQ. ret=%i", ret);
+
+	ib_dealloc_device(&shca->ib_device);
+
+	spin_lock_irqsave(&shca_list_lock, flags);
+	list_del(&shca->shca_list);
+	spin_unlock_irqrestore(&shca_list_lock, flags);
+
+	return ret;
+}
+
+static struct of_device_id ehca_device_table[] =
+{
+	{
+		.name       = "lhca",
+		.compatible = "IBM,lhca",
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(of, ehca_device_table);
+
+static struct platform_driver ehca_driver = {
+	.probe       = ehca_probe,
+	.remove      = ehca_remove,
+	.driver = {
+		.name = "ehca",
+		.owner = THIS_MODULE,
+		.groups = ehca_drv_attr_groups,
+		.of_match_table = ehca_device_table,
+	},
+};
+
+void ehca_poll_eqs(unsigned long data)
+{
+	struct ehca_shca *shca;
+
+	spin_lock(&shca_list_lock);
+	list_for_each_entry(shca, &shca_list, shca_list) {
+		if (shca->eq.is_initialized) {
+			/* call deadman proc only if eq ptr does not change */
+			struct ehca_eq *eq = &shca->eq;
+			int max = 3;
+			volatile u64 q_ofs, q_ofs2;
+			unsigned long flags;
+			spin_lock_irqsave(&eq->spinlock, flags);
+			q_ofs = eq->ipz_queue.current_q_offset;
+			spin_unlock_irqrestore(&eq->spinlock, flags);
+			do {
+				spin_lock_irqsave(&eq->spinlock, flags);
+				q_ofs2 = eq->ipz_queue.current_q_offset;
+				spin_unlock_irqrestore(&eq->spinlock, flags);
+				max--;
+			} while (q_ofs == q_ofs2 && max > 0);
+			if (q_ofs == q_ofs2)
+				ehca_process_eq(shca, 0);
+		}
+	}
+	mod_timer(&poll_eqs_timer, round_jiffies(jiffies + HZ));
+	spin_unlock(&shca_list_lock);
+}
+
+static int ehca_mem_notifier(struct notifier_block *nb,
+			     unsigned long action, void *data)
+{
+	static unsigned long ehca_dmem_warn_time;
+	unsigned long flags;
+
+	switch (action) {
+	case MEM_CANCEL_OFFLINE:
+	case MEM_CANCEL_ONLINE:
+	case MEM_ONLINE:
+	case MEM_OFFLINE:
+		return NOTIFY_OK;
+	case MEM_GOING_ONLINE:
+	case MEM_GOING_OFFLINE:
+		/* only ok if no hca is attached to the lpar */
+		spin_lock_irqsave(&shca_list_lock, flags);
+		if (list_empty(&shca_list)) {
+			spin_unlock_irqrestore(&shca_list_lock, flags);
+			return NOTIFY_OK;
+		} else {
+			spin_unlock_irqrestore(&shca_list_lock, flags);
+			if (printk_timed_ratelimit(&ehca_dmem_warn_time,
+						   30 * 1000))
+				ehca_gen_err("DMEM operations are not allowed"
+					     "in conjunction with eHCA");
+			return NOTIFY_BAD;
+		}
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block ehca_mem_nb = {
+	.notifier_call = ehca_mem_notifier,
+};
+
+static int __init ehca_module_init(void)
+{
+	int ret;
+
+	printk(KERN_INFO "eHCA Infiniband Device Driver "
+	       "(Version " HCAD_VERSION ")\n");
+
+	ret = ehca_create_comp_pool();
+	if (ret) {
+		ehca_gen_err("Cannot create comp pool.");
+		return ret;
+	}
+
+	ret = ehca_create_slab_caches();
+	if (ret) {
+		ehca_gen_err("Cannot create SLAB caches");
+		ret = -ENOMEM;
+		goto module_init1;
+	}
+
+	ret = ehca_create_busmap();
+	if (ret) {
+		ehca_gen_err("Cannot create busmap.");
+		goto module_init2;
+	}
+
+	ret = ibmebus_register_driver(&ehca_driver);
+	if (ret) {
+		ehca_gen_err("Cannot register eHCA device driver");
+		ret = -EINVAL;
+		goto module_init3;
+	}
+
+	ret = register_memory_notifier(&ehca_mem_nb);
+	if (ret) {
+		ehca_gen_err("Failed registering memory add/remove notifier");
+		goto module_init4;
+	}
+
+	if (ehca_poll_all_eqs != 1) {
+		ehca_gen_err("WARNING!!!");
+		ehca_gen_err("It is possible to lose interrupts.");
+	} else {
+		init_timer(&poll_eqs_timer);
+		poll_eqs_timer.function = ehca_poll_eqs;
+		poll_eqs_timer.expires = jiffies + HZ;
+		add_timer(&poll_eqs_timer);
+	}
+
+	return 0;
+
+module_init4:
+	ibmebus_unregister_driver(&ehca_driver);
+
+module_init3:
+	ehca_destroy_busmap();
+
+module_init2:
+	ehca_destroy_slab_caches();
+
+module_init1:
+	ehca_destroy_comp_pool();
+	return ret;
+};
+
+static void __exit ehca_module_exit(void)
+{
+	if (ehca_poll_all_eqs == 1)
+		del_timer_sync(&poll_eqs_timer);
+
+	ibmebus_unregister_driver(&ehca_driver);
+
+	unregister_memory_notifier(&ehca_mem_nb);
+
+	ehca_destroy_busmap();
+
+	ehca_destroy_slab_caches();
+
+	ehca_destroy_comp_pool();
+
+	idr_destroy(&ehca_cq_idr);
+	idr_destroy(&ehca_qp_idr);
+};
+
+module_init(ehca_module_init);
+module_exit(ehca_module_exit);
diff --git a/drivers/infiniband/hw/ehca/ehca_mcast.c b/drivers/infiniband/hw/ehca/ehca_mcast.c
new file mode 100644
index 000000000..cec181532
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_mcast.c
@@ -0,0 +1,131 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  mcast  functions
+ *
+ *  Authors: Khadija Souissi <souissik@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/module.h>
+#include <linux/err.h>
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+#define MAX_MC_LID 0xFFFE
+#define MIN_MC_LID 0xC000	/* Multicast limits */
+#define EHCA_VALID_MULTICAST_GID(gid)  ((gid)[0] == 0xFF)
+#define EHCA_VALID_MULTICAST_LID(lid) \
+	(((lid) >= MIN_MC_LID) && ((lid) <= MAX_MC_LID))
+
+int ehca_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+	struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
+					      ib_device);
+	union ib_gid my_gid;
+	u64 subnet_prefix, interface_id, h_ret;
+
+	if (ibqp->qp_type != IB_QPT_UD) {
+		ehca_err(ibqp->device, "invalid qp_type=%x", ibqp->qp_type);
+		return -EINVAL;
+	}
+
+	if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) {
+		ehca_err(ibqp->device, "invalid mulitcast gid");
+		return -EINVAL;
+	} else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) {
+		ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid);
+		return -EINVAL;
+	}
+
+	memcpy(&my_gid, gid->raw, sizeof(union ib_gid));
+
+	subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix);
+	interface_id = be64_to_cpu(my_gid.global.interface_id);
+	h_ret = hipz_h_attach_mcqp(shca->ipz_hca_handle,
+				   my_qp->ipz_qp_handle,
+				   my_qp->galpas.kernel,
+				   lid, subnet_prefix, interface_id);
+	if (h_ret != H_SUCCESS)
+		ehca_err(ibqp->device,
+			 "ehca_qp=%p qp_num=%x hipz_h_attach_mcqp() failed "
+			 "h_ret=%lli", my_qp, ibqp->qp_num, h_ret);
+
+	return ehca2ib_return_code(h_ret);
+}
+
+int ehca_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+	struct ehca_shca *shca = container_of(ibqp->pd->device,
+					      struct ehca_shca, ib_device);
+	union ib_gid my_gid;
+	u64 subnet_prefix, interface_id, h_ret;
+
+	if (ibqp->qp_type != IB_QPT_UD) {
+		ehca_err(ibqp->device, "invalid qp_type %x", ibqp->qp_type);
+		return -EINVAL;
+	}
+
+	if (!(EHCA_VALID_MULTICAST_GID(gid->raw))) {
+		ehca_err(ibqp->device, "invalid mulitcast gid");
+		return -EINVAL;
+	} else if ((lid < MIN_MC_LID) || (lid > MAX_MC_LID)) {
+		ehca_err(ibqp->device, "invalid mulitcast lid=%x", lid);
+		return -EINVAL;
+	}
+
+	memcpy(&my_gid, gid->raw, sizeof(union ib_gid));
+
+	subnet_prefix = be64_to_cpu(my_gid.global.subnet_prefix);
+	interface_id = be64_to_cpu(my_gid.global.interface_id);
+	h_ret = hipz_h_detach_mcqp(shca->ipz_hca_handle,
+				   my_qp->ipz_qp_handle,
+				   my_qp->galpas.kernel,
+				   lid, subnet_prefix, interface_id);
+	if (h_ret != H_SUCCESS)
+		ehca_err(ibqp->device,
+			 "ehca_qp=%p qp_num=%x hipz_h_detach_mcqp() failed "
+			 "h_ret=%lli", my_qp, ibqp->qp_num, h_ret);
+
+	return ehca2ib_return_code(h_ret);
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c
new file mode 100644
index 000000000..f914b3099
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c
@@ -0,0 +1,2593 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  MR/MW functions
+ *
+ *  Authors: Dietmar Decker <ddecker@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+#include <rdma/ib_umem.h>
+
+#include "ehca_iverbs.h"
+#include "ehca_mrmw.h"
+#include "hcp_if.h"
+#include "hipz_hw.h"
+
+#define NUM_CHUNKS(length, chunk_size) \
+	(((length) + (chunk_size - 1)) / (chunk_size))
+
+/* max number of rpages (per hcall register_rpages) */
+#define MAX_RPAGES 512
+
+/* DMEM toleration management */
+#define EHCA_SECTSHIFT        SECTION_SIZE_BITS
+#define EHCA_SECTSIZE          (1UL << EHCA_SECTSHIFT)
+#define EHCA_HUGEPAGESHIFT     34
+#define EHCA_HUGEPAGE_SIZE     (1UL << EHCA_HUGEPAGESHIFT)
+#define EHCA_HUGEPAGE_PFN_MASK ((EHCA_HUGEPAGE_SIZE - 1) >> PAGE_SHIFT)
+#define EHCA_INVAL_ADDR        0xFFFFFFFFFFFFFFFFULL
+#define EHCA_DIR_INDEX_SHIFT 13                   /* 8k Entries in 64k block */
+#define EHCA_TOP_INDEX_SHIFT (EHCA_DIR_INDEX_SHIFT * 2)
+#define EHCA_MAP_ENTRIES (1 << EHCA_DIR_INDEX_SHIFT)
+#define EHCA_TOP_MAP_SIZE (0x10000)               /* currently fixed map size */
+#define EHCA_DIR_MAP_SIZE (0x10000)
+#define EHCA_ENT_MAP_SIZE (0x10000)
+#define EHCA_INDEX_MASK (EHCA_MAP_ENTRIES - 1)
+
+static unsigned long ehca_mr_len;
+
+/*
+ * Memory map data structures
+ */
+struct ehca_dir_bmap {
+	u64 ent[EHCA_MAP_ENTRIES];
+};
+struct ehca_top_bmap {
+	struct ehca_dir_bmap *dir[EHCA_MAP_ENTRIES];
+};
+struct ehca_bmap {
+	struct ehca_top_bmap *top[EHCA_MAP_ENTRIES];
+};
+
+static struct ehca_bmap *ehca_bmap;
+
+static struct kmem_cache *mr_cache;
+static struct kmem_cache *mw_cache;
+
+enum ehca_mr_pgsize {
+	EHCA_MR_PGSIZE4K  = 0x1000L,
+	EHCA_MR_PGSIZE64K = 0x10000L,
+	EHCA_MR_PGSIZE1M  = 0x100000L,
+	EHCA_MR_PGSIZE16M = 0x1000000L
+};
+
+#define EHCA_MR_PGSHIFT4K  12
+#define EHCA_MR_PGSHIFT64K 16
+#define EHCA_MR_PGSHIFT1M  20
+#define EHCA_MR_PGSHIFT16M 24
+
+static u64 ehca_map_vaddr(void *caddr);
+
+static u32 ehca_encode_hwpage_size(u32 pgsize)
+{
+	int log = ilog2(pgsize);
+	WARN_ON(log < 12 || log > 24 || log & 3);
+	return (log - 12) / 4;
+}
+
+static u64 ehca_get_max_hwpage_size(struct ehca_shca *shca)
+{
+	return rounddown_pow_of_two(shca->hca_cap_mr_pgsize);
+}
+
+static struct ehca_mr *ehca_mr_new(void)
+{
+	struct ehca_mr *me;
+
+	me = kmem_cache_zalloc(mr_cache, GFP_KERNEL);
+	if (me)
+		spin_lock_init(&me->mrlock);
+	else
+		ehca_gen_err("alloc failed");
+
+	return me;
+}
+
+static void ehca_mr_delete(struct ehca_mr *me)
+{
+	kmem_cache_free(mr_cache, me);
+}
+
+static struct ehca_mw *ehca_mw_new(void)
+{
+	struct ehca_mw *me;
+
+	me = kmem_cache_zalloc(mw_cache, GFP_KERNEL);
+	if (me)
+		spin_lock_init(&me->mwlock);
+	else
+		ehca_gen_err("alloc failed");
+
+	return me;
+}
+
+static void ehca_mw_delete(struct ehca_mw *me)
+{
+	kmem_cache_free(mw_cache, me);
+}
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
+{
+	struct ib_mr *ib_mr;
+	int ret;
+	struct ehca_mr *e_maxmr;
+	struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+	struct ehca_shca *shca =
+		container_of(pd->device, struct ehca_shca, ib_device);
+
+	if (shca->maxmr) {
+		e_maxmr = ehca_mr_new();
+		if (!e_maxmr) {
+			ehca_err(&shca->ib_device, "out of memory");
+			ib_mr = ERR_PTR(-ENOMEM);
+			goto get_dma_mr_exit0;
+		}
+
+		ret = ehca_reg_maxmr(shca, e_maxmr,
+				     (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)),
+				     mr_access_flags, e_pd,
+				     &e_maxmr->ib.ib_mr.lkey,
+				     &e_maxmr->ib.ib_mr.rkey);
+		if (ret) {
+			ehca_mr_delete(e_maxmr);
+			ib_mr = ERR_PTR(ret);
+			goto get_dma_mr_exit0;
+		}
+		ib_mr = &e_maxmr->ib.ib_mr;
+	} else {
+		ehca_err(&shca->ib_device, "no internal max-MR exist!");
+		ib_mr = ERR_PTR(-EINVAL);
+		goto get_dma_mr_exit0;
+	}
+
+get_dma_mr_exit0:
+	if (IS_ERR(ib_mr))
+		ehca_err(&shca->ib_device, "h_ret=%li pd=%p mr_access_flags=%x",
+			 PTR_ERR(ib_mr), pd, mr_access_flags);
+	return ib_mr;
+} /* end ehca_get_dma_mr() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd,
+			       struct ib_phys_buf *phys_buf_array,
+			       int num_phys_buf,
+			       int mr_access_flags,
+			       u64 *iova_start)
+{
+	struct ib_mr *ib_mr;
+	int ret;
+	struct ehca_mr *e_mr;
+	struct ehca_shca *shca =
+		container_of(pd->device, struct ehca_shca, ib_device);
+	struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+
+	u64 size;
+
+	if ((num_phys_buf <= 0) || !phys_buf_array) {
+		ehca_err(pd->device, "bad input values: num_phys_buf=%x "
+			 "phys_buf_array=%p", num_phys_buf, phys_buf_array);
+		ib_mr = ERR_PTR(-EINVAL);
+		goto reg_phys_mr_exit0;
+	}
+	if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+	     !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+	    ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+	     !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
+		/*
+		 * Remote Write Access requires Local Write Access
+		 * Remote Atomic Access requires Local Write Access
+		 */
+		ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+			 mr_access_flags);
+		ib_mr = ERR_PTR(-EINVAL);
+		goto reg_phys_mr_exit0;
+	}
+
+	/* check physical buffer list and calculate size */
+	ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, num_phys_buf,
+					    iova_start, &size);
+	if (ret) {
+		ib_mr = ERR_PTR(ret);
+		goto reg_phys_mr_exit0;
+	}
+	if ((size == 0) ||
+	    (((u64)iova_start + size) < (u64)iova_start)) {
+		ehca_err(pd->device, "bad input values: size=%llx iova_start=%p",
+			 size, iova_start);
+		ib_mr = ERR_PTR(-EINVAL);
+		goto reg_phys_mr_exit0;
+	}
+
+	e_mr = ehca_mr_new();
+	if (!e_mr) {
+		ehca_err(pd->device, "out of memory");
+		ib_mr = ERR_PTR(-ENOMEM);
+		goto reg_phys_mr_exit0;
+	}
+
+	/* register MR on HCA */
+	if (ehca_mr_is_maxmr(size, iova_start)) {
+		e_mr->flags |= EHCA_MR_FLAG_MAXMR;
+		ret = ehca_reg_maxmr(shca, e_mr, iova_start, mr_access_flags,
+				     e_pd, &e_mr->ib.ib_mr.lkey,
+				     &e_mr->ib.ib_mr.rkey);
+		if (ret) {
+			ib_mr = ERR_PTR(ret);
+			goto reg_phys_mr_exit1;
+		}
+	} else {
+		struct ehca_mr_pginfo pginfo;
+		u32 num_kpages;
+		u32 num_hwpages;
+		u64 hw_pgsize;
+
+		num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size,
+					PAGE_SIZE);
+		/* for kernel space we try most possible pgsize */
+		hw_pgsize = ehca_get_max_hwpage_size(shca);
+		num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size,
+					 hw_pgsize);
+		memset(&pginfo, 0, sizeof(pginfo));
+		pginfo.type = EHCA_MR_PGI_PHYS;
+		pginfo.num_kpages = num_kpages;
+		pginfo.hwpage_size = hw_pgsize;
+		pginfo.num_hwpages = num_hwpages;
+		pginfo.u.phy.num_phys_buf = num_phys_buf;
+		pginfo.u.phy.phys_buf_array = phys_buf_array;
+		pginfo.next_hwpage =
+			((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
+
+		ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags,
+				  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
+				  &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
+		if (ret) {
+			ib_mr = ERR_PTR(ret);
+			goto reg_phys_mr_exit1;
+		}
+	}
+
+	/* successful registration of all pages */
+	return &e_mr->ib.ib_mr;
+
+reg_phys_mr_exit1:
+	ehca_mr_delete(e_mr);
+reg_phys_mr_exit0:
+	if (IS_ERR(ib_mr))
+		ehca_err(pd->device, "h_ret=%li pd=%p phys_buf_array=%p "
+			 "num_phys_buf=%x mr_access_flags=%x iova_start=%p",
+			 PTR_ERR(ib_mr), pd, phys_buf_array,
+			 num_phys_buf, mr_access_flags, iova_start);
+	return ib_mr;
+} /* end ehca_reg_phys_mr() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+			       u64 virt, int mr_access_flags,
+			       struct ib_udata *udata)
+{
+	struct ib_mr *ib_mr;
+	struct ehca_mr *e_mr;
+	struct ehca_shca *shca =
+		container_of(pd->device, struct ehca_shca, ib_device);
+	struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+	struct ehca_mr_pginfo pginfo;
+	int ret, page_shift;
+	u32 num_kpages;
+	u32 num_hwpages;
+	u64 hwpage_size;
+
+	if (!pd) {
+		ehca_gen_err("bad pd=%p", pd);
+		return ERR_PTR(-EFAULT);
+	}
+
+	if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+	     !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+	    ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+	     !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
+		/*
+		 * Remote Write Access requires Local Write Access
+		 * Remote Atomic Access requires Local Write Access
+		 */
+		ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+			 mr_access_flags);
+		ib_mr = ERR_PTR(-EINVAL);
+		goto reg_user_mr_exit0;
+	}
+
+	if (length == 0 || virt + length < virt) {
+		ehca_err(pd->device, "bad input values: length=%llx "
+			 "virt_base=%llx", length, virt);
+		ib_mr = ERR_PTR(-EINVAL);
+		goto reg_user_mr_exit0;
+	}
+
+	e_mr = ehca_mr_new();
+	if (!e_mr) {
+		ehca_err(pd->device, "out of memory");
+		ib_mr = ERR_PTR(-ENOMEM);
+		goto reg_user_mr_exit0;
+	}
+
+	e_mr->umem = ib_umem_get(pd->uobject->context, start, length,
+				 mr_access_flags, 0);
+	if (IS_ERR(e_mr->umem)) {
+		ib_mr = (void *)e_mr->umem;
+		goto reg_user_mr_exit1;
+	}
+
+	if (e_mr->umem->page_size != PAGE_SIZE) {
+		ehca_err(pd->device, "page size not supported, "
+			 "e_mr->umem->page_size=%x", e_mr->umem->page_size);
+		ib_mr = ERR_PTR(-EINVAL);
+		goto reg_user_mr_exit2;
+	}
+
+	/* determine number of MR pages */
+	num_kpages = NUM_CHUNKS((virt % PAGE_SIZE) + length, PAGE_SIZE);
+	/* select proper hw_pgsize */
+	page_shift = PAGE_SHIFT;
+	if (e_mr->umem->hugetlb) {
+		/* determine page_shift, clamp between 4K and 16M */
+		page_shift = (fls64(length - 1) + 3) & ~3;
+		page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K),
+				 EHCA_MR_PGSHIFT16M);
+	}
+	hwpage_size = 1UL << page_shift;
+
+	/* now that we have the desired page size, shift until it's
+	 * supported, too. 4K is always supported, so this terminates.
+	 */
+	while (!(hwpage_size & shca->hca_cap_mr_pgsize))
+		hwpage_size >>= 4;
+
+reg_user_mr_fallback:
+	num_hwpages = NUM_CHUNKS((virt % hwpage_size) + length, hwpage_size);
+	/* register MR on HCA */
+	memset(&pginfo, 0, sizeof(pginfo));
+	pginfo.type = EHCA_MR_PGI_USER;
+	pginfo.hwpage_size = hwpage_size;
+	pginfo.num_kpages = num_kpages;
+	pginfo.num_hwpages = num_hwpages;
+	pginfo.u.usr.region = e_mr->umem;
+	pginfo.next_hwpage = ib_umem_offset(e_mr->umem) / hwpage_size;
+	pginfo.u.usr.next_sg = pginfo.u.usr.region->sg_head.sgl;
+	ret = ehca_reg_mr(shca, e_mr, (u64 *)virt, length, mr_access_flags,
+			  e_pd, &pginfo, &e_mr->ib.ib_mr.lkey,
+			  &e_mr->ib.ib_mr.rkey, EHCA_REG_MR);
+	if (ret == -EINVAL && pginfo.hwpage_size > PAGE_SIZE) {
+		ehca_warn(pd->device, "failed to register mr "
+			  "with hwpage_size=%llx", hwpage_size);
+		ehca_info(pd->device, "try to register mr with "
+			  "kpage_size=%lx", PAGE_SIZE);
+		/*
+		 * this means kpages are not contiguous for a hw page
+		 * try kernel page size as fallback solution
+		 */
+		hwpage_size = PAGE_SIZE;
+		goto reg_user_mr_fallback;
+	}
+	if (ret) {
+		ib_mr = ERR_PTR(ret);
+		goto reg_user_mr_exit2;
+	}
+
+	/* successful registration of all pages */
+	return &e_mr->ib.ib_mr;
+
+reg_user_mr_exit2:
+	ib_umem_release(e_mr->umem);
+reg_user_mr_exit1:
+	ehca_mr_delete(e_mr);
+reg_user_mr_exit0:
+	if (IS_ERR(ib_mr))
+		ehca_err(pd->device, "rc=%li pd=%p mr_access_flags=%x udata=%p",
+			 PTR_ERR(ib_mr), pd, mr_access_flags, udata);
+	return ib_mr;
+} /* end ehca_reg_user_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_rereg_phys_mr(struct ib_mr *mr,
+		       int mr_rereg_mask,
+		       struct ib_pd *pd,
+		       struct ib_phys_buf *phys_buf_array,
+		       int num_phys_buf,
+		       int mr_access_flags,
+		       u64 *iova_start)
+{
+	int ret;
+
+	struct ehca_shca *shca =
+		container_of(mr->device, struct ehca_shca, ib_device);
+	struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
+	u64 new_size;
+	u64 *new_start;
+	u32 new_acl;
+	struct ehca_pd *new_pd;
+	u32 tmp_lkey, tmp_rkey;
+	unsigned long sl_flags;
+	u32 num_kpages = 0;
+	u32 num_hwpages = 0;
+	struct ehca_mr_pginfo pginfo;
+
+	if (!(mr_rereg_mask & IB_MR_REREG_TRANS)) {
+		/* TODO not supported, because PHYP rereg hCall needs pages */
+		ehca_err(mr->device, "rereg without IB_MR_REREG_TRANS not "
+			 "supported yet, mr_rereg_mask=%x", mr_rereg_mask);
+		ret = -EINVAL;
+		goto rereg_phys_mr_exit0;
+	}
+
+	if (mr_rereg_mask & IB_MR_REREG_PD) {
+		if (!pd) {
+			ehca_err(mr->device, "rereg with bad pd, pd=%p "
+				 "mr_rereg_mask=%x", pd, mr_rereg_mask);
+			ret = -EINVAL;
+			goto rereg_phys_mr_exit0;
+		}
+	}
+
+	if ((mr_rereg_mask &
+	     ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) ||
+	    (mr_rereg_mask == 0)) {
+		ret = -EINVAL;
+		goto rereg_phys_mr_exit0;
+	}
+
+	/* check other parameters */
+	if (e_mr == shca->maxmr) {
+		/* should be impossible, however reject to be sure */
+		ehca_err(mr->device, "rereg internal max-MR impossible, mr=%p "
+			 "shca->maxmr=%p mr->lkey=%x",
+			 mr, shca->maxmr, mr->lkey);
+		ret = -EINVAL;
+		goto rereg_phys_mr_exit0;
+	}
+	if (mr_rereg_mask & IB_MR_REREG_TRANS) { /* transl., i.e. addr/size */
+		if (e_mr->flags & EHCA_MR_FLAG_FMR) {
+			ehca_err(mr->device, "not supported for FMR, mr=%p "
+				 "flags=%x", mr, e_mr->flags);
+			ret = -EINVAL;
+			goto rereg_phys_mr_exit0;
+		}
+		if (!phys_buf_array || num_phys_buf <= 0) {
+			ehca_err(mr->device, "bad input values mr_rereg_mask=%x"
+				 " phys_buf_array=%p num_phys_buf=%x",
+				 mr_rereg_mask, phys_buf_array, num_phys_buf);
+			ret = -EINVAL;
+			goto rereg_phys_mr_exit0;
+		}
+	}
+	if ((mr_rereg_mask & IB_MR_REREG_ACCESS) &&	/* change ACL */
+	    (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+	      !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+	     ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+	      !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)))) {
+		/*
+		 * Remote Write Access requires Local Write Access
+		 * Remote Atomic Access requires Local Write Access
+		 */
+		ehca_err(mr->device, "bad input values: mr_rereg_mask=%x "
+			 "mr_access_flags=%x", mr_rereg_mask, mr_access_flags);
+		ret = -EINVAL;
+		goto rereg_phys_mr_exit0;
+	}
+
+	/* set requested values dependent on rereg request */
+	spin_lock_irqsave(&e_mr->mrlock, sl_flags);
+	new_start = e_mr->start;
+	new_size = e_mr->size;
+	new_acl = e_mr->acl;
+	new_pd = container_of(mr->pd, struct ehca_pd, ib_pd);
+
+	if (mr_rereg_mask & IB_MR_REREG_TRANS) {
+		u64 hw_pgsize = ehca_get_max_hwpage_size(shca);
+
+		new_start = iova_start;	/* change address */
+		/* check physical buffer list and calculate size */
+		ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array,
+						    num_phys_buf, iova_start,
+						    &new_size);
+		if (ret)
+			goto rereg_phys_mr_exit1;
+		if ((new_size == 0) ||
+		    (((u64)iova_start + new_size) < (u64)iova_start)) {
+			ehca_err(mr->device, "bad input values: new_size=%llx "
+				 "iova_start=%p", new_size, iova_start);
+			ret = -EINVAL;
+			goto rereg_phys_mr_exit1;
+		}
+		num_kpages = NUM_CHUNKS(((u64)new_start % PAGE_SIZE) +
+					new_size, PAGE_SIZE);
+		num_hwpages = NUM_CHUNKS(((u64)new_start % hw_pgsize) +
+					 new_size, hw_pgsize);
+		memset(&pginfo, 0, sizeof(pginfo));
+		pginfo.type = EHCA_MR_PGI_PHYS;
+		pginfo.num_kpages = num_kpages;
+		pginfo.hwpage_size = hw_pgsize;
+		pginfo.num_hwpages = num_hwpages;
+		pginfo.u.phy.num_phys_buf = num_phys_buf;
+		pginfo.u.phy.phys_buf_array = phys_buf_array;
+		pginfo.next_hwpage =
+			((u64)iova_start & ~PAGE_MASK) / hw_pgsize;
+	}
+	if (mr_rereg_mask & IB_MR_REREG_ACCESS)
+		new_acl = mr_access_flags;
+	if (mr_rereg_mask & IB_MR_REREG_PD)
+		new_pd = container_of(pd, struct ehca_pd, ib_pd);
+
+	ret = ehca_rereg_mr(shca, e_mr, new_start, new_size, new_acl,
+			    new_pd, &pginfo, &tmp_lkey, &tmp_rkey);
+	if (ret)
+		goto rereg_phys_mr_exit1;
+
+	/* successful reregistration */
+	if (mr_rereg_mask & IB_MR_REREG_PD)
+		mr->pd = pd;
+	mr->lkey = tmp_lkey;
+	mr->rkey = tmp_rkey;
+
+rereg_phys_mr_exit1:
+	spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
+rereg_phys_mr_exit0:
+	if (ret)
+		ehca_err(mr->device, "ret=%i mr=%p mr_rereg_mask=%x pd=%p "
+			 "phys_buf_array=%p num_phys_buf=%x mr_access_flags=%x "
+			 "iova_start=%p",
+			 ret, mr, mr_rereg_mask, pd, phys_buf_array,
+			 num_phys_buf, mr_access_flags, iova_start);
+	return ret;
+} /* end ehca_rereg_phys_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
+{
+	int ret = 0;
+	u64 h_ret;
+	struct ehca_shca *shca =
+		container_of(mr->device, struct ehca_shca, ib_device);
+	struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
+	unsigned long sl_flags;
+	struct ehca_mr_hipzout_parms hipzout;
+
+	if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
+		ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
+			 "e_mr->flags=%x", mr, e_mr, e_mr->flags);
+		ret = -EINVAL;
+		goto query_mr_exit0;
+	}
+
+	memset(mr_attr, 0, sizeof(struct ib_mr_attr));
+	spin_lock_irqsave(&e_mr->mrlock, sl_flags);
+
+	h_ret = hipz_h_query_mr(shca->ipz_hca_handle, e_mr, &hipzout);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(mr->device, "hipz_mr_query failed, h_ret=%lli mr=%p "
+			 "hca_hndl=%llx mr_hndl=%llx lkey=%x",
+			 h_ret, mr, shca->ipz_hca_handle.handle,
+			 e_mr->ipz_mr_handle.handle, mr->lkey);
+		ret = ehca2ib_return_code(h_ret);
+		goto query_mr_exit1;
+	}
+	mr_attr->pd = mr->pd;
+	mr_attr->device_virt_addr = hipzout.vaddr;
+	mr_attr->size = hipzout.len;
+	mr_attr->lkey = hipzout.lkey;
+	mr_attr->rkey = hipzout.rkey;
+	ehca_mrmw_reverse_map_acl(&hipzout.acl, &mr_attr->mr_access_flags);
+
+query_mr_exit1:
+	spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
+query_mr_exit0:
+	if (ret)
+		ehca_err(mr->device, "ret=%i mr=%p mr_attr=%p",
+			 ret, mr, mr_attr);
+	return ret;
+} /* end ehca_query_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dereg_mr(struct ib_mr *mr)
+{
+	int ret = 0;
+	u64 h_ret;
+	struct ehca_shca *shca =
+		container_of(mr->device, struct ehca_shca, ib_device);
+	struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr);
+
+	if ((e_mr->flags & EHCA_MR_FLAG_FMR)) {
+		ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p "
+			 "e_mr->flags=%x", mr, e_mr, e_mr->flags);
+		ret = -EINVAL;
+		goto dereg_mr_exit0;
+	} else if (e_mr == shca->maxmr) {
+		/* should be impossible, however reject to be sure */
+		ehca_err(mr->device, "dereg internal max-MR impossible, mr=%p "
+			 "shca->maxmr=%p mr->lkey=%x",
+			 mr, shca->maxmr, mr->lkey);
+		ret = -EINVAL;
+		goto dereg_mr_exit0;
+	}
+
+	/* TODO: BUSY: MR still has bound window(s) */
+	h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(mr->device, "hipz_free_mr failed, h_ret=%lli shca=%p "
+			 "e_mr=%p hca_hndl=%llx mr_hndl=%llx mr->lkey=%x",
+			 h_ret, shca, e_mr, shca->ipz_hca_handle.handle,
+			 e_mr->ipz_mr_handle.handle, mr->lkey);
+		ret = ehca2ib_return_code(h_ret);
+		goto dereg_mr_exit0;
+	}
+
+	if (e_mr->umem)
+		ib_umem_release(e_mr->umem);
+
+	/* successful deregistration */
+	ehca_mr_delete(e_mr);
+
+dereg_mr_exit0:
+	if (ret)
+		ehca_err(mr->device, "ret=%i mr=%p", ret, mr);
+	return ret;
+} /* end ehca_dereg_mr() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
+{
+	struct ib_mw *ib_mw;
+	u64 h_ret;
+	struct ehca_mw *e_mw;
+	struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+	struct ehca_shca *shca =
+		container_of(pd->device, struct ehca_shca, ib_device);
+	struct ehca_mw_hipzout_parms hipzout;
+
+	if (type != IB_MW_TYPE_1)
+		return ERR_PTR(-EINVAL);
+
+	e_mw = ehca_mw_new();
+	if (!e_mw) {
+		ib_mw = ERR_PTR(-ENOMEM);
+		goto alloc_mw_exit0;
+	}
+
+	h_ret = hipz_h_alloc_resource_mw(shca->ipz_hca_handle, e_mw,
+					 e_pd->fw_pd, &hipzout);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(pd->device, "hipz_mw_allocate failed, h_ret=%lli "
+			 "shca=%p hca_hndl=%llx mw=%p",
+			 h_ret, shca, shca->ipz_hca_handle.handle, e_mw);
+		ib_mw = ERR_PTR(ehca2ib_return_code(h_ret));
+		goto alloc_mw_exit1;
+	}
+	/* successful MW allocation */
+	e_mw->ipz_mw_handle = hipzout.handle;
+	e_mw->ib_mw.rkey    = hipzout.rkey;
+	return &e_mw->ib_mw;
+
+alloc_mw_exit1:
+	ehca_mw_delete(e_mw);
+alloc_mw_exit0:
+	if (IS_ERR(ib_mw))
+		ehca_err(pd->device, "h_ret=%li pd=%p", PTR_ERR(ib_mw), pd);
+	return ib_mw;
+} /* end ehca_alloc_mw() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_bind_mw(struct ib_qp *qp,
+		 struct ib_mw *mw,
+		 struct ib_mw_bind *mw_bind)
+{
+	/* TODO: not supported up to now */
+	ehca_gen_err("bind MW currently not supported by HCAD");
+
+	return -EPERM;
+} /* end ehca_bind_mw() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dealloc_mw(struct ib_mw *mw)
+{
+	u64 h_ret;
+	struct ehca_shca *shca =
+		container_of(mw->device, struct ehca_shca, ib_device);
+	struct ehca_mw *e_mw = container_of(mw, struct ehca_mw, ib_mw);
+
+	h_ret = hipz_h_free_resource_mw(shca->ipz_hca_handle, e_mw);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(mw->device, "hipz_free_mw failed, h_ret=%lli shca=%p "
+			 "mw=%p rkey=%x hca_hndl=%llx mw_hndl=%llx",
+			 h_ret, shca, mw, mw->rkey, shca->ipz_hca_handle.handle,
+			 e_mw->ipz_mw_handle.handle);
+		return ehca2ib_return_code(h_ret);
+	}
+	/* successful deallocation */
+	ehca_mw_delete(e_mw);
+	return 0;
+} /* end ehca_dealloc_mw() */
+
+/*----------------------------------------------------------------------*/
+
+struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
+			      int mr_access_flags,
+			      struct ib_fmr_attr *fmr_attr)
+{
+	struct ib_fmr *ib_fmr;
+	struct ehca_shca *shca =
+		container_of(pd->device, struct ehca_shca, ib_device);
+	struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd);
+	struct ehca_mr *e_fmr;
+	int ret;
+	u32 tmp_lkey, tmp_rkey;
+	struct ehca_mr_pginfo pginfo;
+	u64 hw_pgsize;
+
+	/* check other parameters */
+	if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) &&
+	     !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) ||
+	    ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
+	     !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) {
+		/*
+		 * Remote Write Access requires Local Write Access
+		 * Remote Atomic Access requires Local Write Access
+		 */
+		ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+			 mr_access_flags);
+		ib_fmr = ERR_PTR(-EINVAL);
+		goto alloc_fmr_exit0;
+	}
+	if (mr_access_flags & IB_ACCESS_MW_BIND) {
+		ehca_err(pd->device, "bad input values: mr_access_flags=%x",
+			 mr_access_flags);
+		ib_fmr = ERR_PTR(-EINVAL);
+		goto alloc_fmr_exit0;
+	}
+	if ((fmr_attr->max_pages == 0) || (fmr_attr->max_maps == 0)) {
+		ehca_err(pd->device, "bad input values: fmr_attr->max_pages=%x "
+			 "fmr_attr->max_maps=%x fmr_attr->page_shift=%x",
+			 fmr_attr->max_pages, fmr_attr->max_maps,
+			 fmr_attr->page_shift);
+		ib_fmr = ERR_PTR(-EINVAL);
+		goto alloc_fmr_exit0;
+	}
+
+	hw_pgsize = 1 << fmr_attr->page_shift;
+	if (!(hw_pgsize & shca->hca_cap_mr_pgsize)) {
+		ehca_err(pd->device, "unsupported fmr_attr->page_shift=%x",
+			 fmr_attr->page_shift);
+		ib_fmr = ERR_PTR(-EINVAL);
+		goto alloc_fmr_exit0;
+	}
+
+	e_fmr = ehca_mr_new();
+	if (!e_fmr) {
+		ib_fmr = ERR_PTR(-ENOMEM);
+		goto alloc_fmr_exit0;
+	}
+	e_fmr->flags |= EHCA_MR_FLAG_FMR;
+
+	/* register MR on HCA */
+	memset(&pginfo, 0, sizeof(pginfo));
+	pginfo.hwpage_size = hw_pgsize;
+	/*
+	 * pginfo.num_hwpages==0, ie register_rpages() will not be called
+	 * but deferred to map_phys_fmr()
+	 */
+	ret = ehca_reg_mr(shca, e_fmr, NULL,
+			  fmr_attr->max_pages * (1 << fmr_attr->page_shift),
+			  mr_access_flags, e_pd, &pginfo,
+			  &tmp_lkey, &tmp_rkey, EHCA_REG_MR);
+	if (ret) {
+		ib_fmr = ERR_PTR(ret);
+		goto alloc_fmr_exit1;
+	}
+
+	/* successful */
+	e_fmr->hwpage_size = hw_pgsize;
+	e_fmr->fmr_page_size = 1 << fmr_attr->page_shift;
+	e_fmr->fmr_max_pages = fmr_attr->max_pages;
+	e_fmr->fmr_max_maps = fmr_attr->max_maps;
+	e_fmr->fmr_map_cnt = 0;
+	return &e_fmr->ib.ib_fmr;
+
+alloc_fmr_exit1:
+	ehca_mr_delete(e_fmr);
+alloc_fmr_exit0:
+	return ib_fmr;
+} /* end ehca_alloc_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_map_phys_fmr(struct ib_fmr *fmr,
+		      u64 *page_list,
+		      int list_len,
+		      u64 iova)
+{
+	int ret;
+	struct ehca_shca *shca =
+		container_of(fmr->device, struct ehca_shca, ib_device);
+	struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr);
+	struct ehca_pd *e_pd = container_of(fmr->pd, struct ehca_pd, ib_pd);
+	struct ehca_mr_pginfo pginfo;
+	u32 tmp_lkey, tmp_rkey;
+
+	if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
+		ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x",
+			 e_fmr, e_fmr->flags);
+		ret = -EINVAL;
+		goto map_phys_fmr_exit0;
+	}
+	ret = ehca_fmr_check_page_list(e_fmr, page_list, list_len);
+	if (ret)
+		goto map_phys_fmr_exit0;
+	if (iova % e_fmr->fmr_page_size) {
+		/* only whole-numbered pages */
+		ehca_err(fmr->device, "bad iova, iova=%llx fmr_page_size=%x",
+			 iova, e_fmr->fmr_page_size);
+		ret = -EINVAL;
+		goto map_phys_fmr_exit0;
+	}
+	if (e_fmr->fmr_map_cnt >= e_fmr->fmr_max_maps) {
+		/* HCAD does not limit the maps, however trace this anyway */
+		ehca_info(fmr->device, "map limit exceeded, fmr=%p "
+			  "e_fmr->fmr_map_cnt=%x e_fmr->fmr_max_maps=%x",
+			  fmr, e_fmr->fmr_map_cnt, e_fmr->fmr_max_maps);
+	}
+
+	memset(&pginfo, 0, sizeof(pginfo));
+	pginfo.type = EHCA_MR_PGI_FMR;
+	pginfo.num_kpages = list_len;
+	pginfo.hwpage_size = e_fmr->hwpage_size;
+	pginfo.num_hwpages =
+		list_len * e_fmr->fmr_page_size / pginfo.hwpage_size;
+	pginfo.u.fmr.page_list = page_list;
+	pginfo.next_hwpage =
+		(iova & (e_fmr->fmr_page_size-1)) / pginfo.hwpage_size;
+	pginfo.u.fmr.fmr_pgsize = e_fmr->fmr_page_size;
+
+	ret = ehca_rereg_mr(shca, e_fmr, (u64 *)iova,
+			    list_len * e_fmr->fmr_page_size,
+			    e_fmr->acl, e_pd, &pginfo, &tmp_lkey, &tmp_rkey);
+	if (ret)
+		goto map_phys_fmr_exit0;
+
+	/* successful reregistration */
+	e_fmr->fmr_map_cnt++;
+	e_fmr->ib.ib_fmr.lkey = tmp_lkey;
+	e_fmr->ib.ib_fmr.rkey = tmp_rkey;
+	return 0;
+
+map_phys_fmr_exit0:
+	if (ret)
+		ehca_err(fmr->device, "ret=%i fmr=%p page_list=%p list_len=%x "
+			 "iova=%llx", ret, fmr, page_list, list_len, iova);
+	return ret;
+} /* end ehca_map_phys_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_unmap_fmr(struct list_head *fmr_list)
+{
+	int ret = 0;
+	struct ib_fmr *ib_fmr;
+	struct ehca_shca *shca = NULL;
+	struct ehca_shca *prev_shca;
+	struct ehca_mr *e_fmr;
+	u32 num_fmr = 0;
+	u32 unmap_fmr_cnt = 0;
+
+	/* check all FMR belong to same SHCA, and check internal flag */
+	list_for_each_entry(ib_fmr, fmr_list, list) {
+		prev_shca = shca;
+		shca = container_of(ib_fmr->device, struct ehca_shca,
+				    ib_device);
+		e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr);
+		if ((shca != prev_shca) && prev_shca) {
+			ehca_err(&shca->ib_device, "SHCA mismatch, shca=%p "
+				 "prev_shca=%p e_fmr=%p",
+				 shca, prev_shca, e_fmr);
+			ret = -EINVAL;
+			goto unmap_fmr_exit0;
+		}
+		if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
+			ehca_err(&shca->ib_device, "not a FMR, e_fmr=%p "
+				 "e_fmr->flags=%x", e_fmr, e_fmr->flags);
+			ret = -EINVAL;
+			goto unmap_fmr_exit0;
+		}
+		num_fmr++;
+	}
+
+	/* loop over all FMRs to unmap */
+	list_for_each_entry(ib_fmr, fmr_list, list) {
+		unmap_fmr_cnt++;
+		e_fmr = container_of(ib_fmr, struct ehca_mr, ib.ib_fmr);
+		shca = container_of(ib_fmr->device, struct ehca_shca,
+				    ib_device);
+		ret = ehca_unmap_one_fmr(shca, e_fmr);
+		if (ret) {
+			/* unmap failed, stop unmapping of rest of FMRs */
+			ehca_err(&shca->ib_device, "unmap of one FMR failed, "
+				 "stop rest, e_fmr=%p num_fmr=%x "
+				 "unmap_fmr_cnt=%x lkey=%x", e_fmr, num_fmr,
+				 unmap_fmr_cnt, e_fmr->ib.ib_fmr.lkey);
+			goto unmap_fmr_exit0;
+		}
+	}
+
+unmap_fmr_exit0:
+	if (ret)
+		ehca_gen_err("ret=%i fmr_list=%p num_fmr=%x unmap_fmr_cnt=%x",
+			     ret, fmr_list, num_fmr, unmap_fmr_cnt);
+	return ret;
+} /* end ehca_unmap_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dealloc_fmr(struct ib_fmr *fmr)
+{
+	int ret;
+	u64 h_ret;
+	struct ehca_shca *shca =
+		container_of(fmr->device, struct ehca_shca, ib_device);
+	struct ehca_mr *e_fmr = container_of(fmr, struct ehca_mr, ib.ib_fmr);
+
+	if (!(e_fmr->flags & EHCA_MR_FLAG_FMR)) {
+		ehca_err(fmr->device, "not a FMR, e_fmr=%p e_fmr->flags=%x",
+			 e_fmr, e_fmr->flags);
+		ret = -EINVAL;
+		goto free_fmr_exit0;
+	}
+
+	h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(fmr->device, "hipz_free_mr failed, h_ret=%lli e_fmr=%p "
+			 "hca_hndl=%llx fmr_hndl=%llx fmr->lkey=%x",
+			 h_ret, e_fmr, shca->ipz_hca_handle.handle,
+			 e_fmr->ipz_mr_handle.handle, fmr->lkey);
+		ret = ehca2ib_return_code(h_ret);
+		goto free_fmr_exit0;
+	}
+	/* successful deregistration */
+	ehca_mr_delete(e_fmr);
+	return 0;
+
+free_fmr_exit0:
+	if (ret)
+		ehca_err(&shca->ib_device, "ret=%i fmr=%p", ret, fmr);
+	return ret;
+} /* end ehca_dealloc_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+				   struct ehca_mr *e_mr,
+				   struct ehca_mr_pginfo *pginfo);
+
+int ehca_reg_mr(struct ehca_shca *shca,
+		struct ehca_mr *e_mr,
+		u64 *iova_start,
+		u64 size,
+		int acl,
+		struct ehca_pd *e_pd,
+		struct ehca_mr_pginfo *pginfo,
+		u32 *lkey, /*OUT*/
+		u32 *rkey, /*OUT*/
+		enum ehca_reg_type reg_type)
+{
+	int ret;
+	u64 h_ret;
+	u32 hipz_acl;
+	struct ehca_mr_hipzout_parms hipzout;
+
+	ehca_mrmw_map_acl(acl, &hipz_acl);
+	ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl);
+	if (ehca_use_hp_mr == 1)
+		hipz_acl |= 0x00000001;
+
+	h_ret = hipz_h_alloc_resource_mr(shca->ipz_hca_handle, e_mr,
+					 (u64)iova_start, size, hipz_acl,
+					 e_pd->fw_pd, &hipzout);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "hipz_alloc_mr failed, h_ret=%lli "
+			 "hca_hndl=%llx", h_ret, shca->ipz_hca_handle.handle);
+		ret = ehca2ib_return_code(h_ret);
+		goto ehca_reg_mr_exit0;
+	}
+
+	e_mr->ipz_mr_handle = hipzout.handle;
+
+	if (reg_type == EHCA_REG_BUSMAP_MR)
+		ret = ehca_reg_bmap_mr_rpages(shca, e_mr, pginfo);
+	else if (reg_type == EHCA_REG_MR)
+		ret = ehca_reg_mr_rpages(shca, e_mr, pginfo);
+	else
+		ret = -EINVAL;
+
+	if (ret)
+		goto ehca_reg_mr_exit1;
+
+	/* successful registration */
+	e_mr->num_kpages = pginfo->num_kpages;
+	e_mr->num_hwpages = pginfo->num_hwpages;
+	e_mr->hwpage_size = pginfo->hwpage_size;
+	e_mr->start = iova_start;
+	e_mr->size = size;
+	e_mr->acl = acl;
+	*lkey = hipzout.lkey;
+	*rkey = hipzout.rkey;
+	return 0;
+
+ehca_reg_mr_exit1:
+	h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "h_ret=%lli shca=%p e_mr=%p "
+			 "iova_start=%p size=%llx acl=%x e_pd=%p lkey=%x "
+			 "pginfo=%p num_kpages=%llx num_hwpages=%llx ret=%i",
+			 h_ret, shca, e_mr, iova_start, size, acl, e_pd,
+			 hipzout.lkey, pginfo, pginfo->num_kpages,
+			 pginfo->num_hwpages, ret);
+		ehca_err(&shca->ib_device, "internal error in ehca_reg_mr, "
+			 "not recoverable");
+	}
+ehca_reg_mr_exit0:
+	if (ret)
+		ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
+			 "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p "
+			 "num_kpages=%llx num_hwpages=%llx",
+			 ret, shca, e_mr, iova_start, size, acl, e_pd, pginfo,
+			 pginfo->num_kpages, pginfo->num_hwpages);
+	return ret;
+} /* end ehca_reg_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_reg_mr_rpages(struct ehca_shca *shca,
+		       struct ehca_mr *e_mr,
+		       struct ehca_mr_pginfo *pginfo)
+{
+	int ret = 0;
+	u64 h_ret;
+	u32 rnum;
+	u64 rpage;
+	u32 i;
+	u64 *kpage;
+
+	if (!pginfo->num_hwpages) /* in case of fmr */
+		return 0;
+
+	kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!kpage) {
+		ehca_err(&shca->ib_device, "kpage alloc failed");
+		ret = -ENOMEM;
+		goto ehca_reg_mr_rpages_exit0;
+	}
+
+	/* max MAX_RPAGES ehca mr pages per register call */
+	for (i = 0; i < NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES); i++) {
+
+		if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) {
+			rnum = pginfo->num_hwpages % MAX_RPAGES; /* last shot */
+			if (rnum == 0)
+				rnum = MAX_RPAGES;      /* last shot is full */
+		} else
+			rnum = MAX_RPAGES;
+
+		ret = ehca_set_pagebuf(pginfo, rnum, kpage);
+		if (ret) {
+			ehca_err(&shca->ib_device, "ehca_set_pagebuf "
+				 "bad rc, ret=%i rnum=%x kpage=%p",
+				 ret, rnum, kpage);
+			goto ehca_reg_mr_rpages_exit1;
+		}
+
+		if (rnum > 1) {
+			rpage = __pa(kpage);
+			if (!rpage) {
+				ehca_err(&shca->ib_device, "kpage=%p i=%x",
+					 kpage, i);
+				ret = -EFAULT;
+				goto ehca_reg_mr_rpages_exit1;
+			}
+		} else
+			rpage = *kpage;
+
+		h_ret = hipz_h_register_rpage_mr(
+			shca->ipz_hca_handle, e_mr,
+			ehca_encode_hwpage_size(pginfo->hwpage_size),
+			0, rpage, rnum);
+
+		if (i == NUM_CHUNKS(pginfo->num_hwpages, MAX_RPAGES) - 1) {
+			/*
+			 * check for 'registration complete'==H_SUCCESS
+			 * and for 'page registered'==H_PAGE_REGISTERED
+			 */
+			if (h_ret != H_SUCCESS) {
+				ehca_err(&shca->ib_device, "last "
+					 "hipz_reg_rpage_mr failed, h_ret=%lli "
+					 "e_mr=%p i=%x hca_hndl=%llx mr_hndl=%llx"
+					 " lkey=%x", h_ret, e_mr, i,
+					 shca->ipz_hca_handle.handle,
+					 e_mr->ipz_mr_handle.handle,
+					 e_mr->ib.ib_mr.lkey);
+				ret = ehca2ib_return_code(h_ret);
+				break;
+			} else
+				ret = 0;
+		} else if (h_ret != H_PAGE_REGISTERED) {
+			ehca_err(&shca->ib_device, "hipz_reg_rpage_mr failed, "
+				 "h_ret=%lli e_mr=%p i=%x lkey=%x hca_hndl=%llx "
+				 "mr_hndl=%llx", h_ret, e_mr, i,
+				 e_mr->ib.ib_mr.lkey,
+				 shca->ipz_hca_handle.handle,
+				 e_mr->ipz_mr_handle.handle);
+			ret = ehca2ib_return_code(h_ret);
+			break;
+		} else
+			ret = 0;
+	} /* end for(i) */
+
+
+ehca_reg_mr_rpages_exit1:
+	ehca_free_fw_ctrlblock(kpage);
+ehca_reg_mr_rpages_exit0:
+	if (ret)
+		ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p pginfo=%p "
+			 "num_kpages=%llx num_hwpages=%llx", ret, shca, e_mr,
+			 pginfo, pginfo->num_kpages, pginfo->num_hwpages);
+	return ret;
+} /* end ehca_reg_mr_rpages() */
+
+/*----------------------------------------------------------------------*/
+
+inline int ehca_rereg_mr_rereg1(struct ehca_shca *shca,
+				struct ehca_mr *e_mr,
+				u64 *iova_start,
+				u64 size,
+				u32 acl,
+				struct ehca_pd *e_pd,
+				struct ehca_mr_pginfo *pginfo,
+				u32 *lkey, /*OUT*/
+				u32 *rkey) /*OUT*/
+{
+	int ret;
+	u64 h_ret;
+	u32 hipz_acl;
+	u64 *kpage;
+	u64 rpage;
+	struct ehca_mr_pginfo pginfo_save;
+	struct ehca_mr_hipzout_parms hipzout;
+
+	ehca_mrmw_map_acl(acl, &hipz_acl);
+	ehca_mrmw_set_pgsize_hipz_acl(pginfo->hwpage_size, &hipz_acl);
+
+	kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!kpage) {
+		ehca_err(&shca->ib_device, "kpage alloc failed");
+		ret = -ENOMEM;
+		goto ehca_rereg_mr_rereg1_exit0;
+	}
+
+	pginfo_save = *pginfo;
+	ret = ehca_set_pagebuf(pginfo, pginfo->num_hwpages, kpage);
+	if (ret) {
+		ehca_err(&shca->ib_device, "set pagebuf failed, e_mr=%p "
+			 "pginfo=%p type=%x num_kpages=%llx num_hwpages=%llx "
+			 "kpage=%p", e_mr, pginfo, pginfo->type,
+			 pginfo->num_kpages, pginfo->num_hwpages, kpage);
+		goto ehca_rereg_mr_rereg1_exit1;
+	}
+	rpage = __pa(kpage);
+	if (!rpage) {
+		ehca_err(&shca->ib_device, "kpage=%p", kpage);
+		ret = -EFAULT;
+		goto ehca_rereg_mr_rereg1_exit1;
+	}
+	h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_mr,
+				      (u64)iova_start, size, hipz_acl,
+				      e_pd->fw_pd, rpage, &hipzout);
+	if (h_ret != H_SUCCESS) {
+		/*
+		 * reregistration unsuccessful, try it again with the 3 hCalls,
+		 * e.g. this is required in case H_MR_CONDITION
+		 * (MW bound or MR is shared)
+		 */
+		ehca_warn(&shca->ib_device, "hipz_h_reregister_pmr failed "
+			  "(Rereg1), h_ret=%lli e_mr=%p", h_ret, e_mr);
+		*pginfo = pginfo_save;
+		ret = -EAGAIN;
+	} else if ((u64 *)hipzout.vaddr != iova_start) {
+		ehca_err(&shca->ib_device, "PHYP changed iova_start in "
+			 "rereg_pmr, iova_start=%p iova_start_out=%llx e_mr=%p "
+			 "mr_handle=%llx lkey=%x lkey_out=%x", iova_start,
+			 hipzout.vaddr, e_mr, e_mr->ipz_mr_handle.handle,
+			 e_mr->ib.ib_mr.lkey, hipzout.lkey);
+		ret = -EFAULT;
+	} else {
+		/*
+		 * successful reregistration
+		 * note: start and start_out are identical for eServer HCAs
+		 */
+		e_mr->num_kpages = pginfo->num_kpages;
+		e_mr->num_hwpages = pginfo->num_hwpages;
+		e_mr->hwpage_size = pginfo->hwpage_size;
+		e_mr->start = iova_start;
+		e_mr->size = size;
+		e_mr->acl = acl;
+		*lkey = hipzout.lkey;
+		*rkey = hipzout.rkey;
+	}
+
+ehca_rereg_mr_rereg1_exit1:
+	ehca_free_fw_ctrlblock(kpage);
+ehca_rereg_mr_rereg1_exit0:
+	if ( ret && (ret != -EAGAIN) )
+		ehca_err(&shca->ib_device, "ret=%i lkey=%x rkey=%x "
+			 "pginfo=%p num_kpages=%llx num_hwpages=%llx",
+			 ret, *lkey, *rkey, pginfo, pginfo->num_kpages,
+			 pginfo->num_hwpages);
+	return ret;
+} /* end ehca_rereg_mr_rereg1() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_rereg_mr(struct ehca_shca *shca,
+		  struct ehca_mr *e_mr,
+		  u64 *iova_start,
+		  u64 size,
+		  int acl,
+		  struct ehca_pd *e_pd,
+		  struct ehca_mr_pginfo *pginfo,
+		  u32 *lkey,
+		  u32 *rkey)
+{
+	int ret = 0;
+	u64 h_ret;
+	int rereg_1_hcall = 1; /* 1: use hipz_h_reregister_pmr directly */
+	int rereg_3_hcall = 0; /* 1: use 3 hipz calls for reregistration */
+
+	/* first determine reregistration hCall(s) */
+	if ((pginfo->num_hwpages > MAX_RPAGES) ||
+	    (e_mr->num_hwpages > MAX_RPAGES) ||
+	    (pginfo->num_hwpages > e_mr->num_hwpages)) {
+		ehca_dbg(&shca->ib_device, "Rereg3 case, "
+			 "pginfo->num_hwpages=%llx e_mr->num_hwpages=%x",
+			 pginfo->num_hwpages, e_mr->num_hwpages);
+		rereg_1_hcall = 0;
+		rereg_3_hcall = 1;
+	}
+
+	if (e_mr->flags & EHCA_MR_FLAG_MAXMR) {	/* check for max-MR */
+		rereg_1_hcall = 0;
+		rereg_3_hcall = 1;
+		e_mr->flags &= ~EHCA_MR_FLAG_MAXMR;
+		ehca_err(&shca->ib_device, "Rereg MR for max-MR! e_mr=%p",
+			 e_mr);
+	}
+
+	if (rereg_1_hcall) {
+		ret = ehca_rereg_mr_rereg1(shca, e_mr, iova_start, size,
+					   acl, e_pd, pginfo, lkey, rkey);
+		if (ret) {
+			if (ret == -EAGAIN)
+				rereg_3_hcall = 1;
+			else
+				goto ehca_rereg_mr_exit0;
+		}
+	}
+
+	if (rereg_3_hcall) {
+		struct ehca_mr save_mr;
+
+		/* first deregister old MR */
+		h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
+		if (h_ret != H_SUCCESS) {
+			ehca_err(&shca->ib_device, "hipz_free_mr failed, "
+				 "h_ret=%lli e_mr=%p hca_hndl=%llx mr_hndl=%llx "
+				 "mr->lkey=%x",
+				 h_ret, e_mr, shca->ipz_hca_handle.handle,
+				 e_mr->ipz_mr_handle.handle,
+				 e_mr->ib.ib_mr.lkey);
+			ret = ehca2ib_return_code(h_ret);
+			goto ehca_rereg_mr_exit0;
+		}
+		/* clean ehca_mr_t, without changing struct ib_mr and lock */
+		save_mr = *e_mr;
+		ehca_mr_deletenew(e_mr);
+
+		/* set some MR values */
+		e_mr->flags = save_mr.flags;
+		e_mr->hwpage_size = save_mr.hwpage_size;
+		e_mr->fmr_page_size = save_mr.fmr_page_size;
+		e_mr->fmr_max_pages = save_mr.fmr_max_pages;
+		e_mr->fmr_max_maps = save_mr.fmr_max_maps;
+		e_mr->fmr_map_cnt = save_mr.fmr_map_cnt;
+
+		ret = ehca_reg_mr(shca, e_mr, iova_start, size, acl,
+				  e_pd, pginfo, lkey, rkey, EHCA_REG_MR);
+		if (ret) {
+			u32 offset = (u64)(&e_mr->flags) - (u64)e_mr;
+			memcpy(&e_mr->flags, &(save_mr.flags),
+			       sizeof(struct ehca_mr) - offset);
+			goto ehca_rereg_mr_exit0;
+		}
+	}
+
+ehca_rereg_mr_exit0:
+	if (ret)
+		ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
+			 "iova_start=%p size=%llx acl=%x e_pd=%p pginfo=%p "
+			 "num_kpages=%llx lkey=%x rkey=%x rereg_1_hcall=%x "
+			 "rereg_3_hcall=%x", ret, shca, e_mr, iova_start, size,
+			 acl, e_pd, pginfo, pginfo->num_kpages, *lkey, *rkey,
+			 rereg_1_hcall, rereg_3_hcall);
+	return ret;
+} /* end ehca_rereg_mr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_unmap_one_fmr(struct ehca_shca *shca,
+		       struct ehca_mr *e_fmr)
+{
+	int ret = 0;
+	u64 h_ret;
+	struct ehca_pd *e_pd =
+		container_of(e_fmr->ib.ib_fmr.pd, struct ehca_pd, ib_pd);
+	struct ehca_mr save_fmr;
+	u32 tmp_lkey, tmp_rkey;
+	struct ehca_mr_pginfo pginfo;
+	struct ehca_mr_hipzout_parms hipzout;
+	struct ehca_mr save_mr;
+
+	if (e_fmr->fmr_max_pages <= MAX_RPAGES) {
+		/*
+		 * note: after using rereg hcall with len=0,
+		 * rereg hcall must be used again for registering pages
+		 */
+		h_ret = hipz_h_reregister_pmr(shca->ipz_hca_handle, e_fmr, 0,
+					      0, 0, e_pd->fw_pd, 0, &hipzout);
+		if (h_ret == H_SUCCESS) {
+			/* successful reregistration */
+			e_fmr->start = NULL;
+			e_fmr->size = 0;
+			tmp_lkey = hipzout.lkey;
+			tmp_rkey = hipzout.rkey;
+			return 0;
+		}
+		/*
+		 * should not happen, because length checked above,
+		 * FMRs are not shared and no MW bound to FMRs
+		 */
+		ehca_err(&shca->ib_device, "hipz_reregister_pmr failed "
+			 "(Rereg1), h_ret=%lli e_fmr=%p hca_hndl=%llx "
+			 "mr_hndl=%llx lkey=%x lkey_out=%x",
+			 h_ret, e_fmr, shca->ipz_hca_handle.handle,
+			 e_fmr->ipz_mr_handle.handle,
+			 e_fmr->ib.ib_fmr.lkey, hipzout.lkey);
+		/* try free and rereg */
+	}
+
+	/* first free old FMR */
+	h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "hipz_free_mr failed, "
+			 "h_ret=%lli e_fmr=%p hca_hndl=%llx mr_hndl=%llx "
+			 "lkey=%x",
+			 h_ret, e_fmr, shca->ipz_hca_handle.handle,
+			 e_fmr->ipz_mr_handle.handle,
+			 e_fmr->ib.ib_fmr.lkey);
+		ret = ehca2ib_return_code(h_ret);
+		goto ehca_unmap_one_fmr_exit0;
+	}
+	/* clean ehca_mr_t, without changing lock */
+	save_fmr = *e_fmr;
+	ehca_mr_deletenew(e_fmr);
+
+	/* set some MR values */
+	e_fmr->flags = save_fmr.flags;
+	e_fmr->hwpage_size = save_fmr.hwpage_size;
+	e_fmr->fmr_page_size = save_fmr.fmr_page_size;
+	e_fmr->fmr_max_pages = save_fmr.fmr_max_pages;
+	e_fmr->fmr_max_maps = save_fmr.fmr_max_maps;
+	e_fmr->fmr_map_cnt = save_fmr.fmr_map_cnt;
+	e_fmr->acl = save_fmr.acl;
+
+	memset(&pginfo, 0, sizeof(pginfo));
+	pginfo.type = EHCA_MR_PGI_FMR;
+	ret = ehca_reg_mr(shca, e_fmr, NULL,
+			  (e_fmr->fmr_max_pages * e_fmr->fmr_page_size),
+			  e_fmr->acl, e_pd, &pginfo, &tmp_lkey,
+			  &tmp_rkey, EHCA_REG_MR);
+	if (ret) {
+		u32 offset = (u64)(&e_fmr->flags) - (u64)e_fmr;
+		memcpy(&e_fmr->flags, &(save_mr.flags),
+		       sizeof(struct ehca_mr) - offset);
+	}
+
+ehca_unmap_one_fmr_exit0:
+	if (ret)
+		ehca_err(&shca->ib_device, "ret=%i tmp_lkey=%x tmp_rkey=%x "
+			 "fmr_max_pages=%x",
+			 ret, tmp_lkey, tmp_rkey, e_fmr->fmr_max_pages);
+	return ret;
+} /* end ehca_unmap_one_fmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_reg_smr(struct ehca_shca *shca,
+		 struct ehca_mr *e_origmr,
+		 struct ehca_mr *e_newmr,
+		 u64 *iova_start,
+		 int acl,
+		 struct ehca_pd *e_pd,
+		 u32 *lkey, /*OUT*/
+		 u32 *rkey) /*OUT*/
+{
+	int ret = 0;
+	u64 h_ret;
+	u32 hipz_acl;
+	struct ehca_mr_hipzout_parms hipzout;
+
+	ehca_mrmw_map_acl(acl, &hipz_acl);
+	ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl);
+
+	h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr,
+				    (u64)iova_start, hipz_acl, e_pd->fw_pd,
+				    &hipzout);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli "
+			 "shca=%p e_origmr=%p e_newmr=%p iova_start=%p acl=%x "
+			 "e_pd=%p hca_hndl=%llx mr_hndl=%llx lkey=%x",
+			 h_ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd,
+			 shca->ipz_hca_handle.handle,
+			 e_origmr->ipz_mr_handle.handle,
+			 e_origmr->ib.ib_mr.lkey);
+		ret = ehca2ib_return_code(h_ret);
+		goto ehca_reg_smr_exit0;
+	}
+	/* successful registration */
+	e_newmr->num_kpages = e_origmr->num_kpages;
+	e_newmr->num_hwpages = e_origmr->num_hwpages;
+	e_newmr->hwpage_size   = e_origmr->hwpage_size;
+	e_newmr->start = iova_start;
+	e_newmr->size = e_origmr->size;
+	e_newmr->acl = acl;
+	e_newmr->ipz_mr_handle = hipzout.handle;
+	*lkey = hipzout.lkey;
+	*rkey = hipzout.rkey;
+	return 0;
+
+ehca_reg_smr_exit0:
+	if (ret)
+		ehca_err(&shca->ib_device, "ret=%i shca=%p e_origmr=%p "
+			 "e_newmr=%p iova_start=%p acl=%x e_pd=%p",
+			 ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd);
+	return ret;
+} /* end ehca_reg_smr() */
+
+/*----------------------------------------------------------------------*/
+static inline void *ehca_calc_sectbase(int top, int dir, int idx)
+{
+	unsigned long ret = idx;
+	ret |= dir << EHCA_DIR_INDEX_SHIFT;
+	ret |= top << EHCA_TOP_INDEX_SHIFT;
+	return __va(ret << SECTION_SIZE_BITS);
+}
+
+#define ehca_bmap_valid(entry) \
+	((u64)entry != (u64)EHCA_INVAL_ADDR)
+
+static u64 ehca_reg_mr_section(int top, int dir, int idx, u64 *kpage,
+			       struct ehca_shca *shca, struct ehca_mr *mr,
+			       struct ehca_mr_pginfo *pginfo)
+{
+	u64 h_ret = 0;
+	unsigned long page = 0;
+	u64 rpage = __pa(kpage);
+	int page_count;
+
+	void *sectbase = ehca_calc_sectbase(top, dir, idx);
+	if ((unsigned long)sectbase & (pginfo->hwpage_size - 1)) {
+		ehca_err(&shca->ib_device, "reg_mr_section will probably fail:"
+					   "hwpage_size does not fit to "
+					   "section start address");
+	}
+	page_count = EHCA_SECTSIZE / pginfo->hwpage_size;
+
+	while (page < page_count) {
+		u64 rnum;
+		for (rnum = 0; (rnum < MAX_RPAGES) && (page < page_count);
+		     rnum++) {
+			void *pg = sectbase + ((page++) * pginfo->hwpage_size);
+			kpage[rnum] = __pa(pg);
+		}
+
+		h_ret = hipz_h_register_rpage_mr(shca->ipz_hca_handle, mr,
+			ehca_encode_hwpage_size(pginfo->hwpage_size),
+			0, rpage, rnum);
+
+		if ((h_ret != H_SUCCESS) && (h_ret != H_PAGE_REGISTERED)) {
+			ehca_err(&shca->ib_device, "register_rpage_mr failed");
+			return h_ret;
+		}
+	}
+	return h_ret;
+}
+
+static u64 ehca_reg_mr_sections(int top, int dir, u64 *kpage,
+				struct ehca_shca *shca, struct ehca_mr *mr,
+				struct ehca_mr_pginfo *pginfo)
+{
+	u64 hret = H_SUCCESS;
+	int idx;
+
+	for (idx = 0; idx < EHCA_MAP_ENTRIES; idx++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]->ent[idx]))
+			continue;
+
+		hret = ehca_reg_mr_section(top, dir, idx, kpage, shca, mr,
+					   pginfo);
+		if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+				return hret;
+	}
+	return hret;
+}
+
+static u64 ehca_reg_mr_dir_sections(int top, u64 *kpage, struct ehca_shca *shca,
+				    struct ehca_mr *mr,
+				    struct ehca_mr_pginfo *pginfo)
+{
+	u64 hret = H_SUCCESS;
+	int dir;
+
+	for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+			continue;
+
+		hret = ehca_reg_mr_sections(top, dir, kpage, shca, mr, pginfo);
+		if ((hret != H_SUCCESS) && (hret != H_PAGE_REGISTERED))
+				return hret;
+	}
+	return hret;
+}
+
+/* register internal max-MR to internal SHCA */
+int ehca_reg_internal_maxmr(
+	struct ehca_shca *shca,
+	struct ehca_pd *e_pd,
+	struct ehca_mr **e_maxmr)  /*OUT*/
+{
+	int ret;
+	struct ehca_mr *e_mr;
+	u64 *iova_start;
+	u64 size_maxmr;
+	struct ehca_mr_pginfo pginfo;
+	struct ib_phys_buf ib_pbuf;
+	u32 num_kpages;
+	u32 num_hwpages;
+	u64 hw_pgsize;
+
+	if (!ehca_bmap) {
+		ret = -EFAULT;
+		goto ehca_reg_internal_maxmr_exit0;
+	}
+
+	e_mr = ehca_mr_new();
+	if (!e_mr) {
+		ehca_err(&shca->ib_device, "out of memory");
+		ret = -ENOMEM;
+		goto ehca_reg_internal_maxmr_exit0;
+	}
+	e_mr->flags |= EHCA_MR_FLAG_MAXMR;
+
+	/* register internal max-MR on HCA */
+	size_maxmr = ehca_mr_len;
+	iova_start = (u64 *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START));
+	ib_pbuf.addr = 0;
+	ib_pbuf.size = size_maxmr;
+	num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr,
+				PAGE_SIZE);
+	hw_pgsize = ehca_get_max_hwpage_size(shca);
+	num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size_maxmr,
+				 hw_pgsize);
+
+	memset(&pginfo, 0, sizeof(pginfo));
+	pginfo.type = EHCA_MR_PGI_PHYS;
+	pginfo.num_kpages = num_kpages;
+	pginfo.num_hwpages = num_hwpages;
+	pginfo.hwpage_size = hw_pgsize;
+	pginfo.u.phy.num_phys_buf = 1;
+	pginfo.u.phy.phys_buf_array = &ib_pbuf;
+
+	ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd,
+			  &pginfo, &e_mr->ib.ib_mr.lkey,
+			  &e_mr->ib.ib_mr.rkey, EHCA_REG_BUSMAP_MR);
+	if (ret) {
+		ehca_err(&shca->ib_device, "reg of internal max MR failed, "
+			 "e_mr=%p iova_start=%p size_maxmr=%llx num_kpages=%x "
+			 "num_hwpages=%x", e_mr, iova_start, size_maxmr,
+			 num_kpages, num_hwpages);
+		goto ehca_reg_internal_maxmr_exit1;
+	}
+
+	/* successful registration of all pages */
+	e_mr->ib.ib_mr.device = e_pd->ib_pd.device;
+	e_mr->ib.ib_mr.pd = &e_pd->ib_pd;
+	e_mr->ib.ib_mr.uobject = NULL;
+	atomic_inc(&(e_pd->ib_pd.usecnt));
+	atomic_set(&(e_mr->ib.ib_mr.usecnt), 0);
+	*e_maxmr = e_mr;
+	return 0;
+
+ehca_reg_internal_maxmr_exit1:
+	ehca_mr_delete(e_mr);
+ehca_reg_internal_maxmr_exit0:
+	if (ret)
+		ehca_err(&shca->ib_device, "ret=%i shca=%p e_pd=%p e_maxmr=%p",
+			 ret, shca, e_pd, e_maxmr);
+	return ret;
+} /* end ehca_reg_internal_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_reg_maxmr(struct ehca_shca *shca,
+		   struct ehca_mr *e_newmr,
+		   u64 *iova_start,
+		   int acl,
+		   struct ehca_pd *e_pd,
+		   u32 *lkey,
+		   u32 *rkey)
+{
+	u64 h_ret;
+	struct ehca_mr *e_origmr = shca->maxmr;
+	u32 hipz_acl;
+	struct ehca_mr_hipzout_parms hipzout;
+
+	ehca_mrmw_map_acl(acl, &hipz_acl);
+	ehca_mrmw_set_pgsize_hipz_acl(e_origmr->hwpage_size, &hipz_acl);
+
+	h_ret = hipz_h_register_smr(shca->ipz_hca_handle, e_newmr, e_origmr,
+				    (u64)iova_start, hipz_acl, e_pd->fw_pd,
+				    &hipzout);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lli "
+			 "e_origmr=%p hca_hndl=%llx mr_hndl=%llx lkey=%x",
+			 h_ret, e_origmr, shca->ipz_hca_handle.handle,
+			 e_origmr->ipz_mr_handle.handle,
+			 e_origmr->ib.ib_mr.lkey);
+		return ehca2ib_return_code(h_ret);
+	}
+	/* successful registration */
+	e_newmr->num_kpages = e_origmr->num_kpages;
+	e_newmr->num_hwpages = e_origmr->num_hwpages;
+	e_newmr->hwpage_size = e_origmr->hwpage_size;
+	e_newmr->start = iova_start;
+	e_newmr->size = e_origmr->size;
+	e_newmr->acl = acl;
+	e_newmr->ipz_mr_handle = hipzout.handle;
+	*lkey = hipzout.lkey;
+	*rkey = hipzout.rkey;
+	return 0;
+} /* end ehca_reg_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+int ehca_dereg_internal_maxmr(struct ehca_shca *shca)
+{
+	int ret;
+	struct ehca_mr *e_maxmr;
+	struct ib_pd *ib_pd;
+
+	if (!shca->maxmr) {
+		ehca_err(&shca->ib_device, "bad call, shca=%p", shca);
+		ret = -EINVAL;
+		goto ehca_dereg_internal_maxmr_exit0;
+	}
+
+	e_maxmr = shca->maxmr;
+	ib_pd = e_maxmr->ib.ib_mr.pd;
+	shca->maxmr = NULL; /* remove internal max-MR indication from SHCA */
+
+	ret = ehca_dereg_mr(&e_maxmr->ib.ib_mr);
+	if (ret) {
+		ehca_err(&shca->ib_device, "dereg internal max-MR failed, "
+			 "ret=%i e_maxmr=%p shca=%p lkey=%x",
+			 ret, e_maxmr, shca, e_maxmr->ib.ib_mr.lkey);
+		shca->maxmr = e_maxmr;
+		goto ehca_dereg_internal_maxmr_exit0;
+	}
+
+	atomic_dec(&ib_pd->usecnt);
+
+ehca_dereg_internal_maxmr_exit0:
+	if (ret)
+		ehca_err(&shca->ib_device, "ret=%i shca=%p shca->maxmr=%p",
+			 ret, shca, shca->maxmr);
+	return ret;
+} /* end ehca_dereg_internal_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * check physical buffer array of MR verbs for validness and
+ * calculates MR size
+ */
+int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
+				  int num_phys_buf,
+				  u64 *iova_start,
+				  u64 *size)
+{
+	struct ib_phys_buf *pbuf = phys_buf_array;
+	u64 size_count = 0;
+	u32 i;
+
+	if (num_phys_buf == 0) {
+		ehca_gen_err("bad phys buf array len, num_phys_buf=0");
+		return -EINVAL;
+	}
+	/* check first buffer */
+	if (((u64)iova_start & ~PAGE_MASK) != (pbuf->addr & ~PAGE_MASK)) {
+		ehca_gen_err("iova_start/addr mismatch, iova_start=%p "
+			     "pbuf->addr=%llx pbuf->size=%llx",
+			     iova_start, pbuf->addr, pbuf->size);
+		return -EINVAL;
+	}
+	if (((pbuf->addr + pbuf->size) % PAGE_SIZE) &&
+	    (num_phys_buf > 1)) {
+		ehca_gen_err("addr/size mismatch in 1st buf, pbuf->addr=%llx "
+			     "pbuf->size=%llx", pbuf->addr, pbuf->size);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < num_phys_buf; i++) {
+		if ((i > 0) && (pbuf->addr % PAGE_SIZE)) {
+			ehca_gen_err("bad address, i=%x pbuf->addr=%llx "
+				     "pbuf->size=%llx",
+				     i, pbuf->addr, pbuf->size);
+			return -EINVAL;
+		}
+		if (((i > 0) &&	/* not 1st */
+		     (i < (num_phys_buf - 1)) &&	/* not last */
+		     (pbuf->size % PAGE_SIZE)) || (pbuf->size == 0)) {
+			ehca_gen_err("bad size, i=%x pbuf->size=%llx",
+				     i, pbuf->size);
+			return -EINVAL;
+		}
+		size_count += pbuf->size;
+		pbuf++;
+	}
+
+	*size = size_count;
+	return 0;
+} /* end ehca_mr_chk_buf_and_calc_size() */
+
+/*----------------------------------------------------------------------*/
+
+/* check page list of map FMR verb for validness */
+int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
+			     u64 *page_list,
+			     int list_len)
+{
+	u32 i;
+	u64 *page;
+
+	if ((list_len == 0) || (list_len > e_fmr->fmr_max_pages)) {
+		ehca_gen_err("bad list_len, list_len=%x "
+			     "e_fmr->fmr_max_pages=%x fmr=%p",
+			     list_len, e_fmr->fmr_max_pages, e_fmr);
+		return -EINVAL;
+	}
+
+	/* each page must be aligned */
+	page = page_list;
+	for (i = 0; i < list_len; i++) {
+		if (*page % e_fmr->fmr_page_size) {
+			ehca_gen_err("bad page, i=%x *page=%llx page=%p fmr=%p "
+				     "fmr_page_size=%x", i, *page, page, e_fmr,
+				     e_fmr->fmr_page_size);
+			return -EINVAL;
+		}
+		page++;
+	}
+
+	return 0;
+} /* end ehca_fmr_check_page_list() */
+
+/*----------------------------------------------------------------------*/
+
+/* PAGE_SIZE >= pginfo->hwpage_size */
+static int ehca_set_pagebuf_user1(struct ehca_mr_pginfo *pginfo,
+				  u32 number,
+				  u64 *kpage)
+{
+	int ret = 0;
+	u64 pgaddr;
+	u32 j = 0;
+	int hwpages_per_kpage = PAGE_SIZE / pginfo->hwpage_size;
+	struct scatterlist **sg = &pginfo->u.usr.next_sg;
+
+	while (*sg != NULL) {
+		pgaddr = page_to_pfn(sg_page(*sg))
+			<< PAGE_SHIFT;
+		*kpage = pgaddr + (pginfo->next_hwpage *
+				   pginfo->hwpage_size);
+		if (!(*kpage)) {
+			ehca_gen_err("pgaddr=%llx "
+				     "sg_dma_address=%llx "
+				     "entry=%llx next_hwpage=%llx",
+				     pgaddr, (u64)sg_dma_address(*sg),
+				     pginfo->u.usr.next_nmap,
+				     pginfo->next_hwpage);
+			return -EFAULT;
+		}
+		(pginfo->hwpage_cnt)++;
+		(pginfo->next_hwpage)++;
+		kpage++;
+		if (pginfo->next_hwpage % hwpages_per_kpage == 0) {
+			(pginfo->kpage_cnt)++;
+			(pginfo->u.usr.next_nmap)++;
+			pginfo->next_hwpage = 0;
+			*sg = sg_next(*sg);
+		}
+		j++;
+		if (j >= number)
+			break;
+	}
+
+	return ret;
+}
+
+/*
+ * check given pages for contiguous layout
+ * last page addr is returned in prev_pgaddr for further check
+ */
+static int ehca_check_kpages_per_ate(struct scatterlist **sg,
+				     int num_pages,
+				     u64 *prev_pgaddr)
+{
+	for (; *sg && num_pages > 0; *sg = sg_next(*sg), num_pages--) {
+		u64 pgaddr = page_to_pfn(sg_page(*sg)) << PAGE_SHIFT;
+		if (ehca_debug_level >= 3)
+			ehca_gen_dbg("chunk_page=%llx value=%016llx", pgaddr,
+				     *(u64 *)__va(pgaddr));
+		if (pgaddr - PAGE_SIZE != *prev_pgaddr) {
+			ehca_gen_err("uncontiguous page found pgaddr=%llx "
+				     "prev_pgaddr=%llx entries_left_in_hwpage=%x",
+				     pgaddr, *prev_pgaddr, num_pages);
+			return -EINVAL;
+		}
+		*prev_pgaddr = pgaddr;
+	}
+	return 0;
+}
+
+/* PAGE_SIZE < pginfo->hwpage_size */
+static int ehca_set_pagebuf_user2(struct ehca_mr_pginfo *pginfo,
+				  u32 number,
+				  u64 *kpage)
+{
+	int ret = 0;
+	u64 pgaddr, prev_pgaddr;
+	u32 j = 0;
+	int kpages_per_hwpage = pginfo->hwpage_size / PAGE_SIZE;
+	int nr_kpages = kpages_per_hwpage;
+	struct scatterlist **sg = &pginfo->u.usr.next_sg;
+
+	while (*sg != NULL) {
+
+		if (nr_kpages == kpages_per_hwpage) {
+			pgaddr = (page_to_pfn(sg_page(*sg))
+				   << PAGE_SHIFT);
+			*kpage = pgaddr;
+			if (!(*kpage)) {
+				ehca_gen_err("pgaddr=%llx entry=%llx",
+					     pgaddr, pginfo->u.usr.next_nmap);
+				ret = -EFAULT;
+				return ret;
+			}
+			/*
+			 * The first page in a hwpage must be aligned;
+			 * the first MR page is exempt from this rule.
+			 */
+			if (pgaddr & (pginfo->hwpage_size - 1)) {
+				if (pginfo->hwpage_cnt) {
+					ehca_gen_err(
+						"invalid alignment "
+						"pgaddr=%llx entry=%llx "
+						"mr_pgsize=%llx",
+						pgaddr, pginfo->u.usr.next_nmap,
+						pginfo->hwpage_size);
+					ret = -EFAULT;
+					return ret;
+				}
+				/* first MR page */
+				pginfo->kpage_cnt =
+					(pgaddr &
+					 (pginfo->hwpage_size - 1)) >>
+					PAGE_SHIFT;
+				nr_kpages -= pginfo->kpage_cnt;
+				*kpage = pgaddr &
+					 ~(pginfo->hwpage_size - 1);
+			}
+			if (ehca_debug_level >= 3) {
+				u64 val = *(u64 *)__va(pgaddr);
+				ehca_gen_dbg("kpage=%llx page=%llx "
+					     "value=%016llx",
+					     *kpage, pgaddr, val);
+			}
+			prev_pgaddr = pgaddr;
+			*sg = sg_next(*sg);
+			pginfo->kpage_cnt++;
+			pginfo->u.usr.next_nmap++;
+			nr_kpages--;
+			if (!nr_kpages)
+				goto next_kpage;
+			continue;
+		}
+
+		ret = ehca_check_kpages_per_ate(sg, nr_kpages,
+						&prev_pgaddr);
+		if (ret)
+			return ret;
+		pginfo->kpage_cnt += nr_kpages;
+		pginfo->u.usr.next_nmap += nr_kpages;
+
+next_kpage:
+		nr_kpages = kpages_per_hwpage;
+		(pginfo->hwpage_cnt)++;
+		kpage++;
+		j++;
+		if (j >= number)
+			break;
+	}
+
+	return ret;
+}
+
+static int ehca_set_pagebuf_phys(struct ehca_mr_pginfo *pginfo,
+				 u32 number, u64 *kpage)
+{
+	int ret = 0;
+	struct ib_phys_buf *pbuf;
+	u64 num_hw, offs_hw;
+	u32 i = 0;
+
+	/* loop over desired phys_buf_array entries */
+	while (i < number) {
+		pbuf   = pginfo->u.phy.phys_buf_array + pginfo->u.phy.next_buf;
+		num_hw  = NUM_CHUNKS((pbuf->addr % pginfo->hwpage_size) +
+				     pbuf->size, pginfo->hwpage_size);
+		offs_hw = (pbuf->addr & ~(pginfo->hwpage_size - 1)) /
+			pginfo->hwpage_size;
+		while (pginfo->next_hwpage < offs_hw + num_hw) {
+			/* sanity check */
+			if ((pginfo->kpage_cnt >= pginfo->num_kpages) ||
+			    (pginfo->hwpage_cnt >= pginfo->num_hwpages)) {
+				ehca_gen_err("kpage_cnt >= num_kpages, "
+					     "kpage_cnt=%llx num_kpages=%llx "
+					     "hwpage_cnt=%llx "
+					     "num_hwpages=%llx i=%x",
+					     pginfo->kpage_cnt,
+					     pginfo->num_kpages,
+					     pginfo->hwpage_cnt,
+					     pginfo->num_hwpages, i);
+				return -EFAULT;
+			}
+			*kpage = (pbuf->addr & ~(pginfo->hwpage_size - 1)) +
+				 (pginfo->next_hwpage * pginfo->hwpage_size);
+			if ( !(*kpage) && pbuf->addr ) {
+				ehca_gen_err("pbuf->addr=%llx pbuf->size=%llx "
+					     "next_hwpage=%llx", pbuf->addr,
+					     pbuf->size, pginfo->next_hwpage);
+				return -EFAULT;
+			}
+			(pginfo->hwpage_cnt)++;
+			(pginfo->next_hwpage)++;
+			if (PAGE_SIZE >= pginfo->hwpage_size) {
+				if (pginfo->next_hwpage %
+				    (PAGE_SIZE / pginfo->hwpage_size) == 0)
+					(pginfo->kpage_cnt)++;
+			} else
+				pginfo->kpage_cnt += pginfo->hwpage_size /
+					PAGE_SIZE;
+			kpage++;
+			i++;
+			if (i >= number) break;
+		}
+		if (pginfo->next_hwpage >= offs_hw + num_hw) {
+			(pginfo->u.phy.next_buf)++;
+			pginfo->next_hwpage = 0;
+		}
+	}
+	return ret;
+}
+
+static int ehca_set_pagebuf_fmr(struct ehca_mr_pginfo *pginfo,
+				u32 number, u64 *kpage)
+{
+	int ret = 0;
+	u64 *fmrlist;
+	u32 i;
+
+	/* loop over desired page_list entries */
+	fmrlist = pginfo->u.fmr.page_list + pginfo->u.fmr.next_listelem;
+	for (i = 0; i < number; i++) {
+		*kpage = (*fmrlist & ~(pginfo->hwpage_size - 1)) +
+			   pginfo->next_hwpage * pginfo->hwpage_size;
+		if ( !(*kpage) ) {
+			ehca_gen_err("*fmrlist=%llx fmrlist=%p "
+				     "next_listelem=%llx next_hwpage=%llx",
+				     *fmrlist, fmrlist,
+				     pginfo->u.fmr.next_listelem,
+				     pginfo->next_hwpage);
+			return -EFAULT;
+		}
+		(pginfo->hwpage_cnt)++;
+		if (pginfo->u.fmr.fmr_pgsize >= pginfo->hwpage_size) {
+			if (pginfo->next_hwpage %
+			    (pginfo->u.fmr.fmr_pgsize /
+			     pginfo->hwpage_size) == 0) {
+				(pginfo->kpage_cnt)++;
+				(pginfo->u.fmr.next_listelem)++;
+				fmrlist++;
+				pginfo->next_hwpage = 0;
+			} else
+				(pginfo->next_hwpage)++;
+		} else {
+			unsigned int cnt_per_hwpage = pginfo->hwpage_size /
+				pginfo->u.fmr.fmr_pgsize;
+			unsigned int j;
+			u64 prev = *kpage;
+			/* check if adrs are contiguous */
+			for (j = 1; j < cnt_per_hwpage; j++) {
+				u64 p = fmrlist[j] & ~(pginfo->hwpage_size - 1);
+				if (prev + pginfo->u.fmr.fmr_pgsize != p) {
+					ehca_gen_err("uncontiguous fmr pages "
+						     "found prev=%llx p=%llx "
+						     "idx=%x", prev, p, i + j);
+					return -EINVAL;
+				}
+				prev = p;
+			}
+			pginfo->kpage_cnt += cnt_per_hwpage;
+			pginfo->u.fmr.next_listelem += cnt_per_hwpage;
+			fmrlist += cnt_per_hwpage;
+		}
+		kpage++;
+	}
+	return ret;
+}
+
+/* setup page buffer from page info */
+int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo,
+		     u32 number,
+		     u64 *kpage)
+{
+	int ret;
+
+	switch (pginfo->type) {
+	case EHCA_MR_PGI_PHYS:
+		ret = ehca_set_pagebuf_phys(pginfo, number, kpage);
+		break;
+	case EHCA_MR_PGI_USER:
+		ret = PAGE_SIZE >= pginfo->hwpage_size ?
+			ehca_set_pagebuf_user1(pginfo, number, kpage) :
+			ehca_set_pagebuf_user2(pginfo, number, kpage);
+		break;
+	case EHCA_MR_PGI_FMR:
+		ret = ehca_set_pagebuf_fmr(pginfo, number, kpage);
+		break;
+	default:
+		ehca_gen_err("bad pginfo->type=%x", pginfo->type);
+		ret = -EFAULT;
+		break;
+	}
+	return ret;
+} /* end ehca_set_pagebuf() */
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * check MR if it is a max-MR, i.e. uses whole memory
+ * in case it's a max-MR 1 is returned, else 0
+ */
+int ehca_mr_is_maxmr(u64 size,
+		     u64 *iova_start)
+{
+	/* a MR is treated as max-MR only if it fits following: */
+	if ((size == ehca_mr_len) &&
+	    (iova_start == (void *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)))) {
+		ehca_gen_dbg("this is a max-MR");
+		return 1;
+	} else
+		return 0;
+} /* end ehca_mr_is_maxmr() */
+
+/*----------------------------------------------------------------------*/
+
+/* map access control for MR/MW. This routine is used for MR and MW. */
+void ehca_mrmw_map_acl(int ib_acl,
+		       u32 *hipz_acl)
+{
+	*hipz_acl = 0;
+	if (ib_acl & IB_ACCESS_REMOTE_READ)
+		*hipz_acl |= HIPZ_ACCESSCTRL_R_READ;
+	if (ib_acl & IB_ACCESS_REMOTE_WRITE)
+		*hipz_acl |= HIPZ_ACCESSCTRL_R_WRITE;
+	if (ib_acl & IB_ACCESS_REMOTE_ATOMIC)
+		*hipz_acl |= HIPZ_ACCESSCTRL_R_ATOMIC;
+	if (ib_acl & IB_ACCESS_LOCAL_WRITE)
+		*hipz_acl |= HIPZ_ACCESSCTRL_L_WRITE;
+	if (ib_acl & IB_ACCESS_MW_BIND)
+		*hipz_acl |= HIPZ_ACCESSCTRL_MW_BIND;
+} /* end ehca_mrmw_map_acl() */
+
+/*----------------------------------------------------------------------*/
+
+/* sets page size in hipz access control for MR/MW. */
+void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl) /*INOUT*/
+{
+	*hipz_acl |= (ehca_encode_hwpage_size(pgsize) << 24);
+} /* end ehca_mrmw_set_pgsize_hipz_acl() */
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * reverse map access control for MR/MW.
+ * This routine is used for MR and MW.
+ */
+void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
+			       int *ib_acl) /*OUT*/
+{
+	*ib_acl = 0;
+	if (*hipz_acl & HIPZ_ACCESSCTRL_R_READ)
+		*ib_acl |= IB_ACCESS_REMOTE_READ;
+	if (*hipz_acl & HIPZ_ACCESSCTRL_R_WRITE)
+		*ib_acl |= IB_ACCESS_REMOTE_WRITE;
+	if (*hipz_acl & HIPZ_ACCESSCTRL_R_ATOMIC)
+		*ib_acl |= IB_ACCESS_REMOTE_ATOMIC;
+	if (*hipz_acl & HIPZ_ACCESSCTRL_L_WRITE)
+		*ib_acl |= IB_ACCESS_LOCAL_WRITE;
+	if (*hipz_acl & HIPZ_ACCESSCTRL_MW_BIND)
+		*ib_acl |= IB_ACCESS_MW_BIND;
+} /* end ehca_mrmw_reverse_map_acl() */
+
+
+/*----------------------------------------------------------------------*/
+
+/*
+ * MR destructor and constructor
+ * used in Reregister MR verb, sets all fields in ehca_mr_t to 0,
+ * except struct ib_mr and spinlock
+ */
+void ehca_mr_deletenew(struct ehca_mr *mr)
+{
+	mr->flags = 0;
+	mr->num_kpages = 0;
+	mr->num_hwpages = 0;
+	mr->acl = 0;
+	mr->start = NULL;
+	mr->fmr_page_size = 0;
+	mr->fmr_max_pages = 0;
+	mr->fmr_max_maps = 0;
+	mr->fmr_map_cnt = 0;
+	memset(&mr->ipz_mr_handle, 0, sizeof(mr->ipz_mr_handle));
+	memset(&mr->galpas, 0, sizeof(mr->galpas));
+} /* end ehca_mr_deletenew() */
+
+int ehca_init_mrmw_cache(void)
+{
+	mr_cache = kmem_cache_create("ehca_cache_mr",
+				     sizeof(struct ehca_mr), 0,
+				     SLAB_HWCACHE_ALIGN,
+				     NULL);
+	if (!mr_cache)
+		return -ENOMEM;
+	mw_cache = kmem_cache_create("ehca_cache_mw",
+				     sizeof(struct ehca_mw), 0,
+				     SLAB_HWCACHE_ALIGN,
+				     NULL);
+	if (!mw_cache) {
+		kmem_cache_destroy(mr_cache);
+		mr_cache = NULL;
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+void ehca_cleanup_mrmw_cache(void)
+{
+	if (mr_cache)
+		kmem_cache_destroy(mr_cache);
+	if (mw_cache)
+		kmem_cache_destroy(mw_cache);
+}
+
+static inline int ehca_init_top_bmap(struct ehca_top_bmap *ehca_top_bmap,
+				     int dir)
+{
+	if (!ehca_bmap_valid(ehca_top_bmap->dir[dir])) {
+		ehca_top_bmap->dir[dir] =
+			kmalloc(sizeof(struct ehca_dir_bmap), GFP_KERNEL);
+		if (!ehca_top_bmap->dir[dir])
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_top_bmap->dir[dir], 0xFF, EHCA_ENT_MAP_SIZE);
+	}
+	return 0;
+}
+
+static inline int ehca_init_bmap(struct ehca_bmap *ehca_bmap, int top, int dir)
+{
+	if (!ehca_bmap_valid(ehca_bmap->top[top])) {
+		ehca_bmap->top[top] =
+			kmalloc(sizeof(struct ehca_top_bmap), GFP_KERNEL);
+		if (!ehca_bmap->top[top])
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_bmap->top[top], 0xFF, EHCA_DIR_MAP_SIZE);
+	}
+	return ehca_init_top_bmap(ehca_bmap->top[top], dir);
+}
+
+static inline int ehca_calc_index(unsigned long i, unsigned long s)
+{
+	return (i >> s) & EHCA_INDEX_MASK;
+}
+
+void ehca_destroy_busmap(void)
+{
+	int top, dir;
+
+	if (!ehca_bmap)
+		return;
+
+	for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]))
+			continue;
+		for (dir = 0; dir < EHCA_MAP_ENTRIES; dir++) {
+			if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+				continue;
+
+			kfree(ehca_bmap->top[top]->dir[dir]);
+		}
+
+		kfree(ehca_bmap->top[top]);
+	}
+
+	kfree(ehca_bmap);
+	ehca_bmap = NULL;
+}
+
+static int ehca_update_busmap(unsigned long pfn, unsigned long nr_pages)
+{
+	unsigned long i, start_section, end_section;
+	int top, dir, idx;
+
+	if (!nr_pages)
+		return 0;
+
+	if (!ehca_bmap) {
+		ehca_bmap = kmalloc(sizeof(struct ehca_bmap), GFP_KERNEL);
+		if (!ehca_bmap)
+			return -ENOMEM;
+		/* Set map block to 0xFF according to EHCA_INVAL_ADDR */
+		memset(ehca_bmap, 0xFF, EHCA_TOP_MAP_SIZE);
+	}
+
+	start_section = (pfn * PAGE_SIZE) / EHCA_SECTSIZE;
+	end_section = ((pfn + nr_pages) * PAGE_SIZE) / EHCA_SECTSIZE;
+	for (i = start_section; i < end_section; i++) {
+		int ret;
+		top = ehca_calc_index(i, EHCA_TOP_INDEX_SHIFT);
+		dir = ehca_calc_index(i, EHCA_DIR_INDEX_SHIFT);
+		idx = i & EHCA_INDEX_MASK;
+
+		ret = ehca_init_bmap(ehca_bmap, top, dir);
+		if (ret) {
+			ehca_destroy_busmap();
+			return ret;
+		}
+		ehca_bmap->top[top]->dir[dir]->ent[idx] = ehca_mr_len;
+		ehca_mr_len += EHCA_SECTSIZE;
+	}
+	return 0;
+}
+
+static int ehca_is_hugepage(unsigned long pfn)
+{
+	int page_order;
+
+	if (pfn & EHCA_HUGEPAGE_PFN_MASK)
+		return 0;
+
+	page_order = compound_order(pfn_to_page(pfn));
+	if (page_order + PAGE_SHIFT != EHCA_HUGEPAGESHIFT)
+		return 0;
+
+	return 1;
+}
+
+static int ehca_create_busmap_callback(unsigned long initial_pfn,
+				       unsigned long total_nr_pages, void *arg)
+{
+	int ret;
+	unsigned long pfn, start_pfn, end_pfn, nr_pages;
+
+	if ((total_nr_pages * PAGE_SIZE) < EHCA_HUGEPAGE_SIZE)
+		return ehca_update_busmap(initial_pfn, total_nr_pages);
+
+	/* Given chunk is >= 16GB -> check for hugepages */
+	start_pfn = initial_pfn;
+	end_pfn = initial_pfn + total_nr_pages;
+	pfn = start_pfn;
+
+	while (pfn < end_pfn) {
+		if (ehca_is_hugepage(pfn)) {
+			/* Add mem found in front of the hugepage */
+			nr_pages = pfn - start_pfn;
+			ret = ehca_update_busmap(start_pfn, nr_pages);
+			if (ret)
+				return ret;
+			/* Skip the hugepage */
+			pfn += (EHCA_HUGEPAGE_SIZE / PAGE_SIZE);
+			start_pfn = pfn;
+		} else
+			pfn += (EHCA_SECTSIZE / PAGE_SIZE);
+	}
+
+	/* Add mem found behind the hugepage(s)  */
+	nr_pages = pfn - start_pfn;
+	return ehca_update_busmap(start_pfn, nr_pages);
+}
+
+int ehca_create_busmap(void)
+{
+	int ret;
+
+	ehca_mr_len = 0;
+	ret = walk_system_ram_range(0, 1ULL << MAX_PHYSMEM_BITS, NULL,
+				   ehca_create_busmap_callback);
+	return ret;
+}
+
+static int ehca_reg_bmap_mr_rpages(struct ehca_shca *shca,
+				   struct ehca_mr *e_mr,
+				   struct ehca_mr_pginfo *pginfo)
+{
+	int top;
+	u64 hret, *kpage;
+
+	kpage = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!kpage) {
+		ehca_err(&shca->ib_device, "kpage alloc failed");
+		return -ENOMEM;
+	}
+	for (top = 0; top < EHCA_MAP_ENTRIES; top++) {
+		if (!ehca_bmap_valid(ehca_bmap->top[top]))
+			continue;
+		hret = ehca_reg_mr_dir_sections(top, kpage, shca, e_mr, pginfo);
+		if ((hret != H_PAGE_REGISTERED) && (hret != H_SUCCESS))
+			break;
+	}
+
+	ehca_free_fw_ctrlblock(kpage);
+
+	if (hret == H_SUCCESS)
+		return 0; /* Everything is fine */
+	else {
+		ehca_err(&shca->ib_device, "ehca_reg_bmap_mr_rpages failed, "
+				 "h_ret=%lli e_mr=%p top=%x lkey=%x "
+				 "hca_hndl=%llx mr_hndl=%llx", hret, e_mr, top,
+				 e_mr->ib.ib_mr.lkey,
+				 shca->ipz_hca_handle.handle,
+				 e_mr->ipz_mr_handle.handle);
+		return ehca2ib_return_code(hret);
+	}
+}
+
+static u64 ehca_map_vaddr(void *caddr)
+{
+	int top, dir, idx;
+	unsigned long abs_addr, offset;
+	u64 entry;
+
+	if (!ehca_bmap)
+		return EHCA_INVAL_ADDR;
+
+	abs_addr = __pa(caddr);
+	top = ehca_calc_index(abs_addr, EHCA_TOP_INDEX_SHIFT + EHCA_SECTSHIFT);
+	if (!ehca_bmap_valid(ehca_bmap->top[top]))
+		return EHCA_INVAL_ADDR;
+
+	dir = ehca_calc_index(abs_addr, EHCA_DIR_INDEX_SHIFT + EHCA_SECTSHIFT);
+	if (!ehca_bmap_valid(ehca_bmap->top[top]->dir[dir]))
+		return EHCA_INVAL_ADDR;
+
+	idx = ehca_calc_index(abs_addr, EHCA_SECTSHIFT);
+
+	entry = ehca_bmap->top[top]->dir[dir]->ent[idx];
+	if (ehca_bmap_valid(entry)) {
+		offset = (unsigned long)caddr & (EHCA_SECTSIZE - 1);
+		return entry | offset;
+	} else
+		return EHCA_INVAL_ADDR;
+}
+
+static int ehca_dma_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return dma_addr == EHCA_INVAL_ADDR;
+}
+
+static u64 ehca_dma_map_single(struct ib_device *dev, void *cpu_addr,
+			       size_t size, enum dma_data_direction direction)
+{
+	if (cpu_addr)
+		return ehca_map_vaddr(cpu_addr);
+	else
+		return EHCA_INVAL_ADDR;
+}
+
+static void ehca_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+				  enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static u64 ehca_dma_map_page(struct ib_device *dev, struct page *page,
+			     unsigned long offset, size_t size,
+			     enum dma_data_direction direction)
+{
+	u64 addr;
+
+	if (offset + size > PAGE_SIZE)
+		return EHCA_INVAL_ADDR;
+
+	addr = ehca_map_vaddr(page_address(page));
+	if (!ehca_dma_mapping_error(dev, addr))
+		addr += offset;
+
+	return addr;
+}
+
+static void ehca_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+				enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static int ehca_dma_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+			   int nents, enum dma_data_direction direction)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(sgl, sg, nents, i) {
+		u64 addr;
+		addr = ehca_map_vaddr(sg_virt(sg));
+		if (ehca_dma_mapping_error(dev, addr))
+			return 0;
+
+		sg->dma_address = addr;
+		sg->dma_length = sg->length;
+	}
+	return nents;
+}
+
+static void ehca_dma_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
+			      int nents, enum dma_data_direction direction)
+{
+	/* This is only a stub; nothing to be done here */
+}
+
+static void ehca_dma_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+					 size_t size,
+					 enum dma_data_direction dir)
+{
+	dma_sync_single_for_cpu(dev->dma_device, addr, size, dir);
+}
+
+static void ehca_dma_sync_single_for_device(struct ib_device *dev, u64 addr,
+					    size_t size,
+					    enum dma_data_direction dir)
+{
+	dma_sync_single_for_device(dev->dma_device, addr, size, dir);
+}
+
+static void *ehca_dma_alloc_coherent(struct ib_device *dev, size_t size,
+				     u64 *dma_handle, gfp_t flag)
+{
+	struct page *p;
+	void *addr = NULL;
+	u64 dma_addr;
+
+	p = alloc_pages(flag, get_order(size));
+	if (p) {
+		addr = page_address(p);
+		dma_addr = ehca_map_vaddr(addr);
+		if (ehca_dma_mapping_error(dev, dma_addr)) {
+			free_pages((unsigned long)addr,	get_order(size));
+			return NULL;
+		}
+		if (dma_handle)
+			*dma_handle = dma_addr;
+		return addr;
+	}
+	return NULL;
+}
+
+static void ehca_dma_free_coherent(struct ib_device *dev, size_t size,
+				   void *cpu_addr, u64 dma_handle)
+{
+	if (cpu_addr && size)
+		free_pages((unsigned long)cpu_addr, get_order(size));
+}
+
+
+struct ib_dma_mapping_ops ehca_dma_mapping_ops = {
+	.mapping_error          = ehca_dma_mapping_error,
+	.map_single             = ehca_dma_map_single,
+	.unmap_single           = ehca_dma_unmap_single,
+	.map_page               = ehca_dma_map_page,
+	.unmap_page             = ehca_dma_unmap_page,
+	.map_sg                 = ehca_dma_map_sg,
+	.unmap_sg               = ehca_dma_unmap_sg,
+	.sync_single_for_cpu    = ehca_dma_sync_single_for_cpu,
+	.sync_single_for_device = ehca_dma_sync_single_for_device,
+	.alloc_coherent         = ehca_dma_alloc_coherent,
+	.free_coherent          = ehca_dma_free_coherent,
+};
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.h b/drivers/infiniband/hw/ehca/ehca_mrmw.h
new file mode 100644
index 000000000..50d8b5130
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_mrmw.h
@@ -0,0 +1,132 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  MR/MW declarations and inline functions
+ *
+ *  Authors: Dietmar Decker <ddecker@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _EHCA_MRMW_H_
+#define _EHCA_MRMW_H_
+
+enum ehca_reg_type {
+	EHCA_REG_MR,
+	EHCA_REG_BUSMAP_MR
+};
+
+int ehca_reg_mr(struct ehca_shca *shca,
+		struct ehca_mr *e_mr,
+		u64 *iova_start,
+		u64 size,
+		int acl,
+		struct ehca_pd *e_pd,
+		struct ehca_mr_pginfo *pginfo,
+		u32 *lkey,
+		u32 *rkey,
+		enum ehca_reg_type reg_type);
+
+int ehca_reg_mr_rpages(struct ehca_shca *shca,
+		       struct ehca_mr *e_mr,
+		       struct ehca_mr_pginfo *pginfo);
+
+int ehca_rereg_mr(struct ehca_shca *shca,
+		  struct ehca_mr *e_mr,
+		  u64 *iova_start,
+		  u64 size,
+		  int mr_access_flags,
+		  struct ehca_pd *e_pd,
+		  struct ehca_mr_pginfo *pginfo,
+		  u32 *lkey,
+		  u32 *rkey);
+
+int ehca_unmap_one_fmr(struct ehca_shca *shca,
+		       struct ehca_mr *e_fmr);
+
+int ehca_reg_smr(struct ehca_shca *shca,
+		 struct ehca_mr *e_origmr,
+		 struct ehca_mr *e_newmr,
+		 u64 *iova_start,
+		 int acl,
+		 struct ehca_pd *e_pd,
+		 u32 *lkey,
+		 u32 *rkey);
+
+int ehca_reg_internal_maxmr(struct ehca_shca *shca,
+			    struct ehca_pd *e_pd,
+			    struct ehca_mr **maxmr);
+
+int ehca_reg_maxmr(struct ehca_shca *shca,
+		   struct ehca_mr *e_newmr,
+		   u64 *iova_start,
+		   int acl,
+		   struct ehca_pd *e_pd,
+		   u32 *lkey,
+		   u32 *rkey);
+
+int ehca_dereg_internal_maxmr(struct ehca_shca *shca);
+
+int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array,
+				  int num_phys_buf,
+				  u64 *iova_start,
+				  u64 *size);
+
+int ehca_fmr_check_page_list(struct ehca_mr *e_fmr,
+			     u64 *page_list,
+			     int list_len);
+
+int ehca_set_pagebuf(struct ehca_mr_pginfo *pginfo,
+		     u32 number,
+		     u64 *kpage);
+
+int ehca_mr_is_maxmr(u64 size,
+		     u64 *iova_start);
+
+void ehca_mrmw_map_acl(int ib_acl,
+		       u32 *hipz_acl);
+
+void ehca_mrmw_set_pgsize_hipz_acl(u32 pgsize, u32 *hipz_acl);
+
+void ehca_mrmw_reverse_map_acl(const u32 *hipz_acl,
+			       int *ib_acl);
+
+void ehca_mr_deletenew(struct ehca_mr *mr);
+
+int ehca_create_busmap(void);
+
+void ehca_destroy_busmap(void);
+
+extern struct ib_dma_mapping_ops ehca_dma_mapping_ops;
+#endif  /*_EHCA_MRMW_H_*/
diff --git a/drivers/infiniband/hw/ehca/ehca_pd.c b/drivers/infiniband/hw/ehca/ehca_pd.c
new file mode 100644
index 000000000..351577a66
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_pd.c
@@ -0,0 +1,124 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  PD functions
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+
+static struct kmem_cache *pd_cache;
+
+struct ib_pd *ehca_alloc_pd(struct ib_device *device,
+			    struct ib_ucontext *context, struct ib_udata *udata)
+{
+	struct ehca_pd *pd;
+	int i;
+
+	pd = kmem_cache_zalloc(pd_cache, GFP_KERNEL);
+	if (!pd) {
+		ehca_err(device, "device=%p context=%p out of memory",
+			 device, context);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < 2; i++) {
+		INIT_LIST_HEAD(&pd->free[i]);
+		INIT_LIST_HEAD(&pd->full[i]);
+	}
+	mutex_init(&pd->lock);
+
+	/*
+	 * Kernel PD: when device = -1, 0
+	 * User   PD: when context != -1
+	 */
+	if (!context) {
+		/*
+		 * Kernel PDs after init reuses always
+		 * the one created in ehca_shca_reopen()
+		 */
+		struct ehca_shca *shca = container_of(device, struct ehca_shca,
+						      ib_device);
+		pd->fw_pd.value = shca->pd->fw_pd.value;
+	} else
+		pd->fw_pd.value = (u64)pd;
+
+	return &pd->ib_pd;
+}
+
+int ehca_dealloc_pd(struct ib_pd *pd)
+{
+	struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
+	int i, leftovers = 0;
+	struct ipz_small_queue_page *page, *tmp;
+
+	for (i = 0; i < 2; i++) {
+		list_splice(&my_pd->full[i], &my_pd->free[i]);
+		list_for_each_entry_safe(page, tmp, &my_pd->free[i], list) {
+			leftovers = 1;
+			free_page(page->page);
+			kmem_cache_free(small_qp_cache, page);
+		}
+	}
+
+	if (leftovers)
+		ehca_warn(pd->device,
+			  "Some small queue pages were not freed");
+
+	kmem_cache_free(pd_cache, my_pd);
+
+	return 0;
+}
+
+int ehca_init_pd_cache(void)
+{
+	pd_cache = kmem_cache_create("ehca_cache_pd",
+				     sizeof(struct ehca_pd), 0,
+				     SLAB_HWCACHE_ALIGN,
+				     NULL);
+	if (!pd_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void ehca_cleanup_pd_cache(void)
+{
+	if (pd_cache)
+		kmem_cache_destroy(pd_cache);
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_qes.h b/drivers/infiniband/hw/ehca/ehca_qes.h
new file mode 100644
index 000000000..90c4efa67
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_qes.h
@@ -0,0 +1,260 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Hardware request structures
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _EHCA_QES_H_
+#define _EHCA_QES_H_
+
+#include "ehca_tools.h"
+
+/* virtual scatter gather entry to specify remote addresses with length */
+struct ehca_vsgentry {
+	u64 vaddr;
+	u32 lkey;
+	u32 length;
+};
+
+#define GRH_FLAG_MASK        EHCA_BMASK_IBM( 7,  7)
+#define GRH_IPVERSION_MASK   EHCA_BMASK_IBM( 0,  3)
+#define GRH_TCLASS_MASK      EHCA_BMASK_IBM( 4, 12)
+#define GRH_FLOWLABEL_MASK   EHCA_BMASK_IBM(13, 31)
+#define GRH_PAYLEN_MASK      EHCA_BMASK_IBM(32, 47)
+#define GRH_NEXTHEADER_MASK  EHCA_BMASK_IBM(48, 55)
+#define GRH_HOPLIMIT_MASK    EHCA_BMASK_IBM(56, 63)
+
+/*
+ * Unreliable Datagram Address Vector Format
+ * see IBTA Vol1 chapter 8.3 Global Routing Header
+ */
+struct ehca_ud_av {
+	u8 sl;
+	u8 lnh;
+	u16 dlid;
+	u8 reserved1;
+	u8 reserved2;
+	u8 reserved3;
+	u8 slid_path_bits;
+	u8 reserved4;
+	u8 ipd;
+	u8 reserved5;
+	u8 pmtu;
+	u32 reserved6;
+	u64 reserved7;
+	union {
+		struct {
+			u64 word_0; /* always set to 6  */
+			/*should be 0x1B for IB transport */
+			u64 word_1;
+			u64 word_2;
+			u64 word_3;
+			u64 word_4;
+		} grh;
+		struct {
+			u32 wd_0;
+			u32 wd_1;
+			/* DWord_1 --> SGID */
+
+			u32 sgid_wd3;
+			u32 sgid_wd2;
+
+			u32 sgid_wd1;
+			u32 sgid_wd0;
+			/* DWord_3 --> DGID */
+
+			u32 dgid_wd3;
+			u32 dgid_wd2;
+
+			u32 dgid_wd1;
+			u32 dgid_wd0;
+		} grh_l;
+	};
+};
+
+/* maximum number of sg entries allowed in a WQE */
+#define MAX_WQE_SG_ENTRIES 252
+
+#define WQE_OPTYPE_SEND             0x80
+#define WQE_OPTYPE_RDMAREAD         0x40
+#define WQE_OPTYPE_RDMAWRITE        0x20
+#define WQE_OPTYPE_CMPSWAP          0x10
+#define WQE_OPTYPE_FETCHADD         0x08
+#define WQE_OPTYPE_BIND             0x04
+
+#define WQE_WRFLAG_REQ_SIGNAL_COM   0x80
+#define WQE_WRFLAG_FENCE            0x40
+#define WQE_WRFLAG_IMM_DATA_PRESENT 0x20
+#define WQE_WRFLAG_SOLIC_EVENT      0x10
+
+#define WQEF_CACHE_HINT             0x80
+#define WQEF_CACHE_HINT_RD_WR       0x40
+#define WQEF_TIMED_WQE              0x20
+#define WQEF_PURGE                  0x08
+#define WQEF_HIGH_NIBBLE            0xF0
+
+#define MW_BIND_ACCESSCTRL_R_WRITE   0x40
+#define MW_BIND_ACCESSCTRL_R_READ    0x20
+#define MW_BIND_ACCESSCTRL_R_ATOMIC  0x10
+
+struct ehca_wqe {
+	u64 work_request_id;
+	u8 optype;
+	u8 wr_flag;
+	u16 pkeyi;
+	u8 wqef;
+	u8 nr_of_data_seg;
+	u16 wqe_provided_slid;
+	u32 destination_qp_number;
+	u32 resync_psn_sqp;
+	u32 local_ee_context_qkey;
+	u32 immediate_data;
+	union {
+		struct {
+			u64 remote_virtual_address;
+			u32 rkey;
+			u32 reserved;
+			u64 atomic_1st_op_dma_len;
+			u64 atomic_2nd_op;
+			struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
+
+		} nud;
+		struct {
+			u64 ehca_ud_av_ptr;
+			u64 reserved1;
+			u64 reserved2;
+			u64 reserved3;
+			struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
+		} ud_avp;
+		struct {
+			struct ehca_ud_av ud_av;
+			struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES -
+						     2];
+		} ud_av;
+		struct {
+			u64 reserved0;
+			u64 reserved1;
+			u64 reserved2;
+			u64 reserved3;
+			struct ehca_vsgentry sg_list[MAX_WQE_SG_ENTRIES];
+		} all_rcv;
+
+		struct {
+			u64 reserved;
+			u32 rkey;
+			u32 old_rkey;
+			u64 reserved1;
+			u64 reserved2;
+			u64 virtual_address;
+			u32 reserved3;
+			u32 length;
+			u32 reserved4;
+			u16 reserved5;
+			u8 reserved6;
+			u8 lr_ctl;
+			u32 lkey;
+			u32 reserved7;
+			u64 reserved8;
+			u64 reserved9;
+			u64 reserved10;
+			u64 reserved11;
+		} bind;
+		struct {
+			u64 reserved12;
+			u64 reserved13;
+			u32 size;
+			u32 start;
+		} inline_data;
+	} u;
+
+};
+
+#define WC_SEND_RECEIVE EHCA_BMASK_IBM(0, 0)
+#define WC_IMM_DATA     EHCA_BMASK_IBM(1, 1)
+#define WC_GRH_PRESENT  EHCA_BMASK_IBM(2, 2)
+#define WC_SE_BIT       EHCA_BMASK_IBM(3, 3)
+#define WC_STATUS_ERROR_BIT 0x80000000
+#define WC_STATUS_REMOTE_ERROR_FLAGS 0x0000F800
+#define WC_STATUS_PURGE_BIT 0x10
+#define WC_SEND_RECEIVE_BIT 0x80
+
+struct ehca_cqe {
+	u64 work_request_id;
+	u8 optype;
+	u8 w_completion_flags;
+	u16 reserved1;
+	u32 nr_bytes_transferred;
+	u32 immediate_data;
+	u32 local_qp_number;
+	u8 freed_resource_count;
+	u8 service_level;
+	u16 wqe_count;
+	u32 qp_token;
+	u32 qkey_ee_token;
+	u32 remote_qp_number;
+	u16 dlid;
+	u16 rlid;
+	u16 reserved2;
+	u16 pkey_index;
+	u32 cqe_timestamp;
+	u32 wqe_timestamp;
+	u8 wqe_timestamp_valid;
+	u8 reserved3;
+	u8 reserved4;
+	u8 cqe_flags;
+	u32 status;
+};
+
+struct ehca_eqe {
+	u64 entry;
+};
+
+struct ehca_mrte {
+	u64 starting_va;
+	u64 length; /* length of memory region in bytes*/
+	u32 pd;
+	u8 key_instance;
+	u8 pagesize;
+	u8 mr_control;
+	u8 local_remote_access_ctrl;
+	u8 reserved[0x20 - 0x18];
+	u64 at_pointer[4];
+};
+#endif /*_EHCA_QES_H_*/
diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
new file mode 100644
index 000000000..2e89356c4
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_qp.c
@@ -0,0 +1,2257 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  QP functions
+ *
+ *  Authors: Joachim Fenkes <fenkes@de.ibm.com>
+ *           Stefan Roscher <stefan.roscher@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+#include "hipz_fns.h"
+
+static struct kmem_cache *qp_cache;
+
+/*
+ * attributes not supported by query qp
+ */
+#define QP_ATTR_QUERY_NOT_SUPPORTED (IB_QP_ACCESS_FLAGS       | \
+				     IB_QP_EN_SQD_ASYNC_NOTIFY)
+
+/*
+ * ehca (internal) qp state values
+ */
+enum ehca_qp_state {
+	EHCA_QPS_RESET = 1,
+	EHCA_QPS_INIT = 2,
+	EHCA_QPS_RTR = 3,
+	EHCA_QPS_RTS = 5,
+	EHCA_QPS_SQD = 6,
+	EHCA_QPS_SQE = 8,
+	EHCA_QPS_ERR = 128
+};
+
+/*
+ * qp state transitions as defined by IB Arch Rel 1.1 page 431
+ */
+enum ib_qp_statetrans {
+	IB_QPST_ANY2RESET,
+	IB_QPST_ANY2ERR,
+	IB_QPST_RESET2INIT,
+	IB_QPST_INIT2RTR,
+	IB_QPST_INIT2INIT,
+	IB_QPST_RTR2RTS,
+	IB_QPST_RTS2SQD,
+	IB_QPST_RTS2RTS,
+	IB_QPST_SQD2RTS,
+	IB_QPST_SQE2RTS,
+	IB_QPST_SQD2SQD,
+	IB_QPST_MAX	/* nr of transitions, this must be last!!! */
+};
+
+/*
+ * ib2ehca_qp_state maps IB to ehca qp_state
+ * returns ehca qp state corresponding to given ib qp state
+ */
+static inline enum ehca_qp_state ib2ehca_qp_state(enum ib_qp_state ib_qp_state)
+{
+	switch (ib_qp_state) {
+	case IB_QPS_RESET:
+		return EHCA_QPS_RESET;
+	case IB_QPS_INIT:
+		return EHCA_QPS_INIT;
+	case IB_QPS_RTR:
+		return EHCA_QPS_RTR;
+	case IB_QPS_RTS:
+		return EHCA_QPS_RTS;
+	case IB_QPS_SQD:
+		return EHCA_QPS_SQD;
+	case IB_QPS_SQE:
+		return EHCA_QPS_SQE;
+	case IB_QPS_ERR:
+		return EHCA_QPS_ERR;
+	default:
+		ehca_gen_err("invalid ib_qp_state=%x", ib_qp_state);
+		return -EINVAL;
+	}
+}
+
+/*
+ * ehca2ib_qp_state maps ehca to IB qp_state
+ * returns ib qp state corresponding to given ehca qp state
+ */
+static inline enum ib_qp_state ehca2ib_qp_state(enum ehca_qp_state
+						ehca_qp_state)
+{
+	switch (ehca_qp_state) {
+	case EHCA_QPS_RESET:
+		return IB_QPS_RESET;
+	case EHCA_QPS_INIT:
+		return IB_QPS_INIT;
+	case EHCA_QPS_RTR:
+		return IB_QPS_RTR;
+	case EHCA_QPS_RTS:
+		return IB_QPS_RTS;
+	case EHCA_QPS_SQD:
+		return IB_QPS_SQD;
+	case EHCA_QPS_SQE:
+		return IB_QPS_SQE;
+	case EHCA_QPS_ERR:
+		return IB_QPS_ERR;
+	default:
+		ehca_gen_err("invalid ehca_qp_state=%x", ehca_qp_state);
+		return -EINVAL;
+	}
+}
+
+/*
+ * ehca_qp_type used as index for req_attr and opt_attr of
+ * struct ehca_modqp_statetrans
+ */
+enum ehca_qp_type {
+	QPT_RC = 0,
+	QPT_UC = 1,
+	QPT_UD = 2,
+	QPT_SQP = 3,
+	QPT_MAX
+};
+
+/*
+ * ib2ehcaqptype maps Ib to ehca qp_type
+ * returns ehca qp type corresponding to ib qp type
+ */
+static inline enum ehca_qp_type ib2ehcaqptype(enum ib_qp_type ibqptype)
+{
+	switch (ibqptype) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		return QPT_SQP;
+	case IB_QPT_RC:
+		return QPT_RC;
+	case IB_QPT_UC:
+		return QPT_UC;
+	case IB_QPT_UD:
+		return QPT_UD;
+	default:
+		ehca_gen_err("Invalid ibqptype=%x", ibqptype);
+		return -EINVAL;
+	}
+}
+
+static inline enum ib_qp_statetrans get_modqp_statetrans(int ib_fromstate,
+							 int ib_tostate)
+{
+	int index = -EINVAL;
+	switch (ib_tostate) {
+	case IB_QPS_RESET:
+		index = IB_QPST_ANY2RESET;
+		break;
+	case IB_QPS_INIT:
+		switch (ib_fromstate) {
+		case IB_QPS_RESET:
+			index = IB_QPST_RESET2INIT;
+			break;
+		case IB_QPS_INIT:
+			index = IB_QPST_INIT2INIT;
+			break;
+		}
+		break;
+	case IB_QPS_RTR:
+		if (ib_fromstate == IB_QPS_INIT)
+			index = IB_QPST_INIT2RTR;
+		break;
+	case IB_QPS_RTS:
+		switch (ib_fromstate) {
+		case IB_QPS_RTR:
+			index = IB_QPST_RTR2RTS;
+			break;
+		case IB_QPS_RTS:
+			index = IB_QPST_RTS2RTS;
+			break;
+		case IB_QPS_SQD:
+			index = IB_QPST_SQD2RTS;
+			break;
+		case IB_QPS_SQE:
+			index = IB_QPST_SQE2RTS;
+			break;
+		}
+		break;
+	case IB_QPS_SQD:
+		if (ib_fromstate == IB_QPS_RTS)
+			index = IB_QPST_RTS2SQD;
+		break;
+	case IB_QPS_SQE:
+		break;
+	case IB_QPS_ERR:
+		index = IB_QPST_ANY2ERR;
+		break;
+	default:
+		break;
+	}
+	return index;
+}
+
+/*
+ * ibqptype2servicetype returns hcp service type corresponding to given
+ * ib qp type used by create_qp()
+ */
+static inline int ibqptype2servicetype(enum ib_qp_type ibqptype)
+{
+	switch (ibqptype) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		return ST_UD;
+	case IB_QPT_RC:
+		return ST_RC;
+	case IB_QPT_UC:
+		return ST_UC;
+	case IB_QPT_UD:
+		return ST_UD;
+	case IB_QPT_RAW_IPV6:
+		return -EINVAL;
+	case IB_QPT_RAW_ETHERTYPE:
+		return -EINVAL;
+	default:
+		ehca_gen_err("Invalid ibqptype=%x", ibqptype);
+		return -EINVAL;
+	}
+}
+
+/*
+ * init userspace queue info from ipz_queue data
+ */
+static inline void queue2resp(struct ipzu_queue_resp *resp,
+			      struct ipz_queue *queue)
+{
+	resp->qe_size = queue->qe_size;
+	resp->act_nr_of_sg = queue->act_nr_of_sg;
+	resp->queue_length = queue->queue_length;
+	resp->pagesize = queue->pagesize;
+	resp->toggle_state = queue->toggle_state;
+	resp->offset = queue->offset;
+}
+
+/*
+ * init_qp_queue initializes/constructs r/squeue and registers queue pages.
+ */
+static inline int init_qp_queue(struct ehca_shca *shca,
+				struct ehca_pd *pd,
+				struct ehca_qp *my_qp,
+				struct ipz_queue *queue,
+				int q_type,
+				u64 expected_hret,
+				struct ehca_alloc_queue_parms *parms,
+				int wqe_size)
+{
+	int ret, cnt, ipz_rc, nr_q_pages;
+	void *vpage;
+	u64 rpage, h_ret;
+	struct ib_device *ib_dev = &shca->ib_device;
+	struct ipz_adapter_handle ipz_hca_handle = shca->ipz_hca_handle;
+
+	if (!parms->queue_size)
+		return 0;
+
+	if (parms->is_small) {
+		nr_q_pages = 1;
+		ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages,
+					128 << parms->page_size,
+					wqe_size, parms->act_nr_sges, 1);
+	} else {
+		nr_q_pages = parms->queue_size;
+		ipz_rc = ipz_queue_ctor(pd, queue, nr_q_pages,
+					EHCA_PAGESIZE, wqe_size,
+					parms->act_nr_sges, 0);
+	}
+
+	if (!ipz_rc) {
+		ehca_err(ib_dev, "Cannot allocate page for queue. ipz_rc=%i",
+			 ipz_rc);
+		return -EBUSY;
+	}
+
+	/* register queue pages */
+	for (cnt = 0; cnt < nr_q_pages; cnt++) {
+		vpage = ipz_qpageit_get_inc(queue);
+		if (!vpage) {
+			ehca_err(ib_dev, "ipz_qpageit_get_inc() "
+				 "failed p_vpage= %p", vpage);
+			ret = -EINVAL;
+			goto init_qp_queue1;
+		}
+		rpage = __pa(vpage);
+
+		h_ret = hipz_h_register_rpage_qp(ipz_hca_handle,
+						 my_qp->ipz_qp_handle,
+						 NULL, 0, q_type,
+						 rpage, parms->is_small ? 0 : 1,
+						 my_qp->galpas.kernel);
+		if (cnt == (nr_q_pages - 1)) {	/* last page! */
+			if (h_ret != expected_hret) {
+				ehca_err(ib_dev, "hipz_qp_register_rpage() "
+					 "h_ret=%lli", h_ret);
+				ret = ehca2ib_return_code(h_ret);
+				goto init_qp_queue1;
+			}
+			vpage = ipz_qpageit_get_inc(&my_qp->ipz_rqueue);
+			if (vpage) {
+				ehca_err(ib_dev, "ipz_qpageit_get_inc() "
+					 "should not succeed vpage=%p", vpage);
+				ret = -EINVAL;
+				goto init_qp_queue1;
+			}
+		} else {
+			if (h_ret != H_PAGE_REGISTERED) {
+				ehca_err(ib_dev, "hipz_qp_register_rpage() "
+					 "h_ret=%lli", h_ret);
+				ret = ehca2ib_return_code(h_ret);
+				goto init_qp_queue1;
+			}
+		}
+	}
+
+	ipz_qeit_reset(queue);
+
+	return 0;
+
+init_qp_queue1:
+	ipz_queue_dtor(pd, queue);
+	return ret;
+}
+
+static inline int ehca_calc_wqe_size(int act_nr_sge, int is_llqp)
+{
+	if (is_llqp)
+		return 128 << act_nr_sge;
+	else
+		return offsetof(struct ehca_wqe,
+				u.nud.sg_list[act_nr_sge]);
+}
+
+static void ehca_determine_small_queue(struct ehca_alloc_queue_parms *queue,
+				       int req_nr_sge, int is_llqp)
+{
+	u32 wqe_size, q_size;
+	int act_nr_sge = req_nr_sge;
+
+	if (!is_llqp)
+		/* round up #SGEs so WQE size is a power of 2 */
+		for (act_nr_sge = 4; act_nr_sge <= 252;
+		     act_nr_sge = 4 + 2 * act_nr_sge)
+			if (act_nr_sge >= req_nr_sge)
+				break;
+
+	wqe_size = ehca_calc_wqe_size(act_nr_sge, is_llqp);
+	q_size = wqe_size * (queue->max_wr + 1);
+
+	if (q_size <= 512)
+		queue->page_size = 2;
+	else if (q_size <= 1024)
+		queue->page_size = 3;
+	else
+		queue->page_size = 0;
+
+	queue->is_small = (queue->page_size != 0);
+}
+
+/* needs to be called with cq->spinlock held */
+void ehca_add_to_err_list(struct ehca_qp *qp, int on_sq)
+{
+	struct list_head *list, *node;
+
+	/* TODO: support low latency QPs */
+	if (qp->ext_type == EQPT_LLQP)
+		return;
+
+	if (on_sq) {
+		list = &qp->send_cq->sqp_err_list;
+		node = &qp->sq_err_node;
+	} else {
+		list = &qp->recv_cq->rqp_err_list;
+		node = &qp->rq_err_node;
+	}
+
+	if (list_empty(node))
+		list_add_tail(node, list);
+
+	return;
+}
+
+static void del_from_err_list(struct ehca_cq *cq, struct list_head *node)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cq->spinlock, flags);
+
+	if (!list_empty(node))
+		list_del_init(node);
+
+	spin_unlock_irqrestore(&cq->spinlock, flags);
+}
+
+static void reset_queue_map(struct ehca_queue_map *qmap)
+{
+	int i;
+
+	qmap->tail = qmap->entries - 1;
+	qmap->left_to_poll = 0;
+	qmap->next_wqe_idx = 0;
+	for (i = 0; i < qmap->entries; i++) {
+		qmap->map[i].reported = 1;
+		qmap->map[i].cqe_req = 0;
+	}
+}
+
+/*
+ * Create an ib_qp struct that is either a QP or an SRQ, depending on
+ * the value of the is_srq parameter. If init_attr and srq_init_attr share
+ * fields, the field out of init_attr is used.
+ */
+static struct ehca_qp *internal_create_qp(
+	struct ib_pd *pd,
+	struct ib_qp_init_attr *init_attr,
+	struct ib_srq_init_attr *srq_init_attr,
+	struct ib_udata *udata, int is_srq)
+{
+	struct ehca_qp *my_qp, *my_srq = NULL;
+	struct ehca_pd *my_pd = container_of(pd, struct ehca_pd, ib_pd);
+	struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
+					      ib_device);
+	struct ib_ucontext *context = NULL;
+	u64 h_ret;
+	int is_llqp = 0, has_srq = 0, is_user = 0;
+	int qp_type, max_send_sge, max_recv_sge, ret;
+
+	/* h_call's out parameters */
+	struct ehca_alloc_qp_parms parms;
+	u32 swqe_size = 0, rwqe_size = 0, ib_qp_num;
+	unsigned long flags;
+
+	if (!atomic_add_unless(&shca->num_qps, 1, shca->max_num_qps)) {
+		ehca_err(pd->device, "Unable to create QP, max number of %i "
+			 "QPs reached.", shca->max_num_qps);
+		ehca_err(pd->device, "To increase the maximum number of QPs "
+			 "use the number_of_qps module parameter.\n");
+		return ERR_PTR(-ENOSPC);
+	}
+
+	if (init_attr->create_flags) {
+		atomic_dec(&shca->num_qps);
+		return ERR_PTR(-EINVAL);
+	}
+
+	memset(&parms, 0, sizeof(parms));
+	qp_type = init_attr->qp_type;
+
+	if (init_attr->sq_sig_type != IB_SIGNAL_REQ_WR &&
+		init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) {
+		ehca_err(pd->device, "init_attr->sg_sig_type=%x not allowed",
+			 init_attr->sq_sig_type);
+		atomic_dec(&shca->num_qps);
+		return ERR_PTR(-EINVAL);
+	}
+
+	/* save LLQP info */
+	if (qp_type & 0x80) {
+		is_llqp = 1;
+		parms.ext_type = EQPT_LLQP;
+		parms.ll_comp_flags = qp_type & LLQP_COMP_MASK;
+	}
+	qp_type &= 0x1F;
+	init_attr->qp_type &= 0x1F;
+
+	/* handle SRQ base QPs */
+	if (init_attr->srq) {
+		my_srq = container_of(init_attr->srq, struct ehca_qp, ib_srq);
+
+		if (qp_type == IB_QPT_UC) {
+			ehca_err(pd->device, "UC with SRQ not supported");
+			atomic_dec(&shca->num_qps);
+			return ERR_PTR(-EINVAL);
+		}
+
+		has_srq = 1;
+		parms.ext_type = EQPT_SRQBASE;
+		parms.srq_qpn = my_srq->real_qp_num;
+	}
+
+	if (is_llqp && has_srq) {
+		ehca_err(pd->device, "LLQPs can't have an SRQ");
+		atomic_dec(&shca->num_qps);
+		return ERR_PTR(-EINVAL);
+	}
+
+	/* handle SRQs */
+	if (is_srq) {
+		parms.ext_type = EQPT_SRQ;
+		parms.srq_limit = srq_init_attr->attr.srq_limit;
+		if (init_attr->cap.max_recv_sge > 3) {
+			ehca_err(pd->device, "no more than three SGEs "
+				 "supported for SRQ  pd=%p  max_sge=%x",
+				 pd, init_attr->cap.max_recv_sge);
+			atomic_dec(&shca->num_qps);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	/* check QP type */
+	if (qp_type != IB_QPT_UD &&
+	    qp_type != IB_QPT_UC &&
+	    qp_type != IB_QPT_RC &&
+	    qp_type != IB_QPT_SMI &&
+	    qp_type != IB_QPT_GSI) {
+		ehca_err(pd->device, "wrong QP Type=%x", qp_type);
+		atomic_dec(&shca->num_qps);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (is_llqp) {
+		switch (qp_type) {
+		case IB_QPT_RC:
+			if ((init_attr->cap.max_send_wr > 255) ||
+			    (init_attr->cap.max_recv_wr > 255)) {
+				ehca_err(pd->device,
+					 "Invalid Number of max_sq_wr=%x "
+					 "or max_rq_wr=%x for RC LLQP",
+					 init_attr->cap.max_send_wr,
+					 init_attr->cap.max_recv_wr);
+				atomic_dec(&shca->num_qps);
+				return ERR_PTR(-EINVAL);
+			}
+			break;
+		case IB_QPT_UD:
+			if (!EHCA_BMASK_GET(HCA_CAP_UD_LL_QP, shca->hca_cap)) {
+				ehca_err(pd->device, "UD LLQP not supported "
+					 "by this adapter");
+				atomic_dec(&shca->num_qps);
+				return ERR_PTR(-ENOSYS);
+			}
+			if (!(init_attr->cap.max_send_sge <= 5
+			    && init_attr->cap.max_send_sge >= 1
+			    && init_attr->cap.max_recv_sge <= 5
+			    && init_attr->cap.max_recv_sge >= 1)) {
+				ehca_err(pd->device,
+					 "Invalid Number of max_send_sge=%x "
+					 "or max_recv_sge=%x for UD LLQP",
+					 init_attr->cap.max_send_sge,
+					 init_attr->cap.max_recv_sge);
+				atomic_dec(&shca->num_qps);
+				return ERR_PTR(-EINVAL);
+			} else if (init_attr->cap.max_send_wr > 255) {
+				ehca_err(pd->device,
+					 "Invalid Number of "
+					 "max_send_wr=%x for UD QP_TYPE=%x",
+					 init_attr->cap.max_send_wr, qp_type);
+				atomic_dec(&shca->num_qps);
+				return ERR_PTR(-EINVAL);
+			}
+			break;
+		default:
+			ehca_err(pd->device, "unsupported LL QP Type=%x",
+				 qp_type);
+			atomic_dec(&shca->num_qps);
+			return ERR_PTR(-EINVAL);
+		}
+	} else {
+		int max_sge = (qp_type == IB_QPT_UD || qp_type == IB_QPT_SMI
+			       || qp_type == IB_QPT_GSI) ? 250 : 252;
+
+		if (init_attr->cap.max_send_sge > max_sge
+		    || init_attr->cap.max_recv_sge > max_sge) {
+			ehca_err(pd->device, "Invalid number of SGEs requested "
+				 "send_sge=%x recv_sge=%x max_sge=%x",
+				 init_attr->cap.max_send_sge,
+				 init_attr->cap.max_recv_sge, max_sge);
+			atomic_dec(&shca->num_qps);
+			return ERR_PTR(-EINVAL);
+		}
+	}
+
+	my_qp = kmem_cache_zalloc(qp_cache, GFP_KERNEL);
+	if (!my_qp) {
+		ehca_err(pd->device, "pd=%p not enough memory to alloc qp", pd);
+		atomic_dec(&shca->num_qps);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (pd->uobject && udata) {
+		is_user = 1;
+		context = pd->uobject->context;
+	}
+
+	atomic_set(&my_qp->nr_events, 0);
+	init_waitqueue_head(&my_qp->wait_completion);
+	spin_lock_init(&my_qp->spinlock_s);
+	spin_lock_init(&my_qp->spinlock_r);
+	my_qp->qp_type = qp_type;
+	my_qp->ext_type = parms.ext_type;
+	my_qp->state = IB_QPS_RESET;
+
+	if (init_attr->recv_cq)
+		my_qp->recv_cq =
+			container_of(init_attr->recv_cq, struct ehca_cq, ib_cq);
+	if (init_attr->send_cq)
+		my_qp->send_cq =
+			container_of(init_attr->send_cq, struct ehca_cq, ib_cq);
+
+	idr_preload(GFP_KERNEL);
+	write_lock_irqsave(&ehca_qp_idr_lock, flags);
+
+	ret = idr_alloc(&ehca_qp_idr, my_qp, 0, 0x2000000, GFP_NOWAIT);
+	if (ret >= 0)
+		my_qp->token = ret;
+
+	write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+	idr_preload_end();
+	if (ret < 0) {
+		if (ret == -ENOSPC) {
+			ret = -EINVAL;
+			ehca_err(pd->device, "Invalid number of qp");
+		} else {
+			ret = -ENOMEM;
+			ehca_err(pd->device, "Can't allocate new idr entry.");
+		}
+		goto create_qp_exit0;
+	}
+
+	if (has_srq)
+		parms.srq_token = my_qp->token;
+
+	parms.servicetype = ibqptype2servicetype(qp_type);
+	if (parms.servicetype < 0) {
+		ret = -EINVAL;
+		ehca_err(pd->device, "Invalid qp_type=%x", qp_type);
+		goto create_qp_exit1;
+	}
+
+	/* Always signal by WQE so we can hide circ. WQEs */
+	parms.sigtype = HCALL_SIGT_BY_WQE;
+
+	/* UD_AV CIRCUMVENTION */
+	max_send_sge = init_attr->cap.max_send_sge;
+	max_recv_sge = init_attr->cap.max_recv_sge;
+	if (parms.servicetype == ST_UD && !is_llqp) {
+		max_send_sge += 2;
+		max_recv_sge += 2;
+	}
+
+	parms.token = my_qp->token;
+	parms.eq_handle = shca->eq.ipz_eq_handle;
+	parms.pd = my_pd->fw_pd;
+	if (my_qp->send_cq)
+		parms.send_cq_handle = my_qp->send_cq->ipz_cq_handle;
+	if (my_qp->recv_cq)
+		parms.recv_cq_handle = my_qp->recv_cq->ipz_cq_handle;
+
+	parms.squeue.max_wr = init_attr->cap.max_send_wr;
+	parms.rqueue.max_wr = init_attr->cap.max_recv_wr;
+	parms.squeue.max_sge = max_send_sge;
+	parms.rqueue.max_sge = max_recv_sge;
+
+	/* RC QPs need one more SWQE for unsolicited ack circumvention */
+	if (qp_type == IB_QPT_RC)
+		parms.squeue.max_wr++;
+
+	if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) {
+		if (HAS_SQ(my_qp))
+			ehca_determine_small_queue(
+				&parms.squeue, max_send_sge, is_llqp);
+		if (HAS_RQ(my_qp))
+			ehca_determine_small_queue(
+				&parms.rqueue, max_recv_sge, is_llqp);
+		parms.qp_storage =
+			(parms.squeue.is_small || parms.rqueue.is_small);
+	}
+
+	h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms, is_user);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%lli",
+			 h_ret);
+		ret = ehca2ib_return_code(h_ret);
+		goto create_qp_exit1;
+	}
+
+	ib_qp_num = my_qp->real_qp_num = parms.real_qp_num;
+	my_qp->ipz_qp_handle = parms.qp_handle;
+	my_qp->galpas = parms.galpas;
+
+	swqe_size = ehca_calc_wqe_size(parms.squeue.act_nr_sges, is_llqp);
+	rwqe_size = ehca_calc_wqe_size(parms.rqueue.act_nr_sges, is_llqp);
+
+	switch (qp_type) {
+	case IB_QPT_RC:
+		if (is_llqp) {
+			parms.squeue.act_nr_sges = 1;
+			parms.rqueue.act_nr_sges = 1;
+		}
+		/* hide the extra WQE */
+		parms.squeue.act_nr_wqes--;
+		break;
+	case IB_QPT_UD:
+	case IB_QPT_GSI:
+	case IB_QPT_SMI:
+		/* UD circumvention */
+		if (is_llqp) {
+			parms.squeue.act_nr_sges = 1;
+			parms.rqueue.act_nr_sges = 1;
+		} else {
+			parms.squeue.act_nr_sges -= 2;
+			parms.rqueue.act_nr_sges -= 2;
+		}
+
+		if (IB_QPT_GSI == qp_type || IB_QPT_SMI == qp_type) {
+			parms.squeue.act_nr_wqes = init_attr->cap.max_send_wr;
+			parms.rqueue.act_nr_wqes = init_attr->cap.max_recv_wr;
+			parms.squeue.act_nr_sges = init_attr->cap.max_send_sge;
+			parms.rqueue.act_nr_sges = init_attr->cap.max_recv_sge;
+			ib_qp_num = (qp_type == IB_QPT_SMI) ? 0 : 1;
+		}
+
+		break;
+
+	default:
+		break;
+	}
+
+	/* initialize r/squeue and register queue pages */
+	if (HAS_SQ(my_qp)) {
+		ret = init_qp_queue(
+			shca, my_pd, my_qp, &my_qp->ipz_squeue, 0,
+			HAS_RQ(my_qp) ? H_PAGE_REGISTERED : H_SUCCESS,
+			&parms.squeue, swqe_size);
+		if (ret) {
+			ehca_err(pd->device, "Couldn't initialize squeue "
+				 "and pages ret=%i", ret);
+			goto create_qp_exit2;
+		}
+
+		if (!is_user) {
+			my_qp->sq_map.entries = my_qp->ipz_squeue.queue_length /
+				my_qp->ipz_squeue.qe_size;
+			my_qp->sq_map.map = vmalloc(my_qp->sq_map.entries *
+						    sizeof(struct ehca_qmap_entry));
+			if (!my_qp->sq_map.map) {
+				ehca_err(pd->device, "Couldn't allocate squeue "
+					 "map ret=%i", ret);
+				goto create_qp_exit3;
+			}
+			INIT_LIST_HEAD(&my_qp->sq_err_node);
+			/* to avoid the generation of bogus flush CQEs */
+			reset_queue_map(&my_qp->sq_map);
+		}
+	}
+
+	if (HAS_RQ(my_qp)) {
+		ret = init_qp_queue(
+			shca, my_pd, my_qp, &my_qp->ipz_rqueue, 1,
+			H_SUCCESS, &parms.rqueue, rwqe_size);
+		if (ret) {
+			ehca_err(pd->device, "Couldn't initialize rqueue "
+				 "and pages ret=%i", ret);
+			goto create_qp_exit4;
+		}
+		if (!is_user) {
+			my_qp->rq_map.entries = my_qp->ipz_rqueue.queue_length /
+				my_qp->ipz_rqueue.qe_size;
+			my_qp->rq_map.map = vmalloc(my_qp->rq_map.entries *
+						    sizeof(struct ehca_qmap_entry));
+			if (!my_qp->rq_map.map) {
+				ehca_err(pd->device, "Couldn't allocate squeue "
+					 "map ret=%i", ret);
+				goto create_qp_exit5;
+			}
+			INIT_LIST_HEAD(&my_qp->rq_err_node);
+			/* to avoid the generation of bogus flush CQEs */
+			reset_queue_map(&my_qp->rq_map);
+		}
+	} else if (init_attr->srq && !is_user) {
+		/* this is a base QP, use the queue map of the SRQ */
+		my_qp->rq_map = my_srq->rq_map;
+		INIT_LIST_HEAD(&my_qp->rq_err_node);
+
+		my_qp->ipz_rqueue = my_srq->ipz_rqueue;
+	}
+
+	if (is_srq) {
+		my_qp->ib_srq.pd = &my_pd->ib_pd;
+		my_qp->ib_srq.device = my_pd->ib_pd.device;
+
+		my_qp->ib_srq.srq_context = init_attr->qp_context;
+		my_qp->ib_srq.event_handler = init_attr->event_handler;
+	} else {
+		my_qp->ib_qp.qp_num = ib_qp_num;
+		my_qp->ib_qp.pd = &my_pd->ib_pd;
+		my_qp->ib_qp.device = my_pd->ib_pd.device;
+
+		my_qp->ib_qp.recv_cq = init_attr->recv_cq;
+		my_qp->ib_qp.send_cq = init_attr->send_cq;
+
+		my_qp->ib_qp.qp_type = qp_type;
+		my_qp->ib_qp.srq = init_attr->srq;
+
+		my_qp->ib_qp.qp_context = init_attr->qp_context;
+		my_qp->ib_qp.event_handler = init_attr->event_handler;
+	}
+
+	init_attr->cap.max_inline_data = 0; /* not supported yet */
+	init_attr->cap.max_recv_sge = parms.rqueue.act_nr_sges;
+	init_attr->cap.max_recv_wr = parms.rqueue.act_nr_wqes;
+	init_attr->cap.max_send_sge = parms.squeue.act_nr_sges;
+	init_attr->cap.max_send_wr = parms.squeue.act_nr_wqes;
+	my_qp->init_attr = *init_attr;
+
+	if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
+		shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
+			&my_qp->ib_qp;
+		if (ehca_nr_ports < 0) {
+			/* alloc array to cache subsequent modify qp parms
+			 * for autodetect mode
+			 */
+			my_qp->mod_qp_parm =
+				kzalloc(EHCA_MOD_QP_PARM_MAX *
+					sizeof(*my_qp->mod_qp_parm),
+					GFP_KERNEL);
+			if (!my_qp->mod_qp_parm) {
+				ehca_err(pd->device,
+					 "Could not alloc mod_qp_parm");
+				goto create_qp_exit5;
+			}
+		}
+	}
+
+	/* NOTE: define_apq0() not supported yet */
+	if (qp_type == IB_QPT_GSI) {
+		h_ret = ehca_define_sqp(shca, my_qp, init_attr);
+		if (h_ret != H_SUCCESS) {
+			kfree(my_qp->mod_qp_parm);
+			my_qp->mod_qp_parm = NULL;
+			/* the QP pointer is no longer valid */
+			shca->sport[init_attr->port_num - 1].ibqp_sqp[qp_type] =
+				NULL;
+			ret = ehca2ib_return_code(h_ret);
+			goto create_qp_exit6;
+		}
+	}
+
+	if (my_qp->send_cq) {
+		ret = ehca_cq_assign_qp(my_qp->send_cq, my_qp);
+		if (ret) {
+			ehca_err(pd->device,
+				 "Couldn't assign qp to send_cq ret=%i", ret);
+			goto create_qp_exit7;
+		}
+	}
+
+	/* copy queues, galpa data to user space */
+	if (context && udata) {
+		struct ehca_create_qp_resp resp;
+		memset(&resp, 0, sizeof(resp));
+
+		resp.qp_num = my_qp->real_qp_num;
+		resp.token = my_qp->token;
+		resp.qp_type = my_qp->qp_type;
+		resp.ext_type = my_qp->ext_type;
+		resp.qkey = my_qp->qkey;
+		resp.real_qp_num = my_qp->real_qp_num;
+
+		if (HAS_SQ(my_qp))
+			queue2resp(&resp.ipz_squeue, &my_qp->ipz_squeue);
+		if (HAS_RQ(my_qp))
+			queue2resp(&resp.ipz_rqueue, &my_qp->ipz_rqueue);
+		resp.fw_handle_ofs = (u32)
+			(my_qp->galpas.user.fw_handle & (PAGE_SIZE - 1));
+
+		if (ib_copy_to_udata(udata, &resp, sizeof resp)) {
+			ehca_err(pd->device, "Copy to udata failed");
+			ret = -EINVAL;
+			goto create_qp_exit8;
+		}
+	}
+
+	return my_qp;
+
+create_qp_exit8:
+	ehca_cq_unassign_qp(my_qp->send_cq, my_qp->real_qp_num);
+
+create_qp_exit7:
+	kfree(my_qp->mod_qp_parm);
+
+create_qp_exit6:
+	if (HAS_RQ(my_qp) && !is_user)
+		vfree(my_qp->rq_map.map);
+
+create_qp_exit5:
+	if (HAS_RQ(my_qp))
+		ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
+
+create_qp_exit4:
+	if (HAS_SQ(my_qp) && !is_user)
+		vfree(my_qp->sq_map.map);
+
+create_qp_exit3:
+	if (HAS_SQ(my_qp))
+		ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
+
+create_qp_exit2:
+	hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
+
+create_qp_exit1:
+	write_lock_irqsave(&ehca_qp_idr_lock, flags);
+	idr_remove(&ehca_qp_idr, my_qp->token);
+	write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+
+create_qp_exit0:
+	kmem_cache_free(qp_cache, my_qp);
+	atomic_dec(&shca->num_qps);
+	return ERR_PTR(ret);
+}
+
+struct ib_qp *ehca_create_qp(struct ib_pd *pd,
+			     struct ib_qp_init_attr *qp_init_attr,
+			     struct ib_udata *udata)
+{
+	struct ehca_qp *ret;
+
+	ret = internal_create_qp(pd, qp_init_attr, NULL, udata, 0);
+	return IS_ERR(ret) ? (struct ib_qp *)ret : &ret->ib_qp;
+}
+
+static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
+			       struct ib_uobject *uobject);
+
+struct ib_srq *ehca_create_srq(struct ib_pd *pd,
+			       struct ib_srq_init_attr *srq_init_attr,
+			       struct ib_udata *udata)
+{
+	struct ib_qp_init_attr qp_init_attr;
+	struct ehca_qp *my_qp;
+	struct ib_srq *ret;
+	struct ehca_shca *shca = container_of(pd->device, struct ehca_shca,
+					      ib_device);
+	struct hcp_modify_qp_control_block *mqpcb;
+	u64 hret, update_mask;
+
+	if (srq_init_attr->srq_type != IB_SRQT_BASIC)
+		return ERR_PTR(-ENOSYS);
+
+	/* For common attributes, internal_create_qp() takes its info
+	 * out of qp_init_attr, so copy all common attrs there.
+	 */
+	memset(&qp_init_attr, 0, sizeof(qp_init_attr));
+	qp_init_attr.event_handler = srq_init_attr->event_handler;
+	qp_init_attr.qp_context = srq_init_attr->srq_context;
+	qp_init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+	qp_init_attr.qp_type = IB_QPT_RC;
+	qp_init_attr.cap.max_recv_wr = srq_init_attr->attr.max_wr;
+	qp_init_attr.cap.max_recv_sge = srq_init_attr->attr.max_sge;
+
+	my_qp = internal_create_qp(pd, &qp_init_attr, srq_init_attr, udata, 1);
+	if (IS_ERR(my_qp))
+		return (struct ib_srq *)my_qp;
+
+	/* copy back return values */
+	srq_init_attr->attr.max_wr = qp_init_attr.cap.max_recv_wr;
+	srq_init_attr->attr.max_sge = 3;
+
+	/* drive SRQ into RTR state */
+	mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!mqpcb) {
+		ehca_err(pd->device, "Could not get zeroed page for mqpcb "
+			 "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num);
+		ret = ERR_PTR(-ENOMEM);
+		goto create_srq1;
+	}
+
+	mqpcb->qp_state = EHCA_QPS_INIT;
+	mqpcb->prim_phys_port = 1;
+	update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
+	hret = hipz_h_modify_qp(shca->ipz_hca_handle,
+				my_qp->ipz_qp_handle,
+				&my_qp->pf,
+				update_mask,
+				mqpcb, my_qp->galpas.kernel);
+	if (hret != H_SUCCESS) {
+		ehca_err(pd->device, "Could not modify SRQ to INIT "
+			 "ehca_qp=%p qp_num=%x h_ret=%lli",
+			 my_qp, my_qp->real_qp_num, hret);
+		goto create_srq2;
+	}
+
+	mqpcb->qp_enable = 1;
+	update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1);
+	hret = hipz_h_modify_qp(shca->ipz_hca_handle,
+				my_qp->ipz_qp_handle,
+				&my_qp->pf,
+				update_mask,
+				mqpcb, my_qp->galpas.kernel);
+	if (hret != H_SUCCESS) {
+		ehca_err(pd->device, "Could not enable SRQ "
+			 "ehca_qp=%p qp_num=%x h_ret=%lli",
+			 my_qp, my_qp->real_qp_num, hret);
+		goto create_srq2;
+	}
+
+	mqpcb->qp_state  = EHCA_QPS_RTR;
+	update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
+	hret = hipz_h_modify_qp(shca->ipz_hca_handle,
+				my_qp->ipz_qp_handle,
+				&my_qp->pf,
+				update_mask,
+				mqpcb, my_qp->galpas.kernel);
+	if (hret != H_SUCCESS) {
+		ehca_err(pd->device, "Could not modify SRQ to RTR "
+			 "ehca_qp=%p qp_num=%x h_ret=%lli",
+			 my_qp, my_qp->real_qp_num, hret);
+		goto create_srq2;
+	}
+
+	ehca_free_fw_ctrlblock(mqpcb);
+
+	return &my_qp->ib_srq;
+
+create_srq2:
+	ret = ERR_PTR(ehca2ib_return_code(hret));
+	ehca_free_fw_ctrlblock(mqpcb);
+
+create_srq1:
+	internal_destroy_qp(pd->device, my_qp, my_qp->ib_srq.uobject);
+
+	return ret;
+}
+
+/*
+ * prepare_sqe_rts called by internal_modify_qp() at trans sqe -> rts
+ * set purge bit of bad wqe and subsequent wqes to avoid reentering sqe
+ * returns total number of bad wqes in bad_wqe_cnt
+ */
+static int prepare_sqe_rts(struct ehca_qp *my_qp, struct ehca_shca *shca,
+			   int *bad_wqe_cnt)
+{
+	u64 h_ret;
+	struct ipz_queue *squeue;
+	void *bad_send_wqe_p, *bad_send_wqe_v;
+	u64 q_ofs;
+	struct ehca_wqe *wqe;
+	int qp_num = my_qp->ib_qp.qp_num;
+
+	/* get send wqe pointer */
+	h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
+					   my_qp->ipz_qp_handle, &my_qp->pf,
+					   &bad_send_wqe_p, NULL, 2);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(&shca->ib_device, "hipz_h_disable_and_get_wqe() failed"
+			 " ehca_qp=%p qp_num=%x h_ret=%lli",
+			 my_qp, qp_num, h_ret);
+		return ehca2ib_return_code(h_ret);
+	}
+	bad_send_wqe_p = (void *)((u64)bad_send_wqe_p & (~(1L << 63)));
+	ehca_dbg(&shca->ib_device, "qp_num=%x bad_send_wqe_p=%p",
+		 qp_num, bad_send_wqe_p);
+	/* convert wqe pointer to vadr */
+	bad_send_wqe_v = __va((u64)bad_send_wqe_p);
+	if (ehca_debug_level >= 2)
+		ehca_dmp(bad_send_wqe_v, 32, "qp_num=%x bad_wqe", qp_num);
+	squeue = &my_qp->ipz_squeue;
+	if (ipz_queue_abs_to_offset(squeue, (u64)bad_send_wqe_p, &q_ofs)) {
+		ehca_err(&shca->ib_device, "failed to get wqe offset qp_num=%x"
+			 " bad_send_wqe_p=%p", qp_num, bad_send_wqe_p);
+		return -EFAULT;
+	}
+
+	/* loop sets wqe's purge bit */
+	wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs);
+	*bad_wqe_cnt = 0;
+	while (wqe->optype != 0xff && wqe->wqef != 0xff) {
+		if (ehca_debug_level >= 2)
+			ehca_dmp(wqe, 32, "qp_num=%x wqe", qp_num);
+		wqe->nr_of_data_seg = 0; /* suppress data access */
+		wqe->wqef = WQEF_PURGE; /* WQE to be purged */
+		q_ofs = ipz_queue_advance_offset(squeue, q_ofs);
+		wqe = (struct ehca_wqe *)ipz_qeit_calc(squeue, q_ofs);
+		*bad_wqe_cnt = (*bad_wqe_cnt)+1;
+	}
+	/*
+	 * bad wqe will be reprocessed and ignored when pol_cq() is called,
+	 *  i.e. nr of wqes with flush error status is one less
+	 */
+	ehca_dbg(&shca->ib_device, "qp_num=%x flusherr_wqe_cnt=%x",
+		 qp_num, (*bad_wqe_cnt)-1);
+	wqe->wqef = 0;
+
+	return 0;
+}
+
+static int calc_left_cqes(u64 wqe_p, struct ipz_queue *ipz_queue,
+			  struct ehca_queue_map *qmap)
+{
+	void *wqe_v;
+	u64 q_ofs;
+	u32 wqe_idx;
+	unsigned int tail_idx;
+
+	/* convert real to abs address */
+	wqe_p = wqe_p & (~(1UL << 63));
+
+	wqe_v = __va(wqe_p);
+
+	if (ipz_queue_abs_to_offset(ipz_queue, wqe_p, &q_ofs)) {
+		ehca_gen_err("Invalid offset for calculating left cqes "
+				"wqe_p=%#llx wqe_v=%p\n", wqe_p, wqe_v);
+		return -EFAULT;
+	}
+
+	tail_idx = next_index(qmap->tail, qmap->entries);
+	wqe_idx = q_ofs / ipz_queue->qe_size;
+
+	/* check all processed wqes, whether a cqe is requested or not */
+	while (tail_idx != wqe_idx) {
+		if (qmap->map[tail_idx].cqe_req)
+			qmap->left_to_poll++;
+		tail_idx = next_index(tail_idx, qmap->entries);
+	}
+	/* save index in queue, where we have to start flushing */
+	qmap->next_wqe_idx = wqe_idx;
+	return 0;
+}
+
+static int check_for_left_cqes(struct ehca_qp *my_qp, struct ehca_shca *shca)
+{
+	u64 h_ret;
+	void *send_wqe_p, *recv_wqe_p;
+	int ret;
+	unsigned long flags;
+	int qp_num = my_qp->ib_qp.qp_num;
+
+	/* this hcall is not supported on base QPs */
+	if (my_qp->ext_type != EQPT_SRQBASE) {
+		/* get send and receive wqe pointer */
+		h_ret = hipz_h_disable_and_get_wqe(shca->ipz_hca_handle,
+				my_qp->ipz_qp_handle, &my_qp->pf,
+				&send_wqe_p, &recv_wqe_p, 4);
+		if (h_ret != H_SUCCESS) {
+			ehca_err(&shca->ib_device, "disable_and_get_wqe() "
+				 "failed ehca_qp=%p qp_num=%x h_ret=%lli",
+				 my_qp, qp_num, h_ret);
+			return ehca2ib_return_code(h_ret);
+		}
+
+		/*
+		 * acquire lock to ensure that nobody is polling the cq which
+		 * could mean that the qmap->tail pointer is in an
+		 * inconsistent state.
+		 */
+		spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
+		ret = calc_left_cqes((u64)send_wqe_p, &my_qp->ipz_squeue,
+				&my_qp->sq_map);
+		spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
+		if (ret)
+			return ret;
+
+
+		spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
+		ret = calc_left_cqes((u64)recv_wqe_p, &my_qp->ipz_rqueue,
+				&my_qp->rq_map);
+		spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
+		if (ret)
+			return ret;
+	} else {
+		spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
+		my_qp->sq_map.left_to_poll = 0;
+		my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
+							my_qp->sq_map.entries);
+		spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
+
+		spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
+		my_qp->rq_map.left_to_poll = 0;
+		my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
+							my_qp->rq_map.entries);
+		spin_unlock_irqrestore(&my_qp->recv_cq->spinlock, flags);
+	}
+
+	/* this assures flush cqes being generated only for pending wqes */
+	if ((my_qp->sq_map.left_to_poll == 0) &&
+				(my_qp->rq_map.left_to_poll == 0)) {
+		spin_lock_irqsave(&my_qp->send_cq->spinlock, flags);
+		ehca_add_to_err_list(my_qp, 1);
+		spin_unlock_irqrestore(&my_qp->send_cq->spinlock, flags);
+
+		if (HAS_RQ(my_qp)) {
+			spin_lock_irqsave(&my_qp->recv_cq->spinlock, flags);
+			ehca_add_to_err_list(my_qp, 0);
+			spin_unlock_irqrestore(&my_qp->recv_cq->spinlock,
+					flags);
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * internal_modify_qp with circumvention to handle aqp0 properly
+ * smi_reset2init indicates if this is an internal reset-to-init-call for
+ * smi. This flag must always be zero if called from ehca_modify_qp()!
+ * This internal func was intorduced to avoid recursion of ehca_modify_qp()!
+ */
+static int internal_modify_qp(struct ib_qp *ibqp,
+			      struct ib_qp_attr *attr,
+			      int attr_mask, int smi_reset2init)
+{
+	enum ib_qp_state qp_cur_state, qp_new_state;
+	int cnt, qp_attr_idx, ret = 0;
+	enum ib_qp_statetrans statetrans;
+	struct hcp_modify_qp_control_block *mqpcb;
+	struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+	struct ehca_shca *shca =
+		container_of(ibqp->pd->device, struct ehca_shca, ib_device);
+	u64 update_mask;
+	u64 h_ret;
+	int bad_wqe_cnt = 0;
+	int is_user = 0;
+	int squeue_locked = 0;
+	unsigned long flags = 0;
+
+	/* do query_qp to obtain current attr values */
+	mqpcb = ehca_alloc_fw_ctrlblock(GFP_ATOMIC);
+	if (!mqpcb) {
+		ehca_err(ibqp->device, "Could not get zeroed page for mqpcb "
+			 "ehca_qp=%p qp_num=%x ", my_qp, ibqp->qp_num);
+		return -ENOMEM;
+	}
+
+	h_ret = hipz_h_query_qp(shca->ipz_hca_handle,
+				my_qp->ipz_qp_handle,
+				&my_qp->pf,
+				mqpcb, my_qp->galpas.kernel);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(ibqp->device, "hipz_h_query_qp() failed "
+			 "ehca_qp=%p qp_num=%x h_ret=%lli",
+			 my_qp, ibqp->qp_num, h_ret);
+		ret = ehca2ib_return_code(h_ret);
+		goto modify_qp_exit1;
+	}
+	if (ibqp->uobject)
+		is_user = 1;
+
+	qp_cur_state = ehca2ib_qp_state(mqpcb->qp_state);
+
+	if (qp_cur_state == -EINVAL) {	/* invalid qp state */
+		ret = -EINVAL;
+		ehca_err(ibqp->device, "Invalid current ehca_qp_state=%x "
+			 "ehca_qp=%p qp_num=%x",
+			 mqpcb->qp_state, my_qp, ibqp->qp_num);
+		goto modify_qp_exit1;
+	}
+	/*
+	 * circumvention to set aqp0 initial state to init
+	 * as expected by IB spec
+	 */
+	if (smi_reset2init == 0 &&
+	    ibqp->qp_type == IB_QPT_SMI &&
+	    qp_cur_state == IB_QPS_RESET &&
+	    (attr_mask & IB_QP_STATE) &&
+	    attr->qp_state == IB_QPS_INIT) { /* RESET -> INIT */
+		struct ib_qp_attr smiqp_attr = {
+			.qp_state = IB_QPS_INIT,
+			.port_num = my_qp->init_attr.port_num,
+			.pkey_index = 0,
+			.qkey = 0
+		};
+		int smiqp_attr_mask = IB_QP_STATE | IB_QP_PORT |
+			IB_QP_PKEY_INDEX | IB_QP_QKEY;
+		int smirc = internal_modify_qp(
+			ibqp, &smiqp_attr, smiqp_attr_mask, 1);
+		if (smirc) {
+			ehca_err(ibqp->device, "SMI RESET -> INIT failed. "
+				 "ehca_modify_qp() rc=%i", smirc);
+			ret = H_PARAMETER;
+			goto modify_qp_exit1;
+		}
+		qp_cur_state = IB_QPS_INIT;
+		ehca_dbg(ibqp->device, "SMI RESET -> INIT succeeded");
+	}
+	/* is transmitted current state  equal to "real" current state */
+	if ((attr_mask & IB_QP_CUR_STATE) &&
+	    qp_cur_state != attr->cur_qp_state) {
+		ret = -EINVAL;
+		ehca_err(ibqp->device,
+			 "Invalid IB_QP_CUR_STATE attr->curr_qp_state=%x <>"
+			 " actual cur_qp_state=%x. ehca_qp=%p qp_num=%x",
+			 attr->cur_qp_state, qp_cur_state, my_qp, ibqp->qp_num);
+		goto modify_qp_exit1;
+	}
+
+	ehca_dbg(ibqp->device, "ehca_qp=%p qp_num=%x current qp_state=%x "
+		 "new qp_state=%x attribute_mask=%x",
+		 my_qp, ibqp->qp_num, qp_cur_state, attr->qp_state, attr_mask);
+
+	qp_new_state = attr_mask & IB_QP_STATE ? attr->qp_state : qp_cur_state;
+	if (!smi_reset2init &&
+	    !ib_modify_qp_is_ok(qp_cur_state, qp_new_state, ibqp->qp_type,
+				attr_mask, IB_LINK_LAYER_UNSPECIFIED)) {
+		ret = -EINVAL;
+		ehca_err(ibqp->device,
+			 "Invalid qp transition new_state=%x cur_state=%x "
+			 "ehca_qp=%p qp_num=%x attr_mask=%x", qp_new_state,
+			 qp_cur_state, my_qp, ibqp->qp_num, attr_mask);
+		goto modify_qp_exit1;
+	}
+
+	mqpcb->qp_state = ib2ehca_qp_state(qp_new_state);
+	if (mqpcb->qp_state)
+		update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_STATE, 1);
+	else {
+		ret = -EINVAL;
+		ehca_err(ibqp->device, "Invalid new qp state=%x "
+			 "ehca_qp=%p qp_num=%x",
+			 qp_new_state, my_qp, ibqp->qp_num);
+		goto modify_qp_exit1;
+	}
+
+	/* retrieve state transition struct to get req and opt attrs */
+	statetrans = get_modqp_statetrans(qp_cur_state, qp_new_state);
+	if (statetrans < 0) {
+		ret = -EINVAL;
+		ehca_err(ibqp->device, "<INVALID STATE CHANGE> qp_cur_state=%x "
+			 "new_qp_state=%x State_xsition=%x ehca_qp=%p "
+			 "qp_num=%x", qp_cur_state, qp_new_state,
+			 statetrans, my_qp, ibqp->qp_num);
+		goto modify_qp_exit1;
+	}
+
+	qp_attr_idx = ib2ehcaqptype(ibqp->qp_type);
+
+	if (qp_attr_idx < 0) {
+		ret = qp_attr_idx;
+		ehca_err(ibqp->device,
+			 "Invalid QP type=%x ehca_qp=%p qp_num=%x",
+			 ibqp->qp_type, my_qp, ibqp->qp_num);
+		goto modify_qp_exit1;
+	}
+
+	ehca_dbg(ibqp->device,
+		 "ehca_qp=%p qp_num=%x <VALID STATE CHANGE> qp_state_xsit=%x",
+		 my_qp, ibqp->qp_num, statetrans);
+
+	/* eHCA2 rev2 and higher require the SEND_GRH_FLAG to be set
+	 * in non-LL UD QPs.
+	 */
+	if ((my_qp->qp_type == IB_QPT_UD) &&
+	    (my_qp->ext_type != EQPT_LLQP) &&
+	    (statetrans == IB_QPST_INIT2RTR) &&
+	    (shca->hw_level >= 0x22)) {
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
+		mqpcb->send_grh_flag = 1;
+	}
+
+	/* sqe -> rts: set purge bit of bad wqe before actual trans */
+	if ((my_qp->qp_type == IB_QPT_UD ||
+	     my_qp->qp_type == IB_QPT_GSI ||
+	     my_qp->qp_type == IB_QPT_SMI) &&
+	    statetrans == IB_QPST_SQE2RTS) {
+		/* mark next free wqe if kernel */
+		if (!ibqp->uobject) {
+			struct ehca_wqe *wqe;
+			/* lock send queue */
+			spin_lock_irqsave(&my_qp->spinlock_s, flags);
+			squeue_locked = 1;
+			/* mark next free wqe */
+			wqe = (struct ehca_wqe *)
+				ipz_qeit_get(&my_qp->ipz_squeue);
+			wqe->optype = wqe->wqef = 0xff;
+			ehca_dbg(ibqp->device, "qp_num=%x next_free_wqe=%p",
+				 ibqp->qp_num, wqe);
+		}
+		ret = prepare_sqe_rts(my_qp, shca, &bad_wqe_cnt);
+		if (ret) {
+			ehca_err(ibqp->device, "prepare_sqe_rts() failed "
+				 "ehca_qp=%p qp_num=%x ret=%i",
+				 my_qp, ibqp->qp_num, ret);
+			goto modify_qp_exit2;
+		}
+	}
+
+	/*
+	 * enable RDMA_Atomic_Control if reset->init und reliable con
+	 * this is necessary since gen2 does not provide that flag,
+	 * but pHyp requires it
+	 */
+	if (statetrans == IB_QPST_RESET2INIT &&
+	    (ibqp->qp_type == IB_QPT_RC || ibqp->qp_type == IB_QPT_UC)) {
+		mqpcb->rdma_atomic_ctrl = 3;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RDMA_ATOMIC_CTRL, 1);
+	}
+	/* circ. pHyp requires #RDMA/Atomic Resp Res for UC INIT -> RTR */
+	if (statetrans == IB_QPST_INIT2RTR &&
+	    (ibqp->qp_type == IB_QPT_UC) &&
+	    !(attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)) {
+		mqpcb->rdma_nr_atomic_resp_res = 1; /* default to 1 */
+		update_mask |=
+			EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1);
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		if (attr->pkey_index >= 16) {
+			ret = -EINVAL;
+			ehca_err(ibqp->device, "Invalid pkey_index=%x. "
+				 "ehca_qp=%p qp_num=%x max_pkey_index=f",
+				 attr->pkey_index, my_qp, ibqp->qp_num);
+			goto modify_qp_exit2;
+		}
+		mqpcb->prim_p_key_idx = attr->pkey_index;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_P_KEY_IDX, 1);
+	}
+	if (attr_mask & IB_QP_PORT) {
+		struct ehca_sport *sport;
+		struct ehca_qp *aqp1;
+		if (attr->port_num < 1 || attr->port_num > shca->num_ports) {
+			ret = -EINVAL;
+			ehca_err(ibqp->device, "Invalid port=%x. "
+				 "ehca_qp=%p qp_num=%x num_ports=%x",
+				 attr->port_num, my_qp, ibqp->qp_num,
+				 shca->num_ports);
+			goto modify_qp_exit2;
+		}
+		sport = &shca->sport[attr->port_num - 1];
+		if (!sport->ibqp_sqp[IB_QPT_GSI]) {
+			/* should not occur */
+			ret = -EFAULT;
+			ehca_err(ibqp->device, "AQP1 was not created for "
+				 "port=%x", attr->port_num);
+			goto modify_qp_exit2;
+		}
+		aqp1 = container_of(sport->ibqp_sqp[IB_QPT_GSI],
+				    struct ehca_qp, ib_qp);
+		if (ibqp->qp_type != IB_QPT_GSI &&
+		    ibqp->qp_type != IB_QPT_SMI &&
+		    aqp1->mod_qp_parm) {
+			/*
+			 * firmware will reject this modify_qp() because
+			 * port is not activated/initialized fully
+			 */
+			ret = -EFAULT;
+			ehca_warn(ibqp->device, "Couldn't modify qp port=%x: "
+				  "either port is being activated (try again) "
+				  "or cabling issue", attr->port_num);
+			goto modify_qp_exit2;
+		}
+		mqpcb->prim_phys_port = attr->port_num;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_PHYS_PORT, 1);
+	}
+	if (attr_mask & IB_QP_QKEY) {
+		mqpcb->qkey = attr->qkey;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_QKEY, 1);
+	}
+	if (attr_mask & IB_QP_AV) {
+		mqpcb->dlid = attr->ah_attr.dlid;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DLID, 1);
+		mqpcb->source_path_bits = attr->ah_attr.src_path_bits;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS, 1);
+		mqpcb->service_level = attr->ah_attr.sl;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL, 1);
+
+		if (ehca_calc_ipd(shca, mqpcb->prim_phys_port,
+				  attr->ah_attr.static_rate,
+				  &mqpcb->max_static_rate)) {
+			ret = -EINVAL;
+			goto modify_qp_exit2;
+		}
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE, 1);
+
+		/*
+		 * Always supply the GRH flag, even if it's zero, to give the
+		 * hypervisor a clear "yes" or "no" instead of a "perhaps"
+		 */
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG, 1);
+
+		/*
+		 * only if GRH is TRUE we might consider SOURCE_GID_IDX
+		 * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
+		 */
+		if (attr->ah_attr.ah_flags == IB_AH_GRH) {
+			mqpcb->send_grh_flag = 1;
+
+			mqpcb->source_gid_idx = attr->ah_attr.grh.sgid_index;
+			update_mask |=
+				EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX, 1);
+
+			for (cnt = 0; cnt < 16; cnt++)
+				mqpcb->dest_gid.byte[cnt] =
+					attr->ah_attr.grh.dgid.raw[cnt];
+
+			update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_GID, 1);
+			mqpcb->flow_label = attr->ah_attr.grh.flow_label;
+			update_mask |= EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL, 1);
+			mqpcb->hop_limit = attr->ah_attr.grh.hop_limit;
+			update_mask |= EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT, 1);
+			mqpcb->traffic_class = attr->ah_attr.grh.traffic_class;
+			update_mask |=
+				EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS, 1);
+		}
+	}
+
+	if (attr_mask & IB_QP_PATH_MTU) {
+		/* store ld(MTU) */
+		my_qp->mtu_shift = attr->path_mtu + 7;
+		mqpcb->path_mtu = attr->path_mtu;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PATH_MTU, 1);
+	}
+	if (attr_mask & IB_QP_TIMEOUT) {
+		mqpcb->timeout = attr->timeout;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT, 1);
+	}
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		mqpcb->retry_count = attr->retry_cnt;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT, 1);
+	}
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		mqpcb->rnr_retry_count = attr->rnr_retry;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT, 1);
+	}
+	if (attr_mask & IB_QP_RQ_PSN) {
+		mqpcb->receive_psn = attr->rq_psn;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_RECEIVE_PSN, 1);
+	}
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		mqpcb->rdma_nr_atomic_resp_res = attr->max_dest_rd_atomic < 3 ?
+			attr->max_dest_rd_atomic : 2;
+		update_mask |=
+			EHCA_BMASK_SET(MQPCB_MASK_RDMA_NR_ATOMIC_RESP_RES, 1);
+	}
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		mqpcb->rdma_atomic_outst_dest_qp = attr->max_rd_atomic < 3 ?
+			attr->max_rd_atomic : 2;
+		update_mask |=
+			EHCA_BMASK_SET
+			(MQPCB_MASK_RDMA_ATOMIC_OUTST_DEST_QP, 1);
+	}
+	if (attr_mask & IB_QP_ALT_PATH) {
+		if (attr->alt_port_num < 1
+		    || attr->alt_port_num > shca->num_ports) {
+			ret = -EINVAL;
+			ehca_err(ibqp->device, "Invalid alt_port=%x. "
+				 "ehca_qp=%p qp_num=%x num_ports=%x",
+				 attr->alt_port_num, my_qp, ibqp->qp_num,
+				 shca->num_ports);
+			goto modify_qp_exit2;
+		}
+		mqpcb->alt_phys_port = attr->alt_port_num;
+
+		if (attr->alt_pkey_index >= 16) {
+			ret = -EINVAL;
+			ehca_err(ibqp->device, "Invalid alt_pkey_index=%x. "
+				 "ehca_qp=%p qp_num=%x max_pkey_index=f",
+				 attr->pkey_index, my_qp, ibqp->qp_num);
+			goto modify_qp_exit2;
+		}
+		mqpcb->alt_p_key_idx = attr->alt_pkey_index;
+
+		mqpcb->timeout_al = attr->alt_timeout;
+		mqpcb->dlid_al = attr->alt_ah_attr.dlid;
+		mqpcb->source_path_bits_al = attr->alt_ah_attr.src_path_bits;
+		mqpcb->service_level_al = attr->alt_ah_attr.sl;
+
+		if (ehca_calc_ipd(shca, mqpcb->alt_phys_port,
+				  attr->alt_ah_attr.static_rate,
+				  &mqpcb->max_static_rate_al)) {
+			ret = -EINVAL;
+			goto modify_qp_exit2;
+		}
+
+		/* OpenIB doesn't support alternate retry counts - copy them */
+		mqpcb->retry_count_al = mqpcb->retry_count;
+		mqpcb->rnr_retry_count_al = mqpcb->rnr_retry_count;
+
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_ALT_PHYS_PORT, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_ALT_P_KEY_IDX, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_DLID_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT_AL, 1);
+
+		/*
+		 * Always supply the GRH flag, even if it's zero, to give the
+		 * hypervisor a clear "yes" or "no" instead of a "perhaps"
+		 */
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG_AL, 1);
+
+		/*
+		 * only if GRH is TRUE we might consider SOURCE_GID_IDX
+		 * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
+		 */
+		if (attr->alt_ah_attr.ah_flags == IB_AH_GRH) {
+			mqpcb->send_grh_flag_al = 1;
+
+			for (cnt = 0; cnt < 16; cnt++)
+				mqpcb->dest_gid_al.byte[cnt] =
+					attr->alt_ah_attr.grh.dgid.raw[cnt];
+			mqpcb->source_gid_idx_al =
+				attr->alt_ah_attr.grh.sgid_index;
+			mqpcb->flow_label_al = attr->alt_ah_attr.grh.flow_label;
+			mqpcb->hop_limit_al = attr->alt_ah_attr.grh.hop_limit;
+			mqpcb->traffic_class_al =
+				attr->alt_ah_attr.grh.traffic_class;
+
+			update_mask |=
+				EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX_AL, 1)
+				| EHCA_BMASK_SET(MQPCB_MASK_DEST_GID_AL, 1)
+				| EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL_AL, 1)
+				| EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT_AL, 1) |
+				EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS_AL, 1);
+		}
+	}
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		mqpcb->min_rnr_nak_timer_field = attr->min_rnr_timer;
+		update_mask |=
+			EHCA_BMASK_SET(MQPCB_MASK_MIN_RNR_NAK_TIMER_FIELD, 1);
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN) {
+		mqpcb->send_psn = attr->sq_psn;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_PSN, 1);
+	}
+
+	if (attr_mask & IB_QP_DEST_QPN) {
+		mqpcb->dest_qp_nr = attr->dest_qp_num;
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DEST_QP_NR, 1);
+	}
+
+	if (attr_mask & IB_QP_PATH_MIG_STATE) {
+		if (attr->path_mig_state != IB_MIG_REARM
+		    && attr->path_mig_state != IB_MIG_MIGRATED) {
+			ret = -EINVAL;
+			ehca_err(ibqp->device, "Invalid mig_state=%x",
+				 attr->path_mig_state);
+			goto modify_qp_exit2;
+		}
+		mqpcb->path_migration_state = attr->path_mig_state + 1;
+		if (attr->path_mig_state == IB_MIG_REARM)
+			my_qp->mig_armed = 1;
+		update_mask |=
+			EHCA_BMASK_SET(MQPCB_MASK_PATH_MIGRATION_STATE, 1);
+	}
+
+	if (attr_mask & IB_QP_CAP) {
+		mqpcb->max_nr_outst_send_wr = attr->cap.max_send_wr+1;
+		update_mask |=
+			EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_SEND_WR, 1);
+		mqpcb->max_nr_outst_recv_wr = attr->cap.max_recv_wr+1;
+		update_mask |=
+			EHCA_BMASK_SET(MQPCB_MASK_MAX_NR_OUTST_RECV_WR, 1);
+		/* no support for max_send/recv_sge yet */
+	}
+
+	if (ehca_debug_level >= 2)
+		ehca_dmp(mqpcb, 4*70, "qp_num=%x", ibqp->qp_num);
+
+	h_ret = hipz_h_modify_qp(shca->ipz_hca_handle,
+				 my_qp->ipz_qp_handle,
+				 &my_qp->pf,
+				 update_mask,
+				 mqpcb, my_qp->galpas.kernel);
+
+	if (h_ret != H_SUCCESS) {
+		ret = ehca2ib_return_code(h_ret);
+		ehca_err(ibqp->device, "hipz_h_modify_qp() failed h_ret=%lli "
+			 "ehca_qp=%p qp_num=%x", h_ret, my_qp, ibqp->qp_num);
+		goto modify_qp_exit2;
+	}
+
+	if ((my_qp->qp_type == IB_QPT_UD ||
+	     my_qp->qp_type == IB_QPT_GSI ||
+	     my_qp->qp_type == IB_QPT_SMI) &&
+	    statetrans == IB_QPST_SQE2RTS) {
+		/* doorbell to reprocessing wqes */
+		iosync(); /* serialize GAL register access */
+		hipz_update_sqa(my_qp, bad_wqe_cnt-1);
+		ehca_gen_dbg("doorbell for %x wqes", bad_wqe_cnt);
+	}
+
+	if (statetrans == IB_QPST_RESET2INIT ||
+	    statetrans == IB_QPST_INIT2INIT) {
+		mqpcb->qp_enable = 1;
+		mqpcb->qp_state = EHCA_QPS_INIT;
+		update_mask = 0;
+		update_mask = EHCA_BMASK_SET(MQPCB_MASK_QP_ENABLE, 1);
+
+		h_ret = hipz_h_modify_qp(shca->ipz_hca_handle,
+					 my_qp->ipz_qp_handle,
+					 &my_qp->pf,
+					 update_mask,
+					 mqpcb,
+					 my_qp->galpas.kernel);
+
+		if (h_ret != H_SUCCESS) {
+			ret = ehca2ib_return_code(h_ret);
+			ehca_err(ibqp->device, "ENABLE in context of "
+				 "RESET_2_INIT failed! Maybe you didn't get "
+				 "a LID h_ret=%lli ehca_qp=%p qp_num=%x",
+				 h_ret, my_qp, ibqp->qp_num);
+			goto modify_qp_exit2;
+		}
+	}
+	if ((qp_new_state == IB_QPS_ERR) && (qp_cur_state != IB_QPS_ERR)
+	    && !is_user) {
+		ret = check_for_left_cqes(my_qp, shca);
+		if (ret)
+			goto modify_qp_exit2;
+	}
+
+	if (statetrans == IB_QPST_ANY2RESET) {
+		ipz_qeit_reset(&my_qp->ipz_rqueue);
+		ipz_qeit_reset(&my_qp->ipz_squeue);
+
+		if (qp_cur_state == IB_QPS_ERR && !is_user) {
+			del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
+
+			if (HAS_RQ(my_qp))
+				del_from_err_list(my_qp->recv_cq,
+						  &my_qp->rq_err_node);
+		}
+		if (!is_user)
+			reset_queue_map(&my_qp->sq_map);
+
+		if (HAS_RQ(my_qp) && !is_user)
+			reset_queue_map(&my_qp->rq_map);
+	}
+
+	if (attr_mask & IB_QP_QKEY)
+		my_qp->qkey = attr->qkey;
+
+modify_qp_exit2:
+	if (squeue_locked) { /* this means: sqe -> rts */
+		spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
+		my_qp->sqerr_purgeflag = 1;
+	}
+
+modify_qp_exit1:
+	ehca_free_fw_ctrlblock(mqpcb);
+
+	return ret;
+}
+
+int ehca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+		   struct ib_udata *udata)
+{
+	int ret = 0;
+
+	struct ehca_shca *shca = container_of(ibqp->device, struct ehca_shca,
+					      ib_device);
+	struct ehca_qp *my_qp = container_of(ibqp, struct ehca_qp, ib_qp);
+
+	/* The if-block below caches qp_attr to be modified for GSI and SMI
+	 * qps during the initialization by ib_mad. When the respective port
+	 * is activated, ie we got an event PORT_ACTIVE, we'll replay the
+	 * cached modify calls sequence, see ehca_recover_sqs() below.
+	 * Why that is required:
+	 * 1) If one port is connected, older code requires that port one
+	 *    to be connected and module option nr_ports=1 to be given by
+	 *    user, which is very inconvenient for end user.
+	 * 2) Firmware accepts modify_qp() only if respective port has become
+	 *    active. Older code had a wait loop of 30sec create_qp()/
+	 *    define_aqp1(), which is not appropriate in practice. This
+	 *    code now removes that wait loop, see define_aqp1(), and always
+	 *    reports all ports to ib_mad resp. users. Only activated ports
+	 *    will then usable for the users.
+	 */
+	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) {
+		int port = my_qp->init_attr.port_num;
+		struct ehca_sport *sport = &shca->sport[port - 1];
+		unsigned long flags;
+		spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+		/* cache qp_attr only during init */
+		if (my_qp->mod_qp_parm) {
+			struct ehca_mod_qp_parm *p;
+			if (my_qp->mod_qp_parm_idx >= EHCA_MOD_QP_PARM_MAX) {
+				ehca_err(&shca->ib_device,
+					 "mod_qp_parm overflow state=%x port=%x"
+					 " type=%x", attr->qp_state,
+					 my_qp->init_attr.port_num,
+					 ibqp->qp_type);
+				spin_unlock_irqrestore(&sport->mod_sqp_lock,
+						       flags);
+				return -EINVAL;
+			}
+			p = &my_qp->mod_qp_parm[my_qp->mod_qp_parm_idx];
+			p->mask = attr_mask;
+			p->attr = *attr;
+			my_qp->mod_qp_parm_idx++;
+			ehca_dbg(&shca->ib_device,
+				 "Saved qp_attr for state=%x port=%x type=%x",
+				 attr->qp_state, my_qp->init_attr.port_num,
+				 ibqp->qp_type);
+			spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+			goto out;
+		}
+		spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+	}
+
+	ret = internal_modify_qp(ibqp, attr, attr_mask, 0);
+
+out:
+	if ((ret == 0) && (attr_mask & IB_QP_STATE))
+		my_qp->state = attr->qp_state;
+
+	return ret;
+}
+
+void ehca_recover_sqp(struct ib_qp *sqp)
+{
+	struct ehca_qp *my_sqp = container_of(sqp, struct ehca_qp, ib_qp);
+	int port = my_sqp->init_attr.port_num;
+	struct ib_qp_attr attr;
+	struct ehca_mod_qp_parm *qp_parm;
+	int i, qp_parm_idx, ret;
+	unsigned long flags, wr_cnt;
+
+	if (!my_sqp->mod_qp_parm)
+		return;
+	ehca_dbg(sqp->device, "SQP port=%x qp_num=%x", port, sqp->qp_num);
+
+	qp_parm = my_sqp->mod_qp_parm;
+	qp_parm_idx = my_sqp->mod_qp_parm_idx;
+	for (i = 0; i < qp_parm_idx; i++) {
+		attr = qp_parm[i].attr;
+		ret = internal_modify_qp(sqp, &attr, qp_parm[i].mask, 0);
+		if (ret) {
+			ehca_err(sqp->device, "Could not modify SQP port=%x "
+				 "qp_num=%x ret=%x", port, sqp->qp_num, ret);
+			goto free_qp_parm;
+		}
+		ehca_dbg(sqp->device, "SQP port=%x qp_num=%x in state=%x",
+			 port, sqp->qp_num, attr.qp_state);
+	}
+
+	/* re-trigger posted recv wrs */
+	wr_cnt =  my_sqp->ipz_rqueue.current_q_offset /
+		my_sqp->ipz_rqueue.qe_size;
+	if (wr_cnt) {
+		spin_lock_irqsave(&my_sqp->spinlock_r, flags);
+		hipz_update_rqa(my_sqp, wr_cnt);
+		spin_unlock_irqrestore(&my_sqp->spinlock_r, flags);
+		ehca_dbg(sqp->device, "doorbell port=%x qp_num=%x wr_cnt=%lx",
+			 port, sqp->qp_num, wr_cnt);
+	}
+
+free_qp_parm:
+	kfree(qp_parm);
+	/* this prevents subsequent calls to modify_qp() to cache qp_attr */
+	my_sqp->mod_qp_parm = NULL;
+}
+
+int ehca_query_qp(struct ib_qp *qp,
+		  struct ib_qp_attr *qp_attr,
+		  int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+	struct ehca_shca *shca = container_of(qp->device, struct ehca_shca,
+					      ib_device);
+	struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
+	struct hcp_modify_qp_control_block *qpcb;
+	int cnt, ret = 0;
+	u64 h_ret;
+
+	if (qp_attr_mask & QP_ATTR_QUERY_NOT_SUPPORTED) {
+		ehca_err(qp->device, "Invalid attribute mask "
+			 "ehca_qp=%p qp_num=%x qp_attr_mask=%x ",
+			 my_qp, qp->qp_num, qp_attr_mask);
+		return -EINVAL;
+	}
+
+	qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!qpcb) {
+		ehca_err(qp->device, "Out of memory for qpcb "
+			 "ehca_qp=%p qp_num=%x", my_qp, qp->qp_num);
+		return -ENOMEM;
+	}
+
+	h_ret = hipz_h_query_qp(adapter_handle,
+				my_qp->ipz_qp_handle,
+				&my_qp->pf,
+				qpcb, my_qp->galpas.kernel);
+
+	if (h_ret != H_SUCCESS) {
+		ret = ehca2ib_return_code(h_ret);
+		ehca_err(qp->device, "hipz_h_query_qp() failed "
+			 "ehca_qp=%p qp_num=%x h_ret=%lli",
+			 my_qp, qp->qp_num, h_ret);
+		goto query_qp_exit1;
+	}
+
+	qp_attr->cur_qp_state = ehca2ib_qp_state(qpcb->qp_state);
+	qp_attr->qp_state = qp_attr->cur_qp_state;
+
+	if (qp_attr->cur_qp_state == -EINVAL) {
+		ret = -EINVAL;
+		ehca_err(qp->device, "Got invalid ehca_qp_state=%x "
+			 "ehca_qp=%p qp_num=%x",
+			 qpcb->qp_state, my_qp, qp->qp_num);
+		goto query_qp_exit1;
+	}
+
+	if (qp_attr->qp_state == IB_QPS_SQD)
+		qp_attr->sq_draining = 1;
+
+	qp_attr->qkey = qpcb->qkey;
+	qp_attr->path_mtu = qpcb->path_mtu;
+	qp_attr->path_mig_state = qpcb->path_migration_state - 1;
+	qp_attr->rq_psn = qpcb->receive_psn;
+	qp_attr->sq_psn = qpcb->send_psn;
+	qp_attr->min_rnr_timer = qpcb->min_rnr_nak_timer_field;
+	qp_attr->cap.max_send_wr = qpcb->max_nr_outst_send_wr-1;
+	qp_attr->cap.max_recv_wr = qpcb->max_nr_outst_recv_wr-1;
+	/* UD_AV CIRCUMVENTION */
+	if (my_qp->qp_type == IB_QPT_UD) {
+		qp_attr->cap.max_send_sge =
+			qpcb->actual_nr_sges_in_sq_wqe - 2;
+		qp_attr->cap.max_recv_sge =
+			qpcb->actual_nr_sges_in_rq_wqe - 2;
+	} else {
+		qp_attr->cap.max_send_sge =
+			qpcb->actual_nr_sges_in_sq_wqe;
+		qp_attr->cap.max_recv_sge =
+			qpcb->actual_nr_sges_in_rq_wqe;
+	}
+
+	qp_attr->cap.max_inline_data = my_qp->sq_max_inline_data_size;
+	qp_attr->dest_qp_num = qpcb->dest_qp_nr;
+
+	qp_attr->pkey_index = qpcb->prim_p_key_idx;
+	qp_attr->port_num = qpcb->prim_phys_port;
+	qp_attr->timeout = qpcb->timeout;
+	qp_attr->retry_cnt = qpcb->retry_count;
+	qp_attr->rnr_retry = qpcb->rnr_retry_count;
+
+	qp_attr->alt_pkey_index = qpcb->alt_p_key_idx;
+	qp_attr->alt_port_num = qpcb->alt_phys_port;
+	qp_attr->alt_timeout = qpcb->timeout_al;
+
+	qp_attr->max_dest_rd_atomic = qpcb->rdma_nr_atomic_resp_res;
+	qp_attr->max_rd_atomic = qpcb->rdma_atomic_outst_dest_qp;
+
+	/* primary av */
+	qp_attr->ah_attr.sl = qpcb->service_level;
+
+	if (qpcb->send_grh_flag) {
+		qp_attr->ah_attr.ah_flags = IB_AH_GRH;
+	}
+
+	qp_attr->ah_attr.static_rate = qpcb->max_static_rate;
+	qp_attr->ah_attr.dlid = qpcb->dlid;
+	qp_attr->ah_attr.src_path_bits = qpcb->source_path_bits;
+	qp_attr->ah_attr.port_num = qp_attr->port_num;
+
+	/* primary GRH */
+	qp_attr->ah_attr.grh.traffic_class = qpcb->traffic_class;
+	qp_attr->ah_attr.grh.hop_limit = qpcb->hop_limit;
+	qp_attr->ah_attr.grh.sgid_index = qpcb->source_gid_idx;
+	qp_attr->ah_attr.grh.flow_label = qpcb->flow_label;
+
+	for (cnt = 0; cnt < 16; cnt++)
+		qp_attr->ah_attr.grh.dgid.raw[cnt] =
+			qpcb->dest_gid.byte[cnt];
+
+	/* alternate AV */
+	qp_attr->alt_ah_attr.sl = qpcb->service_level_al;
+	if (qpcb->send_grh_flag_al) {
+		qp_attr->alt_ah_attr.ah_flags = IB_AH_GRH;
+	}
+
+	qp_attr->alt_ah_attr.static_rate = qpcb->max_static_rate_al;
+	qp_attr->alt_ah_attr.dlid = qpcb->dlid_al;
+	qp_attr->alt_ah_attr.src_path_bits = qpcb->source_path_bits_al;
+
+	/* alternate GRH */
+	qp_attr->alt_ah_attr.grh.traffic_class = qpcb->traffic_class_al;
+	qp_attr->alt_ah_attr.grh.hop_limit = qpcb->hop_limit_al;
+	qp_attr->alt_ah_attr.grh.sgid_index = qpcb->source_gid_idx_al;
+	qp_attr->alt_ah_attr.grh.flow_label = qpcb->flow_label_al;
+
+	for (cnt = 0; cnt < 16; cnt++)
+		qp_attr->alt_ah_attr.grh.dgid.raw[cnt] =
+			qpcb->dest_gid_al.byte[cnt];
+
+	/* return init attributes given in ehca_create_qp */
+	if (qp_init_attr)
+		*qp_init_attr = my_qp->init_attr;
+
+	if (ehca_debug_level >= 2)
+		ehca_dmp(qpcb, 4*70, "qp_num=%x", qp->qp_num);
+
+query_qp_exit1:
+	ehca_free_fw_ctrlblock(qpcb);
+
+	return ret;
+}
+
+int ehca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		    enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct ehca_qp *my_qp =
+		container_of(ibsrq, struct ehca_qp, ib_srq);
+	struct ehca_shca *shca =
+		container_of(ibsrq->pd->device, struct ehca_shca, ib_device);
+	struct hcp_modify_qp_control_block *mqpcb;
+	u64 update_mask;
+	u64 h_ret;
+	int ret = 0;
+
+	mqpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!mqpcb) {
+		ehca_err(ibsrq->device, "Could not get zeroed page for mqpcb "
+			 "ehca_qp=%p qp_num=%x ", my_qp, my_qp->real_qp_num);
+		return -ENOMEM;
+	}
+
+	update_mask = 0;
+	if (attr_mask & IB_SRQ_LIMIT) {
+		attr_mask &= ~IB_SRQ_LIMIT;
+		update_mask |=
+			EHCA_BMASK_SET(MQPCB_MASK_CURR_SRQ_LIMIT, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_QP_AFF_ASYN_EV_LOG_REG, 1);
+		mqpcb->curr_srq_limit = attr->srq_limit;
+		mqpcb->qp_aff_asyn_ev_log_reg =
+			EHCA_BMASK_SET(QPX_AAELOG_RESET_SRQ_LIMIT, 1);
+	}
+
+	/* by now, all bits in attr_mask should have been cleared */
+	if (attr_mask) {
+		ehca_err(ibsrq->device, "invalid attribute mask bits set  "
+			 "attr_mask=%x", attr_mask);
+		ret = -EINVAL;
+		goto modify_srq_exit0;
+	}
+
+	if (ehca_debug_level >= 2)
+		ehca_dmp(mqpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
+
+	h_ret = hipz_h_modify_qp(shca->ipz_hca_handle, my_qp->ipz_qp_handle,
+				 NULL, update_mask, mqpcb,
+				 my_qp->galpas.kernel);
+
+	if (h_ret != H_SUCCESS) {
+		ret = ehca2ib_return_code(h_ret);
+		ehca_err(ibsrq->device, "hipz_h_modify_qp() failed h_ret=%lli "
+			 "ehca_qp=%p qp_num=%x",
+			 h_ret, my_qp, my_qp->real_qp_num);
+	}
+
+modify_srq_exit0:
+	ehca_free_fw_ctrlblock(mqpcb);
+
+	return ret;
+}
+
+int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr)
+{
+	struct ehca_qp *my_qp = container_of(srq, struct ehca_qp, ib_srq);
+	struct ehca_shca *shca = container_of(srq->device, struct ehca_shca,
+					      ib_device);
+	struct ipz_adapter_handle adapter_handle = shca->ipz_hca_handle;
+	struct hcp_modify_qp_control_block *qpcb;
+	int ret = 0;
+	u64 h_ret;
+
+	qpcb = ehca_alloc_fw_ctrlblock(GFP_KERNEL);
+	if (!qpcb) {
+		ehca_err(srq->device, "Out of memory for qpcb "
+			 "ehca_qp=%p qp_num=%x", my_qp, my_qp->real_qp_num);
+		return -ENOMEM;
+	}
+
+	h_ret = hipz_h_query_qp(adapter_handle, my_qp->ipz_qp_handle,
+				NULL, qpcb, my_qp->galpas.kernel);
+
+	if (h_ret != H_SUCCESS) {
+		ret = ehca2ib_return_code(h_ret);
+		ehca_err(srq->device, "hipz_h_query_qp() failed "
+			 "ehca_qp=%p qp_num=%x h_ret=%lli",
+			 my_qp, my_qp->real_qp_num, h_ret);
+		goto query_srq_exit1;
+	}
+
+	srq_attr->max_wr = qpcb->max_nr_outst_recv_wr - 1;
+	srq_attr->max_sge = 3;
+	srq_attr->srq_limit = qpcb->curr_srq_limit;
+
+	if (ehca_debug_level >= 2)
+		ehca_dmp(qpcb, 4*70, "qp_num=%x", my_qp->real_qp_num);
+
+query_srq_exit1:
+	ehca_free_fw_ctrlblock(qpcb);
+
+	return ret;
+}
+
+static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
+			       struct ib_uobject *uobject)
+{
+	struct ehca_shca *shca = container_of(dev, struct ehca_shca, ib_device);
+	struct ehca_pd *my_pd = container_of(my_qp->ib_qp.pd, struct ehca_pd,
+					     ib_pd);
+	struct ehca_sport *sport = &shca->sport[my_qp->init_attr.port_num - 1];
+	u32 qp_num = my_qp->real_qp_num;
+	int ret;
+	u64 h_ret;
+	u8 port_num;
+	int is_user = 0;
+	enum ib_qp_type	qp_type;
+	unsigned long flags;
+
+	if (uobject) {
+		is_user = 1;
+		if (my_qp->mm_count_galpa ||
+		    my_qp->mm_count_rqueue || my_qp->mm_count_squeue) {
+			ehca_err(dev, "Resources still referenced in "
+				 "user space qp_num=%x", qp_num);
+			return -EINVAL;
+		}
+	}
+
+	if (my_qp->send_cq) {
+		ret = ehca_cq_unassign_qp(my_qp->send_cq, qp_num);
+		if (ret) {
+			ehca_err(dev, "Couldn't unassign qp from "
+				 "send_cq ret=%i qp_num=%x cq_num=%x", ret,
+				 qp_num, my_qp->send_cq->cq_number);
+			return ret;
+		}
+	}
+
+	write_lock_irqsave(&ehca_qp_idr_lock, flags);
+	idr_remove(&ehca_qp_idr, my_qp->token);
+	write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
+
+	/*
+	 * SRQs will never get into an error list and do not have a recv_cq,
+	 * so we need to skip them here.
+	 */
+	if (HAS_RQ(my_qp) && !IS_SRQ(my_qp) && !is_user)
+		del_from_err_list(my_qp->recv_cq, &my_qp->rq_err_node);
+
+	if (HAS_SQ(my_qp) && !is_user)
+		del_from_err_list(my_qp->send_cq, &my_qp->sq_err_node);
+
+	/* now wait until all pending events have completed */
+	wait_event(my_qp->wait_completion, !atomic_read(&my_qp->nr_events));
+
+	h_ret = hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
+	if (h_ret != H_SUCCESS) {
+		ehca_err(dev, "hipz_h_destroy_qp() failed h_ret=%lli "
+			 "ehca_qp=%p qp_num=%x", h_ret, my_qp, qp_num);
+		return ehca2ib_return_code(h_ret);
+	}
+
+	port_num = my_qp->init_attr.port_num;
+	qp_type  = my_qp->init_attr.qp_type;
+
+	if (qp_type == IB_QPT_SMI || qp_type == IB_QPT_GSI) {
+		spin_lock_irqsave(&sport->mod_sqp_lock, flags);
+		kfree(my_qp->mod_qp_parm);
+		my_qp->mod_qp_parm = NULL;
+		shca->sport[port_num - 1].ibqp_sqp[qp_type] = NULL;
+		spin_unlock_irqrestore(&sport->mod_sqp_lock, flags);
+	}
+
+	/* no support for IB_QPT_SMI yet */
+	if (qp_type == IB_QPT_GSI) {
+		struct ib_event event;
+		ehca_info(dev, "device %s: port %x is inactive.",
+				shca->ib_device.name, port_num);
+		event.device = &shca->ib_device;
+		event.event = IB_EVENT_PORT_ERR;
+		event.element.port_num = port_num;
+		shca->sport[port_num - 1].port_state = IB_PORT_DOWN;
+		ib_dispatch_event(&event);
+	}
+
+	if (HAS_RQ(my_qp)) {
+		ipz_queue_dtor(my_pd, &my_qp->ipz_rqueue);
+		if (!is_user)
+			vfree(my_qp->rq_map.map);
+	}
+	if (HAS_SQ(my_qp)) {
+		ipz_queue_dtor(my_pd, &my_qp->ipz_squeue);
+		if (!is_user)
+			vfree(my_qp->sq_map.map);
+	}
+	kmem_cache_free(qp_cache, my_qp);
+	atomic_dec(&shca->num_qps);
+	return 0;
+}
+
+int ehca_destroy_qp(struct ib_qp *qp)
+{
+	return internal_destroy_qp(qp->device,
+				   container_of(qp, struct ehca_qp, ib_qp),
+				   qp->uobject);
+}
+
+int ehca_destroy_srq(struct ib_srq *srq)
+{
+	return internal_destroy_qp(srq->device,
+				   container_of(srq, struct ehca_qp, ib_srq),
+				   srq->uobject);
+}
+
+int ehca_init_qp_cache(void)
+{
+	qp_cache = kmem_cache_create("ehca_cache_qp",
+				     sizeof(struct ehca_qp), 0,
+				     SLAB_HWCACHE_ALIGN,
+				     NULL);
+	if (!qp_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void ehca_cleanup_qp_cache(void)
+{
+	if (qp_cache)
+		kmem_cache_destroy(qp_cache);
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c
new file mode 100644
index 000000000..47f949843
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_reqs.c
@@ -0,0 +1,953 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  post_send/recv, poll_cq, req_notify
+ *
+ *  Authors: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+#include "hipz_fns.h"
+
+/* in RC traffic, insert an empty RDMA READ every this many packets */
+#define ACK_CIRC_THRESHOLD 2000000
+
+static u64 replace_wr_id(u64 wr_id, u16 idx)
+{
+	u64 ret;
+
+	ret = wr_id & ~QMAP_IDX_MASK;
+	ret |= idx & QMAP_IDX_MASK;
+
+	return ret;
+}
+
+static u16 get_app_wr_id(u64 wr_id)
+{
+	return wr_id & QMAP_IDX_MASK;
+}
+
+static inline int ehca_write_rwqe(struct ipz_queue *ipz_rqueue,
+				  struct ehca_wqe *wqe_p,
+				  struct ib_recv_wr *recv_wr,
+				  u32 rq_map_idx)
+{
+	u8 cnt_ds;
+	if (unlikely((recv_wr->num_sge < 0) ||
+		     (recv_wr->num_sge > ipz_rqueue->act_nr_of_sg))) {
+		ehca_gen_err("Invalid number of WQE SGE. "
+			 "num_sqe=%x max_nr_of_sg=%x",
+			 recv_wr->num_sge, ipz_rqueue->act_nr_of_sg);
+		return -EINVAL; /* invalid SG list length */
+	}
+
+	/* clear wqe header until sglist */
+	memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
+
+	wqe_p->work_request_id = replace_wr_id(recv_wr->wr_id, rq_map_idx);
+	wqe_p->nr_of_data_seg = recv_wr->num_sge;
+
+	for (cnt_ds = 0; cnt_ds < recv_wr->num_sge; cnt_ds++) {
+		wqe_p->u.all_rcv.sg_list[cnt_ds].vaddr =
+			recv_wr->sg_list[cnt_ds].addr;
+		wqe_p->u.all_rcv.sg_list[cnt_ds].lkey =
+			recv_wr->sg_list[cnt_ds].lkey;
+		wqe_p->u.all_rcv.sg_list[cnt_ds].length =
+			recv_wr->sg_list[cnt_ds].length;
+	}
+
+	if (ehca_debug_level >= 3) {
+		ehca_gen_dbg("RECEIVE WQE written into ipz_rqueue=%p",
+			     ipz_rqueue);
+		ehca_dmp(wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "recv wqe");
+	}
+
+	return 0;
+}
+
+#if defined(DEBUG_GSI_SEND_WR)
+
+/* need ib_mad struct */
+#include <rdma/ib_mad.h>
+
+static void trace_send_wr_ud(const struct ib_send_wr *send_wr)
+{
+	int idx;
+	int j;
+	while (send_wr) {
+		struct ib_mad_hdr *mad_hdr = send_wr->wr.ud.mad_hdr;
+		struct ib_sge *sge = send_wr->sg_list;
+		ehca_gen_dbg("send_wr#%x wr_id=%lx num_sge=%x "
+			     "send_flags=%x opcode=%x", idx, send_wr->wr_id,
+			     send_wr->num_sge, send_wr->send_flags,
+			     send_wr->opcode);
+		if (mad_hdr) {
+			ehca_gen_dbg("send_wr#%x mad_hdr base_version=%x "
+				     "mgmt_class=%x class_version=%x method=%x "
+				     "status=%x class_specific=%x tid=%lx "
+				     "attr_id=%x resv=%x attr_mod=%x",
+				     idx, mad_hdr->base_version,
+				     mad_hdr->mgmt_class,
+				     mad_hdr->class_version, mad_hdr->method,
+				     mad_hdr->status, mad_hdr->class_specific,
+				     mad_hdr->tid, mad_hdr->attr_id,
+				     mad_hdr->resv,
+				     mad_hdr->attr_mod);
+		}
+		for (j = 0; j < send_wr->num_sge; j++) {
+			u8 *data = __va(sge->addr);
+			ehca_gen_dbg("send_wr#%x sge#%x addr=%p length=%x "
+				     "lkey=%x",
+				     idx, j, data, sge->length, sge->lkey);
+			/* assume length is n*16 */
+			ehca_dmp(data, sge->length, "send_wr#%x sge#%x",
+				 idx, j);
+			sge++;
+		} /* eof for j */
+		idx++;
+		send_wr = send_wr->next;
+	} /* eof while send_wr */
+}
+
+#endif /* DEBUG_GSI_SEND_WR */
+
+static inline int ehca_write_swqe(struct ehca_qp *qp,
+				  struct ehca_wqe *wqe_p,
+				  const struct ib_send_wr *send_wr,
+				  u32 sq_map_idx,
+				  int hidden)
+{
+	u32 idx;
+	u64 dma_length;
+	struct ehca_av *my_av;
+	u32 remote_qkey = send_wr->wr.ud.remote_qkey;
+	struct ehca_qmap_entry *qmap_entry = &qp->sq_map.map[sq_map_idx];
+
+	if (unlikely((send_wr->num_sge < 0) ||
+		     (send_wr->num_sge > qp->ipz_squeue.act_nr_of_sg))) {
+		ehca_gen_err("Invalid number of WQE SGE. "
+			 "num_sqe=%x max_nr_of_sg=%x",
+			 send_wr->num_sge, qp->ipz_squeue.act_nr_of_sg);
+		return -EINVAL; /* invalid SG list length */
+	}
+
+	/* clear wqe header until sglist */
+	memset(wqe_p, 0, offsetof(struct ehca_wqe, u.ud_av.sg_list));
+
+	wqe_p->work_request_id = replace_wr_id(send_wr->wr_id, sq_map_idx);
+
+	qmap_entry->app_wr_id = get_app_wr_id(send_wr->wr_id);
+	qmap_entry->reported = 0;
+	qmap_entry->cqe_req = 0;
+
+	switch (send_wr->opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+		wqe_p->optype = WQE_OPTYPE_SEND;
+		break;
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		wqe_p->optype = WQE_OPTYPE_RDMAWRITE;
+		break;
+	case IB_WR_RDMA_READ:
+		wqe_p->optype = WQE_OPTYPE_RDMAREAD;
+		break;
+	default:
+		ehca_gen_err("Invalid opcode=%x", send_wr->opcode);
+		return -EINVAL; /* invalid opcode */
+	}
+
+	wqe_p->wqef = (send_wr->opcode) & WQEF_HIGH_NIBBLE;
+
+	wqe_p->wr_flag = 0;
+
+	if ((send_wr->send_flags & IB_SEND_SIGNALED ||
+	    qp->init_attr.sq_sig_type == IB_SIGNAL_ALL_WR)
+	    && !hidden) {
+		wqe_p->wr_flag |= WQE_WRFLAG_REQ_SIGNAL_COM;
+		qmap_entry->cqe_req = 1;
+	}
+
+	if (send_wr->opcode == IB_WR_SEND_WITH_IMM ||
+	    send_wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM) {
+		/* this might not work as long as HW does not support it */
+		wqe_p->immediate_data = be32_to_cpu(send_wr->ex.imm_data);
+		wqe_p->wr_flag |= WQE_WRFLAG_IMM_DATA_PRESENT;
+	}
+
+	wqe_p->nr_of_data_seg = send_wr->num_sge;
+
+	switch (qp->qp_type) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		/* no break is intential here */
+	case IB_QPT_UD:
+		/* IB 1.2 spec C10-15 compliance */
+		if (send_wr->wr.ud.remote_qkey & 0x80000000)
+			remote_qkey = qp->qkey;
+
+		wqe_p->destination_qp_number = send_wr->wr.ud.remote_qpn << 8;
+		wqe_p->local_ee_context_qkey = remote_qkey;
+		if (unlikely(!send_wr->wr.ud.ah)) {
+			ehca_gen_err("wr.ud.ah is NULL. qp=%p", qp);
+			return -EINVAL;
+		}
+		if (unlikely(send_wr->wr.ud.remote_qpn == 0)) {
+			ehca_gen_err("dest QP# is 0. qp=%x", qp->real_qp_num);
+			return -EINVAL;
+		}
+		my_av = container_of(send_wr->wr.ud.ah, struct ehca_av, ib_ah);
+		wqe_p->u.ud_av.ud_av = my_av->av;
+
+		/*
+		 * omitted check of IB_SEND_INLINE
+		 * since HW does not support it
+		 */
+		for (idx = 0; idx < send_wr->num_sge; idx++) {
+			wqe_p->u.ud_av.sg_list[idx].vaddr =
+				send_wr->sg_list[idx].addr;
+			wqe_p->u.ud_av.sg_list[idx].lkey =
+				send_wr->sg_list[idx].lkey;
+			wqe_p->u.ud_av.sg_list[idx].length =
+				send_wr->sg_list[idx].length;
+		} /* eof for idx */
+		if (qp->qp_type == IB_QPT_SMI ||
+		    qp->qp_type == IB_QPT_GSI)
+			wqe_p->u.ud_av.ud_av.pmtu = 1;
+		if (qp->qp_type == IB_QPT_GSI) {
+			wqe_p->pkeyi = send_wr->wr.ud.pkey_index;
+#ifdef DEBUG_GSI_SEND_WR
+			trace_send_wr_ud(send_wr);
+#endif /* DEBUG_GSI_SEND_WR */
+		}
+		break;
+
+	case IB_QPT_UC:
+		if (send_wr->send_flags & IB_SEND_FENCE)
+			wqe_p->wr_flag |= WQE_WRFLAG_FENCE;
+		/* no break is intentional here */
+	case IB_QPT_RC:
+		/* TODO: atomic not implemented */
+		wqe_p->u.nud.remote_virtual_address =
+			send_wr->wr.rdma.remote_addr;
+		wqe_p->u.nud.rkey = send_wr->wr.rdma.rkey;
+
+		/*
+		 * omitted checking of IB_SEND_INLINE
+		 * since HW does not support it
+		 */
+		dma_length = 0;
+		for (idx = 0; idx < send_wr->num_sge; idx++) {
+			wqe_p->u.nud.sg_list[idx].vaddr =
+				send_wr->sg_list[idx].addr;
+			wqe_p->u.nud.sg_list[idx].lkey =
+				send_wr->sg_list[idx].lkey;
+			wqe_p->u.nud.sg_list[idx].length =
+				send_wr->sg_list[idx].length;
+			dma_length += send_wr->sg_list[idx].length;
+		} /* eof idx */
+		wqe_p->u.nud.atomic_1st_op_dma_len = dma_length;
+
+		/* unsolicited ack circumvention */
+		if (send_wr->opcode == IB_WR_RDMA_READ) {
+			/* on RDMA read, switch on and reset counters */
+			qp->message_count = qp->packet_count = 0;
+			qp->unsol_ack_circ = 1;
+		} else
+			/* else estimate #packets */
+			qp->packet_count += (dma_length >> qp->mtu_shift) + 1;
+
+		break;
+
+	default:
+		ehca_gen_err("Invalid qptype=%x", qp->qp_type);
+		return -EINVAL;
+	}
+
+	if (ehca_debug_level >= 3) {
+		ehca_gen_dbg("SEND WQE written into queue qp=%p ", qp);
+		ehca_dmp( wqe_p, 16*(6 + wqe_p->nr_of_data_seg), "send wqe");
+	}
+	return 0;
+}
+
+/* map_ib_wc_status converts raw cqe_status to ib_wc_status */
+static inline void map_ib_wc_status(u32 cqe_status,
+				    enum ib_wc_status *wc_status)
+{
+	if (unlikely(cqe_status & WC_STATUS_ERROR_BIT)) {
+		switch (cqe_status & 0x3F) {
+		case 0x01:
+		case 0x21:
+			*wc_status = IB_WC_LOC_LEN_ERR;
+			break;
+		case 0x02:
+		case 0x22:
+			*wc_status = IB_WC_LOC_QP_OP_ERR;
+			break;
+		case 0x03:
+		case 0x23:
+			*wc_status = IB_WC_LOC_EEC_OP_ERR;
+			break;
+		case 0x04:
+		case 0x24:
+			*wc_status = IB_WC_LOC_PROT_ERR;
+			break;
+		case 0x05:
+		case 0x25:
+			*wc_status = IB_WC_WR_FLUSH_ERR;
+			break;
+		case 0x06:
+			*wc_status = IB_WC_MW_BIND_ERR;
+			break;
+		case 0x07: /* remote error - look into bits 20:24 */
+			switch ((cqe_status
+				 & WC_STATUS_REMOTE_ERROR_FLAGS) >> 11) {
+			case 0x0:
+				/*
+				 * PSN Sequence Error!
+				 * couldn't find a matching status!
+				 */
+				*wc_status = IB_WC_GENERAL_ERR;
+				break;
+			case 0x1:
+				*wc_status = IB_WC_REM_INV_REQ_ERR;
+				break;
+			case 0x2:
+				*wc_status = IB_WC_REM_ACCESS_ERR;
+				break;
+			case 0x3:
+				*wc_status = IB_WC_REM_OP_ERR;
+				break;
+			case 0x4:
+				*wc_status = IB_WC_REM_INV_RD_REQ_ERR;
+				break;
+			}
+			break;
+		case 0x08:
+			*wc_status = IB_WC_RETRY_EXC_ERR;
+			break;
+		case 0x09:
+			*wc_status = IB_WC_RNR_RETRY_EXC_ERR;
+			break;
+		case 0x0A:
+		case 0x2D:
+			*wc_status = IB_WC_REM_ABORT_ERR;
+			break;
+		case 0x0B:
+		case 0x2E:
+			*wc_status = IB_WC_INV_EECN_ERR;
+			break;
+		case 0x0C:
+		case 0x2F:
+			*wc_status = IB_WC_INV_EEC_STATE_ERR;
+			break;
+		case 0x0D:
+			*wc_status = IB_WC_BAD_RESP_ERR;
+			break;
+		case 0x10:
+			/* WQE purged */
+			*wc_status = IB_WC_WR_FLUSH_ERR;
+			break;
+		default:
+			*wc_status = IB_WC_FATAL_ERR;
+
+		}
+	} else
+		*wc_status = IB_WC_SUCCESS;
+}
+
+static inline int post_one_send(struct ehca_qp *my_qp,
+			 struct ib_send_wr *cur_send_wr,
+			 int hidden)
+{
+	struct ehca_wqe *wqe_p;
+	int ret;
+	u32 sq_map_idx;
+	u64 start_offset = my_qp->ipz_squeue.current_q_offset;
+
+	/* get pointer next to free WQE */
+	wqe_p = ipz_qeit_get_inc(&my_qp->ipz_squeue);
+	if (unlikely(!wqe_p)) {
+		/* too many posted work requests: queue overflow */
+		ehca_err(my_qp->ib_qp.device, "Too many posted WQEs "
+			 "qp_num=%x", my_qp->ib_qp.qp_num);
+		return -ENOMEM;
+	}
+
+	/*
+	 * Get the index of the WQE in the send queue. The same index is used
+	 * for writing into the sq_map.
+	 */
+	sq_map_idx = start_offset / my_qp->ipz_squeue.qe_size;
+
+	/* write a SEND WQE into the QUEUE */
+	ret = ehca_write_swqe(my_qp, wqe_p, cur_send_wr, sq_map_idx, hidden);
+	/*
+	 * if something failed,
+	 * reset the free entry pointer to the start value
+	 */
+	if (unlikely(ret)) {
+		my_qp->ipz_squeue.current_q_offset = start_offset;
+		ehca_err(my_qp->ib_qp.device, "Could not write WQE "
+			 "qp_num=%x", my_qp->ib_qp.qp_num);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int ehca_post_send(struct ib_qp *qp,
+		   struct ib_send_wr *send_wr,
+		   struct ib_send_wr **bad_send_wr)
+{
+	struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+	int wqe_cnt = 0;
+	int ret = 0;
+	unsigned long flags;
+
+	/* Reject WR if QP is in RESET, INIT or RTR state */
+	if (unlikely(my_qp->state < IB_QPS_RTS)) {
+		ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
+			 my_qp->state, qp->qp_num);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* LOCK the QUEUE */
+	spin_lock_irqsave(&my_qp->spinlock_s, flags);
+
+	/* Send an empty extra RDMA read if:
+	 *  1) there has been an RDMA read on this connection before
+	 *  2) no RDMA read occurred for ACK_CIRC_THRESHOLD link packets
+	 *  3) we can be sure that any previous extra RDMA read has been
+	 *     processed so we don't overflow the SQ
+	 */
+	if (unlikely(my_qp->unsol_ack_circ &&
+		     my_qp->packet_count > ACK_CIRC_THRESHOLD &&
+		     my_qp->message_count > my_qp->init_attr.cap.max_send_wr)) {
+		/* insert an empty RDMA READ to fix up the remote QP state */
+		struct ib_send_wr circ_wr;
+		memset(&circ_wr, 0, sizeof(circ_wr));
+		circ_wr.opcode = IB_WR_RDMA_READ;
+		post_one_send(my_qp, &circ_wr, 1); /* ignore retcode */
+		wqe_cnt++;
+		ehca_dbg(qp->device, "posted circ wr  qp_num=%x", qp->qp_num);
+		my_qp->message_count = my_qp->packet_count = 0;
+	}
+
+	/* loop processes list of send reqs */
+	while (send_wr) {
+		ret = post_one_send(my_qp, send_wr, 0);
+		if (unlikely(ret)) {
+			goto post_send_exit0;
+		}
+		wqe_cnt++;
+		send_wr = send_wr->next;
+	}
+
+post_send_exit0:
+	iosync(); /* serialize GAL register access */
+	hipz_update_sqa(my_qp, wqe_cnt);
+	if (unlikely(ret || ehca_debug_level >= 2))
+		ehca_dbg(qp->device, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
+			 my_qp, qp->qp_num, wqe_cnt, ret);
+	my_qp->message_count += wqe_cnt;
+	spin_unlock_irqrestore(&my_qp->spinlock_s, flags);
+
+out:
+	if (ret)
+		*bad_send_wr = send_wr;
+	return ret;
+}
+
+static int internal_post_recv(struct ehca_qp *my_qp,
+			      struct ib_device *dev,
+			      struct ib_recv_wr *recv_wr,
+			      struct ib_recv_wr **bad_recv_wr)
+{
+	struct ehca_wqe *wqe_p;
+	int wqe_cnt = 0;
+	int ret = 0;
+	u32 rq_map_idx;
+	unsigned long flags;
+	struct ehca_qmap_entry *qmap_entry;
+
+	if (unlikely(!HAS_RQ(my_qp))) {
+		ehca_err(dev, "QP has no RQ  ehca_qp=%p qp_num=%x ext_type=%d",
+			 my_qp, my_qp->real_qp_num, my_qp->ext_type);
+		ret = -ENODEV;
+		goto out;
+	}
+
+	/* LOCK the QUEUE */
+	spin_lock_irqsave(&my_qp->spinlock_r, flags);
+
+	/* loop processes list of recv reqs */
+	while (recv_wr) {
+		u64 start_offset = my_qp->ipz_rqueue.current_q_offset;
+		/* get pointer next to free WQE */
+		wqe_p = ipz_qeit_get_inc(&my_qp->ipz_rqueue);
+		if (unlikely(!wqe_p)) {
+			/* too many posted work requests: queue overflow */
+			ret = -ENOMEM;
+			ehca_err(dev, "Too many posted WQEs "
+				"qp_num=%x", my_qp->real_qp_num);
+			goto post_recv_exit0;
+		}
+		/*
+		 * Get the index of the WQE in the recv queue. The same index
+		 * is used for writing into the rq_map.
+		 */
+		rq_map_idx = start_offset / my_qp->ipz_rqueue.qe_size;
+
+		/* write a RECV WQE into the QUEUE */
+		ret = ehca_write_rwqe(&my_qp->ipz_rqueue, wqe_p, recv_wr,
+				rq_map_idx);
+		/*
+		 * if something failed,
+		 * reset the free entry pointer to the start value
+		 */
+		if (unlikely(ret)) {
+			my_qp->ipz_rqueue.current_q_offset = start_offset;
+			ret = -EINVAL;
+			ehca_err(dev, "Could not write WQE "
+				"qp_num=%x", my_qp->real_qp_num);
+			goto post_recv_exit0;
+		}
+
+		qmap_entry = &my_qp->rq_map.map[rq_map_idx];
+		qmap_entry->app_wr_id = get_app_wr_id(recv_wr->wr_id);
+		qmap_entry->reported = 0;
+		qmap_entry->cqe_req = 1;
+
+		wqe_cnt++;
+		recv_wr = recv_wr->next;
+	} /* eof for recv_wr */
+
+post_recv_exit0:
+	iosync(); /* serialize GAL register access */
+	hipz_update_rqa(my_qp, wqe_cnt);
+	if (unlikely(ret || ehca_debug_level >= 2))
+	    ehca_dbg(dev, "ehca_qp=%p qp_num=%x wqe_cnt=%d ret=%i",
+		     my_qp, my_qp->real_qp_num, wqe_cnt, ret);
+	spin_unlock_irqrestore(&my_qp->spinlock_r, flags);
+
+out:
+	if (ret)
+		*bad_recv_wr = recv_wr;
+
+	return ret;
+}
+
+int ehca_post_recv(struct ib_qp *qp,
+		   struct ib_recv_wr *recv_wr,
+		   struct ib_recv_wr **bad_recv_wr)
+{
+	struct ehca_qp *my_qp = container_of(qp, struct ehca_qp, ib_qp);
+
+	/* Reject WR if QP is in RESET state */
+	if (unlikely(my_qp->state == IB_QPS_RESET)) {
+		ehca_err(qp->device, "Invalid QP state  qp_state=%d qpn=%x",
+			 my_qp->state, qp->qp_num);
+		*bad_recv_wr = recv_wr;
+		return -EINVAL;
+	}
+
+	return internal_post_recv(my_qp, qp->device, recv_wr, bad_recv_wr);
+}
+
+int ehca_post_srq_recv(struct ib_srq *srq,
+		       struct ib_recv_wr *recv_wr,
+		       struct ib_recv_wr **bad_recv_wr)
+{
+	return internal_post_recv(container_of(srq, struct ehca_qp, ib_srq),
+				  srq->device, recv_wr, bad_recv_wr);
+}
+
+/*
+ * ib_wc_opcode table converts ehca wc opcode to ib
+ * Since we use zero to indicate invalid opcode, the actual ib opcode must
+ * be decremented!!!
+ */
+static const u8 ib_wc_opcode[255] = {
+	[0x01] = IB_WC_RECV+1,
+	[0x02] = IB_WC_RECV_RDMA_WITH_IMM+1,
+	[0x04] = IB_WC_BIND_MW+1,
+	[0x08] = IB_WC_FETCH_ADD+1,
+	[0x10] = IB_WC_COMP_SWAP+1,
+	[0x20] = IB_WC_RDMA_WRITE+1,
+	[0x40] = IB_WC_RDMA_READ+1,
+	[0x80] = IB_WC_SEND+1
+};
+
+/* internal function to poll one entry of cq */
+static inline int ehca_poll_cq_one(struct ib_cq *cq, struct ib_wc *wc)
+{
+	int ret = 0, qmap_tail_idx;
+	struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+	struct ehca_cqe *cqe;
+	struct ehca_qp *my_qp;
+	struct ehca_qmap_entry *qmap_entry;
+	struct ehca_queue_map *qmap;
+	int cqe_count = 0, is_error;
+
+repoll:
+	cqe = (struct ehca_cqe *)
+		ipz_qeit_get_inc_valid(&my_cq->ipz_queue);
+	if (!cqe) {
+		ret = -EAGAIN;
+		if (ehca_debug_level >= 3)
+			ehca_dbg(cq->device, "Completion queue is empty  "
+				 "my_cq=%p cq_num=%x", my_cq, my_cq->cq_number);
+		goto poll_cq_one_exit0;
+	}
+
+	/* prevents loads being reordered across this point */
+	rmb();
+
+	cqe_count++;
+	if (unlikely(cqe->status & WC_STATUS_PURGE_BIT)) {
+		struct ehca_qp *qp;
+		int purgeflag;
+		unsigned long flags;
+
+		qp = ehca_cq_get_qp(my_cq, cqe->local_qp_number);
+		if (!qp) {
+			ehca_err(cq->device, "cq_num=%x qp_num=%x "
+				 "could not find qp -> ignore cqe",
+				 my_cq->cq_number, cqe->local_qp_number);
+			ehca_dmp(cqe, 64, "cq_num=%x qp_num=%x",
+				 my_cq->cq_number, cqe->local_qp_number);
+			/* ignore this purged cqe */
+			goto repoll;
+		}
+		spin_lock_irqsave(&qp->spinlock_s, flags);
+		purgeflag = qp->sqerr_purgeflag;
+		spin_unlock_irqrestore(&qp->spinlock_s, flags);
+
+		if (purgeflag) {
+			ehca_dbg(cq->device,
+				 "Got CQE with purged bit qp_num=%x src_qp=%x",
+				 cqe->local_qp_number, cqe->remote_qp_number);
+			if (ehca_debug_level >= 2)
+				ehca_dmp(cqe, 64, "qp_num=%x src_qp=%x",
+					 cqe->local_qp_number,
+					 cqe->remote_qp_number);
+			/*
+			 * ignore this to avoid double cqes of bad wqe
+			 * that caused sqe and turn off purge flag
+			 */
+			qp->sqerr_purgeflag = 0;
+			goto repoll;
+		}
+	}
+
+	is_error = cqe->status & WC_STATUS_ERROR_BIT;
+
+	/* trace error CQEs if debug_level >= 1, trace all CQEs if >= 3 */
+	if (unlikely(ehca_debug_level >= 3 || (ehca_debug_level && is_error))) {
+		ehca_dbg(cq->device,
+			 "Received %sCOMPLETION ehca_cq=%p cq_num=%x -----",
+			 is_error ? "ERROR " : "", my_cq, my_cq->cq_number);
+		ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
+			 my_cq, my_cq->cq_number);
+		ehca_dbg(cq->device,
+			 "ehca_cq=%p cq_num=%x -------------------------",
+			 my_cq, my_cq->cq_number);
+	}
+
+	read_lock(&ehca_qp_idr_lock);
+	my_qp = idr_find(&ehca_qp_idr, cqe->qp_token);
+	read_unlock(&ehca_qp_idr_lock);
+	if (!my_qp)
+		goto repoll;
+	wc->qp = &my_qp->ib_qp;
+
+	qmap_tail_idx = get_app_wr_id(cqe->work_request_id);
+	if (!(cqe->w_completion_flags & WC_SEND_RECEIVE_BIT))
+		/* We got a send completion. */
+		qmap = &my_qp->sq_map;
+	else
+		/* We got a receive completion. */
+		qmap = &my_qp->rq_map;
+
+	/* advance the tail pointer */
+	qmap->tail = qmap_tail_idx;
+
+	if (is_error) {
+		/*
+		 * set left_to_poll to 0 because in error state, we will not
+		 * get any additional CQEs
+		 */
+		my_qp->sq_map.next_wqe_idx = next_index(my_qp->sq_map.tail,
+							my_qp->sq_map.entries);
+		my_qp->sq_map.left_to_poll = 0;
+		ehca_add_to_err_list(my_qp, 1);
+
+		my_qp->rq_map.next_wqe_idx = next_index(my_qp->rq_map.tail,
+							my_qp->rq_map.entries);
+		my_qp->rq_map.left_to_poll = 0;
+		if (HAS_RQ(my_qp))
+			ehca_add_to_err_list(my_qp, 0);
+	}
+
+	qmap_entry = &qmap->map[qmap_tail_idx];
+	if (qmap_entry->reported) {
+		ehca_warn(cq->device, "Double cqe on qp_num=%#x",
+				my_qp->real_qp_num);
+		/* found a double cqe, discard it and read next one */
+		goto repoll;
+	}
+
+	wc->wr_id = replace_wr_id(cqe->work_request_id, qmap_entry->app_wr_id);
+	qmap_entry->reported = 1;
+
+	/* if left_to_poll is decremented to 0, add the QP to the error list */
+	if (qmap->left_to_poll > 0) {
+		qmap->left_to_poll--;
+		if ((my_qp->sq_map.left_to_poll == 0) &&
+				(my_qp->rq_map.left_to_poll == 0)) {
+			ehca_add_to_err_list(my_qp, 1);
+			if (HAS_RQ(my_qp))
+				ehca_add_to_err_list(my_qp, 0);
+		}
+	}
+
+	/* eval ib_wc_opcode */
+	wc->opcode = ib_wc_opcode[cqe->optype]-1;
+	if (unlikely(wc->opcode == -1)) {
+		ehca_err(cq->device, "Invalid cqe->OPType=%x cqe->status=%x "
+			 "ehca_cq=%p cq_num=%x",
+			 cqe->optype, cqe->status, my_cq, my_cq->cq_number);
+		/* dump cqe for other infos */
+		ehca_dmp(cqe, 64, "ehca_cq=%p cq_num=%x",
+			 my_cq, my_cq->cq_number);
+		/* update also queue adder to throw away this entry!!! */
+		goto repoll;
+	}
+
+	/* eval ib_wc_status */
+	if (unlikely(is_error)) {
+		/* complete with errors */
+		map_ib_wc_status(cqe->status, &wc->status);
+		wc->vendor_err = wc->status;
+	} else
+		wc->status = IB_WC_SUCCESS;
+
+	wc->byte_len = cqe->nr_bytes_transferred;
+	wc->pkey_index = cqe->pkey_index;
+	wc->slid = cqe->rlid;
+	wc->dlid_path_bits = cqe->dlid;
+	wc->src_qp = cqe->remote_qp_number;
+	/*
+	 * HW has "Immed data present" and "GRH present" in bits 6 and 5.
+	 * SW defines those in bits 1 and 0, so we can just shift and mask.
+	 */
+	wc->wc_flags = (cqe->w_completion_flags >> 5) & 3;
+	wc->ex.imm_data = cpu_to_be32(cqe->immediate_data);
+	wc->sl = cqe->service_level;
+
+poll_cq_one_exit0:
+	if (cqe_count > 0)
+		hipz_update_feca(my_cq, cqe_count);
+
+	return ret;
+}
+
+static int generate_flush_cqes(struct ehca_qp *my_qp, struct ib_cq *cq,
+			       struct ib_wc *wc, int num_entries,
+			       struct ipz_queue *ipz_queue, int on_sq)
+{
+	int nr = 0;
+	struct ehca_wqe *wqe;
+	u64 offset;
+	struct ehca_queue_map *qmap;
+	struct ehca_qmap_entry *qmap_entry;
+
+	if (on_sq)
+		qmap = &my_qp->sq_map;
+	else
+		qmap = &my_qp->rq_map;
+
+	qmap_entry = &qmap->map[qmap->next_wqe_idx];
+
+	while ((nr < num_entries) && (qmap_entry->reported == 0)) {
+		/* generate flush CQE */
+
+		memset(wc, 0, sizeof(*wc));
+
+		offset = qmap->next_wqe_idx * ipz_queue->qe_size;
+		wqe = (struct ehca_wqe *)ipz_qeit_calc(ipz_queue, offset);
+		if (!wqe) {
+			ehca_err(cq->device, "Invalid wqe offset=%#llx on "
+				 "qp_num=%#x", offset, my_qp->real_qp_num);
+			return nr;
+		}
+
+		wc->wr_id = replace_wr_id(wqe->work_request_id,
+					  qmap_entry->app_wr_id);
+
+		if (on_sq) {
+			switch (wqe->optype) {
+			case WQE_OPTYPE_SEND:
+				wc->opcode = IB_WC_SEND;
+				break;
+			case WQE_OPTYPE_RDMAWRITE:
+				wc->opcode = IB_WC_RDMA_WRITE;
+				break;
+			case WQE_OPTYPE_RDMAREAD:
+				wc->opcode = IB_WC_RDMA_READ;
+				break;
+			default:
+				ehca_err(cq->device, "Invalid optype=%x",
+						wqe->optype);
+				return nr;
+			}
+		} else
+			wc->opcode = IB_WC_RECV;
+
+		if (wqe->wr_flag & WQE_WRFLAG_IMM_DATA_PRESENT) {
+			wc->ex.imm_data = wqe->immediate_data;
+			wc->wc_flags |= IB_WC_WITH_IMM;
+		}
+
+		wc->status = IB_WC_WR_FLUSH_ERR;
+
+		wc->qp = &my_qp->ib_qp;
+
+		/* mark as reported and advance next_wqe pointer */
+		qmap_entry->reported = 1;
+		qmap->next_wqe_idx = next_index(qmap->next_wqe_idx,
+						qmap->entries);
+		qmap_entry = &qmap->map[qmap->next_wqe_idx];
+
+		wc++; nr++;
+	}
+
+	return nr;
+
+}
+
+int ehca_poll_cq(struct ib_cq *cq, int num_entries, struct ib_wc *wc)
+{
+	struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+	int nr;
+	struct ehca_qp *err_qp;
+	struct ib_wc *current_wc = wc;
+	int ret = 0;
+	unsigned long flags;
+	int entries_left = num_entries;
+
+	if (num_entries < 1) {
+		ehca_err(cq->device, "Invalid num_entries=%d ehca_cq=%p "
+			 "cq_num=%x", num_entries, my_cq, my_cq->cq_number);
+		ret = -EINVAL;
+		goto poll_cq_exit0;
+	}
+
+	spin_lock_irqsave(&my_cq->spinlock, flags);
+
+	/* generate flush cqes for send queues */
+	list_for_each_entry(err_qp, &my_cq->sqp_err_list, sq_err_node) {
+		nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
+				&err_qp->ipz_squeue, 1);
+		entries_left -= nr;
+		current_wc += nr;
+
+		if (entries_left == 0)
+			break;
+	}
+
+	/* generate flush cqes for receive queues */
+	list_for_each_entry(err_qp, &my_cq->rqp_err_list, rq_err_node) {
+		nr = generate_flush_cqes(err_qp, cq, current_wc, entries_left,
+				&err_qp->ipz_rqueue, 0);
+		entries_left -= nr;
+		current_wc += nr;
+
+		if (entries_left == 0)
+			break;
+	}
+
+	for (nr = 0; nr < entries_left; nr++) {
+		ret = ehca_poll_cq_one(cq, current_wc);
+		if (ret)
+			break;
+		current_wc++;
+	} /* eof for nr */
+	entries_left -= nr;
+
+	spin_unlock_irqrestore(&my_cq->spinlock, flags);
+	if (ret == -EAGAIN  || !ret)
+		ret = num_entries - entries_left;
+
+poll_cq_exit0:
+	return ret;
+}
+
+int ehca_req_notify_cq(struct ib_cq *cq, enum ib_cq_notify_flags notify_flags)
+{
+	struct ehca_cq *my_cq = container_of(cq, struct ehca_cq, ib_cq);
+	int ret = 0;
+
+	switch (notify_flags & IB_CQ_SOLICITED_MASK) {
+	case IB_CQ_SOLICITED:
+		hipz_set_cqx_n0(my_cq, 1);
+		break;
+	case IB_CQ_NEXT_COMP:
+		hipz_set_cqx_n1(my_cq, 1);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (notify_flags & IB_CQ_REPORT_MISSED_EVENTS) {
+		unsigned long spl_flags;
+		spin_lock_irqsave(&my_cq->spinlock, spl_flags);
+		ret = ipz_qeit_is_valid(&my_cq->ipz_queue);
+		spin_unlock_irqrestore(&my_cq->spinlock, spl_flags);
+	}
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_sqp.c b/drivers/infiniband/hw/ehca/ehca_sqp.c
new file mode 100644
index 000000000..dba8f9f8b
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_sqp.c
@@ -0,0 +1,237 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  SQP functions
+ *
+ *  Authors: Khadija Souissi <souissi@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <rdma/ib_mad.h>
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "ehca_iverbs.h"
+#include "hcp_if.h"
+
+#define IB_MAD_STATUS_REDIRECT		cpu_to_be16(0x0002)
+#define IB_MAD_STATUS_UNSUP_VERSION	cpu_to_be16(0x0004)
+#define IB_MAD_STATUS_UNSUP_METHOD	cpu_to_be16(0x0008)
+
+#define IB_PMA_CLASS_PORT_INFO		cpu_to_be16(0x0001)
+
+/**
+ * ehca_define_sqp - Defines special queue pair 1 (GSI QP). When special queue
+ * pair is created successfully, the corresponding port gets active.
+ *
+ * Define Special Queue pair 0 (SMI QP) is still not supported.
+ *
+ * @qp_init_attr: Queue pair init attributes with port and queue pair type
+ */
+
+u64 ehca_define_sqp(struct ehca_shca *shca,
+		    struct ehca_qp *ehca_qp,
+		    struct ib_qp_init_attr *qp_init_attr)
+{
+	u32 pma_qp_nr, bma_qp_nr;
+	u64 ret;
+	u8 port = qp_init_attr->port_num;
+	int counter;
+
+	shca->sport[port - 1].port_state = IB_PORT_DOWN;
+
+	switch (qp_init_attr->qp_type) {
+	case IB_QPT_SMI:
+		/* function not supported yet */
+		break;
+	case IB_QPT_GSI:
+		ret = hipz_h_define_aqp1(shca->ipz_hca_handle,
+					 ehca_qp->ipz_qp_handle,
+					 ehca_qp->galpas.kernel,
+					 (u32) qp_init_attr->port_num,
+					 &pma_qp_nr, &bma_qp_nr);
+
+		if (ret != H_SUCCESS) {
+			ehca_err(&shca->ib_device,
+				 "Can't define AQP1 for port %x. h_ret=%lli",
+				 port, ret);
+			return ret;
+		}
+		shca->sport[port - 1].pma_qp_nr = pma_qp_nr;
+		ehca_dbg(&shca->ib_device, "port=%x pma_qp_nr=%x",
+			 port, pma_qp_nr);
+		break;
+	default:
+		ehca_err(&shca->ib_device, "invalid qp_type=%x",
+			 qp_init_attr->qp_type);
+		return H_PARAMETER;
+	}
+
+	if (ehca_nr_ports < 0) /* autodetect mode */
+		return H_SUCCESS;
+
+	for (counter = 0;
+	     shca->sport[port - 1].port_state != IB_PORT_ACTIVE &&
+		     counter < ehca_port_act_time;
+	     counter++) {
+		ehca_dbg(&shca->ib_device, "... wait until port %x is active",
+			 port);
+		msleep_interruptible(1000);
+	}
+
+	if (counter == ehca_port_act_time) {
+		ehca_err(&shca->ib_device, "Port %x is not active.", port);
+		return H_HARDWARE;
+	}
+
+	return H_SUCCESS;
+}
+
+struct ib_perf {
+	struct ib_mad_hdr mad_hdr;
+	u8 reserved[40];
+	u8 data[192];
+} __attribute__ ((packed));
+
+/* TC/SL/FL packed into 32 bits, as in ClassPortInfo */
+struct tcslfl {
+	u32 tc:8;
+	u32 sl:4;
+	u32 fl:20;
+} __attribute__ ((packed));
+
+/* IP Version/TC/FL packed into 32 bits, as in GRH */
+struct vertcfl {
+	u32 ver:4;
+	u32 tc:8;
+	u32 fl:20;
+} __attribute__ ((packed));
+
+static int ehca_process_perf(struct ib_device *ibdev, u8 port_num,
+			     struct ib_wc *in_wc, struct ib_grh *in_grh,
+			     struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	struct ib_perf *in_perf = (struct ib_perf *)in_mad;
+	struct ib_perf *out_perf = (struct ib_perf *)out_mad;
+	struct ib_class_port_info *poi =
+		(struct ib_class_port_info *)out_perf->data;
+	struct tcslfl *tcslfl =
+		(struct tcslfl *)&poi->redirect_tcslfl;
+	struct ehca_shca *shca =
+		container_of(ibdev, struct ehca_shca, ib_device);
+	struct ehca_sport *sport = &shca->sport[port_num - 1];
+
+	ehca_dbg(ibdev, "method=%x", in_perf->mad_hdr.method);
+
+	*out_mad = *in_mad;
+
+	if (in_perf->mad_hdr.class_version != 1) {
+		ehca_warn(ibdev, "Unsupported class_version=%x",
+			  in_perf->mad_hdr.class_version);
+		out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_VERSION;
+		goto perf_reply;
+	}
+
+	switch (in_perf->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+	case IB_MGMT_METHOD_SET:
+		/* set class port info for redirection */
+		out_perf->mad_hdr.attr_id = IB_PMA_CLASS_PORT_INFO;
+		out_perf->mad_hdr.status = IB_MAD_STATUS_REDIRECT;
+		memset(poi, 0, sizeof(*poi));
+		poi->base_version = 1;
+		poi->class_version = 1;
+		poi->resp_time_value = 18;
+
+		/* copy local routing information from WC where applicable */
+		tcslfl->sl         = in_wc->sl;
+		poi->redirect_lid  =
+			sport->saved_attr.lid | in_wc->dlid_path_bits;
+		poi->redirect_qp   = sport->pma_qp_nr;
+		poi->redirect_qkey = IB_QP1_QKEY;
+
+		ehca_query_pkey(ibdev, port_num, in_wc->pkey_index,
+				&poi->redirect_pkey);
+
+		/* if request was globally routed, copy route info */
+		if (in_grh) {
+			struct vertcfl *vertcfl =
+				(struct vertcfl *)&in_grh->version_tclass_flow;
+			memcpy(poi->redirect_gid, in_grh->dgid.raw,
+			       sizeof(poi->redirect_gid));
+			tcslfl->tc        = vertcfl->tc;
+			tcslfl->fl        = vertcfl->fl;
+		} else
+			/* else only fill in default GID */
+			ehca_query_gid(ibdev, port_num, 0,
+				       (union ib_gid *)&poi->redirect_gid);
+
+		ehca_dbg(ibdev, "ehca_pma_lid=%x ehca_pma_qp=%x",
+			 sport->saved_attr.lid, sport->pma_qp_nr);
+		break;
+
+	case IB_MGMT_METHOD_GET_RESP:
+		return IB_MAD_RESULT_FAILURE;
+
+	default:
+		out_perf->mad_hdr.status = IB_MAD_STATUS_UNSUP_METHOD;
+		break;
+	}
+
+perf_reply:
+	out_perf->mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+int ehca_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		     struct ib_wc *in_wc, struct ib_grh *in_grh,
+		     struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	int ret;
+
+	if (!port_num || port_num > ibdev->phys_port_cnt || !in_wc)
+		return IB_MAD_RESULT_FAILURE;
+
+	/* accept only pma request */
+	if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
+		return IB_MAD_RESULT_SUCCESS;
+
+	ehca_dbg(ibdev, "port_num=%x src_qp=%x", port_num, in_wc->src_qp);
+	ret = ehca_process_perf(ibdev, port_num, in_wc, in_grh,
+				in_mad, out_mad);
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ehca/ehca_tools.h b/drivers/infiniband/hw/ehca/ehca_tools.h
new file mode 100644
index 000000000..d280b12aa
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_tools.h
@@ -0,0 +1,155 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  auxiliary functions
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Khadija Souissi <souissik@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef EHCA_TOOLS_H
+#define EHCA_TOOLS_H
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/vmalloc.h>
+#include <linux/notifier.h>
+#include <linux/cpu.h>
+#include <linux/device.h>
+
+#include <linux/atomic.h>
+#include <asm/ibmebus.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/hvcall.h>
+
+extern int ehca_debug_level;
+
+#define ehca_dbg(ib_dev, format, arg...) \
+	do { \
+		if (unlikely(ehca_debug_level)) \
+			dev_printk(KERN_DEBUG, (ib_dev)->dma_device, \
+				   "PU%04x EHCA_DBG:%s " format "\n", \
+				   raw_smp_processor_id(), __func__, \
+				   ## arg); \
+	} while (0)
+
+#define ehca_info(ib_dev, format, arg...) \
+	dev_info((ib_dev)->dma_device, "PU%04x EHCA_INFO:%s " format "\n", \
+		 raw_smp_processor_id(), __func__, ## arg)
+
+#define ehca_warn(ib_dev, format, arg...) \
+	dev_warn((ib_dev)->dma_device, "PU%04x EHCA_WARN:%s " format "\n", \
+		 raw_smp_processor_id(), __func__, ## arg)
+
+#define ehca_err(ib_dev, format, arg...) \
+	dev_err((ib_dev)->dma_device, "PU%04x EHCA_ERR:%s " format "\n", \
+		raw_smp_processor_id(), __func__, ## arg)
+
+/* use this one only if no ib_dev available */
+#define ehca_gen_dbg(format, arg...) \
+	do { \
+		if (unlikely(ehca_debug_level)) \
+			printk(KERN_DEBUG "PU%04x EHCA_DBG:%s " format "\n", \
+			       raw_smp_processor_id(), __func__, ## arg); \
+	} while (0)
+
+#define ehca_gen_warn(format, arg...) \
+	printk(KERN_INFO "PU%04x EHCA_WARN:%s " format "\n", \
+	       raw_smp_processor_id(), __func__, ## arg)
+
+#define ehca_gen_err(format, arg...) \
+	printk(KERN_ERR "PU%04x EHCA_ERR:%s " format "\n", \
+	       raw_smp_processor_id(), __func__, ## arg)
+
+/**
+ * ehca_dmp - printk a memory block, whose length is n*8 bytes.
+ * Each line has the following layout:
+ * <format string> adr=X ofs=Y <8 bytes hex> <8 bytes hex>
+ */
+#define ehca_dmp(adr, len, format, args...) \
+	do { \
+		unsigned int x; \
+		unsigned int l = (unsigned int)(len); \
+		unsigned char *deb = (unsigned char *)(adr); \
+		for (x = 0; x < l; x += 16) { \
+			printk(KERN_INFO "EHCA_DMP:%s " format \
+			       " adr=%p ofs=%04x %016llx %016llx\n", \
+			       __func__, ##args, deb, x, \
+			       *((u64 *)&deb[0]), *((u64 *)&deb[8])); \
+			deb += 16; \
+		} \
+	} while (0)
+
+/* define a bitmask, little endian version */
+#define EHCA_BMASK(pos, length) (((pos) << 16) + (length))
+
+/* define a bitmask, the ibm way... */
+#define EHCA_BMASK_IBM(from, to) (((63 - to) << 16) + ((to) - (from) + 1))
+
+/* internal function, don't use */
+#define EHCA_BMASK_SHIFTPOS(mask) (((mask) >> 16) & 0xffff)
+
+/* internal function, don't use */
+#define EHCA_BMASK_MASK(mask) (~0ULL >> ((64 - (mask)) & 0xffff))
+
+/**
+ * EHCA_BMASK_SET - return value shifted and masked by mask
+ * variable|=EHCA_BMASK_SET(MY_MASK,0x4711) ORs the bits in variable
+ * variable&=~EHCA_BMASK_SET(MY_MASK,-1) clears the bits from the mask
+ * in variable
+ */
+#define EHCA_BMASK_SET(mask, value) \
+	((EHCA_BMASK_MASK(mask) & ((u64)(value))) << EHCA_BMASK_SHIFTPOS(mask))
+
+/**
+ * EHCA_BMASK_GET - extract a parameter from value by mask
+ */
+#define EHCA_BMASK_GET(mask, value) \
+	(EHCA_BMASK_MASK(mask) & (((u64)(value)) >> EHCA_BMASK_SHIFTPOS(mask)))
+
+/* Converts ehca to ib return code */
+int ehca2ib_return_code(u64 ehca_rc);
+
+#endif /* EHCA_TOOLS_H */
diff --git a/drivers/infiniband/hw/ehca/ehca_uverbs.c b/drivers/infiniband/hw/ehca/ehca_uverbs.c
new file mode 100644
index 000000000..1a1d5d99f
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ehca_uverbs.c
@@ -0,0 +1,309 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  userspace support verbs
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_classes.h"
+#include "ehca_iverbs.h"
+#include "ehca_mrmw.h"
+#include "ehca_tools.h"
+#include "hcp_if.h"
+
+struct ib_ucontext *ehca_alloc_ucontext(struct ib_device *device,
+					struct ib_udata *udata)
+{
+	struct ehca_ucontext *my_context;
+
+	my_context = kzalloc(sizeof *my_context, GFP_KERNEL);
+	if (!my_context) {
+		ehca_err(device, "Out of memory device=%p", device);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return &my_context->ib_ucontext;
+}
+
+int ehca_dealloc_ucontext(struct ib_ucontext *context)
+{
+	kfree(container_of(context, struct ehca_ucontext, ib_ucontext));
+	return 0;
+}
+
+static void ehca_mm_open(struct vm_area_struct *vma)
+{
+	u32 *count = (u32 *)vma->vm_private_data;
+	if (!count) {
+		ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx",
+			     vma->vm_start, vma->vm_end);
+		return;
+	}
+	(*count)++;
+	if (!(*count))
+		ehca_gen_err("Use count overflow vm_start=%lx vm_end=%lx",
+			     vma->vm_start, vma->vm_end);
+	ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x",
+		     vma->vm_start, vma->vm_end, *count);
+}
+
+static void ehca_mm_close(struct vm_area_struct *vma)
+{
+	u32 *count = (u32 *)vma->vm_private_data;
+	if (!count) {
+		ehca_gen_err("Invalid vma struct vm_start=%lx vm_end=%lx",
+			     vma->vm_start, vma->vm_end);
+		return;
+	}
+	(*count)--;
+	ehca_gen_dbg("vm_start=%lx vm_end=%lx count=%x",
+		     vma->vm_start, vma->vm_end, *count);
+}
+
+static const struct vm_operations_struct vm_ops = {
+	.open =	ehca_mm_open,
+	.close = ehca_mm_close,
+};
+
+static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas,
+			u32 *mm_count)
+{
+	int ret;
+	u64 vsize, physical;
+
+	vsize = vma->vm_end - vma->vm_start;
+	if (vsize < EHCA_PAGESIZE) {
+		ehca_gen_err("invalid vsize=%lx", vma->vm_end - vma->vm_start);
+		return -EINVAL;
+	}
+
+	physical = galpas->user.fw_handle;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	ehca_gen_dbg("vsize=%llx physical=%llx", vsize, physical);
+	/* VM_IO | VM_DONTEXPAND | VM_DONTDUMP are set by remap_pfn_range() */
+	ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT,
+			   vma->vm_page_prot);
+	if (unlikely(ret)) {
+		ehca_gen_err("remap_pfn_range() failed ret=%i", ret);
+		return -ENOMEM;
+	}
+
+	vma->vm_private_data = mm_count;
+	(*mm_count)++;
+	vma->vm_ops = &vm_ops;
+
+	return 0;
+}
+
+static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue,
+			   u32 *mm_count)
+{
+	int ret;
+	u64 start, ofs;
+	struct page *page;
+
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+	start = vma->vm_start;
+	for (ofs = 0; ofs < queue->queue_length; ofs += PAGE_SIZE) {
+		u64 virt_addr = (u64)ipz_qeit_calc(queue, ofs);
+		page = virt_to_page(virt_addr);
+		ret = vm_insert_page(vma, start, page);
+		if (unlikely(ret)) {
+			ehca_gen_err("vm_insert_page() failed rc=%i", ret);
+			return ret;
+		}
+		start += PAGE_SIZE;
+	}
+	vma->vm_private_data = mm_count;
+	(*mm_count)++;
+	vma->vm_ops = &vm_ops;
+
+	return 0;
+}
+
+static int ehca_mmap_cq(struct vm_area_struct *vma, struct ehca_cq *cq,
+			u32 rsrc_type)
+{
+	int ret;
+
+	switch (rsrc_type) {
+	case 0: /* galpa fw handle */
+		ehca_dbg(cq->ib_cq.device, "cq_num=%x fw", cq->cq_number);
+		ret = ehca_mmap_fw(vma, &cq->galpas, &cq->mm_count_galpa);
+		if (unlikely(ret)) {
+			ehca_err(cq->ib_cq.device,
+				 "ehca_mmap_fw() failed rc=%i cq_num=%x",
+				 ret, cq->cq_number);
+			return ret;
+		}
+		break;
+
+	case 1: /* cq queue_addr */
+		ehca_dbg(cq->ib_cq.device, "cq_num=%x queue", cq->cq_number);
+		ret = ehca_mmap_queue(vma, &cq->ipz_queue, &cq->mm_count_queue);
+		if (unlikely(ret)) {
+			ehca_err(cq->ib_cq.device,
+				 "ehca_mmap_queue() failed rc=%i cq_num=%x",
+				 ret, cq->cq_number);
+			return ret;
+		}
+		break;
+
+	default:
+		ehca_err(cq->ib_cq.device, "bad resource type=%x cq_num=%x",
+			 rsrc_type, cq->cq_number);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp,
+			u32 rsrc_type)
+{
+	int ret;
+
+	switch (rsrc_type) {
+	case 0: /* galpa fw handle */
+		ehca_dbg(qp->ib_qp.device, "qp_num=%x fw", qp->ib_qp.qp_num);
+		ret = ehca_mmap_fw(vma, &qp->galpas, &qp->mm_count_galpa);
+		if (unlikely(ret)) {
+			ehca_err(qp->ib_qp.device,
+				 "remap_pfn_range() failed ret=%i qp_num=%x",
+				 ret, qp->ib_qp.qp_num);
+			return -ENOMEM;
+		}
+		break;
+
+	case 1: /* qp rqueue_addr */
+		ehca_dbg(qp->ib_qp.device, "qp_num=%x rq", qp->ib_qp.qp_num);
+		ret = ehca_mmap_queue(vma, &qp->ipz_rqueue,
+				      &qp->mm_count_rqueue);
+		if (unlikely(ret)) {
+			ehca_err(qp->ib_qp.device,
+				 "ehca_mmap_queue(rq) failed rc=%i qp_num=%x",
+				 ret, qp->ib_qp.qp_num);
+			return ret;
+		}
+		break;
+
+	case 2: /* qp squeue_addr */
+		ehca_dbg(qp->ib_qp.device, "qp_num=%x sq", qp->ib_qp.qp_num);
+		ret = ehca_mmap_queue(vma, &qp->ipz_squeue,
+				      &qp->mm_count_squeue);
+		if (unlikely(ret)) {
+			ehca_err(qp->ib_qp.device,
+				 "ehca_mmap_queue(sq) failed rc=%i qp_num=%x",
+				 ret, qp->ib_qp.qp_num);
+			return ret;
+		}
+		break;
+
+	default:
+		ehca_err(qp->ib_qp.device, "bad resource type=%x qp=num=%x",
+			 rsrc_type, qp->ib_qp.qp_num);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	u64 fileoffset = vma->vm_pgoff;
+	u32 idr_handle = fileoffset & 0x1FFFFFF;
+	u32 q_type = (fileoffset >> 27) & 0x1;	  /* CQ, QP,...        */
+	u32 rsrc_type = (fileoffset >> 25) & 0x3; /* sq,rq,cmnd_window */
+	u32 ret;
+	struct ehca_cq *cq;
+	struct ehca_qp *qp;
+	struct ib_uobject *uobject;
+
+	switch (q_type) {
+	case  0: /* CQ */
+		read_lock(&ehca_cq_idr_lock);
+		cq = idr_find(&ehca_cq_idr, idr_handle);
+		read_unlock(&ehca_cq_idr_lock);
+
+		/* make sure this mmap really belongs to the authorized user */
+		if (!cq)
+			return -EINVAL;
+
+		if (!cq->ib_cq.uobject || cq->ib_cq.uobject->context != context)
+			return -EINVAL;
+
+		ret = ehca_mmap_cq(vma, cq, rsrc_type);
+		if (unlikely(ret)) {
+			ehca_err(cq->ib_cq.device,
+				 "ehca_mmap_cq() failed rc=%i cq_num=%x",
+				 ret, cq->cq_number);
+			return ret;
+		}
+		break;
+
+	case 1: /* QP */
+		read_lock(&ehca_qp_idr_lock);
+		qp = idr_find(&ehca_qp_idr, idr_handle);
+		read_unlock(&ehca_qp_idr_lock);
+
+		/* make sure this mmap really belongs to the authorized user */
+		if (!qp)
+			return -EINVAL;
+
+		uobject = IS_SRQ(qp) ? qp->ib_srq.uobject : qp->ib_qp.uobject;
+		if (!uobject || uobject->context != context)
+			return -EINVAL;
+
+		ret = ehca_mmap_qp(vma, qp, rsrc_type);
+		if (unlikely(ret)) {
+			ehca_err(qp->ib_qp.device,
+				 "ehca_mmap_qp() failed rc=%i qp_num=%x",
+				 ret, qp->ib_qp.qp_num);
+			return ret;
+		}
+		break;
+
+	default:
+		ehca_gen_err("bad queue type %x", q_type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c
new file mode 100644
index 000000000..89517ffb4
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/hcp_if.c
@@ -0,0 +1,949 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Firmware Infiniband Interface code for POWER
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Joachim Fenkes <fenkes@de.ibm.com>
+ *           Gerd Bayer <gerd.bayer@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <asm/hvcall.h>
+#include "ehca_tools.h"
+#include "hcp_if.h"
+#include "hcp_phyp.h"
+#include "hipz_fns.h"
+#include "ipz_pt_fn.h"
+
+#define H_ALL_RES_QP_ENHANCED_OPS       EHCA_BMASK_IBM(9, 11)
+#define H_ALL_RES_QP_PTE_PIN            EHCA_BMASK_IBM(12, 12)
+#define H_ALL_RES_QP_SERVICE_TYPE       EHCA_BMASK_IBM(13, 15)
+#define H_ALL_RES_QP_STORAGE            EHCA_BMASK_IBM(16, 17)
+#define H_ALL_RES_QP_LL_RQ_CQE_POSTING  EHCA_BMASK_IBM(18, 18)
+#define H_ALL_RES_QP_LL_SQ_CQE_POSTING  EHCA_BMASK_IBM(19, 21)
+#define H_ALL_RES_QP_SIGNALING_TYPE     EHCA_BMASK_IBM(22, 23)
+#define H_ALL_RES_QP_UD_AV_LKEY_CTRL    EHCA_BMASK_IBM(31, 31)
+#define H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE EHCA_BMASK_IBM(32, 35)
+#define H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE EHCA_BMASK_IBM(36, 39)
+#define H_ALL_RES_QP_RESOURCE_TYPE      EHCA_BMASK_IBM(56, 63)
+
+#define H_ALL_RES_QP_MAX_OUTST_SEND_WR  EHCA_BMASK_IBM(0, 15)
+#define H_ALL_RES_QP_MAX_OUTST_RECV_WR  EHCA_BMASK_IBM(16, 31)
+#define H_ALL_RES_QP_MAX_SEND_SGE       EHCA_BMASK_IBM(32, 39)
+#define H_ALL_RES_QP_MAX_RECV_SGE       EHCA_BMASK_IBM(40, 47)
+
+#define H_ALL_RES_QP_UD_AV_LKEY         EHCA_BMASK_IBM(32, 63)
+#define H_ALL_RES_QP_SRQ_QP_TOKEN       EHCA_BMASK_IBM(0, 31)
+#define H_ALL_RES_QP_SRQ_QP_HANDLE      EHCA_BMASK_IBM(0, 64)
+#define H_ALL_RES_QP_SRQ_LIMIT          EHCA_BMASK_IBM(48, 63)
+#define H_ALL_RES_QP_SRQ_QPN            EHCA_BMASK_IBM(40, 63)
+
+#define H_ALL_RES_QP_ACT_OUTST_SEND_WR  EHCA_BMASK_IBM(16, 31)
+#define H_ALL_RES_QP_ACT_OUTST_RECV_WR  EHCA_BMASK_IBM(48, 63)
+#define H_ALL_RES_QP_ACT_SEND_SGE       EHCA_BMASK_IBM(8, 15)
+#define H_ALL_RES_QP_ACT_RECV_SGE       EHCA_BMASK_IBM(24, 31)
+
+#define H_ALL_RES_QP_SQUEUE_SIZE_PAGES  EHCA_BMASK_IBM(0, 31)
+#define H_ALL_RES_QP_RQUEUE_SIZE_PAGES  EHCA_BMASK_IBM(32, 63)
+
+#define H_MP_INIT_TYPE                  EHCA_BMASK_IBM(44, 47)
+#define H_MP_SHUTDOWN                   EHCA_BMASK_IBM(48, 48)
+#define H_MP_RESET_QKEY_CTR             EHCA_BMASK_IBM(49, 49)
+
+#define HCALL4_REGS_FORMAT "r4=%lx r5=%lx r6=%lx r7=%lx"
+#define HCALL7_REGS_FORMAT HCALL4_REGS_FORMAT " r8=%lx r9=%lx r10=%lx"
+#define HCALL9_REGS_FORMAT HCALL7_REGS_FORMAT " r11=%lx r12=%lx"
+
+static DEFINE_SPINLOCK(hcall_lock);
+
+static long ehca_plpar_hcall_norets(unsigned long opcode,
+				    unsigned long arg1,
+				    unsigned long arg2,
+				    unsigned long arg3,
+				    unsigned long arg4,
+				    unsigned long arg5,
+				    unsigned long arg6,
+				    unsigned long arg7)
+{
+	long ret;
+	int i, sleep_msecs;
+	unsigned long flags = 0;
+
+	if (unlikely(ehca_debug_level >= 2))
+		ehca_gen_dbg("opcode=%lx " HCALL7_REGS_FORMAT,
+			     opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7);
+
+	for (i = 0; i < 5; i++) {
+		/* serialize hCalls to work around firmware issue */
+		if (ehca_lock_hcalls)
+			spin_lock_irqsave(&hcall_lock, flags);
+
+		ret = plpar_hcall_norets(opcode, arg1, arg2, arg3, arg4,
+					 arg5, arg6, arg7);
+
+		if (ehca_lock_hcalls)
+			spin_unlock_irqrestore(&hcall_lock, flags);
+
+		if (H_IS_LONG_BUSY(ret)) {
+			sleep_msecs = get_longbusy_msecs(ret);
+			msleep_interruptible(sleep_msecs);
+			continue;
+		}
+
+		if (ret < H_SUCCESS)
+			ehca_gen_err("opcode=%lx ret=%li " HCALL7_REGS_FORMAT,
+				     opcode, ret, arg1, arg2, arg3,
+				     arg4, arg5, arg6, arg7);
+		else
+			if (unlikely(ehca_debug_level >= 2))
+				ehca_gen_dbg("opcode=%lx ret=%li", opcode, ret);
+
+		return ret;
+	}
+
+	return H_BUSY;
+}
+
+static long ehca_plpar_hcall9(unsigned long opcode,
+			      unsigned long *outs, /* array of 9 outputs */
+			      unsigned long arg1,
+			      unsigned long arg2,
+			      unsigned long arg3,
+			      unsigned long arg4,
+			      unsigned long arg5,
+			      unsigned long arg6,
+			      unsigned long arg7,
+			      unsigned long arg8,
+			      unsigned long arg9)
+{
+	long ret;
+	int i, sleep_msecs;
+	unsigned long flags = 0;
+
+	if (unlikely(ehca_debug_level >= 2))
+		ehca_gen_dbg("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, opcode,
+			     arg1, arg2, arg3, arg4, arg5,
+			     arg6, arg7, arg8, arg9);
+
+	for (i = 0; i < 5; i++) {
+		/* serialize hCalls to work around firmware issue */
+		if (ehca_lock_hcalls)
+			spin_lock_irqsave(&hcall_lock, flags);
+
+		ret = plpar_hcall9(opcode, outs,
+				   arg1, arg2, arg3, arg4, arg5,
+				   arg6, arg7, arg8, arg9);
+
+		if (ehca_lock_hcalls)
+			spin_unlock_irqrestore(&hcall_lock, flags);
+
+		if (H_IS_LONG_BUSY(ret)) {
+			sleep_msecs = get_longbusy_msecs(ret);
+			msleep_interruptible(sleep_msecs);
+			continue;
+		}
+
+		if (ret < H_SUCCESS) {
+			ehca_gen_err("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT,
+				     opcode, arg1, arg2, arg3, arg4, arg5,
+				     arg6, arg7, arg8, arg9);
+			ehca_gen_err("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
+				     ret, outs[0], outs[1], outs[2], outs[3],
+				     outs[4], outs[5], outs[6], outs[7],
+				     outs[8]);
+		} else if (unlikely(ehca_debug_level >= 2))
+			ehca_gen_dbg("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
+				     ret, outs[0], outs[1], outs[2], outs[3],
+				     outs[4], outs[5], outs[6], outs[7],
+				     outs[8]);
+		return ret;
+	}
+
+	return H_BUSY;
+}
+
+u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
+			     struct ehca_pfeq *pfeq,
+			     const u32 neq_control,
+			     const u32 number_of_entries,
+			     struct ipz_eq_handle *eq_handle,
+			     u32 *act_nr_of_entries,
+			     u32 *act_pages,
+			     u32 *eq_ist)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+	u64 allocate_controls;
+
+	/* resource type */
+	allocate_controls = 3ULL;
+
+	/* ISN is associated */
+	if (neq_control != 1)
+		allocate_controls = (1ULL << (63 - 7)) | allocate_controls;
+	else /* notification event queue */
+		allocate_controls = (1ULL << 63) | allocate_controls;
+
+	ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+				adapter_handle.handle,  /* r4 */
+				allocate_controls,      /* r5 */
+				number_of_entries,      /* r6 */
+				0, 0, 0, 0, 0, 0);
+	eq_handle->handle = outs[0];
+	*act_nr_of_entries = (u32)outs[3];
+	*act_pages = (u32)outs[4];
+	*eq_ist = (u32)outs[5];
+
+	if (ret == H_NOT_ENOUGH_RESOURCES)
+		ehca_gen_err("Not enough resource - ret=%lli ", ret);
+
+	return ret;
+}
+
+u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle,
+		       struct ipz_eq_handle eq_handle,
+		       const u64 event_mask)
+{
+	return ehca_plpar_hcall_norets(H_RESET_EVENTS,
+				       adapter_handle.handle, /* r4 */
+				       eq_handle.handle,      /* r5 */
+				       event_mask,	      /* r6 */
+				       0, 0, 0, 0);
+}
+
+u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
+			     struct ehca_cq *cq,
+			     struct ehca_alloc_cq_parms *param)
+{
+	int rc;
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+				adapter_handle.handle,   /* r4  */
+				2,	                 /* r5  */
+				param->eq_handle.handle, /* r6  */
+				cq->token,	         /* r7  */
+				param->nr_cqe,           /* r8  */
+				0, 0, 0, 0);
+	cq->ipz_cq_handle.handle = outs[0];
+	param->act_nr_of_entries = (u32)outs[3];
+	param->act_pages = (u32)outs[4];
+
+	if (ret == H_SUCCESS) {
+		rc = hcp_galpas_ctor(&cq->galpas, 0, outs[5], outs[6]);
+		if (rc) {
+			ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx",
+				     rc, outs[5]);
+
+			ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+						adapter_handle.handle,     /* r4 */
+						cq->ipz_cq_handle.handle,  /* r5 */
+						0, 0, 0, 0, 0);
+			ret = H_NO_MEM;
+		}
+	}
+
+	if (ret == H_NOT_ENOUGH_RESOURCES)
+		ehca_gen_err("Not enough resources. ret=%lli", ret);
+
+	return ret;
+}
+
+u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
+			     struct ehca_alloc_qp_parms *parms, int is_user)
+{
+	int rc;
+	u64 ret;
+	u64 allocate_controls, max_r10_reg, r11, r12;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	allocate_controls =
+		EHCA_BMASK_SET(H_ALL_RES_QP_ENHANCED_OPS, parms->ext_type)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_PTE_PIN, 0)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_SERVICE_TYPE, parms->servicetype)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_SIGNALING_TYPE, parms->sigtype)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_STORAGE, parms->qp_storage)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_SQ_PAGE_SIZE,
+				 parms->squeue.page_size)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_SMALL_RQ_PAGE_SIZE,
+				 parms->rqueue.page_size)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_LL_RQ_CQE_POSTING,
+				 !!(parms->ll_comp_flags & LLQP_RECV_COMP))
+		| EHCA_BMASK_SET(H_ALL_RES_QP_LL_SQ_CQE_POSTING,
+				 !!(parms->ll_comp_flags & LLQP_SEND_COMP))
+		| EHCA_BMASK_SET(H_ALL_RES_QP_UD_AV_LKEY_CTRL,
+				 parms->ud_av_l_key_ctl)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_RESOURCE_TYPE, 1);
+
+	max_r10_reg =
+		EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_SEND_WR,
+			       parms->squeue.max_wr + 1)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_MAX_OUTST_RECV_WR,
+				 parms->rqueue.max_wr + 1)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_MAX_SEND_SGE,
+				 parms->squeue.max_sge)
+		| EHCA_BMASK_SET(H_ALL_RES_QP_MAX_RECV_SGE,
+				 parms->rqueue.max_sge);
+
+	r11 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QP_TOKEN, parms->srq_token);
+
+	if (parms->ext_type == EQPT_SRQ)
+		r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_LIMIT, parms->srq_limit);
+	else
+		r12 = EHCA_BMASK_SET(H_ALL_RES_QP_SRQ_QPN, parms->srq_qpn);
+
+	ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+				adapter_handle.handle,	           /* r4  */
+				allocate_controls,	           /* r5  */
+				parms->send_cq_handle.handle,
+				parms->recv_cq_handle.handle,
+				parms->eq_handle.handle,
+				((u64)parms->token << 32) | parms->pd.value,
+				max_r10_reg, r11, r12);
+
+	parms->qp_handle.handle = outs[0];
+	parms->real_qp_num = (u32)outs[1];
+	parms->squeue.act_nr_wqes =
+		(u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_SEND_WR, outs[2]);
+	parms->rqueue.act_nr_wqes =
+		(u16)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_OUTST_RECV_WR, outs[2]);
+	parms->squeue.act_nr_sges =
+		(u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_SEND_SGE, outs[3]);
+	parms->rqueue.act_nr_sges =
+		(u8)EHCA_BMASK_GET(H_ALL_RES_QP_ACT_RECV_SGE, outs[3]);
+	parms->squeue.queue_size =
+		(u32)EHCA_BMASK_GET(H_ALL_RES_QP_SQUEUE_SIZE_PAGES, outs[4]);
+	parms->rqueue.queue_size =
+		(u32)EHCA_BMASK_GET(H_ALL_RES_QP_RQUEUE_SIZE_PAGES, outs[4]);
+
+	if (ret == H_SUCCESS) {
+		rc = hcp_galpas_ctor(&parms->galpas, is_user, outs[6], outs[6]);
+		if (rc) {
+			ehca_gen_err("Could not establish HW access. rc=%d paddr=%#lx",
+				     rc, outs[6]);
+
+			ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+						adapter_handle.handle,     /* r4 */
+						parms->qp_handle.handle,  /* r5 */
+						0, 0, 0, 0, 0);
+			ret = H_NO_MEM;
+		}
+	}
+
+	if (ret == H_NOT_ENOUGH_RESOURCES)
+		ehca_gen_err("Not enough resources. ret=%lli", ret);
+
+	return ret;
+}
+
+u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
+		      const u8 port_id,
+		      struct hipz_query_port *query_port_response_block)
+{
+	u64 ret;
+	u64 r_cb = __pa(query_port_response_block);
+
+	if (r_cb & (EHCA_PAGESIZE-1)) {
+		ehca_gen_err("response block not page aligned");
+		return H_PARAMETER;
+	}
+
+	ret = ehca_plpar_hcall_norets(H_QUERY_PORT,
+				      adapter_handle.handle, /* r4 */
+				      port_id,	             /* r5 */
+				      r_cb,	             /* r6 */
+				      0, 0, 0, 0);
+
+	if (ehca_debug_level >= 2)
+		ehca_dmp(query_port_response_block, 64, "response_block");
+
+	return ret;
+}
+
+u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle,
+		       const u8 port_id, const u32 port_cap,
+		       const u8 init_type, const int modify_mask)
+{
+	u64 port_attributes = port_cap;
+
+	if (modify_mask & IB_PORT_SHUTDOWN)
+		port_attributes |= EHCA_BMASK_SET(H_MP_SHUTDOWN, 1);
+	if (modify_mask & IB_PORT_INIT_TYPE)
+		port_attributes |= EHCA_BMASK_SET(H_MP_INIT_TYPE, init_type);
+	if (modify_mask & IB_PORT_RESET_QKEY_CNTR)
+		port_attributes |= EHCA_BMASK_SET(H_MP_RESET_QKEY_CTR, 1);
+
+	return ehca_plpar_hcall_norets(H_MODIFY_PORT,
+				       adapter_handle.handle, /* r4 */
+				       port_id,               /* r5 */
+				       port_attributes,       /* r6 */
+				       0, 0, 0, 0);
+}
+
+u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle,
+		     struct hipz_query_hca *query_hca_rblock)
+{
+	u64 r_cb = __pa(query_hca_rblock);
+
+	if (r_cb & (EHCA_PAGESIZE-1)) {
+		ehca_gen_err("response_block=%p not page aligned",
+			     query_hca_rblock);
+		return H_PARAMETER;
+	}
+
+	return ehca_plpar_hcall_norets(H_QUERY_HCA,
+				       adapter_handle.handle, /* r4 */
+				       r_cb,                  /* r5 */
+				       0, 0, 0, 0, 0);
+}
+
+u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle,
+			  const u8 pagesize,
+			  const u8 queue_type,
+			  const u64 resource_handle,
+			  const u64 logical_address_of_page,
+			  u64 count)
+{
+	return ehca_plpar_hcall_norets(H_REGISTER_RPAGES,
+				       adapter_handle.handle,      /* r4  */
+				       (u64)queue_type | ((u64)pagesize) << 8,
+				       /* r5  */
+				       resource_handle,	           /* r6  */
+				       logical_address_of_page,    /* r7  */
+				       count,	                   /* r8  */
+				       0, 0);
+}
+
+u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle,
+			     const struct ipz_eq_handle eq_handle,
+			     struct ehca_pfeq *pfeq,
+			     const u8 pagesize,
+			     const u8 queue_type,
+			     const u64 logical_address_of_page,
+			     const u64 count)
+{
+	if (count != 1) {
+		ehca_gen_err("Ppage counter=%llx", count);
+		return H_PARAMETER;
+	}
+	return hipz_h_register_rpage(adapter_handle,
+				     pagesize,
+				     queue_type,
+				     eq_handle.handle,
+				     logical_address_of_page, count);
+}
+
+u64 hipz_h_query_int_state(const struct ipz_adapter_handle adapter_handle,
+			   u32 ist)
+{
+	u64 ret;
+	ret = ehca_plpar_hcall_norets(H_QUERY_INT_STATE,
+				      adapter_handle.handle, /* r4 */
+				      ist,                   /* r5 */
+				      0, 0, 0, 0, 0);
+
+	if (ret != H_SUCCESS && ret != H_BUSY)
+		ehca_gen_err("Could not query interrupt state.");
+
+	return ret;
+}
+
+u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle,
+			     const struct ipz_cq_handle cq_handle,
+			     struct ehca_pfcq *pfcq,
+			     const u8 pagesize,
+			     const u8 queue_type,
+			     const u64 logical_address_of_page,
+			     const u64 count,
+			     const struct h_galpa gal)
+{
+	if (count != 1) {
+		ehca_gen_err("Page counter=%llx", count);
+		return H_PARAMETER;
+	}
+
+	return hipz_h_register_rpage(adapter_handle, pagesize, queue_type,
+				     cq_handle.handle, logical_address_of_page,
+				     count);
+}
+
+u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle,
+			     const struct ipz_qp_handle qp_handle,
+			     struct ehca_pfqp *pfqp,
+			     const u8 pagesize,
+			     const u8 queue_type,
+			     const u64 logical_address_of_page,
+			     const u64 count,
+			     const struct h_galpa galpa)
+{
+	if (count > 1) {
+		ehca_gen_err("Page counter=%llx", count);
+		return H_PARAMETER;
+	}
+
+	return hipz_h_register_rpage(adapter_handle, pagesize, queue_type,
+				     qp_handle.handle, logical_address_of_page,
+				     count);
+}
+
+u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle,
+			       const struct ipz_qp_handle qp_handle,
+			       struct ehca_pfqp *pfqp,
+			       void **log_addr_next_sq_wqe2processed,
+			       void **log_addr_next_rq_wqe2processed,
+			       int dis_and_get_function_code)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs,
+				adapter_handle.handle,     /* r4 */
+				dis_and_get_function_code, /* r5 */
+				qp_handle.handle,	   /* r6 */
+				0, 0, 0, 0, 0, 0);
+	if (log_addr_next_sq_wqe2processed)
+		*log_addr_next_sq_wqe2processed = (void *)outs[0];
+	if (log_addr_next_rq_wqe2processed)
+		*log_addr_next_rq_wqe2processed = (void *)outs[1];
+
+	return ret;
+}
+
+u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
+		     const struct ipz_qp_handle qp_handle,
+		     struct ehca_pfqp *pfqp,
+		     const u64 update_mask,
+		     struct hcp_modify_qp_control_block *mqpcb,
+		     struct h_galpa gal)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+	ret = ehca_plpar_hcall9(H_MODIFY_QP, outs,
+				adapter_handle.handle, /* r4 */
+				qp_handle.handle,      /* r5 */
+				update_mask,	       /* r6 */
+				__pa(mqpcb),	       /* r7 */
+				0, 0, 0, 0, 0);
+
+	if (ret == H_NOT_ENOUGH_RESOURCES)
+		ehca_gen_err("Insufficient resources ret=%lli", ret);
+
+	return ret;
+}
+
+u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle,
+		    const struct ipz_qp_handle qp_handle,
+		    struct ehca_pfqp *pfqp,
+		    struct hcp_modify_qp_control_block *qqpcb,
+		    struct h_galpa gal)
+{
+	return ehca_plpar_hcall_norets(H_QUERY_QP,
+				       adapter_handle.handle, /* r4 */
+				       qp_handle.handle,      /* r5 */
+				       __pa(qqpcb),	      /* r6 */
+				       0, 0, 0, 0);
+}
+
+u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
+		      struct ehca_qp *qp)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = hcp_galpas_dtor(&qp->galpas);
+	if (ret) {
+		ehca_gen_err("Could not destruct qp->galpas");
+		return H_RESOURCE;
+	}
+	ret = ehca_plpar_hcall9(H_DISABLE_AND_GETC, outs,
+				adapter_handle.handle,     /* r4 */
+				/* function code */
+				1,	                   /* r5 */
+				qp->ipz_qp_handle.handle,  /* r6 */
+				0, 0, 0, 0, 0, 0);
+	if (ret == H_HARDWARE)
+		ehca_gen_err("HCA not operational. ret=%lli", ret);
+
+	ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+				      adapter_handle.handle,     /* r4 */
+				      qp->ipz_qp_handle.handle,  /* r5 */
+				      0, 0, 0, 0, 0);
+
+	if (ret == H_RESOURCE)
+		ehca_gen_err("Resource still in use. ret=%lli", ret);
+
+	return ret;
+}
+
+u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle,
+		       const struct ipz_qp_handle qp_handle,
+		       struct h_galpa gal,
+		       u32 port)
+{
+	return ehca_plpar_hcall_norets(H_DEFINE_AQP0,
+				       adapter_handle.handle, /* r4 */
+				       qp_handle.handle,      /* r5 */
+				       port,                  /* r6 */
+				       0, 0, 0, 0);
+}
+
+u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
+		       const struct ipz_qp_handle qp_handle,
+		       struct h_galpa gal,
+		       u32 port, u32 * pma_qp_nr,
+		       u32 * bma_qp_nr)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_DEFINE_AQP1, outs,
+				adapter_handle.handle, /* r4 */
+				qp_handle.handle,      /* r5 */
+				port,	               /* r6 */
+				0, 0, 0, 0, 0, 0);
+	*pma_qp_nr = (u32)outs[0];
+	*bma_qp_nr = (u32)outs[1];
+
+	if (ret == H_ALIAS_EXIST)
+		ehca_gen_err("AQP1 already exists. ret=%lli", ret);
+
+	return ret;
+}
+
+u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
+		       const struct ipz_qp_handle qp_handle,
+		       struct h_galpa gal,
+		       u16 mcg_dlid,
+		       u64 subnet_prefix, u64 interface_id)
+{
+	u64 ret;
+
+	ret = ehca_plpar_hcall_norets(H_ATTACH_MCQP,
+				      adapter_handle.handle,  /* r4 */
+				      qp_handle.handle,       /* r5 */
+				      mcg_dlid,               /* r6 */
+				      interface_id,           /* r7 */
+				      subnet_prefix,          /* r8 */
+				      0, 0);
+
+	if (ret == H_NOT_ENOUGH_RESOURCES)
+		ehca_gen_err("Not enough resources. ret=%lli", ret);
+
+	return ret;
+}
+
+u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle,
+		       const struct ipz_qp_handle qp_handle,
+		       struct h_galpa gal,
+		       u16 mcg_dlid,
+		       u64 subnet_prefix, u64 interface_id)
+{
+	return ehca_plpar_hcall_norets(H_DETACH_MCQP,
+				       adapter_handle.handle, /* r4 */
+				       qp_handle.handle,      /* r5 */
+				       mcg_dlid,              /* r6 */
+				       interface_id,          /* r7 */
+				       subnet_prefix,         /* r8 */
+				       0, 0);
+}
+
+u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
+		      struct ehca_cq *cq,
+		      u8 force_flag)
+{
+	u64 ret;
+
+	ret = hcp_galpas_dtor(&cq->galpas);
+	if (ret) {
+		ehca_gen_err("Could not destruct cp->galpas");
+		return H_RESOURCE;
+	}
+
+	ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+				      adapter_handle.handle,     /* r4 */
+				      cq->ipz_cq_handle.handle,  /* r5 */
+				      force_flag != 0 ? 1L : 0L, /* r6 */
+				      0, 0, 0, 0);
+
+	if (ret == H_RESOURCE)
+		ehca_gen_err("H_FREE_RESOURCE failed ret=%lli ", ret);
+
+	return ret;
+}
+
+u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
+		      struct ehca_eq *eq)
+{
+	u64 ret;
+
+	ret = hcp_galpas_dtor(&eq->galpas);
+	if (ret) {
+		ehca_gen_err("Could not destruct eq->galpas");
+		return H_RESOURCE;
+	}
+
+	ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+				      adapter_handle.handle,     /* r4 */
+				      eq->ipz_eq_handle.handle,  /* r5 */
+				      0, 0, 0, 0, 0);
+
+	if (ret == H_RESOURCE)
+		ehca_gen_err("Resource in use. ret=%lli ", ret);
+
+	return ret;
+}
+
+u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle,
+			     const struct ehca_mr *mr,
+			     const u64 vaddr,
+			     const u64 length,
+			     const u32 access_ctrl,
+			     const struct ipz_pd pd,
+			     struct ehca_mr_hipzout_parms *outparms)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+				adapter_handle.handle,            /* r4 */
+				5,                                /* r5 */
+				vaddr,                            /* r6 */
+				length,                           /* r7 */
+				(((u64)access_ctrl) << 32ULL),    /* r8 */
+				pd.value,                         /* r9 */
+				0, 0, 0);
+	outparms->handle.handle = outs[0];
+	outparms->lkey = (u32)outs[2];
+	outparms->rkey = (u32)outs[3];
+
+	return ret;
+}
+
+u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle,
+			     const struct ehca_mr *mr,
+			     const u8 pagesize,
+			     const u8 queue_type,
+			     const u64 logical_address_of_page,
+			     const u64 count)
+{
+	u64 ret;
+
+	if (unlikely(ehca_debug_level >= 3)) {
+		if (count > 1) {
+			u64 *kpage;
+			int i;
+			kpage = __va(logical_address_of_page);
+			for (i = 0; i < count; i++)
+				ehca_gen_dbg("kpage[%d]=%p",
+					     i, (void *)kpage[i]);
+		} else
+			ehca_gen_dbg("kpage=%p",
+				     (void *)logical_address_of_page);
+	}
+
+	if ((count > 1) && (logical_address_of_page & (EHCA_PAGESIZE-1))) {
+		ehca_gen_err("logical_address_of_page not on a 4k boundary "
+			     "adapter_handle=%llx mr=%p mr_handle=%llx "
+			     "pagesize=%x queue_type=%x "
+			     "logical_address_of_page=%llx count=%llx",
+			     adapter_handle.handle, mr,
+			     mr->ipz_mr_handle.handle, pagesize, queue_type,
+			     logical_address_of_page, count);
+		ret = H_PARAMETER;
+	} else
+		ret = hipz_h_register_rpage(adapter_handle, pagesize,
+					    queue_type,
+					    mr->ipz_mr_handle.handle,
+					    logical_address_of_page, count);
+	return ret;
+}
+
+u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle,
+		    const struct ehca_mr *mr,
+		    struct ehca_mr_hipzout_parms *outparms)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_QUERY_MR, outs,
+				adapter_handle.handle,     /* r4 */
+				mr->ipz_mr_handle.handle,  /* r5 */
+				0, 0, 0, 0, 0, 0, 0);
+	outparms->len = outs[0];
+	outparms->vaddr = outs[1];
+	outparms->acl  = outs[4] >> 32;
+	outparms->lkey = (u32)(outs[5] >> 32);
+	outparms->rkey = (u32)(outs[5] & (0xffffffff));
+
+	return ret;
+}
+
+u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
+			    const struct ehca_mr *mr)
+{
+	return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+				       adapter_handle.handle,    /* r4 */
+				       mr->ipz_mr_handle.handle, /* r5 */
+				       0, 0, 0, 0, 0);
+}
+
+u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
+			  const struct ehca_mr *mr,
+			  const u64 vaddr_in,
+			  const u64 length,
+			  const u32 access_ctrl,
+			  const struct ipz_pd pd,
+			  const u64 mr_addr_cb,
+			  struct ehca_mr_hipzout_parms *outparms)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_REREGISTER_PMR, outs,
+				adapter_handle.handle,    /* r4 */
+				mr->ipz_mr_handle.handle, /* r5 */
+				vaddr_in,	          /* r6 */
+				length,                   /* r7 */
+				/* r8 */
+				((((u64)access_ctrl) << 32ULL) | pd.value),
+				mr_addr_cb,               /* r9 */
+				0, 0, 0);
+	outparms->vaddr = outs[1];
+	outparms->lkey = (u32)outs[2];
+	outparms->rkey = (u32)outs[3];
+
+	return ret;
+}
+
+u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle,
+			const struct ehca_mr *mr,
+			const struct ehca_mr *orig_mr,
+			const u64 vaddr_in,
+			const u32 access_ctrl,
+			const struct ipz_pd pd,
+			struct ehca_mr_hipzout_parms *outparms)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_REGISTER_SMR, outs,
+				adapter_handle.handle,            /* r4 */
+				orig_mr->ipz_mr_handle.handle,    /* r5 */
+				vaddr_in,                         /* r6 */
+				(((u64)access_ctrl) << 32ULL),    /* r7 */
+				pd.value,                         /* r8 */
+				0, 0, 0, 0);
+	outparms->handle.handle = outs[0];
+	outparms->lkey = (u32)outs[2];
+	outparms->rkey = (u32)outs[3];
+
+	return ret;
+}
+
+u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle,
+			     const struct ehca_mw *mw,
+			     const struct ipz_pd pd,
+			     struct ehca_mw_hipzout_parms *outparms)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_ALLOC_RESOURCE, outs,
+				adapter_handle.handle,      /* r4 */
+				6,                          /* r5 */
+				pd.value,                   /* r6 */
+				0, 0, 0, 0, 0, 0);
+	outparms->handle.handle = outs[0];
+	outparms->rkey = (u32)outs[3];
+
+	return ret;
+}
+
+u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle,
+		    const struct ehca_mw *mw,
+		    struct ehca_mw_hipzout_parms *outparms)
+{
+	u64 ret;
+	unsigned long outs[PLPAR_HCALL9_BUFSIZE];
+
+	ret = ehca_plpar_hcall9(H_QUERY_MW, outs,
+				adapter_handle.handle,    /* r4 */
+				mw->ipz_mw_handle.handle, /* r5 */
+				0, 0, 0, 0, 0, 0, 0);
+	outparms->rkey = (u32)outs[3];
+
+	return ret;
+}
+
+u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle,
+			    const struct ehca_mw *mw)
+{
+	return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
+				       adapter_handle.handle,    /* r4 */
+				       mw->ipz_mw_handle.handle, /* r5 */
+				       0, 0, 0, 0, 0);
+}
+
+u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
+		      const u64 ressource_handle,
+		      void *rblock,
+		      unsigned long *byte_count)
+{
+	u64 r_cb = __pa(rblock);
+
+	if (r_cb & (EHCA_PAGESIZE-1)) {
+		ehca_gen_err("rblock not page aligned.");
+		return H_PARAMETER;
+	}
+
+	return ehca_plpar_hcall_norets(H_ERROR_DATA,
+				       adapter_handle.handle,
+				       ressource_handle,
+				       r_cb,
+				       0, 0, 0, 0);
+}
+
+u64 hipz_h_eoi(int irq)
+{
+	unsigned long xirr;
+
+	iosync();
+	xirr = (0xffULL << 24) | irq;
+
+	return plpar_hcall_norets(H_EOI, xirr);
+}
diff --git a/drivers/infiniband/hw/ehca/hcp_if.h b/drivers/infiniband/hw/ehca/hcp_if.h
new file mode 100644
index 000000000..a46e514c3
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/hcp_if.h
@@ -0,0 +1,265 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Firmware Infiniband Interface code for POWER
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Gerd Bayer <gerd.bayer@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HCP_IF_H__
+#define __HCP_IF_H__
+
+#include "ehca_classes.h"
+#include "ehca_tools.h"
+#include "hipz_hw.h"
+
+/*
+ * hipz_h_alloc_resource_eq allocates EQ resources in HW and FW, initialize
+ * resources, create the empty EQPT (ring).
+ */
+u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
+			     struct ehca_pfeq *pfeq,
+			     const u32 neq_control,
+			     const u32 number_of_entries,
+			     struct ipz_eq_handle *eq_handle,
+			     u32 * act_nr_of_entries,
+			     u32 * act_pages,
+			     u32 * eq_ist);
+
+u64 hipz_h_reset_event(const struct ipz_adapter_handle adapter_handle,
+		       struct ipz_eq_handle eq_handle,
+		       const u64 event_mask);
+/*
+ * hipz_h_allocate_resource_cq allocates CQ resources in HW and FW, initialize
+ * resources, create the empty CQPT (ring).
+ */
+u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
+			     struct ehca_cq *cq,
+			     struct ehca_alloc_cq_parms *param);
+
+
+/*
+ * hipz_h_alloc_resource_qp allocates QP resources in HW and FW,
+ * initialize resources, create empty QPPTs (2 rings).
+ */
+u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
+			     struct ehca_alloc_qp_parms *parms, int is_user);
+
+u64 hipz_h_query_port(const struct ipz_adapter_handle adapter_handle,
+		      const u8 port_id,
+		      struct hipz_query_port *query_port_response_block);
+
+u64 hipz_h_modify_port(const struct ipz_adapter_handle adapter_handle,
+		       const u8 port_id, const u32 port_cap,
+		       const u8 init_type, const int modify_mask);
+
+u64 hipz_h_query_hca(const struct ipz_adapter_handle adapter_handle,
+		     struct hipz_query_hca *query_hca_rblock);
+
+/*
+ * hipz_h_register_rpage internal function in hcp_if.h for all
+ * hcp_H_REGISTER_RPAGE calls.
+ */
+u64 hipz_h_register_rpage(const struct ipz_adapter_handle adapter_handle,
+			  const u8 pagesize,
+			  const u8 queue_type,
+			  const u64 resource_handle,
+			  const u64 logical_address_of_page,
+			  u64 count);
+
+u64 hipz_h_register_rpage_eq(const struct ipz_adapter_handle adapter_handle,
+			     const struct ipz_eq_handle eq_handle,
+			     struct ehca_pfeq *pfeq,
+			     const u8 pagesize,
+			     const u8 queue_type,
+			     const u64 logical_address_of_page,
+			     const u64 count);
+
+u64 hipz_h_query_int_state(const struct ipz_adapter_handle
+			   hcp_adapter_handle,
+			   u32 ist);
+
+u64 hipz_h_register_rpage_cq(const struct ipz_adapter_handle adapter_handle,
+			     const struct ipz_cq_handle cq_handle,
+			     struct ehca_pfcq *pfcq,
+			     const u8 pagesize,
+			     const u8 queue_type,
+			     const u64 logical_address_of_page,
+			     const u64 count,
+			     const struct h_galpa gal);
+
+u64 hipz_h_register_rpage_qp(const struct ipz_adapter_handle adapter_handle,
+			     const struct ipz_qp_handle qp_handle,
+			     struct ehca_pfqp *pfqp,
+			     const u8 pagesize,
+			     const u8 queue_type,
+			     const u64 logical_address_of_page,
+			     const u64 count,
+			     const struct h_galpa galpa);
+
+u64 hipz_h_disable_and_get_wqe(const struct ipz_adapter_handle adapter_handle,
+			       const struct ipz_qp_handle qp_handle,
+			       struct ehca_pfqp *pfqp,
+			       void **log_addr_next_sq_wqe_tb_processed,
+			       void **log_addr_next_rq_wqe_tb_processed,
+			       int dis_and_get_function_code);
+enum hcall_sigt {
+	HCALL_SIGT_NO_CQE = 0,
+	HCALL_SIGT_BY_WQE = 1,
+	HCALL_SIGT_EVERY = 2
+};
+
+u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
+		     const struct ipz_qp_handle qp_handle,
+		     struct ehca_pfqp *pfqp,
+		     const u64 update_mask,
+		     struct hcp_modify_qp_control_block *mqpcb,
+		     struct h_galpa gal);
+
+u64 hipz_h_query_qp(const struct ipz_adapter_handle adapter_handle,
+		    const struct ipz_qp_handle qp_handle,
+		    struct ehca_pfqp *pfqp,
+		    struct hcp_modify_qp_control_block *qqpcb,
+		    struct h_galpa gal);
+
+u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
+		      struct ehca_qp *qp);
+
+u64 hipz_h_define_aqp0(const struct ipz_adapter_handle adapter_handle,
+		       const struct ipz_qp_handle qp_handle,
+		       struct h_galpa gal,
+		       u32 port);
+
+u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
+		       const struct ipz_qp_handle qp_handle,
+		       struct h_galpa gal,
+		       u32 port, u32 * pma_qp_nr,
+		       u32 * bma_qp_nr);
+
+u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
+		       const struct ipz_qp_handle qp_handle,
+		       struct h_galpa gal,
+		       u16 mcg_dlid,
+		       u64 subnet_prefix, u64 interface_id);
+
+u64 hipz_h_detach_mcqp(const struct ipz_adapter_handle adapter_handle,
+		       const struct ipz_qp_handle qp_handle,
+		       struct h_galpa gal,
+		       u16 mcg_dlid,
+		       u64 subnet_prefix, u64 interface_id);
+
+u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
+		      struct ehca_cq *cq,
+		      u8 force_flag);
+
+u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
+		      struct ehca_eq *eq);
+
+/*
+ * hipz_h_alloc_resource_mr allocates MR resources in HW and FW, initialize
+ * resources.
+ */
+u64 hipz_h_alloc_resource_mr(const struct ipz_adapter_handle adapter_handle,
+			     const struct ehca_mr *mr,
+			     const u64 vaddr,
+			     const u64 length,
+			     const u32 access_ctrl,
+			     const struct ipz_pd pd,
+			     struct ehca_mr_hipzout_parms *outparms);
+
+/* hipz_h_register_rpage_mr registers MR resource pages in HW and FW */
+u64 hipz_h_register_rpage_mr(const struct ipz_adapter_handle adapter_handle,
+			     const struct ehca_mr *mr,
+			     const u8 pagesize,
+			     const u8 queue_type,
+			     const u64 logical_address_of_page,
+			     const u64 count);
+
+/* hipz_h_query_mr queries MR in HW and FW */
+u64 hipz_h_query_mr(const struct ipz_adapter_handle adapter_handle,
+		    const struct ehca_mr *mr,
+		    struct ehca_mr_hipzout_parms *outparms);
+
+/* hipz_h_free_resource_mr frees MR resources in HW and FW */
+u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
+			    const struct ehca_mr *mr);
+
+/* hipz_h_reregister_pmr reregisters MR in HW and FW */
+u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
+			  const struct ehca_mr *mr,
+			  const u64 vaddr_in,
+			  const u64 length,
+			  const u32 access_ctrl,
+			  const struct ipz_pd pd,
+			  const u64 mr_addr_cb,
+			  struct ehca_mr_hipzout_parms *outparms);
+
+/* hipz_h_register_smr register shared MR in HW and FW */
+u64 hipz_h_register_smr(const struct ipz_adapter_handle adapter_handle,
+			const struct ehca_mr *mr,
+			const struct ehca_mr *orig_mr,
+			const u64 vaddr_in,
+			const u32 access_ctrl,
+			const struct ipz_pd pd,
+			struct ehca_mr_hipzout_parms *outparms);
+
+/*
+ * hipz_h_alloc_resource_mw allocates MW resources in HW and FW, initialize
+ * resources.
+ */
+u64 hipz_h_alloc_resource_mw(const struct ipz_adapter_handle adapter_handle,
+			     const struct ehca_mw *mw,
+			     const struct ipz_pd pd,
+			     struct ehca_mw_hipzout_parms *outparms);
+
+/* hipz_h_query_mw queries MW in HW and FW */
+u64 hipz_h_query_mw(const struct ipz_adapter_handle adapter_handle,
+		    const struct ehca_mw *mw,
+		    struct ehca_mw_hipzout_parms *outparms);
+
+/* hipz_h_free_resource_mw frees MW resources in HW and FW */
+u64 hipz_h_free_resource_mw(const struct ipz_adapter_handle adapter_handle,
+			    const struct ehca_mw *mw);
+
+u64 hipz_h_error_data(const struct ipz_adapter_handle adapter_handle,
+		      const u64 ressource_handle,
+		      void *rblock,
+		      unsigned long *byte_count);
+u64 hipz_h_eoi(int irq);
+
+#endif /* __HCP_IF_H__ */
diff --git a/drivers/infiniband/hw/ehca/hcp_phyp.c b/drivers/infiniband/hw/ehca/hcp_phyp.c
new file mode 100644
index 000000000..077376ff3
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/hcp_phyp.c
@@ -0,0 +1,82 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *   load store abstraction for ehca register access with tracing
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "ehca_classes.h"
+#include "hipz_hw.h"
+
+u64 hcall_map_page(u64 physaddr)
+{
+	return (u64)ioremap(physaddr, EHCA_PAGESIZE);
+}
+
+int hcall_unmap_page(u64 mapaddr)
+{
+	iounmap((volatile void __iomem *) mapaddr);
+	return 0;
+}
+
+int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
+		    u64 paddr_kernel, u64 paddr_user)
+{
+	if (!is_user) {
+		galpas->kernel.fw_handle = hcall_map_page(paddr_kernel);
+		if (!galpas->kernel.fw_handle)
+			return -ENOMEM;
+	} else
+		galpas->kernel.fw_handle = 0;
+
+	galpas->user.fw_handle = paddr_user;
+
+	return 0;
+}
+
+int hcp_galpas_dtor(struct h_galpas *galpas)
+{
+	if (galpas->kernel.fw_handle) {
+		int ret = hcall_unmap_page(galpas->kernel.fw_handle);
+		if (ret)
+			return ret;
+	}
+
+	galpas->user.fw_handle = galpas->kernel.fw_handle = 0;
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/ehca/hcp_phyp.h b/drivers/infiniband/hw/ehca/hcp_phyp.h
new file mode 100644
index 000000000..d1b029910
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/hcp_phyp.h
@@ -0,0 +1,90 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  Firmware calls
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Waleri Fomin <fomin@de.ibm.com>
+ *           Gerd Bayer <gerd.bayer@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HCP_PHYP_H__
+#define __HCP_PHYP_H__
+
+
+/*
+ * eHCA page (mapped into memory)
+ * resource to access eHCA register pages in CPU address space
+*/
+struct h_galpa {
+	u64 fw_handle;
+	/* for pSeries this is a 64bit memory address where
+	   I/O memory is mapped into CPU address space (kv) */
+};
+
+/*
+ * resource to access eHCA address space registers, all types
+ */
+struct h_galpas {
+	u32 pid;		/*PID of userspace galpa checking */
+	struct h_galpa user;	/* user space accessible resource,
+				   set to 0 if unused */
+	struct h_galpa kernel;	/* kernel space accessible resource,
+				   set to 0 if unused */
+};
+
+static inline u64 hipz_galpa_load(struct h_galpa galpa, u32 offset)
+{
+	u64 addr = galpa.fw_handle + offset;
+	return *(volatile u64 __force *)addr;
+}
+
+static inline void hipz_galpa_store(struct h_galpa galpa, u32 offset, u64 value)
+{
+	u64 addr = galpa.fw_handle + offset;
+	*(volatile u64 __force *)addr = value;
+}
+
+int hcp_galpas_ctor(struct h_galpas *galpas, int is_user,
+		    u64 paddr_kernel, u64 paddr_user);
+
+int hcp_galpas_dtor(struct h_galpas *galpas);
+
+u64 hcall_map_page(u64 physaddr);
+
+int hcall_unmap_page(u64 mapaddr);
+
+#endif
diff --git a/drivers/infiniband/hw/ehca/hipz_fns.h b/drivers/infiniband/hw/ehca/hipz_fns.h
new file mode 100644
index 000000000..9dac93d02
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/hipz_fns.h
@@ -0,0 +1,68 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  HW abstraction register functions
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HIPZ_FNS_H__
+#define __HIPZ_FNS_H__
+
+#include "ehca_classes.h"
+#include "hipz_hw.h"
+
+#include "hipz_fns_core.h"
+
+#define hipz_galpa_store_eq(gal, offset, value) \
+	hipz_galpa_store(gal, EQTEMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_eq(gal, offset) \
+	hipz_galpa_load(gal, EQTEMM_OFFSET(offset))
+
+#define hipz_galpa_store_qped(gal, offset, value) \
+	hipz_galpa_store(gal, QPEDMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_qped(gal, offset) \
+	hipz_galpa_load(gal, QPEDMM_OFFSET(offset))
+
+#define hipz_galpa_store_mrmw(gal, offset, value) \
+	hipz_galpa_store(gal, MRMWMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_mrmw(gal, offset) \
+	hipz_galpa_load(gal, MRMWMM_OFFSET(offset))
+
+#endif
diff --git a/drivers/infiniband/hw/ehca/hipz_fns_core.h b/drivers/infiniband/hw/ehca/hipz_fns_core.h
new file mode 100644
index 000000000..868735fd3
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/hipz_fns_core.h
@@ -0,0 +1,100 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  HW abstraction register functions
+ *
+ *  Authors: Christoph Raisch <raisch@de.ibm.com>
+ *           Heiko J Schick <schickhj@de.ibm.com>
+ *           Hoang-Nam Nguyen <hnguyen@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HIPZ_FNS_CORE_H__
+#define __HIPZ_FNS_CORE_H__
+
+#include "hcp_phyp.h"
+#include "hipz_hw.h"
+
+#define hipz_galpa_store_cq(gal, offset, value) \
+	hipz_galpa_store(gal, CQTEMM_OFFSET(offset), value)
+
+#define hipz_galpa_load_cq(gal, offset) \
+	hipz_galpa_load(gal, CQTEMM_OFFSET(offset))
+
+#define hipz_galpa_store_qp(gal, offset, value) \
+	hipz_galpa_store(gal, QPTEMM_OFFSET(offset), value)
+#define hipz_galpa_load_qp(gal, offset) \
+	hipz_galpa_load(gal, QPTEMM_OFFSET(offset))
+
+static inline void hipz_update_sqa(struct ehca_qp *qp, u16 nr_wqes)
+{
+	/*  ringing doorbell :-) */
+	hipz_galpa_store_qp(qp->galpas.kernel, qpx_sqa,
+			    EHCA_BMASK_SET(QPX_SQADDER, nr_wqes));
+}
+
+static inline void hipz_update_rqa(struct ehca_qp *qp, u16 nr_wqes)
+{
+	/*  ringing doorbell :-) */
+	hipz_galpa_store_qp(qp->galpas.kernel, qpx_rqa,
+			    EHCA_BMASK_SET(QPX_RQADDER, nr_wqes));
+}
+
+static inline void hipz_update_feca(struct ehca_cq *cq, u32 nr_cqes)
+{
+	hipz_galpa_store_cq(cq->galpas.kernel, cqx_feca,
+			    EHCA_BMASK_SET(CQX_FECADDER, nr_cqes));
+}
+
+static inline void hipz_set_cqx_n0(struct ehca_cq *cq, u32 value)
+{
+	u64 cqx_n0_reg;
+
+	hipz_galpa_store_cq(cq->galpas.kernel, cqx_n0,
+			    EHCA_BMASK_SET(CQX_N0_GENERATE_SOLICITED_COMP_EVENT,
+					   value));
+	cqx_n0_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n0);
+}
+
+static inline void hipz_set_cqx_n1(struct ehca_cq *cq, u32 value)
+{
+	u64 cqx_n1_reg;
+
+	hipz_galpa_store_cq(cq->galpas.kernel, cqx_n1,
+			    EHCA_BMASK_SET(CQX_N1_GENERATE_COMP_EVENT, value));
+	cqx_n1_reg = hipz_galpa_load_cq(cq->galpas.kernel, cqx_n1);
+}
+
+#endif /* __HIPZ_FNC_CORE_H__ */
diff --git a/drivers/infiniband/hw/ehca/hipz_hw.h b/drivers/infiniband/hw/ehca/hipz_hw.h
new file mode 100644
index 000000000..bf996c7ac
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/hipz_hw.h
@@ -0,0 +1,414 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  eHCA register definitions
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __HIPZ_HW_H__
+#define __HIPZ_HW_H__
+
+#include "ehca_tools.h"
+
+#define EHCA_MAX_MTU 4
+
+/* QP Table Entry Memory Map */
+struct hipz_qptemm {
+	u64 qpx_hcr;
+	u64 qpx_c;
+	u64 qpx_herr;
+	u64 qpx_aer;
+/* 0x20*/
+	u64 qpx_sqa;
+	u64 qpx_sqc;
+	u64 qpx_rqa;
+	u64 qpx_rqc;
+/* 0x40*/
+	u64 qpx_st;
+	u64 qpx_pmstate;
+	u64 qpx_pmfa;
+	u64 qpx_pkey;
+/* 0x60*/
+	u64 qpx_pkeya;
+	u64 qpx_pkeyb;
+	u64 qpx_pkeyc;
+	u64 qpx_pkeyd;
+/* 0x80*/
+	u64 qpx_qkey;
+	u64 qpx_dqp;
+	u64 qpx_dlidp;
+	u64 qpx_portp;
+/* 0xa0*/
+	u64 qpx_slidp;
+	u64 qpx_slidpp;
+	u64 qpx_dlida;
+	u64 qpx_porta;
+/* 0xc0*/
+	u64 qpx_slida;
+	u64 qpx_slidpa;
+	u64 qpx_slvl;
+	u64 qpx_ipd;
+/* 0xe0*/
+	u64 qpx_mtu;
+	u64 qpx_lato;
+	u64 qpx_rlimit;
+	u64 qpx_rnrlimit;
+/* 0x100*/
+	u64 qpx_t;
+	u64 qpx_sqhp;
+	u64 qpx_sqptp;
+	u64 qpx_nspsn;
+/* 0x120*/
+	u64 qpx_nspsnhwm;
+	u64 reserved1;
+	u64 qpx_sdsi;
+	u64 qpx_sdsbc;
+/* 0x140*/
+	u64 qpx_sqwsize;
+	u64 qpx_sqwts;
+	u64 qpx_lsn;
+	u64 qpx_nssn;
+/* 0x160 */
+	u64 qpx_mor;
+	u64 qpx_cor;
+	u64 qpx_sqsize;
+	u64 qpx_erc;
+/* 0x180*/
+	u64 qpx_rnrrc;
+	u64 qpx_ernrwt;
+	u64 qpx_rnrresp;
+	u64 qpx_lmsna;
+/* 0x1a0 */
+	u64 qpx_sqhpc;
+	u64 qpx_sqcptp;
+	u64 qpx_sigt;
+	u64 qpx_wqecnt;
+/* 0x1c0*/
+	u64 qpx_rqhp;
+	u64 qpx_rqptp;
+	u64 qpx_rqsize;
+	u64 qpx_nrr;
+/* 0x1e0*/
+	u64 qpx_rdmac;
+	u64 qpx_nrpsn;
+	u64 qpx_lapsn;
+	u64 qpx_lcr;
+/* 0x200*/
+	u64 qpx_rwc;
+	u64 qpx_rwva;
+	u64 qpx_rdsi;
+	u64 qpx_rdsbc;
+/* 0x220*/
+	u64 qpx_rqwsize;
+	u64 qpx_crmsn;
+	u64 qpx_rdd;
+	u64 qpx_larpsn;
+/* 0x240*/
+	u64 qpx_pd;
+	u64 qpx_scqn;
+	u64 qpx_rcqn;
+	u64 qpx_aeqn;
+/* 0x260*/
+	u64 qpx_aaelog;
+	u64 qpx_ram;
+	u64 qpx_rdmaqe0;
+	u64 qpx_rdmaqe1;
+/* 0x280*/
+	u64 qpx_rdmaqe2;
+	u64 qpx_rdmaqe3;
+	u64 qpx_nrpsnhwm;
+/* 0x298*/
+	u64 reserved[(0x400 - 0x298) / 8];
+/* 0x400 extended data */
+	u64 reserved_ext[(0x500 - 0x400) / 8];
+/* 0x500 */
+	u64 reserved2[(0x1000 - 0x500) / 8];
+/* 0x1000      */
+};
+
+#define QPX_SQADDER EHCA_BMASK_IBM(48, 63)
+#define QPX_RQADDER EHCA_BMASK_IBM(48, 63)
+#define QPX_AAELOG_RESET_SRQ_LIMIT EHCA_BMASK_IBM(3, 3)
+
+#define QPTEMM_OFFSET(x) offsetof(struct hipz_qptemm, x)
+
+/* MRMWPT Entry Memory Map */
+struct hipz_mrmwmm {
+	/* 0x00 */
+	u64 mrx_hcr;
+
+	u64 mrx_c;
+	u64 mrx_herr;
+	u64 mrx_aer;
+	/* 0x20 */
+	u64 mrx_pp;
+	u64 reserved1;
+	u64 reserved2;
+	u64 reserved3;
+	/* 0x40 */
+	u64 reserved4[(0x200 - 0x40) / 8];
+	/* 0x200 */
+	u64 mrx_ctl[64];
+
+};
+
+#define MRMWMM_OFFSET(x) offsetof(struct hipz_mrmwmm, x)
+
+struct hipz_qpedmm {
+	/* 0x00 */
+	u64 reserved0[(0x400) / 8];
+	/* 0x400 */
+	u64 qpedx_phh;
+	u64 qpedx_ppsgp;
+	/* 0x410 */
+	u64 qpedx_ppsgu;
+	u64 qpedx_ppdgp;
+	/* 0x420 */
+	u64 qpedx_ppdgu;
+	u64 qpedx_aph;
+	/* 0x430 */
+	u64 qpedx_apsgp;
+	u64 qpedx_apsgu;
+	/* 0x440 */
+	u64 qpedx_apdgp;
+	u64 qpedx_apdgu;
+	/* 0x450 */
+	u64 qpedx_apav;
+	u64 qpedx_apsav;
+	/* 0x460  */
+	u64 qpedx_hcr;
+	u64 reserved1[4];
+	/* 0x488 */
+	u64 qpedx_rrl0;
+	/* 0x490 */
+	u64 qpedx_rrrkey0;
+	u64 qpedx_rrva0;
+	/* 0x4a0 */
+	u64 reserved2;
+	u64 qpedx_rrl1;
+	/* 0x4b0 */
+	u64 qpedx_rrrkey1;
+	u64 qpedx_rrva1;
+	/* 0x4c0 */
+	u64 reserved3;
+	u64 qpedx_rrl2;
+	/* 0x4d0 */
+	u64 qpedx_rrrkey2;
+	u64 qpedx_rrva2;
+	/* 0x4e0 */
+	u64 reserved4;
+	u64 qpedx_rrl3;
+	/* 0x4f0 */
+	u64 qpedx_rrrkey3;
+	u64 qpedx_rrva3;
+};
+
+#define QPEDMM_OFFSET(x) offsetof(struct hipz_qpedmm, x)
+
+/* CQ Table Entry Memory Map */
+struct hipz_cqtemm {
+	u64 cqx_hcr;
+	u64 cqx_c;
+	u64 cqx_herr;
+	u64 cqx_aer;
+/* 0x20  */
+	u64 cqx_ptp;
+	u64 cqx_tp;
+	u64 cqx_fec;
+	u64 cqx_feca;
+/* 0x40  */
+	u64 cqx_ep;
+	u64 cqx_eq;
+/* 0x50  */
+	u64 reserved1;
+	u64 cqx_n0;
+/* 0x60  */
+	u64 cqx_n1;
+	u64 reserved2[(0x1000 - 0x60) / 8];
+/* 0x1000 */
+};
+
+#define CQX_FEC_CQE_CNT           EHCA_BMASK_IBM(32, 63)
+#define CQX_FECADDER              EHCA_BMASK_IBM(32, 63)
+#define CQX_N0_GENERATE_SOLICITED_COMP_EVENT EHCA_BMASK_IBM(0, 0)
+#define CQX_N1_GENERATE_COMP_EVENT EHCA_BMASK_IBM(0, 0)
+
+#define CQTEMM_OFFSET(x) offsetof(struct hipz_cqtemm, x)
+
+/* EQ Table Entry Memory Map */
+struct hipz_eqtemm {
+	u64 eqx_hcr;
+	u64 eqx_c;
+
+	u64 eqx_herr;
+	u64 eqx_aer;
+/* 0x20 */
+	u64 eqx_ptp;
+	u64 eqx_tp;
+	u64 eqx_ssba;
+	u64 eqx_psba;
+
+/* 0x40 */
+	u64 eqx_cec;
+	u64 eqx_meql;
+	u64 eqx_xisbi;
+	u64 eqx_xisc;
+/* 0x60 */
+	u64 eqx_it;
+
+};
+
+#define EQTEMM_OFFSET(x) offsetof(struct hipz_eqtemm, x)
+
+/* access control defines for MR/MW */
+#define HIPZ_ACCESSCTRL_L_WRITE  0x00800000
+#define HIPZ_ACCESSCTRL_R_WRITE  0x00400000
+#define HIPZ_ACCESSCTRL_R_READ   0x00200000
+#define HIPZ_ACCESSCTRL_R_ATOMIC 0x00100000
+#define HIPZ_ACCESSCTRL_MW_BIND  0x00080000
+
+/* query hca response block */
+struct hipz_query_hca {
+	u32 cur_reliable_dg;
+	u32 cur_qp;
+	u32 cur_cq;
+	u32 cur_eq;
+	u32 cur_mr;
+	u32 cur_mw;
+	u32 cur_ee_context;
+	u32 cur_mcast_grp;
+	u32 cur_qp_attached_mcast_grp;
+	u32 reserved1;
+	u32 cur_ipv6_qp;
+	u32 cur_eth_qp;
+	u32 cur_hp_mr;
+	u32 reserved2[3];
+	u32 max_rd_domain;
+	u32 max_qp;
+	u32 max_cq;
+	u32 max_eq;
+	u32 max_mr;
+	u32 max_hp_mr;
+	u32 max_mw;
+	u32 max_mrwpte;
+	u32 max_special_mrwpte;
+	u32 max_rd_ee_context;
+	u32 max_mcast_grp;
+	u32 max_total_mcast_qp_attach;
+	u32 max_mcast_qp_attach;
+	u32 max_raw_ipv6_qp;
+	u32 max_raw_ethy_qp;
+	u32 internal_clock_frequency;
+	u32 max_pd;
+	u32 max_ah;
+	u32 max_cqe;
+	u32 max_wqes_wq;
+	u32 max_partitions;
+	u32 max_rr_ee_context;
+	u32 max_rr_qp;
+	u32 max_rr_hca;
+	u32 max_act_wqs_ee_context;
+	u32 max_act_wqs_qp;
+	u32 max_sge;
+	u32 max_sge_rd;
+	u32 memory_page_size_supported;
+	u64 max_mr_size;
+	u32 local_ca_ack_delay;
+	u32 num_ports;
+	u32 vendor_id;
+	u32 vendor_part_id;
+	u32 hw_ver;
+	u64 node_guid;
+	u64 hca_cap_indicators;
+	u32 data_counter_register_size;
+	u32 max_shared_rq;
+	u32 max_isns_eq;
+	u32 max_neq;
+} __attribute__ ((packed));
+
+#define HCA_CAP_AH_PORT_NR_CHECK      EHCA_BMASK_IBM( 0,  0)
+#define HCA_CAP_ATOMIC                EHCA_BMASK_IBM( 1,  1)
+#define HCA_CAP_AUTO_PATH_MIG         EHCA_BMASK_IBM( 2,  2)
+#define HCA_CAP_BAD_P_KEY_CTR         EHCA_BMASK_IBM( 3,  3)
+#define HCA_CAP_SQD_RTS_PORT_CHANGE   EHCA_BMASK_IBM( 4,  4)
+#define HCA_CAP_CUR_QP_STATE_MOD      EHCA_BMASK_IBM( 5,  5)
+#define HCA_CAP_INIT_TYPE             EHCA_BMASK_IBM( 6,  6)
+#define HCA_CAP_PORT_ACTIVE_EVENT     EHCA_BMASK_IBM( 7,  7)
+#define HCA_CAP_Q_KEY_VIOL_CTR        EHCA_BMASK_IBM( 8,  8)
+#define HCA_CAP_WQE_RESIZE            EHCA_BMASK_IBM( 9,  9)
+#define HCA_CAP_RAW_PACKET_MCAST      EHCA_BMASK_IBM(10, 10)
+#define HCA_CAP_SHUTDOWN_PORT         EHCA_BMASK_IBM(11, 11)
+#define HCA_CAP_RC_LL_QP              EHCA_BMASK_IBM(12, 12)
+#define HCA_CAP_SRQ                   EHCA_BMASK_IBM(13, 13)
+#define HCA_CAP_UD_LL_QP              EHCA_BMASK_IBM(16, 16)
+#define HCA_CAP_RESIZE_MR             EHCA_BMASK_IBM(17, 17)
+#define HCA_CAP_MINI_QP               EHCA_BMASK_IBM(18, 18)
+#define HCA_CAP_H_ALLOC_RES_SYNC      EHCA_BMASK_IBM(19, 19)
+
+/* query port response block */
+struct hipz_query_port {
+	u32 state;
+	u32 bad_pkey_cntr;
+	u32 lmc;
+	u32 lid;
+	u32 subnet_timeout;
+	u32 qkey_viol_cntr;
+	u32 sm_sl;
+	u32 sm_lid;
+	u32 capability_mask;
+	u32 init_type_reply;
+	u32 pkey_tbl_len;
+	u32 gid_tbl_len;
+	u64 gid_prefix;
+	u32 port_nr;
+	u16 pkey_entries[16];
+	u8  reserved1[32];
+	u32 trent_size;
+	u32 trbuf_size;
+	u64 max_msg_sz;
+	u32 max_mtu;
+	u32 vl_cap;
+	u32 phys_pstate;
+	u32 phys_state;
+	u32 phys_speed;
+	u32 phys_width;
+	u8  reserved2[1884];
+	u64 guid_entries[255];
+} __attribute__ ((packed));
+
+#endif
diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.c b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
new file mode 100644
index 000000000..8d594517c
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
@@ -0,0 +1,295 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  internal queue handling
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/slab.h>
+
+#include "ehca_tools.h"
+#include "ipz_pt_fn.h"
+#include "ehca_classes.h"
+
+#define PAGES_PER_KPAGE (PAGE_SIZE >> EHCA_PAGESHIFT)
+
+struct kmem_cache *small_qp_cache;
+
+void *ipz_qpageit_get_inc(struct ipz_queue *queue)
+{
+	void *ret = ipz_qeit_get(queue);
+	queue->current_q_offset += queue->pagesize;
+	if (queue->current_q_offset > queue->queue_length) {
+		queue->current_q_offset -= queue->pagesize;
+		ret = NULL;
+	}
+	if (((u64)ret) % queue->pagesize) {
+		ehca_gen_err("ERROR!! not at PAGE-Boundary");
+		return NULL;
+	}
+	return ret;
+}
+
+void *ipz_qeit_eq_get_inc(struct ipz_queue *queue)
+{
+	void *ret = ipz_qeit_get(queue);
+	u64 last_entry_in_q = queue->queue_length - queue->qe_size;
+
+	queue->current_q_offset += queue->qe_size;
+	if (queue->current_q_offset > last_entry_in_q) {
+		queue->current_q_offset = 0;
+		queue->toggle_state = (~queue->toggle_state) & 1;
+	}
+
+	return ret;
+}
+
+int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset)
+{
+	int i;
+	for (i = 0; i < queue->queue_length / queue->pagesize; i++) {
+		u64 page = __pa(queue->queue_pages[i]);
+		if (addr >= page && addr < page + queue->pagesize) {
+			*q_offset = addr - page + i * queue->pagesize;
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
+#if PAGE_SHIFT < EHCA_PAGESHIFT
+#error Kernel pages must be at least as large than eHCA pages (4K) !
+#endif
+
+/*
+ * allocate pages for queue:
+ * outer loop allocates whole kernel pages (page aligned) and
+ * inner loop divides a kernel page into smaller hca queue pages
+ */
+static int alloc_queue_pages(struct ipz_queue *queue, const u32 nr_of_pages)
+{
+	int k, f = 0;
+	u8 *kpage;
+
+	while (f < nr_of_pages) {
+		kpage = (u8 *)get_zeroed_page(GFP_KERNEL);
+		if (!kpage)
+			goto out;
+
+		for (k = 0; k < PAGES_PER_KPAGE && f < nr_of_pages; k++) {
+			queue->queue_pages[f] = (struct ipz_page *)kpage;
+			kpage += EHCA_PAGESIZE;
+			f++;
+		}
+	}
+	return 1;
+
+out:
+	for (f = 0; f < nr_of_pages && queue->queue_pages[f];
+	     f += PAGES_PER_KPAGE)
+		free_page((unsigned long)(queue->queue_pages)[f]);
+	return 0;
+}
+
+static int alloc_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
+{
+	int order = ilog2(queue->pagesize) - 9;
+	struct ipz_small_queue_page *page;
+	unsigned long bit;
+
+	mutex_lock(&pd->lock);
+
+	if (!list_empty(&pd->free[order]))
+		page = list_entry(pd->free[order].next,
+				  struct ipz_small_queue_page, list);
+	else {
+		page = kmem_cache_zalloc(small_qp_cache, GFP_KERNEL);
+		if (!page)
+			goto out;
+
+		page->page = get_zeroed_page(GFP_KERNEL);
+		if (!page->page) {
+			kmem_cache_free(small_qp_cache, page);
+			goto out;
+		}
+
+		list_add(&page->list, &pd->free[order]);
+	}
+
+	bit = find_first_zero_bit(page->bitmap, IPZ_SPAGE_PER_KPAGE >> order);
+	__set_bit(bit, page->bitmap);
+	page->fill++;
+
+	if (page->fill == IPZ_SPAGE_PER_KPAGE >> order)
+		list_move(&page->list, &pd->full[order]);
+
+	mutex_unlock(&pd->lock);
+
+	queue->queue_pages[0] = (void *)(page->page | (bit << (order + 9)));
+	queue->small_page = page;
+	queue->offset = bit << (order + 9);
+	return 1;
+
+out:
+	ehca_err(pd->ib_pd.device, "failed to allocate small queue page");
+	mutex_unlock(&pd->lock);
+	return 0;
+}
+
+static void free_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
+{
+	int order = ilog2(queue->pagesize) - 9;
+	struct ipz_small_queue_page *page = queue->small_page;
+	unsigned long bit;
+	int free_page = 0;
+
+	bit = ((unsigned long)queue->queue_pages[0] & ~PAGE_MASK)
+		>> (order + 9);
+
+	mutex_lock(&pd->lock);
+
+	__clear_bit(bit, page->bitmap);
+	page->fill--;
+
+	if (page->fill == 0) {
+		list_del(&page->list);
+		free_page = 1;
+	}
+
+	if (page->fill == (IPZ_SPAGE_PER_KPAGE >> order) - 1)
+		/* the page was full until we freed the chunk */
+		list_move_tail(&page->list, &pd->free[order]);
+
+	mutex_unlock(&pd->lock);
+
+	if (free_page) {
+		free_page(page->page);
+		kmem_cache_free(small_qp_cache, page);
+	}
+}
+
+int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+		   const u32 nr_of_pages, const u32 pagesize,
+		   const u32 qe_size, const u32 nr_of_sg,
+		   int is_small)
+{
+	if (pagesize > PAGE_SIZE) {
+		ehca_gen_err("FATAL ERROR: pagesize=%x "
+			     "is greater than kernel page size", pagesize);
+		return 0;
+	}
+
+	/* init queue fields */
+	queue->queue_length = nr_of_pages * pagesize;
+	queue->pagesize = pagesize;
+	queue->qe_size = qe_size;
+	queue->act_nr_of_sg = nr_of_sg;
+	queue->current_q_offset = 0;
+	queue->toggle_state = 1;
+	queue->small_page = NULL;
+
+	/* allocate queue page pointers */
+	queue->queue_pages = kzalloc(nr_of_pages * sizeof(void *),
+				     GFP_KERNEL | __GFP_NOWARN);
+	if (!queue->queue_pages) {
+		queue->queue_pages = vzalloc(nr_of_pages * sizeof(void *));
+		if (!queue->queue_pages) {
+			ehca_gen_err("Couldn't allocate queue page list");
+			return 0;
+		}
+	}
+
+	/* allocate actual queue pages */
+	if (is_small) {
+		if (!alloc_small_queue_page(queue, pd))
+			goto ipz_queue_ctor_exit0;
+	} else
+		if (!alloc_queue_pages(queue, nr_of_pages))
+			goto ipz_queue_ctor_exit0;
+
+	return 1;
+
+ipz_queue_ctor_exit0:
+	ehca_gen_err("Couldn't alloc pages queue=%p "
+		 "nr_of_pages=%x",  queue, nr_of_pages);
+	if (is_vmalloc_addr(queue->queue_pages))
+		vfree(queue->queue_pages);
+	else
+		kfree(queue->queue_pages);
+
+	return 0;
+}
+
+int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue)
+{
+	int i, nr_pages;
+
+	if (!queue || !queue->queue_pages) {
+		ehca_gen_dbg("queue or queue_pages is NULL");
+		return 0;
+	}
+
+	if (queue->small_page)
+		free_small_queue_page(queue, pd);
+	else {
+		nr_pages = queue->queue_length / queue->pagesize;
+		for (i = 0; i < nr_pages; i += PAGES_PER_KPAGE)
+			free_page((unsigned long)queue->queue_pages[i]);
+	}
+
+	if (is_vmalloc_addr(queue->queue_pages))
+		vfree(queue->queue_pages);
+	else
+		kfree(queue->queue_pages);
+
+	return 1;
+}
+
+int ehca_init_small_qp_cache(void)
+{
+	small_qp_cache = kmem_cache_create("ehca_cache_small_qp",
+					   sizeof(struct ipz_small_queue_page),
+					   0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!small_qp_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void ehca_cleanup_small_qp_cache(void)
+{
+	kmem_cache_destroy(small_qp_cache);
+}
diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.h b/drivers/infiniband/hw/ehca/ipz_pt_fn.h
new file mode 100644
index 000000000..a801274ea
--- /dev/null
+++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.h
@@ -0,0 +1,289 @@
+/*
+ *  IBM eServer eHCA Infiniband device driver for Linux on POWER
+ *
+ *  internal queue handling
+ *
+ *  Authors: Waleri Fomin <fomin@de.ibm.com>
+ *           Reinhard Ernst <rernst@de.ibm.com>
+ *           Christoph Raisch <raisch@de.ibm.com>
+ *
+ *  Copyright (c) 2005 IBM Corporation
+ *
+ *  All rights reserved.
+ *
+ *  This source code is distributed under a dual license of GPL v2.0 and OpenIB
+ *  BSD.
+ *
+ * OpenIB BSD License
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER
+ * IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __IPZ_PT_FN_H__
+#define __IPZ_PT_FN_H__
+
+#define EHCA_PAGESHIFT   12
+#define EHCA_PAGESIZE   4096UL
+#define EHCA_PAGEMASK   (~(EHCA_PAGESIZE-1))
+#define EHCA_PT_ENTRIES 512UL
+
+#include "ehca_tools.h"
+#include "ehca_qes.h"
+
+struct ehca_pd;
+struct ipz_small_queue_page;
+
+extern struct kmem_cache *small_qp_cache;
+
+/* struct generic ehca page */
+struct ipz_page {
+	u8 entries[EHCA_PAGESIZE];
+};
+
+#define IPZ_SPAGE_PER_KPAGE (PAGE_SIZE / 512)
+
+struct ipz_small_queue_page {
+	unsigned long page;
+	unsigned long bitmap[IPZ_SPAGE_PER_KPAGE / BITS_PER_LONG];
+	int fill;
+	void *mapped_addr;
+	u32 mmap_count;
+	struct list_head list;
+};
+
+/* struct generic queue in linux kernel virtual memory (kv) */
+struct ipz_queue {
+	u64 current_q_offset;	/* current queue entry */
+
+	struct ipz_page **queue_pages;	/* array of pages belonging to queue */
+	u32 qe_size;		/* queue entry size */
+	u32 act_nr_of_sg;
+	u32 queue_length;	/* queue length allocated in bytes */
+	u32 pagesize;
+	u32 toggle_state;	/* toggle flag - per page */
+	u32 offset; /* save offset within page for small_qp */
+	struct ipz_small_queue_page *small_page;
+};
+
+/*
+ * return current Queue Entry for a certain q_offset
+ * returns address (kv) of Queue Entry
+ */
+static inline void *ipz_qeit_calc(struct ipz_queue *queue, u64 q_offset)
+{
+	struct ipz_page *current_page;
+	if (q_offset >= queue->queue_length)
+		return NULL;
+	current_page = (queue->queue_pages)[q_offset >> EHCA_PAGESHIFT];
+	return &current_page->entries[q_offset & (EHCA_PAGESIZE - 1)];
+}
+
+/*
+ * return current Queue Entry
+ * returns address (kv) of Queue Entry
+ */
+static inline void *ipz_qeit_get(struct ipz_queue *queue)
+{
+	return ipz_qeit_calc(queue, queue->current_q_offset);
+}
+
+/*
+ * return current Queue Page , increment Queue Page iterator from
+ * page to page in struct ipz_queue, last increment will return 0! and
+ * NOT wrap
+ * returns address (kv) of Queue Page
+ * warning don't use in parallel with ipz_QE_get_inc()
+ */
+void *ipz_qpageit_get_inc(struct ipz_queue *queue);
+
+/*
+ * return current Queue Entry, increment Queue Entry iterator by one
+ * step in struct ipz_queue, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * warning don't use in parallel with ipz_qpageit_get_inc()
+ */
+static inline void *ipz_qeit_get_inc(struct ipz_queue *queue)
+{
+	void *ret = ipz_qeit_get(queue);
+	queue->current_q_offset += queue->qe_size;
+	if (queue->current_q_offset >= queue->queue_length) {
+		queue->current_q_offset = 0;
+		/* toggle the valid flag */
+		queue->toggle_state = (~queue->toggle_state) & 1;
+	}
+
+	return ret;
+}
+
+/*
+ * return a bool indicating whether current Queue Entry is valid
+ */
+static inline int ipz_qeit_is_valid(struct ipz_queue *queue)
+{
+	struct ehca_cqe *cqe = ipz_qeit_get(queue);
+	return ((cqe->cqe_flags >> 7) == (queue->toggle_state & 1));
+}
+
+/*
+ * return current Queue Entry, increment Queue Entry iterator by one
+ * step in struct ipz_queue, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * returns 0 and does not increment, if wrong valid state
+ * warning don't use in parallel with ipz_qpageit_get_inc()
+ */
+static inline void *ipz_qeit_get_inc_valid(struct ipz_queue *queue)
+{
+	return ipz_qeit_is_valid(queue) ? ipz_qeit_get_inc(queue) : NULL;
+}
+
+/*
+ * returns and resets Queue Entry iterator
+ * returns address (kv) of first Queue Entry
+ */
+static inline void *ipz_qeit_reset(struct ipz_queue *queue)
+{
+	queue->current_q_offset = 0;
+	return ipz_qeit_get(queue);
+}
+
+/*
+ * return the q_offset corresponding to an absolute address
+ */
+int ipz_queue_abs_to_offset(struct ipz_queue *queue, u64 addr, u64 *q_offset);
+
+/*
+ * return the next queue offset. don't modify the queue.
+ */
+static inline u64 ipz_queue_advance_offset(struct ipz_queue *queue, u64 offset)
+{
+	offset += queue->qe_size;
+	if (offset >= queue->queue_length) offset = 0;
+	return offset;
+}
+
+/* struct generic page table */
+struct ipz_pt {
+	u64 entries[EHCA_PT_ENTRIES];
+};
+
+/* struct page table for a queue, only to be used in pf */
+struct ipz_qpt {
+	/* queue page tables (kv), use u64 because we know the element length */
+	u64 *qpts;
+	u32 n_qpts;
+	u32 n_ptes;       /*  number of page table entries */
+	u64 *current_pte_addr;
+};
+
+/*
+ * constructor for a ipz_queue_t, placement new for ipz_queue_t,
+ * new for all dependent datastructors
+ * all QP Tables are the same
+ * flow:
+ *    allocate+pin queue
+ * see ipz_qpt_ctor()
+ * returns true if ok, false if out of memory
+ */
+int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+		   const u32 nr_of_pages, const u32 pagesize,
+		   const u32 qe_size, const u32 nr_of_sg,
+		   int is_small);
+
+/*
+ * destructor for a ipz_queue_t
+ *  -# free queue
+ *  see ipz_queue_ctor()
+ *  returns true if ok, false if queue was NULL-ptr of free failed
+ */
+int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue);
+
+/*
+ * constructor for a ipz_qpt_t,
+ * placement new for struct ipz_queue, new for all dependent datastructors
+ * all QP Tables are the same,
+ * flow:
+ * -# allocate+pin queue
+ * -# initialise ptcb
+ * -# allocate+pin PTs
+ * -# link PTs to a ring, according to HCA Arch, set bit62 id needed
+ * -# the ring must have room for exactly nr_of_PTEs
+ * see ipz_qpt_ctor()
+ */
+void ipz_qpt_ctor(struct ipz_qpt *qpt,
+		  const u32 nr_of_qes,
+		  const u32 pagesize,
+		  const u32 qe_size,
+		  const u8 lowbyte, const u8 toggle,
+		  u32 * act_nr_of_QEs, u32 * act_nr_of_pages);
+
+/*
+ * return current Queue Entry, increment Queue Entry iterator by one
+ * step in struct ipz_queue, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * warning don't use in parallel with ipz_qpageit_get_inc()
+ * warning unpredictable results may occur if steps>act_nr_of_queue_entries
+ * fix EQ page problems
+ */
+void *ipz_qeit_eq_get_inc(struct ipz_queue *queue);
+
+/*
+ * return current Event Queue Entry, increment Queue Entry iterator
+ * by one step in struct ipz_queue if valid, will wrap in ringbuffer
+ * returns address (kv) of Queue Entry BEFORE increment
+ * returns 0 and does not increment, if wrong valid state
+ * warning don't use in parallel with ipz_queue_QPageit_get_inc()
+ * warning unpredictable results may occur if steps>act_nr_of_queue_entries
+ */
+static inline void *ipz_eqit_eq_get_inc_valid(struct ipz_queue *queue)
+{
+	void *ret = ipz_qeit_get(queue);
+	u32 qe = *(u8 *)ret;
+	if ((qe >> 7) != (queue->toggle_state & 1))
+		return NULL;
+	ipz_qeit_eq_get_inc(queue); /* this is a good one */
+	return ret;
+}
+
+static inline void *ipz_eqit_eq_peek_valid(struct ipz_queue *queue)
+{
+	void *ret = ipz_qeit_get(queue);
+	u32 qe = *(u8 *)ret;
+	if ((qe >> 7) != (queue->toggle_state & 1))
+		return NULL;
+	return ret;
+}
+
+/* returns address (GX) of first queue entry */
+static inline u64 ipz_qpt_get_firstpage(struct ipz_qpt *qpt)
+{
+	return be64_to_cpu(qpt->qpts[0]);
+}
+
+/* returns address (kv) of first page of queue page table */
+static inline void *ipz_qpt_get_qpt(struct ipz_qpt *qpt)
+{
+	return qpt->qpts;
+}
+
+#endif				/* __IPZ_PT_FN_H__ */
diff --git a/drivers/infiniband/hw/ipath/Kconfig b/drivers/infiniband/hw/ipath/Kconfig
new file mode 100644
index 000000000..1d9bb115c
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/Kconfig
@@ -0,0 +1,11 @@
+config INFINIBAND_IPATH
+	tristate "QLogic HTX HCA support"
+	depends on 64BIT && NET && HT_IRQ
+	---help---
+	This is a driver for the obsolete QLogic Hyper-Transport
+	IB host channel adapter (model QHT7140),
+	including InfiniBand verbs support.  This driver allows these
+	devices to be used with both kernel upper level protocols such
+	as IP-over-InfiniBand as well as with userspace applications
+	(in conjunction with InfiniBand userspace access).
+	For QLogic PCIe QLE based cards, use the QIB driver instead.
diff --git a/drivers/infiniband/hw/ipath/Makefile b/drivers/infiniband/hw/ipath/Makefile
new file mode 100644
index 000000000..4496f2820
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/Makefile
@@ -0,0 +1,37 @@
+ccflags-y := -DIPATH_IDSTR='"QLogic kernel.org driver"' \
+	-DIPATH_KERN_TYPE=0
+
+obj-$(CONFIG_INFINIBAND_IPATH) += ib_ipath.o
+
+ib_ipath-y := \
+	ipath_cq.o \
+	ipath_diag.o \
+	ipath_dma.o \
+	ipath_driver.o \
+	ipath_eeprom.o \
+	ipath_file_ops.o \
+	ipath_fs.o \
+	ipath_init_chip.o \
+	ipath_intr.o \
+	ipath_keys.o \
+	ipath_mad.o \
+	ipath_mmap.o \
+	ipath_mr.o \
+	ipath_qp.o \
+	ipath_rc.o \
+	ipath_ruc.o \
+	ipath_sdma.o \
+	ipath_srq.o \
+	ipath_stats.o \
+	ipath_sysfs.o \
+	ipath_uc.o \
+	ipath_ud.o \
+	ipath_user_pages.o \
+	ipath_user_sdma.o \
+	ipath_verbs_mcast.o \
+	ipath_verbs.o
+
+ib_ipath-$(CONFIG_HT_IRQ) += ipath_iba6110.o
+
+ib_ipath-$(CONFIG_X86_64) += ipath_wc_x86_64.o
+ib_ipath-$(CONFIG_PPC64) += ipath_wc_ppc64.o
diff --git a/drivers/infiniband/hw/ipath/ipath_common.h b/drivers/infiniband/hw/ipath/ipath_common.h
new file mode 100644
index 000000000..28cfe97cf
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_common.h
@@ -0,0 +1,851 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_COMMON_H
+#define _IPATH_COMMON_H
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+
+/* This is the IEEE-assigned OUI for QLogic Inc. InfiniPath */
+#define IPATH_SRC_OUI_1 0x00
+#define IPATH_SRC_OUI_2 0x11
+#define IPATH_SRC_OUI_3 0x75
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * IPATH_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fastpath code
+ * IPATH_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in faspath code
+ * _IPATH_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ * _IPATH_DEBUGGING define as 0 if you want to remove debug prints
+ */
+
+/*
+ * The value in the BTH QP field that InfiniPath uses to differentiate
+ * an infinipath protocol IB packet vs standard IB transport
+ */
+#define IPATH_KD_QP 0x656b79
+
+/*
+ * valid states passed to ipath_set_linkstate() user call
+ */
+#define IPATH_IB_LINKDOWN		0
+#define IPATH_IB_LINKARM		1
+#define IPATH_IB_LINKACTIVE		2
+#define IPATH_IB_LINKDOWN_ONLY		3
+#define IPATH_IB_LINKDOWN_SLEEP		4
+#define IPATH_IB_LINKDOWN_DISABLE	5
+#define IPATH_IB_LINK_LOOPBACK	6 /* enable local loopback */
+#define IPATH_IB_LINK_EXTERNAL	7 /* normal, disable local loopback */
+#define IPATH_IB_LINK_NO_HRTBT	8 /* disable Heartbeat, e.g. for loopback */
+#define IPATH_IB_LINK_HRTBT	9 /* enable heartbeat, normal, non-loopback */
+
+/*
+ * These 3 values (SDR and DDR may be ORed for auto-speed
+ * negotiation) are used for the 3rd argument to path_f_set_ib_cfg
+ * with cmd IPATH_IB_CFG_SPD_ENB, by direct calls or via sysfs.  They
+ * are also the the possible values for ipath_link_speed_enabled and active
+ * The values were chosen to match values used within the IB spec.
+ */
+#define IPATH_IB_SDR 1
+#define IPATH_IB_DDR 2
+
+/*
+ * stats maintained by the driver.  For now, at least, this is global
+ * to all minor devices.
+ */
+struct infinipath_stats {
+	/* number of interrupts taken */
+	__u64 sps_ints;
+	/* number of interrupts for errors */
+	__u64 sps_errints;
+	/* number of errors from chip (not incl. packet errors or CRC) */
+	__u64 sps_errs;
+	/* number of packet errors from chip other than CRC */
+	__u64 sps_pkterrs;
+	/* number of packets with CRC errors (ICRC and VCRC) */
+	__u64 sps_crcerrs;
+	/* number of hardware errors reported (parity, etc.) */
+	__u64 sps_hwerrs;
+	/* number of times IB link changed state unexpectedly */
+	__u64 sps_iblink;
+	__u64 sps_unused; /* was fastrcvint, no longer implemented */
+	/* number of kernel (port0) packets received */
+	__u64 sps_port0pkts;
+	/* number of "ethernet" packets sent by driver */
+	__u64 sps_ether_spkts;
+	/* number of "ethernet" packets received by driver */
+	__u64 sps_ether_rpkts;
+	/* number of SMA packets sent by driver. Obsolete. */
+	__u64 sps_sma_spkts;
+	/* number of SMA packets received by driver. Obsolete. */
+	__u64 sps_sma_rpkts;
+	/* number of times all ports rcvhdrq was full and packet dropped */
+	__u64 sps_hdrqfull;
+	/* number of times all ports egrtid was full and packet dropped */
+	__u64 sps_etidfull;
+	/*
+	 * number of times we tried to send from driver, but no pio buffers
+	 * avail
+	 */
+	__u64 sps_nopiobufs;
+	/* number of ports currently open */
+	__u64 sps_ports;
+	/* list of pkeys (other than default) accepted (0 means not set) */
+	__u16 sps_pkeys[4];
+	__u16 sps_unused16[4]; /* available; maintaining compatible layout */
+	/* number of user ports per chip (not IB ports) */
+	__u32 sps_nports;
+	/* not our interrupt, or already handled */
+	__u32 sps_nullintr;
+	/* max number of packets handled per receive call */
+	__u32 sps_maxpkts_call;
+	/* avg number of packets handled per receive call */
+	__u32 sps_avgpkts_call;
+	/* total number of pages locked */
+	__u64 sps_pagelocks;
+	/* total number of pages unlocked */
+	__u64 sps_pageunlocks;
+	/*
+	 * Number of packets dropped in kernel other than errors (ether
+	 * packets if ipath not configured, etc.)
+	 */
+	__u64 sps_krdrops;
+	__u64 sps_txeparity; /* PIO buffer parity error, recovered */
+	/* pad for future growth */
+	__u64 __sps_pad[45];
+};
+
+/*
+ * These are the status bits readable (in ascii form, 64bit value)
+ * from the "status" sysfs file.
+ */
+#define IPATH_STATUS_INITTED       0x1	/* basic initialization done */
+#define IPATH_STATUS_DISABLED      0x2	/* hardware disabled */
+/* Device has been disabled via admin request */
+#define IPATH_STATUS_ADMIN_DISABLED    0x4
+/* Chip has been found and initted */
+#define IPATH_STATUS_CHIP_PRESENT 0x20
+/* IB link is at ACTIVE, usable for data traffic */
+#define IPATH_STATUS_IB_READY     0x40
+/* link is configured, LID, MTU, etc. have been set */
+#define IPATH_STATUS_IB_CONF      0x80
+/* no link established, probably no cable */
+#define IPATH_STATUS_IB_NOCABLE  0x100
+/* A Fatal hardware error has occurred. */
+#define IPATH_STATUS_HWERROR     0x200
+
+/*
+ * The list of usermode accessible registers.  Also see Reg_* later in file.
+ */
+typedef enum _ipath_ureg {
+	/* (RO)  DMA RcvHdr to be used next. */
+	ur_rcvhdrtail = 0,
+	/* (RW)  RcvHdr entry to be processed next by host. */
+	ur_rcvhdrhead = 1,
+	/* (RO)  Index of next Eager index to use. */
+	ur_rcvegrindextail = 2,
+	/* (RW)  Eager TID to be processed next */
+	ur_rcvegrindexhead = 3,
+	/* For internal use only; max register number. */
+	_IPATH_UregMax
+} ipath_ureg;
+
+/* bit values for spi_runtime_flags */
+#define IPATH_RUNTIME_HT	0x1
+#define IPATH_RUNTIME_PCIE	0x2
+#define IPATH_RUNTIME_FORCE_WC_ORDER	0x4
+#define IPATH_RUNTIME_RCVHDR_COPY	0x8
+#define IPATH_RUNTIME_MASTER	0x10
+#define IPATH_RUNTIME_NODMA_RTAIL 0x80
+#define IPATH_RUNTIME_SDMA	      0x200
+#define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400
+#define IPATH_RUNTIME_PIO_REGSWAPPED 0x800
+
+/*
+ * This structure is returned by ipath_userinit() immediately after
+ * open to get implementation-specific info, and info specific to this
+ * instance.
+ *
+ * This struct must have explict pad fields where type sizes
+ * may result in different alignments between 32 and 64 bit
+ * programs, since the 64 bit * bit kernel requires the user code
+ * to have matching offsets
+ */
+struct ipath_base_info {
+	/* version of hardware, for feature checking. */
+	__u32 spi_hw_version;
+	/* version of software, for feature checking. */
+	__u32 spi_sw_version;
+	/* InfiniPath port assigned, goes into sent packets */
+	__u16 spi_port;
+	__u16 spi_subport;
+	/*
+	 * IB MTU, packets IB data must be less than this.
+	 * The MTU is in bytes, and will be a multiple of 4 bytes.
+	 */
+	__u32 spi_mtu;
+	/*
+	 * Size of a PIO buffer.  Any given packet's total size must be less
+	 * than this (in words).  Included is the starting control word, so
+	 * if 513 is returned, then total pkt size is 512 words or less.
+	 */
+	__u32 spi_piosize;
+	/* size of the TID cache in infinipath, in entries */
+	__u32 spi_tidcnt;
+	/* size of the TID Eager list in infinipath, in entries */
+	__u32 spi_tidegrcnt;
+	/* size of a single receive header queue entry in words. */
+	__u32 spi_rcvhdrent_size;
+	/*
+	 * Count of receive header queue entries allocated.
+	 * This may be less than the spu_rcvhdrcnt passed in!.
+	 */
+	__u32 spi_rcvhdr_cnt;
+
+	/* per-chip and other runtime features bitmap (IPATH_RUNTIME_*) */
+	__u32 spi_runtime_flags;
+
+	/* address where receive buffer queue is mapped into */
+	__u64 spi_rcvhdr_base;
+
+	/* user program. */
+
+	/* base address of eager TID receive buffers. */
+	__u64 spi_rcv_egrbufs;
+
+	/* Allocated by initialization code, not by protocol. */
+
+	/*
+	 * Size of each TID buffer in host memory, starting at
+	 * spi_rcv_egrbufs.  The buffers are virtually contiguous.
+	 */
+	__u32 spi_rcv_egrbufsize;
+	/*
+	 * The special QP (queue pair) value that identifies an infinipath
+	 * protocol packet from standard IB packets.  More, probably much
+	 * more, to be added.
+	 */
+	__u32 spi_qpair;
+
+	/*
+	 * User register base for init code, not to be used directly by
+	 * protocol or applications.
+	 */
+	__u64 __spi_uregbase;
+	/*
+	 * Maximum buffer size in bytes that can be used in a single TID
+	 * entry (assuming the buffer is aligned to this boundary).  This is
+	 * the minimum of what the hardware and software support Guaranteed
+	 * to be a power of 2.
+	 */
+	__u32 spi_tid_maxsize;
+	/*
+	 * alignment of each pio send buffer (byte count
+	 * to add to spi_piobufbase to get to second buffer)
+	 */
+	__u32 spi_pioalign;
+	/*
+	 * The index of the first pio buffer available to this process;
+	 * needed to do lookup in spi_pioavailaddr; not added to
+	 * spi_piobufbase.
+	 */
+	__u32 spi_pioindex;
+	 /* number of buffers mapped for this process */
+	__u32 spi_piocnt;
+
+	/*
+	 * Base address of writeonly pio buffers for this process.
+	 * Each buffer has spi_piosize words, and is aligned on spi_pioalign
+	 * boundaries.  spi_piocnt buffers are mapped from this address
+	 */
+	__u64 spi_piobufbase;
+
+	/*
+	 * Base address of readonly memory copy of the pioavail registers.
+	 * There are 2 bits for each buffer.
+	 */
+	__u64 spi_pioavailaddr;
+
+	/*
+	 * Address where driver updates a copy of the interface and driver
+	 * status (IPATH_STATUS_*) as a 64 bit value.  It's followed by a
+	 * string indicating hardware error, if there was one.
+	 */
+	__u64 spi_status;
+
+	/* number of chip ports available to user processes */
+	__u32 spi_nports;
+	/* unit number of chip we are using */
+	__u32 spi_unit;
+	/* num bufs in each contiguous set */
+	__u32 spi_rcv_egrperchunk;
+	/* size in bytes of each contiguous set */
+	__u32 spi_rcv_egrchunksize;
+	/* total size of mmap to cover full rcvegrbuffers */
+	__u32 spi_rcv_egrbuftotlen;
+	__u32 spi_filler_for_align;
+	/* address of readonly memory copy of the rcvhdrq tail register. */
+	__u64 spi_rcvhdr_tailaddr;
+
+	/* shared memory pages for subports if port is shared */
+	__u64 spi_subport_uregbase;
+	__u64 spi_subport_rcvegrbuf;
+	__u64 spi_subport_rcvhdr_base;
+
+	/* shared memory page for hardware port if it is shared */
+	__u64 spi_port_uregbase;
+	__u64 spi_port_rcvegrbuf;
+	__u64 spi_port_rcvhdr_base;
+	__u64 spi_port_rcvhdr_tailaddr;
+
+} __attribute__ ((aligned(8)));
+
+
+/*
+ * This version number is given to the driver by the user code during
+ * initialization in the spu_userversion field of ipath_user_info, so
+ * the driver can check for compatibility with user code.
+ *
+ * The major version changes when data structures
+ * change in an incompatible way.  The driver must be the same or higher
+ * for initialization to succeed.  In some cases, a higher version
+ * driver will not interoperate with older software, and initialization
+ * will return an error.
+ */
+#define IPATH_USER_SWMAJOR 1
+
+/*
+ * Minor version differences are always compatible
+ * a within a major version, however if user software is larger
+ * than driver software, some new features and/or structure fields
+ * may not be implemented; the user code must deal with this if it
+ * cares, or it must abort after initialization reports the difference.
+ */
+#define IPATH_USER_SWMINOR 6
+
+#define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR)
+
+#define IPATH_KERN_TYPE 0
+
+/*
+ * Similarly, this is the kernel version going back to the user.  It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a QLogic release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of ipath_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define IPATH_KERN_SWVERSION ((IPATH_KERN_TYPE<<31) | IPATH_USER_SWVERSION)
+
+/*
+ * This structure is passed to ipath_userinit() to tell the driver where
+ * user code buffers are, sizes, etc.   The offsets and sizes of the
+ * fields must remain unchanged, for binary compatibility.  It can
+ * be extended, if userversion is changed so user code can tell, if needed
+ */
+struct ipath_user_info {
+	/*
+	 * version of user software, to detect compatibility issues.
+	 * Should be set to IPATH_USER_SWVERSION.
+	 */
+	__u32 spu_userversion;
+
+	/* desired number of receive header queue entries */
+	__u32 spu_rcvhdrcnt;
+
+	/* size of struct base_info to write to */
+	__u32 spu_base_info_size;
+
+	/*
+	 * number of words in KD protocol header
+	 * This tells InfiniPath how many words to copy to rcvhdrq.  If 0,
+	 * kernel uses a default.  Once set, attempts to set any other value
+	 * are an error (EAGAIN) until driver is reloaded.
+	 */
+	__u32 spu_rcvhdrsize;
+
+	/*
+	 * If two or more processes wish to share a port, each process
+	 * must set the spu_subport_cnt and spu_subport_id to the same
+	 * values.  The only restriction on the spu_subport_id is that
+	 * it be unique for a given node.
+	 */
+	__u16 spu_subport_cnt;
+	__u16 spu_subport_id;
+
+	__u32 spu_unused; /* kept for compatible layout */
+
+	/*
+	 * address of struct base_info to write to
+	 */
+	__u64 spu_base_info;
+
+} __attribute__ ((aligned(8)));
+
+/* User commands. */
+
+#define IPATH_CMD_MIN		16
+
+#define __IPATH_CMD_USER_INIT	16	/* old set up userspace (for old user code) */
+#define IPATH_CMD_PORT_INFO	17	/* find out what resources we got */
+#define IPATH_CMD_RECV_CTRL	18	/* control receipt of packets */
+#define IPATH_CMD_TID_UPDATE	19	/* update expected TID entries */
+#define IPATH_CMD_TID_FREE	20	/* free expected TID entries */
+#define IPATH_CMD_SET_PART_KEY	21	/* add partition key */
+#define __IPATH_CMD_SLAVE_INFO	22	/* return info on slave processes (for old user code) */
+#define IPATH_CMD_ASSIGN_PORT	23	/* allocate HCA and port */
+#define IPATH_CMD_USER_INIT 	24	/* set up userspace */
+#define IPATH_CMD_UNUSED_1	25
+#define IPATH_CMD_UNUSED_2	26
+#define IPATH_CMD_PIOAVAILUPD	27	/* force an update of PIOAvail reg */
+#define IPATH_CMD_POLL_TYPE	28	/* set the kind of polling we want */
+#define IPATH_CMD_ARMLAUNCH_CTRL	29 /* armlaunch detection control */
+/* 30 is unused */
+#define IPATH_CMD_SDMA_INFLIGHT 31	/* sdma inflight counter request */
+#define IPATH_CMD_SDMA_COMPLETE 32	/* sdma completion counter request */
+
+/*
+ * Poll types
+ */
+#define IPATH_POLL_TYPE_URGENT	 0x01
+#define IPATH_POLL_TYPE_OVERFLOW 0x02
+
+struct ipath_port_info {
+	__u32 num_active;	/* number of active units */
+	__u32 unit;		/* unit (chip) assigned to caller */
+	__u16 port;		/* port on unit assigned to caller */
+	__u16 subport;		/* subport on unit assigned to caller */
+	__u16 num_ports;	/* number of ports available on unit */
+	__u16 num_subports;	/* number of subports opened on port */
+};
+
+struct ipath_tid_info {
+	__u32 tidcnt;
+	/* make structure same size in 32 and 64 bit */
+	__u32 tid__unused;
+	/* virtual address of first page in transfer */
+	__u64 tidvaddr;
+	/* pointer (same size 32/64 bit) to __u16 tid array */
+	__u64 tidlist;
+
+	/*
+	 * pointer (same size 32/64 bit) to bitmap of TIDs used
+	 * for this call; checked for being large enough at open
+	 */
+	__u64 tidmap;
+};
+
+struct ipath_cmd {
+	__u32 type;			/* command type */
+	union {
+		struct ipath_tid_info tid_info;
+		struct ipath_user_info user_info;
+
+		/*
+		 * address in userspace where we should put the sdma
+		 * inflight counter
+		 */
+		__u64 sdma_inflight;
+		/*
+		 * address in userspace where we should put the sdma
+		 * completion counter
+		 */
+		__u64 sdma_complete;
+		/* address in userspace of struct ipath_port_info to
+		   write result to */
+		__u64 port_info;
+		/* enable/disable receipt of packets */
+		__u32 recv_ctrl;
+		/* enable/disable armlaunch errors (non-zero to enable) */
+		__u32 armlaunch_ctrl;
+		/* partition key to set */
+		__u16 part_key;
+		/* user address of __u32 bitmask of active slaves */
+		__u64 slave_mask_addr;
+		/* type of polling we want */
+		__u16 poll_type;
+	} cmd;
+};
+
+struct ipath_iovec {
+	/* Pointer to data, but same size 32 and 64 bit */
+	__u64 iov_base;
+
+	/*
+	 * Length of data; don't need 64 bits, but want
+	 * ipath_sendpkt to remain same size as before 32 bit changes, so...
+	 */
+	__u64 iov_len;
+};
+
+/*
+ * Describes a single packet for send.  Each packet can have one or more
+ * buffers, but the total length (exclusive of IB headers) must be less
+ * than the MTU, and if using the PIO method, entire packet length,
+ * including IB headers, must be less than the ipath_piosize value (words).
+ * Use of this necessitates including sys/uio.h
+ */
+struct __ipath_sendpkt {
+	__u32 sps_flags;	/* flags for packet (TBD) */
+	__u32 sps_cnt;		/* number of entries to use in sps_iov */
+	/* array of iov's describing packet. TEMPORARY */
+	struct ipath_iovec sps_iov[4];
+};
+
+/*
+ * diagnostics can send a packet by "writing" one of the following
+ * two structs to diag data special file
+ * The first is the legacy version for backward compatibility
+ */
+struct ipath_diag_pkt {
+	__u32 unit;
+	__u64 data;
+	__u32 len;
+};
+
+/* The second diag_pkt struct is the expanded version that allows
+ * more control over the packet, specifically, by allowing a custom
+ * pbc (+ static rate) qword, so that special modes and deliberate
+ * changes to CRCs can be used. The elements were also re-ordered
+ * for better alignment and to avoid padding issues.
+ */
+struct ipath_diag_xpkt {
+	__u64 data;
+	__u64 pbc_wd;
+	__u32 unit;
+	__u32 len;
+};
+
+/*
+ * Data layout in I2C flash (for GUID, etc.)
+ * All fields are little-endian binary unless otherwise stated
+ */
+#define IPATH_FLASH_VERSION 2
+struct ipath_flash {
+	/* flash layout version (IPATH_FLASH_VERSION) */
+	__u8 if_fversion;
+	/* checksum protecting if_length bytes */
+	__u8 if_csum;
+	/*
+	 * valid length (in use, protected by if_csum), including
+	 * if_fversion and if_csum themselves)
+	 */
+	__u8 if_length;
+	/* the GUID, in network order */
+	__u8 if_guid[8];
+	/* number of GUIDs to use, starting from if_guid */
+	__u8 if_numguid;
+	/* the (last 10 characters of) board serial number, in ASCII */
+	char if_serial[12];
+	/* board mfg date (YYYYMMDD ASCII) */
+	char if_mfgdate[8];
+	/* last board rework/test date (YYYYMMDD ASCII) */
+	char if_testdate[8];
+	/* logging of error counts, TBD */
+	__u8 if_errcntp[4];
+	/* powered on hours, updated at driver unload */
+	__u8 if_powerhour[2];
+	/* ASCII free-form comment field */
+	char if_comment[32];
+	/* Backwards compatible prefix for longer QLogic Serial Numbers */
+	char if_sprefix[4];
+	/* 82 bytes used, min flash size is 128 bytes */
+	__u8 if_future[46];
+};
+
+/*
+ * These are the counters implemented in the chip, and are listed in order.
+ * The InterCaps naming is taken straight from the chip spec.
+ */
+struct infinipath_counters {
+	__u64 LBIntCnt;
+	__u64 LBFlowStallCnt;
+	__u64 TxSDmaDescCnt;	/* was Reserved1 */
+	__u64 TxUnsupVLErrCnt;
+	__u64 TxDataPktCnt;
+	__u64 TxFlowPktCnt;
+	__u64 TxDwordCnt;
+	__u64 TxLenErrCnt;
+	__u64 TxMaxMinLenErrCnt;
+	__u64 TxUnderrunCnt;
+	__u64 TxFlowStallCnt;
+	__u64 TxDroppedPktCnt;
+	__u64 RxDroppedPktCnt;
+	__u64 RxDataPktCnt;
+	__u64 RxFlowPktCnt;
+	__u64 RxDwordCnt;
+	__u64 RxLenErrCnt;
+	__u64 RxMaxMinLenErrCnt;
+	__u64 RxICRCErrCnt;
+	__u64 RxVCRCErrCnt;
+	__u64 RxFlowCtrlErrCnt;
+	__u64 RxBadFormatCnt;
+	__u64 RxLinkProblemCnt;
+	__u64 RxEBPCnt;
+	__u64 RxLPCRCErrCnt;
+	__u64 RxBufOvflCnt;
+	__u64 RxTIDFullErrCnt;
+	__u64 RxTIDValidErrCnt;
+	__u64 RxPKeyMismatchCnt;
+	__u64 RxP0HdrEgrOvflCnt;
+	__u64 RxP1HdrEgrOvflCnt;
+	__u64 RxP2HdrEgrOvflCnt;
+	__u64 RxP3HdrEgrOvflCnt;
+	__u64 RxP4HdrEgrOvflCnt;
+	__u64 RxP5HdrEgrOvflCnt;
+	__u64 RxP6HdrEgrOvflCnt;
+	__u64 RxP7HdrEgrOvflCnt;
+	__u64 RxP8HdrEgrOvflCnt;
+	__u64 RxP9HdrEgrOvflCnt;	/* was Reserved6 */
+	__u64 RxP10HdrEgrOvflCnt;	/* was Reserved7 */
+	__u64 RxP11HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP12HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP13HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP14HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP15HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 RxP16HdrEgrOvflCnt;	/* new for IBA7220 */
+	__u64 IBStatusChangeCnt;
+	__u64 IBLinkErrRecoveryCnt;
+	__u64 IBLinkDownedCnt;
+	__u64 IBSymbolErrCnt;
+	/* The following are new for IBA7220 */
+	__u64 RxVL15DroppedPktCnt;
+	__u64 RxOtherLocalPhyErrCnt;
+	__u64 PcieRetryBufDiagQwordCnt;
+	__u64 ExcessBufferOvflCnt;
+	__u64 LocalLinkIntegrityErrCnt;
+	__u64 RxVlErrCnt;
+	__u64 RxDlidFltrCnt;
+};
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software
+ * The other bits that are used only by the driver or diags are in
+ * ipath_registers.h
+ */
+
+/* RcvHdrFlags bits */
+#define INFINIPATH_RHF_LENGTH_MASK 0x7FF
+#define INFINIPATH_RHF_LENGTH_SHIFT 0
+#define INFINIPATH_RHF_RCVTYPE_MASK 0x7
+#define INFINIPATH_RHF_RCVTYPE_SHIFT 11
+#define INFINIPATH_RHF_EGRINDEX_MASK 0xFFF
+#define INFINIPATH_RHF_EGRINDEX_SHIFT 16
+#define INFINIPATH_RHF_SEQ_MASK 0xF
+#define INFINIPATH_RHF_SEQ_SHIFT 0
+#define INFINIPATH_RHF_HDRQ_OFFSET_MASK 0x7FF
+#define INFINIPATH_RHF_HDRQ_OFFSET_SHIFT 4
+#define INFINIPATH_RHF_H_ICRCERR   0x80000000
+#define INFINIPATH_RHF_H_VCRCERR   0x40000000
+#define INFINIPATH_RHF_H_PARITYERR 0x20000000
+#define INFINIPATH_RHF_H_LENERR    0x10000000
+#define INFINIPATH_RHF_H_MTUERR    0x08000000
+#define INFINIPATH_RHF_H_IHDRERR   0x04000000
+#define INFINIPATH_RHF_H_TIDERR    0x02000000
+#define INFINIPATH_RHF_H_MKERR     0x01000000
+#define INFINIPATH_RHF_H_IBERR     0x00800000
+#define INFINIPATH_RHF_H_ERR_MASK  0xFF800000
+#define INFINIPATH_RHF_L_USE_EGR   0x80000000
+#define INFINIPATH_RHF_L_SWA       0x00008000
+#define INFINIPATH_RHF_L_SWB       0x00004000
+
+/* infinipath header fields */
+#define INFINIPATH_I_VERS_MASK 0xF
+#define INFINIPATH_I_VERS_SHIFT 28
+#define INFINIPATH_I_PORT_MASK 0xF
+#define INFINIPATH_I_PORT_SHIFT 24
+#define INFINIPATH_I_TID_MASK 0x7FF
+#define INFINIPATH_I_TID_SHIFT 13
+#define INFINIPATH_I_OFFSET_MASK 0x1FFF
+#define INFINIPATH_I_OFFSET_SHIFT 0
+
+/* K_PktFlags bits */
+#define INFINIPATH_KPF_INTR 0x1
+#define INFINIPATH_KPF_SUBPORT_MASK 0x3
+#define INFINIPATH_KPF_SUBPORT_SHIFT 1
+
+#define INFINIPATH_MAX_SUBPORT	4
+
+/* SendPIO per-buffer control */
+#define INFINIPATH_SP_TEST    0x40
+#define INFINIPATH_SP_TESTEBP 0x20
+#define INFINIPATH_SP_TRIGGER_SHIFT  15
+
+/* SendPIOAvail bits */
+#define INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT 1
+#define INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT 0
+
+/* infinipath header format */
+struct ipath_header {
+	/*
+	 * Version - 4 bits, Port - 4 bits, TID - 10 bits and Offset -
+	 * 14 bits before ECO change ~28 Dec 03.  After that, Vers 4,
+	 * Port 4, TID 11, offset 13.
+	 */
+	__le32 ver_port_tid_offset;
+	__le16 chksum;
+	__le16 pkt_flags;
+};
+
+/* infinipath user message header format.
+ * This structure contains the first 4 fields common to all protocols
+ * that employ infinipath.
+ */
+struct ipath_message_header {
+	__be16 lrh[4];
+	__be32 bth[3];
+	/* fields below this point are in host byte order */
+	struct ipath_header iph;
+	__u8 sub_opcode;
+};
+
+/* infinipath ethernet header format */
+struct ether_header {
+	__be16 lrh[4];
+	__be32 bth[3];
+	struct ipath_header iph;
+	__u8 sub_opcode;
+	__u8 cmd;
+	__be16 lid;
+	__u16 mac[3];
+	__u8 frag_num;
+	__u8 seq_num;
+	__le32 len;
+	/* MUST be of word size due to PIO write requirements */
+	__le32 csum;
+	__le16 csum_offset;
+	__le16 flags;
+	__u16 first_2_bytes;
+	__u8 unused[2];		/* currently unused */
+};
+
+
+/* IB - LRH header consts */
+#define IPATH_LRH_GRH 0x0003	/* 1. word of IB LRH - next header: GRH */
+#define IPATH_LRH_BTH 0x0002	/* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define IPATH_DEFAULT_P_KEY 0xFFFF
+#define IPATH_PERMISSIVE_LID 0xFFFF
+#define IPATH_AETH_CREDIT_SHIFT 24
+#define IPATH_AETH_CREDIT_MASK 0x1F
+#define IPATH_AETH_CREDIT_INVAL 0x1F
+#define IPATH_PSN_MASK 0xFFFFFF
+#define IPATH_MSN_MASK 0xFFFFFF
+#define IPATH_QPN_MASK 0xFFFFFF
+#define IPATH_MULTICAST_LID_BASE 0xC000
+#define IPATH_EAGER_TID_ID INFINIPATH_I_TID_MASK
+#define IPATH_MULTICAST_QPN 0xFFFFFF
+
+/* Receive Header Queue: receive type (from infinipath) */
+#define RCVHQ_RCV_TYPE_EXPECTED  0
+#define RCVHQ_RCV_TYPE_EAGER     1
+#define RCVHQ_RCV_TYPE_NON_KD    2
+#define RCVHQ_RCV_TYPE_ERROR     3
+
+
+/* sub OpCodes - ith4x  */
+#define IPATH_ITH4X_OPCODE_ENCAP 0x81
+#define IPATH_ITH4X_OPCODE_LID_ARP 0x82
+
+#define IPATH_HEADER_QUEUE_WORDS 9
+
+/* functions for extracting fields from rcvhdrq entries for the driver.
+ */
+static inline __u32 ipath_hdrget_err_flags(const __le32 * rbuf)
+{
+	return __le32_to_cpu(rbuf[1]) & INFINIPATH_RHF_H_ERR_MASK;
+}
+
+static inline __u32 ipath_hdrget_rcv_type(const __le32 * rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_RCVTYPE_SHIFT)
+	    & INFINIPATH_RHF_RCVTYPE_MASK;
+}
+
+static inline __u32 ipath_hdrget_length_in_bytes(const __le32 * rbuf)
+{
+	return ((__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_LENGTH_SHIFT)
+		& INFINIPATH_RHF_LENGTH_MASK) << 2;
+}
+
+static inline __u32 ipath_hdrget_index(const __le32 * rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> INFINIPATH_RHF_EGRINDEX_SHIFT)
+	    & INFINIPATH_RHF_EGRINDEX_MASK;
+}
+
+static inline __u32 ipath_hdrget_seq(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_SEQ_SHIFT)
+		& INFINIPATH_RHF_SEQ_MASK;
+}
+
+static inline __u32 ipath_hdrget_offset(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> INFINIPATH_RHF_HDRQ_OFFSET_SHIFT)
+		& INFINIPATH_RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline __u32 ipath_hdrget_use_egr_buf(const __le32 *rbuf)
+{
+	return __le32_to_cpu(rbuf[0]) & INFINIPATH_RHF_L_USE_EGR;
+}
+
+static inline __u32 ipath_hdrget_ipath_ver(__le32 hdrword)
+{
+	return (__le32_to_cpu(hdrword) >> INFINIPATH_I_VERS_SHIFT)
+	    & INFINIPATH_I_VERS_MASK;
+}
+
+#endif				/* _IPATH_COMMON_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c
new file mode 100644
index 000000000..0416c6c0e
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_cq.c
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_cq_enter - add a new entry to the completion queue
+ * @cq: completion queue
+ * @entry: work completion entry to add
+ * @sig: true if @entry is a solicitated entry
+ *
+ * This may be called with qp->s_lock held.
+ */
+void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
+{
+	struct ipath_cq_wc *wc;
+	unsigned long flags;
+	u32 head;
+	u32 next;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	/*
+	 * Note that the head pointer might be writable by user processes.
+	 * Take care to verify it is a sane value.
+	 */
+	wc = cq->queue;
+	head = wc->head;
+	if (head >= (unsigned) cq->ibcq.cqe) {
+		head = cq->ibcq.cqe;
+		next = 0;
+	} else
+		next = head + 1;
+	if (unlikely(next == wc->tail)) {
+		spin_unlock_irqrestore(&cq->lock, flags);
+		if (cq->ibcq.event_handler) {
+			struct ib_event ev;
+
+			ev.device = cq->ibcq.device;
+			ev.element.cq = &cq->ibcq;
+			ev.event = IB_EVENT_CQ_ERR;
+			cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+		}
+		return;
+	}
+	if (cq->ip) {
+		wc->uqueue[head].wr_id = entry->wr_id;
+		wc->uqueue[head].status = entry->status;
+		wc->uqueue[head].opcode = entry->opcode;
+		wc->uqueue[head].vendor_err = entry->vendor_err;
+		wc->uqueue[head].byte_len = entry->byte_len;
+		wc->uqueue[head].ex.imm_data = (__u32 __force) entry->ex.imm_data;
+		wc->uqueue[head].qp_num = entry->qp->qp_num;
+		wc->uqueue[head].src_qp = entry->src_qp;
+		wc->uqueue[head].wc_flags = entry->wc_flags;
+		wc->uqueue[head].pkey_index = entry->pkey_index;
+		wc->uqueue[head].slid = entry->slid;
+		wc->uqueue[head].sl = entry->sl;
+		wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+		wc->uqueue[head].port_num = entry->port_num;
+		/* Make sure entry is written before the head index. */
+		smp_wmb();
+	} else
+		wc->kqueue[head] = *entry;
+	wc->head = next;
+
+	if (cq->notify == IB_CQ_NEXT_COMP ||
+	    (cq->notify == IB_CQ_SOLICITED && solicited)) {
+		cq->notify = IB_CQ_NONE;
+		cq->triggered++;
+		/*
+		 * This will cause send_complete() to be called in
+		 * another thread.
+		 */
+		tasklet_hi_schedule(&cq->comptask);
+	}
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	if (entry->status != IB_WC_SUCCESS)
+		to_idev(cq->ibcq.device)->n_wqe_errs++;
+}
+
+/**
+ * ipath_poll_cq - poll for work completion entries
+ * @ibcq: the completion queue to poll
+ * @num_entries: the maximum number of entries to return
+ * @entry: pointer to array where work completions are placed
+ *
+ * Returns the number of completion entries polled.
+ *
+ * This may be called from interrupt context.  Also called by ib_poll_cq()
+ * in the generic verbs code.
+ */
+int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+	struct ipath_cq *cq = to_icq(ibcq);
+	struct ipath_cq_wc *wc;
+	unsigned long flags;
+	int npolled;
+	u32 tail;
+
+	/* The kernel can only poll a kernel completion queue */
+	if (cq->ip) {
+		npolled = -EINVAL;
+		goto bail;
+	}
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	wc = cq->queue;
+	tail = wc->tail;
+	if (tail > (u32) cq->ibcq.cqe)
+		tail = (u32) cq->ibcq.cqe;
+	for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
+		if (tail == wc->head)
+			break;
+		/* The kernel doesn't need a RMB since it has the lock. */
+		*entry = wc->kqueue[tail];
+		if (tail >= cq->ibcq.cqe)
+			tail = 0;
+		else
+			tail++;
+	}
+	wc->tail = tail;
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+bail:
+	return npolled;
+}
+
+static void send_complete(unsigned long data)
+{
+	struct ipath_cq *cq = (struct ipath_cq *)data;
+
+	/*
+	 * The completion handler will most likely rearm the notification
+	 * and poll for all pending entries.  If a new completion entry
+	 * is added while we are in this routine, tasklet_hi_schedule()
+	 * won't call us again until we return so we check triggered to
+	 * see if we need to call the handler again.
+	 */
+	for (;;) {
+		u8 triggered = cq->triggered;
+
+		cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+
+		if (cq->triggered == triggered)
+			return;
+	}
+}
+
+/**
+ * ipath_create_cq - create a completion queue
+ * @ibdev: the device this completion queue is attached to
+ * @entries: the minimum size of the completion queue
+ * @context: unused by the InfiniPath driver
+ * @udata: unused by the InfiniPath driver
+ *
+ * Returns a pointer to the completion queue or negative errno values
+ * for failure.
+ *
+ * Called by ib_create_cq() in the generic verbs code.
+ */
+struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vector,
+			      struct ib_ucontext *context,
+			      struct ib_udata *udata)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cq *cq;
+	struct ipath_cq_wc *wc;
+	struct ib_cq *ret;
+	u32 sz;
+
+	if (entries < 1 || entries > ib_ipath_max_cqes) {
+		ret = ERR_PTR(-EINVAL);
+		goto done;
+	}
+
+	/* Allocate the completion queue structure. */
+	cq = kmalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto done;
+	}
+
+	/*
+	 * Allocate the completion queue entries and head/tail pointers.
+	 * This is allocated separately so that it can be resized and
+	 * also mapped into user space.
+	 * We need to use vmalloc() in order to support mmap and large
+	 * numbers of entries.
+	 */
+	sz = sizeof(*wc);
+	if (udata && udata->outlen >= sizeof(__u64))
+		sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
+	else
+		sz += sizeof(struct ib_wc) * (entries + 1);
+	wc = vmalloc_user(sz);
+	if (!wc) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_cq;
+	}
+
+	/*
+	 * Return the address of the WC as the offset to mmap.
+	 * See ipath_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		int err;
+
+		cq->ip = ipath_create_mmap_info(dev, sz, context, wc);
+		if (!cq->ip) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_wc;
+		}
+
+		err = ib_copy_to_udata(udata, &cq->ip->offset,
+				       sizeof(cq->ip->offset));
+		if (err) {
+			ret = ERR_PTR(err);
+			goto bail_ip;
+		}
+	} else
+		cq->ip = NULL;
+
+	spin_lock(&dev->n_cqs_lock);
+	if (dev->n_cqs_allocated == ib_ipath_max_cqs) {
+		spin_unlock(&dev->n_cqs_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	dev->n_cqs_allocated++;
+	spin_unlock(&dev->n_cqs_lock);
+
+	if (cq->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	/*
+	 * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
+	 * The number of entries should be >= the number requested or return
+	 * an error.
+	 */
+	cq->ibcq.cqe = entries;
+	cq->notify = IB_CQ_NONE;
+	cq->triggered = 0;
+	spin_lock_init(&cq->lock);
+	tasklet_init(&cq->comptask, send_complete, (unsigned long)cq);
+	wc->head = 0;
+	wc->tail = 0;
+	cq->queue = wc;
+
+	ret = &cq->ibcq;
+
+	goto done;
+
+bail_ip:
+	kfree(cq->ip);
+bail_wc:
+	vfree(wc);
+bail_cq:
+	kfree(cq);
+done:
+	return ret;
+}
+
+/**
+ * ipath_destroy_cq - destroy a completion queue
+ * @ibcq: the completion queue to destroy.
+ *
+ * Returns 0 for success.
+ *
+ * Called by ib_destroy_cq() in the generic verbs code.
+ */
+int ipath_destroy_cq(struct ib_cq *ibcq)
+{
+	struct ipath_ibdev *dev = to_idev(ibcq->device);
+	struct ipath_cq *cq = to_icq(ibcq);
+
+	tasklet_kill(&cq->comptask);
+	spin_lock(&dev->n_cqs_lock);
+	dev->n_cqs_allocated--;
+	spin_unlock(&dev->n_cqs_lock);
+	if (cq->ip)
+		kref_put(&cq->ip->ref, ipath_release_mmap_info);
+	else
+		vfree(cq->queue);
+	kfree(cq);
+
+	return 0;
+}
+
+/**
+ * ipath_req_notify_cq - change the notification type for a completion queue
+ * @ibcq: the completion queue
+ * @notify_flags: the type of notification to request
+ *
+ * Returns 0 for success.
+ *
+ * This may be called from interrupt context.  Also called by
+ * ib_req_notify_cq() in the generic verbs code.
+ */
+int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+	struct ipath_cq *cq = to_icq(ibcq);
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&cq->lock, flags);
+	/*
+	 * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
+	 * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
+	 */
+	if (cq->notify != IB_CQ_NEXT_COMP)
+		cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
+
+	if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
+	    cq->queue->head != cq->queue->tail)
+		ret = 1;
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return ret;
+}
+
+/**
+ * ipath_resize_cq - change the size of the CQ
+ * @ibcq: the completion queue
+ *
+ * Returns 0 for success.
+ */
+int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+	struct ipath_cq *cq = to_icq(ibcq);
+	struct ipath_cq_wc *old_wc;
+	struct ipath_cq_wc *wc;
+	u32 head, tail, n;
+	int ret;
+	u32 sz;
+
+	if (cqe < 1 || cqe > ib_ipath_max_cqes) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Need to use vmalloc() if we want to support large #s of entries.
+	 */
+	sz = sizeof(*wc);
+	if (udata && udata->outlen >= sizeof(__u64))
+		sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
+	else
+		sz += sizeof(struct ib_wc) * (cqe + 1);
+	wc = vmalloc_user(sz);
+	if (!wc) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	/* Check that we can write the offset to mmap. */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		__u64 offset = 0;
+
+		ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
+		if (ret)
+			goto bail_free;
+	}
+
+	spin_lock_irq(&cq->lock);
+	/*
+	 * Make sure head and tail are sane since they
+	 * might be user writable.
+	 */
+	old_wc = cq->queue;
+	head = old_wc->head;
+	if (head > (u32) cq->ibcq.cqe)
+		head = (u32) cq->ibcq.cqe;
+	tail = old_wc->tail;
+	if (tail > (u32) cq->ibcq.cqe)
+		tail = (u32) cq->ibcq.cqe;
+	if (head < tail)
+		n = cq->ibcq.cqe + 1 + head - tail;
+	else
+		n = head - tail;
+	if (unlikely((u32)cqe < n)) {
+		ret = -EINVAL;
+		goto bail_unlock;
+	}
+	for (n = 0; tail != head; n++) {
+		if (cq->ip)
+			wc->uqueue[n] = old_wc->uqueue[tail];
+		else
+			wc->kqueue[n] = old_wc->kqueue[tail];
+		if (tail == (u32) cq->ibcq.cqe)
+			tail = 0;
+		else
+			tail++;
+	}
+	cq->ibcq.cqe = cqe;
+	wc->head = n;
+	wc->tail = 0;
+	cq->queue = wc;
+	spin_unlock_irq(&cq->lock);
+
+	vfree(old_wc);
+
+	if (cq->ip) {
+		struct ipath_ibdev *dev = to_idev(ibcq->device);
+		struct ipath_mmap_info *ip = cq->ip;
+
+		ipath_update_mmap_info(dev, ip, sz, wc);
+
+		/*
+		 * Return the offset to mmap.
+		 * See ipath_mmap() for details.
+		 */
+		if (udata && udata->outlen >= sizeof(__u64)) {
+			ret = ib_copy_to_udata(udata, &ip->offset,
+					       sizeof(ip->offset));
+			if (ret)
+				goto bail;
+		}
+
+		spin_lock_irq(&dev->pending_lock);
+		if (list_empty(&ip->pending_mmaps))
+			list_add(&ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = 0;
+	goto bail;
+
+bail_unlock:
+	spin_unlock_irq(&cq->lock);
+bail_free:
+	vfree(wc);
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_debug.h b/drivers/infiniband/hw/ipath/ipath_debug.h
new file mode 100644
index 000000000..65926cd35
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_debug.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_DEBUG_H
+#define _IPATH_DEBUG_H
+
+#ifndef _IPATH_DEBUGGING	/* debugging enabled or not */
+#define _IPATH_DEBUGGING 1
+#endif
+
+#if _IPATH_DEBUGGING
+
+/*
+ * Mask values for debugging.  The scheme allows us to compile out any
+ * of the debug tracing stuff, and if compiled in, to enable or disable
+ * dynamically.  This can be set at modprobe time also:
+ *      modprobe infinipath.ko infinipath_debug=7
+ */
+
+#define __IPATH_INFO        0x1	/* generic low verbosity stuff */
+#define __IPATH_DBG         0x2	/* generic debug */
+#define __IPATH_TRSAMPLE    0x8	/* generate trace buffer sample entries */
+/* leave some low verbosity spots open */
+#define __IPATH_VERBDBG     0x40	/* very verbose debug */
+#define __IPATH_PKTDBG      0x80	/* print packet data */
+/* print process startup (init)/exit messages */
+#define __IPATH_PROCDBG     0x100
+/* print mmap/fault stuff, not using VDBG any more */
+#define __IPATH_MMDBG       0x200
+#define __IPATH_ERRPKTDBG   0x400
+#define __IPATH_USER_SEND   0x1000	/* use user mode send */
+#define __IPATH_KERNEL_SEND 0x2000	/* use kernel mode send */
+#define __IPATH_EPKTDBG     0x4000	/* print ethernet packet data */
+#define __IPATH_IPATHDBG    0x10000	/* Ethernet (IPATH) gen debug */
+#define __IPATH_IPATHWARN   0x20000	/* Ethernet (IPATH) warnings */
+#define __IPATH_IPATHERR    0x40000	/* Ethernet (IPATH) errors */
+#define __IPATH_IPATHPD     0x80000	/* Ethernet (IPATH) packet dump */
+#define __IPATH_IPATHTABLE  0x100000	/* Ethernet (IPATH) table dump */
+#define __IPATH_LINKVERBDBG 0x200000	/* very verbose linkchange debug */
+
+#else				/* _IPATH_DEBUGGING */
+
+/*
+ * define all of these even with debugging off, for the few places that do
+ * if(infinipath_debug & _IPATH_xyzzy), but in a way that will make the
+ * compiler eliminate the code
+ */
+
+#define __IPATH_INFO      0x0	/* generic low verbosity stuff */
+#define __IPATH_DBG       0x0	/* generic debug */
+#define __IPATH_TRSAMPLE  0x0	/* generate trace buffer sample entries */
+#define __IPATH_VERBDBG   0x0	/* very verbose debug */
+#define __IPATH_PKTDBG    0x0	/* print packet data */
+#define __IPATH_PROCDBG   0x0	/* process startup (init)/exit messages */
+/* print mmap/fault stuff, not using VDBG any more */
+#define __IPATH_MMDBG     0x0
+#define __IPATH_EPKTDBG   0x0	/* print ethernet packet data */
+#define __IPATH_IPATHDBG  0x0	/* Ethernet (IPATH) table dump on */
+#define __IPATH_IPATHWARN 0x0	/* Ethernet (IPATH) warnings on   */
+#define __IPATH_IPATHERR  0x0	/* Ethernet (IPATH) errors on   */
+#define __IPATH_IPATHPD   0x0	/* Ethernet (IPATH) packet dump on   */
+#define __IPATH_IPATHTABLE 0x0	/* Ethernet (IPATH) packet dump on   */
+#define __IPATH_LINKVERBDBG 0x0	/* very verbose linkchange debug */
+
+#endif				/* _IPATH_DEBUGGING */
+
+#define __IPATH_VERBOSEDBG __IPATH_VERBDBG
+
+#endif				/* _IPATH_DEBUG_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c
new file mode 100644
index 000000000..45802e973
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_diag.c
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains support for diagnostic functions.  It is accessed by
+ * opening the ipath_diag device, normally minor number 129.  Diagnostic use
+ * of the InfiniPath chip may render the chip or board unusable until the
+ * driver is unloaded, or in some cases, until the system is rebooted.
+ *
+ * Accesses to the chip through this interface are not similar to going
+ * through the /sys/bus/pci resource mmap interface.
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+#include <linux/fs.h>
+#include <linux/export.h>
+#include <asm/uaccess.h>
+
+#include "ipath_kernel.h"
+#include "ipath_common.h"
+
+int ipath_diag_inuse;
+static int diag_set_link;
+
+static int ipath_diag_open(struct inode *in, struct file *fp);
+static int ipath_diag_release(struct inode *in, struct file *fp);
+static ssize_t ipath_diag_read(struct file *fp, char __user *data,
+			       size_t count, loff_t *off);
+static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
+				size_t count, loff_t *off);
+
+static const struct file_operations diag_file_ops = {
+	.owner = THIS_MODULE,
+	.write = ipath_diag_write,
+	.read = ipath_diag_read,
+	.open = ipath_diag_open,
+	.release = ipath_diag_release,
+	.llseek = default_llseek,
+};
+
+static ssize_t ipath_diagpkt_write(struct file *fp,
+				   const char __user *data,
+				   size_t count, loff_t *off);
+
+static const struct file_operations diagpkt_file_ops = {
+	.owner = THIS_MODULE,
+	.write = ipath_diagpkt_write,
+	.llseek = noop_llseek,
+};
+
+static atomic_t diagpkt_count = ATOMIC_INIT(0);
+static struct cdev *diagpkt_cdev;
+static struct device *diagpkt_dev;
+
+int ipath_diag_add(struct ipath_devdata *dd)
+{
+	char name[16];
+	int ret = 0;
+
+	if (atomic_inc_return(&diagpkt_count) == 1) {
+		ret = ipath_cdev_init(IPATH_DIAGPKT_MINOR,
+				      "ipath_diagpkt", &diagpkt_file_ops,
+				      &diagpkt_cdev, &diagpkt_dev);
+
+		if (ret) {
+			ipath_dev_err(dd, "Couldn't create ipath_diagpkt "
+				      "device: %d", ret);
+			goto done;
+		}
+	}
+
+	snprintf(name, sizeof(name), "ipath_diag%d", dd->ipath_unit);
+
+	ret = ipath_cdev_init(IPATH_DIAG_MINOR_BASE + dd->ipath_unit, name,
+			      &diag_file_ops, &dd->diag_cdev,
+			      &dd->diag_dev);
+	if (ret)
+		ipath_dev_err(dd, "Couldn't create %s device: %d",
+			      name, ret);
+
+done:
+	return ret;
+}
+
+void ipath_diag_remove(struct ipath_devdata *dd)
+{
+	if (atomic_dec_and_test(&diagpkt_count))
+		ipath_cdev_cleanup(&diagpkt_cdev, &diagpkt_dev);
+
+	ipath_cdev_cleanup(&dd->diag_cdev, &dd->diag_dev);
+}
+
+/**
+ * ipath_read_umem64 - read a 64-bit quantity from the chip into user space
+ * @dd: the infinipath device
+ * @uaddr: the location to store the data in user memory
+ * @caddr: the source chip address (full pointer, not offset)
+ * @count: number of bytes to copy (multiple of 32 bits)
+ *
+ * This function also localizes all chip memory accesses.
+ * The copy should be written such that we read full cacheline packets
+ * from the chip.  This is usually used for a single qword
+ *
+ * NOTE:  This assumes the chip address is 64-bit aligned.
+ */
+static int ipath_read_umem64(struct ipath_devdata *dd, void __user *uaddr,
+			     const void __iomem *caddr, size_t count)
+{
+	const u64 __iomem *reg_addr = caddr;
+	const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
+	int ret;
+
+	/* not very efficient, but it works for now */
+	if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	while (reg_addr < reg_end) {
+		u64 data = readq(reg_addr);
+		if (copy_to_user(uaddr, &data, sizeof(u64))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		reg_addr++;
+		uaddr += sizeof(u64);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * ipath_write_umem64 - write a 64-bit quantity to the chip from user space
+ * @dd: the infinipath device
+ * @caddr: the destination chip address (full pointer, not offset)
+ * @uaddr: the source of the data in user memory
+ * @count: the number of bytes to copy (multiple of 32 bits)
+ *
+ * This is usually used for a single qword
+ * NOTE:  This assumes the chip address is 64-bit aligned.
+ */
+
+static int ipath_write_umem64(struct ipath_devdata *dd, void __iomem *caddr,
+			      const void __user *uaddr, size_t count)
+{
+	u64 __iomem *reg_addr = caddr;
+	const u64 __iomem *reg_end = reg_addr + (count / sizeof(u64));
+	int ret;
+
+	/* not very efficient, but it works for now */
+	if (reg_addr < dd->ipath_kregbase || reg_end > dd->ipath_kregend) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	while (reg_addr < reg_end) {
+		u64 data;
+		if (copy_from_user(&data, uaddr, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		writeq(data, reg_addr);
+
+		reg_addr++;
+		uaddr += sizeof(u64);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * ipath_read_umem32 - read a 32-bit quantity from the chip into user space
+ * @dd: the infinipath device
+ * @uaddr: the location to store the data in user memory
+ * @caddr: the source chip address (full pointer, not offset)
+ * @count: number of bytes to copy
+ *
+ * read 32 bit values, not 64 bit; for memories that only
+ * support 32 bit reads; usually a single dword.
+ */
+static int ipath_read_umem32(struct ipath_devdata *dd, void __user *uaddr,
+			     const void __iomem *caddr, size_t count)
+{
+	const u32 __iomem *reg_addr = caddr;
+	const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
+	int ret;
+
+	if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
+	    reg_end > (u32 __iomem *) dd->ipath_kregend) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	/* not very efficient, but it works for now */
+	while (reg_addr < reg_end) {
+		u32 data = readl(reg_addr);
+		if (copy_to_user(uaddr, &data, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+
+		reg_addr++;
+		uaddr += sizeof(u32);
+
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * ipath_write_umem32 - write a 32-bit quantity to the chip from user space
+ * @dd: the infinipath device
+ * @caddr: the destination chip address (full pointer, not offset)
+ * @uaddr: the source of the data in user memory
+ * @count: number of bytes to copy
+ *
+ * write 32 bit values, not 64 bit; for memories that only
+ * support 32 bit write; usually a single dword.
+ */
+
+static int ipath_write_umem32(struct ipath_devdata *dd, void __iomem *caddr,
+			      const void __user *uaddr, size_t count)
+{
+	u32 __iomem *reg_addr = caddr;
+	const u32 __iomem *reg_end = reg_addr + (count / sizeof(u32));
+	int ret;
+
+	if (reg_addr < (u32 __iomem *) dd->ipath_kregbase ||
+	    reg_end > (u32 __iomem *) dd->ipath_kregend) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	while (reg_addr < reg_end) {
+		u32 data;
+		if (copy_from_user(&data, uaddr, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		writel(data, reg_addr);
+
+		reg_addr++;
+		uaddr += sizeof(u32);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+static int ipath_diag_open(struct inode *in, struct file *fp)
+{
+	int unit = iminor(in) - IPATH_DIAG_MINOR_BASE;
+	struct ipath_devdata *dd;
+	int ret;
+
+	mutex_lock(&ipath_mutex);
+
+	if (ipath_diag_inuse) {
+		ret = -EBUSY;
+		goto bail;
+	}
+
+	dd = ipath_lookup(unit);
+
+	if (dd == NULL || !(dd->ipath_flags & IPATH_PRESENT) ||
+	    !dd->ipath_kregbase) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	fp->private_data = dd;
+	ipath_diag_inuse = -2;
+	diag_set_link = 0;
+	ret = 0;
+
+	/* Only expose a way to reset the device if we
+	   make it into diag mode. */
+	ipath_expose_reset(&dd->pcidev->dev);
+
+bail:
+	mutex_unlock(&ipath_mutex);
+
+	return ret;
+}
+
+/**
+ * ipath_diagpkt_write - write an IB packet
+ * @fp: the diag data device file pointer
+ * @data: ipath_diag_pkt structure saying where to get the packet
+ * @count: size of data to write
+ * @off: unused by this code
+ */
+static ssize_t ipath_diagpkt_write(struct file *fp,
+				   const char __user *data,
+				   size_t count, loff_t *off)
+{
+	u32 __iomem *piobuf;
+	u32 plen, pbufn, maxlen_reserve;
+	struct ipath_diag_pkt odp;
+	struct ipath_diag_xpkt dp;
+	u32 *tmpbuf = NULL;
+	struct ipath_devdata *dd;
+	ssize_t ret = 0;
+	u64 val;
+	u32 l_state, lt_state; /* LinkState, LinkTrainingState */
+
+
+	if (count == sizeof(dp)) {
+		if (copy_from_user(&dp, data, sizeof(dp))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+	} else if (count == sizeof(odp)) {
+		if (copy_from_user(&odp, data, sizeof(odp))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		dp.len = odp.len;
+		dp.unit = odp.unit;
+		dp.data = odp.data;
+		dp.pbc_wd = 0;
+	} else {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/* send count must be an exact number of dwords */
+	if (dp.len & 3) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	plen = dp.len >> 2;
+
+	dd = ipath_lookup(dp.unit);
+	if (!dd || !(dd->ipath_flags & IPATH_PRESENT) ||
+	    !dd->ipath_kregbase) {
+		ipath_cdbg(VERBOSE, "illegal unit %u for diag data send\n",
+			   dp.unit);
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	if (ipath_diag_inuse && !diag_set_link &&
+	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+		diag_set_link = 1;
+		ipath_cdbg(VERBOSE, "Trying to set to set link active for "
+			   "diag pkt\n");
+		ipath_set_linkstate(dd, IPATH_IB_LINKARM);
+		ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
+	}
+
+	if (!(dd->ipath_flags & IPATH_INITTED)) {
+		/* no hardware, freeze, etc. */
+		ipath_cdbg(VERBOSE, "unit %u not usable\n", dd->ipath_unit);
+		ret = -ENODEV;
+		goto bail;
+	}
+	/*
+	 * Want to skip check for l_state if using custom PBC,
+	 * because we might be trying to force an SM packet out.
+	 * first-cut, skip _all_ state checking in that case.
+	 */
+	val = ipath_ib_state(dd, dd->ipath_lastibcstat);
+	lt_state = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat);
+	l_state = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
+	if (!dp.pbc_wd && (lt_state != INFINIPATH_IBCS_LT_STATE_LINKUP ||
+	    (val != dd->ib_init && val != dd->ib_arm &&
+	    val != dd->ib_active))) {
+		ipath_cdbg(VERBOSE, "unit %u not ready (state %llx)\n",
+			   dd->ipath_unit, (unsigned long long) val);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * need total length before first word written, plus 2 Dwords. One Dword
+	 * is for padding so we get the full user data when not aligned on
+	 * a word boundary. The other Dword is to make sure we have room for the
+	 * ICRC which gets tacked on later.
+	 */
+	maxlen_reserve = 2 * sizeof(u32);
+	if (dp.len > dd->ipath_ibmaxlen - maxlen_reserve) {
+		ipath_dbg("Pkt len 0x%x > ibmaxlen %x\n",
+			  dp.len, dd->ipath_ibmaxlen);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	plen = sizeof(u32) + dp.len;
+
+	tmpbuf = vmalloc(plen);
+	if (!tmpbuf) {
+		dev_info(&dd->pcidev->dev, "Unable to allocate tmp buffer, "
+			 "failing\n");
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	if (copy_from_user(tmpbuf,
+			   (const void __user *) (unsigned long) dp.data,
+			   dp.len)) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	plen >>= 2;		/* in dwords */
+
+	piobuf = ipath_getpiobuf(dd, plen, &pbufn);
+	if (!piobuf) {
+		ipath_cdbg(VERBOSE, "No PIO buffers avail unit for %u\n",
+			   dd->ipath_unit);
+		ret = -EBUSY;
+		goto bail;
+	}
+	/* disarm it just to be extra sure */
+	ipath_disarm_piobufs(dd, pbufn, 1);
+
+	if (ipath_debug & __IPATH_PKTDBG)
+		ipath_cdbg(VERBOSE, "unit %u 0x%x+1w pio%d\n",
+			   dd->ipath_unit, plen - 1, pbufn);
+
+	if (dp.pbc_wd == 0)
+		dp.pbc_wd = plen;
+	writeq(dp.pbc_wd, piobuf);
+	/*
+	 * Copy all by the trigger word, then flush, so it's written
+	 * to chip before trigger word, then write trigger word, then
+	 * flush again, so packet is sent.
+	 */
+	if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
+		ipath_flush_wc();
+		__iowrite32_copy(piobuf + 2, tmpbuf, plen - 1);
+		ipath_flush_wc();
+		__raw_writel(tmpbuf[plen - 1], piobuf + plen + 1);
+	} else
+		__iowrite32_copy(piobuf + 2, tmpbuf, plen);
+
+	ipath_flush_wc();
+
+	ret = sizeof(dp);
+
+bail:
+	vfree(tmpbuf);
+	return ret;
+}
+
+static int ipath_diag_release(struct inode *in, struct file *fp)
+{
+	mutex_lock(&ipath_mutex);
+	ipath_diag_inuse = 0;
+	fp->private_data = NULL;
+	mutex_unlock(&ipath_mutex);
+	return 0;
+}
+
+static ssize_t ipath_diag_read(struct file *fp, char __user *data,
+			       size_t count, loff_t *off)
+{
+	struct ipath_devdata *dd = fp->private_data;
+	void __iomem *kreg_base;
+	ssize_t ret;
+
+	kreg_base = dd->ipath_kregbase;
+
+	if (count == 0)
+		ret = 0;
+	else if ((count % 4) || (*off % 4))
+		/* address or length is not 32-bit aligned, hence invalid */
+		ret = -EINVAL;
+	else if (ipath_diag_inuse < 1 && (*off || count != 8))
+		ret = -EINVAL;  /* prevent cat /dev/ipath_diag* */
+	else if ((count % 8) || (*off % 8))
+		/* address or length not 64-bit aligned; do 32-bit reads */
+		ret = ipath_read_umem32(dd, data, kreg_base + *off, count);
+	else
+		ret = ipath_read_umem64(dd, data, kreg_base + *off, count);
+
+	if (ret >= 0) {
+		*off += count;
+		ret = count;
+		if (ipath_diag_inuse == -2)
+			ipath_diag_inuse++;
+	}
+
+	return ret;
+}
+
+static ssize_t ipath_diag_write(struct file *fp, const char __user *data,
+				size_t count, loff_t *off)
+{
+	struct ipath_devdata *dd = fp->private_data;
+	void __iomem *kreg_base;
+	ssize_t ret;
+
+	kreg_base = dd->ipath_kregbase;
+
+	if (count == 0)
+		ret = 0;
+	else if ((count % 4) || (*off % 4))
+		/* address or length is not 32-bit aligned, hence invalid */
+		ret = -EINVAL;
+	else if ((ipath_diag_inuse == -1 && (*off || count != 8)) ||
+		 ipath_diag_inuse == -2)  /* read qw off 0, write qw off 0 */
+		ret = -EINVAL;  /* before any other write allowed */
+	else if ((count % 8) || (*off % 8))
+		/* address or length not 64-bit aligned; do 32-bit writes */
+		ret = ipath_write_umem32(dd, kreg_base + *off, data, count);
+	else
+		ret = ipath_write_umem64(dd, kreg_base + *off, data, count);
+
+	if (ret >= 0) {
+		*off += count;
+		ret = count;
+		if (ipath_diag_inuse == -1)
+			ipath_diag_inuse = 1; /* all read/write OK now */
+	}
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_dma.c b/drivers/infiniband/hw/ipath/ipath_dma.c
new file mode 100644
index 000000000..123a8c053
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_dma.c
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2006 QLogic, Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/scatterlist.h>
+#include <linux/gfp.h>
+#include <rdma/ib_verbs.h>
+
+#include "ipath_verbs.h"
+
+#define BAD_DMA_ADDRESS ((u64) 0)
+
+/*
+ * The following functions implement driver specific replacements
+ * for the ib_dma_*() functions.
+ *
+ * These functions return kernel virtual addresses instead of
+ * device bus addresses since the driver uses the CPU to copy
+ * data instead of using hardware DMA.
+ */
+
+static int ipath_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return dma_addr == BAD_DMA_ADDRESS;
+}
+
+static u64 ipath_dma_map_single(struct ib_device *dev,
+			        void *cpu_addr, size_t size,
+			        enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+	return (u64) cpu_addr;
+}
+
+static void ipath_dma_unmap_single(struct ib_device *dev,
+				   u64 addr, size_t size,
+				   enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+}
+
+static u64 ipath_dma_map_page(struct ib_device *dev,
+			      struct page *page,
+			      unsigned long offset,
+			      size_t size,
+			      enum dma_data_direction direction)
+{
+	u64 addr;
+
+	BUG_ON(!valid_dma_direction(direction));
+
+	if (offset + size > PAGE_SIZE) {
+		addr = BAD_DMA_ADDRESS;
+		goto done;
+	}
+
+	addr = (u64) page_address(page);
+	if (addr)
+		addr += offset;
+	/* TODO: handle highmem pages */
+
+done:
+	return addr;
+}
+
+static void ipath_dma_unmap_page(struct ib_device *dev,
+				 u64 addr, size_t size,
+				 enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+}
+
+static int ipath_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+			int nents, enum dma_data_direction direction)
+{
+	struct scatterlist *sg;
+	u64 addr;
+	int i;
+	int ret = nents;
+
+	BUG_ON(!valid_dma_direction(direction));
+
+	for_each_sg(sgl, sg, nents, i) {
+		addr = (u64) page_address(sg_page(sg));
+		/* TODO: handle highmem pages */
+		if (!addr) {
+			ret = 0;
+			break;
+		}
+		sg->dma_address = addr + sg->offset;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+		sg->dma_length = sg->length;
+#endif
+	}
+	return ret;
+}
+
+static void ipath_unmap_sg(struct ib_device *dev,
+			   struct scatterlist *sg, int nents,
+			   enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+}
+
+static void ipath_sync_single_for_cpu(struct ib_device *dev,
+				      u64 addr,
+				      size_t size,
+				      enum dma_data_direction dir)
+{
+}
+
+static void ipath_sync_single_for_device(struct ib_device *dev,
+					 u64 addr,
+					 size_t size,
+					 enum dma_data_direction dir)
+{
+}
+
+static void *ipath_dma_alloc_coherent(struct ib_device *dev, size_t size,
+				      u64 *dma_handle, gfp_t flag)
+{
+	struct page *p;
+	void *addr = NULL;
+
+	p = alloc_pages(flag, get_order(size));
+	if (p)
+		addr = page_address(p);
+	if (dma_handle)
+		*dma_handle = (u64) addr;
+	return addr;
+}
+
+static void ipath_dma_free_coherent(struct ib_device *dev, size_t size,
+				    void *cpu_addr, u64 dma_handle)
+{
+	free_pages((unsigned long) cpu_addr, get_order(size));
+}
+
+struct ib_dma_mapping_ops ipath_dma_mapping_ops = {
+	.mapping_error = ipath_mapping_error,
+	.map_single = ipath_dma_map_single,
+	.unmap_single = ipath_dma_unmap_single,
+	.map_page = ipath_dma_map_page,
+	.unmap_page = ipath_dma_unmap_page,
+	.map_sg = ipath_map_sg,
+	.unmap_sg = ipath_unmap_sg,
+	.sync_single_for_cpu = ipath_sync_single_for_cpu,
+	.sync_single_for_device = ipath_sync_single_for_device,
+	.alloc_coherent = ipath_dma_alloc_coherent,
+	.free_coherent = ipath_dma_free_coherent
+};
diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
new file mode 100644
index 000000000..bd0caedaf
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_driver.c
@@ -0,0 +1,2779 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+#include <linux/idr.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/bitmap.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+
+static void ipath_update_pio_bufs(struct ipath_devdata *);
+
+const char *ipath_get_unit_name(int unit)
+{
+	static char iname[16];
+	snprintf(iname, sizeof iname, "infinipath%u", unit);
+	return iname;
+}
+
+#define DRIVER_LOAD_MSG "QLogic " IPATH_DRV_NAME " loaded: "
+#define PFX IPATH_DRV_NAME ": "
+
+/*
+ * The size has to be longer than this string, so we can append
+ * board/chip information to it in the init code.
+ */
+const char ib_ipath_version[] = IPATH_IDSTR "\n";
+
+static struct idr unit_table;
+DEFINE_SPINLOCK(ipath_devs_lock);
+LIST_HEAD(ipath_dev_list);
+
+wait_queue_head_t ipath_state_wait;
+
+unsigned ipath_debug = __IPATH_INFO;
+
+module_param_named(debug, ipath_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(debug, "mask for debug prints");
+EXPORT_SYMBOL_GPL(ipath_debug);
+
+unsigned ipath_mtu4096 = 1; /* max 4KB IB mtu by default, if supported */
+module_param_named(mtu4096, ipath_mtu4096, uint, S_IRUGO);
+MODULE_PARM_DESC(mtu4096, "enable MTU of 4096 bytes, if supported");
+
+static unsigned ipath_hol_timeout_ms = 13000;
+module_param_named(hol_timeout_ms, ipath_hol_timeout_ms, uint, S_IRUGO);
+MODULE_PARM_DESC(hol_timeout_ms,
+	"duration of user app suspension after link failure");
+
+unsigned ipath_linkrecovery = 1;
+module_param_named(linkrecovery, ipath_linkrecovery, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(linkrecovery, "enable workaround for link recovery issue");
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("QLogic <support@qlogic.com>");
+MODULE_DESCRIPTION("QLogic InfiniPath driver");
+
+/*
+ * Table to translate the LINKTRAININGSTATE portion of
+ * IBCStatus to a human-readable form.
+ */
+const char *ipath_ibcstatus_str[] = {
+	"Disabled",
+	"LinkUp",
+	"PollActive",
+	"PollQuiet",
+	"SleepDelay",
+	"SleepQuiet",
+	"LState6",		/* unused */
+	"LState7",		/* unused */
+	"CfgDebounce",
+	"CfgRcvfCfg",
+	"CfgWaitRmt",
+	"CfgIdle",
+	"RecovRetrain",
+	"CfgTxRevLane",		/* unused before IBA7220 */
+	"RecovWaitRmt",
+	"RecovIdle",
+	/* below were added for IBA7220 */
+	"CfgEnhanced",
+	"CfgTest",
+	"CfgWaitRmtTest",
+	"CfgWaitCfgEnhanced",
+	"SendTS_T",
+	"SendTstIdles",
+	"RcvTS_T",
+	"SendTst_TS1s",
+	"LTState18", "LTState19", "LTState1A", "LTState1B",
+	"LTState1C", "LTState1D", "LTState1E", "LTState1F"
+};
+
+static void ipath_remove_one(struct pci_dev *);
+static int ipath_init_one(struct pci_dev *, const struct pci_device_id *);
+
+/* Only needed for registration, nothing else needs this info */
+#define PCI_VENDOR_ID_PATHSCALE 0x1fc1
+#define PCI_DEVICE_ID_INFINIPATH_HT 0xd
+
+/* Number of seconds before our card status check...  */
+#define STATUS_TIMEOUT 60
+
+static const struct pci_device_id ipath_pci_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_INFINIPATH_HT) },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, ipath_pci_tbl);
+
+static struct pci_driver ipath_driver = {
+	.name = IPATH_DRV_NAME,
+	.probe = ipath_init_one,
+	.remove = ipath_remove_one,
+	.id_table = ipath_pci_tbl,
+	.driver = {
+		.groups = ipath_driver_attr_groups,
+	},
+};
+
+static inline void read_bars(struct ipath_devdata *dd, struct pci_dev *dev,
+			     u32 *bar0, u32 *bar1)
+{
+	int ret;
+
+	ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_0, bar0);
+	if (ret)
+		ipath_dev_err(dd, "failed to read bar0 before enable: "
+			      "error %d\n", -ret);
+
+	ret = pci_read_config_dword(dev, PCI_BASE_ADDRESS_1, bar1);
+	if (ret)
+		ipath_dev_err(dd, "failed to read bar1 before enable: "
+			      "error %d\n", -ret);
+
+	ipath_dbg("Read bar0 %x bar1 %x\n", *bar0, *bar1);
+}
+
+static void ipath_free_devdata(struct pci_dev *pdev,
+			       struct ipath_devdata *dd)
+{
+	unsigned long flags;
+
+	pci_set_drvdata(pdev, NULL);
+
+	if (dd->ipath_unit != -1) {
+		spin_lock_irqsave(&ipath_devs_lock, flags);
+		idr_remove(&unit_table, dd->ipath_unit);
+		list_del(&dd->ipath_list);
+		spin_unlock_irqrestore(&ipath_devs_lock, flags);
+	}
+	vfree(dd);
+}
+
+static struct ipath_devdata *ipath_alloc_devdata(struct pci_dev *pdev)
+{
+	unsigned long flags;
+	struct ipath_devdata *dd;
+	int ret;
+
+	dd = vzalloc(sizeof(*dd));
+	if (!dd) {
+		dd = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+	dd->ipath_unit = -1;
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irqsave(&ipath_devs_lock, flags);
+
+	ret = idr_alloc(&unit_table, dd, 0, 0, GFP_NOWAIT);
+	if (ret < 0) {
+		printk(KERN_ERR IPATH_DRV_NAME
+		       ": Could not allocate unit ID: error %d\n", -ret);
+		ipath_free_devdata(pdev, dd);
+		dd = ERR_PTR(ret);
+		goto bail_unlock;
+	}
+	dd->ipath_unit = ret;
+
+	dd->pcidev = pdev;
+	pci_set_drvdata(pdev, dd);
+
+	list_add(&dd->ipath_list, &ipath_dev_list);
+
+bail_unlock:
+	spin_unlock_irqrestore(&ipath_devs_lock, flags);
+	idr_preload_end();
+bail:
+	return dd;
+}
+
+static inline struct ipath_devdata *__ipath_lookup(int unit)
+{
+	return idr_find(&unit_table, unit);
+}
+
+struct ipath_devdata *ipath_lookup(int unit)
+{
+	struct ipath_devdata *dd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ipath_devs_lock, flags);
+	dd = __ipath_lookup(unit);
+	spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+	return dd;
+}
+
+int ipath_count_units(int *npresentp, int *nupp, int *maxportsp)
+{
+	int nunits, npresent, nup;
+	struct ipath_devdata *dd;
+	unsigned long flags;
+	int maxports;
+
+	nunits = npresent = nup = maxports = 0;
+
+	spin_lock_irqsave(&ipath_devs_lock, flags);
+
+	list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
+		nunits++;
+		if ((dd->ipath_flags & IPATH_PRESENT) && dd->ipath_kregbase)
+			npresent++;
+		if (dd->ipath_lid &&
+		    !(dd->ipath_flags & (IPATH_DISABLED | IPATH_LINKDOWN
+					 | IPATH_LINKUNK)))
+			nup++;
+		if (dd->ipath_cfgports > maxports)
+			maxports = dd->ipath_cfgports;
+	}
+
+	spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+	if (npresentp)
+		*npresentp = npresent;
+	if (nupp)
+		*nupp = nup;
+	if (maxportsp)
+		*maxportsp = maxports;
+
+	return nunits;
+}
+
+/*
+ * These next two routines are placeholders in case we don't have per-arch
+ * code for controlling write combining.  If explicit control of write
+ * combining is not available, performance will probably be awful.
+ */
+
+int __attribute__((weak)) ipath_enable_wc(struct ipath_devdata *dd)
+{
+	return -EOPNOTSUPP;
+}
+
+void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd)
+{
+}
+
+/*
+ * Perform a PIO buffer bandwidth write test, to verify proper system
+ * configuration.  Even when all the setup calls work, occasionally
+ * BIOS or other issues can prevent write combining from working, or
+ * can cause other bandwidth problems to the chip.
+ *
+ * This test simply writes the same buffer over and over again, and
+ * measures close to the peak bandwidth to the chip (not testing
+ * data bandwidth to the wire).   On chips that use an address-based
+ * trigger to send packets to the wire, this is easy.  On chips that
+ * use a count to trigger, we want to make sure that the packet doesn't
+ * go out on the wire, or trigger flow control checks.
+ */
+static void ipath_verify_pioperf(struct ipath_devdata *dd)
+{
+	u32 pbnum, cnt, lcnt;
+	u32 __iomem *piobuf;
+	u32 *addr;
+	u64 msecs, emsecs;
+
+	piobuf = ipath_getpiobuf(dd, 0, &pbnum);
+	if (!piobuf) {
+		dev_info(&dd->pcidev->dev,
+			"No PIObufs for checking perf, skipping\n");
+		return;
+	}
+
+	/*
+	 * Enough to give us a reasonable test, less than piobuf size, and
+	 * likely multiple of store buffer length.
+	 */
+	cnt = 1024;
+
+	addr = vmalloc(cnt);
+	if (!addr) {
+		dev_info(&dd->pcidev->dev,
+			"Couldn't get memory for checking PIO perf,"
+			" skipping\n");
+		goto done;
+	}
+
+	preempt_disable();  /* we want reasonably accurate elapsed time */
+	msecs = 1 + jiffies_to_msecs(jiffies);
+	for (lcnt = 0; lcnt < 10000U; lcnt++) {
+		/* wait until we cross msec boundary */
+		if (jiffies_to_msecs(jiffies) >= msecs)
+			break;
+		udelay(1);
+	}
+
+	ipath_disable_armlaunch(dd);
+
+	/*
+	 * length 0, no dwords actually sent, and mark as VL15
+	 * on chips where that may matter (due to IB flowcontrol)
+	 */
+	if ((dd->ipath_flags & IPATH_HAS_PBC_CNT))
+		writeq(1UL << 63, piobuf);
+	else
+		writeq(0, piobuf);
+	ipath_flush_wc();
+
+	/*
+	 * this is only roughly accurate, since even with preempt we
+	 * still take interrupts that could take a while.   Running for
+	 * >= 5 msec seems to get us "close enough" to accurate values
+	 */
+	msecs = jiffies_to_msecs(jiffies);
+	for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) {
+		__iowrite32_copy(piobuf + 64, addr, cnt >> 2);
+		emsecs = jiffies_to_msecs(jiffies) - msecs;
+	}
+
+	/* 1 GiB/sec, slightly over IB SDR line rate */
+	if (lcnt < (emsecs * 1024U))
+		ipath_dev_err(dd,
+			"Performance problem: bandwidth to PIO buffers is "
+			"only %u MiB/sec\n",
+			lcnt / (u32) emsecs);
+	else
+		ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n",
+			lcnt / (u32) emsecs);
+
+	preempt_enable();
+
+	vfree(addr);
+
+done:
+	/* disarm piobuf, so it's available again */
+	ipath_disarm_piobufs(dd, pbnum, 1);
+	ipath_enable_armlaunch(dd);
+}
+
+static void cleanup_device(struct ipath_devdata *dd);
+
+static int ipath_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int ret, len, j;
+	struct ipath_devdata *dd;
+	unsigned long long addr;
+	u32 bar0 = 0, bar1 = 0;
+
+	dd = ipath_alloc_devdata(pdev);
+	if (IS_ERR(dd)) {
+		ret = PTR_ERR(dd);
+		printk(KERN_ERR IPATH_DRV_NAME
+		       ": Could not allocate devdata: error %d\n", -ret);
+		goto bail;
+	}
+
+	ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit);
+
+	ret = pci_enable_device(pdev);
+	if (ret) {
+		/* This can happen iff:
+		 *
+		 * We did a chip reset, and then failed to reprogram the
+		 * BAR, or the chip reset due to an internal error.  We then
+		 * unloaded the driver and reloaded it.
+		 *
+		 * Both reset cases set the BAR back to initial state.  For
+		 * the latter case, the AER sticky error bit at offset 0x718
+		 * should be set, but the Linux kernel doesn't yet know
+		 * about that, it appears.  If the original BAR was retained
+		 * in the kernel data structures, this may be OK.
+		 */
+		ipath_dev_err(dd, "enable unit %d failed: error %d\n",
+			      dd->ipath_unit, -ret);
+		goto bail_devdata;
+	}
+	addr = pci_resource_start(pdev, 0);
+	len = pci_resource_len(pdev, 0);
+	ipath_cdbg(VERBOSE, "regbase (0) %llx len %d irq %d, vend %x/%x "
+		   "driver_data %lx\n", addr, len, pdev->irq, ent->vendor,
+		   ent->device, ent->driver_data);
+
+	read_bars(dd, pdev, &bar0, &bar1);
+
+	if (!bar1 && !(bar0 & ~0xf)) {
+		if (addr) {
+			dev_info(&pdev->dev, "BAR is 0 (probable RESET), "
+				 "rewriting as %llx\n", addr);
+			ret = pci_write_config_dword(
+				pdev, PCI_BASE_ADDRESS_0, addr);
+			if (ret) {
+				ipath_dev_err(dd, "rewrite of BAR0 "
+					      "failed: err %d\n", -ret);
+				goto bail_disable;
+			}
+			ret = pci_write_config_dword(
+				pdev, PCI_BASE_ADDRESS_1, addr >> 32);
+			if (ret) {
+				ipath_dev_err(dd, "rewrite of BAR1 "
+					      "failed: err %d\n", -ret);
+				goto bail_disable;
+			}
+		} else {
+			ipath_dev_err(dd, "BAR is 0 (probable RESET), "
+				      "not usable until reboot\n");
+			ret = -ENODEV;
+			goto bail_disable;
+		}
+	}
+
+	ret = pci_request_regions(pdev, IPATH_DRV_NAME);
+	if (ret) {
+		dev_info(&pdev->dev, "pci_request_regions unit %u fails: "
+			 "err %d\n", dd->ipath_unit, -ret);
+		goto bail_disable;
+	}
+
+	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (ret) {
+		/*
+		 * if the 64 bit setup fails, try 32 bit.  Some systems
+		 * do not setup 64 bit maps on systems with 2GB or less
+		 * memory installed.
+		 */
+		ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (ret) {
+			dev_info(&pdev->dev,
+				"Unable to set DMA mask for unit %u: %d\n",
+				dd->ipath_unit, ret);
+			goto bail_regions;
+		}
+		else {
+			ipath_dbg("No 64bit DMA mask, used 32 bit mask\n");
+			ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+			if (ret)
+				dev_info(&pdev->dev,
+					"Unable to set DMA consistent mask "
+					"for unit %u: %d\n",
+					dd->ipath_unit, ret);
+
+		}
+	}
+	else {
+		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+		if (ret)
+			dev_info(&pdev->dev,
+				"Unable to set DMA consistent mask "
+				"for unit %u: %d\n",
+				dd->ipath_unit, ret);
+	}
+
+	pci_set_master(pdev);
+
+	/*
+	 * Save BARs to rewrite after device reset.  Save all 64 bits of
+	 * BAR, just in case.
+	 */
+	dd->ipath_pcibar0 = addr;
+	dd->ipath_pcibar1 = addr >> 32;
+	dd->ipath_deviceid = ent->device;	/* save for later use */
+	dd->ipath_vendorid = ent->vendor;
+
+	/* setup the chip-specific functions, as early as possible. */
+	switch (ent->device) {
+	case PCI_DEVICE_ID_INFINIPATH_HT:
+		ipath_init_iba6110_funcs(dd);
+		break;
+
+	default:
+		ipath_dev_err(dd, "Found unknown QLogic deviceid 0x%x, "
+			      "failing\n", ent->device);
+		return -ENODEV;
+	}
+
+	for (j = 0; j < 6; j++) {
+		if (!pdev->resource[j].start)
+			continue;
+		ipath_cdbg(VERBOSE, "BAR %d %pR, len %llx\n",
+			   j, &pdev->resource[j],
+			   (unsigned long long)pci_resource_len(pdev, j));
+	}
+
+	if (!addr) {
+		ipath_dev_err(dd, "No valid address in BAR 0!\n");
+		ret = -ENODEV;
+		goto bail_regions;
+	}
+
+	dd->ipath_pcirev = pdev->revision;
+
+#if defined(__powerpc__)
+	/* There isn't a generic way to specify writethrough mappings */
+	dd->ipath_kregbase = __ioremap(addr, len,
+		(_PAGE_NO_CACHE|_PAGE_WRITETHRU));
+#else
+	dd->ipath_kregbase = ioremap_nocache(addr, len);
+#endif
+
+	if (!dd->ipath_kregbase) {
+		ipath_dbg("Unable to map io addr %llx to kvirt, failing\n",
+			  addr);
+		ret = -ENOMEM;
+		goto bail_iounmap;
+	}
+	dd->ipath_kregend = (u64 __iomem *)
+		((void __iomem *)dd->ipath_kregbase + len);
+	dd->ipath_physaddr = addr;	/* used for io_remap, etc. */
+	/* for user mmap */
+	ipath_cdbg(VERBOSE, "mapped io addr %llx to kregbase %p\n",
+		   addr, dd->ipath_kregbase);
+
+	if (dd->ipath_f_bus(dd, pdev))
+		ipath_dev_err(dd, "Failed to setup config space; "
+			      "continuing anyway\n");
+
+	/*
+	 * set up our interrupt handler; IRQF_SHARED probably not needed,
+	 * since MSI interrupts shouldn't be shared but won't  hurt for now.
+	 * check 0 irq after we return from chip-specific bus setup, since
+	 * that can affect this due to setup
+	 */
+	if (!dd->ipath_irq)
+		ipath_dev_err(dd, "irq is 0, BIOS error?  Interrupts won't "
+			      "work\n");
+	else {
+		ret = request_irq(dd->ipath_irq, ipath_intr, IRQF_SHARED,
+				  IPATH_DRV_NAME, dd);
+		if (ret) {
+			ipath_dev_err(dd, "Couldn't setup irq handler, "
+				      "irq=%d: %d\n", dd->ipath_irq, ret);
+			goto bail_iounmap;
+		}
+	}
+
+	ret = ipath_init_chip(dd, 0);	/* do the chip-specific init */
+	if (ret)
+		goto bail_irqsetup;
+
+	ret = ipath_enable_wc(dd);
+
+	if (ret) {
+		ipath_dev_err(dd, "Write combining not enabled "
+			      "(err %d): performance may be poor\n",
+			      -ret);
+		ret = 0;
+	}
+
+	ipath_verify_pioperf(dd);
+
+	ipath_device_create_group(&pdev->dev, dd);
+	ipathfs_add_device(dd);
+	ipath_user_add(dd);
+	ipath_diag_add(dd);
+	ipath_register_ib_device(dd);
+
+	goto bail;
+
+bail_irqsetup:
+	cleanup_device(dd);
+
+	if (dd->ipath_irq)
+		dd->ipath_f_free_irq(dd);
+
+	if (dd->ipath_f_cleanup)
+		dd->ipath_f_cleanup(dd);
+
+bail_iounmap:
+	iounmap((volatile void __iomem *) dd->ipath_kregbase);
+
+bail_regions:
+	pci_release_regions(pdev);
+
+bail_disable:
+	pci_disable_device(pdev);
+
+bail_devdata:
+	ipath_free_devdata(pdev, dd);
+
+bail:
+	return ret;
+}
+
+static void cleanup_device(struct ipath_devdata *dd)
+{
+	int port;
+	struct ipath_portdata **tmp;
+	unsigned long flags;
+
+	if (*dd->ipath_statusp & IPATH_STATUS_CHIP_PRESENT) {
+		/* can't do anything more with chip; needs re-init */
+		*dd->ipath_statusp &= ~IPATH_STATUS_CHIP_PRESENT;
+		if (dd->ipath_kregbase) {
+			/*
+			 * if we haven't already cleaned up before these are
+			 * to ensure any register reads/writes "fail" until
+			 * re-init
+			 */
+			dd->ipath_kregbase = NULL;
+			dd->ipath_uregbase = 0;
+			dd->ipath_sregbase = 0;
+			dd->ipath_cregbase = 0;
+			dd->ipath_kregsize = 0;
+		}
+		ipath_disable_wc(dd);
+	}
+
+	if (dd->ipath_spectriggerhit)
+		dev_info(&dd->pcidev->dev, "%lu special trigger hits\n",
+			 dd->ipath_spectriggerhit);
+
+	if (dd->ipath_pioavailregs_dma) {
+		dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+				  (void *) dd->ipath_pioavailregs_dma,
+				  dd->ipath_pioavailregs_phys);
+		dd->ipath_pioavailregs_dma = NULL;
+	}
+	if (dd->ipath_dummy_hdrq) {
+		dma_free_coherent(&dd->pcidev->dev,
+			dd->ipath_pd[0]->port_rcvhdrq_size,
+			dd->ipath_dummy_hdrq, dd->ipath_dummy_hdrq_phys);
+		dd->ipath_dummy_hdrq = NULL;
+	}
+
+	if (dd->ipath_pageshadow) {
+		struct page **tmpp = dd->ipath_pageshadow;
+		dma_addr_t *tmpd = dd->ipath_physshadow;
+		int i, cnt = 0;
+
+		ipath_cdbg(VERBOSE, "Unlocking any expTID pages still "
+			   "locked\n");
+		for (port = 0; port < dd->ipath_cfgports; port++) {
+			int port_tidbase = port * dd->ipath_rcvtidcnt;
+			int maxtid = port_tidbase + dd->ipath_rcvtidcnt;
+			for (i = port_tidbase; i < maxtid; i++) {
+				if (!tmpp[i])
+					continue;
+				pci_unmap_page(dd->pcidev, tmpd[i],
+					PAGE_SIZE, PCI_DMA_FROMDEVICE);
+				ipath_release_user_pages(&tmpp[i], 1);
+				tmpp[i] = NULL;
+				cnt++;
+			}
+		}
+		if (cnt) {
+			ipath_stats.sps_pageunlocks += cnt;
+			ipath_cdbg(VERBOSE, "There were still %u expTID "
+				   "entries locked\n", cnt);
+		}
+		if (ipath_stats.sps_pagelocks ||
+		    ipath_stats.sps_pageunlocks)
+			ipath_cdbg(VERBOSE, "%llu pages locked, %llu "
+				   "unlocked via ipath_m{un}lock\n",
+				   (unsigned long long)
+				   ipath_stats.sps_pagelocks,
+				   (unsigned long long)
+				   ipath_stats.sps_pageunlocks);
+
+		ipath_cdbg(VERBOSE, "Free shadow page tid array at %p\n",
+			   dd->ipath_pageshadow);
+		tmpp = dd->ipath_pageshadow;
+		dd->ipath_pageshadow = NULL;
+		vfree(tmpp);
+
+		dd->ipath_egrtidbase = NULL;
+	}
+
+	/*
+	 * free any resources still in use (usually just kernel ports)
+	 * at unload; we do for portcnt, because that's what we allocate.
+	 * We acquire lock to be really paranoid that ipath_pd isn't being
+	 * accessed from some interrupt-related code (that should not happen,
+	 * but best to be sure).
+	 */
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+	tmp = dd->ipath_pd;
+	dd->ipath_pd = NULL;
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+	for (port = 0; port < dd->ipath_portcnt; port++) {
+		struct ipath_portdata *pd = tmp[port];
+		tmp[port] = NULL; /* debugging paranoia */
+		ipath_free_pddata(dd, pd);
+	}
+	kfree(tmp);
+}
+
+static void ipath_remove_one(struct pci_dev *pdev)
+{
+	struct ipath_devdata *dd = pci_get_drvdata(pdev);
+
+	ipath_cdbg(VERBOSE, "removing, pdev=%p, dd=%p\n", pdev, dd);
+
+	/*
+	 * disable the IB link early, to be sure no new packets arrive, which
+	 * complicates the shutdown process
+	 */
+	ipath_shutdown_device(dd);
+
+	flush_workqueue(ib_wq);
+
+	if (dd->verbs_dev)
+		ipath_unregister_ib_device(dd->verbs_dev);
+
+	ipath_diag_remove(dd);
+	ipath_user_remove(dd);
+	ipathfs_remove_device(dd);
+	ipath_device_remove_group(&pdev->dev, dd);
+
+	ipath_cdbg(VERBOSE, "Releasing pci memory regions, dd %p, "
+		   "unit %u\n", dd, (u32) dd->ipath_unit);
+
+	cleanup_device(dd);
+
+	/*
+	 * turn off rcv, send, and interrupts for all ports, all drivers
+	 * should also hard reset the chip here?
+	 * free up port 0 (kernel) rcvhdr, egr bufs, and eventually tid bufs
+	 * for all versions of the driver, if they were allocated
+	 */
+	if (dd->ipath_irq) {
+		ipath_cdbg(VERBOSE, "unit %u free irq %d\n",
+			   dd->ipath_unit, dd->ipath_irq);
+		dd->ipath_f_free_irq(dd);
+	} else
+		ipath_dbg("irq is 0, not doing free_irq "
+			  "for unit %u\n", dd->ipath_unit);
+	/*
+	 * we check for NULL here, because it's outside
+	 * the kregbase check, and we need to call it
+	 * after the free_irq.	Thus it's possible that
+	 * the function pointers were never initialized.
+	 */
+	if (dd->ipath_f_cleanup)
+		/* clean up chip-specific stuff */
+		dd->ipath_f_cleanup(dd);
+
+	ipath_cdbg(VERBOSE, "Unmapping kregbase %p\n", dd->ipath_kregbase);
+	iounmap((volatile void __iomem *) dd->ipath_kregbase);
+	pci_release_regions(pdev);
+	ipath_cdbg(VERBOSE, "calling pci_disable_device\n");
+	pci_disable_device(pdev);
+
+	ipath_free_devdata(pdev, dd);
+}
+
+/* general driver use */
+DEFINE_MUTEX(ipath_mutex);
+
+static DEFINE_SPINLOCK(ipath_pioavail_lock);
+
+/**
+ * ipath_disarm_piobufs - cancel a range of PIO buffers
+ * @dd: the infinipath device
+ * @first: the first PIO buffer to cancel
+ * @cnt: the number of PIO buffers to cancel
+ *
+ * cancel a range of PIO buffers, used when they might be armed, but
+ * not triggered.  Used at init to ensure buffer state, and also user
+ * process close, in case it died while writing to a PIO buffer
+ * Also after errors.
+ */
+void ipath_disarm_piobufs(struct ipath_devdata *dd, unsigned first,
+			  unsigned cnt)
+{
+	unsigned i, last = first + cnt;
+	unsigned long flags;
+
+	ipath_cdbg(PKT, "disarm %u PIObufs first=%u\n", cnt, first);
+	for (i = first; i < last; i++) {
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		/*
+		 * The disarm-related bits are write-only, so it
+		 * is ok to OR them in with our copy of sendctrl
+		 * while we hold the lock.
+		 */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			dd->ipath_sendctrl | INFINIPATH_S_DISARM |
+			(i << INFINIPATH_S_DISARMPIOBUF_SHIFT));
+		/* can't disarm bufs back-to-back per iba7220 spec */
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+	}
+	/* on some older chips, update may not happen after cancel */
+	ipath_force_pio_avail_update(dd);
+}
+
+/**
+ * ipath_wait_linkstate - wait for an IB link state change to occur
+ * @dd: the infinipath device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * wait up to msecs milliseconds for IB link state change to occur for
+ * now, take the easy polling route.  Currently used only by
+ * ipath_set_linkstate.  Returns 0 if state reached, otherwise
+ * -ETIMEDOUT state can have multiple states set, for any of several
+ * transitions.
+ */
+int ipath_wait_linkstate(struct ipath_devdata *dd, u32 state, int msecs)
+{
+	dd->ipath_state_wanted = state;
+	wait_event_interruptible_timeout(ipath_state_wait,
+					 (dd->ipath_flags & state),
+					 msecs_to_jiffies(msecs));
+	dd->ipath_state_wanted = 0;
+
+	if (!(dd->ipath_flags & state)) {
+		u64 val;
+		ipath_cdbg(VERBOSE, "Didn't reach linkstate %s within %u"
+			   " ms\n",
+			   /* test INIT ahead of DOWN, both can be set */
+			   (state & IPATH_LINKINIT) ? "INIT" :
+			   ((state & IPATH_LINKDOWN) ? "DOWN" :
+			    ((state & IPATH_LINKARMED) ? "ARM" : "ACTIVE")),
+			   msecs);
+		val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+		ipath_cdbg(VERBOSE, "ibcc=%llx ibcstatus=%llx (%s)\n",
+			   (unsigned long long) ipath_read_kreg64(
+				   dd, dd->ipath_kregs->kr_ibcctrl),
+			   (unsigned long long) val,
+			   ipath_ibcstatus_str[val & dd->ibcs_lts_mask]);
+	}
+	return (dd->ipath_flags & state) ? 0 : -ETIMEDOUT;
+}
+
+static void decode_sdma_errs(struct ipath_devdata *dd, ipath_err_t err,
+	char *buf, size_t blen)
+{
+	static const struct {
+		ipath_err_t err;
+		const char *msg;
+	} errs[] = {
+		{ INFINIPATH_E_SDMAGENMISMATCH, "SDmaGenMismatch" },
+		{ INFINIPATH_E_SDMAOUTOFBOUND, "SDmaOutOfBound" },
+		{ INFINIPATH_E_SDMATAILOUTOFBOUND, "SDmaTailOutOfBound" },
+		{ INFINIPATH_E_SDMABASE, "SDmaBase" },
+		{ INFINIPATH_E_SDMA1STDESC, "SDma1stDesc" },
+		{ INFINIPATH_E_SDMARPYTAG, "SDmaRpyTag" },
+		{ INFINIPATH_E_SDMADWEN, "SDmaDwEn" },
+		{ INFINIPATH_E_SDMAMISSINGDW, "SDmaMissingDw" },
+		{ INFINIPATH_E_SDMAUNEXPDATA, "SDmaUnexpData" },
+		{ INFINIPATH_E_SDMADESCADDRMISALIGN, "SDmaDescAddrMisalign" },
+		{ INFINIPATH_E_SENDBUFMISUSE, "SendBufMisuse" },
+		{ INFINIPATH_E_SDMADISABLED, "SDmaDisabled" },
+	};
+	int i;
+	int expected;
+	size_t bidx = 0;
+
+	for (i = 0; i < ARRAY_SIZE(errs); i++) {
+		expected = (errs[i].err != INFINIPATH_E_SDMADISABLED) ? 0 :
+			test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+		if ((err & errs[i].err) && !expected)
+			bidx += snprintf(buf + bidx, blen - bidx,
+					 "%s ", errs[i].msg);
+	}
+}
+
+/*
+ * Decode the error status into strings, deciding whether to always
+ * print * it or not depending on "normal packet errors" vs everything
+ * else.   Return 1 if "real" errors, otherwise 0 if only packet
+ * errors, so caller can decide what to print with the string.
+ */
+int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
+	ipath_err_t err)
+{
+	int iserr = 1;
+	*buf = '\0';
+	if (err & INFINIPATH_E_PKTERRS) {
+		if (!(err & ~INFINIPATH_E_PKTERRS))
+			iserr = 0; // if only packet errors.
+		if (ipath_debug & __IPATH_ERRPKTDBG) {
+			if (err & INFINIPATH_E_REBP)
+				strlcat(buf, "EBP ", blen);
+			if (err & INFINIPATH_E_RVCRC)
+				strlcat(buf, "VCRC ", blen);
+			if (err & INFINIPATH_E_RICRC) {
+				strlcat(buf, "CRC ", blen);
+				// clear for check below, so only once
+				err &= INFINIPATH_E_RICRC;
+			}
+			if (err & INFINIPATH_E_RSHORTPKTLEN)
+				strlcat(buf, "rshortpktlen ", blen);
+			if (err & INFINIPATH_E_SDROPPEDDATAPKT)
+				strlcat(buf, "sdroppeddatapkt ", blen);
+			if (err & INFINIPATH_E_SPKTLEN)
+				strlcat(buf, "spktlen ", blen);
+		}
+		if ((err & INFINIPATH_E_RICRC) &&
+			!(err&(INFINIPATH_E_RVCRC|INFINIPATH_E_REBP)))
+			strlcat(buf, "CRC ", blen);
+		if (!iserr)
+			goto done;
+	}
+	if (err & INFINIPATH_E_RHDRLEN)
+		strlcat(buf, "rhdrlen ", blen);
+	if (err & INFINIPATH_E_RBADTID)
+		strlcat(buf, "rbadtid ", blen);
+	if (err & INFINIPATH_E_RBADVERSION)
+		strlcat(buf, "rbadversion ", blen);
+	if (err & INFINIPATH_E_RHDR)
+		strlcat(buf, "rhdr ", blen);
+	if (err & INFINIPATH_E_SENDSPECIALTRIGGER)
+		strlcat(buf, "sendspecialtrigger ", blen);
+	if (err & INFINIPATH_E_RLONGPKTLEN)
+		strlcat(buf, "rlongpktlen ", blen);
+	if (err & INFINIPATH_E_RMAXPKTLEN)
+		strlcat(buf, "rmaxpktlen ", blen);
+	if (err & INFINIPATH_E_RMINPKTLEN)
+		strlcat(buf, "rminpktlen ", blen);
+	if (err & INFINIPATH_E_SMINPKTLEN)
+		strlcat(buf, "sminpktlen ", blen);
+	if (err & INFINIPATH_E_RFORMATERR)
+		strlcat(buf, "rformaterr ", blen);
+	if (err & INFINIPATH_E_RUNSUPVL)
+		strlcat(buf, "runsupvl ", blen);
+	if (err & INFINIPATH_E_RUNEXPCHAR)
+		strlcat(buf, "runexpchar ", blen);
+	if (err & INFINIPATH_E_RIBFLOW)
+		strlcat(buf, "ribflow ", blen);
+	if (err & INFINIPATH_E_SUNDERRUN)
+		strlcat(buf, "sunderrun ", blen);
+	if (err & INFINIPATH_E_SPIOARMLAUNCH)
+		strlcat(buf, "spioarmlaunch ", blen);
+	if (err & INFINIPATH_E_SUNEXPERRPKTNUM)
+		strlcat(buf, "sunexperrpktnum ", blen);
+	if (err & INFINIPATH_E_SDROPPEDSMPPKT)
+		strlcat(buf, "sdroppedsmppkt ", blen);
+	if (err & INFINIPATH_E_SMAXPKTLEN)
+		strlcat(buf, "smaxpktlen ", blen);
+	if (err & INFINIPATH_E_SUNSUPVL)
+		strlcat(buf, "sunsupVL ", blen);
+	if (err & INFINIPATH_E_INVALIDADDR)
+		strlcat(buf, "invalidaddr ", blen);
+	if (err & INFINIPATH_E_RRCVEGRFULL)
+		strlcat(buf, "rcvegrfull ", blen);
+	if (err & INFINIPATH_E_RRCVHDRFULL)
+		strlcat(buf, "rcvhdrfull ", blen);
+	if (err & INFINIPATH_E_IBSTATUSCHANGED)
+		strlcat(buf, "ibcstatuschg ", blen);
+	if (err & INFINIPATH_E_RIBLOSTLINK)
+		strlcat(buf, "riblostlink ", blen);
+	if (err & INFINIPATH_E_HARDWARE)
+		strlcat(buf, "hardware ", blen);
+	if (err & INFINIPATH_E_RESET)
+		strlcat(buf, "reset ", blen);
+	if (err & INFINIPATH_E_SDMAERRS)
+		decode_sdma_errs(dd, err, buf, blen);
+	if (err & INFINIPATH_E_INVALIDEEPCMD)
+		strlcat(buf, "invalideepromcmd ", blen);
+done:
+	return iserr;
+}
+
+/**
+ * get_rhf_errstring - decode RHF errors
+ * @err: the err number
+ * @msg: the output buffer
+ * @len: the length of the output buffer
+ *
+ * only used one place now, may want more later
+ */
+static void get_rhf_errstring(u32 err, char *msg, size_t len)
+{
+	/* if no errors, and so don't need to check what's first */
+	*msg = '\0';
+
+	if (err & INFINIPATH_RHF_H_ICRCERR)
+		strlcat(msg, "icrcerr ", len);
+	if (err & INFINIPATH_RHF_H_VCRCERR)
+		strlcat(msg, "vcrcerr ", len);
+	if (err & INFINIPATH_RHF_H_PARITYERR)
+		strlcat(msg, "parityerr ", len);
+	if (err & INFINIPATH_RHF_H_LENERR)
+		strlcat(msg, "lenerr ", len);
+	if (err & INFINIPATH_RHF_H_MTUERR)
+		strlcat(msg, "mtuerr ", len);
+	if (err & INFINIPATH_RHF_H_IHDRERR)
+		/* infinipath hdr checksum error */
+		strlcat(msg, "ipathhdrerr ", len);
+	if (err & INFINIPATH_RHF_H_TIDERR)
+		strlcat(msg, "tiderr ", len);
+	if (err & INFINIPATH_RHF_H_MKERR)
+		/* bad port, offset, etc. */
+		strlcat(msg, "invalid ipathhdr ", len);
+	if (err & INFINIPATH_RHF_H_IBERR)
+		strlcat(msg, "iberr ", len);
+	if (err & INFINIPATH_RHF_L_SWA)
+		strlcat(msg, "swA ", len);
+	if (err & INFINIPATH_RHF_L_SWB)
+		strlcat(msg, "swB ", len);
+}
+
+/**
+ * ipath_get_egrbuf - get an eager buffer
+ * @dd: the infinipath device
+ * @bufnum: the eager buffer to get
+ *
+ * must only be called if ipath_pd[port] is known to be allocated
+ */
+static inline void *ipath_get_egrbuf(struct ipath_devdata *dd, u32 bufnum)
+{
+	return dd->ipath_port0_skbinfo ?
+		(void *) dd->ipath_port0_skbinfo[bufnum].skb->data : NULL;
+}
+
+/**
+ * ipath_alloc_skb - allocate an skb and buffer with possible constraints
+ * @dd: the infinipath device
+ * @gfp_mask: the sk_buff SFP mask
+ */
+struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd,
+				gfp_t gfp_mask)
+{
+	struct sk_buff *skb;
+	u32 len;
+
+	/*
+	 * Only fully supported way to handle this is to allocate lots
+	 * extra, align as needed, and then do skb_reserve().  That wastes
+	 * a lot of memory...  I'll have to hack this into infinipath_copy
+	 * also.
+	 */
+
+	/*
+	 * We need 2 extra bytes for ipath_ether data sent in the
+	 * key header.  In order to keep everything dword aligned,
+	 * we'll reserve 4 bytes.
+	 */
+	len = dd->ipath_ibmaxlen + 4;
+
+	if (dd->ipath_flags & IPATH_4BYTE_TID) {
+		/* We need a 2KB multiple alignment, and there is no way
+		 * to do it except to allocate extra and then skb_reserve
+		 * enough to bring it up to the right alignment.
+		 */
+		len += 2047;
+	}
+
+	skb = __dev_alloc_skb(len, gfp_mask);
+	if (!skb) {
+		ipath_dev_err(dd, "Failed to allocate skbuff, length %u\n",
+			      len);
+		goto bail;
+	}
+
+	skb_reserve(skb, 4);
+
+	if (dd->ipath_flags & IPATH_4BYTE_TID) {
+		u32 una = (unsigned long)skb->data & 2047;
+		if (una)
+			skb_reserve(skb, 2048 - una);
+	}
+
+bail:
+	return skb;
+}
+
+static void ipath_rcv_hdrerr(struct ipath_devdata *dd,
+			     u32 eflags,
+			     u32 l,
+			     u32 etail,
+			     __le32 *rhf_addr,
+			     struct ipath_message_header *hdr)
+{
+	char emsg[128];
+
+	get_rhf_errstring(eflags, emsg, sizeof emsg);
+	ipath_cdbg(PKT, "RHFerrs %x hdrqtail=%x typ=%u "
+		   "tlen=%x opcode=%x egridx=%x: %s\n",
+		   eflags, l,
+		   ipath_hdrget_rcv_type(rhf_addr),
+		   ipath_hdrget_length_in_bytes(rhf_addr),
+		   be32_to_cpu(hdr->bth[0]) >> 24,
+		   etail, emsg);
+
+	/* Count local link integrity errors. */
+	if (eflags & (INFINIPATH_RHF_H_ICRCERR | INFINIPATH_RHF_H_VCRCERR)) {
+		u8 n = (dd->ipath_ibcctrl >>
+			INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+			INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+
+		if (++dd->ipath_lli_counter > n) {
+			dd->ipath_lli_counter = 0;
+			dd->ipath_lli_errors++;
+		}
+	}
+}
+
+/*
+ * ipath_kreceive - receive a packet
+ * @pd: the infinipath port
+ *
+ * called from interrupt handler for errors or receive interrupt
+ */
+void ipath_kreceive(struct ipath_portdata *pd)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	__le32 *rhf_addr;
+	void *ebuf;
+	const u32 rsize = dd->ipath_rcvhdrentsize;	/* words */
+	const u32 maxcnt = dd->ipath_rcvhdrcnt * rsize;	/* words */
+	u32 etail = -1, l, hdrqtail;
+	struct ipath_message_header *hdr;
+	u32 eflags, i, etype, tlen, pkttot = 0, updegr = 0, reloop = 0;
+	static u64 totcalls;	/* stats, may eventually remove */
+	int last;
+
+	l = pd->port_head;
+	rhf_addr = (__le32 *) pd->port_rcvhdrq + l + dd->ipath_rhf_offset;
+	if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
+		u32 seq = ipath_hdrget_seq(rhf_addr);
+
+		if (seq != pd->port_seq_cnt)
+			goto bail;
+		hdrqtail = 0;
+	} else {
+		hdrqtail = ipath_get_rcvhdrtail(pd);
+		if (l == hdrqtail)
+			goto bail;
+		smp_rmb();
+	}
+
+reloop:
+	for (last = 0, i = 1; !last; i += !last) {
+		hdr = dd->ipath_f_get_msgheader(dd, rhf_addr);
+		eflags = ipath_hdrget_err_flags(rhf_addr);
+		etype = ipath_hdrget_rcv_type(rhf_addr);
+		/* total length */
+		tlen = ipath_hdrget_length_in_bytes(rhf_addr);
+		ebuf = NULL;
+		if ((dd->ipath_flags & IPATH_NODMA_RTAIL) ?
+		    ipath_hdrget_use_egr_buf(rhf_addr) :
+		    (etype != RCVHQ_RCV_TYPE_EXPECTED)) {
+			/*
+			 * It turns out that the chip uses an eager buffer
+			 * for all non-expected packets, whether it "needs"
+			 * one or not.  So always get the index, but don't
+			 * set ebuf (so we try to copy data) unless the
+			 * length requires it.
+			 */
+			etail = ipath_hdrget_index(rhf_addr);
+			updegr = 1;
+			if (tlen > sizeof(*hdr) ||
+			    etype == RCVHQ_RCV_TYPE_NON_KD)
+				ebuf = ipath_get_egrbuf(dd, etail);
+		}
+
+		/*
+		 * both tiderr and ipathhdrerr are set for all plain IB
+		 * packets; only ipathhdrerr should be set.
+		 */
+
+		if (etype != RCVHQ_RCV_TYPE_NON_KD &&
+		    etype != RCVHQ_RCV_TYPE_ERROR &&
+		    ipath_hdrget_ipath_ver(hdr->iph.ver_port_tid_offset) !=
+		    IPS_PROTO_VERSION)
+			ipath_cdbg(PKT, "Bad InfiniPath protocol version "
+				   "%x\n", etype);
+
+		if (unlikely(eflags))
+			ipath_rcv_hdrerr(dd, eflags, l, etail, rhf_addr, hdr);
+		else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
+			ipath_ib_rcv(dd->verbs_dev, (u32 *)hdr, ebuf, tlen);
+			if (dd->ipath_lli_counter)
+				dd->ipath_lli_counter--;
+		} else if (etype == RCVHQ_RCV_TYPE_EAGER) {
+			u8 opcode = be32_to_cpu(hdr->bth[0]) >> 24;
+			u32 qp = be32_to_cpu(hdr->bth[1]) & 0xffffff;
+			ipath_cdbg(PKT, "typ %x, opcode %x (eager, "
+				   "qp=%x), len %x; ignored\n",
+				   etype, opcode, qp, tlen);
+		}
+		else if (etype == RCVHQ_RCV_TYPE_EXPECTED)
+			ipath_dbg("Bug: Expected TID, opcode %x; ignored\n",
+				  be32_to_cpu(hdr->bth[0]) >> 24);
+		else {
+			/*
+			 * error packet, type of error unknown.
+			 * Probably type 3, but we don't know, so don't
+			 * even try to print the opcode, etc.
+			 * Usually caused by a "bad packet", that has no
+			 * BTH, when the LRH says it should.
+			 */
+			ipath_cdbg(ERRPKT, "Error Pkt, but no eflags! egrbuf"
+				  " %x, len %x hdrq+%x rhf: %Lx\n",
+				  etail, tlen, l, (unsigned long long)
+				  le64_to_cpu(*(__le64 *) rhf_addr));
+			if (ipath_debug & __IPATH_ERRPKTDBG) {
+				u32 j, *d, dw = rsize-2;
+				if (rsize > (tlen>>2))
+					dw = tlen>>2;
+				d = (u32 *)hdr;
+				printk(KERN_DEBUG "EPkt rcvhdr(%x dw):\n",
+					dw);
+				for (j = 0; j < dw; j++)
+					printk(KERN_DEBUG "%8x%s", d[j],
+						(j%8) == 7 ? "\n" : " ");
+				printk(KERN_DEBUG ".\n");
+			}
+		}
+		l += rsize;
+		if (l >= maxcnt)
+			l = 0;
+		rhf_addr = (__le32 *) pd->port_rcvhdrq +
+			l + dd->ipath_rhf_offset;
+		if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
+			u32 seq = ipath_hdrget_seq(rhf_addr);
+
+			if (++pd->port_seq_cnt > 13)
+				pd->port_seq_cnt = 1;
+			if (seq != pd->port_seq_cnt)
+				last = 1;
+		} else if (l == hdrqtail)
+			last = 1;
+		/*
+		 * update head regs on last packet, and every 16 packets.
+		 * Reduce bus traffic, while still trying to prevent
+		 * rcvhdrq overflows, for when the queue is nearly full
+		 */
+		if (last || !(i & 0xf)) {
+			u64 lval = l;
+
+			/* request IBA6120 and 7220 interrupt only on last */
+			if (last)
+				lval |= dd->ipath_rhdrhead_intr_off;
+			ipath_write_ureg(dd, ur_rcvhdrhead, lval,
+				pd->port_port);
+			if (updegr) {
+				ipath_write_ureg(dd, ur_rcvegrindexhead,
+						 etail, pd->port_port);
+				updegr = 0;
+			}
+		}
+	}
+
+	if (!dd->ipath_rhdrhead_intr_off && !reloop &&
+	    !(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
+		/* IBA6110 workaround; we can have a race clearing chip
+		 * interrupt with another interrupt about to be delivered,
+		 * and can clear it before it is delivered on the GPIO
+		 * workaround.  By doing the extra check here for the
+		 * in-memory tail register updating while we were doing
+		 * earlier packets, we "almost" guarantee we have covered
+		 * that case.
+		 */
+		u32 hqtail = ipath_get_rcvhdrtail(pd);
+		if (hqtail != hdrqtail) {
+			hdrqtail = hqtail;
+			reloop = 1; /* loop 1 extra time at most */
+			goto reloop;
+		}
+	}
+
+	pkttot += i;
+
+	pd->port_head = l;
+
+	if (pkttot > ipath_stats.sps_maxpkts_call)
+		ipath_stats.sps_maxpkts_call = pkttot;
+	ipath_stats.sps_port0pkts += pkttot;
+	ipath_stats.sps_avgpkts_call =
+		ipath_stats.sps_port0pkts / ++totcalls;
+
+bail:;
+}
+
+/**
+ * ipath_update_pio_bufs - update shadow copy of the PIO availability map
+ * @dd: the infinipath device
+ *
+ * called whenever our local copy indicates we have run out of send buffers
+ * NOTE: This can be called from interrupt context by some code
+ * and from non-interrupt context by ipath_getpiobuf().
+ */
+
+static void ipath_update_pio_bufs(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+	int i;
+	const unsigned piobregs = (unsigned)dd->ipath_pioavregs;
+
+	/* If the generation (check) bits have changed, then we update the
+	 * busy bit for the corresponding PIO buffer.  This algorithm will
+	 * modify positions to the value they already have in some cases
+	 * (i.e., no change), but it's faster than changing only the bits
+	 * that have changed.
+	 *
+	 * We would like to do this atomicly, to avoid spinlocks in the
+	 * critical send path, but that's not really possible, given the
+	 * type of changes, and that this routine could be called on
+	 * multiple cpu's simultaneously, so we lock in this routine only,
+	 * to avoid conflicting updates; all we change is the shadow, and
+	 * it's a single 64 bit memory location, so by definition the update
+	 * is atomic in terms of what other cpu's can see in testing the
+	 * bits.  The spin_lock overhead isn't too bad, since it only
+	 * happens when all buffers are in use, so only cpu overhead, not
+	 * latency or bandwidth is affected.
+	 */
+	if (!dd->ipath_pioavailregs_dma) {
+		ipath_dbg("Update shadow pioavail, but regs_dma NULL!\n");
+		return;
+	}
+	if (ipath_debug & __IPATH_VERBDBG) {
+		/* only if packet debug and verbose */
+		volatile __le64 *dma = dd->ipath_pioavailregs_dma;
+		unsigned long *shadow = dd->ipath_pioavailshadow;
+
+		ipath_cdbg(PKT, "Refill avail, dma0=%llx shad0=%lx, "
+			   "d1=%llx s1=%lx, d2=%llx s2=%lx, d3=%llx "
+			   "s3=%lx\n",
+			   (unsigned long long) le64_to_cpu(dma[0]),
+			   shadow[0],
+			   (unsigned long long) le64_to_cpu(dma[1]),
+			   shadow[1],
+			   (unsigned long long) le64_to_cpu(dma[2]),
+			   shadow[2],
+			   (unsigned long long) le64_to_cpu(dma[3]),
+			   shadow[3]);
+		if (piobregs > 4)
+			ipath_cdbg(
+				PKT, "2nd group, dma4=%llx shad4=%lx, "
+				"d5=%llx s5=%lx, d6=%llx s6=%lx, "
+				"d7=%llx s7=%lx\n",
+				(unsigned long long) le64_to_cpu(dma[4]),
+				shadow[4],
+				(unsigned long long) le64_to_cpu(dma[5]),
+				shadow[5],
+				(unsigned long long) le64_to_cpu(dma[6]),
+				shadow[6],
+				(unsigned long long) le64_to_cpu(dma[7]),
+				shadow[7]);
+	}
+	spin_lock_irqsave(&ipath_pioavail_lock, flags);
+	for (i = 0; i < piobregs; i++) {
+		u64 pchbusy, pchg, piov, pnew;
+		/*
+		 * Chip Errata: bug 6641; even and odd qwords>3 are swapped
+		 */
+		if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
+			piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i ^ 1]);
+		else
+			piov = le64_to_cpu(dd->ipath_pioavailregs_dma[i]);
+		pchg = dd->ipath_pioavailkernel[i] &
+			~(dd->ipath_pioavailshadow[i] ^ piov);
+		pchbusy = pchg << INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT;
+		if (pchg && (pchbusy & dd->ipath_pioavailshadow[i])) {
+			pnew = dd->ipath_pioavailshadow[i] & ~pchbusy;
+			pnew |= piov & pchbusy;
+			dd->ipath_pioavailshadow[i] = pnew;
+		}
+	}
+	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+}
+
+/*
+ * used to force update of pioavailshadow if we can't get a pio buffer.
+ * Needed primarily due to exitting freeze mode after recovering
+ * from errors.  Done lazily, because it's safer (known to not
+ * be writing pio buffers).
+ */
+static void ipath_reset_availshadow(struct ipath_devdata *dd)
+{
+	int i, im;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ipath_pioavail_lock, flags);
+	for (i = 0; i < dd->ipath_pioavregs; i++) {
+		u64 val, oldval;
+		/* deal with 6110 chip bug on high register #s */
+		im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
+			i ^ 1 : i;
+		val = le64_to_cpu(dd->ipath_pioavailregs_dma[im]);
+		/*
+		 * busy out the buffers not in the kernel avail list,
+		 * without changing the generation bits.
+		 */
+		oldval = dd->ipath_pioavailshadow[i];
+		dd->ipath_pioavailshadow[i] = val |
+			((~dd->ipath_pioavailkernel[i] <<
+			INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT) &
+			0xaaaaaaaaaaaaaaaaULL); /* All BUSY bits in qword */
+		if (oldval != dd->ipath_pioavailshadow[i])
+			ipath_dbg("shadow[%d] was %Lx, now %lx\n",
+				i, (unsigned long long) oldval,
+				dd->ipath_pioavailshadow[i]);
+	}
+	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+}
+
+/**
+ * ipath_setrcvhdrsize - set the receive header size
+ * @dd: the infinipath device
+ * @rhdrsize: the receive header size
+ *
+ * called from user init code, and also layered driver init
+ */
+int ipath_setrcvhdrsize(struct ipath_devdata *dd, unsigned rhdrsize)
+{
+	int ret = 0;
+
+	if (dd->ipath_flags & IPATH_RCVHDRSZ_SET) {
+		if (dd->ipath_rcvhdrsize != rhdrsize) {
+			dev_info(&dd->pcidev->dev,
+				 "Error: can't set protocol header "
+				 "size %u, already %u\n",
+				 rhdrsize, dd->ipath_rcvhdrsize);
+			ret = -EAGAIN;
+		} else
+			ipath_cdbg(VERBOSE, "Reuse same protocol header "
+				   "size %u\n", dd->ipath_rcvhdrsize);
+	} else if (rhdrsize > (dd->ipath_rcvhdrentsize -
+			       (sizeof(u64) / sizeof(u32)))) {
+		ipath_dbg("Error: can't set protocol header size %u "
+			  "(> max %u)\n", rhdrsize,
+			  dd->ipath_rcvhdrentsize -
+			  (u32) (sizeof(u64) / sizeof(u32)));
+		ret = -EOVERFLOW;
+	} else {
+		dd->ipath_flags |= IPATH_RCVHDRSZ_SET;
+		dd->ipath_rcvhdrsize = rhdrsize;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
+				 dd->ipath_rcvhdrsize);
+		ipath_cdbg(VERBOSE, "Set protocol header size to %u\n",
+			   dd->ipath_rcvhdrsize);
+	}
+	return ret;
+}
+
+/*
+ * debugging code and stats updates if no pio buffers available.
+ */
+static noinline void no_pio_bufs(struct ipath_devdata *dd)
+{
+	unsigned long *shadow = dd->ipath_pioavailshadow;
+	__le64 *dma = (__le64 *)dd->ipath_pioavailregs_dma;
+
+	dd->ipath_upd_pio_shadow = 1;
+
+	/*
+	 * not atomic, but if we lose a stat count in a while, that's OK
+	 */
+	ipath_stats.sps_nopiobufs++;
+	if (!(++dd->ipath_consec_nopiobuf % 100000)) {
+		ipath_force_pio_avail_update(dd); /* at start */
+		ipath_dbg("%u tries no piobufavail ts%lx; dmacopy: "
+			"%llx %llx %llx %llx\n"
+			"ipath  shadow:  %lx %lx %lx %lx\n",
+			dd->ipath_consec_nopiobuf,
+			(unsigned long)get_cycles(),
+			(unsigned long long) le64_to_cpu(dma[0]),
+			(unsigned long long) le64_to_cpu(dma[1]),
+			(unsigned long long) le64_to_cpu(dma[2]),
+			(unsigned long long) le64_to_cpu(dma[3]),
+			shadow[0], shadow[1], shadow[2], shadow[3]);
+		/*
+		 * 4 buffers per byte, 4 registers above, cover rest
+		 * below
+		 */
+		if ((dd->ipath_piobcnt2k + dd->ipath_piobcnt4k) >
+		    (sizeof(shadow[0]) * 4 * 4))
+			ipath_dbg("2nd group: dmacopy: "
+				  "%llx %llx %llx %llx\n"
+				  "ipath  shadow:  %lx %lx %lx %lx\n",
+				  (unsigned long long)le64_to_cpu(dma[4]),
+				  (unsigned long long)le64_to_cpu(dma[5]),
+				  (unsigned long long)le64_to_cpu(dma[6]),
+				  (unsigned long long)le64_to_cpu(dma[7]),
+				  shadow[4], shadow[5], shadow[6], shadow[7]);
+
+		/* at end, so update likely happened */
+		ipath_reset_availshadow(dd);
+	}
+}
+
+/*
+ * common code for normal driver pio buffer allocation, and reserved
+ * allocation.
+ *
+ * do appropriate marking as busy, etc.
+ * returns buffer number if one found (>=0), negative number is error.
+ */
+static u32 __iomem *ipath_getpiobuf_range(struct ipath_devdata *dd,
+	u32 *pbufnum, u32 first, u32 last, u32 firsti)
+{
+	int i, j, updated = 0;
+	unsigned piobcnt;
+	unsigned long flags;
+	unsigned long *shadow = dd->ipath_pioavailshadow;
+	u32 __iomem *buf;
+
+	piobcnt = last - first;
+	if (dd->ipath_upd_pio_shadow) {
+		/*
+		 * Minor optimization.  If we had no buffers on last call,
+		 * start out by doing the update; continue and do scan even
+		 * if no buffers were updated, to be paranoid
+		 */
+		ipath_update_pio_bufs(dd);
+		updated++;
+		i = first;
+	} else
+		i = firsti;
+rescan:
+	/*
+	 * while test_and_set_bit() is atomic, we do that and then the
+	 * change_bit(), and the pair is not.  See if this is the cause
+	 * of the remaining armlaunch errors.
+	 */
+	spin_lock_irqsave(&ipath_pioavail_lock, flags);
+	for (j = 0; j < piobcnt; j++, i++) {
+		if (i >= last)
+			i = first;
+		if (__test_and_set_bit((2 * i) + 1, shadow))
+			continue;
+		/* flip generation bit */
+		__change_bit(2 * i, shadow);
+		break;
+	}
+	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+
+	if (j == piobcnt) {
+		if (!updated) {
+			/*
+			 * first time through; shadow exhausted, but may be
+			 * buffers available, try an update and then rescan.
+			 */
+			ipath_update_pio_bufs(dd);
+			updated++;
+			i = first;
+			goto rescan;
+		} else if (updated == 1 && piobcnt <=
+			((dd->ipath_sendctrl
+			>> INFINIPATH_S_UPDTHRESH_SHIFT) &
+			INFINIPATH_S_UPDTHRESH_MASK)) {
+			/*
+			 * for chips supporting and using the update
+			 * threshold we need to force an update of the
+			 * in-memory copy if the count is less than the
+			 * thershold, then check one more time.
+			 */
+			ipath_force_pio_avail_update(dd);
+			ipath_update_pio_bufs(dd);
+			updated++;
+			i = first;
+			goto rescan;
+		}
+
+		no_pio_bufs(dd);
+		buf = NULL;
+	} else {
+		if (i < dd->ipath_piobcnt2k)
+			buf = (u32 __iomem *) (dd->ipath_pio2kbase +
+					       i * dd->ipath_palign);
+		else
+			buf = (u32 __iomem *)
+				(dd->ipath_pio4kbase +
+				 (i - dd->ipath_piobcnt2k) * dd->ipath_4kalign);
+		if (pbufnum)
+			*pbufnum = i;
+	}
+
+	return buf;
+}
+
+/**
+ * ipath_getpiobuf - find an available pio buffer
+ * @dd: the infinipath device
+ * @plen: the size of the PIO buffer needed in 32-bit words
+ * @pbufnum: the buffer number is placed here
+ */
+u32 __iomem *ipath_getpiobuf(struct ipath_devdata *dd, u32 plen, u32 *pbufnum)
+{
+	u32 __iomem *buf;
+	u32 pnum, nbufs;
+	u32 first, lasti;
+
+	if (plen + 1 >= IPATH_SMALLBUF_DWORDS) {
+		first = dd->ipath_piobcnt2k;
+		lasti = dd->ipath_lastpioindexl;
+	} else {
+		first = 0;
+		lasti = dd->ipath_lastpioindex;
+	}
+	nbufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+	buf = ipath_getpiobuf_range(dd, &pnum, first, nbufs, lasti);
+
+	if (buf) {
+		/*
+		 * Set next starting place.  It's just an optimization,
+		 * it doesn't matter who wins on this, so no locking
+		 */
+		if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
+			dd->ipath_lastpioindexl = pnum + 1;
+		else
+			dd->ipath_lastpioindex = pnum + 1;
+		if (dd->ipath_upd_pio_shadow)
+			dd->ipath_upd_pio_shadow = 0;
+		if (dd->ipath_consec_nopiobuf)
+			dd->ipath_consec_nopiobuf = 0;
+		ipath_cdbg(VERBOSE, "Return piobuf%u %uk @ %p\n",
+			   pnum, (pnum < dd->ipath_piobcnt2k) ? 2 : 4, buf);
+		if (pbufnum)
+			*pbufnum = pnum;
+
+	}
+	return buf;
+}
+
+/**
+ * ipath_chg_pioavailkernel - change which send buffers are available for kernel
+ * @dd: the infinipath device
+ * @start: the starting send buffer number
+ * @len: the number of send buffers
+ * @avail: true if the buffers are available for kernel use, false otherwise
+ */
+void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
+			      unsigned len, int avail)
+{
+	unsigned long flags;
+	unsigned end, cnt = 0;
+
+	/* There are two bits per send buffer (busy and generation) */
+	start *= 2;
+	end = start + len * 2;
+
+	spin_lock_irqsave(&ipath_pioavail_lock, flags);
+	/* Set or clear the busy bit in the shadow. */
+	while (start < end) {
+		if (avail) {
+			unsigned long dma;
+			int i, im;
+			/*
+			 * the BUSY bit will never be set, because we disarm
+			 * the user buffers before we hand them back to the
+			 * kernel.  We do have to make sure the generation
+			 * bit is set correctly in shadow, since it could
+			 * have changed many times while allocated to user.
+			 * We can't use the bitmap functions on the full
+			 * dma array because it is always little-endian, so
+			 * we have to flip to host-order first.
+			 * BITS_PER_LONG is slightly wrong, since it's
+			 * always 64 bits per register in chip...
+			 * We only work on 64 bit kernels, so that's OK.
+			 */
+			/* deal with 6110 chip bug on high register #s */
+			i = start / BITS_PER_LONG;
+			im = (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS)) ?
+				i ^ 1 : i;
+			__clear_bit(INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT
+				+ start, dd->ipath_pioavailshadow);
+			dma = (unsigned long) le64_to_cpu(
+				dd->ipath_pioavailregs_dma[im]);
+			if (test_bit((INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+				+ start) % BITS_PER_LONG, &dma))
+				__set_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+					+ start, dd->ipath_pioavailshadow);
+			else
+				__clear_bit(INFINIPATH_SENDPIOAVAIL_CHECK_SHIFT
+					+ start, dd->ipath_pioavailshadow);
+			__set_bit(start, dd->ipath_pioavailkernel);
+		} else {
+			__set_bit(start + INFINIPATH_SENDPIOAVAIL_BUSY_SHIFT,
+				dd->ipath_pioavailshadow);
+			__clear_bit(start, dd->ipath_pioavailkernel);
+		}
+		start += 2;
+	}
+
+	if (dd->ipath_pioupd_thresh) {
+		end = 2 * (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
+		cnt = bitmap_weight(dd->ipath_pioavailkernel, end);
+	}
+	spin_unlock_irqrestore(&ipath_pioavail_lock, flags);
+
+	/*
+	 * When moving buffers from kernel to user, if number assigned to
+	 * the user is less than the pio update threshold, and threshold
+	 * is supported (cnt was computed > 0), drop the update threshold
+	 * so we update at least once per allocated number of buffers.
+	 * In any case, if the kernel buffers are less than the threshold,
+	 * drop the threshold.  We don't bother increasing it, having once
+	 * decreased it, since it would typically just cycle back and forth.
+	 * If we don't decrease below buffers in use, we can wait a long
+	 * time for an update, until some other context uses PIO buffers.
+	 */
+	if (!avail && len < cnt)
+		cnt = len;
+	if (cnt < dd->ipath_pioupd_thresh) {
+		dd->ipath_pioupd_thresh = cnt;
+		ipath_dbg("Decreased pio update threshold to %u\n",
+			dd->ipath_pioupd_thresh);
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl &= ~(INFINIPATH_S_UPDTHRESH_MASK
+			<< INFINIPATH_S_UPDTHRESH_SHIFT);
+		dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
+			<< INFINIPATH_S_UPDTHRESH_SHIFT;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			dd->ipath_sendctrl);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+	}
+}
+
+/**
+ * ipath_create_rcvhdrq - create a receive header queue
+ * @dd: the infinipath device
+ * @pd: the port data
+ *
+ * this must be contiguous memory (from an i/o perspective), and must be
+ * DMA'able (which means for some systems, it will go through an IOMMU,
+ * or be forced into a low address range).
+ */
+int ipath_create_rcvhdrq(struct ipath_devdata *dd,
+			 struct ipath_portdata *pd)
+{
+	int ret = 0;
+
+	if (!pd->port_rcvhdrq) {
+		dma_addr_t phys_hdrqtail;
+		gfp_t gfp_flags = GFP_USER | __GFP_COMP;
+		int amt = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
+				sizeof(u32), PAGE_SIZE);
+
+		pd->port_rcvhdrq = dma_alloc_coherent(
+			&dd->pcidev->dev, amt, &pd->port_rcvhdrq_phys,
+			gfp_flags);
+
+		if (!pd->port_rcvhdrq) {
+			ipath_dev_err(dd, "attempt to allocate %d bytes "
+				      "for port %u rcvhdrq failed\n",
+				      amt, pd->port_port);
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
+			pd->port_rcvhdrtail_kvaddr = dma_alloc_coherent(
+				&dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
+				GFP_KERNEL);
+			if (!pd->port_rcvhdrtail_kvaddr) {
+				ipath_dev_err(dd, "attempt to allocate 1 page "
+					"for port %u rcvhdrqtailaddr "
+					"failed\n", pd->port_port);
+				ret = -ENOMEM;
+				dma_free_coherent(&dd->pcidev->dev, amt,
+					pd->port_rcvhdrq,
+					pd->port_rcvhdrq_phys);
+				pd->port_rcvhdrq = NULL;
+				goto bail;
+			}
+			pd->port_rcvhdrqtailaddr_phys = phys_hdrqtail;
+			ipath_cdbg(VERBOSE, "port %d hdrtailaddr, %llx "
+				   "physical\n", pd->port_port,
+				   (unsigned long long) phys_hdrqtail);
+		}
+
+		pd->port_rcvhdrq_size = amt;
+
+		ipath_cdbg(VERBOSE, "%d pages at %p (phys %lx) size=%lu "
+			   "for port %u rcvhdr Q\n",
+			   amt >> PAGE_SHIFT, pd->port_rcvhdrq,
+			   (unsigned long) pd->port_rcvhdrq_phys,
+			   (unsigned long) pd->port_rcvhdrq_size,
+			   pd->port_port);
+	}
+	else
+		ipath_cdbg(VERBOSE, "reuse port %d rcvhdrq @%p %llx phys; "
+			   "hdrtailaddr@%p %llx physical\n",
+			   pd->port_port, pd->port_rcvhdrq,
+			   (unsigned long long) pd->port_rcvhdrq_phys,
+			   pd->port_rcvhdrtail_kvaddr, (unsigned long long)
+			   pd->port_rcvhdrqtailaddr_phys);
+
+	/* clear for security and sanity on each use */
+	memset(pd->port_rcvhdrq, 0, pd->port_rcvhdrq_size);
+	if (pd->port_rcvhdrtail_kvaddr)
+		memset(pd->port_rcvhdrtail_kvaddr, 0, PAGE_SIZE);
+
+	/*
+	 * tell chip each time we init it, even if we are re-using previous
+	 * memory (we zero the register at process close)
+	 */
+	ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdrtailaddr,
+			      pd->port_port, pd->port_rcvhdrqtailaddr_phys);
+	ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
+			      pd->port_port, pd->port_rcvhdrq_phys);
+
+bail:
+	return ret;
+}
+
+
+/*
+ * Flush all sends that might be in the ready to send state, as well as any
+ * that are in the process of being sent.   Used whenever we need to be
+ * sure the send side is idle.  Cleans up all buffer state by canceling
+ * all pio buffers, and issuing an abort, which cleans up anything in the
+ * launch fifo.  The cancel is superfluous on some chip versions, but
+ * it's safer to always do it.
+ * PIOAvail bits are updated by the chip as if normal send had happened.
+ */
+void ipath_cancel_sends(struct ipath_devdata *dd, int restore_sendctrl)
+{
+	unsigned long flags;
+
+	if (dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) {
+		ipath_cdbg(VERBOSE, "Ignore while in autonegotiation\n");
+		goto bail;
+	}
+	/*
+	 * If we have SDMA, and it's not disabled, we have to kick off the
+	 * abort state machine, provided we aren't already aborting.
+	 * If we are in the process of aborting SDMA (!DISABLED, but ABORTING),
+	 * we skip the rest of this routine. It is already "in progress"
+	 */
+	if (dd->ipath_flags & IPATH_HAS_SEND_DMA) {
+		int skip_cancel;
+		unsigned long *statp = &dd->ipath_sdma_status;
+
+		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+		skip_cancel =
+			test_and_set_bit(IPATH_SDMA_ABORTING, statp)
+			&& !test_bit(IPATH_SDMA_DISABLED, statp);
+		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+		if (skip_cancel)
+			goto bail;
+	}
+
+	ipath_dbg("Cancelling all in-progress send buffers\n");
+
+	/* skip armlaunch errs for a while */
+	dd->ipath_lastcancel = jiffies + HZ / 2;
+
+	/*
+	 * The abort bit is auto-clearing.  We also don't want pioavail
+	 * update happening during this, and we don't want any other
+	 * sends going out, so turn those off for the duration.  We read
+	 * the scratch register to be sure that cancels and the abort
+	 * have taken effect in the chip.  Otherwise two parts are same
+	 * as ipath_force_pio_avail_update()
+	 */
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl &= ~(INFINIPATH_S_PIOBUFAVAILUPD
+		| INFINIPATH_S_PIOENABLE);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+		dd->ipath_sendctrl | INFINIPATH_S_ABORT);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	/* disarm all send buffers */
+	ipath_disarm_piobufs(dd, 0,
+		dd->ipath_piobcnt2k + dd->ipath_piobcnt4k);
+
+	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+		set_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
+
+	if (restore_sendctrl) {
+		/* else done by caller later if needed */
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl |= INFINIPATH_S_PIOBUFAVAILUPD |
+			INFINIPATH_S_PIOENABLE;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			dd->ipath_sendctrl);
+		/* and again, be sure all have hit the chip */
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+	}
+
+	if ((dd->ipath_flags & IPATH_HAS_SEND_DMA) &&
+	    !test_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status) &&
+	    test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)) {
+		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+		/* only wait so long for intr */
+		dd->ipath_sdma_abort_intr_timeout = jiffies + HZ;
+		dd->ipath_sdma_reset_wait = 200;
+		if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+			tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
+		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+	}
+bail:;
+}
+
+/*
+ * Force an update of in-memory copy of the pioavail registers, when
+ * needed for any of a variety of reasons.  We read the scratch register
+ * to make it highly likely that the update will have happened by the
+ * time we return.  If already off (as in cancel_sends above), this
+ * routine is a nop, on the assumption that the caller will "do the
+ * right thing".
+ */
+void ipath_force_pio_avail_update(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	if (dd->ipath_sendctrl & INFINIPATH_S_PIOBUFAVAILUPD) {
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			dd->ipath_sendctrl & ~INFINIPATH_S_PIOBUFAVAILUPD);
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			dd->ipath_sendctrl);
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	}
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+}
+
+static void ipath_set_ib_lstate(struct ipath_devdata *dd, int linkcmd,
+				int linitcmd)
+{
+	u64 mod_wd;
+	static const char *what[4] = {
+		[0] = "NOP",
+		[INFINIPATH_IBCC_LINKCMD_DOWN] = "DOWN",
+		[INFINIPATH_IBCC_LINKCMD_ARMED] = "ARMED",
+		[INFINIPATH_IBCC_LINKCMD_ACTIVE] = "ACTIVE"
+	};
+
+	if (linitcmd == INFINIPATH_IBCC_LINKINITCMD_DISABLE) {
+		/*
+		 * If we are told to disable, note that so link-recovery
+		 * code does not attempt to bring us back up.
+		 */
+		preempt_disable();
+		dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
+		preempt_enable();
+	} else if (linitcmd) {
+		/*
+		 * Any other linkinitcmd will lead to LINKDOWN and then
+		 * to INIT (if all is well), so clear flag to let
+		 * link-recovery code attempt to bring us back up.
+		 */
+		preempt_disable();
+		dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
+		preempt_enable();
+	}
+
+	mod_wd = (linkcmd << dd->ibcc_lc_shift) |
+		(linitcmd << INFINIPATH_IBCC_LINKINITCMD_SHIFT);
+	ipath_cdbg(VERBOSE,
+		"Moving unit %u to %s (initcmd=0x%x), current ltstate is %s\n",
+		dd->ipath_unit, what[linkcmd], linitcmd,
+		ipath_ibcstatus_str[ipath_ib_linktrstate(dd,
+			ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus))]);
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+			 dd->ipath_ibcctrl | mod_wd);
+	/* read from chip so write is flushed */
+	(void) ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+}
+
+int ipath_set_linkstate(struct ipath_devdata *dd, u8 newstate)
+{
+	u32 lstate;
+	int ret;
+
+	switch (newstate) {
+	case IPATH_IB_LINKDOWN_ONLY:
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN, 0);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case IPATH_IB_LINKDOWN:
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
+					INFINIPATH_IBCC_LINKINITCMD_POLL);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case IPATH_IB_LINKDOWN_SLEEP:
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
+					INFINIPATH_IBCC_LINKINITCMD_SLEEP);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case IPATH_IB_LINKDOWN_DISABLE:
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_DOWN,
+					INFINIPATH_IBCC_LINKINITCMD_DISABLE);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case IPATH_IB_LINKARM:
+		if (dd->ipath_flags & IPATH_LINKARMED) {
+			ret = 0;
+			goto bail;
+		}
+		if (!(dd->ipath_flags &
+		      (IPATH_LINKINIT | IPATH_LINKACTIVE))) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ARMED, 0);
+
+		/*
+		 * Since the port can transition to ACTIVE by receiving
+		 * a non VL 15 packet, wait for either state.
+		 */
+		lstate = IPATH_LINKARMED | IPATH_LINKACTIVE;
+		break;
+
+	case IPATH_IB_LINKACTIVE:
+		if (dd->ipath_flags & IPATH_LINKACTIVE) {
+			ret = 0;
+			goto bail;
+		}
+		if (!(dd->ipath_flags & IPATH_LINKARMED)) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		ipath_set_ib_lstate(dd, INFINIPATH_IBCC_LINKCMD_ACTIVE, 0);
+		lstate = IPATH_LINKACTIVE;
+		break;
+
+	case IPATH_IB_LINK_LOOPBACK:
+		dev_info(&dd->pcidev->dev, "Enabling IB local loopback\n");
+		dd->ipath_ibcctrl |= INFINIPATH_IBCC_LOOPBACK;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+				 dd->ipath_ibcctrl);
+
+		/* turn heartbeat off, as it causes loopback to fail */
+		dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+				       IPATH_IB_HRTBT_OFF);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case IPATH_IB_LINK_EXTERNAL:
+		dev_info(&dd->pcidev->dev,
+			"Disabling IB local loopback (normal)\n");
+		dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+				       IPATH_IB_HRTBT_ON);
+		dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LOOPBACK;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+				 dd->ipath_ibcctrl);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	/*
+	 * Heartbeat can be explicitly enabled by the user via
+	 * "hrtbt_enable" "file", and if disabled, trying to enable here
+	 * will have no effect.  Implicit changes (heartbeat off when
+	 * loopback on, and vice versa) are included to ease testing.
+	 */
+	case IPATH_IB_LINK_HRTBT:
+		ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+			IPATH_IB_HRTBT_ON);
+		goto bail;
+
+	case IPATH_IB_LINK_NO_HRTBT:
+		ret = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT,
+			IPATH_IB_HRTBT_OFF);
+		goto bail;
+
+	default:
+		ipath_dbg("Invalid linkstate 0x%x requested\n", newstate);
+		ret = -EINVAL;
+		goto bail;
+	}
+	ret = ipath_wait_linkstate(dd, lstate, 2000);
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_set_mtu - set the MTU
+ * @dd: the infinipath device
+ * @arg: the new MTU
+ *
+ * we can handle "any" incoming size, the issue here is whether we
+ * need to restrict our outgoing size.   For now, we don't do any
+ * sanity checking on this, and we don't deal with what happens to
+ * programs that are already running when the size changes.
+ * NOTE: changing the MTU will usually cause the IBC to go back to
+ * link INIT state...
+ */
+int ipath_set_mtu(struct ipath_devdata *dd, u16 arg)
+{
+	u32 piosize;
+	int changed = 0;
+	int ret;
+
+	/*
+	 * mtu is IB data payload max.  It's the largest power of 2 less
+	 * than piosize (or even larger, since it only really controls the
+	 * largest we can receive; we can send the max of the mtu and
+	 * piosize).  We check that it's one of the valid IB sizes.
+	 */
+	if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 &&
+	    (arg != 4096 || !ipath_mtu4096)) {
+		ipath_dbg("Trying to set invalid mtu %u, failing\n", arg);
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (dd->ipath_ibmtu == arg) {
+		ret = 0;        /* same as current */
+		goto bail;
+	}
+
+	piosize = dd->ipath_ibmaxlen;
+	dd->ipath_ibmtu = arg;
+
+	if (arg >= (piosize - IPATH_PIO_MAXIBHDR)) {
+		/* Only if it's not the initial value (or reset to it) */
+		if (piosize != dd->ipath_init_ibmaxlen) {
+			if (arg > piosize && arg <= dd->ipath_init_ibmaxlen)
+				piosize = dd->ipath_init_ibmaxlen;
+			dd->ipath_ibmaxlen = piosize;
+			changed = 1;
+		}
+	} else if ((arg + IPATH_PIO_MAXIBHDR) != dd->ipath_ibmaxlen) {
+		piosize = arg + IPATH_PIO_MAXIBHDR;
+		ipath_cdbg(VERBOSE, "ibmaxlen was 0x%x, setting to 0x%x "
+			   "(mtu 0x%x)\n", dd->ipath_ibmaxlen, piosize,
+			   arg);
+		dd->ipath_ibmaxlen = piosize;
+		changed = 1;
+	}
+
+	if (changed) {
+		u64 ibc = dd->ipath_ibcctrl, ibdw;
+		/*
+		 * update our housekeeping variables, and set IBC max
+		 * size, same as init code; max IBC is max we allow in
+		 * buffer, less the qword pbc, plus 1 for ICRC, in dwords
+		 */
+		dd->ipath_ibmaxlen = piosize - 2 * sizeof(u32);
+		ibdw = (dd->ipath_ibmaxlen >> 2) + 1;
+		ibc &= ~(INFINIPATH_IBCC_MAXPKTLEN_MASK <<
+			 dd->ibcc_mpl_shift);
+		ibc |= ibdw << dd->ibcc_mpl_shift;
+		dd->ipath_ibcctrl = ibc;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+				 dd->ipath_ibcctrl);
+		dd->ipath_f_tidtemplate(dd);
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int ipath_set_lid(struct ipath_devdata *dd, u32 lid, u8 lmc)
+{
+	dd->ipath_lid = lid;
+	dd->ipath_lmc = lmc;
+
+	dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LIDLMC, lid |
+		(~((1U << lmc) - 1)) << 16);
+
+	dev_info(&dd->pcidev->dev, "We got a lid: 0x%x\n", lid);
+
+	return 0;
+}
+
+
+/**
+ * ipath_write_kreg_port - write a device's per-port 64-bit kernel register
+ * @dd: the infinipath device
+ * @regno: the register number to write
+ * @port: the port containing the register
+ * @value: the value to write
+ *
+ * Registers that vary with the chip implementation constants (port)
+ * use this routine.
+ */
+void ipath_write_kreg_port(const struct ipath_devdata *dd, ipath_kreg regno,
+			  unsigned port, u64 value)
+{
+	u16 where;
+
+	if (port < dd->ipath_portcnt &&
+	    (regno == dd->ipath_kregs->kr_rcvhdraddr ||
+	     regno == dd->ipath_kregs->kr_rcvhdrtailaddr))
+		where = regno + port;
+	else
+		where = -1;
+
+	ipath_write_kreg(dd, where, value);
+}
+
+/*
+ * Following deal with the "obviously simple" task of overriding the state
+ * of the LEDS, which normally indicate link physical and logical status.
+ * The complications arise in dealing with different hardware mappings
+ * and the board-dependent routine being called from interrupts.
+ * and then there's the requirement to _flash_ them.
+ */
+#define LED_OVER_FREQ_SHIFT 8
+#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
+/* Below is "non-zero" to force override, but both actual LEDs are off */
+#define LED_OVER_BOTH_OFF (8)
+
+static void ipath_run_led_override(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+	int timeoff;
+	int pidx;
+	u64 lstate, ltstate, val;
+
+	if (!(dd->ipath_flags & IPATH_INITTED))
+		return;
+
+	pidx = dd->ipath_led_override_phase++ & 1;
+	dd->ipath_led_override = dd->ipath_led_override_vals[pidx];
+	timeoff = dd->ipath_led_override_timeoff;
+
+	/*
+	 * below potentially restores the LED values per current status,
+	 * should also possibly setup the traffic-blink register,
+	 * but leave that to per-chip functions.
+	 */
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+	ltstate = ipath_ib_linktrstate(dd, val);
+	lstate = ipath_ib_linkstate(dd, val);
+
+	dd->ipath_f_setextled(dd, lstate, ltstate);
+	mod_timer(&dd->ipath_led_override_timer, jiffies + timeoff);
+}
+
+void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val)
+{
+	int timeoff, freq;
+
+	if (!(dd->ipath_flags & IPATH_INITTED))
+		return;
+
+	/* First check if we are blinking. If not, use 1HZ polling */
+	timeoff = HZ;
+	freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
+
+	if (freq) {
+		/* For blink, set each phase from one nybble of val */
+		dd->ipath_led_override_vals[0] = val & 0xF;
+		dd->ipath_led_override_vals[1] = (val >> 4) & 0xF;
+		timeoff = (HZ << 4)/freq;
+	} else {
+		/* Non-blink set both phases the same. */
+		dd->ipath_led_override_vals[0] = val & 0xF;
+		dd->ipath_led_override_vals[1] = val & 0xF;
+	}
+	dd->ipath_led_override_timeoff = timeoff;
+
+	/*
+	 * If the timer has not already been started, do so. Use a "quick"
+	 * timeout so the function will be called soon, to look at our request.
+	 */
+	if (atomic_inc_return(&dd->ipath_led_override_timer_active) == 1) {
+		/* Need to start timer */
+		init_timer(&dd->ipath_led_override_timer);
+		dd->ipath_led_override_timer.function =
+						 ipath_run_led_override;
+		dd->ipath_led_override_timer.data = (unsigned long) dd;
+		dd->ipath_led_override_timer.expires = jiffies + 1;
+		add_timer(&dd->ipath_led_override_timer);
+	} else
+		atomic_dec(&dd->ipath_led_override_timer_active);
+}
+
+/**
+ * ipath_shutdown_device - shut down a device
+ * @dd: the infinipath device
+ *
+ * This is called to make the device quiet when we are about to
+ * unload the driver, and also when the device is administratively
+ * disabled.   It does not free any data structures.
+ * Everything it does has to be setup again by ipath_init_chip(dd,1)
+ */
+void ipath_shutdown_device(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+
+	ipath_dbg("Shutting down the device\n");
+
+	ipath_hol_up(dd); /* make sure user processes aren't suspended */
+
+	dd->ipath_flags |= IPATH_LINKUNK;
+	dd->ipath_flags &= ~(IPATH_INITTED | IPATH_LINKDOWN |
+			     IPATH_LINKINIT | IPATH_LINKARMED |
+			     IPATH_LINKACTIVE);
+	*dd->ipath_statusp &= ~(IPATH_STATUS_IB_CONF |
+				IPATH_STATUS_IB_READY);
+
+	/* mask interrupts, but not errors */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+
+	dd->ipath_rcvctrl = 0;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			 dd->ipath_rcvctrl);
+
+	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+		teardown_sdma(dd);
+
+	/*
+	 * gracefully stop all sends allowing any in progress to trickle out
+	 * first.
+	 */
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl = 0;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	/* flush it */
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	/*
+	 * enough for anything that's going to trickle out to have actually
+	 * done so.
+	 */
+	udelay(5);
+
+	dd->ipath_f_setextled(dd, 0, 0); /* make sure LEDs are off */
+
+	ipath_set_ib_lstate(dd, 0, INFINIPATH_IBCC_LINKINITCMD_DISABLE);
+	ipath_cancel_sends(dd, 0);
+
+	/*
+	 * we are shutting down, so tell components that care.  We don't do
+	 * this on just a link state change, much like ethernet, a cable
+	 * unplug, etc. doesn't change driver state
+	 */
+	signal_ib_event(dd, IB_EVENT_PORT_ERR);
+
+	/* disable IBC */
+	dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control | INFINIPATH_C_FREEZEMODE);
+
+	/*
+	 * clear SerdesEnable and turn the leds off; do this here because
+	 * we are unloading, so don't count on interrupts to move along
+	 * Turn the LEDs off explicitly for the same reason.
+	 */
+	dd->ipath_f_quiet_serdes(dd);
+
+	/* stop all the timers that might still be running */
+	del_timer_sync(&dd->ipath_hol_timer);
+	if (dd->ipath_stats_timer_active) {
+		del_timer_sync(&dd->ipath_stats_timer);
+		dd->ipath_stats_timer_active = 0;
+	}
+	if (dd->ipath_intrchk_timer.data) {
+		del_timer_sync(&dd->ipath_intrchk_timer);
+		dd->ipath_intrchk_timer.data = 0;
+	}
+	if (atomic_read(&dd->ipath_led_override_timer_active)) {
+		del_timer_sync(&dd->ipath_led_override_timer);
+		atomic_set(&dd->ipath_led_override_timer_active, 0);
+	}
+
+	/*
+	 * clear all interrupts and errors, so that the next time the driver
+	 * is loaded or device is enabled, we know that whatever is set
+	 * happened while we were unloaded
+	 */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+			 ~0ULL & ~INFINIPATH_HWE_MEMBISTFAILED);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
+
+	ipath_cdbg(VERBOSE, "Flush time and errors to EEPROM\n");
+	ipath_update_eeprom_log(dd);
+}
+
+/**
+ * ipath_free_pddata - free a port's allocated data
+ * @dd: the infinipath device
+ * @pd: the portdata structure
+ *
+ * free up any allocated data for a port
+ * This should not touch anything that would affect a simultaneous
+ * re-allocation of port data, because it is called after ipath_mutex
+ * is released (and can be called from reinit as well).
+ * It should never change any chip state, or global driver state.
+ * (The only exception to global state is freeing the port0 port0_skbs.)
+ */
+void ipath_free_pddata(struct ipath_devdata *dd, struct ipath_portdata *pd)
+{
+	if (!pd)
+		return;
+
+	if (pd->port_rcvhdrq) {
+		ipath_cdbg(VERBOSE, "free closed port %d rcvhdrq @ %p "
+			   "(size=%lu)\n", pd->port_port, pd->port_rcvhdrq,
+			   (unsigned long) pd->port_rcvhdrq_size);
+		dma_free_coherent(&dd->pcidev->dev, pd->port_rcvhdrq_size,
+				  pd->port_rcvhdrq, pd->port_rcvhdrq_phys);
+		pd->port_rcvhdrq = NULL;
+		if (pd->port_rcvhdrtail_kvaddr) {
+			dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+					 pd->port_rcvhdrtail_kvaddr,
+					 pd->port_rcvhdrqtailaddr_phys);
+			pd->port_rcvhdrtail_kvaddr = NULL;
+		}
+	}
+	if (pd->port_port && pd->port_rcvegrbuf) {
+		unsigned e;
+
+		for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
+			void *base = pd->port_rcvegrbuf[e];
+			size_t size = pd->port_rcvegrbuf_size;
+
+			ipath_cdbg(VERBOSE, "egrbuf free(%p, %lu), "
+				   "chunk %u/%u\n", base,
+				   (unsigned long) size,
+				   e, pd->port_rcvegrbuf_chunks);
+			dma_free_coherent(&dd->pcidev->dev, size,
+				base, pd->port_rcvegrbuf_phys[e]);
+		}
+		kfree(pd->port_rcvegrbuf);
+		pd->port_rcvegrbuf = NULL;
+		kfree(pd->port_rcvegrbuf_phys);
+		pd->port_rcvegrbuf_phys = NULL;
+		pd->port_rcvegrbuf_chunks = 0;
+	} else if (pd->port_port == 0 && dd->ipath_port0_skbinfo) {
+		unsigned e;
+		struct ipath_skbinfo *skbinfo = dd->ipath_port0_skbinfo;
+
+		dd->ipath_port0_skbinfo = NULL;
+		ipath_cdbg(VERBOSE, "free closed port %d "
+			   "ipath_port0_skbinfo @ %p\n", pd->port_port,
+			   skbinfo);
+		for (e = 0; e < dd->ipath_p0_rcvegrcnt; e++)
+			if (skbinfo[e].skb) {
+				pci_unmap_single(dd->pcidev, skbinfo[e].phys,
+						 dd->ipath_ibmaxlen,
+						 PCI_DMA_FROMDEVICE);
+				dev_kfree_skb(skbinfo[e].skb);
+			}
+		vfree(skbinfo);
+	}
+	kfree(pd->port_tid_pg_list);
+	vfree(pd->subport_uregbase);
+	vfree(pd->subport_rcvegrbuf);
+	vfree(pd->subport_rcvhdr_base);
+	kfree(pd);
+}
+
+static int __init infinipath_init(void)
+{
+	int ret;
+
+	if (ipath_debug & __IPATH_DBG)
+		printk(KERN_INFO DRIVER_LOAD_MSG "%s", ib_ipath_version);
+
+	/*
+	 * These must be called before the driver is registered with
+	 * the PCI subsystem.
+	 */
+	idr_init(&unit_table);
+
+	ret = pci_register_driver(&ipath_driver);
+	if (ret < 0) {
+		printk(KERN_ERR IPATH_DRV_NAME
+		       ": Unable to register driver: error %d\n", -ret);
+		goto bail_unit;
+	}
+
+	ret = ipath_init_ipathfs();
+	if (ret < 0) {
+		printk(KERN_ERR IPATH_DRV_NAME ": Unable to create "
+		       "ipathfs: error %d\n", -ret);
+		goto bail_pci;
+	}
+
+	goto bail;
+
+bail_pci:
+	pci_unregister_driver(&ipath_driver);
+
+bail_unit:
+	idr_destroy(&unit_table);
+
+bail:
+	return ret;
+}
+
+static void __exit infinipath_cleanup(void)
+{
+	ipath_exit_ipathfs();
+
+	ipath_cdbg(VERBOSE, "Unregistering pci driver\n");
+	pci_unregister_driver(&ipath_driver);
+
+	idr_destroy(&unit_table);
+}
+
+/**
+ * ipath_reset_device - reset the chip if possible
+ * @unit: the device to reset
+ *
+ * Whether or not reset is successful, we attempt to re-initialize the chip
+ * (that is, much like a driver unload/reload).  We clear the INITTED flag
+ * so that the various entry points will fail until we reinitialize.  For
+ * now, we only allow this if no user ports are open that use chip resources
+ */
+int ipath_reset_device(int unit)
+{
+	int ret, i;
+	struct ipath_devdata *dd = ipath_lookup(unit);
+	unsigned long flags;
+
+	if (!dd) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	if (atomic_read(&dd->ipath_led_override_timer_active)) {
+		/* Need to stop LED timer, _then_ shut off LEDs */
+		del_timer_sync(&dd->ipath_led_override_timer);
+		atomic_set(&dd->ipath_led_override_timer_active, 0);
+	}
+
+	/* Shut off LEDs after we are sure timer is not running */
+	dd->ipath_led_override = LED_OVER_BOTH_OFF;
+	dd->ipath_f_setextled(dd, 0, 0);
+
+	dev_info(&dd->pcidev->dev, "Reset on unit %u requested\n", unit);
+
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT)) {
+		dev_info(&dd->pcidev->dev, "Invalid unit number %u or "
+			 "not initialized or not present\n", unit);
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+	if (dd->ipath_pd)
+		for (i = 1; i < dd->ipath_cfgports; i++) {
+			if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
+				continue;
+			spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+			ipath_dbg("unit %u port %d is in use "
+				  "(PID %u cmd %s), can't reset\n",
+				  unit, i,
+				  pid_nr(dd->ipath_pd[i]->port_pid),
+				  dd->ipath_pd[i]->port_comm);
+			ret = -EBUSY;
+			goto bail;
+		}
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+
+	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+		teardown_sdma(dd);
+
+	dd->ipath_flags &= ~IPATH_INITTED;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+	ret = dd->ipath_f_reset(dd);
+	if (ret == 1) {
+		ipath_dbg("Reinitializing unit %u after reset attempt\n",
+			  unit);
+		ret = ipath_init_chip(dd, 1);
+	} else
+		ret = -EAGAIN;
+	if (ret)
+		ipath_dev_err(dd, "Reinitialize unit %u after "
+			      "reset failed with %d\n", unit, ret);
+	else
+		dev_info(&dd->pcidev->dev, "Reinitialized unit %u after "
+			 "resetting\n", unit);
+
+bail:
+	return ret;
+}
+
+/*
+ * send a signal to all the processes that have the driver open
+ * through the normal interfaces (i.e., everything other than diags
+ * interface).  Returns number of signalled processes.
+ */
+static int ipath_signal_procs(struct ipath_devdata *dd, int sig)
+{
+	int i, sub, any = 0;
+	struct pid *pid;
+	unsigned long flags;
+
+	if (!dd->ipath_pd)
+		return 0;
+
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+	for (i = 1; i < dd->ipath_cfgports; i++) {
+		if (!dd->ipath_pd[i] || !dd->ipath_pd[i]->port_cnt)
+			continue;
+		pid = dd->ipath_pd[i]->port_pid;
+		if (!pid)
+			continue;
+
+		dev_info(&dd->pcidev->dev, "context %d in use "
+			  "(PID %u), sending signal %d\n",
+			  i, pid_nr(pid), sig);
+		kill_pid(pid, sig, 1);
+		any++;
+		for (sub = 0; sub < INFINIPATH_MAX_SUBPORT; sub++) {
+			pid = dd->ipath_pd[i]->port_subpid[sub];
+			if (!pid)
+				continue;
+			dev_info(&dd->pcidev->dev, "sub-context "
+				"%d:%d in use (PID %u), sending "
+				"signal %d\n", i, sub, pid_nr(pid), sig);
+			kill_pid(pid, sig, 1);
+			any++;
+		}
+	}
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+	return any;
+}
+
+static void ipath_hol_signal_down(struct ipath_devdata *dd)
+{
+	if (ipath_signal_procs(dd, SIGSTOP))
+		ipath_dbg("Stopped some processes\n");
+	ipath_cancel_sends(dd, 1);
+}
+
+
+static void ipath_hol_signal_up(struct ipath_devdata *dd)
+{
+	if (ipath_signal_procs(dd, SIGCONT))
+		ipath_dbg("Continued some processes\n");
+}
+
+/*
+ * link is down, stop any users processes, and flush pending sends
+ * to prevent HoL blocking, then start the HoL timer that
+ * periodically continues, then stop procs, so they can detect
+ * link down if they want, and do something about it.
+ * Timer may already be running, so use mod_timer, not add_timer.
+ */
+void ipath_hol_down(struct ipath_devdata *dd)
+{
+	dd->ipath_hol_state = IPATH_HOL_DOWN;
+	ipath_hol_signal_down(dd);
+	dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
+	dd->ipath_hol_timer.expires = jiffies +
+		msecs_to_jiffies(ipath_hol_timeout_ms);
+	mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires);
+}
+
+/*
+ * link is up, continue any user processes, and ensure timer
+ * is a nop, if running.  Let timer keep running, if set; it
+ * will nop when it sees the link is up
+ */
+void ipath_hol_up(struct ipath_devdata *dd)
+{
+	ipath_hol_signal_up(dd);
+	dd->ipath_hol_state = IPATH_HOL_UP;
+}
+
+/*
+ * toggle the running/not running state of user proceses
+ * to prevent HoL blocking on chip resources, but still allow
+ * user processes to do link down special case handling.
+ * Should only be called via the timer
+ */
+void ipath_hol_event(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+
+	if (dd->ipath_hol_next == IPATH_HOL_DOWNSTOP
+		&& dd->ipath_hol_state != IPATH_HOL_UP) {
+		dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
+		ipath_dbg("Stopping processes\n");
+		ipath_hol_signal_down(dd);
+	} else { /* may do "extra" if also in ipath_hol_up() */
+		dd->ipath_hol_next = IPATH_HOL_DOWNSTOP;
+		ipath_dbg("Continuing processes\n");
+		ipath_hol_signal_up(dd);
+	}
+	if (dd->ipath_hol_state == IPATH_HOL_UP)
+		ipath_dbg("link's up, don't resched timer\n");
+	else {
+		dd->ipath_hol_timer.expires = jiffies +
+			msecs_to_jiffies(ipath_hol_timeout_ms);
+		mod_timer(&dd->ipath_hol_timer,
+			dd->ipath_hol_timer.expires);
+	}
+}
+
+int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv)
+{
+	u64 val;
+
+	if (new_pol_inv > INFINIPATH_XGXS_RX_POL_MASK)
+		return -1;
+	if (dd->ipath_rx_pol_inv != new_pol_inv) {
+		dd->ipath_rx_pol_inv = new_pol_inv;
+		val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+		val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
+			 INFINIPATH_XGXS_RX_POL_SHIFT);
+		val |= ((u64)dd->ipath_rx_pol_inv) <<
+			INFINIPATH_XGXS_RX_POL_SHIFT;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+	}
+	return 0;
+}
+
+/*
+ * Disable and enable the armlaunch error.  Used for PIO bandwidth testing on
+ * the 7220, which is count-based, rather than trigger-based.  Safe for the
+ * driver check, since it's at init.   Not completely safe when used for
+ * user-mode checking, since some error checking can be lost, but not
+ * particularly risky, and only has problematic side-effects in the face of
+ * very buggy user code.  There is no reference counting, but that's also
+ * fine, given the intended use.
+ */
+void ipath_enable_armlaunch(struct ipath_devdata *dd)
+{
+	dd->ipath_lasterror &= ~INFINIPATH_E_SPIOARMLAUNCH;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+		INFINIPATH_E_SPIOARMLAUNCH);
+	dd->ipath_errormask |= INFINIPATH_E_SPIOARMLAUNCH;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+		dd->ipath_errormask);
+}
+
+void ipath_disable_armlaunch(struct ipath_devdata *dd)
+{
+	/* so don't re-enable if already set */
+	dd->ipath_maskederrs &= ~INFINIPATH_E_SPIOARMLAUNCH;
+	dd->ipath_errormask &= ~INFINIPATH_E_SPIOARMLAUNCH;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+		dd->ipath_errormask);
+}
+
+module_init(infinipath_init);
+module_exit(infinipath_cleanup);
diff --git a/drivers/infiniband/hw/ipath/ipath_eeprom.c b/drivers/infiniband/hw/ipath/ipath_eeprom.c
new file mode 100644
index 000000000..fc7181985
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_eeprom.c
@@ -0,0 +1,1183 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_kernel.h"
+
+/*
+ * InfiniPath I2C driver for a serial eeprom.  This is not a generic
+ * I2C interface.  For a start, the device we're using (Atmel AT24C11)
+ * doesn't work like a regular I2C device.  It looks like one
+ * electrically, but not logically.  Normal I2C devices have a single
+ * 7-bit or 10-bit I2C address that they respond to.  Valid 7-bit
+ * addresses range from 0x03 to 0x77.  Addresses 0x00 to 0x02 and 0x78
+ * to 0x7F are special reserved addresses (e.g. 0x00 is the "general
+ * call" address.)  The Atmel device, on the other hand, responds to ALL
+ * 7-bit addresses.  It's designed to be the only device on a given I2C
+ * bus.  A 7-bit address corresponds to the memory address within the
+ * Atmel device itself.
+ *
+ * Also, the timing requirements mean more than simple software
+ * bitbanging, with readbacks from chip to ensure timing (simple udelay
+ * is not enough).
+ *
+ * This all means that accessing the device is specialized enough
+ * that using the standard kernel I2C bitbanging interface would be
+ * impossible.  For example, the core I2C eeprom driver expects to find
+ * a device at one or more of a limited set of addresses only.  It doesn't
+ * allow writing to an eeprom.  It also doesn't provide any means of
+ * accessing eeprom contents from within the kernel, only via sysfs.
+ */
+
+/* Added functionality for IBA7220-based cards */
+#define IPATH_EEPROM_DEV_V1 0xA0
+#define IPATH_EEPROM_DEV_V2 0xA2
+#define IPATH_TEMP_DEV 0x98
+#define IPATH_BAD_DEV (IPATH_EEPROM_DEV_V2+2)
+#define IPATH_NO_DEV (0xFF)
+
+/*
+ * The number of I2C chains is proliferating. Table below brings
+ * some order to the madness. The basic principle is that the
+ * table is scanned from the top, and a "probe" is made to the
+ * device probe_dev. If that succeeds, the chain is considered
+ * to be of that type, and dd->i2c_chain_type is set to the index+1
+ * of the entry.
+ * The +1 is so static initialization can mean "unknown, do probe."
+ */
+static struct i2c_chain_desc {
+	u8 probe_dev;	/* If seen at probe, chain is this type */
+	u8 eeprom_dev;	/* Dev addr (if any) for EEPROM */
+	u8 temp_dev;	/* Dev Addr (if any) for Temp-sense */
+} i2c_chains[] = {
+	{ IPATH_BAD_DEV, IPATH_NO_DEV, IPATH_NO_DEV }, /* pre-iba7220 bds */
+	{ IPATH_EEPROM_DEV_V1, IPATH_EEPROM_DEV_V1, IPATH_TEMP_DEV}, /* V1 */
+	{ IPATH_EEPROM_DEV_V2, IPATH_EEPROM_DEV_V2, IPATH_TEMP_DEV}, /* V2 */
+	{ IPATH_NO_DEV }
+};
+
+enum i2c_type {
+	i2c_line_scl = 0,
+	i2c_line_sda
+};
+
+enum i2c_state {
+	i2c_line_low = 0,
+	i2c_line_high
+};
+
+#define READ_CMD 1
+#define WRITE_CMD 0
+
+/**
+ * i2c_gpio_set - set a GPIO line
+ * @dd: the infinipath device
+ * @line: the line to set
+ * @new_line_state: the state to set
+ *
+ * Returns 0 if the line was set to the new state successfully, non-zero
+ * on error.
+ */
+static int i2c_gpio_set(struct ipath_devdata *dd,
+			enum i2c_type line,
+			enum i2c_state new_line_state)
+{
+	u64 out_mask, dir_mask, *gpioval;
+	unsigned long flags = 0;
+
+	gpioval = &dd->ipath_gpio_out;
+
+	if (line == i2c_line_scl) {
+		dir_mask = dd->ipath_gpio_scl;
+		out_mask = (1UL << dd->ipath_gpio_scl_num);
+	} else {
+		dir_mask = dd->ipath_gpio_sda;
+		out_mask = (1UL << dd->ipath_gpio_sda_num);
+	}
+
+	spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+	if (new_line_state == i2c_line_high) {
+		/* tri-state the output rather than force high */
+		dd->ipath_extctrl &= ~dir_mask;
+	} else {
+		/* config line to be an output */
+		dd->ipath_extctrl |= dir_mask;
+	}
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl);
+
+	/* set output as well (no real verify) */
+	if (new_line_state == i2c_line_high)
+		*gpioval |= out_mask;
+	else
+		*gpioval &= ~out_mask;
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_out, *gpioval);
+	spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+
+	return 0;
+}
+
+/**
+ * i2c_gpio_get - get a GPIO line state
+ * @dd: the infinipath device
+ * @line: the line to get
+ * @curr_statep: where to put the line state
+ *
+ * Returns 0 if the line was set to the new state successfully, non-zero
+ * on error.  curr_state is not set on error.
+ */
+static int i2c_gpio_get(struct ipath_devdata *dd,
+			enum i2c_type line,
+			enum i2c_state *curr_statep)
+{
+	u64 read_val, mask;
+	int ret;
+	unsigned long flags = 0;
+
+	/* check args */
+	if (curr_statep == NULL) {
+		ret = 1;
+		goto bail;
+	}
+
+	/* config line to be an input */
+	if (line == i2c_line_scl)
+		mask = dd->ipath_gpio_scl;
+	else
+		mask = dd->ipath_gpio_sda;
+
+	spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+	dd->ipath_extctrl &= ~mask;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, dd->ipath_extctrl);
+	/*
+	 * Below is very unlikely to reflect true input state if Output
+	 * Enable actually changed.
+	 */
+	read_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
+	spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+
+	if (read_val & mask)
+		*curr_statep = i2c_line_high;
+	else
+		*curr_statep = i2c_line_low;
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * i2c_wait_for_writes - wait for a write
+ * @dd: the infinipath device
+ *
+ * We use this instead of udelay directly, so we can make sure
+ * that previous register writes have been flushed all the way
+ * to the chip.  Since we are delaying anyway, the cost doesn't
+ * hurt, and makes the bit twiddling more regular
+ */
+static void i2c_wait_for_writes(struct ipath_devdata *dd)
+{
+	(void)ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
+	rmb();
+}
+
+static void scl_out(struct ipath_devdata *dd, u8 bit)
+{
+	udelay(1);
+	i2c_gpio_set(dd, i2c_line_scl, bit ? i2c_line_high : i2c_line_low);
+
+	i2c_wait_for_writes(dd);
+}
+
+static void sda_out(struct ipath_devdata *dd, u8 bit)
+{
+	i2c_gpio_set(dd, i2c_line_sda, bit ? i2c_line_high : i2c_line_low);
+
+	i2c_wait_for_writes(dd);
+}
+
+static u8 sda_in(struct ipath_devdata *dd, int wait)
+{
+	enum i2c_state bit;
+
+	if (i2c_gpio_get(dd, i2c_line_sda, &bit))
+		ipath_dbg("get bit failed!\n");
+
+	if (wait)
+		i2c_wait_for_writes(dd);
+
+	return bit == i2c_line_high ? 1U : 0;
+}
+
+/**
+ * i2c_ackrcv - see if ack following write is true
+ * @dd: the infinipath device
+ */
+static int i2c_ackrcv(struct ipath_devdata *dd)
+{
+	u8 ack_received;
+
+	/* AT ENTRY SCL = LOW */
+	/* change direction, ignore data */
+	ack_received = sda_in(dd, 1);
+	scl_out(dd, i2c_line_high);
+	ack_received = sda_in(dd, 1) == 0;
+	scl_out(dd, i2c_line_low);
+	return ack_received;
+}
+
+/**
+ * rd_byte - read a byte, leaving ACK, STOP, etc up to caller
+ * @dd: the infinipath device
+ *
+ * Returns byte shifted out of device
+ */
+static int rd_byte(struct ipath_devdata *dd)
+{
+	int bit_cntr, data;
+
+	data = 0;
+
+	for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
+		data <<= 1;
+		scl_out(dd, i2c_line_high);
+		data |= sda_in(dd, 0);
+		scl_out(dd, i2c_line_low);
+	}
+	return data;
+}
+
+/**
+ * wr_byte - write a byte, one bit at a time
+ * @dd: the infinipath device
+ * @data: the byte to write
+ *
+ * Returns 0 if we got the following ack, otherwise 1
+ */
+static int wr_byte(struct ipath_devdata *dd, u8 data)
+{
+	int bit_cntr;
+	u8 bit;
+
+	for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
+		bit = (data >> bit_cntr) & 1;
+		sda_out(dd, bit);
+		scl_out(dd, i2c_line_high);
+		scl_out(dd, i2c_line_low);
+	}
+	return (!i2c_ackrcv(dd)) ? 1 : 0;
+}
+
+static void send_ack(struct ipath_devdata *dd)
+{
+	sda_out(dd, i2c_line_low);
+	scl_out(dd, i2c_line_high);
+	scl_out(dd, i2c_line_low);
+	sda_out(dd, i2c_line_high);
+}
+
+/**
+ * i2c_startcmd - transmit the start condition, followed by address/cmd
+ * @dd: the infinipath device
+ * @offset_dir: direction byte
+ *
+ *      (both clock/data high, clock high, data low while clock is high)
+ */
+static int i2c_startcmd(struct ipath_devdata *dd, u8 offset_dir)
+{
+	int res;
+
+	/* issue start sequence */
+	sda_out(dd, i2c_line_high);
+	scl_out(dd, i2c_line_high);
+	sda_out(dd, i2c_line_low);
+	scl_out(dd, i2c_line_low);
+
+	/* issue length and direction byte */
+	res = wr_byte(dd, offset_dir);
+
+	if (res)
+		ipath_cdbg(VERBOSE, "No ack to complete start\n");
+
+	return res;
+}
+
+/**
+ * stop_cmd - transmit the stop condition
+ * @dd: the infinipath device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_cmd(struct ipath_devdata *dd)
+{
+	scl_out(dd, i2c_line_low);
+	sda_out(dd, i2c_line_low);
+	scl_out(dd, i2c_line_high);
+	sda_out(dd, i2c_line_high);
+	udelay(2);
+}
+
+/**
+ * eeprom_reset - reset I2C communication
+ * @dd: the infinipath device
+ */
+
+static int eeprom_reset(struct ipath_devdata *dd)
+{
+	int clock_cycles_left = 9;
+	u64 *gpioval = &dd->ipath_gpio_out;
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+	/* Make sure shadows are consistent */
+	dd->ipath_extctrl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extctrl);
+	*gpioval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_out);
+	spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+
+	ipath_cdbg(VERBOSE, "Resetting i2c eeprom; initial gpioout reg "
+		   "is %llx\n", (unsigned long long) *gpioval);
+
+	/*
+	 * This is to get the i2c into a known state, by first going low,
+	 * then tristate sda (and then tristate scl as first thing
+	 * in loop)
+	 */
+	scl_out(dd, i2c_line_low);
+	sda_out(dd, i2c_line_high);
+
+	/* Clock up to 9 cycles looking for SDA hi, then issue START and STOP */
+	while (clock_cycles_left--) {
+		scl_out(dd, i2c_line_high);
+
+		/* SDA seen high, issue START by dropping it while SCL high */
+		if (sda_in(dd, 0)) {
+			sda_out(dd, i2c_line_low);
+			scl_out(dd, i2c_line_low);
+			/* ATMEL spec says must be followed by STOP. */
+			scl_out(dd, i2c_line_high);
+			sda_out(dd, i2c_line_high);
+			ret = 0;
+			goto bail;
+		}
+
+		scl_out(dd, i2c_line_low);
+	}
+
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/*
+ * Probe for I2C device at specified address. Returns 0 for "success"
+ * to match rest of this file.
+ * Leave bus in "reasonable" state for further commands.
+ */
+static int i2c_probe(struct ipath_devdata *dd, int devaddr)
+{
+	int ret = 0;
+
+	ret = eeprom_reset(dd);
+	if (ret) {
+		ipath_dev_err(dd, "Failed reset probing device 0x%02X\n",
+			      devaddr);
+		return ret;
+	}
+	/*
+	 * Reset no longer leaves bus in start condition, so normal
+	 * i2c_startcmd() will do.
+	 */
+	ret = i2c_startcmd(dd, devaddr | READ_CMD);
+	if (ret)
+		ipath_cdbg(VERBOSE, "Failed startcmd for device 0x%02X\n",
+			   devaddr);
+	else {
+		/*
+		 * Device did respond. Complete a single-byte read, because some
+		 * devices apparently cannot handle STOP immediately after they
+		 * ACK the start-cmd.
+		 */
+		int data;
+		data = rd_byte(dd);
+		stop_cmd(dd);
+		ipath_cdbg(VERBOSE, "Response from device 0x%02X\n", devaddr);
+	}
+	return ret;
+}
+
+/*
+ * Returns the "i2c type". This is a pointer to a struct that describes
+ * the I2C chain on this board. To minimize impact on struct ipath_devdata,
+ * the (small integer) index into the table is actually memoized, rather
+ * then the pointer.
+ * Memoization is because the type is determined on the first call per chip.
+ * An alternative would be to move type determination to early
+ * init code.
+ */
+static struct i2c_chain_desc *ipath_i2c_type(struct ipath_devdata *dd)
+{
+	int idx;
+
+	/* Get memoized index, from previous successful probes */
+	idx = dd->ipath_i2c_chain_type - 1;
+	if (idx >= 0 && idx < (ARRAY_SIZE(i2c_chains) - 1))
+		goto done;
+
+	idx = 0;
+	while (i2c_chains[idx].probe_dev != IPATH_NO_DEV) {
+		/* if probe succeeds, this is type */
+		if (!i2c_probe(dd, i2c_chains[idx].probe_dev))
+			break;
+		++idx;
+	}
+
+	/*
+	 * Old EEPROM (first entry) may require a reset after probe,
+	 * rather than being able to "start" after "stop"
+	 */
+	if (idx == 0)
+		eeprom_reset(dd);
+
+	if (i2c_chains[idx].probe_dev == IPATH_NO_DEV)
+		idx = -1;
+	else
+		dd->ipath_i2c_chain_type = idx + 1;
+done:
+	return (idx >= 0) ? i2c_chains + idx : NULL;
+}
+
+static int ipath_eeprom_internal_read(struct ipath_devdata *dd,
+					u8 eeprom_offset, void *buffer, int len)
+{
+	int ret;
+	struct i2c_chain_desc *icd;
+	u8 *bp = buffer;
+
+	ret = 1;
+	icd = ipath_i2c_type(dd);
+	if (!icd)
+		goto bail;
+
+	if (icd->eeprom_dev == IPATH_NO_DEV) {
+		/* legacy not-really-I2C */
+		ipath_cdbg(VERBOSE, "Start command only address\n");
+		eeprom_offset = (eeprom_offset << 1) | READ_CMD;
+		ret = i2c_startcmd(dd, eeprom_offset);
+	} else {
+		/* Actual I2C */
+		ipath_cdbg(VERBOSE, "Start command uses devaddr\n");
+		if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) {
+			ipath_dbg("Failed EEPROM startcmd\n");
+			stop_cmd(dd);
+			ret = 1;
+			goto bail;
+		}
+		ret = wr_byte(dd, eeprom_offset);
+		stop_cmd(dd);
+		if (ret) {
+			ipath_dev_err(dd, "Failed to write EEPROM address\n");
+			ret = 1;
+			goto bail;
+		}
+		ret = i2c_startcmd(dd, icd->eeprom_dev | READ_CMD);
+	}
+	if (ret) {
+		ipath_dbg("Failed startcmd for dev %02X\n", icd->eeprom_dev);
+		stop_cmd(dd);
+		ret = 1;
+		goto bail;
+	}
+
+	/*
+	 * eeprom keeps clocking data out as long as we ack, automatically
+	 * incrementing the address.
+	 */
+	while (len-- > 0) {
+		/* get and store data */
+		*bp++ = rd_byte(dd);
+		/* send ack if not the last byte */
+		if (len)
+			send_ack(dd);
+	}
+
+	stop_cmd(dd);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int ipath_eeprom_internal_write(struct ipath_devdata *dd, u8 eeprom_offset,
+				       const void *buffer, int len)
+{
+	int sub_len;
+	const u8 *bp = buffer;
+	int max_wait_time, i;
+	int ret;
+	struct i2c_chain_desc *icd;
+
+	ret = 1;
+	icd = ipath_i2c_type(dd);
+	if (!icd)
+		goto bail;
+
+	while (len > 0) {
+		if (icd->eeprom_dev == IPATH_NO_DEV) {
+			if (i2c_startcmd(dd,
+					 (eeprom_offset << 1) | WRITE_CMD)) {
+				ipath_dbg("Failed to start cmd offset %u\n",
+					eeprom_offset);
+				goto failed_write;
+			}
+		} else {
+			/* Real I2C */
+			if (i2c_startcmd(dd, icd->eeprom_dev | WRITE_CMD)) {
+				ipath_dbg("Failed EEPROM startcmd\n");
+				goto failed_write;
+			}
+			ret = wr_byte(dd, eeprom_offset);
+			if (ret) {
+				ipath_dev_err(dd, "Failed to write EEPROM "
+					      "address\n");
+				goto failed_write;
+			}
+		}
+
+		sub_len = min(len, 4);
+		eeprom_offset += sub_len;
+		len -= sub_len;
+
+		for (i = 0; i < sub_len; i++) {
+			if (wr_byte(dd, *bp++)) {
+				ipath_dbg("no ack after byte %u/%u (%u "
+					  "total remain)\n", i, sub_len,
+					  len + sub_len - i);
+				goto failed_write;
+			}
+		}
+
+		stop_cmd(dd);
+
+		/*
+		 * wait for write complete by waiting for a successful
+		 * read (the chip replies with a zero after the write
+		 * cmd completes, and before it writes to the eeprom.
+		 * The startcmd for the read will fail the ack until
+		 * the writes have completed.   We do this inline to avoid
+		 * the debug prints that are in the real read routine
+		 * if the startcmd fails.
+		 * We also use the proper device address, so it doesn't matter
+		 * whether we have real eeprom_dev. legacy likes any address.
+		 */
+		max_wait_time = 100;
+		while (i2c_startcmd(dd, icd->eeprom_dev | READ_CMD)) {
+			stop_cmd(dd);
+			if (!--max_wait_time) {
+				ipath_dbg("Did not get successful read to "
+					  "complete write\n");
+				goto failed_write;
+			}
+		}
+		/* now read (and ignore) the resulting byte */
+		rd_byte(dd);
+		stop_cmd(dd);
+	}
+
+	ret = 0;
+	goto bail;
+
+failed_write:
+	stop_cmd(dd);
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_eeprom_read - receives bytes from the eeprom via I2C
+ * @dd: the infinipath device
+ * @eeprom_offset: address to read from
+ * @buffer: where to store result
+ * @len: number of bytes to receive
+ */
+int ipath_eeprom_read(struct ipath_devdata *dd, u8 eeprom_offset,
+			void *buff, int len)
+{
+	int ret;
+
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+	if (!ret) {
+		ret = ipath_eeprom_internal_read(dd, eeprom_offset, buff, len);
+		mutex_unlock(&dd->ipath_eep_lock);
+	}
+
+	return ret;
+}
+
+/**
+ * ipath_eeprom_write - writes data to the eeprom via I2C
+ * @dd: the infinipath device
+ * @eeprom_offset: where to place data
+ * @buffer: data to write
+ * @len: number of bytes to write
+ */
+int ipath_eeprom_write(struct ipath_devdata *dd, u8 eeprom_offset,
+			const void *buff, int len)
+{
+	int ret;
+
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+	if (!ret) {
+		ret = ipath_eeprom_internal_write(dd, eeprom_offset, buff, len);
+		mutex_unlock(&dd->ipath_eep_lock);
+	}
+
+	return ret;
+}
+
+static u8 flash_csum(struct ipath_flash *ifp, int adjust)
+{
+	u8 *ip = (u8 *) ifp;
+	u8 csum = 0, len;
+
+	/*
+	 * Limit length checksummed to max length of actual data.
+	 * Checksum of erased eeprom will still be bad, but we avoid
+	 * reading past the end of the buffer we were passed.
+	 */
+	len = ifp->if_length;
+	if (len > sizeof(struct ipath_flash))
+		len = sizeof(struct ipath_flash);
+	while (len--)
+		csum += *ip++;
+	csum -= ifp->if_csum;
+	csum = ~csum;
+	if (adjust)
+		ifp->if_csum = csum;
+
+	return csum;
+}
+
+/**
+ * ipath_get_guid - get the GUID from the i2c device
+ * @dd: the infinipath device
+ *
+ * We have the capability to use the ipath_nguid field, and get
+ * the guid from the first chip's flash, to use for all of them.
+ */
+void ipath_get_eeprom_info(struct ipath_devdata *dd)
+{
+	void *buf;
+	struct ipath_flash *ifp;
+	__be64 guid;
+	int len, eep_stat;
+	u8 csum, *bguid;
+	int t = dd->ipath_unit;
+	struct ipath_devdata *dd0 = ipath_lookup(0);
+
+	if (t && dd0->ipath_nguid > 1 && t <= dd0->ipath_nguid) {
+		u8 oguid;
+		dd->ipath_guid = dd0->ipath_guid;
+		bguid = (u8 *) & dd->ipath_guid;
+
+		oguid = bguid[7];
+		bguid[7] += t;
+		if (oguid > bguid[7]) {
+			if (bguid[6] == 0xff) {
+				if (bguid[5] == 0xff) {
+					ipath_dev_err(
+						dd,
+						"Can't set %s GUID from "
+						"base, wraps to OUI!\n",
+						ipath_get_unit_name(t));
+					dd->ipath_guid = 0;
+					goto bail;
+				}
+				bguid[5]++;
+			}
+			bguid[6]++;
+		}
+		dd->ipath_nguid = 1;
+
+		ipath_dbg("nguid %u, so adding %u to device 0 guid, "
+			  "for %llx\n",
+			  dd0->ipath_nguid, t,
+			  (unsigned long long) be64_to_cpu(dd->ipath_guid));
+		goto bail;
+	}
+
+	/*
+	 * read full flash, not just currently used part, since it may have
+	 * been written with a newer definition
+	 * */
+	len = sizeof(struct ipath_flash);
+	buf = vmalloc(len);
+	if (!buf) {
+		ipath_dev_err(dd, "Couldn't allocate memory to read %u "
+			      "bytes from eeprom for GUID\n", len);
+		goto bail;
+	}
+
+	mutex_lock(&dd->ipath_eep_lock);
+	eep_stat = ipath_eeprom_internal_read(dd, 0, buf, len);
+	mutex_unlock(&dd->ipath_eep_lock);
+
+	if (eep_stat) {
+		ipath_dev_err(dd, "Failed reading GUID from eeprom\n");
+		goto done;
+	}
+	ifp = (struct ipath_flash *)buf;
+
+	csum = flash_csum(ifp, 0);
+	if (csum != ifp->if_csum) {
+		dev_info(&dd->pcidev->dev, "Bad I2C flash checksum: "
+			 "0x%x, not 0x%x\n", csum, ifp->if_csum);
+		goto done;
+	}
+	if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) ||
+	    *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) {
+		ipath_dev_err(dd, "Invalid GUID %llx from flash; "
+			      "ignoring\n",
+			      *(unsigned long long *) ifp->if_guid);
+		/* don't allow GUID if all 0 or all 1's */
+		goto done;
+	}
+
+	/* complain, but allow it */
+	if (*(u64 *) ifp->if_guid == 0x100007511000000ULL)
+		dev_info(&dd->pcidev->dev, "Warning, GUID %llx is "
+			 "default, probably not correct!\n",
+			 *(unsigned long long *) ifp->if_guid);
+
+	bguid = ifp->if_guid;
+	if (!bguid[0] && !bguid[1] && !bguid[2]) {
+		/* original incorrect GUID format in flash; fix in
+		 * core copy, by shifting up 2 octets; don't need to
+		 * change top octet, since both it and shifted are
+		 * 0.. */
+		bguid[1] = bguid[3];
+		bguid[2] = bguid[4];
+		bguid[3] = bguid[4] = 0;
+		guid = *(__be64 *) ifp->if_guid;
+		ipath_cdbg(VERBOSE, "Old GUID format in flash, top 3 zero, "
+			   "shifting 2 octets\n");
+	} else
+		guid = *(__be64 *) ifp->if_guid;
+	dd->ipath_guid = guid;
+	dd->ipath_nguid = ifp->if_numguid;
+	/*
+	 * Things are slightly complicated by the desire to transparently
+	 * support both the Pathscale 10-digit serial number and the QLogic
+	 * 13-character version.
+	 */
+	if ((ifp->if_fversion > 1) && ifp->if_sprefix[0]
+		&& ((u8 *)ifp->if_sprefix)[0] != 0xFF) {
+		/* This board has a Serial-prefix, which is stored
+		 * elsewhere for backward-compatibility.
+		 */
+		char *snp = dd->ipath_serial;
+		memcpy(snp, ifp->if_sprefix, sizeof ifp->if_sprefix);
+		snp[sizeof ifp->if_sprefix] = '\0';
+		len = strlen(snp);
+		snp += len;
+		len = (sizeof dd->ipath_serial) - len;
+		if (len > sizeof ifp->if_serial) {
+			len = sizeof ifp->if_serial;
+		}
+		memcpy(snp, ifp->if_serial, len);
+	} else
+		memcpy(dd->ipath_serial, ifp->if_serial,
+		       sizeof ifp->if_serial);
+	if (!strstr(ifp->if_comment, "Tested successfully"))
+		ipath_dev_err(dd, "Board SN %s did not pass functional "
+			"test: %s\n", dd->ipath_serial,
+			ifp->if_comment);
+
+	ipath_cdbg(VERBOSE, "Initted GUID to %llx from eeprom\n",
+		   (unsigned long long) be64_to_cpu(dd->ipath_guid));
+
+	memcpy(&dd->ipath_eep_st_errs, &ifp->if_errcntp, IPATH_EEP_LOG_CNT);
+	/*
+	 * Power-on (actually "active") hours are kept as little-endian value
+	 * in EEPROM, but as seconds in a (possibly as small as 24-bit)
+	 * atomic_t while running.
+	 */
+	atomic_set(&dd->ipath_active_time, 0);
+	dd->ipath_eep_hrs = ifp->if_powerhour[0] | (ifp->if_powerhour[1] << 8);
+
+done:
+	vfree(buf);
+
+bail:;
+}
+
+/**
+ * ipath_update_eeprom_log - copy active-time and error counters to eeprom
+ * @dd: the infinipath device
+ *
+ * Although the time is kept as seconds in the ipath_devdata struct, it is
+ * rounded to hours for re-write, as we have only 16 bits in EEPROM.
+ * First-cut code reads whole (expected) struct ipath_flash, modifies,
+ * re-writes. Future direction: read/write only what we need, assuming
+ * that the EEPROM had to have been "good enough" for driver init, and
+ * if not, we aren't making it worse.
+ *
+ */
+
+int ipath_update_eeprom_log(struct ipath_devdata *dd)
+{
+	void *buf;
+	struct ipath_flash *ifp;
+	int len, hi_water;
+	uint32_t new_time, new_hrs;
+	u8 csum;
+	int ret, idx;
+	unsigned long flags;
+
+	/* first, check if we actually need to do anything. */
+	ret = 0;
+	for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
+		if (dd->ipath_eep_st_new_errs[idx]) {
+			ret = 1;
+			break;
+		}
+	}
+	new_time = atomic_read(&dd->ipath_active_time);
+
+	if (ret == 0 && new_time < 3600)
+		return 0;
+
+	/*
+	 * The quick-check above determined that there is something worthy
+	 * of logging, so get current contents and do a more detailed idea.
+	 * read full flash, not just currently used part, since it may have
+	 * been written with a newer definition
+	 */
+	len = sizeof(struct ipath_flash);
+	buf = vmalloc(len);
+	ret = 1;
+	if (!buf) {
+		ipath_dev_err(dd, "Couldn't allocate memory to read %u "
+				"bytes from eeprom for logging\n", len);
+		goto bail;
+	}
+
+	/* Grab semaphore and read current EEPROM. If we get an
+	 * error, let go, but if not, keep it until we finish write.
+	 */
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+	if (ret) {
+		ipath_dev_err(dd, "Unable to acquire EEPROM for logging\n");
+		goto free_bail;
+	}
+	ret = ipath_eeprom_internal_read(dd, 0, buf, len);
+	if (ret) {
+		mutex_unlock(&dd->ipath_eep_lock);
+		ipath_dev_err(dd, "Unable read EEPROM for logging\n");
+		goto free_bail;
+	}
+	ifp = (struct ipath_flash *)buf;
+
+	csum = flash_csum(ifp, 0);
+	if (csum != ifp->if_csum) {
+		mutex_unlock(&dd->ipath_eep_lock);
+		ipath_dev_err(dd, "EEPROM cks err (0x%02X, S/B 0x%02X)\n",
+				csum, ifp->if_csum);
+		ret = 1;
+		goto free_bail;
+	}
+	hi_water = 0;
+	spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
+	for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
+		int new_val = dd->ipath_eep_st_new_errs[idx];
+		if (new_val) {
+			/*
+			 * If we have seen any errors, add to EEPROM values
+			 * We need to saturate at 0xFF (255) and we also
+			 * would need to adjust the checksum if we were
+			 * trying to minimize EEPROM traffic
+			 * Note that we add to actual current count in EEPROM,
+			 * in case it was altered while we were running.
+			 */
+			new_val += ifp->if_errcntp[idx];
+			if (new_val > 0xFF)
+				new_val = 0xFF;
+			if (ifp->if_errcntp[idx] != new_val) {
+				ifp->if_errcntp[idx] = new_val;
+				hi_water = offsetof(struct ipath_flash,
+						if_errcntp) + idx;
+			}
+			/*
+			 * update our shadow (used to minimize EEPROM
+			 * traffic), to match what we are about to write.
+			 */
+			dd->ipath_eep_st_errs[idx] = new_val;
+			dd->ipath_eep_st_new_errs[idx] = 0;
+		}
+	}
+	/*
+	 * now update active-time. We would like to round to the nearest hour
+	 * but unless atomic_t are sure to be proper signed ints we cannot,
+	 * because we need to account for what we "transfer" to EEPROM and
+	 * if we log an hour at 31 minutes, then we would need to set
+	 * active_time to -29 to accurately count the _next_ hour.
+	 */
+	if (new_time >= 3600) {
+		new_hrs = new_time / 3600;
+		atomic_sub((new_hrs * 3600), &dd->ipath_active_time);
+		new_hrs += dd->ipath_eep_hrs;
+		if (new_hrs > 0xFFFF)
+			new_hrs = 0xFFFF;
+		dd->ipath_eep_hrs = new_hrs;
+		if ((new_hrs & 0xFF) != ifp->if_powerhour[0]) {
+			ifp->if_powerhour[0] = new_hrs & 0xFF;
+			hi_water = offsetof(struct ipath_flash, if_powerhour);
+		}
+		if ((new_hrs >> 8) != ifp->if_powerhour[1]) {
+			ifp->if_powerhour[1] = new_hrs >> 8;
+			hi_water = offsetof(struct ipath_flash, if_powerhour)
+					+ 1;
+		}
+	}
+	/*
+	 * There is a tiny possibility that we could somehow fail to write
+	 * the EEPROM after updating our shadows, but problems from holding
+	 * the spinlock too long are a much bigger issue.
+	 */
+	spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
+	if (hi_water) {
+		/* we made some change to the data, uopdate cksum and write */
+		csum = flash_csum(ifp, 1);
+		ret = ipath_eeprom_internal_write(dd, 0, buf, hi_water + 1);
+	}
+	mutex_unlock(&dd->ipath_eep_lock);
+	if (ret)
+		ipath_dev_err(dd, "Failed updating EEPROM\n");
+
+free_bail:
+	vfree(buf);
+bail:
+	return ret;
+
+}
+
+/**
+ * ipath_inc_eeprom_err - increment one of the four error counters
+ * that are logged to EEPROM.
+ * @dd: the infinipath device
+ * @eidx: 0..3, the counter to increment
+ * @incr: how much to add
+ *
+ * Each counter is 8-bits, and saturates at 255 (0xFF). They
+ * are copied to the EEPROM (aka flash) whenever ipath_update_eeprom_log()
+ * is called, but it can only be called in a context that allows sleep.
+ * This function can be called even at interrupt level.
+ */
+
+void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr)
+{
+	uint new_val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
+	new_val = dd->ipath_eep_st_new_errs[eidx] + incr;
+	if (new_val > 255)
+		new_val = 255;
+	dd->ipath_eep_st_new_errs[eidx] = new_val;
+	spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
+	return;
+}
+
+static int ipath_tempsense_internal_read(struct ipath_devdata *dd, u8 regnum)
+{
+	int ret;
+	struct i2c_chain_desc *icd;
+
+	ret = -ENOENT;
+
+	icd = ipath_i2c_type(dd);
+	if (!icd)
+		goto bail;
+
+	if (icd->temp_dev == IPATH_NO_DEV) {
+		/* tempsense only exists on new, real-I2C boards */
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) {
+		ipath_dbg("Failed tempsense startcmd\n");
+		stop_cmd(dd);
+		ret = -ENXIO;
+		goto bail;
+	}
+	ret = wr_byte(dd, regnum);
+	stop_cmd(dd);
+	if (ret) {
+		ipath_dev_err(dd, "Failed tempsense WR command %02X\n",
+			      regnum);
+		ret = -ENXIO;
+		goto bail;
+	}
+	if (i2c_startcmd(dd, icd->temp_dev | READ_CMD)) {
+		ipath_dbg("Failed tempsense RD startcmd\n");
+		stop_cmd(dd);
+		ret = -ENXIO;
+		goto bail;
+	}
+	/*
+	 * We can only clock out one byte per command, sensibly
+	 */
+	ret = rd_byte(dd);
+	stop_cmd(dd);
+
+bail:
+	return ret;
+}
+
+#define VALID_TS_RD_REG_MASK 0xBF
+
+/**
+ * ipath_tempsense_read - read register of temp sensor via I2C
+ * @dd: the infinipath device
+ * @regnum: register to read from
+ *
+ * returns reg contents (0..255) or < 0 for error
+ */
+int ipath_tempsense_read(struct ipath_devdata *dd, u8 regnum)
+{
+	int ret;
+
+	if (regnum > 7)
+		return -EINVAL;
+
+	/* return a bogus value for (the one) register we do not have */
+	if (!((1 << regnum) & VALID_TS_RD_REG_MASK))
+		return 0;
+
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+	if (!ret) {
+		ret = ipath_tempsense_internal_read(dd, regnum);
+		mutex_unlock(&dd->ipath_eep_lock);
+	}
+
+	/*
+	 * There are three possibilities here:
+	 * ret is actual value (0..255)
+	 * ret is -ENXIO or -EINVAL from code in this file
+	 * ret is -EINTR from mutex_lock_interruptible.
+	 */
+	return ret;
+}
+
+static int ipath_tempsense_internal_write(struct ipath_devdata *dd,
+					  u8 regnum, u8 data)
+{
+	int ret = -ENOENT;
+	struct i2c_chain_desc *icd;
+
+	icd = ipath_i2c_type(dd);
+	if (!icd)
+		goto bail;
+
+	if (icd->temp_dev == IPATH_NO_DEV) {
+		/* tempsense only exists on new, real-I2C boards */
+		ret = -ENXIO;
+		goto bail;
+	}
+	if (i2c_startcmd(dd, icd->temp_dev | WRITE_CMD)) {
+		ipath_dbg("Failed tempsense startcmd\n");
+		stop_cmd(dd);
+		ret = -ENXIO;
+		goto bail;
+	}
+	ret = wr_byte(dd, regnum);
+	if (ret) {
+		stop_cmd(dd);
+		ipath_dev_err(dd, "Failed to write tempsense command %02X\n",
+			      regnum);
+		ret = -ENXIO;
+		goto bail;
+	}
+	ret = wr_byte(dd, data);
+	stop_cmd(dd);
+	ret = i2c_startcmd(dd, icd->temp_dev | READ_CMD);
+	if (ret) {
+		ipath_dev_err(dd, "Failed tempsense data wrt to %02X\n",
+			      regnum);
+		ret = -ENXIO;
+	}
+
+bail:
+	return ret;
+}
+
+#define VALID_TS_WR_REG_MASK ((1 << 9) | (1 << 0xB) | (1 << 0xD))
+
+/**
+ * ipath_tempsense_write - write register of temp sensor via I2C
+ * @dd: the infinipath device
+ * @regnum: register to write
+ * @data: data to write
+ *
+ * returns 0 for success or < 0 for error
+ */
+int ipath_tempsense_write(struct ipath_devdata *dd, u8 regnum, u8 data)
+{
+	int ret;
+
+	if (regnum > 15 || !((1 << regnum) & VALID_TS_WR_REG_MASK))
+		return -EINVAL;
+
+	ret = mutex_lock_interruptible(&dd->ipath_eep_lock);
+	if (!ret) {
+		ret = ipath_tempsense_internal_write(dd, regnum, data);
+		mutex_unlock(&dd->ipath_eep_lock);
+	}
+
+	/*
+	 * There are three possibilities here:
+	 * ret is 0 for success
+	 * ret is -ENXIO or -EINVAL from code in this file
+	 * ret is -EINTR from mutex_lock_interruptible.
+	 */
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
new file mode 100644
index 000000000..450d15965
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c
@@ -0,0 +1,2620 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/swap.h>
+#include <linux/export.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/jiffies.h>
+#include <linux/cpu.h>
+#include <linux/uio.h>
+#include <asm/pgtable.h>
+
+#include "ipath_kernel.h"
+#include "ipath_common.h"
+#include "ipath_user_sdma.h"
+
+static int ipath_open(struct inode *, struct file *);
+static int ipath_close(struct inode *, struct file *);
+static ssize_t ipath_write(struct file *, const char __user *, size_t,
+			   loff_t *);
+static ssize_t ipath_write_iter(struct kiocb *, struct iov_iter *from);
+static unsigned int ipath_poll(struct file *, struct poll_table_struct *);
+static int ipath_mmap(struct file *, struct vm_area_struct *);
+
+/*
+ * This is really, really weird shit - write() and writev() here
+ * have completely unrelated semantics.  Sucky userland ABI,
+ * film at 11.
+ */
+static const struct file_operations ipath_file_ops = {
+	.owner = THIS_MODULE,
+	.write = ipath_write,
+	.write_iter = ipath_write_iter,
+	.open = ipath_open,
+	.release = ipath_close,
+	.poll = ipath_poll,
+	.mmap = ipath_mmap,
+	.llseek = noop_llseek,
+};
+
+/*
+ * Convert kernel virtual addresses to physical addresses so they don't
+ * potentially conflict with the chip addresses used as mmap offsets.
+ * It doesn't really matter what mmap offset we use as long as we can
+ * interpret it correctly.
+ */
+static u64 cvt_kvaddr(void *p)
+{
+	struct page *page;
+	u64 paddr = 0;
+
+	page = vmalloc_to_page(p);
+	if (page)
+		paddr = page_to_pfn(page) << PAGE_SHIFT;
+
+	return paddr;
+}
+
+static int ipath_get_base_info(struct file *fp,
+			       void __user *ubase, size_t ubase_size)
+{
+	struct ipath_portdata *pd = port_fp(fp);
+	int ret = 0;
+	struct ipath_base_info *kinfo = NULL;
+	struct ipath_devdata *dd = pd->port_dd;
+	unsigned subport_cnt;
+	int shared, master;
+	size_t sz;
+
+	subport_cnt = pd->port_subport_cnt;
+	if (!subport_cnt) {
+		shared = 0;
+		master = 0;
+		subport_cnt = 1;
+	} else {
+		shared = 1;
+		master = !subport_fp(fp);
+	}
+
+	sz = sizeof(*kinfo);
+	/* If port sharing is not requested, allow the old size structure */
+	if (!shared)
+		sz -= 7 * sizeof(u64);
+	if (ubase_size < sz) {
+		ipath_cdbg(PROC,
+			   "Base size %zu, need %zu (version mismatch?)\n",
+			   ubase_size, sz);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL);
+	if (kinfo == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	ret = dd->ipath_f_get_base_info(pd, kinfo);
+	if (ret < 0)
+		goto bail;
+
+	kinfo->spi_rcvhdr_cnt = dd->ipath_rcvhdrcnt;
+	kinfo->spi_rcvhdrent_size = dd->ipath_rcvhdrentsize;
+	kinfo->spi_tidegrcnt = dd->ipath_rcvegrcnt;
+	kinfo->spi_rcv_egrbufsize = dd->ipath_rcvegrbufsize;
+	/*
+	 * have to mmap whole thing
+	 */
+	kinfo->spi_rcv_egrbuftotlen =
+		pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
+	kinfo->spi_rcv_egrperchunk = pd->port_rcvegrbufs_perchunk;
+	kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /
+		pd->port_rcvegrbuf_chunks;
+	kinfo->spi_tidcnt = dd->ipath_rcvtidcnt / subport_cnt;
+	if (master)
+		kinfo->spi_tidcnt += dd->ipath_rcvtidcnt % subport_cnt;
+	/*
+	 * for this use, may be ipath_cfgports summed over all chips that
+	 * are are configured and present
+	 */
+	kinfo->spi_nports = dd->ipath_cfgports;
+	/* unit (chip/board) our port is on */
+	kinfo->spi_unit = dd->ipath_unit;
+	/* for now, only a single page */
+	kinfo->spi_tid_maxsize = PAGE_SIZE;
+
+	/*
+	 * Doing this per port, and based on the skip value, etc.  This has
+	 * to be the actual buffer size, since the protocol code treats it
+	 * as an array.
+	 *
+	 * These have to be set to user addresses in the user code via mmap.
+	 * These values are used on return to user code for the mmap target
+	 * addresses only.  For 32 bit, same 44 bit address problem, so use
+	 * the physical address, not virtual.  Before 2.6.11, using the
+	 * page_address() macro worked, but in 2.6.11, even that returns the
+	 * full 64 bit address (upper bits all 1's).  So far, using the
+	 * physical addresses (or chip offsets, for chip mapping) works, but
+	 * no doubt some future kernel release will change that, and we'll be
+	 * on to yet another method of dealing with this.
+	 */
+	kinfo->spi_rcvhdr_base = (u64) pd->port_rcvhdrq_phys;
+	kinfo->spi_rcvhdr_tailaddr = (u64) pd->port_rcvhdrqtailaddr_phys;
+	kinfo->spi_rcv_egrbufs = (u64) pd->port_rcvegr_phys;
+	kinfo->spi_pioavailaddr = (u64) dd->ipath_pioavailregs_phys;
+	kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +
+		(void *) dd->ipath_statusp -
+		(void *) dd->ipath_pioavailregs_dma;
+	if (!shared) {
+		kinfo->spi_piocnt = pd->port_piocnt;
+		kinfo->spi_piobufbase = (u64) pd->port_piobufs;
+		kinfo->__spi_uregbase = (u64) dd->ipath_uregbase +
+			dd->ipath_ureg_align * pd->port_port;
+	} else if (master) {
+		kinfo->spi_piocnt = (pd->port_piocnt / subport_cnt) +
+				    (pd->port_piocnt % subport_cnt);
+		/* Master's PIO buffers are after all the slave's */
+		kinfo->spi_piobufbase = (u64) pd->port_piobufs +
+			dd->ipath_palign *
+			(pd->port_piocnt - kinfo->spi_piocnt);
+	} else {
+		unsigned slave = subport_fp(fp) - 1;
+
+		kinfo->spi_piocnt = pd->port_piocnt / subport_cnt;
+		kinfo->spi_piobufbase = (u64) pd->port_piobufs +
+			dd->ipath_palign * kinfo->spi_piocnt * slave;
+	}
+
+	if (shared) {
+		kinfo->spi_port_uregbase = (u64) dd->ipath_uregbase +
+			dd->ipath_ureg_align * pd->port_port;
+		kinfo->spi_port_rcvegrbuf = kinfo->spi_rcv_egrbufs;
+		kinfo->spi_port_rcvhdr_base = kinfo->spi_rcvhdr_base;
+		kinfo->spi_port_rcvhdr_tailaddr = kinfo->spi_rcvhdr_tailaddr;
+
+		kinfo->__spi_uregbase = cvt_kvaddr(pd->subport_uregbase +
+			PAGE_SIZE * subport_fp(fp));
+
+		kinfo->spi_rcvhdr_base = cvt_kvaddr(pd->subport_rcvhdr_base +
+			pd->port_rcvhdrq_size * subport_fp(fp));
+		kinfo->spi_rcvhdr_tailaddr = 0;
+		kinfo->spi_rcv_egrbufs = cvt_kvaddr(pd->subport_rcvegrbuf +
+			pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size *
+			subport_fp(fp));
+
+		kinfo->spi_subport_uregbase =
+			cvt_kvaddr(pd->subport_uregbase);
+		kinfo->spi_subport_rcvegrbuf =
+			cvt_kvaddr(pd->subport_rcvegrbuf);
+		kinfo->spi_subport_rcvhdr_base =
+			cvt_kvaddr(pd->subport_rcvhdr_base);
+		ipath_cdbg(PROC, "port %u flags %x %llx %llx %llx\n",
+			kinfo->spi_port, kinfo->spi_runtime_flags,
+			(unsigned long long) kinfo->spi_subport_uregbase,
+			(unsigned long long) kinfo->spi_subport_rcvegrbuf,
+			(unsigned long long) kinfo->spi_subport_rcvhdr_base);
+	}
+
+	/*
+	 * All user buffers are 2KB buffers.  If we ever support
+	 * giving 4KB buffers to user processes, this will need some
+	 * work.
+	 */
+	kinfo->spi_pioindex = (kinfo->spi_piobufbase -
+		(dd->ipath_piobufbase & 0xffffffff)) / dd->ipath_palign;
+	kinfo->spi_pioalign = dd->ipath_palign;
+
+	kinfo->spi_qpair = IPATH_KD_QP;
+	/*
+	 * user mode PIO buffers are always 2KB, even when 4KB can
+	 * be received, and sent via the kernel; this is ibmaxlen
+	 * for 2K MTU.
+	 */
+	kinfo->spi_piosize = dd->ipath_piosize2k - 2 * sizeof(u32);
+	kinfo->spi_mtu = dd->ipath_ibmaxlen;	/* maxlen, not ibmtu */
+	kinfo->spi_port = pd->port_port;
+	kinfo->spi_subport = subport_fp(fp);
+	kinfo->spi_sw_version = IPATH_KERN_SWVERSION;
+	kinfo->spi_hw_version = dd->ipath_revision;
+
+	if (master) {
+		kinfo->spi_runtime_flags |= IPATH_RUNTIME_MASTER;
+	}
+
+	sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo);
+	if (copy_to_user(ubase, kinfo, sz))
+		ret = -EFAULT;
+
+bail:
+	kfree(kinfo);
+	return ret;
+}
+
+/**
+ * ipath_tid_update - update a port TID
+ * @pd: the port
+ * @fp: the ipath device file
+ * @ti: the TID information
+ *
+ * The new implementation as of Oct 2004 is that the driver assigns
+ * the tid and returns it to the caller.   To make it easier to
+ * catch bugs, and to reduce search time, we keep a cursor for
+ * each port, walking the shadow tid array to find one that's not
+ * in use.
+ *
+ * For now, if we can't allocate the full list, we fail, although
+ * in the long run, we'll allocate as many as we can, and the
+ * caller will deal with that by trying the remaining pages later.
+ * That means that when we fail, we have to mark the tids as not in
+ * use again, in our shadow copy.
+ *
+ * It's up to the caller to free the tids when they are done.
+ * We'll unlock the pages as they free them.
+ *
+ * Also, right now we are locking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.
+ */
+static int ipath_tid_update(struct ipath_portdata *pd, struct file *fp,
+			    const struct ipath_tid_info *ti)
+{
+	int ret = 0, ntids;
+	u32 tid, porttid, cnt, i, tidcnt, tidoff;
+	u16 *tidlist;
+	struct ipath_devdata *dd = pd->port_dd;
+	u64 physaddr;
+	unsigned long vaddr;
+	u64 __iomem *tidbase;
+	unsigned long tidmap[8];
+	struct page **pagep = NULL;
+	unsigned subport = subport_fp(fp);
+
+	if (!dd->ipath_pageshadow) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	cnt = ti->tidcnt;
+	if (!cnt) {
+		ipath_dbg("After copyin, tidcnt 0, tidlist %llx\n",
+			  (unsigned long long) ti->tidlist);
+		/*
+		 * Should we treat as success?  likely a bug
+		 */
+		ret = -EFAULT;
+		goto done;
+	}
+	porttid = pd->port_port * dd->ipath_rcvtidcnt;
+	if (!pd->port_subport_cnt) {
+		tidcnt = dd->ipath_rcvtidcnt;
+		tid = pd->port_tidcursor;
+		tidoff = 0;
+	} else if (!subport) {
+		tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
+			 (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
+		tidoff = dd->ipath_rcvtidcnt - tidcnt;
+		porttid += tidoff;
+		tid = tidcursor_fp(fp);
+	} else {
+		tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
+		tidoff = tidcnt * (subport - 1);
+		porttid += tidoff;
+		tid = tidcursor_fp(fp);
+	}
+	if (cnt > tidcnt) {
+		/* make sure it all fits in port_tid_pg_list */
+		dev_info(&dd->pcidev->dev, "Process tried to allocate %u "
+			 "TIDs, only trying max (%u)\n", cnt, tidcnt);
+		cnt = tidcnt;
+	}
+	pagep = &((struct page **) pd->port_tid_pg_list)[tidoff];
+	tidlist = &((u16 *) &pagep[dd->ipath_rcvtidcnt])[tidoff];
+
+	memset(tidmap, 0, sizeof(tidmap));
+	/* before decrement; chip actual # */
+	ntids = tidcnt;
+	tidbase = (u64 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
+				   dd->ipath_rcvtidbase +
+				   porttid * sizeof(*tidbase));
+
+	ipath_cdbg(VERBOSE, "Port%u %u tids, cursor %u, tidbase %p\n",
+		   pd->port_port, cnt, tid, tidbase);
+
+	/* virtual address of first page in transfer */
+	vaddr = ti->tidvaddr;
+	if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
+		       cnt * PAGE_SIZE)) {
+		ipath_dbg("Fail vaddr %p, %u pages, !access_ok\n",
+			  (void *)vaddr, cnt);
+		ret = -EFAULT;
+		goto done;
+	}
+	ret = ipath_get_user_pages(vaddr, cnt, pagep);
+	if (ret) {
+		if (ret == -EBUSY) {
+			ipath_dbg("Failed to lock addr %p, %u pages "
+				  "(already locked)\n",
+				  (void *) vaddr, cnt);
+			/*
+			 * for now, continue, and see what happens but with
+			 * the new implementation, this should never happen,
+			 * unless perhaps the user has mpin'ed the pages
+			 * themselves (something we need to test)
+			 */
+			ret = 0;
+		} else {
+			dev_info(&dd->pcidev->dev,
+				 "Failed to lock addr %p, %u pages: "
+				 "errno %d\n", (void *) vaddr, cnt, -ret);
+			goto done;
+		}
+	}
+	for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
+		for (; ntids--; tid++) {
+			if (tid == tidcnt)
+				tid = 0;
+			if (!dd->ipath_pageshadow[porttid + tid])
+				break;
+		}
+		if (ntids < 0) {
+			/*
+			 * oops, wrapped all the way through their TIDs,
+			 * and didn't have enough free; see comments at
+			 * start of routine
+			 */
+			ipath_dbg("Not enough free TIDs for %u pages "
+				  "(index %d), failing\n", cnt, i);
+			i--;	/* last tidlist[i] not filled in */
+			ret = -ENOMEM;
+			break;
+		}
+		tidlist[i] = tid + tidoff;
+		ipath_cdbg(VERBOSE, "Updating idx %u to TID %u, "
+			   "vaddr %lx\n", i, tid + tidoff, vaddr);
+		/* we "know" system pages and TID pages are same size */
+		dd->ipath_pageshadow[porttid + tid] = pagep[i];
+		dd->ipath_physshadow[porttid + tid] = ipath_map_page(
+			dd->pcidev, pagep[i], 0, PAGE_SIZE,
+			PCI_DMA_FROMDEVICE);
+		/*
+		 * don't need atomic or it's overhead
+		 */
+		__set_bit(tid, tidmap);
+		physaddr = dd->ipath_physshadow[porttid + tid];
+		ipath_stats.sps_pagelocks++;
+		ipath_cdbg(VERBOSE,
+			   "TID %u, vaddr %lx, physaddr %llx pgp %p\n",
+			   tid, vaddr, (unsigned long long) physaddr,
+			   pagep[i]);
+		dd->ipath_f_put_tid(dd, &tidbase[tid], RCVHQ_RCV_TYPE_EXPECTED,
+				    physaddr);
+		/*
+		 * don't check this tid in ipath_portshadow, since we
+		 * just filled it in; start with the next one.
+		 */
+		tid++;
+	}
+
+	if (ret) {
+		u32 limit;
+	cleanup:
+		/* jump here if copy out of updated info failed... */
+		ipath_dbg("After failure (ret=%d), undo %d of %d entries\n",
+			  -ret, i, cnt);
+		/* same code that's in ipath_free_tid() */
+		limit = sizeof(tidmap) * BITS_PER_BYTE;
+		if (limit > tidcnt)
+			/* just in case size changes in future */
+			limit = tidcnt;
+		tid = find_first_bit((const unsigned long *)tidmap, limit);
+		for (; tid < limit; tid++) {
+			if (!test_bit(tid, tidmap))
+				continue;
+			if (dd->ipath_pageshadow[porttid + tid]) {
+				ipath_cdbg(VERBOSE, "Freeing TID %u\n",
+					   tid);
+				dd->ipath_f_put_tid(dd, &tidbase[tid],
+						    RCVHQ_RCV_TYPE_EXPECTED,
+						    dd->ipath_tidinvalid);
+				pci_unmap_page(dd->pcidev,
+					dd->ipath_physshadow[porttid + tid],
+					PAGE_SIZE, PCI_DMA_FROMDEVICE);
+				dd->ipath_pageshadow[porttid + tid] = NULL;
+				ipath_stats.sps_pageunlocks++;
+			}
+		}
+		ipath_release_user_pages(pagep, cnt);
+	} else {
+		/*
+		 * Copy the updated array, with ipath_tid's filled in, back
+		 * to user.  Since we did the copy in already, this "should
+		 * never fail" If it does, we have to clean up...
+		 */
+		if (copy_to_user((void __user *)
+				 (unsigned long) ti->tidlist,
+				 tidlist, cnt * sizeof(*tidlist))) {
+			ret = -EFAULT;
+			goto cleanup;
+		}
+		if (copy_to_user((void __user *) (unsigned long) ti->tidmap,
+				 tidmap, sizeof tidmap)) {
+			ret = -EFAULT;
+			goto cleanup;
+		}
+		if (tid == tidcnt)
+			tid = 0;
+		if (!pd->port_subport_cnt)
+			pd->port_tidcursor = tid;
+		else
+			tidcursor_fp(fp) = tid;
+	}
+
+done:
+	if (ret)
+		ipath_dbg("Failed to map %u TID pages, failing with %d\n",
+			  ti->tidcnt, -ret);
+	return ret;
+}
+
+/**
+ * ipath_tid_free - free a port TID
+ * @pd: the port
+ * @subport: the subport
+ * @ti: the TID info
+ *
+ * right now we are unlocking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.  We check that the TID is in range for this port
+ * but otherwise don't check validity; if user has an error and
+ * frees the wrong tid, it's only their own data that can thereby
+ * be corrupted.  We do check that the TID was in use, for sanity
+ * We always use our idea of the saved address, not the address that
+ * they pass in to us.
+ */
+
+static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport,
+			  const struct ipath_tid_info *ti)
+{
+	int ret = 0;
+	u32 tid, porttid, cnt, limit, tidcnt;
+	struct ipath_devdata *dd = pd->port_dd;
+	u64 __iomem *tidbase;
+	unsigned long tidmap[8];
+
+	if (!dd->ipath_pageshadow) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap,
+			   sizeof tidmap)) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	porttid = pd->port_port * dd->ipath_rcvtidcnt;
+	if (!pd->port_subport_cnt)
+		tidcnt = dd->ipath_rcvtidcnt;
+	else if (!subport) {
+		tidcnt = (dd->ipath_rcvtidcnt / pd->port_subport_cnt) +
+			 (dd->ipath_rcvtidcnt % pd->port_subport_cnt);
+		porttid += dd->ipath_rcvtidcnt - tidcnt;
+	} else {
+		tidcnt = dd->ipath_rcvtidcnt / pd->port_subport_cnt;
+		porttid += tidcnt * (subport - 1);
+	}
+	tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+				   dd->ipath_rcvtidbase +
+				   porttid * sizeof(*tidbase));
+
+	limit = sizeof(tidmap) * BITS_PER_BYTE;
+	if (limit > tidcnt)
+		/* just in case size changes in future */
+		limit = tidcnt;
+	tid = find_first_bit(tidmap, limit);
+	ipath_cdbg(VERBOSE, "Port%u free %u tids; first bit (max=%d) "
+		   "set is %d, porttid %u\n", pd->port_port, ti->tidcnt,
+		   limit, tid, porttid);
+	for (cnt = 0; tid < limit; tid++) {
+		/*
+		 * small optimization; if we detect a run of 3 or so without
+		 * any set, use find_first_bit again.  That's mainly to
+		 * accelerate the case where we wrapped, so we have some at
+		 * the beginning, and some at the end, and a big gap
+		 * in the middle.
+		 */
+		if (!test_bit(tid, tidmap))
+			continue;
+		cnt++;
+		if (dd->ipath_pageshadow[porttid + tid]) {
+			struct page *p;
+			p = dd->ipath_pageshadow[porttid + tid];
+			dd->ipath_pageshadow[porttid + tid] = NULL;
+			ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n",
+				   pid_nr(pd->port_pid), tid);
+			dd->ipath_f_put_tid(dd, &tidbase[tid],
+					    RCVHQ_RCV_TYPE_EXPECTED,
+					    dd->ipath_tidinvalid);
+			pci_unmap_page(dd->pcidev,
+				dd->ipath_physshadow[porttid + tid],
+				PAGE_SIZE, PCI_DMA_FROMDEVICE);
+			ipath_release_user_pages(&p, 1);
+			ipath_stats.sps_pageunlocks++;
+		} else
+			ipath_dbg("Unused tid %u, ignoring\n", tid);
+	}
+	if (cnt != ti->tidcnt)
+		ipath_dbg("passed in tidcnt %d, only %d bits set in map\n",
+			  ti->tidcnt, cnt);
+done:
+	if (ret)
+		ipath_dbg("Failed to unmap %u TID pages, failing with %d\n",
+			  ti->tidcnt, -ret);
+	return ret;
+}
+
+/**
+ * ipath_set_part_key - set a partition key
+ * @pd: the port
+ * @key: the key
+ *
+ * We can have up to 4 active at a time (other than the default, which is
+ * always allowed).  This is somewhat tricky, since multiple ports may set
+ * the same key, so we reference count them, and clean up at exit.  All 4
+ * partition keys are packed into a single infinipath register.  It's an
+ * error for a process to set the same pkey multiple times.  We provide no
+ * mechanism to de-allocate a pkey at this time, we may eventually need to
+ * do that.  I've used the atomic operations, and no locking, and only make
+ * a single pass through what's available.  This should be more than
+ * adequate for some time. I'll think about spinlocks or the like if and as
+ * it's necessary.
+ */
+static int ipath_set_part_key(struct ipath_portdata *pd, u16 key)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	int i, any = 0, pidx = -1;
+	u16 lkey = key & 0x7FFF;
+	int ret;
+
+	if (lkey == (IPATH_DEFAULT_P_KEY & 0x7FFF)) {
+		/* nothing to do; this key always valid */
+		ret = 0;
+		goto bail;
+	}
+
+	ipath_cdbg(VERBOSE, "p%u try to set pkey %hx, current keys "
+		   "%hx:%x %hx:%x %hx:%x %hx:%x\n",
+		   pd->port_port, key, dd->ipath_pkeys[0],
+		   atomic_read(&dd->ipath_pkeyrefs[0]), dd->ipath_pkeys[1],
+		   atomic_read(&dd->ipath_pkeyrefs[1]), dd->ipath_pkeys[2],
+		   atomic_read(&dd->ipath_pkeyrefs[2]), dd->ipath_pkeys[3],
+		   atomic_read(&dd->ipath_pkeyrefs[3]));
+
+	if (!lkey) {
+		ipath_cdbg(PROC, "p%u tries to set key 0, not allowed\n",
+			   pd->port_port);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Set the full membership bit, because it has to be
+	 * set in the register or the packet, and it seems
+	 * cleaner to set in the register than to force all
+	 * callers to set it. (see bug 4331)
+	 */
+	key |= 0x8000;
+
+	for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+		if (!pd->port_pkeys[i] && pidx == -1)
+			pidx = i;
+		if (pd->port_pkeys[i] == key) {
+			ipath_cdbg(VERBOSE, "p%u tries to set same pkey "
+				   "(%x) more than once\n",
+				   pd->port_port, key);
+			ret = -EEXIST;
+			goto bail;
+		}
+	}
+	if (pidx == -1) {
+		ipath_dbg("All pkeys for port %u already in use, "
+			  "can't set %x\n", pd->port_port, key);
+		ret = -EBUSY;
+		goto bail;
+	}
+	for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (!dd->ipath_pkeys[i]) {
+			any++;
+			continue;
+		}
+		if (dd->ipath_pkeys[i] == key) {
+			atomic_t *pkrefs = &dd->ipath_pkeyrefs[i];
+
+			if (atomic_inc_return(pkrefs) > 1) {
+				pd->port_pkeys[pidx] = key;
+				ipath_cdbg(VERBOSE, "p%u set key %x "
+					   "matches #%d, count now %d\n",
+					   pd->port_port, key, i,
+					   atomic_read(pkrefs));
+				ret = 0;
+				goto bail;
+			} else {
+				/*
+				 * lost race, decrement count, catch below
+				 */
+				atomic_dec(pkrefs);
+				ipath_cdbg(VERBOSE, "Lost race, count was "
+					   "0, after dec, it's %d\n",
+					   atomic_read(pkrefs));
+				any++;
+			}
+		}
+		if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
+			/*
+			 * It makes no sense to have both the limited and
+			 * full membership PKEY set at the same time since
+			 * the unlimited one will disable the limited one.
+			 */
+			ret = -EEXIST;
+			goto bail;
+		}
+	}
+	if (!any) {
+		ipath_dbg("port %u, all pkeys already in use, "
+			  "can't set %x\n", pd->port_port, key);
+		ret = -EBUSY;
+		goto bail;
+	}
+	for (any = i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (!dd->ipath_pkeys[i] &&
+		    atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
+			u64 pkey;
+
+			/* for ipathstats, etc. */
+			ipath_stats.sps_pkeys[i] = lkey;
+			pd->port_pkeys[pidx] = dd->ipath_pkeys[i] = key;
+			pkey =
+				(u64) dd->ipath_pkeys[0] |
+				((u64) dd->ipath_pkeys[1] << 16) |
+				((u64) dd->ipath_pkeys[2] << 32) |
+				((u64) dd->ipath_pkeys[3] << 48);
+			ipath_cdbg(PROC, "p%u set key %x in #%d, "
+				   "portidx %d, new pkey reg %llx\n",
+				   pd->port_port, key, i, pidx,
+				   (unsigned long long) pkey);
+			ipath_write_kreg(
+				dd, dd->ipath_kregs->kr_partitionkey, pkey);
+
+			ret = 0;
+			goto bail;
+		}
+	}
+	ipath_dbg("port %u, all pkeys already in use 2nd pass, "
+		  "can't set %x\n", pd->port_port, key);
+	ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_manage_rcvq - manage a port's receive queue
+ * @pd: the port
+ * @subport: the subport
+ * @start_stop: action to carry out
+ *
+ * start_stop == 0 disables receive on the port, for use in queue
+ * overflow conditions.  start_stop==1 re-enables, to be used to
+ * re-init the software copy of the head register
+ */
+static int ipath_manage_rcvq(struct ipath_portdata *pd, unsigned subport,
+			     int start_stop)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+
+	ipath_cdbg(PROC, "%sabling rcv for unit %u port %u:%u\n",
+		   start_stop ? "en" : "dis", dd->ipath_unit,
+		   pd->port_port, subport);
+	if (subport)
+		goto bail;
+	/* atomically clear receive enable port. */
+	if (start_stop) {
+		/*
+		 * On enable, force in-memory copy of the tail register to
+		 * 0, so that protocol code doesn't have to worry about
+		 * whether or not the chip has yet updated the in-memory
+		 * copy or not on return from the system call. The chip
+		 * always resets it's tail register back to 0 on a
+		 * transition from disabled to enabled.  This could cause a
+		 * problem if software was broken, and did the enable w/o
+		 * the disable, but eventually the in-memory copy will be
+		 * updated and correct itself, even in the face of software
+		 * bugs.
+		 */
+		if (pd->port_rcvhdrtail_kvaddr)
+			ipath_clear_rcvhdrtail(pd);
+		set_bit(dd->ipath_r_portenable_shift + pd->port_port,
+			&dd->ipath_rcvctrl);
+	} else
+		clear_bit(dd->ipath_r_portenable_shift + pd->port_port,
+			  &dd->ipath_rcvctrl);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			 dd->ipath_rcvctrl);
+	/* now be sure chip saw it before we return */
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	if (start_stop) {
+		/*
+		 * And try to be sure that tail reg update has happened too.
+		 * This should in theory interlock with the RXE changes to
+		 * the tail register.  Don't assign it to the tail register
+		 * in memory copy, since we could overwrite an update by the
+		 * chip if we did.
+		 */
+		ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
+	}
+	/* always; new head should be equal to new tail; see above */
+bail:
+	return 0;
+}
+
+static void ipath_clean_part_key(struct ipath_portdata *pd,
+				 struct ipath_devdata *dd)
+{
+	int i, j, pchanged = 0;
+	u64 oldpkey;
+
+	/* for debugging only */
+	oldpkey = (u64) dd->ipath_pkeys[0] |
+		((u64) dd->ipath_pkeys[1] << 16) |
+		((u64) dd->ipath_pkeys[2] << 32) |
+		((u64) dd->ipath_pkeys[3] << 48);
+
+	for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+		if (!pd->port_pkeys[i])
+			continue;
+		ipath_cdbg(VERBOSE, "look for key[%d] %hx in pkeys\n", i,
+			   pd->port_pkeys[i]);
+		for (j = 0; j < ARRAY_SIZE(dd->ipath_pkeys); j++) {
+			/* check for match independent of the global bit */
+			if ((dd->ipath_pkeys[j] & 0x7fff) !=
+			    (pd->port_pkeys[i] & 0x7fff))
+				continue;
+			if (atomic_dec_and_test(&dd->ipath_pkeyrefs[j])) {
+				ipath_cdbg(VERBOSE, "p%u clear key "
+					   "%x matches #%d\n",
+					   pd->port_port,
+					   pd->port_pkeys[i], j);
+				ipath_stats.sps_pkeys[j] =
+					dd->ipath_pkeys[j] = 0;
+				pchanged++;
+			}
+			else ipath_cdbg(
+				VERBOSE, "p%u key %x matches #%d, "
+				"but ref still %d\n", pd->port_port,
+				pd->port_pkeys[i], j,
+				atomic_read(&dd->ipath_pkeyrefs[j]));
+			break;
+		}
+		pd->port_pkeys[i] = 0;
+	}
+	if (pchanged) {
+		u64 pkey = (u64) dd->ipath_pkeys[0] |
+			((u64) dd->ipath_pkeys[1] << 16) |
+			((u64) dd->ipath_pkeys[2] << 32) |
+			((u64) dd->ipath_pkeys[3] << 48);
+		ipath_cdbg(VERBOSE, "p%u old pkey reg %llx, "
+			   "new pkey reg %llx\n", pd->port_port,
+			   (unsigned long long) oldpkey,
+			   (unsigned long long) pkey);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
+				 pkey);
+	}
+}
+
+/*
+ * Initialize the port data with the receive buffer sizes
+ * so this can be done while the master port is locked.
+ * Otherwise, there is a race with a slave opening the port
+ * and seeing these fields uninitialized.
+ */
+static void init_user_egr_sizes(struct ipath_portdata *pd)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	unsigned egrperchunk, egrcnt, size;
+
+	/*
+	 * to avoid wasting a lot of memory, we allocate 32KB chunks of
+	 * physically contiguous memory, advance through it until used up
+	 * and then allocate more.  Of course, we need memory to store those
+	 * extra pointers, now.  Started out with 256KB, but under heavy
+	 * memory pressure (creating large files and then copying them over
+	 * NFS while doing lots of MPI jobs), we hit some allocation
+	 * failures, even though we can sleep...  (2.6.10) Still get
+	 * failures at 64K.  32K is the lowest we can go without wasting
+	 * additional memory.
+	 */
+	size = 0x8000;
+	egrperchunk = size / dd->ipath_rcvegrbufsize;
+	egrcnt = dd->ipath_rcvegrcnt;
+	pd->port_rcvegrbuf_chunks = (egrcnt + egrperchunk - 1) / egrperchunk;
+	pd->port_rcvegrbufs_perchunk = egrperchunk;
+	pd->port_rcvegrbuf_size = size;
+}
+
+/**
+ * ipath_create_user_egr - allocate eager TID buffers
+ * @pd: the port to allocate TID buffers for
+ *
+ * This routine is now quite different for user and kernel, because
+ * the kernel uses skb's, for the accelerated network performance
+ * This is the user port version
+ *
+ * Allocate the eager TID buffers and program them into infinipath
+ * They are no longer completely contiguous, we do multiple allocation
+ * calls.
+ */
+static int ipath_create_user_egr(struct ipath_portdata *pd)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
+	size_t size;
+	int ret;
+	gfp_t gfp_flags;
+
+	/*
+	 * GFP_USER, but without GFP_FS, so buffer cache can be
+	 * coalesced (we hope); otherwise, even at order 4,
+	 * heavy filesystem activity makes these fail, and we can
+	 * use compound pages.
+	 */
+	gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+
+	egrcnt = dd->ipath_rcvegrcnt;
+	/* TID number offset for this port */
+	egroff = (pd->port_port - 1) * egrcnt + dd->ipath_p0_rcvegrcnt;
+	egrsize = dd->ipath_rcvegrbufsize;
+	ipath_cdbg(VERBOSE, "Allocating %d egr buffers, at egrtid "
+		   "offset %x, egrsize %u\n", egrcnt, egroff, egrsize);
+
+	chunk = pd->port_rcvegrbuf_chunks;
+	egrperchunk = pd->port_rcvegrbufs_perchunk;
+	size = pd->port_rcvegrbuf_size;
+	pd->port_rcvegrbuf = kmalloc(chunk * sizeof(pd->port_rcvegrbuf[0]),
+				     GFP_KERNEL);
+	if (!pd->port_rcvegrbuf) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	pd->port_rcvegrbuf_phys =
+		kmalloc(chunk * sizeof(pd->port_rcvegrbuf_phys[0]),
+			GFP_KERNEL);
+	if (!pd->port_rcvegrbuf_phys) {
+		ret = -ENOMEM;
+		goto bail_rcvegrbuf;
+	}
+	for (e = 0; e < pd->port_rcvegrbuf_chunks; e++) {
+
+		pd->port_rcvegrbuf[e] = dma_alloc_coherent(
+			&dd->pcidev->dev, size, &pd->port_rcvegrbuf_phys[e],
+			gfp_flags);
+
+		if (!pd->port_rcvegrbuf[e]) {
+			ret = -ENOMEM;
+			goto bail_rcvegrbuf_phys;
+		}
+	}
+
+	pd->port_rcvegr_phys = pd->port_rcvegrbuf_phys[0];
+
+	for (e = chunk = 0; chunk < pd->port_rcvegrbuf_chunks; chunk++) {
+		dma_addr_t pa = pd->port_rcvegrbuf_phys[chunk];
+		unsigned i;
+
+		for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) {
+			dd->ipath_f_put_tid(dd, e + egroff +
+					    (u64 __iomem *)
+					    ((char __iomem *)
+					     dd->ipath_kregbase +
+					     dd->ipath_rcvegrbase),
+					    RCVHQ_RCV_TYPE_EAGER, pa);
+			pa += egrsize;
+		}
+		cond_resched();	/* don't hog the cpu */
+	}
+
+	ret = 0;
+	goto bail;
+
+bail_rcvegrbuf_phys:
+	for (e = 0; e < pd->port_rcvegrbuf_chunks &&
+		pd->port_rcvegrbuf[e]; e++) {
+		dma_free_coherent(&dd->pcidev->dev, size,
+				  pd->port_rcvegrbuf[e],
+				  pd->port_rcvegrbuf_phys[e]);
+
+	}
+	kfree(pd->port_rcvegrbuf_phys);
+	pd->port_rcvegrbuf_phys = NULL;
+bail_rcvegrbuf:
+	kfree(pd->port_rcvegrbuf);
+	pd->port_rcvegrbuf = NULL;
+bail:
+	return ret;
+}
+
+
+/* common code for the mappings on dma_alloc_coherent mem */
+static int ipath_mmap_mem(struct vm_area_struct *vma,
+	struct ipath_portdata *pd, unsigned len, int write_ok,
+	void *kvaddr, char *what)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	unsigned long pfn;
+	int ret;
+
+	if ((vma->vm_end - vma->vm_start) > len) {
+		dev_info(&dd->pcidev->dev,
+		         "FAIL on %s: len %lx > %x\n", what,
+			 vma->vm_end - vma->vm_start, len);
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	if (!write_ok) {
+		if (vma->vm_flags & VM_WRITE) {
+			dev_info(&dd->pcidev->dev,
+				 "%s must be mapped readonly\n", what);
+			ret = -EPERM;
+			goto bail;
+		}
+
+		/* don't allow them to later change with mprotect */
+		vma->vm_flags &= ~VM_MAYWRITE;
+	}
+
+	pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT;
+	ret = remap_pfn_range(vma, vma->vm_start, pfn,
+			      len, vma->vm_page_prot);
+	if (ret)
+		dev_info(&dd->pcidev->dev, "%s port%u mmap of %lx, %x "
+			 "bytes r%c failed: %d\n", what, pd->port_port,
+			 pfn, len, write_ok?'w':'o', ret);
+	else
+		ipath_cdbg(VERBOSE, "%s port%u mmaped %lx, %x bytes "
+			   "r%c\n", what, pd->port_port, pfn, len,
+			   write_ok?'w':'o');
+bail:
+	return ret;
+}
+
+static int mmap_ureg(struct vm_area_struct *vma, struct ipath_devdata *dd,
+		     u64 ureg)
+{
+	unsigned long phys;
+	int ret;
+
+	/*
+	 * This is real hardware, so use io_remap.  This is the mechanism
+	 * for the user process to update the head registers for their port
+	 * in the chip.
+	 */
+	if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) {
+		dev_info(&dd->pcidev->dev, "FAIL mmap userreg: reqlen "
+			 "%lx > PAGE\n", vma->vm_end - vma->vm_start);
+		ret = -EFAULT;
+	} else {
+		phys = dd->ipath_physaddr + ureg;
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+		vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 phys >> PAGE_SHIFT,
+					 vma->vm_end - vma->vm_start,
+					 vma->vm_page_prot);
+	}
+	return ret;
+}
+
+static int mmap_piobufs(struct vm_area_struct *vma,
+			struct ipath_devdata *dd,
+			struct ipath_portdata *pd,
+			unsigned piobufs, unsigned piocnt)
+{
+	unsigned long phys;
+	int ret;
+
+	/*
+	 * When we map the PIO buffers in the chip, we want to map them as
+	 * writeonly, no read possible.   This prevents access to previous
+	 * process data, and catches users who might try to read the i/o
+	 * space due to a bug.
+	 */
+	if ((vma->vm_end - vma->vm_start) > (piocnt * dd->ipath_palign)) {
+		dev_info(&dd->pcidev->dev, "FAIL mmap piobufs: "
+			 "reqlen %lx > PAGE\n",
+			 vma->vm_end - vma->vm_start);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	phys = dd->ipath_physaddr + piobufs;
+
+#if defined(__powerpc__)
+	/* There isn't a generic way to specify writethrough mappings */
+	pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
+	pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;
+	pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED;
+#endif
+
+	/*
+	 * don't allow them to later change to readable with mprotect (for when
+	 * not initially mapped readable, as is normally the case)
+	 */
+	vma->vm_flags &= ~VM_MAYREAD;
+	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+
+	ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT,
+				 vma->vm_end - vma->vm_start,
+				 vma->vm_page_prot);
+bail:
+	return ret;
+}
+
+static int mmap_rcvegrbufs(struct vm_area_struct *vma,
+			   struct ipath_portdata *pd)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	unsigned long start, size;
+	size_t total_size, i;
+	unsigned long pfn;
+	int ret;
+
+	size = pd->port_rcvegrbuf_size;
+	total_size = pd->port_rcvegrbuf_chunks * size;
+	if ((vma->vm_end - vma->vm_start) > total_size) {
+		dev_info(&dd->pcidev->dev, "FAIL on egr bufs: "
+			 "reqlen %lx > actual %lx\n",
+			 vma->vm_end - vma->vm_start,
+			 (unsigned long) total_size);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (vma->vm_flags & VM_WRITE) {
+		dev_info(&dd->pcidev->dev, "Can't map eager buffers as "
+			 "writable (flags=%lx)\n", vma->vm_flags);
+		ret = -EPERM;
+		goto bail;
+	}
+	/* don't allow them to later change to writeable with mprotect */
+	vma->vm_flags &= ~VM_MAYWRITE;
+
+	start = vma->vm_start;
+
+	for (i = 0; i < pd->port_rcvegrbuf_chunks; i++, start += size) {
+		pfn = virt_to_phys(pd->port_rcvegrbuf[i]) >> PAGE_SHIFT;
+		ret = remap_pfn_range(vma, start, pfn, size,
+				      vma->vm_page_prot);
+		if (ret < 0)
+			goto bail;
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/*
+ * ipath_file_vma_fault - handle a VMA page fault.
+ */
+static int ipath_file_vma_fault(struct vm_area_struct *vma,
+					struct vm_fault *vmf)
+{
+	struct page *page;
+
+	page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
+	if (!page)
+		return VM_FAULT_SIGBUS;
+	get_page(page);
+	vmf->page = page;
+
+	return 0;
+}
+
+static const struct vm_operations_struct ipath_file_vm_ops = {
+	.fault = ipath_file_vma_fault,
+};
+
+static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
+		       struct ipath_portdata *pd, unsigned subport)
+{
+	unsigned long len;
+	struct ipath_devdata *dd;
+	void *addr;
+	size_t size;
+	int ret = 0;
+
+	/* If the port is not shared, all addresses should be physical */
+	if (!pd->port_subport_cnt)
+		goto bail;
+
+	dd = pd->port_dd;
+	size = pd->port_rcvegrbuf_chunks * pd->port_rcvegrbuf_size;
+
+	/*
+	 * Each process has all the subport uregbase, rcvhdrq, and
+	 * rcvegrbufs mmapped - as an array for all the processes,
+	 * and also separately for this process.
+	 */
+	if (pgaddr == cvt_kvaddr(pd->subport_uregbase)) {
+		addr = pd->subport_uregbase;
+		size = PAGE_SIZE * pd->port_subport_cnt;
+	} else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base)) {
+		addr = pd->subport_rcvhdr_base;
+		size = pd->port_rcvhdrq_size * pd->port_subport_cnt;
+	} else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf)) {
+		addr = pd->subport_rcvegrbuf;
+		size *= pd->port_subport_cnt;
+        } else if (pgaddr == cvt_kvaddr(pd->subport_uregbase +
+                                        PAGE_SIZE * subport)) {
+                addr = pd->subport_uregbase + PAGE_SIZE * subport;
+                size = PAGE_SIZE;
+        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvhdr_base +
+                                pd->port_rcvhdrq_size * subport)) {
+                addr = pd->subport_rcvhdr_base +
+                        pd->port_rcvhdrq_size * subport;
+                size = pd->port_rcvhdrq_size;
+        } else if (pgaddr == cvt_kvaddr(pd->subport_rcvegrbuf +
+                               size * subport)) {
+                addr = pd->subport_rcvegrbuf + size * subport;
+                /* rcvegrbufs are read-only on the slave */
+                if (vma->vm_flags & VM_WRITE) {
+                        dev_info(&dd->pcidev->dev,
+                                 "Can't map eager buffers as "
+                                 "writable (flags=%lx)\n", vma->vm_flags);
+                        ret = -EPERM;
+                        goto bail;
+                }
+                /*
+                 * Don't allow permission to later change to writeable
+                 * with mprotect.
+                 */
+                vma->vm_flags &= ~VM_MAYWRITE;
+	} else {
+		goto bail;
+	}
+	len = vma->vm_end - vma->vm_start;
+	if (len > size) {
+		ipath_cdbg(MM, "FAIL: reqlen %lx > %zx\n", len, size);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
+	vma->vm_ops = &ipath_file_vm_ops;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_mmap - mmap various structures into user space
+ * @fp: the file pointer
+ * @vma: the VM area
+ *
+ * We use this to have a shared buffer between the kernel and the user code
+ * for the rcvhdr queue, egr buffers, and the per-port user regs and pio
+ * buffers in the chip.  We have the open and close entries so we can bump
+ * the ref count and keep the driver from being unloaded while still mapped.
+ */
+static int ipath_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+	struct ipath_portdata *pd;
+	struct ipath_devdata *dd;
+	u64 pgaddr, ureg;
+	unsigned piobufs, piocnt;
+	int ret;
+
+	pd = port_fp(fp);
+	if (!pd) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	dd = pd->port_dd;
+
+	/*
+	 * This is the ipath_do_user_init() code, mapping the shared buffers
+	 * into the user process. The address referred to by vm_pgoff is the
+	 * file offset passed via mmap().  For shared ports, this is the
+	 * kernel vmalloc() address of the pages to share with the master.
+	 * For non-shared or master ports, this is a physical address.
+	 * We only do one mmap for each space mapped.
+	 */
+	pgaddr = vma->vm_pgoff << PAGE_SHIFT;
+
+	/*
+	 * Check for 0 in case one of the allocations failed, but user
+	 * called mmap anyway.
+	 */
+	if (!pgaddr)  {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ipath_cdbg(MM, "pgaddr %llx vm_start=%lx len %lx port %u:%u:%u\n",
+		   (unsigned long long) pgaddr, vma->vm_start,
+		   vma->vm_end - vma->vm_start, dd->ipath_unit,
+		   pd->port_port, subport_fp(fp));
+
+	/*
+	 * Physical addresses must fit in 40 bits for our hardware.
+	 * Check for kernel virtual addresses first, anything else must
+	 * match a HW or memory address.
+	 */
+	ret = mmap_kvaddr(vma, pgaddr, pd, subport_fp(fp));
+	if (ret) {
+		if (ret > 0)
+			ret = 0;
+		goto bail;
+	}
+
+	ureg = dd->ipath_uregbase + dd->ipath_ureg_align * pd->port_port;
+	if (!pd->port_subport_cnt) {
+		/* port is not shared */
+		piocnt = pd->port_piocnt;
+		piobufs = pd->port_piobufs;
+	} else if (!subport_fp(fp)) {
+		/* caller is the master */
+		piocnt = (pd->port_piocnt / pd->port_subport_cnt) +
+			 (pd->port_piocnt % pd->port_subport_cnt);
+		piobufs = pd->port_piobufs +
+			dd->ipath_palign * (pd->port_piocnt - piocnt);
+	} else {
+		unsigned slave = subport_fp(fp) - 1;
+
+		/* caller is a slave */
+		piocnt = pd->port_piocnt / pd->port_subport_cnt;
+		piobufs = pd->port_piobufs + dd->ipath_palign * piocnt * slave;
+	}
+
+	if (pgaddr == ureg)
+		ret = mmap_ureg(vma, dd, ureg);
+	else if (pgaddr == piobufs)
+		ret = mmap_piobufs(vma, dd, pd, piobufs, piocnt);
+	else if (pgaddr == dd->ipath_pioavailregs_phys)
+		/* in-memory copy of pioavail registers */
+		ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
+			      	     (void *) dd->ipath_pioavailregs_dma,
+				     "pioavail registers");
+	else if (pgaddr == pd->port_rcvegr_phys)
+		ret = mmap_rcvegrbufs(vma, pd);
+	else if (pgaddr == (u64) pd->port_rcvhdrq_phys)
+		/*
+		 * The rcvhdrq itself; readonly except on HT (so have
+		 * to allow writable mapping), multiple pages, contiguous
+		 * from an i/o perspective.
+		 */
+		ret = ipath_mmap_mem(vma, pd, pd->port_rcvhdrq_size, 1,
+				     pd->port_rcvhdrq,
+				     "rcvhdrq");
+	else if (pgaddr == (u64) pd->port_rcvhdrqtailaddr_phys)
+		/* in-memory copy of rcvhdrq tail register */
+		ret = ipath_mmap_mem(vma, pd, PAGE_SIZE, 0,
+				     pd->port_rcvhdrtail_kvaddr,
+				     "rcvhdrq tail");
+	else
+		ret = -EINVAL;
+
+	vma->vm_private_data = NULL;
+
+	if (ret < 0)
+		dev_info(&dd->pcidev->dev,
+			 "Failure %d on off %llx len %lx\n",
+			 -ret, (unsigned long long)pgaddr,
+			 vma->vm_end - vma->vm_start);
+bail:
+	return ret;
+}
+
+static unsigned ipath_poll_hdrqfull(struct ipath_portdata *pd)
+{
+	unsigned pollflag = 0;
+
+	if ((pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) &&
+	    pd->port_hdrqfull != pd->port_hdrqfull_poll) {
+		pollflag |= POLLIN | POLLRDNORM;
+		pd->port_hdrqfull_poll = pd->port_hdrqfull;
+	}
+
+	return pollflag;
+}
+
+static unsigned int ipath_poll_urgent(struct ipath_portdata *pd,
+				      struct file *fp,
+				      struct poll_table_struct *pt)
+{
+	unsigned pollflag = 0;
+	struct ipath_devdata *dd;
+
+	dd = pd->port_dd;
+
+	/* variable access in ipath_poll_hdrqfull() needs this */
+	rmb();
+	pollflag = ipath_poll_hdrqfull(pd);
+
+	if (pd->port_urgent != pd->port_urgent_poll) {
+		pollflag |= POLLIN | POLLRDNORM;
+		pd->port_urgent_poll = pd->port_urgent;
+	}
+
+	if (!pollflag) {
+		/* this saves a spin_lock/unlock in interrupt handler... */
+		set_bit(IPATH_PORT_WAITING_URG, &pd->port_flag);
+		/* flush waiting flag so don't miss an event... */
+		wmb();
+		poll_wait(fp, &pd->port_wait, pt);
+	}
+
+	return pollflag;
+}
+
+static unsigned int ipath_poll_next(struct ipath_portdata *pd,
+				    struct file *fp,
+				    struct poll_table_struct *pt)
+{
+	u32 head;
+	u32 tail;
+	unsigned pollflag = 0;
+	struct ipath_devdata *dd;
+
+	dd = pd->port_dd;
+
+	/* variable access in ipath_poll_hdrqfull() needs this */
+	rmb();
+	pollflag = ipath_poll_hdrqfull(pd);
+
+	head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port);
+	if (pd->port_rcvhdrtail_kvaddr)
+		tail = ipath_get_rcvhdrtail(pd);
+	else
+		tail = ipath_read_ureg32(dd, ur_rcvhdrtail, pd->port_port);
+
+	if (head != tail)
+		pollflag |= POLLIN | POLLRDNORM;
+	else {
+		/* this saves a spin_lock/unlock in interrupt handler */
+		set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag);
+		/* flush waiting flag so we don't miss an event */
+		wmb();
+
+		set_bit(pd->port_port + dd->ipath_r_intravail_shift,
+			&dd->ipath_rcvctrl);
+
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+				 dd->ipath_rcvctrl);
+
+		if (dd->ipath_rhdrhead_intr_off) /* arm rcv interrupt */
+			ipath_write_ureg(dd, ur_rcvhdrhead,
+					 dd->ipath_rhdrhead_intr_off | head,
+					 pd->port_port);
+
+		poll_wait(fp, &pd->port_wait, pt);
+	}
+
+	return pollflag;
+}
+
+static unsigned int ipath_poll(struct file *fp,
+			       struct poll_table_struct *pt)
+{
+	struct ipath_portdata *pd;
+	unsigned pollflag;
+
+	pd = port_fp(fp);
+	if (!pd)
+		pollflag = 0;
+	else if (pd->poll_type & IPATH_POLL_TYPE_URGENT)
+		pollflag = ipath_poll_urgent(pd, fp, pt);
+	else
+		pollflag = ipath_poll_next(pd, fp, pt);
+
+	return pollflag;
+}
+
+static int ipath_supports_subports(int user_swmajor, int user_swminor)
+{
+	/* no subport implementation prior to software version 1.3 */
+	return (user_swmajor > 1) || (user_swminor >= 3);
+}
+
+static int ipath_compatible_subports(int user_swmajor, int user_swminor)
+{
+	/* this code is written long-hand for clarity */
+	if (IPATH_USER_SWMAJOR != user_swmajor) {
+		/* no promise of compatibility if major mismatch */
+		return 0;
+	}
+	if (IPATH_USER_SWMAJOR == 1) {
+		switch (IPATH_USER_SWMINOR) {
+		case 0:
+		case 1:
+		case 2:
+			/* no subport implementation so cannot be compatible */
+			return 0;
+		case 3:
+			/* 3 is only compatible with itself */
+			return user_swminor == 3;
+		default:
+			/* >= 4 are compatible (or are expected to be) */
+			return user_swminor >= 4;
+		}
+	}
+	/* make no promises yet for future major versions */
+	return 0;
+}
+
+static int init_subports(struct ipath_devdata *dd,
+			 struct ipath_portdata *pd,
+			 const struct ipath_user_info *uinfo)
+{
+	int ret = 0;
+	unsigned num_subports;
+	size_t size;
+
+	/*
+	 * If the user is requesting zero subports,
+	 * skip the subport allocation.
+	 */
+	if (uinfo->spu_subport_cnt <= 0)
+		goto bail;
+
+	/* Self-consistency check for ipath_compatible_subports() */
+	if (ipath_supports_subports(IPATH_USER_SWMAJOR, IPATH_USER_SWMINOR) &&
+	    !ipath_compatible_subports(IPATH_USER_SWMAJOR,
+				       IPATH_USER_SWMINOR)) {
+		dev_info(&dd->pcidev->dev,
+			 "Inconsistent ipath_compatible_subports()\n");
+		goto bail;
+	}
+
+	/* Check for subport compatibility */
+	if (!ipath_compatible_subports(uinfo->spu_userversion >> 16,
+				       uinfo->spu_userversion & 0xffff)) {
+		dev_info(&dd->pcidev->dev,
+			 "Mismatched user version (%d.%d) and driver "
+			 "version (%d.%d) while port sharing. Ensure "
+                         "that driver and library are from the same "
+                         "release.\n",
+			 (int) (uinfo->spu_userversion >> 16),
+                         (int) (uinfo->spu_userversion & 0xffff),
+			 IPATH_USER_SWMAJOR,
+	                 IPATH_USER_SWMINOR);
+		goto bail;
+	}
+	if (uinfo->spu_subport_cnt > INFINIPATH_MAX_SUBPORT) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	num_subports = uinfo->spu_subport_cnt;
+	pd->subport_uregbase = vzalloc(PAGE_SIZE * num_subports);
+	if (!pd->subport_uregbase) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	/* Note: pd->port_rcvhdrq_size isn't initialized yet. */
+	size = ALIGN(dd->ipath_rcvhdrcnt * dd->ipath_rcvhdrentsize *
+		     sizeof(u32), PAGE_SIZE) * num_subports;
+	pd->subport_rcvhdr_base = vzalloc(size);
+	if (!pd->subport_rcvhdr_base) {
+		ret = -ENOMEM;
+		goto bail_ureg;
+	}
+
+	pd->subport_rcvegrbuf = vzalloc(pd->port_rcvegrbuf_chunks *
+					pd->port_rcvegrbuf_size *
+					num_subports);
+	if (!pd->subport_rcvegrbuf) {
+		ret = -ENOMEM;
+		goto bail_rhdr;
+	}
+
+	pd->port_subport_cnt = uinfo->spu_subport_cnt;
+	pd->port_subport_id = uinfo->spu_subport_id;
+	pd->active_slaves = 1;
+	set_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
+	goto bail;
+
+bail_rhdr:
+	vfree(pd->subport_rcvhdr_base);
+bail_ureg:
+	vfree(pd->subport_uregbase);
+	pd->subport_uregbase = NULL;
+bail:
+	return ret;
+}
+
+static int try_alloc_port(struct ipath_devdata *dd, int port,
+			  struct file *fp,
+			  const struct ipath_user_info *uinfo)
+{
+	struct ipath_portdata *pd;
+	int ret;
+
+	if (!(pd = dd->ipath_pd[port])) {
+		void *ptmp;
+
+		pd = kzalloc(sizeof(struct ipath_portdata), GFP_KERNEL);
+
+		/*
+		 * Allocate memory for use in ipath_tid_update() just once
+		 * at open, not per call.  Reduces cost of expected send
+		 * setup.
+		 */
+		ptmp = kmalloc(dd->ipath_rcvtidcnt * sizeof(u16) +
+			       dd->ipath_rcvtidcnt * sizeof(struct page **),
+			       GFP_KERNEL);
+		if (!pd || !ptmp) {
+			ipath_dev_err(dd, "Unable to allocate portdata "
+				      "memory, failing open\n");
+			ret = -ENOMEM;
+			kfree(pd);
+			kfree(ptmp);
+			goto bail;
+		}
+		dd->ipath_pd[port] = pd;
+		dd->ipath_pd[port]->port_port = port;
+		dd->ipath_pd[port]->port_dd = dd;
+		dd->ipath_pd[port]->port_tid_pg_list = ptmp;
+		init_waitqueue_head(&dd->ipath_pd[port]->port_wait);
+	}
+	if (!pd->port_cnt) {
+		pd->userversion = uinfo->spu_userversion;
+		init_user_egr_sizes(pd);
+		if ((ret = init_subports(dd, pd, uinfo)) != 0)
+			goto bail;
+		ipath_cdbg(PROC, "%s[%u] opened unit:port %u:%u\n",
+			   current->comm, current->pid, dd->ipath_unit,
+			   port);
+		pd->port_cnt = 1;
+		port_fp(fp) = pd;
+		pd->port_pid = get_pid(task_pid(current));
+		strlcpy(pd->port_comm, current->comm, sizeof(pd->port_comm));
+		ipath_stats.sps_ports++;
+		ret = 0;
+	} else
+		ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+static inline int usable(struct ipath_devdata *dd)
+{
+	return dd &&
+		(dd->ipath_flags & IPATH_PRESENT) &&
+		dd->ipath_kregbase &&
+		dd->ipath_lid &&
+		!(dd->ipath_flags & (IPATH_LINKDOWN | IPATH_DISABLED
+				     | IPATH_LINKUNK));
+}
+
+static int find_free_port(int unit, struct file *fp,
+			  const struct ipath_user_info *uinfo)
+{
+	struct ipath_devdata *dd = ipath_lookup(unit);
+	int ret, i;
+
+	if (!dd) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	if (!usable(dd)) {
+		ret = -ENETDOWN;
+		goto bail;
+	}
+
+	for (i = 1; i < dd->ipath_cfgports; i++) {
+		ret = try_alloc_port(dd, i, fp, uinfo);
+		if (ret != -EBUSY)
+			goto bail;
+	}
+	ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+static int find_best_unit(struct file *fp,
+			  const struct ipath_user_info *uinfo)
+{
+	int ret = 0, i, prefunit = -1, devmax;
+	int maxofallports, npresent, nup;
+	int ndev;
+
+	devmax = ipath_count_units(&npresent, &nup, &maxofallports);
+
+	/*
+	 * This code is present to allow a knowledgeable person to
+	 * specify the layout of processes to processors before opening
+	 * this driver, and then we'll assign the process to the "closest"
+	 * InfiniPath chip to that processor (we assume reasonable connectivity,
+	 * for now).  This code assumes that if affinity has been set
+	 * before this point, that at most one cpu is set; for now this
+	 * is reasonable.  I check for both cpumask_empty() and cpumask_full(),
+	 * in case some kernel variant sets none of the bits when no
+	 * affinity is set.  2.6.11 and 12 kernels have all present
+	 * cpus set.  Some day we'll have to fix it up further to handle
+	 * a cpu subset.  This algorithm fails for two HT chips connected
+	 * in tunnel fashion.  Eventually this needs real topology
+	 * information.  There may be some issues with dual core numbering
+	 * as well.  This needs more work prior to release.
+	 */
+	if (!cpumask_empty(tsk_cpus_allowed(current)) &&
+	    !cpumask_full(tsk_cpus_allowed(current))) {
+		int ncpus = num_online_cpus(), curcpu = -1, nset = 0;
+		get_online_cpus();
+		for_each_online_cpu(i)
+			if (cpumask_test_cpu(i, tsk_cpus_allowed(current))) {
+				ipath_cdbg(PROC, "%s[%u] affinity set for "
+					   "cpu %d/%d\n", current->comm,
+					   current->pid, i, ncpus);
+				curcpu = i;
+				nset++;
+			}
+		put_online_cpus();
+		if (curcpu != -1 && nset != ncpus) {
+			if (npresent) {
+				prefunit = curcpu / (ncpus / npresent);
+				ipath_cdbg(PROC,"%s[%u] %d chips, %d cpus, "
+					  "%d cpus/chip, select unit %d\n",
+					  current->comm, current->pid,
+					  npresent, ncpus, ncpus / npresent,
+					  prefunit);
+			}
+		}
+	}
+
+	/*
+	 * user ports start at 1, kernel port is 0
+	 * For now, we do round-robin access across all chips
+	 */
+
+	if (prefunit != -1)
+		devmax = prefunit + 1;
+recheck:
+	for (i = 1; i < maxofallports; i++) {
+		for (ndev = prefunit != -1 ? prefunit : 0; ndev < devmax;
+		     ndev++) {
+			struct ipath_devdata *dd = ipath_lookup(ndev);
+
+			if (!usable(dd))
+				continue; /* can't use this unit */
+			if (i >= dd->ipath_cfgports)
+				/*
+				 * Maxed out on users of this unit. Try
+				 * next.
+				 */
+				continue;
+			ret = try_alloc_port(dd, i, fp, uinfo);
+			if (!ret)
+				goto done;
+		}
+	}
+
+	if (npresent) {
+		if (nup == 0) {
+			ret = -ENETDOWN;
+			ipath_dbg("No ports available (none initialized "
+				  "and ready)\n");
+		} else {
+			if (prefunit > 0) {
+				/* if started above 0, retry from 0 */
+				ipath_cdbg(PROC,
+					   "%s[%u] no ports on prefunit "
+					   "%d, clear and re-check\n",
+					   current->comm, current->pid,
+					   prefunit);
+				devmax = ipath_count_units(NULL, NULL,
+							   NULL);
+				prefunit = -1;
+				goto recheck;
+			}
+			ret = -EBUSY;
+			ipath_dbg("No ports available\n");
+		}
+	} else {
+		ret = -ENXIO;
+		ipath_dbg("No boards found\n");
+	}
+
+done:
+	return ret;
+}
+
+static int find_shared_port(struct file *fp,
+			    const struct ipath_user_info *uinfo)
+{
+	int devmax, ndev, i;
+	int ret = 0;
+
+	devmax = ipath_count_units(NULL, NULL, NULL);
+
+	for (ndev = 0; ndev < devmax; ndev++) {
+		struct ipath_devdata *dd = ipath_lookup(ndev);
+
+		if (!usable(dd))
+			continue;
+		for (i = 1; i < dd->ipath_cfgports; i++) {
+			struct ipath_portdata *pd = dd->ipath_pd[i];
+
+			/* Skip ports which are not yet open */
+			if (!pd || !pd->port_cnt)
+				continue;
+			/* Skip port if it doesn't match the requested one */
+			if (pd->port_subport_id != uinfo->spu_subport_id)
+				continue;
+			/* Verify the sharing process matches the master */
+			if (pd->port_subport_cnt != uinfo->spu_subport_cnt ||
+			    pd->userversion != uinfo->spu_userversion ||
+			    pd->port_cnt >= pd->port_subport_cnt) {
+				ret = -EINVAL;
+				goto done;
+			}
+			port_fp(fp) = pd;
+			subport_fp(fp) = pd->port_cnt++;
+			pd->port_subpid[subport_fp(fp)] =
+				get_pid(task_pid(current));
+			tidcursor_fp(fp) = 0;
+			pd->active_slaves |= 1 << subport_fp(fp);
+			ipath_cdbg(PROC,
+				   "%s[%u] %u sharing %s[%u] unit:port %u:%u\n",
+				   current->comm, current->pid,
+				   subport_fp(fp),
+				   pd->port_comm, pid_nr(pd->port_pid),
+				   dd->ipath_unit, pd->port_port);
+			ret = 1;
+			goto done;
+		}
+	}
+
+done:
+	return ret;
+}
+
+static int ipath_open(struct inode *in, struct file *fp)
+{
+	/* The real work is performed later in ipath_assign_port() */
+	fp->private_data = kzalloc(sizeof(struct ipath_filedata), GFP_KERNEL);
+	return fp->private_data ? 0 : -ENOMEM;
+}
+
+/* Get port early, so can set affinity prior to memory allocation */
+static int ipath_assign_port(struct file *fp,
+			      const struct ipath_user_info *uinfo)
+{
+	int ret;
+	int i_minor;
+	unsigned swmajor, swminor;
+
+	/* Check to be sure we haven't already initialized this file */
+	if (port_fp(fp)) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	/* for now, if major version is different, bail */
+	swmajor = uinfo->spu_userversion >> 16;
+	if (swmajor != IPATH_USER_SWMAJOR) {
+		ipath_dbg("User major version %d not same as driver "
+			  "major %d\n", uinfo->spu_userversion >> 16,
+			  IPATH_USER_SWMAJOR);
+		ret = -ENODEV;
+		goto done;
+	}
+
+	swminor = uinfo->spu_userversion & 0xffff;
+	if (swminor != IPATH_USER_SWMINOR)
+		ipath_dbg("User minor version %d not same as driver "
+			  "minor %d\n", swminor, IPATH_USER_SWMINOR);
+
+	mutex_lock(&ipath_mutex);
+
+	if (ipath_compatible_subports(swmajor, swminor) &&
+	    uinfo->spu_subport_cnt &&
+	    (ret = find_shared_port(fp, uinfo))) {
+		if (ret > 0)
+			ret = 0;
+		goto done_chk_sdma;
+	}
+
+	i_minor = iminor(file_inode(fp)) - IPATH_USER_MINOR_BASE;
+	ipath_cdbg(VERBOSE, "open on dev %lx (minor %d)\n",
+		   (long)file_inode(fp)->i_rdev, i_minor);
+
+	if (i_minor)
+		ret = find_free_port(i_minor - 1, fp, uinfo);
+	else
+		ret = find_best_unit(fp, uinfo);
+
+done_chk_sdma:
+	if (!ret) {
+		struct ipath_filedata *fd = fp->private_data;
+		const struct ipath_portdata *pd = fd->pd;
+		const struct ipath_devdata *dd = pd->port_dd;
+
+		fd->pq = ipath_user_sdma_queue_create(&dd->pcidev->dev,
+						      dd->ipath_unit,
+						      pd->port_port,
+						      fd->subport);
+
+		if (!fd->pq)
+			ret = -ENOMEM;
+	}
+
+	mutex_unlock(&ipath_mutex);
+
+done:
+	return ret;
+}
+
+
+static int ipath_do_user_init(struct file *fp,
+			      const struct ipath_user_info *uinfo)
+{
+	int ret;
+	struct ipath_portdata *pd = port_fp(fp);
+	struct ipath_devdata *dd;
+	u32 head32;
+
+	/* Subports don't need to initialize anything since master did it. */
+	if (subport_fp(fp)) {
+		ret = wait_event_interruptible(pd->port_wait,
+			!test_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag));
+		goto done;
+	}
+
+	dd = pd->port_dd;
+
+	if (uinfo->spu_rcvhdrsize) {
+		ret = ipath_setrcvhdrsize(dd, uinfo->spu_rcvhdrsize);
+		if (ret)
+			goto done;
+	}
+
+	/* for now we do nothing with rcvhdrcnt: uinfo->spu_rcvhdrcnt */
+
+	/* some ports may get extra buffers, calculate that here */
+	if (pd->port_port <= dd->ipath_ports_extrabuf)
+		pd->port_piocnt = dd->ipath_pbufsport + 1;
+	else
+		pd->port_piocnt = dd->ipath_pbufsport;
+
+	/* for right now, kernel piobufs are at end, so port 1 is at 0 */
+	if (pd->port_port <= dd->ipath_ports_extrabuf)
+		pd->port_pio_base = (dd->ipath_pbufsport + 1)
+			* (pd->port_port - 1);
+	else
+		pd->port_pio_base = dd->ipath_ports_extrabuf +
+			dd->ipath_pbufsport * (pd->port_port - 1);
+	pd->port_piobufs = dd->ipath_piobufbase +
+		pd->port_pio_base * dd->ipath_palign;
+	ipath_cdbg(VERBOSE, "piobuf base for port %u is 0x%x, piocnt %u,"
+		" first pio %u\n", pd->port_port, pd->port_piobufs,
+		pd->port_piocnt, pd->port_pio_base);
+	ipath_chg_pioavailkernel(dd, pd->port_pio_base, pd->port_piocnt, 0);
+
+	/*
+	 * Now allocate the rcvhdr Q and eager TIDs; skip the TID
+	 * array for time being.  If pd->port_port > chip-supported,
+	 * we need to do extra stuff here to handle by handling overflow
+	 * through port 0, someday
+	 */
+	ret = ipath_create_rcvhdrq(dd, pd);
+	if (!ret)
+		ret = ipath_create_user_egr(pd);
+	if (ret)
+		goto done;
+
+	/*
+	 * set the eager head register for this port to the current values
+	 * of the tail pointers, since we don't know if they were
+	 * updated on last use of the port.
+	 */
+	head32 = ipath_read_ureg32(dd, ur_rcvegrindextail, pd->port_port);
+	ipath_write_ureg(dd, ur_rcvegrindexhead, head32, pd->port_port);
+	pd->port_lastrcvhdrqtail = -1;
+	ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n",
+		pd->port_port, head32);
+	pd->port_tidcursor = 0;	/* start at beginning after open */
+
+	/* initialize poll variables... */
+	pd->port_urgent = 0;
+	pd->port_urgent_poll = 0;
+	pd->port_hdrqfull_poll = pd->port_hdrqfull;
+
+	/*
+	 * Now enable the port for receive.
+	 * For chips that are set to DMA the tail register to memory
+	 * when they change (and when the update bit transitions from
+	 * 0 to 1.  So for those chips, we turn it off and then back on.
+	 * This will (very briefly) affect any other open ports, but the
+	 * duration is very short, and therefore isn't an issue.  We
+	 * explicitly set the in-memory tail copy to 0 beforehand, so we
+	 * don't have to wait to be sure the DMA update has happened
+	 * (chip resets head/tail to 0 on transition to enable).
+	 */
+	set_bit(dd->ipath_r_portenable_shift + pd->port_port,
+		&dd->ipath_rcvctrl);
+	if (!(dd->ipath_flags & IPATH_NODMA_RTAIL)) {
+		if (pd->port_rcvhdrtail_kvaddr)
+			ipath_clear_rcvhdrtail(pd);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			dd->ipath_rcvctrl &
+			~(1ULL << dd->ipath_r_tailupd_shift));
+	}
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			 dd->ipath_rcvctrl);
+	/* Notify any waiting slaves */
+	if (pd->port_subport_cnt) {
+		clear_bit(IPATH_PORT_MASTER_UNINIT, &pd->port_flag);
+		wake_up(&pd->port_wait);
+	}
+done:
+	return ret;
+}
+
+/**
+ * unlock_exptid - unlock any expected TID entries port still had in use
+ * @pd: port
+ *
+ * We don't actually update the chip here, because we do a bulk update
+ * below, using ipath_f_clear_tids.
+ */
+static void unlock_expected_tids(struct ipath_portdata *pd)
+{
+	struct ipath_devdata *dd = pd->port_dd;
+	int port_tidbase = pd->port_port * dd->ipath_rcvtidcnt;
+	int i, cnt = 0, maxtid = port_tidbase + dd->ipath_rcvtidcnt;
+
+	ipath_cdbg(VERBOSE, "Port %u unlocking any locked expTID pages\n",
+		   pd->port_port);
+	for (i = port_tidbase; i < maxtid; i++) {
+		struct page *ps = dd->ipath_pageshadow[i];
+
+		if (!ps)
+			continue;
+
+		dd->ipath_pageshadow[i] = NULL;
+		pci_unmap_page(dd->pcidev, dd->ipath_physshadow[i],
+			PAGE_SIZE, PCI_DMA_FROMDEVICE);
+		ipath_release_user_pages_on_close(&ps, 1);
+		cnt++;
+		ipath_stats.sps_pageunlocks++;
+	}
+	if (cnt)
+		ipath_cdbg(VERBOSE, "Port %u locked %u expTID entries\n",
+			   pd->port_port, cnt);
+
+	if (ipath_stats.sps_pagelocks || ipath_stats.sps_pageunlocks)
+		ipath_cdbg(VERBOSE, "%llu pages locked, %llu unlocked\n",
+			   (unsigned long long) ipath_stats.sps_pagelocks,
+			   (unsigned long long)
+			   ipath_stats.sps_pageunlocks);
+}
+
+static int ipath_close(struct inode *in, struct file *fp)
+{
+	int ret = 0;
+	struct ipath_filedata *fd;
+	struct ipath_portdata *pd;
+	struct ipath_devdata *dd;
+	unsigned long flags;
+	unsigned port;
+	struct pid *pid;
+
+	ipath_cdbg(VERBOSE, "close on dev %lx, private data %p\n",
+		   (long)in->i_rdev, fp->private_data);
+
+	mutex_lock(&ipath_mutex);
+
+	fd = fp->private_data;
+	fp->private_data = NULL;
+	pd = fd->pd;
+	if (!pd) {
+		mutex_unlock(&ipath_mutex);
+		goto bail;
+	}
+
+	dd = pd->port_dd;
+
+	/* drain user sdma queue */
+	ipath_user_sdma_queue_drain(dd, fd->pq);
+	ipath_user_sdma_queue_destroy(fd->pq);
+
+	if (--pd->port_cnt) {
+		/*
+		 * XXX If the master closes the port before the slave(s),
+		 * revoke the mmap for the eager receive queue so
+		 * the slave(s) don't wait for receive data forever.
+		 */
+		pd->active_slaves &= ~(1 << fd->subport);
+		put_pid(pd->port_subpid[fd->subport]);
+		pd->port_subpid[fd->subport] = NULL;
+		mutex_unlock(&ipath_mutex);
+		goto bail;
+	}
+	/* early; no interrupt users after this */
+	spin_lock_irqsave(&dd->ipath_uctxt_lock, flags);
+	port = pd->port_port;
+	dd->ipath_pd[port] = NULL;
+	pid = pd->port_pid;
+	pd->port_pid = NULL;
+	spin_unlock_irqrestore(&dd->ipath_uctxt_lock, flags);
+
+	if (pd->port_rcvwait_to || pd->port_piowait_to
+	    || pd->port_rcvnowait || pd->port_pionowait) {
+		ipath_cdbg(VERBOSE, "port%u, %u rcv, %u pio wait timeo; "
+			   "%u rcv %u, pio already\n",
+			   pd->port_port, pd->port_rcvwait_to,
+			   pd->port_piowait_to, pd->port_rcvnowait,
+			   pd->port_pionowait);
+		pd->port_rcvwait_to = pd->port_piowait_to =
+			pd->port_rcvnowait = pd->port_pionowait = 0;
+	}
+	if (pd->port_flag) {
+		ipath_cdbg(PROC, "port %u port_flag set: 0x%lx\n",
+			  pd->port_port, pd->port_flag);
+		pd->port_flag = 0;
+	}
+
+	if (dd->ipath_kregbase) {
+		/* atomically clear receive enable port and intr avail. */
+		clear_bit(dd->ipath_r_portenable_shift + port,
+			  &dd->ipath_rcvctrl);
+		clear_bit(pd->port_port + dd->ipath_r_intravail_shift,
+			  &dd->ipath_rcvctrl);
+		ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl,
+			dd->ipath_rcvctrl);
+		/* and read back from chip to be sure that nothing
+		 * else is in flight when we do the rest */
+		(void)ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+		/* clean up the pkeys for this port user */
+		ipath_clean_part_key(pd, dd);
+		/*
+		 * be paranoid, and never write 0's to these, just use an
+		 * unused part of the port 0 tail page.  Of course,
+		 * rcvhdraddr points to a large chunk of memory, so this
+		 * could still trash things, but at least it won't trash
+		 * page 0, and by disabling the port, it should stop "soon",
+		 * even if a packet or two is in already in flight after we
+		 * disabled the port.
+		 */
+		ipath_write_kreg_port(dd,
+		        dd->ipath_kregs->kr_rcvhdrtailaddr, port,
+			dd->ipath_dummy_hdrq_phys);
+		ipath_write_kreg_port(dd, dd->ipath_kregs->kr_rcvhdraddr,
+			pd->port_port, dd->ipath_dummy_hdrq_phys);
+
+		ipath_disarm_piobufs(dd, pd->port_pio_base, pd->port_piocnt);
+		ipath_chg_pioavailkernel(dd, pd->port_pio_base,
+			pd->port_piocnt, 1);
+
+		dd->ipath_f_clear_tids(dd, pd->port_port);
+
+		if (dd->ipath_pageshadow)
+			unlock_expected_tids(pd);
+		ipath_stats.sps_ports--;
+		ipath_cdbg(PROC, "%s[%u] closed port %u:%u\n",
+			   pd->port_comm, pid_nr(pid),
+			   dd->ipath_unit, port);
+	}
+
+	put_pid(pid);
+	mutex_unlock(&ipath_mutex);
+	ipath_free_pddata(dd, pd); /* after releasing the mutex */
+
+bail:
+	kfree(fd);
+	return ret;
+}
+
+static int ipath_port_info(struct ipath_portdata *pd, u16 subport,
+			   struct ipath_port_info __user *uinfo)
+{
+	struct ipath_port_info info;
+	int nup;
+	int ret;
+	size_t sz;
+
+	(void) ipath_count_units(NULL, &nup, NULL);
+	info.num_active = nup;
+	info.unit = pd->port_dd->ipath_unit;
+	info.port = pd->port_port;
+	info.subport = subport;
+	/* Don't return new fields if old library opened the port. */
+	if (ipath_supports_subports(pd->userversion >> 16,
+				    pd->userversion & 0xffff)) {
+		/* Number of user ports available for this device. */
+		info.num_ports = pd->port_dd->ipath_cfgports - 1;
+		info.num_subports = pd->port_subport_cnt;
+		sz = sizeof(info);
+	} else
+		sz = sizeof(info) - 2 * sizeof(u16);
+
+	if (copy_to_user(uinfo, &info, sz)) {
+		ret = -EFAULT;
+		goto bail;
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int ipath_get_slave_info(struct ipath_portdata *pd,
+				void __user *slave_mask_addr)
+{
+	int ret = 0;
+
+	if (copy_to_user(slave_mask_addr, &pd->active_slaves, sizeof(u32)))
+		ret = -EFAULT;
+	return ret;
+}
+
+static int ipath_sdma_get_inflight(struct ipath_user_sdma_queue *pq,
+				   u32 __user *inflightp)
+{
+	const u32 val = ipath_user_sdma_inflight_counter(pq);
+
+	if (put_user(val, inflightp))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ipath_sdma_get_complete(struct ipath_devdata *dd,
+				   struct ipath_user_sdma_queue *pq,
+				   u32 __user *completep)
+{
+	u32 val;
+	int err;
+
+	err = ipath_user_sdma_make_progress(dd, pq);
+	if (err < 0)
+		return err;
+
+	val = ipath_user_sdma_complete_counter(pq);
+	if (put_user(val, completep))
+		return -EFAULT;
+
+	return 0;
+}
+
+static ssize_t ipath_write(struct file *fp, const char __user *data,
+			   size_t count, loff_t *off)
+{
+	const struct ipath_cmd __user *ucmd;
+	struct ipath_portdata *pd;
+	const void __user *src;
+	size_t consumed, copy;
+	struct ipath_cmd cmd;
+	ssize_t ret = 0;
+	void *dest;
+
+	if (count < sizeof(cmd.type)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ucmd = (const struct ipath_cmd __user *) data;
+
+	if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	consumed = sizeof(cmd.type);
+
+	switch (cmd.type) {
+	case IPATH_CMD_ASSIGN_PORT:
+	case __IPATH_CMD_USER_INIT:
+	case IPATH_CMD_USER_INIT:
+		copy = sizeof(cmd.cmd.user_info);
+		dest = &cmd.cmd.user_info;
+		src = &ucmd->cmd.user_info;
+		break;
+	case IPATH_CMD_RECV_CTRL:
+		copy = sizeof(cmd.cmd.recv_ctrl);
+		dest = &cmd.cmd.recv_ctrl;
+		src = &ucmd->cmd.recv_ctrl;
+		break;
+	case IPATH_CMD_PORT_INFO:
+		copy = sizeof(cmd.cmd.port_info);
+		dest = &cmd.cmd.port_info;
+		src = &ucmd->cmd.port_info;
+		break;
+	case IPATH_CMD_TID_UPDATE:
+	case IPATH_CMD_TID_FREE:
+		copy = sizeof(cmd.cmd.tid_info);
+		dest = &cmd.cmd.tid_info;
+		src = &ucmd->cmd.tid_info;
+		break;
+	case IPATH_CMD_SET_PART_KEY:
+		copy = sizeof(cmd.cmd.part_key);
+		dest = &cmd.cmd.part_key;
+		src = &ucmd->cmd.part_key;
+		break;
+	case __IPATH_CMD_SLAVE_INFO:
+		copy = sizeof(cmd.cmd.slave_mask_addr);
+		dest = &cmd.cmd.slave_mask_addr;
+		src = &ucmd->cmd.slave_mask_addr;
+		break;
+	case IPATH_CMD_PIOAVAILUPD:	// force an update of PIOAvail reg
+		copy = 0;
+		src = NULL;
+		dest = NULL;
+		break;
+	case IPATH_CMD_POLL_TYPE:
+		copy = sizeof(cmd.cmd.poll_type);
+		dest = &cmd.cmd.poll_type;
+		src = &ucmd->cmd.poll_type;
+		break;
+	case IPATH_CMD_ARMLAUNCH_CTRL:
+		copy = sizeof(cmd.cmd.armlaunch_ctrl);
+		dest = &cmd.cmd.armlaunch_ctrl;
+		src = &ucmd->cmd.armlaunch_ctrl;
+		break;
+	case IPATH_CMD_SDMA_INFLIGHT:
+		copy = sizeof(cmd.cmd.sdma_inflight);
+		dest = &cmd.cmd.sdma_inflight;
+		src = &ucmd->cmd.sdma_inflight;
+		break;
+	case IPATH_CMD_SDMA_COMPLETE:
+		copy = sizeof(cmd.cmd.sdma_complete);
+		dest = &cmd.cmd.sdma_complete;
+		src = &ucmd->cmd.sdma_complete;
+		break;
+	default:
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (copy) {
+		if ((count - consumed) < copy) {
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		if (copy_from_user(dest, src, copy)) {
+			ret = -EFAULT;
+			goto bail;
+		}
+
+		consumed += copy;
+	}
+
+	pd = port_fp(fp);
+	if (!pd && cmd.type != __IPATH_CMD_USER_INIT &&
+		cmd.type != IPATH_CMD_ASSIGN_PORT) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	switch (cmd.type) {
+	case IPATH_CMD_ASSIGN_PORT:
+		ret = ipath_assign_port(fp, &cmd.cmd.user_info);
+		if (ret)
+			goto bail;
+		break;
+	case __IPATH_CMD_USER_INIT:
+		/* backwards compatibility, get port first */
+		ret = ipath_assign_port(fp, &cmd.cmd.user_info);
+		if (ret)
+			goto bail;
+		/* and fall through to current version. */
+	case IPATH_CMD_USER_INIT:
+		ret = ipath_do_user_init(fp, &cmd.cmd.user_info);
+		if (ret)
+			goto bail;
+		ret = ipath_get_base_info(
+			fp, (void __user *) (unsigned long)
+			cmd.cmd.user_info.spu_base_info,
+			cmd.cmd.user_info.spu_base_info_size);
+		break;
+	case IPATH_CMD_RECV_CTRL:
+		ret = ipath_manage_rcvq(pd, subport_fp(fp), cmd.cmd.recv_ctrl);
+		break;
+	case IPATH_CMD_PORT_INFO:
+		ret = ipath_port_info(pd, subport_fp(fp),
+				      (struct ipath_port_info __user *)
+				      (unsigned long) cmd.cmd.port_info);
+		break;
+	case IPATH_CMD_TID_UPDATE:
+		ret = ipath_tid_update(pd, fp, &cmd.cmd.tid_info);
+		break;
+	case IPATH_CMD_TID_FREE:
+		ret = ipath_tid_free(pd, subport_fp(fp), &cmd.cmd.tid_info);
+		break;
+	case IPATH_CMD_SET_PART_KEY:
+		ret = ipath_set_part_key(pd, cmd.cmd.part_key);
+		break;
+	case __IPATH_CMD_SLAVE_INFO:
+		ret = ipath_get_slave_info(pd,
+					   (void __user *) (unsigned long)
+					   cmd.cmd.slave_mask_addr);
+		break;
+	case IPATH_CMD_PIOAVAILUPD:
+		ipath_force_pio_avail_update(pd->port_dd);
+		break;
+	case IPATH_CMD_POLL_TYPE:
+		pd->poll_type = cmd.cmd.poll_type;
+		break;
+	case IPATH_CMD_ARMLAUNCH_CTRL:
+		if (cmd.cmd.armlaunch_ctrl)
+			ipath_enable_armlaunch(pd->port_dd);
+		else
+			ipath_disable_armlaunch(pd->port_dd);
+		break;
+	case IPATH_CMD_SDMA_INFLIGHT:
+		ret = ipath_sdma_get_inflight(user_sdma_queue_fp(fp),
+					      (u32 __user *) (unsigned long)
+					      cmd.cmd.sdma_inflight);
+		break;
+	case IPATH_CMD_SDMA_COMPLETE:
+		ret = ipath_sdma_get_complete(pd->port_dd,
+					      user_sdma_queue_fp(fp),
+					      (u32 __user *) (unsigned long)
+					      cmd.cmd.sdma_complete);
+		break;
+	}
+
+	if (ret >= 0)
+		ret = consumed;
+
+bail:
+	return ret;
+}
+
+static ssize_t ipath_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct file *filp = iocb->ki_filp;
+	struct ipath_filedata *fp = filp->private_data;
+	struct ipath_portdata *pd = port_fp(filp);
+	struct ipath_user_sdma_queue *pq = fp->pq;
+
+	if (!iter_is_iovec(from) || !from->nr_segs)
+		return -EINVAL;
+
+	return ipath_user_sdma_writev(pd->port_dd, pq, from->iov, from->nr_segs);
+}
+
+static struct class *ipath_class;
+
+static int init_cdev(int minor, char *name, const struct file_operations *fops,
+		     struct cdev **cdevp, struct device **devp)
+{
+	const dev_t dev = MKDEV(IPATH_MAJOR, minor);
+	struct cdev *cdev = NULL;
+	struct device *device = NULL;
+	int ret;
+
+	cdev = cdev_alloc();
+	if (!cdev) {
+		printk(KERN_ERR IPATH_DRV_NAME
+		       ": Could not allocate cdev for minor %d, %s\n",
+		       minor, name);
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	cdev->owner = THIS_MODULE;
+	cdev->ops = fops;
+	kobject_set_name(&cdev->kobj, name);
+
+	ret = cdev_add(cdev, dev, 1);
+	if (ret < 0) {
+		printk(KERN_ERR IPATH_DRV_NAME
+		       ": Could not add cdev for minor %d, %s (err %d)\n",
+		       minor, name, -ret);
+		goto err_cdev;
+	}
+
+	device = device_create(ipath_class, NULL, dev, NULL, name);
+
+	if (IS_ERR(device)) {
+		ret = PTR_ERR(device);
+		printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
+		       "device for minor %d, %s (err %d)\n",
+		       minor, name, -ret);
+		goto err_cdev;
+	}
+
+	goto done;
+
+err_cdev:
+	cdev_del(cdev);
+	cdev = NULL;
+
+done:
+	if (ret >= 0) {
+		*cdevp = cdev;
+		*devp = device;
+	} else {
+		*cdevp = NULL;
+		*devp = NULL;
+	}
+
+	return ret;
+}
+
+int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
+		    struct cdev **cdevp, struct device **devp)
+{
+	return init_cdev(minor, name, fops, cdevp, devp);
+}
+
+static void cleanup_cdev(struct cdev **cdevp,
+			 struct device **devp)
+{
+	struct device *dev = *devp;
+
+	if (dev) {
+		device_unregister(dev);
+		*devp = NULL;
+	}
+
+	if (*cdevp) {
+		cdev_del(*cdevp);
+		*cdevp = NULL;
+	}
+}
+
+void ipath_cdev_cleanup(struct cdev **cdevp,
+			struct device **devp)
+{
+	cleanup_cdev(cdevp, devp);
+}
+
+static struct cdev *wildcard_cdev;
+static struct device *wildcard_dev;
+
+static const dev_t dev = MKDEV(IPATH_MAJOR, 0);
+
+static int user_init(void)
+{
+	int ret;
+
+	ret = register_chrdev_region(dev, IPATH_NMINORS, IPATH_DRV_NAME);
+	if (ret < 0) {
+		printk(KERN_ERR IPATH_DRV_NAME ": Could not register "
+		       "chrdev region (err %d)\n", -ret);
+		goto done;
+	}
+
+	ipath_class = class_create(THIS_MODULE, IPATH_DRV_NAME);
+
+	if (IS_ERR(ipath_class)) {
+		ret = PTR_ERR(ipath_class);
+		printk(KERN_ERR IPATH_DRV_NAME ": Could not create "
+		       "device class (err %d)\n", -ret);
+		goto bail;
+	}
+
+	goto done;
+bail:
+	unregister_chrdev_region(dev, IPATH_NMINORS);
+done:
+	return ret;
+}
+
+static void user_cleanup(void)
+{
+	if (ipath_class) {
+		class_destroy(ipath_class);
+		ipath_class = NULL;
+	}
+
+	unregister_chrdev_region(dev, IPATH_NMINORS);
+}
+
+static atomic_t user_count = ATOMIC_INIT(0);
+static atomic_t user_setup = ATOMIC_INIT(0);
+
+int ipath_user_add(struct ipath_devdata *dd)
+{
+	char name[10];
+	int ret;
+
+	if (atomic_inc_return(&user_count) == 1) {
+		ret = user_init();
+		if (ret < 0) {
+			ipath_dev_err(dd, "Unable to set up user support: "
+				      "error %d\n", -ret);
+			goto bail;
+		}
+		ret = init_cdev(0, "ipath", &ipath_file_ops, &wildcard_cdev,
+				&wildcard_dev);
+		if (ret < 0) {
+			ipath_dev_err(dd, "Could not create wildcard "
+				      "minor: error %d\n", -ret);
+			goto bail_user;
+		}
+
+		atomic_set(&user_setup, 1);
+	}
+
+	snprintf(name, sizeof(name), "ipath%d", dd->ipath_unit);
+
+	ret = init_cdev(dd->ipath_unit + 1, name, &ipath_file_ops,
+			&dd->user_cdev, &dd->user_dev);
+	if (ret < 0)
+		ipath_dev_err(dd, "Could not create user minor %d, %s\n",
+			      dd->ipath_unit + 1, name);
+
+	goto bail;
+
+bail_user:
+	user_cleanup();
+bail:
+	return ret;
+}
+
+void ipath_user_remove(struct ipath_devdata *dd)
+{
+	cleanup_cdev(&dd->user_cdev, &dd->user_dev);
+
+	if (atomic_dec_return(&user_count) == 0) {
+		if (atomic_read(&user_setup) == 0)
+			goto bail;
+
+		cleanup_cdev(&wildcard_cdev, &wildcard_dev);
+		user_cleanup();
+
+		atomic_set(&user_setup, 0);
+	}
+bail:
+	return;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
new file mode 100644
index 000000000..1ca8e32a9
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+
+#include "ipath_kernel.h"
+
+#define IPATHFS_MAGIC 0x726a77
+
+static struct super_block *ipath_super;
+
+static int ipathfs_mknod(struct inode *dir, struct dentry *dentry,
+			 umode_t mode, const struct file_operations *fops,
+			 void *data)
+{
+	int error;
+	struct inode *inode = new_inode(dir->i_sb);
+
+	if (!inode) {
+		error = -EPERM;
+		goto bail;
+	}
+
+	inode->i_ino = get_next_ino();
+	inode->i_mode = mode;
+	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_private = data;
+	if (S_ISDIR(mode)) {
+		inode->i_op = &simple_dir_inode_operations;
+		inc_nlink(inode);
+		inc_nlink(dir);
+	}
+
+	inode->i_fop = fops;
+
+	d_instantiate(dentry, inode);
+	error = 0;
+
+bail:
+	return error;
+}
+
+static int create_file(const char *name, umode_t mode,
+		       struct dentry *parent, struct dentry **dentry,
+		       const struct file_operations *fops, void *data)
+{
+	int error;
+
+	mutex_lock(&d_inode(parent)->i_mutex);
+	*dentry = lookup_one_len(name, parent, strlen(name));
+	if (!IS_ERR(*dentry))
+		error = ipathfs_mknod(d_inode(parent), *dentry,
+				      mode, fops, data);
+	else
+		error = PTR_ERR(*dentry);
+	mutex_unlock(&d_inode(parent)->i_mutex);
+
+	return error;
+}
+
+static ssize_t atomic_stats_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, count, ppos, &ipath_stats,
+				       sizeof ipath_stats);
+}
+
+static const struct file_operations atomic_stats_ops = {
+	.read = atomic_stats_read,
+	.llseek = default_llseek,
+};
+
+static ssize_t atomic_counters_read(struct file *file, char __user *buf,
+				    size_t count, loff_t *ppos)
+{
+	struct infinipath_counters counters;
+	struct ipath_devdata *dd;
+
+	dd = file_inode(file)->i_private;
+	dd->ipath_f_read_counters(dd, &counters);
+
+	return simple_read_from_buffer(buf, count, ppos, &counters,
+				       sizeof counters);
+}
+
+static const struct file_operations atomic_counters_ops = {
+	.read = atomic_counters_read,
+	.llseek = default_llseek,
+};
+
+static ssize_t flash_read(struct file *file, char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	struct ipath_devdata *dd;
+	ssize_t ret;
+	loff_t pos;
+	char *tmp;
+
+	pos = *ppos;
+
+	if ( pos < 0) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (pos >= sizeof(struct ipath_flash)) {
+		ret = 0;
+		goto bail;
+	}
+
+	if (count > sizeof(struct ipath_flash) - pos)
+		count = sizeof(struct ipath_flash) - pos;
+
+	tmp = kmalloc(count, GFP_KERNEL);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	dd = file_inode(file)->i_private;
+	if (ipath_eeprom_read(dd, pos, tmp, count)) {
+		ipath_dev_err(dd, "failed to read from flash\n");
+		ret = -ENXIO;
+		goto bail_tmp;
+	}
+
+	if (copy_to_user(buf, tmp, count)) {
+		ret = -EFAULT;
+		goto bail_tmp;
+	}
+
+	*ppos = pos + count;
+	ret = count;
+
+bail_tmp:
+	kfree(tmp);
+
+bail:
+	return ret;
+}
+
+static ssize_t flash_write(struct file *file, const char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	struct ipath_devdata *dd;
+	ssize_t ret;
+	loff_t pos;
+	char *tmp;
+
+	pos = *ppos;
+
+	if (pos != 0) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (count != sizeof(struct ipath_flash)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	tmp = kmalloc(count, GFP_KERNEL);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	if (copy_from_user(tmp, buf, count)) {
+		ret = -EFAULT;
+		goto bail_tmp;
+	}
+
+	dd = file_inode(file)->i_private;
+	if (ipath_eeprom_write(dd, pos, tmp, count)) {
+		ret = -ENXIO;
+		ipath_dev_err(dd, "failed to write to flash\n");
+		goto bail_tmp;
+	}
+
+	*ppos = pos + count;
+	ret = count;
+
+bail_tmp:
+	kfree(tmp);
+
+bail:
+	return ret;
+}
+
+static const struct file_operations flash_ops = {
+	.read = flash_read,
+	.write = flash_write,
+	.llseek = default_llseek,
+};
+
+static int create_device_files(struct super_block *sb,
+			       struct ipath_devdata *dd)
+{
+	struct dentry *dir, *tmp;
+	char unit[10];
+	int ret;
+
+	snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
+	ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir,
+			  &simple_dir_operations, dd);
+	if (ret) {
+		printk(KERN_ERR "create_file(%s) failed: %d\n", unit, ret);
+		goto bail;
+	}
+
+	ret = create_file("atomic_counters", S_IFREG|S_IRUGO, dir, &tmp,
+			  &atomic_counters_ops, dd);
+	if (ret) {
+		printk(KERN_ERR "create_file(%s/atomic_counters) "
+		       "failed: %d\n", unit, ret);
+		goto bail;
+	}
+
+	ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp,
+			  &flash_ops, dd);
+	if (ret) {
+		printk(KERN_ERR "create_file(%s/flash) "
+		       "failed: %d\n", unit, ret);
+		goto bail;
+	}
+
+bail:
+	return ret;
+}
+
+static int remove_file(struct dentry *parent, char *name)
+{
+	struct dentry *tmp;
+	int ret;
+
+	tmp = lookup_one_len(name, parent, strlen(name));
+
+	if (IS_ERR(tmp)) {
+		ret = PTR_ERR(tmp);
+		goto bail;
+	}
+
+	spin_lock(&tmp->d_lock);
+	if (!d_unhashed(tmp) && d_really_is_positive(tmp)) {
+		dget_dlock(tmp);
+		__d_drop(tmp);
+		spin_unlock(&tmp->d_lock);
+		simple_unlink(d_inode(parent), tmp);
+	} else
+		spin_unlock(&tmp->d_lock);
+
+	ret = 0;
+bail:
+	/*
+	 * We don't expect clients to care about the return value, but
+	 * it's there if they need it.
+	 */
+	return ret;
+}
+
+static int remove_device_files(struct super_block *sb,
+			       struct ipath_devdata *dd)
+{
+	struct dentry *dir, *root;
+	char unit[10];
+	int ret;
+
+	root = dget(sb->s_root);
+	mutex_lock(&d_inode(root)->i_mutex);
+	snprintf(unit, sizeof unit, "%02d", dd->ipath_unit);
+	dir = lookup_one_len(unit, root, strlen(unit));
+
+	if (IS_ERR(dir)) {
+		ret = PTR_ERR(dir);
+		printk(KERN_ERR "Lookup of %s failed\n", unit);
+		goto bail;
+	}
+
+	remove_file(dir, "flash");
+	remove_file(dir, "atomic_counters");
+	d_delete(dir);
+	ret = simple_rmdir(d_inode(root), dir);
+
+bail:
+	mutex_unlock(&d_inode(root)->i_mutex);
+	dput(root);
+	return ret;
+}
+
+static int ipathfs_fill_super(struct super_block *sb, void *data,
+			      int silent)
+{
+	struct ipath_devdata *dd, *tmp;
+	unsigned long flags;
+	int ret;
+
+	static struct tree_descr files[] = {
+		[2] = {"atomic_stats", &atomic_stats_ops, S_IRUGO},
+		{""},
+	};
+
+	ret = simple_fill_super(sb, IPATHFS_MAGIC, files);
+	if (ret) {
+		printk(KERN_ERR "simple_fill_super failed: %d\n", ret);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&ipath_devs_lock, flags);
+
+	list_for_each_entry_safe(dd, tmp, &ipath_dev_list, ipath_list) {
+		spin_unlock_irqrestore(&ipath_devs_lock, flags);
+		ret = create_device_files(sb, dd);
+		if (ret)
+			goto bail;
+		spin_lock_irqsave(&ipath_devs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+bail:
+	return ret;
+}
+
+static struct dentry *ipathfs_mount(struct file_system_type *fs_type,
+			int flags, const char *dev_name, void *data)
+{
+	struct dentry *ret;
+	ret = mount_single(fs_type, flags, data, ipathfs_fill_super);
+	if (!IS_ERR(ret))
+		ipath_super = ret->d_sb;
+	return ret;
+}
+
+static void ipathfs_kill_super(struct super_block *s)
+{
+	kill_litter_super(s);
+	ipath_super = NULL;
+}
+
+int ipathfs_add_device(struct ipath_devdata *dd)
+{
+	int ret;
+
+	if (ipath_super == NULL) {
+		ret = 0;
+		goto bail;
+	}
+
+	ret = create_device_files(ipath_super, dd);
+
+bail:
+	return ret;
+}
+
+int ipathfs_remove_device(struct ipath_devdata *dd)
+{
+	int ret;
+
+	if (ipath_super == NULL) {
+		ret = 0;
+		goto bail;
+	}
+
+	ret = remove_device_files(ipath_super, dd);
+
+bail:
+	return ret;
+}
+
+static struct file_system_type ipathfs_fs_type = {
+	.owner =	THIS_MODULE,
+	.name =		"ipathfs",
+	.mount =	ipathfs_mount,
+	.kill_sb =	ipathfs_kill_super,
+};
+MODULE_ALIAS_FS("ipathfs");
+
+int __init ipath_init_ipathfs(void)
+{
+	return register_filesystem(&ipathfs_fs_type);
+}
+
+void __exit ipath_exit_ipathfs(void)
+{
+	unregister_filesystem(&ipathfs_fs_type);
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c
new file mode 100644
index 000000000..7cc305488
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_iba6110.c
@@ -0,0 +1,1940 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains all of the code that is specific to the InfiniPath
+ * HT chip.
+ */
+
+#include <linux/vmalloc.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/htirq.h>
+#include <rdma/ib_verbs.h>
+
+#include "ipath_kernel.h"
+#include "ipath_registers.h"
+
+static void ipath_setup_ht_setextled(struct ipath_devdata *, u64, u64);
+
+
+/*
+ * This lists the InfiniPath registers, in the actual chip layout.
+ * This structure should never be directly accessed.
+ *
+ * The names are in InterCap form because they're taken straight from
+ * the chip specification.  Since they're only used in this file, they
+ * don't pollute the rest of the source.
+*/
+
+struct _infinipath_do_not_use_kernel_regs {
+	unsigned long long Revision;
+	unsigned long long Control;
+	unsigned long long PageAlign;
+	unsigned long long PortCnt;
+	unsigned long long DebugPortSelect;
+	unsigned long long DebugPort;
+	unsigned long long SendRegBase;
+	unsigned long long UserRegBase;
+	unsigned long long CounterRegBase;
+	unsigned long long Scratch;
+	unsigned long long ReservedMisc1;
+	unsigned long long InterruptConfig;
+	unsigned long long IntBlocked;
+	unsigned long long IntMask;
+	unsigned long long IntStatus;
+	unsigned long long IntClear;
+	unsigned long long ErrorMask;
+	unsigned long long ErrorStatus;
+	unsigned long long ErrorClear;
+	unsigned long long HwErrMask;
+	unsigned long long HwErrStatus;
+	unsigned long long HwErrClear;
+	unsigned long long HwDiagCtrl;
+	unsigned long long MDIO;
+	unsigned long long IBCStatus;
+	unsigned long long IBCCtrl;
+	unsigned long long ExtStatus;
+	unsigned long long ExtCtrl;
+	unsigned long long GPIOOut;
+	unsigned long long GPIOMask;
+	unsigned long long GPIOStatus;
+	unsigned long long GPIOClear;
+	unsigned long long RcvCtrl;
+	unsigned long long RcvBTHQP;
+	unsigned long long RcvHdrSize;
+	unsigned long long RcvHdrCnt;
+	unsigned long long RcvHdrEntSize;
+	unsigned long long RcvTIDBase;
+	unsigned long long RcvTIDCnt;
+	unsigned long long RcvEgrBase;
+	unsigned long long RcvEgrCnt;
+	unsigned long long RcvBufBase;
+	unsigned long long RcvBufSize;
+	unsigned long long RxIntMemBase;
+	unsigned long long RxIntMemSize;
+	unsigned long long RcvPartitionKey;
+	unsigned long long ReservedRcv[10];
+	unsigned long long SendCtrl;
+	unsigned long long SendPIOBufBase;
+	unsigned long long SendPIOSize;
+	unsigned long long SendPIOBufCnt;
+	unsigned long long SendPIOAvailAddr;
+	unsigned long long TxIntMemBase;
+	unsigned long long TxIntMemSize;
+	unsigned long long ReservedSend[9];
+	unsigned long long SendBufferError;
+	unsigned long long SendBufferErrorCONT1;
+	unsigned long long SendBufferErrorCONT2;
+	unsigned long long SendBufferErrorCONT3;
+	unsigned long long ReservedSBE[4];
+	unsigned long long RcvHdrAddr0;
+	unsigned long long RcvHdrAddr1;
+	unsigned long long RcvHdrAddr2;
+	unsigned long long RcvHdrAddr3;
+	unsigned long long RcvHdrAddr4;
+	unsigned long long RcvHdrAddr5;
+	unsigned long long RcvHdrAddr6;
+	unsigned long long RcvHdrAddr7;
+	unsigned long long RcvHdrAddr8;
+	unsigned long long ReservedRHA[7];
+	unsigned long long RcvHdrTailAddr0;
+	unsigned long long RcvHdrTailAddr1;
+	unsigned long long RcvHdrTailAddr2;
+	unsigned long long RcvHdrTailAddr3;
+	unsigned long long RcvHdrTailAddr4;
+	unsigned long long RcvHdrTailAddr5;
+	unsigned long long RcvHdrTailAddr6;
+	unsigned long long RcvHdrTailAddr7;
+	unsigned long long RcvHdrTailAddr8;
+	unsigned long long ReservedRHTA[7];
+	unsigned long long Sync;	/* Software only */
+	unsigned long long Dump;	/* Software only */
+	unsigned long long SimVer;	/* Software only */
+	unsigned long long ReservedSW[5];
+	unsigned long long SerdesConfig0;
+	unsigned long long SerdesConfig1;
+	unsigned long long SerdesStatus;
+	unsigned long long XGXSConfig;
+	unsigned long long ReservedSW2[4];
+};
+
+struct _infinipath_do_not_use_counters {
+	__u64 LBIntCnt;
+	__u64 LBFlowStallCnt;
+	__u64 Reserved1;
+	__u64 TxUnsupVLErrCnt;
+	__u64 TxDataPktCnt;
+	__u64 TxFlowPktCnt;
+	__u64 TxDwordCnt;
+	__u64 TxLenErrCnt;
+	__u64 TxMaxMinLenErrCnt;
+	__u64 TxUnderrunCnt;
+	__u64 TxFlowStallCnt;
+	__u64 TxDroppedPktCnt;
+	__u64 RxDroppedPktCnt;
+	__u64 RxDataPktCnt;
+	__u64 RxFlowPktCnt;
+	__u64 RxDwordCnt;
+	__u64 RxLenErrCnt;
+	__u64 RxMaxMinLenErrCnt;
+	__u64 RxICRCErrCnt;
+	__u64 RxVCRCErrCnt;
+	__u64 RxFlowCtrlErrCnt;
+	__u64 RxBadFormatCnt;
+	__u64 RxLinkProblemCnt;
+	__u64 RxEBPCnt;
+	__u64 RxLPCRCErrCnt;
+	__u64 RxBufOvflCnt;
+	__u64 RxTIDFullErrCnt;
+	__u64 RxTIDValidErrCnt;
+	__u64 RxPKeyMismatchCnt;
+	__u64 RxP0HdrEgrOvflCnt;
+	__u64 RxP1HdrEgrOvflCnt;
+	__u64 RxP2HdrEgrOvflCnt;
+	__u64 RxP3HdrEgrOvflCnt;
+	__u64 RxP4HdrEgrOvflCnt;
+	__u64 RxP5HdrEgrOvflCnt;
+	__u64 RxP6HdrEgrOvflCnt;
+	__u64 RxP7HdrEgrOvflCnt;
+	__u64 RxP8HdrEgrOvflCnt;
+	__u64 Reserved6;
+	__u64 Reserved7;
+	__u64 IBStatusChangeCnt;
+	__u64 IBLinkErrRecoveryCnt;
+	__u64 IBLinkDownedCnt;
+	__u64 IBSymbolErrCnt;
+};
+
+#define IPATH_KREG_OFFSET(field) (offsetof( \
+	struct _infinipath_do_not_use_kernel_regs, field) / sizeof(u64))
+#define IPATH_CREG_OFFSET(field) (offsetof( \
+	struct _infinipath_do_not_use_counters, field) / sizeof(u64))
+
+static const struct ipath_kregs ipath_ht_kregs = {
+	.kr_control = IPATH_KREG_OFFSET(Control),
+	.kr_counterregbase = IPATH_KREG_OFFSET(CounterRegBase),
+	.kr_debugport = IPATH_KREG_OFFSET(DebugPort),
+	.kr_debugportselect = IPATH_KREG_OFFSET(DebugPortSelect),
+	.kr_errorclear = IPATH_KREG_OFFSET(ErrorClear),
+	.kr_errormask = IPATH_KREG_OFFSET(ErrorMask),
+	.kr_errorstatus = IPATH_KREG_OFFSET(ErrorStatus),
+	.kr_extctrl = IPATH_KREG_OFFSET(ExtCtrl),
+	.kr_extstatus = IPATH_KREG_OFFSET(ExtStatus),
+	.kr_gpio_clear = IPATH_KREG_OFFSET(GPIOClear),
+	.kr_gpio_mask = IPATH_KREG_OFFSET(GPIOMask),
+	.kr_gpio_out = IPATH_KREG_OFFSET(GPIOOut),
+	.kr_gpio_status = IPATH_KREG_OFFSET(GPIOStatus),
+	.kr_hwdiagctrl = IPATH_KREG_OFFSET(HwDiagCtrl),
+	.kr_hwerrclear = IPATH_KREG_OFFSET(HwErrClear),
+	.kr_hwerrmask = IPATH_KREG_OFFSET(HwErrMask),
+	.kr_hwerrstatus = IPATH_KREG_OFFSET(HwErrStatus),
+	.kr_ibcctrl = IPATH_KREG_OFFSET(IBCCtrl),
+	.kr_ibcstatus = IPATH_KREG_OFFSET(IBCStatus),
+	.kr_intblocked = IPATH_KREG_OFFSET(IntBlocked),
+	.kr_intclear = IPATH_KREG_OFFSET(IntClear),
+	.kr_interruptconfig = IPATH_KREG_OFFSET(InterruptConfig),
+	.kr_intmask = IPATH_KREG_OFFSET(IntMask),
+	.kr_intstatus = IPATH_KREG_OFFSET(IntStatus),
+	.kr_mdio = IPATH_KREG_OFFSET(MDIO),
+	.kr_pagealign = IPATH_KREG_OFFSET(PageAlign),
+	.kr_partitionkey = IPATH_KREG_OFFSET(RcvPartitionKey),
+	.kr_portcnt = IPATH_KREG_OFFSET(PortCnt),
+	.kr_rcvbthqp = IPATH_KREG_OFFSET(RcvBTHQP),
+	.kr_rcvbufbase = IPATH_KREG_OFFSET(RcvBufBase),
+	.kr_rcvbufsize = IPATH_KREG_OFFSET(RcvBufSize),
+	.kr_rcvctrl = IPATH_KREG_OFFSET(RcvCtrl),
+	.kr_rcvegrbase = IPATH_KREG_OFFSET(RcvEgrBase),
+	.kr_rcvegrcnt = IPATH_KREG_OFFSET(RcvEgrCnt),
+	.kr_rcvhdrcnt = IPATH_KREG_OFFSET(RcvHdrCnt),
+	.kr_rcvhdrentsize = IPATH_KREG_OFFSET(RcvHdrEntSize),
+	.kr_rcvhdrsize = IPATH_KREG_OFFSET(RcvHdrSize),
+	.kr_rcvintmembase = IPATH_KREG_OFFSET(RxIntMemBase),
+	.kr_rcvintmemsize = IPATH_KREG_OFFSET(RxIntMemSize),
+	.kr_rcvtidbase = IPATH_KREG_OFFSET(RcvTIDBase),
+	.kr_rcvtidcnt = IPATH_KREG_OFFSET(RcvTIDCnt),
+	.kr_revision = IPATH_KREG_OFFSET(Revision),
+	.kr_scratch = IPATH_KREG_OFFSET(Scratch),
+	.kr_sendbuffererror = IPATH_KREG_OFFSET(SendBufferError),
+	.kr_sendctrl = IPATH_KREG_OFFSET(SendCtrl),
+	.kr_sendpioavailaddr = IPATH_KREG_OFFSET(SendPIOAvailAddr),
+	.kr_sendpiobufbase = IPATH_KREG_OFFSET(SendPIOBufBase),
+	.kr_sendpiobufcnt = IPATH_KREG_OFFSET(SendPIOBufCnt),
+	.kr_sendpiosize = IPATH_KREG_OFFSET(SendPIOSize),
+	.kr_sendregbase = IPATH_KREG_OFFSET(SendRegBase),
+	.kr_txintmembase = IPATH_KREG_OFFSET(TxIntMemBase),
+	.kr_txintmemsize = IPATH_KREG_OFFSET(TxIntMemSize),
+	.kr_userregbase = IPATH_KREG_OFFSET(UserRegBase),
+	.kr_serdesconfig0 = IPATH_KREG_OFFSET(SerdesConfig0),
+	.kr_serdesconfig1 = IPATH_KREG_OFFSET(SerdesConfig1),
+	.kr_serdesstatus = IPATH_KREG_OFFSET(SerdesStatus),
+	.kr_xgxsconfig = IPATH_KREG_OFFSET(XGXSConfig),
+	/*
+	 * These should not be used directly via ipath_write_kreg64(),
+	 * use them with ipath_write_kreg64_port(),
+	 */
+	.kr_rcvhdraddr = IPATH_KREG_OFFSET(RcvHdrAddr0),
+	.kr_rcvhdrtailaddr = IPATH_KREG_OFFSET(RcvHdrTailAddr0)
+};
+
+static const struct ipath_cregs ipath_ht_cregs = {
+	.cr_badformatcnt = IPATH_CREG_OFFSET(RxBadFormatCnt),
+	.cr_erricrccnt = IPATH_CREG_OFFSET(RxICRCErrCnt),
+	.cr_errlinkcnt = IPATH_CREG_OFFSET(RxLinkProblemCnt),
+	.cr_errlpcrccnt = IPATH_CREG_OFFSET(RxLPCRCErrCnt),
+	.cr_errpkey = IPATH_CREG_OFFSET(RxPKeyMismatchCnt),
+	.cr_errrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowCtrlErrCnt),
+	.cr_err_rlencnt = IPATH_CREG_OFFSET(RxLenErrCnt),
+	.cr_errslencnt = IPATH_CREG_OFFSET(TxLenErrCnt),
+	.cr_errtidfull = IPATH_CREG_OFFSET(RxTIDFullErrCnt),
+	.cr_errtidvalid = IPATH_CREG_OFFSET(RxTIDValidErrCnt),
+	.cr_errvcrccnt = IPATH_CREG_OFFSET(RxVCRCErrCnt),
+	.cr_ibstatuschange = IPATH_CREG_OFFSET(IBStatusChangeCnt),
+	/* calc from Reg_CounterRegBase + offset */
+	.cr_intcnt = IPATH_CREG_OFFSET(LBIntCnt),
+	.cr_invalidrlencnt = IPATH_CREG_OFFSET(RxMaxMinLenErrCnt),
+	.cr_invalidslencnt = IPATH_CREG_OFFSET(TxMaxMinLenErrCnt),
+	.cr_lbflowstallcnt = IPATH_CREG_OFFSET(LBFlowStallCnt),
+	.cr_pktrcvcnt = IPATH_CREG_OFFSET(RxDataPktCnt),
+	.cr_pktrcvflowctrlcnt = IPATH_CREG_OFFSET(RxFlowPktCnt),
+	.cr_pktsendcnt = IPATH_CREG_OFFSET(TxDataPktCnt),
+	.cr_pktsendflowcnt = IPATH_CREG_OFFSET(TxFlowPktCnt),
+	.cr_portovflcnt = IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt),
+	.cr_rcvebpcnt = IPATH_CREG_OFFSET(RxEBPCnt),
+	.cr_rcvovflcnt = IPATH_CREG_OFFSET(RxBufOvflCnt),
+	.cr_senddropped = IPATH_CREG_OFFSET(TxDroppedPktCnt),
+	.cr_sendstallcnt = IPATH_CREG_OFFSET(TxFlowStallCnt),
+	.cr_sendunderruncnt = IPATH_CREG_OFFSET(TxUnderrunCnt),
+	.cr_wordrcvcnt = IPATH_CREG_OFFSET(RxDwordCnt),
+	.cr_wordsendcnt = IPATH_CREG_OFFSET(TxDwordCnt),
+	.cr_unsupvlcnt = IPATH_CREG_OFFSET(TxUnsupVLErrCnt),
+	.cr_rxdroppktcnt = IPATH_CREG_OFFSET(RxDroppedPktCnt),
+	.cr_iblinkerrrecovcnt = IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt),
+	.cr_iblinkdowncnt = IPATH_CREG_OFFSET(IBLinkDownedCnt),
+	.cr_ibsymbolerrcnt = IPATH_CREG_OFFSET(IBSymbolErrCnt)
+};
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define INFINIPATH_I_RCVURG_MASK ((1U<<9)-1)
+#define INFINIPATH_I_RCVURG_SHIFT 0
+#define INFINIPATH_I_RCVAVAIL_MASK ((1U<<9)-1)
+#define INFINIPATH_I_RCVAVAIL_SHIFT 12
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+#define INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT 0
+#define INFINIPATH_HWE_HTCMEMPARITYERR_MASK 0x3FFFFFULL
+#define INFINIPATH_HWE_HTCLNKABYTE0CRCERR   0x0000000000800000ULL
+#define INFINIPATH_HWE_HTCLNKABYTE1CRCERR   0x0000000001000000ULL
+#define INFINIPATH_HWE_HTCLNKBBYTE0CRCERR   0x0000000002000000ULL
+#define INFINIPATH_HWE_HTCLNKBBYTE1CRCERR   0x0000000004000000ULL
+#define INFINIPATH_HWE_HTCMISCERR4          0x0000000008000000ULL
+#define INFINIPATH_HWE_HTCMISCERR5          0x0000000010000000ULL
+#define INFINIPATH_HWE_HTCMISCERR6          0x0000000020000000ULL
+#define INFINIPATH_HWE_HTCMISCERR7          0x0000000040000000ULL
+#define INFINIPATH_HWE_HTCBUSTREQPARITYERR  0x0000000080000000ULL
+#define INFINIPATH_HWE_HTCBUSTRESPPARITYERR 0x0000000100000000ULL
+#define INFINIPATH_HWE_HTCBUSIREQPARITYERR  0x0000000200000000ULL
+#define INFINIPATH_HWE_COREPLL_FBSLIP       0x0080000000000000ULL
+#define INFINIPATH_HWE_COREPLL_RFSLIP       0x0100000000000000ULL
+#define INFINIPATH_HWE_HTBPLL_FBSLIP        0x0200000000000000ULL
+#define INFINIPATH_HWE_HTBPLL_RFSLIP        0x0400000000000000ULL
+#define INFINIPATH_HWE_HTAPLL_FBSLIP        0x0800000000000000ULL
+#define INFINIPATH_HWE_HTAPLL_RFSLIP        0x1000000000000000ULL
+#define INFINIPATH_HWE_SERDESPLLFAILED      0x2000000000000000ULL
+
+#define IBA6110_IBCS_LINKTRAININGSTATE_MASK 0xf
+#define IBA6110_IBCS_LINKSTATE_SHIFT 4
+
+/* kr_extstatus bits */
+#define INFINIPATH_EXTS_FREQSEL 0x2
+#define INFINIPATH_EXTS_SERDESSEL 0x4
+#define INFINIPATH_EXTS_MEMBIST_ENDTEST     0x0000000000004000
+#define INFINIPATH_EXTS_MEMBIST_CORRECT     0x0000000000008000
+
+
+/* TID entries (memory), HT-only */
+#define INFINIPATH_RT_ADDR_MASK 0xFFFFFFFFFFULL	/* 40 bits valid */
+#define INFINIPATH_RT_VALID 0x8000000000000000ULL
+#define INFINIPATH_RT_ADDR_SHIFT 0
+#define INFINIPATH_RT_BUFSIZE_MASK 0x3FFFULL
+#define INFINIPATH_RT_BUFSIZE_SHIFT 48
+
+#define INFINIPATH_R_INTRAVAIL_SHIFT 16
+#define INFINIPATH_R_TAILUPD_SHIFT 31
+
+/* kr_xgxsconfig bits */
+#define INFINIPATH_XGXS_RESET          0x7ULL
+
+/*
+ * masks and bits that are different in different chips, or present only
+ * in one
+ */
+static const ipath_err_t infinipath_hwe_htcmemparityerr_mask =
+    INFINIPATH_HWE_HTCMEMPARITYERR_MASK;
+static const ipath_err_t infinipath_hwe_htcmemparityerr_shift =
+    INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT;
+
+static const ipath_err_t infinipath_hwe_htclnkabyte0crcerr =
+    INFINIPATH_HWE_HTCLNKABYTE0CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkabyte1crcerr =
+    INFINIPATH_HWE_HTCLNKABYTE1CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkbbyte0crcerr =
+    INFINIPATH_HWE_HTCLNKBBYTE0CRCERR;
+static const ipath_err_t infinipath_hwe_htclnkbbyte1crcerr =
+    INFINIPATH_HWE_HTCLNKBBYTE1CRCERR;
+
+#define _IPATH_GPIO_SDA_NUM 1
+#define _IPATH_GPIO_SCL_NUM 0
+
+#define IPATH_GPIO_SDA \
+	(1ULL << (_IPATH_GPIO_SDA_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
+#define IPATH_GPIO_SCL \
+	(1ULL << (_IPATH_GPIO_SCL_NUM+INFINIPATH_EXTC_GPIOOE_SHIFT))
+
+/* keep the code below somewhat more readable; not used elsewhere */
+#define _IPATH_HTLINK0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |	\
+				infinipath_hwe_htclnkabyte1crcerr)
+#define _IPATH_HTLINK1_CRCBITS (infinipath_hwe_htclnkbbyte0crcerr |	\
+				infinipath_hwe_htclnkbbyte1crcerr)
+#define _IPATH_HTLANE0_CRCBITS (infinipath_hwe_htclnkabyte0crcerr |	\
+				infinipath_hwe_htclnkbbyte0crcerr)
+#define _IPATH_HTLANE1_CRCBITS (infinipath_hwe_htclnkabyte1crcerr |	\
+				infinipath_hwe_htclnkbbyte1crcerr)
+
+static void hwerr_crcbits(struct ipath_devdata *dd, ipath_err_t hwerrs,
+			  char *msg, size_t msgl)
+{
+	char bitsmsg[64];
+	ipath_err_t crcbits = hwerrs &
+		(_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS);
+	/* don't check if 8bit HT */
+	if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
+		crcbits &= ~infinipath_hwe_htclnkabyte1crcerr;
+	/* don't check if 8bit HT */
+	if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
+		crcbits &= ~infinipath_hwe_htclnkbbyte1crcerr;
+	/*
+	 * we'll want to ignore link errors on link that is
+	 * not in use, if any.  For now, complain about both
+	 */
+	if (crcbits) {
+		u16 ctrl0, ctrl1;
+		snprintf(bitsmsg, sizeof bitsmsg,
+			 "[HT%s lane %s CRC (%llx); powercycle to completely clear]",
+			 !(crcbits & _IPATH_HTLINK1_CRCBITS) ?
+			 "0 (A)" : (!(crcbits & _IPATH_HTLINK0_CRCBITS)
+				    ? "1 (B)" : "0+1 (A+B)"),
+			 !(crcbits & _IPATH_HTLANE1_CRCBITS) ? "0"
+			 : (!(crcbits & _IPATH_HTLANE0_CRCBITS) ? "1" :
+			    "0+1"), (unsigned long long) crcbits);
+		strlcat(msg, bitsmsg, msgl);
+
+		/*
+		 * print extra info for debugging.  slave/primary
+		 * config word 4, 8 (link control 0, 1)
+		 */
+
+		if (pci_read_config_word(dd->pcidev,
+					 dd->ipath_ht_slave_off + 0x4,
+					 &ctrl0))
+			dev_info(&dd->pcidev->dev, "Couldn't read "
+				 "linkctrl0 of slave/primary "
+				 "config block\n");
+		else if (!(ctrl0 & 1 << 6))
+			/* not if EOC bit set */
+			ipath_dbg("HT linkctrl0 0x%x%s%s\n", ctrl0,
+				  ((ctrl0 >> 8) & 7) ? " CRC" : "",
+				  ((ctrl0 >> 4) & 1) ? "linkfail" :
+				  "");
+		if (pci_read_config_word(dd->pcidev,
+					 dd->ipath_ht_slave_off + 0x8,
+					 &ctrl1))
+			dev_info(&dd->pcidev->dev, "Couldn't read "
+				 "linkctrl1 of slave/primary "
+				 "config block\n");
+		else if (!(ctrl1 & 1 << 6))
+			/* not if EOC bit set */
+			ipath_dbg("HT linkctrl1 0x%x%s%s\n", ctrl1,
+				  ((ctrl1 >> 8) & 7) ? " CRC" : "",
+				  ((ctrl1 >> 4) & 1) ? "linkfail" :
+				  "");
+
+		/* disable until driver reloaded */
+		dd->ipath_hwerrmask &= ~crcbits;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+				 dd->ipath_hwerrmask);
+		ipath_dbg("HT crc errs: %s\n", msg);
+	} else
+		ipath_dbg("ignoring HT crc errors 0x%llx, "
+			  "not in use\n", (unsigned long long)
+			  (hwerrs & (_IPATH_HTLINK0_CRCBITS |
+				     _IPATH_HTLINK1_CRCBITS)));
+}
+
+/* 6110 specific hardware errors... */
+static const struct ipath_hwerror_msgs ipath_6110_hwerror_msgs[] = {
+	INFINIPATH_HWE_MSG(HTCBUSIREQPARITYERR, "HTC Ireq Parity"),
+	INFINIPATH_HWE_MSG(HTCBUSTREQPARITYERR, "HTC Treq Parity"),
+	INFINIPATH_HWE_MSG(HTCBUSTRESPPARITYERR, "HTC Tresp Parity"),
+	INFINIPATH_HWE_MSG(HTCMISCERR5, "HT core Misc5"),
+	INFINIPATH_HWE_MSG(HTCMISCERR6, "HT core Misc6"),
+	INFINIPATH_HWE_MSG(HTCMISCERR7, "HT core Misc7"),
+	INFINIPATH_HWE_MSG(RXDSYNCMEMPARITYERR, "Rx Dsync"),
+	INFINIPATH_HWE_MSG(SERDESPLLFAILED, "SerDes PLL"),
+};
+
+#define TXE_PIO_PARITY ((INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF | \
+		        INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC) \
+		        << INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT)
+#define RXE_EAGER_PARITY (INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID \
+			  << INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT)
+
+static void ipath_ht_txe_recover(struct ipath_devdata *dd)
+{
+	++ipath_stats.sps_txeparity;
+	dev_info(&dd->pcidev->dev,
+		"Recovering from TXE PIO parity error\n");
+}
+
+
+/**
+ * ipath_ht_handle_hwerrors - display hardware errors.
+ * @dd: the infinipath device
+ * @msg: the output buffer
+ * @msgl: the size of the output buffer
+ *
+ * Use same msg buffer as regular errors to avoid excessive stack
+ * use.  Most hardware errors are catastrophic, but for right now,
+ * we'll print them and continue.  We reuse the same message buffer as
+ * ipath_handle_errors() to avoid excessive stack usage.
+ */
+static void ipath_ht_handle_hwerrors(struct ipath_devdata *dd, char *msg,
+				     size_t msgl)
+{
+	ipath_err_t hwerrs;
+	u32 bits, ctrl;
+	int isfatal = 0;
+	char bitsmsg[64];
+	int log_idx;
+
+	hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
+
+	if (!hwerrs) {
+		ipath_cdbg(VERBOSE, "Called but no hardware errors set\n");
+		/*
+		 * better than printing cofusing messages
+		 * This seems to be related to clearing the crc error, or
+		 * the pll error during init.
+		 */
+		goto bail;
+	} else if (hwerrs == -1LL) {
+		ipath_dev_err(dd, "Read of hardware error status failed "
+			      "(all bits set); ignoring\n");
+		goto bail;
+	}
+	ipath_stats.sps_hwerrs++;
+
+	/* Always clear the error status register, except MEMBISTFAIL,
+	 * regardless of whether we continue or stop using the chip.
+	 * We want that set so we know it failed, even across driver reload.
+	 * We'll still ignore it in the hwerrmask.  We do this partly for
+	 * diagnostics, but also for support */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+			 hwerrs&~INFINIPATH_HWE_MEMBISTFAILED);
+
+	hwerrs &= dd->ipath_hwerrmask;
+
+	/* We log some errors to EEPROM, check if we have any of those. */
+	for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx)
+		if (hwerrs & dd->ipath_eep_st_masks[log_idx].hwerrs_to_log)
+			ipath_inc_eeprom_err(dd, log_idx, 1);
+
+	/*
+	 * make sure we get this much out, unless told to be quiet,
+	 * it's a parity error we may recover from,
+	 * or it's occurred within the last 5 seconds
+	 */
+	if ((hwerrs & ~(dd->ipath_lasthwerror | TXE_PIO_PARITY |
+		RXE_EAGER_PARITY)) ||
+		(ipath_debug & __IPATH_VERBDBG))
+		dev_info(&dd->pcidev->dev, "Hardware error: hwerr=0x%llx "
+			 "(cleared)\n", (unsigned long long) hwerrs);
+	dd->ipath_lasthwerror |= hwerrs;
+
+	if (hwerrs & ~dd->ipath_hwe_bitsextant)
+		ipath_dev_err(dd, "hwerror interrupt with unknown errors "
+			      "%llx set\n", (unsigned long long)
+			      (hwerrs & ~dd->ipath_hwe_bitsextant));
+
+	ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
+	if ((ctrl & INFINIPATH_C_FREEZEMODE) && !ipath_diag_inuse) {
+		/*
+		 * parity errors in send memory are recoverable,
+		 * just cancel the send (if indicated in * sendbuffererror),
+		 * count the occurrence, unfreeze (if no other handled
+		 * hardware error bits are set), and continue. They can
+		 * occur if a processor speculative read is done to the PIO
+		 * buffer while we are sending a packet, for example.
+		 */
+		if (hwerrs & TXE_PIO_PARITY) {
+			ipath_ht_txe_recover(dd);
+			hwerrs &= ~TXE_PIO_PARITY;
+		}
+
+		if (!hwerrs) {
+			ipath_dbg("Clearing freezemode on ignored or "
+				  "recovered hardware error\n");
+			ipath_clear_freeze(dd);
+		}
+	}
+
+	*msg = '\0';
+
+	/*
+	 * may someday want to decode into which bits are which
+	 * functional area for parity errors, etc.
+	 */
+	if (hwerrs & (infinipath_hwe_htcmemparityerr_mask
+		      << INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT)) {
+		bits = (u32) ((hwerrs >>
+			       INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) &
+			      INFINIPATH_HWE_HTCMEMPARITYERR_MASK);
+		snprintf(bitsmsg, sizeof bitsmsg, "[HTC Parity Errs %x] ",
+			 bits);
+		strlcat(msg, bitsmsg, msgl);
+	}
+
+	ipath_format_hwerrors(hwerrs,
+			      ipath_6110_hwerror_msgs,
+			      ARRAY_SIZE(ipath_6110_hwerror_msgs),
+			      msg, msgl);
+
+	if (hwerrs & (_IPATH_HTLINK0_CRCBITS | _IPATH_HTLINK1_CRCBITS))
+		hwerr_crcbits(dd, hwerrs, msg, msgl);
+
+	if (hwerrs & INFINIPATH_HWE_MEMBISTFAILED) {
+		strlcat(msg, "[Memory BIST test failed, InfiniPath hardware unusable]",
+			msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->ipath_hwerrmask &= ~INFINIPATH_HWE_MEMBISTFAILED;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+				 dd->ipath_hwerrmask);
+	}
+#define _IPATH_PLL_FAIL (INFINIPATH_HWE_COREPLL_FBSLIP |	\
+			 INFINIPATH_HWE_COREPLL_RFSLIP |	\
+			 INFINIPATH_HWE_HTBPLL_FBSLIP |		\
+			 INFINIPATH_HWE_HTBPLL_RFSLIP |		\
+			 INFINIPATH_HWE_HTAPLL_FBSLIP |		\
+			 INFINIPATH_HWE_HTAPLL_RFSLIP)
+
+	if (hwerrs & _IPATH_PLL_FAIL) {
+		snprintf(bitsmsg, sizeof bitsmsg,
+			 "[PLL failed (%llx), InfiniPath hardware unusable]",
+			 (unsigned long long) (hwerrs & _IPATH_PLL_FAIL));
+		strlcat(msg, bitsmsg, msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->ipath_hwerrmask &= ~(hwerrs & _IPATH_PLL_FAIL);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+				 dd->ipath_hwerrmask);
+	}
+
+	if (hwerrs & INFINIPATH_HWE_SERDESPLLFAILED) {
+		/*
+		 * If it occurs, it is left masked since the eternal
+		 * interface is unused
+		 */
+		dd->ipath_hwerrmask &= ~INFINIPATH_HWE_SERDESPLLFAILED;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+				 dd->ipath_hwerrmask);
+	}
+
+	if (hwerrs) {
+		/*
+		 * if any set that we aren't ignoring; only
+		 * make the complaint once, in case it's stuck
+		 * or recurring, and we get here multiple
+		 * times.
+		 * force link down, so switch knows, and
+		 * LEDs are turned off
+		 */
+		if (dd->ipath_flags & IPATH_INITTED) {
+			ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
+			ipath_setup_ht_setextled(dd,
+				INFINIPATH_IBCS_L_STATE_DOWN,
+				INFINIPATH_IBCS_LT_STATE_DISABLED);
+			ipath_dev_err(dd, "Fatal Hardware Error (freeze "
+					  "mode), no longer usable, SN %.16s\n",
+					  dd->ipath_serial);
+			isfatal = 1;
+		}
+		*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+		/* mark as having had error */
+		*dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+		/*
+		 * mark as not usable, at a minimum until driver
+		 * is reloaded, probably until reboot, since no
+		 * other reset is possible.
+		 */
+		dd->ipath_flags &= ~IPATH_INITTED;
+	}
+	else
+		*msg = 0; /* recovered from all of them */
+	if (*msg)
+		ipath_dev_err(dd, "%s hardware error\n", msg);
+	if (isfatal && !ipath_diag_inuse && dd->ipath_freezemsg)
+		/*
+		 * for status file; if no trailing brace is copied,
+		 * we'll know it was truncated.
+		 */
+		snprintf(dd->ipath_freezemsg,
+			 dd->ipath_freezelen, "{%s}", msg);
+
+bail:;
+}
+
+/**
+ * ipath_ht_boardname - fill in the board name
+ * @dd: the infinipath device
+ * @name: the output buffer
+ * @namelen: the size of the output buffer
+ *
+ * fill in the board name, based on the board revision register
+ */
+static int ipath_ht_boardname(struct ipath_devdata *dd, char *name,
+			      size_t namelen)
+{
+	char *n = NULL;
+	u8 boardrev = dd->ipath_boardrev;
+	int ret = 0;
+
+	switch (boardrev) {
+	case 5:
+		/*
+		 * original production board; two production levels, with
+		 * different serial number ranges.   See ipath_ht_early_init() for
+		 * case where we enable IPATH_GPIO_INTR for later serial # range.
+		 * Original 112* serial number is no longer supported.
+		 */
+		n = "InfiniPath_QHT7040";
+		break;
+	case 7:
+		/* small form factor production board */
+		n = "InfiniPath_QHT7140";
+		break;
+	default:		/* don't know, just print the number */
+		ipath_dev_err(dd, "Don't yet know about board "
+			      "with ID %u\n", boardrev);
+		snprintf(name, namelen, "Unknown_InfiniPath_QHT7xxx_%u",
+			 boardrev);
+		break;
+	}
+	if (n)
+		snprintf(name, namelen, "%s", n);
+
+	if (ret) {
+		ipath_dev_err(dd, "Unsupported InfiniPath board %s!\n", name);
+		goto bail;
+	}
+	if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 ||
+		dd->ipath_minrev > 4)) {
+		/*
+		 * This version of the driver only supports Rev 3.2 - 3.4
+		 */
+		ipath_dev_err(dd,
+			      "Unsupported InfiniPath hardware revision %u.%u!\n",
+			      dd->ipath_majrev, dd->ipath_minrev);
+		ret = 1;
+		goto bail;
+	}
+	/*
+	 * pkt/word counters are 32 bit, and therefore wrap fast enough
+	 * that we snapshot them from a timer, and maintain 64 bit shadow
+	 * copies
+	 */
+	dd->ipath_flags |= IPATH_32BITCOUNTERS;
+	dd->ipath_flags |= IPATH_GPIO_INTR;
+	if (dd->ipath_lbus_speed != 800)
+		ipath_dev_err(dd,
+			      "Incorrectly configured for HT @ %uMHz\n",
+			      dd->ipath_lbus_speed);
+
+	/*
+	 * set here, not in ipath_init_*_funcs because we have to do
+	 * it after we can read chip registers.
+	 */
+	dd->ipath_ureg_align =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
+
+bail:
+	return ret;
+}
+
+static void ipath_check_htlink(struct ipath_devdata *dd)
+{
+	u8 linkerr, link_off, i;
+
+	for (i = 0; i < 2; i++) {
+		link_off = dd->ipath_ht_slave_off + i * 4 + 0xd;
+		if (pci_read_config_byte(dd->pcidev, link_off, &linkerr))
+			dev_info(&dd->pcidev->dev, "Couldn't read "
+				 "linkerror%d of HT slave/primary block\n",
+				 i);
+		else if (linkerr & 0xf0) {
+			ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
+				   "clearing\n", linkerr >> 4, i);
+			/*
+			 * writing the linkerr bits that are set should
+			 * clear them
+			 */
+			if (pci_write_config_byte(dd->pcidev, link_off,
+						  linkerr))
+				ipath_dbg("Failed write to clear HT "
+					  "linkerror%d\n", i);
+			if (pci_read_config_byte(dd->pcidev, link_off,
+						 &linkerr))
+				dev_info(&dd->pcidev->dev,
+					 "Couldn't reread linkerror%d of "
+					 "HT slave/primary block\n", i);
+			else if (linkerr & 0xf0)
+				dev_info(&dd->pcidev->dev,
+					 "HT linkerror%d bits 0x%x "
+					 "couldn't be cleared\n",
+					 i, linkerr >> 4);
+		}
+	}
+}
+
+static int ipath_setup_ht_reset(struct ipath_devdata *dd)
+{
+	ipath_dbg("No reset possible for this InfiniPath hardware\n");
+	return 0;
+}
+
+#define HT_INTR_DISC_CONFIG  0x80	/* HT interrupt and discovery cap */
+#define HT_INTR_REG_INDEX    2	/* intconfig requires indirect accesses */
+
+/*
+ * Bits 13-15 of command==0 is slave/primary block.  Clear any HT CRC
+ * errors.  We only bother to do this at load time, because it's OK if
+ * it happened before we were loaded (first time after boot/reset),
+ * but any time after that, it's fatal anyway.  Also need to not check
+ * for upper byte errors if we are in 8 bit mode, so figure out
+ * our width.  For now, at least, also complain if it's 8 bit.
+ */
+static void slave_or_pri_blk(struct ipath_devdata *dd, struct pci_dev *pdev,
+			     int pos, u8 cap_type)
+{
+	u8 linkwidth = 0, linkerr, link_a_b_off, link_off;
+	u16 linkctrl = 0;
+	int i;
+
+	dd->ipath_ht_slave_off = pos;
+	/* command word, master_host bit */
+	/* master host || slave */
+	if ((cap_type >> 2) & 1)
+		link_a_b_off = 4;
+	else
+		link_a_b_off = 0;
+	ipath_cdbg(VERBOSE, "HT%u (Link %c) connected to processor\n",
+		   link_a_b_off ? 1 : 0,
+		   link_a_b_off ? 'B' : 'A');
+
+	link_a_b_off += pos;
+
+	/*
+	 * check both link control registers; clear both HT CRC sets if
+	 * necessary.
+	 */
+	for (i = 0; i < 2; i++) {
+		link_off = pos + i * 4 + 0x4;
+		if (pci_read_config_word(pdev, link_off, &linkctrl))
+			ipath_dev_err(dd, "Couldn't read HT link control%d "
+				      "register\n", i);
+		else if (linkctrl & (0xf << 8)) {
+			ipath_cdbg(VERBOSE, "Clear linkctrl%d CRC Error "
+				   "bits %x\n", i, linkctrl & (0xf << 8));
+			/*
+			 * now write them back to clear the error.
+			 */
+			pci_write_config_word(pdev, link_off,
+					      linkctrl & (0xf << 8));
+		}
+	}
+
+	/*
+	 * As with HT CRC bits, same for protocol errors that might occur
+	 * during boot.
+	 */
+	for (i = 0; i < 2; i++) {
+		link_off = pos + i * 4 + 0xd;
+		if (pci_read_config_byte(pdev, link_off, &linkerr))
+			dev_info(&pdev->dev, "Couldn't read linkerror%d "
+				 "of HT slave/primary block\n", i);
+		else if (linkerr & 0xf0) {
+			ipath_cdbg(VERBOSE, "HT linkerr%d bits 0x%x set, "
+				   "clearing\n", linkerr >> 4, i);
+			/*
+			 * writing the linkerr bits that are set will clear
+			 * them
+			 */
+			if (pci_write_config_byte
+			    (pdev, link_off, linkerr))
+				ipath_dbg("Failed write to clear HT "
+					  "linkerror%d\n", i);
+			if (pci_read_config_byte(pdev, link_off, &linkerr))
+				dev_info(&pdev->dev, "Couldn't reread "
+					 "linkerror%d of HT slave/primary "
+					 "block\n", i);
+			else if (linkerr & 0xf0)
+				dev_info(&pdev->dev, "HT linkerror%d bits "
+					 "0x%x couldn't be cleared\n",
+					 i, linkerr >> 4);
+		}
+	}
+
+	/*
+	 * this is just for our link to the host, not devices connected
+	 * through tunnel.
+	 */
+
+	if (pci_read_config_byte(pdev, link_a_b_off + 7, &linkwidth))
+		ipath_dev_err(dd, "Couldn't read HT link width "
+			      "config register\n");
+	else {
+		u32 width;
+		switch (linkwidth & 7) {
+		case 5:
+			width = 4;
+			break;
+		case 4:
+			width = 2;
+			break;
+		case 3:
+			width = 32;
+			break;
+		case 1:
+			width = 16;
+			break;
+		case 0:
+		default:	/* if wrong, assume 8 bit */
+			width = 8;
+			break;
+		}
+
+		dd->ipath_lbus_width = width;
+
+		if (linkwidth != 0x11) {
+			ipath_dev_err(dd, "Not configured for 16 bit HT "
+				      "(%x)\n", linkwidth);
+			if (!(linkwidth & 0xf)) {
+				ipath_dbg("Will ignore HT lane1 errors\n");
+				dd->ipath_flags |= IPATH_8BIT_IN_HT0;
+			}
+		}
+	}
+
+	/*
+	 * this is just for our link to the host, not devices connected
+	 * through tunnel.
+	 */
+	if (pci_read_config_byte(pdev, link_a_b_off + 0xd, &linkwidth))
+		ipath_dev_err(dd, "Couldn't read HT link frequency "
+			      "config register\n");
+	else {
+		u32 speed;
+		switch (linkwidth & 0xf) {
+		case 6:
+			speed = 1000;
+			break;
+		case 5:
+			speed = 800;
+			break;
+		case 4:
+			speed = 600;
+			break;
+		case 3:
+			speed = 500;
+			break;
+		case 2:
+			speed = 400;
+			break;
+		case 1:
+			speed = 300;
+			break;
+		default:
+			/*
+			 * assume reserved and vendor-specific are 200...
+			 */
+		case 0:
+			speed = 200;
+			break;
+		}
+		dd->ipath_lbus_speed = speed;
+	}
+
+	snprintf(dd->ipath_lbus_info, sizeof(dd->ipath_lbus_info),
+		"HyperTransport,%uMHz,x%u\n",
+		dd->ipath_lbus_speed,
+		dd->ipath_lbus_width);
+}
+
+static int ipath_ht_intconfig(struct ipath_devdata *dd)
+{
+	int ret;
+
+	if (dd->ipath_intconfig) {
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_interruptconfig,
+				 dd->ipath_intconfig);	/* interrupt address */
+		ret = 0;
+	} else {
+		ipath_dev_err(dd, "No interrupts enabled, couldn't setup "
+			      "interrupt address\n");
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static void ipath_ht_irq_update(struct pci_dev *dev, int irq,
+				struct ht_irq_msg *msg)
+{
+	struct ipath_devdata *dd = pci_get_drvdata(dev);
+	u64 prev_intconfig = dd->ipath_intconfig;
+
+	dd->ipath_intconfig = msg->address_lo;
+	dd->ipath_intconfig |= ((u64) msg->address_hi) << 32;
+
+	/*
+	 * If the previous value of dd->ipath_intconfig is zero, we're
+	 * getting configured for the first time, and must not program the
+	 * intconfig register here (it will be programmed later, when the
+	 * hardware is ready).  Otherwise, we should.
+	 */
+	if (prev_intconfig)
+		ipath_ht_intconfig(dd);
+}
+
+/**
+ * ipath_setup_ht_config - setup the interruptconfig register
+ * @dd: the infinipath device
+ * @pdev: the PCI device
+ *
+ * setup the interruptconfig register from the HT config info.
+ * Also clear CRC errors in HT linkcontrol, if necessary.
+ * This is done only for the real hardware.  It is done before
+ * chip address space is initted, so can't touch infinipath registers
+ */
+static int ipath_setup_ht_config(struct ipath_devdata *dd,
+				 struct pci_dev *pdev)
+{
+	int pos, ret;
+
+	ret = __ht_create_irq(pdev, 0, ipath_ht_irq_update);
+	if (ret < 0) {
+		ipath_dev_err(dd, "Couldn't create interrupt handler: "
+			      "err %d\n", ret);
+		goto bail;
+	}
+	dd->ipath_irq = ret;
+	ret = 0;
+
+	/*
+	 * Handle clearing CRC errors in linkctrl register if necessary.  We
+	 * do this early, before we ever enable errors or hardware errors,
+	 * mostly to avoid causing the chip to enter freeze mode.
+	 */
+	pos = pci_find_capability(pdev, PCI_CAP_ID_HT);
+	if (!pos) {
+		ipath_dev_err(dd, "Couldn't find HyperTransport "
+			      "capability; no interrupts\n");
+		ret = -ENODEV;
+		goto bail;
+	}
+	do {
+		u8 cap_type;
+
+		/*
+		 * The HT capability type byte is 3 bytes after the
+		 * capability byte.
+		 */
+		if (pci_read_config_byte(pdev, pos + 3, &cap_type)) {
+			dev_info(&pdev->dev, "Couldn't read config "
+				 "command @ %d\n", pos);
+			continue;
+		}
+		if (!(cap_type & 0xE0))
+			slave_or_pri_blk(dd, pdev, pos, cap_type);
+	} while ((pos = pci_find_next_capability(pdev, pos,
+						 PCI_CAP_ID_HT)));
+
+	dd->ipath_flags |= IPATH_SWAP_PIOBUFS;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_setup_ht_cleanup - clean up any per-chip chip-specific stuff
+ * @dd: the infinipath device
+ *
+ * Called during driver unload.
+ * This is currently a nop for the HT chip, not for all chips
+ */
+static void ipath_setup_ht_cleanup(struct ipath_devdata *dd)
+{
+}
+
+/**
+ * ipath_setup_ht_setextled - set the state of the two external LEDs
+ * @dd: the infinipath device
+ * @lst: the L state
+ * @ltst: the LT state
+ *
+ * Set the state of the two external LEDs, to indicate physical and
+ * logical state of IB link.   For this chip (at least with recommended
+ * board pinouts), LED1 is Green (physical state), and LED2 is Yellow
+ * (logical state)
+ *
+ * Note:  We try to match the Mellanox HCA LED behavior as best
+ * we can.  Green indicates physical link state is OK (something is
+ * plugged in, and we can train).
+ * Amber indicates the link is logically up (ACTIVE).
+ * Mellanox further blinks the amber LED to indicate data packet
+ * activity, but we have no hardware support for that, so it would
+ * require waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if appropriate.  That's
+ * visible overhead, so not something we will do.
+ *
+ */
+static void ipath_setup_ht_setextled(struct ipath_devdata *dd,
+				     u64 lst, u64 ltst)
+{
+	u64 extctl;
+	unsigned long flags = 0;
+
+	/* the diags use the LED to indicate diag info, so we leave
+	 * the external LED alone when the diags are running */
+	if (ipath_diag_inuse)
+		return;
+
+	/* Allow override of LED display for, e.g. Locating system in rack */
+	if (dd->ipath_led_override) {
+		ltst = (dd->ipath_led_override & IPATH_LED_PHYS)
+			? INFINIPATH_IBCS_LT_STATE_LINKUP
+			: INFINIPATH_IBCS_LT_STATE_DISABLED;
+		lst = (dd->ipath_led_override & IPATH_LED_LOG)
+			? INFINIPATH_IBCS_L_STATE_ACTIVE
+			: INFINIPATH_IBCS_L_STATE_DOWN;
+	}
+
+	spin_lock_irqsave(&dd->ipath_gpio_lock, flags);
+	/*
+	 * start by setting both LED control bits to off, then turn
+	 * on the appropriate bit(s).
+	 */
+	if (dd->ipath_boardrev == 8) { /* LS/X-1 uses different pins */
+		/*
+		 * major difference is that INFINIPATH_EXTC_LEDGBLERR_OFF
+		 * is inverted,  because it is normally used to indicate
+		 * a hardware fault at reset, if there were errors
+		 */
+		extctl = (dd->ipath_extctrl & ~INFINIPATH_EXTC_LEDGBLOK_ON)
+			| INFINIPATH_EXTC_LEDGBLERR_OFF;
+		if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
+			extctl &= ~INFINIPATH_EXTC_LEDGBLERR_OFF;
+		if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
+			extctl |= INFINIPATH_EXTC_LEDGBLOK_ON;
+	}
+	else {
+		extctl = dd->ipath_extctrl &
+			~(INFINIPATH_EXTC_LED1PRIPORT_ON |
+			  INFINIPATH_EXTC_LED2PRIPORT_ON);
+		if (ltst == INFINIPATH_IBCS_LT_STATE_LINKUP)
+			extctl |= INFINIPATH_EXTC_LED1PRIPORT_ON;
+		if (lst == INFINIPATH_IBCS_L_STATE_ACTIVE)
+			extctl |= INFINIPATH_EXTC_LED2PRIPORT_ON;
+	}
+	dd->ipath_extctrl = extctl;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_extctrl, extctl);
+	spin_unlock_irqrestore(&dd->ipath_gpio_lock, flags);
+}
+
+static void ipath_init_ht_variables(struct ipath_devdata *dd)
+{
+	/*
+	 * setup the register offsets, since they are different for each
+	 * chip
+	 */
+	dd->ipath_kregs = &ipath_ht_kregs;
+	dd->ipath_cregs = &ipath_ht_cregs;
+
+	dd->ipath_gpio_sda_num = _IPATH_GPIO_SDA_NUM;
+	dd->ipath_gpio_scl_num = _IPATH_GPIO_SCL_NUM;
+	dd->ipath_gpio_sda = IPATH_GPIO_SDA;
+	dd->ipath_gpio_scl = IPATH_GPIO_SCL;
+
+	/*
+	 * Fill in data for field-values that change in newer chips.
+	 * We dynamically specify only the mask for LINKTRAININGSTATE
+	 * and only the shift for LINKSTATE, as they are the only ones
+	 * that change.  Also precalculate the 3 link states of interest
+	 * and the combined mask.
+	 */
+	dd->ibcs_ls_shift = IBA6110_IBCS_LINKSTATE_SHIFT;
+	dd->ibcs_lts_mask = IBA6110_IBCS_LINKTRAININGSTATE_MASK;
+	dd->ibcs_mask = (INFINIPATH_IBCS_LINKSTATE_MASK <<
+		dd->ibcs_ls_shift) | dd->ibcs_lts_mask;
+	dd->ib_init = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+		(INFINIPATH_IBCS_L_STATE_INIT << dd->ibcs_ls_shift);
+	dd->ib_arm = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+		(INFINIPATH_IBCS_L_STATE_ARM << dd->ibcs_ls_shift);
+	dd->ib_active = (INFINIPATH_IBCS_LT_STATE_LINKUP <<
+		INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) |
+		(INFINIPATH_IBCS_L_STATE_ACTIVE << dd->ibcs_ls_shift);
+
+	/*
+	 * Fill in data for ibcc field-values that change in newer chips.
+	 * We dynamically specify only the mask for LINKINITCMD
+	 * and only the shift for LINKCMD and MAXPKTLEN, as they are
+	 * the only ones that change.
+	 */
+	dd->ibcc_lic_mask = INFINIPATH_IBCC_LINKINITCMD_MASK;
+	dd->ibcc_lc_shift = INFINIPATH_IBCC_LINKCMD_SHIFT;
+	dd->ibcc_mpl_shift = INFINIPATH_IBCC_MAXPKTLEN_SHIFT;
+
+	/* Fill in shifts for RcvCtrl. */
+	dd->ipath_r_portenable_shift = INFINIPATH_R_PORTENABLE_SHIFT;
+	dd->ipath_r_intravail_shift = INFINIPATH_R_INTRAVAIL_SHIFT;
+	dd->ipath_r_tailupd_shift = INFINIPATH_R_TAILUPD_SHIFT;
+	dd->ipath_r_portcfg_shift = 0; /* Not on IBA6110 */
+
+	dd->ipath_i_bitsextant =
+		(INFINIPATH_I_RCVURG_MASK << INFINIPATH_I_RCVURG_SHIFT) |
+		(INFINIPATH_I_RCVAVAIL_MASK <<
+		 INFINIPATH_I_RCVAVAIL_SHIFT) |
+		INFINIPATH_I_ERROR | INFINIPATH_I_SPIOSENT |
+		INFINIPATH_I_SPIOBUFAVAIL | INFINIPATH_I_GPIO;
+
+	dd->ipath_e_bitsextant =
+		INFINIPATH_E_RFORMATERR | INFINIPATH_E_RVCRC |
+		INFINIPATH_E_RICRC | INFINIPATH_E_RMINPKTLEN |
+		INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RLONGPKTLEN |
+		INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RUNEXPCHAR |
+		INFINIPATH_E_RUNSUPVL | INFINIPATH_E_REBP |
+		INFINIPATH_E_RIBFLOW | INFINIPATH_E_RBADVERSION |
+		INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+		INFINIPATH_E_RBADTID | INFINIPATH_E_RHDRLEN |
+		INFINIPATH_E_RHDR | INFINIPATH_E_RIBLOSTLINK |
+		INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SMAXPKTLEN |
+		INFINIPATH_E_SUNDERRUN | INFINIPATH_E_SPKTLEN |
+		INFINIPATH_E_SDROPPEDSMPPKT | INFINIPATH_E_SDROPPEDDATAPKT |
+		INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM |
+		INFINIPATH_E_SUNSUPVL | INFINIPATH_E_IBSTATUSCHANGED |
+		INFINIPATH_E_INVALIDADDR | INFINIPATH_E_RESET |
+		INFINIPATH_E_HARDWARE;
+
+	dd->ipath_hwe_bitsextant =
+		(INFINIPATH_HWE_HTCMEMPARITYERR_MASK <<
+		 INFINIPATH_HWE_HTCMEMPARITYERR_SHIFT) |
+		(INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
+		 INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) |
+		(INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
+		 INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) |
+		INFINIPATH_HWE_HTCLNKABYTE0CRCERR |
+		INFINIPATH_HWE_HTCLNKABYTE1CRCERR |
+		INFINIPATH_HWE_HTCLNKBBYTE0CRCERR |
+		INFINIPATH_HWE_HTCLNKBBYTE1CRCERR |
+		INFINIPATH_HWE_HTCMISCERR4 |
+		INFINIPATH_HWE_HTCMISCERR5 | INFINIPATH_HWE_HTCMISCERR6 |
+		INFINIPATH_HWE_HTCMISCERR7 |
+		INFINIPATH_HWE_HTCBUSTREQPARITYERR |
+		INFINIPATH_HWE_HTCBUSTRESPPARITYERR |
+		INFINIPATH_HWE_HTCBUSIREQPARITYERR |
+		INFINIPATH_HWE_RXDSYNCMEMPARITYERR |
+		INFINIPATH_HWE_MEMBISTFAILED |
+		INFINIPATH_HWE_COREPLL_FBSLIP |
+		INFINIPATH_HWE_COREPLL_RFSLIP |
+		INFINIPATH_HWE_HTBPLL_FBSLIP |
+		INFINIPATH_HWE_HTBPLL_RFSLIP |
+		INFINIPATH_HWE_HTAPLL_FBSLIP |
+		INFINIPATH_HWE_HTAPLL_RFSLIP |
+		INFINIPATH_HWE_SERDESPLLFAILED |
+		INFINIPATH_HWE_IBCBUSTOSPCPARITYERR |
+		INFINIPATH_HWE_IBCBUSFRSPCPARITYERR;
+
+	dd->ipath_i_rcvavail_mask = INFINIPATH_I_RCVAVAIL_MASK;
+	dd->ipath_i_rcvurg_mask = INFINIPATH_I_RCVURG_MASK;
+	dd->ipath_i_rcvavail_shift = INFINIPATH_I_RCVAVAIL_SHIFT;
+	dd->ipath_i_rcvurg_shift = INFINIPATH_I_RCVURG_SHIFT;
+
+	/*
+	 * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity.
+	 * 2 is Some Misc, 3 is reserved for future.
+	 */
+	dd->ipath_eep_st_masks[0].hwerrs_to_log =
+		INFINIPATH_HWE_TXEMEMPARITYERR_MASK <<
+		INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT;
+
+	dd->ipath_eep_st_masks[1].hwerrs_to_log =
+		INFINIPATH_HWE_RXEMEMPARITYERR_MASK <<
+		INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT;
+
+	dd->ipath_eep_st_masks[2].errs_to_log = INFINIPATH_E_RESET;
+
+	dd->delay_mult = 2; /* SDR, 4X, can't change */
+
+	dd->ipath_link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
+	dd->ipath_link_speed_supported = IPATH_IB_SDR;
+	dd->ipath_link_width_enabled = IB_WIDTH_4X;
+	dd->ipath_link_speed_enabled = dd->ipath_link_speed_supported;
+	/* these can't change for this chip, so set once */
+	dd->ipath_link_width_active = dd->ipath_link_width_enabled;
+	dd->ipath_link_speed_active = dd->ipath_link_speed_enabled;
+}
+
+/**
+ * ipath_ht_init_hwerrors - enable hardware errors
+ * @dd: the infinipath device
+ *
+ * now that we have finished initializing everything that might reasonably
+ * cause a hardware error, and cleared those errors bits as they occur,
+ * we can enable hardware errors in the mask (potentially enabling
+ * freeze mode), and enable hardware errors as errors (along with
+ * everything else) in errormask
+ */
+static void ipath_ht_init_hwerrors(struct ipath_devdata *dd)
+{
+	ipath_err_t val;
+	u64 extsval;
+
+	extsval = ipath_read_kreg64(dd, dd->ipath_kregs->kr_extstatus);
+
+	if (!(extsval & INFINIPATH_EXTS_MEMBIST_ENDTEST))
+		ipath_dev_err(dd, "MemBIST did not complete!\n");
+	if (extsval & INFINIPATH_EXTS_MEMBIST_CORRECT)
+		ipath_dbg("MemBIST corrected\n");
+
+	ipath_check_htlink(dd);
+
+	/* barring bugs, all hwerrors become interrupts, which can */
+	val = -1LL;
+	/* don't look at crc lane1 if 8 bit */
+	if (dd->ipath_flags & IPATH_8BIT_IN_HT0)
+		val &= ~infinipath_hwe_htclnkabyte1crcerr;
+	/* don't look at crc lane1 if 8 bit */
+	if (dd->ipath_flags & IPATH_8BIT_IN_HT1)
+		val &= ~infinipath_hwe_htclnkbbyte1crcerr;
+
+	/*
+	 * disable RXDSYNCMEMPARITY because external serdes is unused,
+	 * and therefore the logic will never be used or initialized,
+	 * and uninitialized state will normally result in this error
+	 * being asserted.  Similarly for the external serdess pll
+	 * lock signal.
+	 */
+	val &= ~(INFINIPATH_HWE_SERDESPLLFAILED |
+		 INFINIPATH_HWE_RXDSYNCMEMPARITYERR);
+
+	/*
+	 * Disable MISCERR4 because of an inversion in the HT core
+	 * logic checking for errors that cause this bit to be set.
+	 * The errata can also cause the protocol error bit to be set
+	 * in the HT config space linkerror register(s).
+	 */
+	val &= ~INFINIPATH_HWE_HTCMISCERR4;
+
+	/*
+	 * PLL ignored because unused MDIO interface has a logic problem
+	 */
+	if (dd->ipath_boardrev == 4 || dd->ipath_boardrev == 9)
+		val &= ~INFINIPATH_HWE_SERDESPLLFAILED;
+	dd->ipath_hwerrmask = val;
+}
+
+
+
+
+/**
+ * ipath_ht_bringup_serdes - bring up the serdes
+ * @dd: the infinipath device
+ */
+static int ipath_ht_bringup_serdes(struct ipath_devdata *dd)
+{
+	u64 val, config1;
+	int ret = 0, change = 0;
+
+	ipath_dbg("Trying to bringup serdes\n");
+
+	if (ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus) &
+	    INFINIPATH_HWE_SERDESPLLFAILED)
+	{
+		ipath_dbg("At start, serdes PLL failed bit set in "
+			  "hwerrstatus, clearing and continuing\n");
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+				 INFINIPATH_HWE_SERDESPLLFAILED);
+	}
+
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+	config1 = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig1);
+
+	ipath_cdbg(VERBOSE, "Initial serdes status is config0=%llx "
+		   "config1=%llx, sstatus=%llx xgxs %llx\n",
+		   (unsigned long long) val, (unsigned long long) config1,
+		   (unsigned long long)
+		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
+		   (unsigned long long)
+		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
+
+	/* force reset on */
+	val |= INFINIPATH_SERDC0_RESET_PLL
+		/* | INFINIPATH_SERDC0_RESET_MASK */
+		;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
+	udelay(15);		/* need pll reset set at least for a bit */
+
+	if (val & INFINIPATH_SERDC0_RESET_PLL) {
+		u64 val2 = val &= ~INFINIPATH_SERDC0_RESET_PLL;
+		/* set lane resets, and tx idle, during pll reset */
+		val2 |= INFINIPATH_SERDC0_RESET_MASK |
+			INFINIPATH_SERDC0_TXIDLE;
+		ipath_cdbg(VERBOSE, "Clearing serdes PLL reset (writing "
+			   "%llx)\n", (unsigned long long) val2);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
+				 val2);
+		/*
+		 * be sure chip saw it
+		 */
+		val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		/*
+		 * need pll reset clear at least 11 usec before lane
+		 * resets cleared; give it a few more
+		 */
+		udelay(15);
+		val = val2;	/* for check below */
+	}
+
+	if (val & (INFINIPATH_SERDC0_RESET_PLL |
+		   INFINIPATH_SERDC0_RESET_MASK |
+		   INFINIPATH_SERDC0_TXIDLE)) {
+		val &= ~(INFINIPATH_SERDC0_RESET_PLL |
+			 INFINIPATH_SERDC0_RESET_MASK |
+			 INFINIPATH_SERDC0_TXIDLE);
+		/* clear them */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0,
+				 val);
+	}
+
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+	if (val & INFINIPATH_XGXS_RESET) {
+		/* normally true after boot */
+		val &= ~INFINIPATH_XGXS_RESET;
+		change = 1;
+	}
+	if (((val >> INFINIPATH_XGXS_RX_POL_SHIFT) &
+	     INFINIPATH_XGXS_RX_POL_MASK) != dd->ipath_rx_pol_inv ) {
+		/* need to compensate for Tx inversion in partner */
+		val &= ~(INFINIPATH_XGXS_RX_POL_MASK <<
+		         INFINIPATH_XGXS_RX_POL_SHIFT);
+		val |= dd->ipath_rx_pol_inv <<
+			INFINIPATH_XGXS_RX_POL_SHIFT;
+		change = 1;
+	}
+	if (change)
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+
+	/* clear current and de-emphasis bits */
+	config1 &= ~0x0ffffffff00ULL;
+	/* set current to 20ma */
+	config1 |= 0x00000000000ULL;
+	/* set de-emphasis to -5.68dB */
+	config1 |= 0x0cccc000000ULL;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig1, config1);
+
+	ipath_cdbg(VERBOSE, "After setup: serdes status is config0=%llx "
+		   "config1=%llx, sstatus=%llx xgxs %llx\n",
+		   (unsigned long long) val, (unsigned long long) config1,
+		   (unsigned long long)
+		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesstatus),
+		   (unsigned long long)
+		   ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig));
+
+	return ret;		/* for now, say we always succeeded */
+}
+
+/**
+ * ipath_ht_quiet_serdes - set serdes to txidle
+ * @dd: the infinipath device
+ * driver is being unloaded
+ */
+static void ipath_ht_quiet_serdes(struct ipath_devdata *dd)
+{
+	u64 val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_serdesconfig0);
+
+	val |= INFINIPATH_SERDC0_TXIDLE;
+	ipath_dbg("Setting TxIdleEn on serdes (config0 = %llx)\n",
+		  (unsigned long long) val);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_serdesconfig0, val);
+}
+
+/**
+ * ipath_pe_put_tid - write a TID in chip
+ * @dd: the infinipath device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0) for expected
+ * @pa: physical address of in memory buffer; ipath_tidinvalid if freeing
+ *
+ * This exists as a separate routine to allow for special locking etc.
+ * It's used for both the full cleanup on exit, as well as the normal
+ * setup and teardown.
+ */
+static void ipath_ht_put_tid(struct ipath_devdata *dd,
+			     u64 __iomem *tidptr, u32 type,
+			     unsigned long pa)
+{
+	if (!dd->ipath_kregbase)
+		return;
+
+	if (pa != dd->ipath_tidinvalid) {
+		if (unlikely((pa & ~INFINIPATH_RT_ADDR_MASK))) {
+			dev_info(&dd->pcidev->dev,
+				 "physaddr %lx has more than "
+				 "40 bits, using only 40!!!\n", pa);
+			pa &= INFINIPATH_RT_ADDR_MASK;
+		}
+		if (type == RCVHQ_RCV_TYPE_EAGER)
+			pa |= dd->ipath_tidtemplate;
+		else {
+			/* in words (fixed, full page).  */
+			u64 lenvalid = PAGE_SIZE >> 2;
+			lenvalid <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+			pa |= lenvalid | INFINIPATH_RT_VALID;
+		}
+	}
+
+	writeq(pa, tidptr);
+}
+
+
+/**
+ * ipath_ht_clear_tid - clear all TID entries for a port, expected and eager
+ * @dd: the infinipath device
+ * @port: the port
+ *
+ * Used from ipath_close(), and at chip initialization.
+ */
+static void ipath_ht_clear_tids(struct ipath_devdata *dd, unsigned port)
+{
+	u64 __iomem *tidbase;
+	int i;
+
+	if (!dd->ipath_kregbase)
+		return;
+
+	ipath_cdbg(VERBOSE, "Invalidate TIDs for port %u\n", port);
+
+	/*
+	 * need to invalidate all of the expected TID entries for this
+	 * port, so we don't have valid entries that might somehow get
+	 * used (early in next use of this port, or through some bug)
+	 */
+	tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+				   dd->ipath_rcvtidbase +
+				   port * dd->ipath_rcvtidcnt *
+				   sizeof(*tidbase));
+	for (i = 0; i < dd->ipath_rcvtidcnt; i++)
+		ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
+				 dd->ipath_tidinvalid);
+
+	tidbase = (u64 __iomem *) ((char __iomem *)(dd->ipath_kregbase) +
+				   dd->ipath_rcvegrbase +
+				   port * dd->ipath_rcvegrcnt *
+				   sizeof(*tidbase));
+
+	for (i = 0; i < dd->ipath_rcvegrcnt; i++)
+		ipath_ht_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
+				 dd->ipath_tidinvalid);
+}
+
+/**
+ * ipath_ht_tidtemplate - setup constants for TID updates
+ * @dd: the infinipath device
+ *
+ * We setup stuff that we use a lot, to avoid calculating each time
+ */
+static void ipath_ht_tidtemplate(struct ipath_devdata *dd)
+{
+	dd->ipath_tidtemplate = dd->ipath_ibmaxlen >> 2;
+	dd->ipath_tidtemplate <<= INFINIPATH_RT_BUFSIZE_SHIFT;
+	dd->ipath_tidtemplate |= INFINIPATH_RT_VALID;
+
+	/*
+	 * work around chip errata bug 7358, by marking invalid tids
+	 * as having max length
+	 */
+	dd->ipath_tidinvalid = (-1LL & INFINIPATH_RT_BUFSIZE_MASK) <<
+		INFINIPATH_RT_BUFSIZE_SHIFT;
+}
+
+static int ipath_ht_early_init(struct ipath_devdata *dd)
+{
+	u32 __iomem *piobuf;
+	u32 pioincr, val32;
+	int i;
+
+	/*
+	 * one cache line; long IB headers will spill over into received
+	 * buffer
+	 */
+	dd->ipath_rcvhdrentsize = 16;
+	dd->ipath_rcvhdrsize = IPATH_DFLT_RCVHDRSIZE;
+
+	/*
+	 * For HT, we allocate a somewhat overly large eager buffer,
+	 * such that we can guarantee that we can receive the largest
+	 * packet that we can send out.  To truly support a 4KB MTU,
+	 * we need to bump this to a large value.  To date, other than
+	 * testing, we have never encountered an HCA that can really
+	 * send 4KB MTU packets, so we do not handle that (we'll get
+	 * errors interrupts if we ever see one).
+	 */
+	dd->ipath_rcvegrbufsize = dd->ipath_piosize2k;
+
+	/*
+	 * the min() check here is currently a nop, but it may not
+	 * always be, depending on just how we do ipath_rcvegrbufsize
+	 */
+	dd->ipath_ibmaxlen = min(dd->ipath_piosize2k,
+				 dd->ipath_rcvegrbufsize);
+	dd->ipath_init_ibmaxlen = dd->ipath_ibmaxlen;
+	ipath_ht_tidtemplate(dd);
+
+	/*
+	 * zero all the TID entries at startup.  We do this for sanity,
+	 * in case of a previous driver crash of some kind, and also
+	 * because the chip powers up with these memories in an unknown
+	 * state.  Use portcnt, not cfgports, since this is for the
+	 * full chip, not for current (possibly different) configuration
+	 * value.
+	 * Chip Errata bug 6447
+	 */
+	for (val32 = 0; val32 < dd->ipath_portcnt; val32++)
+		ipath_ht_clear_tids(dd, val32);
+
+	/*
+	 * write the pbc of each buffer, to be sure it's initialized, then
+	 * cancel all the buffers, and also abort any packets that might
+	 * have been in flight for some reason (the latter is for driver
+	 * unload/reload, but isn't a bad idea at first init).	PIO send
+	 * isn't enabled at this point, so there is no danger of sending
+	 * these out on the wire.
+	 * Chip Errata bug 6610
+	 */
+	piobuf = (u32 __iomem *) (((char __iomem *)(dd->ipath_kregbase)) +
+				  dd->ipath_piobufbase);
+	pioincr = dd->ipath_palign / sizeof(*piobuf);
+	for (i = 0; i < dd->ipath_piobcnt2k; i++) {
+		/*
+		 * reasonable word count, just to init pbc
+		 */
+		writel(16, piobuf);
+		piobuf += pioincr;
+	}
+
+	ipath_get_eeprom_info(dd);
+	if (dd->ipath_boardrev == 5) {
+		/*
+		 * Later production QHT7040 has same changes as QHT7140, so
+		 * can use GPIO interrupts.  They have serial #'s starting
+		 * with 128, rather than 112.
+		 */
+		if (dd->ipath_serial[0] == '1' &&
+		    dd->ipath_serial[1] == '2' &&
+		    dd->ipath_serial[2] == '8')
+			dd->ipath_flags |= IPATH_GPIO_INTR;
+		else {
+			ipath_dev_err(dd, "Unsupported InfiniPath board "
+				"(serial number %.16s)!\n",
+				dd->ipath_serial);
+			return 1;
+		}
+	}
+
+	if (dd->ipath_minrev >= 4) {
+		/* Rev4+ reports extra errors via internal GPIO pins */
+		dd->ipath_flags |= IPATH_GPIO_ERRINTRS;
+		dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+				 dd->ipath_gpio_mask);
+	}
+
+	return 0;
+}
+
+
+/**
+ * ipath_init_ht_get_base_info - set chip-specific flags for user code
+ * @dd: the infinipath device
+ * @kbase: ipath_base_info pointer
+ *
+ * We set the PCIE flag because the lower bandwidth on PCIe vs
+ * HyperTransport can affect some user packet algorithms.
+ */
+static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase)
+{
+	struct ipath_base_info *kinfo = kbase;
+
+	kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT |
+		IPATH_RUNTIME_PIO_REGSWAPPED;
+
+	if (pd->port_dd->ipath_minrev < 4)
+		kinfo->spi_runtime_flags |= IPATH_RUNTIME_RCVHDR_COPY;
+
+	return 0;
+}
+
+static void ipath_ht_free_irq(struct ipath_devdata *dd)
+{
+	free_irq(dd->ipath_irq, dd);
+	ht_destroy_irq(dd->ipath_irq);
+	dd->ipath_irq = 0;
+	dd->ipath_intconfig = 0;
+}
+
+static struct ipath_message_header *
+ipath_ht_get_msgheader(struct ipath_devdata *dd, __le32 *rhf_addr)
+{
+	return (struct ipath_message_header *)
+		&rhf_addr[sizeof(u64) / sizeof(u32)];
+}
+
+static void ipath_ht_config_ports(struct ipath_devdata *dd, ushort cfgports)
+{
+	dd->ipath_portcnt =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_portcnt);
+	dd->ipath_p0_rcvegrcnt =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+}
+
+static void ipath_ht_read_counters(struct ipath_devdata *dd,
+				   struct infinipath_counters *cntrs)
+{
+	cntrs->LBIntCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBIntCnt));
+	cntrs->LBFlowStallCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(LBFlowStallCnt));
+	cntrs->TxSDmaDescCnt = 0;
+	cntrs->TxUnsupVLErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnsupVLErrCnt));
+	cntrs->TxDataPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDataPktCnt));
+	cntrs->TxFlowPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowPktCnt));
+	cntrs->TxDwordCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDwordCnt));
+	cntrs->TxLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxLenErrCnt));
+	cntrs->TxMaxMinLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxMaxMinLenErrCnt));
+	cntrs->TxUnderrunCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxUnderrunCnt));
+	cntrs->TxFlowStallCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxFlowStallCnt));
+	cntrs->TxDroppedPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(TxDroppedPktCnt));
+	cntrs->RxDroppedPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDroppedPktCnt));
+	cntrs->RxDataPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDataPktCnt));
+	cntrs->RxFlowPktCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowPktCnt));
+	cntrs->RxDwordCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxDwordCnt));
+	cntrs->RxLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLenErrCnt));
+	cntrs->RxMaxMinLenErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxMaxMinLenErrCnt));
+	cntrs->RxICRCErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxICRCErrCnt));
+	cntrs->RxVCRCErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxVCRCErrCnt));
+	cntrs->RxFlowCtrlErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxFlowCtrlErrCnt));
+	cntrs->RxBadFormatCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBadFormatCnt));
+	cntrs->RxLinkProblemCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLinkProblemCnt));
+	cntrs->RxEBPCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxEBPCnt));
+	cntrs->RxLPCRCErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxLPCRCErrCnt));
+	cntrs->RxBufOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxBufOvflCnt));
+	cntrs->RxTIDFullErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDFullErrCnt));
+	cntrs->RxTIDValidErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxTIDValidErrCnt));
+	cntrs->RxPKeyMismatchCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxPKeyMismatchCnt));
+	cntrs->RxP0HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP0HdrEgrOvflCnt));
+	cntrs->RxP1HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP1HdrEgrOvflCnt));
+	cntrs->RxP2HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP2HdrEgrOvflCnt));
+	cntrs->RxP3HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP3HdrEgrOvflCnt));
+	cntrs->RxP4HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP4HdrEgrOvflCnt));
+	cntrs->RxP5HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP5HdrEgrOvflCnt));
+	cntrs->RxP6HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP6HdrEgrOvflCnt));
+	cntrs->RxP7HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP7HdrEgrOvflCnt));
+	cntrs->RxP8HdrEgrOvflCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(RxP8HdrEgrOvflCnt));
+	cntrs->RxP9HdrEgrOvflCnt = 0;
+	cntrs->RxP10HdrEgrOvflCnt = 0;
+	cntrs->RxP11HdrEgrOvflCnt = 0;
+	cntrs->RxP12HdrEgrOvflCnt = 0;
+	cntrs->RxP13HdrEgrOvflCnt = 0;
+	cntrs->RxP14HdrEgrOvflCnt = 0;
+	cntrs->RxP15HdrEgrOvflCnt = 0;
+	cntrs->RxP16HdrEgrOvflCnt = 0;
+	cntrs->IBStatusChangeCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBStatusChangeCnt));
+	cntrs->IBLinkErrRecoveryCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkErrRecoveryCnt));
+	cntrs->IBLinkDownedCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBLinkDownedCnt));
+	cntrs->IBSymbolErrCnt =
+		ipath_snap_cntr(dd, IPATH_CREG_OFFSET(IBSymbolErrCnt));
+	cntrs->RxVL15DroppedPktCnt = 0;
+	cntrs->RxOtherLocalPhyErrCnt = 0;
+	cntrs->PcieRetryBufDiagQwordCnt = 0;
+	cntrs->ExcessBufferOvflCnt = dd->ipath_overrun_thresh_errs;
+	cntrs->LocalLinkIntegrityErrCnt =
+		(dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
+		dd->ipath_lli_errs : dd->ipath_lli_errors;
+	cntrs->RxVlErrCnt = 0;
+	cntrs->RxDlidFltrCnt = 0;
+}
+
+
+/* no interrupt fallback for these chips */
+static int ipath_ht_nointr_fallback(struct ipath_devdata *dd)
+{
+	return 0;
+}
+
+
+/*
+ * reset the XGXS (between serdes and IBC).  Slightly less intrusive
+ * than resetting the IBC or external link state, and useful in some
+ * cases to cause some retraining.  To do this right, we reset IBC
+ * as well.
+ */
+static void ipath_ht_xgxs_reset(struct ipath_devdata *dd)
+{
+	u64 val, prev_val;
+
+	prev_val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_xgxsconfig);
+	val = prev_val | INFINIPATH_XGXS_RESET;
+	prev_val &= ~INFINIPATH_XGXS_RESET; /* be sure */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control & ~INFINIPATH_C_LINKENABLE);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, val);
+	ipath_read_kreg32(dd, dd->ipath_kregs->kr_scratch);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_xgxsconfig, prev_val);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control);
+}
+
+
+static int ipath_ht_get_ib_cfg(struct ipath_devdata *dd, int which)
+{
+	int ret;
+
+	switch (which) {
+	case IPATH_IB_CFG_LWID:
+		ret = dd->ipath_link_width_active;
+		break;
+	case IPATH_IB_CFG_SPD:
+		ret = dd->ipath_link_speed_active;
+		break;
+	case IPATH_IB_CFG_LWID_ENB:
+		ret = dd->ipath_link_width_enabled;
+		break;
+	case IPATH_IB_CFG_SPD_ENB:
+		ret = dd->ipath_link_speed_enabled;
+		break;
+	default:
+		ret =  -ENOTSUPP;
+		break;
+	}
+	return ret;
+}
+
+
+/* we assume range checking is already done, if needed */
+static int ipath_ht_set_ib_cfg(struct ipath_devdata *dd, int which, u32 val)
+{
+	int ret = 0;
+
+	if (which == IPATH_IB_CFG_LWID_ENB)
+		dd->ipath_link_width_enabled = val;
+	else if (which == IPATH_IB_CFG_SPD_ENB)
+		dd->ipath_link_speed_enabled = val;
+	else
+		ret = -ENOTSUPP;
+	return ret;
+}
+
+
+static void ipath_ht_config_jint(struct ipath_devdata *dd, u16 a, u16 b)
+{
+}
+
+
+static int ipath_ht_ib_updown(struct ipath_devdata *dd, int ibup, u64 ibcs)
+{
+	ipath_setup_ht_setextled(dd, ipath_ib_linkstate(dd, ibcs),
+		ipath_ib_linktrstate(dd, ibcs));
+	return 0;
+}
+
+
+/**
+ * ipath_init_iba6110_funcs - set up the chip-specific function pointers
+ * @dd: the infinipath device
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+void ipath_init_iba6110_funcs(struct ipath_devdata *dd)
+{
+	dd->ipath_f_intrsetup = ipath_ht_intconfig;
+	dd->ipath_f_bus = ipath_setup_ht_config;
+	dd->ipath_f_reset = ipath_setup_ht_reset;
+	dd->ipath_f_get_boardname = ipath_ht_boardname;
+	dd->ipath_f_init_hwerrors = ipath_ht_init_hwerrors;
+	dd->ipath_f_early_init = ipath_ht_early_init;
+	dd->ipath_f_handle_hwerrors = ipath_ht_handle_hwerrors;
+	dd->ipath_f_quiet_serdes = ipath_ht_quiet_serdes;
+	dd->ipath_f_bringup_serdes = ipath_ht_bringup_serdes;
+	dd->ipath_f_clear_tids = ipath_ht_clear_tids;
+	dd->ipath_f_put_tid = ipath_ht_put_tid;
+	dd->ipath_f_cleanup = ipath_setup_ht_cleanup;
+	dd->ipath_f_setextled = ipath_setup_ht_setextled;
+	dd->ipath_f_get_base_info = ipath_ht_get_base_info;
+	dd->ipath_f_free_irq = ipath_ht_free_irq;
+	dd->ipath_f_tidtemplate = ipath_ht_tidtemplate;
+	dd->ipath_f_intr_fallback = ipath_ht_nointr_fallback;
+	dd->ipath_f_get_msgheader = ipath_ht_get_msgheader;
+	dd->ipath_f_config_ports = ipath_ht_config_ports;
+	dd->ipath_f_read_counters = ipath_ht_read_counters;
+	dd->ipath_f_xgxs_reset = ipath_ht_xgxs_reset;
+	dd->ipath_f_get_ib_cfg = ipath_ht_get_ib_cfg;
+	dd->ipath_f_set_ib_cfg = ipath_ht_set_ib_cfg;
+	dd->ipath_f_config_jint = ipath_ht_config_jint;
+	dd->ipath_f_ib_updown = ipath_ht_ib_updown;
+
+	/*
+	 * initialize chip-specific variables
+	 */
+	ipath_init_ht_variables(dd);
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_init_chip.c b/drivers/infiniband/hw/ipath/ipath_init_chip.c
new file mode 100644
index 000000000..be2a60e14
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_init_chip.c
@@ -0,0 +1,1066 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_kernel.h"
+#include "ipath_common.h"
+
+/*
+ * min buffers we want to have per port, after driver
+ */
+#define IPATH_MIN_USER_PORT_BUFCNT 7
+
+/*
+ * Number of ports we are configured to use (to allow for more pio
+ * buffers per port, etc.)  Zero means use chip value.
+ */
+static ushort ipath_cfgports;
+
+module_param_named(cfgports, ipath_cfgports, ushort, S_IRUGO);
+MODULE_PARM_DESC(cfgports, "Set max number of ports to use");
+
+/*
+ * Number of buffers reserved for driver (verbs and layered drivers.)
+ * Initialized based on number of PIO buffers if not set via module interface.
+ * The problem with this is that it's global, but we'll use different
+ * numbers for different chip types.
+ */
+static ushort ipath_kpiobufs;
+
+static int ipath_set_kpiobufs(const char *val, struct kernel_param *kp);
+
+module_param_call(kpiobufs, ipath_set_kpiobufs, param_get_ushort,
+		  &ipath_kpiobufs, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(kpiobufs, "Set number of PIO buffers for driver");
+
+/**
+ * create_port0_egr - allocate the eager TID buffers
+ * @dd: the infinipath device
+ *
+ * This code is now quite different for user and kernel, because
+ * the kernel uses skb's, for the accelerated network performance.
+ * This is the kernel (port0) version.
+ *
+ * Allocate the eager TID buffers and program them into infinipath.
+ * We use the network layer alloc_skb() allocator to allocate the
+ * memory, and either use the buffers as is for things like verbs
+ * packets, or pass the buffers up to the ipath layered driver and
+ * thence the network layer, replacing them as we do so (see
+ * ipath_rcv_layer()).
+ */
+static int create_port0_egr(struct ipath_devdata *dd)
+{
+	unsigned e, egrcnt;
+	struct ipath_skbinfo *skbinfo;
+	int ret;
+
+	egrcnt = dd->ipath_p0_rcvegrcnt;
+
+	skbinfo = vmalloc(sizeof(*dd->ipath_port0_skbinfo) * egrcnt);
+	if (skbinfo == NULL) {
+		ipath_dev_err(dd, "allocation error for eager TID "
+			      "skb array\n");
+		ret = -ENOMEM;
+		goto bail;
+	}
+	for (e = 0; e < egrcnt; e++) {
+		/*
+		 * This is a bit tricky in that we allocate extra
+		 * space for 2 bytes of the 14 byte ethernet header.
+		 * These two bytes are passed in the ipath header so
+		 * the rest of the data is word aligned.  We allocate
+		 * 4 bytes so that the data buffer stays word aligned.
+		 * See ipath_kreceive() for more details.
+		 */
+		skbinfo[e].skb = ipath_alloc_skb(dd, GFP_KERNEL);
+		if (!skbinfo[e].skb) {
+			ipath_dev_err(dd, "SKB allocation error for "
+				      "eager TID %u\n", e);
+			while (e != 0)
+				dev_kfree_skb(skbinfo[--e].skb);
+			vfree(skbinfo);
+			ret = -ENOMEM;
+			goto bail;
+		}
+	}
+	/*
+	 * After loop above, so we can test non-NULL to see if ready
+	 * to use at receive, etc.
+	 */
+	dd->ipath_port0_skbinfo = skbinfo;
+
+	for (e = 0; e < egrcnt; e++) {
+		dd->ipath_port0_skbinfo[e].phys =
+		  ipath_map_single(dd->pcidev,
+				   dd->ipath_port0_skbinfo[e].skb->data,
+				   dd->ipath_ibmaxlen, PCI_DMA_FROMDEVICE);
+		dd->ipath_f_put_tid(dd, e + (u64 __iomem *)
+				    ((char __iomem *) dd->ipath_kregbase +
+				     dd->ipath_rcvegrbase),
+				    RCVHQ_RCV_TYPE_EAGER,
+				    dd->ipath_port0_skbinfo[e].phys);
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int bringup_link(struct ipath_devdata *dd)
+{
+	u64 val, ibc;
+	int ret = 0;
+
+	/* hold IBC in reset */
+	dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control);
+
+	/*
+	 * set initial max size pkt IBC will send, including ICRC; it's the
+	 * PIO buffer size in dwords, less 1; also see ipath_set_mtu()
+	 */
+	val = (dd->ipath_ibmaxlen >> 2) + 1;
+	ibc = val << dd->ibcc_mpl_shift;
+
+	/* flowcontrolwatermark is in units of KBytes */
+	ibc |= 0x5ULL << INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT;
+	/*
+	 * How often flowctrl sent.  More or less in usecs; balance against
+	 * watermark value, so that in theory senders always get a flow
+	 * control update in time to not let the IB link go idle.
+	 */
+	ibc |= 0x3ULL << INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT;
+	/* max error tolerance */
+	ibc |= 0xfULL << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
+	/* use "real" buffer space for */
+	ibc |= 4ULL << INFINIPATH_IBCC_CREDITSCALE_SHIFT;
+	/* IB credit flow control. */
+	ibc |= 0xfULL << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
+	/* initially come up waiting for TS1, without sending anything. */
+	dd->ipath_ibcctrl = ibc;
+	/*
+	 * Want to start out with both LINKCMD and LINKINITCMD in NOP
+	 * (0 and 0).  Don't put linkinitcmd in ipath_ibcctrl, want that
+	 * to stay a NOP. Flag that we are disabled, for the (unlikely)
+	 * case that some recovery path is trying to bring the link up
+	 * before we are ready.
+	 */
+	ibc |= INFINIPATH_IBCC_LINKINITCMD_DISABLE <<
+		INFINIPATH_IBCC_LINKINITCMD_SHIFT;
+	dd->ipath_flags |= IPATH_IB_LINK_DISABLED;
+	ipath_cdbg(VERBOSE, "Writing 0x%llx to ibcctrl\n",
+		   (unsigned long long) ibc);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl, ibc);
+
+	// be sure chip saw it
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+	ret = dd->ipath_f_bringup_serdes(dd);
+
+	if (ret)
+		dev_info(&dd->pcidev->dev, "Could not initialize SerDes, "
+			 "not usable\n");
+	else {
+		/* enable IBC */
+		dd->ipath_control |= INFINIPATH_C_LINKENABLE;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+				 dd->ipath_control);
+	}
+
+	return ret;
+}
+
+static struct ipath_portdata *create_portdata0(struct ipath_devdata *dd)
+{
+	struct ipath_portdata *pd = NULL;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (pd) {
+		pd->port_dd = dd;
+		pd->port_cnt = 1;
+		/* The port 0 pkey table is used by the layer interface. */
+		pd->port_pkeys[0] = IPATH_DEFAULT_P_KEY;
+		pd->port_seq_cnt = 1;
+	}
+	return pd;
+}
+
+static int init_chip_first(struct ipath_devdata *dd)
+{
+	struct ipath_portdata *pd;
+	int ret = 0;
+	u64 val;
+
+	spin_lock_init(&dd->ipath_kernel_tid_lock);
+	spin_lock_init(&dd->ipath_user_tid_lock);
+	spin_lock_init(&dd->ipath_sendctrl_lock);
+	spin_lock_init(&dd->ipath_uctxt_lock);
+	spin_lock_init(&dd->ipath_sdma_lock);
+	spin_lock_init(&dd->ipath_gpio_lock);
+	spin_lock_init(&dd->ipath_eep_st_lock);
+	spin_lock_init(&dd->ipath_sdepb_lock);
+	mutex_init(&dd->ipath_eep_lock);
+
+	/*
+	 * skip cfgports stuff because we are not allocating memory,
+	 * and we don't want problems if the portcnt changed due to
+	 * cfgports.  We do still check and report a difference, if
+	 * not same (should be impossible).
+	 */
+	dd->ipath_f_config_ports(dd, ipath_cfgports);
+	if (!ipath_cfgports)
+		dd->ipath_cfgports = dd->ipath_portcnt;
+	else if (ipath_cfgports <= dd->ipath_portcnt) {
+		dd->ipath_cfgports = ipath_cfgports;
+		ipath_dbg("Configured to use %u ports out of %u in chip\n",
+			  dd->ipath_cfgports, ipath_read_kreg32(dd,
+			  dd->ipath_kregs->kr_portcnt));
+	} else {
+		dd->ipath_cfgports = dd->ipath_portcnt;
+		ipath_dbg("Tried to configured to use %u ports; chip "
+			  "only supports %u\n", ipath_cfgports,
+			  ipath_read_kreg32(dd,
+				  dd->ipath_kregs->kr_portcnt));
+	}
+	/*
+	 * Allocate full portcnt array, rather than just cfgports, because
+	 * cleanup iterates across all possible ports.
+	 */
+	dd->ipath_pd = kzalloc(sizeof(*dd->ipath_pd) * dd->ipath_portcnt,
+			       GFP_KERNEL);
+
+	if (!dd->ipath_pd) {
+		ipath_dev_err(dd, "Unable to allocate portdata array, "
+			      "failing\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	pd = create_portdata0(dd);
+	if (!pd) {
+		ipath_dev_err(dd, "Unable to allocate portdata for port "
+			      "0, failing\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+	dd->ipath_pd[0] = pd;
+
+	dd->ipath_rcvtidcnt =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
+	dd->ipath_rcvtidbase =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
+	dd->ipath_rcvegrcnt =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+	dd->ipath_rcvegrbase =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
+	dd->ipath_palign =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_pagealign);
+	dd->ipath_piobufbase =
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufbase);
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiosize);
+	dd->ipath_piosize2k = val & ~0U;
+	dd->ipath_piosize4k = val >> 32;
+	if (dd->ipath_piosize4k == 0 && ipath_mtu4096)
+		ipath_mtu4096 = 0; /* 4KB not supported by this chip */
+	dd->ipath_ibmtu = ipath_mtu4096 ? 4096 : 2048;
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpiobufcnt);
+	dd->ipath_piobcnt2k = val & ~0U;
+	dd->ipath_piobcnt4k = val >> 32;
+	dd->ipath_pio2kbase =
+		(u32 __iomem *) (((char __iomem *) dd->ipath_kregbase) +
+				 (dd->ipath_piobufbase & 0xffffffff));
+	if (dd->ipath_piobcnt4k) {
+		dd->ipath_pio4kbase = (u32 __iomem *)
+			(((char __iomem *) dd->ipath_kregbase) +
+			 (dd->ipath_piobufbase >> 32));
+		/*
+		 * 4K buffers take 2 pages; we use roundup just to be
+		 * paranoid; we calculate it once here, rather than on
+		 * ever buf allocate
+		 */
+		dd->ipath_4kalign = ALIGN(dd->ipath_piosize4k,
+					  dd->ipath_palign);
+		ipath_dbg("%u 2k(%x) piobufs @ %p, %u 4k(%x) @ %p "
+			  "(%x aligned)\n",
+			  dd->ipath_piobcnt2k, dd->ipath_piosize2k,
+			  dd->ipath_pio2kbase, dd->ipath_piobcnt4k,
+			  dd->ipath_piosize4k, dd->ipath_pio4kbase,
+			  dd->ipath_4kalign);
+	}
+	else ipath_dbg("%u 2k piobufs @ %p\n",
+		       dd->ipath_piobcnt2k, dd->ipath_pio2kbase);
+
+done:
+	return ret;
+}
+
+/**
+ * init_chip_reset - re-initialize after a reset, or enable
+ * @dd: the infinipath device
+ *
+ * sanity check at least some of the values after reset, and
+ * ensure no receive or transmit (explicitly, in case reset
+ * failed
+ */
+static int init_chip_reset(struct ipath_devdata *dd)
+{
+	u32 rtmp;
+	int i;
+	unsigned long flags;
+
+	/*
+	 * ensure chip does no sends or receives, tail updates, or
+	 * pioavail updates while we re-initialize
+	 */
+	dd->ipath_rcvctrl &= ~(1ULL << dd->ipath_r_tailupd_shift);
+	for (i = 0; i < dd->ipath_portcnt; i++) {
+		clear_bit(dd->ipath_r_portenable_shift + i,
+			  &dd->ipath_rcvctrl);
+		clear_bit(dd->ipath_r_intravail_shift + i,
+			  &dd->ipath_rcvctrl);
+	}
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+		dd->ipath_rcvctrl);
+
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl = 0U; /* no sdma, etc */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
+
+	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidcnt);
+	if (rtmp != dd->ipath_rcvtidcnt)
+		dev_info(&dd->pcidev->dev, "tidcnt was %u before "
+			 "reset, now %u, using original\n",
+			 dd->ipath_rcvtidcnt, rtmp);
+	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvtidbase);
+	if (rtmp != dd->ipath_rcvtidbase)
+		dev_info(&dd->pcidev->dev, "tidbase was %u before "
+			 "reset, now %u, using original\n",
+			 dd->ipath_rcvtidbase, rtmp);
+	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrcnt);
+	if (rtmp != dd->ipath_rcvegrcnt)
+		dev_info(&dd->pcidev->dev, "egrcnt was %u before "
+			 "reset, now %u, using original\n",
+			 dd->ipath_rcvegrcnt, rtmp);
+	rtmp = ipath_read_kreg32(dd, dd->ipath_kregs->kr_rcvegrbase);
+	if (rtmp != dd->ipath_rcvegrbase)
+		dev_info(&dd->pcidev->dev, "egrbase was %u before "
+			 "reset, now %u, using original\n",
+			 dd->ipath_rcvegrbase, rtmp);
+
+	return 0;
+}
+
+static int init_pioavailregs(struct ipath_devdata *dd)
+{
+	int ret;
+
+	dd->ipath_pioavailregs_dma = dma_alloc_coherent(
+		&dd->pcidev->dev, PAGE_SIZE, &dd->ipath_pioavailregs_phys,
+		GFP_KERNEL);
+	if (!dd->ipath_pioavailregs_dma) {
+		ipath_dev_err(dd, "failed to allocate PIOavail reg area "
+			      "in memory\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	/*
+	 * we really want L2 cache aligned, but for current CPUs of
+	 * interest, they are the same.
+	 */
+	dd->ipath_statusp = (u64 *)
+		((char *)dd->ipath_pioavailregs_dma +
+		 ((2 * L1_CACHE_BYTES +
+		   dd->ipath_pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES));
+	/* copy the current value now that it's really allocated */
+	*dd->ipath_statusp = dd->_ipath_status;
+	/*
+	 * setup buffer to hold freeze msg, accessible to apps,
+	 * following statusp
+	 */
+	dd->ipath_freezemsg = (char *)&dd->ipath_statusp[1];
+	/* and its length */
+	dd->ipath_freezelen = L1_CACHE_BYTES - sizeof(dd->ipath_statusp[0]);
+
+	ret = 0;
+
+done:
+	return ret;
+}
+
+/**
+ * init_shadow_tids - allocate the shadow TID array
+ * @dd: the infinipath device
+ *
+ * allocate the shadow TID array, so we can ipath_munlock previous
+ * entries.  It may make more sense to move the pageshadow to the
+ * port data structure, so we only allocate memory for ports actually
+ * in use, since we at 8k per port, now.
+ */
+static void init_shadow_tids(struct ipath_devdata *dd)
+{
+	struct page **pages;
+	dma_addr_t *addrs;
+
+	pages = vzalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+			sizeof(struct page *));
+	if (!pages) {
+		ipath_dev_err(dd, "failed to allocate shadow page * "
+			      "array, no expected sends!\n");
+		dd->ipath_pageshadow = NULL;
+		return;
+	}
+
+	addrs = vmalloc(dd->ipath_cfgports * dd->ipath_rcvtidcnt *
+			sizeof(dma_addr_t));
+	if (!addrs) {
+		ipath_dev_err(dd, "failed to allocate shadow dma handle "
+			      "array, no expected sends!\n");
+		vfree(pages);
+		dd->ipath_pageshadow = NULL;
+		return;
+	}
+
+	dd->ipath_pageshadow = pages;
+	dd->ipath_physshadow = addrs;
+}
+
+static void enable_chip(struct ipath_devdata *dd, int reinit)
+{
+	u32 val;
+	u64 rcvmask;
+	unsigned long flags;
+	int i;
+
+	if (!reinit)
+		init_waitqueue_head(&ipath_state_wait);
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			 dd->ipath_rcvctrl);
+
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	/* Enable PIO send, and update of PIOavail regs to memory. */
+	dd->ipath_sendctrl = INFINIPATH_S_PIOENABLE |
+		INFINIPATH_S_PIOBUFAVAILUPD;
+
+	/*
+	 * Set the PIO avail update threshold to host memory
+	 * on chips that support it.
+	 */
+	if (dd->ipath_pioupd_thresh)
+		dd->ipath_sendctrl |= dd->ipath_pioupd_thresh
+			<< INFINIPATH_S_UPDTHRESH_SHIFT;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	/*
+	 * Enable kernel ports' receive and receive interrupt.
+	 * Other ports done as user opens and inits them.
+	 */
+	rcvmask = 1ULL;
+	dd->ipath_rcvctrl |= (rcvmask << dd->ipath_r_portenable_shift) |
+		(rcvmask << dd->ipath_r_intravail_shift);
+	if (!(dd->ipath_flags & IPATH_NODMA_RTAIL))
+		dd->ipath_rcvctrl |= (1ULL << dd->ipath_r_tailupd_shift);
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+			 dd->ipath_rcvctrl);
+
+	/*
+	 * now ready for use.  this should be cleared whenever we
+	 * detect a reset, or initiate one.
+	 */
+	dd->ipath_flags |= IPATH_INITTED;
+
+	/*
+	 * Init our shadow copies of head from tail values,
+	 * and write head values to match.
+	 */
+	val = ipath_read_ureg32(dd, ur_rcvegrindextail, 0);
+	ipath_write_ureg(dd, ur_rcvegrindexhead, val, 0);
+
+	/* Initialize so we interrupt on next packet received */
+	ipath_write_ureg(dd, ur_rcvhdrhead,
+			 dd->ipath_rhdrhead_intr_off |
+			 dd->ipath_pd[0]->port_head, 0);
+
+	/*
+	 * by now pioavail updates to memory should have occurred, so
+	 * copy them into our working/shadow registers; this is in
+	 * case something went wrong with abort, but mostly to get the
+	 * initial values of the generation bit correct.
+	 */
+	for (i = 0; i < dd->ipath_pioavregs; i++) {
+		__le64 pioavail;
+
+		/*
+		 * Chip Errata bug 6641; even and odd qwords>3 are swapped.
+		 */
+		if (i > 3 && (dd->ipath_flags & IPATH_SWAP_PIOBUFS))
+			pioavail = dd->ipath_pioavailregs_dma[i ^ 1];
+		else
+			pioavail = dd->ipath_pioavailregs_dma[i];
+		/*
+		 * don't need to worry about ipath_pioavailkernel here
+		 * because we will call ipath_chg_pioavailkernel() later
+		 * in initialization, to busy out buffers as needed
+		 */
+		dd->ipath_pioavailshadow[i] = le64_to_cpu(pioavail);
+	}
+	/* can get counters, stats, etc. */
+	dd->ipath_flags |= IPATH_PRESENT;
+}
+
+static int init_housekeeping(struct ipath_devdata *dd, int reinit)
+{
+	char boardn[40];
+	int ret = 0;
+
+	/*
+	 * have to clear shadow copies of registers at init that are
+	 * not otherwise set here, or all kinds of bizarre things
+	 * happen with driver on chip reset
+	 */
+	dd->ipath_rcvhdrsize = 0;
+
+	/*
+	 * Don't clear ipath_flags as 8bit mode was set before
+	 * entering this func. However, we do set the linkstate to
+	 * unknown, so we can watch for a transition.
+	 * PRESENT is set because we want register reads to work,
+	 * and the kernel infrastructure saw it in config space;
+	 * We clear it if we have failures.
+	 */
+	dd->ipath_flags |= IPATH_LINKUNK | IPATH_PRESENT;
+	dd->ipath_flags &= ~(IPATH_LINKACTIVE | IPATH_LINKARMED |
+			     IPATH_LINKDOWN | IPATH_LINKINIT);
+
+	ipath_cdbg(VERBOSE, "Try to read spc chip revision\n");
+	dd->ipath_revision =
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_revision);
+
+	/*
+	 * set up fundamental info we need to use the chip; we assume
+	 * if the revision reg and these regs are OK, we don't need to
+	 * special case the rest
+	 */
+	dd->ipath_sregbase =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_sendregbase);
+	dd->ipath_cregbase =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_counterregbase);
+	dd->ipath_uregbase =
+		ipath_read_kreg32(dd, dd->ipath_kregs->kr_userregbase);
+	ipath_cdbg(VERBOSE, "ipath_kregbase %p, sendbase %x usrbase %x, "
+		   "cntrbase %x\n", dd->ipath_kregbase, dd->ipath_sregbase,
+		   dd->ipath_uregbase, dd->ipath_cregbase);
+	if ((dd->ipath_revision & 0xffffffff) == 0xffffffff
+	    || (dd->ipath_sregbase & 0xffffffff) == 0xffffffff
+	    || (dd->ipath_cregbase & 0xffffffff) == 0xffffffff
+	    || (dd->ipath_uregbase & 0xffffffff) == 0xffffffff) {
+		ipath_dev_err(dd, "Register read failures from chip, "
+			      "giving up initialization\n");
+		dd->ipath_flags &= ~IPATH_PRESENT;
+		ret = -ENODEV;
+		goto done;
+	}
+
+
+	/* clear diagctrl register, in case diags were running and crashed */
+	ipath_write_kreg (dd, dd->ipath_kregs->kr_hwdiagctrl, 0);
+
+	/* clear the initial reset flag, in case first driver load */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+			 INFINIPATH_E_RESET);
+
+	ipath_cdbg(VERBOSE, "Revision %llx (PCI %x)\n",
+		   (unsigned long long) dd->ipath_revision,
+		   dd->ipath_pcirev);
+
+	if (((dd->ipath_revision >> INFINIPATH_R_SOFTWARE_SHIFT) &
+	     INFINIPATH_R_SOFTWARE_MASK) != IPATH_CHIP_SWVERSION) {
+		ipath_dev_err(dd, "Driver only handles version %d, "
+			      "chip swversion is %d (%llx), failng\n",
+			      IPATH_CHIP_SWVERSION,
+			      (int)(dd->ipath_revision >>
+				    INFINIPATH_R_SOFTWARE_SHIFT) &
+			      INFINIPATH_R_SOFTWARE_MASK,
+			      (unsigned long long) dd->ipath_revision);
+		ret = -ENOSYS;
+		goto done;
+	}
+	dd->ipath_majrev = (u8) ((dd->ipath_revision >>
+				  INFINIPATH_R_CHIPREVMAJOR_SHIFT) &
+				 INFINIPATH_R_CHIPREVMAJOR_MASK);
+	dd->ipath_minrev = (u8) ((dd->ipath_revision >>
+				  INFINIPATH_R_CHIPREVMINOR_SHIFT) &
+				 INFINIPATH_R_CHIPREVMINOR_MASK);
+	dd->ipath_boardrev = (u8) ((dd->ipath_revision >>
+				    INFINIPATH_R_BOARDID_SHIFT) &
+				   INFINIPATH_R_BOARDID_MASK);
+
+	ret = dd->ipath_f_get_boardname(dd, boardn, sizeof boardn);
+
+	snprintf(dd->ipath_boardversion, sizeof(dd->ipath_boardversion),
+		 "ChipABI %u.%u, %s, InfiniPath%u %u.%u, PCI %u, "
+		 "SW Compat %u\n",
+		 IPATH_CHIP_VERS_MAJ, IPATH_CHIP_VERS_MIN, boardn,
+		 (unsigned)(dd->ipath_revision >> INFINIPATH_R_ARCH_SHIFT) &
+		 INFINIPATH_R_ARCH_MASK,
+		 dd->ipath_majrev, dd->ipath_minrev, dd->ipath_pcirev,
+		 (unsigned)(dd->ipath_revision >>
+			    INFINIPATH_R_SOFTWARE_SHIFT) &
+		 INFINIPATH_R_SOFTWARE_MASK);
+
+	ipath_dbg("%s", dd->ipath_boardversion);
+
+	if (ret)
+		goto done;
+
+	if (reinit)
+		ret = init_chip_reset(dd);
+	else
+		ret = init_chip_first(dd);
+
+done:
+	return ret;
+}
+
+static void verify_interrupt(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
+
+	if (!dd)
+		return; /* being torn down */
+
+	/*
+	 * If we don't have any interrupts, let the user know and
+	 * don't bother checking again.
+	 */
+	if (dd->ipath_int_counter == 0) {
+		if (!dd->ipath_f_intr_fallback(dd))
+			dev_err(&dd->pcidev->dev, "No interrupts detected, "
+				"not usable.\n");
+		else /* re-arm the timer to see if fallback works */
+			mod_timer(&dd->ipath_intrchk_timer, jiffies + HZ/2);
+	} else
+		ipath_cdbg(VERBOSE, "%u interrupts at timer check\n",
+			dd->ipath_int_counter);
+}
+
+/**
+ * ipath_init_chip - do the actual initialization sequence on the chip
+ * @dd: the infinipath device
+ * @reinit: reinitializing, so don't allocate new memory
+ *
+ * Do the actual initialization sequence on the chip.  This is done
+ * both from the init routine called from the PCI infrastructure, and
+ * when we reset the chip, or detect that it was reset internally,
+ * or it's administratively re-enabled.
+ *
+ * Memory allocation here and in called routines is only done in
+ * the first case (reinit == 0).  We have to be careful, because even
+ * without memory allocation, we need to re-write all the chip registers
+ * TIDs, etc. after the reset or enable has completed.
+ */
+int ipath_init_chip(struct ipath_devdata *dd, int reinit)
+{
+	int ret = 0;
+	u32 kpiobufs, defkbufs;
+	u32 piobufs, uports;
+	u64 val;
+	struct ipath_portdata *pd;
+	gfp_t gfp_flags = GFP_USER | __GFP_COMP;
+
+	ret = init_housekeeping(dd, reinit);
+	if (ret)
+		goto done;
+
+	/*
+	 * We could bump this to allow for full rcvegrcnt + rcvtidcnt,
+	 * but then it no longer nicely fits power of two, and since
+	 * we now use routines that backend onto __get_free_pages, the
+	 * rest would be wasted.
+	 */
+	dd->ipath_rcvhdrcnt = max(dd->ipath_p0_rcvegrcnt, dd->ipath_rcvegrcnt);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrcnt,
+			 dd->ipath_rcvhdrcnt);
+
+	/*
+	 * Set up the shadow copies of the piobufavail registers,
+	 * which we compare against the chip registers for now, and
+	 * the in memory DMA'ed copies of the registers.  This has to
+	 * be done early, before we calculate lastport, etc.
+	 */
+	piobufs = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+	/*
+	 * calc number of pioavail registers, and save it; we have 2
+	 * bits per buffer.
+	 */
+	dd->ipath_pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2)
+		/ (sizeof(u64) * BITS_PER_BYTE / 2);
+	uports = dd->ipath_cfgports ? dd->ipath_cfgports - 1 : 0;
+	if (piobufs > 144)
+		defkbufs = 32 + dd->ipath_pioreserved;
+	else
+		defkbufs = 16 + dd->ipath_pioreserved;
+
+	if (ipath_kpiobufs && (ipath_kpiobufs +
+		(uports * IPATH_MIN_USER_PORT_BUFCNT)) > piobufs) {
+		int i = (int) piobufs -
+			(int) (uports * IPATH_MIN_USER_PORT_BUFCNT);
+		if (i < 1)
+			i = 1;
+		dev_info(&dd->pcidev->dev, "Allocating %d PIO bufs of "
+			 "%d for kernel leaves too few for %d user ports "
+			 "(%d each); using %u\n", ipath_kpiobufs,
+			 piobufs, uports, IPATH_MIN_USER_PORT_BUFCNT, i);
+		/*
+		 * shouldn't change ipath_kpiobufs, because could be
+		 * different for different devices...
+		 */
+		kpiobufs = i;
+	} else if (ipath_kpiobufs)
+		kpiobufs = ipath_kpiobufs;
+	else
+		kpiobufs = defkbufs;
+	dd->ipath_lastport_piobuf = piobufs - kpiobufs;
+	dd->ipath_pbufsport =
+		uports ? dd->ipath_lastport_piobuf / uports : 0;
+	/* if not an even divisor, some user ports get extra buffers */
+	dd->ipath_ports_extrabuf = dd->ipath_lastport_piobuf -
+		(dd->ipath_pbufsport * uports);
+	if (dd->ipath_ports_extrabuf)
+		ipath_dbg("%u pbufs/port leaves some unused, add 1 buffer to "
+			"ports <= %u\n", dd->ipath_pbufsport,
+			dd->ipath_ports_extrabuf);
+	dd->ipath_lastpioindex = 0;
+	dd->ipath_lastpioindexl = dd->ipath_piobcnt2k;
+	/* ipath_pioavailshadow initialized earlier */
+	ipath_cdbg(VERBOSE, "%d PIO bufs for kernel out of %d total %u "
+		   "each for %u user ports\n", kpiobufs,
+		   piobufs, dd->ipath_pbufsport, uports);
+	ret = dd->ipath_f_early_init(dd);
+	if (ret) {
+		ipath_dev_err(dd, "Early initialization failure\n");
+		goto done;
+	}
+
+	/*
+	 * Early_init sets rcvhdrentsize and rcvhdrsize, so this must be
+	 * done after early_init.
+	 */
+	dd->ipath_hdrqlast =
+		dd->ipath_rcvhdrentsize * (dd->ipath_rcvhdrcnt - 1);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrentsize,
+			 dd->ipath_rcvhdrentsize);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvhdrsize,
+			 dd->ipath_rcvhdrsize);
+
+	if (!reinit) {
+		ret = init_pioavailregs(dd);
+		init_shadow_tids(dd);
+		if (ret)
+			goto done;
+	}
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendpioavailaddr,
+			 dd->ipath_pioavailregs_phys);
+
+	/*
+	 * this is to detect s/w errors, which the h/w works around by
+	 * ignoring the low 6 bits of address, if it wasn't aligned.
+	 */
+	val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendpioavailaddr);
+	if (val != dd->ipath_pioavailregs_phys) {
+		ipath_dev_err(dd, "Catastrophic software error, "
+			      "SendPIOAvailAddr written as %lx, "
+			      "read back as %llx\n",
+			      (unsigned long) dd->ipath_pioavailregs_phys,
+			      (unsigned long long) val);
+		ret = -EINVAL;
+		goto done;
+	}
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvbthqp, IPATH_KD_QP);
+
+	/*
+	 * make sure we are not in freeze, and PIO send enabled, so
+	 * writes to pbc happen
+	 */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask, 0ULL);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+			 ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control, 0ULL);
+
+	/*
+	 * before error clears, since we expect serdes pll errors during
+	 * this, the first time after reset
+	 */
+	if (bringup_link(dd)) {
+		dev_info(&dd->pcidev->dev, "Failed to bringup IB link\n");
+		ret = -ENETDOWN;
+		goto done;
+	}
+
+	/*
+	 * clear any "expected" hwerrs from reset and/or initialization
+	 * clear any that aren't enabled (at least this once), and then
+	 * set the enable mask
+	 */
+	dd->ipath_f_init_hwerrors(dd);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear,
+			 ~0ULL&~INFINIPATH_HWE_MEMBISTFAILED);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrmask,
+			 dd->ipath_hwerrmask);
+
+	/* clear all */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, -1LL);
+	/* enable errors that are masked, at least this first time. */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+			 ~dd->ipath_maskederrs);
+	dd->ipath_maskederrs = 0; /* don't re-enable ignored in timer */
+	dd->ipath_errormask =
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
+	/* clear any interrupts up to this point (ints still not enabled) */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, -1LL);
+
+	dd->ipath_f_tidtemplate(dd);
+
+	/*
+	 * Set up the port 0 (kernel) rcvhdr q and egr TIDs.  If doing
+	 * re-init, the simplest way to handle this is to free
+	 * existing, and re-allocate.
+	 * Need to re-create rest of port 0 portdata as well.
+	 */
+	pd = dd->ipath_pd[0];
+	if (reinit) {
+		struct ipath_portdata *npd;
+
+		/*
+		 * Alloc and init new ipath_portdata for port0,
+		 * Then free old pd. Could lead to fragmentation, but also
+		 * makes later support for hot-swap easier.
+		 */
+		npd = create_portdata0(dd);
+		if (npd) {
+			ipath_free_pddata(dd, pd);
+			dd->ipath_pd[0] = npd;
+			pd = npd;
+		} else {
+			ipath_dev_err(dd, "Unable to allocate portdata"
+				      " for port 0, failing\n");
+			ret = -ENOMEM;
+			goto done;
+		}
+	}
+	ret = ipath_create_rcvhdrq(dd, pd);
+	if (!ret)
+		ret = create_port0_egr(dd);
+	if (ret) {
+		ipath_dev_err(dd, "failed to allocate kernel port's "
+			      "rcvhdrq and/or egr bufs\n");
+		goto done;
+	}
+	else
+		enable_chip(dd, reinit);
+
+	/* after enable_chip, so pioavailshadow setup */
+	ipath_chg_pioavailkernel(dd, 0, piobufs, 1);
+
+	/*
+	 * Cancel any possible active sends from early driver load.
+	 * Follows early_init because some chips have to initialize
+	 * PIO buffers in early_init to avoid false parity errors.
+	 * After enable and ipath_chg_pioavailkernel so we can safely
+	 * enable pioavail updates and PIOENABLE; packets are now
+	 * ready to go out.
+	 */
+	ipath_cancel_sends(dd, 1);
+
+	if (!reinit) {
+		/*
+		 * Used when we close a port, for DMA already in flight
+		 * at close.
+		 */
+		dd->ipath_dummy_hdrq = dma_alloc_coherent(
+			&dd->pcidev->dev, dd->ipath_pd[0]->port_rcvhdrq_size,
+			&dd->ipath_dummy_hdrq_phys,
+			gfp_flags);
+		if (!dd->ipath_dummy_hdrq) {
+			dev_info(&dd->pcidev->dev,
+				"Couldn't allocate 0x%lx bytes for dummy hdrq\n",
+				dd->ipath_pd[0]->port_rcvhdrq_size);
+			/* fallback to just 0'ing */
+			dd->ipath_dummy_hdrq_phys = 0UL;
+		}
+	}
+
+	/*
+	 * cause retrigger of pending interrupts ignored during init,
+	 * even if we had errors
+	 */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+
+	if (!dd->ipath_stats_timer_active) {
+		/*
+		 * first init, or after an admin disable/enable
+		 * set up stats retrieval timer, even if we had errors
+		 * in last portion of setup
+		 */
+		init_timer(&dd->ipath_stats_timer);
+		dd->ipath_stats_timer.function = ipath_get_faststats;
+		dd->ipath_stats_timer.data = (unsigned long) dd;
+		/* every 5 seconds; */
+		dd->ipath_stats_timer.expires = jiffies + 5 * HZ;
+		/* takes ~16 seconds to overflow at full IB 4x bandwdith */
+		add_timer(&dd->ipath_stats_timer);
+		dd->ipath_stats_timer_active = 1;
+	}
+
+	/* Set up SendDMA if chip supports it */
+	if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+		ret = setup_sdma(dd);
+
+	/* Set up HoL state */
+	init_timer(&dd->ipath_hol_timer);
+	dd->ipath_hol_timer.function = ipath_hol_event;
+	dd->ipath_hol_timer.data = (unsigned long)dd;
+	dd->ipath_hol_state = IPATH_HOL_UP;
+
+done:
+	if (!ret) {
+		*dd->ipath_statusp |= IPATH_STATUS_CHIP_PRESENT;
+		if (!dd->ipath_f_intrsetup(dd)) {
+			/* now we can enable all interrupts from the chip */
+			ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
+					 -1LL);
+			/* force re-interrupt of any pending interrupts. */
+			ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear,
+					 0ULL);
+			/* chip is usable; mark it as initialized */
+			*dd->ipath_statusp |= IPATH_STATUS_INITTED;
+
+			/*
+			 * setup to verify we get an interrupt, and fallback
+			 * to an alternate if necessary and possible
+			 */
+			if (!reinit) {
+				init_timer(&dd->ipath_intrchk_timer);
+				dd->ipath_intrchk_timer.function =
+					verify_interrupt;
+				dd->ipath_intrchk_timer.data =
+					(unsigned long) dd;
+			}
+			dd->ipath_intrchk_timer.expires = jiffies + HZ/2;
+			add_timer(&dd->ipath_intrchk_timer);
+		} else
+			ipath_dev_err(dd, "No interrupts enabled, couldn't "
+				      "setup interrupt address\n");
+
+		if (dd->ipath_cfgports > ipath_stats.sps_nports)
+			/*
+			 * sps_nports is a global, so, we set it to
+			 * the highest number of ports of any of the
+			 * chips we find; we never decrement it, at
+			 * least for now.  Since this might have changed
+			 * over disable/enable or prior to reset, always
+			 * do the check and potentially adjust.
+			 */
+			ipath_stats.sps_nports = dd->ipath_cfgports;
+	} else
+		ipath_dbg("Failed (%d) to initialize chip\n", ret);
+
+	/* if ret is non-zero, we probably should do some cleanup
+	   here... */
+	return ret;
+}
+
+static int ipath_set_kpiobufs(const char *str, struct kernel_param *kp)
+{
+	struct ipath_devdata *dd;
+	unsigned long flags;
+	unsigned short val;
+	int ret;
+
+	ret = ipath_parse_ushort(str, &val);
+
+	spin_lock_irqsave(&ipath_devs_lock, flags);
+
+	if (ret < 0)
+		goto bail;
+
+	if (val == 0) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	list_for_each_entry(dd, &ipath_dev_list, ipath_list) {
+		if (dd->ipath_kregbase)
+			continue;
+		if (val > (dd->ipath_piobcnt2k + dd->ipath_piobcnt4k -
+			   (dd->ipath_cfgports *
+			    IPATH_MIN_USER_PORT_BUFCNT)))
+		{
+			ipath_dev_err(
+				dd,
+				"Allocating %d PIO bufs for kernel leaves "
+				"too few for %d user ports (%d each)\n",
+				val, dd->ipath_cfgports - 1,
+				IPATH_MIN_USER_PORT_BUFCNT);
+			ret = -EINVAL;
+			goto bail;
+		}
+		dd->ipath_lastport_piobuf =
+			dd->ipath_piobcnt2k + dd->ipath_piobcnt4k - val;
+	}
+
+	ipath_kpiobufs = val;
+	ret = 0;
+bail:
+	spin_unlock_irqrestore(&ipath_devs_lock, flags);
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c
new file mode 100644
index 000000000..01ba79279
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_intr.c
@@ -0,0 +1,1273 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/sched.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+
+/*
+ * Called when we might have an error that is specific to a particular
+ * PIO buffer, and may need to cancel that buffer, so it can be re-used.
+ */
+void ipath_disarm_senderrbufs(struct ipath_devdata *dd)
+{
+	u32 piobcnt;
+	unsigned long sbuf[4];
+	/*
+	 * it's possible that sendbuffererror could have bits set; might
+	 * have already done this as a result of hardware error handling
+	 */
+	piobcnt = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+	/* read these before writing errorclear */
+	sbuf[0] = ipath_read_kreg64(
+		dd, dd->ipath_kregs->kr_sendbuffererror);
+	sbuf[1] = ipath_read_kreg64(
+		dd, dd->ipath_kregs->kr_sendbuffererror + 1);
+	if (piobcnt > 128)
+		sbuf[2] = ipath_read_kreg64(
+			dd, dd->ipath_kregs->kr_sendbuffererror + 2);
+	if (piobcnt > 192)
+		sbuf[3] = ipath_read_kreg64(
+			dd, dd->ipath_kregs->kr_sendbuffererror + 3);
+	else
+		sbuf[3] = 0;
+
+	if (sbuf[0] || sbuf[1] || (piobcnt > 128 && (sbuf[2] || sbuf[3]))) {
+		int i;
+		if (ipath_debug & (__IPATH_PKTDBG|__IPATH_DBG) &&
+			time_after(dd->ipath_lastcancel, jiffies)) {
+			__IPATH_DBG_WHICH(__IPATH_PKTDBG|__IPATH_DBG,
+					  "SendbufErrs %lx %lx", sbuf[0],
+					  sbuf[1]);
+			if (ipath_debug & __IPATH_PKTDBG && piobcnt > 128)
+				printk(" %lx %lx ", sbuf[2], sbuf[3]);
+			printk("\n");
+		}
+
+		for (i = 0; i < piobcnt; i++)
+			if (test_bit(i, sbuf))
+				ipath_disarm_piobufs(dd, i, 1);
+		/* ignore armlaunch errs for a bit */
+		dd->ipath_lastcancel = jiffies+3;
+	}
+}
+
+
+/* These are all rcv-related errors which we want to count for stats */
+#define E_SUM_PKTERRS \
+	(INFINIPATH_E_RHDRLEN | INFINIPATH_E_RBADTID | \
+	 INFINIPATH_E_RBADVERSION | INFINIPATH_E_RHDR | \
+	 INFINIPATH_E_RLONGPKTLEN | INFINIPATH_E_RSHORTPKTLEN | \
+	 INFINIPATH_E_RMAXPKTLEN | INFINIPATH_E_RMINPKTLEN | \
+	 INFINIPATH_E_RFORMATERR | INFINIPATH_E_RUNSUPVL | \
+	 INFINIPATH_E_RUNEXPCHAR | INFINIPATH_E_REBP)
+
+/* These are all send-related errors which we want to count for stats */
+#define E_SUM_ERRS \
+	(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SUNEXPERRPKTNUM | \
+	 INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
+	 INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SUNSUPVL | \
+	 INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
+	 INFINIPATH_E_INVALIDADDR)
+
+/*
+ * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore
+ * errors not related to freeze and cancelling buffers.  Can't ignore
+ * armlaunch because could get more while still cleaning up, and need
+ * to cancel those as they happen.
+ */
+#define E_SPKT_ERRS_IGNORE \
+	 (INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
+	 INFINIPATH_E_SMAXPKTLEN | INFINIPATH_E_SMINPKTLEN | \
+	 INFINIPATH_E_SPKTLEN)
+
+/*
+ * these are errors that can occur when the link changes state while
+ * a packet is being sent or received.  This doesn't cover things
+ * like EBP or VCRC that can be the result of a sending having the
+ * link change state, so we receive a "known bad" packet.
+ */
+#define E_SUM_LINK_PKTERRS \
+	(INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_SDROPPEDSMPPKT | \
+	 INFINIPATH_E_SMINPKTLEN | INFINIPATH_E_SPKTLEN | \
+	 INFINIPATH_E_RSHORTPKTLEN | INFINIPATH_E_RMINPKTLEN | \
+	 INFINIPATH_E_RUNEXPCHAR)
+
+static u64 handle_e_sum_errs(struct ipath_devdata *dd, ipath_err_t errs)
+{
+	u64 ignore_this_time = 0;
+
+	ipath_disarm_senderrbufs(dd);
+	if ((errs & E_SUM_LINK_PKTERRS) &&
+	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+		/*
+		 * This can happen when SMA is trying to bring the link
+		 * up, but the IB link changes state at the "wrong" time.
+		 * The IB logic then complains that the packet isn't
+		 * valid.  We don't want to confuse people, so we just
+		 * don't print them, except at debug
+		 */
+		ipath_dbg("Ignoring packet errors %llx, because link not "
+			  "ACTIVE\n", (unsigned long long) errs);
+		ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+	}
+
+	return ignore_this_time;
+}
+
+/* generic hw error messages... */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_MSG(a) \
+	{ \
+		.mask = ( INFINIPATH_HWE_TXEMEMPARITYERR_##a <<    \
+			  INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT ),   \
+		.msg = "TXE " #a " Memory Parity"	     \
+	}
+#define INFINIPATH_HWE_RXEMEMPARITYERR_MSG(a) \
+	{ \
+		.mask = ( INFINIPATH_HWE_RXEMEMPARITYERR_##a <<    \
+			  INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT ),   \
+		.msg = "RXE " #a " Memory Parity"	     \
+	}
+
+static const struct ipath_hwerror_msgs ipath_generic_hwerror_msgs[] = {
+	INFINIPATH_HWE_MSG(IBCBUSFRSPCPARITYERR, "IPATH2IB Parity"),
+	INFINIPATH_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2IPATH Parity"),
+
+	INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOBUF),
+	INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOPBC),
+	INFINIPATH_HWE_TXEMEMPARITYERR_MSG(PIOLAUNCHFIFO),
+
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(RCVBUF),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(LOOKUPQ),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EAGERTID),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(EXPTID),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(FLAGBUF),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(DATAINFO),
+	INFINIPATH_HWE_RXEMEMPARITYERR_MSG(HDRINFO),
+};
+
+/**
+ * ipath_format_hwmsg - format a single hwerror message
+ * @msg message buffer
+ * @msgl length of message buffer
+ * @hwmsg message to add to message buffer
+ */
+static void ipath_format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
+{
+	strlcat(msg, "[", msgl);
+	strlcat(msg, hwmsg, msgl);
+	strlcat(msg, "]", msgl);
+}
+
+/**
+ * ipath_format_hwerrors - format hardware error messages for display
+ * @hwerrs hardware errors bit vector
+ * @hwerrmsgs hardware error descriptions
+ * @nhwerrmsgs number of hwerrmsgs
+ * @msg message buffer
+ * @msgl message buffer length
+ */
+void ipath_format_hwerrors(u64 hwerrs,
+			   const struct ipath_hwerror_msgs *hwerrmsgs,
+			   size_t nhwerrmsgs,
+			   char *msg, size_t msgl)
+{
+	int i;
+	const int glen =
+	    ARRAY_SIZE(ipath_generic_hwerror_msgs);
+
+	for (i=0; i<glen; i++) {
+		if (hwerrs & ipath_generic_hwerror_msgs[i].mask) {
+			ipath_format_hwmsg(msg, msgl,
+					   ipath_generic_hwerror_msgs[i].msg);
+		}
+	}
+
+	for (i=0; i<nhwerrmsgs; i++) {
+		if (hwerrs & hwerrmsgs[i].mask) {
+			ipath_format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
+		}
+	}
+}
+
+/* return the strings for the most common link states */
+static char *ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
+{
+	char *ret;
+	u32 state;
+
+	state = ipath_ib_state(dd, ibcs);
+	if (state == dd->ib_init)
+		ret = "Init";
+	else if (state == dd->ib_arm)
+		ret = "Arm";
+	else if (state == dd->ib_active)
+		ret = "Active";
+	else
+		ret = "Down";
+	return ret;
+}
+
+void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev)
+{
+	struct ib_event event;
+
+	event.device = &dd->verbs_dev->ibdev;
+	event.element.port_num = 1;
+	event.event = ev;
+	ib_dispatch_event(&event);
+}
+
+static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
+				     ipath_err_t errs)
+{
+	u32 ltstate, lstate, ibstate, lastlstate;
+	u32 init = dd->ib_init;
+	u32 arm = dd->ib_arm;
+	u32 active = dd->ib_active;
+	const u64 ibcs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_ibcstatus);
+
+	lstate = ipath_ib_linkstate(dd, ibcs); /* linkstate */
+	ibstate = ipath_ib_state(dd, ibcs);
+	/* linkstate at last interrupt */
+	lastlstate = ipath_ib_linkstate(dd, dd->ipath_lastibcstat);
+	ltstate = ipath_ib_linktrstate(dd, ibcs); /* linktrainingtate */
+
+	/*
+	 * Since going into a recovery state causes the link state to go
+	 * down and since recovery is transitory, it is better if we "miss"
+	 * ever seeing the link training state go into recovery (i.e.,
+	 * ignore this transition for link state special handling purposes)
+	 * without even updating ipath_lastibcstat.
+	 */
+	if ((ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN) ||
+	    (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT) ||
+	    (ltstate == INFINIPATH_IBCS_LT_STATE_RECOVERIDLE))
+		goto done;
+
+	/*
+	 * if linkstate transitions into INIT from any of the various down
+	 * states, or if it transitions from any of the up (INIT or better)
+	 * states into any of the down states (except link recovery), then
+	 * call the chip-specific code to take appropriate actions.
+	 */
+	if (lstate >= INFINIPATH_IBCS_L_STATE_INIT &&
+		lastlstate == INFINIPATH_IBCS_L_STATE_DOWN) {
+		/* transitioned to UP */
+		if (dd->ipath_f_ib_updown(dd, 1, ibcs)) {
+			/* link came up, so we must no longer be disabled */
+			dd->ipath_flags &= ~IPATH_IB_LINK_DISABLED;
+			ipath_cdbg(LINKVERB, "LinkUp handled, skipped\n");
+			goto skip_ibchange; /* chip-code handled */
+		}
+	} else if ((lastlstate >= INFINIPATH_IBCS_L_STATE_INIT ||
+		(dd->ipath_flags & IPATH_IB_FORCE_NOTIFY)) &&
+		ltstate <= INFINIPATH_IBCS_LT_STATE_CFGWAITRMT &&
+		ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) {
+		int handled;
+		handled = dd->ipath_f_ib_updown(dd, 0, ibcs);
+		dd->ipath_flags &= ~IPATH_IB_FORCE_NOTIFY;
+		if (handled) {
+			ipath_cdbg(LINKVERB, "LinkDown handled, skipped\n");
+			goto skip_ibchange; /* chip-code handled */
+		}
+	}
+
+	/*
+	 * Significant enough to always print and get into logs, if it was
+	 * unexpected.  If it was a requested state change, we'll have
+	 * already cleared the flags, so we won't print this warning
+	 */
+	if ((ibstate != arm && ibstate != active) &&
+	    (dd->ipath_flags & (IPATH_LINKARMED | IPATH_LINKACTIVE))) {
+		dev_info(&dd->pcidev->dev, "Link state changed from %s "
+			 "to %s\n", (dd->ipath_flags & IPATH_LINKARMED) ?
+			 "ARM" : "ACTIVE", ib_linkstate(dd, ibcs));
+	}
+
+	if (ltstate == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
+	    ltstate == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
+		u32 lastlts;
+		lastlts = ipath_ib_linktrstate(dd, dd->ipath_lastibcstat);
+		/*
+		 * Ignore cycling back and forth from Polling.Active to
+		 * Polling.Quiet while waiting for the other end of the link
+		 * to come up, except to try and decide if we are connected
+		 * to a live IB device or not.  We will cycle back and
+		 * forth between them if no cable is plugged in, the other
+		 * device is powered off or disabled, etc.
+		 */
+		if (lastlts == INFINIPATH_IBCS_LT_STATE_POLLACTIVE ||
+		    lastlts == INFINIPATH_IBCS_LT_STATE_POLLQUIET) {
+			if (!(dd->ipath_flags & IPATH_IB_AUTONEG_INPROG) &&
+			     (++dd->ipath_ibpollcnt == 40)) {
+				dd->ipath_flags |= IPATH_NOCABLE;
+				*dd->ipath_statusp |=
+					IPATH_STATUS_IB_NOCABLE;
+				ipath_cdbg(LINKVERB, "Set NOCABLE\n");
+			}
+			ipath_cdbg(LINKVERB, "POLL change to %s (%x)\n",
+				ipath_ibcstatus_str[ltstate], ibstate);
+			goto skip_ibchange;
+		}
+	}
+
+	dd->ipath_ibpollcnt = 0; /* not poll*, now */
+	ipath_stats.sps_iblink++;
+
+	if (ibstate != init && dd->ipath_lastlinkrecov && ipath_linkrecovery) {
+		u64 linkrecov;
+		linkrecov = ipath_snap_cntr(dd,
+			dd->ipath_cregs->cr_iblinkerrrecovcnt);
+		if (linkrecov != dd->ipath_lastlinkrecov) {
+			ipath_dbg("IB linkrecov up %Lx (%s %s) recov %Lu\n",
+				(unsigned long long) ibcs,
+				ib_linkstate(dd, ibcs),
+				ipath_ibcstatus_str[ltstate],
+				(unsigned long long) linkrecov);
+			/* and no more until active again */
+			dd->ipath_lastlinkrecov = 0;
+			ipath_set_linkstate(dd, IPATH_IB_LINKDOWN);
+			goto skip_ibchange;
+		}
+	}
+
+	if (ibstate == init || ibstate == arm || ibstate == active) {
+		*dd->ipath_statusp &= ~IPATH_STATUS_IB_NOCABLE;
+		if (ibstate == init || ibstate == arm) {
+			*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+			if (dd->ipath_flags & IPATH_LINKACTIVE)
+				signal_ib_event(dd, IB_EVENT_PORT_ERR);
+		}
+		if (ibstate == arm) {
+			dd->ipath_flags |= IPATH_LINKARMED;
+			dd->ipath_flags &= ~(IPATH_LINKUNK |
+				IPATH_LINKINIT | IPATH_LINKDOWN |
+				IPATH_LINKACTIVE | IPATH_NOCABLE);
+			ipath_hol_down(dd);
+		} else  if (ibstate == init) {
+			/*
+			 * set INIT and DOWN.  Down is checked by
+			 * most of the other code, but INIT is
+			 * useful to know in a few places.
+			 */
+			dd->ipath_flags |= IPATH_LINKINIT |
+				IPATH_LINKDOWN;
+			dd->ipath_flags &= ~(IPATH_LINKUNK |
+				IPATH_LINKARMED | IPATH_LINKACTIVE |
+				IPATH_NOCABLE);
+			ipath_hol_down(dd);
+		} else {  /* active */
+			dd->ipath_lastlinkrecov = ipath_snap_cntr(dd,
+				dd->ipath_cregs->cr_iblinkerrrecovcnt);
+			*dd->ipath_statusp |=
+				IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF;
+			dd->ipath_flags |= IPATH_LINKACTIVE;
+			dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+				| IPATH_LINKDOWN | IPATH_LINKARMED |
+				IPATH_NOCABLE);
+			if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
+				ipath_restart_sdma(dd);
+			signal_ib_event(dd, IB_EVENT_PORT_ACTIVE);
+			/* LED active not handled in chip _f_updown */
+			dd->ipath_f_setextled(dd, lstate, ltstate);
+			ipath_hol_up(dd);
+		}
+
+		/*
+		 * print after we've already done the work, so as not to
+		 * delay the state changes and notifications, for debugging
+		 */
+		if (lstate == lastlstate)
+			ipath_cdbg(LINKVERB, "Unchanged from last: %s "
+				"(%x)\n", ib_linkstate(dd, ibcs), ibstate);
+		else
+			ipath_cdbg(VERBOSE, "Unit %u: link up to %s %s (%x)\n",
+				  dd->ipath_unit, ib_linkstate(dd, ibcs),
+				  ipath_ibcstatus_str[ltstate],  ibstate);
+	} else { /* down */
+		if (dd->ipath_flags & IPATH_LINKACTIVE)
+			signal_ib_event(dd, IB_EVENT_PORT_ERR);
+		dd->ipath_flags |= IPATH_LINKDOWN;
+		dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+				     | IPATH_LINKACTIVE |
+				     IPATH_LINKARMED);
+		*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+		dd->ipath_lli_counter = 0;
+
+		if (lastlstate != INFINIPATH_IBCS_L_STATE_DOWN)
+			ipath_cdbg(VERBOSE, "Unit %u link state down "
+				   "(state 0x%x), from %s\n",
+				   dd->ipath_unit, lstate,
+				   ib_linkstate(dd, dd->ipath_lastibcstat));
+		else
+			ipath_cdbg(LINKVERB, "Unit %u link state changed "
+				   "to %s (0x%x) from down (%x)\n",
+				   dd->ipath_unit,
+				   ipath_ibcstatus_str[ltstate],
+				   ibstate, lastlstate);
+	}
+
+skip_ibchange:
+	dd->ipath_lastibcstat = ibcs;
+done:
+	return;
+}
+
+static void handle_supp_msgs(struct ipath_devdata *dd,
+			     unsigned supp_msgs, char *msg, u32 msgsz)
+{
+	/*
+	 * Print the message unless it's ibc status change only, which
+	 * happens so often we never want to count it.
+	 */
+	if (dd->ipath_lasterror & ~INFINIPATH_E_IBSTATUSCHANGED) {
+		int iserr;
+		ipath_err_t mask;
+		iserr = ipath_decode_err(dd, msg, msgsz,
+					 dd->ipath_lasterror &
+					 ~INFINIPATH_E_IBSTATUSCHANGED);
+
+		mask = INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+			INFINIPATH_E_PKTERRS | INFINIPATH_E_SDMADISABLED;
+
+		/* if we're in debug, then don't mask SDMADISABLED msgs */
+		if (ipath_debug & __IPATH_DBG)
+			mask &= ~INFINIPATH_E_SDMADISABLED;
+
+		if (dd->ipath_lasterror & ~mask)
+			ipath_dev_err(dd, "Suppressed %u messages for "
+				      "fast-repeating errors (%s) (%llx)\n",
+				      supp_msgs, msg,
+				      (unsigned long long)
+				      dd->ipath_lasterror);
+		else {
+			/*
+			 * rcvegrfull and rcvhdrqfull are "normal", for some
+			 * types of processes (mostly benchmarks) that send
+			 * huge numbers of messages, while not processing
+			 * them. So only complain about these at debug
+			 * level.
+			 */
+			if (iserr)
+				ipath_dbg("Suppressed %u messages for %s\n",
+					  supp_msgs, msg);
+			else
+				ipath_cdbg(ERRPKT,
+					"Suppressed %u messages for %s\n",
+					  supp_msgs, msg);
+		}
+	}
+}
+
+static unsigned handle_frequent_errors(struct ipath_devdata *dd,
+				       ipath_err_t errs, char *msg,
+				       u32 msgsz, int *noprint)
+{
+	unsigned long nc;
+	static unsigned long nextmsg_time;
+	static unsigned nmsgs, supp_msgs;
+
+	/*
+	 * Throttle back "fast" messages to no more than 10 per 5 seconds.
+	 * This isn't perfect, but it's a reasonable heuristic. If we get
+	 * more than 10, give a 6x longer delay.
+	 */
+	nc = jiffies;
+	if (nmsgs > 10) {
+		if (time_before(nc, nextmsg_time)) {
+			*noprint = 1;
+			if (!supp_msgs++)
+				nextmsg_time = nc + HZ * 3;
+		}
+		else if (supp_msgs) {
+			handle_supp_msgs(dd, supp_msgs, msg, msgsz);
+			supp_msgs = 0;
+			nmsgs = 0;
+		}
+	}
+	else if (!nmsgs++ || time_after(nc, nextmsg_time))
+		nextmsg_time = nc + HZ / 2;
+
+	return supp_msgs;
+}
+
+static void handle_sdma_errors(struct ipath_devdata *dd, ipath_err_t errs)
+{
+	unsigned long flags;
+	int expected;
+
+	if (ipath_debug & __IPATH_DBG) {
+		char msg[128];
+		ipath_decode_err(dd, msg, sizeof msg, errs &
+			INFINIPATH_E_SDMAERRS);
+		ipath_dbg("errors %lx (%s)\n", (unsigned long)errs, msg);
+	}
+	if (ipath_debug & __IPATH_VERBDBG) {
+		unsigned long tl, hd, status, lengen;
+		tl = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail);
+		hd = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead);
+		status = ipath_read_kreg64(dd
+			, dd->ipath_kregs->kr_senddmastatus);
+		lengen = ipath_read_kreg64(dd,
+			dd->ipath_kregs->kr_senddmalengen);
+		ipath_cdbg(VERBOSE, "sdma tl 0x%lx hd 0x%lx status 0x%lx "
+			"lengen 0x%lx\n", tl, hd, status, lengen);
+	}
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+	__set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+	expected = test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+	if (!expected)
+		ipath_cancel_sends(dd, 1);
+}
+
+static void handle_sdma_intr(struct ipath_devdata *dd, u64 istat)
+{
+	unsigned long flags;
+	int expected;
+
+	if ((istat & INFINIPATH_I_SDMAINT) &&
+	    !test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+		ipath_sdma_intr(dd);
+
+	if (istat & INFINIPATH_I_SDMADISABLED) {
+		expected = test_bit(IPATH_SDMA_ABORTING,
+			&dd->ipath_sdma_status);
+		ipath_dbg("%s SDmaDisabled intr\n",
+			expected ? "expected" : "unexpected");
+		spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+		__set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+		if (!expected)
+			ipath_cancel_sends(dd, 1);
+		if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+			tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
+	}
+}
+
+static int handle_hdrq_full(struct ipath_devdata *dd)
+{
+	int chkerrpkts = 0;
+	u32 hd, tl;
+	u32 i;
+
+	ipath_stats.sps_hdrqfull++;
+	for (i = 0; i < dd->ipath_cfgports; i++) {
+		struct ipath_portdata *pd = dd->ipath_pd[i];
+
+		if (i == 0) {
+			/*
+			 * For kernel receive queues, we just want to know
+			 * if there are packets in the queue that we can
+			 * process.
+			 */
+			if (pd->port_head != ipath_get_hdrqtail(pd))
+				chkerrpkts |= 1 << i;
+			continue;
+		}
+
+		/* Skip if user context is not open */
+		if (!pd || !pd->port_cnt)
+			continue;
+
+		/* Don't report the same point multiple times. */
+		if (dd->ipath_flags & IPATH_NODMA_RTAIL)
+			tl = ipath_read_ureg32(dd, ur_rcvhdrtail, i);
+		else
+			tl = ipath_get_rcvhdrtail(pd);
+		if (tl == pd->port_lastrcvhdrqtail)
+			continue;
+
+		hd = ipath_read_ureg32(dd, ur_rcvhdrhead, i);
+		if (hd == (tl + 1) || (!hd && tl == dd->ipath_hdrqlast)) {
+			pd->port_lastrcvhdrqtail = tl;
+			pd->port_hdrqfull++;
+			/* flush hdrqfull so that poll() sees it */
+			wmb();
+			wake_up_interruptible(&pd->port_wait);
+		}
+	}
+
+	return chkerrpkts;
+}
+
+static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
+{
+	char msg[128];
+	u64 ignore_this_time = 0;
+	u64 iserr = 0;
+	int chkerrpkts = 0, noprint = 0;
+	unsigned supp_msgs;
+	int log_idx;
+
+	/*
+	 * don't report errors that are masked, either at init
+	 * (not set in ipath_errormask), or temporarily (set in
+	 * ipath_maskederrs)
+	 */
+	errs &= dd->ipath_errormask & ~dd->ipath_maskederrs;
+
+	supp_msgs = handle_frequent_errors(dd, errs, msg, (u32)sizeof msg,
+		&noprint);
+
+	/* do these first, they are most important */
+	if (errs & INFINIPATH_E_HARDWARE) {
+		/* reuse same msg buf */
+		dd->ipath_f_handle_hwerrors(dd, msg, sizeof msg);
+	} else {
+		u64 mask;
+		for (log_idx = 0; log_idx < IPATH_EEP_LOG_CNT; ++log_idx) {
+			mask = dd->ipath_eep_st_masks[log_idx].errs_to_log;
+			if (errs & mask)
+				ipath_inc_eeprom_err(dd, log_idx, 1);
+		}
+	}
+
+	if (errs & INFINIPATH_E_SDMAERRS)
+		handle_sdma_errors(dd, errs);
+
+	if (!noprint && (errs & ~dd->ipath_e_bitsextant))
+		ipath_dev_err(dd, "error interrupt with unknown errors "
+			      "%llx set\n", (unsigned long long)
+			      (errs & ~dd->ipath_e_bitsextant));
+
+	if (errs & E_SUM_ERRS)
+		ignore_this_time = handle_e_sum_errs(dd, errs);
+	else if ((errs & E_SUM_LINK_PKTERRS) &&
+	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+		/*
+		 * This can happen when SMA is trying to bring the link
+		 * up, but the IB link changes state at the "wrong" time.
+		 * The IB logic then complains that the packet isn't
+		 * valid.  We don't want to confuse people, so we just
+		 * don't print them, except at debug
+		 */
+		ipath_dbg("Ignoring packet errors %llx, because link not "
+			  "ACTIVE\n", (unsigned long long) errs);
+		ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+	}
+
+	if (supp_msgs == 250000) {
+		int s_iserr;
+		/*
+		 * It's not entirely reasonable assuming that the errors set
+		 * in the last clear period are all responsible for the
+		 * problem, but the alternative is to assume it's the only
+		 * ones on this particular interrupt, which also isn't great
+		 */
+		dd->ipath_maskederrs |= dd->ipath_lasterror | errs;
+
+		dd->ipath_errormask &= ~dd->ipath_maskederrs;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+				 dd->ipath_errormask);
+		s_iserr = ipath_decode_err(dd, msg, sizeof msg,
+					   dd->ipath_maskederrs);
+
+		if (dd->ipath_maskederrs &
+		    ~(INFINIPATH_E_RRCVEGRFULL |
+		      INFINIPATH_E_RRCVHDRFULL | INFINIPATH_E_PKTERRS))
+			ipath_dev_err(dd, "Temporarily disabling "
+			    "error(s) %llx reporting; too frequent (%s)\n",
+				(unsigned long long) dd->ipath_maskederrs,
+				msg);
+		else {
+			/*
+			 * rcvegrfull and rcvhdrqfull are "normal",
+			 * for some types of processes (mostly benchmarks)
+			 * that send huge numbers of messages, while not
+			 * processing them.  So only complain about
+			 * these at debug level.
+			 */
+			if (s_iserr)
+				ipath_dbg("Temporarily disabling reporting "
+				    "too frequent queue full errors (%s)\n",
+				    msg);
+			else
+				ipath_cdbg(ERRPKT,
+				    "Temporarily disabling reporting too"
+				    " frequent packet errors (%s)\n",
+				    msg);
+		}
+
+		/*
+		 * Re-enable the masked errors after around 3 minutes.  in
+		 * ipath_get_faststats().  If we have a series of fast
+		 * repeating but different errors, the interval will keep
+		 * stretching out, but that's OK, as that's pretty
+		 * catastrophic.
+		 */
+		dd->ipath_unmasktime = jiffies + HZ * 180;
+	}
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, errs);
+	if (ignore_this_time)
+		errs &= ~ignore_this_time;
+	if (errs & ~dd->ipath_lasterror) {
+		errs &= ~dd->ipath_lasterror;
+		/* never suppress duplicate hwerrors or ibstatuschange */
+		dd->ipath_lasterror |= errs &
+			~(INFINIPATH_E_HARDWARE |
+			  INFINIPATH_E_IBSTATUSCHANGED);
+	}
+
+	if (errs & INFINIPATH_E_SENDSPECIALTRIGGER) {
+		dd->ipath_spectriggerhit++;
+		ipath_dbg("%lu special trigger hits\n",
+			dd->ipath_spectriggerhit);
+	}
+
+	/* likely due to cancel; so suppress message unless verbose */
+	if ((errs & (INFINIPATH_E_SPKTLEN | INFINIPATH_E_SPIOARMLAUNCH)) &&
+		time_after(dd->ipath_lastcancel, jiffies)) {
+		/* armlaunch takes precedence; it often causes both. */
+		ipath_cdbg(VERBOSE,
+			"Suppressed %s error (%llx) after sendbuf cancel\n",
+			(errs &  INFINIPATH_E_SPIOARMLAUNCH) ?
+			"armlaunch" : "sendpktlen", (unsigned long long)errs);
+		errs &= ~(INFINIPATH_E_SPIOARMLAUNCH | INFINIPATH_E_SPKTLEN);
+	}
+
+	if (!errs)
+		return 0;
+
+	if (!noprint) {
+		ipath_err_t mask;
+		/*
+		 * The ones we mask off are handled specially below
+		 * or above.  Also mask SDMADISABLED by default as it
+		 * is too chatty.
+		 */
+		mask = INFINIPATH_E_IBSTATUSCHANGED |
+			INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+			INFINIPATH_E_HARDWARE | INFINIPATH_E_SDMADISABLED;
+
+		/* if we're in debug, then don't mask SDMADISABLED msgs */
+		if (ipath_debug & __IPATH_DBG)
+			mask &= ~INFINIPATH_E_SDMADISABLED;
+
+		ipath_decode_err(dd, msg, sizeof msg, errs & ~mask);
+	} else
+		/* so we don't need if (!noprint) at strlcat's below */
+		*msg = 0;
+
+	if (errs & E_SUM_PKTERRS) {
+		ipath_stats.sps_pkterrs++;
+		chkerrpkts = 1;
+	}
+	if (errs & E_SUM_ERRS)
+		ipath_stats.sps_errs++;
+
+	if (errs & (INFINIPATH_E_RICRC | INFINIPATH_E_RVCRC)) {
+		ipath_stats.sps_crcerrs++;
+		chkerrpkts = 1;
+	}
+	iserr = errs & ~(E_SUM_PKTERRS | INFINIPATH_E_PKTERRS);
+
+
+	/*
+	 * We don't want to print these two as they happen, or we can make
+	 * the situation even worse, because it takes so long to print
+	 * messages to serial consoles.  Kernel ports get printed from
+	 * fast_stats, no more than every 5 seconds, user ports get printed
+	 * on close
+	 */
+	if (errs & INFINIPATH_E_RRCVHDRFULL)
+		chkerrpkts |= handle_hdrq_full(dd);
+	if (errs & INFINIPATH_E_RRCVEGRFULL) {
+		struct ipath_portdata *pd = dd->ipath_pd[0];
+
+		/*
+		 * since this is of less importance and not likely to
+		 * happen without also getting hdrfull, only count
+		 * occurrences; don't check each port (or even the kernel
+		 * vs user)
+		 */
+		ipath_stats.sps_etidfull++;
+		if (pd->port_head != ipath_get_hdrqtail(pd))
+			chkerrpkts |= 1;
+	}
+
+	/*
+	 * do this before IBSTATUSCHANGED, in case both bits set in a single
+	 * interrupt; we want the STATUSCHANGE to "win", so we do our
+	 * internal copy of state machine correctly
+	 */
+	if (errs & INFINIPATH_E_RIBLOSTLINK) {
+		/*
+		 * force through block below
+		 */
+		errs |= INFINIPATH_E_IBSTATUSCHANGED;
+		ipath_stats.sps_iblink++;
+		dd->ipath_flags |= IPATH_LINKDOWN;
+		dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
+				     | IPATH_LINKARMED | IPATH_LINKACTIVE);
+		*dd->ipath_statusp &= ~IPATH_STATUS_IB_READY;
+
+		ipath_dbg("Lost link, link now down (%s)\n",
+			ipath_ibcstatus_str[ipath_read_kreg64(dd,
+			dd->ipath_kregs->kr_ibcstatus) & 0xf]);
+	}
+	if (errs & INFINIPATH_E_IBSTATUSCHANGED)
+		handle_e_ibstatuschanged(dd, errs);
+
+	if (errs & INFINIPATH_E_RESET) {
+		if (!noprint)
+			ipath_dev_err(dd, "Got reset, requires re-init "
+				      "(unload and reload driver)\n");
+		dd->ipath_flags &= ~IPATH_INITTED;	/* needs re-init */
+		/* mark as having had error */
+		*dd->ipath_statusp |= IPATH_STATUS_HWERROR;
+		*dd->ipath_statusp &= ~IPATH_STATUS_IB_CONF;
+	}
+
+	if (!noprint && *msg) {
+		if (iserr)
+			ipath_dev_err(dd, "%s error\n", msg);
+	}
+	if (dd->ipath_state_wanted & dd->ipath_flags) {
+		ipath_cdbg(VERBOSE, "driver wanted state %x, iflags now %x, "
+			   "waking\n", dd->ipath_state_wanted,
+			   dd->ipath_flags);
+		wake_up_interruptible(&ipath_state_wait);
+	}
+
+	return chkerrpkts;
+}
+
+/*
+ * try to cleanup as much as possible for anything that might have gone
+ * wrong while in freeze mode, such as pio buffers being written by user
+ * processes (causing armlaunch), send errors due to going into freeze mode,
+ * etc., and try to avoid causing extra interrupts while doing so.
+ * Forcibly update the in-memory pioavail register copies after cleanup
+ * because the chip won't do it while in freeze mode (the register values
+ * themselves are kept correct).
+ * Make sure that we don't lose any important interrupts by using the chip
+ * feature that says that writing 0 to a bit in *clear that is set in
+ * *status will cause an interrupt to be generated again (if allowed by
+ * the *mask value).
+ */
+void ipath_clear_freeze(struct ipath_devdata *dd)
+{
+	/* disable error interrupts, to avoid confusion */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask, 0ULL);
+
+	/* also disable interrupts; errormask is sometimes overwriten */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+
+	ipath_cancel_sends(dd, 1);
+
+	/* clear the freeze, and be sure chip saw it */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
+			 dd->ipath_control);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+
+	/* force in-memory update now we are out of freeze */
+	ipath_force_pio_avail_update(dd);
+
+	/*
+	 * force new interrupt if any hwerr, error or interrupt bits are
+	 * still set, and clear "safe" send packet errors related to freeze
+	 * and cancelling sends.  Re-enable error interrupts before possible
+	 * force of re-interrupt on pending interrupts.
+	 */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear,
+		E_SPKT_ERRS_IGNORE);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+		dd->ipath_errormask);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, -1LL);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+}
+
+
+/* this is separate to allow for better optimization of ipath_intr() */
+
+static noinline void ipath_bad_intr(struct ipath_devdata *dd, u32 *unexpectp)
+{
+	/*
+	 * sometimes happen during driver init and unload, don't want
+	 * to process any interrupts at that point
+	 */
+
+	/* this is just a bandaid, not a fix, if something goes badly
+	 * wrong */
+	if (++*unexpectp > 100) {
+		if (++*unexpectp > 105) {
+			/*
+			 * ok, we must be taking somebody else's interrupts,
+			 * due to a messed up mptable and/or PIRQ table, so
+			 * unregister the interrupt.  We've seen this during
+			 * linuxbios development work, and it may happen in
+			 * the future again.
+			 */
+			if (dd->pcidev && dd->ipath_irq) {
+				ipath_dev_err(dd, "Now %u unexpected "
+					      "interrupts, unregistering "
+					      "interrupt handler\n",
+					      *unexpectp);
+				ipath_dbg("free_irq of irq %d\n",
+					  dd->ipath_irq);
+				dd->ipath_f_free_irq(dd);
+			}
+		}
+		if (ipath_read_ireg(dd, dd->ipath_kregs->kr_intmask)) {
+			ipath_dev_err(dd, "%u unexpected interrupts, "
+				      "disabling interrupts completely\n",
+				      *unexpectp);
+			/*
+			 * disable all interrupts, something is very wrong
+			 */
+			ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask,
+					 0ULL);
+		}
+	} else if (*unexpectp > 1)
+		ipath_dbg("Interrupt when not ready, should not happen, "
+			  "ignoring\n");
+}
+
+static noinline void ipath_bad_regread(struct ipath_devdata *dd)
+{
+	static int allbits;
+
+	/* separate routine, for better optimization of ipath_intr() */
+
+	/*
+	 * We print the message and disable interrupts, in hope of
+	 * having a better chance of debugging the problem.
+	 */
+	ipath_dev_err(dd,
+		      "Read of interrupt status failed (all bits set)\n");
+	if (allbits++) {
+		/* disable all interrupts, something is very wrong */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_intmask, 0ULL);
+		if (allbits == 2) {
+			ipath_dev_err(dd, "Still bad interrupt status, "
+				      "unregistering interrupt\n");
+			dd->ipath_f_free_irq(dd);
+		} else if (allbits > 2) {
+			if ((allbits % 10000) == 0)
+				printk(".");
+		} else
+			ipath_dev_err(dd, "Disabling interrupts, "
+				      "multiple errors\n");
+	}
+}
+
+static void handle_layer_pioavail(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+	int ret;
+
+	ret = ipath_ib_piobufavail(dd->verbs_dev);
+	if (ret > 0)
+		goto set;
+
+	return;
+set:
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+			 dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+}
+
+/*
+ * Handle receive interrupts for user ports; this means a user
+ * process was waiting for a packet to arrive, and didn't want
+ * to poll
+ */
+static void handle_urcv(struct ipath_devdata *dd, u64 istat)
+{
+	u64 portr;
+	int i;
+	int rcvdint = 0;
+
+	/*
+	 * test_and_clear_bit(IPATH_PORT_WAITING_RCV) and
+	 * test_and_clear_bit(IPATH_PORT_WAITING_URG) below
+	 * would both like timely updates of the bits so that
+	 * we don't pass them by unnecessarily.  the rmb()
+	 * here ensures that we see them promptly -- the
+	 * corresponding wmb()'s are in ipath_poll_urgent()
+	 * and ipath_poll_next()...
+	 */
+	rmb();
+	portr = ((istat >> dd->ipath_i_rcvavail_shift) &
+		 dd->ipath_i_rcvavail_mask) |
+		((istat >> dd->ipath_i_rcvurg_shift) &
+		 dd->ipath_i_rcvurg_mask);
+	for (i = 1; i < dd->ipath_cfgports; i++) {
+		struct ipath_portdata *pd = dd->ipath_pd[i];
+
+		if (portr & (1 << i) && pd && pd->port_cnt) {
+			if (test_and_clear_bit(IPATH_PORT_WAITING_RCV,
+					       &pd->port_flag)) {
+				clear_bit(i + dd->ipath_r_intravail_shift,
+					  &dd->ipath_rcvctrl);
+				wake_up_interruptible(&pd->port_wait);
+				rcvdint = 1;
+			} else if (test_and_clear_bit(IPATH_PORT_WAITING_URG,
+						      &pd->port_flag)) {
+				pd->port_urgent++;
+				wake_up_interruptible(&pd->port_wait);
+			}
+		}
+	}
+	if (rcvdint) {
+		/* only want to take one interrupt, so turn off the rcv
+		 * interrupt for all the ports that we set the rcv_waiting
+		 * (but never for kernel port)
+		 */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_rcvctrl,
+				 dd->ipath_rcvctrl);
+	}
+}
+
+irqreturn_t ipath_intr(int irq, void *data)
+{
+	struct ipath_devdata *dd = data;
+	u64 istat, chk0rcv = 0;
+	ipath_err_t estat = 0;
+	irqreturn_t ret;
+	static unsigned unexpected = 0;
+	u64 kportrbits;
+
+	ipath_stats.sps_ints++;
+
+	if (dd->ipath_int_counter != (u32) -1)
+		dd->ipath_int_counter++;
+
+	if (!(dd->ipath_flags & IPATH_PRESENT)) {
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+	}
+
+	/*
+	 * this needs to be flags&initted, not statusp, so we keep
+	 * taking interrupts even after link goes down, etc.
+	 * Also, we *must* clear the interrupt at some point, or we won't
+	 * take it again, which can be real bad for errors, etc...
+	 */
+
+	if (!(dd->ipath_flags & IPATH_INITTED)) {
+		ipath_bad_intr(dd, &unexpected);
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	istat = ipath_read_ireg(dd, dd->ipath_kregs->kr_intstatus);
+
+	if (unlikely(!istat)) {
+		ipath_stats.sps_nullintr++;
+		ret = IRQ_NONE; /* not our interrupt, or already handled */
+		goto bail;
+	}
+	if (unlikely(istat == -1)) {
+		ipath_bad_regread(dd);
+		/* don't know if it was our interrupt or not */
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	if (unexpected)
+		unexpected = 0;
+
+	if (unlikely(istat & ~dd->ipath_i_bitsextant))
+		ipath_dev_err(dd,
+			      "interrupt with unknown interrupts %Lx set\n",
+			      (unsigned long long)
+			      istat & ~dd->ipath_i_bitsextant);
+	else if (istat & ~INFINIPATH_I_ERROR) /* errors do own printing */
+		ipath_cdbg(VERBOSE, "intr stat=0x%Lx\n",
+			(unsigned long long) istat);
+
+	if (istat & INFINIPATH_I_ERROR) {
+		ipath_stats.sps_errints++;
+		estat = ipath_read_kreg64(dd,
+					  dd->ipath_kregs->kr_errorstatus);
+		if (!estat)
+			dev_info(&dd->pcidev->dev, "error interrupt (%Lx), "
+				 "but no error bits set!\n",
+				 (unsigned long long) istat);
+		else if (estat == -1LL)
+			/*
+			 * should we try clearing all, or hope next read
+			 * works?
+			 */
+			ipath_dev_err(dd, "Read of error status failed "
+				      "(all bits set); ignoring\n");
+		else
+			chk0rcv |= handle_errors(dd, estat);
+	}
+
+	if (istat & INFINIPATH_I_GPIO) {
+		/*
+		 * GPIO interrupts fall in two broad classes:
+		 * GPIO_2 indicates (on some HT4xx boards) that a packet
+		 *        has arrived for Port 0. Checking for this
+		 *        is controlled by flag IPATH_GPIO_INTR.
+		 * GPIO_3..5 on IBA6120 Rev2 and IBA6110 Rev4 chips indicate
+		 *        errors that we need to count. Checking for this
+		 *        is controlled by flag IPATH_GPIO_ERRINTRS.
+		 */
+		u32 gpiostatus;
+		u32 to_clear = 0;
+
+		gpiostatus = ipath_read_kreg32(
+			dd, dd->ipath_kregs->kr_gpio_status);
+		/* First the error-counter case. */
+		if ((gpiostatus & IPATH_GPIO_ERRINTR_MASK) &&
+		    (dd->ipath_flags & IPATH_GPIO_ERRINTRS)) {
+			/* want to clear the bits we see asserted. */
+			to_clear |= (gpiostatus & IPATH_GPIO_ERRINTR_MASK);
+
+			/*
+			 * Count appropriately, clear bits out of our copy,
+			 * as they have been "handled".
+			 */
+			if (gpiostatus & (1 << IPATH_GPIO_RXUVL_BIT)) {
+				ipath_dbg("FlowCtl on UnsupVL\n");
+				dd->ipath_rxfc_unsupvl_errs++;
+			}
+			if (gpiostatus & (1 << IPATH_GPIO_OVRUN_BIT)) {
+				ipath_dbg("Overrun Threshold exceeded\n");
+				dd->ipath_overrun_thresh_errs++;
+			}
+			if (gpiostatus & (1 << IPATH_GPIO_LLI_BIT)) {
+				ipath_dbg("Local Link Integrity error\n");
+				dd->ipath_lli_errs++;
+			}
+			gpiostatus &= ~IPATH_GPIO_ERRINTR_MASK;
+		}
+		/* Now the Port0 Receive case */
+		if ((gpiostatus & (1 << IPATH_GPIO_PORT0_BIT)) &&
+		    (dd->ipath_flags & IPATH_GPIO_INTR)) {
+			/*
+			 * GPIO status bit 2 is set, and we expected it.
+			 * clear it and indicate in p0bits.
+			 * This probably only happens if a Port0 pkt
+			 * arrives at _just_ the wrong time, and we
+			 * handle that by seting chk0rcv;
+			 */
+			to_clear |= (1 << IPATH_GPIO_PORT0_BIT);
+			gpiostatus &= ~(1 << IPATH_GPIO_PORT0_BIT);
+			chk0rcv = 1;
+		}
+		if (gpiostatus) {
+			/*
+			 * Some unexpected bits remain. If they could have
+			 * caused the interrupt, complain and clear.
+			 * To avoid repetition of this condition, also clear
+			 * the mask. It is almost certainly due to error.
+			 */
+			const u32 mask = (u32) dd->ipath_gpio_mask;
+
+			if (mask & gpiostatus) {
+				ipath_dbg("Unexpected GPIO IRQ bits %x\n",
+				  gpiostatus & mask);
+				to_clear |= (gpiostatus & mask);
+				dd->ipath_gpio_mask &= ~(gpiostatus & mask);
+				ipath_write_kreg(dd,
+					dd->ipath_kregs->kr_gpio_mask,
+					dd->ipath_gpio_mask);
+			}
+		}
+		if (to_clear) {
+			ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_clear,
+					(u64) to_clear);
+		}
+	}
+
+	/*
+	 * Clear the interrupt bits we found set, unless they are receive
+	 * related, in which case we already cleared them above, and don't
+	 * want to clear them again, because we might lose an interrupt.
+	 * Clear it early, so we "know" know the chip will have seen this by
+	 * the time we process the queue, and will re-interrupt if necessary.
+	 * The processor itself won't take the interrupt again until we return.
+	 */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, istat);
+
+	/*
+	 * Handle kernel receive queues before checking for pio buffers
+	 * available since receives can overflow; piobuf waiters can afford
+	 * a few extra cycles, since they were waiting anyway, and user's
+	 * waiting for receive are at the bottom.
+	 */
+	kportrbits = (1ULL << dd->ipath_i_rcvavail_shift) |
+		(1ULL << dd->ipath_i_rcvurg_shift);
+	if (chk0rcv || (istat & kportrbits)) {
+		istat &= ~kportrbits;
+		ipath_kreceive(dd->ipath_pd[0]);
+	}
+
+	if (istat & ((dd->ipath_i_rcvavail_mask << dd->ipath_i_rcvavail_shift) |
+		     (dd->ipath_i_rcvurg_mask << dd->ipath_i_rcvurg_shift)))
+		handle_urcv(dd, istat);
+
+	if (istat & (INFINIPATH_I_SDMAINT | INFINIPATH_I_SDMADISABLED))
+		handle_sdma_intr(dd, istat);
+
+	if (istat & INFINIPATH_I_SPIOBUFAVAIL) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl &= ~INFINIPATH_S_PIOINTBUFAVAIL;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+				 dd->ipath_sendctrl);
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+		/* always process; sdma verbs uses PIO for acks and VL15  */
+		handle_layer_pioavail(dd);
+	}
+
+	ret = IRQ_HANDLED;
+
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h
new file mode 100644
index 000000000..e08db7020
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_kernel.h
@@ -0,0 +1,1375 @@
+#ifndef _IPATH_KERNEL_H
+#define _IPATH_KERNEL_H
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This header file is the base header file for infinipath kernel code
+ * ipath_user.h serves a similar purpose for user code.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <asm/io.h>
+#include <rdma/ib_verbs.h>
+
+#include "ipath_common.h"
+#include "ipath_debug.h"
+#include "ipath_registers.h"
+
+/* only s/w major version of InfiniPath we can handle */
+#define IPATH_CHIP_VERS_MAJ 2U
+
+/* don't care about this except printing */
+#define IPATH_CHIP_VERS_MIN 0U
+
+/* temporary, maybe always */
+extern struct infinipath_stats ipath_stats;
+
+#define IPATH_CHIP_SWVERSION IPATH_CHIP_VERS_MAJ
+/*
+ * First-cut critierion for "device is active" is
+ * two thousand dwords combined Tx, Rx traffic per
+ * 5-second interval. SMA packets are 64 dwords,
+ * and occur "a few per second", presumably each way.
+ */
+#define IPATH_TRAFFIC_ACTIVE_THRESHOLD (2000)
+/*
+ * Struct used to indicate which errors are logged in each of the
+ * error-counters that are logged to EEPROM. A counter is incremented
+ * _once_ (saturating at 255) for each event with any bits set in
+ * the error or hwerror register masks below.
+ */
+#define IPATH_EEP_LOG_CNT (4)
+struct ipath_eep_log_mask {
+	u64 errs_to_log;
+	u64 hwerrs_to_log;
+};
+
+struct ipath_portdata {
+	void **port_rcvegrbuf;
+	dma_addr_t *port_rcvegrbuf_phys;
+	/* rcvhdrq base, needs mmap before useful */
+	void *port_rcvhdrq;
+	/* kernel virtual address where hdrqtail is updated */
+	void *port_rcvhdrtail_kvaddr;
+	/*
+	 * temp buffer for expected send setup, allocated at open, instead
+	 * of each setup call
+	 */
+	void *port_tid_pg_list;
+	/* when waiting for rcv or pioavail */
+	wait_queue_head_t port_wait;
+	/*
+	 * rcvegr bufs base, physical, must fit
+	 * in 44 bits so 32 bit programs mmap64 44 bit works)
+	 */
+	dma_addr_t port_rcvegr_phys;
+	/* mmap of hdrq, must fit in 44 bits */
+	dma_addr_t port_rcvhdrq_phys;
+	dma_addr_t port_rcvhdrqtailaddr_phys;
+	/*
+	 * number of opens (including slave subports) on this instance
+	 * (ignoring forks, dup, etc. for now)
+	 */
+	int port_cnt;
+	/*
+	 * how much space to leave at start of eager TID entries for
+	 * protocol use, on each TID
+	 */
+	/* instead of calculating it */
+	unsigned port_port;
+	/* non-zero if port is being shared. */
+	u16 port_subport_cnt;
+	/* non-zero if port is being shared. */
+	u16 port_subport_id;
+	/* number of pio bufs for this port (all procs, if shared) */
+	u32 port_piocnt;
+	/* first pio buffer for this port */
+	u32 port_pio_base;
+	/* chip offset of PIO buffers for this port */
+	u32 port_piobufs;
+	/* how many alloc_pages() chunks in port_rcvegrbuf_pages */
+	u32 port_rcvegrbuf_chunks;
+	/* how many egrbufs per chunk */
+	u32 port_rcvegrbufs_perchunk;
+	/* order for port_rcvegrbuf_pages */
+	size_t port_rcvegrbuf_size;
+	/* rcvhdrq size (for freeing) */
+	size_t port_rcvhdrq_size;
+	/* next expected TID to check when looking for free */
+	u32 port_tidcursor;
+	/* next expected TID to check */
+	unsigned long port_flag;
+	/* what happened */
+	unsigned long int_flag;
+	/* WAIT_RCV that timed out, no interrupt */
+	u32 port_rcvwait_to;
+	/* WAIT_PIO that timed out, no interrupt */
+	u32 port_piowait_to;
+	/* WAIT_RCV already happened, no wait */
+	u32 port_rcvnowait;
+	/* WAIT_PIO already happened, no wait */
+	u32 port_pionowait;
+	/* total number of rcvhdrqfull errors */
+	u32 port_hdrqfull;
+	/*
+	 * Used to suppress multiple instances of same
+	 * port staying stuck at same point.
+	 */
+	u32 port_lastrcvhdrqtail;
+	/* saved total number of rcvhdrqfull errors for poll edge trigger */
+	u32 port_hdrqfull_poll;
+	/* total number of polled urgent packets */
+	u32 port_urgent;
+	/* saved total number of polled urgent packets for poll edge trigger */
+	u32 port_urgent_poll;
+	/* pid of process using this port */
+	struct pid *port_pid;
+	struct pid *port_subpid[INFINIPATH_MAX_SUBPORT];
+	/* same size as task_struct .comm[] */
+	char port_comm[16];
+	/* pkeys set by this use of this port */
+	u16 port_pkeys[4];
+	/* so file ops can get at unit */
+	struct ipath_devdata *port_dd;
+	/* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
+	void *subport_uregbase;
+	/* An array of pages for the eager receive buffers * N */
+	void *subport_rcvegrbuf;
+	/* An array of pages for the eager header queue entries * N */
+	void *subport_rcvhdr_base;
+	/* The version of the library which opened this port */
+	u32 userversion;
+	/* Bitmask of active slaves */
+	u32 active_slaves;
+	/* Type of packets or conditions we want to poll for */
+	u16 poll_type;
+	/* port rcvhdrq head offset */
+	u32 port_head;
+	/* receive packet sequence counter */
+	u32 port_seq_cnt;
+};
+
+struct sk_buff;
+struct ipath_sge_state;
+struct ipath_verbs_txreq;
+
+/*
+ * control information for layered drivers
+ */
+struct _ipath_layer {
+	void *l_arg;
+};
+
+struct ipath_skbinfo {
+	struct sk_buff *skb;
+	dma_addr_t phys;
+};
+
+struct ipath_sdma_txreq {
+	int                 flags;
+	int                 sg_count;
+	union {
+		struct scatterlist *sg;
+		void *map_addr;
+	};
+	void              (*callback)(void *, int);
+	void               *callback_cookie;
+	int                 callback_status;
+	u16                 start_idx;  /* sdma private */
+	u16                 next_descq_idx;  /* sdma private */
+	struct list_head    list;       /* sdma private */
+};
+
+struct ipath_sdma_desc {
+	__le64 qw[2];
+};
+
+#define IPATH_SDMA_TXREQ_F_USELARGEBUF  0x1
+#define IPATH_SDMA_TXREQ_F_HEADTOHOST   0x2
+#define IPATH_SDMA_TXREQ_F_INTREQ       0x4
+#define IPATH_SDMA_TXREQ_F_FREEBUF      0x8
+#define IPATH_SDMA_TXREQ_F_FREEDESC     0x10
+#define IPATH_SDMA_TXREQ_F_VL15         0x20
+
+#define IPATH_SDMA_TXREQ_S_OK        0
+#define IPATH_SDMA_TXREQ_S_SENDERROR 1
+#define IPATH_SDMA_TXREQ_S_ABORTED   2
+#define IPATH_SDMA_TXREQ_S_SHUTDOWN  3
+
+#define IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG	(1ull << 63)
+#define IPATH_SDMA_STATUS_ABORT_IN_PROG			(1ull << 62)
+#define IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE		(1ull << 61)
+#define IPATH_SDMA_STATUS_SCB_EMPTY			(1ull << 30)
+
+/* max dwords in small buffer packet */
+#define IPATH_SMALLBUF_DWORDS (dd->ipath_piosize2k >> 2)
+
+/*
+ * Possible IB config parameters for ipath_f_get/set_ib_cfg()
+ */
+#define IPATH_IB_CFG_LIDLMC 0 /* Get/set LID (LS16b) and Mask (MS16b) */
+#define IPATH_IB_CFG_HRTBT 1 /* Get/set Heartbeat off/enable/auto */
+#define IPATH_IB_HRTBT_ON 3 /* Heartbeat enabled, sent every 100msec */
+#define IPATH_IB_HRTBT_OFF 0 /* Heartbeat off */
+#define IPATH_IB_CFG_LWID_ENB 2 /* Get/set allowed Link-width */
+#define IPATH_IB_CFG_LWID 3 /* Get currently active Link-width */
+#define IPATH_IB_CFG_SPD_ENB 4 /* Get/set allowed Link speeds */
+#define IPATH_IB_CFG_SPD 5 /* Get current Link spd */
+#define IPATH_IB_CFG_RXPOL_ENB 6 /* Get/set Auto-RX-polarity enable */
+#define IPATH_IB_CFG_LREV_ENB 7 /* Get/set Auto-Lane-reversal enable */
+#define IPATH_IB_CFG_LINKLATENCY 8 /* Get Auto-Lane-reversal enable */
+
+
+struct ipath_devdata {
+	struct list_head ipath_list;
+
+	struct ipath_kregs const *ipath_kregs;
+	struct ipath_cregs const *ipath_cregs;
+
+	/* mem-mapped pointer to base of chip regs */
+	u64 __iomem *ipath_kregbase;
+	/* end of mem-mapped chip space; range checking */
+	u64 __iomem *ipath_kregend;
+	/* physical address of chip for io_remap, etc. */
+	unsigned long ipath_physaddr;
+	/* base of memory alloced for ipath_kregbase, for free */
+	u64 *ipath_kregalloc;
+	/* ipath_cfgports pointers */
+	struct ipath_portdata **ipath_pd;
+	/* sk_buffs used by port 0 eager receive queue */
+	struct ipath_skbinfo *ipath_port0_skbinfo;
+	/* kvirt address of 1st 2k pio buffer */
+	void __iomem *ipath_pio2kbase;
+	/* kvirt address of 1st 4k pio buffer */
+	void __iomem *ipath_pio4kbase;
+	/*
+	 * points to area where PIOavail registers will be DMA'ed.
+	 * Has to be on a page of it's own, because the page will be
+	 * mapped into user program space.  This copy is *ONLY* ever
+	 * written by DMA, not by the driver!  Need a copy per device
+	 * when we get to multiple devices
+	 */
+	volatile __le64 *ipath_pioavailregs_dma;
+	/* physical address where updates occur */
+	dma_addr_t ipath_pioavailregs_phys;
+	struct _ipath_layer ipath_layer;
+	/* setup intr */
+	int (*ipath_f_intrsetup)(struct ipath_devdata *);
+	/* fallback to alternate interrupt type if possible */
+	int (*ipath_f_intr_fallback)(struct ipath_devdata *);
+	/* setup on-chip bus config */
+	int (*ipath_f_bus)(struct ipath_devdata *, struct pci_dev *);
+	/* hard reset chip */
+	int (*ipath_f_reset)(struct ipath_devdata *);
+	int (*ipath_f_get_boardname)(struct ipath_devdata *, char *,
+				     size_t);
+	void (*ipath_f_init_hwerrors)(struct ipath_devdata *);
+	void (*ipath_f_handle_hwerrors)(struct ipath_devdata *, char *,
+					size_t);
+	void (*ipath_f_quiet_serdes)(struct ipath_devdata *);
+	int (*ipath_f_bringup_serdes)(struct ipath_devdata *);
+	int (*ipath_f_early_init)(struct ipath_devdata *);
+	void (*ipath_f_clear_tids)(struct ipath_devdata *, unsigned);
+	void (*ipath_f_put_tid)(struct ipath_devdata *, u64 __iomem*,
+				u32, unsigned long);
+	void (*ipath_f_tidtemplate)(struct ipath_devdata *);
+	void (*ipath_f_cleanup)(struct ipath_devdata *);
+	void (*ipath_f_setextled)(struct ipath_devdata *, u64, u64);
+	/* fill out chip-specific fields */
+	int (*ipath_f_get_base_info)(struct ipath_portdata *, void *);
+	/* free irq */
+	void (*ipath_f_free_irq)(struct ipath_devdata *);
+	struct ipath_message_header *(*ipath_f_get_msgheader)
+					(struct ipath_devdata *, __le32 *);
+	void (*ipath_f_config_ports)(struct ipath_devdata *, ushort);
+	int (*ipath_f_get_ib_cfg)(struct ipath_devdata *, int);
+	int (*ipath_f_set_ib_cfg)(struct ipath_devdata *, int, u32);
+	void (*ipath_f_config_jint)(struct ipath_devdata *, u16 , u16);
+	void (*ipath_f_read_counters)(struct ipath_devdata *,
+					struct infinipath_counters *);
+	void (*ipath_f_xgxs_reset)(struct ipath_devdata *);
+	/* per chip actions needed for IB Link up/down changes */
+	int (*ipath_f_ib_updown)(struct ipath_devdata *, int, u64);
+
+	unsigned ipath_lastegr_idx;
+	struct ipath_ibdev *verbs_dev;
+	struct timer_list verbs_timer;
+	/* total dwords sent (summed from counter) */
+	u64 ipath_sword;
+	/* total dwords rcvd (summed from counter) */
+	u64 ipath_rword;
+	/* total packets sent (summed from counter) */
+	u64 ipath_spkts;
+	/* total packets rcvd (summed from counter) */
+	u64 ipath_rpkts;
+	/* ipath_statusp initially points to this. */
+	u64 _ipath_status;
+	/* GUID for this interface, in network order */
+	__be64 ipath_guid;
+	/*
+	 * aggregrate of error bits reported since last cleared, for
+	 * limiting of error reporting
+	 */
+	ipath_err_t ipath_lasterror;
+	/*
+	 * aggregrate of error bits reported since last cleared, for
+	 * limiting of hwerror reporting
+	 */
+	ipath_err_t ipath_lasthwerror;
+	/* errors masked because they occur too fast */
+	ipath_err_t ipath_maskederrs;
+	u64 ipath_lastlinkrecov; /* link recoveries at last ACTIVE */
+	/* these 5 fields are used to establish deltas for IB Symbol
+	 * errors and linkrecovery errors. They can be reported on
+	 * some chips during link negotiation prior to INIT, and with
+	 * DDR when faking DDR negotiations with non-IBTA switches.
+	 * The chip counters are adjusted at driver unload if there is
+	 * a non-zero delta.
+	 */
+	u64 ibdeltainprog;
+	u64 ibsymdelta;
+	u64 ibsymsnap;
+	u64 iblnkerrdelta;
+	u64 iblnkerrsnap;
+
+	/* time in jiffies at which to re-enable maskederrs */
+	unsigned long ipath_unmasktime;
+	/* count of egrfull errors, combined for all ports */
+	u64 ipath_last_tidfull;
+	/* for ipath_qcheck() */
+	u64 ipath_lastport0rcv_cnt;
+	/* template for writing TIDs  */
+	u64 ipath_tidtemplate;
+	/* value to write to free TIDs */
+	u64 ipath_tidinvalid;
+	/* IBA6120 rcv interrupt setup */
+	u64 ipath_rhdrhead_intr_off;
+
+	/* size of memory at ipath_kregbase */
+	u32 ipath_kregsize;
+	/* number of registers used for pioavail */
+	u32 ipath_pioavregs;
+	/* IPATH_POLL, etc. */
+	u32 ipath_flags;
+	/* ipath_flags driver is waiting for */
+	u32 ipath_state_wanted;
+	/* last buffer for user use, first buf for kernel use is this
+	 * index. */
+	u32 ipath_lastport_piobuf;
+	/* is a stats timer active */
+	u32 ipath_stats_timer_active;
+	/* number of interrupts for this device -- saturates... */
+	u32 ipath_int_counter;
+	/* dwords sent read from counter */
+	u32 ipath_lastsword;
+	/* dwords received read from counter */
+	u32 ipath_lastrword;
+	/* sent packets read from counter */
+	u32 ipath_lastspkts;
+	/* received packets read from counter */
+	u32 ipath_lastrpkts;
+	/* pio bufs allocated per port */
+	u32 ipath_pbufsport;
+	/* if remainder on bufs/port, ports < extrabuf get 1 extra */
+	u32 ipath_ports_extrabuf;
+	u32 ipath_pioupd_thresh; /* update threshold, some chips */
+	/*
+	 * number of ports configured as max; zero is set to number chip
+	 * supports, less gives more pio bufs/port, etc.
+	 */
+	u32 ipath_cfgports;
+	/* count of port 0 hdrqfull errors */
+	u32 ipath_p0_hdrqfull;
+	/* port 0 number of receive eager buffers */
+	u32 ipath_p0_rcvegrcnt;
+
+	/*
+	 * index of last piobuffer we used.  Speeds up searching, by
+	 * starting at this point.  Doesn't matter if multiple cpu's use and
+	 * update, last updater is only write that matters.  Whenever it
+	 * wraps, we update shadow copies.  Need a copy per device when we
+	 * get to multiple devices
+	 */
+	u32 ipath_lastpioindex;
+	u32 ipath_lastpioindexl;
+	/* max length of freezemsg */
+	u32 ipath_freezelen;
+	/*
+	 * consecutive times we wanted a PIO buffer but were unable to
+	 * get one
+	 */
+	u32 ipath_consec_nopiobuf;
+	/*
+	 * hint that we should update ipath_pioavailshadow before
+	 * looking for a PIO buffer
+	 */
+	u32 ipath_upd_pio_shadow;
+	/* so we can rewrite it after a chip reset */
+	u32 ipath_pcibar0;
+	/* so we can rewrite it after a chip reset */
+	u32 ipath_pcibar1;
+	u32 ipath_x1_fix_tries;
+	u32 ipath_autoneg_tries;
+	u32 serdes_first_init_done;
+
+	struct ipath_relock {
+		atomic_t ipath_relock_timer_active;
+		struct timer_list ipath_relock_timer;
+		unsigned int ipath_relock_interval; /* in jiffies */
+	} ipath_relock_singleton;
+
+	/* interrupt number */
+	int ipath_irq;
+	/* HT/PCI Vendor ID (here for NodeInfo) */
+	u16 ipath_vendorid;
+	/* HT/PCI Device ID (here for NodeInfo) */
+	u16 ipath_deviceid;
+	/* offset in HT config space of slave/primary interface block */
+	u8 ipath_ht_slave_off;
+	/* for write combining settings */
+	unsigned long ipath_wc_cookie;
+	unsigned long ipath_wc_base;
+	unsigned long ipath_wc_len;
+	/* ref count for each pkey */
+	atomic_t ipath_pkeyrefs[4];
+	/* shadow copy of struct page *'s for exp tid pages */
+	struct page **ipath_pageshadow;
+	/* shadow copy of dma handles for exp tid pages */
+	dma_addr_t *ipath_physshadow;
+	u64 __iomem *ipath_egrtidbase;
+	/* lock to workaround chip bug 9437 and others */
+	spinlock_t ipath_kernel_tid_lock;
+	spinlock_t ipath_user_tid_lock;
+	spinlock_t ipath_sendctrl_lock;
+	/* around ipath_pd and (user ports) port_cnt use (intr vs free) */
+	spinlock_t ipath_uctxt_lock;
+
+	/*
+	 * IPATH_STATUS_*,
+	 * this address is mapped readonly into user processes so they can
+	 * get status cheaply, whenever they want.
+	 */
+	u64 *ipath_statusp;
+	/* freeze msg if hw error put chip in freeze */
+	char *ipath_freezemsg;
+	/* pci access data structure */
+	struct pci_dev *pcidev;
+	struct cdev *user_cdev;
+	struct cdev *diag_cdev;
+	struct device *user_dev;
+	struct device *diag_dev;
+	/* timer used to prevent stats overflow, error throttling, etc. */
+	struct timer_list ipath_stats_timer;
+	/* timer to verify interrupts work, and fallback if possible */
+	struct timer_list ipath_intrchk_timer;
+	void *ipath_dummy_hdrq;	/* used after port close */
+	dma_addr_t ipath_dummy_hdrq_phys;
+
+	/* SendDMA related entries */
+	spinlock_t            ipath_sdma_lock;
+	unsigned long         ipath_sdma_status;
+	unsigned long         ipath_sdma_abort_jiffies;
+	unsigned long         ipath_sdma_abort_intr_timeout;
+	unsigned long         ipath_sdma_buf_jiffies;
+	struct ipath_sdma_desc *ipath_sdma_descq;
+	u64		      ipath_sdma_descq_added;
+	u64		      ipath_sdma_descq_removed;
+	int		      ipath_sdma_desc_nreserved;
+	u16                   ipath_sdma_descq_cnt;
+	u16                   ipath_sdma_descq_tail;
+	u16                   ipath_sdma_descq_head;
+	u16                   ipath_sdma_next_intr;
+	u16                   ipath_sdma_reset_wait;
+	u8                    ipath_sdma_generation;
+	struct tasklet_struct ipath_sdma_abort_task;
+	struct tasklet_struct ipath_sdma_notify_task;
+	struct list_head      ipath_sdma_activelist;
+	struct list_head      ipath_sdma_notifylist;
+	atomic_t              ipath_sdma_vl15_count;
+	struct timer_list     ipath_sdma_vl15_timer;
+
+	dma_addr_t       ipath_sdma_descq_phys;
+	volatile __le64 *ipath_sdma_head_dma;
+	dma_addr_t       ipath_sdma_head_phys;
+
+	unsigned long ipath_ureg_align; /* user register alignment */
+
+	struct delayed_work ipath_autoneg_work;
+	wait_queue_head_t ipath_autoneg_wait;
+
+	/* HoL blocking / user app forward-progress state */
+	unsigned          ipath_hol_state;
+	unsigned          ipath_hol_next;
+	struct timer_list ipath_hol_timer;
+
+	/*
+	 * Shadow copies of registers; size indicates read access size.
+	 * Most of them are readonly, but some are write-only register,
+	 * where we manipulate the bits in the shadow copy, and then write
+	 * the shadow copy to infinipath.
+	 *
+	 * We deliberately make most of these 32 bits, since they have
+	 * restricted range.  For any that we read, we won't to generate 32
+	 * bit accesses, since Opteron will generate 2 separate 32 bit HT
+	 * transactions for a 64 bit read, and we want to avoid unnecessary
+	 * HT transactions.
+	 */
+
+	/* This is the 64 bit group */
+
+	/*
+	 * shadow of pioavail, check to be sure it's large enough at
+	 * init time.
+	 */
+	unsigned long ipath_pioavailshadow[8];
+	/* bitmap of send buffers available for the kernel to use with PIO. */
+	unsigned long ipath_pioavailkernel[8];
+	/* shadow of kr_gpio_out, for rmw ops */
+	u64 ipath_gpio_out;
+	/* shadow the gpio mask register */
+	u64 ipath_gpio_mask;
+	/* shadow the gpio output enable, etc... */
+	u64 ipath_extctrl;
+	/* kr_revision shadow */
+	u64 ipath_revision;
+	/*
+	 * shadow of ibcctrl, for interrupt handling of link changes,
+	 * etc.
+	 */
+	u64 ipath_ibcctrl;
+	/*
+	 * last ibcstatus, to suppress "duplicate" status change messages,
+	 * mostly from 2 to 3
+	 */
+	u64 ipath_lastibcstat;
+	/* hwerrmask shadow */
+	ipath_err_t ipath_hwerrmask;
+	ipath_err_t ipath_errormask; /* errormask shadow */
+	/* interrupt config reg shadow */
+	u64 ipath_intconfig;
+	/* kr_sendpiobufbase value */
+	u64 ipath_piobufbase;
+	/* kr_ibcddrctrl shadow */
+	u64 ipath_ibcddrctrl;
+
+	/* these are the "32 bit" regs */
+
+	/*
+	 * number of GUIDs in the flash for this interface; may need some
+	 * rethinking for setting on other ifaces
+	 */
+	u32 ipath_nguid;
+	/*
+	 * the following two are 32-bit bitmasks, but {test,clear,set}_bit
+	 * all expect bit fields to be "unsigned long"
+	 */
+	/* shadow kr_rcvctrl */
+	unsigned long ipath_rcvctrl;
+	/* shadow kr_sendctrl */
+	unsigned long ipath_sendctrl;
+	/* to not count armlaunch after cancel */
+	unsigned long ipath_lastcancel;
+	/* count cases where special trigger was needed (double write) */
+	unsigned long ipath_spectriggerhit;
+
+	/* value we put in kr_rcvhdrcnt */
+	u32 ipath_rcvhdrcnt;
+	/* value we put in kr_rcvhdrsize */
+	u32 ipath_rcvhdrsize;
+	/* value we put in kr_rcvhdrentsize */
+	u32 ipath_rcvhdrentsize;
+	/* offset of last entry in rcvhdrq */
+	u32 ipath_hdrqlast;
+	/* kr_portcnt value */
+	u32 ipath_portcnt;
+	/* kr_pagealign value */
+	u32 ipath_palign;
+	/* number of "2KB" PIO buffers */
+	u32 ipath_piobcnt2k;
+	/* size in bytes of "2KB" PIO buffers */
+	u32 ipath_piosize2k;
+	/* number of "4KB" PIO buffers */
+	u32 ipath_piobcnt4k;
+	/* size in bytes of "4KB" PIO buffers */
+	u32 ipath_piosize4k;
+	u32 ipath_pioreserved; /* reserved special-inkernel; */
+	/* kr_rcvegrbase value */
+	u32 ipath_rcvegrbase;
+	/* kr_rcvegrcnt value */
+	u32 ipath_rcvegrcnt;
+	/* kr_rcvtidbase value */
+	u32 ipath_rcvtidbase;
+	/* kr_rcvtidcnt value */
+	u32 ipath_rcvtidcnt;
+	/* kr_sendregbase */
+	u32 ipath_sregbase;
+	/* kr_userregbase */
+	u32 ipath_uregbase;
+	/* kr_counterregbase */
+	u32 ipath_cregbase;
+	/* shadow the control register contents */
+	u32 ipath_control;
+	/* PCI revision register (HTC rev on FPGA) */
+	u32 ipath_pcirev;
+
+	/* chip address space used by 4k pio buffers */
+	u32 ipath_4kalign;
+	/* The MTU programmed for this unit */
+	u32 ipath_ibmtu;
+	/*
+	 * The max size IB packet, included IB headers that we can send.
+	 * Starts same as ipath_piosize, but is affected when ibmtu is
+	 * changed, or by size of eager buffers
+	 */
+	u32 ipath_ibmaxlen;
+	/*
+	 * ibmaxlen at init time, limited by chip and by receive buffer
+	 * size.  Not changed after init.
+	 */
+	u32 ipath_init_ibmaxlen;
+	/* size of each rcvegrbuffer */
+	u32 ipath_rcvegrbufsize;
+	/* localbus width (1, 2,4,8,16,32) from config space  */
+	u32 ipath_lbus_width;
+	/* localbus speed (HT: 200,400,800,1000; PCIe 2500) */
+	u32 ipath_lbus_speed;
+	/*
+	 * number of sequential ibcstatus change for polling active/quiet
+	 * (i.e., link not coming up).
+	 */
+	u32 ipath_ibpollcnt;
+	/* low and high portions of MSI capability/vector */
+	u32 ipath_msi_lo;
+	/* saved after PCIe init for restore after reset */
+	u32 ipath_msi_hi;
+	/* MSI data (vector) saved for restore */
+	u16 ipath_msi_data;
+	/* MLID programmed for this instance */
+	u16 ipath_mlid;
+	/* LID programmed for this instance */
+	u16 ipath_lid;
+	/* list of pkeys programmed; 0 if not set */
+	u16 ipath_pkeys[4];
+	/*
+	 * ASCII serial number, from flash, large enough for original
+	 * all digit strings, and longer QLogic serial number format
+	 */
+	u8 ipath_serial[16];
+	/* human readable board version */
+	u8 ipath_boardversion[96];
+	u8 ipath_lbus_info[32]; /* human readable localbus info */
+	/* chip major rev, from ipath_revision */
+	u8 ipath_majrev;
+	/* chip minor rev, from ipath_revision */
+	u8 ipath_minrev;
+	/* board rev, from ipath_revision */
+	u8 ipath_boardrev;
+	/* saved for restore after reset */
+	u8 ipath_pci_cacheline;
+	/* LID mask control */
+	u8 ipath_lmc;
+	/* link width supported */
+	u8 ipath_link_width_supported;
+	/* link speed supported */
+	u8 ipath_link_speed_supported;
+	u8 ipath_link_width_enabled;
+	u8 ipath_link_speed_enabled;
+	u8 ipath_link_width_active;
+	u8 ipath_link_speed_active;
+	/* Rx Polarity inversion (compensate for ~tx on partner) */
+	u8 ipath_rx_pol_inv;
+
+	u8 ipath_r_portenable_shift;
+	u8 ipath_r_intravail_shift;
+	u8 ipath_r_tailupd_shift;
+	u8 ipath_r_portcfg_shift;
+
+	/* unit # of this chip, if present */
+	int ipath_unit;
+
+	/* local link integrity counter */
+	u32 ipath_lli_counter;
+	/* local link integrity errors */
+	u32 ipath_lli_errors;
+	/*
+	 * Above counts only cases where _successive_ LocalLinkIntegrity
+	 * errors were seen in the receive headers of kern-packets.
+	 * Below are the three (monotonically increasing) counters
+	 * maintained via GPIO interrupts on iba6120-rev2.
+	 */
+	u32 ipath_rxfc_unsupvl_errs;
+	u32 ipath_overrun_thresh_errs;
+	u32 ipath_lli_errs;
+
+	/*
+	 * Not all devices managed by a driver instance are the same
+	 * type, so these fields must be per-device.
+	 */
+	u64 ipath_i_bitsextant;
+	ipath_err_t ipath_e_bitsextant;
+	ipath_err_t ipath_hwe_bitsextant;
+
+	/*
+	 * Below should be computable from number of ports,
+	 * since they are never modified.
+	 */
+	u64 ipath_i_rcvavail_mask;
+	u64 ipath_i_rcvurg_mask;
+	u16 ipath_i_rcvurg_shift;
+	u16 ipath_i_rcvavail_shift;
+
+	/*
+	 * Register bits for selecting i2c direction and values, used for
+	 * I2C serial flash.
+	 */
+	u8 ipath_gpio_sda_num;
+	u8 ipath_gpio_scl_num;
+	u8 ipath_i2c_chain_type;
+	u64 ipath_gpio_sda;
+	u64 ipath_gpio_scl;
+
+	/* lock for doing RMW of shadows/regs for ExtCtrl and GPIO */
+	spinlock_t ipath_gpio_lock;
+
+	/*
+	 * IB link and linktraining states and masks that vary per chip in
+	 * some way.  Set at init, to avoid each IB status change interrupt
+	 */
+	u8 ibcs_ls_shift;
+	u8 ibcs_lts_mask;
+	u32 ibcs_mask;
+	u32 ib_init;
+	u32 ib_arm;
+	u32 ib_active;
+
+	u16 ipath_rhf_offset; /* offset of RHF within receive header entry */
+
+	/*
+	 * shift/mask for linkcmd, linkinitcmd, maxpktlen in ibccontol
+	 * reg. Changes for IBA7220
+	 */
+	u8 ibcc_lic_mask; /* LinkInitCmd */
+	u8 ibcc_lc_shift; /* LinkCmd */
+	u8 ibcc_mpl_shift; /* Maxpktlen */
+
+	u8 delay_mult;
+
+	/* used to override LED behavior */
+	u8 ipath_led_override;  /* Substituted for normal value, if non-zero */
+	u16 ipath_led_override_timeoff; /* delta to next timer event */
+	u8 ipath_led_override_vals[2]; /* Alternates per blink-frame */
+	u8 ipath_led_override_phase; /* Just counts, LSB picks from vals[] */
+	atomic_t ipath_led_override_timer_active;
+	/* Used to flash LEDs in override mode */
+	struct timer_list ipath_led_override_timer;
+
+	/* Support (including locks) for EEPROM logging of errors and time */
+	/* control access to actual counters, timer */
+	spinlock_t ipath_eep_st_lock;
+	/* control high-level access to EEPROM */
+	struct mutex ipath_eep_lock;
+	/* Below inc'd by ipath_snap_cntrs(), locked by ipath_eep_st_lock */
+	uint64_t ipath_traffic_wds;
+	/* active time is kept in seconds, but logged in hours */
+	atomic_t ipath_active_time;
+	/* Below are nominal shadow of EEPROM, new since last EEPROM update */
+	uint8_t ipath_eep_st_errs[IPATH_EEP_LOG_CNT];
+	uint8_t ipath_eep_st_new_errs[IPATH_EEP_LOG_CNT];
+	uint16_t ipath_eep_hrs;
+	/*
+	 * masks for which bits of errs, hwerrs that cause
+	 * each of the counters to increment.
+	 */
+	struct ipath_eep_log_mask ipath_eep_st_masks[IPATH_EEP_LOG_CNT];
+
+	/* interrupt mitigation reload register info */
+	u16 ipath_jint_idle_ticks;	/* idle clock ticks */
+	u16 ipath_jint_max_packets;	/* max packets across all ports */
+
+	/*
+	 * lock for access to SerDes, and flags to sequence preset
+	 * versus steady-state. 7220-only at the moment.
+	 */
+	spinlock_t ipath_sdepb_lock;
+	u8 ipath_presets_needed; /* Set if presets to be restored next DOWN */
+};
+
+/* ipath_hol_state values (stopping/starting user proc, send flushing) */
+#define IPATH_HOL_UP       0
+#define IPATH_HOL_DOWN     1
+/* ipath_hol_next toggle values, used when hol_state IPATH_HOL_DOWN */
+#define IPATH_HOL_DOWNSTOP 0
+#define IPATH_HOL_DOWNCONT 1
+
+/* bit positions for sdma_status */
+#define IPATH_SDMA_ABORTING  0
+#define IPATH_SDMA_DISARMED  1
+#define IPATH_SDMA_DISABLED  2
+#define IPATH_SDMA_LAYERBUF  3
+#define IPATH_SDMA_RUNNING  30
+#define IPATH_SDMA_SHUTDOWN 31
+
+/* bit combinations that correspond to abort states */
+#define IPATH_SDMA_ABORT_NONE 0
+#define IPATH_SDMA_ABORT_ABORTING (1UL << IPATH_SDMA_ABORTING)
+#define IPATH_SDMA_ABORT_DISARMED ((1UL << IPATH_SDMA_ABORTING) | \
+	(1UL << IPATH_SDMA_DISARMED))
+#define IPATH_SDMA_ABORT_DISABLED ((1UL << IPATH_SDMA_ABORTING) | \
+	(1UL << IPATH_SDMA_DISABLED))
+#define IPATH_SDMA_ABORT_ABORTED ((1UL << IPATH_SDMA_ABORTING) | \
+	(1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED))
+#define IPATH_SDMA_ABORT_MASK ((1UL<<IPATH_SDMA_ABORTING) | \
+	(1UL << IPATH_SDMA_DISARMED) | (1UL << IPATH_SDMA_DISABLED))
+
+#define IPATH_SDMA_BUF_NONE 0
+#define IPATH_SDMA_BUF_MASK (1UL<<IPATH_SDMA_LAYERBUF)
+
+/* Private data for file operations */
+struct ipath_filedata {
+	struct ipath_portdata *pd;
+	unsigned subport;
+	unsigned tidcursor;
+	struct ipath_user_sdma_queue *pq;
+};
+extern struct list_head ipath_dev_list;
+extern spinlock_t ipath_devs_lock;
+extern struct ipath_devdata *ipath_lookup(int unit);
+
+int ipath_init_chip(struct ipath_devdata *, int);
+int ipath_enable_wc(struct ipath_devdata *dd);
+void ipath_disable_wc(struct ipath_devdata *dd);
+int ipath_count_units(int *npresentp, int *nupp, int *maxportsp);
+void ipath_shutdown_device(struct ipath_devdata *);
+void ipath_clear_freeze(struct ipath_devdata *);
+
+struct file_operations;
+int ipath_cdev_init(int minor, char *name, const struct file_operations *fops,
+		    struct cdev **cdevp, struct device **devp);
+void ipath_cdev_cleanup(struct cdev **cdevp,
+			struct device **devp);
+
+int ipath_diag_add(struct ipath_devdata *);
+void ipath_diag_remove(struct ipath_devdata *);
+
+extern wait_queue_head_t ipath_state_wait;
+
+int ipath_user_add(struct ipath_devdata *dd);
+void ipath_user_remove(struct ipath_devdata *dd);
+
+struct sk_buff *ipath_alloc_skb(struct ipath_devdata *dd, gfp_t);
+
+extern int ipath_diag_inuse;
+
+irqreturn_t ipath_intr(int irq, void *devid);
+int ipath_decode_err(struct ipath_devdata *dd, char *buf, size_t blen,
+		     ipath_err_t err);
+#if __IPATH_INFO || __IPATH_DBG
+extern const char *ipath_ibcstatus_str[];
+#endif
+
+/* clean up any per-chip chip-specific stuff */
+void ipath_chip_cleanup(struct ipath_devdata *);
+/* clean up any chip type-specific stuff */
+void ipath_chip_done(void);
+
+void ipath_disarm_piobufs(struct ipath_devdata *, unsigned first,
+			  unsigned cnt);
+void ipath_cancel_sends(struct ipath_devdata *, int);
+
+int ipath_create_rcvhdrq(struct ipath_devdata *, struct ipath_portdata *);
+void ipath_free_pddata(struct ipath_devdata *, struct ipath_portdata *);
+
+int ipath_parse_ushort(const char *str, unsigned short *valp);
+
+void ipath_kreceive(struct ipath_portdata *);
+int ipath_setrcvhdrsize(struct ipath_devdata *, unsigned);
+int ipath_reset_device(int);
+void ipath_get_faststats(unsigned long);
+int ipath_wait_linkstate(struct ipath_devdata *, u32, int);
+int ipath_set_linkstate(struct ipath_devdata *, u8);
+int ipath_set_mtu(struct ipath_devdata *, u16);
+int ipath_set_lid(struct ipath_devdata *, u32, u8);
+int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv);
+void ipath_enable_armlaunch(struct ipath_devdata *);
+void ipath_disable_armlaunch(struct ipath_devdata *);
+void ipath_hol_down(struct ipath_devdata *);
+void ipath_hol_up(struct ipath_devdata *);
+void ipath_hol_event(unsigned long);
+void ipath_toggle_rclkrls(struct ipath_devdata *);
+void ipath_sd7220_clr_ibpar(struct ipath_devdata *);
+void ipath_set_relock_poll(struct ipath_devdata *, int);
+void ipath_shutdown_relock_poll(struct ipath_devdata *);
+
+/* for use in system calls, where we want to know device type, etc. */
+#define port_fp(fp) ((struct ipath_filedata *)(fp)->private_data)->pd
+#define subport_fp(fp) \
+	((struct ipath_filedata *)(fp)->private_data)->subport
+#define tidcursor_fp(fp) \
+	((struct ipath_filedata *)(fp)->private_data)->tidcursor
+#define user_sdma_queue_fp(fp) \
+	((struct ipath_filedata *)(fp)->private_data)->pq
+
+/*
+ * values for ipath_flags
+ */
+		/* chip can report link latency (IB 1.2) */
+#define IPATH_HAS_LINK_LATENCY 0x1
+		/* The chip is up and initted */
+#define IPATH_INITTED       0x2
+		/* set if any user code has set kr_rcvhdrsize */
+#define IPATH_RCVHDRSZ_SET  0x4
+		/* The chip is present and valid for accesses */
+#define IPATH_PRESENT       0x8
+		/* HT link0 is only 8 bits wide, ignore upper byte crc
+		 * errors, etc. */
+#define IPATH_8BIT_IN_HT0   0x10
+		/* HT link1 is only 8 bits wide, ignore upper byte crc
+		 * errors, etc. */
+#define IPATH_8BIT_IN_HT1   0x20
+		/* The link is down */
+#define IPATH_LINKDOWN      0x40
+		/* The link level is up (0x11) */
+#define IPATH_LINKINIT      0x80
+		/* The link is in the armed (0x21) state */
+#define IPATH_LINKARMED     0x100
+		/* The link is in the active (0x31) state */
+#define IPATH_LINKACTIVE    0x200
+		/* link current state is unknown */
+#define IPATH_LINKUNK       0x400
+		/* Write combining flush needed for PIO */
+#define IPATH_PIO_FLUSH_WC  0x1000
+		/* DMA Receive tail pointer */
+#define IPATH_NODMA_RTAIL   0x2000
+		/* no IB cable, or no device on IB cable */
+#define IPATH_NOCABLE       0x4000
+		/* Supports port zero per packet receive interrupts via
+		 * GPIO */
+#define IPATH_GPIO_INTR     0x8000
+		/* uses the coded 4byte TID, not 8 byte */
+#define IPATH_4BYTE_TID     0x10000
+		/* packet/word counters are 32 bit, else those 4 counters
+		 * are 64bit */
+#define IPATH_32BITCOUNTERS 0x20000
+		/* Interrupt register is 64 bits */
+#define IPATH_INTREG_64     0x40000
+		/* can miss port0 rx interrupts */
+#define IPATH_DISABLED      0x80000 /* administratively disabled */
+		/* Use GPIO interrupts for new counters */
+#define IPATH_GPIO_ERRINTRS 0x100000
+#define IPATH_SWAP_PIOBUFS  0x200000
+		/* Supports Send DMA */
+#define IPATH_HAS_SEND_DMA  0x400000
+		/* Supports Send Count (not just word count) in PBC */
+#define IPATH_HAS_PBC_CNT   0x800000
+		/* Suppress heartbeat, even if turning off loopback */
+#define IPATH_NO_HRTBT      0x1000000
+#define IPATH_HAS_THRESH_UPDATE 0x4000000
+#define IPATH_HAS_MULT_IB_SPEED 0x8000000
+#define IPATH_IB_AUTONEG_INPROG 0x10000000
+#define IPATH_IB_AUTONEG_FAILED 0x20000000
+		/* Linkdown-disable intentionally, Do not attempt to bring up */
+#define IPATH_IB_LINK_DISABLED 0x40000000
+#define IPATH_IB_FORCE_NOTIFY 0x80000000 /* force notify on next ib change */
+
+/* Bits in GPIO for the added interrupts */
+#define IPATH_GPIO_PORT0_BIT 2
+#define IPATH_GPIO_RXUVL_BIT 3
+#define IPATH_GPIO_OVRUN_BIT 4
+#define IPATH_GPIO_LLI_BIT 5
+#define IPATH_GPIO_ERRINTR_MASK 0x38
+
+/* portdata flag bit offsets */
+		/* waiting for a packet to arrive */
+#define IPATH_PORT_WAITING_RCV   2
+		/* master has not finished initializing */
+#define IPATH_PORT_MASTER_UNINIT 4
+		/* waiting for an urgent packet to arrive */
+#define IPATH_PORT_WAITING_URG 5
+
+/* free up any allocated data at closes */
+void ipath_free_data(struct ipath_portdata *dd);
+u32 __iomem *ipath_getpiobuf(struct ipath_devdata *, u32, u32 *);
+void ipath_chg_pioavailkernel(struct ipath_devdata *dd, unsigned start,
+				unsigned len, int avail);
+void ipath_init_iba6110_funcs(struct ipath_devdata *);
+void ipath_get_eeprom_info(struct ipath_devdata *);
+int ipath_update_eeprom_log(struct ipath_devdata *dd);
+void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr);
+u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg);
+void ipath_disarm_senderrbufs(struct ipath_devdata *);
+void ipath_force_pio_avail_update(struct ipath_devdata *);
+void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev);
+
+/*
+ * Set LED override, only the two LSBs have "public" meaning, but
+ * any non-zero value substitutes them for the Link and LinkTrain
+ * LED states.
+ */
+#define IPATH_LED_PHYS 1 /* Physical (linktraining) GREEN LED */
+#define IPATH_LED_LOG 2  /* Logical (link) YELLOW LED */
+void ipath_set_led_override(struct ipath_devdata *dd, unsigned int val);
+
+/* send dma routines */
+int setup_sdma(struct ipath_devdata *);
+void teardown_sdma(struct ipath_devdata *);
+void ipath_restart_sdma(struct ipath_devdata *);
+void ipath_sdma_intr(struct ipath_devdata *);
+int ipath_sdma_verbs_send(struct ipath_devdata *, struct ipath_sge_state *,
+			  u32, struct ipath_verbs_txreq *);
+/* ipath_sdma_lock should be locked before calling this. */
+int ipath_sdma_make_progress(struct ipath_devdata *dd);
+
+/* must be called under ipath_sdma_lock */
+static inline u16 ipath_sdma_descq_freecnt(const struct ipath_devdata *dd)
+{
+	return dd->ipath_sdma_descq_cnt -
+		(dd->ipath_sdma_descq_added - dd->ipath_sdma_descq_removed) -
+		1 - dd->ipath_sdma_desc_nreserved;
+}
+
+static inline void ipath_sdma_desc_reserve(struct ipath_devdata *dd, u16 cnt)
+{
+	dd->ipath_sdma_desc_nreserved += cnt;
+}
+
+static inline void ipath_sdma_desc_unreserve(struct ipath_devdata *dd, u16 cnt)
+{
+	dd->ipath_sdma_desc_nreserved -= cnt;
+}
+
+/*
+ * number of words used for protocol header if not set by ipath_userinit();
+ */
+#define IPATH_DFLT_RCVHDRSIZE 9
+
+int ipath_get_user_pages(unsigned long, size_t, struct page **);
+void ipath_release_user_pages(struct page **, size_t);
+void ipath_release_user_pages_on_close(struct page **, size_t);
+int ipath_eeprom_read(struct ipath_devdata *, u8, void *, int);
+int ipath_eeprom_write(struct ipath_devdata *, u8, const void *, int);
+int ipath_tempsense_read(struct ipath_devdata *, u8 regnum);
+int ipath_tempsense_write(struct ipath_devdata *, u8 regnum, u8 data);
+
+/* these are used for the registers that vary with port */
+void ipath_write_kreg_port(const struct ipath_devdata *, ipath_kreg,
+			   unsigned, u64);
+
+/*
+ * We could have a single register get/put routine, that takes a group type,
+ * but this is somewhat clearer and cleaner.  It also gives us some error
+ * checking.  64 bit register reads should always work, but are inefficient
+ * on opteron (the northbridge always generates 2 separate HT 32 bit reads),
+ * so we use kreg32 wherever possible.  User register and counter register
+ * reads are always 32 bit reads, so only one form of those routines.
+ */
+
+/*
+ * At the moment, none of the s-registers are writable, so no
+ * ipath_write_sreg().
+ */
+
+/**
+ * ipath_read_ureg32 - read 32-bit virtualized per-port register
+ * @dd: device
+ * @regno: register number
+ * @port: port number
+ *
+ * Return the contents of a register that is virtualized to be per port.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u32 ipath_read_ureg32(const struct ipath_devdata *dd,
+				    ipath_ureg regno, int port)
+{
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+		return 0;
+
+	return readl(regno + (u64 __iomem *)
+		     (dd->ipath_uregbase +
+		      (char __iomem *)dd->ipath_kregbase +
+		      dd->ipath_ureg_align * port));
+}
+
+/**
+ * ipath_write_ureg - write 32-bit virtualized per-port register
+ * @dd: device
+ * @regno: register number
+ * @value: value
+ * @port: port
+ *
+ * Write the contents of a register that is virtualized to be per port.
+ */
+static inline void ipath_write_ureg(const struct ipath_devdata *dd,
+				    ipath_ureg regno, u64 value, int port)
+{
+	u64 __iomem *ubase = (u64 __iomem *)
+		(dd->ipath_uregbase + (char __iomem *) dd->ipath_kregbase +
+		 dd->ipath_ureg_align * port);
+	if (dd->ipath_kregbase)
+		writeq(value, &ubase[regno]);
+}
+
+static inline u32 ipath_read_kreg32(const struct ipath_devdata *dd,
+				    ipath_kreg regno)
+{
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+		return -1;
+	return readl((u32 __iomem *) & dd->ipath_kregbase[regno]);
+}
+
+static inline u64 ipath_read_kreg64(const struct ipath_devdata *dd,
+				    ipath_kreg regno)
+{
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+		return -1;
+
+	return readq(&dd->ipath_kregbase[regno]);
+}
+
+static inline void ipath_write_kreg(const struct ipath_devdata *dd,
+				    ipath_kreg regno, u64 value)
+{
+	if (dd->ipath_kregbase)
+		writeq(value, &dd->ipath_kregbase[regno]);
+}
+
+static inline u64 ipath_read_creg(const struct ipath_devdata *dd,
+				  ipath_sreg regno)
+{
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+		return 0;
+
+	return readq(regno + (u64 __iomem *)
+		     (dd->ipath_cregbase +
+		      (char __iomem *)dd->ipath_kregbase));
+}
+
+static inline u32 ipath_read_creg32(const struct ipath_devdata *dd,
+					 ipath_sreg regno)
+{
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_PRESENT))
+		return 0;
+	return readl(regno + (u64 __iomem *)
+		     (dd->ipath_cregbase +
+		      (char __iomem *)dd->ipath_kregbase));
+}
+
+static inline void ipath_write_creg(const struct ipath_devdata *dd,
+				    ipath_creg regno, u64 value)
+{
+	if (dd->ipath_kregbase)
+		writeq(value, regno + (u64 __iomem *)
+		       (dd->ipath_cregbase +
+			(char __iomem *)dd->ipath_kregbase));
+}
+
+static inline void ipath_clear_rcvhdrtail(const struct ipath_portdata *pd)
+{
+	*((u64 *) pd->port_rcvhdrtail_kvaddr) = 0ULL;
+}
+
+static inline u32 ipath_get_rcvhdrtail(const struct ipath_portdata *pd)
+{
+	return (u32) le64_to_cpu(*((volatile __le64 *)
+				pd->port_rcvhdrtail_kvaddr));
+}
+
+static inline u32 ipath_get_hdrqtail(const struct ipath_portdata *pd)
+{
+	const struct ipath_devdata *dd = pd->port_dd;
+	u32 hdrqtail;
+
+	if (dd->ipath_flags & IPATH_NODMA_RTAIL) {
+		__le32 *rhf_addr;
+		u32 seq;
+
+		rhf_addr = (__le32 *) pd->port_rcvhdrq +
+			pd->port_head + dd->ipath_rhf_offset;
+		seq = ipath_hdrget_seq(rhf_addr);
+		hdrqtail = pd->port_head;
+		if (seq == pd->port_seq_cnt)
+			hdrqtail++;
+	} else
+		hdrqtail = ipath_get_rcvhdrtail(pd);
+
+	return hdrqtail;
+}
+
+static inline u64 ipath_read_ireg(const struct ipath_devdata *dd, ipath_kreg r)
+{
+	return (dd->ipath_flags & IPATH_INTREG_64) ?
+		ipath_read_kreg64(dd, r) : ipath_read_kreg32(dd, r);
+}
+
+/*
+ * from contents of IBCStatus (or a saved copy), return linkstate
+ * Report ACTIVE_DEFER as ACTIVE, because we treat them the same
+ * everywhere, anyway (and should be, for almost all purposes).
+ */
+static inline u32 ipath_ib_linkstate(struct ipath_devdata *dd, u64 ibcs)
+{
+	u32 state = (u32)(ibcs >> dd->ibcs_ls_shift) &
+		INFINIPATH_IBCS_LINKSTATE_MASK;
+	if (state == INFINIPATH_IBCS_L_STATE_ACT_DEFER)
+		state = INFINIPATH_IBCS_L_STATE_ACTIVE;
+	return state;
+}
+
+/* from contents of IBCStatus (or a saved copy), return linktrainingstate */
+static inline u32 ipath_ib_linktrstate(struct ipath_devdata *dd, u64 ibcs)
+{
+	return (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
+		dd->ibcs_lts_mask;
+}
+
+/*
+ * from contents of IBCStatus (or a saved copy), return logical link state
+ * combination of link state and linktraining state (down, active, init,
+ * arm, etc.
+ */
+static inline u32 ipath_ib_state(struct ipath_devdata *dd, u64 ibcs)
+{
+	u32 ibs;
+	ibs = (u32)(ibcs >> INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) &
+		dd->ibcs_lts_mask;
+	ibs |= (u32)(ibcs &
+		(INFINIPATH_IBCS_LINKSTATE_MASK << dd->ibcs_ls_shift));
+	return ibs;
+}
+
+/*
+ * sysfs interface.
+ */
+
+struct device_driver;
+
+extern const char ib_ipath_version[];
+
+extern const struct attribute_group *ipath_driver_attr_groups[];
+
+int ipath_device_create_group(struct device *, struct ipath_devdata *);
+void ipath_device_remove_group(struct device *, struct ipath_devdata *);
+int ipath_expose_reset(struct device *);
+
+int ipath_init_ipathfs(void);
+void ipath_exit_ipathfs(void);
+int ipathfs_add_device(struct ipath_devdata *);
+int ipathfs_remove_device(struct ipath_devdata *);
+
+/*
+ * dma_addr wrappers - all 0's invalid for hw
+ */
+dma_addr_t ipath_map_page(struct pci_dev *, struct page *, unsigned long,
+			  size_t, int);
+dma_addr_t ipath_map_single(struct pci_dev *, void *, size_t, int);
+const char *ipath_get_unit_name(int unit);
+
+/*
+ * Flush write combining store buffers (if present) and perform a write
+ * barrier.
+ */
+#if defined(CONFIG_X86_64)
+#define ipath_flush_wc() asm volatile("sfence" ::: "memory")
+#else
+#define ipath_flush_wc() wmb()
+#endif
+
+extern unsigned ipath_debug; /* debugging bit mask */
+extern unsigned ipath_linkrecovery;
+extern unsigned ipath_mtu4096;
+extern struct mutex ipath_mutex;
+
+#define IPATH_DRV_NAME		"ib_ipath"
+#define IPATH_MAJOR		233
+#define IPATH_USER_MINOR_BASE	0
+#define IPATH_DIAGPKT_MINOR	127
+#define IPATH_DIAG_MINOR_BASE	129
+#define IPATH_NMINORS		255
+
+#define ipath_dev_err(dd,fmt,...) \
+	do { \
+		const struct ipath_devdata *__dd = (dd); \
+		if (__dd->pcidev) \
+			dev_err(&__dd->pcidev->dev, "%s: " fmt, \
+				ipath_get_unit_name(__dd->ipath_unit), \
+				##__VA_ARGS__); \
+		else \
+			printk(KERN_ERR IPATH_DRV_NAME ": %s: " fmt, \
+			       ipath_get_unit_name(__dd->ipath_unit), \
+			       ##__VA_ARGS__); \
+	} while (0)
+
+#if _IPATH_DEBUGGING
+
+# define __IPATH_DBG_WHICH(which,fmt,...) \
+	do { \
+		if (unlikely(ipath_debug & (which))) \
+			printk(KERN_DEBUG IPATH_DRV_NAME ": %s: " fmt, \
+			       __func__,##__VA_ARGS__); \
+	} while(0)
+
+# define ipath_dbg(fmt,...) \
+	__IPATH_DBG_WHICH(__IPATH_DBG,fmt,##__VA_ARGS__)
+# define ipath_cdbg(which,fmt,...) \
+	__IPATH_DBG_WHICH(__IPATH_##which##DBG,fmt,##__VA_ARGS__)
+
+#else /* ! _IPATH_DEBUGGING */
+
+# define ipath_dbg(fmt,...)
+# define ipath_cdbg(which,fmt,...)
+
+#endif /* _IPATH_DEBUGGING */
+
+/*
+ * this is used for formatting hw error messages...
+ */
+struct ipath_hwerror_msgs {
+	u64 mask;
+	const char *msg;
+};
+
+#define INFINIPATH_HWE_MSG(a, b) { .mask = INFINIPATH_HWE_##a, .msg = b }
+
+/* in ipath_intr.c... */
+void ipath_format_hwerrors(u64 hwerrs,
+			   const struct ipath_hwerror_msgs *hwerrmsgs,
+			   size_t nhwerrmsgs,
+			   char *msg, size_t lmsg);
+
+#endif				/* _IPATH_KERNEL_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_keys.c b/drivers/infiniband/hw/ipath/ipath_keys.c
new file mode 100644
index 000000000..c0e933fec
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_keys.c
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm/io.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/**
+ * ipath_alloc_lkey - allocate an lkey
+ * @rkt: lkey table in which to allocate the lkey
+ * @mr: memory region that this lkey protects
+ *
+ * Returns 1 if successful, otherwise returns 0.
+ */
+
+int ipath_alloc_lkey(struct ipath_lkey_table *rkt, struct ipath_mregion *mr)
+{
+	unsigned long flags;
+	u32 r;
+	u32 n;
+	int ret;
+
+	spin_lock_irqsave(&rkt->lock, flags);
+
+	/* Find the next available LKEY */
+	r = n = rkt->next;
+	for (;;) {
+		if (rkt->table[r] == NULL)
+			break;
+		r = (r + 1) & (rkt->max - 1);
+		if (r == n) {
+			spin_unlock_irqrestore(&rkt->lock, flags);
+			ipath_dbg("LKEY table full\n");
+			ret = 0;
+			goto bail;
+		}
+	}
+	rkt->next = (r + 1) & (rkt->max - 1);
+	/*
+	 * Make sure lkey is never zero which is reserved to indicate an
+	 * unrestricted LKEY.
+	 */
+	rkt->gen++;
+	mr->lkey = (r << (32 - ib_ipath_lkey_table_size)) |
+		((((1 << (24 - ib_ipath_lkey_table_size)) - 1) & rkt->gen)
+		 << 8);
+	if (mr->lkey == 0) {
+		mr->lkey |= 1 << 8;
+		rkt->gen++;
+	}
+	rkt->table[r] = mr;
+	spin_unlock_irqrestore(&rkt->lock, flags);
+
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_free_lkey - free an lkey
+ * @rkt: table from which to free the lkey
+ * @lkey: lkey id to free
+ */
+void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey)
+{
+	unsigned long flags;
+	u32 r;
+
+	if (lkey == 0)
+		return;
+	r = lkey >> (32 - ib_ipath_lkey_table_size);
+	spin_lock_irqsave(&rkt->lock, flags);
+	rkt->table[r] = NULL;
+	spin_unlock_irqrestore(&rkt->lock, flags);
+}
+
+/**
+ * ipath_lkey_ok - check IB SGE for validity and initialize
+ * @rkt: table containing lkey to check SGE against
+ * @isge: outgoing internal SGE
+ * @sge: SGE to check
+ * @acc: access flags
+ *
+ * Return 1 if valid and successful, otherwise returns 0.
+ *
+ * Check the IB SGE for validity and initialize our internal version
+ * of it.
+ */
+int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
+		  struct ib_sge *sge, int acc)
+{
+	struct ipath_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
+	struct ipath_mregion *mr;
+	unsigned n, m;
+	size_t off;
+	int ret;
+
+	/*
+	 * We use LKEY == zero for kernel virtual addresses
+	 * (see ipath_get_dma_mr and ipath_dma.c).
+	 */
+	if (sge->lkey == 0) {
+		/* always a kernel port, no locking needed */
+		struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
+
+		if (pd->user) {
+			ret = 0;
+			goto bail;
+		}
+		isge->mr = NULL;
+		isge->vaddr = (void *) sge->addr;
+		isge->length = sge->length;
+		isge->sge_length = sge->length;
+		ret = 1;
+		goto bail;
+	}
+	mr = rkt->table[(sge->lkey >> (32 - ib_ipath_lkey_table_size))];
+	if (unlikely(mr == NULL || mr->lkey != sge->lkey ||
+		     qp->ibqp.pd != mr->pd)) {
+		ret = 0;
+		goto bail;
+	}
+
+	off = sge->addr - mr->user_base;
+	if (unlikely(sge->addr < mr->user_base ||
+		     off + sge->length > mr->length ||
+		     (mr->access_flags & acc) != acc)) {
+		ret = 0;
+		goto bail;
+	}
+
+	off += mr->offset;
+	m = 0;
+	n = 0;
+	while (off >= mr->map[m]->segs[n].length) {
+		off -= mr->map[m]->segs[n].length;
+		n++;
+		if (n >= IPATH_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	isge->mr = mr;
+	isge->vaddr = mr->map[m]->segs[n].vaddr + off;
+	isge->length = mr->map[m]->segs[n].length - off;
+	isge->sge_length = sge->length;
+	isge->m = m;
+	isge->n = n;
+
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_rkey_ok - check the IB virtual address, length, and RKEY
+ * @dev: infiniband device
+ * @ss: SGE state
+ * @len: length of data
+ * @vaddr: virtual address to place data
+ * @rkey: rkey to check
+ * @acc: access flags
+ *
+ * Return 1 if successful, otherwise 0.
+ */
+int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
+		  u32 len, u64 vaddr, u32 rkey, int acc)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_lkey_table *rkt = &dev->lk_table;
+	struct ipath_sge *sge = &ss->sge;
+	struct ipath_mregion *mr;
+	unsigned n, m;
+	size_t off;
+	int ret;
+
+	/*
+	 * We use RKEY == zero for kernel virtual addresses
+	 * (see ipath_get_dma_mr and ipath_dma.c).
+	 */
+	if (rkey == 0) {
+		/* always a kernel port, no locking needed */
+		struct ipath_pd *pd = to_ipd(qp->ibqp.pd);
+
+		if (pd->user) {
+			ret = 0;
+			goto bail;
+		}
+		sge->mr = NULL;
+		sge->vaddr = (void *) vaddr;
+		sge->length = len;
+		sge->sge_length = len;
+		ss->sg_list = NULL;
+		ss->num_sge = 1;
+		ret = 1;
+		goto bail;
+	}
+
+	mr = rkt->table[(rkey >> (32 - ib_ipath_lkey_table_size))];
+	if (unlikely(mr == NULL || mr->lkey != rkey ||
+		     qp->ibqp.pd != mr->pd)) {
+		ret = 0;
+		goto bail;
+	}
+
+	off = vaddr - mr->iova;
+	if (unlikely(vaddr < mr->iova || off + len > mr->length ||
+		     (mr->access_flags & acc) == 0)) {
+		ret = 0;
+		goto bail;
+	}
+
+	off += mr->offset;
+	m = 0;
+	n = 0;
+	while (off >= mr->map[m]->segs[n].length) {
+		off -= mr->map[m]->segs[n].length;
+		n++;
+		if (n >= IPATH_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	sge->mr = mr;
+	sge->vaddr = mr->map[m]->segs[n].vaddr + off;
+	sge->length = mr->map[m]->segs[n].length - off;
+	sge->sge_length = len;
+	sge->m = m;
+	sge->n = n;
+	ss->sg_list = NULL;
+	ss->num_sge = 1;
+
+	ret = 1;
+
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c
new file mode 100644
index 000000000..e890e5ba0
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_mad.c
@@ -0,0 +1,1513 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_pma.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+#define IB_SMP_UNSUP_VERSION	cpu_to_be16(0x0004)
+#define IB_SMP_UNSUP_METHOD	cpu_to_be16(0x0008)
+#define IB_SMP_UNSUP_METH_ATTR	cpu_to_be16(0x000C)
+#define IB_SMP_INVALID_FIELD	cpu_to_be16(0x001C)
+
+static int reply(struct ib_smp *smp)
+{
+	/*
+	 * The verbs framework will handle the directed/LID route
+	 * packet changes.
+	 */
+	smp->method = IB_MGMT_METHOD_GET_RESP;
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		smp->status |= IB_SMP_DIRECTION;
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static int recv_subn_get_nodedescription(struct ib_smp *smp,
+					 struct ib_device *ibdev)
+{
+	if (smp->attr_mod)
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	memcpy(smp->data, ibdev->node_desc, sizeof(smp->data));
+
+	return reply(smp);
+}
+
+struct nodeinfo {
+	u8 base_version;
+	u8 class_version;
+	u8 node_type;
+	u8 num_ports;
+	__be64 sys_guid;
+	__be64 node_guid;
+	__be64 port_guid;
+	__be16 partition_cap;
+	__be16 device_id;
+	__be32 revision;
+	u8 local_port_num;
+	u8 vendor_id[3];
+} __attribute__ ((packed));
+
+static int recv_subn_get_nodeinfo(struct ib_smp *smp,
+				  struct ib_device *ibdev, u8 port)
+{
+	struct nodeinfo *nip = (struct nodeinfo *)&smp->data;
+	struct ipath_devdata *dd = to_idev(ibdev)->dd;
+	u32 vendor, majrev, minrev;
+
+	/* GUID 0 is illegal */
+	if (smp->attr_mod || (dd->ipath_guid == 0))
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	nip->base_version = 1;
+	nip->class_version = 1;
+	nip->node_type = 1;	/* channel adapter */
+	/*
+	 * XXX The num_ports value will need a layer function to get
+	 * the value if we ever have more than one IB port on a chip.
+	 * We will also need to get the GUID for the port.
+	 */
+	nip->num_ports = ibdev->phys_port_cnt;
+	/* This is already in network order */
+	nip->sys_guid = to_idev(ibdev)->sys_image_guid;
+	nip->node_guid = dd->ipath_guid;
+	nip->port_guid = dd->ipath_guid;
+	nip->partition_cap = cpu_to_be16(ipath_get_npkeys(dd));
+	nip->device_id = cpu_to_be16(dd->ipath_deviceid);
+	majrev = dd->ipath_majrev;
+	minrev = dd->ipath_minrev;
+	nip->revision = cpu_to_be32((majrev << 16) | minrev);
+	nip->local_port_num = port;
+	vendor = dd->ipath_vendorid;
+	nip->vendor_id[0] = IPATH_SRC_OUI_1;
+	nip->vendor_id[1] = IPATH_SRC_OUI_2;
+	nip->vendor_id[2] = IPATH_SRC_OUI_3;
+
+	return reply(smp);
+}
+
+static int recv_subn_get_guidinfo(struct ib_smp *smp,
+				  struct ib_device *ibdev)
+{
+	u32 startgx = 8 * be32_to_cpu(smp->attr_mod);
+	__be64 *p = (__be64 *) smp->data;
+
+	/* 32 blocks of 8 64-bit GUIDs per block */
+
+	memset(smp->data, 0, sizeof(smp->data));
+
+	/*
+	 * We only support one GUID for now.  If this changes, the
+	 * portinfo.guid_cap field needs to be updated too.
+	 */
+	if (startgx == 0) {
+		__be64 g = to_idev(ibdev)->dd->ipath_guid;
+		if (g == 0)
+			/* GUID 0 is illegal */
+			smp->status |= IB_SMP_INVALID_FIELD;
+		else
+			/* The first is a copy of the read-only HW GUID. */
+			*p = g;
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return reply(smp);
+}
+
+static void set_link_width_enabled(struct ipath_devdata *dd, u32 w)
+{
+	(void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, w);
+}
+
+static void set_link_speed_enabled(struct ipath_devdata *dd, u32 s)
+{
+	(void) dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, s);
+}
+
+static int get_overrunthreshold(struct ipath_devdata *dd)
+{
+	return (dd->ipath_ibcctrl >>
+		INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
+		INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
+}
+
+/**
+ * set_overrunthreshold - set the overrun threshold
+ * @dd: the infinipath device
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_overrunthreshold(struct ipath_devdata *dd, unsigned n)
+{
+	unsigned v;
+
+	v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT) &
+		INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK;
+	if (v != n) {
+		dd->ipath_ibcctrl &=
+			~(INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK <<
+			  INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT);
+		dd->ipath_ibcctrl |=
+			(u64) n << INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+				 dd->ipath_ibcctrl);
+	}
+	return 0;
+}
+
+static int get_phyerrthreshold(struct ipath_devdata *dd)
+{
+	return (dd->ipath_ibcctrl >>
+		INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+		INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+}
+
+/**
+ * set_phyerrthreshold - set the physical error threshold
+ * @dd: the infinipath device
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_phyerrthreshold(struct ipath_devdata *dd, unsigned n)
+{
+	unsigned v;
+
+	v = (dd->ipath_ibcctrl >> INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT) &
+		INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK;
+	if (v != n) {
+		dd->ipath_ibcctrl &=
+			~(INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK <<
+			  INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT);
+		dd->ipath_ibcctrl |=
+			(u64) n << INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+				 dd->ipath_ibcctrl);
+	}
+	return 0;
+}
+
+/**
+ * get_linkdowndefaultstate - get the default linkdown state
+ * @dd: the infinipath device
+ *
+ * Returns zero if the default is POLL, 1 if the default is SLEEP.
+ */
+static int get_linkdowndefaultstate(struct ipath_devdata *dd)
+{
+	return !!(dd->ipath_ibcctrl & INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE);
+}
+
+static int recv_subn_get_portinfo(struct ib_smp *smp,
+				  struct ib_device *ibdev, u8 port)
+{
+	struct ipath_ibdev *dev;
+	struct ipath_devdata *dd;
+	struct ib_port_info *pip = (struct ib_port_info *)smp->data;
+	u16 lid;
+	u8 ibcstat;
+	u8 mtu;
+	int ret;
+
+	if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt) {
+		smp->status |= IB_SMP_INVALID_FIELD;
+		ret = reply(smp);
+		goto bail;
+	}
+
+	dev = to_idev(ibdev);
+	dd = dev->dd;
+
+	/* Clear all fields.  Only set the non-zero fields. */
+	memset(smp->data, 0, sizeof(smp->data));
+
+	/* Only return the mkey if the protection field allows it. */
+	if (smp->method == IB_MGMT_METHOD_SET || dev->mkey == smp->mkey ||
+	    dev->mkeyprot == 0)
+		pip->mkey = dev->mkey;
+	pip->gid_prefix = dev->gid_prefix;
+	lid = dd->ipath_lid;
+	pip->lid = lid ? cpu_to_be16(lid) : IB_LID_PERMISSIVE;
+	pip->sm_lid = cpu_to_be16(dev->sm_lid);
+	pip->cap_mask = cpu_to_be32(dev->port_cap_flags);
+	/* pip->diag_code; */
+	pip->mkey_lease_period = cpu_to_be16(dev->mkey_lease_period);
+	pip->local_port_num = port;
+	pip->link_width_enabled = dd->ipath_link_width_enabled;
+	pip->link_width_supported = dd->ipath_link_width_supported;
+	pip->link_width_active = dd->ipath_link_width_active;
+	pip->linkspeed_portstate = dd->ipath_link_speed_supported << 4;
+	ibcstat = dd->ipath_lastibcstat;
+	/* map LinkState to IB portinfo values.  */
+	pip->linkspeed_portstate |= ipath_ib_linkstate(dd, ibcstat) + 1;
+
+	pip->portphysstate_linkdown =
+		(ipath_cvt_physportstate[ibcstat & dd->ibcs_lts_mask] << 4) |
+		(get_linkdowndefaultstate(dd) ? 1 : 2);
+	pip->mkeyprot_resv_lmc = (dev->mkeyprot << 6) | dd->ipath_lmc;
+	pip->linkspeedactive_enabled = (dd->ipath_link_speed_active << 4) |
+		dd->ipath_link_speed_enabled;
+	switch (dd->ipath_ibmtu) {
+	case 4096:
+		mtu = IB_MTU_4096;
+		break;
+	case 2048:
+		mtu = IB_MTU_2048;
+		break;
+	case 1024:
+		mtu = IB_MTU_1024;
+		break;
+	case 512:
+		mtu = IB_MTU_512;
+		break;
+	case 256:
+		mtu = IB_MTU_256;
+		break;
+	default:		/* oops, something is wrong */
+		mtu = IB_MTU_2048;
+		break;
+	}
+	pip->neighbormtu_mastersmsl = (mtu << 4) | dev->sm_sl;
+	pip->vlcap_inittype = 0x10;	/* VLCap = VL0, InitType = 0 */
+	pip->vl_high_limit = dev->vl_high_limit;
+	/* pip->vl_arb_high_cap; // only one VL */
+	/* pip->vl_arb_low_cap; // only one VL */
+	/* InitTypeReply = 0 */
+	/* our mtu cap depends on whether 4K MTU enabled or not */
+	pip->inittypereply_mtucap = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
+	/* HCAs ignore VLStallCount and HOQLife */
+	/* pip->vlstallcnt_hoqlife; */
+	pip->operationalvl_pei_peo_fpi_fpo = 0x10;	/* OVLs = 1 */
+	pip->mkey_violations = cpu_to_be16(dev->mkey_violations);
+	/* P_KeyViolations are counted by hardware. */
+	pip->pkey_violations =
+		cpu_to_be16((ipath_get_cr_errpkey(dd) -
+			     dev->z_pkey_violations) & 0xFFFF);
+	pip->qkey_violations = cpu_to_be16(dev->qkey_violations);
+	/* Only the hardware GUID is supported for now */
+	pip->guid_cap = 1;
+	pip->clientrereg_resv_subnetto = dev->subnet_timeout;
+	/* 32.768 usec. response time (guessing) */
+	pip->resv_resptimevalue = 3;
+	pip->localphyerrors_overrunerrors =
+		(get_phyerrthreshold(dd) << 4) |
+		get_overrunthreshold(dd);
+	/* pip->max_credit_hint; */
+	if (dev->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) {
+		u32 v;
+
+		v = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LINKLATENCY);
+		pip->link_roundtrip_latency[0] = v >> 16;
+		pip->link_roundtrip_latency[1] = v >> 8;
+		pip->link_roundtrip_latency[2] = v;
+	}
+
+	ret = reply(smp);
+
+bail:
+	return ret;
+}
+
+/**
+ * get_pkeys - return the PKEY table for port 0
+ * @dd: the infinipath device
+ * @pkeys: the pkey table is placed here
+ */
+static int get_pkeys(struct ipath_devdata *dd, u16 * pkeys)
+{
+	/* always a kernel port, no locking needed */
+	struct ipath_portdata *pd = dd->ipath_pd[0];
+
+	memcpy(pkeys, pd->port_pkeys, sizeof(pd->port_pkeys));
+
+	return 0;
+}
+
+static int recv_subn_get_pkeytable(struct ib_smp *smp,
+				   struct ib_device *ibdev)
+{
+	u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+	u16 *p = (u16 *) smp->data;
+	__be16 *q = (__be16 *) smp->data;
+
+	/* 64 blocks of 32 16-bit P_Key entries */
+
+	memset(smp->data, 0, sizeof(smp->data));
+	if (startpx == 0) {
+		struct ipath_ibdev *dev = to_idev(ibdev);
+		unsigned i, n = ipath_get_npkeys(dev->dd);
+
+		get_pkeys(dev->dd, p);
+
+		for (i = 0; i < n; i++)
+			q[i] = cpu_to_be16(p[i]);
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return reply(smp);
+}
+
+static int recv_subn_set_guidinfo(struct ib_smp *smp,
+				  struct ib_device *ibdev)
+{
+	/* The only GUID we support is the first read-only entry. */
+	return recv_subn_get_guidinfo(smp, ibdev);
+}
+
+/**
+ * set_linkdowndefaultstate - set the default linkdown state
+ * @dd: the infinipath device
+ * @sleep: the new state
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_linkdowndefaultstate(struct ipath_devdata *dd, int sleep)
+{
+	if (sleep)
+		dd->ipath_ibcctrl |= INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
+	else
+		dd->ipath_ibcctrl &= ~INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_ibcctrl,
+			 dd->ipath_ibcctrl);
+	return 0;
+}
+
+/**
+ * recv_subn_set_portinfo - set port information
+ * @smp: the incoming SM packet
+ * @ibdev: the infiniband device
+ * @port: the port on the device
+ *
+ * Set Portinfo (see ch. 14.2.5.6).
+ */
+static int recv_subn_set_portinfo(struct ib_smp *smp,
+				  struct ib_device *ibdev, u8 port)
+{
+	struct ib_port_info *pip = (struct ib_port_info *)smp->data;
+	struct ib_event event;
+	struct ipath_ibdev *dev;
+	struct ipath_devdata *dd;
+	char clientrereg = 0;
+	u16 lid, smlid;
+	u8 lwe;
+	u8 lse;
+	u8 state;
+	u16 lstate;
+	u32 mtu;
+	int ret, ore;
+
+	if (be32_to_cpu(smp->attr_mod) > ibdev->phys_port_cnt)
+		goto err;
+
+	dev = to_idev(ibdev);
+	dd = dev->dd;
+	event.device = ibdev;
+	event.element.port_num = port;
+
+	dev->mkey = pip->mkey;
+	dev->gid_prefix = pip->gid_prefix;
+	dev->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period);
+
+	lid = be16_to_cpu(pip->lid);
+	if (dd->ipath_lid != lid ||
+	    dd->ipath_lmc != (pip->mkeyprot_resv_lmc & 7)) {
+		/* Must be a valid unicast LID address. */
+		if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE)
+			goto err;
+		ipath_set_lid(dd, lid, pip->mkeyprot_resv_lmc & 7);
+		event.event = IB_EVENT_LID_CHANGE;
+		ib_dispatch_event(&event);
+	}
+
+	smlid = be16_to_cpu(pip->sm_lid);
+	if (smlid != dev->sm_lid) {
+		/* Must be a valid unicast LID address. */
+		if (smlid == 0 || smlid >= IPATH_MULTICAST_LID_BASE)
+			goto err;
+		dev->sm_lid = smlid;
+		event.event = IB_EVENT_SM_CHANGE;
+		ib_dispatch_event(&event);
+	}
+
+	/* Allow 1x or 4x to be set (see 14.2.6.6). */
+	lwe = pip->link_width_enabled;
+	if (lwe) {
+		if (lwe == 0xFF)
+			lwe = dd->ipath_link_width_supported;
+		else if (lwe >= 16 || (lwe & ~dd->ipath_link_width_supported))
+			goto err;
+		set_link_width_enabled(dd, lwe);
+	}
+
+	/* Allow 2.5 or 5.0 Gbs. */
+	lse = pip->linkspeedactive_enabled & 0xF;
+	if (lse) {
+		if (lse == 15)
+			lse = dd->ipath_link_speed_supported;
+		else if (lse >= 8 || (lse & ~dd->ipath_link_speed_supported))
+			goto err;
+		set_link_speed_enabled(dd, lse);
+	}
+
+	/* Set link down default state. */
+	switch (pip->portphysstate_linkdown & 0xF) {
+	case 0: /* NOP */
+		break;
+	case 1: /* SLEEP */
+		if (set_linkdowndefaultstate(dd, 1))
+			goto err;
+		break;
+	case 2: /* POLL */
+		if (set_linkdowndefaultstate(dd, 0))
+			goto err;
+		break;
+	default:
+		goto err;
+	}
+
+	dev->mkeyprot = pip->mkeyprot_resv_lmc >> 6;
+	dev->vl_high_limit = pip->vl_high_limit;
+
+	switch ((pip->neighbormtu_mastersmsl >> 4) & 0xF) {
+	case IB_MTU_256:
+		mtu = 256;
+		break;
+	case IB_MTU_512:
+		mtu = 512;
+		break;
+	case IB_MTU_1024:
+		mtu = 1024;
+		break;
+	case IB_MTU_2048:
+		mtu = 2048;
+		break;
+	case IB_MTU_4096:
+		if (!ipath_mtu4096)
+			goto err;
+		mtu = 4096;
+		break;
+	default:
+		/* XXX We have already partially updated our state! */
+		goto err;
+	}
+	ipath_set_mtu(dd, mtu);
+
+	dev->sm_sl = pip->neighbormtu_mastersmsl & 0xF;
+
+	/* We only support VL0 */
+	if (((pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF) > 1)
+		goto err;
+
+	if (pip->mkey_violations == 0)
+		dev->mkey_violations = 0;
+
+	/*
+	 * Hardware counter can't be reset so snapshot and subtract
+	 * later.
+	 */
+	if (pip->pkey_violations == 0)
+		dev->z_pkey_violations = ipath_get_cr_errpkey(dd);
+
+	if (pip->qkey_violations == 0)
+		dev->qkey_violations = 0;
+
+	ore = pip->localphyerrors_overrunerrors;
+	if (set_phyerrthreshold(dd, (ore >> 4) & 0xF))
+		goto err;
+
+	if (set_overrunthreshold(dd, (ore & 0xF)))
+		goto err;
+
+	dev->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F;
+
+	if (pip->clientrereg_resv_subnetto & 0x80) {
+		clientrereg = 1;
+		event.event = IB_EVENT_CLIENT_REREGISTER;
+		ib_dispatch_event(&event);
+	}
+
+	/*
+	 * Do the port state change now that the other link parameters
+	 * have been set.
+	 * Changing the port physical state only makes sense if the link
+	 * is down or is being set to down.
+	 */
+	state = pip->linkspeed_portstate & 0xF;
+	lstate = (pip->portphysstate_linkdown >> 4) & 0xF;
+	if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP))
+		goto err;
+
+	/*
+	 * Only state changes of DOWN, ARM, and ACTIVE are valid
+	 * and must be in the correct state to take effect (see 7.2.6).
+	 */
+	switch (state) {
+	case IB_PORT_NOP:
+		if (lstate == 0)
+			break;
+		/* FALLTHROUGH */
+	case IB_PORT_DOWN:
+		if (lstate == 0)
+			lstate = IPATH_IB_LINKDOWN_ONLY;
+		else if (lstate == 1)
+			lstate = IPATH_IB_LINKDOWN_SLEEP;
+		else if (lstate == 2)
+			lstate = IPATH_IB_LINKDOWN;
+		else if (lstate == 3)
+			lstate = IPATH_IB_LINKDOWN_DISABLE;
+		else
+			goto err;
+		ipath_set_linkstate(dd, lstate);
+		if (lstate == IPATH_IB_LINKDOWN_DISABLE) {
+			ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+			goto done;
+		}
+		ipath_wait_linkstate(dd, IPATH_LINKINIT | IPATH_LINKARMED |
+				IPATH_LINKACTIVE, 1000);
+		break;
+	case IB_PORT_ARMED:
+		ipath_set_linkstate(dd, IPATH_IB_LINKARM);
+		break;
+	case IB_PORT_ACTIVE:
+		ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
+		break;
+	default:
+		/* XXX We have already partially updated our state! */
+		goto err;
+	}
+
+	ret = recv_subn_get_portinfo(smp, ibdev, port);
+
+	if (clientrereg)
+		pip->clientrereg_resv_subnetto |= 0x80;
+
+	goto done;
+
+err:
+	smp->status |= IB_SMP_INVALID_FIELD;
+	ret = recv_subn_get_portinfo(smp, ibdev, port);
+
+done:
+	return ret;
+}
+
+/**
+ * rm_pkey - decrecment the reference count for the given PKEY
+ * @dd: the infinipath device
+ * @key: the PKEY index
+ *
+ * Return true if this was the last reference and the hardware table entry
+ * needs to be changed.
+ */
+static int rm_pkey(struct ipath_devdata *dd, u16 key)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (dd->ipath_pkeys[i] != key)
+			continue;
+		if (atomic_dec_and_test(&dd->ipath_pkeyrefs[i])) {
+			dd->ipath_pkeys[i] = 0;
+			ret = 1;
+			goto bail;
+		}
+		break;
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * add_pkey - add the given PKEY to the hardware table
+ * @dd: the infinipath device
+ * @key: the PKEY
+ *
+ * Return an error code if unable to add the entry, zero if no change,
+ * or 1 if the hardware PKEY register needs to be updated.
+ */
+static int add_pkey(struct ipath_devdata *dd, u16 key)
+{
+	int i;
+	u16 lkey = key & 0x7FFF;
+	int any = 0;
+	int ret;
+
+	if (lkey == 0x7FFF) {
+		ret = 0;
+		goto bail;
+	}
+
+	/* Look for an empty slot or a matching PKEY. */
+	for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (!dd->ipath_pkeys[i]) {
+			any++;
+			continue;
+		}
+		/* If it matches exactly, try to increment the ref count */
+		if (dd->ipath_pkeys[i] == key) {
+			if (atomic_inc_return(&dd->ipath_pkeyrefs[i]) > 1) {
+				ret = 0;
+				goto bail;
+			}
+			/* Lost the race. Look for an empty slot below. */
+			atomic_dec(&dd->ipath_pkeyrefs[i]);
+			any++;
+		}
+		/*
+		 * It makes no sense to have both the limited and unlimited
+		 * PKEY set at the same time since the unlimited one will
+		 * disable the limited one.
+		 */
+		if ((dd->ipath_pkeys[i] & 0x7FFF) == lkey) {
+			ret = -EEXIST;
+			goto bail;
+		}
+	}
+	if (!any) {
+		ret = -EBUSY;
+		goto bail;
+	}
+	for (i = 0; i < ARRAY_SIZE(dd->ipath_pkeys); i++) {
+		if (!dd->ipath_pkeys[i] &&
+		    atomic_inc_return(&dd->ipath_pkeyrefs[i]) == 1) {
+			/* for ipathstats, etc. */
+			ipath_stats.sps_pkeys[i] = lkey;
+			dd->ipath_pkeys[i] = key;
+			ret = 1;
+			goto bail;
+		}
+	}
+	ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+/**
+ * set_pkeys - set the PKEY table for port 0
+ * @dd: the infinipath device
+ * @pkeys: the PKEY table
+ */
+static int set_pkeys(struct ipath_devdata *dd, u16 *pkeys, u8 port)
+{
+	struct ipath_portdata *pd;
+	int i;
+	int changed = 0;
+
+	/* always a kernel port, no locking needed */
+	pd = dd->ipath_pd[0];
+
+	for (i = 0; i < ARRAY_SIZE(pd->port_pkeys); i++) {
+		u16 key = pkeys[i];
+		u16 okey = pd->port_pkeys[i];
+
+		if (key == okey)
+			continue;
+		/*
+		 * The value of this PKEY table entry is changing.
+		 * Remove the old entry in the hardware's array of PKEYs.
+		 */
+		if (okey & 0x7FFF)
+			changed |= rm_pkey(dd, okey);
+		if (key & 0x7FFF) {
+			int ret = add_pkey(dd, key);
+
+			if (ret < 0)
+				key = 0;
+			else
+				changed |= ret;
+		}
+		pd->port_pkeys[i] = key;
+	}
+	if (changed) {
+		u64 pkey;
+		struct ib_event event;
+
+		pkey = (u64) dd->ipath_pkeys[0] |
+			((u64) dd->ipath_pkeys[1] << 16) |
+			((u64) dd->ipath_pkeys[2] << 32) |
+			((u64) dd->ipath_pkeys[3] << 48);
+		ipath_cdbg(VERBOSE, "p0 new pkey reg %llx\n",
+			   (unsigned long long) pkey);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_partitionkey,
+				 pkey);
+
+		event.event = IB_EVENT_PKEY_CHANGE;
+		event.device = &dd->verbs_dev->ibdev;
+		event.element.port_num = port;
+		ib_dispatch_event(&event);
+	}
+	return 0;
+}
+
+static int recv_subn_set_pkeytable(struct ib_smp *smp,
+				   struct ib_device *ibdev, u8 port)
+{
+	u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+	__be16 *p = (__be16 *) smp->data;
+	u16 *q = (u16 *) smp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	unsigned i, n = ipath_get_npkeys(dev->dd);
+
+	for (i = 0; i < n; i++)
+		q[i] = be16_to_cpu(p[i]);
+
+	if (startpx != 0 || set_pkeys(dev->dd, q, port) != 0)
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return recv_subn_get_pkeytable(smp, ibdev);
+}
+
+static int recv_pma_get_classportinfo(struct ib_pma_mad *pmp)
+{
+	struct ib_class_port_info *p =
+		(struct ib_class_port_info *)pmp->data;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	if (pmp->mad_hdr.attr_mod != 0)
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	/* Indicate AllPortSelect is valid (only one port anyway) */
+	p->capability_mask = cpu_to_be16(1 << 8);
+	p->base_version = 1;
+	p->class_version = 1;
+	/*
+	 * Expected response time is 4.096 usec. * 2^18 == 1.073741824
+	 * sec.
+	 */
+	p->resp_time_value = 18;
+
+	return reply((struct ib_smp *) pmp);
+}
+
+/*
+ * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
+ * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
+ * We support 5 counters which only count the mandatory quantities.
+ */
+#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
+#define COUNTER_MASK0_9 cpu_to_be32(COUNTER_MASK(1, 0) | \
+				    COUNTER_MASK(1, 1) | \
+				    COUNTER_MASK(1, 2) | \
+				    COUNTER_MASK(1, 3) | \
+				    COUNTER_MASK(1, 4))
+
+static int recv_pma_get_portsamplescontrol(struct ib_pma_mad *pmp,
+					   struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplescontrol *p =
+		(struct ib_pma_portsamplescontrol *)pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+	unsigned long flags;
+	u8 port_select = p->port_select;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 ||
+	    (port_select != port && port_select != 0xFF))
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+	/*
+	 * Ticks are 10x the link transfer period which for 2.5Gbs is 4
+	 * nsec.  0 == 4 nsec., 1 == 8 nsec., ..., 255 == 1020 nsec.  Sample
+	 * intervals are counted in ticks.  Since we use Linux timers, that
+	 * count in jiffies, we can't sample for less than 1000 ticks if HZ
+	 * == 1000 (4000 ticks if HZ is 250).  link_speed_active returns 2 for
+	 * DDR, 1 for SDR, set the tick to 1 for DDR, 0 for SDR on chips that
+	 * have hardware support for delaying packets.
+	 */
+	if (crp->cr_psstat)
+		p->tick = dev->dd->ipath_link_speed_active - 1;
+	else
+		p->tick = 250;		/* 1 usec. */
+	p->counter_width = 4;	/* 32 bit counters */
+	p->counter_mask0_9 = COUNTER_MASK0_9;
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	if (crp->cr_psstat)
+		p->sample_status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+	else
+		p->sample_status = dev->pma_sample_status;
+	p->sample_start = cpu_to_be32(dev->pma_sample_start);
+	p->sample_interval = cpu_to_be32(dev->pma_sample_interval);
+	p->tag = cpu_to_be16(dev->pma_tag);
+	p->counter_select[0] = dev->pma_counter_select[0];
+	p->counter_select[1] = dev->pma_counter_select[1];
+	p->counter_select[2] = dev->pma_counter_select[2];
+	p->counter_select[3] = dev->pma_counter_select[3];
+	p->counter_select[4] = dev->pma_counter_select[4];
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_set_portsamplescontrol(struct ib_pma_mad *pmp,
+					   struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplescontrol *p =
+		(struct ib_pma_portsamplescontrol *)pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+	unsigned long flags;
+	u8 status;
+	int ret;
+
+	if (pmp->mad_hdr.attr_mod != 0 ||
+	    (p->port_select != port && p->port_select != 0xFF)) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		ret = reply((struct ib_smp *) pmp);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	if (crp->cr_psstat)
+		status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+	else
+		status = dev->pma_sample_status;
+	if (status == IB_PMA_SAMPLE_STATUS_DONE) {
+		dev->pma_sample_start = be32_to_cpu(p->sample_start);
+		dev->pma_sample_interval = be32_to_cpu(p->sample_interval);
+		dev->pma_tag = be16_to_cpu(p->tag);
+		dev->pma_counter_select[0] = p->counter_select[0];
+		dev->pma_counter_select[1] = p->counter_select[1];
+		dev->pma_counter_select[2] = p->counter_select[2];
+		dev->pma_counter_select[3] = p->counter_select[3];
+		dev->pma_counter_select[4] = p->counter_select[4];
+		if (crp->cr_psstat) {
+			ipath_write_creg(dev->dd, crp->cr_psinterval,
+					 dev->pma_sample_interval);
+			ipath_write_creg(dev->dd, crp->cr_psstart,
+					 dev->pma_sample_start);
+		} else
+			dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED;
+	}
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+	ret = recv_pma_get_portsamplescontrol(pmp, ibdev, port);
+
+bail:
+	return ret;
+}
+
+static u64 get_counter(struct ipath_ibdev *dev,
+		       struct ipath_cregs const *crp,
+		       __be16 sel)
+{
+	u64 ret;
+
+	switch (sel) {
+	case IB_PMA_PORT_XMIT_DATA:
+		ret = (crp->cr_psxmitdatacount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psxmitdatacount) :
+			dev->ipath_sword;
+		break;
+	case IB_PMA_PORT_RCV_DATA:
+		ret = (crp->cr_psrcvdatacount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psrcvdatacount) :
+			dev->ipath_rword;
+		break;
+	case IB_PMA_PORT_XMIT_PKTS:
+		ret = (crp->cr_psxmitpktscount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psxmitpktscount) :
+			dev->ipath_spkts;
+		break;
+	case IB_PMA_PORT_RCV_PKTS:
+		ret = (crp->cr_psrcvpktscount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psrcvpktscount) :
+			dev->ipath_rpkts;
+		break;
+	case IB_PMA_PORT_XMIT_WAIT:
+		ret = (crp->cr_psxmitwaitcount) ?
+			ipath_read_creg32(dev->dd, crp->cr_psxmitwaitcount) :
+			dev->ipath_xmit_wait;
+		break;
+	default:
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int recv_pma_get_portsamplesresult(struct ib_pma_mad *pmp,
+					  struct ib_device *ibdev)
+{
+	struct ib_pma_portsamplesresult *p =
+		(struct ib_pma_portsamplesresult *)pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+	u8 status;
+	int i;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+	p->tag = cpu_to_be16(dev->pma_tag);
+	if (crp->cr_psstat)
+		status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+	else
+		status = dev->pma_sample_status;
+	p->sample_status = cpu_to_be16(status);
+	for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
+		p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
+		    cpu_to_be32(
+			get_counter(dev, crp, dev->pma_counter_select[i]));
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp,
+					      struct ib_device *ibdev)
+{
+	struct ib_pma_portsamplesresult_ext *p =
+		(struct ib_pma_portsamplesresult_ext *)pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_cregs const *crp = dev->dd->ipath_cregs;
+	u8 status;
+	int i;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+	p->tag = cpu_to_be16(dev->pma_tag);
+	if (crp->cr_psstat)
+		status = ipath_read_creg32(dev->dd, crp->cr_psstat);
+	else
+		status = dev->pma_sample_status;
+	p->sample_status = cpu_to_be16(status);
+	/* 64 bits */
+	p->extended_width = cpu_to_be32(0x80000000);
+	for (i = 0; i < ARRAY_SIZE(dev->pma_counter_select); i++)
+		p->counter[i] = (status != IB_PMA_SAMPLE_STATUS_DONE) ? 0 :
+		    cpu_to_be64(
+			get_counter(dev, crp, dev->pma_counter_select[i]));
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portcounters(struct ib_pma_mad *pmp,
+				     struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_verbs_counters cntrs;
+	u8 port_select = p->port_select;
+
+	ipath_get_counters(dev->dd, &cntrs);
+
+	/* Adjust counters for any resets done. */
+	cntrs.symbol_error_counter -= dev->z_symbol_error_counter;
+	cntrs.link_error_recovery_counter -=
+		dev->z_link_error_recovery_counter;
+	cntrs.link_downed_counter -= dev->z_link_downed_counter;
+	cntrs.port_rcv_errors += dev->rcv_errors;
+	cntrs.port_rcv_errors -= dev->z_port_rcv_errors;
+	cntrs.port_rcv_remphys_errors -= dev->z_port_rcv_remphys_errors;
+	cntrs.port_xmit_discards -= dev->z_port_xmit_discards;
+	cntrs.port_xmit_data -= dev->z_port_xmit_data;
+	cntrs.port_rcv_data -= dev->z_port_rcv_data;
+	cntrs.port_xmit_packets -= dev->z_port_xmit_packets;
+	cntrs.port_rcv_packets -= dev->z_port_rcv_packets;
+	cntrs.local_link_integrity_errors -=
+		dev->z_local_link_integrity_errors;
+	cntrs.excessive_buffer_overrun_errors -=
+		dev->z_excessive_buffer_overrun_errors;
+	cntrs.vl15_dropped -= dev->z_vl15_dropped;
+	cntrs.vl15_dropped += dev->n_vl15_dropped;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 ||
+	    (port_select != port && port_select != 0xFF))
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	if (cntrs.symbol_error_counter > 0xFFFFUL)
+		p->symbol_error_counter = cpu_to_be16(0xFFFF);
+	else
+		p->symbol_error_counter =
+			cpu_to_be16((u16)cntrs.symbol_error_counter);
+	if (cntrs.link_error_recovery_counter > 0xFFUL)
+		p->link_error_recovery_counter = 0xFF;
+	else
+		p->link_error_recovery_counter =
+			(u8)cntrs.link_error_recovery_counter;
+	if (cntrs.link_downed_counter > 0xFFUL)
+		p->link_downed_counter = 0xFF;
+	else
+		p->link_downed_counter = (u8)cntrs.link_downed_counter;
+	if (cntrs.port_rcv_errors > 0xFFFFUL)
+		p->port_rcv_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_errors =
+			cpu_to_be16((u16) cntrs.port_rcv_errors);
+	if (cntrs.port_rcv_remphys_errors > 0xFFFFUL)
+		p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_remphys_errors =
+			cpu_to_be16((u16)cntrs.port_rcv_remphys_errors);
+	if (cntrs.port_xmit_discards > 0xFFFFUL)
+		p->port_xmit_discards = cpu_to_be16(0xFFFF);
+	else
+		p->port_xmit_discards =
+			cpu_to_be16((u16)cntrs.port_xmit_discards);
+	if (cntrs.local_link_integrity_errors > 0xFUL)
+		cntrs.local_link_integrity_errors = 0xFUL;
+	if (cntrs.excessive_buffer_overrun_errors > 0xFUL)
+		cntrs.excessive_buffer_overrun_errors = 0xFUL;
+	p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) |
+		cntrs.excessive_buffer_overrun_errors;
+	if (cntrs.vl15_dropped > 0xFFFFUL)
+		p->vl15_dropped = cpu_to_be16(0xFFFF);
+	else
+		p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped);
+	if (cntrs.port_xmit_data > 0xFFFFFFFFUL)
+		p->port_xmit_data = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data);
+	if (cntrs.port_rcv_data > 0xFFFFFFFFUL)
+		p->port_rcv_data = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data);
+	if (cntrs.port_xmit_packets > 0xFFFFFFFFUL)
+		p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_xmit_packets =
+			cpu_to_be32((u32)cntrs.port_xmit_packets);
+	if (cntrs.port_rcv_packets > 0xFFFFFFFFUL)
+		p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_rcv_packets =
+			cpu_to_be32((u32) cntrs.port_rcv_packets);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_get_portcounters_ext(struct ib_pma_mad *pmp,
+					 struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters_ext *p =
+		(struct ib_pma_portcounters_ext *)pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	u64 swords, rwords, spkts, rpkts, xwait;
+	u8 port_select = p->port_select;
+
+	ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
+				&rpkts, &xwait);
+
+	/* Adjust counters for any resets done. */
+	swords -= dev->z_port_xmit_data;
+	rwords -= dev->z_port_rcv_data;
+	spkts -= dev->z_port_xmit_packets;
+	rpkts -= dev->z_port_rcv_packets;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 ||
+	    (port_select != port && port_select != 0xFF))
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	p->port_xmit_data = cpu_to_be64(swords);
+	p->port_rcv_data = cpu_to_be64(rwords);
+	p->port_xmit_packets = cpu_to_be64(spkts);
+	p->port_rcv_packets = cpu_to_be64(rpkts);
+	p->port_unicast_xmit_packets = cpu_to_be64(dev->n_unicast_xmit);
+	p->port_unicast_rcv_packets = cpu_to_be64(dev->n_unicast_rcv);
+	p->port_multicast_xmit_packets = cpu_to_be64(dev->n_multicast_xmit);
+	p->port_multicast_rcv_packets = cpu_to_be64(dev->n_multicast_rcv);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int recv_pma_set_portcounters(struct ib_pma_mad *pmp,
+				     struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_verbs_counters cntrs;
+
+	/*
+	 * Since the HW doesn't support clearing counters, we save the
+	 * current count and subtract it from future responses.
+	 */
+	ipath_get_counters(dev->dd, &cntrs);
+
+	if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR)
+		dev->z_symbol_error_counter = cntrs.symbol_error_counter;
+
+	if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY)
+		dev->z_link_error_recovery_counter =
+			cntrs.link_error_recovery_counter;
+
+	if (p->counter_select & IB_PMA_SEL_LINK_DOWNED)
+		dev->z_link_downed_counter = cntrs.link_downed_counter;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS)
+		dev->z_port_rcv_errors =
+			cntrs.port_rcv_errors + dev->rcv_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS)
+		dev->z_port_rcv_remphys_errors =
+			cntrs.port_rcv_remphys_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS)
+		dev->z_port_xmit_discards = cntrs.port_xmit_discards;
+
+	if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS)
+		dev->z_local_link_integrity_errors =
+			cntrs.local_link_integrity_errors;
+
+	if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS)
+		dev->z_excessive_buffer_overrun_errors =
+			cntrs.excessive_buffer_overrun_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) {
+		dev->n_vl15_dropped = 0;
+		dev->z_vl15_dropped = cntrs.vl15_dropped;
+	}
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA)
+		dev->z_port_xmit_data = cntrs.port_xmit_data;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA)
+		dev->z_port_rcv_data = cntrs.port_rcv_data;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS)
+		dev->z_port_xmit_packets = cntrs.port_xmit_packets;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS)
+		dev->z_port_rcv_packets = cntrs.port_rcv_packets;
+
+	return recv_pma_get_portcounters(pmp, ibdev, port);
+}
+
+static int recv_pma_set_portcounters_ext(struct ib_pma_mad *pmp,
+					 struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	u64 swords, rwords, spkts, rpkts, xwait;
+
+	ipath_snapshot_counters(dev->dd, &swords, &rwords, &spkts,
+				&rpkts, &xwait);
+
+	if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA)
+		dev->z_port_xmit_data = swords;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA)
+		dev->z_port_rcv_data = rwords;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS)
+		dev->z_port_xmit_packets = spkts;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS)
+		dev->z_port_rcv_packets = rpkts;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS)
+		dev->n_unicast_xmit = 0;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS)
+		dev->n_unicast_rcv = 0;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS)
+		dev->n_multicast_xmit = 0;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS)
+		dev->n_multicast_rcv = 0;
+
+	return recv_pma_get_portcounters_ext(pmp, ibdev, port);
+}
+
+static int process_subn(struct ib_device *ibdev, int mad_flags,
+			u8 port_num, struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_smp *smp = (struct ib_smp *)out_mad;
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	int ret;
+
+	*out_mad = *in_mad;
+	if (smp->class_version != 1) {
+		smp->status |= IB_SMP_UNSUP_VERSION;
+		ret = reply(smp);
+		goto bail;
+	}
+
+	/* Is the mkey in the process of expiring? */
+	if (dev->mkey_lease_timeout &&
+	    time_after_eq(jiffies, dev->mkey_lease_timeout)) {
+		/* Clear timeout and mkey protection field. */
+		dev->mkey_lease_timeout = 0;
+		dev->mkeyprot = 0;
+	}
+
+	/*
+	 * M_Key checking depends on
+	 * Portinfo:M_Key_protect_bits
+	 */
+	if ((mad_flags & IB_MAD_IGNORE_MKEY) == 0 && dev->mkey != 0 &&
+	    dev->mkey != smp->mkey &&
+	    (smp->method == IB_MGMT_METHOD_SET ||
+	     (smp->method == IB_MGMT_METHOD_GET &&
+	      dev->mkeyprot >= 2))) {
+		if (dev->mkey_violations != 0xFFFF)
+			++dev->mkey_violations;
+		if (dev->mkey_lease_timeout ||
+		    dev->mkey_lease_period == 0) {
+			ret = IB_MAD_RESULT_SUCCESS |
+				IB_MAD_RESULT_CONSUMED;
+			goto bail;
+		}
+		dev->mkey_lease_timeout = jiffies +
+			dev->mkey_lease_period * HZ;
+		/* Future: Generate a trap notice. */
+		ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+		goto bail;
+	} else if (dev->mkey_lease_timeout)
+		dev->mkey_lease_timeout = 0;
+
+	switch (smp->method) {
+	case IB_MGMT_METHOD_GET:
+		switch (smp->attr_id) {
+		case IB_SMP_ATTR_NODE_DESC:
+			ret = recv_subn_get_nodedescription(smp, ibdev);
+			goto bail;
+		case IB_SMP_ATTR_NODE_INFO:
+			ret = recv_subn_get_nodeinfo(smp, ibdev, port_num);
+			goto bail;
+		case IB_SMP_ATTR_GUID_INFO:
+			ret = recv_subn_get_guidinfo(smp, ibdev);
+			goto bail;
+		case IB_SMP_ATTR_PORT_INFO:
+			ret = recv_subn_get_portinfo(smp, ibdev, port_num);
+			goto bail;
+		case IB_SMP_ATTR_PKEY_TABLE:
+			ret = recv_subn_get_pkeytable(smp, ibdev);
+			goto bail;
+		case IB_SMP_ATTR_SM_INFO:
+			if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
+				ret = IB_MAD_RESULT_SUCCESS |
+					IB_MAD_RESULT_CONSUMED;
+				goto bail;
+			}
+			if (dev->port_cap_flags & IB_PORT_SM) {
+				ret = IB_MAD_RESULT_SUCCESS;
+				goto bail;
+			}
+			/* FALLTHROUGH */
+		default:
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply(smp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_SET:
+		switch (smp->attr_id) {
+		case IB_SMP_ATTR_GUID_INFO:
+			ret = recv_subn_set_guidinfo(smp, ibdev);
+			goto bail;
+		case IB_SMP_ATTR_PORT_INFO:
+			ret = recv_subn_set_portinfo(smp, ibdev, port_num);
+			goto bail;
+		case IB_SMP_ATTR_PKEY_TABLE:
+			ret = recv_subn_set_pkeytable(smp, ibdev, port_num);
+			goto bail;
+		case IB_SMP_ATTR_SM_INFO:
+			if (dev->port_cap_flags & IB_PORT_SM_DISABLED) {
+				ret = IB_MAD_RESULT_SUCCESS |
+					IB_MAD_RESULT_CONSUMED;
+				goto bail;
+			}
+			if (dev->port_cap_flags & IB_PORT_SM) {
+				ret = IB_MAD_RESULT_SUCCESS;
+				goto bail;
+			}
+			/* FALLTHROUGH */
+		default:
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply(smp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_TRAP:
+	case IB_MGMT_METHOD_REPORT:
+	case IB_MGMT_METHOD_REPORT_RESP:
+	case IB_MGMT_METHOD_TRAP_REPRESS:
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+	default:
+		smp->status |= IB_SMP_UNSUP_METHOD;
+		ret = reply(smp);
+	}
+
+bail:
+	return ret;
+}
+
+static int process_perf(struct ib_device *ibdev, u8 port_num,
+			struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
+	int ret;
+
+	*out_mad = *in_mad;
+	if (pmp->mad_hdr.class_version != 1) {
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+		ret = reply((struct ib_smp *) pmp);
+		goto bail;
+	}
+
+	switch (pmp->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+		switch (pmp->mad_hdr.attr_id) {
+		case IB_PMA_CLASS_PORT_INFO:
+			ret = recv_pma_get_classportinfo(pmp);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_CONTROL:
+			ret = recv_pma_get_portsamplescontrol(pmp, ibdev,
+							      port_num);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_RESULT:
+			ret = recv_pma_get_portsamplesresult(pmp, ibdev);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_RESULT_EXT:
+			ret = recv_pma_get_portsamplesresult_ext(pmp,
+								 ibdev);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS:
+			ret = recv_pma_get_portcounters(pmp, ibdev,
+							port_num);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_EXT:
+			ret = recv_pma_get_portcounters_ext(pmp, ibdev,
+							    port_num);
+			goto bail;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) pmp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_SET:
+		switch (pmp->mad_hdr.attr_id) {
+		case IB_PMA_PORT_SAMPLES_CONTROL:
+			ret = recv_pma_set_portsamplescontrol(pmp, ibdev,
+							      port_num);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS:
+			ret = recv_pma_set_portcounters(pmp, ibdev,
+							port_num);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_EXT:
+			ret = recv_pma_set_portcounters_ext(pmp, ibdev,
+							    port_num);
+			goto bail;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) pmp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+	default:
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+		ret = reply((struct ib_smp *) pmp);
+	}
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port_num: the port number this packet came in on
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
+ * interested in processing.
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ */
+int ipath_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		      struct ib_wc *in_wc, struct ib_grh *in_grh,
+		      struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	int ret;
+
+	switch (in_mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+		ret = process_subn(ibdev, mad_flags, port_num,
+				   in_mad, out_mad);
+		goto bail;
+	case IB_MGMT_CLASS_PERF_MGMT:
+		ret = process_perf(ibdev, port_num, in_mad, out_mad);
+		goto bail;
+	default:
+		ret = IB_MAD_RESULT_SUCCESS;
+	}
+
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_mmap.c b/drivers/infiniband/hw/ipath/ipath_mmap.c
new file mode 100644
index 000000000..e73274229
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_mmap.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/vmalloc.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <asm/pgtable.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_release_mmap_info - free mmap info structure
+ * @ref: a pointer to the kref within struct ipath_mmap_info
+ */
+void ipath_release_mmap_info(struct kref *ref)
+{
+	struct ipath_mmap_info *ip =
+		container_of(ref, struct ipath_mmap_info, ref);
+	struct ipath_ibdev *dev = to_idev(ip->context->device);
+
+	spin_lock_irq(&dev->pending_lock);
+	list_del(&ip->pending_mmaps);
+	spin_unlock_irq(&dev->pending_lock);
+
+	vfree(ip->obj);
+	kfree(ip);
+}
+
+/*
+ * open and close keep track of how many times the CQ is mapped,
+ * to avoid releasing it.
+ */
+static void ipath_vma_open(struct vm_area_struct *vma)
+{
+	struct ipath_mmap_info *ip = vma->vm_private_data;
+
+	kref_get(&ip->ref);
+}
+
+static void ipath_vma_close(struct vm_area_struct *vma)
+{
+	struct ipath_mmap_info *ip = vma->vm_private_data;
+
+	kref_put(&ip->ref, ipath_release_mmap_info);
+}
+
+static const struct vm_operations_struct ipath_vm_ops = {
+	.open =     ipath_vma_open,
+	.close =    ipath_vma_close,
+};
+
+/**
+ * ipath_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ * Return zero if the mmap is OK. Otherwise, return an errno.
+ */
+int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct ipath_ibdev *dev = to_idev(context->device);
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct ipath_mmap_info *ip, *pp;
+	int ret = -EINVAL;
+
+	/*
+	 * Search the device's list of objects waiting for a mmap call.
+	 * Normally, this list is very short since a call to create a
+	 * CQ, QP, or SRQ is soon followed by a call to mmap().
+	 */
+	spin_lock_irq(&dev->pending_lock);
+	list_for_each_entry_safe(ip, pp, &dev->pending_mmaps,
+				 pending_mmaps) {
+		/* Only the creator is allowed to mmap the object */
+		if (context != ip->context || (__u64) offset != ip->offset)
+			continue;
+		/* Don't allow a mmap larger than the object. */
+		if (size > ip->size)
+			break;
+
+		list_del_init(&ip->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+
+		ret = remap_vmalloc_range(vma, ip->obj, 0);
+		if (ret)
+			goto done;
+		vma->vm_ops = &ipath_vm_ops;
+		vma->vm_private_data = ip;
+		ipath_vma_open(vma);
+		goto done;
+	}
+	spin_unlock_irq(&dev->pending_lock);
+done:
+	return ret;
+}
+
+/*
+ * Allocate information for ipath_mmap
+ */
+struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev,
+					       u32 size,
+					       struct ib_ucontext *context,
+					       void *obj) {
+	struct ipath_mmap_info *ip;
+
+	ip = kmalloc(sizeof *ip, GFP_KERNEL);
+	if (!ip)
+		goto bail;
+
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irq(&dev->mmap_offset_lock);
+	if (dev->mmap_offset == 0)
+		dev->mmap_offset = PAGE_SIZE;
+	ip->offset = dev->mmap_offset;
+	dev->mmap_offset += size;
+	spin_unlock_irq(&dev->mmap_offset_lock);
+
+	INIT_LIST_HEAD(&ip->pending_mmaps);
+	ip->size = size;
+	ip->context = context;
+	ip->obj = obj;
+	kref_init(&ip->ref);
+
+bail:
+	return ip;
+}
+
+void ipath_update_mmap_info(struct ipath_ibdev *dev,
+			    struct ipath_mmap_info *ip,
+			    u32 size, void *obj) {
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irq(&dev->mmap_offset_lock);
+	if (dev->mmap_offset == 0)
+		dev->mmap_offset = PAGE_SIZE;
+	ip->offset = dev->mmap_offset;
+	dev->mmap_offset += size;
+	spin_unlock_irq(&dev->mmap_offset_lock);
+
+	ip->size = size;
+	ip->obj = obj;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_mr.c b/drivers/infiniband/hw/ipath/ipath_mr.c
new file mode 100644
index 000000000..c7278f6a8
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_mr.c
@@ -0,0 +1,425 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_smi.h>
+
+#include "ipath_verbs.h"
+
+/* Fast memory region */
+struct ipath_fmr {
+	struct ib_fmr ibfmr;
+	u8 page_shift;
+	struct ipath_mregion mr;        /* must be last */
+};
+
+static inline struct ipath_fmr *to_ifmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct ipath_fmr, ibfmr);
+}
+
+/**
+ * ipath_get_dma_mr - get a DMA memory region
+ * @pd: protection domain for this memory region
+ * @acc: access flags
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ * Note that all DMA addresses should be created via the
+ * struct ib_dma_mapping_ops functions (see ipath_dma.c).
+ */
+struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct ipath_mr *mr;
+	struct ib_mr *ret;
+
+	mr = kzalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	mr->mr.access_flags = acc;
+	ret = &mr->ibmr;
+
+bail:
+	return ret;
+}
+
+static struct ipath_mr *alloc_mr(int count,
+				 struct ipath_lkey_table *lk_table)
+{
+	struct ipath_mr *mr;
+	int m, i = 0;
+
+	/* Allocate struct plus pointers to first level page tables. */
+	m = (count + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
+	mr = kmalloc(sizeof *mr + m * sizeof mr->mr.map[0], GFP_KERNEL);
+	if (!mr)
+		goto done;
+
+	/* Allocate first level page tables. */
+	for (; i < m; i++) {
+		mr->mr.map[i] = kmalloc(sizeof *mr->mr.map[0], GFP_KERNEL);
+		if (!mr->mr.map[i])
+			goto bail;
+	}
+	mr->mr.mapsz = m;
+
+	/*
+	 * ib_reg_phys_mr() will initialize mr->ibmr except for
+	 * lkey and rkey.
+	 */
+	if (!ipath_alloc_lkey(lk_table, &mr->mr))
+		goto bail;
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey;
+
+	goto done;
+
+bail:
+	while (i) {
+		i--;
+		kfree(mr->mr.map[i]);
+	}
+	kfree(mr);
+	mr = NULL;
+
+done:
+	return mr;
+}
+
+/**
+ * ipath_reg_phys_mr - register a physical memory region
+ * @pd: protection domain for this memory region
+ * @buffer_list: pointer to the list of physical buffers to register
+ * @num_phys_buf: the number of physical buffers to register
+ * @iova_start: the starting address passed over IB which maps to this MR
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
+				struct ib_phys_buf *buffer_list,
+				int num_phys_buf, int acc, u64 *iova_start)
+{
+	struct ipath_mr *mr;
+	int n, m, i;
+	struct ib_mr *ret;
+
+	mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table);
+	if (mr == NULL) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	mr->mr.pd = pd;
+	mr->mr.user_base = *iova_start;
+	mr->mr.iova = *iova_start;
+	mr->mr.length = 0;
+	mr->mr.offset = 0;
+	mr->mr.access_flags = acc;
+	mr->mr.max_segs = num_phys_buf;
+	mr->umem = NULL;
+
+	m = 0;
+	n = 0;
+	for (i = 0; i < num_phys_buf; i++) {
+		mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
+		mr->mr.map[m]->segs[n].length = buffer_list[i].size;
+		mr->mr.length += buffer_list[i].size;
+		n++;
+		if (n == IPATH_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+
+	ret = &mr->ibmr;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_reg_user_mr - register a userspace memory region
+ * @pd: protection domain for this memory region
+ * @start: starting userspace address
+ * @length: length of region to register
+ * @virt_addr: virtual address to use (from HCA's point of view)
+ * @mr_access_flags: access flags for this memory region
+ * @udata: unused by the InfiniPath driver
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				u64 virt_addr, int mr_access_flags,
+				struct ib_udata *udata)
+{
+	struct ipath_mr *mr;
+	struct ib_umem *umem;
+	int n, m, entry;
+	struct scatterlist *sg;
+	struct ib_mr *ret;
+
+	if (length == 0) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	umem = ib_umem_get(pd->uobject->context, start, length,
+			   mr_access_flags, 0);
+	if (IS_ERR(umem))
+		return (void *) umem;
+
+	n = umem->nmap;
+	mr = alloc_mr(n, &to_idev(pd->device)->lk_table);
+	if (!mr) {
+		ret = ERR_PTR(-ENOMEM);
+		ib_umem_release(umem);
+		goto bail;
+	}
+
+	mr->mr.pd = pd;
+	mr->mr.user_base = start;
+	mr->mr.iova = virt_addr;
+	mr->mr.length = length;
+	mr->mr.offset = ib_umem_offset(umem);
+	mr->mr.access_flags = mr_access_flags;
+	mr->mr.max_segs = n;
+	mr->umem = umem;
+
+	m = 0;
+	n = 0;
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		void *vaddr;
+
+		vaddr = page_address(sg_page(sg));
+		if (!vaddr) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+		mr->mr.map[m]->segs[n].vaddr = vaddr;
+		mr->mr.map[m]->segs[n].length = umem->page_size;
+		n++;
+		if (n == IPATH_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	ret = &mr->ibmr;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_dereg_mr - unregister and free a memory region
+ * @ibmr: the memory region to free
+ *
+ * Returns 0 on success.
+ *
+ * Note that this is called to free MRs created by ipath_get_dma_mr()
+ * or ipath_reg_user_mr().
+ */
+int ipath_dereg_mr(struct ib_mr *ibmr)
+{
+	struct ipath_mr *mr = to_imr(ibmr);
+	int i;
+
+	ipath_free_lkey(&to_idev(ibmr->device)->lk_table, ibmr->lkey);
+	i = mr->mr.mapsz;
+	while (i) {
+		i--;
+		kfree(mr->mr.map[i]);
+	}
+
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+
+	kfree(mr);
+	return 0;
+}
+
+/**
+ * ipath_alloc_fmr - allocate a fast memory region
+ * @pd: the protection domain for this memory region
+ * @mr_access_flags: access flags for this memory region
+ * @fmr_attr: fast memory region attributes
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+			       struct ib_fmr_attr *fmr_attr)
+{
+	struct ipath_fmr *fmr;
+	int m, i = 0;
+	struct ib_fmr *ret;
+
+	/* Allocate struct plus pointers to first level page tables. */
+	m = (fmr_attr->max_pages + IPATH_SEGSZ - 1) / IPATH_SEGSZ;
+	fmr = kmalloc(sizeof *fmr + m * sizeof fmr->mr.map[0], GFP_KERNEL);
+	if (!fmr)
+		goto bail;
+
+	/* Allocate first level page tables. */
+	for (; i < m; i++) {
+		fmr->mr.map[i] = kmalloc(sizeof *fmr->mr.map[0],
+					 GFP_KERNEL);
+		if (!fmr->mr.map[i])
+			goto bail;
+	}
+	fmr->mr.mapsz = m;
+
+	/*
+	 * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
+	 * rkey.
+	 */
+	if (!ipath_alloc_lkey(&to_idev(pd->device)->lk_table, &fmr->mr))
+		goto bail;
+	fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mr.lkey;
+	/*
+	 * Resources are allocated but no valid mapping (RKEY can't be
+	 * used).
+	 */
+	fmr->mr.pd = pd;
+	fmr->mr.user_base = 0;
+	fmr->mr.iova = 0;
+	fmr->mr.length = 0;
+	fmr->mr.offset = 0;
+	fmr->mr.access_flags = mr_access_flags;
+	fmr->mr.max_segs = fmr_attr->max_pages;
+	fmr->page_shift = fmr_attr->page_shift;
+
+	ret = &fmr->ibfmr;
+	goto done;
+
+bail:
+	while (i)
+		kfree(fmr->mr.map[--i]);
+	kfree(fmr);
+	ret = ERR_PTR(-ENOMEM);
+
+done:
+	return ret;
+}
+
+/**
+ * ipath_map_phys_fmr - set up a fast memory region
+ * @ibmfr: the fast memory region to set up
+ * @page_list: the list of pages to associate with the fast memory region
+ * @list_len: the number of pages to associate with the fast memory region
+ * @iova: the virtual address of the start of the fast memory region
+ *
+ * This may be called from interrupt context.
+ */
+
+int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
+		       int list_len, u64 iova)
+{
+	struct ipath_fmr *fmr = to_ifmr(ibfmr);
+	struct ipath_lkey_table *rkt;
+	unsigned long flags;
+	int m, n, i;
+	u32 ps;
+	int ret;
+
+	if (list_len > fmr->mr.max_segs) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	rkt = &to_idev(ibfmr->device)->lk_table;
+	spin_lock_irqsave(&rkt->lock, flags);
+	fmr->mr.user_base = iova;
+	fmr->mr.iova = iova;
+	ps = 1 << fmr->page_shift;
+	fmr->mr.length = list_len * ps;
+	m = 0;
+	n = 0;
+	ps = 1 << fmr->page_shift;
+	for (i = 0; i < list_len; i++) {
+		fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i];
+		fmr->mr.map[m]->segs[n].length = ps;
+		if (++n == IPATH_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	spin_unlock_irqrestore(&rkt->lock, flags);
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_unmap_fmr - unmap fast memory regions
+ * @fmr_list: the list of fast memory regions to unmap
+ *
+ * Returns 0 on success.
+ */
+int ipath_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ipath_fmr *fmr;
+	struct ipath_lkey_table *rkt;
+	unsigned long flags;
+
+	list_for_each_entry(fmr, fmr_list, ibfmr.list) {
+		rkt = &to_idev(fmr->ibfmr.device)->lk_table;
+		spin_lock_irqsave(&rkt->lock, flags);
+		fmr->mr.user_base = 0;
+		fmr->mr.iova = 0;
+		fmr->mr.length = 0;
+		spin_unlock_irqrestore(&rkt->lock, flags);
+	}
+	return 0;
+}
+
+/**
+ * ipath_dealloc_fmr - deallocate a fast memory region
+ * @ibfmr: the fast memory region to deallocate
+ *
+ * Returns 0 on success.
+ */
+int ipath_dealloc_fmr(struct ib_fmr *ibfmr)
+{
+	struct ipath_fmr *fmr = to_ifmr(ibfmr);
+	int i;
+
+	ipath_free_lkey(&to_idev(ibfmr->device)->lk_table, ibfmr->lkey);
+	i = fmr->mr.mapsz;
+	while (i)
+		kfree(fmr->mr.map[--i]);
+	kfree(fmr);
+	return 0;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c
new file mode 100644
index 000000000..face87602
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_qp.c
@@ -0,0 +1,1080 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+#define BITS_PER_PAGE		(PAGE_SIZE*BITS_PER_BYTE)
+#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE-1)
+#define mk_qpn(qpt, map, off)	(((map) - (qpt)->map) * BITS_PER_PAGE + \
+				 (off))
+#define find_next_offset(map, off) find_next_zero_bit((map)->page, \
+						      BITS_PER_PAGE, off)
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static u32 credit_table[31] = {
+	0,			/* 0 */
+	1,			/* 1 */
+	2,			/* 2 */
+	3,			/* 3 */
+	4,			/* 4 */
+	6,			/* 5 */
+	8,			/* 6 */
+	12,			/* 7 */
+	16,			/* 8 */
+	24,			/* 9 */
+	32,			/* A */
+	48,			/* B */
+	64,			/* C */
+	96,			/* D */
+	128,			/* E */
+	192,			/* F */
+	256,			/* 10 */
+	384,			/* 11 */
+	512,			/* 12 */
+	768,			/* 13 */
+	1024,			/* 14 */
+	1536,			/* 15 */
+	2048,			/* 16 */
+	3072,			/* 17 */
+	4096,			/* 18 */
+	6144,			/* 19 */
+	8192,			/* 1A */
+	12288,			/* 1B */
+	16384,			/* 1C */
+	24576,			/* 1D */
+	32768			/* 1E */
+};
+
+
+static void get_map_page(struct ipath_qp_table *qpt, struct qpn_map *map)
+{
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+	unsigned long flags;
+
+	/*
+	 * Free the page if someone raced with us installing it.
+	 */
+
+	spin_lock_irqsave(&qpt->lock, flags);
+	if (map->page)
+		free_page(page);
+	else
+		map->page = (void *)page;
+	spin_unlock_irqrestore(&qpt->lock, flags);
+}
+
+
+static int alloc_qpn(struct ipath_qp_table *qpt, enum ib_qp_type type)
+{
+	u32 i, offset, max_scan, qpn;
+	struct qpn_map *map;
+	u32 ret = -1;
+
+	if (type == IB_QPT_SMI)
+		ret = 0;
+	else if (type == IB_QPT_GSI)
+		ret = 1;
+
+	if (ret != -1) {
+		map = &qpt->map[0];
+		if (unlikely(!map->page)) {
+			get_map_page(qpt, map);
+			if (unlikely(!map->page)) {
+				ret = -ENOMEM;
+				goto bail;
+			}
+		}
+		if (!test_and_set_bit(ret, map->page))
+			atomic_dec(&map->n_free);
+		else
+			ret = -EBUSY;
+		goto bail;
+	}
+
+	qpn = qpt->last + 1;
+	if (qpn >= QPN_MAX)
+		qpn = 2;
+	offset = qpn & BITS_PER_PAGE_MASK;
+	map = &qpt->map[qpn / BITS_PER_PAGE];
+	max_scan = qpt->nmaps - !offset;
+	for (i = 0;;) {
+		if (unlikely(!map->page)) {
+			get_map_page(qpt, map);
+			if (unlikely(!map->page))
+				break;
+		}
+		if (likely(atomic_read(&map->n_free))) {
+			do {
+				if (!test_and_set_bit(offset, map->page)) {
+					atomic_dec(&map->n_free);
+					qpt->last = qpn;
+					ret = qpn;
+					goto bail;
+				}
+				offset = find_next_offset(map, offset);
+				qpn = mk_qpn(qpt, map, offset);
+				/*
+				 * This test differs from alloc_pidmap().
+				 * If find_next_offset() does find a zero
+				 * bit, we don't need to check for QPN
+				 * wrapping around past our starting QPN.
+				 * We just need to be sure we don't loop
+				 * forever.
+				 */
+			} while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
+		}
+		/*
+		 * In order to keep the number of pages allocated to a
+		 * minimum, we scan the all existing pages before increasing
+		 * the size of the bitmap table.
+		 */
+		if (++i > max_scan) {
+			if (qpt->nmaps == QPNMAP_ENTRIES)
+				break;
+			map = &qpt->map[qpt->nmaps++];
+			offset = 0;
+		} else if (map < &qpt->map[qpt->nmaps]) {
+			++map;
+			offset = 0;
+		} else {
+			map = &qpt->map[0];
+			offset = 2;
+		}
+		qpn = mk_qpn(qpt, map, offset);
+	}
+
+	ret = -ENOMEM;
+
+bail:
+	return ret;
+}
+
+static void free_qpn(struct ipath_qp_table *qpt, u32 qpn)
+{
+	struct qpn_map *map;
+
+	map = qpt->map + qpn / BITS_PER_PAGE;
+	if (map->page)
+		clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
+	atomic_inc(&map->n_free);
+}
+
+/**
+ * ipath_alloc_qpn - allocate a QP number
+ * @qpt: the QP table
+ * @qp: the QP
+ * @type: the QP type (IB_QPT_SMI and IB_QPT_GSI are special)
+ *
+ * Allocate the next available QPN and put the QP into the hash table.
+ * The hash table holds a reference to the QP.
+ */
+static int ipath_alloc_qpn(struct ipath_qp_table *qpt, struct ipath_qp *qp,
+			   enum ib_qp_type type)
+{
+	unsigned long flags;
+	int ret;
+
+	ret = alloc_qpn(qpt, type);
+	if (ret < 0)
+		goto bail;
+	qp->ibqp.qp_num = ret;
+
+	/* Add the QP to the hash table. */
+	spin_lock_irqsave(&qpt->lock, flags);
+
+	ret %= qpt->max;
+	qp->next = qpt->table[ret];
+	qpt->table[ret] = qp;
+	atomic_inc(&qp->refcount);
+
+	spin_unlock_irqrestore(&qpt->lock, flags);
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_free_qp - remove a QP from the QP table
+ * @qpt: the QP table
+ * @qp: the QP to remove
+ *
+ * Remove the QP from the table so it can't be found asynchronously by
+ * the receive interrupt routine.
+ */
+static void ipath_free_qp(struct ipath_qp_table *qpt, struct ipath_qp *qp)
+{
+	struct ipath_qp *q, **qpp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qpt->lock, flags);
+
+	/* Remove QP from the hash table. */
+	qpp = &qpt->table[qp->ibqp.qp_num % qpt->max];
+	for (; (q = *qpp) != NULL; qpp = &q->next) {
+		if (q == qp) {
+			*qpp = qp->next;
+			qp->next = NULL;
+			atomic_dec(&qp->refcount);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&qpt->lock, flags);
+}
+
+/**
+ * ipath_free_all_qps - check for QPs still in use
+ * @qpt: the QP table to empty
+ *
+ * There should not be any QPs still in use.
+ * Free memory for table.
+ */
+unsigned ipath_free_all_qps(struct ipath_qp_table *qpt)
+{
+	unsigned long flags;
+	struct ipath_qp *qp;
+	u32 n, qp_inuse = 0;
+
+	spin_lock_irqsave(&qpt->lock, flags);
+	for (n = 0; n < qpt->max; n++) {
+		qp = qpt->table[n];
+		qpt->table[n] = NULL;
+
+		for (; qp; qp = qp->next)
+			qp_inuse++;
+	}
+	spin_unlock_irqrestore(&qpt->lock, flags);
+
+	for (n = 0; n < ARRAY_SIZE(qpt->map); n++)
+		if (qpt->map[n].page)
+			free_page((unsigned long) qpt->map[n].page);
+	return qp_inuse;
+}
+
+/**
+ * ipath_lookup_qpn - return the QP with the given QPN
+ * @qpt: the QP table
+ * @qpn: the QP number to look up
+ *
+ * The caller is responsible for decrementing the QP reference count
+ * when done.
+ */
+struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn)
+{
+	unsigned long flags;
+	struct ipath_qp *qp;
+
+	spin_lock_irqsave(&qpt->lock, flags);
+
+	for (qp = qpt->table[qpn % qpt->max]; qp; qp = qp->next) {
+		if (qp->ibqp.qp_num == qpn) {
+			atomic_inc(&qp->refcount);
+			break;
+		}
+	}
+
+	spin_unlock_irqrestore(&qpt->lock, flags);
+	return qp;
+}
+
+/**
+ * ipath_reset_qp - initialize the QP state to the reset state
+ * @qp: the QP to reset
+ * @type: the QP type
+ */
+static void ipath_reset_qp(struct ipath_qp *qp, enum ib_qp_type type)
+{
+	qp->remote_qpn = 0;
+	qp->qkey = 0;
+	qp->qp_access_flags = 0;
+	atomic_set(&qp->s_dma_busy, 0);
+	qp->s_flags &= IPATH_S_SIGNAL_REQ_WR;
+	qp->s_hdrwords = 0;
+	qp->s_wqe = NULL;
+	qp->s_pkt_delay = 0;
+	qp->s_draining = 0;
+	qp->s_psn = 0;
+	qp->r_psn = 0;
+	qp->r_msn = 0;
+	if (type == IB_QPT_RC) {
+		qp->s_state = IB_OPCODE_RC_SEND_LAST;
+		qp->r_state = IB_OPCODE_RC_SEND_LAST;
+	} else {
+		qp->s_state = IB_OPCODE_UC_SEND_LAST;
+		qp->r_state = IB_OPCODE_UC_SEND_LAST;
+	}
+	qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+	qp->r_nak_state = 0;
+	qp->r_aflags = 0;
+	qp->r_flags = 0;
+	qp->s_rnr_timeout = 0;
+	qp->s_head = 0;
+	qp->s_tail = 0;
+	qp->s_cur = 0;
+	qp->s_last = 0;
+	qp->s_ssn = 1;
+	qp->s_lsn = 0;
+	memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
+	qp->r_head_ack_queue = 0;
+	qp->s_tail_ack_queue = 0;
+	qp->s_num_rd_atomic = 0;
+	if (qp->r_rq.wq) {
+		qp->r_rq.wq->head = 0;
+		qp->r_rq.wq->tail = 0;
+	}
+}
+
+/**
+ * ipath_error_qp - put a QP into the error state
+ * @qp: the QP to put into the error state
+ * @err: the receive completion error to signal if a RWQE is active
+ *
+ * Flushes both send and receive work queues.
+ * Returns true if last WQE event should be generated.
+ * The QP s_lock should be held and interrupts disabled.
+ * If we are already in error state, just return.
+ */
+
+int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ib_wc wc;
+	int ret = 0;
+
+	if (qp->state == IB_QPS_ERR)
+		goto bail;
+
+	qp->state = IB_QPS_ERR;
+
+	spin_lock(&dev->pending_lock);
+	if (!list_empty(&qp->timerwait))
+		list_del_init(&qp->timerwait);
+	if (!list_empty(&qp->piowait))
+		list_del_init(&qp->piowait);
+	spin_unlock(&dev->pending_lock);
+
+	/* Schedule the sending tasklet to drain the send work queue. */
+	if (qp->s_last != qp->s_head)
+		ipath_schedule_send(qp);
+
+	memset(&wc, 0, sizeof(wc));
+	wc.qp = &qp->ibqp;
+	wc.opcode = IB_WC_RECV;
+
+	if (test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags)) {
+		wc.wr_id = qp->r_wr_id;
+		wc.status = err;
+		ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+	}
+	wc.status = IB_WC_WR_FLUSH_ERR;
+
+	if (qp->r_rq.wq) {
+		struct ipath_rwq *wq;
+		u32 head;
+		u32 tail;
+
+		spin_lock(&qp->r_rq.lock);
+
+		/* sanity check pointers before trusting them */
+		wq = qp->r_rq.wq;
+		head = wq->head;
+		if (head >= qp->r_rq.size)
+			head = 0;
+		tail = wq->tail;
+		if (tail >= qp->r_rq.size)
+			tail = 0;
+		while (tail != head) {
+			wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
+			if (++tail >= qp->r_rq.size)
+				tail = 0;
+			ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+		}
+		wq->tail = tail;
+
+		spin_unlock(&qp->r_rq.lock);
+	} else if (qp->ibqp.event_handler)
+		ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_modify_qp - modify the attributes of a queue pair
+ * @ibqp: the queue pair who's attributes we're modifying
+ * @attr: the new attributes
+ * @attr_mask: the mask of attributes to modify
+ * @udata: user data for ipathverbs.so
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		    int attr_mask, struct ib_udata *udata)
+{
+	struct ipath_ibdev *dev = to_idev(ibqp->device);
+	struct ipath_qp *qp = to_iqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	int lastwqe = 0;
+	int ret;
+
+	spin_lock_irq(&qp->s_lock);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ?
+		attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+				attr_mask, IB_LINK_LAYER_UNSPECIFIED))
+		goto inval;
+
+	if (attr_mask & IB_QP_AV) {
+		if (attr->ah_attr.dlid == 0 ||
+		    attr->ah_attr.dlid >= IPATH_MULTICAST_LID_BASE)
+			goto inval;
+
+		if ((attr->ah_attr.ah_flags & IB_AH_GRH) &&
+		    (attr->ah_attr.grh.sgid_index > 1))
+			goto inval;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		if (attr->pkey_index >= ipath_get_npkeys(dev->dd))
+			goto inval;
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		if (attr->min_rnr_timer > 31)
+			goto inval;
+
+	if (attr_mask & IB_QP_PORT)
+		if (attr->port_num == 0 ||
+		    attr->port_num > ibqp->device->phys_port_cnt)
+			goto inval;
+
+	/*
+	 * don't allow invalid Path MTU values or greater than 2048
+	 * unless we are configured for a 4KB MTU
+	 */
+	if ((attr_mask & IB_QP_PATH_MTU) &&
+		(ib_mtu_enum_to_int(attr->path_mtu) == -1 ||
+		(attr->path_mtu > IB_MTU_2048 && !ipath_mtu4096)))
+		goto inval;
+
+	if (attr_mask & IB_QP_PATH_MIG_STATE)
+		if (attr->path_mig_state != IB_MIG_MIGRATED &&
+		    attr->path_mig_state != IB_MIG_REARM)
+			goto inval;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		if (attr->max_dest_rd_atomic > IPATH_MAX_RDMA_ATOMIC)
+			goto inval;
+
+	switch (new_state) {
+	case IB_QPS_RESET:
+		if (qp->state != IB_QPS_RESET) {
+			qp->state = IB_QPS_RESET;
+			spin_lock(&dev->pending_lock);
+			if (!list_empty(&qp->timerwait))
+				list_del_init(&qp->timerwait);
+			if (!list_empty(&qp->piowait))
+				list_del_init(&qp->piowait);
+			spin_unlock(&dev->pending_lock);
+			qp->s_flags &= ~IPATH_S_ANY_WAIT;
+			spin_unlock_irq(&qp->s_lock);
+			/* Stop the sending tasklet */
+			tasklet_kill(&qp->s_task);
+			wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
+			spin_lock_irq(&qp->s_lock);
+		}
+		ipath_reset_qp(qp, ibqp->qp_type);
+		break;
+
+	case IB_QPS_SQD:
+		qp->s_draining = qp->s_last != qp->s_cur;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_SQE:
+		if (qp->ibqp.qp_type == IB_QPT_RC)
+			goto inval;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_ERR:
+		lastwqe = ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+		break;
+
+	default:
+		qp->state = new_state;
+		break;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		qp->s_pkey_index = attr->pkey_index;
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		qp->remote_qpn = attr->dest_qp_num;
+
+	if (attr_mask & IB_QP_SQ_PSN) {
+		qp->s_psn = qp->s_next_psn = attr->sq_psn;
+		qp->s_last_psn = qp->s_next_psn - 1;
+	}
+
+	if (attr_mask & IB_QP_RQ_PSN)
+		qp->r_psn = attr->rq_psn;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->qp_access_flags = attr->qp_access_flags;
+
+	if (attr_mask & IB_QP_AV) {
+		qp->remote_ah_attr = attr->ah_attr;
+		qp->s_dmult = ipath_ib_rate_to_mult(attr->ah_attr.static_rate);
+	}
+
+	if (attr_mask & IB_QP_PATH_MTU)
+		qp->path_mtu = attr->path_mtu;
+
+	if (attr_mask & IB_QP_RETRY_CNT)
+		qp->s_retry = qp->s_retry_cnt = attr->retry_cnt;
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		qp->s_rnr_retry = attr->rnr_retry;
+		if (qp->s_rnr_retry > 7)
+			qp->s_rnr_retry = 7;
+		qp->s_rnr_retry_cnt = qp->s_rnr_retry;
+	}
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		qp->r_min_rnr_timer = attr->min_rnr_timer;
+
+	if (attr_mask & IB_QP_TIMEOUT)
+		qp->timeout = attr->timeout;
+
+	if (attr_mask & IB_QP_QKEY)
+		qp->qkey = attr->qkey;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
+		qp->s_max_rd_atomic = attr->max_rd_atomic;
+
+	spin_unlock_irq(&qp->s_lock);
+
+	if (lastwqe) {
+		struct ib_event ev;
+
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+	ret = 0;
+	goto bail;
+
+inval:
+	spin_unlock_irq(&qp->s_lock);
+	ret = -EINVAL;
+
+bail:
+	return ret;
+}
+
+int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		   int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+
+	attr->qp_state = qp->state;
+	attr->cur_qp_state = attr->qp_state;
+	attr->path_mtu = qp->path_mtu;
+	attr->path_mig_state = 0;
+	attr->qkey = qp->qkey;
+	attr->rq_psn = qp->r_psn;
+	attr->sq_psn = qp->s_next_psn;
+	attr->dest_qp_num = qp->remote_qpn;
+	attr->qp_access_flags = qp->qp_access_flags;
+	attr->cap.max_send_wr = qp->s_size - 1;
+	attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
+	attr->cap.max_send_sge = qp->s_max_sge;
+	attr->cap.max_recv_sge = qp->r_rq.max_sge;
+	attr->cap.max_inline_data = 0;
+	attr->ah_attr = qp->remote_ah_attr;
+	memset(&attr->alt_ah_attr, 0, sizeof(attr->alt_ah_attr));
+	attr->pkey_index = qp->s_pkey_index;
+	attr->alt_pkey_index = 0;
+	attr->en_sqd_async_notify = 0;
+	attr->sq_draining = qp->s_draining;
+	attr->max_rd_atomic = qp->s_max_rd_atomic;
+	attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
+	attr->min_rnr_timer = qp->r_min_rnr_timer;
+	attr->port_num = 1;
+	attr->timeout = qp->timeout;
+	attr->retry_cnt = qp->s_retry_cnt;
+	attr->rnr_retry = qp->s_rnr_retry_cnt;
+	attr->alt_port_num = 0;
+	attr->alt_timeout = 0;
+
+	init_attr->event_handler = qp->ibqp.event_handler;
+	init_attr->qp_context = qp->ibqp.qp_context;
+	init_attr->send_cq = qp->ibqp.send_cq;
+	init_attr->recv_cq = qp->ibqp.recv_cq;
+	init_attr->srq = qp->ibqp.srq;
+	init_attr->cap = attr->cap;
+	if (qp->s_flags & IPATH_S_SIGNAL_REQ_WR)
+		init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+	else
+		init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+	init_attr->qp_type = qp->ibqp.qp_type;
+	init_attr->port_num = 1;
+	return 0;
+}
+
+/**
+ * ipath_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 ipath_compute_aeth(struct ipath_qp *qp)
+{
+	u32 aeth = qp->r_msn & IPATH_MSN_MASK;
+
+	if (qp->ibqp.srq) {
+		/*
+		 * Shared receive queues don't generate credits.
+		 * Set the credit field to the invalid value.
+		 */
+		aeth |= IPATH_AETH_CREDIT_INVAL << IPATH_AETH_CREDIT_SHIFT;
+	} else {
+		u32 min, max, x;
+		u32 credits;
+		struct ipath_rwq *wq = qp->r_rq.wq;
+		u32 head;
+		u32 tail;
+
+		/* sanity check pointers before trusting them */
+		head = wq->head;
+		if (head >= qp->r_rq.size)
+			head = 0;
+		tail = wq->tail;
+		if (tail >= qp->r_rq.size)
+			tail = 0;
+		/*
+		 * Compute the number of credits available (RWQEs).
+		 * XXX Not holding the r_rq.lock here so there is a small
+		 * chance that the pair of reads are not atomic.
+		 */
+		credits = head - tail;
+		if ((int)credits < 0)
+			credits += qp->r_rq.size;
+		/*
+		 * Binary search the credit table to find the code to
+		 * use.
+		 */
+		min = 0;
+		max = 31;
+		for (;;) {
+			x = (min + max) / 2;
+			if (credit_table[x] == credits)
+				break;
+			if (credit_table[x] > credits)
+				max = x;
+			else if (min == x)
+				break;
+			else
+				min = x;
+		}
+		aeth |= x << IPATH_AETH_CREDIT_SHIFT;
+	}
+	return cpu_to_be32(aeth);
+}
+
+/**
+ * ipath_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: unused by InfiniPath
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
+			      struct ib_qp_init_attr *init_attr,
+			      struct ib_udata *udata)
+{
+	struct ipath_qp *qp;
+	int err;
+	struct ipath_swqe *swq = NULL;
+	struct ipath_ibdev *dev;
+	size_t sz;
+	size_t sg_list_sz;
+	struct ib_qp *ret;
+
+	if (init_attr->create_flags) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	if (init_attr->cap.max_send_sge > ib_ipath_max_sges ||
+	    init_attr->cap.max_send_wr > ib_ipath_max_qp_wrs) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	/* Check receive queue parameters if no SRQ is specified. */
+	if (!init_attr->srq) {
+		if (init_attr->cap.max_recv_sge > ib_ipath_max_sges ||
+		    init_attr->cap.max_recv_wr > ib_ipath_max_qp_wrs) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+		if (init_attr->cap.max_send_sge +
+		    init_attr->cap.max_send_wr +
+		    init_attr->cap.max_recv_sge +
+		    init_attr->cap.max_recv_wr == 0) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+	}
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_UC:
+	case IB_QPT_RC:
+	case IB_QPT_UD:
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		sz = sizeof(struct ipath_sge) *
+			init_attr->cap.max_send_sge +
+			sizeof(struct ipath_swqe);
+		swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
+		if (swq == NULL) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail;
+		}
+		sz = sizeof(*qp);
+		sg_list_sz = 0;
+		if (init_attr->srq) {
+			struct ipath_srq *srq = to_isrq(init_attr->srq);
+
+			if (srq->rq.max_sge > 1)
+				sg_list_sz = sizeof(*qp->r_sg_list) *
+					(srq->rq.max_sge - 1);
+		} else if (init_attr->cap.max_recv_sge > 1)
+			sg_list_sz = sizeof(*qp->r_sg_list) *
+				(init_attr->cap.max_recv_sge - 1);
+		qp = kmalloc(sz + sg_list_sz, GFP_KERNEL);
+		if (!qp) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_swq;
+		}
+		if (sg_list_sz && (init_attr->qp_type == IB_QPT_UD ||
+		    init_attr->qp_type == IB_QPT_SMI ||
+		    init_attr->qp_type == IB_QPT_GSI)) {
+			qp->r_ud_sg_list = kmalloc(sg_list_sz, GFP_KERNEL);
+			if (!qp->r_ud_sg_list) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_qp;
+			}
+		} else
+			qp->r_ud_sg_list = NULL;
+		if (init_attr->srq) {
+			sz = 0;
+			qp->r_rq.size = 0;
+			qp->r_rq.max_sge = 0;
+			qp->r_rq.wq = NULL;
+			init_attr->cap.max_recv_wr = 0;
+			init_attr->cap.max_recv_sge = 0;
+		} else {
+			qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
+			qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
+			sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
+				sizeof(struct ipath_rwqe);
+			qp->r_rq.wq = vmalloc_user(sizeof(struct ipath_rwq) +
+					      qp->r_rq.size * sz);
+			if (!qp->r_rq.wq) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_sg_list;
+			}
+		}
+
+		/*
+		 * ib_create_qp() will initialize qp->ibqp
+		 * except for qp->ibqp.qp_num.
+		 */
+		spin_lock_init(&qp->s_lock);
+		spin_lock_init(&qp->r_rq.lock);
+		atomic_set(&qp->refcount, 0);
+		init_waitqueue_head(&qp->wait);
+		init_waitqueue_head(&qp->wait_dma);
+		tasklet_init(&qp->s_task, ipath_do_send, (unsigned long)qp);
+		INIT_LIST_HEAD(&qp->piowait);
+		INIT_LIST_HEAD(&qp->timerwait);
+		qp->state = IB_QPS_RESET;
+		qp->s_wq = swq;
+		qp->s_size = init_attr->cap.max_send_wr + 1;
+		qp->s_max_sge = init_attr->cap.max_send_sge;
+		if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
+			qp->s_flags = IPATH_S_SIGNAL_REQ_WR;
+		else
+			qp->s_flags = 0;
+		dev = to_idev(ibpd->device);
+		err = ipath_alloc_qpn(&dev->qp_table, qp,
+				      init_attr->qp_type);
+		if (err) {
+			ret = ERR_PTR(err);
+			vfree(qp->r_rq.wq);
+			goto bail_sg_list;
+		}
+		qp->ip = NULL;
+		qp->s_tx = NULL;
+		ipath_reset_qp(qp, init_attr->qp_type);
+		break;
+
+	default:
+		/* Don't support raw QPs */
+		ret = ERR_PTR(-ENOSYS);
+		goto bail;
+	}
+
+	init_attr->cap.max_inline_data = 0;
+
+	/*
+	 * Return the address of the RWQ as the offset to mmap.
+	 * See ipath_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		if (!qp->r_rq.wq) {
+			__u64 offset = 0;
+
+			err = ib_copy_to_udata(udata, &offset,
+					       sizeof(offset));
+			if (err) {
+				ret = ERR_PTR(err);
+				goto bail_ip;
+			}
+		} else {
+			u32 s = sizeof(struct ipath_rwq) +
+				qp->r_rq.size * sz;
+
+			qp->ip =
+			    ipath_create_mmap_info(dev, s,
+						   ibpd->uobject->context,
+						   qp->r_rq.wq);
+			if (!qp->ip) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_ip;
+			}
+
+			err = ib_copy_to_udata(udata, &(qp->ip->offset),
+					       sizeof(qp->ip->offset));
+			if (err) {
+				ret = ERR_PTR(err);
+				goto bail_ip;
+			}
+		}
+	}
+
+	spin_lock(&dev->n_qps_lock);
+	if (dev->n_qps_allocated == ib_ipath_max_qps) {
+		spin_unlock(&dev->n_qps_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	dev->n_qps_allocated++;
+	spin_unlock(&dev->n_qps_lock);
+
+	if (qp->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = &qp->ibqp;
+	goto bail;
+
+bail_ip:
+	if (qp->ip)
+		kref_put(&qp->ip->ref, ipath_release_mmap_info);
+	else
+		vfree(qp->r_rq.wq);
+	ipath_free_qp(&dev->qp_table, qp);
+	free_qpn(&dev->qp_table, qp->ibqp.qp_num);
+bail_sg_list:
+	kfree(qp->r_ud_sg_list);
+bail_qp:
+	kfree(qp);
+bail_swq:
+	vfree(swq);
+bail:
+	return ret;
+}
+
+/**
+ * ipath_destroy_qp - destroy a queue pair
+ * @ibqp: the queue pair to destroy
+ *
+ * Returns 0 on success.
+ *
+ * Note that this can be called while the QP is actively sending or
+ * receiving!
+ */
+int ipath_destroy_qp(struct ib_qp *ibqp)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+	struct ipath_ibdev *dev = to_idev(ibqp->device);
+
+	/* Make sure HW and driver activity is stopped. */
+	spin_lock_irq(&qp->s_lock);
+	if (qp->state != IB_QPS_RESET) {
+		qp->state = IB_QPS_RESET;
+		spin_lock(&dev->pending_lock);
+		if (!list_empty(&qp->timerwait))
+			list_del_init(&qp->timerwait);
+		if (!list_empty(&qp->piowait))
+			list_del_init(&qp->piowait);
+		spin_unlock(&dev->pending_lock);
+		qp->s_flags &= ~IPATH_S_ANY_WAIT;
+		spin_unlock_irq(&qp->s_lock);
+		/* Stop the sending tasklet */
+		tasklet_kill(&qp->s_task);
+		wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
+	} else
+		spin_unlock_irq(&qp->s_lock);
+
+	ipath_free_qp(&dev->qp_table, qp);
+
+	if (qp->s_tx) {
+		atomic_dec(&qp->refcount);
+		if (qp->s_tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
+			kfree(qp->s_tx->txreq.map_addr);
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&qp->s_tx->txreq.list, &dev->txreq_free);
+		spin_unlock_irq(&dev->pending_lock);
+		qp->s_tx = NULL;
+	}
+
+	wait_event(qp->wait, !atomic_read(&qp->refcount));
+
+	/* all user's cleaned up, mark it available */
+	free_qpn(&dev->qp_table, qp->ibqp.qp_num);
+	spin_lock(&dev->n_qps_lock);
+	dev->n_qps_allocated--;
+	spin_unlock(&dev->n_qps_lock);
+
+	if (qp->ip)
+		kref_put(&qp->ip->ref, ipath_release_mmap_info);
+	else
+		vfree(qp->r_rq.wq);
+	kfree(qp->r_ud_sg_list);
+	vfree(qp->s_wq);
+	kfree(qp);
+	return 0;
+}
+
+/**
+ * ipath_init_qp_table - initialize the QP table for a device
+ * @idev: the device who's QP table we're initializing
+ * @size: the size of the QP table
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int ipath_init_qp_table(struct ipath_ibdev *idev, int size)
+{
+	int i;
+	int ret;
+
+	idev->qp_table.last = 1;	/* QPN 0 and 1 are special. */
+	idev->qp_table.max = size;
+	idev->qp_table.nmaps = 1;
+	idev->qp_table.table = kzalloc(size * sizeof(*idev->qp_table.table),
+				       GFP_KERNEL);
+	if (idev->qp_table.table == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	for (i = 0; i < ARRAY_SIZE(idev->qp_table.map); i++) {
+		atomic_set(&idev->qp_table.map[i].n_free, BITS_PER_PAGE);
+		idev->qp_table.map[i].page = NULL;
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void ipath_get_credit(struct ipath_qp *qp, u32 aeth)
+{
+	u32 credit = (aeth >> IPATH_AETH_CREDIT_SHIFT) & IPATH_AETH_CREDIT_MASK;
+
+	/*
+	 * If the credit is invalid, we can send
+	 * as many packets as we like.  Otherwise, we have to
+	 * honor the credit field.
+	 */
+	if (credit == IPATH_AETH_CREDIT_INVAL)
+		qp->s_lsn = (u32) -1;
+	else if (qp->s_lsn != (u32) -1) {
+		/* Compute new LSN (i.e., MSN + credit) */
+		credit = (aeth + credit_table[credit]) & IPATH_MSN_MASK;
+		if (ipath_cmp24(credit, qp->s_lsn) > 0)
+			qp->s_lsn = credit;
+	}
+
+	/* Restart sending if it was blocked due to lack of credits. */
+	if ((qp->s_flags & IPATH_S_WAIT_SSN_CREDIT) &&
+	    qp->s_cur != qp->s_head &&
+	    (qp->s_lsn == (u32) -1 ||
+	     ipath_cmp24(get_swqe_ptr(qp, qp->s_cur)->ssn,
+			 qp->s_lsn + 1) <= 0))
+		ipath_schedule_send(qp);
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c
new file mode 100644
index 000000000..79b3dbc97
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_rc.c
@@ -0,0 +1,1969 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/io.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static u32 restart_sge(struct ipath_sge_state *ss, struct ipath_swqe *wqe,
+		       u32 psn, u32 pmtu)
+{
+	u32 len;
+
+	len = ((psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
+	ss->sge = wqe->sg_list[0];
+	ss->sg_list = wqe->sg_list + 1;
+	ss->num_sge = wqe->wr.num_sge;
+	ipath_skip_sge(ss, len);
+	return wqe->length - len;
+}
+
+/**
+ * ipath_init_restart- initialize the qp->s_sge after a restart
+ * @qp: the QP who's SGE we're restarting
+ * @wqe: the work queue to initialize the QP's SGE from
+ *
+ * The QP s_lock should be held and interrupts disabled.
+ */
+static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
+{
+	struct ipath_ibdev *dev;
+
+	qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn,
+				ib_mtu_enum_to_int(qp->path_mtu));
+	dev = to_idev(qp->ibqp.device);
+	spin_lock(&dev->pending_lock);
+	if (list_empty(&qp->timerwait))
+		list_add_tail(&qp->timerwait,
+			      &dev->pending[dev->pending_index]);
+	spin_unlock(&dev->pending_lock);
+}
+
+/**
+ * ipath_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @pmtu: the path MTU
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ * Note that we are in the responder's side of the QP context.
+ * Note the QP s_lock must be held.
+ */
+static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,
+			     struct ipath_other_headers *ohdr, u32 pmtu)
+{
+	struct ipath_ack_entry *e;
+	u32 hwords;
+	u32 len;
+	u32 bth0;
+	u32 bth2;
+
+	/* Don't send an ACK if we aren't supposed to. */
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+		goto bail;
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+
+	switch (qp->s_ack_state) {
+	case OP(RDMA_READ_RESPONSE_LAST):
+	case OP(RDMA_READ_RESPONSE_ONLY):
+	case OP(ATOMIC_ACKNOWLEDGE):
+		/*
+		 * We can increment the tail pointer now that the last
+		 * response has been sent instead of only being
+		 * constructed.
+		 */
+		if (++qp->s_tail_ack_queue > IPATH_MAX_RDMA_ATOMIC)
+			qp->s_tail_ack_queue = 0;
+		/* FALLTHROUGH */
+	case OP(SEND_ONLY):
+	case OP(ACKNOWLEDGE):
+		/* Check for no next entry in the queue. */
+		if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
+			if (qp->s_flags & IPATH_S_ACK_PENDING)
+				goto normal;
+			qp->s_ack_state = OP(ACKNOWLEDGE);
+			goto bail;
+		}
+
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST)) {
+			/* Copy SGE state in case we need to resend */
+			qp->s_ack_rdma_sge = e->rdma_sge;
+			qp->s_cur_sge = &qp->s_ack_rdma_sge;
+			len = e->rdma_sge.sge.sge_length;
+			if (len > pmtu) {
+				len = pmtu;
+				qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
+			} else {
+				qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
+				e->sent = 1;
+			}
+			ohdr->u.aeth = ipath_compute_aeth(qp);
+			hwords++;
+			qp->s_ack_rdma_psn = e->psn;
+			bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
+		} else {
+			/* COMPARE_SWAP or FETCH_ADD */
+			qp->s_cur_sge = NULL;
+			len = 0;
+			qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+			ohdr->u.at.aeth = ipath_compute_aeth(qp);
+			ohdr->u.at.atomic_ack_eth[0] =
+				cpu_to_be32(e->atomic_data >> 32);
+			ohdr->u.at.atomic_ack_eth[1] =
+				cpu_to_be32(e->atomic_data);
+			hwords += sizeof(ohdr->u.at) / sizeof(u32);
+			bth2 = e->psn;
+			e->sent = 1;
+		}
+		bth0 = qp->s_ack_state << 24;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		len = qp->s_ack_rdma_sge.sge.sge_length;
+		if (len > pmtu)
+			len = pmtu;
+		else {
+			ohdr->u.aeth = ipath_compute_aeth(qp);
+			hwords++;
+			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+			qp->s_ack_queue[qp->s_tail_ack_queue].sent = 1;
+		}
+		bth0 = qp->s_ack_state << 24;
+		bth2 = qp->s_ack_rdma_psn++ & IPATH_PSN_MASK;
+		break;
+
+	default:
+	normal:
+		/*
+		 * Send a regular ACK.
+		 * Set the s_ack_state so we wait until after sending
+		 * the ACK before setting s_ack_state to ACKNOWLEDGE
+		 * (see above).
+		 */
+		qp->s_ack_state = OP(SEND_ONLY);
+		qp->s_flags &= ~IPATH_S_ACK_PENDING;
+		qp->s_cur_sge = NULL;
+		if (qp->s_nak_state)
+			ohdr->u.aeth =
+				cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
+					    (qp->s_nak_state <<
+					     IPATH_AETH_CREDIT_SHIFT));
+		else
+			ohdr->u.aeth = ipath_compute_aeth(qp);
+		hwords++;
+		len = 0;
+		bth0 = OP(ACKNOWLEDGE) << 24;
+		bth2 = qp->s_ack_psn & IPATH_PSN_MASK;
+	}
+	qp->s_hdrwords = hwords;
+	qp->s_cur_size = len;
+	ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2);
+	return 1;
+
+bail:
+	return 0;
+}
+
+/**
+ * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int ipath_make_rc_req(struct ipath_qp *qp)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_other_headers *ohdr;
+	struct ipath_sge_state *ss;
+	struct ipath_swqe *wqe;
+	u32 hwords;
+	u32 len;
+	u32 bth0;
+	u32 bth2;
+	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+	char newreq;
+	unsigned long flags;
+	int ret = 0;
+
+	ohdr = &qp->s_hdr.u.oth;
+	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+		ohdr = &qp->s_hdr.u.l.oth;
+
+	/*
+	 * The lock is needed to synchronize between the sending tasklet,
+	 * the receive interrupt handler, and timeout resends.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Sending responses has higher priority over sending requests. */
+	if ((qp->r_head_ack_queue != qp->s_tail_ack_queue ||
+	     (qp->s_flags & IPATH_S_ACK_PENDING) ||
+	     qp->s_ack_state != OP(ACKNOWLEDGE)) &&
+	    ipath_make_rc_ack(dev, qp, ohdr, pmtu))
+		goto done;
+
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
+		if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_dma_busy)) {
+			qp->s_flags |= IPATH_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	/* Leave BUSY set until RNR timeout. */
+	if (qp->s_rnr_timeout) {
+		qp->s_flags |= IPATH_S_WAITING;
+		goto bail;
+	}
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+	bth0 = 1 << 22; /* Set M bit */
+
+	/* Send a request. */
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	switch (qp->s_state) {
+	default:
+		if (!(ib_ipath_state_ops[qp->state] &
+		    IPATH_PROCESS_NEXT_SEND_OK))
+			goto bail;
+		/*
+		 * Resend an old request or start a new one.
+		 *
+		 * We keep track of the current SWQE so that
+		 * we don't reset the "furthest progress" state
+		 * if we need to back up.
+		 */
+		newreq = 0;
+		if (qp->s_cur == qp->s_tail) {
+			/* Check if send work queue is empty. */
+			if (qp->s_tail == qp->s_head)
+				goto bail;
+			/*
+			 * If a fence is requested, wait for previous
+			 * RDMA read and atomic operations to finish.
+			 */
+			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
+			    qp->s_num_rd_atomic) {
+				qp->s_flags |= IPATH_S_FENCE_PENDING;
+				goto bail;
+			}
+			wqe->psn = qp->s_next_psn;
+			newreq = 1;
+		}
+		/*
+		 * Note that we have to be careful not to modify the
+		 * original work request since we may need to resend
+		 * it.
+		 */
+		len = wqe->length;
+		ss = &qp->s_sge;
+		bth2 = 0;
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			/* If no credit, return. */
+			if (qp->s_lsn != (u32) -1 &&
+			    ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+				qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
+				goto bail;
+			}
+			wqe->lpsn = wqe->psn;
+			if (len > pmtu) {
+				wqe->lpsn += (len - 1) / pmtu;
+				qp->s_state = OP(SEND_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_SEND)
+				qp->s_state = OP(SEND_ONLY);
+			else {
+				qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the BTH */
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+			}
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= 1 << 23;
+			bth2 = 1 << 31;	/* Request ACK. */
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+			if (newreq && qp->s_lsn != (u32) -1)
+				qp->s_lsn++;
+			/* FALLTHROUGH */
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			/* If no credit, return. */
+			if (qp->s_lsn != (u32) -1 &&
+			    ipath_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+				qp->s_flags |= IPATH_S_WAIT_SSN_CREDIT;
+				goto bail;
+			}
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			hwords += sizeof(struct ib_reth) / sizeof(u32);
+			wqe->lpsn = wqe->psn;
+			if (len > pmtu) {
+				wqe->lpsn += (len - 1) / pmtu;
+				qp->s_state = OP(RDMA_WRITE_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+				qp->s_state = OP(RDMA_WRITE_ONLY);
+			else {
+				qp->s_state =
+					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after RETH */
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+					bth0 |= 1 << 23;
+			}
+			bth2 = 1 << 31;	/* Request ACK. */
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_READ:
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow.
+			 */
+			if (newreq) {
+				if (qp->s_num_rd_atomic >=
+				    qp->s_max_rd_atomic) {
+					qp->s_flags |= IPATH_S_RDMAR_PENDING;
+					goto bail;
+				}
+				qp->s_num_rd_atomic++;
+				if (qp->s_lsn != (u32) -1)
+					qp->s_lsn++;
+				/*
+				 * Adjust s_next_psn to count the
+				 * expected number of responses.
+				 */
+				if (len > pmtu)
+					qp->s_next_psn += (len - 1) / pmtu;
+				wqe->lpsn = qp->s_next_psn++;
+			}
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			qp->s_state = OP(RDMA_READ_REQUEST);
+			hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+			ss = NULL;
+			len = 0;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_ATOMIC_CMP_AND_SWP:
+		case IB_WR_ATOMIC_FETCH_AND_ADD:
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow.
+			 */
+			if (newreq) {
+				if (qp->s_num_rd_atomic >=
+				    qp->s_max_rd_atomic) {
+					qp->s_flags |= IPATH_S_RDMAR_PENDING;
+					goto bail;
+				}
+				qp->s_num_rd_atomic++;
+				if (qp->s_lsn != (u32) -1)
+					qp->s_lsn++;
+				wqe->lpsn = wqe->psn;
+			}
+			if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+				qp->s_state = OP(COMPARE_SWAP);
+				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+					wqe->wr.wr.atomic.swap);
+				ohdr->u.atomic_eth.compare_data = cpu_to_be64(
+					wqe->wr.wr.atomic.compare_add);
+			} else {
+				qp->s_state = OP(FETCH_ADD);
+				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+					wqe->wr.wr.atomic.compare_add);
+				ohdr->u.atomic_eth.compare_data = 0;
+			}
+			ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
+				wqe->wr.wr.atomic.remote_addr >> 32);
+			ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
+				wqe->wr.wr.atomic.remote_addr);
+			ohdr->u.atomic_eth.rkey = cpu_to_be32(
+				wqe->wr.wr.atomic.rkey);
+			hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
+			ss = NULL;
+			len = 0;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		default:
+			goto bail;
+		}
+		qp->s_sge.sge = wqe->sg_list[0];
+		qp->s_sge.sg_list = wqe->sg_list + 1;
+		qp->s_sge.num_sge = wqe->wr.num_sge;
+		qp->s_len = wqe->length;
+		if (newreq) {
+			qp->s_tail++;
+			if (qp->s_tail >= qp->s_size)
+				qp->s_tail = 0;
+		}
+		bth2 |= qp->s_psn & IPATH_PSN_MASK;
+		if (wqe->wr.opcode == IB_WR_RDMA_READ)
+			qp->s_psn = wqe->lpsn + 1;
+		else {
+			qp->s_psn++;
+			if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+				qp->s_next_psn = qp->s_psn;
+		}
+		/*
+		 * Put the QP on the pending list so lost ACKs will cause
+		 * a retry.  More than one request can be pending so the
+		 * QP may already be on the dev->pending list.
+		 */
+		spin_lock(&dev->pending_lock);
+		if (list_empty(&qp->timerwait))
+			list_add_tail(&qp->timerwait,
+				      &dev->pending[dev->pending_index]);
+		spin_unlock(&dev->pending_lock);
+		break;
+
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		/*
+		 * This case can only happen if a send is restarted.
+		 * See ipath_restart_rc().
+		 */
+		ipath_init_restart(qp, wqe);
+		/* FALLTHROUGH */
+	case OP(SEND_FIRST):
+		qp->s_state = OP(SEND_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		bth2 = qp->s_psn++ & IPATH_PSN_MASK;
+		if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+			qp->s_next_psn = qp->s_psn;
+		ss = &qp->s_sge;
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_SEND)
+			qp->s_state = OP(SEND_LAST);
+		else {
+			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+		}
+		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+			bth0 |= 1 << 23;
+		bth2 |= 1 << 31;	/* Request ACK. */
+		qp->s_cur++;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_LAST):
+		/*
+		 * This case can only happen if a RDMA write is restarted.
+		 * See ipath_restart_rc().
+		 */
+		ipath_init_restart(qp, wqe);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_FIRST):
+		qp->s_state = OP(RDMA_WRITE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		bth2 = qp->s_psn++ & IPATH_PSN_MASK;
+		if (ipath_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+			qp->s_next_psn = qp->s_psn;
+		ss = &qp->s_sge;
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+			qp->s_state = OP(RDMA_WRITE_LAST);
+		else {
+			qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= 1 << 23;
+		}
+		bth2 |= 1 << 31;	/* Request ACK. */
+		qp->s_cur++;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		/*
+		 * This case can only happen if a RDMA read is restarted.
+		 * See ipath_restart_rc().
+		 */
+		ipath_init_restart(qp, wqe);
+		len = ((qp->s_psn - wqe->psn) & IPATH_PSN_MASK) * pmtu;
+		ohdr->u.rc.reth.vaddr =
+			cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
+		ohdr->u.rc.reth.rkey =
+			cpu_to_be32(wqe->wr.wr.rdma.rkey);
+		ohdr->u.rc.reth.length = cpu_to_be32(qp->s_len);
+		qp->s_state = OP(RDMA_READ_REQUEST);
+		hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+		bth2 = qp->s_psn & IPATH_PSN_MASK;
+		qp->s_psn = wqe->lpsn + 1;
+		ss = NULL;
+		len = 0;
+		qp->s_cur++;
+		if (qp->s_cur == qp->s_size)
+			qp->s_cur = 0;
+		break;
+	}
+	if (ipath_cmp24(qp->s_psn, qp->s_last_psn + IPATH_PSN_CREDIT - 1) >= 0)
+		bth2 |= 1 << 31;	/* Request ACK. */
+	qp->s_len -= len;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_sge = ss;
+	qp->s_cur_size = len;
+	ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2);
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * send_rc_ack - Construct an ACK packet and send it
+ * @qp: a pointer to the QP
+ *
+ * This is called from ipath_rc_rcv() and only uses the receive
+ * side QP state.
+ * Note that RDMA reads and atomics are handled in the
+ * send side QP state and tasklet.
+ */
+static void send_rc_ack(struct ipath_qp *qp)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_devdata *dd;
+	u16 lrh0;
+	u32 bth0;
+	u32 hwords;
+	u32 __iomem *piobuf;
+	struct ipath_ib_header hdr;
+	struct ipath_other_headers *ohdr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Don't send ACK or NAK if a RDMA read or atomic is pending. */
+	if (qp->r_head_ack_queue != qp->s_tail_ack_queue ||
+	    (qp->s_flags & IPATH_S_ACK_PENDING) ||
+	    qp->s_ack_state != OP(ACKNOWLEDGE))
+		goto queue_ack;
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	/* Don't try to send ACKs if the link isn't ACTIVE */
+	dd = dev->dd;
+	if (!(dd->ipath_flags & IPATH_LINKACTIVE))
+		goto done;
+
+	piobuf = ipath_getpiobuf(dd, 0, NULL);
+	if (!piobuf) {
+		/*
+		 * We are out of PIO buffers at the moment.
+		 * Pass responsibility for sending the ACK to the
+		 * send tasklet so that when a PIO buffer becomes
+		 * available, the ACK is sent ahead of other outgoing
+		 * packets.
+		 */
+		spin_lock_irqsave(&qp->s_lock, flags);
+		goto queue_ack;
+	}
+
+	/* Construct the header. */
+	ohdr = &hdr.u.oth;
+	lrh0 = IPATH_LRH_BTH;
+	/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
+	hwords = 6;
+	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+		hwords += ipath_make_grh(dev, &hdr.u.l.grh,
+					 &qp->remote_ah_attr.grh,
+					 hwords, 0);
+		ohdr = &hdr.u.l.oth;
+		lrh0 = IPATH_LRH_GRH;
+	}
+	/* read pkey_index w/o lock (its atomic) */
+	bth0 = ipath_get_pkey(dd, qp->s_pkey_index) |
+		(OP(ACKNOWLEDGE) << 24) | (1 << 22);
+	if (qp->r_nak_state)
+		ohdr->u.aeth = cpu_to_be32((qp->r_msn & IPATH_MSN_MASK) |
+					    (qp->r_nak_state <<
+					     IPATH_AETH_CREDIT_SHIFT));
+	else
+		ohdr->u.aeth = ipath_compute_aeth(qp);
+	lrh0 |= qp->remote_ah_attr.sl << 4;
+	hdr.lrh[0] = cpu_to_be16(lrh0);
+	hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+	hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+	hdr.lrh[3] = cpu_to_be16(dd->ipath_lid |
+				 qp->remote_ah_attr.src_path_bits);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & IPATH_PSN_MASK);
+
+	writeq(hwords + 1, piobuf);
+
+	if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
+		u32 *hdrp = (u32 *) &hdr;
+
+		ipath_flush_wc();
+		__iowrite32_copy(piobuf + 2, hdrp, hwords - 1);
+		ipath_flush_wc();
+		__raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
+	} else
+		__iowrite32_copy(piobuf + 2, (u32 *) &hdr, hwords);
+
+	ipath_flush_wc();
+
+	dev->n_unicast_xmit++;
+	goto done;
+
+queue_ack:
+	if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK) {
+		dev->n_rc_qacks++;
+		qp->s_flags |= IPATH_S_ACK_PENDING;
+		qp->s_nak_state = qp->r_nak_state;
+		qp->s_ack_psn = qp->r_ack_psn;
+
+		/* Schedule the send tasklet. */
+		ipath_schedule_send(qp);
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return;
+}
+
+/**
+ * reset_psn - reset the QP state to send starting from PSN
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ *
+ * This is called from ipath_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void reset_psn(struct ipath_qp *qp, u32 psn)
+{
+	u32 n = qp->s_last;
+	struct ipath_swqe *wqe = get_swqe_ptr(qp, n);
+	u32 opcode;
+
+	qp->s_cur = n;
+
+	/*
+	 * If we are starting the request from the beginning,
+	 * let the normal send code handle initialization.
+	 */
+	if (ipath_cmp24(psn, wqe->psn) <= 0) {
+		qp->s_state = OP(SEND_LAST);
+		goto done;
+	}
+
+	/* Find the work request opcode corresponding to the given PSN. */
+	opcode = wqe->wr.opcode;
+	for (;;) {
+		int diff;
+
+		if (++n == qp->s_size)
+			n = 0;
+		if (n == qp->s_tail)
+			break;
+		wqe = get_swqe_ptr(qp, n);
+		diff = ipath_cmp24(psn, wqe->psn);
+		if (diff < 0)
+			break;
+		qp->s_cur = n;
+		/*
+		 * If we are starting the request from the beginning,
+		 * let the normal send code handle initialization.
+		 */
+		if (diff == 0) {
+			qp->s_state = OP(SEND_LAST);
+			goto done;
+		}
+		opcode = wqe->wr.opcode;
+	}
+
+	/*
+	 * Set the state to restart in the middle of a request.
+	 * Don't change the s_sge, s_cur_sge, or s_cur_size.
+	 * See ipath_make_rc_req().
+	 */
+	switch (opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+		qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+		break;
+
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+		break;
+
+	case IB_WR_RDMA_READ:
+		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+		break;
+
+	default:
+		/*
+		 * This case shouldn't happen since its only
+		 * one PSN per req.
+		 */
+		qp->s_state = OP(SEND_LAST);
+	}
+done:
+	qp->s_psn = psn;
+}
+
+/**
+ * ipath_restart_rc - back up requester to resend the last un-ACKed request
+ * @qp: the QP to restart
+ * @psn: packet sequence number for the request
+ * @wc: the work completion request
+ *
+ * The QP s_lock should be held and interrupts disabled.
+ */
+void ipath_restart_rc(struct ipath_qp *qp, u32 psn)
+{
+	struct ipath_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+	struct ipath_ibdev *dev;
+
+	if (qp->s_retry == 0) {
+		ipath_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+		ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+		goto bail;
+	}
+	qp->s_retry--;
+
+	/*
+	 * Remove the QP from the timeout queue.
+	 * Note: it may already have been removed by ipath_ib_timer().
+	 */
+	dev = to_idev(qp->ibqp.device);
+	spin_lock(&dev->pending_lock);
+	if (!list_empty(&qp->timerwait))
+		list_del_init(&qp->timerwait);
+	if (!list_empty(&qp->piowait))
+		list_del_init(&qp->piowait);
+	spin_unlock(&dev->pending_lock);
+
+	if (wqe->wr.opcode == IB_WR_RDMA_READ)
+		dev->n_rc_resends++;
+	else
+		dev->n_rc_resends += (qp->s_psn - psn) & IPATH_PSN_MASK;
+
+	reset_psn(qp, psn);
+	ipath_schedule_send(qp);
+
+bail:
+	return;
+}
+
+static inline void update_last_psn(struct ipath_qp *qp, u32 psn)
+{
+	qp->s_last_psn = psn;
+}
+
+/**
+ * do_rc_ack - process an incoming RC ACK
+ * @qp: the QP the ACK came in on
+ * @psn: the packet sequence number of the ACK
+ * @opcode: the opcode of the request that resulted in the ACK
+ *
+ * This is called from ipath_rc_rcv_resp() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held and interrupts disabled.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
+		     u64 val)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ib_wc wc;
+	enum ib_wc_status status;
+	struct ipath_swqe *wqe;
+	int ret = 0;
+	u32 ack_psn;
+	int diff;
+
+	/*
+	 * Remove the QP from the timeout queue (or RNR timeout queue).
+	 * If ipath_ib_timer() has already removed it,
+	 * it's OK since we hold the QP s_lock and ipath_restart_rc()
+	 * just won't find anything to restart if we ACK everything.
+	 */
+	spin_lock(&dev->pending_lock);
+	if (!list_empty(&qp->timerwait))
+		list_del_init(&qp->timerwait);
+	spin_unlock(&dev->pending_lock);
+
+	/*
+	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+	 * requests and implicitly NAK RDMA read and atomic requests issued
+	 * before the NAK'ed request.  The MSN won't include the NAK'ed
+	 * request but will include an ACK'ed request(s).
+	 */
+	ack_psn = psn;
+	if (aeth >> 29)
+		ack_psn--;
+	wqe = get_swqe_ptr(qp, qp->s_last);
+
+	/*
+	 * The MSN might be for a later WQE than the PSN indicates so
+	 * only complete WQEs that the PSN finishes.
+	 */
+	while ((diff = ipath_cmp24(ack_psn, wqe->lpsn)) >= 0) {
+		/*
+		 * RDMA_READ_RESPONSE_ONLY is a special case since
+		 * we want to generate completion events for everything
+		 * before the RDMA read, copy the data, then generate
+		 * the completion for the read.
+		 */
+		if (wqe->wr.opcode == IB_WR_RDMA_READ &&
+		    opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
+		    diff == 0) {
+			ret = 1;
+			goto bail;
+		}
+		/*
+		 * If this request is a RDMA read or atomic, and the ACK is
+		 * for a later operation, this ACK NAKs the RDMA read or
+		 * atomic.  In other words, only a RDMA_READ_LAST or ONLY
+		 * can ACK a RDMA read and likewise for atomic ops.  Note
+		 * that the NAK case can only happen if relaxed ordering is
+		 * used and requests are sent after an RDMA read or atomic
+		 * is sent but before the response is received.
+		 */
+		if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+		     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
+			/*
+			 * The last valid PSN seen is the previous
+			 * request's.
+			 */
+			update_last_psn(qp, wqe->psn - 1);
+			/* Retry this request. */
+			ipath_restart_rc(qp, wqe->psn);
+			/*
+			 * No need to process the ACK/NAK since we are
+			 * restarting an earlier request.
+			 */
+			goto bail;
+		}
+		if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+			*(u64 *) wqe->sg_list[0].vaddr = val;
+		if (qp->s_num_rd_atomic &&
+		    (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
+			qp->s_num_rd_atomic--;
+			/* Restart sending task if fence is complete */
+			if (((qp->s_flags & IPATH_S_FENCE_PENDING) &&
+			     !qp->s_num_rd_atomic) ||
+			    qp->s_flags & IPATH_S_RDMAR_PENDING)
+				ipath_schedule_send(qp);
+		}
+		/* Post a send completion queue entry if requested. */
+		if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
+		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+			memset(&wc, 0, sizeof wc);
+			wc.wr_id = wqe->wr.wr_id;
+			wc.status = IB_WC_SUCCESS;
+			wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+			wc.byte_len = wqe->length;
+			wc.qp = &qp->ibqp;
+			wc.src_qp = qp->remote_qpn;
+			wc.slid = qp->remote_ah_attr.dlid;
+			wc.sl = qp->remote_ah_attr.sl;
+			ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+		}
+		qp->s_retry = qp->s_retry_cnt;
+		/*
+		 * If we are completing a request which is in the process of
+		 * being resent, we can stop resending it since we know the
+		 * responder has already seen it.
+		 */
+		if (qp->s_last == qp->s_cur) {
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			qp->s_last = qp->s_cur;
+			if (qp->s_last == qp->s_tail)
+				break;
+			wqe = get_swqe_ptr(qp, qp->s_cur);
+			qp->s_state = OP(SEND_LAST);
+			qp->s_psn = wqe->psn;
+		} else {
+			if (++qp->s_last >= qp->s_size)
+				qp->s_last = 0;
+			if (qp->state == IB_QPS_SQD && qp->s_last == qp->s_cur)
+				qp->s_draining = 0;
+			if (qp->s_last == qp->s_tail)
+				break;
+			wqe = get_swqe_ptr(qp, qp->s_last);
+		}
+	}
+
+	switch (aeth >> 29) {
+	case 0:		/* ACK */
+		dev->n_rc_acks++;
+		/* If this is a partial ACK, reset the retransmit timer. */
+		if (qp->s_last != qp->s_tail) {
+			spin_lock(&dev->pending_lock);
+			if (list_empty(&qp->timerwait))
+				list_add_tail(&qp->timerwait,
+					&dev->pending[dev->pending_index]);
+			spin_unlock(&dev->pending_lock);
+			/*
+			 * If we get a partial ACK for a resent operation,
+			 * we can stop resending the earlier packets and
+			 * continue with the next packet the receiver wants.
+			 */
+			if (ipath_cmp24(qp->s_psn, psn) <= 0) {
+				reset_psn(qp, psn + 1);
+				ipath_schedule_send(qp);
+			}
+		} else if (ipath_cmp24(qp->s_psn, psn) <= 0) {
+			qp->s_state = OP(SEND_LAST);
+			qp->s_psn = psn + 1;
+		}
+		ipath_get_credit(qp, aeth);
+		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+		qp->s_retry = qp->s_retry_cnt;
+		update_last_psn(qp, psn);
+		ret = 1;
+		goto bail;
+
+	case 1:		/* RNR NAK */
+		dev->n_rnr_naks++;
+		if (qp->s_last == qp->s_tail)
+			goto bail;
+		if (qp->s_rnr_retry == 0) {
+			status = IB_WC_RNR_RETRY_EXC_ERR;
+			goto class_b;
+		}
+		if (qp->s_rnr_retry_cnt < 7)
+			qp->s_rnr_retry--;
+
+		/* The last valid PSN is the previous PSN. */
+		update_last_psn(qp, psn - 1);
+
+		if (wqe->wr.opcode == IB_WR_RDMA_READ)
+			dev->n_rc_resends++;
+		else
+			dev->n_rc_resends +=
+				(qp->s_psn - psn) & IPATH_PSN_MASK;
+
+		reset_psn(qp, psn);
+
+		qp->s_rnr_timeout =
+			ib_ipath_rnr_table[(aeth >> IPATH_AETH_CREDIT_SHIFT) &
+					   IPATH_AETH_CREDIT_MASK];
+		ipath_insert_rnr_queue(qp);
+		ipath_schedule_send(qp);
+		goto bail;
+
+	case 3:		/* NAK */
+		if (qp->s_last == qp->s_tail)
+			goto bail;
+		/* The last valid PSN is the previous PSN. */
+		update_last_psn(qp, psn - 1);
+		switch ((aeth >> IPATH_AETH_CREDIT_SHIFT) &
+			IPATH_AETH_CREDIT_MASK) {
+		case 0:	/* PSN sequence error */
+			dev->n_seq_naks++;
+			/*
+			 * Back up to the responder's expected PSN.
+			 * Note that we might get a NAK in the middle of an
+			 * RDMA READ response which terminates the RDMA
+			 * READ.
+			 */
+			ipath_restart_rc(qp, psn);
+			break;
+
+		case 1:	/* Invalid Request */
+			status = IB_WC_REM_INV_REQ_ERR;
+			dev->n_other_naks++;
+			goto class_b;
+
+		case 2:	/* Remote Access Error */
+			status = IB_WC_REM_ACCESS_ERR;
+			dev->n_other_naks++;
+			goto class_b;
+
+		case 3:	/* Remote Operation Error */
+			status = IB_WC_REM_OP_ERR;
+			dev->n_other_naks++;
+		class_b:
+			ipath_send_complete(qp, wqe, status);
+			ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			break;
+
+		default:
+			/* Ignore other reserved NAK error codes */
+			goto reserved;
+		}
+		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+		goto bail;
+
+	default:		/* 2: reserved */
+	reserved:
+		/* Ignore reserved NAK codes. */
+		goto bail;
+	}
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_rc_rcv_resp - process an incoming RC response packet
+ * @dev: the device this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @hdrsize: the header length
+ * @pmtu: the path MTU
+ * @header_in_data: true if part of the header data is in the data buffer
+ *
+ * This is called from ipath_rc_rcv() to process an incoming RC response
+ * packet for the given QP.
+ * Called at interrupt level.
+ */
+static inline void ipath_rc_rcv_resp(struct ipath_ibdev *dev,
+				     struct ipath_other_headers *ohdr,
+				     void *data, u32 tlen,
+				     struct ipath_qp *qp,
+				     u32 opcode,
+				     u32 psn, u32 hdrsize, u32 pmtu,
+				     int header_in_data)
+{
+	struct ipath_swqe *wqe;
+	enum ib_wc_status status;
+	unsigned long flags;
+	int diff;
+	u32 pad;
+	u32 aeth;
+	u64 val;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Double check we can process this now that we hold the s_lock. */
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+		goto ack_done;
+
+	/* Ignore invalid responses. */
+	if (ipath_cmp24(psn, qp->s_next_psn) >= 0)
+		goto ack_done;
+
+	/* Ignore duplicate responses. */
+	diff = ipath_cmp24(psn, qp->s_last_psn);
+	if (unlikely(diff <= 0)) {
+		/* Update credits for "ghost" ACKs */
+		if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
+			if (!header_in_data)
+				aeth = be32_to_cpu(ohdr->u.aeth);
+			else {
+				aeth = be32_to_cpu(((__be32 *) data)[0]);
+				data += sizeof(__be32);
+			}
+			if ((aeth >> 29) == 0)
+				ipath_get_credit(qp, aeth);
+		}
+		goto ack_done;
+	}
+
+	if (unlikely(qp->s_last == qp->s_tail))
+		goto ack_done;
+	wqe = get_swqe_ptr(qp, qp->s_last);
+	status = IB_WC_SUCCESS;
+
+	switch (opcode) {
+	case OP(ACKNOWLEDGE):
+	case OP(ATOMIC_ACKNOWLEDGE):
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		if (!header_in_data)
+			aeth = be32_to_cpu(ohdr->u.aeth);
+		else {
+			aeth = be32_to_cpu(((__be32 *) data)[0]);
+			data += sizeof(__be32);
+		}
+		if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
+			if (!header_in_data) {
+				__be32 *p = ohdr->u.at.atomic_ack_eth;
+
+				val = ((u64) be32_to_cpu(p[0]) << 32) |
+					be32_to_cpu(p[1]);
+			} else
+				val = be64_to_cpu(((__be64 *) data)[0]);
+		} else
+			val = 0;
+		if (!do_rc_ack(qp, aeth, psn, opcode, val) ||
+		    opcode != OP(RDMA_READ_RESPONSE_FIRST))
+			goto ack_done;
+		hdrsize += 4;
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+		qp->r_flags &= ~IPATH_R_RDMAR_SEQ;
+		/*
+		 * If this is a response to a resent RDMA read, we
+		 * have to be careful to copy the data to the right
+		 * location.
+		 */
+		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+						  wqe, psn, pmtu);
+		goto read_middle;
+
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		/* no AETH, no ACK */
+		if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
+			dev->n_rdma_seq++;
+			if (qp->r_flags & IPATH_R_RDMAR_SEQ)
+				goto ack_done;
+			qp->r_flags |= IPATH_R_RDMAR_SEQ;
+			ipath_restart_rc(qp, qp->s_last_psn + 1);
+			goto ack_done;
+		}
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+	read_middle:
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto ack_len_err;
+		if (unlikely(pmtu >= qp->s_rdma_read_len))
+			goto ack_len_err;
+
+		/* We got a response so update the timeout. */
+		spin_lock(&dev->pending_lock);
+		if (qp->s_rnr_timeout == 0 && !list_empty(&qp->timerwait))
+			list_move_tail(&qp->timerwait,
+				       &dev->pending[dev->pending_index]);
+		spin_unlock(&dev->pending_lock);
+
+		if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
+			qp->s_retry = qp->s_retry_cnt;
+
+		/*
+		 * Update the RDMA receive state but do the copy w/o
+		 * holding the locks and blocking interrupts.
+		 */
+		qp->s_rdma_read_len -= pmtu;
+		update_last_psn(qp, psn);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		ipath_copy_sge(&qp->s_rdma_read_sge, data, pmtu);
+		goto bail;
+
+	case OP(RDMA_READ_RESPONSE_ONLY):
+		if (!header_in_data)
+			aeth = be32_to_cpu(ohdr->u.aeth);
+		else
+			aeth = be32_to_cpu(((__be32 *) data)[0]);
+		if (!do_rc_ack(qp, aeth, psn, opcode, 0))
+			goto ack_done;
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/*
+		 * Check that the data size is >= 0 && <= pmtu.
+		 * Remember to account for the AETH header (4) and
+		 * ICRC (4).
+		 */
+		if (unlikely(tlen < (hdrsize + pad + 8)))
+			goto ack_len_err;
+		/*
+		 * If this is a response to a resent RDMA read, we
+		 * have to be careful to copy the data to the right
+		 * location.
+		 */
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+						  wqe, psn, pmtu);
+		goto read_last;
+
+	case OP(RDMA_READ_RESPONSE_LAST):
+		/* ACKs READ req. */
+		if (unlikely(ipath_cmp24(psn, qp->s_last_psn + 1))) {
+			dev->n_rdma_seq++;
+			if (qp->r_flags & IPATH_R_RDMAR_SEQ)
+				goto ack_done;
+			qp->r_flags |= IPATH_R_RDMAR_SEQ;
+			ipath_restart_rc(qp, qp->s_last_psn + 1);
+			goto ack_done;
+		}
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/*
+		 * Check that the data size is >= 1 && <= pmtu.
+		 * Remember to account for the AETH header (4) and
+		 * ICRC (4).
+		 */
+		if (unlikely(tlen <= (hdrsize + pad + 8)))
+			goto ack_len_err;
+	read_last:
+		tlen -= hdrsize + pad + 8;
+		if (unlikely(tlen != qp->s_rdma_read_len))
+			goto ack_len_err;
+		if (!header_in_data)
+			aeth = be32_to_cpu(ohdr->u.aeth);
+		else {
+			aeth = be32_to_cpu(((__be32 *) data)[0]);
+			data += sizeof(__be32);
+		}
+		ipath_copy_sge(&qp->s_rdma_read_sge, data, tlen);
+		(void) do_rc_ack(qp, aeth, psn,
+				 OP(RDMA_READ_RESPONSE_LAST), 0);
+		goto ack_done;
+	}
+
+ack_op_err:
+	status = IB_WC_LOC_QP_OP_ERR;
+	goto ack_err;
+
+ack_len_err:
+	status = IB_WC_LOC_LEN_ERR;
+ack_err:
+	ipath_send_complete(qp, wqe, status);
+	ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+ack_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+bail:
+	return;
+}
+
+/**
+ * ipath_rc_rcv_error - process an incoming duplicate or error RC packet
+ * @dev: the device this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @diff: the difference between the PSN and the expected PSN
+ * @header_in_data: true if part of the header data is in the data buffer
+ *
+ * This is called from ipath_rc_rcv() to process an unexpected
+ * incoming RC packet for the given QP.
+ * Called at interrupt level.
+ * Return 1 if no more processing is needed; otherwise return 0 to
+ * schedule a response to be sent.
+ */
+static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
+				     struct ipath_other_headers *ohdr,
+				     void *data,
+				     struct ipath_qp *qp,
+				     u32 opcode,
+				     u32 psn,
+				     int diff,
+				     int header_in_data)
+{
+	struct ipath_ack_entry *e;
+	u8 i, prev;
+	int old_req;
+	unsigned long flags;
+
+	if (diff > 0) {
+		/*
+		 * Packet sequence error.
+		 * A NAK will ACK earlier sends and RDMA writes.
+		 * Don't queue the NAK if we already sent one.
+		 */
+		if (!qp->r_nak_state) {
+			qp->r_nak_state = IB_NAK_PSN_ERROR;
+			/* Use the expected PSN. */
+			qp->r_ack_psn = qp->r_psn;
+			goto send_ack;
+		}
+		goto done;
+	}
+
+	/*
+	 * Handle a duplicate request.  Don't re-execute SEND, RDMA
+	 * write or atomic op.  Don't NAK errors, just silently drop
+	 * the duplicate request.  Note that r_sge, r_len, and
+	 * r_rcv_len may be in use so don't modify them.
+	 *
+	 * We are supposed to ACK the earliest duplicate PSN but we
+	 * can coalesce an outstanding duplicate ACK.  We have to
+	 * send the earliest so that RDMA reads can be restarted at
+	 * the requester's expected PSN.
+	 *
+	 * First, find where this duplicate PSN falls within the
+	 * ACKs previously sent.
+	 */
+	psn &= IPATH_PSN_MASK;
+	e = NULL;
+	old_req = 1;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	/* Double check we can process this now that we hold the s_lock. */
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+		goto unlock_done;
+
+	for (i = qp->r_head_ack_queue; ; i = prev) {
+		if (i == qp->s_tail_ack_queue)
+			old_req = 0;
+		if (i)
+			prev = i - 1;
+		else
+			prev = IPATH_MAX_RDMA_ATOMIC;
+		if (prev == qp->r_head_ack_queue) {
+			e = NULL;
+			break;
+		}
+		e = &qp->s_ack_queue[prev];
+		if (!e->opcode) {
+			e = NULL;
+			break;
+		}
+		if (ipath_cmp24(psn, e->psn) >= 0) {
+			if (prev == qp->s_tail_ack_queue)
+				old_req = 0;
+			break;
+		}
+	}
+	switch (opcode) {
+	case OP(RDMA_READ_REQUEST): {
+		struct ib_reth *reth;
+		u32 offset;
+		u32 len;
+
+		/*
+		 * If we didn't find the RDMA read request in the ack queue,
+		 * or the send tasklet is already backed up to send an
+		 * earlier entry, we can ignore this request.
+		 */
+		if (!e || e->opcode != OP(RDMA_READ_REQUEST) || old_req)
+			goto unlock_done;
+		/* RETH comes after BTH */
+		if (!header_in_data)
+			reth = &ohdr->u.rc.reth;
+		else {
+			reth = (struct ib_reth *)data;
+			data += sizeof(*reth);
+		}
+		/*
+		 * Address range must be a subset of the original
+		 * request and start on pmtu boundaries.
+		 * We reuse the old ack_queue slot since the requester
+		 * should not back up and request an earlier PSN for the
+		 * same request.
+		 */
+		offset = ((psn - e->psn) & IPATH_PSN_MASK) *
+			ib_mtu_enum_to_int(qp->path_mtu);
+		len = be32_to_cpu(reth->length);
+		if (unlikely(offset + len > e->rdma_sge.sge.sge_length))
+			goto unlock_done;
+		if (len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			ok = ipath_rkey_ok(qp, &e->rdma_sge,
+					   len, vaddr, rkey,
+					   IB_ACCESS_REMOTE_READ);
+			if (unlikely(!ok))
+				goto unlock_done;
+		} else {
+			e->rdma_sge.sg_list = NULL;
+			e->rdma_sge.num_sge = 0;
+			e->rdma_sge.sge.mr = NULL;
+			e->rdma_sge.sge.vaddr = NULL;
+			e->rdma_sge.sge.length = 0;
+			e->rdma_sge.sge.sge_length = 0;
+		}
+		e->psn = psn;
+		qp->s_ack_state = OP(ACKNOWLEDGE);
+		qp->s_tail_ack_queue = prev;
+		break;
+	}
+
+	case OP(COMPARE_SWAP):
+	case OP(FETCH_ADD): {
+		/*
+		 * If we didn't find the atomic request in the ack queue
+		 * or the send tasklet is already backed up to send an
+		 * earlier entry, we can ignore this request.
+		 */
+		if (!e || e->opcode != (u8) opcode || old_req)
+			goto unlock_done;
+		qp->s_ack_state = OP(ACKNOWLEDGE);
+		qp->s_tail_ack_queue = prev;
+		break;
+	}
+
+	default:
+		if (old_req)
+			goto unlock_done;
+		/*
+		 * Resend the most recent ACK if this request is
+		 * after all the previous RDMA reads and atomics.
+		 */
+		if (i == qp->r_head_ack_queue) {
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qp->r_nak_state = 0;
+			qp->r_ack_psn = qp->r_psn - 1;
+			goto send_ack;
+		}
+		/*
+		 * Try to send a simple ACK to work around a Mellanox bug
+		 * which doesn't accept a RDMA read response or atomic
+		 * response as an ACK for earlier SENDs or RDMA writes.
+		 */
+		if (qp->r_head_ack_queue == qp->s_tail_ack_queue &&
+		    !(qp->s_flags & IPATH_S_ACK_PENDING) &&
+		    qp->s_ack_state == OP(ACKNOWLEDGE)) {
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qp->r_nak_state = 0;
+			qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
+			goto send_ack;
+		}
+		/*
+		 * Resend the RDMA read or atomic op which
+		 * ACKs this duplicate request.
+		 */
+		qp->s_ack_state = OP(ACKNOWLEDGE);
+		qp->s_tail_ack_queue = i;
+		break;
+	}
+	qp->r_nak_state = 0;
+	ipath_schedule_send(qp);
+
+unlock_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return 1;
+
+send_ack:
+	return 0;
+}
+
+void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)
+{
+	unsigned long flags;
+	int lastwqe;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	lastwqe = ipath_error_qp(qp, err);
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	if (lastwqe) {
+		struct ib_event ev;
+
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+}
+
+static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n)
+{
+	unsigned next;
+
+	next = n + 1;
+	if (next > IPATH_MAX_RDMA_ATOMIC)
+		next = 0;
+	if (n == qp->s_tail_ack_queue) {
+		qp->s_tail_ack_queue = next;
+		qp->s_ack_state = OP(ACKNOWLEDGE);
+	}
+}
+
+/**
+ * ipath_rc_rcv - process an incoming RC packet
+ * @dev: the device this packet came in on
+ * @hdr: the header of this packet
+ * @has_grh: true if the header has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ *
+ * This is called from ipath_qp_rcv() to process an incoming RC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+	struct ipath_other_headers *ohdr;
+	u32 opcode;
+	u32 hdrsize;
+	u32 psn;
+	u32 pad;
+	struct ib_wc wc;
+	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+	int diff;
+	struct ib_reth *reth;
+	int header_in_data;
+	unsigned long flags;
+
+	/* Validate the SLID. See Ch. 9.6.1.5 */
+	if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
+		goto done;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12;	/* LRH + BTH */
+		psn = be32_to_cpu(ohdr->bth[2]);
+		header_in_data = 0;
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12;	/* LRH + GRH + BTH */
+		/*
+		 * The header with GRH is 60 bytes and the core driver sets
+		 * the eager header buffer size to 56 bytes so the last 4
+		 * bytes of the BTH header (PSN) is in the data buffer.
+		 */
+		header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
+		if (header_in_data) {
+			psn = be32_to_cpu(((__be32 *) data)[0]);
+			data += sizeof(__be32);
+		} else
+			psn = be32_to_cpu(ohdr->bth[2]);
+	}
+
+	/*
+	 * Process responses (ACKs) before anything else.  Note that the
+	 * packet sequence number will be for something in the send work
+	 * queue rather than the expected receive packet sequence number.
+	 * In other words, this QP is the requester.
+	 */
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+		ipath_rc_rcv_resp(dev, ohdr, data, tlen, qp, opcode, psn,
+				  hdrsize, pmtu, header_in_data);
+		goto done;
+	}
+
+	/* Compute 24 bits worth of difference. */
+	diff = ipath_cmp24(psn, qp->r_psn);
+	if (unlikely(diff)) {
+		if (ipath_rc_rcv_error(dev, ohdr, data, qp, opcode,
+				       psn, diff, header_in_data))
+			goto done;
+		goto send_ack;
+	}
+
+	/* Check for opcode sequence errors. */
+	switch (qp->r_state) {
+	case OP(SEND_FIRST):
+	case OP(SEND_MIDDLE):
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+			break;
+		goto nack_inv;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_MIDDLE):
+		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			break;
+		goto nack_inv;
+
+	default:
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			goto nack_inv;
+		/*
+		 * Note that it is up to the requester to not send a new
+		 * RDMA read or atomic operation before receiving an ACK
+		 * for the previous operation.
+		 */
+		break;
+	}
+
+	memset(&wc, 0, sizeof wc);
+
+	/* OK, process the packet. */
+	switch (opcode) {
+	case OP(SEND_FIRST):
+		if (!ipath_get_rwqe(qp, 0))
+			goto rnr_nak;
+		qp->r_rcv_len = 0;
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+	case OP(RDMA_WRITE_MIDDLE):
+	send_middle:
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto nack_inv;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto nack_inv;
+		ipath_copy_sge(&qp->r_sge, data, pmtu);
+		break;
+
+	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+		/* consume RWQE */
+		if (!ipath_get_rwqe(qp, 1))
+			goto rnr_nak;
+		goto send_last_imm;
+
+	case OP(SEND_ONLY):
+	case OP(SEND_ONLY_WITH_IMMEDIATE):
+		if (!ipath_get_rwqe(qp, 0))
+			goto rnr_nak;
+		qp->r_rcv_len = 0;
+		if (opcode == OP(SEND_ONLY))
+			goto send_last;
+		/* FALLTHROUGH */
+	case OP(SEND_LAST_WITH_IMMEDIATE):
+	send_last_imm:
+		if (header_in_data) {
+			wc.ex.imm_data = *(__be32 *) data;
+			data += sizeof(__be32);
+		} else {
+			/* Immediate data comes after BTH */
+			wc.ex.imm_data = ohdr->u.imm_data;
+		}
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		/* FALLTHROUGH */
+	case OP(SEND_LAST):
+	case OP(RDMA_WRITE_LAST):
+	send_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto nack_inv;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		wc.byte_len = tlen + qp->r_rcv_len;
+		if (unlikely(wc.byte_len > qp->r_len))
+			goto nack_inv;
+		ipath_copy_sge(&qp->r_sge, data, tlen);
+		qp->r_msn++;
+		if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
+			break;
+		wc.wr_id = qp->r_wr_id;
+		wc.status = IB_WC_SUCCESS;
+		if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		else
+			wc.opcode = IB_WC_RECV;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = qp->remote_qpn;
+		wc.slid = qp->remote_ah_attr.dlid;
+		wc.sl = qp->remote_ah_attr.sl;
+		/* Signal completion event if the solicited bit is set. */
+		ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+			       (ohdr->bth[0] &
+				cpu_to_be32(1 << 23)) != 0);
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_ONLY):
+	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_WRITE)))
+			goto nack_inv;
+		/* consume RWQE */
+		/* RETH comes after BTH */
+		if (!header_in_data)
+			reth = &ohdr->u.rc.reth;
+		else {
+			reth = (struct ib_reth *)data;
+			data += sizeof(*reth);
+		}
+		hdrsize += sizeof(*reth);
+		qp->r_len = be32_to_cpu(reth->length);
+		qp->r_rcv_len = 0;
+		if (qp->r_len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey & NAK */
+			ok = ipath_rkey_ok(qp, &qp->r_sge,
+					   qp->r_len, vaddr, rkey,
+					   IB_ACCESS_REMOTE_WRITE);
+			if (unlikely(!ok))
+				goto nack_acc;
+		} else {
+			qp->r_sge.sg_list = NULL;
+			qp->r_sge.sge.mr = NULL;
+			qp->r_sge.sge.vaddr = NULL;
+			qp->r_sge.sge.length = 0;
+			qp->r_sge.sge.sge_length = 0;
+		}
+		if (opcode == OP(RDMA_WRITE_FIRST))
+			goto send_middle;
+		else if (opcode == OP(RDMA_WRITE_ONLY))
+			goto send_last;
+		if (!ipath_get_rwqe(qp, 1))
+			goto rnr_nak;
+		goto send_last_imm;
+
+	case OP(RDMA_READ_REQUEST): {
+		struct ipath_ack_entry *e;
+		u32 len;
+		u8 next;
+
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_READ)))
+			goto nack_inv;
+		next = qp->r_head_ack_queue + 1;
+		if (next > IPATH_MAX_RDMA_ATOMIC)
+			next = 0;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		/* Double check we can process this while holding the s_lock. */
+		if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+			goto unlock;
+		if (unlikely(next == qp->s_tail_ack_queue)) {
+			if (!qp->s_ack_queue[next].sent)
+				goto nack_inv_unlck;
+			ipath_update_ack_queue(qp, next);
+		}
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		/* RETH comes after BTH */
+		if (!header_in_data)
+			reth = &ohdr->u.rc.reth;
+		else {
+			reth = (struct ib_reth *)data;
+			data += sizeof(*reth);
+		}
+		len = be32_to_cpu(reth->length);
+		if (len) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey & NAK */
+			ok = ipath_rkey_ok(qp, &e->rdma_sge, len, vaddr,
+					   rkey, IB_ACCESS_REMOTE_READ);
+			if (unlikely(!ok))
+				goto nack_acc_unlck;
+			/*
+			 * Update the next expected PSN.  We add 1 later
+			 * below, so only add the remainder here.
+			 */
+			if (len > pmtu)
+				qp->r_psn += (len - 1) / pmtu;
+		} else {
+			e->rdma_sge.sg_list = NULL;
+			e->rdma_sge.num_sge = 0;
+			e->rdma_sge.sge.mr = NULL;
+			e->rdma_sge.sge.vaddr = NULL;
+			e->rdma_sge.sge.length = 0;
+			e->rdma_sge.sge.sge_length = 0;
+		}
+		e->opcode = opcode;
+		e->sent = 0;
+		e->psn = psn;
+		/*
+		 * We need to increment the MSN here instead of when we
+		 * finish sending the result since a duplicate request would
+		 * increment it more than once.
+		 */
+		qp->r_msn++;
+		qp->r_psn++;
+		qp->r_state = opcode;
+		qp->r_nak_state = 0;
+		qp->r_head_ack_queue = next;
+
+		/* Schedule the send tasklet. */
+		ipath_schedule_send(qp);
+
+		goto unlock;
+	}
+
+	case OP(COMPARE_SWAP):
+	case OP(FETCH_ADD): {
+		struct ib_atomic_eth *ateth;
+		struct ipath_ack_entry *e;
+		u64 vaddr;
+		atomic64_t *maddr;
+		u64 sdata;
+		u32 rkey;
+		u8 next;
+
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_ATOMIC)))
+			goto nack_inv;
+		next = qp->r_head_ack_queue + 1;
+		if (next > IPATH_MAX_RDMA_ATOMIC)
+			next = 0;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		/* Double check we can process this while holding the s_lock. */
+		if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK))
+			goto unlock;
+		if (unlikely(next == qp->s_tail_ack_queue)) {
+			if (!qp->s_ack_queue[next].sent)
+				goto nack_inv_unlck;
+			ipath_update_ack_queue(qp, next);
+		}
+		if (!header_in_data)
+			ateth = &ohdr->u.atomic_eth;
+		else
+			ateth = (struct ib_atomic_eth *)data;
+		vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
+			be32_to_cpu(ateth->vaddr[1]);
+		if (unlikely(vaddr & (sizeof(u64) - 1)))
+			goto nack_inv_unlck;
+		rkey = be32_to_cpu(ateth->rkey);
+		/* Check rkey & NAK */
+		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge,
+					    sizeof(u64), vaddr, rkey,
+					    IB_ACCESS_REMOTE_ATOMIC)))
+			goto nack_acc_unlck;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+		sdata = be64_to_cpu(ateth->swap_data);
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		e->atomic_data = (opcode == OP(FETCH_ADD)) ?
+			(u64) atomic64_add_return(sdata, maddr) - sdata :
+			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+				      be64_to_cpu(ateth->compare_data),
+				      sdata);
+		e->opcode = opcode;
+		e->sent = 0;
+		e->psn = psn & IPATH_PSN_MASK;
+		qp->r_msn++;
+		qp->r_psn++;
+		qp->r_state = opcode;
+		qp->r_nak_state = 0;
+		qp->r_head_ack_queue = next;
+
+		/* Schedule the send tasklet. */
+		ipath_schedule_send(qp);
+
+		goto unlock;
+	}
+
+	default:
+		/* NAK unknown opcodes. */
+		goto nack_inv;
+	}
+	qp->r_psn++;
+	qp->r_state = opcode;
+	qp->r_ack_psn = psn;
+	qp->r_nak_state = 0;
+	/* Send an ACK if requested or required. */
+	if (psn & (1 << 31))
+		goto send_ack;
+	goto done;
+
+rnr_nak:
+	qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
+	qp->r_ack_psn = qp->r_psn;
+	goto send_ack;
+
+nack_inv_unlck:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+	ipath_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+	qp->r_ack_psn = qp->r_psn;
+	goto send_ack;
+
+nack_acc_unlck:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_acc:
+	ipath_rc_error(qp, IB_WC_LOC_PROT_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+send_ack:
+	send_rc_ack(qp);
+	goto done;
+
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_registers.h b/drivers/infiniband/hw/ipath/ipath_registers.h
new file mode 100644
index 000000000..8f44d0cf3
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_registers.h
@@ -0,0 +1,512 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPATH_REGISTERS_H
+#define _IPATH_REGISTERS_H
+
+/*
+ * This file should only be included by kernel source, and by the diags.  It
+ * defines the registers, and their contents, for InfiniPath chips.
+ */
+
+/*
+ * These are the InfiniPath register and buffer bit definitions,
+ * that are visible to software, and needed only by the kernel
+ * and diag code.  A few, that are visible to protocol and user
+ * code are in ipath_common.h.  Some bits are specific
+ * to a given chip implementation, and have been moved to the
+ * chip-specific source file
+ */
+
+/* kr_revision bits */
+#define INFINIPATH_R_CHIPREVMINOR_MASK 0xFF
+#define INFINIPATH_R_CHIPREVMINOR_SHIFT 0
+#define INFINIPATH_R_CHIPREVMAJOR_MASK 0xFF
+#define INFINIPATH_R_CHIPREVMAJOR_SHIFT 8
+#define INFINIPATH_R_ARCH_MASK 0xFF
+#define INFINIPATH_R_ARCH_SHIFT 16
+#define INFINIPATH_R_SOFTWARE_MASK 0xFF
+#define INFINIPATH_R_SOFTWARE_SHIFT 24
+#define INFINIPATH_R_BOARDID_MASK 0xFF
+#define INFINIPATH_R_BOARDID_SHIFT 32
+
+/* kr_control bits */
+#define INFINIPATH_C_FREEZEMODE 0x00000002
+#define INFINIPATH_C_LINKENABLE 0x00000004
+
+/* kr_sendctrl bits */
+#define INFINIPATH_S_DISARMPIOBUF_SHIFT 16
+#define INFINIPATH_S_UPDTHRESH_SHIFT 24
+#define INFINIPATH_S_UPDTHRESH_MASK 0x1f
+
+#define IPATH_S_ABORT		0
+#define IPATH_S_PIOINTBUFAVAIL	1
+#define IPATH_S_PIOBUFAVAILUPD	2
+#define IPATH_S_PIOENABLE	3
+#define IPATH_S_SDMAINTENABLE	9
+#define IPATH_S_SDMASINGLEDESCRIPTOR	10
+#define IPATH_S_SDMAENABLE	11
+#define IPATH_S_SDMAHALT	12
+#define IPATH_S_DISARM		31
+
+#define INFINIPATH_S_ABORT		(1U << IPATH_S_ABORT)
+#define INFINIPATH_S_PIOINTBUFAVAIL	(1U << IPATH_S_PIOINTBUFAVAIL)
+#define INFINIPATH_S_PIOBUFAVAILUPD	(1U << IPATH_S_PIOBUFAVAILUPD)
+#define INFINIPATH_S_PIOENABLE		(1U << IPATH_S_PIOENABLE)
+#define INFINIPATH_S_SDMAINTENABLE	(1U << IPATH_S_SDMAINTENABLE)
+#define INFINIPATH_S_SDMASINGLEDESCRIPTOR \
+					(1U << IPATH_S_SDMASINGLEDESCRIPTOR)
+#define INFINIPATH_S_SDMAENABLE		(1U << IPATH_S_SDMAENABLE)
+#define INFINIPATH_S_SDMAHALT		(1U << IPATH_S_SDMAHALT)
+#define INFINIPATH_S_DISARM		(1U << IPATH_S_DISARM)
+
+/* kr_rcvctrl bits that are the same on multiple chips */
+#define INFINIPATH_R_PORTENABLE_SHIFT 0
+#define INFINIPATH_R_QPMAP_ENABLE (1ULL << 38)
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define INFINIPATH_I_SDMAINT		0x8000000000000000ULL
+#define INFINIPATH_I_SDMADISABLED	0x4000000000000000ULL
+#define INFINIPATH_I_ERROR		0x0000000080000000ULL
+#define INFINIPATH_I_SPIOSENT		0x0000000040000000ULL
+#define INFINIPATH_I_SPIOBUFAVAIL	0x0000000020000000ULL
+#define INFINIPATH_I_GPIO		0x0000000010000000ULL
+#define INFINIPATH_I_JINT		0x0000000004000000ULL
+
+/* kr_errorstatus, kr_errorclear, kr_errormask bits */
+#define INFINIPATH_E_RFORMATERR			0x0000000000000001ULL
+#define INFINIPATH_E_RVCRC			0x0000000000000002ULL
+#define INFINIPATH_E_RICRC			0x0000000000000004ULL
+#define INFINIPATH_E_RMINPKTLEN			0x0000000000000008ULL
+#define INFINIPATH_E_RMAXPKTLEN			0x0000000000000010ULL
+#define INFINIPATH_E_RLONGPKTLEN		0x0000000000000020ULL
+#define INFINIPATH_E_RSHORTPKTLEN		0x0000000000000040ULL
+#define INFINIPATH_E_RUNEXPCHAR			0x0000000000000080ULL
+#define INFINIPATH_E_RUNSUPVL			0x0000000000000100ULL
+#define INFINIPATH_E_REBP			0x0000000000000200ULL
+#define INFINIPATH_E_RIBFLOW			0x0000000000000400ULL
+#define INFINIPATH_E_RBADVERSION		0x0000000000000800ULL
+#define INFINIPATH_E_RRCVEGRFULL		0x0000000000001000ULL
+#define INFINIPATH_E_RRCVHDRFULL		0x0000000000002000ULL
+#define INFINIPATH_E_RBADTID			0x0000000000004000ULL
+#define INFINIPATH_E_RHDRLEN			0x0000000000008000ULL
+#define INFINIPATH_E_RHDR			0x0000000000010000ULL
+#define INFINIPATH_E_RIBLOSTLINK		0x0000000000020000ULL
+#define INFINIPATH_E_SENDSPECIALTRIGGER		0x0000000008000000ULL
+#define INFINIPATH_E_SDMADISABLED		0x0000000010000000ULL
+#define INFINIPATH_E_SMINPKTLEN			0x0000000020000000ULL
+#define INFINIPATH_E_SMAXPKTLEN			0x0000000040000000ULL
+#define INFINIPATH_E_SUNDERRUN			0x0000000080000000ULL
+#define INFINIPATH_E_SPKTLEN			0x0000000100000000ULL
+#define INFINIPATH_E_SDROPPEDSMPPKT		0x0000000200000000ULL
+#define INFINIPATH_E_SDROPPEDDATAPKT		0x0000000400000000ULL
+#define INFINIPATH_E_SPIOARMLAUNCH		0x0000000800000000ULL
+#define INFINIPATH_E_SUNEXPERRPKTNUM		0x0000001000000000ULL
+#define INFINIPATH_E_SUNSUPVL			0x0000002000000000ULL
+#define INFINIPATH_E_SENDBUFMISUSE		0x0000004000000000ULL
+#define INFINIPATH_E_SDMAGENMISMATCH		0x0000008000000000ULL
+#define INFINIPATH_E_SDMAOUTOFBOUND		0x0000010000000000ULL
+#define INFINIPATH_E_SDMATAILOUTOFBOUND		0x0000020000000000ULL
+#define INFINIPATH_E_SDMABASE			0x0000040000000000ULL
+#define INFINIPATH_E_SDMA1STDESC		0x0000080000000000ULL
+#define INFINIPATH_E_SDMARPYTAG			0x0000100000000000ULL
+#define INFINIPATH_E_SDMADWEN			0x0000200000000000ULL
+#define INFINIPATH_E_SDMAMISSINGDW		0x0000400000000000ULL
+#define INFINIPATH_E_SDMAUNEXPDATA		0x0000800000000000ULL
+#define INFINIPATH_E_IBSTATUSCHANGED		0x0001000000000000ULL
+#define INFINIPATH_E_INVALIDADDR		0x0002000000000000ULL
+#define INFINIPATH_E_RESET			0x0004000000000000ULL
+#define INFINIPATH_E_HARDWARE			0x0008000000000000ULL
+#define INFINIPATH_E_SDMADESCADDRMISALIGN	0x0010000000000000ULL
+#define INFINIPATH_E_INVALIDEEPCMD		0x0020000000000000ULL
+
+/*
+ * this is used to print "common" packet errors only when the
+ * __IPATH_ERRPKTDBG bit is set in ipath_debug.
+ */
+#define INFINIPATH_E_PKTERRS ( INFINIPATH_E_SPKTLEN \
+		| INFINIPATH_E_SDROPPEDDATAPKT | INFINIPATH_E_RVCRC \
+		| INFINIPATH_E_RICRC | INFINIPATH_E_RSHORTPKTLEN \
+		| INFINIPATH_E_REBP )
+
+/* Convenience for decoding Send DMA errors */
+#define INFINIPATH_E_SDMAERRS ( \
+	INFINIPATH_E_SDMAGENMISMATCH | INFINIPATH_E_SDMAOUTOFBOUND | \
+	INFINIPATH_E_SDMATAILOUTOFBOUND | INFINIPATH_E_SDMABASE | \
+	INFINIPATH_E_SDMA1STDESC | INFINIPATH_E_SDMARPYTAG | \
+	INFINIPATH_E_SDMADWEN | INFINIPATH_E_SDMAMISSINGDW | \
+	INFINIPATH_E_SDMAUNEXPDATA | \
+	INFINIPATH_E_SDMADESCADDRMISALIGN | \
+	INFINIPATH_E_SDMADISABLED | \
+	INFINIPATH_E_SENDBUFMISUSE)
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+/* TXEMEMPARITYERR bit 0: PIObuf, 1: PIOpbc, 2: launchfifo
+ * RXEMEMPARITYERR bit 0: rcvbuf, 1: lookupq, 2:  expTID, 3: eagerTID
+ * 		bit 4: flag buffer, 5: datainfo, 6: header info */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_MASK 0xFULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT 40
+#define INFINIPATH_HWE_RXEMEMPARITYERR_MASK 0x7FULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT 44
+#define INFINIPATH_HWE_IBCBUSTOSPCPARITYERR 0x4000000000000000ULL
+#define INFINIPATH_HWE_IBCBUSFRSPCPARITYERR 0x8000000000000000ULL
+/* txe mem parity errors (shift by INFINIPATH_HWE_TXEMEMPARITYERR_SHIFT) */
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOBUF	0x1ULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOPBC	0x2ULL
+#define INFINIPATH_HWE_TXEMEMPARITYERR_PIOLAUNCHFIFO 0x4ULL
+/* rxe mem parity errors (shift by INFINIPATH_HWE_RXEMEMPARITYERR_SHIFT) */
+#define INFINIPATH_HWE_RXEMEMPARITYERR_RCVBUF   0x01ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_LOOKUPQ  0x02ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_EXPTID   0x04ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_EAGERTID 0x08ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_FLAGBUF  0x10ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_DATAINFO 0x20ULL
+#define INFINIPATH_HWE_RXEMEMPARITYERR_HDRINFO  0x40ULL
+/* waldo specific -- find the rest in ipath_6110.c */
+#define INFINIPATH_HWE_RXDSYNCMEMPARITYERR  0x0000000400000000ULL
+/* 6120/7220 specific -- find the rest in ipath_6120.c and ipath_7220.c */
+#define INFINIPATH_HWE_MEMBISTFAILED	0x0040000000000000ULL
+
+/* kr_hwdiagctrl bits */
+#define INFINIPATH_DC_FORCETXEMEMPARITYERR_MASK 0xFULL
+#define INFINIPATH_DC_FORCETXEMEMPARITYERR_SHIFT 40
+#define INFINIPATH_DC_FORCERXEMEMPARITYERR_MASK 0x7FULL
+#define INFINIPATH_DC_FORCERXEMEMPARITYERR_SHIFT 44
+#define INFINIPATH_DC_FORCERXDSYNCMEMPARITYERR  0x0000000400000000ULL
+#define INFINIPATH_DC_COUNTERDISABLE            0x1000000000000000ULL
+#define INFINIPATH_DC_COUNTERWREN               0x2000000000000000ULL
+#define INFINIPATH_DC_FORCEIBCBUSTOSPCPARITYERR 0x4000000000000000ULL
+#define INFINIPATH_DC_FORCEIBCBUSFRSPCPARITYERR 0x8000000000000000ULL
+
+/* kr_ibcctrl bits */
+#define INFINIPATH_IBCC_FLOWCTRLPERIOD_MASK 0xFFULL
+#define INFINIPATH_IBCC_FLOWCTRLPERIOD_SHIFT 0
+#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_MASK 0xFFULL
+#define INFINIPATH_IBCC_FLOWCTRLWATERMARK_SHIFT 8
+#define INFINIPATH_IBCC_LINKINITCMD_MASK 0x3ULL
+#define INFINIPATH_IBCC_LINKINITCMD_DISABLE 1
+/* cycle through TS1/TS2 till OK */
+#define INFINIPATH_IBCC_LINKINITCMD_POLL 2
+/* wait for TS1, then go on */
+#define INFINIPATH_IBCC_LINKINITCMD_SLEEP 3
+#define INFINIPATH_IBCC_LINKINITCMD_SHIFT 16
+#define INFINIPATH_IBCC_LINKCMD_MASK 0x3ULL
+#define INFINIPATH_IBCC_LINKCMD_DOWN 1		/* move to 0x11 */
+#define INFINIPATH_IBCC_LINKCMD_ARMED 2		/* move to 0x21 */
+#define INFINIPATH_IBCC_LINKCMD_ACTIVE 3	/* move to 0x31 */
+#define INFINIPATH_IBCC_LINKCMD_SHIFT 18
+#define INFINIPATH_IBCC_MAXPKTLEN_MASK 0x7FFULL
+#define INFINIPATH_IBCC_MAXPKTLEN_SHIFT 20
+#define INFINIPATH_IBCC_PHYERRTHRESHOLD_MASK 0xFULL
+#define INFINIPATH_IBCC_PHYERRTHRESHOLD_SHIFT 32
+#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_MASK 0xFULL
+#define INFINIPATH_IBCC_OVERRUNTHRESHOLD_SHIFT 36
+#define INFINIPATH_IBCC_CREDITSCALE_MASK 0x7ULL
+#define INFINIPATH_IBCC_CREDITSCALE_SHIFT 40
+#define INFINIPATH_IBCC_LOOPBACK             0x8000000000000000ULL
+#define INFINIPATH_IBCC_LINKDOWNDEFAULTSTATE 0x4000000000000000ULL
+
+/* kr_ibcstatus bits */
+#define INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT 0
+#define INFINIPATH_IBCS_LINKSTATE_MASK 0x7
+
+#define INFINIPATH_IBCS_TXREADY       0x40000000
+#define INFINIPATH_IBCS_TXCREDITOK    0x80000000
+/* link training states (shift by
+   INFINIPATH_IBCS_LINKTRAININGSTATE_SHIFT) */
+#define INFINIPATH_IBCS_LT_STATE_DISABLED	0x00
+#define INFINIPATH_IBCS_LT_STATE_LINKUP		0x01
+#define INFINIPATH_IBCS_LT_STATE_POLLACTIVE	0x02
+#define INFINIPATH_IBCS_LT_STATE_POLLQUIET	0x03
+#define INFINIPATH_IBCS_LT_STATE_SLEEPDELAY	0x04
+#define INFINIPATH_IBCS_LT_STATE_SLEEPQUIET	0x05
+#define INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE	0x08
+#define INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG	0x09
+#define INFINIPATH_IBCS_LT_STATE_CFGWAITRMT	0x0a
+#define INFINIPATH_IBCS_LT_STATE_CFGIDLE	0x0b
+#define INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN	0x0c
+#define INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT	0x0e
+#define INFINIPATH_IBCS_LT_STATE_RECOVERIDLE	0x0f
+/* link state machine states (shift by ibcs_ls_shift) */
+#define INFINIPATH_IBCS_L_STATE_DOWN		0x0
+#define INFINIPATH_IBCS_L_STATE_INIT		0x1
+#define INFINIPATH_IBCS_L_STATE_ARM		0x2
+#define INFINIPATH_IBCS_L_STATE_ACTIVE		0x3
+#define INFINIPATH_IBCS_L_STATE_ACT_DEFER	0x4
+
+
+/* kr_extstatus bits */
+#define INFINIPATH_EXTS_SERDESPLLLOCK 0x1
+#define INFINIPATH_EXTS_GPIOIN_MASK 0xFFFFULL
+#define INFINIPATH_EXTS_GPIOIN_SHIFT 48
+
+/* kr_extctrl bits */
+#define INFINIPATH_EXTC_GPIOINVERT_MASK 0xFFFFULL
+#define INFINIPATH_EXTC_GPIOINVERT_SHIFT 32
+#define INFINIPATH_EXTC_GPIOOE_MASK 0xFFFFULL
+#define INFINIPATH_EXTC_GPIOOE_SHIFT 48
+#define INFINIPATH_EXTC_SERDESENABLE         0x80000000ULL
+#define INFINIPATH_EXTC_SERDESCONNECT        0x40000000ULL
+#define INFINIPATH_EXTC_SERDESENTRUNKING     0x20000000ULL
+#define INFINIPATH_EXTC_SERDESDISRXFIFO      0x10000000ULL
+#define INFINIPATH_EXTC_SERDESENPLPBK1       0x08000000ULL
+#define INFINIPATH_EXTC_SERDESENPLPBK2       0x04000000ULL
+#define INFINIPATH_EXTC_SERDESENENCDEC       0x02000000ULL
+#define INFINIPATH_EXTC_LED1SECPORT_ON       0x00000020ULL
+#define INFINIPATH_EXTC_LED2SECPORT_ON       0x00000010ULL
+#define INFINIPATH_EXTC_LED1PRIPORT_ON       0x00000008ULL
+#define INFINIPATH_EXTC_LED2PRIPORT_ON       0x00000004ULL
+#define INFINIPATH_EXTC_LEDGBLOK_ON          0x00000002ULL
+#define INFINIPATH_EXTC_LEDGBLERR_OFF        0x00000001ULL
+
+/* kr_partitionkey bits */
+#define INFINIPATH_PKEY_SIZE 16
+#define INFINIPATH_PKEY_MASK 0xFFFF
+#define INFINIPATH_PKEY_DEFAULT_PKEY 0xFFFF
+
+/* kr_serdesconfig0 bits */
+#define INFINIPATH_SERDC0_RESET_MASK  0xfULL	/* overal reset bits */
+#define INFINIPATH_SERDC0_RESET_PLL   0x10000000ULL	/* pll reset */
+/* tx idle enables (per lane) */
+#define INFINIPATH_SERDC0_TXIDLE      0xF000ULL
+/* rx detect enables (per lane) */
+#define INFINIPATH_SERDC0_RXDETECT_EN 0xF0000ULL
+/* L1 Power down; use with RXDETECT, Otherwise not used on IB side */
+#define INFINIPATH_SERDC0_L1PWR_DN	 0xF0ULL
+
+/* common kr_xgxsconfig bits (or safe in all, even if not implemented) */
+#define INFINIPATH_XGXS_RX_POL_SHIFT 19
+#define INFINIPATH_XGXS_RX_POL_MASK 0xfULL
+
+
+/*
+ * IPATH_PIO_MAXIBHDR is the max IB header size allowed for in our
+ * PIO send buffers.  This is well beyond anything currently
+ * defined in the InfiniBand spec.
+ */
+#define IPATH_PIO_MAXIBHDR 128
+
+typedef u64 ipath_err_t;
+
+/* The following change with the type of device, so
+ * need to be part of the ipath_devdata struct, or
+ * we could have problems plugging in devices of
+ * different types (e.g. one HT, one PCIE)
+ * in one system, to be managed by one driver.
+ * On the other hand, this file is may also be included
+ * by other code, so leave the declarations here
+ * temporarily. Minor footprint issue if common-model
+ * linker used, none if C89+ linker used.
+ */
+
+/* mask of defined bits for various registers */
+extern u64 infinipath_i_bitsextant;
+extern ipath_err_t infinipath_e_bitsextant, infinipath_hwe_bitsextant;
+
+/* masks that are different in various chips, or only exist in some chips */
+extern u32 infinipath_i_rcvavail_mask, infinipath_i_rcvurg_mask;
+
+/*
+ * These are the infinipath general register numbers (not offsets).
+ * The kernel registers are used directly, those beyond the kernel
+ * registers are calculated from one of the base registers.  The use of
+ * an integer type doesn't allow type-checking as thorough as, say,
+ * an enum but allows for better hiding of chip differences.
+ */
+typedef const u16 ipath_kreg,	/* infinipath general registers */
+ ipath_creg,			/* infinipath counter registers */
+ ipath_sreg;			/* kernel-only, infinipath send registers */
+
+/*
+ * These are the chip registers common to all infinipath chips, and
+ * used both by the kernel and the diagnostics or other user code.
+ * They are all implemented such that 64 bit accesses work.
+ * Some implement no more than 32 bits.  Because 64 bit reads
+ * require 2 HT cmds on opteron, we access those with 32 bit
+ * reads for efficiency (they are written as 64 bits, since
+ * the extra 32 bits are nearly free on writes, and it slightly reduces
+ * complexity).  The rest are all accessed as 64 bits.
+ */
+struct ipath_kregs {
+	/* These are the 32 bit group */
+	ipath_kreg kr_control;
+	ipath_kreg kr_counterregbase;
+	ipath_kreg kr_intmask;
+	ipath_kreg kr_intstatus;
+	ipath_kreg kr_pagealign;
+	ipath_kreg kr_portcnt;
+	ipath_kreg kr_rcvtidbase;
+	ipath_kreg kr_rcvtidcnt;
+	ipath_kreg kr_rcvegrbase;
+	ipath_kreg kr_rcvegrcnt;
+	ipath_kreg kr_scratch;
+	ipath_kreg kr_sendctrl;
+	ipath_kreg kr_sendpiobufbase;
+	ipath_kreg kr_sendpiobufcnt;
+	ipath_kreg kr_sendpiosize;
+	ipath_kreg kr_sendregbase;
+	ipath_kreg kr_userregbase;
+	/* These are the 64 bit group */
+	ipath_kreg kr_debugport;
+	ipath_kreg kr_debugportselect;
+	ipath_kreg kr_errorclear;
+	ipath_kreg kr_errormask;
+	ipath_kreg kr_errorstatus;
+	ipath_kreg kr_extctrl;
+	ipath_kreg kr_extstatus;
+	ipath_kreg kr_gpio_clear;
+	ipath_kreg kr_gpio_mask;
+	ipath_kreg kr_gpio_out;
+	ipath_kreg kr_gpio_status;
+	ipath_kreg kr_hwdiagctrl;
+	ipath_kreg kr_hwerrclear;
+	ipath_kreg kr_hwerrmask;
+	ipath_kreg kr_hwerrstatus;
+	ipath_kreg kr_ibcctrl;
+	ipath_kreg kr_ibcstatus;
+	ipath_kreg kr_intblocked;
+	ipath_kreg kr_intclear;
+	ipath_kreg kr_interruptconfig;
+	ipath_kreg kr_mdio;
+	ipath_kreg kr_partitionkey;
+	ipath_kreg kr_rcvbthqp;
+	ipath_kreg kr_rcvbufbase;
+	ipath_kreg kr_rcvbufsize;
+	ipath_kreg kr_rcvctrl;
+	ipath_kreg kr_rcvhdrcnt;
+	ipath_kreg kr_rcvhdrentsize;
+	ipath_kreg kr_rcvhdrsize;
+	ipath_kreg kr_rcvintmembase;
+	ipath_kreg kr_rcvintmemsize;
+	ipath_kreg kr_revision;
+	ipath_kreg kr_sendbuffererror;
+	ipath_kreg kr_sendpioavailaddr;
+	ipath_kreg kr_serdesconfig0;
+	ipath_kreg kr_serdesconfig1;
+	ipath_kreg kr_serdesstatus;
+	ipath_kreg kr_txintmembase;
+	ipath_kreg kr_txintmemsize;
+	ipath_kreg kr_xgxsconfig;
+	ipath_kreg kr_ibpllcfg;
+	/* use these two (and the following N ports) only with
+	 * ipath_k*_kreg64_port(); not *kreg64() */
+	ipath_kreg kr_rcvhdraddr;
+	ipath_kreg kr_rcvhdrtailaddr;
+
+	/* remaining registers are not present on all types of infinipath
+	   chips  */
+	ipath_kreg kr_rcvpktledcnt;
+	ipath_kreg kr_pcierbuftestreg0;
+	ipath_kreg kr_pcierbuftestreg1;
+	ipath_kreg kr_pcieq0serdesconfig0;
+	ipath_kreg kr_pcieq0serdesconfig1;
+	ipath_kreg kr_pcieq0serdesstatus;
+	ipath_kreg kr_pcieq1serdesconfig0;
+	ipath_kreg kr_pcieq1serdesconfig1;
+	ipath_kreg kr_pcieq1serdesstatus;
+	ipath_kreg kr_hrtbt_guid;
+	ipath_kreg kr_ibcddrctrl;
+	ipath_kreg kr_ibcddrstatus;
+	ipath_kreg kr_jintreload;
+
+	/* send dma related regs */
+	ipath_kreg kr_senddmabase;
+	ipath_kreg kr_senddmalengen;
+	ipath_kreg kr_senddmatail;
+	ipath_kreg kr_senddmahead;
+	ipath_kreg kr_senddmaheadaddr;
+	ipath_kreg kr_senddmabufmask0;
+	ipath_kreg kr_senddmabufmask1;
+	ipath_kreg kr_senddmabufmask2;
+	ipath_kreg kr_senddmastatus;
+
+	/* SerDes related regs (IBA7220-only) */
+	ipath_kreg kr_ibserdesctrl;
+	ipath_kreg kr_ib_epbacc;
+	ipath_kreg kr_ib_epbtrans;
+	ipath_kreg kr_pcie_epbacc;
+	ipath_kreg kr_pcie_epbtrans;
+	ipath_kreg kr_ib_ddsrxeq;
+};
+
+struct ipath_cregs {
+	ipath_creg cr_badformatcnt;
+	ipath_creg cr_erricrccnt;
+	ipath_creg cr_errlinkcnt;
+	ipath_creg cr_errlpcrccnt;
+	ipath_creg cr_errpkey;
+	ipath_creg cr_errrcvflowctrlcnt;
+	ipath_creg cr_err_rlencnt;
+	ipath_creg cr_errslencnt;
+	ipath_creg cr_errtidfull;
+	ipath_creg cr_errtidvalid;
+	ipath_creg cr_errvcrccnt;
+	ipath_creg cr_ibstatuschange;
+	ipath_creg cr_intcnt;
+	ipath_creg cr_invalidrlencnt;
+	ipath_creg cr_invalidslencnt;
+	ipath_creg cr_lbflowstallcnt;
+	ipath_creg cr_iblinkdowncnt;
+	ipath_creg cr_iblinkerrrecovcnt;
+	ipath_creg cr_ibsymbolerrcnt;
+	ipath_creg cr_pktrcvcnt;
+	ipath_creg cr_pktrcvflowctrlcnt;
+	ipath_creg cr_pktsendcnt;
+	ipath_creg cr_pktsendflowcnt;
+	ipath_creg cr_portovflcnt;
+	ipath_creg cr_rcvebpcnt;
+	ipath_creg cr_rcvovflcnt;
+	ipath_creg cr_rxdroppktcnt;
+	ipath_creg cr_senddropped;
+	ipath_creg cr_sendstallcnt;
+	ipath_creg cr_sendunderruncnt;
+	ipath_creg cr_unsupvlcnt;
+	ipath_creg cr_wordrcvcnt;
+	ipath_creg cr_wordsendcnt;
+	ipath_creg cr_vl15droppedpktcnt;
+	ipath_creg cr_rxotherlocalphyerrcnt;
+	ipath_creg cr_excessbufferovflcnt;
+	ipath_creg cr_locallinkintegrityerrcnt;
+	ipath_creg cr_rxvlerrcnt;
+	ipath_creg cr_rxdlidfltrcnt;
+	ipath_creg cr_psstat;
+	ipath_creg cr_psstart;
+	ipath_creg cr_psinterval;
+	ipath_creg cr_psrcvdatacount;
+	ipath_creg cr_psrcvpktscount;
+	ipath_creg cr_psxmitdatacount;
+	ipath_creg cr_psxmitpktscount;
+	ipath_creg cr_psxmitwaitcount;
+};
+
+#endif				/* _IPATH_REGISTERS_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c
new file mode 100644
index 000000000..1f95bbaf7
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_ruc.c
@@ -0,0 +1,734 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <linux/spinlock.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/*
+ * Convert the AETH RNR timeout code into the number of milliseconds.
+ */
+const u32 ib_ipath_rnr_table[32] = {
+	656,			/* 0 */
+	1,			/* 1 */
+	1,			/* 2 */
+	1,			/* 3 */
+	1,			/* 4 */
+	1,			/* 5 */
+	1,			/* 6 */
+	1,			/* 7 */
+	1,			/* 8 */
+	1,			/* 9 */
+	1,			/* A */
+	1,			/* B */
+	1,			/* C */
+	1,			/* D */
+	2,			/* E */
+	2,			/* F */
+	3,			/* 10 */
+	4,			/* 11 */
+	6,			/* 12 */
+	8,			/* 13 */
+	11,			/* 14 */
+	16,			/* 15 */
+	21,			/* 16 */
+	31,			/* 17 */
+	41,			/* 18 */
+	62,			/* 19 */
+	82,			/* 1A */
+	123,			/* 1B */
+	164,			/* 1C */
+	246,			/* 1D */
+	328,			/* 1E */
+	492			/* 1F */
+};
+
+/**
+ * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device
+ * @qp: the QP
+ *
+ * Called with the QP s_lock held and interrupts disabled.
+ * XXX Use a simple list for now.  We might need a priority
+ * queue if we have lots of QPs waiting for RNR timeouts
+ * but that should be rare.
+ */
+void ipath_insert_rnr_queue(struct ipath_qp *qp)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+
+	/* We already did a spin_lock_irqsave(), so just use spin_lock */
+	spin_lock(&dev->pending_lock);
+	if (list_empty(&dev->rnrwait))
+		list_add(&qp->timerwait, &dev->rnrwait);
+	else {
+		struct list_head *l = &dev->rnrwait;
+		struct ipath_qp *nqp = list_entry(l->next, struct ipath_qp,
+						  timerwait);
+
+		while (qp->s_rnr_timeout >= nqp->s_rnr_timeout) {
+			qp->s_rnr_timeout -= nqp->s_rnr_timeout;
+			l = l->next;
+			if (l->next == &dev->rnrwait) {
+				nqp = NULL;
+				break;
+			}
+			nqp = list_entry(l->next, struct ipath_qp,
+					 timerwait);
+		}
+		if (nqp)
+			nqp->s_rnr_timeout -= qp->s_rnr_timeout;
+		list_add(&qp->timerwait, l);
+	}
+	spin_unlock(&dev->pending_lock);
+}
+
+/**
+ * ipath_init_sge - Validate a RWQE and fill in the SGE state
+ * @qp: the QP
+ *
+ * Return 1 if OK.
+ */
+int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
+		   u32 *lengthp, struct ipath_sge_state *ss)
+{
+	int i, j, ret;
+	struct ib_wc wc;
+
+	*lengthp = 0;
+	for (i = j = 0; i < wqe->num_sge; i++) {
+		if (wqe->sg_list[i].length == 0)
+			continue;
+		/* Check LKEY */
+		if (!ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge,
+				   &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+			goto bad_lkey;
+		*lengthp += wqe->sg_list[i].length;
+		j++;
+	}
+	ss->num_sge = j;
+	ret = 1;
+	goto bail;
+
+bad_lkey:
+	memset(&wc, 0, sizeof(wc));
+	wc.wr_id = wqe->wr_id;
+	wc.status = IB_WC_LOC_PROT_ERR;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	/* Signal solicited completion event. */
+	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * ipath_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return 0 if no RWQE is available, otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
+{
+	unsigned long flags;
+	struct ipath_rq *rq;
+	struct ipath_rwq *wq;
+	struct ipath_srq *srq;
+	struct ipath_rwqe *wqe;
+	void (*handler)(struct ib_event *, void *);
+	u32 tail;
+	int ret;
+
+	if (qp->ibqp.srq) {
+		srq = to_isrq(qp->ibqp.srq);
+		handler = srq->ibsrq.event_handler;
+		rq = &srq->rq;
+	} else {
+		srq = NULL;
+		handler = NULL;
+		rq = &qp->r_rq;
+	}
+
+	spin_lock_irqsave(&rq->lock, flags);
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+		ret = 0;
+		goto unlock;
+	}
+
+	wq = rq->wq;
+	tail = wq->tail;
+	/* Validate tail before using it since it is user writable. */
+	if (tail >= rq->size)
+		tail = 0;
+	do {
+		if (unlikely(tail == wq->head)) {
+			ret = 0;
+			goto unlock;
+		}
+		/* Make sure entry is read after head index is read. */
+		smp_rmb();
+		wqe = get_rwqe_ptr(rq, tail);
+		if (++tail >= rq->size)
+			tail = 0;
+		if (wr_id_only)
+			break;
+		qp->r_sge.sg_list = qp->r_sg_list;
+	} while (!ipath_init_sge(qp, wqe, &qp->r_len, &qp->r_sge));
+	qp->r_wr_id = wqe->wr_id;
+	wq->tail = tail;
+
+	ret = 1;
+	set_bit(IPATH_R_WRID_VALID, &qp->r_aflags);
+	if (handler) {
+		u32 n;
+
+		/*
+		 * validate head pointer value and compute
+		 * the number of remaining WQEs.
+		 */
+		n = wq->head;
+		if (n >= rq->size)
+			n = 0;
+		if (n < tail)
+			n += rq->size - tail;
+		else
+			n -= tail;
+		if (n < srq->limit) {
+			struct ib_event ev;
+
+			srq->limit = 0;
+			spin_unlock_irqrestore(&rq->lock, flags);
+			ev.device = qp->ibqp.device;
+			ev.element.srq = qp->ibqp.srq;
+			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			handler(&ev, srq->ibsrq.srq_context);
+			goto bail;
+		}
+	}
+unlock:
+	spin_unlock_irqrestore(&rq->lock, flags);
+bail:
+	return ret;
+}
+
+/**
+ * ipath_ruc_loopback - handle UC and RC lookback requests
+ * @sqp: the sending QP
+ *
+ * This is called from ipath_do_send() to
+ * forward a WQE addressed to the same HCA.
+ * Note that although we are single threaded due to the tasklet, we still
+ * have to protect against post_send().  We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+static void ipath_ruc_loopback(struct ipath_qp *sqp)
+{
+	struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
+	struct ipath_qp *qp;
+	struct ipath_swqe *wqe;
+	struct ipath_sge *sge;
+	unsigned long flags;
+	struct ib_wc wc;
+	u64 sdata;
+	atomic64_t *maddr;
+	enum ib_wc_status send_status;
+
+	/*
+	 * Note that we check the responder QP state after
+	 * checking the requester's state.
+	 */
+	qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn);
+
+	spin_lock_irqsave(&sqp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
+	    !(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+		goto unlock;
+
+	sqp->s_flags |= IPATH_S_BUSY;
+
+again:
+	if (sqp->s_last == sqp->s_head)
+		goto clr_busy;
+	wqe = get_swqe_ptr(sqp, sqp->s_last);
+
+	/* Return if it is not OK to start a new work reqeust. */
+	if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND))
+			goto clr_busy;
+		/* We are in the error state, flush the work request. */
+		send_status = IB_WC_WR_FLUSH_ERR;
+		goto flush_send;
+	}
+
+	/*
+	 * We can rely on the entry not changing without the s_lock
+	 * being held until we update s_last.
+	 * We increment s_cur to indicate s_last is in progress.
+	 */
+	if (sqp->s_last == sqp->s_cur) {
+		if (++sqp->s_cur >= sqp->s_size)
+			sqp->s_cur = 0;
+	}
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+	if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+		dev->n_pkt_drops++;
+		/*
+		 * For RC, the requester would timeout and retry so
+		 * shortcut the timeouts and just signal too many retries.
+		 */
+		if (sqp->ibqp.qp_type == IB_QPT_RC)
+			send_status = IB_WC_RETRY_EXC_ERR;
+		else
+			send_status = IB_WC_SUCCESS;
+		goto serr;
+	}
+
+	memset(&wc, 0, sizeof wc);
+	send_status = IB_WC_SUCCESS;
+
+	sqp->s_sge.sge = wqe->sg_list[0];
+	sqp->s_sge.sg_list = wqe->sg_list + 1;
+	sqp->s_sge.num_sge = wqe->wr.num_sge;
+	sqp->s_len = wqe->length;
+	switch (wqe->wr.opcode) {
+	case IB_WR_SEND_WITH_IMM:
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		/* FALLTHROUGH */
+	case IB_WR_SEND:
+		if (!ipath_get_rwqe(qp, 0))
+			goto rnr_nak;
+		break;
+
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		if (!ipath_get_rwqe(qp, 1))
+			goto rnr_nak;
+		/* FALLTHROUGH */
+	case IB_WR_RDMA_WRITE:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+		if (wqe->length == 0)
+			break;
+		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, wqe->length,
+					    wqe->wr.wr.rdma.remote_addr,
+					    wqe->wr.wr.rdma.rkey,
+					    IB_ACCESS_REMOTE_WRITE)))
+			goto acc_err;
+		break;
+
+	case IB_WR_RDMA_READ:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+			goto inv_err;
+		if (unlikely(!ipath_rkey_ok(qp, &sqp->s_sge, wqe->length,
+					    wqe->wr.wr.rdma.remote_addr,
+					    wqe->wr.wr.rdma.rkey,
+					    IB_ACCESS_REMOTE_READ)))
+			goto acc_err;
+		qp->r_sge.sge = wqe->sg_list[0];
+		qp->r_sge.sg_list = wqe->sg_list + 1;
+		qp->r_sge.num_sge = wqe->wr.num_sge;
+		break;
+
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+			goto inv_err;
+		if (unlikely(!ipath_rkey_ok(qp, &qp->r_sge, sizeof(u64),
+					    wqe->wr.wr.atomic.remote_addr,
+					    wqe->wr.wr.atomic.rkey,
+					    IB_ACCESS_REMOTE_ATOMIC)))
+			goto acc_err;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+		sdata = wqe->wr.wr.atomic.compare_add;
+		*(u64 *) sqp->s_sge.sge.vaddr =
+			(wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+			(u64) atomic64_add_return(sdata, maddr) - sdata :
+			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+				      sdata, wqe->wr.wr.atomic.swap);
+		goto send_comp;
+
+	default:
+		send_status = IB_WC_LOC_QP_OP_ERR;
+		goto serr;
+	}
+
+	sge = &sqp->s_sge.sge;
+	while (sqp->s_len) {
+		u32 len = sqp->s_len;
+
+		if (len > sge->length)
+			len = sge->length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		ipath_copy_sge(&qp->r_sge, sge->vaddr, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--sqp->s_sge.num_sge)
+				*sge = *sqp->s_sge.sg_list++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		sqp->s_len -= len;
+	}
+
+	if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
+		goto send_comp;
+
+	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+	else
+		wc.opcode = IB_WC_RECV;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.byte_len = wqe->length;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = qp->remote_qpn;
+	wc.slid = qp->remote_ah_attr.dlid;
+	wc.sl = qp->remote_ah_attr.sl;
+	wc.port_num = 1;
+	/* Signal completion event if the solicited bit is set. */
+	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		       wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+flush_send:
+	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+	ipath_send_complete(sqp, wqe, send_status);
+	goto again;
+
+rnr_nak:
+	/* Handle RNR NAK */
+	if (qp->ibqp.qp_type == IB_QPT_UC)
+		goto send_comp;
+	/*
+	 * Note: we don't need the s_lock held since the BUSY flag
+	 * makes this single threaded.
+	 */
+	if (sqp->s_rnr_retry == 0) {
+		send_status = IB_WC_RNR_RETRY_EXC_ERR;
+		goto serr;
+	}
+	if (sqp->s_rnr_retry_cnt < 7)
+		sqp->s_rnr_retry--;
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK))
+		goto clr_busy;
+	sqp->s_flags |= IPATH_S_WAITING;
+	dev->n_rnr_naks++;
+	sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer];
+	ipath_insert_rnr_queue(sqp);
+	goto clr_busy;
+
+inv_err:
+	send_status = IB_WC_REM_INV_REQ_ERR;
+	wc.status = IB_WC_LOC_QP_OP_ERR;
+	goto err;
+
+acc_err:
+	send_status = IB_WC_REM_ACCESS_ERR;
+	wc.status = IB_WC_LOC_PROT_ERR;
+err:
+	/* responder goes to error state */
+	ipath_rc_error(qp, wc.status);
+
+serr:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	ipath_send_complete(sqp, wqe, send_status);
+	if (sqp->ibqp.qp_type == IB_QPT_RC) {
+		int lastwqe = ipath_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+		sqp->s_flags &= ~IPATH_S_BUSY;
+		spin_unlock_irqrestore(&sqp->s_lock, flags);
+		if (lastwqe) {
+			struct ib_event ev;
+
+			ev.device = sqp->ibqp.device;
+			ev.element.qp = &sqp->ibqp;
+			ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+		}
+		goto done;
+	}
+clr_busy:
+	sqp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+	if (qp && atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp)
+{
+	if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) ||
+	    qp->ibqp.qp_type == IB_QPT_SMI) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl |= INFINIPATH_S_PIOINTBUFAVAIL;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+				 dd->ipath_sendctrl);
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+	}
+}
+
+/**
+ * ipath_no_bufs_available - tell the layer driver we need buffers
+ * @qp: the QP that caused the problem
+ * @dev: the device we ran out of buffers on
+ *
+ * Called when we run out of PIO buffers.
+ * If we are now in the error state, return zero to flush the
+ * send work request.
+ */
+static int ipath_no_bufs_available(struct ipath_qp *qp,
+				    struct ipath_ibdev *dev)
+{
+	unsigned long flags;
+	int ret = 1;
+
+	/*
+	 * Note that as soon as want_buffer() is called and
+	 * possibly before it returns, ipath_ib_piobufavail()
+	 * could be called. Therefore, put QP on the piowait list before
+	 * enabling the PIO avail interrupt.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
+		dev->n_piowait++;
+		qp->s_flags |= IPATH_S_WAITING;
+		qp->s_flags &= ~IPATH_S_BUSY;
+		spin_lock(&dev->pending_lock);
+		if (list_empty(&qp->piowait))
+			list_add_tail(&qp->piowait, &dev->piowait);
+		spin_unlock(&dev->pending_lock);
+	} else
+		ret = 0;
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	if (ret)
+		want_buffer(dev->dd, qp);
+	return ret;
+}
+
+/**
+ * ipath_make_grh - construct a GRH header
+ * @dev: a pointer to the ipath device
+ * @hdr: a pointer to the GRH header being constructed
+ * @grh: the global route address to send to
+ * @hwords: the number of 32 bit words of header being sent
+ * @nwords: the number of 32 bit words of data being sent
+ *
+ * Return the size of the header in 32 bit words.
+ */
+u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
+		   struct ib_global_route *grh, u32 hwords, u32 nwords)
+{
+	hdr->version_tclass_flow =
+		cpu_to_be32((6 << 28) |
+			    (grh->traffic_class << 20) |
+			    grh->flow_label);
+	hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
+	/* next_hdr is defined by C8-7 in ch. 8.4.1 */
+	hdr->next_hdr = 0x1B;
+	hdr->hop_limit = grh->hop_limit;
+	/* The SGID is 32-bit aligned. */
+	hdr->sgid.global.subnet_prefix = dev->gid_prefix;
+	hdr->sgid.global.interface_id = dev->dd->ipath_guid;
+	hdr->dgid = grh->dgid;
+
+	/* GRH header size in 32-bit words. */
+	return sizeof(struct ib_grh) / sizeof(u32);
+}
+
+void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
+			   struct ipath_other_headers *ohdr,
+			   u32 bth0, u32 bth2)
+{
+	u16 lrh0;
+	u32 nwords;
+	u32 extra_bytes;
+
+	/* Construct the header. */
+	extra_bytes = -qp->s_cur_size & 3;
+	nwords = (qp->s_cur_size + extra_bytes) >> 2;
+	lrh0 = IPATH_LRH_BTH;
+	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+		qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
+						 &qp->remote_ah_attr.grh,
+						 qp->s_hdrwords, nwords);
+		lrh0 = IPATH_LRH_GRH;
+	}
+	lrh0 |= qp->remote_ah_attr.sl << 4;
+	qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+	qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+	qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+	qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid |
+				       qp->remote_ah_attr.src_path_bits);
+	bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index);
+	bth0 |= extra_bytes << 20;
+	ohdr->bth[0] = cpu_to_be32(bth0 | (1 << 22));
+	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(bth2);
+}
+
+/**
+ * ipath_do_send - perform a send on a QP
+ * @data: contains a pointer to the QP
+ *
+ * Process entries in the send work queue until credit or queue is
+ * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, two threads could send packets out of order.
+ */
+void ipath_do_send(unsigned long data)
+{
+	struct ipath_qp *qp = (struct ipath_qp *)data;
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	int (*make_req)(struct ipath_qp *qp);
+	unsigned long flags;
+
+	if ((qp->ibqp.qp_type == IB_QPT_RC ||
+	     qp->ibqp.qp_type == IB_QPT_UC) &&
+	    qp->remote_ah_attr.dlid == dev->dd->ipath_lid) {
+		ipath_ruc_loopback(qp);
+		goto bail;
+	}
+
+	if (qp->ibqp.qp_type == IB_QPT_RC)
+	       make_req = ipath_make_rc_req;
+	else if (qp->ibqp.qp_type == IB_QPT_UC)
+	       make_req = ipath_make_uc_req;
+	else
+	       make_req = ipath_make_ud_req;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
+	    !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) {
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		goto bail;
+	}
+
+	qp->s_flags |= IPATH_S_BUSY;
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+again:
+	/* Check for a constructed packet to be sent. */
+	if (qp->s_hdrwords != 0) {
+		/*
+		 * If no PIO bufs are available, return.  An interrupt will
+		 * call ipath_ib_piobufavail() when one is available.
+		 */
+		if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords,
+				     qp->s_cur_sge, qp->s_cur_size)) {
+			if (ipath_no_bufs_available(qp, dev))
+				goto bail;
+		}
+		dev->n_unicast_xmit++;
+		/* Record that we sent the packet and s_hdr is empty. */
+		qp->s_hdrwords = 0;
+	}
+
+	if (make_req(qp))
+		goto again;
+
+bail:;
+}
+
+/*
+ * This should be called with s_lock held.
+ */
+void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
+			 enum ib_wc_status status)
+{
+	u32 old_last, last;
+
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+		return;
+
+	/* See ch. 11.2.4.1 and 10.7.3.1 */
+	if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
+	    (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+	    status != IB_WC_SUCCESS) {
+		struct ib_wc wc;
+
+		memset(&wc, 0, sizeof wc);
+		wc.wr_id = wqe->wr.wr_id;
+		wc.status = status;
+		wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+		wc.qp = &qp->ibqp;
+		if (status == IB_WC_SUCCESS)
+			wc.byte_len = wqe->length;
+		ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
+			       status != IB_WC_SUCCESS);
+	}
+
+	old_last = last = qp->s_last;
+	if (++last >= qp->s_size)
+		last = 0;
+	qp->s_last = last;
+	if (qp->s_cur == old_last)
+		qp->s_cur = last;
+	if (qp->s_tail == old_last)
+		qp->s_tail = last;
+	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+		qp->s_draining = 0;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_sdma.c b/drivers/infiniband/hw/ipath/ipath_sdma.c
new file mode 100644
index 000000000..17a517766
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_sdma.c
@@ -0,0 +1,818 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/gfp.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+#define SDMA_DESCQ_SZ PAGE_SIZE /* 256 entries per 4KB page */
+
+static void vl15_watchdog_enq(struct ipath_devdata *dd)
+{
+	/* ipath_sdma_lock must already be held */
+	if (atomic_inc_return(&dd->ipath_sdma_vl15_count) == 1) {
+		unsigned long interval = (HZ + 19) / 20;
+		dd->ipath_sdma_vl15_timer.expires = jiffies + interval;
+		add_timer(&dd->ipath_sdma_vl15_timer);
+	}
+}
+
+static void vl15_watchdog_deq(struct ipath_devdata *dd)
+{
+	/* ipath_sdma_lock must already be held */
+	if (atomic_dec_return(&dd->ipath_sdma_vl15_count) != 0) {
+		unsigned long interval = (HZ + 19) / 20;
+		mod_timer(&dd->ipath_sdma_vl15_timer, jiffies + interval);
+	} else {
+		del_timer(&dd->ipath_sdma_vl15_timer);
+	}
+}
+
+static void vl15_watchdog_timeout(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+
+	if (atomic_read(&dd->ipath_sdma_vl15_count) != 0) {
+		ipath_dbg("vl15 watchdog timeout - clearing\n");
+		ipath_cancel_sends(dd, 1);
+		ipath_hol_down(dd);
+	} else {
+		ipath_dbg("vl15 watchdog timeout - "
+			  "condition already cleared\n");
+	}
+}
+
+static void unmap_desc(struct ipath_devdata *dd, unsigned head)
+{
+	__le64 *descqp = &dd->ipath_sdma_descq[head].qw[0];
+	u64 desc[2];
+	dma_addr_t addr;
+	size_t len;
+
+	desc[0] = le64_to_cpu(descqp[0]);
+	desc[1] = le64_to_cpu(descqp[1]);
+
+	addr = (desc[1] << 32) | (desc[0] >> 32);
+	len = (desc[0] >> 14) & (0x7ffULL << 2);
+	dma_unmap_single(&dd->pcidev->dev, addr, len, DMA_TO_DEVICE);
+}
+
+/*
+ * ipath_sdma_lock should be locked before calling this.
+ */
+int ipath_sdma_make_progress(struct ipath_devdata *dd)
+{
+	struct list_head *lp = NULL;
+	struct ipath_sdma_txreq *txp = NULL;
+	u16 dmahead;
+	u16 start_idx = 0;
+	int progress = 0;
+
+	if (!list_empty(&dd->ipath_sdma_activelist)) {
+		lp = dd->ipath_sdma_activelist.next;
+		txp = list_entry(lp, struct ipath_sdma_txreq, list);
+		start_idx = txp->start_idx;
+	}
+
+	/*
+	 * Read the SDMA head register in order to know that the
+	 * interrupt clear has been written to the chip.
+	 * Otherwise, we may not get an interrupt for the last
+	 * descriptor in the queue.
+	 */
+	dmahead = (u16)ipath_read_kreg32(dd, dd->ipath_kregs->kr_senddmahead);
+	/* sanity check return value for error handling (chip reset, etc.) */
+	if (dmahead >= dd->ipath_sdma_descq_cnt)
+		goto done;
+
+	while (dd->ipath_sdma_descq_head != dmahead) {
+		if (txp && txp->flags & IPATH_SDMA_TXREQ_F_FREEDESC &&
+		    dd->ipath_sdma_descq_head == start_idx) {
+			unmap_desc(dd, dd->ipath_sdma_descq_head);
+			start_idx++;
+			if (start_idx == dd->ipath_sdma_descq_cnt)
+				start_idx = 0;
+		}
+
+		/* increment free count and head */
+		dd->ipath_sdma_descq_removed++;
+		if (++dd->ipath_sdma_descq_head == dd->ipath_sdma_descq_cnt)
+			dd->ipath_sdma_descq_head = 0;
+
+		if (txp && txp->next_descq_idx == dd->ipath_sdma_descq_head) {
+			/* move to notify list */
+			if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
+				vl15_watchdog_deq(dd);
+			list_move_tail(lp, &dd->ipath_sdma_notifylist);
+			if (!list_empty(&dd->ipath_sdma_activelist)) {
+				lp = dd->ipath_sdma_activelist.next;
+				txp = list_entry(lp, struct ipath_sdma_txreq,
+						 list);
+				start_idx = txp->start_idx;
+			} else {
+				lp = NULL;
+				txp = NULL;
+			}
+		}
+		progress = 1;
+	}
+
+	if (progress)
+		tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
+
+done:
+	return progress;
+}
+
+static void ipath_sdma_notify(struct ipath_devdata *dd, struct list_head *list)
+{
+	struct ipath_sdma_txreq *txp, *txp_next;
+
+	list_for_each_entry_safe(txp, txp_next, list, list) {
+		list_del_init(&txp->list);
+
+		if (txp->callback)
+			(*txp->callback)(txp->callback_cookie,
+					 txp->callback_status);
+	}
+}
+
+static void sdma_notify_taskbody(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+	struct list_head list;
+
+	INIT_LIST_HEAD(&list);
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+	list_splice_init(&dd->ipath_sdma_notifylist, &list);
+
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	ipath_sdma_notify(dd, &list);
+
+	/*
+	 * The IB verbs layer needs to see the callback before getting
+	 * the call to ipath_ib_piobufavail() because the callback
+	 * handles releasing resources the next send will need.
+	 * Otherwise, we could do these calls in
+	 * ipath_sdma_make_progress().
+	 */
+	ipath_ib_piobufavail(dd->verbs_dev);
+}
+
+static void sdma_notify_task(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *)opaque;
+
+	if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+		sdma_notify_taskbody(dd);
+}
+
+static void dump_sdma_state(struct ipath_devdata *dd)
+{
+	unsigned long reg;
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmastatus);
+	ipath_cdbg(VERBOSE, "kr_senddmastatus: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_sendctrl);
+	ipath_cdbg(VERBOSE, "kr_sendctrl: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask0);
+	ipath_cdbg(VERBOSE, "kr_senddmabufmask0: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask1);
+	ipath_cdbg(VERBOSE, "kr_senddmabufmask1: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmabufmask2);
+	ipath_cdbg(VERBOSE, "kr_senddmabufmask2: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmatail);
+	ipath_cdbg(VERBOSE, "kr_senddmatail: 0x%016lx\n", reg);
+
+	reg = ipath_read_kreg64(dd, dd->ipath_kregs->kr_senddmahead);
+	ipath_cdbg(VERBOSE, "kr_senddmahead: 0x%016lx\n", reg);
+}
+
+static void sdma_abort_task(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
+	u64 status;
+	unsigned long flags;
+
+	if (test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+		return;
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+	status = dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK;
+
+	/* nothing to do */
+	if (status == IPATH_SDMA_ABORT_NONE)
+		goto unlock;
+
+	/* ipath_sdma_abort() is done, waiting for interrupt */
+	if (status == IPATH_SDMA_ABORT_DISARMED) {
+		if (time_before(jiffies, dd->ipath_sdma_abort_intr_timeout))
+			goto resched_noprint;
+		/* give up, intr got lost somewhere */
+		ipath_dbg("give up waiting for SDMADISABLED intr\n");
+		__set_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+		status = IPATH_SDMA_ABORT_ABORTED;
+	}
+
+	/* everything is stopped, time to clean up and restart */
+	if (status == IPATH_SDMA_ABORT_ABORTED) {
+		struct ipath_sdma_txreq *txp, *txpnext;
+		u64 hwstatus;
+		int notify = 0;
+
+		hwstatus = ipath_read_kreg64(dd,
+				dd->ipath_kregs->kr_senddmastatus);
+
+		if ((hwstatus & (IPATH_SDMA_STATUS_SCORE_BOARD_DRAIN_IN_PROG |
+				 IPATH_SDMA_STATUS_ABORT_IN_PROG	     |
+				 IPATH_SDMA_STATUS_INTERNAL_SDMA_ENABLE)) ||
+		    !(hwstatus & IPATH_SDMA_STATUS_SCB_EMPTY)) {
+			if (dd->ipath_sdma_reset_wait > 0) {
+				/* not done shutting down sdma */
+				--dd->ipath_sdma_reset_wait;
+				goto resched;
+			}
+			ipath_cdbg(VERBOSE, "gave up waiting for quiescent "
+				"status after SDMA reset, continuing\n");
+			dump_sdma_state(dd);
+		}
+
+		/* dequeue all "sent" requests */
+		list_for_each_entry_safe(txp, txpnext,
+					 &dd->ipath_sdma_activelist, list) {
+			txp->callback_status = IPATH_SDMA_TXREQ_S_ABORTED;
+			if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
+				vl15_watchdog_deq(dd);
+			list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
+			notify = 1;
+		}
+		if (notify)
+			tasklet_hi_schedule(&dd->ipath_sdma_notify_task);
+
+		/* reset our notion of head and tail */
+		dd->ipath_sdma_descq_tail = 0;
+		dd->ipath_sdma_descq_head = 0;
+		dd->ipath_sdma_head_dma[0] = 0;
+		dd->ipath_sdma_generation = 0;
+		dd->ipath_sdma_descq_removed = dd->ipath_sdma_descq_added;
+
+		/* Reset SendDmaLenGen */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen,
+			(u64) dd->ipath_sdma_descq_cnt | (1ULL << 18));
+
+		/* done with sdma state for a bit */
+		spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+		/*
+		 * Don't restart sdma here (with the exception
+		 * below). Wait until link is up to ACTIVE.  VL15 MADs
+		 * used to bring the link up use PIO, and multiple link
+		 * transitions otherwise cause the sdma engine to be
+		 * stopped and started multiple times.
+		 * The disable is done here, including the shadow,
+		 * so the state is kept consistent.
+		 * See ipath_restart_sdma() for the actual starting
+		 * of sdma.
+		 */
+		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+		dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+				 dd->ipath_sendctrl);
+		ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+		spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+		/* make sure I see next message */
+		dd->ipath_sdma_abort_jiffies = 0;
+
+		/*
+		 * Not everything that takes SDMA offline is a link
+		 * status change.  If the link was up, restart SDMA.
+		 */
+		if (dd->ipath_flags & IPATH_LINKACTIVE)
+			ipath_restart_sdma(dd);
+
+		goto done;
+	}
+
+resched:
+	/*
+	 * for now, keep spinning
+	 * JAG - this is bad to just have default be a loop without
+	 * state change
+	 */
+	if (time_after(jiffies, dd->ipath_sdma_abort_jiffies)) {
+		ipath_dbg("looping with status 0x%08lx\n",
+			  dd->ipath_sdma_status);
+		dd->ipath_sdma_abort_jiffies = jiffies + 5 * HZ;
+	}
+resched_noprint:
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+	if (!test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+		tasklet_hi_schedule(&dd->ipath_sdma_abort_task);
+	return;
+
+unlock:
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+done:
+	return;
+}
+
+/*
+ * This is called from interrupt context.
+ */
+void ipath_sdma_intr(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+	(void) ipath_sdma_make_progress(dd);
+
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+}
+
+static int alloc_sdma(struct ipath_devdata *dd)
+{
+	int ret = 0;
+
+	/* Allocate memory for SendDMA descriptor FIFO */
+	dd->ipath_sdma_descq = dma_alloc_coherent(&dd->pcidev->dev,
+		SDMA_DESCQ_SZ, &dd->ipath_sdma_descq_phys, GFP_KERNEL);
+
+	if (!dd->ipath_sdma_descq) {
+		ipath_dev_err(dd, "failed to allocate SendDMA descriptor "
+			"FIFO memory\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	dd->ipath_sdma_descq_cnt =
+		SDMA_DESCQ_SZ / sizeof(struct ipath_sdma_desc);
+
+	/* Allocate memory for DMA of head register to memory */
+	dd->ipath_sdma_head_dma = dma_alloc_coherent(&dd->pcidev->dev,
+		PAGE_SIZE, &dd->ipath_sdma_head_phys, GFP_KERNEL);
+	if (!dd->ipath_sdma_head_dma) {
+		ipath_dev_err(dd, "failed to allocate SendDMA head memory\n");
+		ret = -ENOMEM;
+		goto cleanup_descq;
+	}
+	dd->ipath_sdma_head_dma[0] = 0;
+
+	init_timer(&dd->ipath_sdma_vl15_timer);
+	dd->ipath_sdma_vl15_timer.function = vl15_watchdog_timeout;
+	dd->ipath_sdma_vl15_timer.data = (unsigned long)dd;
+	atomic_set(&dd->ipath_sdma_vl15_count, 0);
+
+	goto done;
+
+cleanup_descq:
+	dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ,
+		(void *)dd->ipath_sdma_descq, dd->ipath_sdma_descq_phys);
+	dd->ipath_sdma_descq = NULL;
+	dd->ipath_sdma_descq_phys = 0;
+done:
+	return ret;
+}
+
+int setup_sdma(struct ipath_devdata *dd)
+{
+	int ret = 0;
+	unsigned i, n;
+	u64 tmp64;
+	u64 senddmabufmask[3] = { 0 };
+	unsigned long flags;
+
+	ret = alloc_sdma(dd);
+	if (ret)
+		goto done;
+
+	if (!dd->ipath_sdma_descq) {
+		ipath_dev_err(dd, "SendDMA memory not allocated\n");
+		goto done;
+	}
+
+	/*
+	 * Set initial status as if we had been up, then gone down.
+	 * This lets initial start on transition to ACTIVE be the
+	 * same as restart after link flap.
+	 */
+	dd->ipath_sdma_status = IPATH_SDMA_ABORT_ABORTED;
+	dd->ipath_sdma_abort_jiffies = 0;
+	dd->ipath_sdma_generation = 0;
+	dd->ipath_sdma_descq_tail = 0;
+	dd->ipath_sdma_descq_head = 0;
+	dd->ipath_sdma_descq_removed = 0;
+	dd->ipath_sdma_descq_added = 0;
+
+	/* Set SendDmaBase */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase,
+			 dd->ipath_sdma_descq_phys);
+	/* Set SendDmaLenGen */
+	tmp64 = dd->ipath_sdma_descq_cnt;
+	tmp64 |= 1<<18; /* enable generation checking */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, tmp64);
+	/* Set SendDmaTail */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail,
+			 dd->ipath_sdma_descq_tail);
+	/* Set SendDmaHeadAddr */
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr,
+			 dd->ipath_sdma_head_phys);
+
+	/*
+	 * Reserve all the former "kernel" piobufs, using high number range
+	 * so we get as many 4K buffers as possible
+	 */
+	n = dd->ipath_piobcnt2k + dd->ipath_piobcnt4k;
+	i = dd->ipath_lastport_piobuf + dd->ipath_pioreserved;
+	ipath_chg_pioavailkernel(dd, i, n - i , 0);
+	for (; i < n; ++i) {
+		unsigned word = i / 64;
+		unsigned bit = i & 63;
+		BUG_ON(word >= 3);
+		senddmabufmask[word] |= 1ULL << bit;
+	}
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0,
+			 senddmabufmask[0]);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1,
+			 senddmabufmask[1]);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2,
+			 senddmabufmask[2]);
+
+	INIT_LIST_HEAD(&dd->ipath_sdma_activelist);
+	INIT_LIST_HEAD(&dd->ipath_sdma_notifylist);
+
+	tasklet_init(&dd->ipath_sdma_notify_task, sdma_notify_task,
+		     (unsigned long) dd);
+	tasklet_init(&dd->ipath_sdma_abort_task, sdma_abort_task,
+		     (unsigned long) dd);
+
+	/*
+	 * No use to turn on SDMA here, as link is probably not ACTIVE
+	 * Just mark it RUNNING and enable the interrupt, and let the
+	 * ipath_restart_sdma() on link transition to ACTIVE actually
+	 * enable it.
+	 */
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl |= INFINIPATH_S_SDMAINTENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	__set_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+done:
+	return ret;
+}
+
+void teardown_sdma(struct ipath_devdata *dd)
+{
+	struct ipath_sdma_txreq *txp, *txpnext;
+	unsigned long flags;
+	dma_addr_t sdma_head_phys = 0;
+	dma_addr_t sdma_descq_phys = 0;
+	void *sdma_descq = NULL;
+	void *sdma_head_dma = NULL;
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+	__clear_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status);
+	__set_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+	__set_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status);
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	tasklet_kill(&dd->ipath_sdma_abort_task);
+	tasklet_kill(&dd->ipath_sdma_notify_task);
+
+	/* turn off sdma */
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
+		dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+	/* dequeue all "sent" requests */
+	list_for_each_entry_safe(txp, txpnext, &dd->ipath_sdma_activelist,
+				 list) {
+		txp->callback_status = IPATH_SDMA_TXREQ_S_SHUTDOWN;
+		if (txp->flags & IPATH_SDMA_TXREQ_F_VL15)
+			vl15_watchdog_deq(dd);
+		list_move_tail(&txp->list, &dd->ipath_sdma_notifylist);
+	}
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	sdma_notify_taskbody(dd);
+
+	del_timer_sync(&dd->ipath_sdma_vl15_timer);
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+	dd->ipath_sdma_abort_jiffies = 0;
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabase, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmalengen, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmaheadaddr, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask0, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask1, 0);
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmabufmask2, 0);
+
+	if (dd->ipath_sdma_head_dma) {
+		sdma_head_dma = (void *) dd->ipath_sdma_head_dma;
+		sdma_head_phys = dd->ipath_sdma_head_phys;
+		dd->ipath_sdma_head_dma = NULL;
+		dd->ipath_sdma_head_phys = 0;
+	}
+
+	if (dd->ipath_sdma_descq) {
+		sdma_descq = dd->ipath_sdma_descq;
+		sdma_descq_phys = dd->ipath_sdma_descq_phys;
+		dd->ipath_sdma_descq = NULL;
+		dd->ipath_sdma_descq_phys = 0;
+	}
+
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	if (sdma_head_dma)
+		dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+				  sdma_head_dma, sdma_head_phys);
+
+	if (sdma_descq)
+		dma_free_coherent(&dd->pcidev->dev, SDMA_DESCQ_SZ,
+				  sdma_descq, sdma_descq_phys);
+}
+
+/*
+ * [Re]start SDMA, if we use it, and it's not already OK.
+ * This is called on transition to link ACTIVE, either the first or
+ * subsequent times.
+ */
+void ipath_restart_sdma(struct ipath_devdata *dd)
+{
+	unsigned long flags;
+	int needed = 1;
+
+	if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA))
+		goto bail;
+
+	/*
+	 * First, make sure we should, which is to say,
+	 * check that we are "RUNNING" (not in teardown)
+	 * and not "SHUTDOWN"
+	 */
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+	if (!test_bit(IPATH_SDMA_RUNNING, &dd->ipath_sdma_status)
+		|| test_bit(IPATH_SDMA_SHUTDOWN, &dd->ipath_sdma_status))
+			needed = 0;
+	else {
+		__clear_bit(IPATH_SDMA_DISABLED, &dd->ipath_sdma_status);
+		__clear_bit(IPATH_SDMA_DISARMED, &dd->ipath_sdma_status);
+		__clear_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status);
+	}
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+	if (!needed) {
+		ipath_dbg("invalid attempt to restart SDMA, status 0x%08lx\n",
+			dd->ipath_sdma_status);
+		goto bail;
+	}
+	spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
+	/*
+	 * First clear, just to be safe. Enable is only done
+	 * in chip on 0->1 transition
+	 */
+	dd->ipath_sendctrl &= ~INFINIPATH_S_SDMAENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	dd->ipath_sendctrl |= INFINIPATH_S_SDMAENABLE;
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl, dd->ipath_sendctrl);
+	ipath_read_kreg64(dd, dd->ipath_kregs->kr_scratch);
+	spin_unlock_irqrestore(&dd->ipath_sendctrl_lock, flags);
+
+	/* notify upper layers */
+	ipath_ib_piobufavail(dd->verbs_dev);
+
+bail:
+	return;
+}
+
+static inline void make_sdma_desc(struct ipath_devdata *dd,
+	u64 *sdmadesc, u64 addr, u64 dwlen, u64 dwoffset)
+{
+	WARN_ON(addr & 3);
+	/* SDmaPhyAddr[47:32] */
+	sdmadesc[1] = addr >> 32;
+	/* SDmaPhyAddr[31:0] */
+	sdmadesc[0] = (addr & 0xfffffffcULL) << 32;
+	/* SDmaGeneration[1:0] */
+	sdmadesc[0] |= (dd->ipath_sdma_generation & 3ULL) << 30;
+	/* SDmaDwordCount[10:0] */
+	sdmadesc[0] |= (dwlen & 0x7ffULL) << 16;
+	/* SDmaBufOffset[12:2] */
+	sdmadesc[0] |= dwoffset & 0x7ffULL;
+}
+
+/*
+ * This function queues one IB packet onto the send DMA queue per call.
+ * The caller is responsible for checking:
+ * 1) The number of send DMA descriptor entries is less than the size of
+ *    the descriptor queue.
+ * 2) The IB SGE addresses and lengths are 32-bit aligned
+ *    (except possibly the last SGE's length)
+ * 3) The SGE addresses are suitable for passing to dma_map_single().
+ */
+int ipath_sdma_verbs_send(struct ipath_devdata *dd,
+	struct ipath_sge_state *ss, u32 dwords,
+	struct ipath_verbs_txreq *tx)
+{
+
+	unsigned long flags;
+	struct ipath_sge *sge;
+	int ret = 0;
+	u16 tail;
+	__le64 *descqp;
+	u64 sdmadesc[2];
+	u32 dwoffset;
+	dma_addr_t addr;
+
+	if ((tx->map_len + (dwords<<2)) > dd->ipath_ibmaxlen) {
+		ipath_dbg("packet size %X > ibmax %X, fail\n",
+			tx->map_len + (dwords<<2), dd->ipath_ibmaxlen);
+		ret = -EMSGSIZE;
+		goto fail;
+	}
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+retry:
+	if (unlikely(test_bit(IPATH_SDMA_ABORTING, &dd->ipath_sdma_status))) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	if (tx->txreq.sg_count > ipath_sdma_descq_freecnt(dd)) {
+		if (ipath_sdma_make_progress(dd))
+			goto retry;
+		ret = -ENOBUFS;
+		goto unlock;
+	}
+
+	addr = dma_map_single(&dd->pcidev->dev, tx->txreq.map_addr,
+			      tx->map_len, DMA_TO_DEVICE);
+	if (dma_mapping_error(&dd->pcidev->dev, addr))
+		goto ioerr;
+
+	dwoffset = tx->map_len >> 2;
+	make_sdma_desc(dd, sdmadesc, (u64) addr, dwoffset, 0);
+
+	/* SDmaFirstDesc */
+	sdmadesc[0] |= 1ULL << 12;
+	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
+		sdmadesc[0] |= 1ULL << 14;	/* SDmaUseLargeBuf */
+
+	/* write to the descq */
+	tail = dd->ipath_sdma_descq_tail;
+	descqp = &dd->ipath_sdma_descq[tail].qw[0];
+	*descqp++ = cpu_to_le64(sdmadesc[0]);
+	*descqp++ = cpu_to_le64(sdmadesc[1]);
+
+	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEDESC)
+		tx->txreq.start_idx = tail;
+
+	/* increment the tail */
+	if (++tail == dd->ipath_sdma_descq_cnt) {
+		tail = 0;
+		descqp = &dd->ipath_sdma_descq[0].qw[0];
+		++dd->ipath_sdma_generation;
+	}
+
+	sge = &ss->sge;
+	while (dwords) {
+		u32 dw;
+		u32 len;
+
+		len = dwords << 2;
+		if (len > sge->length)
+			len = sge->length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		dw = (len + 3) >> 2;
+		addr = dma_map_single(&dd->pcidev->dev, sge->vaddr, dw << 2,
+				      DMA_TO_DEVICE);
+		if (dma_mapping_error(&dd->pcidev->dev, addr))
+			goto unmap;
+		make_sdma_desc(dd, sdmadesc, (u64) addr, dw, dwoffset);
+		/* SDmaUseLargeBuf has to be set in every descriptor */
+		if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_USELARGEBUF)
+			sdmadesc[0] |= 1ULL << 14;
+		/* write to the descq */
+		*descqp++ = cpu_to_le64(sdmadesc[0]);
+		*descqp++ = cpu_to_le64(sdmadesc[1]);
+
+		/* increment the tail */
+		if (++tail == dd->ipath_sdma_descq_cnt) {
+			tail = 0;
+			descqp = &dd->ipath_sdma_descq[0].qw[0];
+			++dd->ipath_sdma_generation;
+		}
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+
+		dwoffset += dw;
+		dwords -= dw;
+	}
+
+	if (!tail)
+		descqp = &dd->ipath_sdma_descq[dd->ipath_sdma_descq_cnt].qw[0];
+	descqp -= 2;
+	/* SDmaLastDesc */
+	descqp[0] |= cpu_to_le64(1ULL << 11);
+	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_INTREQ) {
+		/* SDmaIntReq */
+		descqp[0] |= cpu_to_le64(1ULL << 15);
+	}
+
+	/* Commit writes to memory and advance the tail on the chip */
+	wmb();
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
+
+	tx->txreq.next_descq_idx = tail;
+	tx->txreq.callback_status = IPATH_SDMA_TXREQ_S_OK;
+	dd->ipath_sdma_descq_tail = tail;
+	dd->ipath_sdma_descq_added += tx->txreq.sg_count;
+	list_add_tail(&tx->txreq.list, &dd->ipath_sdma_activelist);
+	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_VL15)
+		vl15_watchdog_enq(dd);
+	goto unlock;
+
+unmap:
+	while (tail != dd->ipath_sdma_descq_tail) {
+		if (!tail)
+			tail = dd->ipath_sdma_descq_cnt - 1;
+		else
+			tail--;
+		unmap_desc(dd, tail);
+	}
+ioerr:
+	ret = -EIO;
+unlock:
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+fail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_srq.c b/drivers/infiniband/hw/ipath/ipath_srq.c
new file mode 100644
index 000000000..26271984b
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_srq.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "ipath_verbs.h"
+
+/**
+ * ipath_post_srq_receive - post a receive on a shared receive queue
+ * @ibsrq: the SRQ to post the receive on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first WR to cause a problem is put here
+ *
+ * This may be called from interrupt context.
+ */
+int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			   struct ib_recv_wr **bad_wr)
+{
+	struct ipath_srq *srq = to_isrq(ibsrq);
+	struct ipath_rwq *wq;
+	unsigned long flags;
+	int ret;
+
+	for (; wr; wr = wr->next) {
+		struct ipath_rwqe *wqe;
+		u32 next;
+		int i;
+
+		if ((unsigned) wr->num_sge > srq->rq.max_sge) {
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		spin_lock_irqsave(&srq->rq.lock, flags);
+		wq = srq->rq.wq;
+		next = wq->head + 1;
+		if (next >= srq->rq.size)
+			next = 0;
+		if (next == wq->tail) {
+			spin_unlock_irqrestore(&srq->rq.lock, flags);
+			*bad_wr = wr;
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		wqe = get_rwqe_ptr(&srq->rq, wq->head);
+		wqe->wr_id = wr->wr_id;
+		wqe->num_sge = wr->num_sge;
+		for (i = 0; i < wr->num_sge; i++)
+			wqe->sg_list[i] = wr->sg_list[i];
+		/* Make sure queue entry is written before the head index. */
+		smp_wmb();
+		wq->head = next;
+		spin_unlock_irqrestore(&srq->rq.lock, flags);
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_create_srq - create a shared receive queue
+ * @ibpd: the protection domain of the SRQ to create
+ * @srq_init_attr: the attributes of the SRQ
+ * @udata: data from libipathverbs when creating a user SRQ
+ */
+struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
+				struct ib_srq_init_attr *srq_init_attr,
+				struct ib_udata *udata)
+{
+	struct ipath_ibdev *dev = to_idev(ibpd->device);
+	struct ipath_srq *srq;
+	u32 sz;
+	struct ib_srq *ret;
+
+	if (srq_init_attr->srq_type != IB_SRQT_BASIC) {
+		ret = ERR_PTR(-ENOSYS);
+		goto done;
+	}
+
+	if (srq_init_attr->attr.max_wr == 0) {
+		ret = ERR_PTR(-EINVAL);
+		goto done;
+	}
+
+	if ((srq_init_attr->attr.max_sge > ib_ipath_max_srq_sges) ||
+	    (srq_init_attr->attr.max_wr > ib_ipath_max_srq_wrs)) {
+		ret = ERR_PTR(-EINVAL);
+		goto done;
+	}
+
+	srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto done;
+	}
+
+	/*
+	 * Need to use vmalloc() if we want to support large #s of entries.
+	 */
+	srq->rq.size = srq_init_attr->attr.max_wr + 1;
+	srq->rq.max_sge = srq_init_attr->attr.max_sge;
+	sz = sizeof(struct ib_sge) * srq->rq.max_sge +
+		sizeof(struct ipath_rwqe);
+	srq->rq.wq = vmalloc_user(sizeof(struct ipath_rwq) + srq->rq.size * sz);
+	if (!srq->rq.wq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_srq;
+	}
+
+	/*
+	 * Return the address of the RWQ as the offset to mmap.
+	 * See ipath_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		int err;
+		u32 s = sizeof(struct ipath_rwq) + srq->rq.size * sz;
+
+		srq->ip =
+		    ipath_create_mmap_info(dev, s,
+					   ibpd->uobject->context,
+					   srq->rq.wq);
+		if (!srq->ip) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_wq;
+		}
+
+		err = ib_copy_to_udata(udata, &srq->ip->offset,
+				       sizeof(srq->ip->offset));
+		if (err) {
+			ret = ERR_PTR(err);
+			goto bail_ip;
+		}
+	} else
+		srq->ip = NULL;
+
+	/*
+	 * ib_create_srq() will initialize srq->ibsrq.
+	 */
+	spin_lock_init(&srq->rq.lock);
+	srq->rq.wq->head = 0;
+	srq->rq.wq->tail = 0;
+	srq->limit = srq_init_attr->attr.srq_limit;
+
+	spin_lock(&dev->n_srqs_lock);
+	if (dev->n_srqs_allocated == ib_ipath_max_srqs) {
+		spin_unlock(&dev->n_srqs_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+ 	dev->n_srqs_allocated++;
+	spin_unlock(&dev->n_srqs_lock);
+
+	if (srq->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = &srq->ibsrq;
+	goto done;
+
+bail_ip:
+	kfree(srq->ip);
+bail_wq:
+	vfree(srq->rq.wq);
+bail_srq:
+	kfree(srq);
+done:
+	return ret;
+}
+
+/**
+ * ipath_modify_srq - modify a shared receive queue
+ * @ibsrq: the SRQ to modify
+ * @attr: the new attributes of the SRQ
+ * @attr_mask: indicates which attributes to modify
+ * @udata: user data for ipathverbs.so
+ */
+int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		     enum ib_srq_attr_mask attr_mask,
+		     struct ib_udata *udata)
+{
+	struct ipath_srq *srq = to_isrq(ibsrq);
+	struct ipath_rwq *wq;
+	int ret = 0;
+
+	if (attr_mask & IB_SRQ_MAX_WR) {
+		struct ipath_rwq *owq;
+		struct ipath_rwqe *p;
+		u32 sz, size, n, head, tail;
+
+		/* Check that the requested sizes are below the limits. */
+		if ((attr->max_wr > ib_ipath_max_srq_wrs) ||
+		    ((attr_mask & IB_SRQ_LIMIT) ?
+		     attr->srq_limit : srq->limit) > attr->max_wr) {
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		sz = sizeof(struct ipath_rwqe) +
+			srq->rq.max_sge * sizeof(struct ib_sge);
+		size = attr->max_wr + 1;
+		wq = vmalloc_user(sizeof(struct ipath_rwq) + size * sz);
+		if (!wq) {
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		/* Check that we can write the offset to mmap. */
+		if (udata && udata->inlen >= sizeof(__u64)) {
+			__u64 offset_addr;
+			__u64 offset = 0;
+
+			ret = ib_copy_from_udata(&offset_addr, udata,
+						 sizeof(offset_addr));
+			if (ret)
+				goto bail_free;
+			udata->outbuf =
+				(void __user *) (unsigned long) offset_addr;
+			ret = ib_copy_to_udata(udata, &offset,
+					       sizeof(offset));
+			if (ret)
+				goto bail_free;
+		}
+
+		spin_lock_irq(&srq->rq.lock);
+		/*
+		 * validate head pointer value and compute
+		 * the number of remaining WQEs.
+		 */
+		owq = srq->rq.wq;
+		head = owq->head;
+		if (head >= srq->rq.size)
+			head = 0;
+		tail = owq->tail;
+		if (tail >= srq->rq.size)
+			tail = 0;
+		n = head;
+		if (n < tail)
+			n += srq->rq.size - tail;
+		else
+			n -= tail;
+		if (size <= n) {
+			ret = -EINVAL;
+			goto bail_unlock;
+		}
+		n = 0;
+		p = wq->wq;
+		while (tail != head) {
+			struct ipath_rwqe *wqe;
+			int i;
+
+			wqe = get_rwqe_ptr(&srq->rq, tail);
+			p->wr_id = wqe->wr_id;
+			p->num_sge = wqe->num_sge;
+			for (i = 0; i < wqe->num_sge; i++)
+				p->sg_list[i] = wqe->sg_list[i];
+			n++;
+			p = (struct ipath_rwqe *)((char *) p + sz);
+			if (++tail >= srq->rq.size)
+				tail = 0;
+		}
+		srq->rq.wq = wq;
+		srq->rq.size = size;
+		wq->head = n;
+		wq->tail = 0;
+		if (attr_mask & IB_SRQ_LIMIT)
+			srq->limit = attr->srq_limit;
+		spin_unlock_irq(&srq->rq.lock);
+
+		vfree(owq);
+
+		if (srq->ip) {
+			struct ipath_mmap_info *ip = srq->ip;
+			struct ipath_ibdev *dev = to_idev(srq->ibsrq.device);
+			u32 s = sizeof(struct ipath_rwq) + size * sz;
+
+			ipath_update_mmap_info(dev, ip, s, wq);
+
+			/*
+			 * Return the offset to mmap.
+			 * See ipath_mmap() for details.
+			 */
+			if (udata && udata->inlen >= sizeof(__u64)) {
+				ret = ib_copy_to_udata(udata, &ip->offset,
+						       sizeof(ip->offset));
+				if (ret)
+					goto bail;
+			}
+
+			spin_lock_irq(&dev->pending_lock);
+			if (list_empty(&ip->pending_mmaps))
+				list_add(&ip->pending_mmaps,
+					 &dev->pending_mmaps);
+			spin_unlock_irq(&dev->pending_lock);
+		}
+	} else if (attr_mask & IB_SRQ_LIMIT) {
+		spin_lock_irq(&srq->rq.lock);
+		if (attr->srq_limit >= srq->rq.size)
+			ret = -EINVAL;
+		else
+			srq->limit = attr->srq_limit;
+		spin_unlock_irq(&srq->rq.lock);
+	}
+	goto bail;
+
+bail_unlock:
+	spin_unlock_irq(&srq->rq.lock);
+bail_free:
+	vfree(wq);
+bail:
+	return ret;
+}
+
+int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+	struct ipath_srq *srq = to_isrq(ibsrq);
+
+	attr->max_wr = srq->rq.size - 1;
+	attr->max_sge = srq->rq.max_sge;
+	attr->srq_limit = srq->limit;
+	return 0;
+}
+
+/**
+ * ipath_destroy_srq - destroy a shared receive queue
+ * @ibsrq: the SRQ to destroy
+ */
+int ipath_destroy_srq(struct ib_srq *ibsrq)
+{
+	struct ipath_srq *srq = to_isrq(ibsrq);
+	struct ipath_ibdev *dev = to_idev(ibsrq->device);
+
+	spin_lock(&dev->n_srqs_lock);
+	dev->n_srqs_allocated--;
+	spin_unlock(&dev->n_srqs_lock);
+	if (srq->ip)
+		kref_put(&srq->ip->ref, ipath_release_mmap_info);
+	else
+		vfree(srq->rq.wq);
+	kfree(srq);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_stats.c b/drivers/infiniband/hw/ipath/ipath_stats.c
new file mode 100644
index 000000000..f63e143e3
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_stats.c
@@ -0,0 +1,347 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipath_kernel.h"
+
+struct infinipath_stats ipath_stats;
+
+/**
+ * ipath_snap_cntr - snapshot a chip counter
+ * @dd: the infinipath device
+ * @creg: the counter to snapshot
+ *
+ * called from add_timer and user counter read calls, to deal with
+ * counters that wrap in "human time".  The words sent and received, and
+ * the packets sent and received are all that we worry about.  For now,
+ * at least, we don't worry about error counters, because if they wrap
+ * that quickly, we probably don't care.  We may eventually just make this
+ * handle all the counters.  word counters can wrap in about 20 seconds
+ * of full bandwidth traffic, packet counters in a few hours.
+ */
+
+u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg)
+{
+	u32 val, reg64 = 0;
+	u64 val64;
+	unsigned long t0, t1;
+	u64 ret;
+
+	t0 = jiffies;
+	/* If fast increment counters are only 32 bits, snapshot them,
+	 * and maintain them as 64bit values in the driver */
+	if (!(dd->ipath_flags & IPATH_32BITCOUNTERS) &&
+	    (creg == dd->ipath_cregs->cr_wordsendcnt ||
+	     creg == dd->ipath_cregs->cr_wordrcvcnt ||
+	     creg == dd->ipath_cregs->cr_pktsendcnt ||
+	     creg == dd->ipath_cregs->cr_pktrcvcnt)) {
+		val64 = ipath_read_creg(dd, creg);
+		val = val64 == ~0ULL ? ~0U : 0;
+		reg64 = 1;
+	} else			/* val64 just to keep gcc quiet... */
+		val64 = val = ipath_read_creg32(dd, creg);
+	/*
+	 * See if a second has passed.  This is just a way to detect things
+	 * that are quite broken.  Normally this should take just a few
+	 * cycles (the check is for long enough that we don't care if we get
+	 * pre-empted.)  An Opteron HT O read timeout is 4 seconds with
+	 * normal NB values
+	 */
+	t1 = jiffies;
+	if (time_before(t0 + HZ, t1) && val == -1) {
+		ipath_dev_err(dd, "Error!  Read counter 0x%x timed out\n",
+			      creg);
+		ret = 0ULL;
+		goto bail;
+	}
+	if (reg64) {
+		ret = val64;
+		goto bail;
+	}
+
+	if (creg == dd->ipath_cregs->cr_wordsendcnt) {
+		if (val != dd->ipath_lastsword) {
+			dd->ipath_sword += val - dd->ipath_lastsword;
+			dd->ipath_lastsword = val;
+		}
+		val64 = dd->ipath_sword;
+	} else if (creg == dd->ipath_cregs->cr_wordrcvcnt) {
+		if (val != dd->ipath_lastrword) {
+			dd->ipath_rword += val - dd->ipath_lastrword;
+			dd->ipath_lastrword = val;
+		}
+		val64 = dd->ipath_rword;
+	} else if (creg == dd->ipath_cregs->cr_pktsendcnt) {
+		if (val != dd->ipath_lastspkts) {
+			dd->ipath_spkts += val - dd->ipath_lastspkts;
+			dd->ipath_lastspkts = val;
+		}
+		val64 = dd->ipath_spkts;
+	} else if (creg == dd->ipath_cregs->cr_pktrcvcnt) {
+		if (val != dd->ipath_lastrpkts) {
+			dd->ipath_rpkts += val - dd->ipath_lastrpkts;
+			dd->ipath_lastrpkts = val;
+		}
+		val64 = dd->ipath_rpkts;
+	} else if (creg == dd->ipath_cregs->cr_ibsymbolerrcnt) {
+		if (dd->ibdeltainprog)
+			val64 -= val64 - dd->ibsymsnap;
+		val64 -= dd->ibsymdelta;
+	} else if (creg == dd->ipath_cregs->cr_iblinkerrrecovcnt) {
+		if (dd->ibdeltainprog)
+			val64 -= val64 - dd->iblnkerrsnap;
+		val64 -= dd->iblnkerrdelta;
+	} else
+		val64 = (u64) val;
+
+	ret = val64;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_qcheck - print delta of egrfull/hdrqfull errors for kernel ports
+ * @dd: the infinipath device
+ *
+ * print the delta of egrfull/hdrqfull errors for kernel ports no more than
+ * every 5 seconds.  User processes are printed at close, but kernel doesn't
+ * close, so...  Separate routine so may call from other places someday, and
+ * so function name when printed by _IPATH_INFO is meaningfull
+ */
+static void ipath_qcheck(struct ipath_devdata *dd)
+{
+	static u64 last_tot_hdrqfull;
+	struct ipath_portdata *pd = dd->ipath_pd[0];
+	size_t blen = 0;
+	char buf[128];
+	u32 hdrqtail;
+
+	*buf = 0;
+	if (pd->port_hdrqfull != dd->ipath_p0_hdrqfull) {
+		blen = snprintf(buf, sizeof buf, "port 0 hdrqfull %u",
+				pd->port_hdrqfull -
+				dd->ipath_p0_hdrqfull);
+		dd->ipath_p0_hdrqfull = pd->port_hdrqfull;
+	}
+	if (ipath_stats.sps_etidfull != dd->ipath_last_tidfull) {
+		blen += snprintf(buf + blen, sizeof buf - blen,
+				 "%srcvegrfull %llu",
+				 blen ? ", " : "",
+				 (unsigned long long)
+				 (ipath_stats.sps_etidfull -
+				  dd->ipath_last_tidfull));
+		dd->ipath_last_tidfull = ipath_stats.sps_etidfull;
+	}
+
+	/*
+	 * this is actually the number of hdrq full interrupts, not actual
+	 * events, but at the moment that's mostly what I'm interested in.
+	 * Actual count, etc. is in the counters, if needed.  For production
+	 * users this won't ordinarily be printed.
+	 */
+
+	if ((ipath_debug & (__IPATH_PKTDBG | __IPATH_DBG)) &&
+	    ipath_stats.sps_hdrqfull != last_tot_hdrqfull) {
+		blen += snprintf(buf + blen, sizeof buf - blen,
+				 "%shdrqfull %llu (all ports)",
+				 blen ? ", " : "",
+				 (unsigned long long)
+				 (ipath_stats.sps_hdrqfull -
+				  last_tot_hdrqfull));
+		last_tot_hdrqfull = ipath_stats.sps_hdrqfull;
+	}
+	if (blen)
+		ipath_dbg("%s\n", buf);
+
+	hdrqtail = ipath_get_hdrqtail(pd);
+	if (pd->port_head != hdrqtail) {
+		if (dd->ipath_lastport0rcv_cnt ==
+		    ipath_stats.sps_port0pkts) {
+			ipath_cdbg(PKT, "missing rcv interrupts? "
+				   "port0 hd=%x tl=%x; port0pkts %llx; write"
+				   " hd (w/intr)\n",
+				   pd->port_head, hdrqtail,
+				   (unsigned long long)
+				   ipath_stats.sps_port0pkts);
+			ipath_write_ureg(dd, ur_rcvhdrhead, hdrqtail |
+				dd->ipath_rhdrhead_intr_off, pd->port_port);
+		}
+		dd->ipath_lastport0rcv_cnt = ipath_stats.sps_port0pkts;
+	}
+}
+
+static void ipath_chk_errormask(struct ipath_devdata *dd)
+{
+	static u32 fixed;
+	u32 ctrl;
+	unsigned long errormask;
+	unsigned long hwerrs;
+
+	if (!dd->ipath_errormask || !(dd->ipath_flags & IPATH_INITTED))
+		return;
+
+	errormask = ipath_read_kreg64(dd, dd->ipath_kregs->kr_errormask);
+
+	if (errormask == dd->ipath_errormask)
+		return;
+	fixed++;
+
+	hwerrs = ipath_read_kreg64(dd, dd->ipath_kregs->kr_hwerrstatus);
+	ctrl = ipath_read_kreg32(dd, dd->ipath_kregs->kr_control);
+
+	ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+		dd->ipath_errormask);
+
+	if ((hwerrs & dd->ipath_hwerrmask) ||
+		(ctrl & INFINIPATH_C_FREEZEMODE)) {
+		/* force re-interrupt of pending events, just in case */
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_hwerrclear, 0ULL);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_errorclear, 0ULL);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_intclear, 0ULL);
+		dev_info(&dd->pcidev->dev,
+			"errormask fixed(%u) %lx -> %lx, ctrl %x hwerr %lx\n",
+			fixed, errormask, (unsigned long)dd->ipath_errormask,
+			ctrl, hwerrs);
+	} else
+		ipath_dbg("errormask fixed(%u) %lx -> %lx, no freeze\n",
+			fixed, errormask,
+			(unsigned long)dd->ipath_errormask);
+}
+
+
+/**
+ * ipath_get_faststats - get word counters from chip before they overflow
+ * @opaque - contains a pointer to the infinipath device ipath_devdata
+ *
+ * called from add_timer
+ */
+void ipath_get_faststats(unsigned long opaque)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *) opaque;
+	int i;
+	static unsigned cnt;
+	unsigned long flags;
+	u64 traffic_wds;
+
+	/*
+	 * don't access the chip while running diags, or memory diags can
+	 * fail
+	 */
+	if (!dd->ipath_kregbase || !(dd->ipath_flags & IPATH_INITTED) ||
+	    ipath_diag_inuse)
+		/* but re-arm the timer, for diags case; won't hurt other */
+		goto done;
+
+	/*
+	 * We now try to maintain a "active timer", based on traffic
+	 * exceeding a threshold, so we need to check the word-counts
+	 * even if they are 64-bit.
+	 */
+	traffic_wds = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt) +
+		ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
+	spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
+	traffic_wds -= dd->ipath_traffic_wds;
+	dd->ipath_traffic_wds += traffic_wds;
+	if (traffic_wds  >= IPATH_TRAFFIC_ACTIVE_THRESHOLD)
+		atomic_add(5, &dd->ipath_active_time); /* S/B #define */
+	spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
+
+	if (dd->ipath_flags & IPATH_32BITCOUNTERS) {
+		ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
+		ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
+	}
+
+	ipath_qcheck(dd);
+
+	/*
+	 * deal with repeat error suppression.  Doesn't really matter if
+	 * last error was almost a full interval ago, or just a few usecs
+	 * ago; still won't get more than 2 per interval.  We may want
+	 * longer intervals for this eventually, could do with mod, counter
+	 * or separate timer.  Also see code in ipath_handle_errors() and
+	 * ipath_handle_hwerrors().
+	 */
+
+	if (dd->ipath_lasterror)
+		dd->ipath_lasterror = 0;
+	if (dd->ipath_lasthwerror)
+		dd->ipath_lasthwerror = 0;
+	if (dd->ipath_maskederrs
+	    && time_after(jiffies, dd->ipath_unmasktime)) {
+		char ebuf[256];
+		int iserr;
+		iserr = ipath_decode_err(dd, ebuf, sizeof ebuf,
+					 dd->ipath_maskederrs);
+		if (dd->ipath_maskederrs &
+		    ~(INFINIPATH_E_RRCVEGRFULL | INFINIPATH_E_RRCVHDRFULL |
+		      INFINIPATH_E_PKTERRS))
+			ipath_dev_err(dd, "Re-enabling masked errors "
+				      "(%s)\n", ebuf);
+		else {
+			/*
+			 * rcvegrfull and rcvhdrqfull are "normal", for some
+			 * types of processes (mostly benchmarks) that send
+			 * huge numbers of messages, while not processing
+			 * them.  So only complain about these at debug
+			 * level.
+			 */
+			if (iserr)
+				ipath_dbg(
+					"Re-enabling queue full errors (%s)\n",
+					ebuf);
+			else
+				ipath_cdbg(ERRPKT, "Re-enabling packet"
+					" problem interrupt (%s)\n", ebuf);
+		}
+
+		/* re-enable masked errors */
+		dd->ipath_errormask |= dd->ipath_maskederrs;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_errormask,
+				 dd->ipath_errormask);
+		dd->ipath_maskederrs = 0;
+	}
+
+	/* limit qfull messages to ~one per minute per port */
+	if ((++cnt & 0x10)) {
+		for (i = (int) dd->ipath_cfgports; --i >= 0; ) {
+			struct ipath_portdata *pd = dd->ipath_pd[i];
+
+			if (pd && pd->port_lastrcvhdrqtail != -1)
+				pd->port_lastrcvhdrqtail = -1;
+		}
+	}
+
+	ipath_chk_errormask(dd);
+done:
+	mod_timer(&dd->ipath_stats_timer, jiffies + HZ * 5);
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_sysfs.c b/drivers/infiniband/hw/ipath/ipath_sysfs.c
new file mode 100644
index 000000000..75558f33f
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_sysfs.c
@@ -0,0 +1,1238 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/ctype.h>
+#include <linux/stat.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+/**
+ * ipath_parse_ushort - parse an unsigned short value in an arbitrary base
+ * @str: the string containing the number
+ * @valp: where to put the result
+ *
+ * returns the number of bytes consumed, or negative value on error
+ */
+int ipath_parse_ushort(const char *str, unsigned short *valp)
+{
+	unsigned long val;
+	char *end;
+	int ret;
+
+	if (!isdigit(str[0])) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	val = simple_strtoul(str, &end, 0);
+
+	if (val > 0xffff) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	*valp = val;
+
+	ret = end + 1 - str;
+	if (ret == 0)
+		ret = -EINVAL;
+
+bail:
+	return ret;
+}
+
+static ssize_t show_version(struct device_driver *dev, char *buf)
+{
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", ib_ipath_version);
+}
+
+static ssize_t show_num_units(struct device_driver *dev, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%d\n",
+			 ipath_count_units(NULL, NULL, NULL));
+}
+
+static ssize_t show_status(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	ssize_t ret;
+
+	if (!dd->ipath_statusp) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+			(unsigned long long) *(dd->ipath_statusp));
+
+bail:
+	return ret;
+}
+
+static const char *ipath_status_str[] = {
+	"Initted",
+	"Disabled",
+	"Admin_Disabled",
+	"", /* This used to be the old "OIB_SMA" status. */
+	"", /* This used to be the old "SMA" status. */
+	"Present",
+	"IB_link_up",
+	"IB_configured",
+	"NoIBcable",
+	"Fatal_Hardware_Error",
+	NULL,
+};
+
+static ssize_t show_status_str(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int i, any;
+	u64 s;
+	ssize_t ret;
+
+	if (!dd->ipath_statusp) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	s = *(dd->ipath_statusp);
+	*buf = '\0';
+	for (any = i = 0; s && ipath_status_str[i]; i++) {
+		if (s & 1) {
+			if (any && strlcat(buf, " ", PAGE_SIZE) >=
+			    PAGE_SIZE)
+				/* overflow */
+				break;
+			if (strlcat(buf, ipath_status_str[i],
+				    PAGE_SIZE) >= PAGE_SIZE)
+				break;
+			any = 1;
+		}
+		s >>= 1;
+	}
+	if (any)
+		strlcat(buf, "\n", PAGE_SIZE);
+
+	ret = strlen(buf);
+
+bail:
+	return ret;
+}
+
+static ssize_t show_boardversion(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion);
+}
+
+static ssize_t show_localbus_info(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_lbus_info);
+}
+
+static ssize_t show_lmc(struct device *dev,
+			struct device_attribute *attr,
+			char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_lmc);
+}
+
+static ssize_t store_lmc(struct device *dev,
+			 struct device_attribute *attr,
+			 const char *buf,
+			 size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 lmc = 0;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &lmc);
+	if (ret < 0)
+		goto invalid;
+
+	if (lmc > 7) {
+		ret = -EINVAL;
+		goto invalid;
+	}
+
+	ipath_set_lid(dd, dd->ipath_lid, lmc);
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid LMC %u\n", lmc);
+bail:
+	return ret;
+}
+
+static ssize_t show_lid(struct device *dev,
+			struct device_attribute *attr,
+			char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_lid);
+}
+
+static ssize_t store_lid(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 lid = 0;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &lid);
+	if (ret < 0)
+		goto invalid;
+
+	if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE) {
+		ret = -EINVAL;
+		goto invalid;
+	}
+
+	ipath_set_lid(dd, lid, dd->ipath_lmc);
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid LID 0x%x\n", lid);
+bail:
+	return ret;
+}
+
+static ssize_t show_mlid(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dd->ipath_mlid);
+}
+
+static ssize_t store_mlid(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 mlid;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &mlid);
+	if (ret < 0 || mlid < IPATH_MULTICAST_LID_BASE)
+		goto invalid;
+
+	dd->ipath_mlid = mlid;
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid MLID\n");
+bail:
+	return ret;
+}
+
+static ssize_t show_guid(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u8 *guid;
+
+	guid = (u8 *) & (dd->ipath_guid);
+
+	return scnprintf(buf, PAGE_SIZE,
+			 "%02x:%02x:%02x:%02x:%02x:%02x:%02x:%02x\n",
+			 guid[0], guid[1], guid[2], guid[3],
+			 guid[4], guid[5], guid[6], guid[7]);
+}
+
+static ssize_t store_guid(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	ssize_t ret;
+	unsigned short guid[8];
+	__be64 new_guid;
+	u8 *ng;
+	int i;
+
+	if (sscanf(buf, "%hx:%hx:%hx:%hx:%hx:%hx:%hx:%hx",
+		   &guid[0], &guid[1], &guid[2], &guid[3],
+		   &guid[4], &guid[5], &guid[6], &guid[7]) != 8)
+		goto invalid;
+
+	ng = (u8 *) &new_guid;
+
+	for (i = 0; i < 8; i++) {
+		if (guid[i] > 0xff)
+			goto invalid;
+		ng[i] = guid[i];
+	}
+
+	if (new_guid == 0)
+		goto invalid;
+
+	dd->ipath_guid = new_guid;
+	dd->ipath_nguid = 1;
+	if (dd->verbs_dev)
+		dd->verbs_dev->ibdev.node_guid = new_guid;
+
+	ret = strlen(buf);
+	goto bail;
+
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid GUID\n");
+	ret = -EINVAL;
+
+bail:
+	return ret;
+}
+
+static ssize_t show_nguid(struct device *dev,
+			  struct device_attribute *attr,
+			  char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_nguid);
+}
+
+static ssize_t show_nports(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	/* Return the number of user ports available. */
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_cfgports - 1);
+}
+
+static ssize_t show_serial(struct device *dev,
+			   struct device_attribute *attr,
+			   char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	buf[sizeof dd->ipath_serial] = '\0';
+	memcpy(buf, dd->ipath_serial, sizeof dd->ipath_serial);
+	strcat(buf, "\n");
+	return strlen(buf);
+}
+
+static ssize_t show_unit(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_unit);
+}
+
+static ssize_t show_jint_max_packets(struct device *dev,
+				     struct device_attribute *attr,
+				     char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_max_packets);
+}
+
+static ssize_t store_jint_max_packets(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf,
+				      size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 v = 0;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &v);
+	if (ret < 0)
+		ipath_dev_err(dd, "invalid jint_max_packets.\n");
+	else
+		dd->ipath_f_config_jint(dd, dd->ipath_jint_idle_ticks, v);
+
+	return ret;
+}
+
+static ssize_t show_jint_idle_ticks(struct device *dev,
+				    struct device_attribute *attr,
+				    char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%hu\n", dd->ipath_jint_idle_ticks);
+}
+
+static ssize_t store_jint_idle_ticks(struct device *dev,
+				     struct device_attribute *attr,
+				     const char *buf,
+				     size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 v = 0;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &v);
+	if (ret < 0)
+		ipath_dev_err(dd, "invalid jint_idle_ticks.\n");
+	else
+		dd->ipath_f_config_jint(dd, v, dd->ipath_jint_max_packets);
+
+	return ret;
+}
+
+#define DEVICE_COUNTER(name, attr) \
+	static ssize_t show_counter_##name(struct device *dev, \
+					   struct device_attribute *attr, \
+					   char *buf) \
+	{ \
+		struct ipath_devdata *dd = dev_get_drvdata(dev); \
+		return scnprintf(\
+			buf, PAGE_SIZE, "%llu\n", (unsigned long long) \
+			ipath_snap_cntr( \
+				dd, offsetof(struct infinipath_counters, \
+					     attr) / sizeof(u64)));	\
+	} \
+	static DEVICE_ATTR(name, S_IRUGO, show_counter_##name, NULL);
+
+DEVICE_COUNTER(ib_link_downeds, IBLinkDownedCnt);
+DEVICE_COUNTER(ib_link_err_recoveries, IBLinkErrRecoveryCnt);
+DEVICE_COUNTER(ib_status_changes, IBStatusChangeCnt);
+DEVICE_COUNTER(ib_symbol_errs, IBSymbolErrCnt);
+DEVICE_COUNTER(lb_flow_stalls, LBFlowStallCnt);
+DEVICE_COUNTER(lb_ints, LBIntCnt);
+DEVICE_COUNTER(rx_bad_formats, RxBadFormatCnt);
+DEVICE_COUNTER(rx_buf_ovfls, RxBufOvflCnt);
+DEVICE_COUNTER(rx_data_pkts, RxDataPktCnt);
+DEVICE_COUNTER(rx_dropped_pkts, RxDroppedPktCnt);
+DEVICE_COUNTER(rx_dwords, RxDwordCnt);
+DEVICE_COUNTER(rx_ebps, RxEBPCnt);
+DEVICE_COUNTER(rx_flow_ctrl_errs, RxFlowCtrlErrCnt);
+DEVICE_COUNTER(rx_flow_pkts, RxFlowPktCnt);
+DEVICE_COUNTER(rx_icrc_errs, RxICRCErrCnt);
+DEVICE_COUNTER(rx_len_errs, RxLenErrCnt);
+DEVICE_COUNTER(rx_link_problems, RxLinkProblemCnt);
+DEVICE_COUNTER(rx_lpcrc_errs, RxLPCRCErrCnt);
+DEVICE_COUNTER(rx_max_min_len_errs, RxMaxMinLenErrCnt);
+DEVICE_COUNTER(rx_p0_hdr_egr_ovfls, RxP0HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p1_hdr_egr_ovfls, RxP1HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p2_hdr_egr_ovfls, RxP2HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p3_hdr_egr_ovfls, RxP3HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p4_hdr_egr_ovfls, RxP4HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p5_hdr_egr_ovfls, RxP5HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p6_hdr_egr_ovfls, RxP6HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p7_hdr_egr_ovfls, RxP7HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_p8_hdr_egr_ovfls, RxP8HdrEgrOvflCnt);
+DEVICE_COUNTER(rx_pkey_mismatches, RxPKeyMismatchCnt);
+DEVICE_COUNTER(rx_tid_full_errs, RxTIDFullErrCnt);
+DEVICE_COUNTER(rx_tid_valid_errs, RxTIDValidErrCnt);
+DEVICE_COUNTER(rx_vcrc_errs, RxVCRCErrCnt);
+DEVICE_COUNTER(tx_data_pkts, TxDataPktCnt);
+DEVICE_COUNTER(tx_dropped_pkts, TxDroppedPktCnt);
+DEVICE_COUNTER(tx_dwords, TxDwordCnt);
+DEVICE_COUNTER(tx_flow_pkts, TxFlowPktCnt);
+DEVICE_COUNTER(tx_flow_stalls, TxFlowStallCnt);
+DEVICE_COUNTER(tx_len_errs, TxLenErrCnt);
+DEVICE_COUNTER(tx_max_min_len_errs, TxMaxMinLenErrCnt);
+DEVICE_COUNTER(tx_underruns, TxUnderrunCnt);
+DEVICE_COUNTER(tx_unsup_vl_errs, TxUnsupVLErrCnt);
+
+static struct attribute *dev_counter_attributes[] = {
+	&dev_attr_ib_link_downeds.attr,
+	&dev_attr_ib_link_err_recoveries.attr,
+	&dev_attr_ib_status_changes.attr,
+	&dev_attr_ib_symbol_errs.attr,
+	&dev_attr_lb_flow_stalls.attr,
+	&dev_attr_lb_ints.attr,
+	&dev_attr_rx_bad_formats.attr,
+	&dev_attr_rx_buf_ovfls.attr,
+	&dev_attr_rx_data_pkts.attr,
+	&dev_attr_rx_dropped_pkts.attr,
+	&dev_attr_rx_dwords.attr,
+	&dev_attr_rx_ebps.attr,
+	&dev_attr_rx_flow_ctrl_errs.attr,
+	&dev_attr_rx_flow_pkts.attr,
+	&dev_attr_rx_icrc_errs.attr,
+	&dev_attr_rx_len_errs.attr,
+	&dev_attr_rx_link_problems.attr,
+	&dev_attr_rx_lpcrc_errs.attr,
+	&dev_attr_rx_max_min_len_errs.attr,
+	&dev_attr_rx_p0_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p1_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p2_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p3_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p4_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p5_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p6_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p7_hdr_egr_ovfls.attr,
+	&dev_attr_rx_p8_hdr_egr_ovfls.attr,
+	&dev_attr_rx_pkey_mismatches.attr,
+	&dev_attr_rx_tid_full_errs.attr,
+	&dev_attr_rx_tid_valid_errs.attr,
+	&dev_attr_rx_vcrc_errs.attr,
+	&dev_attr_tx_data_pkts.attr,
+	&dev_attr_tx_dropped_pkts.attr,
+	&dev_attr_tx_dwords.attr,
+	&dev_attr_tx_flow_pkts.attr,
+	&dev_attr_tx_flow_stalls.attr,
+	&dev_attr_tx_len_errs.attr,
+	&dev_attr_tx_max_min_len_errs.attr,
+	&dev_attr_tx_underruns.attr,
+	&dev_attr_tx_unsup_vl_errs.attr,
+	NULL
+};
+
+static struct attribute_group dev_counter_attr_group = {
+	.name = "counters",
+	.attrs = dev_counter_attributes
+};
+
+static ssize_t store_reset(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	if (count < 5 || memcmp(buf, "reset", 5)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (dd->ipath_flags & IPATH_DISABLED) {
+		/*
+		 * post-reset init would re-enable interrupts, etc.
+		 * so don't allow reset on disabled devices.  Not
+		 * perfect error, but about the best choice.
+		 */
+		dev_info(dev,"Unit %d is disabled, can't reset\n",
+			 dd->ipath_unit);
+		ret = -EINVAL;
+		goto bail;
+	}
+	ret = ipath_reset_device(dd->ipath_unit);
+bail:
+	return ret<0 ? ret : count;
+}
+
+static ssize_t store_link_state(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 state;
+
+	ret = ipath_parse_ushort(buf, &state);
+	if (ret < 0)
+		goto invalid;
+
+	r = ipath_set_linkstate(dd, state);
+	if (r < 0) {
+		ret = r;
+		goto bail;
+	}
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid link state\n");
+bail:
+	return ret;
+}
+
+static ssize_t show_mtu(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_ibmtu);
+}
+
+static ssize_t store_mtu(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	ssize_t ret;
+	u16 mtu = 0;
+	int r;
+
+	ret = ipath_parse_ushort(buf, &mtu);
+	if (ret < 0)
+		goto invalid;
+
+	r = ipath_set_mtu(dd, mtu);
+	if (r < 0)
+		ret = r;
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid MTU\n");
+bail:
+	return ret;
+}
+
+static ssize_t show_enabled(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			 (dd->ipath_flags & IPATH_DISABLED) ? 0 : 1);
+}
+
+static ssize_t store_enabled(struct device *dev,
+			 struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	ssize_t ret;
+	u16 enable = 0;
+
+	ret = ipath_parse_ushort(buf, &enable);
+	if (ret < 0) {
+		ipath_dev_err(dd, "attempt to use non-numeric on enable\n");
+		goto bail;
+	}
+
+	if (enable) {
+		if (!(dd->ipath_flags & IPATH_DISABLED))
+			goto bail;
+
+		dev_info(dev, "Enabling unit %d\n", dd->ipath_unit);
+		/* same as post-reset */
+		ret = ipath_init_chip(dd, 1);
+		if (ret)
+			ipath_dev_err(dd, "Failed to enable unit %d\n",
+				      dd->ipath_unit);
+		else {
+			dd->ipath_flags &= ~IPATH_DISABLED;
+			*dd->ipath_statusp &= ~IPATH_STATUS_ADMIN_DISABLED;
+		}
+	}
+	else if (!(dd->ipath_flags & IPATH_DISABLED)) {
+		dev_info(dev, "Disabling unit %d\n", dd->ipath_unit);
+		ipath_shutdown_device(dd);
+		dd->ipath_flags |= IPATH_DISABLED;
+		*dd->ipath_statusp |= IPATH_STATUS_ADMIN_DISABLED;
+	}
+
+bail:
+	return ret;
+}
+
+static ssize_t store_rx_pol_inv(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret < 0)
+		goto invalid;
+
+	r = ipath_set_rx_pol_inv(dd, val);
+	if (r < 0) {
+		ret = r;
+		goto bail;
+	}
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid Rx Polarity invert\n");
+bail:
+	return ret;
+}
+
+static ssize_t store_led_override(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret > 0)
+		ipath_set_led_override(dd, val);
+	else
+		ipath_dev_err(dd, "attempt to set invalid LED override\n");
+	return ret;
+}
+
+static ssize_t show_logged_errs(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int idx, count;
+
+	/* force consistency with actual EEPROM */
+	if (ipath_update_eeprom_log(dd) != 0)
+		return -ENXIO;
+
+	count = 0;
+	for (idx = 0; idx < IPATH_EEP_LOG_CNT; ++idx) {
+		count += scnprintf(buf + count, PAGE_SIZE - count, "%d%c",
+			dd->ipath_eep_st_errs[idx],
+			idx == (IPATH_EEP_LOG_CNT - 1) ? '\n' : ' ');
+	}
+
+	return count;
+}
+
+/*
+ * New sysfs entries to control various IB config. These all turn into
+ * accesses via ipath_f_get/set_ib_cfg.
+ *
+ * Get/Set heartbeat enable. Or of 1=enabled, 2=auto
+ */
+static ssize_t show_hrtbt_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_HRTBT);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_hrtbt_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && val > 3)
+		ret = -EINVAL;
+	if (ret < 0) {
+		ipath_dev_err(dd, "attempt to set invalid Heartbeat enable\n");
+		goto bail;
+	}
+
+	/*
+	 * Set the "intentional" heartbeat enable per either of
+	 * "Enable" and "Auto", as these are normally set together.
+	 * This bit is consulted when leaving loopback mode,
+	 * because entering loopback mode overrides it and automatically
+	 * disables heartbeat.
+	 */
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_HRTBT, val);
+	if (r < 0)
+		ret = r;
+	else if (val == IPATH_IB_HRTBT_OFF)
+		dd->ipath_flags |= IPATH_NO_HRTBT;
+	else
+		dd->ipath_flags &= ~IPATH_NO_HRTBT;
+
+bail:
+	return ret;
+}
+
+/*
+ * Get/Set Link-widths enabled. Or of 1=1x, 2=4x (this is human/IB centric,
+ * _not_ the particular encoding of any given chip)
+ */
+static ssize_t show_lwid_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_lwid_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && (val == 0 || val > 3))
+		ret = -EINVAL;
+	if (ret < 0) {
+		ipath_dev_err(dd,
+			"attempt to set invalid Link Width (enable)\n");
+		goto bail;
+	}
+
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LWID_ENB, val);
+	if (r < 0)
+		ret = r;
+
+bail:
+	return ret;
+}
+
+/* Get current link width */
+static ssize_t show_lwid(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LWID);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+/*
+ * Get/Set Link-speeds enabled. Or of 1=SDR 2=DDR.
+ */
+static ssize_t show_spd_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_spd_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && (val == 0 || val > (IPATH_IB_SDR | IPATH_IB_DDR)))
+		ret = -EINVAL;
+	if (ret < 0) {
+		ipath_dev_err(dd,
+			"attempt to set invalid Link Speed (enable)\n");
+		goto bail;
+	}
+
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_SPD_ENB, val);
+	if (r < 0)
+		ret = r;
+
+bail:
+	return ret;
+}
+
+/* Get current link speed */
+static ssize_t show_spd(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_SPD);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+/*
+ * Get/Set RX polarity-invert enable. 0=no, 1=yes.
+ */
+static ssize_t show_rx_polinv_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_rx_polinv_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && val > 1) {
+		ipath_dev_err(dd,
+			"attempt to set invalid Rx Polarity (enable)\n");
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_RXPOL_ENB, val);
+	if (r < 0)
+		ret = r;
+
+bail:
+	return ret;
+}
+
+/*
+ * Get/Set RX lane-reversal enable. 0=no, 1=yes.
+ */
+static ssize_t show_lanerev_enb(struct device *dev,
+			 struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+
+	ret = dd->ipath_f_get_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB);
+	if (ret >= 0)
+		ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_lanerev_enb(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf,
+			  size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, r;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret >= 0 && val > 1) {
+		ret = -EINVAL;
+		ipath_dev_err(dd,
+			"attempt to set invalid Lane reversal (enable)\n");
+		goto bail;
+	}
+
+	r = dd->ipath_f_set_ib_cfg(dd, IPATH_IB_CFG_LREV_ENB, val);
+	if (r < 0)
+		ret = r;
+
+bail:
+	return ret;
+}
+
+static DRIVER_ATTR(num_units, S_IRUGO, show_num_units, NULL);
+static DRIVER_ATTR(version, S_IRUGO, show_version, NULL);
+
+static struct attribute *driver_attributes[] = {
+	&driver_attr_num_units.attr,
+	&driver_attr_version.attr,
+	NULL
+};
+
+static struct attribute_group driver_attr_group = {
+	.attrs = driver_attributes
+};
+
+static ssize_t store_tempsense(struct device *dev,
+			       struct device_attribute *attr,
+			       const char *buf,
+			       size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret, stat;
+	u16 val;
+
+	ret = ipath_parse_ushort(buf, &val);
+	if (ret <= 0) {
+		ipath_dev_err(dd, "attempt to set invalid tempsense config\n");
+		goto bail;
+	}
+	/* If anything but the highest limit, enable T_CRIT_A "interrupt" */
+	stat = ipath_tempsense_write(dd, 9, (val == 0x7f7f) ? 0x80 : 0);
+	if (stat) {
+		ipath_dev_err(dd, "Unable to set tempsense config\n");
+		ret = -1;
+		goto bail;
+	}
+	stat = ipath_tempsense_write(dd, 0xB, (u8) (val & 0xFF));
+	if (stat) {
+		ipath_dev_err(dd, "Unable to set local Tcrit\n");
+		ret = -1;
+		goto bail;
+	}
+	stat = ipath_tempsense_write(dd, 0xD, (u8) (val >> 8));
+	if (stat) {
+		ipath_dev_err(dd, "Unable to set remote Tcrit\n");
+		ret = -1;
+		goto bail;
+	}
+
+bail:
+	return ret;
+}
+
+/*
+ * dump tempsense regs. in decimal, to ease shell-scripts.
+ */
+static ssize_t show_tempsense(struct device *dev,
+			      struct device_attribute *attr,
+			      char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	int ret;
+	int idx;
+	u8 regvals[8];
+
+	ret = -ENXIO;
+	for (idx = 0; idx < 8; ++idx) {
+		if (idx == 6)
+			continue;
+		ret = ipath_tempsense_read(dd, idx);
+		if (ret < 0)
+			break;
+		regvals[idx] = ret;
+	}
+	if (idx == 8)
+		ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n",
+			*(signed char *)(regvals),
+			*(signed char *)(regvals + 1),
+			regvals[2], regvals[3],
+			*(signed char *)(regvals + 5),
+			*(signed char *)(regvals + 7));
+	return ret;
+}
+
+const struct attribute_group *ipath_driver_attr_groups[] = {
+	&driver_attr_group,
+	NULL,
+};
+
+static DEVICE_ATTR(guid, S_IWUSR | S_IRUGO, show_guid, store_guid);
+static DEVICE_ATTR(lmc, S_IWUSR | S_IRUGO, show_lmc, store_lmc);
+static DEVICE_ATTR(lid, S_IWUSR | S_IRUGO, show_lid, store_lid);
+static DEVICE_ATTR(link_state, S_IWUSR, NULL, store_link_state);
+static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid);
+static DEVICE_ATTR(mtu, S_IWUSR | S_IRUGO, show_mtu, store_mtu);
+static DEVICE_ATTR(enabled, S_IWUSR | S_IRUGO, show_enabled, store_enabled);
+static DEVICE_ATTR(nguid, S_IRUGO, show_nguid, NULL);
+static DEVICE_ATTR(nports, S_IRUGO, show_nports, NULL);
+static DEVICE_ATTR(reset, S_IWUSR, NULL, store_reset);
+static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
+static DEVICE_ATTR(status, S_IRUGO, show_status, NULL);
+static DEVICE_ATTR(status_str, S_IRUGO, show_status_str, NULL);
+static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
+static DEVICE_ATTR(unit, S_IRUGO, show_unit, NULL);
+static DEVICE_ATTR(rx_pol_inv, S_IWUSR, NULL, store_rx_pol_inv);
+static DEVICE_ATTR(led_override, S_IWUSR, NULL, store_led_override);
+static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL);
+static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL);
+static DEVICE_ATTR(jint_max_packets, S_IWUSR | S_IRUGO,
+		   show_jint_max_packets, store_jint_max_packets);
+static DEVICE_ATTR(jint_idle_ticks, S_IWUSR | S_IRUGO,
+		   show_jint_idle_ticks, store_jint_idle_ticks);
+static DEVICE_ATTR(tempsense, S_IWUSR | S_IRUGO,
+		   show_tempsense, store_tempsense);
+
+static struct attribute *dev_attributes[] = {
+	&dev_attr_guid.attr,
+	&dev_attr_lmc.attr,
+	&dev_attr_lid.attr,
+	&dev_attr_link_state.attr,
+	&dev_attr_mlid.attr,
+	&dev_attr_mtu.attr,
+	&dev_attr_nguid.attr,
+	&dev_attr_nports.attr,
+	&dev_attr_serial.attr,
+	&dev_attr_status.attr,
+	&dev_attr_status_str.attr,
+	&dev_attr_boardversion.attr,
+	&dev_attr_unit.attr,
+	&dev_attr_enabled.attr,
+	&dev_attr_rx_pol_inv.attr,
+	&dev_attr_led_override.attr,
+	&dev_attr_logged_errors.attr,
+	&dev_attr_tempsense.attr,
+	&dev_attr_localbus_info.attr,
+	NULL
+};
+
+static struct attribute_group dev_attr_group = {
+	.attrs = dev_attributes
+};
+
+static DEVICE_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb,
+		   store_hrtbt_enb);
+static DEVICE_ATTR(link_width_enable, S_IWUSR | S_IRUGO, show_lwid_enb,
+		   store_lwid_enb);
+static DEVICE_ATTR(link_width, S_IRUGO, show_lwid, NULL);
+static DEVICE_ATTR(link_speed_enable, S_IWUSR | S_IRUGO, show_spd_enb,
+		   store_spd_enb);
+static DEVICE_ATTR(link_speed, S_IRUGO, show_spd, NULL);
+static DEVICE_ATTR(rx_pol_inv_enable, S_IWUSR | S_IRUGO, show_rx_polinv_enb,
+		   store_rx_polinv_enb);
+static DEVICE_ATTR(rx_lane_rev_enable, S_IWUSR | S_IRUGO, show_lanerev_enb,
+		   store_lanerev_enb);
+
+static struct attribute *dev_ibcfg_attributes[] = {
+	&dev_attr_hrtbt_enable.attr,
+	&dev_attr_link_width_enable.attr,
+	&dev_attr_link_width.attr,
+	&dev_attr_link_speed_enable.attr,
+	&dev_attr_link_speed.attr,
+	&dev_attr_rx_pol_inv_enable.attr,
+	&dev_attr_rx_lane_rev_enable.attr,
+	NULL
+};
+
+static struct attribute_group dev_ibcfg_attr_group = {
+	.attrs = dev_ibcfg_attributes
+};
+
+/**
+ * ipath_expose_reset - create a device reset file
+ * @dev: the device structure
+ *
+ * Only expose a file that lets us reset the device after someone
+ * enters diag mode.  A device reset is quite likely to crash the
+ * machine entirely, so we don't want to normally make it
+ * available.
+ *
+ * Called with ipath_mutex held.
+ */
+int ipath_expose_reset(struct device *dev)
+{
+	static int exposed;
+	int ret;
+
+	if (!exposed) {
+		ret = device_create_file(dev, &dev_attr_reset);
+		exposed = 1;
+	}
+	else
+		ret = 0;
+
+	return ret;
+}
+
+int ipath_device_create_group(struct device *dev, struct ipath_devdata *dd)
+{
+	int ret;
+
+	ret = sysfs_create_group(&dev->kobj, &dev_attr_group);
+	if (ret)
+		goto bail;
+
+	ret = sysfs_create_group(&dev->kobj, &dev_counter_attr_group);
+	if (ret)
+		goto bail_attrs;
+
+	if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
+		ret = device_create_file(dev, &dev_attr_jint_idle_ticks);
+		if (ret)
+			goto bail_counter;
+		ret = device_create_file(dev, &dev_attr_jint_max_packets);
+		if (ret)
+			goto bail_idle;
+
+		ret = sysfs_create_group(&dev->kobj, &dev_ibcfg_attr_group);
+		if (ret)
+			goto bail_max;
+	}
+
+	return 0;
+
+bail_max:
+	device_remove_file(dev, &dev_attr_jint_max_packets);
+bail_idle:
+	device_remove_file(dev, &dev_attr_jint_idle_ticks);
+bail_counter:
+	sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
+bail_attrs:
+	sysfs_remove_group(&dev->kobj, &dev_attr_group);
+bail:
+	return ret;
+}
+
+void ipath_device_remove_group(struct device *dev, struct ipath_devdata *dd)
+{
+	sysfs_remove_group(&dev->kobj, &dev_counter_attr_group);
+
+	if (dd->ipath_flags & IPATH_HAS_MULT_IB_SPEED) {
+		sysfs_remove_group(&dev->kobj, &dev_ibcfg_attr_group);
+		device_remove_file(dev, &dev_attr_jint_idle_ticks);
+		device_remove_file(dev, &dev_attr_jint_max_packets);
+	}
+
+	sysfs_remove_group(&dev->kobj, &dev_attr_group);
+
+	device_remove_file(dev, &dev_attr_reset);
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c
new file mode 100644
index 000000000..22e60998f
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_uc.c
@@ -0,0 +1,547 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_UC_##x
+
+/**
+ * ipath_make_uc_req - construct a request packet (SEND, RDMA write)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int ipath_make_uc_req(struct ipath_qp *qp)
+{
+	struct ipath_other_headers *ohdr;
+	struct ipath_swqe *wqe;
+	unsigned long flags;
+	u32 hwords;
+	u32 bth0;
+	u32 len;
+	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+	int ret = 0;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
+		if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_dma_busy)) {
+			qp->s_flags |= IPATH_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	ohdr = &qp->s_hdr.u.oth;
+	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+		ohdr = &qp->s_hdr.u.l.oth;
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+	bth0 = 1 << 22; /* Set M bit */
+
+	/* Get the next send request. */
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	qp->s_wqe = NULL;
+	switch (qp->s_state) {
+	default:
+		if (!(ib_ipath_state_ops[qp->state] &
+		    IPATH_PROCESS_NEXT_SEND_OK))
+			goto bail;
+		/* Check if send work queue is empty. */
+		if (qp->s_cur == qp->s_head)
+			goto bail;
+		/*
+		 * Start a new request.
+		 */
+		qp->s_psn = wqe->psn = qp->s_next_psn;
+		qp->s_sge.sge = wqe->sg_list[0];
+		qp->s_sge.sg_list = wqe->sg_list + 1;
+		qp->s_sge.num_sge = wqe->wr.num_sge;
+		qp->s_len = len = wqe->length;
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			if (len > pmtu) {
+				qp->s_state = OP(SEND_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_SEND)
+				qp->s_state = OP(SEND_ONLY);
+			else {
+				qp->s_state =
+					OP(SEND_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the BTH */
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+			}
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= 1 << 23;
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			hwords += sizeof(struct ib_reth) / 4;
+			if (len > pmtu) {
+				qp->s_state = OP(RDMA_WRITE_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+				qp->s_state = OP(RDMA_WRITE_ONLY);
+			else {
+				qp->s_state =
+					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the RETH */
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+					bth0 |= 1 << 23;
+			}
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		default:
+			goto bail;
+		}
+		break;
+
+	case OP(SEND_FIRST):
+		qp->s_state = OP(SEND_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_SEND)
+			qp->s_state = OP(SEND_LAST);
+		else {
+			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+		}
+		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+			bth0 |= 1 << 23;
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+		qp->s_state = OP(RDMA_WRITE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+			qp->s_state = OP(RDMA_WRITE_LAST);
+		else {
+			qp->s_state =
+				OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= 1 << 23;
+		}
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+	}
+	qp->s_len -= len;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_sge = &qp->s_sge;
+	qp->s_cur_size = len;
+	ipath_make_ruc_header(to_idev(qp->ibqp.device),
+			      qp, ohdr, bth0 | (qp->s_state << 24),
+			      qp->s_next_psn++ & IPATH_PSN_MASK);
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * ipath_uc_rcv - handle an incoming UC packet
+ * @dev: the device the packet came in on
+ * @hdr: the header of the packet
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the length of the packet
+ * @qp: the QP for this packet.
+ *
+ * This is called from ipath_qp_rcv() to process an incoming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+	struct ipath_other_headers *ohdr;
+	int opcode;
+	u32 hdrsize;
+	u32 psn;
+	u32 pad;
+	struct ib_wc wc;
+	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+	struct ib_reth *reth;
+	int header_in_data;
+
+	/* Validate the SLID. See Ch. 9.6.1.5 */
+	if (unlikely(be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid))
+		goto done;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12;	/* LRH + BTH */
+		psn = be32_to_cpu(ohdr->bth[2]);
+		header_in_data = 0;
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12;	/* LRH + GRH + BTH */
+		/*
+		 * The header with GRH is 60 bytes and the
+		 * core driver sets the eager header buffer
+		 * size to 56 bytes so the last 4 bytes of
+		 * the BTH header (PSN) is in the data buffer.
+		 */
+		header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
+		if (header_in_data) {
+			psn = be32_to_cpu(((__be32 *) data)[0]);
+			data += sizeof(__be32);
+		} else
+			psn = be32_to_cpu(ohdr->bth[2]);
+	}
+	/*
+	 * The opcode is in the low byte when its in network order
+	 * (top byte when in host order).
+	 */
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+
+	memset(&wc, 0, sizeof wc);
+
+	/* Compare the PSN verses the expected PSN. */
+	if (unlikely(ipath_cmp24(psn, qp->r_psn) != 0)) {
+		/*
+		 * Handle a sequence error.
+		 * Silently drop any current message.
+		 */
+		qp->r_psn = psn;
+	inv:
+		qp->r_state = OP(SEND_LAST);
+		switch (opcode) {
+		case OP(SEND_FIRST):
+		case OP(SEND_ONLY):
+		case OP(SEND_ONLY_WITH_IMMEDIATE):
+			goto send_first;
+
+		case OP(RDMA_WRITE_FIRST):
+		case OP(RDMA_WRITE_ONLY):
+		case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+			goto rdma_first;
+
+		default:
+			dev->n_pkt_drops++;
+			goto done;
+		}
+	}
+
+	/* Check for opcode sequence errors. */
+	switch (qp->r_state) {
+	case OP(SEND_FIRST):
+	case OP(SEND_MIDDLE):
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+			break;
+		goto inv;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_MIDDLE):
+		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			break;
+		goto inv;
+
+	default:
+		if (opcode == OP(SEND_FIRST) ||
+		    opcode == OP(SEND_ONLY) ||
+		    opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_FIRST) ||
+		    opcode == OP(RDMA_WRITE_ONLY) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			break;
+		goto inv;
+	}
+
+	/* OK, process the packet. */
+	switch (opcode) {
+	case OP(SEND_FIRST):
+	case OP(SEND_ONLY):
+	case OP(SEND_ONLY_WITH_IMMEDIATE):
+	send_first:
+		if (qp->r_flags & IPATH_R_REUSE_SGE) {
+			qp->r_flags &= ~IPATH_R_REUSE_SGE;
+			qp->r_sge = qp->s_rdma_read_sge;
+		} else if (!ipath_get_rwqe(qp, 0)) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		/* Save the WQE so we can reuse it in case of an error. */
+		qp->s_rdma_read_sge = qp->r_sge;
+		qp->r_rcv_len = 0;
+		if (opcode == OP(SEND_ONLY))
+			goto send_last;
+		else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
+			goto send_last_imm;
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4))) {
+			qp->r_flags |= IPATH_R_REUSE_SGE;
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len)) {
+			qp->r_flags |= IPATH_R_REUSE_SGE;
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		ipath_copy_sge(&qp->r_sge, data, pmtu);
+		break;
+
+	case OP(SEND_LAST_WITH_IMMEDIATE):
+	send_last_imm:
+		if (header_in_data) {
+			wc.ex.imm_data = *(__be32 *) data;
+			data += sizeof(__be32);
+		} else {
+			/* Immediate data comes after BTH */
+			wc.ex.imm_data = ohdr->u.imm_data;
+		}
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		/* FALLTHROUGH */
+	case OP(SEND_LAST):
+	send_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4))) {
+			qp->r_flags |= IPATH_R_REUSE_SGE;
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		wc.byte_len = tlen + qp->r_rcv_len;
+		if (unlikely(wc.byte_len > qp->r_len)) {
+			qp->r_flags |= IPATH_R_REUSE_SGE;
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		wc.opcode = IB_WC_RECV;
+	last_imm:
+		ipath_copy_sge(&qp->r_sge, data, tlen);
+		wc.wr_id = qp->r_wr_id;
+		wc.status = IB_WC_SUCCESS;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = qp->remote_qpn;
+		wc.slid = qp->remote_ah_attr.dlid;
+		wc.sl = qp->remote_ah_attr.sl;
+		/* Signal completion event if the solicited bit is set. */
+		ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+			       (ohdr->bth[0] &
+				cpu_to_be32(1 << 23)) != 0);
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_ONLY):
+	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
+	rdma_first:
+		/* RETH comes after BTH */
+		if (!header_in_data)
+			reth = &ohdr->u.rc.reth;
+		else {
+			reth = (struct ib_reth *)data;
+			data += sizeof(*reth);
+		}
+		hdrsize += sizeof(*reth);
+		qp->r_len = be32_to_cpu(reth->length);
+		qp->r_rcv_len = 0;
+		if (qp->r_len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey */
+			ok = ipath_rkey_ok(qp, &qp->r_sge, qp->r_len,
+					   vaddr, rkey,
+					   IB_ACCESS_REMOTE_WRITE);
+			if (unlikely(!ok)) {
+				dev->n_pkt_drops++;
+				goto done;
+			}
+		} else {
+			qp->r_sge.sg_list = NULL;
+			qp->r_sge.sge.mr = NULL;
+			qp->r_sge.sge.vaddr = NULL;
+			qp->r_sge.sge.length = 0;
+			qp->r_sge.sge.sge_length = 0;
+		}
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_WRITE))) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		if (opcode == OP(RDMA_WRITE_ONLY))
+			goto rdma_last;
+		else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			goto rdma_last_imm;
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4))) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len)) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		ipath_copy_sge(&qp->r_sge, data, pmtu);
+		break;
+
+	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+	rdma_last_imm:
+		if (header_in_data) {
+			wc.ex.imm_data = *(__be32 *) data;
+			data += sizeof(__be32);
+		} else {
+			/* Immediate data comes after BTH */
+			wc.ex.imm_data = ohdr->u.imm_data;
+		}
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4))) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		if (qp->r_flags & IPATH_R_REUSE_SGE)
+			qp->r_flags &= ~IPATH_R_REUSE_SGE;
+		else if (!ipath_get_rwqe(qp, 1)) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		wc.byte_len = qp->r_len;
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		goto last_imm;
+
+	case OP(RDMA_WRITE_LAST):
+	rdma_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4))) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		if (unlikely(tlen + qp->r_rcv_len != qp->r_len)) {
+			dev->n_pkt_drops++;
+			goto done;
+		}
+		ipath_copy_sge(&qp->r_sge, data, tlen);
+		break;
+
+	default:
+		/* Drop packet for unknown opcodes. */
+		dev->n_pkt_drops++;
+		goto done;
+	}
+	qp->r_psn++;
+	qp->r_state = opcode;
+done:
+	return;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c
new file mode 100644
index 000000000..e8a2a9152
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_ud.c
@@ -0,0 +1,580 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/sched.h>
+#include <rdma/ib_smi.h>
+
+#include "ipath_verbs.h"
+#include "ipath_kernel.h"
+
+/**
+ * ipath_ud_loopback - handle send on loopback QPs
+ * @sqp: the sending QP
+ * @swqe: the send work request
+ *
+ * This is called from ipath_make_ud_req() to forward a WQE addressed
+ * to the same HCA.
+ * Note that the receive interrupt handler may be calling ipath_ud_rcv()
+ * while this is being called.
+ */
+static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
+{
+	struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
+	struct ipath_qp *qp;
+	struct ib_ah_attr *ah_attr;
+	unsigned long flags;
+	struct ipath_rq *rq;
+	struct ipath_srq *srq;
+	struct ipath_sge_state rsge;
+	struct ipath_sge *sge;
+	struct ipath_rwq *wq;
+	struct ipath_rwqe *wqe;
+	void (*handler)(struct ib_event *, void *);
+	struct ib_wc wc;
+	u32 tail;
+	u32 rlen;
+	u32 length;
+
+	qp = ipath_lookup_qpn(&dev->qp_table, swqe->wr.wr.ud.remote_qpn);
+	if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+		dev->n_pkt_drops++;
+		goto done;
+	}
+
+	/*
+	 * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	if (unlikely(qp->ibqp.qp_num &&
+		     ((int) swqe->wr.wr.ud.remote_qkey < 0 ?
+		      sqp->qkey : swqe->wr.wr.ud.remote_qkey) != qp->qkey)) {
+		/* XXX OK to lose a count once in a while. */
+		dev->qkey_violations++;
+		dev->n_pkt_drops++;
+		goto drop;
+	}
+
+	/*
+	 * A GRH is expected to precede the data even if not
+	 * present on the wire.
+	 */
+	length = swqe->length;
+	memset(&wc, 0, sizeof wc);
+	wc.byte_len = length + sizeof(struct ib_grh);
+
+	if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = swqe->wr.ex.imm_data;
+	}
+
+	/*
+	 * This would be a lot simpler if we could call ipath_get_rwqe()
+	 * but that uses state that the receive interrupt handler uses
+	 * so we would need to lock out receive interrupts while doing
+	 * local loopback.
+	 */
+	if (qp->ibqp.srq) {
+		srq = to_isrq(qp->ibqp.srq);
+		handler = srq->ibsrq.event_handler;
+		rq = &srq->rq;
+	} else {
+		srq = NULL;
+		handler = NULL;
+		rq = &qp->r_rq;
+	}
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 * Note that it is safe to drop the lock after changing rq->tail
+	 * since ipath_post_receive() won't fill the empty slot.
+	 */
+	spin_lock_irqsave(&rq->lock, flags);
+	wq = rq->wq;
+	tail = wq->tail;
+	/* Validate tail before using it since it is user writable. */
+	if (tail >= rq->size)
+		tail = 0;
+	if (unlikely(tail == wq->head)) {
+		spin_unlock_irqrestore(&rq->lock, flags);
+		dev->n_pkt_drops++;
+		goto drop;
+	}
+	wqe = get_rwqe_ptr(rq, tail);
+	rsge.sg_list = qp->r_ud_sg_list;
+	if (!ipath_init_sge(qp, wqe, &rlen, &rsge)) {
+		spin_unlock_irqrestore(&rq->lock, flags);
+		dev->n_pkt_drops++;
+		goto drop;
+	}
+	/* Silently drop packets which are too big. */
+	if (wc.byte_len > rlen) {
+		spin_unlock_irqrestore(&rq->lock, flags);
+		dev->n_pkt_drops++;
+		goto drop;
+	}
+	if (++tail >= rq->size)
+		tail = 0;
+	wq->tail = tail;
+	wc.wr_id = wqe->wr_id;
+	if (handler) {
+		u32 n;
+
+		/*
+		 * validate head pointer value and compute
+		 * the number of remaining WQEs.
+		 */
+		n = wq->head;
+		if (n >= rq->size)
+			n = 0;
+		if (n < tail)
+			n += rq->size - tail;
+		else
+			n -= tail;
+		if (n < srq->limit) {
+			struct ib_event ev;
+
+			srq->limit = 0;
+			spin_unlock_irqrestore(&rq->lock, flags);
+			ev.device = qp->ibqp.device;
+			ev.element.srq = qp->ibqp.srq;
+			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			handler(&ev, srq->ibsrq.srq_context);
+		} else
+			spin_unlock_irqrestore(&rq->lock, flags);
+	} else
+		spin_unlock_irqrestore(&rq->lock, flags);
+
+	ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr;
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh));
+		wc.wc_flags |= IB_WC_GRH;
+	} else
+		ipath_skip_sge(&rsge, sizeof(struct ib_grh));
+	sge = swqe->sg_list;
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		ipath_copy_sge(&rsge, sge->vaddr, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--swqe->wr.num_sge)
+				sge++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = sqp->ibqp.qp_num;
+	/* XXX do we know which pkey matched? Only needed for GSI. */
+	wc.pkey_index = 0;
+	wc.slid = dev->dd->ipath_lid |
+		(ah_attr->src_path_bits &
+		 ((1 << dev->dd->ipath_lmc) - 1));
+	wc.sl = ah_attr->sl;
+	wc.dlid_path_bits =
+		ah_attr->dlid & ((1 << dev->dd->ipath_lmc) - 1);
+	wc.port_num = 1;
+	/* Signal completion event if the solicited bit is set. */
+	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		       swqe->wr.send_flags & IB_SEND_SOLICITED);
+drop:
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+done:;
+}
+
+/**
+ * ipath_make_ud_req - construct a UD request packet
+ * @qp: the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int ipath_make_ud_req(struct ipath_qp *qp)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_other_headers *ohdr;
+	struct ib_ah_attr *ah_attr;
+	struct ipath_swqe *wqe;
+	unsigned long flags;
+	u32 nwords;
+	u32 extra_bytes;
+	u32 bth0;
+	u16 lrh0;
+	u16 lid;
+	int ret = 0;
+	int next_cur;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_dma_busy)) {
+			qp->s_flags |= IPATH_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		ipath_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	if (qp->s_cur == qp->s_head)
+		goto bail;
+
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	next_cur = qp->s_cur + 1;
+	if (next_cur >= qp->s_size)
+		next_cur = 0;
+
+	/* Construct the header. */
+	ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
+	if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE) {
+		if (ah_attr->dlid != IPATH_PERMISSIVE_LID)
+			dev->n_multicast_xmit++;
+		else
+			dev->n_unicast_xmit++;
+	} else {
+		dev->n_unicast_xmit++;
+		lid = ah_attr->dlid & ~((1 << dev->dd->ipath_lmc) - 1);
+		if (unlikely(lid == dev->dd->ipath_lid)) {
+			/*
+			 * If DMAs are in progress, we can't generate
+			 * a completion for the loopback packet since
+			 * it would be out of order.
+			 * XXX Instead of waiting, we could queue a
+			 * zero length descriptor so we get a callback.
+			 */
+			if (atomic_read(&qp->s_dma_busy)) {
+				qp->s_flags |= IPATH_S_WAIT_DMA;
+				goto bail;
+			}
+			qp->s_cur = next_cur;
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			ipath_ud_loopback(qp, wqe);
+			spin_lock_irqsave(&qp->s_lock, flags);
+			ipath_send_complete(qp, wqe, IB_WC_SUCCESS);
+			goto done;
+		}
+	}
+
+	qp->s_cur = next_cur;
+	extra_bytes = -wqe->length & 3;
+	nwords = (wqe->length + extra_bytes) >> 2;
+
+	/* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
+	qp->s_hdrwords = 7;
+	qp->s_cur_size = wqe->length;
+	qp->s_cur_sge = &qp->s_sge;
+	qp->s_dmult = ah_attr->static_rate;
+	qp->s_wqe = wqe;
+	qp->s_sge.sge = wqe->sg_list[0];
+	qp->s_sge.sg_list = wqe->sg_list + 1;
+	qp->s_sge.num_sge = wqe->wr.num_sge;
+
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		/* Header size in 32-bit words. */
+		qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
+						 &ah_attr->grh,
+						 qp->s_hdrwords, nwords);
+		lrh0 = IPATH_LRH_GRH;
+		ohdr = &qp->s_hdr.u.l.oth;
+		/*
+		 * Don't worry about sending to locally attached multicast
+		 * QPs.  It is unspecified by the spec. what happens.
+		 */
+	} else {
+		/* Header size in 32-bit words. */
+		lrh0 = IPATH_LRH_BTH;
+		ohdr = &qp->s_hdr.u.oth;
+	}
+	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		qp->s_hdrwords++;
+		ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
+		bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+	} else
+		bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+	lrh0 |= ah_attr->sl << 4;
+	if (qp->ibqp.qp_type == IB_QPT_SMI)
+		lrh0 |= 0xF000;	/* Set VL (see ch. 13.5.3.1) */
+	qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+	qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid);	/* DEST LID */
+	qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords +
+					   SIZE_OF_CRC);
+	lid = dev->dd->ipath_lid;
+	if (lid) {
+		lid |= ah_attr->src_path_bits &
+			((1 << dev->dd->ipath_lmc) - 1);
+		qp->s_hdr.lrh[3] = cpu_to_be16(lid);
+	} else
+		qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE;
+	if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+		bth0 |= 1 << 23;
+	bth0 |= extra_bytes << 20;
+	bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPATH_DEFAULT_P_KEY :
+		ipath_get_pkey(dev->dd, qp->s_pkey_index);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	/*
+	 * Use the multicast QP if the destination LID is a multicast LID.
+	 */
+	ohdr->bth[1] = ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
+		ah_attr->dlid != IPATH_PERMISSIVE_LID ?
+		cpu_to_be32(IPATH_MULTICAST_QPN) :
+		cpu_to_be32(wqe->wr.wr.ud.remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPATH_PSN_MASK);
+	/*
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ?
+					 qp->qkey : wqe->wr.wr.ud.remote_qkey);
+	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~IPATH_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * ipath_ud_rcv - receive an incoming UD packet
+ * @dev: the device the packet came in on
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from ipath_qp_rcv() to process an incoming UD packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp)
+{
+	struct ipath_other_headers *ohdr;
+	int opcode;
+	u32 hdrsize;
+	u32 pad;
+	struct ib_wc wc;
+	u32 qkey;
+	u32 src_qp;
+	u16 dlid;
+	int header_in_data;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12 + 8;	/* LRH + BTH + DETH */
+		qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+		src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
+		header_in_data = 0;
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */
+		/*
+		 * The header with GRH is 68 bytes and the core driver sets
+		 * the eager header buffer size to 56 bytes so the last 12
+		 * bytes of the IB header is in the data buffer.
+		 */
+		header_in_data = dev->dd->ipath_rcvhdrentsize == 16;
+		if (header_in_data) {
+			qkey = be32_to_cpu(((__be32 *) data)[1]);
+			src_qp = be32_to_cpu(((__be32 *) data)[2]);
+			data += 12;
+		} else {
+			qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+			src_qp = be32_to_cpu(ohdr->u.ud.deth[1]);
+		}
+	}
+	src_qp &= IPATH_QPN_MASK;
+
+	/*
+	 * Check that the permissive LID is only used on QP0
+	 * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
+	 */
+	if (qp->ibqp.qp_num) {
+		if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
+			     hdr->lrh[3] == IB_LID_PERMISSIVE)) {
+			dev->n_pkt_drops++;
+			goto bail;
+		}
+		if (unlikely(qkey != qp->qkey)) {
+			/* XXX OK to lose a count once in a while. */
+			dev->qkey_violations++;
+			dev->n_pkt_drops++;
+			goto bail;
+		}
+	} else if (hdr->lrh[1] == IB_LID_PERMISSIVE ||
+		   hdr->lrh[3] == IB_LID_PERMISSIVE) {
+		struct ib_smp *smp = (struct ib_smp *) data;
+
+		if (smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+			dev->n_pkt_drops++;
+			goto bail;
+		}
+	}
+
+	/*
+	 * The opcode is in the low byte when its in network order
+	 * (top byte when in host order).
+	 */
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+	if (qp->ibqp.qp_num > 1 &&
+	    opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
+		if (header_in_data) {
+			wc.ex.imm_data = *(__be32 *) data;
+			data += sizeof(__be32);
+		} else
+			wc.ex.imm_data = ohdr->u.ud.imm_data;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		hdrsize += sizeof(u32);
+	} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
+		wc.ex.imm_data = 0;
+		wc.wc_flags = 0;
+	} else {
+		dev->n_pkt_drops++;
+		goto bail;
+	}
+
+	/* Get the number of bytes the message was padded by. */
+	pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+	if (unlikely(tlen < (hdrsize + pad + 4))) {
+		/* Drop incomplete packets. */
+		dev->n_pkt_drops++;
+		goto bail;
+	}
+	tlen -= hdrsize + pad + 4;
+
+	/* Drop invalid MAD packets (see 13.5.3.1). */
+	if (unlikely((qp->ibqp.qp_num == 0 &&
+		      (tlen != 256 ||
+		       (be16_to_cpu(hdr->lrh[0]) >> 12) != 15)) ||
+		     (qp->ibqp.qp_num == 1 &&
+		      (tlen != 256 ||
+		       (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))) {
+		dev->n_pkt_drops++;
+		goto bail;
+	}
+
+	/*
+	 * A GRH is expected to precede the data even if not
+	 * present on the wire.
+	 */
+	wc.byte_len = tlen + sizeof(struct ib_grh);
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 */
+	if (qp->r_flags & IPATH_R_REUSE_SGE)
+		qp->r_flags &= ~IPATH_R_REUSE_SGE;
+	else if (!ipath_get_rwqe(qp, 0)) {
+		/*
+		 * Count VL15 packets dropped due to no receive buffer.
+		 * Otherwise, count them as buffer overruns since usually,
+		 * the HW will be able to receive packets even if there are
+		 * no QPs with posted receive buffers.
+		 */
+		if (qp->ibqp.qp_num == 0)
+			dev->n_vl15_dropped++;
+		else
+			dev->rcv_errors++;
+		goto bail;
+	}
+	/* Silently drop packets which are too big. */
+	if (wc.byte_len > qp->r_len) {
+		qp->r_flags |= IPATH_R_REUSE_SGE;
+		dev->n_pkt_drops++;
+		goto bail;
+	}
+	if (has_grh) {
+		ipath_copy_sge(&qp->r_sge, &hdr->u.l.grh,
+			       sizeof(struct ib_grh));
+		wc.wc_flags |= IB_WC_GRH;
+	} else
+		ipath_skip_sge(&qp->r_sge, sizeof(struct ib_grh));
+	ipath_copy_sge(&qp->r_sge, data,
+		       wc.byte_len - sizeof(struct ib_grh));
+	if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
+		goto bail;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.vendor_err = 0;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = src_qp;
+	/* XXX do we know which pkey matched? Only needed for GSI. */
+	wc.pkey_index = 0;
+	wc.slid = be16_to_cpu(hdr->lrh[3]);
+	wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF;
+	dlid = be16_to_cpu(hdr->lrh[1]);
+	/*
+	 * Save the LMC lower bits if the destination LID is a unicast LID.
+	 */
+	wc.dlid_path_bits = dlid >= IPATH_MULTICAST_LID_BASE ? 0 :
+		dlid & ((1 << dev->dd->ipath_lmc) - 1);
+	wc.port_num = 1;
+	/* Signal completion event if the solicited bit is set. */
+	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		       (ohdr->bth[0] &
+			cpu_to_be32(1 << 23)) != 0);
+
+bail:;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c
new file mode 100644
index 000000000..1da1252dc
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c
@@ -0,0 +1,229 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/device.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include "ipath_kernel.h"
+
+static void __ipath_release_user_pages(struct page **p, size_t num_pages,
+				   int dirty)
+{
+	size_t i;
+
+	for (i = 0; i < num_pages; i++) {
+		ipath_cdbg(MM, "%lu/%lu put_page %p\n", (unsigned long) i,
+			   (unsigned long) num_pages, p[i]);
+		if (dirty)
+			set_page_dirty_lock(p[i]);
+		put_page(p[i]);
+	}
+}
+
+/* call with current->mm->mmap_sem held */
+static int __ipath_get_user_pages(unsigned long start_page, size_t num_pages,
+				  struct page **p)
+{
+	unsigned long lock_limit;
+	size_t got;
+	int ret;
+
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if (num_pages > lock_limit) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	ipath_cdbg(VERBOSE, "pin %lx pages from vaddr %lx\n",
+		   (unsigned long) num_pages, start_page);
+
+	for (got = 0; got < num_pages; got += ret) {
+		ret = get_user_pages(current, current->mm,
+				     start_page + got * PAGE_SIZE,
+				     num_pages - got, 1, 1,
+				     p + got, NULL);
+		if (ret < 0)
+			goto bail_release;
+	}
+
+	current->mm->pinned_vm += num_pages;
+
+	ret = 0;
+	goto bail;
+
+bail_release:
+	__ipath_release_user_pages(p, got, 0);
+bail:
+	return ret;
+}
+
+/**
+ * ipath_map_page - a safety wrapper around pci_map_page()
+ *
+ * A dma_addr of all 0's is interpreted by the chip as "disabled".
+ * Unfortunately, it can also be a valid dma_addr returned on some
+ * architectures.
+ *
+ * The powerpc iommu assigns dma_addrs in ascending order, so we don't
+ * have to bother with retries or mapping a dummy page to insure we
+ * don't just get the same mapping again.
+ *
+ * I'm sure we won't be so lucky with other iommu's, so FIXME.
+ */
+dma_addr_t ipath_map_page(struct pci_dev *hwdev, struct page *page,
+	unsigned long offset, size_t size, int direction)
+{
+	dma_addr_t phys;
+
+	phys = pci_map_page(hwdev, page, offset, size, direction);
+
+	if (phys == 0) {
+		pci_unmap_page(hwdev, phys, size, direction);
+		phys = pci_map_page(hwdev, page, offset, size, direction);
+		/*
+		 * FIXME: If we get 0 again, we should keep this page,
+		 * map another, then free the 0 page.
+		 */
+	}
+
+	return phys;
+}
+
+/**
+ * ipath_map_single - a safety wrapper around pci_map_single()
+ *
+ * Same idea as ipath_map_page().
+ */
+dma_addr_t ipath_map_single(struct pci_dev *hwdev, void *ptr, size_t size,
+	int direction)
+{
+	dma_addr_t phys;
+
+	phys = pci_map_single(hwdev, ptr, size, direction);
+
+	if (phys == 0) {
+		pci_unmap_single(hwdev, phys, size, direction);
+		phys = pci_map_single(hwdev, ptr, size, direction);
+		/*
+		 * FIXME: If we get 0 again, we should keep this page,
+		 * map another, then free the 0 page.
+		 */
+	}
+
+	return phys;
+}
+
+/**
+ * ipath_get_user_pages - lock user pages into memory
+ * @start_page: the start page
+ * @num_pages: the number of pages
+ * @p: the output page structures
+ *
+ * This function takes a given start page (page aligned user virtual
+ * address) and pins it and the following specified number of pages.  For
+ * now, num_pages is always 1, but that will probably change at some point
+ * (because caller is doing expected sends on a single virtually contiguous
+ * buffer, so we can do all pages at once).
+ */
+int ipath_get_user_pages(unsigned long start_page, size_t num_pages,
+			 struct page **p)
+{
+	int ret;
+
+	down_write(&current->mm->mmap_sem);
+
+	ret = __ipath_get_user_pages(start_page, num_pages, p);
+
+	up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+void ipath_release_user_pages(struct page **p, size_t num_pages)
+{
+	down_write(&current->mm->mmap_sem);
+
+	__ipath_release_user_pages(p, num_pages, 1);
+
+	current->mm->pinned_vm -= num_pages;
+
+	up_write(&current->mm->mmap_sem);
+}
+
+struct ipath_user_pages_work {
+	struct work_struct work;
+	struct mm_struct *mm;
+	unsigned long num_pages;
+};
+
+static void user_pages_account(struct work_struct *_work)
+{
+	struct ipath_user_pages_work *work =
+		container_of(_work, struct ipath_user_pages_work, work);
+
+	down_write(&work->mm->mmap_sem);
+	work->mm->pinned_vm -= work->num_pages;
+	up_write(&work->mm->mmap_sem);
+	mmput(work->mm);
+	kfree(work);
+}
+
+void ipath_release_user_pages_on_close(struct page **p, size_t num_pages)
+{
+	struct ipath_user_pages_work *work;
+	struct mm_struct *mm;
+
+	__ipath_release_user_pages(p, num_pages, 1);
+
+	mm = get_task_mm(current);
+	if (!mm)
+		return;
+
+	work = kmalloc(sizeof(*work), GFP_KERNEL);
+	if (!work)
+		goto bail_mm;
+
+	INIT_WORK(&work->work, user_pages_account);
+	work->mm = mm;
+	work->num_pages = num_pages;
+
+	queue_work(ib_wq, &work->work);
+	return;
+
+bail_mm:
+	mmput(mm);
+	return;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
new file mode 100644
index 000000000..cc04b7ba3
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
@@ -0,0 +1,875 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+#include "ipath_kernel.h"
+#include "ipath_user_sdma.h"
+
+/* minimum size of header */
+#define IPATH_USER_SDMA_MIN_HEADER_LENGTH	64
+/* expected size of headers (for dma_pool) */
+#define IPATH_USER_SDMA_EXP_HEADER_LENGTH	64
+/* length mask in PBC (lower 11 bits) */
+#define IPATH_PBC_LENGTH_MASK			((1 << 11) - 1)
+
+struct ipath_user_sdma_pkt {
+	u8 naddr;		/* dimension of addr (1..3) ... */
+	u32 counter;		/* sdma pkts queued counter for this entry */
+	u64 added;		/* global descq number of entries */
+
+	struct {
+		u32 offset;			/* offset for kvaddr, addr */
+		u32 length;			/* length in page */
+		u8  put_page;			/* should we put_page? */
+		u8  dma_mapped;			/* is page dma_mapped? */
+		struct page *page;		/* may be NULL (coherent mem) */
+		void *kvaddr;			/* FIXME: only for pio hack */
+		dma_addr_t addr;
+	} addr[4];   /* max pages, any more and we coalesce */
+	struct list_head list;	/* list element */
+};
+
+struct ipath_user_sdma_queue {
+	/*
+	 * pkts sent to dma engine are queued on this
+	 * list head.  the type of the elements of this
+	 * list are struct ipath_user_sdma_pkt...
+	 */
+	struct list_head sent;
+
+	/* headers with expected length are allocated from here... */
+	char header_cache_name[64];
+	struct dma_pool *header_cache;
+
+	/* packets are allocated from the slab cache... */
+	char pkt_slab_name[64];
+	struct kmem_cache *pkt_slab;
+
+	/* as packets go on the queued queue, they are counted... */
+	u32 counter;
+	u32 sent_counter;
+
+	/* dma page table */
+	struct rb_root dma_pages_root;
+
+	/* protect everything above... */
+	struct mutex lock;
+};
+
+struct ipath_user_sdma_queue *
+ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport)
+{
+	struct ipath_user_sdma_queue *pq =
+		kmalloc(sizeof(struct ipath_user_sdma_queue), GFP_KERNEL);
+
+	if (!pq)
+		goto done;
+
+	pq->counter = 0;
+	pq->sent_counter = 0;
+	INIT_LIST_HEAD(&pq->sent);
+
+	mutex_init(&pq->lock);
+
+	snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name),
+		 "ipath-user-sdma-pkts-%u-%02u.%02u", unit, port, sport);
+	pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name,
+					 sizeof(struct ipath_user_sdma_pkt),
+					 0, 0, NULL);
+
+	if (!pq->pkt_slab)
+		goto err_kfree;
+
+	snprintf(pq->header_cache_name, sizeof(pq->header_cache_name),
+		 "ipath-user-sdma-headers-%u-%02u.%02u", unit, port, sport);
+	pq->header_cache = dma_pool_create(pq->header_cache_name,
+					   dev,
+					   IPATH_USER_SDMA_EXP_HEADER_LENGTH,
+					   4, 0);
+	if (!pq->header_cache)
+		goto err_slab;
+
+	pq->dma_pages_root = RB_ROOT;
+
+	goto done;
+
+err_slab:
+	kmem_cache_destroy(pq->pkt_slab);
+err_kfree:
+	kfree(pq);
+	pq = NULL;
+
+done:
+	return pq;
+}
+
+static void ipath_user_sdma_init_frag(struct ipath_user_sdma_pkt *pkt,
+				      int i, size_t offset, size_t len,
+				      int put_page, int dma_mapped,
+				      struct page *page,
+				      void *kvaddr, dma_addr_t dma_addr)
+{
+	pkt->addr[i].offset = offset;
+	pkt->addr[i].length = len;
+	pkt->addr[i].put_page = put_page;
+	pkt->addr[i].dma_mapped = dma_mapped;
+	pkt->addr[i].page = page;
+	pkt->addr[i].kvaddr = kvaddr;
+	pkt->addr[i].addr = dma_addr;
+}
+
+static void ipath_user_sdma_init_header(struct ipath_user_sdma_pkt *pkt,
+					u32 counter, size_t offset,
+					size_t len, int dma_mapped,
+					struct page *page,
+					void *kvaddr, dma_addr_t dma_addr)
+{
+	pkt->naddr = 1;
+	pkt->counter = counter;
+	ipath_user_sdma_init_frag(pkt, 0, offset, len, 0, dma_mapped, page,
+				  kvaddr, dma_addr);
+}
+
+/* we've too many pages in the iovec, coalesce to a single page */
+static int ipath_user_sdma_coalesce(const struct ipath_devdata *dd,
+				    struct ipath_user_sdma_pkt *pkt,
+				    const struct iovec *iov,
+				    unsigned long niov) {
+	int ret = 0;
+	struct page *page = alloc_page(GFP_KERNEL);
+	void *mpage_save;
+	char *mpage;
+	int i;
+	int len = 0;
+	dma_addr_t dma_addr;
+
+	if (!page) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	mpage = kmap(page);
+	mpage_save = mpage;
+	for (i = 0; i < niov; i++) {
+		int cfur;
+
+		cfur = copy_from_user(mpage,
+				      iov[i].iov_base, iov[i].iov_len);
+		if (cfur) {
+			ret = -EFAULT;
+			goto free_unmap;
+		}
+
+		mpage += iov[i].iov_len;
+		len += iov[i].iov_len;
+	}
+
+	dma_addr = dma_map_page(&dd->pcidev->dev, page, 0, len,
+				DMA_TO_DEVICE);
+	if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+		ret = -ENOMEM;
+		goto free_unmap;
+	}
+
+	ipath_user_sdma_init_frag(pkt, 1, 0, len, 0, 1, page, mpage_save,
+				  dma_addr);
+	pkt->naddr = 2;
+
+	goto done;
+
+free_unmap:
+	kunmap(page);
+	__free_page(page);
+done:
+	return ret;
+}
+
+/* how many pages in this iovec element? */
+static int ipath_user_sdma_num_pages(const struct iovec *iov)
+{
+	const unsigned long addr  = (unsigned long) iov->iov_base;
+	const unsigned long  len  = iov->iov_len;
+	const unsigned long spage = addr & PAGE_MASK;
+	const unsigned long epage = (addr + len - 1) & PAGE_MASK;
+
+	return 1 + ((epage - spage) >> PAGE_SHIFT);
+}
+
+/* truncate length to page boundary */
+static int ipath_user_sdma_page_length(unsigned long addr, unsigned long len)
+{
+	const unsigned long offset = addr & ~PAGE_MASK;
+
+	return ((offset + len) > PAGE_SIZE) ? (PAGE_SIZE - offset) : len;
+}
+
+static void ipath_user_sdma_free_pkt_frag(struct device *dev,
+					  struct ipath_user_sdma_queue *pq,
+					  struct ipath_user_sdma_pkt *pkt,
+					  int frag)
+{
+	const int i = frag;
+
+	if (pkt->addr[i].page) {
+		if (pkt->addr[i].dma_mapped)
+			dma_unmap_page(dev,
+				       pkt->addr[i].addr,
+				       pkt->addr[i].length,
+				       DMA_TO_DEVICE);
+
+		if (pkt->addr[i].kvaddr)
+			kunmap(pkt->addr[i].page);
+
+		if (pkt->addr[i].put_page)
+			put_page(pkt->addr[i].page);
+		else
+			__free_page(pkt->addr[i].page);
+	} else if (pkt->addr[i].kvaddr)
+		/* free coherent mem from cache... */
+		dma_pool_free(pq->header_cache,
+			      pkt->addr[i].kvaddr, pkt->addr[i].addr);
+}
+
+/* return number of pages pinned... */
+static int ipath_user_sdma_pin_pages(const struct ipath_devdata *dd,
+				     struct ipath_user_sdma_pkt *pkt,
+				     unsigned long addr, int tlen, int npages)
+{
+	struct page *pages[2];
+	int j;
+	int ret;
+
+	ret = get_user_pages_fast(addr, npages, 0, pages);
+	if (ret != npages) {
+		int i;
+
+		for (i = 0; i < ret; i++)
+			put_page(pages[i]);
+
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	for (j = 0; j < npages; j++) {
+		/* map the pages... */
+		const int flen =
+			ipath_user_sdma_page_length(addr, tlen);
+		dma_addr_t dma_addr =
+			dma_map_page(&dd->pcidev->dev,
+				     pages[j], 0, flen, DMA_TO_DEVICE);
+		unsigned long fofs = addr & ~PAGE_MASK;
+
+		if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+			ret = -ENOMEM;
+			goto done;
+		}
+
+		ipath_user_sdma_init_frag(pkt, pkt->naddr, fofs, flen, 1, 1,
+					  pages[j], kmap(pages[j]),
+					  dma_addr);
+
+		pkt->naddr++;
+		addr += flen;
+		tlen -= flen;
+	}
+
+done:
+	return ret;
+}
+
+static int ipath_user_sdma_pin_pkt(const struct ipath_devdata *dd,
+				   struct ipath_user_sdma_queue *pq,
+				   struct ipath_user_sdma_pkt *pkt,
+				   const struct iovec *iov,
+				   unsigned long niov)
+{
+	int ret = 0;
+	unsigned long idx;
+
+	for (idx = 0; idx < niov; idx++) {
+		const int npages = ipath_user_sdma_num_pages(iov + idx);
+		const unsigned long addr = (unsigned long) iov[idx].iov_base;
+
+		ret = ipath_user_sdma_pin_pages(dd, pkt,
+						addr, iov[idx].iov_len,
+						npages);
+		if (ret < 0)
+			goto free_pkt;
+	}
+
+	goto done;
+
+free_pkt:
+	for (idx = 0; idx < pkt->naddr; idx++)
+		ipath_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx);
+
+done:
+	return ret;
+}
+
+static int ipath_user_sdma_init_payload(const struct ipath_devdata *dd,
+					struct ipath_user_sdma_queue *pq,
+					struct ipath_user_sdma_pkt *pkt,
+					const struct iovec *iov,
+					unsigned long niov, int npages)
+{
+	int ret = 0;
+
+	if (npages >= ARRAY_SIZE(pkt->addr))
+		ret = ipath_user_sdma_coalesce(dd, pkt, iov, niov);
+	else
+		ret = ipath_user_sdma_pin_pkt(dd, pq, pkt, iov, niov);
+
+	return ret;
+}
+
+/* free a packet list -- return counter value of last packet */
+static void ipath_user_sdma_free_pkt_list(struct device *dev,
+					  struct ipath_user_sdma_queue *pq,
+					  struct list_head *list)
+{
+	struct ipath_user_sdma_pkt *pkt, *pkt_next;
+
+	list_for_each_entry_safe(pkt, pkt_next, list, list) {
+		int i;
+
+		for (i = 0; i < pkt->naddr; i++)
+			ipath_user_sdma_free_pkt_frag(dev, pq, pkt, i);
+
+		kmem_cache_free(pq->pkt_slab, pkt);
+	}
+}
+
+/*
+ * copy headers, coalesce etc -- pq->lock must be held
+ *
+ * we queue all the packets to list, returning the
+ * number of bytes total.  list must be empty initially,
+ * as, if there is an error we clean it...
+ */
+static int ipath_user_sdma_queue_pkts(const struct ipath_devdata *dd,
+				      struct ipath_user_sdma_queue *pq,
+				      struct list_head *list,
+				      const struct iovec *iov,
+				      unsigned long niov,
+				      int maxpkts)
+{
+	unsigned long idx = 0;
+	int ret = 0;
+	int npkts = 0;
+	struct page *page = NULL;
+	__le32 *pbc;
+	dma_addr_t dma_addr;
+	struct ipath_user_sdma_pkt *pkt = NULL;
+	size_t len;
+	size_t nw;
+	u32 counter = pq->counter;
+	int dma_mapped = 0;
+
+	while (idx < niov && npkts < maxpkts) {
+		const unsigned long addr = (unsigned long) iov[idx].iov_base;
+		const unsigned long idx_save = idx;
+		unsigned pktnw;
+		unsigned pktnwc;
+		int nfrags = 0;
+		int npages = 0;
+		int cfur;
+
+		dma_mapped = 0;
+		len = iov[idx].iov_len;
+		nw = len >> 2;
+		page = NULL;
+
+		pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL);
+		if (!pkt) {
+			ret = -ENOMEM;
+			goto free_list;
+		}
+
+		if (len < IPATH_USER_SDMA_MIN_HEADER_LENGTH ||
+		    len > PAGE_SIZE || len & 3 || addr & 3) {
+			ret = -EINVAL;
+			goto free_pkt;
+		}
+
+		if (len == IPATH_USER_SDMA_EXP_HEADER_LENGTH)
+			pbc = dma_pool_alloc(pq->header_cache, GFP_KERNEL,
+					     &dma_addr);
+		else
+			pbc = NULL;
+
+		if (!pbc) {
+			page = alloc_page(GFP_KERNEL);
+			if (!page) {
+				ret = -ENOMEM;
+				goto free_pkt;
+			}
+			pbc = kmap(page);
+		}
+
+		cfur = copy_from_user(pbc, iov[idx].iov_base, len);
+		if (cfur) {
+			ret = -EFAULT;
+			goto free_pbc;
+		}
+
+		/*
+		 * this assignment is a bit strange.  it's because the
+		 * the pbc counts the number of 32 bit words in the full
+		 * packet _except_ the first word of the pbc itself...
+		 */
+		pktnwc = nw - 1;
+
+		/*
+		 * pktnw computation yields the number of 32 bit words
+		 * that the caller has indicated in the PBC.  note that
+		 * this is one less than the total number of words that
+		 * goes to the send DMA engine as the first 32 bit word
+		 * of the PBC itself is not counted.  Armed with this count,
+		 * we can verify that the packet is consistent with the
+		 * iovec lengths.
+		 */
+		pktnw = le32_to_cpu(*pbc) & IPATH_PBC_LENGTH_MASK;
+		if (pktnw < pktnwc || pktnw > pktnwc + (PAGE_SIZE >> 2)) {
+			ret = -EINVAL;
+			goto free_pbc;
+		}
+
+
+		idx++;
+		while (pktnwc < pktnw && idx < niov) {
+			const size_t slen = iov[idx].iov_len;
+			const unsigned long faddr =
+				(unsigned long) iov[idx].iov_base;
+
+			if (slen & 3 || faddr & 3 || !slen ||
+			    slen > PAGE_SIZE) {
+				ret = -EINVAL;
+				goto free_pbc;
+			}
+
+			npages++;
+			if ((faddr & PAGE_MASK) !=
+			    ((faddr + slen - 1) & PAGE_MASK))
+				npages++;
+
+			pktnwc += slen >> 2;
+			idx++;
+			nfrags++;
+		}
+
+		if (pktnwc != pktnw) {
+			ret = -EINVAL;
+			goto free_pbc;
+		}
+
+		if (page) {
+			dma_addr = dma_map_page(&dd->pcidev->dev,
+						page, 0, len, DMA_TO_DEVICE);
+			if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+				ret = -ENOMEM;
+				goto free_pbc;
+			}
+
+			dma_mapped = 1;
+		}
+
+		ipath_user_sdma_init_header(pkt, counter, 0, len, dma_mapped,
+					    page, pbc, dma_addr);
+
+		if (nfrags) {
+			ret = ipath_user_sdma_init_payload(dd, pq, pkt,
+							   iov + idx_save + 1,
+							   nfrags, npages);
+			if (ret < 0)
+				goto free_pbc_dma;
+		}
+
+		counter++;
+		npkts++;
+
+		list_add_tail(&pkt->list, list);
+	}
+
+	ret = idx;
+	goto done;
+
+free_pbc_dma:
+	if (dma_mapped)
+		dma_unmap_page(&dd->pcidev->dev, dma_addr, len, DMA_TO_DEVICE);
+free_pbc:
+	if (page) {
+		kunmap(page);
+		__free_page(page);
+	} else
+		dma_pool_free(pq->header_cache, pbc, dma_addr);
+free_pkt:
+	kmem_cache_free(pq->pkt_slab, pkt);
+free_list:
+	ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list);
+done:
+	return ret;
+}
+
+static void ipath_user_sdma_set_complete_counter(struct ipath_user_sdma_queue *pq,
+						 u32 c)
+{
+	pq->sent_counter = c;
+}
+
+/* try to clean out queue -- needs pq->lock */
+static int ipath_user_sdma_queue_clean(const struct ipath_devdata *dd,
+				       struct ipath_user_sdma_queue *pq)
+{
+	struct list_head free_list;
+	struct ipath_user_sdma_pkt *pkt;
+	struct ipath_user_sdma_pkt *pkt_prev;
+	int ret = 0;
+
+	INIT_LIST_HEAD(&free_list);
+
+	list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) {
+		s64 descd = dd->ipath_sdma_descq_removed - pkt->added;
+
+		if (descd < 0)
+			break;
+
+		list_move_tail(&pkt->list, &free_list);
+
+		/* one more packet cleaned */
+		ret++;
+	}
+
+	if (!list_empty(&free_list)) {
+		u32 counter;
+
+		pkt = list_entry(free_list.prev,
+				 struct ipath_user_sdma_pkt, list);
+		counter = pkt->counter;
+
+		ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
+		ipath_user_sdma_set_complete_counter(pq, counter);
+	}
+
+	return ret;
+}
+
+void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq)
+{
+	if (!pq)
+		return;
+
+	kmem_cache_destroy(pq->pkt_slab);
+	dma_pool_destroy(pq->header_cache);
+	kfree(pq);
+}
+
+/* clean descriptor queue, returns > 0 if some elements cleaned */
+static int ipath_user_sdma_hwqueue_clean(struct ipath_devdata *dd)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+	ret = ipath_sdma_make_progress(dd);
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	return ret;
+}
+
+/* we're in close, drain packets so that we can cleanup successfully... */
+void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
+				 struct ipath_user_sdma_queue *pq)
+{
+	int i;
+
+	if (!pq)
+		return;
+
+	for (i = 0; i < 100; i++) {
+		mutex_lock(&pq->lock);
+		if (list_empty(&pq->sent)) {
+			mutex_unlock(&pq->lock);
+			break;
+		}
+		ipath_user_sdma_hwqueue_clean(dd);
+		ipath_user_sdma_queue_clean(dd, pq);
+		mutex_unlock(&pq->lock);
+		msleep(10);
+	}
+
+	if (!list_empty(&pq->sent)) {
+		struct list_head free_list;
+
+		printk(KERN_INFO "drain: lists not empty: forcing!\n");
+		INIT_LIST_HEAD(&free_list);
+		mutex_lock(&pq->lock);
+		list_splice_init(&pq->sent, &free_list);
+		ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
+		mutex_unlock(&pq->lock);
+	}
+}
+
+static inline __le64 ipath_sdma_make_desc0(struct ipath_devdata *dd,
+					   u64 addr, u64 dwlen, u64 dwoffset)
+{
+	return cpu_to_le64(/* SDmaPhyAddr[31:0] */
+			   ((addr & 0xfffffffcULL) << 32) |
+			   /* SDmaGeneration[1:0] */
+			   ((dd->ipath_sdma_generation & 3ULL) << 30) |
+			   /* SDmaDwordCount[10:0] */
+			   ((dwlen & 0x7ffULL) << 16) |
+			   /* SDmaBufOffset[12:2] */
+			   (dwoffset & 0x7ffULL));
+}
+
+static inline __le64 ipath_sdma_make_first_desc0(__le64 descq)
+{
+	return descq | cpu_to_le64(1ULL << 12);
+}
+
+static inline __le64 ipath_sdma_make_last_desc0(__le64 descq)
+{
+					      /* last */  /* dma head */
+	return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13);
+}
+
+static inline __le64 ipath_sdma_make_desc1(u64 addr)
+{
+	/* SDmaPhyAddr[47:32] */
+	return cpu_to_le64(addr >> 32);
+}
+
+static void ipath_user_sdma_send_frag(struct ipath_devdata *dd,
+				      struct ipath_user_sdma_pkt *pkt, int idx,
+				      unsigned ofs, u16 tail)
+{
+	const u64 addr = (u64) pkt->addr[idx].addr +
+		(u64) pkt->addr[idx].offset;
+	const u64 dwlen = (u64) pkt->addr[idx].length / 4;
+	__le64 *descqp;
+	__le64 descq0;
+
+	descqp = &dd->ipath_sdma_descq[tail].qw[0];
+
+	descq0 = ipath_sdma_make_desc0(dd, addr, dwlen, ofs);
+	if (idx == 0)
+		descq0 = ipath_sdma_make_first_desc0(descq0);
+	if (idx == pkt->naddr - 1)
+		descq0 = ipath_sdma_make_last_desc0(descq0);
+
+	descqp[0] = descq0;
+	descqp[1] = ipath_sdma_make_desc1(addr);
+}
+
+/* pq->lock must be held, get packets on the wire... */
+static int ipath_user_sdma_push_pkts(struct ipath_devdata *dd,
+				     struct ipath_user_sdma_queue *pq,
+				     struct list_head *pktlist)
+{
+	int ret = 0;
+	unsigned long flags;
+	u16 tail;
+
+	if (list_empty(pktlist))
+		return 0;
+
+	if (unlikely(!(dd->ipath_flags & IPATH_LINKACTIVE)))
+		return -ECOMM;
+
+	spin_lock_irqsave(&dd->ipath_sdma_lock, flags);
+
+	if (unlikely(dd->ipath_sdma_status & IPATH_SDMA_ABORT_MASK)) {
+		ret = -ECOMM;
+		goto unlock;
+	}
+
+	tail = dd->ipath_sdma_descq_tail;
+	while (!list_empty(pktlist)) {
+		struct ipath_user_sdma_pkt *pkt =
+			list_entry(pktlist->next, struct ipath_user_sdma_pkt,
+				   list);
+		int i;
+		unsigned ofs = 0;
+		u16 dtail = tail;
+
+		if (pkt->naddr > ipath_sdma_descq_freecnt(dd))
+			goto unlock_check_tail;
+
+		for (i = 0; i < pkt->naddr; i++) {
+			ipath_user_sdma_send_frag(dd, pkt, i, ofs, tail);
+			ofs += pkt->addr[i].length >> 2;
+
+			if (++tail == dd->ipath_sdma_descq_cnt) {
+				tail = 0;
+				++dd->ipath_sdma_generation;
+			}
+		}
+
+		if ((ofs<<2) > dd->ipath_ibmaxlen) {
+			ipath_dbg("packet size %X > ibmax %X, fail\n",
+				ofs<<2, dd->ipath_ibmaxlen);
+			ret = -EMSGSIZE;
+			goto unlock;
+		}
+
+		/*
+		 * if the packet is >= 2KB mtu equivalent, we have to use
+		 * the large buffers, and have to mark each descriptor as
+		 * part of a large buffer packet.
+		 */
+		if (ofs >= IPATH_SMALLBUF_DWORDS) {
+			for (i = 0; i < pkt->naddr; i++) {
+				dd->ipath_sdma_descq[dtail].qw[0] |=
+					cpu_to_le64(1ULL << 14);
+				if (++dtail == dd->ipath_sdma_descq_cnt)
+					dtail = 0;
+			}
+		}
+
+		dd->ipath_sdma_descq_added += pkt->naddr;
+		pkt->added = dd->ipath_sdma_descq_added;
+		list_move_tail(&pkt->list, &pq->sent);
+		ret++;
+	}
+
+unlock_check_tail:
+	/* advance the tail on the chip if necessary */
+	if (dd->ipath_sdma_descq_tail != tail) {
+		wmb();
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_senddmatail, tail);
+		dd->ipath_sdma_descq_tail = tail;
+	}
+
+unlock:
+	spin_unlock_irqrestore(&dd->ipath_sdma_lock, flags);
+
+	return ret;
+}
+
+int ipath_user_sdma_writev(struct ipath_devdata *dd,
+			   struct ipath_user_sdma_queue *pq,
+			   const struct iovec *iov,
+			   unsigned long dim)
+{
+	int ret = 0;
+	struct list_head list;
+	int npkts = 0;
+
+	INIT_LIST_HEAD(&list);
+
+	mutex_lock(&pq->lock);
+
+	if (dd->ipath_sdma_descq_added != dd->ipath_sdma_descq_removed) {
+		ipath_user_sdma_hwqueue_clean(dd);
+		ipath_user_sdma_queue_clean(dd, pq);
+	}
+
+	while (dim) {
+		const int mxp = 8;
+
+		ret = ipath_user_sdma_queue_pkts(dd, pq, &list, iov, dim, mxp);
+		if (ret <= 0)
+			goto done_unlock;
+		else {
+			dim -= ret;
+			iov += ret;
+		}
+
+		/* force packets onto the sdma hw queue... */
+		if (!list_empty(&list)) {
+			/*
+			 * lazily clean hw queue.  the 4 is a guess of about
+			 * how many sdma descriptors a packet will take (it
+			 * doesn't have to be perfect).
+			 */
+			if (ipath_sdma_descq_freecnt(dd) < ret * 4) {
+				ipath_user_sdma_hwqueue_clean(dd);
+				ipath_user_sdma_queue_clean(dd, pq);
+			}
+
+			ret = ipath_user_sdma_push_pkts(dd, pq, &list);
+			if (ret < 0)
+				goto done_unlock;
+			else {
+				npkts += ret;
+				pq->counter += ret;
+
+				if (!list_empty(&list))
+					goto done_unlock;
+			}
+		}
+	}
+
+done_unlock:
+	if (!list_empty(&list))
+		ipath_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list);
+	mutex_unlock(&pq->lock);
+
+	return (ret < 0) ? ret : npkts;
+}
+
+int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
+				  struct ipath_user_sdma_queue *pq)
+{
+	int ret = 0;
+
+	mutex_lock(&pq->lock);
+	ipath_user_sdma_hwqueue_clean(dd);
+	ret = ipath_user_sdma_queue_clean(dd, pq);
+	mutex_unlock(&pq->lock);
+
+	return ret;
+}
+
+u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq)
+{
+	return pq->sent_counter;
+}
+
+u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq)
+{
+	return pq->counter;
+}
+
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.h b/drivers/infiniband/hw/ipath/ipath_user_sdma.h
new file mode 100644
index 000000000..fc76316c4
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/device.h>
+
+struct ipath_user_sdma_queue;
+
+struct ipath_user_sdma_queue *
+ipath_user_sdma_queue_create(struct device *dev, int unit, int port, int sport);
+void ipath_user_sdma_queue_destroy(struct ipath_user_sdma_queue *pq);
+
+int ipath_user_sdma_writev(struct ipath_devdata *dd,
+			   struct ipath_user_sdma_queue *pq,
+			   const struct iovec *iov,
+			   unsigned long dim);
+
+int ipath_user_sdma_make_progress(struct ipath_devdata *dd,
+				  struct ipath_user_sdma_queue *pq);
+
+void ipath_user_sdma_queue_drain(struct ipath_devdata *dd,
+				 struct ipath_user_sdma_queue *pq);
+
+u32 ipath_user_sdma_complete_counter(const struct ipath_user_sdma_queue *pq);
+u32 ipath_user_sdma_inflight_counter(struct ipath_user_sdma_queue *pq);
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
new file mode 100644
index 000000000..44ea93904
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -0,0 +1,2342 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/io.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/rculist.h>
+
+#include "ipath_kernel.h"
+#include "ipath_verbs.h"
+#include "ipath_common.h"
+
+static unsigned int ib_ipath_qp_table_size = 251;
+module_param_named(qp_table_size, ib_ipath_qp_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(qp_table_size, "QP table size");
+
+unsigned int ib_ipath_lkey_table_size = 12;
+module_param_named(lkey_table_size, ib_ipath_lkey_table_size, uint,
+		   S_IRUGO);
+MODULE_PARM_DESC(lkey_table_size,
+		 "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+static unsigned int ib_ipath_max_pds = 0xFFFF;
+module_param_named(max_pds, ib_ipath_max_pds, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_pds,
+		 "Maximum number of protection domains to support");
+
+static unsigned int ib_ipath_max_ahs = 0xFFFF;
+module_param_named(max_ahs, ib_ipath_max_ahs, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
+
+unsigned int ib_ipath_max_cqes = 0x2FFFF;
+module_param_named(max_cqes, ib_ipath_max_cqes, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_cqes,
+		 "Maximum number of completion queue entries to support");
+
+unsigned int ib_ipath_max_cqs = 0x1FFFF;
+module_param_named(max_cqs, ib_ipath_max_cqs, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
+
+unsigned int ib_ipath_max_qp_wrs = 0x3FFF;
+module_param_named(max_qp_wrs, ib_ipath_max_qp_wrs, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
+
+unsigned int ib_ipath_max_qps = 16384;
+module_param_named(max_qps, ib_ipath_max_qps, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
+
+unsigned int ib_ipath_max_sges = 0x60;
+module_param_named(max_sges, ib_ipath_max_sges, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
+
+unsigned int ib_ipath_max_mcast_grps = 16384;
+module_param_named(max_mcast_grps, ib_ipath_max_mcast_grps, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_mcast_grps,
+		 "Maximum number of multicast groups to support");
+
+unsigned int ib_ipath_max_mcast_qp_attached = 16;
+module_param_named(max_mcast_qp_attached, ib_ipath_max_mcast_qp_attached,
+		   uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_mcast_qp_attached,
+		 "Maximum number of attached QPs to support");
+
+unsigned int ib_ipath_max_srqs = 1024;
+module_param_named(max_srqs, ib_ipath_max_srqs, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
+
+unsigned int ib_ipath_max_srq_sges = 128;
+module_param_named(max_srq_sges, ib_ipath_max_srq_sges,
+		   uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
+
+unsigned int ib_ipath_max_srq_wrs = 0x1FFFF;
+module_param_named(max_srq_wrs, ib_ipath_max_srq_wrs,
+		   uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
+
+static unsigned int ib_ipath_disable_sma;
+module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(disable_sma, "Disable the SMA");
+
+/*
+ * Note that it is OK to post send work requests in the SQE and ERR
+ * states; ipath_do_send() will process them and generate error
+ * completions as per IB 1.2 C10-96.
+ */
+const int ib_ipath_state_ops[IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = 0,
+	[IB_QPS_INIT] = IPATH_POST_RECV_OK,
+	[IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
+	[IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+	    IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK |
+	    IPATH_PROCESS_NEXT_SEND_OK,
+	[IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+	    IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
+	[IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+	    IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
+	[IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV |
+	    IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
+};
+
+struct ipath_ucontext {
+	struct ib_ucontext ibucontext;
+};
+
+static inline struct ipath_ucontext *to_iucontext(struct ib_ucontext
+						  *ibucontext)
+{
+	return container_of(ibucontext, struct ipath_ucontext, ibucontext);
+}
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_ipath_wc_opcode[] = {
+	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+	[IB_WR_SEND] = IB_WC_SEND,
+	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+	[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+};
+
+/*
+ * System image GUID.
+ */
+static __be64 sys_image_guid;
+
+/**
+ * ipath_copy_sge - copy data to SGE memory
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ */
+void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length)
+{
+	struct ipath_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		memcpy(sge->vaddr, data, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		data += len;
+		length -= len;
+	}
+}
+
+/**
+ * ipath_skip_sge - skip over SGE memory - XXX almost dup of prev func
+ * @ss: the SGE state
+ * @length: the number of bytes to skip
+ */
+void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
+{
+	struct ipath_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+}
+
+/*
+ * Count the number of DMA descriptors needed to send length bytes of data.
+ * Don't modify the ipath_sge_state to get the count.
+ * Return zero if any of the segments is not aligned.
+ */
+static u32 ipath_count_sge(struct ipath_sge_state *ss, u32 length)
+{
+	struct ipath_sge *sg_list = ss->sg_list;
+	struct ipath_sge sge = ss->sge;
+	u8 num_sge = ss->num_sge;
+	u32 ndesc = 1;	/* count the header */
+
+	while (length) {
+		u32 len = sge.length;
+
+		if (len > length)
+			len = length;
+		if (len > sge.sge_length)
+			len = sge.sge_length;
+		BUG_ON(len == 0);
+		if (((long) sge.vaddr & (sizeof(u32) - 1)) ||
+		    (len != length && (len & (sizeof(u32) - 1)))) {
+			ndesc = 0;
+			break;
+		}
+		ndesc++;
+		sge.vaddr += len;
+		sge.length -= len;
+		sge.sge_length -= len;
+		if (sge.sge_length == 0) {
+			if (--num_sge)
+				sge = *sg_list++;
+		} else if (sge.length == 0 && sge.mr != NULL) {
+			if (++sge.n >= IPATH_SEGSZ) {
+				if (++sge.m >= sge.mr->mapsz)
+					break;
+				sge.n = 0;
+			}
+			sge.vaddr =
+				sge.mr->map[sge.m]->segs[sge.n].vaddr;
+			sge.length =
+				sge.mr->map[sge.m]->segs[sge.n].length;
+		}
+		length -= len;
+	}
+	return ndesc;
+}
+
+/*
+ * Copy from the SGEs to the data buffer.
+ */
+static void ipath_copy_from_sge(void *data, struct ipath_sge_state *ss,
+				u32 length)
+{
+	struct ipath_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		memcpy(data, sge->vaddr, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr != NULL) {
+			if (++sge->n >= IPATH_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		data += len;
+		length -= len;
+	}
+}
+
+/**
+ * ipath_post_one_send - post one RC, UC, or UD send work request
+ * @qp: the QP to post on
+ * @wr: the work request to send
+ */
+static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
+{
+	struct ipath_swqe *wqe;
+	u32 next;
+	int i;
+	int j;
+	int acc;
+	int ret;
+	unsigned long flags;
+	struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (qp->ibqp.qp_type != IB_QPT_SMI &&
+	    !(dd->ipath_flags & IPATH_LINKACTIVE)) {
+		ret = -ENETDOWN;
+		goto bail;
+	}
+
+	/* Check that state is OK to post send. */
+	if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)))
+		goto bail_inval;
+
+	/* IB spec says that num_sge == 0 is OK. */
+	if (wr->num_sge > qp->s_max_sge)
+		goto bail_inval;
+
+	/*
+	 * Don't allow RDMA reads or atomic operations on UC or
+	 * undefined operations.
+	 * Make sure buffer is large enough to hold the result for atomics.
+	 */
+	if (qp->ibqp.qp_type == IB_QPT_UC) {
+		if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
+			goto bail_inval;
+	} else if (qp->ibqp.qp_type == IB_QPT_UD) {
+		/* Check UD opcode */
+		if (wr->opcode != IB_WR_SEND &&
+		    wr->opcode != IB_WR_SEND_WITH_IMM)
+			goto bail_inval;
+		/* Check UD destination address PD */
+		if (qp->ibqp.pd != wr->wr.ud.ah->pd)
+			goto bail_inval;
+	} else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
+		goto bail_inval;
+	else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
+		   (wr->num_sge == 0 ||
+		    wr->sg_list[0].length < sizeof(u64) ||
+		    wr->sg_list[0].addr & (sizeof(u64) - 1)))
+		goto bail_inval;
+	else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
+		goto bail_inval;
+
+	next = qp->s_head + 1;
+	if (next >= qp->s_size)
+		next = 0;
+	if (next == qp->s_last) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	wqe = get_swqe_ptr(qp, qp->s_head);
+	wqe->wr = *wr;
+	wqe->length = 0;
+	if (wr->num_sge) {
+		acc = wr->opcode >= IB_WR_RDMA_READ ?
+			IB_ACCESS_LOCAL_WRITE : 0;
+		for (i = 0, j = 0; i < wr->num_sge; i++) {
+			u32 length = wr->sg_list[i].length;
+			int ok;
+
+			if (length == 0)
+				continue;
+			ok = ipath_lkey_ok(qp, &wqe->sg_list[j],
+					   &wr->sg_list[i], acc);
+			if (!ok)
+				goto bail_inval;
+			wqe->length += length;
+			j++;
+		}
+		wqe->wr.num_sge = j;
+	}
+	if (qp->ibqp.qp_type == IB_QPT_UC ||
+	    qp->ibqp.qp_type == IB_QPT_RC) {
+		if (wqe->length > 0x80000000U)
+			goto bail_inval;
+	} else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu)
+		goto bail_inval;
+	wqe->ssn = qp->s_ssn++;
+	qp->s_head = next;
+
+	ret = 0;
+	goto bail;
+
+bail_inval:
+	ret = -EINVAL;
+bail:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * ipath_post_send - post a send on a QP
+ * @ibqp: the QP to post the send on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			   struct ib_send_wr **bad_wr)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+	int err = 0;
+
+	for (; wr; wr = wr->next) {
+		err = ipath_post_one_send(qp, wr);
+		if (err) {
+			*bad_wr = wr;
+			goto bail;
+		}
+	}
+
+	/* Try to do the send work in the caller's context. */
+	ipath_do_send((unsigned long) qp);
+
+bail:
+	return err;
+}
+
+/**
+ * ipath_post_receive - post a receive on a QP
+ * @ibqp: the QP to post the receive on
+ * @wr: the WR to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+	struct ipath_rwq *wq = qp->r_rq.wq;
+	unsigned long flags;
+	int ret;
+
+	/* Check that state is OK to post receive. */
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_RECV_OK) || !wq) {
+		*bad_wr = wr;
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	for (; wr; wr = wr->next) {
+		struct ipath_rwqe *wqe;
+		u32 next;
+		int i;
+
+		if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		spin_lock_irqsave(&qp->r_rq.lock, flags);
+		next = wq->head + 1;
+		if (next >= qp->r_rq.size)
+			next = 0;
+		if (next == wq->tail) {
+			spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+			*bad_wr = wr;
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
+		wqe->wr_id = wr->wr_id;
+		wqe->num_sge = wr->num_sge;
+		for (i = 0; i < wr->num_sge; i++)
+			wqe->sg_list[i] = wr->sg_list[i];
+		/* Make sure queue entry is written before the head index. */
+		smp_wmb();
+		wq->head = next;
+		spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_qp_rcv - processing an incoming packet on a QP
+ * @dev: the device the packet came on
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from ipath_ib_rcv() to process an incoming packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+static void ipath_qp_rcv(struct ipath_ibdev *dev,
+			 struct ipath_ib_header *hdr, int has_grh,
+			 void *data, u32 tlen, struct ipath_qp *qp)
+{
+	/* Check for valid receive state. */
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+		dev->n_pkt_drops++;
+		return;
+	}
+
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		if (ib_ipath_disable_sma)
+			break;
+		/* FALLTHROUGH */
+	case IB_QPT_UD:
+		ipath_ud_rcv(dev, hdr, has_grh, data, tlen, qp);
+		break;
+
+	case IB_QPT_RC:
+		ipath_rc_rcv(dev, hdr, has_grh, data, tlen, qp);
+		break;
+
+	case IB_QPT_UC:
+		ipath_uc_rcv(dev, hdr, has_grh, data, tlen, qp);
+		break;
+
+	default:
+		break;
+	}
+}
+
+/**
+ * ipath_ib_rcv - process an incoming packet
+ * @arg: the device pointer
+ * @rhdr: the header of the packet
+ * @data: the packet data
+ * @tlen: the packet length
+ *
+ * This is called from ipath_kreceive() to process an incoming packet at
+ * interrupt level. Tlen is the length of the header + data + CRC in bytes.
+ */
+void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data,
+		  u32 tlen)
+{
+	struct ipath_ib_header *hdr = rhdr;
+	struct ipath_other_headers *ohdr;
+	struct ipath_qp *qp;
+	u32 qp_num;
+	int lnh;
+	u8 opcode;
+	u16 lid;
+
+	if (unlikely(dev == NULL))
+		goto bail;
+
+	if (unlikely(tlen < 24)) {	/* LRH+BTH+CRC */
+		dev->rcv_errors++;
+		goto bail;
+	}
+
+	/* Check for a valid destination LID (see ch. 7.11.1). */
+	lid = be16_to_cpu(hdr->lrh[1]);
+	if (lid < IPATH_MULTICAST_LID_BASE) {
+		lid &= ~((1 << dev->dd->ipath_lmc) - 1);
+		if (unlikely(lid != dev->dd->ipath_lid)) {
+			dev->rcv_errors++;
+			goto bail;
+		}
+	}
+
+	/* Check for GRH */
+	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+	if (lnh == IPATH_LRH_BTH)
+		ohdr = &hdr->u.oth;
+	else if (lnh == IPATH_LRH_GRH)
+		ohdr = &hdr->u.l.oth;
+	else {
+		dev->rcv_errors++;
+		goto bail;
+	}
+
+	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f;
+	dev->opstats[opcode].n_bytes += tlen;
+	dev->opstats[opcode].n_packets++;
+
+	/* Get the destination QP number. */
+	qp_num = be32_to_cpu(ohdr->bth[1]) & IPATH_QPN_MASK;
+	if (qp_num == IPATH_MULTICAST_QPN) {
+		struct ipath_mcast *mcast;
+		struct ipath_mcast_qp *p;
+
+		if (lnh != IPATH_LRH_GRH) {
+			dev->n_pkt_drops++;
+			goto bail;
+		}
+		mcast = ipath_mcast_find(&hdr->u.l.grh.dgid);
+		if (mcast == NULL) {
+			dev->n_pkt_drops++;
+			goto bail;
+		}
+		dev->n_multicast_rcv++;
+		list_for_each_entry_rcu(p, &mcast->qp_list, list)
+			ipath_qp_rcv(dev, hdr, 1, data, tlen, p->qp);
+		/*
+		 * Notify ipath_multicast_detach() if it is waiting for us
+		 * to finish.
+		 */
+		if (atomic_dec_return(&mcast->refcount) <= 1)
+			wake_up(&mcast->wait);
+	} else {
+		qp = ipath_lookup_qpn(&dev->qp_table, qp_num);
+		if (qp) {
+			dev->n_unicast_rcv++;
+			ipath_qp_rcv(dev, hdr, lnh == IPATH_LRH_GRH, data,
+				     tlen, qp);
+			/*
+			 * Notify ipath_destroy_qp() if it is waiting
+			 * for us to finish.
+			 */
+			if (atomic_dec_and_test(&qp->refcount))
+				wake_up(&qp->wait);
+		} else
+			dev->n_pkt_drops++;
+	}
+
+bail:;
+}
+
+/**
+ * ipath_ib_timer - verbs timer
+ * @arg: the device pointer
+ *
+ * This is called from ipath_do_rcv_timer() at interrupt level to check for
+ * QPs which need retransmits and to collect performance numbers.
+ */
+static void ipath_ib_timer(struct ipath_ibdev *dev)
+{
+	struct ipath_qp *resend = NULL;
+	struct ipath_qp *rnr = NULL;
+	struct list_head *last;
+	struct ipath_qp *qp;
+	unsigned long flags;
+
+	if (dev == NULL)
+		return;
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	/* Start filling the next pending queue. */
+	if (++dev->pending_index >= ARRAY_SIZE(dev->pending))
+		dev->pending_index = 0;
+	/* Save any requests still in the new queue, they have timed out. */
+	last = &dev->pending[dev->pending_index];
+	while (!list_empty(last)) {
+		qp = list_entry(last->next, struct ipath_qp, timerwait);
+		list_del_init(&qp->timerwait);
+		qp->timer_next = resend;
+		resend = qp;
+		atomic_inc(&qp->refcount);
+	}
+	last = &dev->rnrwait;
+	if (!list_empty(last)) {
+		qp = list_entry(last->next, struct ipath_qp, timerwait);
+		if (--qp->s_rnr_timeout == 0) {
+			do {
+				list_del_init(&qp->timerwait);
+				qp->timer_next = rnr;
+				rnr = qp;
+				atomic_inc(&qp->refcount);
+				if (list_empty(last))
+					break;
+				qp = list_entry(last->next, struct ipath_qp,
+						timerwait);
+			} while (qp->s_rnr_timeout == 0);
+		}
+	}
+	/*
+	 * We should only be in the started state if pma_sample_start != 0
+	 */
+	if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED &&
+	    --dev->pma_sample_start == 0) {
+		dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
+		ipath_snapshot_counters(dev->dd, &dev->ipath_sword,
+					&dev->ipath_rword,
+					&dev->ipath_spkts,
+					&dev->ipath_rpkts,
+					&dev->ipath_xmit_wait);
+	}
+	if (dev->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) {
+		if (dev->pma_sample_interval == 0) {
+			u64 ta, tb, tc, td, te;
+
+			dev->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+			ipath_snapshot_counters(dev->dd, &ta, &tb,
+						&tc, &td, &te);
+
+			dev->ipath_sword = ta - dev->ipath_sword;
+			dev->ipath_rword = tb - dev->ipath_rword;
+			dev->ipath_spkts = tc - dev->ipath_spkts;
+			dev->ipath_rpkts = td - dev->ipath_rpkts;
+			dev->ipath_xmit_wait = te - dev->ipath_xmit_wait;
+		}
+		else
+			dev->pma_sample_interval--;
+	}
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+	/* XXX What if timer fires again while this is running? */
+	while (resend != NULL) {
+		qp = resend;
+		resend = qp->timer_next;
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (qp->s_last != qp->s_tail &&
+		    ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
+			dev->n_timeouts++;
+			ipath_restart_rc(qp, qp->s_last_psn + 1);
+		}
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		/* Notify ipath_destroy_qp() if it is waiting. */
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+	while (rnr != NULL) {
+		qp = rnr;
+		rnr = qp->timer_next;
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
+			ipath_schedule_send(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		/* Notify ipath_destroy_qp() if it is waiting. */
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+}
+
+static void update_sge(struct ipath_sge_state *ss, u32 length)
+{
+	struct ipath_sge *sge = &ss->sge;
+
+	sge->vaddr += length;
+	sge->length -= length;
+	sge->sge_length -= length;
+	if (sge->sge_length == 0) {
+		if (--ss->num_sge)
+			*sge = *ss->sg_list++;
+	} else if (sge->length == 0 && sge->mr != NULL) {
+		if (++sge->n >= IPATH_SEGSZ) {
+			if (++sge->m >= sge->mr->mapsz)
+				return;
+			sge->n = 0;
+		}
+		sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+		sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+	}
+}
+
+#ifdef __LITTLE_ENDIAN
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+	return data >> shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+	return data << shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+	data <<= ((sizeof(u32) - n) * BITS_PER_BYTE);
+	data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+	return data;
+}
+#else
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+	return data << shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+	return data >> shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+	data >>= ((sizeof(u32) - n) * BITS_PER_BYTE);
+	data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+	return data;
+}
+#endif
+
+static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss,
+		    u32 length, unsigned flush_wc)
+{
+	u32 extra = 0;
+	u32 data = 0;
+	u32 last;
+
+	while (1) {
+		u32 len = ss->sge.length;
+		u32 off;
+
+		if (len > length)
+			len = length;
+		if (len > ss->sge.sge_length)
+			len = ss->sge.sge_length;
+		BUG_ON(len == 0);
+		/* If the source address is not aligned, try to align it. */
+		off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
+		if (off) {
+			u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr &
+					    ~(sizeof(u32) - 1));
+			u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE);
+			u32 y;
+
+			y = sizeof(u32) - off;
+			if (len > y)
+				len = y;
+			if (len + extra >= sizeof(u32)) {
+				data |= set_upper_bits(v, extra *
+						       BITS_PER_BYTE);
+				len = sizeof(u32) - extra;
+				if (len == length) {
+					last = data;
+					break;
+				}
+				__raw_writel(data, piobuf);
+				piobuf++;
+				extra = 0;
+				data = 0;
+			} else {
+				/* Clear unused upper bytes */
+				data |= clear_upper_bytes(v, len, extra);
+				if (len == length) {
+					last = data;
+					break;
+				}
+				extra += len;
+			}
+		} else if (extra) {
+			/* Source address is aligned. */
+			u32 *addr = (u32 *) ss->sge.vaddr;
+			int shift = extra * BITS_PER_BYTE;
+			int ushift = 32 - shift;
+			u32 l = len;
+
+			while (l >= sizeof(u32)) {
+				u32 v = *addr;
+
+				data |= set_upper_bits(v, shift);
+				__raw_writel(data, piobuf);
+				data = get_upper_bits(v, ushift);
+				piobuf++;
+				addr++;
+				l -= sizeof(u32);
+			}
+			/*
+			 * We still have 'extra' number of bytes leftover.
+			 */
+			if (l) {
+				u32 v = *addr;
+
+				if (l + extra >= sizeof(u32)) {
+					data |= set_upper_bits(v, shift);
+					len -= l + extra - sizeof(u32);
+					if (len == length) {
+						last = data;
+						break;
+					}
+					__raw_writel(data, piobuf);
+					piobuf++;
+					extra = 0;
+					data = 0;
+				} else {
+					/* Clear unused upper bytes */
+					data |= clear_upper_bytes(v, l,
+								  extra);
+					if (len == length) {
+						last = data;
+						break;
+					}
+					extra += l;
+				}
+			} else if (len == length) {
+				last = data;
+				break;
+			}
+		} else if (len == length) {
+			u32 w;
+
+			/*
+			 * Need to round up for the last dword in the
+			 * packet.
+			 */
+			w = (len + 3) >> 2;
+			__iowrite32_copy(piobuf, ss->sge.vaddr, w - 1);
+			piobuf += w - 1;
+			last = ((u32 *) ss->sge.vaddr)[w - 1];
+			break;
+		} else {
+			u32 w = len >> 2;
+
+			__iowrite32_copy(piobuf, ss->sge.vaddr, w);
+			piobuf += w;
+
+			extra = len & (sizeof(u32) - 1);
+			if (extra) {
+				u32 v = ((u32 *) ss->sge.vaddr)[w];
+
+				/* Clear unused upper bytes */
+				data = clear_upper_bytes(v, extra, 0);
+			}
+		}
+		update_sge(ss, len);
+		length -= len;
+	}
+	/* Update address before sending packet. */
+	update_sge(ss, length);
+	if (flush_wc) {
+		/* must flush early everything before trigger word */
+		ipath_flush_wc();
+		__raw_writel(last, piobuf);
+		/* be sure trigger word is written */
+		ipath_flush_wc();
+	} else
+		__raw_writel(last, piobuf);
+}
+
+/*
+ * Convert IB rate to delay multiplier.
+ */
+unsigned ipath_ib_rate_to_mult(enum ib_rate rate)
+{
+	switch (rate) {
+	case IB_RATE_2_5_GBPS: return 8;
+	case IB_RATE_5_GBPS:   return 4;
+	case IB_RATE_10_GBPS:  return 2;
+	case IB_RATE_20_GBPS:  return 1;
+	default:	       return 0;
+	}
+}
+
+/*
+ * Convert delay multiplier to IB rate
+ */
+static enum ib_rate ipath_mult_to_ib_rate(unsigned mult)
+{
+	switch (mult) {
+	case 8:  return IB_RATE_2_5_GBPS;
+	case 4:  return IB_RATE_5_GBPS;
+	case 2:  return IB_RATE_10_GBPS;
+	case 1:  return IB_RATE_20_GBPS;
+	default: return IB_RATE_PORT_CURRENT;
+	}
+}
+
+static inline struct ipath_verbs_txreq *get_txreq(struct ipath_ibdev *dev)
+{
+	struct ipath_verbs_txreq *tx = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	if (!list_empty(&dev->txreq_free)) {
+		struct list_head *l = dev->txreq_free.next;
+
+		list_del(l);
+		tx = list_entry(l, struct ipath_verbs_txreq, txreq.list);
+	}
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+	return tx;
+}
+
+static inline void put_txreq(struct ipath_ibdev *dev,
+			     struct ipath_verbs_txreq *tx)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	list_add(&tx->txreq.list, &dev->txreq_free);
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+}
+
+static void sdma_complete(void *cookie, int status)
+{
+	struct ipath_verbs_txreq *tx = cookie;
+	struct ipath_qp *qp = tx->qp;
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	unsigned long flags;
+	enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ?
+		IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR;
+
+	if (atomic_dec_and_test(&qp->s_dma_busy)) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (tx->wqe)
+			ipath_send_complete(qp, tx->wqe, ibs);
+		if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
+		     qp->s_last != qp->s_head) ||
+		    (qp->s_flags & IPATH_S_WAIT_DMA))
+			ipath_schedule_send(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		wake_up(&qp->wait_dma);
+	} else if (tx->wqe) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		ipath_send_complete(qp, tx->wqe, ibs);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	}
+
+	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
+		kfree(tx->txreq.map_addr);
+	put_txreq(dev, tx);
+
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+static void decrement_dma_busy(struct ipath_qp *qp)
+{
+	unsigned long flags;
+
+	if (atomic_dec_and_test(&qp->s_dma_busy)) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
+		     qp->s_last != qp->s_head) ||
+		    (qp->s_flags & IPATH_S_WAIT_DMA))
+			ipath_schedule_send(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		wake_up(&qp->wait_dma);
+	}
+}
+
+/*
+ * Compute the number of clock cycles of delay before sending the next packet.
+ * The multipliers reflect the number of clocks for the fastest rate so
+ * one tick at 4xDDR is 8 ticks at 1xSDR.
+ * If the destination port will take longer to receive a packet than
+ * the outgoing link can send it, we need to delay sending the next packet
+ * by the difference in time it takes the receiver to receive and the sender
+ * to send this packet.
+ * Note that this delay is always correct for UC and RC but not always
+ * optimal for UD. For UD, the destination HCA can be different for each
+ * packet, in which case, we could send packets to a different destination
+ * while "waiting" for the delay. The overhead for doing this without
+ * HW support is more than just paying the cost of delaying some packets
+ * unnecessarily.
+ */
+static inline unsigned ipath_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult)
+{
+	return (rcv_mult > snd_mult) ?
+		(plen * (rcv_mult - snd_mult) + 1) >> 1 : 0;
+}
+
+static int ipath_verbs_send_dma(struct ipath_qp *qp,
+				struct ipath_ib_header *hdr, u32 hdrwords,
+				struct ipath_sge_state *ss, u32 len,
+				u32 plen, u32 dwords)
+{
+	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_devdata *dd = dev->dd;
+	struct ipath_verbs_txreq *tx;
+	u32 *piobuf;
+	u32 control;
+	u32 ndesc;
+	int ret;
+
+	tx = qp->s_tx;
+	if (tx) {
+		qp->s_tx = NULL;
+		/* resend previously constructed packet */
+		atomic_inc(&qp->s_dma_busy);
+		ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx);
+		if (ret) {
+			qp->s_tx = tx;
+			decrement_dma_busy(qp);
+		}
+		goto bail;
+	}
+
+	tx = get_txreq(dev);
+	if (!tx) {
+		ret = -EBUSY;
+		goto bail;
+	}
+
+	/*
+	 * Get the saved delay count we computed for the previous packet
+	 * and save the delay count for this packet to be used next time
+	 * we get here.
+	 */
+	control = qp->s_pkt_delay;
+	qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
+
+	tx->qp = qp;
+	atomic_inc(&qp->refcount);
+	tx->wqe = qp->s_wqe;
+	tx->txreq.callback = sdma_complete;
+	tx->txreq.callback_cookie = tx;
+	tx->txreq.flags = IPATH_SDMA_TXREQ_F_HEADTOHOST |
+		IPATH_SDMA_TXREQ_F_INTREQ | IPATH_SDMA_TXREQ_F_FREEDESC;
+	if (plen + 1 >= IPATH_SMALLBUF_DWORDS)
+		tx->txreq.flags |= IPATH_SDMA_TXREQ_F_USELARGEBUF;
+
+	/* VL15 packets bypass credit check */
+	if ((be16_to_cpu(hdr->lrh[0]) >> 12) == 15) {
+		control |= 1ULL << 31;
+		tx->txreq.flags |= IPATH_SDMA_TXREQ_F_VL15;
+	}
+
+	if (len) {
+		/*
+		 * Don't try to DMA if it takes more descriptors than
+		 * the queue holds.
+		 */
+		ndesc = ipath_count_sge(ss, len);
+		if (ndesc >= dd->ipath_sdma_descq_cnt)
+			ndesc = 0;
+	} else
+		ndesc = 1;
+	if (ndesc) {
+		tx->hdr.pbc[0] = cpu_to_le32(plen);
+		tx->hdr.pbc[1] = cpu_to_le32(control);
+		memcpy(&tx->hdr.hdr, hdr, hdrwords << 2);
+		tx->txreq.sg_count = ndesc;
+		tx->map_len = (hdrwords + 2) << 2;
+		tx->txreq.map_addr = &tx->hdr;
+		atomic_inc(&qp->s_dma_busy);
+		ret = ipath_sdma_verbs_send(dd, ss, dwords, tx);
+		if (ret) {
+			/* save ss and length in dwords */
+			tx->ss = ss;
+			tx->len = dwords;
+			qp->s_tx = tx;
+			decrement_dma_busy(qp);
+		}
+		goto bail;
+	}
+
+	/* Allocate a buffer and copy the header and payload to it. */
+	tx->map_len = (plen + 1) << 2;
+	piobuf = kmalloc(tx->map_len, GFP_ATOMIC);
+	if (unlikely(piobuf == NULL)) {
+		ret = -EBUSY;
+		goto err_tx;
+	}
+	tx->txreq.map_addr = piobuf;
+	tx->txreq.flags |= IPATH_SDMA_TXREQ_F_FREEBUF;
+	tx->txreq.sg_count = 1;
+
+	*piobuf++ = (__force u32) cpu_to_le32(plen);
+	*piobuf++ = (__force u32) cpu_to_le32(control);
+	memcpy(piobuf, hdr, hdrwords << 2);
+	ipath_copy_from_sge(piobuf + hdrwords, ss, len);
+
+	atomic_inc(&qp->s_dma_busy);
+	ret = ipath_sdma_verbs_send(dd, NULL, 0, tx);
+	/*
+	 * If we couldn't queue the DMA request, save the info
+	 * and try again later rather than destroying the
+	 * buffer and undoing the side effects of the copy.
+	 */
+	if (ret) {
+		tx->ss = NULL;
+		tx->len = 0;
+		qp->s_tx = tx;
+		decrement_dma_busy(qp);
+	}
+	dev->n_unaligned++;
+	goto bail;
+
+err_tx:
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+	put_txreq(dev, tx);
+bail:
+	return ret;
+}
+
+static int ipath_verbs_send_pio(struct ipath_qp *qp,
+				struct ipath_ib_header *ibhdr, u32 hdrwords,
+				struct ipath_sge_state *ss, u32 len,
+				u32 plen, u32 dwords)
+{
+	struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+	u32 *hdr = (u32 *) ibhdr;
+	u32 __iomem *piobuf;
+	unsigned flush_wc;
+	u32 control;
+	int ret;
+	unsigned long flags;
+
+	piobuf = ipath_getpiobuf(dd, plen, NULL);
+	if (unlikely(piobuf == NULL)) {
+		ret = -EBUSY;
+		goto bail;
+	}
+
+	/*
+	 * Get the saved delay count we computed for the previous packet
+	 * and save the delay count for this packet to be used next time
+	 * we get here.
+	 */
+	control = qp->s_pkt_delay;
+	qp->s_pkt_delay = ipath_pkt_delay(plen, dd->delay_mult, qp->s_dmult);
+
+	/* VL15 packets bypass credit check */
+	if ((be16_to_cpu(ibhdr->lrh[0]) >> 12) == 15)
+		control |= 1ULL << 31;
+
+	/*
+	 * Write the length to the control qword plus any needed flags.
+	 * We have to flush after the PBC for correctness on some cpus
+	 * or WC buffer can be written out of order.
+	 */
+	writeq(((u64) control << 32) | plen, piobuf);
+	piobuf += 2;
+
+	flush_wc = dd->ipath_flags & IPATH_PIO_FLUSH_WC;
+	if (len == 0) {
+		/*
+		 * If there is just the header portion, must flush before
+		 * writing last word of header for correctness, and after
+		 * the last header word (trigger word).
+		 */
+		if (flush_wc) {
+			ipath_flush_wc();
+			__iowrite32_copy(piobuf, hdr, hdrwords - 1);
+			ipath_flush_wc();
+			__raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
+			ipath_flush_wc();
+		} else
+			__iowrite32_copy(piobuf, hdr, hdrwords);
+		goto done;
+	}
+
+	if (flush_wc)
+		ipath_flush_wc();
+	__iowrite32_copy(piobuf, hdr, hdrwords);
+	piobuf += hdrwords;
+
+	/* The common case is aligned and contained in one segment. */
+	if (likely(ss->num_sge == 1 && len <= ss->sge.length &&
+		   !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) {
+		u32 *addr = (u32 *) ss->sge.vaddr;
+
+		/* Update address before sending packet. */
+		update_sge(ss, len);
+		if (flush_wc) {
+			__iowrite32_copy(piobuf, addr, dwords - 1);
+			/* must flush early everything before trigger word */
+			ipath_flush_wc();
+			__raw_writel(addr[dwords - 1], piobuf + dwords - 1);
+			/* be sure trigger word is written */
+			ipath_flush_wc();
+		} else
+			__iowrite32_copy(piobuf, addr, dwords);
+		goto done;
+	}
+	copy_io(piobuf, ss, len, flush_wc);
+done:
+	if (qp->s_wqe) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * ipath_verbs_send - send a packet
+ * @qp: the QP to send on
+ * @hdr: the packet header
+ * @hdrwords: the number of 32-bit words in the header
+ * @ss: the SGE to send
+ * @len: the length of the packet in bytes
+ */
+int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
+		     u32 hdrwords, struct ipath_sge_state *ss, u32 len)
+{
+	struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+	u32 plen;
+	int ret;
+	u32 dwords = (len + 3) >> 2;
+
+	/*
+	 * Calculate the send buffer trigger address.
+	 * The +1 counts for the pbc control dword following the pbc length.
+	 */
+	plen = hdrwords + dwords + 1;
+
+	/*
+	 * VL15 packets (IB_QPT_SMI) will always use PIO, so we
+	 * can defer SDMA restart until link goes ACTIVE without
+	 * worrying about just how we got there.
+	 */
+	if (qp->ibqp.qp_type == IB_QPT_SMI ||
+	    !(dd->ipath_flags & IPATH_HAS_SEND_DMA))
+		ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len,
+					   plen, dwords);
+	else
+		ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len,
+					   plen, dwords);
+
+	return ret;
+}
+
+int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
+			    u64 *rwords, u64 *spkts, u64 *rpkts,
+			    u64 *xmit_wait)
+{
+	int ret;
+
+	if (!(dd->ipath_flags & IPATH_INITTED)) {
+		/* no hardware, freeze, etc. */
+		ret = -EINVAL;
+		goto bail;
+	}
+	*swords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
+	*rwords = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
+	*spkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
+	*rpkts = ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
+	*xmit_wait = ipath_snap_cntr(dd, dd->ipath_cregs->cr_sendstallcnt);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_get_counters - get various chip counters
+ * @dd: the infinipath device
+ * @cntrs: counters are placed here
+ *
+ * Return the counters needed by recv_pma_get_portcounters().
+ */
+int ipath_get_counters(struct ipath_devdata *dd,
+		       struct ipath_verbs_counters *cntrs)
+{
+	struct ipath_cregs const *crp = dd->ipath_cregs;
+	int ret;
+
+	if (!(dd->ipath_flags & IPATH_INITTED)) {
+		/* no hardware, freeze, etc. */
+		ret = -EINVAL;
+		goto bail;
+	}
+	cntrs->symbol_error_counter =
+		ipath_snap_cntr(dd, crp->cr_ibsymbolerrcnt);
+	cntrs->link_error_recovery_counter =
+		ipath_snap_cntr(dd, crp->cr_iblinkerrrecovcnt);
+	/*
+	 * The link downed counter counts when the other side downs the
+	 * connection.  We add in the number of times we downed the link
+	 * due to local link integrity errors to compensate.
+	 */
+	cntrs->link_downed_counter =
+		ipath_snap_cntr(dd, crp->cr_iblinkdowncnt);
+	cntrs->port_rcv_errors =
+		ipath_snap_cntr(dd, crp->cr_rxdroppktcnt) +
+		ipath_snap_cntr(dd, crp->cr_rcvovflcnt) +
+		ipath_snap_cntr(dd, crp->cr_portovflcnt) +
+		ipath_snap_cntr(dd, crp->cr_err_rlencnt) +
+		ipath_snap_cntr(dd, crp->cr_invalidrlencnt) +
+		ipath_snap_cntr(dd, crp->cr_errlinkcnt) +
+		ipath_snap_cntr(dd, crp->cr_erricrccnt) +
+		ipath_snap_cntr(dd, crp->cr_errvcrccnt) +
+		ipath_snap_cntr(dd, crp->cr_errlpcrccnt) +
+		ipath_snap_cntr(dd, crp->cr_badformatcnt) +
+		dd->ipath_rxfc_unsupvl_errs;
+	if (crp->cr_rxotherlocalphyerrcnt)
+		cntrs->port_rcv_errors +=
+			ipath_snap_cntr(dd, crp->cr_rxotherlocalphyerrcnt);
+	if (crp->cr_rxvlerrcnt)
+		cntrs->port_rcv_errors +=
+			ipath_snap_cntr(dd, crp->cr_rxvlerrcnt);
+	cntrs->port_rcv_remphys_errors =
+		ipath_snap_cntr(dd, crp->cr_rcvebpcnt);
+	cntrs->port_xmit_discards = ipath_snap_cntr(dd, crp->cr_unsupvlcnt);
+	cntrs->port_xmit_data = ipath_snap_cntr(dd, crp->cr_wordsendcnt);
+	cntrs->port_rcv_data = ipath_snap_cntr(dd, crp->cr_wordrcvcnt);
+	cntrs->port_xmit_packets = ipath_snap_cntr(dd, crp->cr_pktsendcnt);
+	cntrs->port_rcv_packets = ipath_snap_cntr(dd, crp->cr_pktrcvcnt);
+	cntrs->local_link_integrity_errors =
+		crp->cr_locallinkintegrityerrcnt ?
+		ipath_snap_cntr(dd, crp->cr_locallinkintegrityerrcnt) :
+		((dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
+		 dd->ipath_lli_errs : dd->ipath_lli_errors);
+	cntrs->excessive_buffer_overrun_errors =
+		crp->cr_excessbufferovflcnt ?
+		ipath_snap_cntr(dd, crp->cr_excessbufferovflcnt) :
+		dd->ipath_overrun_thresh_errs;
+	cntrs->vl15_dropped = crp->cr_vl15droppedpktcnt ?
+		ipath_snap_cntr(dd, crp->cr_vl15droppedpktcnt) : 0;
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_ib_piobufavail - callback when a PIO buffer is available
+ * @arg: the device pointer
+ *
+ * This is called from ipath_intr() at interrupt level when a PIO buffer is
+ * available after ipath_verbs_send() returned an error that no buffers were
+ * available.  Return 1 if we consumed all the PIO buffers and we still have
+ * QPs waiting for buffers (for now, just restart the send tasklet and
+ * return zero).
+ */
+int ipath_ib_piobufavail(struct ipath_ibdev *dev)
+{
+	struct list_head *list;
+	struct ipath_qp *qplist;
+	struct ipath_qp *qp;
+	unsigned long flags;
+
+	if (dev == NULL)
+		goto bail;
+
+	list = &dev->piowait;
+	qplist = NULL;
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	while (!list_empty(list)) {
+		qp = list_entry(list->next, struct ipath_qp, piowait);
+		list_del_init(&qp->piowait);
+		qp->pio_next = qplist;
+		qplist = qp;
+		atomic_inc(&qp->refcount);
+	}
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+	while (qplist != NULL) {
+		qp = qplist;
+		qplist = qp->pio_next;
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
+			ipath_schedule_send(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		/* Notify ipath_destroy_qp() if it is waiting. */
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+
+bail:
+	return 0;
+}
+
+static int ipath_query_device(struct ib_device *ibdev,
+			      struct ib_device_attr *props)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+
+	memset(props, 0, sizeof(*props));
+
+	props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+		IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+		IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
+	props->page_size_cap = PAGE_SIZE;
+	props->vendor_id =
+		IPATH_SRC_OUI_1 << 16 | IPATH_SRC_OUI_2 << 8 | IPATH_SRC_OUI_3;
+	props->vendor_part_id = dev->dd->ipath_deviceid;
+	props->hw_ver = dev->dd->ipath_pcirev;
+
+	props->sys_image_guid = dev->sys_image_guid;
+
+	props->max_mr_size = ~0ull;
+	props->max_qp = ib_ipath_max_qps;
+	props->max_qp_wr = ib_ipath_max_qp_wrs;
+	props->max_sge = ib_ipath_max_sges;
+	props->max_cq = ib_ipath_max_cqs;
+	props->max_ah = ib_ipath_max_ahs;
+	props->max_cqe = ib_ipath_max_cqes;
+	props->max_mr = dev->lk_table.max;
+	props->max_fmr = dev->lk_table.max;
+	props->max_map_per_fmr = 32767;
+	props->max_pd = ib_ipath_max_pds;
+	props->max_qp_rd_atom = IPATH_MAX_RDMA_ATOMIC;
+	props->max_qp_init_rd_atom = 255;
+	/* props->max_res_rd_atom */
+	props->max_srq = ib_ipath_max_srqs;
+	props->max_srq_wr = ib_ipath_max_srq_wrs;
+	props->max_srq_sge = ib_ipath_max_srq_sges;
+	/* props->local_ca_ack_delay */
+	props->atomic_cap = IB_ATOMIC_GLOB;
+	props->max_pkeys = ipath_get_npkeys(dev->dd);
+	props->max_mcast_grp = ib_ipath_max_mcast_grps;
+	props->max_mcast_qp_attach = ib_ipath_max_mcast_qp_attached;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+		props->max_mcast_grp;
+
+	return 0;
+}
+
+const u8 ipath_cvt_physportstate[32] = {
+	[INFINIPATH_IBCS_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
+	[INFINIPATH_IBCS_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
+	[INFINIPATH_IBCS_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
+	[INFINIPATH_IBCS_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
+	[INFINIPATH_IBCS_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
+	[INFINIPATH_IBCS_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
+	[INFINIPATH_IBCS_LT_STATE_CFGDEBOUNCE] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[INFINIPATH_IBCS_LT_STATE_CFGRCVFCFG] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[INFINIPATH_IBCS_LT_STATE_CFGWAITRMT] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[INFINIPATH_IBCS_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[INFINIPATH_IBCS_LT_STATE_RECOVERRETRAIN] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[INFINIPATH_IBCS_LT_STATE_RECOVERWAITRMT] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[INFINIPATH_IBCS_LT_STATE_RECOVERIDLE] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[0x10] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x11] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x12] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x13] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
+};
+
+u32 ipath_get_cr_errpkey(struct ipath_devdata *dd)
+{
+	return ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey);
+}
+
+static int ipath_query_port(struct ib_device *ibdev,
+			    u8 port, struct ib_port_attr *props)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_devdata *dd = dev->dd;
+	enum ib_mtu mtu;
+	u16 lid = dd->ipath_lid;
+	u64 ibcstat;
+
+	memset(props, 0, sizeof(*props));
+	props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE);
+	props->lmc = dd->ipath_lmc;
+	props->sm_lid = dev->sm_lid;
+	props->sm_sl = dev->sm_sl;
+	ibcstat = dd->ipath_lastibcstat;
+	/* map LinkState to IB portinfo values.  */
+	props->state = ipath_ib_linkstate(dd, ibcstat) + 1;
+
+	/* See phys_state_show() */
+	props->phys_state = /* MEA: assumes shift == 0 */
+		ipath_cvt_physportstate[dd->ipath_lastibcstat &
+		dd->ibcs_lts_mask];
+	props->port_cap_flags = dev->port_cap_flags;
+	props->gid_tbl_len = 1;
+	props->max_msg_sz = 0x80000000;
+	props->pkey_tbl_len = ipath_get_npkeys(dd);
+	props->bad_pkey_cntr = ipath_get_cr_errpkey(dd) -
+		dev->z_pkey_violations;
+	props->qkey_viol_cntr = dev->qkey_violations;
+	props->active_width = dd->ipath_link_width_active;
+	/* See rate_show() */
+	props->active_speed = dd->ipath_link_speed_active;
+	props->max_vl_num = 1;		/* VLCap = VL0 */
+	props->init_type_reply = 0;
+
+	props->max_mtu = ipath_mtu4096 ? IB_MTU_4096 : IB_MTU_2048;
+	switch (dd->ipath_ibmtu) {
+	case 4096:
+		mtu = IB_MTU_4096;
+		break;
+	case 2048:
+		mtu = IB_MTU_2048;
+		break;
+	case 1024:
+		mtu = IB_MTU_1024;
+		break;
+	case 512:
+		mtu = IB_MTU_512;
+		break;
+	case 256:
+		mtu = IB_MTU_256;
+		break;
+	default:
+		mtu = IB_MTU_2048;
+	}
+	props->active_mtu = mtu;
+	props->subnet_timeout = dev->subnet_timeout;
+
+	return 0;
+}
+
+static int ipath_modify_device(struct ib_device *device,
+			       int device_modify_mask,
+			       struct ib_device_modify *device_modify)
+{
+	int ret;
+
+	if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+				   IB_DEVICE_MODIFY_NODE_DESC)) {
+		ret = -EOPNOTSUPP;
+		goto bail;
+	}
+
+	if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC)
+		memcpy(device->node_desc, device_modify->node_desc, 64);
+
+	if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID)
+		to_idev(device)->sys_image_guid =
+			cpu_to_be64(device_modify->sys_image_guid);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int ipath_modify_port(struct ib_device *ibdev,
+			     u8 port, int port_modify_mask,
+			     struct ib_port_modify *props)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+
+	dev->port_cap_flags |= props->set_port_cap_mask;
+	dev->port_cap_flags &= ~props->clr_port_cap_mask;
+	if (port_modify_mask & IB_PORT_SHUTDOWN)
+		ipath_set_linkstate(dev->dd, IPATH_IB_LINKDOWN);
+	if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
+		dev->qkey_violations = 0;
+	return 0;
+}
+
+static int ipath_query_gid(struct ib_device *ibdev, u8 port,
+			   int index, union ib_gid *gid)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	int ret;
+
+	if (index >= 1) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	gid->global.subnet_prefix = dev->gid_prefix;
+	gid->global.interface_id = dev->dd->ipath_guid;
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static struct ib_pd *ipath_alloc_pd(struct ib_device *ibdev,
+				    struct ib_ucontext *context,
+				    struct ib_udata *udata)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_pd *pd;
+	struct ib_pd *ret;
+
+	/*
+	 * This is actually totally arbitrary.	Some correctness tests
+	 * assume there's a maximum number of PDs that can be allocated.
+	 * We don't actually have this limit, but we fail the test if
+	 * we allow allocations of more than we report for this value.
+	 */
+
+	pd = kmalloc(sizeof *pd, GFP_KERNEL);
+	if (!pd) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	spin_lock(&dev->n_pds_lock);
+	if (dev->n_pds_allocated == ib_ipath_max_pds) {
+		spin_unlock(&dev->n_pds_lock);
+		kfree(pd);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	dev->n_pds_allocated++;
+	spin_unlock(&dev->n_pds_lock);
+
+	/* ib_alloc_pd() will initialize pd->ibpd. */
+	pd->user = udata != NULL;
+
+	ret = &pd->ibpd;
+
+bail:
+	return ret;
+}
+
+static int ipath_dealloc_pd(struct ib_pd *ibpd)
+{
+	struct ipath_pd *pd = to_ipd(ibpd);
+	struct ipath_ibdev *dev = to_idev(ibpd->device);
+
+	spin_lock(&dev->n_pds_lock);
+	dev->n_pds_allocated--;
+	spin_unlock(&dev->n_pds_lock);
+
+	kfree(pd);
+
+	return 0;
+}
+
+/**
+ * ipath_create_ah - create an address handle
+ * @pd: the protection domain
+ * @ah_attr: the attributes of the AH
+ *
+ * This may be called from interrupt context.
+ */
+static struct ib_ah *ipath_create_ah(struct ib_pd *pd,
+				     struct ib_ah_attr *ah_attr)
+{
+	struct ipath_ah *ah;
+	struct ib_ah *ret;
+	struct ipath_ibdev *dev = to_idev(pd->device);
+	unsigned long flags;
+
+	/* A multicast address requires a GRH (see ch. 8.4.1). */
+	if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
+	    ah_attr->dlid != IPATH_PERMISSIVE_LID &&
+	    !(ah_attr->ah_flags & IB_AH_GRH)) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	if (ah_attr->dlid == 0) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	if (ah_attr->port_num < 1 ||
+	    ah_attr->port_num > pd->device->phys_port_cnt) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	ah = kmalloc(sizeof *ah, GFP_ATOMIC);
+	if (!ah) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&dev->n_ahs_lock, flags);
+	if (dev->n_ahs_allocated == ib_ipath_max_ahs) {
+		spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+		kfree(ah);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	dev->n_ahs_allocated++;
+	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+	/* ib_create_ah() will initialize ah->ibah. */
+	ah->attr = *ah_attr;
+	ah->attr.static_rate = ipath_ib_rate_to_mult(ah_attr->static_rate);
+
+	ret = &ah->ibah;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_destroy_ah - destroy an address handle
+ * @ibah: the AH to destroy
+ *
+ * This may be called from interrupt context.
+ */
+static int ipath_destroy_ah(struct ib_ah *ibah)
+{
+	struct ipath_ibdev *dev = to_idev(ibah->device);
+	struct ipath_ah *ah = to_iah(ibah);
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->n_ahs_lock, flags);
+	dev->n_ahs_allocated--;
+	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+	kfree(ah);
+
+	return 0;
+}
+
+static int ipath_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	struct ipath_ah *ah = to_iah(ibah);
+
+	*ah_attr = ah->attr;
+	ah_attr->static_rate = ipath_mult_to_ib_rate(ah->attr.static_rate);
+
+	return 0;
+}
+
+/**
+ * ipath_get_npkeys - return the size of the PKEY table for port 0
+ * @dd: the infinipath device
+ */
+unsigned ipath_get_npkeys(struct ipath_devdata *dd)
+{
+	return ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys);
+}
+
+/**
+ * ipath_get_pkey - return the indexed PKEY from the port PKEY table
+ * @dd: the infinipath device
+ * @index: the PKEY index
+ */
+unsigned ipath_get_pkey(struct ipath_devdata *dd, unsigned index)
+{
+	unsigned ret;
+
+	/* always a kernel port, no locking needed */
+	if (index >= ARRAY_SIZE(dd->ipath_pd[0]->port_pkeys))
+		ret = 0;
+	else
+		ret = dd->ipath_pd[0]->port_pkeys[index];
+
+	return ret;
+}
+
+static int ipath_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			    u16 *pkey)
+{
+	struct ipath_ibdev *dev = to_idev(ibdev);
+	int ret;
+
+	if (index >= ipath_get_npkeys(dev->dd)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	*pkey = ipath_get_pkey(dev->dd, index);
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * ipath_alloc_ucontext - allocate a ucontest
+ * @ibdev: the infiniband device
+ * @udata: not used by the InfiniPath driver
+ */
+
+static struct ib_ucontext *ipath_alloc_ucontext(struct ib_device *ibdev,
+						struct ib_udata *udata)
+{
+	struct ipath_ucontext *context;
+	struct ib_ucontext *ret;
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	ret = &context->ibucontext;
+
+bail:
+	return ret;
+}
+
+static int ipath_dealloc_ucontext(struct ib_ucontext *context)
+{
+	kfree(to_iucontext(context));
+	return 0;
+}
+
+static int ipath_verbs_register_sysfs(struct ib_device *dev);
+
+static void __verbs_timer(unsigned long arg)
+{
+	struct ipath_devdata *dd = (struct ipath_devdata *) arg;
+
+	/* Handle verbs layer timeouts. */
+	ipath_ib_timer(dd->verbs_dev);
+
+	mod_timer(&dd->verbs_timer, jiffies + 1);
+}
+
+static int enable_timer(struct ipath_devdata *dd)
+{
+	/*
+	 * Early chips had a design flaw where the chip and kernel idea
+	 * of the tail register don't always agree, and therefore we won't
+	 * get an interrupt on the next packet received.
+	 * If the board supports per packet receive interrupts, use it.
+	 * Otherwise, the timer function periodically checks for packets
+	 * to cover this case.
+	 * Either way, the timer is needed for verbs layer related
+	 * processing.
+	 */
+	if (dd->ipath_flags & IPATH_GPIO_INTR) {
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_debugportselect,
+				 0x2074076542310ULL);
+		/* Enable GPIO bit 2 interrupt */
+		dd->ipath_gpio_mask |= (u64) (1 << IPATH_GPIO_PORT0_BIT);
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+				 dd->ipath_gpio_mask);
+	}
+
+	init_timer(&dd->verbs_timer);
+	dd->verbs_timer.function = __verbs_timer;
+	dd->verbs_timer.data = (unsigned long)dd;
+	dd->verbs_timer.expires = jiffies + 1;
+	add_timer(&dd->verbs_timer);
+
+	return 0;
+}
+
+static int disable_timer(struct ipath_devdata *dd)
+{
+	/* Disable GPIO bit 2 interrupt */
+	if (dd->ipath_flags & IPATH_GPIO_INTR) {
+                /* Disable GPIO bit 2 interrupt */
+		dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT));
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+				 dd->ipath_gpio_mask);
+		/*
+		 * We might want to undo changes to debugportselect,
+		 * but how?
+		 */
+	}
+
+	del_timer_sync(&dd->verbs_timer);
+
+	return 0;
+}
+
+/**
+ * ipath_register_ib_device - register our device with the infiniband core
+ * @dd: the device data structure
+ * Return the allocated ipath_ibdev pointer or NULL on error.
+ */
+int ipath_register_ib_device(struct ipath_devdata *dd)
+{
+	struct ipath_verbs_counters cntrs;
+	struct ipath_ibdev *idev;
+	struct ib_device *dev;
+	struct ipath_verbs_txreq *tx;
+	unsigned i;
+	int ret;
+
+	idev = (struct ipath_ibdev *)ib_alloc_device(sizeof *idev);
+	if (idev == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	dev = &idev->ibdev;
+
+	if (dd->ipath_sdma_descq_cnt) {
+		tx = kmalloc(dd->ipath_sdma_descq_cnt * sizeof *tx,
+			     GFP_KERNEL);
+		if (tx == NULL) {
+			ret = -ENOMEM;
+			goto err_tx;
+		}
+	} else
+		tx = NULL;
+	idev->txreq_bufs = tx;
+
+	/* Only need to initialize non-zero fields. */
+	spin_lock_init(&idev->n_pds_lock);
+	spin_lock_init(&idev->n_ahs_lock);
+	spin_lock_init(&idev->n_cqs_lock);
+	spin_lock_init(&idev->n_qps_lock);
+	spin_lock_init(&idev->n_srqs_lock);
+	spin_lock_init(&idev->n_mcast_grps_lock);
+
+	spin_lock_init(&idev->qp_table.lock);
+	spin_lock_init(&idev->lk_table.lock);
+	idev->sm_lid = __constant_be16_to_cpu(IB_LID_PERMISSIVE);
+	/* Set the prefix to the default value (see ch. 4.1.1) */
+	idev->gid_prefix = __constant_cpu_to_be64(0xfe80000000000000ULL);
+
+	ret = ipath_init_qp_table(idev, ib_ipath_qp_table_size);
+	if (ret)
+		goto err_qp;
+
+	/*
+	 * The top ib_ipath_lkey_table_size bits are used to index the
+	 * table.  The lower 8 bits can be owned by the user (copied from
+	 * the LKEY).  The remaining bits act as a generation number or tag.
+	 */
+	idev->lk_table.max = 1 << ib_ipath_lkey_table_size;
+	idev->lk_table.table = kzalloc(idev->lk_table.max *
+				       sizeof(*idev->lk_table.table),
+				       GFP_KERNEL);
+	if (idev->lk_table.table == NULL) {
+		ret = -ENOMEM;
+		goto err_lk;
+	}
+	INIT_LIST_HEAD(&idev->pending_mmaps);
+	spin_lock_init(&idev->pending_lock);
+	idev->mmap_offset = PAGE_SIZE;
+	spin_lock_init(&idev->mmap_offset_lock);
+	INIT_LIST_HEAD(&idev->pending[0]);
+	INIT_LIST_HEAD(&idev->pending[1]);
+	INIT_LIST_HEAD(&idev->pending[2]);
+	INIT_LIST_HEAD(&idev->piowait);
+	INIT_LIST_HEAD(&idev->rnrwait);
+	INIT_LIST_HEAD(&idev->txreq_free);
+	idev->pending_index = 0;
+	idev->port_cap_flags =
+		IB_PORT_SYS_IMAGE_GUID_SUP | IB_PORT_CLIENT_REG_SUP;
+	if (dd->ipath_flags & IPATH_HAS_LINK_LATENCY)
+		idev->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP;
+	idev->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+	idev->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+	idev->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+	idev->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+	idev->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
+
+	/* Snapshot current HW counters to "clear" them. */
+	ipath_get_counters(dd, &cntrs);
+	idev->z_symbol_error_counter = cntrs.symbol_error_counter;
+	idev->z_link_error_recovery_counter =
+		cntrs.link_error_recovery_counter;
+	idev->z_link_downed_counter = cntrs.link_downed_counter;
+	idev->z_port_rcv_errors = cntrs.port_rcv_errors;
+	idev->z_port_rcv_remphys_errors =
+		cntrs.port_rcv_remphys_errors;
+	idev->z_port_xmit_discards = cntrs.port_xmit_discards;
+	idev->z_port_xmit_data = cntrs.port_xmit_data;
+	idev->z_port_rcv_data = cntrs.port_rcv_data;
+	idev->z_port_xmit_packets = cntrs.port_xmit_packets;
+	idev->z_port_rcv_packets = cntrs.port_rcv_packets;
+	idev->z_local_link_integrity_errors =
+		cntrs.local_link_integrity_errors;
+	idev->z_excessive_buffer_overrun_errors =
+		cntrs.excessive_buffer_overrun_errors;
+	idev->z_vl15_dropped = cntrs.vl15_dropped;
+
+	for (i = 0; i < dd->ipath_sdma_descq_cnt; i++, tx++)
+		list_add(&tx->txreq.list, &idev->txreq_free);
+
+	/*
+	 * The system image GUID is supposed to be the same for all
+	 * IB HCAs in a single system but since there can be other
+	 * device types in the system, we can't be sure this is unique.
+	 */
+	if (!sys_image_guid)
+		sys_image_guid = dd->ipath_guid;
+	idev->sys_image_guid = sys_image_guid;
+	idev->ib_unit = dd->ipath_unit;
+	idev->dd = dd;
+
+	strlcpy(dev->name, "ipath%d", IB_DEVICE_NAME_MAX);
+	dev->owner = THIS_MODULE;
+	dev->node_guid = dd->ipath_guid;
+	dev->uverbs_abi_ver = IPATH_UVERBS_ABI_VERSION;
+	dev->uverbs_cmd_mask =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_AH)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_AH)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_AH)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_POLL_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_POST_SEND)		|
+		(1ull << IB_USER_VERBS_CMD_POST_RECV)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+	dev->node_type = RDMA_NODE_IB_CA;
+	dev->phys_port_cnt = 1;
+	dev->num_comp_vectors = 1;
+	dev->dma_device = &dd->pcidev->dev;
+	dev->query_device = ipath_query_device;
+	dev->modify_device = ipath_modify_device;
+	dev->query_port = ipath_query_port;
+	dev->modify_port = ipath_modify_port;
+	dev->query_pkey = ipath_query_pkey;
+	dev->query_gid = ipath_query_gid;
+	dev->alloc_ucontext = ipath_alloc_ucontext;
+	dev->dealloc_ucontext = ipath_dealloc_ucontext;
+	dev->alloc_pd = ipath_alloc_pd;
+	dev->dealloc_pd = ipath_dealloc_pd;
+	dev->create_ah = ipath_create_ah;
+	dev->destroy_ah = ipath_destroy_ah;
+	dev->query_ah = ipath_query_ah;
+	dev->create_srq = ipath_create_srq;
+	dev->modify_srq = ipath_modify_srq;
+	dev->query_srq = ipath_query_srq;
+	dev->destroy_srq = ipath_destroy_srq;
+	dev->create_qp = ipath_create_qp;
+	dev->modify_qp = ipath_modify_qp;
+	dev->query_qp = ipath_query_qp;
+	dev->destroy_qp = ipath_destroy_qp;
+	dev->post_send = ipath_post_send;
+	dev->post_recv = ipath_post_receive;
+	dev->post_srq_recv = ipath_post_srq_receive;
+	dev->create_cq = ipath_create_cq;
+	dev->destroy_cq = ipath_destroy_cq;
+	dev->resize_cq = ipath_resize_cq;
+	dev->poll_cq = ipath_poll_cq;
+	dev->req_notify_cq = ipath_req_notify_cq;
+	dev->get_dma_mr = ipath_get_dma_mr;
+	dev->reg_phys_mr = ipath_reg_phys_mr;
+	dev->reg_user_mr = ipath_reg_user_mr;
+	dev->dereg_mr = ipath_dereg_mr;
+	dev->alloc_fmr = ipath_alloc_fmr;
+	dev->map_phys_fmr = ipath_map_phys_fmr;
+	dev->unmap_fmr = ipath_unmap_fmr;
+	dev->dealloc_fmr = ipath_dealloc_fmr;
+	dev->attach_mcast = ipath_multicast_attach;
+	dev->detach_mcast = ipath_multicast_detach;
+	dev->process_mad = ipath_process_mad;
+	dev->mmap = ipath_mmap;
+	dev->dma_ops = &ipath_dma_mapping_ops;
+
+	snprintf(dev->node_desc, sizeof(dev->node_desc),
+		 IPATH_IDSTR " %s", init_utsname()->nodename);
+
+	ret = ib_register_device(dev, NULL);
+	if (ret)
+		goto err_reg;
+
+	ret = ipath_verbs_register_sysfs(dev);
+	if (ret)
+		goto err_class;
+
+	enable_timer(dd);
+
+	goto bail;
+
+err_class:
+	ib_unregister_device(dev);
+err_reg:
+	kfree(idev->lk_table.table);
+err_lk:
+	kfree(idev->qp_table.table);
+err_qp:
+	kfree(idev->txreq_bufs);
+err_tx:
+	ib_dealloc_device(dev);
+	ipath_dev_err(dd, "cannot register verbs: %d!\n", -ret);
+	idev = NULL;
+
+bail:
+	dd->verbs_dev = idev;
+	return ret;
+}
+
+void ipath_unregister_ib_device(struct ipath_ibdev *dev)
+{
+	struct ib_device *ibdev = &dev->ibdev;
+	u32 qps_inuse;
+
+	ib_unregister_device(ibdev);
+
+	disable_timer(dev->dd);
+
+	if (!list_empty(&dev->pending[0]) ||
+	    !list_empty(&dev->pending[1]) ||
+	    !list_empty(&dev->pending[2]))
+		ipath_dev_err(dev->dd, "pending list not empty!\n");
+	if (!list_empty(&dev->piowait))
+		ipath_dev_err(dev->dd, "piowait list not empty!\n");
+	if (!list_empty(&dev->rnrwait))
+		ipath_dev_err(dev->dd, "rnrwait list not empty!\n");
+	if (!ipath_mcast_tree_empty())
+		ipath_dev_err(dev->dd, "multicast table memory leak!\n");
+	/*
+	 * Note that ipath_unregister_ib_device() can be called before all
+	 * the QPs are destroyed!
+	 */
+	qps_inuse = ipath_free_all_qps(&dev->qp_table);
+	if (qps_inuse)
+		ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n",
+			qps_inuse);
+	kfree(dev->qp_table.table);
+	kfree(dev->lk_table.table);
+	kfree(dev->txreq_bufs);
+	ib_dealloc_device(ibdev);
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct ipath_ibdev *dev =
+		container_of(device, struct ipath_ibdev, ibdev.dev);
+
+	return sprintf(buf, "%x\n", dev->dd->ipath_pcirev);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct ipath_ibdev *dev =
+		container_of(device, struct ipath_ibdev, ibdev.dev);
+	int ret;
+
+	ret = dev->dd->ipath_f_get_boardname(dev->dd, buf, 128);
+	if (ret < 0)
+		goto bail;
+	strcat(buf, "\n");
+	ret = strlen(buf);
+
+bail:
+	return ret;
+}
+
+static ssize_t show_stats(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct ipath_ibdev *dev =
+		container_of(device, struct ipath_ibdev, ibdev.dev);
+	int i;
+	int len;
+
+	len = sprintf(buf,
+		      "RC resends  %d\n"
+		      "RC no QACK  %d\n"
+		      "RC ACKs     %d\n"
+		      "RC SEQ NAKs %d\n"
+		      "RC RDMA seq %d\n"
+		      "RC RNR NAKs %d\n"
+		      "RC OTH NAKs %d\n"
+		      "RC timeouts %d\n"
+		      "RC RDMA dup %d\n"
+		      "piobuf wait %d\n"
+		      "unaligned   %d\n"
+		      "PKT drops   %d\n"
+		      "WQE errs    %d\n",
+		      dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
+		      dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
+		      dev->n_other_naks, dev->n_timeouts,
+		      dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned,
+		      dev->n_pkt_drops, dev->n_wqe_errs);
+	for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
+		const struct ipath_opcode_stats *si = &dev->opstats[i];
+
+		if (!si->n_packets && !si->n_bytes)
+			continue;
+		len += sprintf(buf + len, "%02x %llu/%llu\n", i,
+			       (unsigned long long) si->n_packets,
+			       (unsigned long long) si->n_bytes);
+	}
+	return len;
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(stats, S_IRUGO, show_stats, NULL);
+
+static struct device_attribute *ipath_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+	&dev_attr_stats
+};
+
+static int ipath_verbs_register_sysfs(struct ib_device *dev)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i) {
+		ret = device_create_file(&dev->dev,
+				       ipath_class_attributes[i]);
+		if (ret)
+			goto bail;
+	}
+	return 0;
+bail:
+	for (i = 0; i < ARRAY_SIZE(ipath_class_attributes); ++i)
+		device_remove_file(&dev->dev, ipath_class_attributes[i]);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h
new file mode 100644
index 000000000..ae6cff4ab
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.h
@@ -0,0 +1,936 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IPATH_VERBS_H
+#define IPATH_VERBS_H
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "ipath_kernel.h"
+
+#define IPATH_MAX_RDMA_ATOMIC	4
+
+#define QPN_MAX                 (1 << 24)
+#define QPNMAP_ENTRIES          (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define IPATH_UVERBS_ABI_VERSION       2
+
+/*
+ * Define an ib_cq_notify value that is not valid so we know when CQ
+ * notifications are armed.
+ */
+#define IB_CQ_NONE	(IB_CQ_NEXT_COMP + 1)
+
+/* AETH NAK opcode values */
+#define IB_RNR_NAK			0x20
+#define IB_NAK_PSN_ERROR		0x60
+#define IB_NAK_INVALID_REQUEST		0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR	0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST	0x64
+
+/* Flags for checking QP state (see ib_ipath_state_ops[]) */
+#define IPATH_POST_SEND_OK		0x01
+#define IPATH_POST_RECV_OK		0x02
+#define IPATH_PROCESS_RECV_OK		0x04
+#define IPATH_PROCESS_SEND_OK		0x08
+#define IPATH_PROCESS_NEXT_SEND_OK	0x10
+#define IPATH_FLUSH_SEND		0x20
+#define IPATH_FLUSH_RECV		0x40
+#define IPATH_PROCESS_OR_FLUSH_SEND \
+	(IPATH_PROCESS_SEND_OK | IPATH_FLUSH_SEND)
+
+/* IB Performance Manager status values */
+#define IB_PMA_SAMPLE_STATUS_DONE	0x00
+#define IB_PMA_SAMPLE_STATUS_STARTED	0x01
+#define IB_PMA_SAMPLE_STATUS_RUNNING	0x02
+
+/* Mandatory IB performance counter select values. */
+#define IB_PMA_PORT_XMIT_DATA	cpu_to_be16(0x0001)
+#define IB_PMA_PORT_RCV_DATA	cpu_to_be16(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS	cpu_to_be16(0x0003)
+#define IB_PMA_PORT_RCV_PKTS	cpu_to_be16(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT	cpu_to_be16(0x0005)
+
+struct ib_reth {
+	__be64 vaddr;
+	__be32 rkey;
+	__be32 length;
+} __attribute__ ((packed));
+
+struct ib_atomic_eth {
+	__be32 vaddr[2];	/* unaligned so access as 2 32-bit words */
+	__be32 rkey;
+	__be64 swap_data;
+	__be64 compare_data;
+} __attribute__ ((packed));
+
+struct ipath_other_headers {
+	__be32 bth[3];
+	union {
+		struct {
+			__be32 deth[2];
+			__be32 imm_data;
+		} ud;
+		struct {
+			struct ib_reth reth;
+			__be32 imm_data;
+		} rc;
+		struct {
+			__be32 aeth;
+			__be32 atomic_ack_eth[2];
+		} at;
+		__be32 imm_data;
+		__be32 aeth;
+		struct ib_atomic_eth atomic_eth;
+	} u;
+} __attribute__ ((packed));
+
+/*
+ * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
+ * long (72 w/ imm_data).  Only the first 56 bytes of the IB header
+ * will be in the eager header buffer.  The remaining 12 or 16 bytes
+ * are in the data buffer.
+ */
+struct ipath_ib_header {
+	__be16 lrh[4];
+	union {
+		struct {
+			struct ib_grh grh;
+			struct ipath_other_headers oth;
+		} l;
+		struct ipath_other_headers oth;
+	} u;
+} __attribute__ ((packed));
+
+struct ipath_pio_header {
+	__le32 pbc[2];
+	struct ipath_ib_header hdr;
+} __attribute__ ((packed));
+
+/*
+ * There is one struct ipath_mcast for each multicast GID.
+ * All attached QPs are then stored as a list of
+ * struct ipath_mcast_qp.
+ */
+struct ipath_mcast_qp {
+	struct list_head list;
+	struct ipath_qp *qp;
+};
+
+struct ipath_mcast {
+	struct rb_node rb_node;
+	union ib_gid mgid;
+	struct list_head qp_list;
+	wait_queue_head_t wait;
+	atomic_t refcount;
+	int n_attached;
+};
+
+/* Protection domain */
+struct ipath_pd {
+	struct ib_pd ibpd;
+	int user;		/* non-zero if created from user space */
+};
+
+/* Address Handle */
+struct ipath_ah {
+	struct ib_ah ibah;
+	struct ib_ah_attr attr;
+};
+
+/*
+ * This structure is used by ipath_mmap() to validate an offset
+ * when an mmap() request is made.  The vm_area_struct then uses
+ * this as its vm_private_data.
+ */
+struct ipath_mmap_info {
+	struct list_head pending_mmaps;
+	struct ib_ucontext *context;
+	void *obj;
+	__u64 offset;
+	struct kref ref;
+	unsigned size;
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and completion queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ */
+struct ipath_cq_wc {
+	u32 head;		/* index of next entry to fill */
+	u32 tail;		/* index of next ib_poll_cq() entry */
+	union {
+		/* these are actually size ibcq.cqe + 1 */
+		struct ib_uverbs_wc uqueue[0];
+		struct ib_wc kqueue[0];
+	};
+};
+
+/*
+ * The completion queue structure.
+ */
+struct ipath_cq {
+	struct ib_cq ibcq;
+	struct tasklet_struct comptask;
+	spinlock_t lock;
+	u8 notify;
+	u8 triggered;
+	struct ipath_cq_wc *queue;
+	struct ipath_mmap_info *ip;
+};
+
+/*
+ * A segment is a linear region of low physical memory.
+ * XXX Maybe we should use phys addr here and kmap()/kunmap().
+ * Used by the verbs layer.
+ */
+struct ipath_seg {
+	void *vaddr;
+	size_t length;
+};
+
+/* The number of ipath_segs that fit in a page. */
+#define IPATH_SEGSZ     (PAGE_SIZE / sizeof (struct ipath_seg))
+
+struct ipath_segarray {
+	struct ipath_seg segs[IPATH_SEGSZ];
+};
+
+struct ipath_mregion {
+	struct ib_pd *pd;	/* shares refcnt of ibmr.pd */
+	u64 user_base;		/* User's address for this region */
+	u64 iova;		/* IB start address of this region */
+	size_t length;
+	u32 lkey;
+	u32 offset;		/* offset (bytes) to start of region */
+	int access_flags;
+	u32 max_segs;		/* number of ipath_segs in all the arrays */
+	u32 mapsz;		/* size of the map array */
+	struct ipath_segarray *map[0];	/* the segments */
+};
+
+/*
+ * These keep track of the copy progress within a memory region.
+ * Used by the verbs layer.
+ */
+struct ipath_sge {
+	struct ipath_mregion *mr;
+	void *vaddr;		/* kernel virtual address of segment */
+	u32 sge_length;		/* length of the SGE */
+	u32 length;		/* remaining length of the segment */
+	u16 m;			/* current index: mr->map[m] */
+	u16 n;			/* current index: mr->map[m]->segs[n] */
+};
+
+/* Memory region */
+struct ipath_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct ipath_mregion mr;	/* must be last */
+};
+
+/*
+ * Send work request queue entry.
+ * The size of the sg_list is determined when the QP is created and stored
+ * in qp->s_max_sge.
+ */
+struct ipath_swqe {
+	struct ib_send_wr wr;	/* don't use wr.sg_list */
+	u32 psn;		/* first packet sequence number */
+	u32 lpsn;		/* last packet sequence number */
+	u32 ssn;		/* send sequence number */
+	u32 length;		/* total length of data in sg_list */
+	struct ipath_sge sg_list[0];
+};
+
+/*
+ * Receive work request queue entry.
+ * The size of the sg_list is determined when the QP (or SRQ) is created
+ * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
+ */
+struct ipath_rwqe {
+	u64 wr_id;
+	u8 num_sge;
+	struct ib_sge sg_list[0];
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and receive work queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ * Note that the wq array elements are variable size so you can't
+ * just index into the array to get the N'th element;
+ * use get_rwqe_ptr() instead.
+ */
+struct ipath_rwq {
+	u32 head;		/* new work requests posted to the head */
+	u32 tail;		/* receives pull requests from here. */
+	struct ipath_rwqe wq[0];
+};
+
+struct ipath_rq {
+	struct ipath_rwq *wq;
+	spinlock_t lock;
+	u32 size;		/* size of RWQE array */
+	u8 max_sge;
+};
+
+struct ipath_srq {
+	struct ib_srq ibsrq;
+	struct ipath_rq rq;
+	struct ipath_mmap_info *ip;
+	/* send signal when number of RWQEs < limit */
+	u32 limit;
+};
+
+struct ipath_sge_state {
+	struct ipath_sge *sg_list;      /* next SGE to be used if any */
+	struct ipath_sge sge;   /* progress state for the current SGE */
+	u8 num_sge;
+	u8 static_rate;
+};
+
+/*
+ * This structure holds the information that the send tasklet needs
+ * to send a RDMA read response or atomic operation.
+ */
+struct ipath_ack_entry {
+	u8 opcode;
+	u8 sent;
+	u32 psn;
+	union {
+		struct ipath_sge_state rdma_sge;
+		u64 atomic_data;
+	};
+};
+
+/*
+ * Variables prefixed with s_ are for the requester (sender).
+ * Variables prefixed with r_ are for the responder (receiver).
+ * Variables prefixed with ack_ are for responder replies.
+ *
+ * Common variables are protected by both r_rq.lock and s_lock in that order
+ * which only happens in modify_qp() or changing the QP 'state'.
+ */
+struct ipath_qp {
+	struct ib_qp ibqp;
+	struct ipath_qp *next;		/* link list for QPN hash table */
+	struct ipath_qp *timer_next;	/* link list for ipath_ib_timer() */
+	struct ipath_qp *pio_next;	/* link for ipath_ib_piobufavail() */
+	struct list_head piowait;	/* link for wait PIO buf */
+	struct list_head timerwait;	/* link for waiting for timeouts */
+	struct ib_ah_attr remote_ah_attr;
+	struct ipath_ib_header s_hdr;	/* next packet header to send */
+	atomic_t refcount;
+	wait_queue_head_t wait;
+	wait_queue_head_t wait_dma;
+	struct tasklet_struct s_task;
+	struct ipath_mmap_info *ip;
+	struct ipath_sge_state *s_cur_sge;
+	struct ipath_verbs_txreq *s_tx;
+	struct ipath_sge_state s_sge;	/* current send request data */
+	struct ipath_ack_entry s_ack_queue[IPATH_MAX_RDMA_ATOMIC + 1];
+	struct ipath_sge_state s_ack_rdma_sge;
+	struct ipath_sge_state s_rdma_read_sge;
+	struct ipath_sge_state r_sge;	/* current receive data */
+	spinlock_t s_lock;
+	atomic_t s_dma_busy;
+	u16 s_pkt_delay;
+	u16 s_hdrwords;		/* size of s_hdr in 32 bit words */
+	u32 s_cur_size;		/* size of send packet in bytes */
+	u32 s_len;		/* total length of s_sge */
+	u32 s_rdma_read_len;	/* total length of s_rdma_read_sge */
+	u32 s_next_psn;		/* PSN for next request */
+	u32 s_last_psn;		/* last response PSN processed */
+	u32 s_psn;		/* current packet sequence number */
+	u32 s_ack_rdma_psn;	/* PSN for sending RDMA read responses */
+	u32 s_ack_psn;		/* PSN for acking sends and RDMA writes */
+	u32 s_rnr_timeout;	/* number of milliseconds for RNR timeout */
+	u32 r_ack_psn;		/* PSN for next ACK or atomic ACK */
+	u64 r_wr_id;		/* ID for current receive WQE */
+	unsigned long r_aflags;
+	u32 r_len;		/* total length of r_sge */
+	u32 r_rcv_len;		/* receive data len processed */
+	u32 r_psn;		/* expected rcv packet sequence number */
+	u32 r_msn;		/* message sequence number */
+	u8 state;		/* QP state */
+	u8 s_state;		/* opcode of last packet sent */
+	u8 s_ack_state;		/* opcode of packet to ACK */
+	u8 s_nak_state;		/* non-zero if NAK is pending */
+	u8 r_state;		/* opcode of last packet received */
+	u8 r_nak_state;		/* non-zero if NAK is pending */
+	u8 r_min_rnr_timer;	/* retry timeout value for RNR NAKs */
+	u8 r_flags;
+	u8 r_max_rd_atomic;	/* max number of RDMA read/atomic to receive */
+	u8 r_head_ack_queue;	/* index into s_ack_queue[] */
+	u8 qp_access_flags;
+	u8 s_max_sge;		/* size of s_wq->sg_list */
+	u8 s_retry_cnt;		/* number of times to retry */
+	u8 s_rnr_retry_cnt;
+	u8 s_retry;		/* requester retry counter */
+	u8 s_rnr_retry;		/* requester RNR retry counter */
+	u8 s_pkey_index;	/* PKEY index to use */
+	u8 s_max_rd_atomic;	/* max number of RDMA read/atomic to send */
+	u8 s_num_rd_atomic;	/* number of RDMA read/atomic pending */
+	u8 s_tail_ack_queue;	/* index into s_ack_queue[] */
+	u8 s_flags;
+	u8 s_dmult;
+	u8 s_draining;
+	u8 timeout;		/* Timeout for this QP */
+	enum ib_mtu path_mtu;
+	u32 remote_qpn;
+	u32 qkey;		/* QKEY for this QP (for UD or RD) */
+	u32 s_size;		/* send work queue size */
+	u32 s_head;		/* new entries added here */
+	u32 s_tail;		/* next entry to process */
+	u32 s_cur;		/* current work queue entry */
+	u32 s_last;		/* last un-ACK'ed entry */
+	u32 s_ssn;		/* SSN of tail entry */
+	u32 s_lsn;		/* limit sequence number (credit) */
+	struct ipath_swqe *s_wq;	/* send work queue */
+	struct ipath_swqe *s_wqe;
+	struct ipath_sge *r_ud_sg_list;
+	struct ipath_rq r_rq;		/* receive work queue */
+	struct ipath_sge r_sg_list[0];	/* verified SGEs */
+};
+
+/*
+ * Atomic bit definitions for r_aflags.
+ */
+#define IPATH_R_WRID_VALID	0
+
+/*
+ * Bit definitions for r_flags.
+ */
+#define IPATH_R_REUSE_SGE	0x01
+#define IPATH_R_RDMAR_SEQ	0x02
+
+/*
+ * Bit definitions for s_flags.
+ *
+ * IPATH_S_FENCE_PENDING - waiting for all prior RDMA read or atomic SWQEs
+ *			   before processing the next SWQE
+ * IPATH_S_RDMAR_PENDING - waiting for any RDMA read or atomic SWQEs
+ *			   before processing the next SWQE
+ * IPATH_S_WAITING - waiting for RNR timeout or send buffer available.
+ * IPATH_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
+ * IPATH_S_WAIT_DMA - waiting for send DMA queue to drain before generating
+ *		      next send completion entry not via send DMA.
+ */
+#define IPATH_S_SIGNAL_REQ_WR	0x01
+#define IPATH_S_FENCE_PENDING	0x02
+#define IPATH_S_RDMAR_PENDING	0x04
+#define IPATH_S_ACK_PENDING	0x08
+#define IPATH_S_BUSY		0x10
+#define IPATH_S_WAITING		0x20
+#define IPATH_S_WAIT_SSN_CREDIT	0x40
+#define IPATH_S_WAIT_DMA	0x80
+
+#define IPATH_S_ANY_WAIT (IPATH_S_FENCE_PENDING | IPATH_S_RDMAR_PENDING | \
+	IPATH_S_WAITING | IPATH_S_WAIT_SSN_CREDIT | IPATH_S_WAIT_DMA)
+
+#define IPATH_PSN_CREDIT	512
+
+/*
+ * Since struct ipath_swqe is not a fixed size, we can't simply index into
+ * struct ipath_qp.s_wq.  This function does the array index computation.
+ */
+static inline struct ipath_swqe *get_swqe_ptr(struct ipath_qp *qp,
+					      unsigned n)
+{
+	return (struct ipath_swqe *)((char *)qp->s_wq +
+				     (sizeof(struct ipath_swqe) +
+				      qp->s_max_sge *
+				      sizeof(struct ipath_sge)) * n);
+}
+
+/*
+ * Since struct ipath_rwqe is not a fixed size, we can't simply index into
+ * struct ipath_rwq.wq.  This function does the array index computation.
+ */
+static inline struct ipath_rwqe *get_rwqe_ptr(struct ipath_rq *rq,
+					      unsigned n)
+{
+	return (struct ipath_rwqe *)
+		((char *) rq->wq->wq +
+		 (sizeof(struct ipath_rwqe) +
+		  rq->max_sge * sizeof(struct ib_sge)) * n);
+}
+
+/*
+ * QPN-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way,
+ * large bitmaps are not allocated unless large numbers of QPs are used.
+ */
+struct qpn_map {
+	atomic_t n_free;
+	void *page;
+};
+
+struct ipath_qp_table {
+	spinlock_t lock;
+	u32 last;		/* last QP number allocated */
+	u32 max;		/* size of the hash table */
+	u32 nmaps;		/* size of the map table */
+	struct ipath_qp **table;
+	/* bit map of free numbers */
+	struct qpn_map map[QPNMAP_ENTRIES];
+};
+
+struct ipath_lkey_table {
+	spinlock_t lock;
+	u32 next;		/* next unused index (speeds search) */
+	u32 gen;		/* generation count */
+	u32 max;		/* size of the table */
+	struct ipath_mregion **table;
+};
+
+struct ipath_opcode_stats {
+	u64 n_packets;		/* number of packets */
+	u64 n_bytes;		/* total number of bytes */
+};
+
+struct ipath_ibdev {
+	struct ib_device ibdev;
+	struct ipath_devdata *dd;
+	struct list_head pending_mmaps;
+	spinlock_t mmap_offset_lock;
+	u32 mmap_offset;
+	int ib_unit;		/* This is the device number */
+	u16 sm_lid;		/* in host order */
+	u8 sm_sl;
+	u8 mkeyprot;
+	/* non-zero when timer is set */
+	unsigned long mkey_lease_timeout;
+
+	/* The following fields are really per port. */
+	struct ipath_qp_table qp_table;
+	struct ipath_lkey_table lk_table;
+	struct list_head pending[3];	/* FIFO of QPs waiting for ACKs */
+	struct list_head piowait;	/* list for wait PIO buf */
+	struct list_head txreq_free;
+	void *txreq_bufs;
+	/* list of QPs waiting for RNR timer */
+	struct list_head rnrwait;
+	spinlock_t pending_lock;
+	__be64 sys_image_guid;	/* in network order */
+	__be64 gid_prefix;	/* in network order */
+	__be64 mkey;
+
+	u32 n_pds_allocated;	/* number of PDs allocated for device */
+	spinlock_t n_pds_lock;
+	u32 n_ahs_allocated;	/* number of AHs allocated for device */
+	spinlock_t n_ahs_lock;
+	u32 n_cqs_allocated;	/* number of CQs allocated for device */
+	spinlock_t n_cqs_lock;
+	u32 n_qps_allocated;	/* number of QPs allocated for device */
+	spinlock_t n_qps_lock;
+	u32 n_srqs_allocated;	/* number of SRQs allocated for device */
+	spinlock_t n_srqs_lock;
+	u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
+	spinlock_t n_mcast_grps_lock;
+
+	u64 ipath_sword;	/* total dwords sent (sample result) */
+	u64 ipath_rword;	/* total dwords received (sample result) */
+	u64 ipath_spkts;	/* total packets sent (sample result) */
+	u64 ipath_rpkts;	/* total packets received (sample result) */
+	/* # of ticks no data sent (sample result) */
+	u64 ipath_xmit_wait;
+	u64 rcv_errors;		/* # of packets with SW detected rcv errs */
+	u64 n_unicast_xmit;	/* total unicast packets sent */
+	u64 n_unicast_rcv;	/* total unicast packets received */
+	u64 n_multicast_xmit;	/* total multicast packets sent */
+	u64 n_multicast_rcv;	/* total multicast packets received */
+	u64 z_symbol_error_counter;		/* starting count for PMA */
+	u64 z_link_error_recovery_counter;	/* starting count for PMA */
+	u64 z_link_downed_counter;		/* starting count for PMA */
+	u64 z_port_rcv_errors;			/* starting count for PMA */
+	u64 z_port_rcv_remphys_errors;		/* starting count for PMA */
+	u64 z_port_xmit_discards;		/* starting count for PMA */
+	u64 z_port_xmit_data;			/* starting count for PMA */
+	u64 z_port_rcv_data;			/* starting count for PMA */
+	u64 z_port_xmit_packets;		/* starting count for PMA */
+	u64 z_port_rcv_packets;			/* starting count for PMA */
+	u32 z_pkey_violations;			/* starting count for PMA */
+	u32 z_local_link_integrity_errors;	/* starting count for PMA */
+	u32 z_excessive_buffer_overrun_errors;	/* starting count for PMA */
+	u32 z_vl15_dropped;			/* starting count for PMA */
+	u32 n_rc_resends;
+	u32 n_rc_acks;
+	u32 n_rc_qacks;
+	u32 n_seq_naks;
+	u32 n_rdma_seq;
+	u32 n_rnr_naks;
+	u32 n_other_naks;
+	u32 n_timeouts;
+	u32 n_pkt_drops;
+	u32 n_vl15_dropped;
+	u32 n_wqe_errs;
+	u32 n_rdma_dup_busy;
+	u32 n_piowait;
+	u32 n_unaligned;
+	u32 port_cap_flags;
+	u32 pma_sample_start;
+	u32 pma_sample_interval;
+	__be16 pma_counter_select[5];
+	u16 pma_tag;
+	u16 qkey_violations;
+	u16 mkey_violations;
+	u16 mkey_lease_period;
+	u16 pending_index;	/* which pending queue is active */
+	u8 pma_sample_status;
+	u8 subnet_timeout;
+	u8 vl_high_limit;
+	struct ipath_opcode_stats opstats[128];
+};
+
+struct ipath_verbs_counters {
+	u64 symbol_error_counter;
+	u64 link_error_recovery_counter;
+	u64 link_downed_counter;
+	u64 port_rcv_errors;
+	u64 port_rcv_remphys_errors;
+	u64 port_xmit_discards;
+	u64 port_xmit_data;
+	u64 port_rcv_data;
+	u64 port_xmit_packets;
+	u64 port_rcv_packets;
+	u32 local_link_integrity_errors;
+	u32 excessive_buffer_overrun_errors;
+	u32 vl15_dropped;
+};
+
+struct ipath_verbs_txreq {
+	struct ipath_qp         *qp;
+	struct ipath_swqe       *wqe;
+	u32                      map_len;
+	u32                      len;
+	struct ipath_sge_state  *ss;
+	struct ipath_pio_header  hdr;
+	struct ipath_sdma_txreq  txreq;
+};
+
+static inline struct ipath_mr *to_imr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct ipath_mr, ibmr);
+}
+
+static inline struct ipath_pd *to_ipd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct ipath_pd, ibpd);
+}
+
+static inline struct ipath_ah *to_iah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct ipath_ah, ibah);
+}
+
+static inline struct ipath_cq *to_icq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct ipath_cq, ibcq);
+}
+
+static inline struct ipath_srq *to_isrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct ipath_srq, ibsrq);
+}
+
+static inline struct ipath_qp *to_iqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct ipath_qp, ibqp);
+}
+
+static inline struct ipath_ibdev *to_idev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct ipath_ibdev, ibdev);
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+static inline void ipath_schedule_send(struct ipath_qp *qp)
+{
+	if (qp->s_flags & IPATH_S_ANY_WAIT)
+		qp->s_flags &= ~IPATH_S_ANY_WAIT;
+	if (!(qp->s_flags & IPATH_S_BUSY))
+		tasklet_hi_schedule(&qp->s_task);
+}
+
+int ipath_process_mad(struct ib_device *ibdev,
+		      int mad_flags,
+		      u8 port_num,
+		      struct ib_wc *in_wc,
+		      struct ib_grh *in_grh,
+		      struct ib_mad *in_mad, struct ib_mad *out_mad);
+
+/*
+ * Compare the lower 24 bits of the two values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int ipath_cmp24(u32 a, u32 b)
+{
+	return (((int) a) - ((int) b)) << 8;
+}
+
+struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid);
+
+int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
+			    u64 *rwords, u64 *spkts, u64 *rpkts,
+			    u64 *xmit_wait);
+
+int ipath_get_counters(struct ipath_devdata *dd,
+		       struct ipath_verbs_counters *cntrs);
+
+int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int ipath_mcast_tree_empty(void);
+
+__be32 ipath_compute_aeth(struct ipath_qp *qp);
+
+struct ipath_qp *ipath_lookup_qpn(struct ipath_qp_table *qpt, u32 qpn);
+
+struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
+			      struct ib_qp_init_attr *init_attr,
+			      struct ib_udata *udata);
+
+int ipath_destroy_qp(struct ib_qp *ibqp);
+
+int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err);
+
+int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		    int attr_mask, struct ib_udata *udata);
+
+int ipath_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		   int attr_mask, struct ib_qp_init_attr *init_attr);
+
+unsigned ipath_free_all_qps(struct ipath_qp_table *qpt);
+
+int ipath_init_qp_table(struct ipath_ibdev *idev, int size);
+
+void ipath_get_credit(struct ipath_qp *qp, u32 aeth);
+
+unsigned ipath_ib_rate_to_mult(enum ib_rate rate);
+
+int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
+		     u32 hdrwords, struct ipath_sge_state *ss, u32 len);
+
+void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length);
+
+void ipath_skip_sge(struct ipath_sge_state *ss, u32 length);
+
+void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+void ipath_rc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+void ipath_restart_rc(struct ipath_qp *qp, u32 psn);
+
+void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err);
+
+int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr);
+
+void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
+		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
+
+int ipath_alloc_lkey(struct ipath_lkey_table *rkt,
+		     struct ipath_mregion *mr);
+
+void ipath_free_lkey(struct ipath_lkey_table *rkt, u32 lkey);
+
+int ipath_lkey_ok(struct ipath_qp *qp, struct ipath_sge *isge,
+		  struct ib_sge *sge, int acc);
+
+int ipath_rkey_ok(struct ipath_qp *qp, struct ipath_sge_state *ss,
+		  u32 len, u64 vaddr, u32 rkey, int acc);
+
+int ipath_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			   struct ib_recv_wr **bad_wr);
+
+struct ib_srq *ipath_create_srq(struct ib_pd *ibpd,
+				struct ib_srq_init_attr *srq_init_attr,
+				struct ib_udata *udata);
+
+int ipath_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		     enum ib_srq_attr_mask attr_mask,
+		     struct ib_udata *udata);
+
+int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
+
+int ipath_destroy_srq(struct ib_srq *ibsrq);
+
+void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig);
+
+int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+
+struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vector,
+			      struct ib_ucontext *context,
+			      struct ib_udata *udata);
+
+int ipath_destroy_cq(struct ib_cq *ibcq);
+
+int ipath_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
+
+int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
+
+struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc);
+
+struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd,
+				struct ib_phys_buf *buffer_list,
+				int num_phys_buf, int acc, u64 *iova_start);
+
+struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				u64 virt_addr, int mr_access_flags,
+				struct ib_udata *udata);
+
+int ipath_dereg_mr(struct ib_mr *ibmr);
+
+struct ib_fmr *ipath_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+			       struct ib_fmr_attr *fmr_attr);
+
+int ipath_map_phys_fmr(struct ib_fmr *ibfmr, u64 * page_list,
+		       int list_len, u64 iova);
+
+int ipath_unmap_fmr(struct list_head *fmr_list);
+
+int ipath_dealloc_fmr(struct ib_fmr *ibfmr);
+
+void ipath_release_mmap_info(struct kref *ref);
+
+struct ipath_mmap_info *ipath_create_mmap_info(struct ipath_ibdev *dev,
+					       u32 size,
+					       struct ib_ucontext *context,
+					       void *obj);
+
+void ipath_update_mmap_info(struct ipath_ibdev *dev,
+			    struct ipath_mmap_info *ip,
+			    u32 size, void *obj);
+
+int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+void ipath_insert_rnr_queue(struct ipath_qp *qp);
+
+int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
+		   u32 *lengthp, struct ipath_sge_state *ss);
+
+int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only);
+
+u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
+		   struct ib_global_route *grh, u32 hwords, u32 nwords);
+
+void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
+			   struct ipath_other_headers *ohdr,
+			   u32 bth0, u32 bth2);
+
+void ipath_do_send(unsigned long data);
+
+void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
+			 enum ib_wc_status status);
+
+int ipath_make_rc_req(struct ipath_qp *qp);
+
+int ipath_make_uc_req(struct ipath_qp *qp);
+
+int ipath_make_ud_req(struct ipath_qp *qp);
+
+int ipath_register_ib_device(struct ipath_devdata *);
+
+void ipath_unregister_ib_device(struct ipath_ibdev *);
+
+void ipath_ib_rcv(struct ipath_ibdev *, void *, void *, u32);
+
+int ipath_ib_piobufavail(struct ipath_ibdev *);
+
+unsigned ipath_get_npkeys(struct ipath_devdata *);
+
+u32 ipath_get_cr_errpkey(struct ipath_devdata *);
+
+unsigned ipath_get_pkey(struct ipath_devdata *, unsigned);
+
+extern const enum ib_wc_opcode ib_ipath_wc_opcode[];
+
+/*
+ * Below converts HCA-specific LinkTrainingState to IB PhysPortState
+ * values.
+ */
+extern const u8 ipath_cvt_physportstate[];
+#define IB_PHYSPORTSTATE_SLEEP 1
+#define IB_PHYSPORTSTATE_POLL 2
+#define IB_PHYSPORTSTATE_DISABLED 3
+#define IB_PHYSPORTSTATE_CFG_TRAIN 4
+#define IB_PHYSPORTSTATE_LINKUP 5
+#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6
+
+extern const int ib_ipath_state_ops[];
+
+extern unsigned int ib_ipath_lkey_table_size;
+
+extern unsigned int ib_ipath_max_cqes;
+
+extern unsigned int ib_ipath_max_cqs;
+
+extern unsigned int ib_ipath_max_qp_wrs;
+
+extern unsigned int ib_ipath_max_qps;
+
+extern unsigned int ib_ipath_max_sges;
+
+extern unsigned int ib_ipath_max_mcast_grps;
+
+extern unsigned int ib_ipath_max_mcast_qp_attached;
+
+extern unsigned int ib_ipath_max_srqs;
+
+extern unsigned int ib_ipath_max_srq_sges;
+
+extern unsigned int ib_ipath_max_srq_wrs;
+
+extern const u32 ib_ipath_rnr_table[];
+
+extern struct ib_dma_mapping_ops ipath_dma_mapping_ops;
+
+#endif				/* IPATH_VERBS_H */
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
new file mode 100644
index 000000000..6216ea923
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/rculist.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include "ipath_verbs.h"
+
+/*
+ * Global table of GID to attached QPs.
+ * The table is global to all ipath devices since a send from one QP/device
+ * needs to be locally routed to any locally attached QPs on the same
+ * or different device.
+ */
+static struct rb_root mcast_tree;
+static DEFINE_SPINLOCK(mcast_lock);
+
+/**
+ * ipath_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
+ * @qp: the QP to link
+ */
+static struct ipath_mcast_qp *ipath_mcast_qp_alloc(struct ipath_qp *qp)
+{
+	struct ipath_mcast_qp *mqp;
+
+	mqp = kmalloc(sizeof *mqp, GFP_KERNEL);
+	if (!mqp)
+		goto bail;
+
+	mqp->qp = qp;
+	atomic_inc(&qp->refcount);
+
+bail:
+	return mqp;
+}
+
+static void ipath_mcast_qp_free(struct ipath_mcast_qp *mqp)
+{
+	struct ipath_qp *qp = mqp->qp;
+
+	/* Notify ipath_destroy_qp() if it is waiting. */
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+
+	kfree(mqp);
+}
+
+/**
+ * ipath_mcast_alloc - allocate the multicast GID structure
+ * @mgid: the multicast GID
+ *
+ * A list of QPs will be attached to this structure.
+ */
+static struct ipath_mcast *ipath_mcast_alloc(union ib_gid *mgid)
+{
+	struct ipath_mcast *mcast;
+
+	mcast = kmalloc(sizeof *mcast, GFP_KERNEL);
+	if (!mcast)
+		goto bail;
+
+	mcast->mgid = *mgid;
+	INIT_LIST_HEAD(&mcast->qp_list);
+	init_waitqueue_head(&mcast->wait);
+	atomic_set(&mcast->refcount, 0);
+	mcast->n_attached = 0;
+
+bail:
+	return mcast;
+}
+
+static void ipath_mcast_free(struct ipath_mcast *mcast)
+{
+	struct ipath_mcast_qp *p, *tmp;
+
+	list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
+		ipath_mcast_qp_free(p);
+
+	kfree(mcast);
+}
+
+/**
+ * ipath_mcast_find - search the global table for the given multicast GID
+ * @mgid: the multicast GID to search for
+ *
+ * Returns NULL if not found.
+ *
+ * The caller is responsible for decrementing the reference count if found.
+ */
+struct ipath_mcast *ipath_mcast_find(union ib_gid *mgid)
+{
+	struct rb_node *n;
+	unsigned long flags;
+	struct ipath_mcast *mcast;
+
+	spin_lock_irqsave(&mcast_lock, flags);
+	n = mcast_tree.rb_node;
+	while (n) {
+		int ret;
+
+		mcast = rb_entry(n, struct ipath_mcast, rb_node);
+
+		ret = memcmp(mgid->raw, mcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else {
+			atomic_inc(&mcast->refcount);
+			spin_unlock_irqrestore(&mcast_lock, flags);
+			goto bail;
+		}
+	}
+	spin_unlock_irqrestore(&mcast_lock, flags);
+
+	mcast = NULL;
+
+bail:
+	return mcast;
+}
+
+/**
+ * ipath_mcast_add - insert mcast GID into table and attach QP struct
+ * @mcast: the mcast GID table
+ * @mqp: the QP to attach
+ *
+ * Return zero if both were added.  Return EEXIST if the GID was already in
+ * the table but the QP was added.  Return ESRCH if the QP was already
+ * attached and neither structure was added.
+ */
+static int ipath_mcast_add(struct ipath_ibdev *dev,
+			   struct ipath_mcast *mcast,
+			   struct ipath_mcast_qp *mqp)
+{
+	struct rb_node **n = &mcast_tree.rb_node;
+	struct rb_node *pn = NULL;
+	int ret;
+
+	spin_lock_irq(&mcast_lock);
+
+	while (*n) {
+		struct ipath_mcast *tmcast;
+		struct ipath_mcast_qp *p;
+
+		pn = *n;
+		tmcast = rb_entry(pn, struct ipath_mcast, rb_node);
+
+		ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0) {
+			n = &pn->rb_left;
+			continue;
+		}
+		if (ret > 0) {
+			n = &pn->rb_right;
+			continue;
+		}
+
+		/* Search the QP list to see if this is already there. */
+		list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
+			if (p->qp == mqp->qp) {
+				ret = ESRCH;
+				goto bail;
+			}
+		}
+		if (tmcast->n_attached == ib_ipath_max_mcast_qp_attached) {
+			ret = ENOMEM;
+			goto bail;
+		}
+
+		tmcast->n_attached++;
+
+		list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
+		ret = EEXIST;
+		goto bail;
+	}
+
+	spin_lock(&dev->n_mcast_grps_lock);
+	if (dev->n_mcast_grps_allocated == ib_ipath_max_mcast_grps) {
+		spin_unlock(&dev->n_mcast_grps_lock);
+		ret = ENOMEM;
+		goto bail;
+	}
+
+	dev->n_mcast_grps_allocated++;
+	spin_unlock(&dev->n_mcast_grps_lock);
+
+	mcast->n_attached++;
+
+	list_add_tail_rcu(&mqp->list, &mcast->qp_list);
+
+	atomic_inc(&mcast->refcount);
+	rb_link_node(&mcast->rb_node, pn, n);
+	rb_insert_color(&mcast->rb_node, &mcast_tree);
+
+	ret = 0;
+
+bail:
+	spin_unlock_irq(&mcast_lock);
+
+	return ret;
+}
+
+int ipath_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+	struct ipath_ibdev *dev = to_idev(ibqp->device);
+	struct ipath_mcast *mcast;
+	struct ipath_mcast_qp *mqp;
+	int ret;
+
+	/*
+	 * Allocate data structures since its better to do this outside of
+	 * spin locks and it will most likely be needed.
+	 */
+	mcast = ipath_mcast_alloc(gid);
+	if (mcast == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	mqp = ipath_mcast_qp_alloc(qp);
+	if (mqp == NULL) {
+		ipath_mcast_free(mcast);
+		ret = -ENOMEM;
+		goto bail;
+	}
+	switch (ipath_mcast_add(dev, mcast, mqp)) {
+	case ESRCH:
+		/* Neither was used: can't attach the same QP twice. */
+		ipath_mcast_qp_free(mqp);
+		ipath_mcast_free(mcast);
+		ret = -EINVAL;
+		goto bail;
+	case EEXIST:		/* The mcast wasn't used */
+		ipath_mcast_free(mcast);
+		break;
+	case ENOMEM:
+		/* Exceeded the maximum number of mcast groups. */
+		ipath_mcast_qp_free(mqp);
+		ipath_mcast_free(mcast);
+		ret = -ENOMEM;
+		goto bail;
+	default:
+		break;
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int ipath_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct ipath_qp *qp = to_iqp(ibqp);
+	struct ipath_ibdev *dev = to_idev(ibqp->device);
+	struct ipath_mcast *mcast = NULL;
+	struct ipath_mcast_qp *p, *tmp;
+	struct rb_node *n;
+	int last = 0;
+	int ret;
+
+	spin_lock_irq(&mcast_lock);
+
+	/* Find the GID in the mcast table. */
+	n = mcast_tree.rb_node;
+	while (1) {
+		if (n == NULL) {
+			spin_unlock_irq(&mcast_lock);
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		mcast = rb_entry(n, struct ipath_mcast, rb_node);
+		ret = memcmp(gid->raw, mcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			break;
+	}
+
+	/* Search the QP list. */
+	list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
+		if (p->qp != qp)
+			continue;
+		/*
+		 * We found it, so remove it, but don't poison the forward
+		 * link until we are sure there are no list walkers.
+		 */
+		list_del_rcu(&p->list);
+		mcast->n_attached--;
+
+		/* If this was the last attached QP, remove the GID too. */
+		if (list_empty(&mcast->qp_list)) {
+			rb_erase(&mcast->rb_node, &mcast_tree);
+			last = 1;
+		}
+		break;
+	}
+
+	spin_unlock_irq(&mcast_lock);
+
+	if (p) {
+		/*
+		 * Wait for any list walkers to finish before freeing the
+		 * list element.
+		 */
+		wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+		ipath_mcast_qp_free(p);
+	}
+	if (last) {
+		atomic_dec(&mcast->refcount);
+		wait_event(mcast->wait, !atomic_read(&mcast->refcount));
+		ipath_mcast_free(mcast);
+		spin_lock_irq(&dev->n_mcast_grps_lock);
+		dev->n_mcast_grps_allocated--;
+		spin_unlock_irq(&dev->n_mcast_grps_lock);
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int ipath_mcast_tree_empty(void)
+{
+	return mcast_tree.rb_node == NULL;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c b/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c
new file mode 100644
index 000000000..1a7e20a75
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_wc_ppc64.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on PowerPC only.  Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include "ipath_kernel.h"
+
+/**
+ * ipath_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ *
+ * Nothing to do on PowerPC, so just return without error.
+ */
+int ipath_enable_wc(struct ipath_devdata *dd)
+{
+	return 0;
+}
diff --git a/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c
new file mode 100644
index 000000000..4ad0b932d
--- /dev/null
+++ b/drivers/infiniband/hw/ipath/ipath_wc_x86_64.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2006, 2007 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on x86_64 only.  Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include <linux/pci.h>
+#include <asm/mtrr.h>
+#include <asm/processor.h>
+
+#include "ipath_kernel.h"
+
+/**
+ * ipath_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ *
+ * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable
+ * write combining.
+ */
+int ipath_enable_wc(struct ipath_devdata *dd)
+{
+	int ret = 0;
+	u64 pioaddr, piolen;
+	unsigned bits;
+	const unsigned long addr = pci_resource_start(dd->pcidev, 0);
+	const size_t len = pci_resource_len(dd->pcidev, 0);
+
+	/*
+	 * Set the PIO buffers to be WCCOMB, so we get HT bursts to the
+	 * chip.  Linux (possibly the hardware) requires it to be on a power
+	 * of 2 address matching the length (which has to be a power of 2).
+	 * For rev1, that means the base address, for rev2, it will be just
+	 * the PIO buffers themselves.
+	 * For chips with two sets of buffers, the calculations are
+	 * somewhat more complicated; we need to sum, and the piobufbase
+	 * register has both offsets, 2K in low 32 bits, 4K in high 32 bits.
+	 * The buffers are still packed, so a single range covers both.
+	 */
+	if (dd->ipath_piobcnt2k && dd->ipath_piobcnt4k) { /* 2 sizes */
+		unsigned long pio2kbase, pio4kbase;
+		pio2kbase = dd->ipath_piobufbase & 0xffffffffUL;
+		pio4kbase = (dd->ipath_piobufbase >> 32) & 0xffffffffUL;
+		if (pio2kbase < pio4kbase) { /* all, for now */
+			pioaddr = addr + pio2kbase;
+			piolen = pio4kbase - pio2kbase +
+				dd->ipath_piobcnt4k * dd->ipath_4kalign;
+		} else {
+			pioaddr = addr + pio4kbase;
+			piolen = pio2kbase - pio4kbase +
+				dd->ipath_piobcnt2k * dd->ipath_palign;
+		}
+	} else {  /* single buffer size (2K, currently) */
+		pioaddr = addr + dd->ipath_piobufbase;
+		piolen = dd->ipath_piobcnt2k * dd->ipath_palign +
+			dd->ipath_piobcnt4k * dd->ipath_4kalign;
+	}
+
+	for (bits = 0; !(piolen & (1ULL << bits)); bits++)
+		/* do nothing */ ;
+
+	if (piolen != (1ULL << bits)) {
+		piolen >>= bits;
+		while (piolen >>= 1)
+			bits++;
+		piolen = 1ULL << (bits + 1);
+	}
+	if (pioaddr & (piolen - 1)) {
+		u64 atmp;
+		ipath_dbg("pioaddr %llx not on right boundary for size "
+			  "%llx, fixing\n",
+			  (unsigned long long) pioaddr,
+			  (unsigned long long) piolen);
+		atmp = pioaddr & ~(piolen - 1);
+		if (atmp < addr || (atmp + piolen) > (addr + len)) {
+			ipath_dev_err(dd, "No way to align address/size "
+				      "(%llx/%llx), no WC mtrr\n",
+				      (unsigned long long) atmp,
+				      (unsigned long long) piolen << 1);
+			ret = -ENODEV;
+		} else {
+			ipath_dbg("changing WC base from %llx to %llx, "
+				  "len from %llx to %llx\n",
+				  (unsigned long long) pioaddr,
+				  (unsigned long long) atmp,
+				  (unsigned long long) piolen,
+				  (unsigned long long) piolen << 1);
+			pioaddr = atmp;
+			piolen <<= 1;
+		}
+	}
+
+	if (!ret) {
+		int cookie;
+		ipath_cdbg(VERBOSE, "Setting mtrr for chip to WC "
+			   "(addr %llx, len=0x%llx)\n",
+			   (unsigned long long) pioaddr,
+			   (unsigned long long) piolen);
+		cookie = mtrr_add(pioaddr, piolen, MTRR_TYPE_WRCOMB, 0);
+		if (cookie < 0) {
+			{
+				dev_info(&dd->pcidev->dev,
+					 "mtrr_add()  WC for PIO bufs "
+					 "failed (%d)\n",
+					 cookie);
+				ret = -EINVAL;
+			}
+		} else {
+			ipath_cdbg(VERBOSE, "Set mtrr for chip to WC, "
+				   "cookie is %d\n", cookie);
+			dd->ipath_wc_cookie = cookie;
+			dd->ipath_wc_base = (unsigned long) pioaddr;
+			dd->ipath_wc_len = (unsigned long) piolen;
+		}
+	}
+
+	return ret;
+}
+
+/**
+ * ipath_disable_wc - disable write combining for MMIO writes to the device
+ * @dd: infinipath device
+ */
+void ipath_disable_wc(struct ipath_devdata *dd)
+{
+	if (dd->ipath_wc_cookie) {
+		int r;
+		ipath_cdbg(VERBOSE, "undoing WCCOMB on pio buffers\n");
+		r = mtrr_del(dd->ipath_wc_cookie, dd->ipath_wc_base,
+			     dd->ipath_wc_len);
+		if (r < 0)
+			dev_info(&dd->pcidev->dev,
+				 "mtrr_del(%lx, %lx, %lx) failed: %d\n",
+				 dd->ipath_wc_cookie, dd->ipath_wc_base,
+				 dd->ipath_wc_len, r);
+		dd->ipath_wc_cookie = 0; /* even on failure */
+	}
+}
diff --git a/drivers/infiniband/hw/mlx4/Kconfig b/drivers/infiniband/hw/mlx4/Kconfig
new file mode 100644
index 000000000..fc01deac1
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/Kconfig
@@ -0,0 +1,10 @@
+config MLX4_INFINIBAND
+	tristate "Mellanox ConnectX HCA support"
+	depends on NETDEVICES && ETHERNET && PCI && INET
+	select NET_VENDOR_MELLANOX
+	select MLX4_CORE
+	---help---
+	  This driver provides low-level InfiniBand support for
+	  Mellanox ConnectX PCI Express host channel adapters (HCAs).
+	  This is required to use InfiniBand protocols such as
+	  IP-over-IB or SRP with these devices.
diff --git a/drivers/infiniband/hw/mlx4/Makefile b/drivers/infiniband/hw/mlx4/Makefile
new file mode 100644
index 000000000..f4213b3a8
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_MLX4_INFINIBAND)	+= mlx4_ib.o
+
+mlx4_ib-y :=	ah.o cq.o doorbell.o mad.o main.o mr.o qp.o srq.o mcg.o cm.o alias_GUID.o sysfs.o
diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c
new file mode 100644
index 000000000..f50a54622
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/ah.c
@@ -0,0 +1,178 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_addr.h>
+#include <rdma/ib_cache.h>
+
+#include <linux/slab.h>
+#include <linux/inet.h>
+#include <linux/string.h>
+#include <linux/mlx4/driver.h>
+
+#include "mlx4_ib.h"
+
+static struct ib_ah *create_ib_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
+				  struct mlx4_ib_ah *ah)
+{
+	struct mlx4_dev *dev = to_mdev(pd->device)->dev;
+
+	ah->av.ib.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
+	ah->av.ib.g_slid  = ah_attr->src_path_bits;
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		ah->av.ib.g_slid   |= 0x80;
+		ah->av.ib.gid_index = ah_attr->grh.sgid_index;
+		ah->av.ib.hop_limit = ah_attr->grh.hop_limit;
+		ah->av.ib.sl_tclass_flowlabel |=
+			cpu_to_be32((ah_attr->grh.traffic_class << 20) |
+				    ah_attr->grh.flow_label);
+		memcpy(ah->av.ib.dgid, ah_attr->grh.dgid.raw, 16);
+	}
+
+	ah->av.ib.dlid    = cpu_to_be16(ah_attr->dlid);
+	if (ah_attr->static_rate) {
+		ah->av.ib.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
+		while (ah->av.ib.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+		       !(1 << ah->av.ib.stat_rate & dev->caps.stat_rate_support))
+			--ah->av.ib.stat_rate;
+	}
+	ah->av.ib.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+
+	return &ah->ibah;
+}
+
+static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr,
+				    struct mlx4_ib_ah *ah)
+{
+	struct mlx4_ib_dev *ibdev = to_mdev(pd->device);
+	struct mlx4_dev *dev = ibdev->dev;
+	int is_mcast = 0;
+	struct in6_addr in6;
+	u16 vlan_tag;
+
+	memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6));
+	if (rdma_is_multicast_addr(&in6)) {
+		is_mcast = 1;
+		rdma_get_mcast_mac(&in6, ah->av.eth.mac);
+	} else {
+		memcpy(ah->av.eth.mac, ah_attr->dmac, ETH_ALEN);
+	}
+	vlan_tag = ah_attr->vlan_id;
+	if (vlan_tag < 0x1000)
+		vlan_tag |= (ah_attr->sl & 7) << 13;
+	ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24));
+	ah->av.eth.gid_index = ah_attr->grh.sgid_index;
+	ah->av.eth.vlan = cpu_to_be16(vlan_tag);
+	if (ah_attr->static_rate) {
+		ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET;
+		while (ah->av.eth.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+		       !(1 << ah->av.eth.stat_rate & dev->caps.stat_rate_support))
+			--ah->av.eth.stat_rate;
+	}
+
+	/*
+	 * HW requires multicast LID so we just choose one.
+	 */
+	if (is_mcast)
+		ah->av.ib.dlid = cpu_to_be16(0xc000);
+
+	memcpy(ah->av.eth.dgid, ah_attr->grh.dgid.raw, 16);
+	ah->av.eth.sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 29);
+
+	return &ah->ibah;
+}
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	struct mlx4_ib_ah *ah;
+	struct ib_ah *ret;
+
+	ah = kzalloc(sizeof *ah, GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	if (rdma_port_get_link_layer(pd->device, ah_attr->port_num) == IB_LINK_LAYER_ETHERNET) {
+		if (!(ah_attr->ah_flags & IB_AH_GRH)) {
+			ret = ERR_PTR(-EINVAL);
+		} else {
+			/*
+			 * TBD: need to handle the case when we get
+			 * called in an atomic context and there we
+			 * might sleep.  We don't expect this
+			 * currently since we're working with link
+			 * local addresses which we can translate
+			 * without going to sleep.
+			 */
+			ret = create_iboe_ah(pd, ah_attr, ah);
+		}
+
+		if (IS_ERR(ret))
+			kfree(ah);
+
+		return ret;
+	} else
+		return create_ib_ah(pd, ah_attr, ah); /* never fails */
+}
+
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	struct mlx4_ib_ah *ah = to_mah(ibah);
+	enum rdma_link_layer ll;
+
+	memset(ah_attr, 0, sizeof *ah_attr);
+	ah_attr->sl = be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+	ah_attr->port_num = be32_to_cpu(ah->av.ib.port_pd) >> 24;
+	ll = rdma_port_get_link_layer(ibah->device, ah_attr->port_num);
+	ah_attr->dlid = ll == IB_LINK_LAYER_INFINIBAND ? be16_to_cpu(ah->av.ib.dlid) : 0;
+	if (ah->av.ib.stat_rate)
+		ah_attr->static_rate = ah->av.ib.stat_rate - MLX4_STAT_RATE_OFFSET;
+	ah_attr->src_path_bits = ah->av.ib.g_slid & 0x7F;
+
+	if (mlx4_ib_ah_grh_present(ah)) {
+		ah_attr->ah_flags = IB_AH_GRH;
+
+		ah_attr->grh.traffic_class =
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20;
+		ah_attr->grh.flow_label =
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) & 0xfffff;
+		ah_attr->grh.hop_limit  = ah->av.ib.hop_limit;
+		ah_attr->grh.sgid_index = ah->av.ib.gid_index;
+		memcpy(ah_attr->grh.dgid.raw, ah->av.ib.dgid, 16);
+	}
+
+	return 0;
+}
+
+int mlx4_ib_destroy_ah(struct ib_ah *ah)
+{
+	kfree(to_mah(ah));
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx4/alias_GUID.c b/drivers/infiniband/hw/mlx4/alias_GUID.c
new file mode 100644
index 000000000..0f00204d2
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/alias_GUID.c
@@ -0,0 +1,901 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+ /***********************************************************/
+/*This file support the handling of the Alias GUID feature. */
+/***********************************************************/
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_pack.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/delay.h>
+#include "mlx4_ib.h"
+
+/*
+The driver keeps the current state of all guids, as they are in the HW.
+Whenever we receive an smp mad GUIDInfo record, the data will be cached.
+*/
+
+struct mlx4_alias_guid_work_context {
+	u8 port;
+	struct mlx4_ib_dev     *dev ;
+	struct ib_sa_query     *sa_query;
+	struct completion	done;
+	int			query_id;
+	struct list_head	list;
+	int			block_num;
+	ib_sa_comp_mask		guid_indexes;
+	u8			method;
+};
+
+struct mlx4_next_alias_guid_work {
+	u8 port;
+	u8 block_num;
+	u8 method;
+	struct mlx4_sriov_alias_guid_info_rec_det rec_det;
+};
+
+static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port,
+				     int *resched_delay_sec);
+
+void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev, int block_num,
+					 u8 port_num, u8 *p_data)
+{
+	int i;
+	u64 guid_indexes;
+	int slave_id;
+	int port_index = port_num - 1;
+
+	if (!mlx4_is_master(dev->dev))
+		return;
+
+	guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
+				   ports_guid[port_num - 1].
+				   all_rec_per_port[block_num].guid_indexes);
+	pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes);
+
+	for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		/* The location of the specific index starts from bit number 4
+		 * until bit num 11 */
+		if (test_bit(i + 4, (unsigned long *)&guid_indexes)) {
+			slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ;
+			if (slave_id >= dev->dev->num_slaves) {
+				pr_debug("The last slave: %d\n", slave_id);
+				return;
+			}
+
+			/* cache the guid: */
+			memcpy(&dev->sriov.demux[port_index].guid_cache[slave_id],
+			       &p_data[i * GUID_REC_SIZE],
+			       GUID_REC_SIZE);
+		} else
+			pr_debug("Guid number: %d in block: %d"
+				 " was not updated\n", i, block_num);
+	}
+}
+
+static __be64 get_cached_alias_guid(struct mlx4_ib_dev *dev, int port, int index)
+{
+	if (index >= NUM_ALIAS_GUID_PER_PORT) {
+		pr_err("%s: ERROR: asked for index:%d\n", __func__, index);
+		return (__force __be64) -1;
+	}
+	return *(__be64 *)&dev->sriov.demux[port - 1].guid_cache[index];
+}
+
+
+ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index)
+{
+	return IB_SA_COMP_MASK(4 + index);
+}
+
+void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave,
+				    int port,  int slave_init)
+{
+	__be64 curr_guid, required_guid;
+	int record_num = slave / 8;
+	int index = slave % 8;
+	int port_index = port - 1;
+	unsigned long flags;
+	int do_work = 0;
+
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+	if (dev->sriov.alias_guid.ports_guid[port_index].state_flags &
+	    GUID_STATE_NEED_PORT_INIT)
+		goto unlock;
+	if (!slave_init) {
+		curr_guid = *(__be64 *)&dev->sriov.
+			alias_guid.ports_guid[port_index].
+			all_rec_per_port[record_num].
+			all_recs[GUID_REC_SIZE * index];
+		if (curr_guid == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL) ||
+		    !curr_guid)
+			goto unlock;
+		required_guid = cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL);
+	} else {
+		required_guid = mlx4_get_admin_guid(dev->dev, slave, port);
+		if (required_guid == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL))
+			goto unlock;
+	}
+	*(__be64 *)&dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[record_num].
+		all_recs[GUID_REC_SIZE * index] = required_guid;
+	dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[record_num].guid_indexes
+		|= mlx4_ib_get_aguid_comp_mask_from_ix(index);
+	dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[record_num].status
+		= MLX4_GUID_INFO_STATUS_IDLE;
+	/* set to run immediately */
+	dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[record_num].time_to_run = 0;
+	dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[record_num].
+		guids_retry_schedule[index] = 0;
+	do_work = 1;
+unlock:
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+
+	if (do_work)
+		mlx4_ib_init_alias_guid_work(dev, port_index);
+}
+
+/*
+ * Whenever new GUID is set/unset (guid table change) create event and
+ * notify the relevant slave (master also should be notified).
+ * If the GUID value is not as we have in the cache the slave will not be
+ * updated; in this case it waits for the smp_snoop or the port management
+ * event to call the function and to update the slave.
+ * block_number - the index of the block (16 blocks available)
+ * port_number - 1 or 2
+ */
+void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
+					  int block_num, u8 port_num,
+					  u8 *p_data)
+{
+	int i;
+	u64 guid_indexes;
+	int slave_id;
+	enum slave_port_state new_state;
+	enum slave_port_state prev_state;
+	__be64 tmp_cur_ag, form_cache_ag;
+	enum slave_port_gen_event gen_event;
+	struct mlx4_sriov_alias_guid_info_rec_det *rec;
+	unsigned long flags;
+	__be64 required_value;
+
+	if (!mlx4_is_master(dev->dev))
+		return;
+
+	rec = &dev->sriov.alias_guid.ports_guid[port_num - 1].
+			all_rec_per_port[block_num];
+	guid_indexes = be64_to_cpu((__force __be64) dev->sriov.alias_guid.
+				   ports_guid[port_num - 1].
+				   all_rec_per_port[block_num].guid_indexes);
+	pr_debug("port: %d, guid_indexes: 0x%llx\n", port_num, guid_indexes);
+
+	/*calculate the slaves and notify them*/
+	for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		/* the location of the specific index runs from bits 4..11 */
+		if (!(test_bit(i + 4, (unsigned long *)&guid_indexes)))
+			continue;
+
+		slave_id = (block_num * NUM_ALIAS_GUID_IN_REC) + i ;
+		if (slave_id >= dev->dev->persist->num_vfs + 1)
+			return;
+		tmp_cur_ag = *(__be64 *)&p_data[i * GUID_REC_SIZE];
+		form_cache_ag = get_cached_alias_guid(dev, port_num,
+					(NUM_ALIAS_GUID_IN_REC * block_num) + i);
+		/*
+		 * Check if guid is not the same as in the cache,
+		 * If it is different, wait for the snoop_smp or the port mgmt
+		 * change event to update the slave on its port state change
+		 */
+		if (tmp_cur_ag != form_cache_ag)
+			continue;
+
+		spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+		required_value = *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE];
+
+		if (required_value == cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL))
+			required_value = 0;
+
+		if (tmp_cur_ag == required_value) {
+			rec->guid_indexes = rec->guid_indexes &
+			       ~mlx4_ib_get_aguid_comp_mask_from_ix(i);
+		} else {
+			/* may notify port down if value is 0 */
+			if (tmp_cur_ag != MLX4_NOT_SET_GUID) {
+				spin_unlock_irqrestore(&dev->sriov.
+					alias_guid.ag_work_lock, flags);
+				continue;
+			}
+		}
+		spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock,
+				       flags);
+		mlx4_gen_guid_change_eqe(dev->dev, slave_id, port_num);
+		/*2 cases: Valid GUID, and Invalid Guid*/
+
+		if (tmp_cur_ag != MLX4_NOT_SET_GUID) { /*valid GUID*/
+			prev_state = mlx4_get_slave_port_state(dev->dev, slave_id, port_num);
+			new_state = set_and_calc_slave_port_state(dev->dev, slave_id, port_num,
+								  MLX4_PORT_STATE_IB_PORT_STATE_EVENT_GID_VALID,
+								  &gen_event);
+			pr_debug("slave: %d, port: %d prev_port_state: %d,"
+				 " new_port_state: %d, gen_event: %d\n",
+				 slave_id, port_num, prev_state, new_state, gen_event);
+			if (gen_event == SLAVE_PORT_GEN_EVENT_UP) {
+				pr_debug("sending PORT_UP event to slave: %d, port: %d\n",
+					 slave_id, port_num);
+				mlx4_gen_port_state_change_eqe(dev->dev, slave_id,
+							       port_num, MLX4_PORT_CHANGE_SUBTYPE_ACTIVE);
+			}
+		} else { /* request to invalidate GUID */
+			set_and_calc_slave_port_state(dev->dev, slave_id, port_num,
+						      MLX4_PORT_STATE_IB_EVENT_GID_INVALID,
+						      &gen_event);
+			if (gen_event == SLAVE_PORT_GEN_EVENT_DOWN) {
+				pr_debug("sending PORT DOWN event to slave: %d, port: %d\n",
+					 slave_id, port_num);
+				mlx4_gen_port_state_change_eqe(dev->dev,
+							       slave_id,
+							       port_num,
+							       MLX4_PORT_CHANGE_SUBTYPE_DOWN);
+			}
+		}
+	}
+}
+
+static void aliasguid_query_handler(int status,
+				    struct ib_sa_guidinfo_rec *guid_rec,
+				    void *context)
+{
+	struct mlx4_ib_dev *dev;
+	struct mlx4_alias_guid_work_context *cb_ctx = context;
+	u8 port_index ;
+	int i;
+	struct mlx4_sriov_alias_guid_info_rec_det *rec;
+	unsigned long flags, flags1;
+	ib_sa_comp_mask declined_guid_indexes = 0;
+	ib_sa_comp_mask applied_guid_indexes = 0;
+	unsigned int resched_delay_sec = 0;
+
+	if (!context)
+		return;
+
+	dev = cb_ctx->dev;
+	port_index = cb_ctx->port - 1;
+	rec = &dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[cb_ctx->block_num];
+
+	if (status) {
+		pr_debug("(port: %d) failed: status = %d\n",
+			 cb_ctx->port, status);
+		rec->time_to_run = ktime_get_real_ns() + 1 * NSEC_PER_SEC;
+		goto out;
+	}
+
+	if (guid_rec->block_num != cb_ctx->block_num) {
+		pr_err("block num mismatch: %d != %d\n",
+		       cb_ctx->block_num, guid_rec->block_num);
+		goto out;
+	}
+
+	pr_debug("lid/port: %d/%d, block_num: %d\n",
+		 be16_to_cpu(guid_rec->lid), cb_ctx->port,
+		 guid_rec->block_num);
+
+	rec = &dev->sriov.alias_guid.ports_guid[port_index].
+		all_rec_per_port[guid_rec->block_num];
+
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+	for (i = 0 ; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		__be64 sm_response, required_val;
+
+		if (!(cb_ctx->guid_indexes &
+			mlx4_ib_get_aguid_comp_mask_from_ix(i)))
+			continue;
+		sm_response = *(__be64 *)&guid_rec->guid_info_list
+				[i * GUID_REC_SIZE];
+		required_val = *(__be64 *)&rec->all_recs[i * GUID_REC_SIZE];
+		if (cb_ctx->method == MLX4_GUID_INFO_RECORD_DELETE) {
+			if (required_val ==
+			    cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL))
+				goto next_entry;
+
+			/* A new value was set till we got the response */
+			pr_debug("need to set new value %llx, record num %d, block_num:%d\n",
+				 be64_to_cpu(required_val),
+				 i, guid_rec->block_num);
+			goto entry_declined;
+		}
+
+		/* check if the SM didn't assign one of the records.
+		 * if it didn't, re-ask for.
+		 */
+		if (sm_response == MLX4_NOT_SET_GUID) {
+			if (rec->guids_retry_schedule[i] == 0)
+				mlx4_ib_warn(&dev->ib_dev,
+					     "%s:Record num %d in  block_num: %d was declined by SM\n",
+					     __func__, i,
+					     guid_rec->block_num);
+			goto entry_declined;
+		} else {
+		       /* properly assigned record. */
+		       /* We save the GUID we just got from the SM in the
+			* admin_guid in order to be persistent, and in the
+			* request from the sm the process will ask for the same GUID */
+			if (required_val &&
+			    sm_response != required_val) {
+				/* Warn only on first retry */
+				if (rec->guids_retry_schedule[i] == 0)
+					mlx4_ib_warn(&dev->ib_dev, "%s: Failed to set"
+						     " admin guid after SysAdmin "
+						     "configuration. "
+						     "Record num %d in block_num:%d "
+						     "was declined by SM, "
+						     "new val(0x%llx) was kept, SM returned (0x%llx)\n",
+						      __func__, i,
+						     guid_rec->block_num,
+						     be64_to_cpu(required_val),
+						     be64_to_cpu(sm_response));
+				goto entry_declined;
+			} else {
+				*(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] =
+					sm_response;
+				if (required_val == 0)
+					mlx4_set_admin_guid(dev->dev,
+							    sm_response,
+							    (guid_rec->block_num
+							    * NUM_ALIAS_GUID_IN_REC) + i,
+							    cb_ctx->port);
+				goto next_entry;
+			}
+		}
+entry_declined:
+		declined_guid_indexes |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
+		rec->guids_retry_schedule[i] =
+			(rec->guids_retry_schedule[i] == 0) ?  1 :
+			min((unsigned int)60,
+			    rec->guids_retry_schedule[i] * 2);
+		/* using the minimum value among all entries in that record */
+		resched_delay_sec = (resched_delay_sec == 0) ?
+				rec->guids_retry_schedule[i] :
+				min(resched_delay_sec,
+				    rec->guids_retry_schedule[i]);
+		continue;
+
+next_entry:
+		rec->guids_retry_schedule[i] = 0;
+	}
+
+	applied_guid_indexes =  cb_ctx->guid_indexes & ~declined_guid_indexes;
+	if (declined_guid_indexes ||
+	    rec->guid_indexes & ~(applied_guid_indexes)) {
+		pr_debug("record=%d wasn't fully set, guid_indexes=0x%llx applied_indexes=0x%llx, declined_indexes=0x%llx\n",
+			 guid_rec->block_num,
+			 be64_to_cpu((__force __be64)rec->guid_indexes),
+			 be64_to_cpu((__force __be64)applied_guid_indexes),
+			 be64_to_cpu((__force __be64)declined_guid_indexes));
+		rec->time_to_run = ktime_get_real_ns() +
+			resched_delay_sec * NSEC_PER_SEC;
+	} else {
+		rec->status = MLX4_GUID_INFO_STATUS_SET;
+	}
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+	/*
+	The func is call here to close the cases when the
+	sm doesn't send smp, so in the sa response the driver
+	notifies the slave.
+	*/
+	mlx4_ib_notify_slaves_on_guid_change(dev, guid_rec->block_num,
+					     cb_ctx->port,
+					     guid_rec->guid_info_list);
+out:
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	if (!dev->sriov.is_going_down) {
+		get_low_record_time_index(dev, port_index, &resched_delay_sec);
+		queue_delayed_work(dev->sriov.alias_guid.ports_guid[port_index].wq,
+				   &dev->sriov.alias_guid.ports_guid[port_index].
+				   alias_guid_work,
+				   msecs_to_jiffies(resched_delay_sec * 1000));
+	}
+	if (cb_ctx->sa_query) {
+		list_del(&cb_ctx->list);
+		kfree(cb_ctx);
+	} else
+		complete(&cb_ctx->done);
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static void invalidate_guid_record(struct mlx4_ib_dev *dev, u8 port, int index)
+{
+	int i;
+	u64 cur_admin_val;
+	ib_sa_comp_mask comp_mask = 0;
+
+	dev->sriov.alias_guid.ports_guid[port - 1].all_rec_per_port[index].status
+		= MLX4_GUID_INFO_STATUS_SET;
+
+	/* calculate the comp_mask for that record.*/
+	for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		cur_admin_val =
+			*(u64 *)&dev->sriov.alias_guid.ports_guid[port - 1].
+			all_rec_per_port[index].all_recs[GUID_REC_SIZE * i];
+		/*
+		check the admin value: if it's for delete (~00LL) or
+		it is the first guid of the first record (hw guid) or
+		the records is not in ownership of the sysadmin and the sm doesn't
+		need to assign GUIDs, then don't put it up for assignment.
+		*/
+		if (MLX4_GUID_FOR_DELETE_VAL == cur_admin_val ||
+		    (!index && !i))
+			continue;
+		comp_mask |= mlx4_ib_get_aguid_comp_mask_from_ix(i);
+	}
+	dev->sriov.alias_guid.ports_guid[port - 1].
+		all_rec_per_port[index].guid_indexes |= comp_mask;
+	if (dev->sriov.alias_guid.ports_guid[port - 1].
+	    all_rec_per_port[index].guid_indexes)
+		dev->sriov.alias_guid.ports_guid[port - 1].
+		all_rec_per_port[index].status = MLX4_GUID_INFO_STATUS_IDLE;
+
+}
+
+static int set_guid_rec(struct ib_device *ibdev,
+			struct mlx4_next_alias_guid_work *rec)
+{
+	int err;
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_sa_guidinfo_rec guid_info_rec;
+	ib_sa_comp_mask comp_mask;
+	struct ib_port_attr attr;
+	struct mlx4_alias_guid_work_context *callback_context;
+	unsigned long resched_delay, flags, flags1;
+	u8 port = rec->port + 1;
+	int index = rec->block_num;
+	struct mlx4_sriov_alias_guid_info_rec_det *rec_det = &rec->rec_det;
+	struct list_head *head =
+		&dev->sriov.alias_guid.ports_guid[port - 1].cb_list;
+
+	err = __mlx4_ib_query_port(ibdev, port, &attr, 1);
+	if (err) {
+		pr_debug("mlx4_ib_query_port failed (err: %d), port: %d\n",
+			 err, port);
+		return err;
+	}
+	/*check the port was configured by the sm, otherwise no need to send */
+	if (attr.state != IB_PORT_ACTIVE) {
+		pr_debug("port %d not active...rescheduling\n", port);
+		resched_delay = 5 * HZ;
+		err = -EAGAIN;
+		goto new_schedule;
+	}
+
+	callback_context = kmalloc(sizeof *callback_context, GFP_KERNEL);
+	if (!callback_context) {
+		err = -ENOMEM;
+		resched_delay = HZ * 5;
+		goto new_schedule;
+	}
+	callback_context->port = port;
+	callback_context->dev = dev;
+	callback_context->block_num = index;
+	callback_context->guid_indexes = rec_det->guid_indexes;
+	callback_context->method = rec->method;
+
+	memset(&guid_info_rec, 0, sizeof (struct ib_sa_guidinfo_rec));
+
+	guid_info_rec.lid = cpu_to_be16(attr.lid);
+	guid_info_rec.block_num = index;
+
+	memcpy(guid_info_rec.guid_info_list, rec_det->all_recs,
+	       GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC);
+	comp_mask = IB_SA_GUIDINFO_REC_LID | IB_SA_GUIDINFO_REC_BLOCK_NUM |
+		rec_det->guid_indexes;
+
+	init_completion(&callback_context->done);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	list_add_tail(&callback_context->list, head);
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+
+	callback_context->query_id =
+		ib_sa_guid_info_rec_query(dev->sriov.alias_guid.sa_client,
+					  ibdev, port, &guid_info_rec,
+					  comp_mask, rec->method, 1000,
+					  GFP_KERNEL, aliasguid_query_handler,
+					  callback_context,
+					  &callback_context->sa_query);
+	if (callback_context->query_id < 0) {
+		pr_debug("ib_sa_guid_info_rec_query failed, query_id: "
+			 "%d. will reschedule to the next 1 sec.\n",
+			 callback_context->query_id);
+		spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+		list_del(&callback_context->list);
+		kfree(callback_context);
+		spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+		resched_delay = 1 * HZ;
+		err = -EAGAIN;
+		goto new_schedule;
+	}
+	err = 0;
+	goto out;
+
+new_schedule:
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	invalidate_guid_record(dev, port, index);
+	if (!dev->sriov.is_going_down) {
+		queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq,
+				   &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work,
+				   resched_delay);
+	}
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+
+out:
+	return err;
+}
+
+static void mlx4_ib_guid_port_init(struct mlx4_ib_dev *dev, int port)
+{
+	int j, k, entry;
+	__be64 guid;
+
+	/*Check if the SM doesn't need to assign the GUIDs*/
+	for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+		for (k = 0; k < NUM_ALIAS_GUID_IN_REC; k++) {
+			entry = j * NUM_ALIAS_GUID_IN_REC + k;
+			/* no request for the 0 entry (hw guid) */
+			if (!entry || entry > dev->dev->persist->num_vfs ||
+			    !mlx4_is_slave_active(dev->dev, entry))
+				continue;
+			guid = mlx4_get_admin_guid(dev->dev, entry, port);
+			*(__be64 *)&dev->sriov.alias_guid.ports_guid[port - 1].
+				all_rec_per_port[j].all_recs
+				[GUID_REC_SIZE * k] = guid;
+			pr_debug("guid was set, entry=%d, val=0x%llx, port=%d\n",
+				 entry,
+				 be64_to_cpu(guid),
+				 port);
+		}
+	}
+}
+void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port)
+{
+	int i;
+	unsigned long flags, flags1;
+
+	pr_debug("port %d\n", port);
+
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+
+	if (dev->sriov.alias_guid.ports_guid[port - 1].state_flags &
+		GUID_STATE_NEED_PORT_INIT) {
+		mlx4_ib_guid_port_init(dev, port);
+		dev->sriov.alias_guid.ports_guid[port - 1].state_flags &=
+			(~GUID_STATE_NEED_PORT_INIT);
+	}
+	for (i = 0; i < NUM_ALIAS_GUID_REC_IN_PORT; i++)
+		invalidate_guid_record(dev, port, i);
+
+	if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down) {
+		/*
+		make sure no work waits in the queue, if the work is already
+		queued(not on the timer) the cancel will fail. That is not a problem
+		because we just want the work started.
+		*/
+		cancel_delayed_work(&dev->sriov.alias_guid.
+				      ports_guid[port - 1].alias_guid_work);
+		queue_delayed_work(dev->sriov.alias_guid.ports_guid[port - 1].wq,
+				   &dev->sriov.alias_guid.ports_guid[port - 1].alias_guid_work,
+				   0);
+	}
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static void set_required_record(struct mlx4_ib_dev *dev, u8 port,
+				struct mlx4_next_alias_guid_work *next_rec,
+				int record_index)
+{
+	int i;
+	int lowset_time_entry = -1;
+	int lowest_time = 0;
+	ib_sa_comp_mask delete_guid_indexes = 0;
+	ib_sa_comp_mask set_guid_indexes = 0;
+	struct mlx4_sriov_alias_guid_info_rec_det *rec =
+			&dev->sriov.alias_guid.ports_guid[port].
+			all_rec_per_port[record_index];
+
+	for (i = 0; i < NUM_ALIAS_GUID_IN_REC; i++) {
+		if (!(rec->guid_indexes &
+			mlx4_ib_get_aguid_comp_mask_from_ix(i)))
+			continue;
+
+		if (*(__be64 *)&rec->all_recs[i * GUID_REC_SIZE] ==
+				cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL))
+			delete_guid_indexes |=
+				mlx4_ib_get_aguid_comp_mask_from_ix(i);
+		else
+			set_guid_indexes |=
+				mlx4_ib_get_aguid_comp_mask_from_ix(i);
+
+		if (lowset_time_entry == -1 || rec->guids_retry_schedule[i] <=
+			lowest_time) {
+			lowset_time_entry = i;
+			lowest_time = rec->guids_retry_schedule[i];
+		}
+	}
+
+	memcpy(&next_rec->rec_det, rec, sizeof(*rec));
+	next_rec->port = port;
+	next_rec->block_num = record_index;
+
+	if (*(__be64 *)&rec->all_recs[lowset_time_entry * GUID_REC_SIZE] ==
+				cpu_to_be64(MLX4_GUID_FOR_DELETE_VAL)) {
+		next_rec->rec_det.guid_indexes = delete_guid_indexes;
+		next_rec->method = MLX4_GUID_INFO_RECORD_DELETE;
+	} else {
+		next_rec->rec_det.guid_indexes = set_guid_indexes;
+		next_rec->method = MLX4_GUID_INFO_RECORD_SET;
+	}
+}
+
+/* return index of record that should be updated based on lowest
+ * rescheduled time
+ */
+static int get_low_record_time_index(struct mlx4_ib_dev *dev, u8 port,
+				     int *resched_delay_sec)
+{
+	int record_index = -1;
+	u64 low_record_time = 0;
+	struct mlx4_sriov_alias_guid_info_rec_det rec;
+	int j;
+
+	for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+		rec = dev->sriov.alias_guid.ports_guid[port].
+			all_rec_per_port[j];
+		if (rec.status == MLX4_GUID_INFO_STATUS_IDLE &&
+		    rec.guid_indexes) {
+			if (record_index == -1 ||
+			    rec.time_to_run < low_record_time) {
+				record_index = j;
+				low_record_time = rec.time_to_run;
+			}
+		}
+	}
+	if (resched_delay_sec) {
+		u64 curr_time = ktime_get_real_ns();
+
+		*resched_delay_sec = (low_record_time < curr_time) ? 0 :
+			div_u64((low_record_time - curr_time), NSEC_PER_SEC);
+	}
+
+	return record_index;
+}
+
+/* The function returns the next record that was
+ * not configured (or failed to be configured) */
+static int get_next_record_to_update(struct mlx4_ib_dev *dev, u8 port,
+				     struct mlx4_next_alias_guid_work *rec)
+{
+	unsigned long flags;
+	int record_index;
+	int ret = 0;
+
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags);
+	record_index = get_low_record_time_index(dev, port, NULL);
+
+	if (record_index < 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	set_required_record(dev, port, rec, record_index);
+out:
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags);
+	return ret;
+}
+
+static void alias_guid_work(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	int ret = 0;
+	struct mlx4_next_alias_guid_work *rec;
+	struct mlx4_sriov_alias_guid_port_rec_det *sriov_alias_port =
+		container_of(delay, struct mlx4_sriov_alias_guid_port_rec_det,
+			     alias_guid_work);
+	struct mlx4_sriov_alias_guid *sriov_alias_guid = sriov_alias_port->parent;
+	struct mlx4_ib_sriov *ib_sriov = container_of(sriov_alias_guid,
+						struct mlx4_ib_sriov,
+						alias_guid);
+	struct mlx4_ib_dev *dev = container_of(ib_sriov, struct mlx4_ib_dev, sriov);
+
+	rec = kzalloc(sizeof *rec, GFP_KERNEL);
+	if (!rec) {
+		pr_err("alias_guid_work: No Memory\n");
+		return;
+	}
+
+	pr_debug("starting [port: %d]...\n", sriov_alias_port->port + 1);
+	ret = get_next_record_to_update(dev, sriov_alias_port->port, rec);
+	if (ret) {
+		pr_debug("No more records to update.\n");
+		goto out;
+	}
+
+	set_guid_rec(&dev->ib_dev, rec);
+out:
+	kfree(rec);
+}
+
+
+void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port)
+{
+	unsigned long flags, flags1;
+
+	if (!mlx4_is_master(dev->dev))
+		return;
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	spin_lock_irqsave(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	if (!dev->sriov.is_going_down) {
+		/* If there is pending one should cancell then run, otherwise
+		  * won't run till previous one is ended as same work
+		  * struct is used.
+		  */
+		cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[port].
+				    alias_guid_work);
+		queue_delayed_work(dev->sriov.alias_guid.ports_guid[port].wq,
+			   &dev->sriov.alias_guid.ports_guid[port].alias_guid_work, 0);
+	}
+	spin_unlock_irqrestore(&dev->sriov.alias_guid.ag_work_lock, flags1);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev)
+{
+	int i;
+	struct mlx4_ib_sriov *sriov = &dev->sriov;
+	struct mlx4_alias_guid_work_context *cb_ctx;
+	struct mlx4_sriov_alias_guid_port_rec_det *det;
+	struct ib_sa_query *sa_query;
+	unsigned long flags;
+
+	for (i = 0 ; i < dev->num_ports; i++) {
+		cancel_delayed_work(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work);
+		det = &sriov->alias_guid.ports_guid[i];
+		spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags);
+		while (!list_empty(&det->cb_list)) {
+			cb_ctx = list_entry(det->cb_list.next,
+					    struct mlx4_alias_guid_work_context,
+					    list);
+			sa_query = cb_ctx->sa_query;
+			cb_ctx->sa_query = NULL;
+			list_del(&cb_ctx->list);
+			spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags);
+			ib_sa_cancel_query(cb_ctx->query_id, sa_query);
+			wait_for_completion(&cb_ctx->done);
+			kfree(cb_ctx);
+			spin_lock_irqsave(&sriov->alias_guid.ag_work_lock, flags);
+		}
+		spin_unlock_irqrestore(&sriov->alias_guid.ag_work_lock, flags);
+	}
+	for (i = 0 ; i < dev->num_ports; i++) {
+		flush_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+		destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+	}
+	ib_sa_unregister_client(dev->sriov.alias_guid.sa_client);
+	kfree(dev->sriov.alias_guid.sa_client);
+}
+
+int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev)
+{
+	char alias_wq_name[15];
+	int ret = 0;
+	int i, j;
+	union ib_gid gid;
+
+	if (!mlx4_is_master(dev->dev))
+		return 0;
+	dev->sriov.alias_guid.sa_client =
+		kzalloc(sizeof *dev->sriov.alias_guid.sa_client, GFP_KERNEL);
+	if (!dev->sriov.alias_guid.sa_client)
+		return -ENOMEM;
+
+	ib_sa_register_client(dev->sriov.alias_guid.sa_client);
+
+	spin_lock_init(&dev->sriov.alias_guid.ag_work_lock);
+
+	for (i = 1; i <= dev->num_ports; ++i) {
+		if (dev->ib_dev.query_gid(&dev->ib_dev , i, 0, &gid)) {
+			ret = -EFAULT;
+			goto err_unregister;
+		}
+	}
+
+	for (i = 0 ; i < dev->num_ports; i++) {
+		memset(&dev->sriov.alias_guid.ports_guid[i], 0,
+		       sizeof (struct mlx4_sriov_alias_guid_port_rec_det));
+		dev->sriov.alias_guid.ports_guid[i].state_flags |=
+				GUID_STATE_NEED_PORT_INIT;
+		for (j = 0; j < NUM_ALIAS_GUID_REC_IN_PORT; j++) {
+			/* mark each val as it was deleted */
+			memset(dev->sriov.alias_guid.ports_guid[i].
+				all_rec_per_port[j].all_recs, 0xFF,
+				sizeof(dev->sriov.alias_guid.ports_guid[i].
+				all_rec_per_port[j].all_recs));
+		}
+		INIT_LIST_HEAD(&dev->sriov.alias_guid.ports_guid[i].cb_list);
+		/*prepare the records, set them to be allocated by sm*/
+		if (mlx4_ib_sm_guid_assign)
+			for (j = 1; j < NUM_ALIAS_GUID_PER_PORT; j++)
+				mlx4_set_admin_guid(dev->dev, 0, j, i + 1);
+		for (j = 0 ; j < NUM_ALIAS_GUID_REC_IN_PORT; j++)
+			invalidate_guid_record(dev, i + 1, j);
+
+		dev->sriov.alias_guid.ports_guid[i].parent = &dev->sriov.alias_guid;
+		dev->sriov.alias_guid.ports_guid[i].port  = i;
+
+		snprintf(alias_wq_name, sizeof alias_wq_name, "alias_guid%d", i);
+		dev->sriov.alias_guid.ports_guid[i].wq =
+			create_singlethread_workqueue(alias_wq_name);
+		if (!dev->sriov.alias_guid.ports_guid[i].wq) {
+			ret = -ENOMEM;
+			goto err_thread;
+		}
+		INIT_DELAYED_WORK(&dev->sriov.alias_guid.ports_guid[i].alias_guid_work,
+			  alias_guid_work);
+	}
+	return 0;
+
+err_thread:
+	for (--i; i >= 0; i--) {
+		destroy_workqueue(dev->sriov.alias_guid.ports_guid[i].wq);
+		dev->sriov.alias_guid.ports_guid[i].wq = NULL;
+	}
+
+err_unregister:
+	ib_sa_unregister_client(dev->sriov.alias_guid.sa_client);
+	kfree(dev->sriov.alias_guid.sa_client);
+	dev->sriov.alias_guid.sa_client = NULL;
+	pr_err("init_alias_guid_service: Failed. (ret:%d)\n", ret);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/mlx4/cm.c b/drivers/infiniband/hw/mlx4/cm.c
new file mode 100644
index 000000000..39a488889
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/cm.c
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/rbtree.h>
+#include <linux/idr.h>
+#include <rdma/ib_cm.h>
+
+#include "mlx4_ib.h"
+
+#define CM_CLEANUP_CACHE_TIMEOUT  (5 * HZ)
+
+struct id_map_entry {
+	struct rb_node node;
+
+	u32 sl_cm_id;
+	u32 pv_cm_id;
+	int slave_id;
+	int scheduled_delete;
+	struct mlx4_ib_dev *dev;
+
+	struct list_head list;
+	struct delayed_work timeout;
+};
+
+struct cm_generic_msg {
+	struct ib_mad_hdr hdr;
+
+	__be32 local_comm_id;
+	__be32 remote_comm_id;
+};
+
+struct cm_sidr_generic_msg {
+	struct ib_mad_hdr hdr;
+	__be32 request_id;
+};
+
+struct cm_req_msg {
+	unsigned char unused[0x60];
+	union ib_gid primary_path_sgid;
+};
+
+
+static void set_local_comm_id(struct ib_mad *mad, u32 cm_id)
+{
+	if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		struct cm_sidr_generic_msg *msg =
+			(struct cm_sidr_generic_msg *)mad;
+		msg->request_id = cpu_to_be32(cm_id);
+	} else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+		pr_err("trying to set local_comm_id in SIDR_REP\n");
+		return;
+	} else {
+		struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+		msg->local_comm_id = cpu_to_be32(cm_id);
+	}
+}
+
+static u32 get_local_comm_id(struct ib_mad *mad)
+{
+	if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		struct cm_sidr_generic_msg *msg =
+			(struct cm_sidr_generic_msg *)mad;
+		return be32_to_cpu(msg->request_id);
+	} else if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+		pr_err("trying to set local_comm_id in SIDR_REP\n");
+		return -1;
+	} else {
+		struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+		return be32_to_cpu(msg->local_comm_id);
+	}
+}
+
+static void set_remote_comm_id(struct ib_mad *mad, u32 cm_id)
+{
+	if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+		struct cm_sidr_generic_msg *msg =
+			(struct cm_sidr_generic_msg *)mad;
+		msg->request_id = cpu_to_be32(cm_id);
+	} else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		pr_err("trying to set remote_comm_id in SIDR_REQ\n");
+		return;
+	} else {
+		struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+		msg->remote_comm_id = cpu_to_be32(cm_id);
+	}
+}
+
+static u32 get_remote_comm_id(struct ib_mad *mad)
+{
+	if (mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+		struct cm_sidr_generic_msg *msg =
+			(struct cm_sidr_generic_msg *)mad;
+		return be32_to_cpu(msg->request_id);
+	} else if (mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		pr_err("trying to set remote_comm_id in SIDR_REQ\n");
+		return -1;
+	} else {
+		struct cm_generic_msg *msg = (struct cm_generic_msg *)mad;
+		return be32_to_cpu(msg->remote_comm_id);
+	}
+}
+
+static union ib_gid gid_from_req_msg(struct ib_device *ibdev, struct ib_mad *mad)
+{
+	struct cm_req_msg *msg = (struct cm_req_msg *)mad;
+
+	return msg->primary_path_sgid;
+}
+
+/* Lock should be taken before called */
+static struct id_map_entry *
+id_map_find_by_sl_id(struct ib_device *ibdev, u32 slave_id, u32 sl_cm_id)
+{
+	struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map;
+	struct rb_node *node = sl_id_map->rb_node;
+
+	while (node) {
+		struct id_map_entry *id_map_entry =
+			rb_entry(node, struct id_map_entry, node);
+
+		if (id_map_entry->sl_cm_id > sl_cm_id)
+			node = node->rb_left;
+		else if (id_map_entry->sl_cm_id < sl_cm_id)
+			node = node->rb_right;
+		else if (id_map_entry->slave_id > slave_id)
+			node = node->rb_left;
+		else if (id_map_entry->slave_id < slave_id)
+			node = node->rb_right;
+		else
+			return id_map_entry;
+	}
+	return NULL;
+}
+
+static void id_map_ent_timeout(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct id_map_entry *ent = container_of(delay, struct id_map_entry, timeout);
+	struct id_map_entry *db_ent, *found_ent;
+	struct mlx4_ib_dev *dev = ent->dev;
+	struct mlx4_ib_sriov *sriov = &dev->sriov;
+	struct rb_root *sl_id_map = &sriov->sl_id_map;
+	int pv_id = (int) ent->pv_cm_id;
+
+	spin_lock(&sriov->id_map_lock);
+	db_ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_id);
+	if (!db_ent)
+		goto out;
+	found_ent = id_map_find_by_sl_id(&dev->ib_dev, ent->slave_id, ent->sl_cm_id);
+	if (found_ent && found_ent == ent)
+		rb_erase(&found_ent->node, sl_id_map);
+	idr_remove(&sriov->pv_id_table, pv_id);
+
+out:
+	list_del(&ent->list);
+	spin_unlock(&sriov->id_map_lock);
+	kfree(ent);
+}
+
+static void id_map_find_del(struct ib_device *ibdev, int pv_cm_id)
+{
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+	struct rb_root *sl_id_map = &sriov->sl_id_map;
+	struct id_map_entry *ent, *found_ent;
+
+	spin_lock(&sriov->id_map_lock);
+	ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, pv_cm_id);
+	if (!ent)
+		goto out;
+	found_ent = id_map_find_by_sl_id(ibdev, ent->slave_id, ent->sl_cm_id);
+	if (found_ent && found_ent == ent)
+		rb_erase(&found_ent->node, sl_id_map);
+	idr_remove(&sriov->pv_id_table, pv_cm_id);
+out:
+	spin_unlock(&sriov->id_map_lock);
+}
+
+static void sl_id_map_add(struct ib_device *ibdev, struct id_map_entry *new)
+{
+	struct rb_root *sl_id_map = &to_mdev(ibdev)->sriov.sl_id_map;
+	struct rb_node **link = &sl_id_map->rb_node, *parent = NULL;
+	struct id_map_entry *ent;
+	int slave_id = new->slave_id;
+	int sl_cm_id = new->sl_cm_id;
+
+	ent = id_map_find_by_sl_id(ibdev, slave_id, sl_cm_id);
+	if (ent) {
+		pr_debug("overriding existing sl_id_map entry (cm_id = %x)\n",
+			 sl_cm_id);
+
+		rb_replace_node(&ent->node, &new->node, sl_id_map);
+		return;
+	}
+
+	/* Go to the bottom of the tree */
+	while (*link) {
+		parent = *link;
+		ent = rb_entry(parent, struct id_map_entry, node);
+
+		if (ent->sl_cm_id > sl_cm_id || (ent->sl_cm_id == sl_cm_id && ent->slave_id > slave_id))
+			link = &(*link)->rb_left;
+		else
+			link = &(*link)->rb_right;
+	}
+
+	rb_link_node(&new->node, parent, link);
+	rb_insert_color(&new->node, sl_id_map);
+}
+
+static struct id_map_entry *
+id_map_alloc(struct ib_device *ibdev, int slave_id, u32 sl_cm_id)
+{
+	int ret;
+	struct id_map_entry *ent;
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+
+	ent = kmalloc(sizeof (struct id_map_entry), GFP_KERNEL);
+	if (!ent) {
+		mlx4_ib_warn(ibdev, "Couldn't allocate id cache entry - out of memory\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ent->sl_cm_id = sl_cm_id;
+	ent->slave_id = slave_id;
+	ent->scheduled_delete = 0;
+	ent->dev = to_mdev(ibdev);
+	INIT_DELAYED_WORK(&ent->timeout, id_map_ent_timeout);
+
+	idr_preload(GFP_KERNEL);
+	spin_lock(&to_mdev(ibdev)->sriov.id_map_lock);
+
+	ret = idr_alloc_cyclic(&sriov->pv_id_table, ent, 0, 0, GFP_NOWAIT);
+	if (ret >= 0) {
+		ent->pv_cm_id = (u32)ret;
+		sl_id_map_add(ibdev, ent);
+		list_add_tail(&ent->list, &sriov->cm_list);
+	}
+
+	spin_unlock(&sriov->id_map_lock);
+	idr_preload_end();
+
+	if (ret >= 0)
+		return ent;
+
+	/*error flow*/
+	kfree(ent);
+	mlx4_ib_warn(ibdev, "No more space in the idr (err:0x%x)\n", ret);
+	return ERR_PTR(-ENOMEM);
+}
+
+static struct id_map_entry *
+id_map_get(struct ib_device *ibdev, int *pv_cm_id, int sl_cm_id, int slave_id)
+{
+	struct id_map_entry *ent;
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+
+	spin_lock(&sriov->id_map_lock);
+	if (*pv_cm_id == -1) {
+		ent = id_map_find_by_sl_id(ibdev, sl_cm_id, slave_id);
+		if (ent)
+			*pv_cm_id = (int) ent->pv_cm_id;
+	} else
+		ent = (struct id_map_entry *)idr_find(&sriov->pv_id_table, *pv_cm_id);
+	spin_unlock(&sriov->id_map_lock);
+
+	return ent;
+}
+
+static void schedule_delayed(struct ib_device *ibdev, struct id_map_entry *id)
+{
+	struct mlx4_ib_sriov *sriov = &to_mdev(ibdev)->sriov;
+	unsigned long flags;
+
+	spin_lock(&sriov->id_map_lock);
+	spin_lock_irqsave(&sriov->going_down_lock, flags);
+	/*make sure that there is no schedule inside the scheduled work.*/
+	if (!sriov->is_going_down) {
+		id->scheduled_delete = 1;
+		schedule_delayed_work(&id->timeout, CM_CLEANUP_CACHE_TIMEOUT);
+	}
+	spin_unlock_irqrestore(&sriov->going_down_lock, flags);
+	spin_unlock(&sriov->id_map_lock);
+}
+
+int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
+		struct ib_mad *mad)
+{
+	struct id_map_entry *id;
+	u32 sl_cm_id;
+	int pv_cm_id = -1;
+
+	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
+			mad->mad_hdr.attr_id == CM_REP_ATTR_ID ||
+			mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		sl_cm_id = get_local_comm_id(mad);
+		id = id_map_alloc(ibdev, slave_id, sl_cm_id);
+		if (IS_ERR(id)) {
+			mlx4_ib_warn(ibdev, "%s: id{slave: %d, sl_cm_id: 0x%x} Failed to id_map_alloc\n",
+				__func__, slave_id, sl_cm_id);
+			return PTR_ERR(id);
+		}
+	} else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID ||
+		   mad->mad_hdr.attr_id == CM_SIDR_REP_ATTR_ID) {
+		return 0;
+	} else {
+		sl_cm_id = get_local_comm_id(mad);
+		id = id_map_get(ibdev, &pv_cm_id, slave_id, sl_cm_id);
+	}
+
+	if (!id) {
+		pr_debug("id{slave: %d, sl_cm_id: 0x%x} is NULL!\n",
+			 slave_id, sl_cm_id);
+		return -EINVAL;
+	}
+
+	set_local_comm_id(mad, id->pv_cm_id);
+
+	if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID)
+		schedule_delayed(ibdev, id);
+	else if (mad->mad_hdr.attr_id == CM_DREP_ATTR_ID)
+		id_map_find_del(ibdev, pv_cm_id);
+
+	return 0;
+}
+
+int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
+			     struct ib_mad *mad)
+{
+	u32 pv_cm_id;
+	struct id_map_entry *id;
+
+	if (mad->mad_hdr.attr_id == CM_REQ_ATTR_ID ||
+	    mad->mad_hdr.attr_id == CM_SIDR_REQ_ATTR_ID) {
+		union ib_gid gid;
+
+		if (!slave)
+			return 0;
+
+		gid = gid_from_req_msg(ibdev, mad);
+		*slave = mlx4_ib_find_real_gid(ibdev, port, gid.global.interface_id);
+		if (*slave < 0) {
+			mlx4_ib_warn(ibdev, "failed matching slave_id by gid (0x%llx)\n",
+				     be64_to_cpu(gid.global.interface_id));
+			return -ENOENT;
+		}
+		return 0;
+	}
+
+	pv_cm_id = get_remote_comm_id(mad);
+	id = id_map_get(ibdev, (int *)&pv_cm_id, -1, -1);
+
+	if (!id) {
+		pr_debug("Couldn't find an entry for pv_cm_id 0x%x\n", pv_cm_id);
+		return -ENOENT;
+	}
+
+	if (slave)
+		*slave = id->slave_id;
+	set_remote_comm_id(mad, id->sl_cm_id);
+
+	if (mad->mad_hdr.attr_id == CM_DREQ_ATTR_ID)
+		schedule_delayed(ibdev, id);
+	else if (mad->mad_hdr.attr_id == CM_REJ_ATTR_ID ||
+			mad->mad_hdr.attr_id == CM_DREP_ATTR_ID) {
+		id_map_find_del(ibdev, (int) pv_cm_id);
+	}
+
+	return 0;
+}
+
+void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev)
+{
+	spin_lock_init(&dev->sriov.id_map_lock);
+	INIT_LIST_HEAD(&dev->sriov.cm_list);
+	dev->sriov.sl_id_map = RB_ROOT;
+	idr_init(&dev->sriov.pv_id_table);
+}
+
+/* slave = -1 ==> all slaves */
+/* TBD -- call paravirt clean for single slave.  Need for slave RESET event */
+void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave)
+{
+	struct mlx4_ib_sriov *sriov = &dev->sriov;
+	struct rb_root *sl_id_map = &sriov->sl_id_map;
+	struct list_head lh;
+	struct rb_node *nd;
+	int need_flush = 1;
+	struct id_map_entry *map, *tmp_map;
+	/* cancel all delayed work queue entries */
+	INIT_LIST_HEAD(&lh);
+	spin_lock(&sriov->id_map_lock);
+	list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) {
+		if (slave < 0 || slave == map->slave_id) {
+			if (map->scheduled_delete)
+				need_flush &= !!cancel_delayed_work(&map->timeout);
+		}
+	}
+
+	spin_unlock(&sriov->id_map_lock);
+
+	if (!need_flush)
+		flush_scheduled_work(); /* make sure all timers were flushed */
+
+	/* now, remove all leftover entries from databases*/
+	spin_lock(&sriov->id_map_lock);
+	if (slave < 0) {
+		while (rb_first(sl_id_map)) {
+			struct id_map_entry *ent =
+				rb_entry(rb_first(sl_id_map),
+					 struct id_map_entry, node);
+
+			rb_erase(&ent->node, sl_id_map);
+			idr_remove(&sriov->pv_id_table, (int) ent->pv_cm_id);
+		}
+		list_splice_init(&dev->sriov.cm_list, &lh);
+	} else {
+		/* first, move nodes belonging to slave to db remove list */
+		nd = rb_first(sl_id_map);
+		while (nd) {
+			struct id_map_entry *ent =
+				rb_entry(nd, struct id_map_entry, node);
+			nd = rb_next(nd);
+			if (ent->slave_id == slave)
+				list_move_tail(&ent->list, &lh);
+		}
+		/* remove those nodes from databases */
+		list_for_each_entry_safe(map, tmp_map, &lh, list) {
+			rb_erase(&map->node, sl_id_map);
+			idr_remove(&sriov->pv_id_table, (int) map->pv_cm_id);
+		}
+
+		/* add remaining nodes from cm_list */
+		list_for_each_entry_safe(map, tmp_map, &dev->sriov.cm_list, list) {
+			if (slave == map->slave_id)
+				list_move_tail(&map->list, &lh);
+		}
+	}
+
+	spin_unlock(&sriov->id_map_lock);
+
+	/* free any map entries left behind due to cancel_delayed_work above */
+	list_for_each_entry_safe(map, tmp_map, &lh, list) {
+		list_del(&map->list);
+		kfree(map);
+	}
+}
diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
new file mode 100644
index 000000000..0176caa57
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -0,0 +1,983 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/cq.h>
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+static void mlx4_ib_cq_comp(struct mlx4_cq *cq)
+{
+	struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
+	ibcq->comp_handler(ibcq, ibcq->cq_context);
+}
+
+static void mlx4_ib_cq_event(struct mlx4_cq *cq, enum mlx4_event type)
+{
+	struct ib_event event;
+	struct ib_cq *ibcq;
+
+	if (type != MLX4_EVENT_TYPE_CQ_ERROR) {
+		pr_warn("Unexpected event type %d "
+		       "on CQ %06x\n", type, cq->cqn);
+		return;
+	}
+
+	ibcq = &to_mibcq(cq)->ibcq;
+	if (ibcq->event_handler) {
+		event.device     = ibcq->device;
+		event.event      = IB_EVENT_CQ_ERR;
+		event.element.cq = ibcq;
+		ibcq->event_handler(&event, ibcq->cq_context);
+	}
+}
+
+static void *get_cqe_from_buf(struct mlx4_ib_cq_buf *buf, int n)
+{
+	return mlx4_buf_offset(&buf->buf, n * buf->entry_size);
+}
+
+static void *get_cqe(struct mlx4_ib_cq *cq, int n)
+{
+	return get_cqe_from_buf(&cq->buf, n);
+}
+
+static void *get_sw_cqe(struct mlx4_ib_cq *cq, int n)
+{
+	struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+	struct mlx4_cqe *tcqe = ((cq->buf.entry_size == 64) ? (cqe + 1) : cqe);
+
+	return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+		!!(n & (cq->ibcq.cqe + 1))) ? NULL : cqe;
+}
+
+static struct mlx4_cqe *next_cqe_sw(struct mlx4_ib_cq *cq)
+{
+	return get_sw_cqe(cq, cq->mcq.cons_index);
+}
+
+int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+	struct mlx4_ib_cq *mcq = to_mcq(cq);
+	struct mlx4_ib_dev *dev = to_mdev(cq->device);
+
+	return mlx4_cq_modify(dev->dev, &mcq->mcq, cq_count, cq_period);
+}
+
+static int mlx4_ib_alloc_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int nent)
+{
+	int err;
+
+	err = mlx4_buf_alloc(dev->dev, nent * dev->dev->caps.cqe_size,
+			     PAGE_SIZE * 2, &buf->buf, GFP_KERNEL);
+
+	if (err)
+		goto out;
+
+	buf->entry_size = dev->dev->caps.cqe_size;
+	err = mlx4_mtt_init(dev->dev, buf->buf.npages, buf->buf.page_shift,
+				    &buf->mtt);
+	if (err)
+		goto err_buf;
+
+	err = mlx4_buf_write_mtt(dev->dev, &buf->mtt, &buf->buf, GFP_KERNEL);
+	if (err)
+		goto err_mtt;
+
+	return 0;
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &buf->mtt);
+
+err_buf:
+	mlx4_buf_free(dev->dev, nent * buf->entry_size, &buf->buf);
+
+out:
+	return err;
+}
+
+static void mlx4_ib_free_cq_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq_buf *buf, int cqe)
+{
+	mlx4_buf_free(dev->dev, (cqe + 1) * buf->entry_size, &buf->buf);
+}
+
+static int mlx4_ib_get_cq_umem(struct mlx4_ib_dev *dev, struct ib_ucontext *context,
+			       struct mlx4_ib_cq_buf *buf, struct ib_umem **umem,
+			       u64 buf_addr, int cqe)
+{
+	int err;
+	int cqe_size = dev->dev->caps.cqe_size;
+
+	*umem = ib_umem_get(context, buf_addr, cqe * cqe_size,
+			    IB_ACCESS_LOCAL_WRITE, 1);
+	if (IS_ERR(*umem))
+		return PTR_ERR(*umem);
+
+	err = mlx4_mtt_init(dev->dev, ib_umem_page_count(*umem),
+			    ilog2((*umem)->page_size), &buf->mtt);
+	if (err)
+		goto err_buf;
+
+	err = mlx4_ib_umem_write_mtt(dev, &buf->mtt, *umem);
+	if (err)
+		goto err_mtt;
+
+	return 0;
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &buf->mtt);
+
+err_buf:
+	ib_umem_release(*umem);
+
+	return err;
+}
+
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
+				struct ib_ucontext *context,
+				struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct mlx4_ib_cq *cq;
+	struct mlx4_uar *uar;
+	int err;
+
+	if (entries < 1 || entries > dev->dev->caps.max_cqes)
+		return ERR_PTR(-EINVAL);
+
+	cq = kmalloc(sizeof *cq, GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	entries      = roundup_pow_of_two(entries + 1);
+	cq->ibcq.cqe = entries - 1;
+	mutex_init(&cq->resize_mutex);
+	spin_lock_init(&cq->lock);
+	cq->resize_buf = NULL;
+	cq->resize_umem = NULL;
+	INIT_LIST_HEAD(&cq->send_qp_list);
+	INIT_LIST_HEAD(&cq->recv_qp_list);
+
+	if (context) {
+		struct mlx4_ib_create_cq ucmd;
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err_cq;
+		}
+
+		err = mlx4_ib_get_cq_umem(dev, context, &cq->buf, &cq->umem,
+					  ucmd.buf_addr, entries);
+		if (err)
+			goto err_cq;
+
+		err = mlx4_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
+					  &cq->db);
+		if (err)
+			goto err_mtt;
+
+		uar = &to_mucontext(context)->uar;
+	} else {
+		err = mlx4_db_alloc(dev->dev, &cq->db, 1, GFP_KERNEL);
+		if (err)
+			goto err_cq;
+
+		cq->mcq.set_ci_db  = cq->db.db;
+		cq->mcq.arm_db     = cq->db.db + 1;
+		*cq->mcq.set_ci_db = 0;
+		*cq->mcq.arm_db    = 0;
+
+		err = mlx4_ib_alloc_cq_buf(dev, &cq->buf, entries);
+		if (err)
+			goto err_db;
+
+		uar = &dev->priv_uar;
+	}
+
+	if (dev->eq_table)
+		vector = dev->eq_table[vector % ibdev->num_comp_vectors];
+
+	err = mlx4_cq_alloc(dev->dev, entries, &cq->buf.mtt, uar,
+			    cq->db.dma, &cq->mcq, vector, 0, 0);
+	if (err)
+		goto err_dbmap;
+
+	if (context)
+		cq->mcq.tasklet_ctx.comp = mlx4_ib_cq_comp;
+	else
+		cq->mcq.comp = mlx4_ib_cq_comp;
+	cq->mcq.event = mlx4_ib_cq_event;
+
+	if (context)
+		if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof (__u32))) {
+			err = -EFAULT;
+			goto err_dbmap;
+		}
+
+	return &cq->ibcq;
+
+err_dbmap:
+	if (context)
+		mlx4_ib_db_unmap_user(to_mucontext(context), &cq->db);
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &cq->buf.mtt);
+
+	if (context)
+		ib_umem_release(cq->umem);
+	else
+		mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+
+err_db:
+	if (!context)
+		mlx4_db_free(dev->dev, &cq->db);
+
+err_cq:
+	kfree(cq);
+
+	return ERR_PTR(err);
+}
+
+static int mlx4_alloc_resize_buf(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
+				  int entries)
+{
+	int err;
+
+	if (cq->resize_buf)
+		return -EBUSY;
+
+	cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+	if (!cq->resize_buf)
+		return -ENOMEM;
+
+	err = mlx4_ib_alloc_cq_buf(dev, &cq->resize_buf->buf, entries);
+	if (err) {
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		return err;
+	}
+
+	cq->resize_buf->cqe = entries - 1;
+
+	return 0;
+}
+
+static int mlx4_alloc_resize_umem(struct mlx4_ib_dev *dev, struct mlx4_ib_cq *cq,
+				   int entries, struct ib_udata *udata)
+{
+	struct mlx4_ib_resize_cq ucmd;
+	int err;
+
+	if (cq->resize_umem)
+		return -EBUSY;
+
+	if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
+		return -EFAULT;
+
+	cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+	if (!cq->resize_buf)
+		return -ENOMEM;
+
+	err = mlx4_ib_get_cq_umem(dev, cq->umem->context, &cq->resize_buf->buf,
+				  &cq->resize_umem, ucmd.buf_addr, entries);
+	if (err) {
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		return err;
+	}
+
+	cq->resize_buf->cqe = entries - 1;
+
+	return 0;
+}
+
+static int mlx4_ib_get_outstanding_cqes(struct mlx4_ib_cq *cq)
+{
+	u32 i;
+
+	i = cq->mcq.cons_index;
+	while (get_sw_cqe(cq, i))
+		++i;
+
+	return i - cq->mcq.cons_index;
+}
+
+static void mlx4_ib_cq_resize_copy_cqes(struct mlx4_ib_cq *cq)
+{
+	struct mlx4_cqe *cqe, *new_cqe;
+	int i;
+	int cqe_size = cq->buf.entry_size;
+	int cqe_inc = cqe_size == 64 ? 1 : 0;
+
+	i = cq->mcq.cons_index;
+	cqe = get_cqe(cq, i & cq->ibcq.cqe);
+	cqe += cqe_inc;
+
+	while ((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) != MLX4_CQE_OPCODE_RESIZE) {
+		new_cqe = get_cqe_from_buf(&cq->resize_buf->buf,
+					   (i + 1) & cq->resize_buf->cqe);
+		memcpy(new_cqe, get_cqe(cq, i & cq->ibcq.cqe), cqe_size);
+		new_cqe += cqe_inc;
+
+		new_cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) |
+			(((i + 1) & (cq->resize_buf->cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0);
+		cqe = get_cqe(cq, ++i & cq->ibcq.cqe);
+		cqe += cqe_inc;
+	}
+	++cq->mcq.cons_index;
+}
+
+int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibcq->device);
+	struct mlx4_ib_cq *cq = to_mcq(ibcq);
+	struct mlx4_mtt mtt;
+	int outst_cqe;
+	int err;
+
+	mutex_lock(&cq->resize_mutex);
+	if (entries < 1 || entries > dev->dev->caps.max_cqes) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	entries = roundup_pow_of_two(entries + 1);
+	if (entries == ibcq->cqe + 1) {
+		err = 0;
+		goto out;
+	}
+
+	if (entries > dev->dev->caps.max_cqes + 1) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (ibcq->uobject) {
+		err = mlx4_alloc_resize_umem(dev, cq, entries, udata);
+		if (err)
+			goto out;
+	} else {
+		/* Can't be smaller than the number of outstanding CQEs */
+		outst_cqe = mlx4_ib_get_outstanding_cqes(cq);
+		if (entries < outst_cqe + 1) {
+			err = -EINVAL;
+			goto out;
+		}
+
+		err = mlx4_alloc_resize_buf(dev, cq, entries);
+		if (err)
+			goto out;
+	}
+
+	mtt = cq->buf.mtt;
+
+	err = mlx4_cq_resize(dev->dev, &cq->mcq, entries, &cq->resize_buf->buf.mtt);
+	if (err)
+		goto err_buf;
+
+	mlx4_mtt_cleanup(dev->dev, &mtt);
+	if (ibcq->uobject) {
+		cq->buf      = cq->resize_buf->buf;
+		cq->ibcq.cqe = cq->resize_buf->cqe;
+		ib_umem_release(cq->umem);
+		cq->umem     = cq->resize_umem;
+
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		cq->resize_umem = NULL;
+	} else {
+		struct mlx4_ib_cq_buf tmp_buf;
+		int tmp_cqe = 0;
+
+		spin_lock_irq(&cq->lock);
+		if (cq->resize_buf) {
+			mlx4_ib_cq_resize_copy_cqes(cq);
+			tmp_buf = cq->buf;
+			tmp_cqe = cq->ibcq.cqe;
+			cq->buf      = cq->resize_buf->buf;
+			cq->ibcq.cqe = cq->resize_buf->cqe;
+
+			kfree(cq->resize_buf);
+			cq->resize_buf = NULL;
+		}
+		spin_unlock_irq(&cq->lock);
+
+		if (tmp_cqe)
+			mlx4_ib_free_cq_buf(dev, &tmp_buf, tmp_cqe);
+	}
+
+	goto out;
+
+err_buf:
+	mlx4_mtt_cleanup(dev->dev, &cq->resize_buf->buf.mtt);
+	if (!ibcq->uobject)
+		mlx4_ib_free_cq_buf(dev, &cq->resize_buf->buf,
+				    cq->resize_buf->cqe);
+
+	kfree(cq->resize_buf);
+	cq->resize_buf = NULL;
+
+	if (cq->resize_umem) {
+		ib_umem_release(cq->resize_umem);
+		cq->resize_umem = NULL;
+	}
+
+out:
+	mutex_unlock(&cq->resize_mutex);
+
+	return err;
+}
+
+int mlx4_ib_destroy_cq(struct ib_cq *cq)
+{
+	struct mlx4_ib_dev *dev = to_mdev(cq->device);
+	struct mlx4_ib_cq *mcq = to_mcq(cq);
+
+	mlx4_cq_free(dev->dev, &mcq->mcq);
+	mlx4_mtt_cleanup(dev->dev, &mcq->buf.mtt);
+
+	if (cq->uobject) {
+		mlx4_ib_db_unmap_user(to_mucontext(cq->uobject->context), &mcq->db);
+		ib_umem_release(mcq->umem);
+	} else {
+		mlx4_ib_free_cq_buf(dev, &mcq->buf, cq->cqe);
+		mlx4_db_free(dev->dev, &mcq->db);
+	}
+
+	kfree(mcq);
+
+	return 0;
+}
+
+static void dump_cqe(void *cqe)
+{
+	__be32 *buf = cqe;
+
+	pr_debug("CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
+	       be32_to_cpu(buf[0]), be32_to_cpu(buf[1]), be32_to_cpu(buf[2]),
+	       be32_to_cpu(buf[3]), be32_to_cpu(buf[4]), be32_to_cpu(buf[5]),
+	       be32_to_cpu(buf[6]), be32_to_cpu(buf[7]));
+}
+
+static void mlx4_ib_handle_error_cqe(struct mlx4_err_cqe *cqe,
+				     struct ib_wc *wc)
+{
+	if (cqe->syndrome == MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR) {
+		pr_debug("local QP operation err "
+		       "(QPN %06x, WQE index %x, vendor syndrome %02x, "
+		       "opcode = %02x)\n",
+		       be32_to_cpu(cqe->my_qpn), be16_to_cpu(cqe->wqe_index),
+		       cqe->vendor_err_syndrome,
+		       cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+		dump_cqe(cqe);
+	}
+
+	switch (cqe->syndrome) {
+	case MLX4_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+		wc->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+		wc->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_PROT_ERR:
+		wc->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_WR_FLUSH_ERR:
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_MW_BIND_ERR:
+		wc->status = IB_WC_MW_BIND_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_BAD_RESP_ERR:
+		wc->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+		wc->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+		wc->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+		wc->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_OP_ERR:
+		wc->status = IB_WC_REM_OP_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+		wc->status = IB_WC_RETRY_EXC_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+		wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+		break;
+	case MLX4_CQE_SYNDROME_REMOTE_ABORTED_ERR:
+		wc->status = IB_WC_REM_ABORT_ERR;
+		break;
+	default:
+		wc->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	wc->vendor_err = cqe->vendor_err_syndrome;
+}
+
+static int mlx4_ib_ipoib_csum_ok(__be16 status, __be16 checksum)
+{
+	return ((status & cpu_to_be16(MLX4_CQE_STATUS_IPV4      |
+				      MLX4_CQE_STATUS_IPV4F     |
+				      MLX4_CQE_STATUS_IPV4OPT   |
+				      MLX4_CQE_STATUS_IPV6      |
+				      MLX4_CQE_STATUS_IPOK)) ==
+		cpu_to_be16(MLX4_CQE_STATUS_IPV4        |
+			    MLX4_CQE_STATUS_IPOK))              &&
+		(status & cpu_to_be16(MLX4_CQE_STATUS_UDP       |
+				      MLX4_CQE_STATUS_TCP))     &&
+		checksum == cpu_to_be16(0xffff);
+}
+
+static int use_tunnel_data(struct mlx4_ib_qp *qp, struct mlx4_ib_cq *cq, struct ib_wc *wc,
+			   unsigned tail, struct mlx4_cqe *cqe, int is_eth)
+{
+	struct mlx4_ib_proxy_sqp_hdr *hdr;
+
+	ib_dma_sync_single_for_cpu(qp->ibqp.device,
+				   qp->sqp_proxy_rcv[tail].map,
+				   sizeof (struct mlx4_ib_proxy_sqp_hdr),
+				   DMA_FROM_DEVICE);
+	hdr = (struct mlx4_ib_proxy_sqp_hdr *) (qp->sqp_proxy_rcv[tail].addr);
+	wc->pkey_index	= be16_to_cpu(hdr->tun.pkey_index);
+	wc->src_qp	= be32_to_cpu(hdr->tun.flags_src_qp) & 0xFFFFFF;
+	wc->wc_flags   |= (hdr->tun.g_ml_path & 0x80) ? (IB_WC_GRH) : 0;
+	wc->dlid_path_bits = 0;
+
+	if (is_eth) {
+		wc->vlan_id = be16_to_cpu(hdr->tun.sl_vid);
+		memcpy(&(wc->smac[0]), (char *)&hdr->tun.mac_31_0, 4);
+		memcpy(&(wc->smac[4]), (char *)&hdr->tun.slid_mac_47_32, 2);
+		wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC);
+	} else {
+		wc->slid        = be16_to_cpu(hdr->tun.slid_mac_47_32);
+		wc->sl          = (u8) (be16_to_cpu(hdr->tun.sl_vid) >> 12);
+	}
+
+	return 0;
+}
+
+static void mlx4_ib_qp_sw_comp(struct mlx4_ib_qp *qp, int num_entries,
+			       struct ib_wc *wc, int *npolled, int is_send)
+{
+	struct mlx4_ib_wq *wq;
+	unsigned cur;
+	int i;
+
+	wq = is_send ? &qp->sq : &qp->rq;
+	cur = wq->head - wq->tail;
+
+	if (cur == 0)
+		return;
+
+	for (i = 0;  i < cur && *npolled < num_entries; i++) {
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		wc->vendor_err = MLX4_CQE_SYNDROME_WR_FLUSH_ERR;
+		wq->tail++;
+		(*npolled)++;
+		wc->qp = &qp->ibqp;
+		wc++;
+	}
+}
+
+static void mlx4_ib_poll_sw_comp(struct mlx4_ib_cq *cq, int num_entries,
+				 struct ib_wc *wc, int *npolled)
+{
+	struct mlx4_ib_qp *qp;
+
+	*npolled = 0;
+	/* Find uncompleted WQEs belonging to that cq and retrun
+	 * simulated FLUSH_ERR completions
+	 */
+	list_for_each_entry(qp, &cq->send_qp_list, cq_send_list) {
+		mlx4_ib_qp_sw_comp(qp, num_entries, wc, npolled, 1);
+		if (*npolled >= num_entries)
+			goto out;
+	}
+
+	list_for_each_entry(qp, &cq->recv_qp_list, cq_recv_list) {
+		mlx4_ib_qp_sw_comp(qp, num_entries, wc + *npolled, npolled, 0);
+		if (*npolled >= num_entries)
+			goto out;
+	}
+
+out:
+	return;
+}
+
+static int mlx4_ib_poll_one(struct mlx4_ib_cq *cq,
+			    struct mlx4_ib_qp **cur_qp,
+			    struct ib_wc *wc)
+{
+	struct mlx4_cqe *cqe;
+	struct mlx4_qp *mqp;
+	struct mlx4_ib_wq *wq;
+	struct mlx4_ib_srq *srq;
+	struct mlx4_srq *msrq = NULL;
+	int is_send;
+	int is_error;
+	int is_eth;
+	u32 g_mlpath_rqpn;
+	u16 wqe_ctr;
+	unsigned tail = 0;
+
+repoll:
+	cqe = next_cqe_sw(cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	if (cq->buf.entry_size == 64)
+		cqe++;
+
+	++cq->mcq.cons_index;
+
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+
+	is_send  = cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK;
+	is_error = (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) ==
+		MLX4_CQE_OPCODE_ERROR;
+
+	if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_OPCODE_NOP &&
+		     is_send)) {
+		pr_warn("Completion for NOP opcode detected!\n");
+		return -EINVAL;
+	}
+
+	/* Resize CQ in progress */
+	if (unlikely((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) == MLX4_CQE_OPCODE_RESIZE)) {
+		if (cq->resize_buf) {
+			struct mlx4_ib_dev *dev = to_mdev(cq->ibcq.device);
+
+			mlx4_ib_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+			cq->buf      = cq->resize_buf->buf;
+			cq->ibcq.cqe = cq->resize_buf->cqe;
+
+			kfree(cq->resize_buf);
+			cq->resize_buf = NULL;
+		}
+
+		goto repoll;
+	}
+
+	if (!*cur_qp ||
+	    (be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) != (*cur_qp)->mqp.qpn) {
+		/*
+		 * We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		mqp = __mlx4_qp_lookup(to_mdev(cq->ibcq.device)->dev,
+				       be32_to_cpu(cqe->vlan_my_qpn));
+		if (unlikely(!mqp)) {
+			pr_warn("CQ %06x with entry for unknown QPN %06x\n",
+			       cq->mcq.cqn, be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK);
+			return -EINVAL;
+		}
+
+		*cur_qp = to_mibqp(mqp);
+	}
+
+	wc->qp = &(*cur_qp)->ibqp;
+
+	if (wc->qp->qp_type == IB_QPT_XRC_TGT) {
+		u32 srq_num;
+		g_mlpath_rqpn = be32_to_cpu(cqe->g_mlpath_rqpn);
+		srq_num       = g_mlpath_rqpn & 0xffffff;
+		/* SRQ is also in the radix tree */
+		msrq = mlx4_srq_lookup(to_mdev(cq->ibcq.device)->dev,
+				       srq_num);
+		if (unlikely(!msrq)) {
+			pr_warn("CQ %06x with entry for unknown SRQN %06x\n",
+				cq->mcq.cqn, srq_num);
+			return -EINVAL;
+		}
+	}
+
+	if (is_send) {
+		wq = &(*cur_qp)->sq;
+		if (!(*cur_qp)->sq_signal_bits) {
+			wqe_ctr = be16_to_cpu(cqe->wqe_index);
+			wq->tail += (u16) (wqe_ctr - (u16) wq->tail);
+		}
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	} else if ((*cur_qp)->ibqp.srq) {
+		srq = to_msrq((*cur_qp)->ibqp.srq);
+		wqe_ctr = be16_to_cpu(cqe->wqe_index);
+		wc->wr_id = srq->wrid[wqe_ctr];
+		mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+	} else if (msrq) {
+		srq = to_mibsrq(msrq);
+		wqe_ctr = be16_to_cpu(cqe->wqe_index);
+		wc->wr_id = srq->wrid[wqe_ctr];
+		mlx4_ib_free_srq_wqe(srq, wqe_ctr);
+	} else {
+		wq	  = &(*cur_qp)->rq;
+		tail	  = wq->tail & (wq->wqe_cnt - 1);
+		wc->wr_id = wq->wrid[tail];
+		++wq->tail;
+	}
+
+	if (unlikely(is_error)) {
+		mlx4_ib_handle_error_cqe((struct mlx4_err_cqe *) cqe, wc);
+		return 0;
+	}
+
+	wc->status = IB_WC_SUCCESS;
+
+	if (is_send) {
+		wc->wc_flags = 0;
+		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_OPCODE_RDMA_WRITE_IMM:
+			wc->wc_flags |= IB_WC_WITH_IMM;
+		case MLX4_OPCODE_RDMA_WRITE:
+			wc->opcode    = IB_WC_RDMA_WRITE;
+			break;
+		case MLX4_OPCODE_SEND_IMM:
+			wc->wc_flags |= IB_WC_WITH_IMM;
+		case MLX4_OPCODE_SEND:
+		case MLX4_OPCODE_SEND_INVAL:
+			wc->opcode    = IB_WC_SEND;
+			break;
+		case MLX4_OPCODE_RDMA_READ:
+			wc->opcode    = IB_WC_RDMA_READ;
+			wc->byte_len  = be32_to_cpu(cqe->byte_cnt);
+			break;
+		case MLX4_OPCODE_ATOMIC_CS:
+			wc->opcode    = IB_WC_COMP_SWAP;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_ATOMIC_FA:
+			wc->opcode    = IB_WC_FETCH_ADD;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_MASKED_ATOMIC_CS:
+			wc->opcode    = IB_WC_MASKED_COMP_SWAP;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_MASKED_ATOMIC_FA:
+			wc->opcode    = IB_WC_MASKED_FETCH_ADD;
+			wc->byte_len  = 8;
+			break;
+		case MLX4_OPCODE_BIND_MW:
+			wc->opcode    = IB_WC_BIND_MW;
+			break;
+		case MLX4_OPCODE_LSO:
+			wc->opcode    = IB_WC_LSO;
+			break;
+		case MLX4_OPCODE_FMR:
+			wc->opcode    = IB_WC_FAST_REG_MR;
+			break;
+		case MLX4_OPCODE_LOCAL_INVAL:
+			wc->opcode    = IB_WC_LOCAL_INV;
+			break;
+		}
+	} else {
+		wc->byte_len = be32_to_cpu(cqe->byte_cnt);
+
+		switch (cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) {
+		case MLX4_RECV_OPCODE_RDMA_WRITE_IMM:
+			wc->opcode	= IB_WC_RECV_RDMA_WITH_IMM;
+			wc->wc_flags	= IB_WC_WITH_IMM;
+			wc->ex.imm_data = cqe->immed_rss_invalid;
+			break;
+		case MLX4_RECV_OPCODE_SEND_INVAL:
+			wc->opcode	= IB_WC_RECV;
+			wc->wc_flags	= IB_WC_WITH_INVALIDATE;
+			wc->ex.invalidate_rkey = be32_to_cpu(cqe->immed_rss_invalid);
+			break;
+		case MLX4_RECV_OPCODE_SEND:
+			wc->opcode   = IB_WC_RECV;
+			wc->wc_flags = 0;
+			break;
+		case MLX4_RECV_OPCODE_SEND_IMM:
+			wc->opcode	= IB_WC_RECV;
+			wc->wc_flags	= IB_WC_WITH_IMM;
+			wc->ex.imm_data = cqe->immed_rss_invalid;
+			break;
+		}
+
+		is_eth = (rdma_port_get_link_layer(wc->qp->device,
+						  (*cur_qp)->port) ==
+			  IB_LINK_LAYER_ETHERNET);
+		if (mlx4_is_mfunc(to_mdev(cq->ibcq.device)->dev)) {
+			if ((*cur_qp)->mlx4_ib_qp_type &
+			    (MLX4_IB_QPT_PROXY_SMI_OWNER |
+			     MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
+				return use_tunnel_data(*cur_qp, cq, wc, tail,
+						       cqe, is_eth);
+		}
+
+		wc->slid	   = be16_to_cpu(cqe->rlid);
+		g_mlpath_rqpn	   = be32_to_cpu(cqe->g_mlpath_rqpn);
+		wc->src_qp	   = g_mlpath_rqpn & 0xffffff;
+		wc->dlid_path_bits = (g_mlpath_rqpn >> 24) & 0x7f;
+		wc->wc_flags	  |= g_mlpath_rqpn & 0x80000000 ? IB_WC_GRH : 0;
+		wc->pkey_index     = be32_to_cpu(cqe->immed_rss_invalid) & 0x7f;
+		wc->wc_flags	  |= mlx4_ib_ipoib_csum_ok(cqe->status,
+					cqe->checksum) ? IB_WC_IP_CSUM_OK : 0;
+		if (is_eth) {
+			wc->sl  = be16_to_cpu(cqe->sl_vid) >> 13;
+			if (be32_to_cpu(cqe->vlan_my_qpn) &
+					MLX4_CQE_VLAN_PRESENT_MASK) {
+				wc->vlan_id = be16_to_cpu(cqe->sl_vid) &
+					MLX4_CQE_VID_MASK;
+			} else {
+				wc->vlan_id = 0xffff;
+			}
+			memcpy(wc->smac, cqe->smac, ETH_ALEN);
+			wc->wc_flags |= (IB_WC_WITH_VLAN | IB_WC_WITH_SMAC);
+		} else {
+			wc->sl  = be16_to_cpu(cqe->sl_vid) >> 12;
+			wc->vlan_id = 0xffff;
+		}
+	}
+
+	return 0;
+}
+
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct mlx4_ib_cq *cq = to_mcq(ibcq);
+	struct mlx4_ib_qp *cur_qp = NULL;
+	unsigned long flags;
+	int npolled;
+	int err = 0;
+	struct mlx4_ib_dev *mdev = to_mdev(cq->ibcq.device);
+
+	spin_lock_irqsave(&cq->lock, flags);
+	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		mlx4_ib_poll_sw_comp(cq, num_entries, wc, &npolled);
+		goto out;
+	}
+
+	for (npolled = 0; npolled < num_entries; ++npolled) {
+		err = mlx4_ib_poll_one(cq, &cur_qp, wc + npolled);
+		if (err)
+			break;
+	}
+
+	mlx4_cq_set_ci(&cq->mcq);
+
+out:
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	if (err == 0 || err == -EAGAIN)
+		return npolled;
+	else
+		return err;
+}
+
+int mlx4_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	mlx4_cq_arm(&to_mcq(ibcq)->mcq,
+		    (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		    MLX4_CQ_DB_REQ_NOT_SOL : MLX4_CQ_DB_REQ_NOT,
+		    to_mdev(ibcq->device)->uar_map,
+		    MLX4_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->uar_lock));
+
+	return 0;
+}
+
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+	u32 prod_index;
+	int nfreed = 0;
+	struct mlx4_cqe *cqe, *dest;
+	u8 owner_bit;
+	int cqe_inc = cq->buf.entry_size == 64 ? 1 : 0;
+
+	/*
+	 * First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); ++prod_index)
+		if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
+			break;
+
+	/*
+	 * Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
+		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+		cqe += cqe_inc;
+
+		if ((be32_to_cpu(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK) == qpn) {
+			if (srq && !(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK))
+				mlx4_ib_free_srq_wqe(srq, be16_to_cpu(cqe->wqe_index));
+			++nfreed;
+		} else if (nfreed) {
+			dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
+			dest += cqe_inc;
+
+			owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK;
+			memcpy(dest, cqe, sizeof *cqe);
+			dest->owner_sr_opcode = owner_bit |
+				(dest->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK);
+		}
+	}
+
+	if (nfreed) {
+		cq->mcq.cons_index += nfreed;
+		/*
+		 * Make sure update of buffer contents is done before
+		 * updating consumer index.
+		 */
+		wmb();
+		mlx4_cq_set_ci(&cq->mcq);
+	}
+}
+
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq)
+{
+	spin_lock_irq(&cq->lock);
+	__mlx4_ib_cq_clean(cq, qpn, srq);
+	spin_unlock_irq(&cq->lock);
+}
diff --git a/drivers/infiniband/hw/mlx4/doorbell.c b/drivers/infiniband/hw/mlx4/doorbell.c
new file mode 100644
index 000000000..c51740986
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/doorbell.c
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+
+struct mlx4_ib_user_db_page {
+	struct list_head	list;
+	struct ib_umem	       *umem;
+	unsigned long		user_virt;
+	int			refcnt;
+};
+
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+			struct mlx4_db *db)
+{
+	struct mlx4_ib_user_db_page *page;
+	int err = 0;
+
+	mutex_lock(&context->db_page_mutex);
+
+	list_for_each_entry(page, &context->db_page_list, list)
+		if (page->user_virt == (virt & PAGE_MASK))
+			goto found;
+
+	page = kmalloc(sizeof *page, GFP_KERNEL);
+	if (!page) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	page->user_virt = (virt & PAGE_MASK);
+	page->refcnt    = 0;
+	page->umem      = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
+				      PAGE_SIZE, 0, 0);
+	if (IS_ERR(page->umem)) {
+		err = PTR_ERR(page->umem);
+		kfree(page);
+		goto out;
+	}
+
+	list_add(&page->list, &context->db_page_list);
+
+found:
+	db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK);
+	db->u.user_page = page;
+	++page->refcnt;
+
+out:
+	mutex_unlock(&context->db_page_mutex);
+
+	return err;
+}
+
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db)
+{
+	mutex_lock(&context->db_page_mutex);
+
+	if (!--db->u.user_page->refcnt) {
+		list_del(&db->u.user_page->list);
+		ib_umem_release(db->u.user_page->umem);
+		kfree(db->u.user_page);
+	}
+
+	mutex_unlock(&context->db_page_mutex);
+}
diff --git a/drivers/infiniband/hw/mlx4/mad.c b/drivers/infiniband/hw/mlx4/mad.c
new file mode 100644
index 000000000..9cd2b002d
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mad.c
@@ -0,0 +1,2185 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_cache.h>
+
+#include <linux/random.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/gfp.h>
+#include <rdma/ib_pma.h>
+
+#include "mlx4_ib.h"
+
+enum {
+	MLX4_IB_VENDOR_CLASS1 = 0x9,
+	MLX4_IB_VENDOR_CLASS2 = 0xa
+};
+
+#define MLX4_TUN_SEND_WRID_SHIFT 34
+#define MLX4_TUN_QPN_SHIFT 32
+#define MLX4_TUN_WRID_RECV (((u64) 1) << MLX4_TUN_SEND_WRID_SHIFT)
+#define MLX4_TUN_SET_WRID_QPN(a) (((u64) ((a) & 0x3)) << MLX4_TUN_QPN_SHIFT)
+
+#define MLX4_TUN_IS_RECV(a)  (((a) >>  MLX4_TUN_SEND_WRID_SHIFT) & 0x1)
+#define MLX4_TUN_WRID_QPN(a) (((a) >> MLX4_TUN_QPN_SHIFT) & 0x3)
+
+ /* Port mgmt change event handling */
+
+#define GET_BLK_PTR_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.block_ptr)
+#define GET_MASK_FROM_EQE(eqe) be32_to_cpu(eqe->event.port_mgmt_change.params.tbl_change_info.tbl_entries_mask)
+#define NUM_IDX_IN_PKEY_TBL_BLK 32
+#define GUID_TBL_ENTRY_SIZE 8	   /* size in bytes */
+#define GUID_TBL_BLK_NUM_ENTRIES 8
+#define GUID_TBL_BLK_SIZE (GUID_TBL_ENTRY_SIZE * GUID_TBL_BLK_NUM_ENTRIES)
+
+/* Counters should be saturate once they reach their maximum value */
+#define ASSIGN_32BIT_COUNTER(counter, value) do {\
+	if ((value) > U32_MAX)			 \
+		counter = cpu_to_be32(U32_MAX); \
+	else					 \
+		counter = cpu_to_be32(value);	 \
+} while (0)
+
+struct mlx4_mad_rcv_buf {
+	struct ib_grh grh;
+	u8 payload[256];
+} __packed;
+
+struct mlx4_mad_snd_buf {
+	u8 payload[256];
+} __packed;
+
+struct mlx4_tunnel_mad {
+	struct ib_grh grh;
+	struct mlx4_ib_tunnel_header hdr;
+	struct ib_mad mad;
+} __packed;
+
+struct mlx4_rcv_tunnel_mad {
+	struct mlx4_rcv_tunnel_hdr hdr;
+	struct ib_grh grh;
+	struct ib_mad mad;
+} __packed;
+
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num);
+static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num);
+static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+				int block, u32 change_bitmap);
+
+__be64 mlx4_ib_gen_node_guid(void)
+{
+#define NODE_GUID_HI	((u64) (((u64)IB_OPENIB_OUI) << 40))
+	return cpu_to_be64(NODE_GUID_HI | prandom_u32());
+}
+
+__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx)
+{
+	return cpu_to_be64(atomic_inc_return(&ctx->tid)) |
+		cpu_to_be64(0xff00000000000000LL);
+}
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
+		 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		 void *in_mad, void *response_mad)
+{
+	struct mlx4_cmd_mailbox *inmailbox, *outmailbox;
+	void *inbox;
+	int err;
+	u32 in_modifier = port;
+	u8 op_modifier = 0;
+
+	inmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(inmailbox))
+		return PTR_ERR(inmailbox);
+	inbox = inmailbox->buf;
+
+	outmailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(outmailbox)) {
+		mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+		return PTR_ERR(outmailbox);
+	}
+
+	memcpy(inbox, in_mad, 256);
+
+	/*
+	 * Key check traps can't be generated unless we have in_wc to
+	 * tell us where to send the trap.
+	 */
+	if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_MKEY) || !in_wc)
+		op_modifier |= 0x1;
+	if ((mad_ifc_flags & MLX4_MAD_IFC_IGNORE_BKEY) || !in_wc)
+		op_modifier |= 0x2;
+	if (mlx4_is_mfunc(dev->dev) &&
+	    (mad_ifc_flags & MLX4_MAD_IFC_NET_VIEW || in_wc))
+		op_modifier |= 0x8;
+
+	if (in_wc) {
+		struct {
+			__be32		my_qpn;
+			u32		reserved1;
+			__be32		rqpn;
+			u8		sl;
+			u8		g_path;
+			u16		reserved2[2];
+			__be16		pkey;
+			u32		reserved3[11];
+			u8		grh[40];
+		} *ext_info;
+
+		memset(inbox + 256, 0, 256);
+		ext_info = inbox + 256;
+
+		ext_info->my_qpn = cpu_to_be32(in_wc->qp->qp_num);
+		ext_info->rqpn   = cpu_to_be32(in_wc->src_qp);
+		ext_info->sl     = in_wc->sl << 4;
+		ext_info->g_path = in_wc->dlid_path_bits |
+			(in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
+		ext_info->pkey   = cpu_to_be16(in_wc->pkey_index);
+
+		if (in_grh)
+			memcpy(ext_info->grh, in_grh, 40);
+
+		op_modifier |= 0x4;
+
+		in_modifier |= in_wc->slid << 16;
+	}
+
+	err = mlx4_cmd_box(dev->dev, inmailbox->dma, outmailbox->dma, in_modifier,
+			   mlx4_is_master(dev->dev) ? (op_modifier & ~0x8) : op_modifier,
+			   MLX4_CMD_MAD_IFC, MLX4_CMD_TIME_CLASS_C,
+			   (op_modifier & 0x8) ? MLX4_CMD_NATIVE : MLX4_CMD_WRAPPED);
+
+	if (!err)
+		memcpy(response_mad, outmailbox->buf, 256);
+
+	mlx4_free_cmd_mailbox(dev->dev, inmailbox);
+	mlx4_free_cmd_mailbox(dev->dev, outmailbox);
+
+	return err;
+}
+
+static void update_sm_ah(struct mlx4_ib_dev *dev, u8 port_num, u16 lid, u8 sl)
+{
+	struct ib_ah *new_ah;
+	struct ib_ah_attr ah_attr;
+	unsigned long flags;
+
+	if (!dev->send_agent[port_num - 1][0])
+		return;
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid     = lid;
+	ah_attr.sl       = sl;
+	ah_attr.port_num = port_num;
+
+	new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+			      &ah_attr);
+	if (IS_ERR(new_ah))
+		return;
+
+	spin_lock_irqsave(&dev->sm_lock, flags);
+	if (dev->sm_ah[port_num - 1])
+		ib_destroy_ah(dev->sm_ah[port_num - 1]);
+	dev->sm_ah[port_num - 1] = new_ah;
+	spin_unlock_irqrestore(&dev->sm_lock, flags);
+}
+
+/*
+ * Snoop SM MADs for port info, GUID info, and  P_Key table sets, so we can
+ * synthesize LID change, Client-Rereg, GID change, and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev, u8 port_num, struct ib_mad *mad,
+		      u16 prev_lid)
+{
+	struct ib_port_info *pinfo;
+	u16 lid;
+	__be16 *base;
+	u32 bn, pkey_change_bitmap;
+	int i;
+
+
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method == IB_MGMT_METHOD_SET)
+		switch (mad->mad_hdr.attr_id) {
+		case IB_SMP_ATTR_PORT_INFO:
+			pinfo = (struct ib_port_info *) ((struct ib_smp *) mad)->data;
+			lid = be16_to_cpu(pinfo->lid);
+
+			update_sm_ah(dev, port_num,
+				     be16_to_cpu(pinfo->sm_lid),
+				     pinfo->neighbormtu_mastersmsl & 0xf);
+
+			if (pinfo->clientrereg_resv_subnetto & 0x80)
+				handle_client_rereg_event(dev, port_num);
+
+			if (prev_lid != lid)
+				handle_lid_change_event(dev, port_num);
+			break;
+
+		case IB_SMP_ATTR_PKEY_TABLE:
+			if (!mlx4_is_mfunc(dev->dev)) {
+				mlx4_ib_dispatch_event(dev, port_num,
+						       IB_EVENT_PKEY_CHANGE);
+				break;
+			}
+
+			/* at this point, we are running in the master.
+			 * Slaves do not receive SMPs.
+			 */
+			bn  = be32_to_cpu(((struct ib_smp *)mad)->attr_mod) & 0xFFFF;
+			base = (__be16 *) &(((struct ib_smp *)mad)->data[0]);
+			pkey_change_bitmap = 0;
+			for (i = 0; i < 32; i++) {
+				pr_debug("PKEY[%d] = x%x\n",
+					 i + bn*32, be16_to_cpu(base[i]));
+				if (be16_to_cpu(base[i]) !=
+				    dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32]) {
+					pkey_change_bitmap |= (1 << i);
+					dev->pkeys.phys_pkey_cache[port_num - 1][i + bn*32] =
+						be16_to_cpu(base[i]);
+				}
+			}
+			pr_debug("PKEY Change event: port=%d, "
+				 "block=0x%x, change_bitmap=0x%x\n",
+				 port_num, bn, pkey_change_bitmap);
+
+			if (pkey_change_bitmap) {
+				mlx4_ib_dispatch_event(dev, port_num,
+						       IB_EVENT_PKEY_CHANGE);
+				if (!dev->sriov.is_going_down)
+					__propagate_pkey_ev(dev, port_num, bn,
+							    pkey_change_bitmap);
+			}
+			break;
+
+		case IB_SMP_ATTR_GUID_INFO:
+			/* paravirtualized master's guid is guid 0 -- does not change */
+			if (!mlx4_is_master(dev->dev))
+				mlx4_ib_dispatch_event(dev, port_num,
+						       IB_EVENT_GID_CHANGE);
+			/*if master, notify relevant slaves*/
+			if (mlx4_is_master(dev->dev) &&
+			    !dev->sriov.is_going_down) {
+				bn = be32_to_cpu(((struct ib_smp *)mad)->attr_mod);
+				mlx4_ib_update_cache_on_guid_change(dev, bn, port_num,
+								    (u8 *)(&((struct ib_smp *)mad)->data));
+				mlx4_ib_notify_slaves_on_guid_change(dev, bn, port_num,
+								     (u8 *)(&((struct ib_smp *)mad)->data));
+			}
+			break;
+
+		default:
+			break;
+		}
+}
+
+static void __propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+				int block, u32 change_bitmap)
+{
+	int i, ix, slave, err;
+	int have_event = 0;
+
+	for (slave = 0; slave < dev->dev->caps.sqp_demux; slave++) {
+		if (slave == mlx4_master_func_num(dev->dev))
+			continue;
+		if (!mlx4_is_slave_active(dev->dev, slave))
+			continue;
+
+		have_event = 0;
+		for (i = 0; i < 32; i++) {
+			if (!(change_bitmap & (1 << i)))
+				continue;
+			for (ix = 0;
+			     ix < dev->dev->caps.pkey_table_len[port_num]; ix++) {
+				if (dev->pkeys.virt2phys_pkey[slave][port_num - 1]
+				    [ix] == i + 32 * block) {
+					err = mlx4_gen_pkey_eqe(dev->dev, slave, port_num);
+					pr_debug("propagate_pkey_ev: slave %d,"
+						 " port %d, ix %d (%d)\n",
+						 slave, port_num, ix, err);
+					have_event = 1;
+					break;
+				}
+			}
+			if (have_event)
+				break;
+		}
+	}
+}
+
+static void node_desc_override(struct ib_device *dev,
+			       struct ib_mad *mad)
+{
+	unsigned long flags;
+
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP &&
+	    mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) {
+		spin_lock_irqsave(&to_mdev(dev)->sm_lock, flags);
+		memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64);
+		spin_unlock_irqrestore(&to_mdev(dev)->sm_lock, flags);
+	}
+}
+
+static void forward_trap(struct mlx4_ib_dev *dev, u8 port_num, struct ib_mad *mad)
+{
+	int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+	int ret;
+	unsigned long flags;
+
+	if (agent) {
+		send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR,
+					      IB_MGMT_MAD_DATA, GFP_ATOMIC);
+		if (IS_ERR(send_buf))
+			return;
+		/*
+		 * We rely here on the fact that MLX QPs don't use the
+		 * address handle after the send is posted (this is
+		 * wrong following the IB spec strictly, but we know
+		 * it's OK for our devices).
+		 */
+		spin_lock_irqsave(&dev->sm_lock, flags);
+		memcpy(send_buf->mad, mad, sizeof *mad);
+		if ((send_buf->ah = dev->sm_ah[port_num - 1]))
+			ret = ib_post_send_mad(send_buf, NULL);
+		else
+			ret = -EINVAL;
+		spin_unlock_irqrestore(&dev->sm_lock, flags);
+
+		if (ret)
+			ib_free_send_mad(send_buf);
+	}
+}
+
+static int mlx4_ib_demux_sa_handler(struct ib_device *ibdev, int port, int slave,
+							     struct ib_sa_mad *sa_mad)
+{
+	int ret = 0;
+
+	/* dispatch to different sa handlers */
+	switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) {
+	case IB_SA_ATTR_MC_MEMBER_REC:
+		ret = mlx4_ib_mcg_demux_handler(ibdev, port, slave, sa_mad);
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	int i;
+
+	for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+		if (dev->sriov.demux[port - 1].guid_cache[i] == guid)
+			return i;
+	}
+	return -1;
+}
+
+
+static int find_slave_port_pkey_ix(struct mlx4_ib_dev *dev, int slave,
+				   u8 port, u16 pkey, u16 *ix)
+{
+	int i, ret;
+	u8 unassigned_pkey_ix, pkey_ix, partial_ix = 0xFF;
+	u16 slot_pkey;
+
+	if (slave == mlx4_master_func_num(dev->dev))
+		return ib_find_cached_pkey(&dev->ib_dev, port, pkey, ix);
+
+	unassigned_pkey_ix = dev->dev->phys_caps.pkey_phys_table_len[port] - 1;
+
+	for (i = 0; i < dev->dev->caps.pkey_table_len[port]; i++) {
+		if (dev->pkeys.virt2phys_pkey[slave][port - 1][i] == unassigned_pkey_ix)
+			continue;
+
+		pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][i];
+
+		ret = ib_get_cached_pkey(&dev->ib_dev, port, pkey_ix, &slot_pkey);
+		if (ret)
+			continue;
+		if ((slot_pkey & 0x7FFF) == (pkey & 0x7FFF)) {
+			if (slot_pkey & 0x8000) {
+				*ix = (u16) pkey_ix;
+				return 0;
+			} else {
+				/* take first partial pkey index found */
+				if (partial_ix == 0xFF)
+					partial_ix = pkey_ix;
+			}
+		}
+	}
+
+	if (partial_ix < 0xFF) {
+		*ix = (u16) partial_ix;
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
+			  enum ib_qp_type dest_qpt, struct ib_wc *wc,
+			  struct ib_grh *grh, struct ib_mad *mad)
+{
+	struct ib_sge list;
+	struct ib_send_wr wr, *bad_wr;
+	struct mlx4_ib_demux_pv_ctx *tun_ctx;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	struct mlx4_rcv_tunnel_mad *tun_mad;
+	struct ib_ah_attr attr;
+	struct ib_ah *ah;
+	struct ib_qp *src_qp = NULL;
+	unsigned tun_tx_ix = 0;
+	int dqpn;
+	int ret = 0;
+	u16 tun_pkey_ix;
+	u16 cached_pkey;
+	u8 is_eth = dev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
+
+	if (dest_qpt > IB_QPT_GSI)
+		return -EINVAL;
+
+	tun_ctx = dev->sriov.demux[port-1].tun[slave];
+
+	/* check if proxy qp created */
+	if (!tun_ctx || tun_ctx->state != DEMUX_PV_STATE_ACTIVE)
+		return -EAGAIN;
+
+	if (!dest_qpt)
+		tun_qp = &tun_ctx->qp[0];
+	else
+		tun_qp = &tun_ctx->qp[1];
+
+	/* compute P_Key index to put in tunnel header for slave */
+	if (dest_qpt) {
+		u16 pkey_ix;
+		ret = ib_get_cached_pkey(&dev->ib_dev, port, wc->pkey_index, &cached_pkey);
+		if (ret)
+			return -EINVAL;
+
+		ret = find_slave_port_pkey_ix(dev, slave, port, cached_pkey, &pkey_ix);
+		if (ret)
+			return -EINVAL;
+		tun_pkey_ix = pkey_ix;
+	} else
+		tun_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0];
+
+	dqpn = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave + port + (dest_qpt * 2) - 1;
+
+	/* get tunnel tx data buf for slave */
+	src_qp = tun_qp->qp;
+
+	/* create ah. Just need an empty one with the port num for the post send.
+	 * The driver will set the force loopback bit in post_send */
+	memset(&attr, 0, sizeof attr);
+	attr.port_num = port;
+	if (is_eth) {
+		memcpy(&attr.grh.dgid.raw[0], &grh->dgid.raw[0], 16);
+		attr.ah_flags = IB_AH_GRH;
+	}
+	ah = ib_create_ah(tun_ctx->pd, &attr);
+	if (IS_ERR(ah))
+		return -ENOMEM;
+
+	/* allocate tunnel tx buf after pass failure returns */
+	spin_lock(&tun_qp->tx_lock);
+	if (tun_qp->tx_ix_head - tun_qp->tx_ix_tail >=
+	    (MLX4_NUM_TUNNEL_BUFS - 1))
+		ret = -EAGAIN;
+	else
+		tun_tx_ix = (++tun_qp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1);
+	spin_unlock(&tun_qp->tx_lock);
+	if (ret)
+		goto out;
+
+	tun_mad = (struct mlx4_rcv_tunnel_mad *) (tun_qp->tx_ring[tun_tx_ix].buf.addr);
+	if (tun_qp->tx_ring[tun_tx_ix].ah)
+		ib_destroy_ah(tun_qp->tx_ring[tun_tx_ix].ah);
+	tun_qp->tx_ring[tun_tx_ix].ah = ah;
+	ib_dma_sync_single_for_cpu(&dev->ib_dev,
+				   tun_qp->tx_ring[tun_tx_ix].buf.map,
+				   sizeof (struct mlx4_rcv_tunnel_mad),
+				   DMA_TO_DEVICE);
+
+	/* copy over to tunnel buffer */
+	if (grh)
+		memcpy(&tun_mad->grh, grh, sizeof *grh);
+	memcpy(&tun_mad->mad, mad, sizeof *mad);
+
+	/* adjust tunnel data */
+	tun_mad->hdr.pkey_index = cpu_to_be16(tun_pkey_ix);
+	tun_mad->hdr.flags_src_qp = cpu_to_be32(wc->src_qp & 0xFFFFFF);
+	tun_mad->hdr.g_ml_path = (grh && (wc->wc_flags & IB_WC_GRH)) ? 0x80 : 0;
+
+	if (is_eth) {
+		u16 vlan = 0;
+		if (mlx4_get_slave_default_vlan(dev->dev, port, slave, &vlan,
+						NULL)) {
+			/* VST mode */
+			if (vlan != wc->vlan_id)
+				/* Packet vlan is not the VST-assigned vlan.
+				 * Drop the packet.
+				 */
+				goto out;
+			 else
+				/* Remove the vlan tag before forwarding
+				 * the packet to the VF.
+				 */
+				vlan = 0xffff;
+		} else {
+			vlan = wc->vlan_id;
+		}
+
+		tun_mad->hdr.sl_vid = cpu_to_be16(vlan);
+		memcpy((char *)&tun_mad->hdr.mac_31_0, &(wc->smac[0]), 4);
+		memcpy((char *)&tun_mad->hdr.slid_mac_47_32, &(wc->smac[4]), 2);
+	} else {
+		tun_mad->hdr.sl_vid = cpu_to_be16(((u16)(wc->sl)) << 12);
+		tun_mad->hdr.slid_mac_47_32 = cpu_to_be16(wc->slid);
+	}
+
+	ib_dma_sync_single_for_device(&dev->ib_dev,
+				      tun_qp->tx_ring[tun_tx_ix].buf.map,
+				      sizeof (struct mlx4_rcv_tunnel_mad),
+				      DMA_TO_DEVICE);
+
+	list.addr = tun_qp->tx_ring[tun_tx_ix].buf.map;
+	list.length = sizeof (struct mlx4_rcv_tunnel_mad);
+	list.lkey = tun_ctx->mr->lkey;
+
+	wr.wr.ud.ah = ah;
+	wr.wr.ud.port_num = port;
+	wr.wr.ud.remote_qkey = IB_QP_SET_QKEY;
+	wr.wr.ud.remote_qpn = dqpn;
+	wr.next = NULL;
+	wr.wr_id = ((u64) tun_tx_ix) | MLX4_TUN_SET_WRID_QPN(dest_qpt);
+	wr.sg_list = &list;
+	wr.num_sge = 1;
+	wr.opcode = IB_WR_SEND;
+	wr.send_flags = IB_SEND_SIGNALED;
+
+	ret = ib_post_send(src_qp, &wr, &bad_wr);
+out:
+	if (ret)
+		ib_destroy_ah(ah);
+	return ret;
+}
+
+static int mlx4_ib_demux_mad(struct ib_device *ibdev, u8 port,
+			struct ib_wc *wc, struct ib_grh *grh,
+			struct ib_mad *mad)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	int err;
+	int slave;
+	u8 *slave_id;
+	int is_eth = 0;
+
+	if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND)
+		is_eth = 0;
+	else
+		is_eth = 1;
+
+	if (is_eth) {
+		if (!(wc->wc_flags & IB_WC_GRH)) {
+			mlx4_ib_warn(ibdev, "RoCE grh not present.\n");
+			return -EINVAL;
+		}
+		if (mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_CM) {
+			mlx4_ib_warn(ibdev, "RoCE mgmt class is not CM\n");
+			return -EINVAL;
+		}
+		if (mlx4_get_slave_from_roce_gid(dev->dev, port, grh->dgid.raw, &slave)) {
+			mlx4_ib_warn(ibdev, "failed matching grh\n");
+			return -ENOENT;
+		}
+		if (slave >= dev->dev->caps.sqp_demux) {
+			mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n",
+				     slave, dev->dev->caps.sqp_demux);
+			return -ENOENT;
+		}
+
+		if (mlx4_ib_demux_cm_handler(ibdev, port, NULL, mad))
+			return 0;
+
+		err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad);
+		if (err)
+			pr_debug("failed sending to slave %d via tunnel qp (%d)\n",
+				 slave, err);
+		return 0;
+	}
+
+	/* Initially assume that this mad is for us */
+	slave = mlx4_master_func_num(dev->dev);
+
+	/* See if the slave id is encoded in a response mad */
+	if (mad->mad_hdr.method & 0x80) {
+		slave_id = (u8 *) &mad->mad_hdr.tid;
+		slave = *slave_id;
+		if (slave != 255) /*255 indicates the dom0*/
+			*slave_id = 0; /* remap tid */
+	}
+
+	/* If a grh is present, we demux according to it */
+	if (wc->wc_flags & IB_WC_GRH) {
+		slave = mlx4_ib_find_real_gid(ibdev, port, grh->dgid.global.interface_id);
+		if (slave < 0) {
+			mlx4_ib_warn(ibdev, "failed matching grh\n");
+			return -ENOENT;
+		}
+	}
+	/* Class-specific handling */
+	switch (mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+		/* 255 indicates the dom0 */
+		if (slave != 255 && slave != mlx4_master_func_num(dev->dev)) {
+			if (!mlx4_vf_smi_enabled(dev->dev, slave, port))
+				return -EPERM;
+			/* for a VF. drop unsolicited MADs */
+			if (!(mad->mad_hdr.method & IB_MGMT_METHOD_RESP)) {
+				mlx4_ib_warn(ibdev, "demux QP0. rejecting unsolicited mad for slave %d class 0x%x, method 0x%x\n",
+					     slave, mad->mad_hdr.mgmt_class,
+					     mad->mad_hdr.method);
+				return -EINVAL;
+			}
+		}
+		break;
+	case IB_MGMT_CLASS_SUBN_ADM:
+		if (mlx4_ib_demux_sa_handler(ibdev, port, slave,
+					     (struct ib_sa_mad *) mad))
+			return 0;
+		break;
+	case IB_MGMT_CLASS_CM:
+		if (mlx4_ib_demux_cm_handler(ibdev, port, &slave, mad))
+			return 0;
+		break;
+	case IB_MGMT_CLASS_DEVICE_MGMT:
+		if (mad->mad_hdr.method != IB_MGMT_METHOD_GET_RESP)
+			return 0;
+		break;
+	default:
+		/* Drop unsupported classes for slaves in tunnel mode */
+		if (slave != mlx4_master_func_num(dev->dev)) {
+			pr_debug("dropping unsupported ingress mad from class:%d "
+				 "for slave:%d\n", mad->mad_hdr.mgmt_class, slave);
+			return 0;
+		}
+	}
+	/*make sure that no slave==255 was not handled yet.*/
+	if (slave >= dev->dev->caps.sqp_demux) {
+		mlx4_ib_warn(ibdev, "slave id: %d is bigger than allowed:%d\n",
+			     slave, dev->dev->caps.sqp_demux);
+		return -ENOENT;
+	}
+
+	err = mlx4_ib_send_to_slave(dev, slave, port, wc->qp->qp_type, wc, grh, mad);
+	if (err)
+		pr_debug("failed sending to slave %d via tunnel qp (%d)\n",
+			 slave, err);
+	return 0;
+}
+
+static int ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	u16 slid, prev_lid = 0;
+	int err;
+	struct ib_port_attr pattr;
+
+	if (in_wc && in_wc->qp->qp_num) {
+		pr_debug("received MAD: slid:%d sqpn:%d "
+			"dlid_bits:%d dqpn:%d wc_flags:0x%x, cls %x, mtd %x, atr %x\n",
+			in_wc->slid, in_wc->src_qp,
+			in_wc->dlid_path_bits,
+			in_wc->qp->qp_num,
+			in_wc->wc_flags,
+			in_mad->mad_hdr.mgmt_class, in_mad->mad_hdr.method,
+			be16_to_cpu(in_mad->mad_hdr.attr_id));
+		if (in_wc->wc_flags & IB_WC_GRH) {
+			pr_debug("sgid_hi:0x%016llx sgid_lo:0x%016llx\n",
+				 be64_to_cpu(in_grh->sgid.global.subnet_prefix),
+				 be64_to_cpu(in_grh->sgid.global.interface_id));
+			pr_debug("dgid_hi:0x%016llx dgid_lo:0x%016llx\n",
+				 be64_to_cpu(in_grh->dgid.global.subnet_prefix),
+				 be64_to_cpu(in_grh->dgid.global.interface_id));
+		}
+	}
+
+	slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0) {
+		forward_trap(to_mdev(ibdev), port_num, in_mad);
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+	}
+
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+			return IB_MAD_RESULT_SUCCESS;
+
+		/*
+		 * Don't process SMInfo queries -- the SMA can't handle them.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO)
+			return IB_MAD_RESULT_SUCCESS;
+	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+		   in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS1   ||
+		   in_mad->mad_hdr.mgmt_class == MLX4_IB_VENDOR_CLASS2   ||
+		   in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) {
+		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+			return IB_MAD_RESULT_SUCCESS;
+	} else
+		return IB_MAD_RESULT_SUCCESS;
+
+	if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
+	    in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+	    !ib_query_port(ibdev, port_num, &pattr))
+		prev_lid = pattr.lid;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev),
+			   (mad_flags & IB_MAD_IGNORE_MKEY ? MLX4_MAD_IFC_IGNORE_MKEY : 0) |
+			   (mad_flags & IB_MAD_IGNORE_BKEY ? MLX4_MAD_IFC_IGNORE_BKEY : 0) |
+			   MLX4_MAD_IFC_NET_VIEW,
+			   port_num, in_wc, in_grh, in_mad, out_mad);
+	if (err)
+		return IB_MAD_RESULT_FAILURE;
+
+	if (!out_mad->mad_hdr.status) {
+		if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_PORT_MNG_CHG_EV))
+			smp_snoop(ibdev, port_num, in_mad, prev_lid);
+		/* slaves get node desc from FW */
+		if (!mlx4_is_slave(to_mdev(ibdev)->dev))
+			node_desc_override(ibdev, out_mad);
+	}
+
+	/* set return bit in status of directed route responses */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+		/* no response for trap repress */
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static void edit_counter(struct mlx4_counter *cnt,
+					struct ib_pma_portcounters *pma_cnt)
+{
+	ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_data,
+			     (be64_to_cpu(cnt->tx_bytes) >> 2));
+	ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_data,
+			     (be64_to_cpu(cnt->rx_bytes) >> 2));
+	ASSIGN_32BIT_COUNTER(pma_cnt->port_xmit_packets,
+			     be64_to_cpu(cnt->tx_frames));
+	ASSIGN_32BIT_COUNTER(pma_cnt->port_rcv_packets,
+			     be64_to_cpu(cnt->rx_frames));
+}
+
+static int iboe_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	int err;
+	u32 inmod = dev->counters[port_num - 1] & 0xffff;
+	u8 mode;
+
+	if (in_mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_PERF_MGMT)
+		return -EINVAL;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(mailbox))
+		return IB_MAD_RESULT_FAILURE;
+
+	err = mlx4_cmd_box(dev->dev, 0, mailbox->dma, inmod, 0,
+			   MLX4_CMD_QUERY_IF_STAT, MLX4_CMD_TIME_CLASS_C,
+			   MLX4_CMD_WRAPPED);
+	if (err)
+		err = IB_MAD_RESULT_FAILURE;
+	else {
+		memset(out_mad->data, 0, sizeof out_mad->data);
+		mode = ((struct mlx4_counter *)mailbox->buf)->counter_mode;
+		switch (mode & 0xf) {
+		case 0:
+			edit_counter(mailbox->buf,
+						(void *)(out_mad->data + 40));
+			err = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+			break;
+		default:
+			err = IB_MAD_RESULT_FAILURE;
+		}
+	}
+
+	mlx4_free_cmd_mailbox(dev->dev, mailbox);
+
+	return err;
+}
+
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	switch (rdma_port_get_link_layer(ibdev, port_num)) {
+	case IB_LINK_LAYER_INFINIBAND:
+		return ib_process_mad(ibdev, mad_flags, port_num, in_wc,
+				      in_grh, in_mad, out_mad);
+	case IB_LINK_LAYER_ETHERNET:
+		return iboe_process_mad(ibdev, mad_flags, port_num, in_wc,
+					  in_grh, in_mad, out_mad);
+	default:
+		return -EINVAL;
+	}
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	if (mad_send_wc->send_buf->context[0])
+		ib_destroy_ah(mad_send_wc->send_buf->context[0]);
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+	int ret;
+	enum rdma_link_layer ll;
+
+	for (p = 0; p < dev->num_ports; ++p) {
+		ll = rdma_port_get_link_layer(&dev->ib_dev, p + 1);
+		for (q = 0; q <= 1; ++q) {
+			if (ll == IB_LINK_LAYER_INFINIBAND) {
+				agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+							      q ? IB_QPT_GSI : IB_QPT_SMI,
+							      NULL, 0, send_handler,
+							      NULL, NULL, 0);
+				if (IS_ERR(agent)) {
+					ret = PTR_ERR(agent);
+					goto err;
+				}
+				dev->send_agent[p][q] = agent;
+			} else
+				dev->send_agent[p][q] = NULL;
+		}
+	}
+
+	return 0;
+
+err:
+	for (p = 0; p < dev->num_ports; ++p)
+		for (q = 0; q <= 1; ++q)
+			if (dev->send_agent[p][q])
+				ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+	return ret;
+}
+
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+
+	for (p = 0; p < dev->num_ports; ++p) {
+		for (q = 0; q <= 1; ++q) {
+			agent = dev->send_agent[p][q];
+			if (agent) {
+				dev->send_agent[p][q] = NULL;
+				ib_unregister_mad_agent(agent);
+			}
+		}
+
+		if (dev->sm_ah[p])
+			ib_destroy_ah(dev->sm_ah[p]);
+	}
+}
+
+static void handle_lid_change_event(struct mlx4_ib_dev *dev, u8 port_num)
+{
+	mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_LID_CHANGE);
+
+	if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down)
+		mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num,
+					    MLX4_EQ_PORT_INFO_LID_CHANGE_MASK);
+}
+
+static void handle_client_rereg_event(struct mlx4_ib_dev *dev, u8 port_num)
+{
+	/* re-configure the alias-guid and mcg's */
+	if (mlx4_is_master(dev->dev)) {
+		mlx4_ib_invalidate_all_guid_record(dev, port_num);
+
+		if (!dev->sriov.is_going_down) {
+			mlx4_ib_mcg_port_cleanup(&dev->sriov.demux[port_num - 1], 0);
+			mlx4_gen_slaves_port_mgt_ev(dev->dev, port_num,
+						    MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK);
+		}
+	}
+	mlx4_ib_dispatch_event(dev, port_num, IB_EVENT_CLIENT_REREGISTER);
+}
+
+static void propagate_pkey_ev(struct mlx4_ib_dev *dev, int port_num,
+			      struct mlx4_eqe *eqe)
+{
+	__propagate_pkey_ev(dev, port_num, GET_BLK_PTR_FROM_EQE(eqe),
+			    GET_MASK_FROM_EQE(eqe));
+}
+
+static void handle_slaves_guid_change(struct mlx4_ib_dev *dev, u8 port_num,
+				      u32 guid_tbl_blk_num, u32 change_bitmap)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad  = NULL;
+	u16 i;
+
+	if (!mlx4_is_mfunc(dev->dev) || !mlx4_is_master(dev->dev))
+		return;
+
+	in_mad  = kmalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad) {
+		mlx4_ib_warn(&dev->ib_dev, "failed to allocate memory for guid info mads\n");
+		goto out;
+	}
+
+	guid_tbl_blk_num  *= 4;
+
+	for (i = 0; i < 4; i++) {
+		if (change_bitmap && (!((change_bitmap >> (8 * i)) & 0xff)))
+			continue;
+		memset(in_mad, 0, sizeof *in_mad);
+		memset(out_mad, 0, sizeof *out_mad);
+
+		in_mad->base_version  = 1;
+		in_mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+		in_mad->class_version = 1;
+		in_mad->method        = IB_MGMT_METHOD_GET;
+		in_mad->attr_id       = IB_SMP_ATTR_GUID_INFO;
+		in_mad->attr_mod      = cpu_to_be32(guid_tbl_blk_num + i);
+
+		if (mlx4_MAD_IFC(dev,
+				 MLX4_MAD_IFC_IGNORE_KEYS | MLX4_MAD_IFC_NET_VIEW,
+				 port_num, NULL, NULL, in_mad, out_mad)) {
+			mlx4_ib_warn(&dev->ib_dev, "Failed in get GUID INFO MAD_IFC\n");
+			goto out;
+		}
+
+		mlx4_ib_update_cache_on_guid_change(dev, guid_tbl_blk_num + i,
+						    port_num,
+						    (u8 *)(&((struct ib_smp *)out_mad)->data));
+		mlx4_ib_notify_slaves_on_guid_change(dev, guid_tbl_blk_num + i,
+						     port_num,
+						     (u8 *)(&((struct ib_smp *)out_mad)->data));
+	}
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return;
+}
+
+void handle_port_mgmt_change_event(struct work_struct *work)
+{
+	struct ib_event_work *ew = container_of(work, struct ib_event_work, work);
+	struct mlx4_ib_dev *dev = ew->ib_dev;
+	struct mlx4_eqe *eqe = &(ew->ib_eqe);
+	u8 port = eqe->event.port_mgmt_change.port;
+	u32 changed_attr;
+	u32 tbl_block;
+	u32 change_bitmap;
+
+	switch (eqe->subtype) {
+	case MLX4_DEV_PMC_SUBTYPE_PORT_INFO:
+		changed_attr = be32_to_cpu(eqe->event.port_mgmt_change.params.port_info.changed_attr);
+
+		/* Update the SM ah - This should be done before handling
+		   the other changed attributes so that MADs can be sent to the SM */
+		if (changed_attr & MSTR_SM_CHANGE_MASK) {
+			u16 lid = be16_to_cpu(eqe->event.port_mgmt_change.params.port_info.mstr_sm_lid);
+			u8 sl = eqe->event.port_mgmt_change.params.port_info.mstr_sm_sl & 0xf;
+			update_sm_ah(dev, port, lid, sl);
+		}
+
+		/* Check if it is a lid change event */
+		if (changed_attr & MLX4_EQ_PORT_INFO_LID_CHANGE_MASK)
+			handle_lid_change_event(dev, port);
+
+		/* Generate GUID changed event */
+		if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK) {
+			mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
+			/*if master, notify all slaves*/
+			if (mlx4_is_master(dev->dev))
+				mlx4_gen_slaves_port_mgt_ev(dev->dev, port,
+							    MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK);
+		}
+
+		if (changed_attr & MLX4_EQ_PORT_INFO_CLIENT_REREG_MASK)
+			handle_client_rereg_event(dev, port);
+		break;
+
+	case MLX4_DEV_PMC_SUBTYPE_PKEY_TABLE:
+		mlx4_ib_dispatch_event(dev, port, IB_EVENT_PKEY_CHANGE);
+		if (mlx4_is_master(dev->dev) && !dev->sriov.is_going_down)
+			propagate_pkey_ev(dev, port, eqe);
+		break;
+	case MLX4_DEV_PMC_SUBTYPE_GUID_INFO:
+		/* paravirtualized master's guid is guid 0 -- does not change */
+		if (!mlx4_is_master(dev->dev))
+			mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
+		/*if master, notify relevant slaves*/
+		else if (!dev->sriov.is_going_down) {
+			tbl_block = GET_BLK_PTR_FROM_EQE(eqe);
+			change_bitmap = GET_MASK_FROM_EQE(eqe);
+			handle_slaves_guid_change(dev, port, tbl_block, change_bitmap);
+		}
+		break;
+	default:
+		pr_warn("Unsupported subtype 0x%x for "
+			"Port Management Change event\n", eqe->subtype);
+	}
+
+	kfree(ew);
+}
+
+void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num,
+			    enum ib_event_type type)
+{
+	struct ib_event event;
+
+	event.device		= &dev->ib_dev;
+	event.element.port_num	= port_num;
+	event.event		= type;
+
+	ib_dispatch_event(&event);
+}
+
+static void mlx4_ib_tunnel_comp_handler(struct ib_cq *cq, void *arg)
+{
+	unsigned long flags;
+	struct mlx4_ib_demux_pv_ctx *ctx = cq->cq_context;
+	struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	if (!dev->sriov.is_going_down && ctx->state == DEMUX_PV_STATE_ACTIVE)
+		queue_work(ctx->wq, &ctx->work);
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+}
+
+static int mlx4_ib_post_pv_qp_buf(struct mlx4_ib_demux_pv_ctx *ctx,
+				  struct mlx4_ib_demux_pv_qp *tun_qp,
+				  int index)
+{
+	struct ib_sge sg_list;
+	struct ib_recv_wr recv_wr, *bad_recv_wr;
+	int size;
+
+	size = (tun_qp->qp->qp_type == IB_QPT_UD) ?
+		sizeof (struct mlx4_tunnel_mad) : sizeof (struct mlx4_mad_rcv_buf);
+
+	sg_list.addr = tun_qp->ring[index].map;
+	sg_list.length = size;
+	sg_list.lkey = ctx->mr->lkey;
+
+	recv_wr.next = NULL;
+	recv_wr.sg_list = &sg_list;
+	recv_wr.num_sge = 1;
+	recv_wr.wr_id = (u64) index | MLX4_TUN_WRID_RECV |
+		MLX4_TUN_SET_WRID_QPN(tun_qp->proxy_qpt);
+	ib_dma_sync_single_for_device(ctx->ib_dev, tun_qp->ring[index].map,
+				      size, DMA_FROM_DEVICE);
+	return ib_post_recv(tun_qp->qp, &recv_wr, &bad_recv_wr);
+}
+
+static int mlx4_ib_multiplex_sa_handler(struct ib_device *ibdev, int port,
+		int slave, struct ib_sa_mad *sa_mad)
+{
+	int ret = 0;
+
+	/* dispatch to different sa handlers */
+	switch (be16_to_cpu(sa_mad->mad_hdr.attr_id)) {
+	case IB_SA_ATTR_MC_MEMBER_REC:
+		ret = mlx4_ib_mcg_multiplex_handler(ibdev, port, slave, sa_mad);
+		break;
+	default:
+		break;
+	}
+	return ret;
+}
+
+static int is_proxy_qp0(struct mlx4_ib_dev *dev, int qpn, int slave)
+{
+	int proxy_start = dev->dev->phys_caps.base_proxy_sqpn + 8 * slave;
+
+	return (qpn >= proxy_start && qpn <= proxy_start + 1);
+}
+
+
+int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
+			 enum ib_qp_type dest_qpt, u16 pkey_index,
+			 u32 remote_qpn, u32 qkey, struct ib_ah_attr *attr,
+			 u8 *s_mac, struct ib_mad *mad)
+{
+	struct ib_sge list;
+	struct ib_send_wr wr, *bad_wr;
+	struct mlx4_ib_demux_pv_ctx *sqp_ctx;
+	struct mlx4_ib_demux_pv_qp *sqp;
+	struct mlx4_mad_snd_buf *sqp_mad;
+	struct ib_ah *ah;
+	struct ib_qp *send_qp = NULL;
+	unsigned wire_tx_ix = 0;
+	int ret = 0;
+	u16 wire_pkey_ix;
+	int src_qpnum;
+	u8 sgid_index;
+
+
+	sqp_ctx = dev->sriov.sqps[port-1];
+
+	/* check if proxy qp created */
+	if (!sqp_ctx || sqp_ctx->state != DEMUX_PV_STATE_ACTIVE)
+		return -EAGAIN;
+
+	if (dest_qpt == IB_QPT_SMI) {
+		src_qpnum = 0;
+		sqp = &sqp_ctx->qp[0];
+		wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][0];
+	} else {
+		src_qpnum = 1;
+		sqp = &sqp_ctx->qp[1];
+		wire_pkey_ix = dev->pkeys.virt2phys_pkey[slave][port - 1][pkey_index];
+	}
+
+	send_qp = sqp->qp;
+
+	/* create ah */
+	sgid_index = attr->grh.sgid_index;
+	attr->grh.sgid_index = 0;
+	ah = ib_create_ah(sqp_ctx->pd, attr);
+	if (IS_ERR(ah))
+		return -ENOMEM;
+	attr->grh.sgid_index = sgid_index;
+	to_mah(ah)->av.ib.gid_index = sgid_index;
+	/* get rid of force-loopback bit */
+	to_mah(ah)->av.ib.port_pd &= cpu_to_be32(0x7FFFFFFF);
+	spin_lock(&sqp->tx_lock);
+	if (sqp->tx_ix_head - sqp->tx_ix_tail >=
+	    (MLX4_NUM_TUNNEL_BUFS - 1))
+		ret = -EAGAIN;
+	else
+		wire_tx_ix = (++sqp->tx_ix_head) & (MLX4_NUM_TUNNEL_BUFS - 1);
+	spin_unlock(&sqp->tx_lock);
+	if (ret)
+		goto out;
+
+	sqp_mad = (struct mlx4_mad_snd_buf *) (sqp->tx_ring[wire_tx_ix].buf.addr);
+	if (sqp->tx_ring[wire_tx_ix].ah)
+		ib_destroy_ah(sqp->tx_ring[wire_tx_ix].ah);
+	sqp->tx_ring[wire_tx_ix].ah = ah;
+	ib_dma_sync_single_for_cpu(&dev->ib_dev,
+				   sqp->tx_ring[wire_tx_ix].buf.map,
+				   sizeof (struct mlx4_mad_snd_buf),
+				   DMA_TO_DEVICE);
+
+	memcpy(&sqp_mad->payload, mad, sizeof *mad);
+
+	ib_dma_sync_single_for_device(&dev->ib_dev,
+				      sqp->tx_ring[wire_tx_ix].buf.map,
+				      sizeof (struct mlx4_mad_snd_buf),
+				      DMA_TO_DEVICE);
+
+	list.addr = sqp->tx_ring[wire_tx_ix].buf.map;
+	list.length = sizeof (struct mlx4_mad_snd_buf);
+	list.lkey = sqp_ctx->mr->lkey;
+
+	wr.wr.ud.ah = ah;
+	wr.wr.ud.port_num = port;
+	wr.wr.ud.pkey_index = wire_pkey_ix;
+	wr.wr.ud.remote_qkey = qkey;
+	wr.wr.ud.remote_qpn = remote_qpn;
+	wr.next = NULL;
+	wr.wr_id = ((u64) wire_tx_ix) | MLX4_TUN_SET_WRID_QPN(src_qpnum);
+	wr.sg_list = &list;
+	wr.num_sge = 1;
+	wr.opcode = IB_WR_SEND;
+	wr.send_flags = IB_SEND_SIGNALED;
+	if (s_mac)
+		memcpy(to_mah(ah)->av.eth.s_mac, s_mac, 6);
+
+
+	ret = ib_post_send(send_qp, &wr, &bad_wr);
+out:
+	if (ret)
+		ib_destroy_ah(ah);
+	return ret;
+}
+
+static int get_slave_base_gid_ix(struct mlx4_ib_dev *dev, int slave, int port)
+{
+	if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND)
+		return slave;
+	return mlx4_get_base_gid_ix(dev->dev, slave, port);
+}
+
+static void fill_in_real_sgid_index(struct mlx4_ib_dev *dev, int slave, int port,
+				    struct ib_ah_attr *ah_attr)
+{
+	if (rdma_port_get_link_layer(&dev->ib_dev, port) == IB_LINK_LAYER_INFINIBAND)
+		ah_attr->grh.sgid_index = slave;
+	else
+		ah_attr->grh.sgid_index += get_slave_base_gid_ix(dev, slave, port);
+}
+
+static void mlx4_ib_multiplex_mad(struct mlx4_ib_demux_pv_ctx *ctx, struct ib_wc *wc)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+	struct mlx4_ib_demux_pv_qp *tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc->wr_id)];
+	int wr_ix = wc->wr_id & (MLX4_NUM_TUNNEL_BUFS - 1);
+	struct mlx4_tunnel_mad *tunnel = tun_qp->ring[wr_ix].addr;
+	struct mlx4_ib_ah ah;
+	struct ib_ah_attr ah_attr;
+	u8 *slave_id;
+	int slave;
+	int port;
+
+	/* Get slave that sent this packet */
+	if (wc->src_qp < dev->dev->phys_caps.base_proxy_sqpn ||
+	    wc->src_qp >= dev->dev->phys_caps.base_proxy_sqpn + 8 * MLX4_MFUNC_MAX ||
+	    (wc->src_qp & 0x1) != ctx->port - 1 ||
+	    wc->src_qp & 0x4) {
+		mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d\n", wc->src_qp);
+		return;
+	}
+	slave = ((wc->src_qp & ~0x7) - dev->dev->phys_caps.base_proxy_sqpn) / 8;
+	if (slave != ctx->slave) {
+		mlx4_ib_warn(ctx->ib_dev, "can't multiplex bad sqp:%d: "
+			     "belongs to another slave\n", wc->src_qp);
+		return;
+	}
+
+	/* Map transaction ID */
+	ib_dma_sync_single_for_cpu(ctx->ib_dev, tun_qp->ring[wr_ix].map,
+				   sizeof (struct mlx4_tunnel_mad),
+				   DMA_FROM_DEVICE);
+	switch (tunnel->mad.mad_hdr.method) {
+	case IB_MGMT_METHOD_SET:
+	case IB_MGMT_METHOD_GET:
+	case IB_MGMT_METHOD_REPORT:
+	case IB_SA_METHOD_GET_TABLE:
+	case IB_SA_METHOD_DELETE:
+	case IB_SA_METHOD_GET_MULTI:
+	case IB_SA_METHOD_GET_TRACE_TBL:
+		slave_id = (u8 *) &tunnel->mad.mad_hdr.tid;
+		if (*slave_id) {
+			mlx4_ib_warn(ctx->ib_dev, "egress mad has non-null tid msb:%d "
+				     "class:%d slave:%d\n", *slave_id,
+				     tunnel->mad.mad_hdr.mgmt_class, slave);
+			return;
+		} else
+			*slave_id = slave;
+	default:
+		/* nothing */;
+	}
+
+	/* Class-specific handling */
+	switch (tunnel->mad.mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+		if (slave != mlx4_master_func_num(dev->dev) &&
+		    !mlx4_vf_smi_enabled(dev->dev, slave, ctx->port))
+			return;
+		break;
+	case IB_MGMT_CLASS_SUBN_ADM:
+		if (mlx4_ib_multiplex_sa_handler(ctx->ib_dev, ctx->port, slave,
+			      (struct ib_sa_mad *) &tunnel->mad))
+			return;
+		break;
+	case IB_MGMT_CLASS_CM:
+		if (mlx4_ib_multiplex_cm_handler(ctx->ib_dev, ctx->port, slave,
+			      (struct ib_mad *) &tunnel->mad))
+			return;
+		break;
+	case IB_MGMT_CLASS_DEVICE_MGMT:
+		if (tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_GET &&
+		    tunnel->mad.mad_hdr.method != IB_MGMT_METHOD_SET)
+			return;
+		break;
+	default:
+		/* Drop unsupported classes for slaves in tunnel mode */
+		if (slave != mlx4_master_func_num(dev->dev)) {
+			mlx4_ib_warn(ctx->ib_dev, "dropping unsupported egress mad from class:%d "
+				     "for slave:%d\n", tunnel->mad.mad_hdr.mgmt_class, slave);
+			return;
+		}
+	}
+
+	/* We are using standard ib_core services to send the mad, so generate a
+	 * stadard address handle by decoding the tunnelled mlx4_ah fields */
+	memcpy(&ah.av, &tunnel->hdr.av, sizeof (struct mlx4_av));
+	ah.ibah.device = ctx->ib_dev;
+	mlx4_ib_query_ah(&ah.ibah, &ah_attr);
+	if (ah_attr.ah_flags & IB_AH_GRH)
+		fill_in_real_sgid_index(dev, slave, ctx->port, &ah_attr);
+
+	port = mlx4_slave_convert_port(dev->dev, slave, ah_attr.port_num);
+	if (port < 0)
+		return;
+	ah_attr.port_num = port;
+	memcpy(ah_attr.dmac, tunnel->hdr.mac, 6);
+	ah_attr.vlan_id = be16_to_cpu(tunnel->hdr.vlan);
+	/* if slave have default vlan use it */
+	mlx4_get_slave_default_vlan(dev->dev, ctx->port, slave,
+				    &ah_attr.vlan_id, &ah_attr.sl);
+
+	mlx4_ib_send_to_wire(dev, slave, ctx->port,
+			     is_proxy_qp0(dev, wc->src_qp, slave) ?
+			     IB_QPT_SMI : IB_QPT_GSI,
+			     be16_to_cpu(tunnel->hdr.pkey_index),
+			     be32_to_cpu(tunnel->hdr.remote_qpn),
+			     be32_to_cpu(tunnel->hdr.qkey),
+			     &ah_attr, wc->smac, &tunnel->mad);
+}
+
+static int mlx4_ib_alloc_pv_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
+				 enum ib_qp_type qp_type, int is_tun)
+{
+	int i;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	int rx_buf_size, tx_buf_size;
+
+	if (qp_type > IB_QPT_GSI)
+		return -EINVAL;
+
+	tun_qp = &ctx->qp[qp_type];
+
+	tun_qp->ring = kzalloc(sizeof (struct mlx4_ib_buf) * MLX4_NUM_TUNNEL_BUFS,
+			       GFP_KERNEL);
+	if (!tun_qp->ring)
+		return -ENOMEM;
+
+	tun_qp->tx_ring = kcalloc(MLX4_NUM_TUNNEL_BUFS,
+				  sizeof (struct mlx4_ib_tun_tx_buf),
+				  GFP_KERNEL);
+	if (!tun_qp->tx_ring) {
+		kfree(tun_qp->ring);
+		tun_qp->ring = NULL;
+		return -ENOMEM;
+	}
+
+	if (is_tun) {
+		rx_buf_size = sizeof (struct mlx4_tunnel_mad);
+		tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad);
+	} else {
+		rx_buf_size = sizeof (struct mlx4_mad_rcv_buf);
+		tx_buf_size = sizeof (struct mlx4_mad_snd_buf);
+	}
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		tun_qp->ring[i].addr = kmalloc(rx_buf_size, GFP_KERNEL);
+		if (!tun_qp->ring[i].addr)
+			goto err;
+		tun_qp->ring[i].map = ib_dma_map_single(ctx->ib_dev,
+							tun_qp->ring[i].addr,
+							rx_buf_size,
+							DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(ctx->ib_dev, tun_qp->ring[i].map)) {
+			kfree(tun_qp->ring[i].addr);
+			goto err;
+		}
+	}
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		tun_qp->tx_ring[i].buf.addr =
+			kmalloc(tx_buf_size, GFP_KERNEL);
+		if (!tun_qp->tx_ring[i].buf.addr)
+			goto tx_err;
+		tun_qp->tx_ring[i].buf.map =
+			ib_dma_map_single(ctx->ib_dev,
+					  tun_qp->tx_ring[i].buf.addr,
+					  tx_buf_size,
+					  DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(ctx->ib_dev,
+					 tun_qp->tx_ring[i].buf.map)) {
+			kfree(tun_qp->tx_ring[i].buf.addr);
+			goto tx_err;
+		}
+		tun_qp->tx_ring[i].ah = NULL;
+	}
+	spin_lock_init(&tun_qp->tx_lock);
+	tun_qp->tx_ix_head = 0;
+	tun_qp->tx_ix_tail = 0;
+	tun_qp->proxy_qpt = qp_type;
+
+	return 0;
+
+tx_err:
+	while (i > 0) {
+		--i;
+		ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map,
+				    tx_buf_size, DMA_TO_DEVICE);
+		kfree(tun_qp->tx_ring[i].buf.addr);
+	}
+	kfree(tun_qp->tx_ring);
+	tun_qp->tx_ring = NULL;
+	i = MLX4_NUM_TUNNEL_BUFS;
+err:
+	while (i > 0) {
+		--i;
+		ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map,
+				    rx_buf_size, DMA_FROM_DEVICE);
+		kfree(tun_qp->ring[i].addr);
+	}
+	kfree(tun_qp->ring);
+	tun_qp->ring = NULL;
+	return -ENOMEM;
+}
+
+static void mlx4_ib_free_pv_qp_bufs(struct mlx4_ib_demux_pv_ctx *ctx,
+				     enum ib_qp_type qp_type, int is_tun)
+{
+	int i;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	int rx_buf_size, tx_buf_size;
+
+	if (qp_type > IB_QPT_GSI)
+		return;
+
+	tun_qp = &ctx->qp[qp_type];
+	if (is_tun) {
+		rx_buf_size = sizeof (struct mlx4_tunnel_mad);
+		tx_buf_size = sizeof (struct mlx4_rcv_tunnel_mad);
+	} else {
+		rx_buf_size = sizeof (struct mlx4_mad_rcv_buf);
+		tx_buf_size = sizeof (struct mlx4_mad_snd_buf);
+	}
+
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		ib_dma_unmap_single(ctx->ib_dev, tun_qp->ring[i].map,
+				    rx_buf_size, DMA_FROM_DEVICE);
+		kfree(tun_qp->ring[i].addr);
+	}
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		ib_dma_unmap_single(ctx->ib_dev, tun_qp->tx_ring[i].buf.map,
+				    tx_buf_size, DMA_TO_DEVICE);
+		kfree(tun_qp->tx_ring[i].buf.addr);
+		if (tun_qp->tx_ring[i].ah)
+			ib_destroy_ah(tun_qp->tx_ring[i].ah);
+	}
+	kfree(tun_qp->tx_ring);
+	kfree(tun_qp->ring);
+}
+
+static void mlx4_ib_tunnel_comp_worker(struct work_struct *work)
+{
+	struct mlx4_ib_demux_pv_ctx *ctx;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	struct ib_wc wc;
+	int ret;
+	ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work);
+	ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+
+	while (ib_poll_cq(ctx->cq, 1, &wc) == 1) {
+		tun_qp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)];
+		if (wc.status == IB_WC_SUCCESS) {
+			switch (wc.opcode) {
+			case IB_WC_RECV:
+				mlx4_ib_multiplex_mad(ctx, &wc);
+				ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp,
+							     wc.wr_id &
+							     (MLX4_NUM_TUNNEL_BUFS - 1));
+				if (ret)
+					pr_err("Failed reposting tunnel "
+					       "buf:%lld\n", wc.wr_id);
+				break;
+			case IB_WC_SEND:
+				pr_debug("received tunnel send completion:"
+					 "wrid=0x%llx, status=0x%x\n",
+					 wc.wr_id, wc.status);
+				ib_destroy_ah(tun_qp->tx_ring[wc.wr_id &
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+					= NULL;
+				spin_lock(&tun_qp->tx_lock);
+				tun_qp->tx_ix_tail++;
+				spin_unlock(&tun_qp->tx_lock);
+
+				break;
+			default:
+				break;
+			}
+		} else  {
+			pr_debug("mlx4_ib: completion error in tunnel: %d."
+				 " status = %d, wrid = 0x%llx\n",
+				 ctx->slave, wc.status, wc.wr_id);
+			if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
+				ib_destroy_ah(tun_qp->tx_ring[wc.wr_id &
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				tun_qp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+					= NULL;
+				spin_lock(&tun_qp->tx_lock);
+				tun_qp->tx_ix_tail++;
+				spin_unlock(&tun_qp->tx_lock);
+			}
+		}
+	}
+}
+
+static void pv_qp_event_handler(struct ib_event *event, void *qp_context)
+{
+	struct mlx4_ib_demux_pv_ctx *sqp = qp_context;
+
+	/* It's worse than that! He's dead, Jim! */
+	pr_err("Fatal error (%d) on a MAD QP on port %d\n",
+	       event->event, sqp->port);
+}
+
+static int create_pv_sqp(struct mlx4_ib_demux_pv_ctx *ctx,
+			    enum ib_qp_type qp_type, int create_tun)
+{
+	int i, ret;
+	struct mlx4_ib_demux_pv_qp *tun_qp;
+	struct mlx4_ib_qp_tunnel_init_attr qp_init_attr;
+	struct ib_qp_attr attr;
+	int qp_attr_mask_INIT;
+
+	if (qp_type > IB_QPT_GSI)
+		return -EINVAL;
+
+	tun_qp = &ctx->qp[qp_type];
+
+	memset(&qp_init_attr, 0, sizeof qp_init_attr);
+	qp_init_attr.init_attr.send_cq = ctx->cq;
+	qp_init_attr.init_attr.recv_cq = ctx->cq;
+	qp_init_attr.init_attr.sq_sig_type = IB_SIGNAL_ALL_WR;
+	qp_init_attr.init_attr.cap.max_send_wr = MLX4_NUM_TUNNEL_BUFS;
+	qp_init_attr.init_attr.cap.max_recv_wr = MLX4_NUM_TUNNEL_BUFS;
+	qp_init_attr.init_attr.cap.max_send_sge = 1;
+	qp_init_attr.init_attr.cap.max_recv_sge = 1;
+	if (create_tun) {
+		qp_init_attr.init_attr.qp_type = IB_QPT_UD;
+		qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_TUNNEL_QP;
+		qp_init_attr.port = ctx->port;
+		qp_init_attr.slave = ctx->slave;
+		qp_init_attr.proxy_qp_type = qp_type;
+		qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX |
+			   IB_QP_QKEY | IB_QP_PORT;
+	} else {
+		qp_init_attr.init_attr.qp_type = qp_type;
+		qp_init_attr.init_attr.create_flags = MLX4_IB_SRIOV_SQP;
+		qp_attr_mask_INIT = IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_QKEY;
+	}
+	qp_init_attr.init_attr.port_num = ctx->port;
+	qp_init_attr.init_attr.qp_context = ctx;
+	qp_init_attr.init_attr.event_handler = pv_qp_event_handler;
+	tun_qp->qp = ib_create_qp(ctx->pd, &qp_init_attr.init_attr);
+	if (IS_ERR(tun_qp->qp)) {
+		ret = PTR_ERR(tun_qp->qp);
+		tun_qp->qp = NULL;
+		pr_err("Couldn't create %s QP (%d)\n",
+		       create_tun ? "tunnel" : "special", ret);
+		return ret;
+	}
+
+	memset(&attr, 0, sizeof attr);
+	attr.qp_state = IB_QPS_INIT;
+	ret = 0;
+	if (create_tun)
+		ret = find_slave_port_pkey_ix(to_mdev(ctx->ib_dev), ctx->slave,
+					      ctx->port, IB_DEFAULT_PKEY_FULL,
+					      &attr.pkey_index);
+	if (ret || !create_tun)
+		attr.pkey_index =
+			to_mdev(ctx->ib_dev)->pkeys.virt2phys_pkey[ctx->slave][ctx->port - 1][0];
+	attr.qkey = IB_QP1_QKEY;
+	attr.port_num = ctx->port;
+	ret = ib_modify_qp(tun_qp->qp, &attr, qp_attr_mask_INIT);
+	if (ret) {
+		pr_err("Couldn't change %s qp state to INIT (%d)\n",
+		       create_tun ? "tunnel" : "special", ret);
+		goto err_qp;
+	}
+	attr.qp_state = IB_QPS_RTR;
+	ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE);
+	if (ret) {
+		pr_err("Couldn't change %s qp state to RTR (%d)\n",
+		       create_tun ? "tunnel" : "special", ret);
+		goto err_qp;
+	}
+	attr.qp_state = IB_QPS_RTS;
+	attr.sq_psn = 0;
+	ret = ib_modify_qp(tun_qp->qp, &attr, IB_QP_STATE | IB_QP_SQ_PSN);
+	if (ret) {
+		pr_err("Couldn't change %s qp state to RTS (%d)\n",
+		       create_tun ? "tunnel" : "special", ret);
+		goto err_qp;
+	}
+
+	for (i = 0; i < MLX4_NUM_TUNNEL_BUFS; i++) {
+		ret = mlx4_ib_post_pv_qp_buf(ctx, tun_qp, i);
+		if (ret) {
+			pr_err(" mlx4_ib_post_pv_buf error"
+			       " (err = %d, i = %d)\n", ret, i);
+			goto err_qp;
+		}
+	}
+	return 0;
+
+err_qp:
+	ib_destroy_qp(tun_qp->qp);
+	tun_qp->qp = NULL;
+	return ret;
+}
+
+/*
+ * IB MAD completion callback for real SQPs
+ */
+static void mlx4_ib_sqp_comp_worker(struct work_struct *work)
+{
+	struct mlx4_ib_demux_pv_ctx *ctx;
+	struct mlx4_ib_demux_pv_qp *sqp;
+	struct ib_wc wc;
+	struct ib_grh *grh;
+	struct ib_mad *mad;
+
+	ctx = container_of(work, struct mlx4_ib_demux_pv_ctx, work);
+	ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+
+	while (mlx4_ib_poll_cq(ctx->cq, 1, &wc) == 1) {
+		sqp = &ctx->qp[MLX4_TUN_WRID_QPN(wc.wr_id)];
+		if (wc.status == IB_WC_SUCCESS) {
+			switch (wc.opcode) {
+			case IB_WC_SEND:
+				ib_destroy_ah(sqp->tx_ring[wc.wr_id &
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+					= NULL;
+				spin_lock(&sqp->tx_lock);
+				sqp->tx_ix_tail++;
+				spin_unlock(&sqp->tx_lock);
+				break;
+			case IB_WC_RECV:
+				mad = (struct ib_mad *) &(((struct mlx4_mad_rcv_buf *)
+						(sqp->ring[wc.wr_id &
+						(MLX4_NUM_TUNNEL_BUFS - 1)].addr))->payload);
+				grh = &(((struct mlx4_mad_rcv_buf *)
+						(sqp->ring[wc.wr_id &
+						(MLX4_NUM_TUNNEL_BUFS - 1)].addr))->grh);
+				mlx4_ib_demux_mad(ctx->ib_dev, ctx->port, &wc, grh, mad);
+				if (mlx4_ib_post_pv_qp_buf(ctx, sqp, wc.wr_id &
+							   (MLX4_NUM_TUNNEL_BUFS - 1)))
+					pr_err("Failed reposting SQP "
+					       "buf:%lld\n", wc.wr_id);
+				break;
+			default:
+				BUG_ON(1);
+				break;
+			}
+		} else  {
+			pr_debug("mlx4_ib: completion error in tunnel: %d."
+				 " status = %d, wrid = 0x%llx\n",
+				 ctx->slave, wc.status, wc.wr_id);
+			if (!MLX4_TUN_IS_RECV(wc.wr_id)) {
+				ib_destroy_ah(sqp->tx_ring[wc.wr_id &
+					      (MLX4_NUM_TUNNEL_BUFS - 1)].ah);
+				sqp->tx_ring[wc.wr_id & (MLX4_NUM_TUNNEL_BUFS - 1)].ah
+					= NULL;
+				spin_lock(&sqp->tx_lock);
+				sqp->tx_ix_tail++;
+				spin_unlock(&sqp->tx_lock);
+			}
+		}
+	}
+}
+
+static int alloc_pv_object(struct mlx4_ib_dev *dev, int slave, int port,
+			       struct mlx4_ib_demux_pv_ctx **ret_ctx)
+{
+	struct mlx4_ib_demux_pv_ctx *ctx;
+
+	*ret_ctx = NULL;
+	ctx = kzalloc(sizeof (struct mlx4_ib_demux_pv_ctx), GFP_KERNEL);
+	if (!ctx) {
+		pr_err("failed allocating pv resource context "
+		       "for port %d, slave %d\n", port, slave);
+		return -ENOMEM;
+	}
+
+	ctx->ib_dev = &dev->ib_dev;
+	ctx->port = port;
+	ctx->slave = slave;
+	*ret_ctx = ctx;
+	return 0;
+}
+
+static void free_pv_object(struct mlx4_ib_dev *dev, int slave, int port)
+{
+	if (dev->sriov.demux[port - 1].tun[slave]) {
+		kfree(dev->sriov.demux[port - 1].tun[slave]);
+		dev->sriov.demux[port - 1].tun[slave] = NULL;
+	}
+}
+
+static int create_pv_resources(struct ib_device *ibdev, int slave, int port,
+			       int create_tun, struct mlx4_ib_demux_pv_ctx *ctx)
+{
+	int ret, cq_size;
+
+	if (ctx->state != DEMUX_PV_STATE_DOWN)
+		return -EEXIST;
+
+	ctx->state = DEMUX_PV_STATE_STARTING;
+	/* have QP0 only if link layer is IB */
+	if (rdma_port_get_link_layer(ibdev, ctx->port) ==
+	    IB_LINK_LAYER_INFINIBAND)
+		ctx->has_smi = 1;
+
+	if (ctx->has_smi) {
+		ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_SMI, create_tun);
+		if (ret) {
+			pr_err("Failed allocating qp0 tunnel bufs (%d)\n", ret);
+			goto err_out;
+		}
+	}
+
+	ret = mlx4_ib_alloc_pv_bufs(ctx, IB_QPT_GSI, create_tun);
+	if (ret) {
+		pr_err("Failed allocating qp1 tunnel bufs (%d)\n", ret);
+		goto err_out_qp0;
+	}
+
+	cq_size = 2 * MLX4_NUM_TUNNEL_BUFS;
+	if (ctx->has_smi)
+		cq_size *= 2;
+
+	ctx->cq = ib_create_cq(ctx->ib_dev, mlx4_ib_tunnel_comp_handler,
+			       NULL, ctx, cq_size, 0);
+	if (IS_ERR(ctx->cq)) {
+		ret = PTR_ERR(ctx->cq);
+		pr_err("Couldn't create tunnel CQ (%d)\n", ret);
+		goto err_buf;
+	}
+
+	ctx->pd = ib_alloc_pd(ctx->ib_dev);
+	if (IS_ERR(ctx->pd)) {
+		ret = PTR_ERR(ctx->pd);
+		pr_err("Couldn't create tunnel PD (%d)\n", ret);
+		goto err_cq;
+	}
+
+	ctx->mr = ib_get_dma_mr(ctx->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(ctx->mr)) {
+		ret = PTR_ERR(ctx->mr);
+		pr_err("Couldn't get tunnel DMA MR (%d)\n", ret);
+		goto err_pd;
+	}
+
+	if (ctx->has_smi) {
+		ret = create_pv_sqp(ctx, IB_QPT_SMI, create_tun);
+		if (ret) {
+			pr_err("Couldn't create %s QP0 (%d)\n",
+			       create_tun ? "tunnel for" : "",  ret);
+			goto err_mr;
+		}
+	}
+
+	ret = create_pv_sqp(ctx, IB_QPT_GSI, create_tun);
+	if (ret) {
+		pr_err("Couldn't create %s QP1 (%d)\n",
+		       create_tun ? "tunnel for" : "",  ret);
+		goto err_qp0;
+	}
+
+	if (create_tun)
+		INIT_WORK(&ctx->work, mlx4_ib_tunnel_comp_worker);
+	else
+		INIT_WORK(&ctx->work, mlx4_ib_sqp_comp_worker);
+
+	ctx->wq = to_mdev(ibdev)->sriov.demux[port - 1].wq;
+
+	ret = ib_req_notify_cq(ctx->cq, IB_CQ_NEXT_COMP);
+	if (ret) {
+		pr_err("Couldn't arm tunnel cq (%d)\n", ret);
+		goto err_wq;
+	}
+	ctx->state = DEMUX_PV_STATE_ACTIVE;
+	return 0;
+
+err_wq:
+	ctx->wq = NULL;
+	ib_destroy_qp(ctx->qp[1].qp);
+	ctx->qp[1].qp = NULL;
+
+
+err_qp0:
+	if (ctx->has_smi)
+		ib_destroy_qp(ctx->qp[0].qp);
+	ctx->qp[0].qp = NULL;
+
+err_mr:
+	ib_dereg_mr(ctx->mr);
+	ctx->mr = NULL;
+
+err_pd:
+	ib_dealloc_pd(ctx->pd);
+	ctx->pd = NULL;
+
+err_cq:
+	ib_destroy_cq(ctx->cq);
+	ctx->cq = NULL;
+
+err_buf:
+	mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, create_tun);
+
+err_out_qp0:
+	if (ctx->has_smi)
+		mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, create_tun);
+err_out:
+	ctx->state = DEMUX_PV_STATE_DOWN;
+	return ret;
+}
+
+static void destroy_pv_resources(struct mlx4_ib_dev *dev, int slave, int port,
+				 struct mlx4_ib_demux_pv_ctx *ctx, int flush)
+{
+	if (!ctx)
+		return;
+	if (ctx->state > DEMUX_PV_STATE_DOWN) {
+		ctx->state = DEMUX_PV_STATE_DOWNING;
+		if (flush)
+			flush_workqueue(ctx->wq);
+		if (ctx->has_smi) {
+			ib_destroy_qp(ctx->qp[0].qp);
+			ctx->qp[0].qp = NULL;
+			mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_SMI, 1);
+		}
+		ib_destroy_qp(ctx->qp[1].qp);
+		ctx->qp[1].qp = NULL;
+		mlx4_ib_free_pv_qp_bufs(ctx, IB_QPT_GSI, 1);
+		ib_dereg_mr(ctx->mr);
+		ctx->mr = NULL;
+		ib_dealloc_pd(ctx->pd);
+		ctx->pd = NULL;
+		ib_destroy_cq(ctx->cq);
+		ctx->cq = NULL;
+		ctx->state = DEMUX_PV_STATE_DOWN;
+	}
+}
+
+static int mlx4_ib_tunnels_update(struct mlx4_ib_dev *dev, int slave,
+				  int port, int do_init)
+{
+	int ret = 0;
+
+	if (!do_init) {
+		clean_vf_mcast(&dev->sriov.demux[port - 1], slave);
+		/* for master, destroy real sqp resources */
+		if (slave == mlx4_master_func_num(dev->dev))
+			destroy_pv_resources(dev, slave, port,
+					     dev->sriov.sqps[port - 1], 1);
+		/* destroy the tunnel qp resources */
+		destroy_pv_resources(dev, slave, port,
+				     dev->sriov.demux[port - 1].tun[slave], 1);
+		return 0;
+	}
+
+	/* create the tunnel qp resources */
+	ret = create_pv_resources(&dev->ib_dev, slave, port, 1,
+				  dev->sriov.demux[port - 1].tun[slave]);
+
+	/* for master, create the real sqp resources */
+	if (!ret && slave == mlx4_master_func_num(dev->dev))
+		ret = create_pv_resources(&dev->ib_dev, slave, port, 0,
+					  dev->sriov.sqps[port - 1]);
+	return ret;
+}
+
+void mlx4_ib_tunnels_update_work(struct work_struct *work)
+{
+	struct mlx4_ib_demux_work *dmxw;
+
+	dmxw = container_of(work, struct mlx4_ib_demux_work, work);
+	mlx4_ib_tunnels_update(dmxw->dev, dmxw->slave, (int) dmxw->port,
+			       dmxw->do_init);
+	kfree(dmxw);
+	return;
+}
+
+static int mlx4_ib_alloc_demux_ctx(struct mlx4_ib_dev *dev,
+				       struct mlx4_ib_demux_ctx *ctx,
+				       int port)
+{
+	char name[12];
+	int ret = 0;
+	int i;
+
+	ctx->tun = kcalloc(dev->dev->caps.sqp_demux,
+			   sizeof (struct mlx4_ib_demux_pv_ctx *), GFP_KERNEL);
+	if (!ctx->tun)
+		return -ENOMEM;
+
+	ctx->dev = dev;
+	ctx->port = port;
+	ctx->ib_dev = &dev->ib_dev;
+
+	for (i = 0;
+	     i < min(dev->dev->caps.sqp_demux,
+	     (u16)(dev->dev->persist->num_vfs + 1));
+	     i++) {
+		struct mlx4_active_ports actv_ports =
+			mlx4_get_active_ports(dev->dev, i);
+
+		if (!test_bit(port - 1, actv_ports.ports))
+			continue;
+
+		ret = alloc_pv_object(dev, i, port, &ctx->tun[i]);
+		if (ret) {
+			ret = -ENOMEM;
+			goto err_mcg;
+		}
+	}
+
+	ret = mlx4_ib_mcg_port_init(ctx);
+	if (ret) {
+		pr_err("Failed initializing mcg para-virt (%d)\n", ret);
+		goto err_mcg;
+	}
+
+	snprintf(name, sizeof name, "mlx4_ibt%d", port);
+	ctx->wq = create_singlethread_workqueue(name);
+	if (!ctx->wq) {
+		pr_err("Failed to create tunnelling WQ for port %d\n", port);
+		ret = -ENOMEM;
+		goto err_wq;
+	}
+
+	snprintf(name, sizeof name, "mlx4_ibud%d", port);
+	ctx->ud_wq = create_singlethread_workqueue(name);
+	if (!ctx->ud_wq) {
+		pr_err("Failed to create up/down WQ for port %d\n", port);
+		ret = -ENOMEM;
+		goto err_udwq;
+	}
+
+	return 0;
+
+err_udwq:
+	destroy_workqueue(ctx->wq);
+	ctx->wq = NULL;
+
+err_wq:
+	mlx4_ib_mcg_port_cleanup(ctx, 1);
+err_mcg:
+	for (i = 0; i < dev->dev->caps.sqp_demux; i++)
+		free_pv_object(dev, i, port);
+	kfree(ctx->tun);
+	ctx->tun = NULL;
+	return ret;
+}
+
+static void mlx4_ib_free_sqp_ctx(struct mlx4_ib_demux_pv_ctx *sqp_ctx)
+{
+	if (sqp_ctx->state > DEMUX_PV_STATE_DOWN) {
+		sqp_ctx->state = DEMUX_PV_STATE_DOWNING;
+		flush_workqueue(sqp_ctx->wq);
+		if (sqp_ctx->has_smi) {
+			ib_destroy_qp(sqp_ctx->qp[0].qp);
+			sqp_ctx->qp[0].qp = NULL;
+			mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_SMI, 0);
+		}
+		ib_destroy_qp(sqp_ctx->qp[1].qp);
+		sqp_ctx->qp[1].qp = NULL;
+		mlx4_ib_free_pv_qp_bufs(sqp_ctx, IB_QPT_GSI, 0);
+		ib_dereg_mr(sqp_ctx->mr);
+		sqp_ctx->mr = NULL;
+		ib_dealloc_pd(sqp_ctx->pd);
+		sqp_ctx->pd = NULL;
+		ib_destroy_cq(sqp_ctx->cq);
+		sqp_ctx->cq = NULL;
+		sqp_ctx->state = DEMUX_PV_STATE_DOWN;
+	}
+}
+
+static void mlx4_ib_free_demux_ctx(struct mlx4_ib_demux_ctx *ctx)
+{
+	int i;
+	if (ctx) {
+		struct mlx4_ib_dev *dev = to_mdev(ctx->ib_dev);
+		mlx4_ib_mcg_port_cleanup(ctx, 1);
+		for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+			if (!ctx->tun[i])
+				continue;
+			if (ctx->tun[i]->state > DEMUX_PV_STATE_DOWN)
+				ctx->tun[i]->state = DEMUX_PV_STATE_DOWNING;
+		}
+		flush_workqueue(ctx->wq);
+		for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+			destroy_pv_resources(dev, i, ctx->port, ctx->tun[i], 0);
+			free_pv_object(dev, i, ctx->port);
+		}
+		kfree(ctx->tun);
+		destroy_workqueue(ctx->ud_wq);
+		destroy_workqueue(ctx->wq);
+	}
+}
+
+static void mlx4_ib_master_tunnels(struct mlx4_ib_dev *dev, int do_init)
+{
+	int i;
+
+	if (!mlx4_is_master(dev->dev))
+		return;
+	/* initialize or tear down tunnel QPs for the master */
+	for (i = 0; i < dev->dev->caps.num_ports; i++)
+		mlx4_ib_tunnels_update(dev, mlx4_master_func_num(dev->dev), i + 1, do_init);
+	return;
+}
+
+int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev)
+{
+	int i = 0;
+	int err;
+
+	if (!mlx4_is_mfunc(dev->dev))
+		return 0;
+
+	dev->sriov.is_going_down = 0;
+	spin_lock_init(&dev->sriov.going_down_lock);
+	mlx4_ib_cm_paravirt_init(dev);
+
+	mlx4_ib_warn(&dev->ib_dev, "multi-function enabled\n");
+
+	if (mlx4_is_slave(dev->dev)) {
+		mlx4_ib_warn(&dev->ib_dev, "operating in qp1 tunnel mode\n");
+		return 0;
+	}
+
+	for (i = 0; i < dev->dev->caps.sqp_demux; i++) {
+		if (i == mlx4_master_func_num(dev->dev))
+			mlx4_put_slave_node_guid(dev->dev, i, dev->ib_dev.node_guid);
+		else
+			mlx4_put_slave_node_guid(dev->dev, i, mlx4_ib_gen_node_guid());
+	}
+
+	err = mlx4_ib_init_alias_guid_service(dev);
+	if (err) {
+		mlx4_ib_warn(&dev->ib_dev, "Failed init alias guid process.\n");
+		goto paravirt_err;
+	}
+	err = mlx4_ib_device_register_sysfs(dev);
+	if (err) {
+		mlx4_ib_warn(&dev->ib_dev, "Failed to register sysfs\n");
+		goto sysfs_err;
+	}
+
+	mlx4_ib_warn(&dev->ib_dev, "initializing demux service for %d qp1 clients\n",
+		     dev->dev->caps.sqp_demux);
+	for (i = 0; i < dev->num_ports; i++) {
+		union ib_gid gid;
+		err = __mlx4_ib_query_gid(&dev->ib_dev, i + 1, 0, &gid, 1);
+		if (err)
+			goto demux_err;
+		dev->sriov.demux[i].guid_cache[0] = gid.global.interface_id;
+		err = alloc_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1,
+				      &dev->sriov.sqps[i]);
+		if (err)
+			goto demux_err;
+		err = mlx4_ib_alloc_demux_ctx(dev, &dev->sriov.demux[i], i + 1);
+		if (err)
+			goto free_pv;
+	}
+	mlx4_ib_master_tunnels(dev, 1);
+	return 0;
+
+free_pv:
+	free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1);
+demux_err:
+	while (--i >= 0) {
+		free_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1);
+		mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]);
+	}
+	mlx4_ib_device_unregister_sysfs(dev);
+
+sysfs_err:
+	mlx4_ib_destroy_alias_guid_service(dev);
+
+paravirt_err:
+	mlx4_ib_cm_paravirt_clean(dev, -1);
+
+	return err;
+}
+
+void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev)
+{
+	int i;
+	unsigned long flags;
+
+	if (!mlx4_is_mfunc(dev->dev))
+		return;
+
+	spin_lock_irqsave(&dev->sriov.going_down_lock, flags);
+	dev->sriov.is_going_down = 1;
+	spin_unlock_irqrestore(&dev->sriov.going_down_lock, flags);
+	if (mlx4_is_master(dev->dev)) {
+		for (i = 0; i < dev->num_ports; i++) {
+			flush_workqueue(dev->sriov.demux[i].ud_wq);
+			mlx4_ib_free_sqp_ctx(dev->sriov.sqps[i]);
+			kfree(dev->sriov.sqps[i]);
+			dev->sriov.sqps[i] = NULL;
+			mlx4_ib_free_demux_ctx(&dev->sriov.demux[i]);
+		}
+
+		mlx4_ib_cm_paravirt_clean(dev, -1);
+		mlx4_ib_destroy_alias_guid_service(dev);
+		mlx4_ib_device_unregister_sysfs(dev);
+	}
+}
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
new file mode 100644
index 000000000..cc64400d4
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -0,0 +1,2874 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_vlan.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/cmd.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+#define DRV_NAME	MLX4_IB_DRV_NAME
+#define DRV_VERSION	"2.2-1"
+#define DRV_RELDATE	"Feb 2014"
+
+#define MLX4_IB_FLOW_MAX_PRIO 0xFFF
+#define MLX4_IB_FLOW_QPN_MASK 0xFFFFFF
+#define MLX4_IB_CARD_REV_A0   0xA0
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox ConnectX HCA InfiniBand driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+int mlx4_ib_sm_guid_assign = 0;
+module_param_named(sm_guid_assign, mlx4_ib_sm_guid_assign, int, 0444);
+MODULE_PARM_DESC(sm_guid_assign, "Enable SM alias_GUID assignment if sm_guid_assign > 0 (Default: 0)");
+
+static const char mlx4_ib_version[] =
+	DRV_NAME ": Mellanox ConnectX InfiniBand driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+struct update_gid_work {
+	struct work_struct	work;
+	union ib_gid		gids[128];
+	struct mlx4_ib_dev     *dev;
+	int			port;
+};
+
+static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init);
+
+static struct workqueue_struct *wq;
+
+static void init_query_mad(struct ib_smp *mad)
+{
+	mad->base_version  = 1;
+	mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	mad->class_version = 1;
+	mad->method	   = IB_MGMT_METHOD_GET;
+}
+
+static union ib_gid zgid;
+
+static int check_flow_steering_support(struct mlx4_dev *dev)
+{
+	int eth_num_ports = 0;
+	int ib_num_ports = 0;
+
+	int dmfs = dev->caps.steering_mode == MLX4_STEERING_MODE_DEVICE_MANAGED;
+
+	if (dmfs) {
+		int i;
+		mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_ETH)
+			eth_num_ports++;
+		mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+			ib_num_ports++;
+		dmfs &= (!ib_num_ports ||
+			 (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_DMFS_IPOIB)) &&
+			(!eth_num_ports ||
+			 (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_FS_EN));
+		if (ib_num_ports && mlx4_is_mfunc(dev)) {
+			pr_warn("Device managed flow steering is unavailable for IB port in multifunction env.\n");
+			dmfs = 0;
+		}
+	}
+	return dmfs;
+}
+
+static int num_ib_ports(struct mlx4_dev *dev)
+{
+	int ib_ports = 0;
+	int i;
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		ib_ports++;
+
+	return ib_ports;
+}
+
+static int mlx4_ib_query_device(struct ib_device *ibdev,
+				struct ib_device_attr *props)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	int have_ib_ports;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), MLX4_MAD_IFC_IGNORE_KEYS,
+			   1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memset(props, 0, sizeof *props);
+
+	have_ib_ports = num_ib_ports(dev->dev);
+
+	props->fw_ver = dev->dev->caps.fw_ver;
+	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
+		IB_DEVICE_PORT_ACTIVE_EVENT		|
+		IB_DEVICE_SYS_IMAGE_GUID		|
+		IB_DEVICE_RC_RNR_NAK_GEN		|
+		IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_PKEY_CNTR)
+		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BAD_QKEY_CNTR)
+		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_APM && have_ib_ports)
+		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_UD_AV_PORT)
+		props->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IPOIB_CSUM)
+		props->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
+	if (dev->dev->caps.max_gso_sz &&
+	    (dev->dev->rev_id != MLX4_IB_CARD_REV_A0) &&
+	    (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_BLH))
+		props->device_cap_flags |= IB_DEVICE_UD_TSO;
+	if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_RESERVED_LKEY)
+		props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
+	if ((dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_LOCAL_INV) &&
+	    (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_REMOTE_INV) &&
+	    (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_FAST_REG_WR))
+		props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC)
+		props->device_cap_flags |= IB_DEVICE_XRC;
+	if (dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW)
+		props->device_cap_flags |= IB_DEVICE_MEM_WINDOW;
+	if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
+		if (dev->dev->caps.bmme_flags & MLX4_BMME_FLAG_WIN_TYPE_2B)
+			props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2B;
+		else
+			props->device_cap_flags |= IB_DEVICE_MEM_WINDOW_TYPE_2A;
+	if (dev->steering_support ==  MLX4_STEERING_MODE_DEVICE_MANAGED)
+		props->device_cap_flags |= IB_DEVICE_MANAGED_FLOW_STEERING;
+	}
+
+	props->vendor_id	   = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
+		0xffffff;
+	props->vendor_part_id	   = dev->dev->persist->pdev->device;
+	props->hw_ver		   = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&props->sys_image_guid, out_mad->data +	4, 8);
+
+	props->max_mr_size	   = ~0ull;
+	props->page_size_cap	   = dev->dev->caps.page_size_cap;
+	props->max_qp		   = dev->dev->quotas.qp;
+	props->max_qp_wr	   = dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE;
+	props->max_sge		   = min(dev->dev->caps.max_sq_sg,
+					 dev->dev->caps.max_rq_sg);
+	props->max_cq		   = dev->dev->quotas.cq;
+	props->max_cqe		   = dev->dev->caps.max_cqes;
+	props->max_mr		   = dev->dev->quotas.mpt;
+	props->max_pd		   = dev->dev->caps.num_pds - dev->dev->caps.reserved_pds;
+	props->max_qp_rd_atom	   = dev->dev->caps.max_qp_dest_rdma;
+	props->max_qp_init_rd_atom = dev->dev->caps.max_qp_init_rdma;
+	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
+	props->max_srq		   = dev->dev->quotas.srq;
+	props->max_srq_wr	   = dev->dev->caps.max_srq_wqes - 1;
+	props->max_srq_sge	   = dev->dev->caps.max_srq_sge;
+	props->max_fast_reg_page_list_len = MLX4_MAX_FAST_REG_PAGES;
+	props->local_ca_ack_delay  = dev->dev->caps.local_ca_ack_delay;
+	props->atomic_cap	   = dev->dev->caps.flags & MLX4_DEV_CAP_FLAG_ATOMIC ?
+		IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+	props->masked_atomic_cap   = props->atomic_cap;
+	props->max_pkeys	   = dev->dev->caps.pkey_table_len[1];
+	props->max_mcast_grp	   = dev->dev->caps.num_mgms + dev->dev->caps.num_amgms;
+	props->max_mcast_qp_attach = dev->dev->caps.num_qp_per_mgm;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+					   props->max_mcast_grp;
+	props->max_map_per_fmr = dev->dev->caps.max_fmr_maps;
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return err;
+}
+
+static enum rdma_link_layer
+mlx4_ib_port_link_layer(struct ib_device *device, u8 port_num)
+{
+	struct mlx4_dev *dev = to_mdev(device)->dev;
+
+	return dev->caps.port_mask[port_num] == MLX4_PORT_TYPE_IB ?
+		IB_LINK_LAYER_INFINIBAND : IB_LINK_LAYER_ETHERNET;
+}
+
+static int ib_link_query_port(struct ib_device *ibdev, u8 port,
+			      struct ib_port_attr *props, int netw_view)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int ext_active_speed;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view)
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
+				in_mad, out_mad);
+	if (err)
+		goto out;
+
+
+	props->lid		= be16_to_cpup((__be16 *) (out_mad->data + 16));
+	props->lmc		= out_mad->data[34] & 0x7;
+	props->sm_lid		= be16_to_cpup((__be16 *) (out_mad->data + 18));
+	props->sm_sl		= out_mad->data[36] & 0xf;
+	props->state		= out_mad->data[32] & 0xf;
+	props->phys_state	= out_mad->data[33] >> 4;
+	props->port_cap_flags	= be32_to_cpup((__be32 *) (out_mad->data + 20));
+	if (netw_view)
+		props->gid_tbl_len = out_mad->data[50];
+	else
+		props->gid_tbl_len = to_mdev(ibdev)->dev->caps.gid_table_len[port];
+	props->max_msg_sz	= to_mdev(ibdev)->dev->caps.max_msg_sz;
+	props->pkey_tbl_len	= to_mdev(ibdev)->dev->caps.pkey_table_len[port];
+	props->bad_pkey_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 46));
+	props->qkey_viol_cntr	= be16_to_cpup((__be16 *) (out_mad->data + 48));
+	props->active_width	= out_mad->data[31] & 0xf;
+	props->active_speed	= out_mad->data[35] >> 4;
+	props->max_mtu		= out_mad->data[41] & 0xf;
+	props->active_mtu	= out_mad->data[36] >> 4;
+	props->subnet_timeout	= out_mad->data[51] & 0x1f;
+	props->max_vl_num	= out_mad->data[37] >> 4;
+	props->init_type_reply	= out_mad->data[41] >> 4;
+
+	/* Check if extended speeds (EDR/FDR/...) are supported */
+	if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) {
+		ext_active_speed = out_mad->data[62] >> 4;
+
+		switch (ext_active_speed) {
+		case 1:
+			props->active_speed = IB_SPEED_FDR;
+			break;
+		case 2:
+			props->active_speed = IB_SPEED_EDR;
+			break;
+		}
+	}
+
+	/* If reported active speed is QDR, check if is FDR-10 */
+	if (props->active_speed == IB_SPEED_QDR) {
+		init_query_mad(in_mad);
+		in_mad->attr_id = MLX4_ATTR_EXTENDED_PORT_INFO;
+		in_mad->attr_mod = cpu_to_be32(port);
+
+		err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port,
+				   NULL, NULL, in_mad, out_mad);
+		if (err)
+			goto out;
+
+		/* Checking LinkSpeedActive for FDR-10 */
+		if (out_mad->data[15] & 0x1)
+			props->active_speed = IB_SPEED_FDR10;
+	}
+
+	/* Avoid wrong speed value returned by FW if the IB link is down. */
+	if (props->state == IB_PORT_DOWN)
+		 props->active_speed = IB_SPEED_SDR;
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static u8 state_to_phys_state(enum ib_port_state state)
+{
+	return state == IB_PORT_ACTIVE ? 5 : 3;
+}
+
+static int eth_link_query_port(struct ib_device *ibdev, u8 port,
+			       struct ib_port_attr *props, int netw_view)
+{
+
+	struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+	struct mlx4_ib_iboe *iboe = &mdev->iboe;
+	struct net_device *ndev;
+	enum ib_mtu tmp;
+	struct mlx4_cmd_mailbox *mailbox;
+	int err = 0;
+	int is_bonded = mlx4_is_bonded(mdev->dev);
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mlx4_cmd_box(mdev->dev, 0, mailbox->dma, port, 0,
+			   MLX4_CMD_QUERY_PORT, MLX4_CMD_TIME_CLASS_B,
+			   MLX4_CMD_WRAPPED);
+	if (err)
+		goto out;
+
+	props->active_width	=  (((u8 *)mailbox->buf)[5] == 0x40) ?
+						IB_WIDTH_4X : IB_WIDTH_1X;
+	props->active_speed	= IB_SPEED_QDR;
+	props->port_cap_flags	= IB_PORT_CM_SUP | IB_PORT_IP_BASED_GIDS;
+	props->gid_tbl_len	= mdev->dev->caps.gid_table_len[port];
+	props->max_msg_sz	= mdev->dev->caps.max_msg_sz;
+	props->pkey_tbl_len	= 1;
+	props->max_mtu		= IB_MTU_4096;
+	props->max_vl_num	= 2;
+	props->state		= IB_PORT_DOWN;
+	props->phys_state	= state_to_phys_state(props->state);
+	props->active_mtu	= IB_MTU_256;
+	if (is_bonded)
+		rtnl_lock(); /* required to get upper dev */
+	spin_lock_bh(&iboe->lock);
+	ndev = iboe->netdevs[port - 1];
+	if (ndev && is_bonded)
+		ndev = netdev_master_upper_dev_get(ndev);
+	if (!ndev)
+		goto out_unlock;
+
+	tmp = iboe_get_mtu(ndev->mtu);
+	props->active_mtu = tmp ? min(props->max_mtu, tmp) : IB_MTU_256;
+
+	props->state		= (netif_running(ndev) && netif_carrier_ok(ndev)) ?
+					IB_PORT_ACTIVE : IB_PORT_DOWN;
+	props->phys_state	= state_to_phys_state(props->state);
+out_unlock:
+	spin_unlock_bh(&iboe->lock);
+	if (is_bonded)
+		rtnl_unlock();
+out:
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return err;
+}
+
+int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+			 struct ib_port_attr *props, int netw_view)
+{
+	int err;
+
+	memset(props, 0, sizeof *props);
+
+	err = mlx4_ib_port_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND ?
+		ib_link_query_port(ibdev, port, props, netw_view) :
+				eth_link_query_port(ibdev, port, props, netw_view);
+
+	return err;
+}
+
+static int mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+			      struct ib_port_attr *props)
+{
+	/* returns host view */
+	return __mlx4_ib_query_port(ibdev, port, props, 0);
+}
+
+int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			union ib_gid *gid, int netw_view)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	int clear = 0;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	if (mlx4_is_mfunc(dev->dev) && netw_view)
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+	err = mlx4_MAD_IFC(dev, mad_ifc_flags, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw, out_mad->data + 8, 8);
+
+	if (mlx4_is_mfunc(dev->dev) && !netw_view) {
+		if (index) {
+			/* For any index > 0, return the null guid */
+			err = 0;
+			clear = 1;
+			goto out;
+		}
+	}
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
+	in_mad->attr_mod = cpu_to_be32(index / 8);
+
+	err = mlx4_MAD_IFC(dev, mad_ifc_flags, port,
+			   NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+out:
+	if (clear)
+		memset(gid->raw + 8, 0, 8);
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int iboe_query_gid(struct ib_device *ibdev, u8 port, int index,
+			  union ib_gid *gid)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+
+	*gid = dev->iboe.gid_table[port - 1][index];
+
+	return 0;
+}
+
+static int mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			     union ib_gid *gid)
+{
+	if (rdma_port_get_link_layer(ibdev, port) == IB_LINK_LAYER_INFINIBAND)
+		return __mlx4_ib_query_gid(ibdev, port, index, gid, 0);
+	else
+		return iboe_query_gid(ibdev, port, index, gid);
+}
+
+int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			 u16 *pkey, int netw_view)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	if (mlx4_is_mfunc(to_mdev(ibdev)->dev) && netw_view)
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+	err = mlx4_MAD_IFC(to_mdev(ibdev), mad_ifc_flags, port, NULL, NULL,
+			   in_mad, out_mad);
+	if (err)
+		goto out;
+
+	*pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+	return __mlx4_ib_query_pkey(ibdev, port, index, pkey, 0);
+}
+
+static int mlx4_ib_modify_device(struct ib_device *ibdev, int mask,
+				 struct ib_device_modify *props)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	unsigned long flags;
+
+	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+		return -EOPNOTSUPP;
+
+	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
+		return 0;
+
+	if (mlx4_is_slave(to_mdev(ibdev)->dev))
+		return -EOPNOTSUPP;
+
+	spin_lock_irqsave(&to_mdev(ibdev)->sm_lock, flags);
+	memcpy(ibdev->node_desc, props->node_desc, 64);
+	spin_unlock_irqrestore(&to_mdev(ibdev)->sm_lock, flags);
+
+	/*
+	 * If possible, pass node desc to FW, so it can generate
+	 * a 144 trap.  If cmd fails, just ignore.
+	 */
+	mailbox = mlx4_alloc_cmd_mailbox(to_mdev(ibdev)->dev);
+	if (IS_ERR(mailbox))
+		return 0;
+
+	memcpy(mailbox->buf, props->node_desc, 64);
+	mlx4_cmd(to_mdev(ibdev)->dev, mailbox->dma, 1, 0,
+		 MLX4_CMD_SET_NODE, MLX4_CMD_TIME_CLASS_A, MLX4_CMD_NATIVE);
+
+	mlx4_free_cmd_mailbox(to_mdev(ibdev)->dev, mailbox);
+
+	return 0;
+}
+
+static int mlx4_ib_SET_PORT(struct mlx4_ib_dev *dev, u8 port, int reset_qkey_viols,
+			    u32 cap_mask)
+{
+	struct mlx4_cmd_mailbox *mailbox;
+	int err;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	if (dev->dev->flags & MLX4_FLAG_OLD_PORT_CMDS) {
+		*(u8 *) mailbox->buf	     = !!reset_qkey_viols << 6;
+		((__be32 *) mailbox->buf)[2] = cpu_to_be32(cap_mask);
+	} else {
+		((u8 *) mailbox->buf)[3]     = !!reset_qkey_viols;
+		((__be32 *) mailbox->buf)[1] = cpu_to_be32(cap_mask);
+	}
+
+	err = mlx4_cmd(dev->dev, mailbox->dma, port, MLX4_SET_PORT_IB_OPCODE,
+		       MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B,
+		       MLX4_CMD_WRAPPED);
+
+	mlx4_free_cmd_mailbox(dev->dev, mailbox);
+	return err;
+}
+
+static int mlx4_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
+			       struct ib_port_modify *props)
+{
+	struct mlx4_ib_dev *mdev = to_mdev(ibdev);
+	u8 is_eth = mdev->dev->caps.port_type[port] == MLX4_PORT_TYPE_ETH;
+	struct ib_port_attr attr;
+	u32 cap_mask;
+	int err;
+
+	/* return OK if this is RoCE. CM calls ib_modify_port() regardless
+	 * of whether port link layer is ETH or IB. For ETH ports, qkey
+	 * violations and port capabilities are not meaningful.
+	 */
+	if (is_eth)
+		return 0;
+
+	mutex_lock(&mdev->cap_mask_mutex);
+
+	err = mlx4_ib_query_port(ibdev, port, &attr);
+	if (err)
+		goto out;
+
+	cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
+		~props->clr_port_cap_mask;
+
+	err = mlx4_ib_SET_PORT(mdev, port,
+			       !!(mask & IB_PORT_RESET_QKEY_CNTR),
+			       cap_mask);
+
+out:
+	mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+	return err;
+}
+
+static struct ib_ucontext *mlx4_ib_alloc_ucontext(struct ib_device *ibdev,
+						  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct mlx4_ib_ucontext *context;
+	struct mlx4_ib_alloc_ucontext_resp_v3 resp_v3;
+	struct mlx4_ib_alloc_ucontext_resp resp;
+	int err;
+
+	if (!dev->ib_active)
+		return ERR_PTR(-EAGAIN);
+
+	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION) {
+		resp_v3.qp_tab_size      = dev->dev->caps.num_qps;
+		resp_v3.bf_reg_size      = dev->dev->caps.bf_reg_size;
+		resp_v3.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
+	} else {
+		resp.dev_caps	      = dev->dev->caps.userspace_caps;
+		resp.qp_tab_size      = dev->dev->caps.num_qps;
+		resp.bf_reg_size      = dev->dev->caps.bf_reg_size;
+		resp.bf_regs_per_page = dev->dev->caps.bf_regs_per_page;
+		resp.cqe_size	      = dev->dev->caps.cqe_size;
+	}
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_uar_alloc(to_mdev(ibdev)->dev, &context->uar);
+	if (err) {
+		kfree(context);
+		return ERR_PTR(err);
+	}
+
+	INIT_LIST_HEAD(&context->db_page_list);
+	mutex_init(&context->db_page_mutex);
+
+	if (ibdev->uverbs_abi_ver == MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION)
+		err = ib_copy_to_udata(udata, &resp_v3, sizeof(resp_v3));
+	else
+		err = ib_copy_to_udata(udata, &resp, sizeof(resp));
+
+	if (err) {
+		mlx4_uar_free(to_mdev(ibdev)->dev, &context->uar);
+		kfree(context);
+		return ERR_PTR(-EFAULT);
+	}
+
+	return &context->ibucontext;
+}
+
+static int mlx4_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+	struct mlx4_ib_ucontext *context = to_mucontext(ibcontext);
+
+	mlx4_uar_free(to_mdev(ibcontext->device)->dev, &context->uar);
+	kfree(context);
+
+	return 0;
+}
+
+static int mlx4_ib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct mlx4_ib_dev *dev = to_mdev(context->device);
+
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	if (vma->vm_pgoff == 0) {
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				       to_mucontext(context)->uar.pfn,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+	} else if (vma->vm_pgoff == 1 && dev->dev->caps.bf_reg_size != 0) {
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				       to_mucontext(context)->uar.pfn +
+				       dev->dev->caps.num_uars,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+	} else
+		return -EINVAL;
+
+	return 0;
+}
+
+static struct ib_pd *mlx4_ib_alloc_pd(struct ib_device *ibdev,
+				      struct ib_ucontext *context,
+				      struct ib_udata *udata)
+{
+	struct mlx4_ib_pd *pd;
+	int err;
+
+	pd = kmalloc(sizeof *pd, GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_pd_alloc(to_mdev(ibdev)->dev, &pd->pdn);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	if (context)
+		if (ib_copy_to_udata(udata, &pd->pdn, sizeof (__u32))) {
+			mlx4_pd_free(to_mdev(ibdev)->dev, pd->pdn);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+
+	return &pd->ibpd;
+}
+
+static int mlx4_ib_dealloc_pd(struct ib_pd *pd)
+{
+	mlx4_pd_free(to_mdev(pd->device)->dev, to_mpd(pd)->pdn);
+	kfree(pd);
+
+	return 0;
+}
+
+static struct ib_xrcd *mlx4_ib_alloc_xrcd(struct ib_device *ibdev,
+					  struct ib_ucontext *context,
+					  struct ib_udata *udata)
+{
+	struct mlx4_ib_xrcd *xrcd;
+	int err;
+
+	if (!(to_mdev(ibdev)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+		return ERR_PTR(-ENOSYS);
+
+	xrcd = kmalloc(sizeof *xrcd, GFP_KERNEL);
+	if (!xrcd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_xrcd_alloc(to_mdev(ibdev)->dev, &xrcd->xrcdn);
+	if (err)
+		goto err1;
+
+	xrcd->pd = ib_alloc_pd(ibdev);
+	if (IS_ERR(xrcd->pd)) {
+		err = PTR_ERR(xrcd->pd);
+		goto err2;
+	}
+
+	xrcd->cq = ib_create_cq(ibdev, NULL, NULL, xrcd, 1, 0);
+	if (IS_ERR(xrcd->cq)) {
+		err = PTR_ERR(xrcd->cq);
+		goto err3;
+	}
+
+	return &xrcd->ibxrcd;
+
+err3:
+	ib_dealloc_pd(xrcd->pd);
+err2:
+	mlx4_xrcd_free(to_mdev(ibdev)->dev, xrcd->xrcdn);
+err1:
+	kfree(xrcd);
+	return ERR_PTR(err);
+}
+
+static int mlx4_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+	ib_destroy_cq(to_mxrcd(xrcd)->cq);
+	ib_dealloc_pd(to_mxrcd(xrcd)->pd);
+	mlx4_xrcd_free(to_mdev(xrcd->device)->dev, to_mxrcd(xrcd)->xrcdn);
+	kfree(xrcd);
+
+	return 0;
+}
+
+static int add_gid_entry(struct ib_qp *ibqp, union ib_gid *gid)
+{
+	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+	struct mlx4_ib_gid_entry *ge;
+
+	ge = kzalloc(sizeof *ge, GFP_KERNEL);
+	if (!ge)
+		return -ENOMEM;
+
+	ge->gid = *gid;
+	if (mlx4_ib_add_mc(mdev, mqp, gid)) {
+		ge->port = mqp->port;
+		ge->added = 1;
+	}
+
+	mutex_lock(&mqp->mutex);
+	list_add_tail(&ge->list, &mqp->gid_list);
+	mutex_unlock(&mqp->mutex);
+
+	return 0;
+}
+
+int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+		   union ib_gid *gid)
+{
+	struct net_device *ndev;
+	int ret = 0;
+
+	if (!mqp->port)
+		return 0;
+
+	spin_lock_bh(&mdev->iboe.lock);
+	ndev = mdev->iboe.netdevs[mqp->port - 1];
+	if (ndev)
+		dev_hold(ndev);
+	spin_unlock_bh(&mdev->iboe.lock);
+
+	if (ndev) {
+		ret = 1;
+		dev_put(ndev);
+	}
+
+	return ret;
+}
+
+struct mlx4_ib_steering {
+	struct list_head list;
+	struct mlx4_flow_reg_id reg_id;
+	union ib_gid gid;
+};
+
+static int parse_flow_attr(struct mlx4_dev *dev,
+			   u32 qp_num,
+			   union ib_flow_spec *ib_spec,
+			   struct _rule_hw *mlx4_spec)
+{
+	enum mlx4_net_trans_rule_id type;
+
+	switch (ib_spec->type) {
+	case IB_FLOW_SPEC_ETH:
+		type = MLX4_NET_TRANS_RULE_ID_ETH;
+		memcpy(mlx4_spec->eth.dst_mac, ib_spec->eth.val.dst_mac,
+		       ETH_ALEN);
+		memcpy(mlx4_spec->eth.dst_mac_msk, ib_spec->eth.mask.dst_mac,
+		       ETH_ALEN);
+		mlx4_spec->eth.vlan_tag = ib_spec->eth.val.vlan_tag;
+		mlx4_spec->eth.vlan_tag_msk = ib_spec->eth.mask.vlan_tag;
+		break;
+	case IB_FLOW_SPEC_IB:
+		type = MLX4_NET_TRANS_RULE_ID_IB;
+		mlx4_spec->ib.l3_qpn =
+			cpu_to_be32(qp_num);
+		mlx4_spec->ib.qpn_mask =
+			cpu_to_be32(MLX4_IB_FLOW_QPN_MASK);
+		break;
+
+
+	case IB_FLOW_SPEC_IPV4:
+		type = MLX4_NET_TRANS_RULE_ID_IPV4;
+		mlx4_spec->ipv4.src_ip = ib_spec->ipv4.val.src_ip;
+		mlx4_spec->ipv4.src_ip_msk = ib_spec->ipv4.mask.src_ip;
+		mlx4_spec->ipv4.dst_ip = ib_spec->ipv4.val.dst_ip;
+		mlx4_spec->ipv4.dst_ip_msk = ib_spec->ipv4.mask.dst_ip;
+		break;
+
+	case IB_FLOW_SPEC_TCP:
+	case IB_FLOW_SPEC_UDP:
+		type = ib_spec->type == IB_FLOW_SPEC_TCP ?
+					MLX4_NET_TRANS_RULE_ID_TCP :
+					MLX4_NET_TRANS_RULE_ID_UDP;
+		mlx4_spec->tcp_udp.dst_port = ib_spec->tcp_udp.val.dst_port;
+		mlx4_spec->tcp_udp.dst_port_msk = ib_spec->tcp_udp.mask.dst_port;
+		mlx4_spec->tcp_udp.src_port = ib_spec->tcp_udp.val.src_port;
+		mlx4_spec->tcp_udp.src_port_msk = ib_spec->tcp_udp.mask.src_port;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	if (mlx4_map_sw_to_hw_steering_id(dev, type) < 0 ||
+	    mlx4_hw_rule_sz(dev, type) < 0)
+		return -EINVAL;
+	mlx4_spec->id = cpu_to_be16(mlx4_map_sw_to_hw_steering_id(dev, type));
+	mlx4_spec->size = mlx4_hw_rule_sz(dev, type) >> 2;
+	return mlx4_hw_rule_sz(dev, type);
+}
+
+struct default_rules {
+	__u32 mandatory_fields[IB_FLOW_SPEC_SUPPORT_LAYERS];
+	__u32 mandatory_not_fields[IB_FLOW_SPEC_SUPPORT_LAYERS];
+	__u32 rules_create_list[IB_FLOW_SPEC_SUPPORT_LAYERS];
+	__u8  link_layer;
+};
+static const struct default_rules default_table[] = {
+	{
+		.mandatory_fields = {IB_FLOW_SPEC_IPV4},
+		.mandatory_not_fields = {IB_FLOW_SPEC_ETH},
+		.rules_create_list = {IB_FLOW_SPEC_IB},
+		.link_layer = IB_LINK_LAYER_INFINIBAND
+	}
+};
+
+static int __mlx4_ib_default_rules_match(struct ib_qp *qp,
+					 struct ib_flow_attr *flow_attr)
+{
+	int i, j, k;
+	void *ib_flow;
+	const struct default_rules *pdefault_rules = default_table;
+	u8 link_layer = rdma_port_get_link_layer(qp->device, flow_attr->port);
+
+	for (i = 0; i < ARRAY_SIZE(default_table); i++, pdefault_rules++) {
+		__u32 field_types[IB_FLOW_SPEC_SUPPORT_LAYERS];
+		memset(&field_types, 0, sizeof(field_types));
+
+		if (link_layer != pdefault_rules->link_layer)
+			continue;
+
+		ib_flow = flow_attr + 1;
+		/* we assume the specs are sorted */
+		for (j = 0, k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS &&
+		     j < flow_attr->num_of_specs; k++) {
+			union ib_flow_spec *current_flow =
+				(union ib_flow_spec *)ib_flow;
+
+			/* same layer but different type */
+			if (((current_flow->type & IB_FLOW_SPEC_LAYER_MASK) ==
+			     (pdefault_rules->mandatory_fields[k] &
+			      IB_FLOW_SPEC_LAYER_MASK)) &&
+			    (current_flow->type !=
+			     pdefault_rules->mandatory_fields[k]))
+				goto out;
+
+			/* same layer, try match next one */
+			if (current_flow->type ==
+			    pdefault_rules->mandatory_fields[k]) {
+				j++;
+				ib_flow +=
+					((union ib_flow_spec *)ib_flow)->size;
+			}
+		}
+
+		ib_flow = flow_attr + 1;
+		for (j = 0; j < flow_attr->num_of_specs;
+		     j++, ib_flow += ((union ib_flow_spec *)ib_flow)->size)
+			for (k = 0; k < IB_FLOW_SPEC_SUPPORT_LAYERS; k++)
+				/* same layer and same type */
+				if (((union ib_flow_spec *)ib_flow)->type ==
+				    pdefault_rules->mandatory_not_fields[k])
+					goto out;
+
+		return i;
+	}
+out:
+	return -1;
+}
+
+static int __mlx4_ib_create_default_rules(
+		struct mlx4_ib_dev *mdev,
+		struct ib_qp *qp,
+		const struct default_rules *pdefault_rules,
+		struct _rule_hw *mlx4_spec) {
+	int size = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pdefault_rules->rules_create_list); i++) {
+		int ret;
+		union ib_flow_spec ib_spec;
+		switch (pdefault_rules->rules_create_list[i]) {
+		case 0:
+			/* no rule */
+			continue;
+		case IB_FLOW_SPEC_IB:
+			ib_spec.type = IB_FLOW_SPEC_IB;
+			ib_spec.size = sizeof(struct ib_flow_spec_ib);
+
+			break;
+		default:
+			/* invalid rule */
+			return -EINVAL;
+		}
+		/* We must put empty rule, qpn is being ignored */
+		ret = parse_flow_attr(mdev->dev, 0, &ib_spec,
+				      mlx4_spec);
+		if (ret < 0) {
+			pr_info("invalid parsing\n");
+			return -EINVAL;
+		}
+
+		mlx4_spec = (void *)mlx4_spec + ret;
+		size += ret;
+	}
+	return size;
+}
+
+static int __mlx4_ib_create_flow(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
+			  int domain,
+			  enum mlx4_net_trans_promisc_mode flow_type,
+			  u64 *reg_id)
+{
+	int ret, i;
+	int size = 0;
+	void *ib_flow;
+	struct mlx4_ib_dev *mdev = to_mdev(qp->device);
+	struct mlx4_cmd_mailbox *mailbox;
+	struct mlx4_net_trans_rule_hw_ctrl *ctrl;
+	int default_flow;
+
+	static const u16 __mlx4_domain[] = {
+		[IB_FLOW_DOMAIN_USER] = MLX4_DOMAIN_UVERBS,
+		[IB_FLOW_DOMAIN_ETHTOOL] = MLX4_DOMAIN_ETHTOOL,
+		[IB_FLOW_DOMAIN_RFS] = MLX4_DOMAIN_RFS,
+		[IB_FLOW_DOMAIN_NIC] = MLX4_DOMAIN_NIC,
+	};
+
+	if (flow_attr->priority > MLX4_IB_FLOW_MAX_PRIO) {
+		pr_err("Invalid priority value %d\n", flow_attr->priority);
+		return -EINVAL;
+	}
+
+	if (domain >= IB_FLOW_DOMAIN_NUM) {
+		pr_err("Invalid domain value %d\n", domain);
+		return -EINVAL;
+	}
+
+	if (mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type) < 0)
+		return -EINVAL;
+
+	mailbox = mlx4_alloc_cmd_mailbox(mdev->dev);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	ctrl = mailbox->buf;
+
+	ctrl->prio = cpu_to_be16(__mlx4_domain[domain] |
+				 flow_attr->priority);
+	ctrl->type = mlx4_map_sw_to_hw_steering_mode(mdev->dev, flow_type);
+	ctrl->port = flow_attr->port;
+	ctrl->qpn = cpu_to_be32(qp->qp_num);
+
+	ib_flow = flow_attr + 1;
+	size += sizeof(struct mlx4_net_trans_rule_hw_ctrl);
+	/* Add default flows */
+	default_flow = __mlx4_ib_default_rules_match(qp, flow_attr);
+	if (default_flow >= 0) {
+		ret = __mlx4_ib_create_default_rules(
+				mdev, qp, default_table + default_flow,
+				mailbox->buf + size);
+		if (ret < 0) {
+			mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+			return -EINVAL;
+		}
+		size += ret;
+	}
+	for (i = 0; i < flow_attr->num_of_specs; i++) {
+		ret = parse_flow_attr(mdev->dev, qp->qp_num, ib_flow,
+				      mailbox->buf + size);
+		if (ret < 0) {
+			mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+			return -EINVAL;
+		}
+		ib_flow += ((union ib_flow_spec *) ib_flow)->size;
+		size += ret;
+	}
+
+	ret = mlx4_cmd_imm(mdev->dev, mailbox->dma, reg_id, size >> 2, 0,
+			   MLX4_QP_FLOW_STEERING_ATTACH, MLX4_CMD_TIME_CLASS_A,
+			   MLX4_CMD_NATIVE);
+	if (ret == -ENOMEM)
+		pr_err("mcg table is full. Fail to register network rule.\n");
+	else if (ret == -ENXIO)
+		pr_err("Device managed flow steering is disabled. Fail to register network rule.\n");
+	else if (ret)
+		pr_err("Invalid argumant. Fail to register network rule.\n");
+
+	mlx4_free_cmd_mailbox(mdev->dev, mailbox);
+	return ret;
+}
+
+static int __mlx4_ib_destroy_flow(struct mlx4_dev *dev, u64 reg_id)
+{
+	int err;
+	err = mlx4_cmd(dev, reg_id, 0, 0,
+		       MLX4_QP_FLOW_STEERING_DETACH, MLX4_CMD_TIME_CLASS_A,
+		       MLX4_CMD_NATIVE);
+	if (err)
+		pr_err("Fail to detach network rule. registration id = 0x%llx\n",
+		       reg_id);
+	return err;
+}
+
+static int mlx4_ib_tunnel_steer_add(struct ib_qp *qp, struct ib_flow_attr *flow_attr,
+				    u64 *reg_id)
+{
+	void *ib_flow;
+	union ib_flow_spec *ib_spec;
+	struct mlx4_dev	*dev = to_mdev(qp->device)->dev;
+	int err = 0;
+
+	if (dev->caps.tunnel_offload_mode != MLX4_TUNNEL_OFFLOAD_MODE_VXLAN ||
+	    dev->caps.dmfs_high_steer_mode == MLX4_STEERING_DMFS_A0_STATIC)
+		return 0; /* do nothing */
+
+	ib_flow = flow_attr + 1;
+	ib_spec = (union ib_flow_spec *)ib_flow;
+
+	if (ib_spec->type !=  IB_FLOW_SPEC_ETH || flow_attr->num_of_specs != 1)
+		return 0; /* do nothing */
+
+	err = mlx4_tunnel_steer_add(to_mdev(qp->device)->dev, ib_spec->eth.val.dst_mac,
+				    flow_attr->port, qp->qp_num,
+				    MLX4_DOMAIN_UVERBS | (flow_attr->priority & 0xff),
+				    reg_id);
+	return err;
+}
+
+static struct ib_flow *mlx4_ib_create_flow(struct ib_qp *qp,
+				    struct ib_flow_attr *flow_attr,
+				    int domain)
+{
+	int err = 0, i = 0, j = 0;
+	struct mlx4_ib_flow *mflow;
+	enum mlx4_net_trans_promisc_mode type[2];
+	struct mlx4_dev *dev = (to_mdev(qp->device))->dev;
+	int is_bonded = mlx4_is_bonded(dev);
+
+	memset(type, 0, sizeof(type));
+
+	mflow = kzalloc(sizeof(*mflow), GFP_KERNEL);
+	if (!mflow) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	switch (flow_attr->type) {
+	case IB_FLOW_ATTR_NORMAL:
+		type[0] = MLX4_FS_REGULAR;
+		break;
+
+	case IB_FLOW_ATTR_ALL_DEFAULT:
+		type[0] = MLX4_FS_ALL_DEFAULT;
+		break;
+
+	case IB_FLOW_ATTR_MC_DEFAULT:
+		type[0] = MLX4_FS_MC_DEFAULT;
+		break;
+
+	case IB_FLOW_ATTR_SNIFFER:
+		type[0] = MLX4_FS_UC_SNIFFER;
+		type[1] = MLX4_FS_MC_SNIFFER;
+		break;
+
+	default:
+		err = -EINVAL;
+		goto err_free;
+	}
+
+	while (i < ARRAY_SIZE(type) && type[i]) {
+		err = __mlx4_ib_create_flow(qp, flow_attr, domain, type[i],
+					    &mflow->reg_id[i].id);
+		if (err)
+			goto err_create_flow;
+		i++;
+		if (is_bonded) {
+			/* Application always sees one port so the mirror rule
+			 * must be on port #2
+			 */
+			flow_attr->port = 2;
+			err = __mlx4_ib_create_flow(qp, flow_attr,
+						    domain, type[j],
+						    &mflow->reg_id[j].mirror);
+			flow_attr->port = 1;
+			if (err)
+				goto err_create_flow;
+			j++;
+		}
+
+	}
+
+	if (i < ARRAY_SIZE(type) && flow_attr->type == IB_FLOW_ATTR_NORMAL) {
+		err = mlx4_ib_tunnel_steer_add(qp, flow_attr,
+					       &mflow->reg_id[i].id);
+		if (err)
+			goto err_create_flow;
+		i++;
+		if (is_bonded) {
+			flow_attr->port = 2;
+			err = mlx4_ib_tunnel_steer_add(qp, flow_attr,
+						       &mflow->reg_id[j].mirror);
+			flow_attr->port = 1;
+			if (err)
+				goto err_create_flow;
+			j++;
+		}
+		/* function to create mirror rule */
+	}
+
+	return &mflow->ibflow;
+
+err_create_flow:
+	while (i) {
+		(void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev,
+					     mflow->reg_id[i].id);
+		i--;
+	}
+
+	while (j) {
+		(void)__mlx4_ib_destroy_flow(to_mdev(qp->device)->dev,
+					     mflow->reg_id[j].mirror);
+		j--;
+	}
+err_free:
+	kfree(mflow);
+	return ERR_PTR(err);
+}
+
+static int mlx4_ib_destroy_flow(struct ib_flow *flow_id)
+{
+	int err, ret = 0;
+	int i = 0;
+	struct mlx4_ib_dev *mdev = to_mdev(flow_id->qp->device);
+	struct mlx4_ib_flow *mflow = to_mflow(flow_id);
+
+	while (i < ARRAY_SIZE(mflow->reg_id) && mflow->reg_id[i].id) {
+		err = __mlx4_ib_destroy_flow(mdev->dev, mflow->reg_id[i].id);
+		if (err)
+			ret = err;
+		if (mflow->reg_id[i].mirror) {
+			err = __mlx4_ib_destroy_flow(mdev->dev,
+						     mflow->reg_id[i].mirror);
+			if (err)
+				ret = err;
+		}
+		i++;
+	}
+
+	kfree(mflow);
+	return ret;
+}
+
+static int mlx4_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	int err;
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+	struct mlx4_dev	*dev = mdev->dev;
+	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+	struct mlx4_ib_steering *ib_steering = NULL;
+	enum mlx4_protocol prot = MLX4_PROT_IB_IPV6;
+	struct mlx4_flow_reg_id	reg_id;
+
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		ib_steering = kmalloc(sizeof(*ib_steering), GFP_KERNEL);
+		if (!ib_steering)
+			return -ENOMEM;
+	}
+
+	err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw, mqp->port,
+				    !!(mqp->flags &
+				       MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
+				    prot, &reg_id.id);
+	if (err) {
+		pr_err("multicast attach op failed, err %d\n", err);
+		goto err_malloc;
+	}
+
+	reg_id.mirror = 0;
+	if (mlx4_is_bonded(dev)) {
+		err = mlx4_multicast_attach(mdev->dev, &mqp->mqp, gid->raw,
+					    (mqp->port == 1) ? 2 : 1,
+					    !!(mqp->flags &
+					    MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK),
+					    prot, &reg_id.mirror);
+		if (err)
+			goto err_add;
+	}
+
+	err = add_gid_entry(ibqp, gid);
+	if (err)
+		goto err_add;
+
+	if (ib_steering) {
+		memcpy(ib_steering->gid.raw, gid->raw, 16);
+		ib_steering->reg_id = reg_id;
+		mutex_lock(&mqp->mutex);
+		list_add(&ib_steering->list, &mqp->steering_rules);
+		mutex_unlock(&mqp->mutex);
+	}
+	return 0;
+
+err_add:
+	mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+			      prot, reg_id.id);
+	if (reg_id.mirror)
+		mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+				      prot, reg_id.mirror);
+err_malloc:
+	kfree(ib_steering);
+
+	return err;
+}
+
+static struct mlx4_ib_gid_entry *find_gid_entry(struct mlx4_ib_qp *qp, u8 *raw)
+{
+	struct mlx4_ib_gid_entry *ge;
+	struct mlx4_ib_gid_entry *tmp;
+	struct mlx4_ib_gid_entry *ret = NULL;
+
+	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+		if (!memcmp(raw, ge->gid.raw, 16)) {
+			ret = ge;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+static int mlx4_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	int err;
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+	struct mlx4_dev *dev = mdev->dev;
+	struct mlx4_ib_qp *mqp = to_mqp(ibqp);
+	struct net_device *ndev;
+	struct mlx4_ib_gid_entry *ge;
+	struct mlx4_flow_reg_id reg_id = {0, 0};
+	enum mlx4_protocol prot =  MLX4_PROT_IB_IPV6;
+
+	if (mdev->dev->caps.steering_mode ==
+	    MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		struct mlx4_ib_steering *ib_steering;
+
+		mutex_lock(&mqp->mutex);
+		list_for_each_entry(ib_steering, &mqp->steering_rules, list) {
+			if (!memcmp(ib_steering->gid.raw, gid->raw, 16)) {
+				list_del(&ib_steering->list);
+				break;
+			}
+		}
+		mutex_unlock(&mqp->mutex);
+		if (&ib_steering->list == &mqp->steering_rules) {
+			pr_err("Couldn't find reg_id for mgid. Steering rule is left attached\n");
+			return -EINVAL;
+		}
+		reg_id = ib_steering->reg_id;
+		kfree(ib_steering);
+	}
+
+	err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+				    prot, reg_id.id);
+	if (err)
+		return err;
+
+	if (mlx4_is_bonded(dev)) {
+		err = mlx4_multicast_detach(mdev->dev, &mqp->mqp, gid->raw,
+					    prot, reg_id.mirror);
+		if (err)
+			return err;
+	}
+
+	mutex_lock(&mqp->mutex);
+	ge = find_gid_entry(mqp, gid->raw);
+	if (ge) {
+		spin_lock_bh(&mdev->iboe.lock);
+		ndev = ge->added ? mdev->iboe.netdevs[ge->port - 1] : NULL;
+		if (ndev)
+			dev_hold(ndev);
+		spin_unlock_bh(&mdev->iboe.lock);
+		if (ndev)
+			dev_put(ndev);
+		list_del(&ge->list);
+		kfree(ge);
+	} else
+		pr_warn("could not find mgid entry\n");
+
+	mutex_unlock(&mqp->mutex);
+
+	return 0;
+}
+
+static int init_node_data(struct mlx4_ib_dev *dev)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int mad_ifc_flags = MLX4_MAD_IFC_IGNORE_KEYS;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+	if (mlx4_is_master(dev->dev))
+		mad_ifc_flags |= MLX4_MAD_IFC_NET_VIEW;
+
+	err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(dev->ib_dev.node_desc, out_mad->data, 64);
+
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx4_MAD_IFC(dev, mad_ifc_flags, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	dev->dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx4_ib_dev *dev =
+		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+	return sprintf(buf, "MT%d\n", dev->dev->persist->pdev->device);
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+			   char *buf)
+{
+	struct mlx4_ib_dev *dev =
+		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32),
+		       (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
+		       (int) dev->dev->caps.fw_ver & 0xffff);
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx4_ib_dev *dev =
+		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%x\n", dev->dev->rev_id);
+}
+
+static ssize_t show_board(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct mlx4_ib_dev *dev =
+		container_of(device, struct mlx4_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN,
+		       dev->dev->board_id);
+}
+
+static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
+static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+
+static struct device_attribute *mlx4_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id
+};
+
+static void mlx4_addrconf_ifid_eui48(u8 *eui, u16 vlan_id,
+				     struct net_device *dev)
+{
+	memcpy(eui, dev->dev_addr, 3);
+	memcpy(eui + 5, dev->dev_addr + 3, 3);
+	if (vlan_id < 0x1000) {
+		eui[3] = vlan_id >> 8;
+		eui[4] = vlan_id & 0xff;
+	} else {
+		eui[3] = 0xff;
+		eui[4] = 0xfe;
+	}
+	eui[0] ^= 2;
+}
+
+static void update_gids_task(struct work_struct *work)
+{
+	struct update_gid_work *gw = container_of(work, struct update_gid_work, work);
+	struct mlx4_cmd_mailbox *mailbox;
+	union ib_gid *gids;
+	int err;
+	struct mlx4_dev	*dev = gw->dev->dev;
+	int is_bonded = mlx4_is_bonded(dev);
+
+	if (!gw->dev->ib_active)
+		return;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		pr_warn("update gid table failed %ld\n", PTR_ERR(mailbox));
+		return;
+	}
+
+	gids = mailbox->buf;
+	memcpy(gids, gw->gids, sizeof gw->gids);
+
+	err = mlx4_cmd(dev, mailbox->dma, MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
+		       MLX4_SET_PORT_ETH_OPCODE, MLX4_CMD_SET_PORT,
+		       MLX4_CMD_TIME_CLASS_B, MLX4_CMD_WRAPPED);
+	if (err)
+		pr_warn("set port command failed\n");
+	else
+		if ((gw->port == 1) || !is_bonded)
+			mlx4_ib_dispatch_event(gw->dev,
+					       is_bonded ? 1 : gw->port,
+					       IB_EVENT_GID_CHANGE);
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+	kfree(gw);
+}
+
+static void reset_gids_task(struct work_struct *work)
+{
+	struct update_gid_work *gw =
+			container_of(work, struct update_gid_work, work);
+	struct mlx4_cmd_mailbox *mailbox;
+	union ib_gid *gids;
+	int err;
+	struct mlx4_dev	*dev = gw->dev->dev;
+
+	if (!gw->dev->ib_active)
+		return;
+
+	mailbox = mlx4_alloc_cmd_mailbox(dev);
+	if (IS_ERR(mailbox)) {
+		pr_warn("reset gid table failed\n");
+		goto free;
+	}
+
+	gids = mailbox->buf;
+	memcpy(gids, gw->gids, sizeof(gw->gids));
+
+	if (mlx4_ib_port_link_layer(&gw->dev->ib_dev, gw->port) ==
+				    IB_LINK_LAYER_ETHERNET) {
+		err = mlx4_cmd(dev, mailbox->dma,
+			       MLX4_SET_PORT_GID_TABLE << 8 | gw->port,
+			       MLX4_SET_PORT_ETH_OPCODE, MLX4_CMD_SET_PORT,
+			       MLX4_CMD_TIME_CLASS_B,
+			       MLX4_CMD_WRAPPED);
+		if (err)
+			pr_warn("set port %d command failed\n", gw->port);
+	}
+
+	mlx4_free_cmd_mailbox(dev, mailbox);
+free:
+	kfree(gw);
+}
+
+static int update_gid_table(struct mlx4_ib_dev *dev, int port,
+			    union ib_gid *gid, int clear,
+			    int default_gid)
+{
+	struct update_gid_work *work;
+	int i;
+	int need_update = 0;
+	int free = -1;
+	int found = -1;
+	int max_gids;
+
+	if (default_gid) {
+		free = 0;
+	} else {
+		max_gids = dev->dev->caps.gid_table_len[port];
+		for (i = 1; i < max_gids; ++i) {
+			if (!memcmp(&dev->iboe.gid_table[port - 1][i], gid,
+				    sizeof(*gid)))
+				found = i;
+
+			if (clear) {
+				if (found >= 0) {
+					need_update = 1;
+					dev->iboe.gid_table[port - 1][found] =
+						zgid;
+					break;
+				}
+			} else {
+				if (found >= 0)
+					break;
+
+				if (free < 0 &&
+				    !memcmp(&dev->iboe.gid_table[port - 1][i],
+					    &zgid, sizeof(*gid)))
+					free = i;
+			}
+		}
+	}
+
+	if (found == -1 && !clear && free >= 0) {
+		dev->iboe.gid_table[port - 1][free] = *gid;
+		need_update = 1;
+	}
+
+	if (!need_update)
+		return 0;
+
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work)
+		return -ENOMEM;
+
+	memcpy(work->gids, dev->iboe.gid_table[port - 1], sizeof(work->gids));
+	INIT_WORK(&work->work, update_gids_task);
+	work->port = port;
+	work->dev = dev;
+	queue_work(wq, &work->work);
+
+	return 0;
+}
+
+static void mlx4_make_default_gid(struct  net_device *dev, union ib_gid *gid)
+{
+	gid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+	mlx4_addrconf_ifid_eui48(&gid->raw[8], 0xffff, dev);
+}
+
+
+static int reset_gid_table(struct mlx4_ib_dev *dev, u8 port)
+{
+	struct update_gid_work *work;
+
+	work = kzalloc(sizeof(*work), GFP_ATOMIC);
+	if (!work)
+		return -ENOMEM;
+
+	memset(dev->iboe.gid_table[port - 1], 0, sizeof(work->gids));
+	memset(work->gids, 0, sizeof(work->gids));
+	INIT_WORK(&work->work, reset_gids_task);
+	work->dev = dev;
+	work->port = port;
+	queue_work(wq, &work->work);
+	return 0;
+}
+
+static int mlx4_ib_addr_event(int event, struct net_device *event_netdev,
+			      struct mlx4_ib_dev *ibdev, union ib_gid *gid)
+{
+	struct mlx4_ib_iboe *iboe;
+	int port = 0;
+	struct net_device *real_dev = rdma_vlan_dev_real_dev(event_netdev) ?
+				rdma_vlan_dev_real_dev(event_netdev) :
+				event_netdev;
+	union ib_gid default_gid;
+
+	mlx4_make_default_gid(real_dev, &default_gid);
+
+	if (!memcmp(gid, &default_gid, sizeof(*gid)))
+		return 0;
+
+	if (event != NETDEV_DOWN && event != NETDEV_UP)
+		return 0;
+
+	if ((real_dev != event_netdev) &&
+	    (event == NETDEV_DOWN) &&
+	    rdma_link_local_addr((struct in6_addr *)gid))
+		return 0;
+
+	iboe = &ibdev->iboe;
+	spin_lock_bh(&iboe->lock);
+
+	for (port = 1; port <= ibdev->dev->caps.num_ports; ++port)
+		if ((netif_is_bond_master(real_dev) &&
+		     (real_dev == iboe->masters[port - 1])) ||
+		     (!netif_is_bond_master(real_dev) &&
+		     (real_dev == iboe->netdevs[port - 1])))
+			update_gid_table(ibdev, port, gid,
+					 event == NETDEV_DOWN, 0);
+
+	spin_unlock_bh(&iboe->lock);
+	return 0;
+
+}
+
+static u8 mlx4_ib_get_dev_port(struct net_device *dev,
+			       struct mlx4_ib_dev *ibdev)
+{
+	u8 port = 0;
+	struct mlx4_ib_iboe *iboe;
+	struct net_device *real_dev = rdma_vlan_dev_real_dev(dev) ?
+				rdma_vlan_dev_real_dev(dev) : dev;
+
+	iboe = &ibdev->iboe;
+
+	for (port = 1; port <= ibdev->dev->caps.num_ports; ++port)
+		if ((netif_is_bond_master(real_dev) &&
+		     (real_dev == iboe->masters[port - 1])) ||
+		     (!netif_is_bond_master(real_dev) &&
+		     (real_dev == iboe->netdevs[port - 1])))
+			break;
+
+	if ((port == 0) || (port > ibdev->dev->caps.num_ports))
+		return 0;
+	else
+		return port;
+}
+
+static int mlx4_ib_inet_event(struct notifier_block *this, unsigned long event,
+				void *ptr)
+{
+	struct mlx4_ib_dev *ibdev;
+	struct in_ifaddr *ifa = ptr;
+	union ib_gid gid;
+	struct net_device *event_netdev = ifa->ifa_dev->dev;
+
+	ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid);
+
+	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet);
+
+	mlx4_ib_addr_event(event, event_netdev, ibdev, &gid);
+	return NOTIFY_DONE;
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int mlx4_ib_inet6_event(struct notifier_block *this, unsigned long event,
+				void *ptr)
+{
+	struct mlx4_ib_dev *ibdev;
+	struct inet6_ifaddr *ifa = ptr;
+	union  ib_gid *gid = (union ib_gid *)&ifa->addr;
+	struct net_device *event_netdev = ifa->idev->dev;
+
+	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb_inet6);
+
+	mlx4_ib_addr_event(event, event_netdev, ibdev, gid);
+	return NOTIFY_DONE;
+}
+#endif
+
+#define MLX4_IB_INVALID_MAC	((u64)-1)
+static void mlx4_ib_update_qps(struct mlx4_ib_dev *ibdev,
+			       struct net_device *dev,
+			       int port)
+{
+	u64 new_smac = 0;
+	u64 release_mac = MLX4_IB_INVALID_MAC;
+	struct mlx4_ib_qp *qp;
+
+	read_lock(&dev_base_lock);
+	new_smac = mlx4_mac_to_u64(dev->dev_addr);
+	read_unlock(&dev_base_lock);
+
+	atomic64_set(&ibdev->iboe.mac[port - 1], new_smac);
+
+	/* no need for update QP1 and mac registration in non-SRIOV */
+	if (!mlx4_is_mfunc(ibdev->dev))
+		return;
+
+	mutex_lock(&ibdev->qp1_proxy_lock[port - 1]);
+	qp = ibdev->qp1_proxy[port - 1];
+	if (qp) {
+		int new_smac_index;
+		u64 old_smac;
+		struct mlx4_update_qp_params update_params;
+
+		mutex_lock(&qp->mutex);
+		old_smac = qp->pri.smac;
+		if (new_smac == old_smac)
+			goto unlock;
+
+		new_smac_index = mlx4_register_mac(ibdev->dev, port, new_smac);
+
+		if (new_smac_index < 0)
+			goto unlock;
+
+		update_params.smac_index = new_smac_index;
+		if (mlx4_update_qp(ibdev->dev, qp->mqp.qpn, MLX4_UPDATE_QP_SMAC,
+				   &update_params)) {
+			release_mac = new_smac;
+			goto unlock;
+		}
+		/* if old port was zero, no mac was yet registered for this QP */
+		if (qp->pri.smac_port)
+			release_mac = old_smac;
+		qp->pri.smac = new_smac;
+		qp->pri.smac_port = port;
+		qp->pri.smac_index = new_smac_index;
+	}
+
+unlock:
+	if (release_mac != MLX4_IB_INVALID_MAC)
+		mlx4_unregister_mac(ibdev->dev, port, release_mac);
+	if (qp)
+		mutex_unlock(&qp->mutex);
+	mutex_unlock(&ibdev->qp1_proxy_lock[port - 1]);
+}
+
+static void mlx4_ib_get_dev_addr(struct net_device *dev,
+				 struct mlx4_ib_dev *ibdev, u8 port)
+{
+	struct in_device *in_dev;
+#if IS_ENABLED(CONFIG_IPV6)
+	struct inet6_dev *in6_dev;
+	union ib_gid  *pgid;
+	struct inet6_ifaddr *ifp;
+	union ib_gid default_gid;
+#endif
+	union ib_gid gid;
+
+
+	if ((port == 0) || (port > ibdev->dev->caps.num_ports))
+		return;
+
+	/* IPv4 gids */
+	in_dev = in_dev_get(dev);
+	if (in_dev) {
+		for_ifa(in_dev) {
+			/*ifa->ifa_address;*/
+			ipv6_addr_set_v4mapped(ifa->ifa_address,
+					       (struct in6_addr *)&gid);
+			update_gid_table(ibdev, port, &gid, 0, 0);
+		}
+		endfor_ifa(in_dev);
+		in_dev_put(in_dev);
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	mlx4_make_default_gid(dev, &default_gid);
+	/* IPv6 gids */
+	in6_dev = in6_dev_get(dev);
+	if (in6_dev) {
+		read_lock_bh(&in6_dev->lock);
+		list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+			pgid = (union ib_gid *)&ifp->addr;
+			if (!memcmp(pgid, &default_gid, sizeof(*pgid)))
+				continue;
+			update_gid_table(ibdev, port, pgid, 0, 0);
+		}
+		read_unlock_bh(&in6_dev->lock);
+		in6_dev_put(in6_dev);
+	}
+#endif
+}
+
+static void mlx4_ib_set_default_gid(struct mlx4_ib_dev *ibdev,
+				 struct  net_device *dev, u8 port)
+{
+	union ib_gid gid;
+	mlx4_make_default_gid(dev, &gid);
+	update_gid_table(ibdev, port, &gid, 0, 1);
+}
+
+static int mlx4_ib_init_gid_table(struct mlx4_ib_dev *ibdev)
+{
+	struct	net_device *dev;
+	struct mlx4_ib_iboe *iboe = &ibdev->iboe;
+	int i;
+	int err = 0;
+
+	for (i = 1; i <= ibdev->num_ports; ++i) {
+		if (rdma_port_get_link_layer(&ibdev->ib_dev, i) ==
+		    IB_LINK_LAYER_ETHERNET) {
+			err = reset_gid_table(ibdev, i);
+			if (err)
+				goto out;
+		}
+	}
+
+	read_lock(&dev_base_lock);
+	spin_lock_bh(&iboe->lock);
+
+	for_each_netdev(&init_net, dev) {
+		u8 port = mlx4_ib_get_dev_port(dev, ibdev);
+		/* port will be non-zero only for ETH ports */
+		if (port) {
+			mlx4_ib_set_default_gid(ibdev, dev, port);
+			mlx4_ib_get_dev_addr(dev, ibdev, port);
+		}
+	}
+
+	spin_unlock_bh(&iboe->lock);
+	read_unlock(&dev_base_lock);
+out:
+	return err;
+}
+
+static void mlx4_ib_scan_netdevs(struct mlx4_ib_dev *ibdev,
+				 struct net_device *dev,
+				 unsigned long event)
+
+{
+	struct mlx4_ib_iboe *iboe;
+	int update_qps_port = -1;
+	int port;
+
+	iboe = &ibdev->iboe;
+
+	spin_lock_bh(&iboe->lock);
+	mlx4_foreach_ib_transport_port(port, ibdev->dev) {
+		enum ib_port_state	port_state = IB_PORT_NOP;
+		struct net_device *old_master = iboe->masters[port - 1];
+		struct net_device *curr_netdev;
+		struct net_device *curr_master;
+
+		iboe->netdevs[port - 1] =
+			mlx4_get_protocol_dev(ibdev->dev, MLX4_PROT_ETH, port);
+		if (iboe->netdevs[port - 1])
+			mlx4_ib_set_default_gid(ibdev,
+						iboe->netdevs[port - 1], port);
+		curr_netdev = iboe->netdevs[port - 1];
+
+		if (iboe->netdevs[port - 1] &&
+		    netif_is_bond_slave(iboe->netdevs[port - 1])) {
+			iboe->masters[port - 1] = netdev_master_upper_dev_get(
+				iboe->netdevs[port - 1]);
+		} else {
+			iboe->masters[port - 1] = NULL;
+		}
+		curr_master = iboe->masters[port - 1];
+
+		if (dev == iboe->netdevs[port - 1] &&
+		    (event == NETDEV_CHANGEADDR || event == NETDEV_REGISTER ||
+		     event == NETDEV_UP || event == NETDEV_CHANGE))
+			update_qps_port = port;
+
+		if (curr_netdev) {
+			port_state = (netif_running(curr_netdev) && netif_carrier_ok(curr_netdev)) ?
+						IB_PORT_ACTIVE : IB_PORT_DOWN;
+			mlx4_ib_set_default_gid(ibdev, curr_netdev, port);
+			if (curr_master) {
+				/* if using bonding/team and a slave port is down, we
+				 * don't want the bond IP based gids in the table since
+				 * flows that select port by gid may get the down port.
+				*/
+				if (port_state == IB_PORT_DOWN &&
+				    !mlx4_is_bonded(ibdev->dev)) {
+					reset_gid_table(ibdev, port);
+					mlx4_ib_set_default_gid(ibdev,
+								curr_netdev,
+								port);
+				} else {
+					/* gids from the upper dev (bond/team)
+					 * should appear in port's gid table
+					*/
+					mlx4_ib_get_dev_addr(curr_master,
+							     ibdev, port);
+				}
+			}
+			/* if bonding is used it is possible that we add it to
+			 * masters only after IP address is assigned to the
+			 * net bonding interface.
+			*/
+			if (curr_master && (old_master != curr_master)) {
+				reset_gid_table(ibdev, port);
+				mlx4_ib_set_default_gid(ibdev,
+							curr_netdev, port);
+				mlx4_ib_get_dev_addr(curr_master, ibdev, port);
+			}
+
+			if (!curr_master && (old_master != curr_master)) {
+				reset_gid_table(ibdev, port);
+				mlx4_ib_set_default_gid(ibdev,
+							curr_netdev, port);
+				mlx4_ib_get_dev_addr(curr_netdev, ibdev, port);
+			}
+		} else {
+			reset_gid_table(ibdev, port);
+		}
+	}
+
+	spin_unlock_bh(&iboe->lock);
+
+	if (update_qps_port > 0)
+		mlx4_ib_update_qps(ibdev, dev, update_qps_port);
+}
+
+static int mlx4_ib_netdev_event(struct notifier_block *this,
+				unsigned long event, void *ptr)
+{
+	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+	struct mlx4_ib_dev *ibdev;
+
+	if (!net_eq(dev_net(dev), &init_net))
+		return NOTIFY_DONE;
+
+	ibdev = container_of(this, struct mlx4_ib_dev, iboe.nb);
+	mlx4_ib_scan_netdevs(ibdev, dev, event);
+
+	return NOTIFY_DONE;
+}
+
+static void init_pkeys(struct mlx4_ib_dev *ibdev)
+{
+	int port;
+	int slave;
+	int i;
+
+	if (mlx4_is_master(ibdev->dev)) {
+		for (slave = 0; slave <= ibdev->dev->persist->num_vfs;
+		     ++slave) {
+			for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
+				for (i = 0;
+				     i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
+				     ++i) {
+					ibdev->pkeys.virt2phys_pkey[slave][port - 1][i] =
+					/* master has the identity virt2phys pkey mapping */
+						(slave == mlx4_master_func_num(ibdev->dev) || !i) ? i :
+							ibdev->dev->phys_caps.pkey_phys_table_len[port] - 1;
+					mlx4_sync_pkey_table(ibdev->dev, slave, port, i,
+							     ibdev->pkeys.virt2phys_pkey[slave][port - 1][i]);
+				}
+			}
+		}
+		/* initialize pkey cache */
+		for (port = 1; port <= ibdev->dev->caps.num_ports; ++port) {
+			for (i = 0;
+			     i < ibdev->dev->phys_caps.pkey_phys_table_len[port];
+			     ++i)
+				ibdev->pkeys.phys_pkey_cache[port-1][i] =
+					(i) ? 0 : 0xFFFF;
+		}
+	}
+}
+
+static void mlx4_ib_alloc_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
+{
+	char name[80];
+	int eq_per_port = 0;
+	int added_eqs = 0;
+	int total_eqs = 0;
+	int i, j, eq;
+
+	/* Legacy mode or comp_pool is not large enough */
+	if (dev->caps.comp_pool == 0 ||
+	    dev->caps.num_ports > dev->caps.comp_pool)
+		return;
+
+	eq_per_port = dev->caps.comp_pool / dev->caps.num_ports;
+
+	/* Init eq table */
+	added_eqs = 0;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		added_eqs += eq_per_port;
+
+	total_eqs = dev->caps.num_comp_vectors + added_eqs;
+
+	ibdev->eq_table = kzalloc(total_eqs * sizeof(int), GFP_KERNEL);
+	if (!ibdev->eq_table)
+		return;
+
+	ibdev->eq_added = added_eqs;
+
+	eq = 0;
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB) {
+		for (j = 0; j < eq_per_port; j++) {
+			snprintf(name, sizeof(name), "mlx4-ib-%d-%d@%s",
+				 i, j, dev->persist->pdev->bus->name);
+			/* Set IRQ for specific name (per ring) */
+			if (mlx4_assign_eq(dev, name, NULL,
+					   &ibdev->eq_table[eq])) {
+				/* Use legacy (same as mlx4_en driver) */
+				pr_warn("Can't allocate EQ %d; reverting to legacy\n", eq);
+				ibdev->eq_table[eq] =
+					(eq % dev->caps.num_comp_vectors);
+			}
+			eq++;
+		}
+	}
+
+	/* Fill the reset of the vector with legacy EQ */
+	for (i = 0, eq = added_eqs; i < dev->caps.num_comp_vectors; i++)
+		ibdev->eq_table[eq++] = i;
+
+	/* Advertise the new number of EQs to clients */
+	ibdev->ib_dev.num_comp_vectors = total_eqs;
+}
+
+static void mlx4_ib_free_eqs(struct mlx4_dev *dev, struct mlx4_ib_dev *ibdev)
+{
+	int i;
+
+	/* no additional eqs were added */
+	if (!ibdev->eq_table)
+		return;
+
+	/* Reset the advertised EQ number */
+	ibdev->ib_dev.num_comp_vectors = dev->caps.num_comp_vectors;
+
+	/* Free only the added eqs */
+	for (i = 0; i < ibdev->eq_added; i++) {
+		/* Don't free legacy eqs if used */
+		if (ibdev->eq_table[i] <= dev->caps.num_comp_vectors)
+			continue;
+		mlx4_release_eq(dev, ibdev->eq_table[i]);
+	}
+
+	kfree(ibdev->eq_table);
+}
+
+static void *mlx4_ib_add(struct mlx4_dev *dev)
+{
+	struct mlx4_ib_dev *ibdev;
+	int num_ports = 0;
+	int i, j;
+	int err;
+	struct mlx4_ib_iboe *iboe;
+	int ib_num_ports = 0;
+	int num_req_counters;
+
+	pr_info_once("%s", mlx4_ib_version);
+
+	num_ports = 0;
+	mlx4_foreach_ib_transport_port(i, dev)
+		num_ports++;
+
+	/* No point in registering a device with no ports... */
+	if (num_ports == 0)
+		return NULL;
+
+	ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
+	if (!ibdev) {
+		dev_err(&dev->persist->pdev->dev,
+			"Device struct alloc failed\n");
+		return NULL;
+	}
+
+	iboe = &ibdev->iboe;
+
+	if (mlx4_pd_alloc(dev, &ibdev->priv_pdn))
+		goto err_dealloc;
+
+	if (mlx4_uar_alloc(dev, &ibdev->priv_uar))
+		goto err_pd;
+
+	ibdev->uar_map = ioremap((phys_addr_t) ibdev->priv_uar.pfn << PAGE_SHIFT,
+				 PAGE_SIZE);
+	if (!ibdev->uar_map)
+		goto err_uar;
+	MLX4_INIT_DOORBELL_LOCK(&ibdev->uar_lock);
+
+	ibdev->dev = dev;
+	ibdev->bond_next_port	= 0;
+
+	strlcpy(ibdev->ib_dev.name, "mlx4_%d", IB_DEVICE_NAME_MAX);
+	ibdev->ib_dev.owner		= THIS_MODULE;
+	ibdev->ib_dev.node_type		= RDMA_NODE_IB_CA;
+	ibdev->ib_dev.local_dma_lkey	= dev->caps.reserved_lkey;
+	ibdev->num_ports		= num_ports;
+	ibdev->ib_dev.phys_port_cnt     = mlx4_is_bonded(dev) ?
+						1 : ibdev->num_ports;
+	ibdev->ib_dev.num_comp_vectors	= dev->caps.num_comp_vectors;
+	ibdev->ib_dev.dma_device	= &dev->persist->pdev->dev;
+
+	if (dev->caps.userspace_caps)
+		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_ABI_VERSION;
+	else
+		ibdev->ib_dev.uverbs_abi_ver = MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION;
+
+	ibdev->ib_dev.uverbs_cmd_mask	=
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_REREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
+		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
+
+	ibdev->ib_dev.query_device	= mlx4_ib_query_device;
+	ibdev->ib_dev.query_port	= mlx4_ib_query_port;
+	ibdev->ib_dev.get_link_layer	= mlx4_ib_port_link_layer;
+	ibdev->ib_dev.query_gid		= mlx4_ib_query_gid;
+	ibdev->ib_dev.query_pkey	= mlx4_ib_query_pkey;
+	ibdev->ib_dev.modify_device	= mlx4_ib_modify_device;
+	ibdev->ib_dev.modify_port	= mlx4_ib_modify_port;
+	ibdev->ib_dev.alloc_ucontext	= mlx4_ib_alloc_ucontext;
+	ibdev->ib_dev.dealloc_ucontext	= mlx4_ib_dealloc_ucontext;
+	ibdev->ib_dev.mmap		= mlx4_ib_mmap;
+	ibdev->ib_dev.alloc_pd		= mlx4_ib_alloc_pd;
+	ibdev->ib_dev.dealloc_pd	= mlx4_ib_dealloc_pd;
+	ibdev->ib_dev.create_ah		= mlx4_ib_create_ah;
+	ibdev->ib_dev.query_ah		= mlx4_ib_query_ah;
+	ibdev->ib_dev.destroy_ah	= mlx4_ib_destroy_ah;
+	ibdev->ib_dev.create_srq	= mlx4_ib_create_srq;
+	ibdev->ib_dev.modify_srq	= mlx4_ib_modify_srq;
+	ibdev->ib_dev.query_srq		= mlx4_ib_query_srq;
+	ibdev->ib_dev.destroy_srq	= mlx4_ib_destroy_srq;
+	ibdev->ib_dev.post_srq_recv	= mlx4_ib_post_srq_recv;
+	ibdev->ib_dev.create_qp		= mlx4_ib_create_qp;
+	ibdev->ib_dev.modify_qp		= mlx4_ib_modify_qp;
+	ibdev->ib_dev.query_qp		= mlx4_ib_query_qp;
+	ibdev->ib_dev.destroy_qp	= mlx4_ib_destroy_qp;
+	ibdev->ib_dev.post_send		= mlx4_ib_post_send;
+	ibdev->ib_dev.post_recv		= mlx4_ib_post_recv;
+	ibdev->ib_dev.create_cq		= mlx4_ib_create_cq;
+	ibdev->ib_dev.modify_cq		= mlx4_ib_modify_cq;
+	ibdev->ib_dev.resize_cq		= mlx4_ib_resize_cq;
+	ibdev->ib_dev.destroy_cq	= mlx4_ib_destroy_cq;
+	ibdev->ib_dev.poll_cq		= mlx4_ib_poll_cq;
+	ibdev->ib_dev.req_notify_cq	= mlx4_ib_arm_cq;
+	ibdev->ib_dev.get_dma_mr	= mlx4_ib_get_dma_mr;
+	ibdev->ib_dev.reg_user_mr	= mlx4_ib_reg_user_mr;
+	ibdev->ib_dev.rereg_user_mr	= mlx4_ib_rereg_user_mr;
+	ibdev->ib_dev.dereg_mr		= mlx4_ib_dereg_mr;
+	ibdev->ib_dev.alloc_fast_reg_mr = mlx4_ib_alloc_fast_reg_mr;
+	ibdev->ib_dev.alloc_fast_reg_page_list = mlx4_ib_alloc_fast_reg_page_list;
+	ibdev->ib_dev.free_fast_reg_page_list  = mlx4_ib_free_fast_reg_page_list;
+	ibdev->ib_dev.attach_mcast	= mlx4_ib_mcg_attach;
+	ibdev->ib_dev.detach_mcast	= mlx4_ib_mcg_detach;
+	ibdev->ib_dev.process_mad	= mlx4_ib_process_mad;
+
+	if (!mlx4_is_slave(ibdev->dev)) {
+		ibdev->ib_dev.alloc_fmr		= mlx4_ib_fmr_alloc;
+		ibdev->ib_dev.map_phys_fmr	= mlx4_ib_map_phys_fmr;
+		ibdev->ib_dev.unmap_fmr		= mlx4_ib_unmap_fmr;
+		ibdev->ib_dev.dealloc_fmr	= mlx4_ib_fmr_dealloc;
+	}
+
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW ||
+	    dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) {
+		ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw;
+		ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw;
+		ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw;
+
+		ibdev->ib_dev.uverbs_cmd_mask |=
+			(1ull << IB_USER_VERBS_CMD_ALLOC_MW) |
+			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW);
+	}
+
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC) {
+		ibdev->ib_dev.alloc_xrcd = mlx4_ib_alloc_xrcd;
+		ibdev->ib_dev.dealloc_xrcd = mlx4_ib_dealloc_xrcd;
+		ibdev->ib_dev.uverbs_cmd_mask |=
+			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
+			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
+	}
+
+	if (check_flow_steering_support(dev)) {
+		ibdev->steering_support = MLX4_STEERING_MODE_DEVICE_MANAGED;
+		ibdev->ib_dev.create_flow	= mlx4_ib_create_flow;
+		ibdev->ib_dev.destroy_flow	= mlx4_ib_destroy_flow;
+
+		ibdev->ib_dev.uverbs_ex_cmd_mask	|=
+			(1ull << IB_USER_VERBS_EX_CMD_CREATE_FLOW) |
+			(1ull << IB_USER_VERBS_EX_CMD_DESTROY_FLOW);
+	}
+
+	mlx4_ib_alloc_eqs(dev, ibdev);
+
+	spin_lock_init(&iboe->lock);
+
+	if (init_node_data(ibdev))
+		goto err_map;
+
+	num_req_counters = mlx4_is_bonded(dev) ? 1 : ibdev->num_ports;
+	for (i = 0; i < num_req_counters; ++i) {
+		mutex_init(&ibdev->qp1_proxy_lock[i]);
+		if (mlx4_ib_port_link_layer(&ibdev->ib_dev, i + 1) ==
+						IB_LINK_LAYER_ETHERNET) {
+			err = mlx4_counter_alloc(ibdev->dev, &ibdev->counters[i]);
+			if (err)
+				ibdev->counters[i] = -1;
+		} else {
+			ibdev->counters[i] = -1;
+		}
+	}
+	if (mlx4_is_bonded(dev))
+		for (i = 1; i < ibdev->num_ports ; ++i)
+			ibdev->counters[i] = ibdev->counters[0];
+
+
+	mlx4_foreach_port(i, dev, MLX4_PORT_TYPE_IB)
+		ib_num_ports++;
+
+	spin_lock_init(&ibdev->sm_lock);
+	mutex_init(&ibdev->cap_mask_mutex);
+	INIT_LIST_HEAD(&ibdev->qp_list);
+	spin_lock_init(&ibdev->reset_flow_resource_lock);
+
+	if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED &&
+	    ib_num_ports) {
+		ibdev->steer_qpn_count = MLX4_IB_UC_MAX_NUM_QPS;
+		err = mlx4_qp_reserve_range(dev, ibdev->steer_qpn_count,
+					    MLX4_IB_UC_STEER_QPN_ALIGN,
+					    &ibdev->steer_qpn_base, 0);
+		if (err)
+			goto err_counter;
+
+		ibdev->ib_uc_qpns_bitmap =
+			kmalloc(BITS_TO_LONGS(ibdev->steer_qpn_count) *
+				sizeof(long),
+				GFP_KERNEL);
+		if (!ibdev->ib_uc_qpns_bitmap) {
+			dev_err(&dev->persist->pdev->dev,
+				"bit map alloc failed\n");
+			goto err_steer_qp_release;
+		}
+
+		bitmap_zero(ibdev->ib_uc_qpns_bitmap, ibdev->steer_qpn_count);
+
+		err = mlx4_FLOW_STEERING_IB_UC_QP_RANGE(
+				dev, ibdev->steer_qpn_base,
+				ibdev->steer_qpn_base +
+				ibdev->steer_qpn_count - 1);
+		if (err)
+			goto err_steer_free_bitmap;
+	}
+
+	for (j = 1; j <= ibdev->dev->caps.num_ports; j++)
+		atomic64_set(&iboe->mac[j - 1], ibdev->dev->caps.def_mac[j]);
+
+	if (ib_register_device(&ibdev->ib_dev, NULL))
+		goto err_steer_free_bitmap;
+
+	if (mlx4_ib_mad_init(ibdev))
+		goto err_reg;
+
+	if (mlx4_ib_init_sriov(ibdev))
+		goto err_mad;
+
+	if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) {
+		if (!iboe->nb.notifier_call) {
+			iboe->nb.notifier_call = mlx4_ib_netdev_event;
+			err = register_netdevice_notifier(&iboe->nb);
+			if (err) {
+				iboe->nb.notifier_call = NULL;
+				goto err_notif;
+			}
+		}
+		if (!iboe->nb_inet.notifier_call) {
+			iboe->nb_inet.notifier_call = mlx4_ib_inet_event;
+			err = register_inetaddr_notifier(&iboe->nb_inet);
+			if (err) {
+				iboe->nb_inet.notifier_call = NULL;
+				goto err_notif;
+			}
+		}
+#if IS_ENABLED(CONFIG_IPV6)
+		if (!iboe->nb_inet6.notifier_call) {
+			iboe->nb_inet6.notifier_call = mlx4_ib_inet6_event;
+			err = register_inet6addr_notifier(&iboe->nb_inet6);
+			if (err) {
+				iboe->nb_inet6.notifier_call = NULL;
+				goto err_notif;
+			}
+		}
+#endif
+		if (mlx4_ib_init_gid_table(ibdev))
+			goto err_notif;
+	}
+
+	for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) {
+		if (device_create_file(&ibdev->ib_dev.dev,
+				       mlx4_class_attributes[j]))
+			goto err_notif;
+	}
+
+	ibdev->ib_active = true;
+
+	if (mlx4_is_mfunc(ibdev->dev))
+		init_pkeys(ibdev);
+
+	/* create paravirt contexts for any VFs which are active */
+	if (mlx4_is_master(ibdev->dev)) {
+		for (j = 0; j < MLX4_MFUNC_MAX; j++) {
+			if (j == mlx4_master_func_num(ibdev->dev))
+				continue;
+			if (mlx4_is_slave_active(ibdev->dev, j))
+				do_slave_init(ibdev, j, 1);
+		}
+	}
+	return ibdev;
+
+err_notif:
+	if (ibdev->iboe.nb.notifier_call) {
+		if (unregister_netdevice_notifier(&ibdev->iboe.nb))
+			pr_warn("failure unregistering notifier\n");
+		ibdev->iboe.nb.notifier_call = NULL;
+	}
+	if (ibdev->iboe.nb_inet.notifier_call) {
+		if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
+			pr_warn("failure unregistering notifier\n");
+		ibdev->iboe.nb_inet.notifier_call = NULL;
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	if (ibdev->iboe.nb_inet6.notifier_call) {
+		if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6))
+			pr_warn("failure unregistering notifier\n");
+		ibdev->iboe.nb_inet6.notifier_call = NULL;
+	}
+#endif
+	flush_workqueue(wq);
+
+	mlx4_ib_close_sriov(ibdev);
+
+err_mad:
+	mlx4_ib_mad_cleanup(ibdev);
+
+err_reg:
+	ib_unregister_device(&ibdev->ib_dev);
+
+err_steer_free_bitmap:
+	kfree(ibdev->ib_uc_qpns_bitmap);
+
+err_steer_qp_release:
+	if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED)
+		mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
+				      ibdev->steer_qpn_count);
+err_counter:
+	for (; i; --i)
+		if (ibdev->counters[i - 1] != -1)
+			mlx4_counter_free(ibdev->dev, ibdev->counters[i - 1]);
+
+err_map:
+	iounmap(ibdev->uar_map);
+
+err_uar:
+	mlx4_uar_free(dev, &ibdev->priv_uar);
+
+err_pd:
+	mlx4_pd_free(dev, ibdev->priv_pdn);
+
+err_dealloc:
+	ib_dealloc_device(&ibdev->ib_dev);
+
+	return NULL;
+}
+
+int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn)
+{
+	int offset;
+
+	WARN_ON(!dev->ib_uc_qpns_bitmap);
+
+	offset = bitmap_find_free_region(dev->ib_uc_qpns_bitmap,
+					 dev->steer_qpn_count,
+					 get_count_order(count));
+	if (offset < 0)
+		return offset;
+
+	*qpn = dev->steer_qpn_base + offset;
+	return 0;
+}
+
+void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count)
+{
+	if (!qpn ||
+	    dev->steering_support != MLX4_STEERING_MODE_DEVICE_MANAGED)
+		return;
+
+	BUG_ON(qpn < dev->steer_qpn_base);
+
+	bitmap_release_region(dev->ib_uc_qpns_bitmap,
+			      qpn - dev->steer_qpn_base,
+			      get_count_order(count));
+}
+
+int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+			 int is_attach)
+{
+	int err;
+	size_t flow_size;
+	struct ib_flow_attr *flow = NULL;
+	struct ib_flow_spec_ib *ib_spec;
+
+	if (is_attach) {
+		flow_size = sizeof(struct ib_flow_attr) +
+			    sizeof(struct ib_flow_spec_ib);
+		flow = kzalloc(flow_size, GFP_KERNEL);
+		if (!flow)
+			return -ENOMEM;
+		flow->port = mqp->port;
+		flow->num_of_specs = 1;
+		flow->size = flow_size;
+		ib_spec = (struct ib_flow_spec_ib *)(flow + 1);
+		ib_spec->type = IB_FLOW_SPEC_IB;
+		ib_spec->size = sizeof(struct ib_flow_spec_ib);
+		/* Add an empty rule for IB L2 */
+		memset(&ib_spec->mask, 0, sizeof(ib_spec->mask));
+
+		err = __mlx4_ib_create_flow(&mqp->ibqp, flow,
+					    IB_FLOW_DOMAIN_NIC,
+					    MLX4_FS_REGULAR,
+					    &mqp->reg_id);
+	} else {
+		err = __mlx4_ib_destroy_flow(mdev->dev, mqp->reg_id);
+	}
+	kfree(flow);
+	return err;
+}
+
+static void mlx4_ib_remove(struct mlx4_dev *dev, void *ibdev_ptr)
+{
+	struct mlx4_ib_dev *ibdev = ibdev_ptr;
+	int p;
+
+	ibdev->ib_active = false;
+	flush_workqueue(wq);
+
+	mlx4_ib_close_sriov(ibdev);
+	mlx4_ib_mad_cleanup(ibdev);
+	ib_unregister_device(&ibdev->ib_dev);
+	if (ibdev->iboe.nb.notifier_call) {
+		if (unregister_netdevice_notifier(&ibdev->iboe.nb))
+			pr_warn("failure unregistering notifier\n");
+		ibdev->iboe.nb.notifier_call = NULL;
+	}
+
+	if (ibdev->steering_support == MLX4_STEERING_MODE_DEVICE_MANAGED) {
+		mlx4_qp_release_range(dev, ibdev->steer_qpn_base,
+				      ibdev->steer_qpn_count);
+		kfree(ibdev->ib_uc_qpns_bitmap);
+	}
+
+	if (ibdev->iboe.nb_inet.notifier_call) {
+		if (unregister_inetaddr_notifier(&ibdev->iboe.nb_inet))
+			pr_warn("failure unregistering notifier\n");
+		ibdev->iboe.nb_inet.notifier_call = NULL;
+	}
+#if IS_ENABLED(CONFIG_IPV6)
+	if (ibdev->iboe.nb_inet6.notifier_call) {
+		if (unregister_inet6addr_notifier(&ibdev->iboe.nb_inet6))
+			pr_warn("failure unregistering notifier\n");
+		ibdev->iboe.nb_inet6.notifier_call = NULL;
+	}
+#endif
+
+	iounmap(ibdev->uar_map);
+	for (p = 0; p < ibdev->num_ports; ++p)
+		if (ibdev->counters[p] != -1)
+			mlx4_counter_free(ibdev->dev, ibdev->counters[p]);
+	mlx4_foreach_port(p, dev, MLX4_PORT_TYPE_IB)
+		mlx4_CLOSE_PORT(dev, p);
+
+	mlx4_ib_free_eqs(dev, ibdev);
+
+	mlx4_uar_free(dev, &ibdev->priv_uar);
+	mlx4_pd_free(dev, ibdev->priv_pdn);
+	ib_dealloc_device(&ibdev->ib_dev);
+}
+
+static void do_slave_init(struct mlx4_ib_dev *ibdev, int slave, int do_init)
+{
+	struct mlx4_ib_demux_work **dm = NULL;
+	struct mlx4_dev *dev = ibdev->dev;
+	int i;
+	unsigned long flags;
+	struct mlx4_active_ports actv_ports;
+	unsigned int ports;
+	unsigned int first_port;
+
+	if (!mlx4_is_master(dev))
+		return;
+
+	actv_ports = mlx4_get_active_ports(dev, slave);
+	ports = bitmap_weight(actv_ports.ports, dev->caps.num_ports);
+	first_port = find_first_bit(actv_ports.ports, dev->caps.num_ports);
+
+	dm = kcalloc(ports, sizeof(*dm), GFP_ATOMIC);
+	if (!dm) {
+		pr_err("failed to allocate memory for tunneling qp update\n");
+		goto out;
+	}
+
+	for (i = 0; i < ports; i++) {
+		dm[i] = kmalloc(sizeof (struct mlx4_ib_demux_work), GFP_ATOMIC);
+		if (!dm[i]) {
+			pr_err("failed to allocate memory for tunneling qp update work struct\n");
+			for (i = 0; i < dev->caps.num_ports; i++) {
+				if (dm[i])
+					kfree(dm[i]);
+			}
+			goto out;
+		}
+	}
+	/* initialize or tear down tunnel QPs for the slave */
+	for (i = 0; i < ports; i++) {
+		INIT_WORK(&dm[i]->work, mlx4_ib_tunnels_update_work);
+		dm[i]->port = first_port + i + 1;
+		dm[i]->slave = slave;
+		dm[i]->do_init = do_init;
+		dm[i]->dev = ibdev;
+		spin_lock_irqsave(&ibdev->sriov.going_down_lock, flags);
+		if (!ibdev->sriov.is_going_down)
+			queue_work(ibdev->sriov.demux[i].ud_wq, &dm[i]->work);
+		spin_unlock_irqrestore(&ibdev->sriov.going_down_lock, flags);
+	}
+out:
+	kfree(dm);
+	return;
+}
+
+static void mlx4_ib_handle_catas_error(struct mlx4_ib_dev *ibdev)
+{
+	struct mlx4_ib_qp *mqp;
+	unsigned long flags_qp;
+	unsigned long flags_cq;
+	struct mlx4_ib_cq *send_mcq, *recv_mcq;
+	struct list_head    cq_notify_list;
+	struct mlx4_cq *mcq;
+	unsigned long flags;
+
+	pr_warn("mlx4_ib_handle_catas_error was started\n");
+	INIT_LIST_HEAD(&cq_notify_list);
+
+	/* Go over qp list reside on that ibdev, sync with create/destroy qp.*/
+	spin_lock_irqsave(&ibdev->reset_flow_resource_lock, flags);
+
+	list_for_each_entry(mqp, &ibdev->qp_list, qps_list) {
+		spin_lock_irqsave(&mqp->sq.lock, flags_qp);
+		if (mqp->sq.tail != mqp->sq.head) {
+			send_mcq = to_mcq(mqp->ibqp.send_cq);
+			spin_lock_irqsave(&send_mcq->lock, flags_cq);
+			if (send_mcq->mcq.comp &&
+			    mqp->ibqp.send_cq->comp_handler) {
+				if (!send_mcq->mcq.reset_notify_added) {
+					send_mcq->mcq.reset_notify_added = 1;
+					list_add_tail(&send_mcq->mcq.reset_notify,
+						      &cq_notify_list);
+				}
+			}
+			spin_unlock_irqrestore(&send_mcq->lock, flags_cq);
+		}
+		spin_unlock_irqrestore(&mqp->sq.lock, flags_qp);
+		/* Now, handle the QP's receive queue */
+		spin_lock_irqsave(&mqp->rq.lock, flags_qp);
+		/* no handling is needed for SRQ */
+		if (!mqp->ibqp.srq) {
+			if (mqp->rq.tail != mqp->rq.head) {
+				recv_mcq = to_mcq(mqp->ibqp.recv_cq);
+				spin_lock_irqsave(&recv_mcq->lock, flags_cq);
+				if (recv_mcq->mcq.comp &&
+				    mqp->ibqp.recv_cq->comp_handler) {
+					if (!recv_mcq->mcq.reset_notify_added) {
+						recv_mcq->mcq.reset_notify_added = 1;
+						list_add_tail(&recv_mcq->mcq.reset_notify,
+							      &cq_notify_list);
+					}
+				}
+				spin_unlock_irqrestore(&recv_mcq->lock,
+						       flags_cq);
+			}
+		}
+		spin_unlock_irqrestore(&mqp->rq.lock, flags_qp);
+	}
+
+	list_for_each_entry(mcq, &cq_notify_list, reset_notify) {
+		mcq->comp(mcq);
+	}
+	spin_unlock_irqrestore(&ibdev->reset_flow_resource_lock, flags);
+	pr_warn("mlx4_ib_handle_catas_error ended\n");
+}
+
+static void handle_bonded_port_state_event(struct work_struct *work)
+{
+	struct ib_event_work *ew =
+		container_of(work, struct ib_event_work, work);
+	struct mlx4_ib_dev *ibdev = ew->ib_dev;
+	enum ib_port_state bonded_port_state = IB_PORT_NOP;
+	int i;
+	struct ib_event ibev;
+
+	kfree(ew);
+	spin_lock_bh(&ibdev->iboe.lock);
+	for (i = 0; i < MLX4_MAX_PORTS; ++i) {
+		struct net_device *curr_netdev = ibdev->iboe.netdevs[i];
+		enum ib_port_state curr_port_state;
+
+		if (!curr_netdev)
+			continue;
+
+		curr_port_state =
+			(netif_running(curr_netdev) &&
+			 netif_carrier_ok(curr_netdev)) ?
+			IB_PORT_ACTIVE : IB_PORT_DOWN;
+
+		bonded_port_state = (bonded_port_state != IB_PORT_ACTIVE) ?
+			curr_port_state : IB_PORT_ACTIVE;
+	}
+	spin_unlock_bh(&ibdev->iboe.lock);
+
+	ibev.device = &ibdev->ib_dev;
+	ibev.element.port_num = 1;
+	ibev.event = (bonded_port_state == IB_PORT_ACTIVE) ?
+		IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+
+	ib_dispatch_event(&ibev);
+}
+
+static void mlx4_ib_event(struct mlx4_dev *dev, void *ibdev_ptr,
+			  enum mlx4_dev_event event, unsigned long param)
+{
+	struct ib_event ibev;
+	struct mlx4_ib_dev *ibdev = to_mdev((struct ib_device *) ibdev_ptr);
+	struct mlx4_eqe *eqe = NULL;
+	struct ib_event_work *ew;
+	int p = 0;
+
+	if (mlx4_is_bonded(dev) &&
+	    ((event == MLX4_DEV_EVENT_PORT_UP) ||
+	    (event == MLX4_DEV_EVENT_PORT_DOWN))) {
+		ew = kmalloc(sizeof(*ew), GFP_ATOMIC);
+		if (!ew)
+			return;
+		INIT_WORK(&ew->work, handle_bonded_port_state_event);
+		ew->ib_dev = ibdev;
+		queue_work(wq, &ew->work);
+		return;
+	}
+
+	if (event == MLX4_DEV_EVENT_PORT_MGMT_CHANGE)
+		eqe = (struct mlx4_eqe *)param;
+	else
+		p = (int) param;
+
+	switch (event) {
+	case MLX4_DEV_EVENT_PORT_UP:
+		if (p > ibdev->num_ports)
+			return;
+		if (mlx4_is_master(dev) &&
+		    rdma_port_get_link_layer(&ibdev->ib_dev, p) ==
+			IB_LINK_LAYER_INFINIBAND) {
+			mlx4_ib_invalidate_all_guid_record(ibdev, p);
+		}
+		ibev.event = IB_EVENT_PORT_ACTIVE;
+		break;
+
+	case MLX4_DEV_EVENT_PORT_DOWN:
+		if (p > ibdev->num_ports)
+			return;
+		ibev.event = IB_EVENT_PORT_ERR;
+		break;
+
+	case MLX4_DEV_EVENT_CATASTROPHIC_ERROR:
+		ibdev->ib_active = false;
+		ibev.event = IB_EVENT_DEVICE_FATAL;
+		mlx4_ib_handle_catas_error(ibdev);
+		break;
+
+	case MLX4_DEV_EVENT_PORT_MGMT_CHANGE:
+		ew = kmalloc(sizeof *ew, GFP_ATOMIC);
+		if (!ew) {
+			pr_err("failed to allocate memory for events work\n");
+			break;
+		}
+
+		INIT_WORK(&ew->work, handle_port_mgmt_change_event);
+		memcpy(&ew->ib_eqe, eqe, sizeof *eqe);
+		ew->ib_dev = ibdev;
+		/* need to queue only for port owner, which uses GEN_EQE */
+		if (mlx4_is_master(dev))
+			queue_work(wq, &ew->work);
+		else
+			handle_port_mgmt_change_event(&ew->work);
+		return;
+
+	case MLX4_DEV_EVENT_SLAVE_INIT:
+		/* here, p is the slave id */
+		do_slave_init(ibdev, p, 1);
+		if (mlx4_is_master(dev)) {
+			int i;
+
+			for (i = 1; i <= ibdev->num_ports; i++) {
+				if (rdma_port_get_link_layer(&ibdev->ib_dev, i)
+					== IB_LINK_LAYER_INFINIBAND)
+					mlx4_ib_slave_alias_guid_event(ibdev,
+								       p, i,
+								       1);
+			}
+		}
+		return;
+
+	case MLX4_DEV_EVENT_SLAVE_SHUTDOWN:
+		if (mlx4_is_master(dev)) {
+			int i;
+
+			for (i = 1; i <= ibdev->num_ports; i++) {
+				if (rdma_port_get_link_layer(&ibdev->ib_dev, i)
+					== IB_LINK_LAYER_INFINIBAND)
+					mlx4_ib_slave_alias_guid_event(ibdev,
+								       p, i,
+								       0);
+			}
+		}
+		/* here, p is the slave id */
+		do_slave_init(ibdev, p, 0);
+		return;
+
+	default:
+		return;
+	}
+
+	ibev.device	      = ibdev_ptr;
+	ibev.element.port_num = mlx4_is_bonded(ibdev->dev) ? 1 : (u8)p;
+
+	ib_dispatch_event(&ibev);
+}
+
+static struct mlx4_interface mlx4_ib_interface = {
+	.add		= mlx4_ib_add,
+	.remove		= mlx4_ib_remove,
+	.event		= mlx4_ib_event,
+	.protocol	= MLX4_PROT_IB_IPV6,
+	.flags		= MLX4_INTFF_BONDING
+};
+
+static int __init mlx4_ib_init(void)
+{
+	int err;
+
+	wq = create_singlethread_workqueue("mlx4_ib");
+	if (!wq)
+		return -ENOMEM;
+
+	err = mlx4_ib_mcg_init();
+	if (err)
+		goto clean_wq;
+
+	err = mlx4_register_interface(&mlx4_ib_interface);
+	if (err)
+		goto clean_mcg;
+
+	return 0;
+
+clean_mcg:
+	mlx4_ib_mcg_destroy();
+
+clean_wq:
+	destroy_workqueue(wq);
+	return err;
+}
+
+static void __exit mlx4_ib_cleanup(void)
+{
+	mlx4_unregister_interface(&mlx4_ib_interface);
+	mlx4_ib_mcg_destroy();
+	destroy_workqueue(wq);
+}
+
+module_init(mlx4_ib_init);
+module_exit(mlx4_ib_cleanup);
diff --git a/drivers/infiniband/hw/mlx4/mcg.c b/drivers/infiniband/hw/mlx4/mcg.c
new file mode 100644
index 000000000..ed327e6c8
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mcg.c
@@ -0,0 +1,1257 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_sa.h>
+
+#include <linux/mlx4/cmd.h>
+#include <linux/rbtree.h>
+#include <linux/delay.h>
+
+#include "mlx4_ib.h"
+
+#define MAX_VFS		80
+#define MAX_PEND_REQS_PER_FUNC 4
+#define MAD_TIMEOUT_MS	2000
+
+#define mcg_warn(fmt, arg...)	pr_warn("MCG WARNING: " fmt, ##arg)
+#define mcg_error(fmt, arg...)	pr_err(fmt, ##arg)
+#define mcg_warn_group(group, format, arg...) \
+	pr_warn("%s-%d: %16s (port %d): WARNING: " format, __func__, __LINE__,\
+	(group)->name, group->demux->port, ## arg)
+
+#define mcg_error_group(group, format, arg...) \
+	pr_err("  %16s: " format, (group)->name, ## arg)
+
+
+static union ib_gid mgid0;
+
+static struct workqueue_struct *clean_wq;
+
+enum mcast_state {
+	MCAST_NOT_MEMBER = 0,
+	MCAST_MEMBER,
+};
+
+enum mcast_group_state {
+	MCAST_IDLE,
+	MCAST_JOIN_SENT,
+	MCAST_LEAVE_SENT,
+	MCAST_RESP_READY
+};
+
+struct mcast_member {
+	enum mcast_state state;
+	uint8_t			join_state;
+	int			num_pend_reqs;
+	struct list_head	pending;
+};
+
+struct ib_sa_mcmember_data {
+	union ib_gid	mgid;
+	union ib_gid	port_gid;
+	__be32		qkey;
+	__be16		mlid;
+	u8		mtusel_mtu;
+	u8		tclass;
+	__be16		pkey;
+	u8		ratesel_rate;
+	u8		lifetmsel_lifetm;
+	__be32		sl_flowlabel_hoplimit;
+	u8		scope_join_state;
+	u8		proxy_join;
+	u8		reserved[2];
+};
+
+struct mcast_group {
+	struct ib_sa_mcmember_data rec;
+	struct rb_node		node;
+	struct list_head	mgid0_list;
+	struct mlx4_ib_demux_ctx *demux;
+	struct mcast_member	func[MAX_VFS];
+	struct mutex		lock;
+	struct work_struct	work;
+	struct list_head	pending_list;
+	int			members[3];
+	enum mcast_group_state	state;
+	enum mcast_group_state	prev_state;
+	struct ib_sa_mad	response_sa_mad;
+	__be64			last_req_tid;
+
+	char			name[33]; /* MGID string */
+	struct device_attribute	dentry;
+
+	/* refcount is the reference count for the following:
+	   1. Each queued request
+	   2. Each invocation of the worker thread
+	   3. Membership of the port at the SA
+	*/
+	atomic_t		refcount;
+
+	/* delayed work to clean pending SM request */
+	struct delayed_work	timeout_work;
+	struct list_head	cleanup_list;
+};
+
+struct mcast_req {
+	int			func;
+	struct ib_sa_mad	sa_mad;
+	struct list_head	group_list;
+	struct list_head	func_list;
+	struct mcast_group	*group;
+	int			clean;
+};
+
+
+#define safe_atomic_dec(ref) \
+	do {\
+		if (atomic_dec_and_test(ref)) \
+			mcg_warn_group(group, "did not expect to reach zero\n"); \
+	} while (0)
+
+static const char *get_state_string(enum mcast_group_state state)
+{
+	switch (state) {
+	case MCAST_IDLE:
+		return "MCAST_IDLE";
+	case MCAST_JOIN_SENT:
+		return "MCAST_JOIN_SENT";
+	case MCAST_LEAVE_SENT:
+		return "MCAST_LEAVE_SENT";
+	case MCAST_RESP_READY:
+		return "MCAST_RESP_READY";
+	}
+	return "Invalid State";
+}
+
+static struct mcast_group *mcast_find(struct mlx4_ib_demux_ctx *ctx,
+				      union ib_gid *mgid)
+{
+	struct rb_node *node = ctx->mcg_table.rb_node;
+	struct mcast_group *group;
+	int ret;
+
+	while (node) {
+		group = rb_entry(node, struct mcast_group, node);
+		ret = memcmp(mgid->raw, group->rec.mgid.raw, sizeof *mgid);
+		if (!ret)
+			return group;
+
+		if (ret < 0)
+			node = node->rb_left;
+		else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct mcast_group *mcast_insert(struct mlx4_ib_demux_ctx *ctx,
+					struct mcast_group *group)
+{
+	struct rb_node **link = &ctx->mcg_table.rb_node;
+	struct rb_node *parent = NULL;
+	struct mcast_group *cur_group;
+	int ret;
+
+	while (*link) {
+		parent = *link;
+		cur_group = rb_entry(parent, struct mcast_group, node);
+
+		ret = memcmp(group->rec.mgid.raw, cur_group->rec.mgid.raw,
+			     sizeof group->rec.mgid);
+		if (ret < 0)
+			link = &(*link)->rb_left;
+		else if (ret > 0)
+			link = &(*link)->rb_right;
+		else
+			return cur_group;
+	}
+	rb_link_node(&group->node, parent, link);
+	rb_insert_color(&group->node, &ctx->mcg_table);
+	return NULL;
+}
+
+static int send_mad_to_wire(struct mlx4_ib_demux_ctx *ctx, struct ib_mad *mad)
+{
+	struct mlx4_ib_dev *dev = ctx->dev;
+	struct ib_ah_attr	ah_attr;
+
+	spin_lock(&dev->sm_lock);
+	if (!dev->sm_ah[ctx->port - 1]) {
+		/* port is not yet Active, sm_ah not ready */
+		spin_unlock(&dev->sm_lock);
+		return -EAGAIN;
+	}
+	mlx4_ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
+	spin_unlock(&dev->sm_lock);
+	return mlx4_ib_send_to_wire(dev, mlx4_master_func_num(dev->dev),
+				    ctx->port, IB_QPT_GSI, 0, 1, IB_QP1_QKEY,
+				    &ah_attr, NULL, mad);
+}
+
+static int send_mad_to_slave(int slave, struct mlx4_ib_demux_ctx *ctx,
+			     struct ib_mad *mad)
+{
+	struct mlx4_ib_dev *dev = ctx->dev;
+	struct ib_mad_agent *agent = dev->send_agent[ctx->port - 1][1];
+	struct ib_wc wc;
+	struct ib_ah_attr ah_attr;
+
+	/* Our agent might not yet be registered when mads start to arrive */
+	if (!agent)
+		return -EAGAIN;
+
+	ib_query_ah(dev->sm_ah[ctx->port - 1], &ah_attr);
+
+	if (ib_find_cached_pkey(&dev->ib_dev, ctx->port, IB_DEFAULT_PKEY_FULL, &wc.pkey_index))
+		return -EINVAL;
+	wc.sl = 0;
+	wc.dlid_path_bits = 0;
+	wc.port_num = ctx->port;
+	wc.slid = ah_attr.dlid;  /* opensm lid */
+	wc.src_qp = 1;
+	return mlx4_ib_send_to_slave(dev, slave, ctx->port, IB_QPT_GSI, &wc, NULL, mad);
+}
+
+static int send_join_to_wire(struct mcast_group *group, struct ib_sa_mad *sa_mad)
+{
+	struct ib_sa_mad mad;
+	struct ib_sa_mcmember_data *sa_mad_data = (struct ib_sa_mcmember_data *)&mad.data;
+	int ret;
+
+	/* we rely on a mad request as arrived from a VF */
+	memcpy(&mad, sa_mad, sizeof mad);
+
+	/* fix port GID to be the real one (slave 0) */
+	sa_mad_data->port_gid.global.interface_id = group->demux->guid_cache[0];
+
+	/* assign our own TID */
+	mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux);
+	group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */
+
+	ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad);
+	/* set timeout handler */
+	if (!ret) {
+		/* calls mlx4_ib_mcg_timeout_handler */
+		queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
+				msecs_to_jiffies(MAD_TIMEOUT_MS));
+	}
+
+	return ret;
+}
+
+static int send_leave_to_wire(struct mcast_group *group, u8 join_state)
+{
+	struct ib_sa_mad mad;
+	struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data;
+	int ret;
+
+	memset(&mad, 0, sizeof mad);
+	mad.mad_hdr.base_version = 1;
+	mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+	mad.mad_hdr.class_version = 2;
+	mad.mad_hdr.method = IB_SA_METHOD_DELETE;
+	mad.mad_hdr.status = cpu_to_be16(0);
+	mad.mad_hdr.class_specific = cpu_to_be16(0);
+	mad.mad_hdr.tid = mlx4_ib_get_new_demux_tid(group->demux);
+	group->last_req_tid = mad.mad_hdr.tid; /* keep it for later validation */
+	mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+	mad.mad_hdr.attr_mod = cpu_to_be32(0);
+	mad.sa_hdr.sm_key = 0x0;
+	mad.sa_hdr.attr_offset = cpu_to_be16(7);
+	mad.sa_hdr.comp_mask = IB_SA_MCMEMBER_REC_MGID |
+		IB_SA_MCMEMBER_REC_PORT_GID | IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+	*sa_data = group->rec;
+	sa_data->scope_join_state = join_state;
+
+	ret = send_mad_to_wire(group->demux, (struct ib_mad *)&mad);
+	if (ret)
+		group->state = MCAST_IDLE;
+
+	/* set timeout handler */
+	if (!ret) {
+		/* calls mlx4_ib_mcg_timeout_handler */
+		queue_delayed_work(group->demux->mcg_wq, &group->timeout_work,
+				msecs_to_jiffies(MAD_TIMEOUT_MS));
+	}
+
+	return ret;
+}
+
+static int send_reply_to_slave(int slave, struct mcast_group *group,
+		struct ib_sa_mad *req_sa_mad, u16 status)
+{
+	struct ib_sa_mad mad;
+	struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)&mad.data;
+	struct ib_sa_mcmember_data *req_sa_data = (struct ib_sa_mcmember_data *)&req_sa_mad->data;
+	int ret;
+
+	memset(&mad, 0, sizeof mad);
+	mad.mad_hdr.base_version = 1;
+	mad.mad_hdr.mgmt_class = IB_MGMT_CLASS_SUBN_ADM;
+	mad.mad_hdr.class_version = 2;
+	mad.mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+	mad.mad_hdr.status = cpu_to_be16(status);
+	mad.mad_hdr.class_specific = cpu_to_be16(0);
+	mad.mad_hdr.tid = req_sa_mad->mad_hdr.tid;
+	*(u8 *)&mad.mad_hdr.tid = 0; /* resetting tid to 0 */
+	mad.mad_hdr.attr_id = cpu_to_be16(IB_SA_ATTR_MC_MEMBER_REC);
+	mad.mad_hdr.attr_mod = cpu_to_be32(0);
+	mad.sa_hdr.sm_key = req_sa_mad->sa_hdr.sm_key;
+	mad.sa_hdr.attr_offset = cpu_to_be16(7);
+	mad.sa_hdr.comp_mask = 0; /* ignored on responses, see IBTA spec */
+
+	*sa_data = group->rec;
+
+	/* reconstruct VF's requested join_state and port_gid */
+	sa_data->scope_join_state &= 0xf0;
+	sa_data->scope_join_state |= (group->func[slave].join_state & 0x0f);
+	memcpy(&sa_data->port_gid, &req_sa_data->port_gid, sizeof req_sa_data->port_gid);
+
+	ret = send_mad_to_slave(slave, group->demux, (struct ib_mad *)&mad);
+	return ret;
+}
+
+static int check_selector(ib_sa_comp_mask comp_mask,
+			  ib_sa_comp_mask selector_mask,
+			  ib_sa_comp_mask value_mask,
+			  u8 src_value, u8 dst_value)
+{
+	int err;
+	u8 selector = dst_value >> 6;
+	dst_value &= 0x3f;
+	src_value &= 0x3f;
+
+	if (!(comp_mask & selector_mask) || !(comp_mask & value_mask))
+		return 0;
+
+	switch (selector) {
+	case IB_SA_GT:
+		err = (src_value <= dst_value);
+		break;
+	case IB_SA_LT:
+		err = (src_value >= dst_value);
+		break;
+	case IB_SA_EQ:
+		err = (src_value != dst_value);
+		break;
+	default:
+		err = 0;
+		break;
+	}
+
+	return err;
+}
+
+static u16 cmp_rec(struct ib_sa_mcmember_data *src,
+		   struct ib_sa_mcmember_data *dst, ib_sa_comp_mask comp_mask)
+{
+	/* src is group record, dst is request record */
+	/* MGID must already match */
+	/* Port_GID we always replace to our Port_GID, so it is a match */
+
+#define MAD_STATUS_REQ_INVALID 0x0200
+	if (comp_mask & IB_SA_MCMEMBER_REC_QKEY && src->qkey != dst->qkey)
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_MLID && src->mlid != dst->mlid)
+		return MAD_STATUS_REQ_INVALID;
+	if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_MTU_SELECTOR,
+				 IB_SA_MCMEMBER_REC_MTU,
+				 src->mtusel_mtu, dst->mtusel_mtu))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_TRAFFIC_CLASS &&
+	    src->tclass != dst->tclass)
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_PKEY && src->pkey != dst->pkey)
+		return MAD_STATUS_REQ_INVALID;
+	if (check_selector(comp_mask, IB_SA_MCMEMBER_REC_RATE_SELECTOR,
+				 IB_SA_MCMEMBER_REC_RATE,
+				 src->ratesel_rate, dst->ratesel_rate))
+		return MAD_STATUS_REQ_INVALID;
+	if (check_selector(comp_mask,
+				 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME_SELECTOR,
+				 IB_SA_MCMEMBER_REC_PACKET_LIFE_TIME,
+				 src->lifetmsel_lifetm, dst->lifetmsel_lifetm))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SL &&
+			(be32_to_cpu(src->sl_flowlabel_hoplimit) & 0xf0000000) !=
+			(be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0xf0000000))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_FLOW_LABEL &&
+			(be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x0fffff00) !=
+			(be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x0fffff00))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_HOP_LIMIT &&
+			(be32_to_cpu(src->sl_flowlabel_hoplimit) & 0x000000ff) !=
+			(be32_to_cpu(dst->sl_flowlabel_hoplimit) & 0x000000ff))
+		return MAD_STATUS_REQ_INVALID;
+	if (comp_mask & IB_SA_MCMEMBER_REC_SCOPE &&
+			(src->scope_join_state & 0xf0) !=
+			(dst->scope_join_state & 0xf0))
+		return MAD_STATUS_REQ_INVALID;
+
+	/* join_state checked separately, proxy_join ignored */
+
+	return 0;
+}
+
+/* release group, return 1 if this was last release and group is destroyed
+ * timout work is canceled sync */
+static int release_group(struct mcast_group *group, int from_timeout_handler)
+{
+	struct mlx4_ib_demux_ctx *ctx = group->demux;
+	int nzgroup;
+
+	mutex_lock(&ctx->mcg_table_lock);
+	mutex_lock(&group->lock);
+	if (atomic_dec_and_test(&group->refcount)) {
+		if (!from_timeout_handler) {
+			if (group->state != MCAST_IDLE &&
+			    !cancel_delayed_work(&group->timeout_work)) {
+				atomic_inc(&group->refcount);
+				mutex_unlock(&group->lock);
+				mutex_unlock(&ctx->mcg_table_lock);
+				return 0;
+			}
+		}
+
+		nzgroup = memcmp(&group->rec.mgid, &mgid0, sizeof mgid0);
+		if (nzgroup)
+			del_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+		if (!list_empty(&group->pending_list))
+			mcg_warn_group(group, "releasing a group with non empty pending list\n");
+		if (nzgroup)
+			rb_erase(&group->node, &ctx->mcg_table);
+		list_del_init(&group->mgid0_list);
+		mutex_unlock(&group->lock);
+		mutex_unlock(&ctx->mcg_table_lock);
+		kfree(group);
+		return 1;
+	} else {
+		mutex_unlock(&group->lock);
+		mutex_unlock(&ctx->mcg_table_lock);
+	}
+	return 0;
+}
+
+static void adjust_membership(struct mcast_group *group, u8 join_state, int inc)
+{
+	int i;
+
+	for (i = 0; i < 3; i++, join_state >>= 1)
+		if (join_state & 0x1)
+			group->members[i] += inc;
+}
+
+static u8 get_leave_state(struct mcast_group *group)
+{
+	u8 leave_state = 0;
+	int i;
+
+	for (i = 0; i < 3; i++)
+		if (!group->members[i])
+			leave_state |= (1 << i);
+
+	return leave_state & (group->rec.scope_join_state & 7);
+}
+
+static int join_group(struct mcast_group *group, int slave, u8 join_mask)
+{
+	int ret = 0;
+	u8 join_state;
+
+	/* remove bits that slave is already member of, and adjust */
+	join_state = join_mask & (~group->func[slave].join_state);
+	adjust_membership(group, join_state, 1);
+	group->func[slave].join_state |= join_state;
+	if (group->func[slave].state != MCAST_MEMBER && join_state) {
+		group->func[slave].state = MCAST_MEMBER;
+		ret = 1;
+	}
+	return ret;
+}
+
+static int leave_group(struct mcast_group *group, int slave, u8 leave_state)
+{
+	int ret = 0;
+
+	adjust_membership(group, leave_state, -1);
+	group->func[slave].join_state &= ~leave_state;
+	if (!group->func[slave].join_state) {
+		group->func[slave].state = MCAST_NOT_MEMBER;
+		ret = 1;
+	}
+	return ret;
+}
+
+static int check_leave(struct mcast_group *group, int slave, u8 leave_mask)
+{
+	if (group->func[slave].state != MCAST_MEMBER)
+		return MAD_STATUS_REQ_INVALID;
+
+	/* make sure we're not deleting unset bits */
+	if (~group->func[slave].join_state & leave_mask)
+		return MAD_STATUS_REQ_INVALID;
+
+	if (!leave_mask)
+		return MAD_STATUS_REQ_INVALID;
+
+	return 0;
+}
+
+static void mlx4_ib_mcg_timeout_handler(struct work_struct *work)
+{
+	struct delayed_work *delay = to_delayed_work(work);
+	struct mcast_group *group;
+	struct mcast_req *req = NULL;
+
+	group = container_of(delay, typeof(*group), timeout_work);
+
+	mutex_lock(&group->lock);
+	if (group->state == MCAST_JOIN_SENT) {
+		if (!list_empty(&group->pending_list)) {
+			req = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+			list_del(&req->group_list);
+			list_del(&req->func_list);
+			--group->func[req->func].num_pend_reqs;
+			mutex_unlock(&group->lock);
+			kfree(req);
+			if (memcmp(&group->rec.mgid, &mgid0, sizeof mgid0)) {
+				if (release_group(group, 1))
+					return;
+			} else {
+				kfree(group);
+				return;
+			}
+			mutex_lock(&group->lock);
+		} else
+			mcg_warn_group(group, "DRIVER BUG\n");
+	} else if (group->state == MCAST_LEAVE_SENT) {
+		if (group->rec.scope_join_state & 7)
+			group->rec.scope_join_state &= 0xf8;
+		group->state = MCAST_IDLE;
+		mutex_unlock(&group->lock);
+		if (release_group(group, 1))
+			return;
+		mutex_lock(&group->lock);
+	} else
+		mcg_warn_group(group, "invalid state %s\n", get_state_string(group->state));
+	group->state = MCAST_IDLE;
+	atomic_inc(&group->refcount);
+	if (!queue_work(group->demux->mcg_wq, &group->work))
+		safe_atomic_dec(&group->refcount);
+
+	mutex_unlock(&group->lock);
+}
+
+static int handle_leave_req(struct mcast_group *group, u8 leave_mask,
+			    struct mcast_req *req)
+{
+	u16 status;
+
+	if (req->clean)
+		leave_mask = group->func[req->func].join_state;
+
+	status = check_leave(group, req->func, leave_mask);
+	if (!status)
+		leave_group(group, req->func, leave_mask);
+
+	if (!req->clean)
+		send_reply_to_slave(req->func, group, &req->sa_mad, status);
+	--group->func[req->func].num_pend_reqs;
+	list_del(&req->group_list);
+	list_del(&req->func_list);
+	kfree(req);
+	return 1;
+}
+
+static int handle_join_req(struct mcast_group *group, u8 join_mask,
+			   struct mcast_req *req)
+{
+	u8 group_join_state = group->rec.scope_join_state & 7;
+	int ref = 0;
+	u16 status;
+	struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
+
+	if (join_mask == (group_join_state & join_mask)) {
+		/* port's membership need not change */
+		status = cmp_rec(&group->rec, sa_data, req->sa_mad.sa_hdr.comp_mask);
+		if (!status)
+			join_group(group, req->func, join_mask);
+
+		--group->func[req->func].num_pend_reqs;
+		send_reply_to_slave(req->func, group, &req->sa_mad, status);
+		list_del(&req->group_list);
+		list_del(&req->func_list);
+		kfree(req);
+		++ref;
+	} else {
+		/* port's membership needs to be updated */
+		group->prev_state = group->state;
+		if (send_join_to_wire(group, &req->sa_mad)) {
+			--group->func[req->func].num_pend_reqs;
+			list_del(&req->group_list);
+			list_del(&req->func_list);
+			kfree(req);
+			ref = 1;
+			group->state = group->prev_state;
+		} else
+			group->state = MCAST_JOIN_SENT;
+	}
+
+	return ref;
+}
+
+static void mlx4_ib_mcg_work_handler(struct work_struct *work)
+{
+	struct mcast_group *group;
+	struct mcast_req *req = NULL;
+	struct ib_sa_mcmember_data *sa_data;
+	u8 req_join_state;
+	int rc = 1; /* release_count - this is for the scheduled work */
+	u16 status;
+	u8 method;
+
+	group = container_of(work, typeof(*group), work);
+
+	mutex_lock(&group->lock);
+
+	/* First, let's see if a response from SM is waiting regarding this group.
+	 * If so, we need to update the group's REC. If this is a bad response, we
+	 * may need to send a bad response to a VF waiting for it. If VF is waiting
+	 * and this is a good response, the VF will be answered later in this func. */
+	if (group->state == MCAST_RESP_READY) {
+		/* cancels mlx4_ib_mcg_timeout_handler */
+		cancel_delayed_work(&group->timeout_work);
+		status = be16_to_cpu(group->response_sa_mad.mad_hdr.status);
+		method = group->response_sa_mad.mad_hdr.method;
+		if (group->last_req_tid != group->response_sa_mad.mad_hdr.tid) {
+			mcg_warn_group(group, "Got MAD response to existing MGID but wrong TID, dropping. Resp TID=%llx, group TID=%llx\n",
+				be64_to_cpu(group->response_sa_mad.mad_hdr.tid),
+				be64_to_cpu(group->last_req_tid));
+			group->state = group->prev_state;
+			goto process_requests;
+		}
+		if (status) {
+			if (!list_empty(&group->pending_list))
+				req = list_first_entry(&group->pending_list,
+						struct mcast_req, group_list);
+			if ((method == IB_MGMT_METHOD_GET_RESP)) {
+					if (req) {
+						send_reply_to_slave(req->func, group, &req->sa_mad, status);
+						--group->func[req->func].num_pend_reqs;
+						list_del(&req->group_list);
+						list_del(&req->func_list);
+						kfree(req);
+						++rc;
+					} else
+						mcg_warn_group(group, "no request for failed join\n");
+			} else if (method == IB_SA_METHOD_DELETE_RESP && group->demux->flushing)
+				++rc;
+		} else {
+			u8 resp_join_state;
+			u8 cur_join_state;
+
+			resp_join_state = ((struct ib_sa_mcmember_data *)
+						group->response_sa_mad.data)->scope_join_state & 7;
+			cur_join_state = group->rec.scope_join_state & 7;
+
+			if (method == IB_MGMT_METHOD_GET_RESP) {
+				/* successfull join */
+				if (!cur_join_state && resp_join_state)
+					--rc;
+			} else if (!resp_join_state)
+					++rc;
+			memcpy(&group->rec, group->response_sa_mad.data, sizeof group->rec);
+		}
+		group->state = MCAST_IDLE;
+	}
+
+process_requests:
+	/* We should now go over pending join/leave requests, as long as we are idle. */
+	while (!list_empty(&group->pending_list) && group->state == MCAST_IDLE) {
+		req = list_first_entry(&group->pending_list, struct mcast_req,
+				       group_list);
+		sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
+		req_join_state = sa_data->scope_join_state & 0x7;
+
+		/* For a leave request, we will immediately answer the VF, and
+		 * update our internal counters. The actual leave will be sent
+		 * to SM later, if at all needed. We dequeue the request now. */
+		if (req->sa_mad.mad_hdr.method == IB_SA_METHOD_DELETE)
+			rc += handle_leave_req(group, req_join_state, req);
+		else
+			rc += handle_join_req(group, req_join_state, req);
+	}
+
+	/* Handle leaves */
+	if (group->state == MCAST_IDLE) {
+		req_join_state = get_leave_state(group);
+		if (req_join_state) {
+			group->rec.scope_join_state &= ~req_join_state;
+			group->prev_state = group->state;
+			if (send_leave_to_wire(group, req_join_state)) {
+				group->state = group->prev_state;
+				++rc;
+			} else
+				group->state = MCAST_LEAVE_SENT;
+		}
+	}
+
+	if (!list_empty(&group->pending_list) && group->state == MCAST_IDLE)
+		goto process_requests;
+	mutex_unlock(&group->lock);
+
+	while (rc--)
+		release_group(group, 0);
+}
+
+static struct mcast_group *search_relocate_mgid0_group(struct mlx4_ib_demux_ctx *ctx,
+						       __be64 tid,
+						       union ib_gid *new_mgid)
+{
+	struct mcast_group *group = NULL, *cur_group;
+	struct mcast_req *req;
+	struct list_head *pos;
+	struct list_head *n;
+
+	mutex_lock(&ctx->mcg_table_lock);
+	list_for_each_safe(pos, n, &ctx->mcg_mgid0_list) {
+		group = list_entry(pos, struct mcast_group, mgid0_list);
+		mutex_lock(&group->lock);
+		if (group->last_req_tid == tid) {
+			if (memcmp(new_mgid, &mgid0, sizeof mgid0)) {
+				group->rec.mgid = *new_mgid;
+				sprintf(group->name, "%016llx%016llx",
+						be64_to_cpu(group->rec.mgid.global.subnet_prefix),
+						be64_to_cpu(group->rec.mgid.global.interface_id));
+				list_del_init(&group->mgid0_list);
+				cur_group = mcast_insert(ctx, group);
+				if (cur_group) {
+					/* A race between our code and SM. Silently cleaning the new one */
+					req = list_first_entry(&group->pending_list,
+							       struct mcast_req, group_list);
+					--group->func[req->func].num_pend_reqs;
+					list_del(&req->group_list);
+					list_del(&req->func_list);
+					kfree(req);
+					mutex_unlock(&group->lock);
+					mutex_unlock(&ctx->mcg_table_lock);
+					release_group(group, 0);
+					return NULL;
+				}
+
+				atomic_inc(&group->refcount);
+				add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+				mutex_unlock(&group->lock);
+				mutex_unlock(&ctx->mcg_table_lock);
+				return group;
+			} else {
+				struct mcast_req *tmp1, *tmp2;
+
+				list_del(&group->mgid0_list);
+				if (!list_empty(&group->pending_list) && group->state != MCAST_IDLE)
+					cancel_delayed_work_sync(&group->timeout_work);
+
+				list_for_each_entry_safe(tmp1, tmp2, &group->pending_list, group_list) {
+					list_del(&tmp1->group_list);
+					kfree(tmp1);
+				}
+				mutex_unlock(&group->lock);
+				mutex_unlock(&ctx->mcg_table_lock);
+				kfree(group);
+				return NULL;
+			}
+		}
+		mutex_unlock(&group->lock);
+	}
+	mutex_unlock(&ctx->mcg_table_lock);
+
+	return NULL;
+}
+
+static ssize_t sysfs_show_group(struct device *dev,
+		struct device_attribute *attr, char *buf);
+
+static struct mcast_group *acquire_group(struct mlx4_ib_demux_ctx *ctx,
+					 union ib_gid *mgid, int create,
+					 gfp_t gfp_mask)
+{
+	struct mcast_group *group, *cur_group;
+	int is_mgid0;
+	int i;
+
+	is_mgid0 = !memcmp(&mgid0, mgid, sizeof mgid0);
+	if (!is_mgid0) {
+		group = mcast_find(ctx, mgid);
+		if (group)
+			goto found;
+	}
+
+	if (!create)
+		return ERR_PTR(-ENOENT);
+
+	group = kzalloc(sizeof *group, gfp_mask);
+	if (!group)
+		return ERR_PTR(-ENOMEM);
+
+	group->demux = ctx;
+	group->rec.mgid = *mgid;
+	INIT_LIST_HEAD(&group->pending_list);
+	INIT_LIST_HEAD(&group->mgid0_list);
+	for (i = 0; i < MAX_VFS; ++i)
+		INIT_LIST_HEAD(&group->func[i].pending);
+	INIT_WORK(&group->work, mlx4_ib_mcg_work_handler);
+	INIT_DELAYED_WORK(&group->timeout_work, mlx4_ib_mcg_timeout_handler);
+	mutex_init(&group->lock);
+	sprintf(group->name, "%016llx%016llx",
+			be64_to_cpu(group->rec.mgid.global.subnet_prefix),
+			be64_to_cpu(group->rec.mgid.global.interface_id));
+	sysfs_attr_init(&group->dentry.attr);
+	group->dentry.show = sysfs_show_group;
+	group->dentry.store = NULL;
+	group->dentry.attr.name = group->name;
+	group->dentry.attr.mode = 0400;
+	group->state = MCAST_IDLE;
+
+	if (is_mgid0) {
+		list_add(&group->mgid0_list, &ctx->mcg_mgid0_list);
+		goto found;
+	}
+
+	cur_group = mcast_insert(ctx, group);
+	if (cur_group) {
+		mcg_warn("group just showed up %s - confused\n", cur_group->name);
+		kfree(group);
+		return ERR_PTR(-EINVAL);
+	}
+
+	add_sysfs_port_mcg_attr(ctx->dev, ctx->port, &group->dentry.attr);
+
+found:
+	atomic_inc(&group->refcount);
+	return group;
+}
+
+static void queue_req(struct mcast_req *req)
+{
+	struct mcast_group *group = req->group;
+
+	atomic_inc(&group->refcount); /* for the request */
+	atomic_inc(&group->refcount); /* for scheduling the work */
+	list_add_tail(&req->group_list, &group->pending_list);
+	list_add_tail(&req->func_list, &group->func[req->func].pending);
+	/* calls mlx4_ib_mcg_work_handler */
+	if (!queue_work(group->demux->mcg_wq, &group->work))
+		safe_atomic_dec(&group->refcount);
+}
+
+int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
+			      struct ib_sa_mad *mad)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)mad->data;
+	struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1];
+	struct mcast_group *group;
+
+	switch (mad->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET_RESP:
+	case IB_SA_METHOD_DELETE_RESP:
+		mutex_lock(&ctx->mcg_table_lock);
+		group = acquire_group(ctx, &rec->mgid, 0, GFP_KERNEL);
+		mutex_unlock(&ctx->mcg_table_lock);
+		if (IS_ERR(group)) {
+			if (mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP) {
+				__be64 tid = mad->mad_hdr.tid;
+				*(u8 *)(&tid) = (u8)slave; /* in group we kept the modified TID */
+				group = search_relocate_mgid0_group(ctx, tid, &rec->mgid);
+			} else
+				group = NULL;
+		}
+
+		if (!group)
+			return 1;
+
+		mutex_lock(&group->lock);
+		group->response_sa_mad = *mad;
+		group->prev_state = group->state;
+		group->state = MCAST_RESP_READY;
+		/* calls mlx4_ib_mcg_work_handler */
+		atomic_inc(&group->refcount);
+		if (!queue_work(ctx->mcg_wq, &group->work))
+			safe_atomic_dec(&group->refcount);
+		mutex_unlock(&group->lock);
+		release_group(group, 0);
+		return 1; /* consumed */
+	case IB_MGMT_METHOD_SET:
+	case IB_SA_METHOD_GET_TABLE:
+	case IB_SA_METHOD_GET_TABLE_RESP:
+	case IB_SA_METHOD_DELETE:
+		return 0; /* not consumed, pass-through to guest over tunnel */
+	default:
+		mcg_warn("In demux, port %d: unexpected MCMember method: 0x%x, dropping\n",
+			port, mad->mad_hdr.method);
+		return 1; /* consumed */
+	}
+}
+
+int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port,
+				  int slave, struct ib_sa_mad *sa_mad)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct ib_sa_mcmember_data *rec = (struct ib_sa_mcmember_data *)sa_mad->data;
+	struct mlx4_ib_demux_ctx *ctx = &dev->sriov.demux[port - 1];
+	struct mcast_group *group;
+	struct mcast_req *req;
+	int may_create = 0;
+
+	if (ctx->flushing)
+		return -EAGAIN;
+
+	switch (sa_mad->mad_hdr.method) {
+	case IB_MGMT_METHOD_SET:
+		may_create = 1;
+	case IB_SA_METHOD_DELETE:
+		req = kzalloc(sizeof *req, GFP_KERNEL);
+		if (!req)
+			return -ENOMEM;
+
+		req->func = slave;
+		req->sa_mad = *sa_mad;
+
+		mutex_lock(&ctx->mcg_table_lock);
+		group = acquire_group(ctx, &rec->mgid, may_create, GFP_KERNEL);
+		mutex_unlock(&ctx->mcg_table_lock);
+		if (IS_ERR(group)) {
+			kfree(req);
+			return PTR_ERR(group);
+		}
+		mutex_lock(&group->lock);
+		if (group->func[slave].num_pend_reqs > MAX_PEND_REQS_PER_FUNC) {
+			mutex_unlock(&group->lock);
+			mcg_warn_group(group, "Port %d, Func %d has too many pending requests (%d), dropping\n",
+				       port, slave, MAX_PEND_REQS_PER_FUNC);
+			release_group(group, 0);
+			kfree(req);
+			return -ENOMEM;
+		}
+		++group->func[slave].num_pend_reqs;
+		req->group = group;
+		queue_req(req);
+		mutex_unlock(&group->lock);
+		release_group(group, 0);
+		return 1; /* consumed */
+	case IB_SA_METHOD_GET_TABLE:
+	case IB_MGMT_METHOD_GET_RESP:
+	case IB_SA_METHOD_GET_TABLE_RESP:
+	case IB_SA_METHOD_DELETE_RESP:
+		return 0; /* not consumed, pass-through */
+	default:
+		mcg_warn("In multiplex, port %d, func %d: unexpected MCMember method: 0x%x, dropping\n",
+			port, slave, sa_mad->mad_hdr.method);
+		return 1; /* consumed */
+	}
+}
+
+static ssize_t sysfs_show_group(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct mcast_group *group =
+		container_of(attr, struct mcast_group, dentry);
+	struct mcast_req *req = NULL;
+	char pending_str[40];
+	char state_str[40];
+	ssize_t len = 0;
+	int f;
+
+	if (group->state == MCAST_IDLE)
+		sprintf(state_str, "%s", get_state_string(group->state));
+	else
+		sprintf(state_str, "%s(TID=0x%llx)",
+				get_state_string(group->state),
+				be64_to_cpu(group->last_req_tid));
+	if (list_empty(&group->pending_list)) {
+		sprintf(pending_str, "No");
+	} else {
+		req = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+		sprintf(pending_str, "Yes(TID=0x%llx)",
+				be64_to_cpu(req->sa_mad.mad_hdr.tid));
+	}
+	len += sprintf(buf + len, "%1d [%02d,%02d,%02d] %4d %4s %5s     ",
+			group->rec.scope_join_state & 0xf,
+			group->members[2], group->members[1], group->members[0],
+			atomic_read(&group->refcount),
+			pending_str,
+			state_str);
+	for (f = 0; f < MAX_VFS; ++f)
+		if (group->func[f].state == MCAST_MEMBER)
+			len += sprintf(buf + len, "%d[%1x] ",
+					f, group->func[f].join_state);
+
+	len += sprintf(buf + len, "\t\t(%4hx %4x %2x %2x %2x %2x %2x "
+		"%4x %4x %2x %2x)\n",
+		be16_to_cpu(group->rec.pkey),
+		be32_to_cpu(group->rec.qkey),
+		(group->rec.mtusel_mtu & 0xc0) >> 6,
+		group->rec.mtusel_mtu & 0x3f,
+		group->rec.tclass,
+		(group->rec.ratesel_rate & 0xc0) >> 6,
+		group->rec.ratesel_rate & 0x3f,
+		(be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0xf0000000) >> 28,
+		(be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x0fffff00) >> 8,
+		be32_to_cpu(group->rec.sl_flowlabel_hoplimit) & 0x000000ff,
+		group->rec.proxy_join);
+
+	return len;
+}
+
+int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx)
+{
+	char name[20];
+
+	atomic_set(&ctx->tid, 0);
+	sprintf(name, "mlx4_ib_mcg%d", ctx->port);
+	ctx->mcg_wq = create_singlethread_workqueue(name);
+	if (!ctx->mcg_wq)
+		return -ENOMEM;
+
+	mutex_init(&ctx->mcg_table_lock);
+	ctx->mcg_table = RB_ROOT;
+	INIT_LIST_HEAD(&ctx->mcg_mgid0_list);
+	ctx->flushing = 0;
+
+	return 0;
+}
+
+static void force_clean_group(struct mcast_group *group)
+{
+	struct mcast_req *req, *tmp
+		;
+	list_for_each_entry_safe(req, tmp, &group->pending_list, group_list) {
+		list_del(&req->group_list);
+		kfree(req);
+	}
+	del_sysfs_port_mcg_attr(group->demux->dev, group->demux->port, &group->dentry.attr);
+	rb_erase(&group->node, &group->demux->mcg_table);
+	kfree(group);
+}
+
+static void _mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq)
+{
+	int i;
+	struct rb_node *p;
+	struct mcast_group *group;
+	unsigned long end;
+	int count;
+
+	for (i = 0; i < MAX_VFS; ++i)
+		clean_vf_mcast(ctx, i);
+
+	end = jiffies + msecs_to_jiffies(MAD_TIMEOUT_MS + 3000);
+	do {
+		count = 0;
+		mutex_lock(&ctx->mcg_table_lock);
+		for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p))
+			++count;
+		mutex_unlock(&ctx->mcg_table_lock);
+		if (!count)
+			break;
+
+		msleep(1);
+	} while (time_after(end, jiffies));
+
+	flush_workqueue(ctx->mcg_wq);
+	if (destroy_wq)
+		destroy_workqueue(ctx->mcg_wq);
+
+	mutex_lock(&ctx->mcg_table_lock);
+	while ((p = rb_first(&ctx->mcg_table)) != NULL) {
+		group = rb_entry(p, struct mcast_group, node);
+		if (atomic_read(&group->refcount))
+			mcg_warn_group(group, "group refcount %d!!! (pointer %p)\n", atomic_read(&group->refcount), group);
+
+		force_clean_group(group);
+	}
+	mutex_unlock(&ctx->mcg_table_lock);
+}
+
+struct clean_work {
+	struct work_struct work;
+	struct mlx4_ib_demux_ctx *ctx;
+	int destroy_wq;
+};
+
+static void mcg_clean_task(struct work_struct *work)
+{
+	struct clean_work *cw = container_of(work, struct clean_work, work);
+
+	_mlx4_ib_mcg_port_cleanup(cw->ctx, cw->destroy_wq);
+	cw->ctx->flushing = 0;
+	kfree(cw);
+}
+
+void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq)
+{
+	struct clean_work *work;
+
+	if (ctx->flushing)
+		return;
+
+	ctx->flushing = 1;
+
+	if (destroy_wq) {
+		_mlx4_ib_mcg_port_cleanup(ctx, destroy_wq);
+		ctx->flushing = 0;
+		return;
+	}
+
+	work = kmalloc(sizeof *work, GFP_KERNEL);
+	if (!work) {
+		ctx->flushing = 0;
+		mcg_warn("failed allocating work for cleanup\n");
+		return;
+	}
+
+	work->ctx = ctx;
+	work->destroy_wq = destroy_wq;
+	INIT_WORK(&work->work, mcg_clean_task);
+	queue_work(clean_wq, &work->work);
+}
+
+static void build_leave_mad(struct mcast_req *req)
+{
+	struct ib_sa_mad *mad = &req->sa_mad;
+
+	mad->mad_hdr.method = IB_SA_METHOD_DELETE;
+}
+
+
+static void clear_pending_reqs(struct mcast_group *group, int vf)
+{
+	struct mcast_req *req, *tmp, *group_first = NULL;
+	int clear;
+	int pend = 0;
+
+	if (!list_empty(&group->pending_list))
+		group_first = list_first_entry(&group->pending_list, struct mcast_req, group_list);
+
+	list_for_each_entry_safe(req, tmp, &group->func[vf].pending, func_list) {
+		clear = 1;
+		if (group_first == req &&
+		    (group->state == MCAST_JOIN_SENT ||
+		     group->state == MCAST_LEAVE_SENT)) {
+			clear = cancel_delayed_work(&group->timeout_work);
+			pend = !clear;
+			group->state = MCAST_IDLE;
+		}
+		if (clear) {
+			--group->func[vf].num_pend_reqs;
+			list_del(&req->group_list);
+			list_del(&req->func_list);
+			kfree(req);
+			atomic_dec(&group->refcount);
+		}
+	}
+
+	if (!pend && (!list_empty(&group->func[vf].pending) || group->func[vf].num_pend_reqs)) {
+		mcg_warn_group(group, "DRIVER BUG: list_empty %d, num_pend_reqs %d\n",
+			       list_empty(&group->func[vf].pending), group->func[vf].num_pend_reqs);
+	}
+}
+
+static int push_deleteing_req(struct mcast_group *group, int slave)
+{
+	struct mcast_req *req;
+	struct mcast_req *pend_req;
+
+	if (!group->func[slave].join_state)
+		return 0;
+
+	req = kzalloc(sizeof *req, GFP_KERNEL);
+	if (!req) {
+		mcg_warn_group(group, "failed allocation - may leave stall groups\n");
+		return -ENOMEM;
+	}
+
+	if (!list_empty(&group->func[slave].pending)) {
+		pend_req = list_entry(group->func[slave].pending.prev, struct mcast_req, group_list);
+		if (pend_req->clean) {
+			kfree(req);
+			return 0;
+		}
+	}
+
+	req->clean = 1;
+	req->func = slave;
+	req->group = group;
+	++group->func[slave].num_pend_reqs;
+	build_leave_mad(req);
+	queue_req(req);
+	return 0;
+}
+
+void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave)
+{
+	struct mcast_group *group;
+	struct rb_node *p;
+
+	mutex_lock(&ctx->mcg_table_lock);
+	for (p = rb_first(&ctx->mcg_table); p; p = rb_next(p)) {
+		group = rb_entry(p, struct mcast_group, node);
+		mutex_lock(&group->lock);
+		if (atomic_read(&group->refcount)) {
+			/* clear pending requests of this VF */
+			clear_pending_reqs(group, slave);
+			push_deleteing_req(group, slave);
+		}
+		mutex_unlock(&group->lock);
+	}
+	mutex_unlock(&ctx->mcg_table_lock);
+}
+
+
+int mlx4_ib_mcg_init(void)
+{
+	clean_wq = create_singlethread_workqueue("mlx4_ib_mcg");
+	if (!clean_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx4_ib_mcg_destroy(void)
+{
+	destroy_workqueue(clean_wq);
+}
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
new file mode 100644
index 000000000..fce393437
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2006, 2007 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_IB_H
+#define MLX4_IB_H
+
+#include <linux/compiler.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/idr.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_sa.h>
+
+#include <linux/mlx4/device.h>
+#include <linux/mlx4/doorbell.h>
+
+#define MLX4_IB_DRV_NAME	"mlx4_ib"
+
+#ifdef pr_fmt
+#undef pr_fmt
+#endif
+#define pr_fmt(fmt)	"<" MLX4_IB_DRV_NAME "> %s: " fmt, __func__
+
+#define mlx4_ib_warn(ibdev, format, arg...) \
+	dev_warn((ibdev)->dma_device, MLX4_IB_DRV_NAME ": " format, ## arg)
+
+enum {
+	MLX4_IB_SQ_MIN_WQE_SHIFT = 6,
+	MLX4_IB_MAX_HEADROOM	 = 2048
+};
+
+#define MLX4_IB_SQ_HEADROOM(shift)	((MLX4_IB_MAX_HEADROOM >> (shift)) + 1)
+#define MLX4_IB_SQ_MAX_SPARE		(MLX4_IB_SQ_HEADROOM(MLX4_IB_SQ_MIN_WQE_SHIFT))
+
+/*module param to indicate if SM assigns the alias_GUID*/
+extern int mlx4_ib_sm_guid_assign;
+
+#define MLX4_IB_UC_STEER_QPN_ALIGN 1
+#define MLX4_IB_UC_MAX_NUM_QPS     256
+struct mlx4_ib_ucontext {
+	struct ib_ucontext	ibucontext;
+	struct mlx4_uar		uar;
+	struct list_head	db_page_list;
+	struct mutex		db_page_mutex;
+};
+
+struct mlx4_ib_pd {
+	struct ib_pd		ibpd;
+	u32			pdn;
+};
+
+struct mlx4_ib_xrcd {
+	struct ib_xrcd		ibxrcd;
+	u32			xrcdn;
+	struct ib_pd	       *pd;
+	struct ib_cq	       *cq;
+};
+
+struct mlx4_ib_cq_buf {
+	struct mlx4_buf		buf;
+	struct mlx4_mtt		mtt;
+	int			entry_size;
+};
+
+struct mlx4_ib_cq_resize {
+	struct mlx4_ib_cq_buf	buf;
+	int			cqe;
+};
+
+struct mlx4_ib_cq {
+	struct ib_cq		ibcq;
+	struct mlx4_cq		mcq;
+	struct mlx4_ib_cq_buf	buf;
+	struct mlx4_ib_cq_resize *resize_buf;
+	struct mlx4_db		db;
+	spinlock_t		lock;
+	struct mutex		resize_mutex;
+	struct ib_umem	       *umem;
+	struct ib_umem	       *resize_umem;
+	/* List of qps that it serves.*/
+	struct list_head		send_qp_list;
+	struct list_head		recv_qp_list;
+};
+
+struct mlx4_ib_mr {
+	struct ib_mr		ibmr;
+	struct mlx4_mr		mmr;
+	struct ib_umem	       *umem;
+};
+
+struct mlx4_ib_mw {
+	struct ib_mw		ibmw;
+	struct mlx4_mw		mmw;
+};
+
+struct mlx4_ib_fast_reg_page_list {
+	struct ib_fast_reg_page_list	ibfrpl;
+	__be64			       *mapped_page_list;
+	dma_addr_t			map;
+};
+
+struct mlx4_ib_fmr {
+	struct ib_fmr           ibfmr;
+	struct mlx4_fmr         mfmr;
+};
+
+#define MAX_REGS_PER_FLOW 2
+
+struct mlx4_flow_reg_id {
+	u64 id;
+	u64 mirror;
+};
+
+struct mlx4_ib_flow {
+	struct ib_flow ibflow;
+	/* translating DMFS verbs sniffer rule to FW API requires two reg IDs */
+	struct mlx4_flow_reg_id reg_id[MAX_REGS_PER_FLOW];
+};
+
+struct mlx4_ib_wq {
+	u64		       *wrid;
+	spinlock_t		lock;
+	int			wqe_cnt;
+	int			max_post;
+	int			max_gs;
+	int			offset;
+	int			wqe_shift;
+	unsigned		head;
+	unsigned		tail;
+};
+
+enum mlx4_ib_qp_flags {
+	MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO,
+	MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK,
+	MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP,
+	MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO,
+	MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30,
+	MLX4_IB_SRIOV_SQP = 1 << 31,
+};
+
+struct mlx4_ib_gid_entry {
+	struct list_head	list;
+	union ib_gid		gid;
+	int			added;
+	u8			port;
+};
+
+enum mlx4_ib_qp_type {
+	/*
+	 * IB_QPT_SMI and IB_QPT_GSI have to be the first two entries
+	 * here (and in that order) since the MAD layer uses them as
+	 * indices into a 2-entry table.
+	 */
+	MLX4_IB_QPT_SMI = IB_QPT_SMI,
+	MLX4_IB_QPT_GSI = IB_QPT_GSI,
+
+	MLX4_IB_QPT_RC = IB_QPT_RC,
+	MLX4_IB_QPT_UC = IB_QPT_UC,
+	MLX4_IB_QPT_UD = IB_QPT_UD,
+	MLX4_IB_QPT_RAW_IPV6 = IB_QPT_RAW_IPV6,
+	MLX4_IB_QPT_RAW_ETHERTYPE = IB_QPT_RAW_ETHERTYPE,
+	MLX4_IB_QPT_RAW_PACKET = IB_QPT_RAW_PACKET,
+	MLX4_IB_QPT_XRC_INI = IB_QPT_XRC_INI,
+	MLX4_IB_QPT_XRC_TGT = IB_QPT_XRC_TGT,
+
+	MLX4_IB_QPT_PROXY_SMI_OWNER	= 1 << 16,
+	MLX4_IB_QPT_PROXY_SMI		= 1 << 17,
+	MLX4_IB_QPT_PROXY_GSI		= 1 << 18,
+	MLX4_IB_QPT_TUN_SMI_OWNER	= 1 << 19,
+	MLX4_IB_QPT_TUN_SMI		= 1 << 20,
+	MLX4_IB_QPT_TUN_GSI		= 1 << 21,
+};
+
+#define MLX4_IB_QPT_ANY_SRIOV	(MLX4_IB_QPT_PROXY_SMI_OWNER | \
+	MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER | \
+	MLX4_IB_QPT_TUN_SMI | MLX4_IB_QPT_TUN_GSI)
+
+enum mlx4_ib_mad_ifc_flags {
+	MLX4_MAD_IFC_IGNORE_MKEY	= 1,
+	MLX4_MAD_IFC_IGNORE_BKEY	= 2,
+	MLX4_MAD_IFC_IGNORE_KEYS	= (MLX4_MAD_IFC_IGNORE_MKEY |
+					   MLX4_MAD_IFC_IGNORE_BKEY),
+	MLX4_MAD_IFC_NET_VIEW		= 4,
+};
+
+enum {
+	MLX4_NUM_TUNNEL_BUFS		= 256,
+};
+
+struct mlx4_ib_tunnel_header {
+	struct mlx4_av av;
+	__be32 remote_qpn;
+	__be32 qkey;
+	__be16 vlan;
+	u8 mac[6];
+	__be16 pkey_index;
+	u8 reserved[6];
+};
+
+struct mlx4_ib_buf {
+	void *addr;
+	dma_addr_t map;
+};
+
+struct mlx4_rcv_tunnel_hdr {
+	__be32 flags_src_qp; /* flags[6:5] is defined for VLANs:
+			      * 0x0 - no vlan was in the packet
+			      * 0x01 - C-VLAN was in the packet */
+	u8 g_ml_path; /* gid bit stands for ipv6/4 header in RoCE */
+	u8 reserved;
+	__be16 pkey_index;
+	__be16 sl_vid;
+	__be16 slid_mac_47_32;
+	__be32 mac_31_0;
+};
+
+struct mlx4_ib_proxy_sqp_hdr {
+	struct ib_grh grh;
+	struct mlx4_rcv_tunnel_hdr tun;
+}  __packed;
+
+struct mlx4_roce_smac_vlan_info {
+	u64 smac;
+	int smac_index;
+	int smac_port;
+	u64 candidate_smac;
+	int candidate_smac_index;
+	int candidate_smac_port;
+	u16 vid;
+	int vlan_index;
+	int vlan_port;
+	u16 candidate_vid;
+	int candidate_vlan_index;
+	int candidate_vlan_port;
+	int update_vid;
+};
+
+struct mlx4_ib_qp {
+	struct ib_qp		ibqp;
+	struct mlx4_qp		mqp;
+	struct mlx4_buf		buf;
+
+	struct mlx4_db		db;
+	struct mlx4_ib_wq	rq;
+
+	u32			doorbell_qpn;
+	__be32			sq_signal_bits;
+	unsigned		sq_next_wqe;
+	int			sq_max_wqes_per_wr;
+	int			sq_spare_wqes;
+	struct mlx4_ib_wq	sq;
+
+	enum mlx4_ib_qp_type	mlx4_ib_qp_type;
+	struct ib_umem	       *umem;
+	struct mlx4_mtt		mtt;
+	int			buf_size;
+	struct mutex		mutex;
+	u16			xrcdn;
+	u32			flags;
+	u8			port;
+	u8			alt_port;
+	u8			atomic_rd_en;
+	u8			resp_depth;
+	u8			sq_no_prefetch;
+	u8			state;
+	int			mlx_type;
+	struct list_head	gid_list;
+	struct list_head	steering_rules;
+	struct mlx4_ib_buf	*sqp_proxy_rcv;
+	struct mlx4_roce_smac_vlan_info pri;
+	struct mlx4_roce_smac_vlan_info alt;
+	u64			reg_id;
+	struct list_head	qps_list;
+	struct list_head	cq_recv_list;
+	struct list_head	cq_send_list;
+};
+
+struct mlx4_ib_srq {
+	struct ib_srq		ibsrq;
+	struct mlx4_srq		msrq;
+	struct mlx4_buf		buf;
+	struct mlx4_db		db;
+	u64		       *wrid;
+	spinlock_t		lock;
+	int			head;
+	int			tail;
+	u16			wqe_ctr;
+	struct ib_umem	       *umem;
+	struct mlx4_mtt		mtt;
+	struct mutex		mutex;
+};
+
+struct mlx4_ib_ah {
+	struct ib_ah		ibah;
+	union mlx4_ext_av       av;
+};
+
+/****************************************/
+/* alias guid support */
+/****************************************/
+#define NUM_PORT_ALIAS_GUID		2
+#define NUM_ALIAS_GUID_IN_REC		8
+#define NUM_ALIAS_GUID_REC_IN_PORT	16
+#define GUID_REC_SIZE			8
+#define NUM_ALIAS_GUID_PER_PORT		128
+#define MLX4_NOT_SET_GUID		(0x00LL)
+#define MLX4_GUID_FOR_DELETE_VAL	(~(0x00LL))
+
+enum mlx4_guid_alias_rec_status {
+	MLX4_GUID_INFO_STATUS_IDLE,
+	MLX4_GUID_INFO_STATUS_SET,
+};
+
+#define GUID_STATE_NEED_PORT_INIT 0x01
+
+enum mlx4_guid_alias_rec_method {
+	MLX4_GUID_INFO_RECORD_SET	= IB_MGMT_METHOD_SET,
+	MLX4_GUID_INFO_RECORD_DELETE	= IB_SA_METHOD_DELETE,
+};
+
+struct mlx4_sriov_alias_guid_info_rec_det {
+	u8 all_recs[GUID_REC_SIZE * NUM_ALIAS_GUID_IN_REC];
+	ib_sa_comp_mask guid_indexes; /*indicates what from the 8 records are valid*/
+	enum mlx4_guid_alias_rec_status status; /*indicates the administraively status of the record.*/
+	unsigned int guids_retry_schedule[NUM_ALIAS_GUID_IN_REC];
+	u64 time_to_run;
+};
+
+struct mlx4_sriov_alias_guid_port_rec_det {
+	struct mlx4_sriov_alias_guid_info_rec_det all_rec_per_port[NUM_ALIAS_GUID_REC_IN_PORT];
+	struct workqueue_struct *wq;
+	struct delayed_work alias_guid_work;
+	u8 port;
+	u32 state_flags;
+	struct mlx4_sriov_alias_guid *parent;
+	struct list_head cb_list;
+};
+
+struct mlx4_sriov_alias_guid {
+	struct mlx4_sriov_alias_guid_port_rec_det ports_guid[MLX4_MAX_PORTS];
+	spinlock_t ag_work_lock;
+	struct ib_sa_client *sa_client;
+};
+
+struct mlx4_ib_demux_work {
+	struct work_struct	work;
+	struct mlx4_ib_dev     *dev;
+	int			slave;
+	int			do_init;
+	u8			port;
+
+};
+
+struct mlx4_ib_tun_tx_buf {
+	struct mlx4_ib_buf buf;
+	struct ib_ah *ah;
+};
+
+struct mlx4_ib_demux_pv_qp {
+	struct ib_qp *qp;
+	enum ib_qp_type proxy_qpt;
+	struct mlx4_ib_buf *ring;
+	struct mlx4_ib_tun_tx_buf *tx_ring;
+	spinlock_t tx_lock;
+	unsigned tx_ix_head;
+	unsigned tx_ix_tail;
+};
+
+enum mlx4_ib_demux_pv_state {
+	DEMUX_PV_STATE_DOWN,
+	DEMUX_PV_STATE_STARTING,
+	DEMUX_PV_STATE_ACTIVE,
+	DEMUX_PV_STATE_DOWNING,
+};
+
+struct mlx4_ib_demux_pv_ctx {
+	int port;
+	int slave;
+	enum mlx4_ib_demux_pv_state state;
+	int has_smi;
+	struct ib_device *ib_dev;
+	struct ib_cq *cq;
+	struct ib_pd *pd;
+	struct ib_mr *mr;
+	struct work_struct work;
+	struct workqueue_struct *wq;
+	struct mlx4_ib_demux_pv_qp qp[2];
+};
+
+struct mlx4_ib_demux_ctx {
+	struct ib_device *ib_dev;
+	int port;
+	struct workqueue_struct *wq;
+	struct workqueue_struct *ud_wq;
+	spinlock_t ud_lock;
+	__be64 subnet_prefix;
+	__be64 guid_cache[128];
+	struct mlx4_ib_dev *dev;
+	/* the following lock protects both mcg_table and mcg_mgid0_list */
+	struct mutex		mcg_table_lock;
+	struct rb_root		mcg_table;
+	struct list_head	mcg_mgid0_list;
+	struct workqueue_struct	*mcg_wq;
+	struct mlx4_ib_demux_pv_ctx **tun;
+	atomic_t tid;
+	int    flushing; /* flushing the work queue */
+};
+
+struct mlx4_ib_sriov {
+	struct mlx4_ib_demux_ctx demux[MLX4_MAX_PORTS];
+	struct mlx4_ib_demux_pv_ctx *sqps[MLX4_MAX_PORTS];
+	/* when using this spinlock you should use "irq" because
+	 * it may be called from interrupt context.*/
+	spinlock_t going_down_lock;
+	int is_going_down;
+
+	struct mlx4_sriov_alias_guid alias_guid;
+
+	/* CM paravirtualization fields */
+	struct list_head cm_list;
+	spinlock_t id_map_lock;
+	struct rb_root sl_id_map;
+	struct idr pv_id_table;
+};
+
+struct mlx4_ib_iboe {
+	spinlock_t		lock;
+	struct net_device      *netdevs[MLX4_MAX_PORTS];
+	struct net_device      *masters[MLX4_MAX_PORTS];
+	atomic64_t		mac[MLX4_MAX_PORTS];
+	struct notifier_block 	nb;
+	struct notifier_block	nb_inet;
+	struct notifier_block	nb_inet6;
+	union ib_gid		gid_table[MLX4_MAX_PORTS][128];
+};
+
+struct pkey_mgt {
+	u8			virt2phys_pkey[MLX4_MFUNC_MAX][MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+	u16			phys_pkey_cache[MLX4_MAX_PORTS][MLX4_MAX_PORT_PKEYS];
+	struct list_head	pkey_port_list[MLX4_MFUNC_MAX];
+	struct kobject	       *device_parent[MLX4_MFUNC_MAX];
+};
+
+struct mlx4_ib_iov_sysfs_attr {
+	void *ctx;
+	struct kobject *kobj;
+	unsigned long data;
+	u32 entry_num;
+	char name[15];
+	struct device_attribute dentry;
+	struct device *dev;
+};
+
+struct mlx4_ib_iov_sysfs_attr_ar {
+	struct mlx4_ib_iov_sysfs_attr dentries[3 * NUM_ALIAS_GUID_PER_PORT + 1];
+};
+
+struct mlx4_ib_iov_port {
+	char name[100];
+	u8 num;
+	struct mlx4_ib_dev *dev;
+	struct list_head list;
+	struct mlx4_ib_iov_sysfs_attr_ar *dentr_ar;
+	struct ib_port_attr attr;
+	struct kobject	*cur_port;
+	struct kobject	*admin_alias_parent;
+	struct kobject	*gids_parent;
+	struct kobject	*pkeys_parent;
+	struct kobject	*mcgs_parent;
+	struct mlx4_ib_iov_sysfs_attr mcg_dentry;
+};
+
+struct mlx4_ib_dev {
+	struct ib_device	ib_dev;
+	struct mlx4_dev	       *dev;
+	int			num_ports;
+	void __iomem	       *uar_map;
+
+	struct mlx4_uar		priv_uar;
+	u32			priv_pdn;
+	MLX4_DECLARE_DOORBELL_LOCK(uar_lock);
+
+	struct ib_mad_agent    *send_agent[MLX4_MAX_PORTS][2];
+	struct ib_ah	       *sm_ah[MLX4_MAX_PORTS];
+	spinlock_t		sm_lock;
+	struct mlx4_ib_sriov	sriov;
+
+	struct mutex		cap_mask_mutex;
+	bool			ib_active;
+	struct mlx4_ib_iboe	iboe;
+	int			counters[MLX4_MAX_PORTS];
+	int		       *eq_table;
+	int			eq_added;
+	struct kobject	       *iov_parent;
+	struct kobject	       *ports_parent;
+	struct kobject	       *dev_ports_parent[MLX4_MFUNC_MAX];
+	struct mlx4_ib_iov_port	iov_ports[MLX4_MAX_PORTS];
+	struct pkey_mgt		pkeys;
+	unsigned long *ib_uc_qpns_bitmap;
+	int steer_qpn_count;
+	int steer_qpn_base;
+	int steering_support;
+	struct mlx4_ib_qp      *qp1_proxy[MLX4_MAX_PORTS];
+	/* lock when destroying qp1_proxy and getting netdev events */
+	struct mutex		qp1_proxy_lock[MLX4_MAX_PORTS];
+	u8			bond_next_port;
+	/* protect resources needed as part of reset flow */
+	spinlock_t		reset_flow_resource_lock;
+	struct list_head		qp_list;
+};
+
+struct ib_event_work {
+	struct work_struct	work;
+	struct mlx4_ib_dev	*ib_dev;
+	struct mlx4_eqe		ib_eqe;
+};
+
+struct mlx4_ib_qp_tunnel_init_attr {
+	struct ib_qp_init_attr init_attr;
+	int slave;
+	enum ib_qp_type proxy_qp_type;
+	u8 port;
+};
+
+static inline struct mlx4_ib_dev *to_mdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct mlx4_ib_dev, ib_dev);
+}
+
+static inline struct mlx4_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct mlx4_ib_ucontext, ibucontext);
+}
+
+static inline struct mlx4_ib_pd *to_mpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct mlx4_ib_pd, ibpd);
+}
+
+static inline struct mlx4_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd)
+{
+	return container_of(ibxrcd, struct mlx4_ib_xrcd, ibxrcd);
+}
+
+static inline struct mlx4_ib_cq *to_mcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct mlx4_ib_cq, ibcq);
+}
+
+static inline struct mlx4_ib_cq *to_mibcq(struct mlx4_cq *mcq)
+{
+	return container_of(mcq, struct mlx4_ib_cq, mcq);
+}
+
+static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct mlx4_ib_mr, ibmr);
+}
+
+static inline struct mlx4_ib_mw *to_mmw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct mlx4_ib_mw, ibmw);
+}
+
+static inline struct mlx4_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl)
+{
+	return container_of(ibfrpl, struct mlx4_ib_fast_reg_page_list, ibfrpl);
+}
+
+static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr);
+}
+
+static inline struct mlx4_ib_flow *to_mflow(struct ib_flow *ibflow)
+{
+	return container_of(ibflow, struct mlx4_ib_flow, ibflow);
+}
+
+static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct mlx4_ib_qp, ibqp);
+}
+
+static inline struct mlx4_ib_qp *to_mibqp(struct mlx4_qp *mqp)
+{
+	return container_of(mqp, struct mlx4_ib_qp, mqp);
+}
+
+static inline struct mlx4_ib_srq *to_msrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct mlx4_ib_srq, ibsrq);
+}
+
+static inline struct mlx4_ib_srq *to_mibsrq(struct mlx4_srq *msrq)
+{
+	return container_of(msrq, struct mlx4_ib_srq, msrq);
+}
+
+static inline struct mlx4_ib_ah *to_mah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct mlx4_ib_ah, ibah);
+}
+
+static inline u8 mlx4_ib_bond_next_port(struct mlx4_ib_dev *dev)
+{
+	dev->bond_next_port = (dev->bond_next_port + 1) % dev->num_ports;
+
+	return dev->bond_next_port + 1;
+}
+
+int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev);
+void mlx4_ib_close_sriov(struct mlx4_ib_dev *dev);
+
+int mlx4_ib_db_map_user(struct mlx4_ib_ucontext *context, unsigned long virt,
+			struct mlx4_db *db);
+void mlx4_ib_db_unmap_user(struct mlx4_ib_ucontext *context, struct mlx4_db *db);
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc);
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+			   struct ib_umem *umem);
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata);
+int mlx4_ib_dereg_mr(struct ib_mr *mr);
+struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type);
+int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
+		    struct ib_mw_bind *mw_bind);
+int mlx4_ib_dealloc_mw(struct ib_mw *mw);
+struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
+					int max_page_list_len);
+struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
+							       int page_list_len);
+void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+
+int mlx4_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+int mlx4_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
+struct ib_cq *mlx4_ib_create_cq(struct ib_device *ibdev, int entries, int vector,
+				struct ib_ucontext *context,
+				struct ib_udata *udata);
+int mlx4_ib_destroy_cq(struct ib_cq *cq);
+int mlx4_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int mlx4_ib_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+void __mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+void mlx4_ib_cq_clean(struct mlx4_ib_cq *cq, u32 qpn, struct mlx4_ib_srq *srq);
+
+struct ib_ah *mlx4_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+int mlx4_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr);
+int mlx4_ib_destroy_ah(struct ib_ah *ah);
+
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata);
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mlx4_ib_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+int mlx4_ib_destroy_srq(struct ib_srq *srq);
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index);
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr);
+
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata);
+int mlx4_ib_destroy_qp(struct ib_qp *qp);
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata);
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr);
+int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+
+int mlx4_MAD_IFC(struct mlx4_ib_dev *dev, int mad_ifc_flags,
+		 int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		 void *in_mad, void *response_mad);
+int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,	u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad);
+int mlx4_ib_mad_init(struct mlx4_ib_dev *dev);
+void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev);
+
+struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags,
+				  struct ib_fmr_attr *fmr_attr);
+int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages,
+			 u64 iova);
+int mlx4_ib_unmap_fmr(struct list_head *fmr_list);
+int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr);
+int __mlx4_ib_query_port(struct ib_device *ibdev, u8 port,
+			 struct ib_port_attr *props, int netw_view);
+int __mlx4_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			 u16 *pkey, int netw_view);
+
+int __mlx4_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			union ib_gid *gid, int netw_view);
+
+static inline bool mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
+{
+	u8 port = be32_to_cpu(ah->av.ib.port_pd) >> 24 & 3;
+
+	if (rdma_port_get_link_layer(ah->ibah.device, port) == IB_LINK_LAYER_ETHERNET)
+		return true;
+
+	return !!(ah->av.ib.g_slid & 0x80);
+}
+
+int mlx4_ib_mcg_port_init(struct mlx4_ib_demux_ctx *ctx);
+void mlx4_ib_mcg_port_cleanup(struct mlx4_ib_demux_ctx *ctx, int destroy_wq);
+void clean_vf_mcast(struct mlx4_ib_demux_ctx *ctx, int slave);
+int mlx4_ib_mcg_init(void);
+void mlx4_ib_mcg_destroy(void);
+
+int mlx4_ib_find_real_gid(struct ib_device *ibdev, u8 port, __be64 guid);
+
+int mlx4_ib_mcg_multiplex_handler(struct ib_device *ibdev, int port, int slave,
+				  struct ib_sa_mad *sa_mad);
+int mlx4_ib_mcg_demux_handler(struct ib_device *ibdev, int port, int slave,
+			      struct ib_sa_mad *mad);
+
+int mlx4_ib_add_mc(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+		   union ib_gid *gid);
+
+void mlx4_ib_dispatch_event(struct mlx4_ib_dev *dev, u8 port_num,
+			    enum ib_event_type type);
+
+void mlx4_ib_tunnels_update_work(struct work_struct *work);
+
+int mlx4_ib_send_to_slave(struct mlx4_ib_dev *dev, int slave, u8 port,
+			  enum ib_qp_type qpt, struct ib_wc *wc,
+			  struct ib_grh *grh, struct ib_mad *mad);
+
+int mlx4_ib_send_to_wire(struct mlx4_ib_dev *dev, int slave, u8 port,
+			 enum ib_qp_type dest_qpt, u16 pkey_index, u32 remote_qpn,
+			 u32 qkey, struct ib_ah_attr *attr, u8 *s_mac,
+			 struct ib_mad *mad);
+
+__be64 mlx4_ib_get_new_demux_tid(struct mlx4_ib_demux_ctx *ctx);
+
+int mlx4_ib_demux_cm_handler(struct ib_device *ibdev, int port, int *slave,
+		struct ib_mad *mad);
+
+int mlx4_ib_multiplex_cm_handler(struct ib_device *ibdev, int port, int slave_id,
+		struct ib_mad *mad);
+
+void mlx4_ib_cm_paravirt_init(struct mlx4_ib_dev *dev);
+void mlx4_ib_cm_paravirt_clean(struct mlx4_ib_dev *dev, int slave_id);
+
+/* alias guid support */
+void mlx4_ib_init_alias_guid_work(struct mlx4_ib_dev *dev, int port);
+int mlx4_ib_init_alias_guid_service(struct mlx4_ib_dev *dev);
+void mlx4_ib_destroy_alias_guid_service(struct mlx4_ib_dev *dev);
+void mlx4_ib_invalidate_all_guid_record(struct mlx4_ib_dev *dev, int port);
+
+void mlx4_ib_notify_slaves_on_guid_change(struct mlx4_ib_dev *dev,
+					  int block_num,
+					  u8 port_num, u8 *p_data);
+
+void mlx4_ib_update_cache_on_guid_change(struct mlx4_ib_dev *dev,
+					 int block_num, u8 port_num,
+					 u8 *p_data);
+
+int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+			    struct attribute *attr);
+void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+			     struct attribute *attr);
+ib_sa_comp_mask mlx4_ib_get_aguid_comp_mask_from_ix(int index);
+void mlx4_ib_slave_alias_guid_event(struct mlx4_ib_dev *dev, int slave,
+				    int port, int slave_init);
+
+int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *device) ;
+
+void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device);
+
+__be64 mlx4_ib_gen_node_guid(void);
+
+int mlx4_ib_steer_qp_alloc(struct mlx4_ib_dev *dev, int count, int *qpn);
+void mlx4_ib_steer_qp_free(struct mlx4_ib_dev *dev, u32 qpn, int count);
+int mlx4_ib_steer_qp_reg(struct mlx4_ib_dev *mdev, struct mlx4_ib_qp *mqp,
+			 int is_attach);
+int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
+			  u64 start, u64 length, u64 virt_addr,
+			  int mr_access_flags, struct ib_pd *pd,
+			  struct ib_udata *udata);
+
+#endif /* MLX4_IB_H */
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
new file mode 100644
index 000000000..e0d271782
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -0,0 +1,525 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+
+static u32 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX4_PERM_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MLX4_PERM_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX4_PERM_LOCAL_WRITE  : 0) |
+	       (acc & IB_ACCESS_MW_BIND	      ? MLX4_PERM_BIND_MW      : 0) |
+	       MLX4_PERM_LOCAL_READ;
+}
+
+static enum mlx4_mw_type to_mlx4_type(enum ib_mw_type type)
+{
+	switch (type) {
+	case IB_MW_TYPE_1:	return MLX4_MW_TYPE_1;
+	case IB_MW_TYPE_2:	return MLX4_MW_TYPE_2;
+	default:		return -1;
+	}
+}
+
+struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct mlx4_ib_mr *mr;
+	int err;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0,
+			    ~0ull, convert_access(acc), 0, 0, &mr->mmr);
+	if (err)
+		goto err_free;
+
+	err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr);
+	if (err)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_mr:
+	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_free:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
+			   struct ib_umem *umem)
+{
+	u64 *pages;
+	int i, k, entry;
+	int n;
+	int len;
+	int err = 0;
+	struct scatterlist *sg;
+
+	pages = (u64 *) __get_free_page(GFP_KERNEL);
+	if (!pages)
+		return -ENOMEM;
+
+	i = n = 0;
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		len = sg_dma_len(sg) >> mtt->page_shift;
+		for (k = 0; k < len; ++k) {
+			pages[i++] = sg_dma_address(sg) +
+				umem->page_size * k;
+			/*
+			 * Be friendly to mlx4_write_mtt() and
+			 * pass it chunks of appropriate size.
+			 */
+			if (i == PAGE_SIZE / sizeof (u64)) {
+				err = mlx4_write_mtt(dev->dev, mtt, n,
+						     i, pages);
+				if (err)
+					goto out;
+				n += i;
+				i = 0;
+			}
+		}
+	}
+
+	if (i)
+		err = mlx4_write_mtt(dev->dev, mtt, n, i, pages);
+
+out:
+	free_page((unsigned long) pages);
+	return err;
+}
+
+struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_mr *mr;
+	int shift;
+	int err;
+	int n;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	/* Force registering the memory as writable. */
+	/* Used for memory re-registeration. HCA protects the access */
+	mr->umem = ib_umem_get(pd->uobject->context, start, length,
+			       access_flags | IB_ACCESS_LOCAL_WRITE, 0);
+	if (IS_ERR(mr->umem)) {
+		err = PTR_ERR(mr->umem);
+		goto err_free;
+	}
+
+	n = ib_umem_page_count(mr->umem);
+	shift = ilog2(mr->umem->page_size);
+
+	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
+			    convert_access(access_flags), n, shift, &mr->mmr);
+	if (err)
+		goto err_umem;
+
+	err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
+	if (err)
+		goto err_mr;
+
+	err = mlx4_mr_enable(dev->dev, &mr->mmr);
+	if (err)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+
+	return &mr->ibmr;
+
+err_mr:
+	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
+
+err_umem:
+	ib_umem_release(mr->umem);
+
+err_free:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_rereg_user_mr(struct ib_mr *mr, int flags,
+			  u64 start, u64 length, u64 virt_addr,
+			  int mr_access_flags, struct ib_pd *pd,
+			  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(mr->device);
+	struct mlx4_ib_mr *mmr = to_mmr(mr);
+	struct mlx4_mpt_entry *mpt_entry;
+	struct mlx4_mpt_entry **pmpt_entry = &mpt_entry;
+	int err;
+
+	/* Since we synchronize this call and mlx4_ib_dereg_mr via uverbs,
+	 * we assume that the calls can't run concurrently. Otherwise, a
+	 * race exists.
+	 */
+	err =  mlx4_mr_hw_get_mpt(dev->dev, &mmr->mmr, &pmpt_entry);
+
+	if (err)
+		return err;
+
+	if (flags & IB_MR_REREG_PD) {
+		err = mlx4_mr_hw_change_pd(dev->dev, *pmpt_entry,
+					   to_mpd(pd)->pdn);
+
+		if (err)
+			goto release_mpt_entry;
+	}
+
+	if (flags & IB_MR_REREG_ACCESS) {
+		err = mlx4_mr_hw_change_access(dev->dev, *pmpt_entry,
+					       convert_access(mr_access_flags));
+
+		if (err)
+			goto release_mpt_entry;
+	}
+
+	if (flags & IB_MR_REREG_TRANS) {
+		int shift;
+		int n;
+
+		mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+		ib_umem_release(mmr->umem);
+		mmr->umem = ib_umem_get(mr->uobject->context, start, length,
+					mr_access_flags |
+					IB_ACCESS_LOCAL_WRITE,
+					0);
+		if (IS_ERR(mmr->umem)) {
+			err = PTR_ERR(mmr->umem);
+			/* Prevent mlx4_ib_dereg_mr from free'ing invalid pointer */
+			mmr->umem = NULL;
+			goto release_mpt_entry;
+		}
+		n = ib_umem_page_count(mmr->umem);
+		shift = ilog2(mmr->umem->page_size);
+
+		err = mlx4_mr_rereg_mem_write(dev->dev, &mmr->mmr,
+					      virt_addr, length, n, shift,
+					      *pmpt_entry);
+		if (err) {
+			ib_umem_release(mmr->umem);
+			goto release_mpt_entry;
+		}
+		mmr->mmr.iova       = virt_addr;
+		mmr->mmr.size       = length;
+
+		err = mlx4_ib_umem_write_mtt(dev, &mmr->mmr.mtt, mmr->umem);
+		if (err) {
+			mlx4_mr_rereg_mem_cleanup(dev->dev, &mmr->mmr);
+			ib_umem_release(mmr->umem);
+			goto release_mpt_entry;
+		}
+	}
+
+	/* If we couldn't transfer the MR to the HCA, just remember to
+	 * return a failure. But dereg_mr will free the resources.
+	 */
+	err = mlx4_mr_hw_write_mpt(dev->dev, &mmr->mmr, pmpt_entry);
+	if (!err && flags & IB_MR_REREG_ACCESS)
+		mmr->mmr.access = mr_access_flags;
+
+release_mpt_entry:
+	mlx4_mr_hw_put_mpt(dev->dev, pmpt_entry);
+
+	return err;
+}
+
+int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
+{
+	struct mlx4_ib_mr *mr = to_mmr(ibmr);
+	int ret;
+
+	ret = mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
+	if (ret)
+		return ret;
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+	kfree(mr);
+
+	return 0;
+}
+
+struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_mw *mw;
+	int err;
+
+	mw = kmalloc(sizeof(*mw), GFP_KERNEL);
+	if (!mw)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_mw_alloc(dev->dev, to_mpd(pd)->pdn,
+			    to_mlx4_type(type), &mw->mmw);
+	if (err)
+		goto err_free;
+
+	err = mlx4_mw_enable(dev->dev, &mw->mmw);
+	if (err)
+		goto err_mw;
+
+	mw->ibmw.rkey = mw->mmw.key;
+
+	return &mw->ibmw;
+
+err_mw:
+	mlx4_mw_free(dev->dev, &mw->mmw);
+
+err_free:
+	kfree(mw);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
+		    struct ib_mw_bind *mw_bind)
+{
+	struct ib_send_wr  wr;
+	struct ib_send_wr *bad_wr;
+	int ret;
+
+	memset(&wr, 0, sizeof(wr));
+	wr.opcode               = IB_WR_BIND_MW;
+	wr.wr_id                = mw_bind->wr_id;
+	wr.send_flags           = mw_bind->send_flags;
+	wr.wr.bind_mw.mw        = mw;
+	wr.wr.bind_mw.bind_info = mw_bind->bind_info;
+	wr.wr.bind_mw.rkey      = ib_inc_rkey(mw->rkey);
+
+	ret = mlx4_ib_post_send(qp, &wr, &bad_wr);
+	if (!ret)
+		mw->rkey = wr.wr.bind_mw.rkey;
+
+	return ret;
+}
+
+int mlx4_ib_dealloc_mw(struct ib_mw *ibmw)
+{
+	struct mlx4_ib_mw *mw = to_mmw(ibmw);
+
+	mlx4_mw_free(to_mdev(ibmw->device)->dev, &mw->mmw);
+	kfree(mw);
+
+	return 0;
+}
+
+struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
+					int max_page_list_len)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_mr *mr;
+	int err;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0,
+			    max_page_list_len, 0, &mr->mmr);
+	if (err)
+		goto err_free;
+
+	err = mlx4_mr_enable(dev->dev, &mr->mmr);
+	if (err)
+		goto err_mr;
+
+	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_mr:
+	(void) mlx4_mr_free(dev->dev, &mr->mmr);
+
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
+							       int page_list_len)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibdev);
+	struct mlx4_ib_fast_reg_page_list *mfrpl;
+	int size = page_list_len * sizeof (u64);
+
+	if (page_list_len > MLX4_MAX_FAST_REG_PAGES)
+		return ERR_PTR(-EINVAL);
+
+	mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL);
+	if (!mfrpl)
+		return ERR_PTR(-ENOMEM);
+
+	mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL);
+	if (!mfrpl->ibfrpl.page_list)
+		goto err_free;
+
+	mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->persist->
+						     pdev->dev,
+						     size, &mfrpl->map,
+						     GFP_KERNEL);
+	if (!mfrpl->mapped_page_list)
+		goto err_free;
+
+	WARN_ON(mfrpl->map & 0x3f);
+
+	return &mfrpl->ibfrpl;
+
+err_free:
+	kfree(mfrpl->ibfrpl.page_list);
+	kfree(mfrpl);
+	return ERR_PTR(-ENOMEM);
+}
+
+void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+	struct mlx4_ib_dev *dev = to_mdev(page_list->device);
+	struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
+	int size = page_list->max_page_list_len * sizeof (u64);
+
+	dma_free_coherent(&dev->dev->persist->pdev->dev, size,
+			  mfrpl->mapped_page_list,
+			  mfrpl->map);
+	kfree(mfrpl->ibfrpl.page_list);
+	kfree(mfrpl);
+}
+
+struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc,
+				 struct ib_fmr_attr *fmr_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_fmr *fmr;
+	int err = -ENOMEM;
+
+	fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
+	if (!fmr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc),
+			     fmr_attr->max_pages, fmr_attr->max_maps,
+			     fmr_attr->page_shift, &fmr->mfmr);
+	if (err)
+		goto err_free;
+
+	err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr);
+	if (err)
+		goto err_mr;
+
+	fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key;
+
+	return &fmr->ibfmr;
+
+err_mr:
+	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr);
+
+err_free:
+	kfree(fmr);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		      int npages, u64 iova)
+{
+	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+	struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device);
+
+	return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova,
+				 &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
+}
+
+int mlx4_ib_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ib_fmr *ibfmr;
+	int err;
+	struct mlx4_dev *mdev = NULL;
+
+	list_for_each_entry(ibfmr, fmr_list, list) {
+		if (mdev && to_mdev(ibfmr->device)->dev != mdev)
+			return -EINVAL;
+		mdev = to_mdev(ibfmr->device)->dev;
+	}
+
+	if (!mdev)
+		return 0;
+
+	list_for_each_entry(ibfmr, fmr_list, list) {
+		struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+
+		mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
+	}
+
+	/*
+	 * Make sure all MPT status updates are visible before issuing
+	 * SYNC_TPT firmware command.
+	 */
+	wmb();
+
+	err = mlx4_SYNC_TPT(mdev);
+	if (err)
+		pr_warn("SYNC_TPT error %d when "
+		       "unmapping FMRs\n", err);
+
+	return 0;
+}
+
+int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr)
+{
+	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+	struct mlx4_ib_dev *dev = to_mdev(ibfmr->device);
+	int err;
+
+	err = mlx4_fmr_free(dev->dev, &ifmr->mfmr);
+
+	if (!err)
+		kfree(ifmr);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
new file mode 100644
index 000000000..02fc91c68
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -0,0 +1,3217 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/log2.h>
+#include <linux/slab.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_addr.h>
+#include <rdma/ib_mad.h>
+
+#include <linux/mlx4/driver.h>
+#include <linux/mlx4/qp.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq,
+			     struct mlx4_ib_cq *recv_cq);
+static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq,
+			       struct mlx4_ib_cq *recv_cq);
+
+enum {
+	MLX4_IB_ACK_REQ_FREQ	= 8,
+};
+
+enum {
+	MLX4_IB_DEFAULT_SCHED_QUEUE	= 0x83,
+	MLX4_IB_DEFAULT_QP0_SCHED_QUEUE	= 0x3f,
+	MLX4_IB_LINK_TYPE_IB		= 0,
+	MLX4_IB_LINK_TYPE_ETH		= 1
+};
+
+enum {
+	/*
+	 * Largest possible UD header: send with GRH and immediate
+	 * data plus 18 bytes for an Ethernet header with VLAN/802.1Q
+	 * tag.  (LRH would only use 8 bytes, so Ethernet is the
+	 * biggest case)
+	 */
+	MLX4_IB_UD_HEADER_SIZE		= 82,
+	MLX4_IB_LSO_HEADER_SPARE	= 128,
+};
+
+enum {
+	MLX4_IB_IBOE_ETHERTYPE		= 0x8915
+};
+
+struct mlx4_ib_sqp {
+	struct mlx4_ib_qp	qp;
+	int			pkey_index;
+	u32			qkey;
+	u32			send_psn;
+	struct ib_ud_header	ud_header;
+	u8			header_buf[MLX4_IB_UD_HEADER_SIZE];
+};
+
+enum {
+	MLX4_IB_MIN_SQ_STRIDE	= 6,
+	MLX4_IB_CACHE_LINE_SIZE	= 64,
+};
+
+enum {
+	MLX4_RAW_QP_MTU		= 7,
+	MLX4_RAW_QP_MSGMAX	= 31,
+};
+
+#ifndef ETH_ALEN
+#define ETH_ALEN        6
+#endif
+
+static const __be32 mlx4_ib_opcode[] = {
+	[IB_WR_SEND]				= cpu_to_be32(MLX4_OPCODE_SEND),
+	[IB_WR_LSO]				= cpu_to_be32(MLX4_OPCODE_LSO),
+	[IB_WR_SEND_WITH_IMM]			= cpu_to_be32(MLX4_OPCODE_SEND_IMM),
+	[IB_WR_RDMA_WRITE]			= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE),
+	[IB_WR_RDMA_WRITE_WITH_IMM]		= cpu_to_be32(MLX4_OPCODE_RDMA_WRITE_IMM),
+	[IB_WR_RDMA_READ]			= cpu_to_be32(MLX4_OPCODE_RDMA_READ),
+	[IB_WR_ATOMIC_CMP_AND_SWP]		= cpu_to_be32(MLX4_OPCODE_ATOMIC_CS),
+	[IB_WR_ATOMIC_FETCH_AND_ADD]		= cpu_to_be32(MLX4_OPCODE_ATOMIC_FA),
+	[IB_WR_SEND_WITH_INV]			= cpu_to_be32(MLX4_OPCODE_SEND_INVAL),
+	[IB_WR_LOCAL_INV]			= cpu_to_be32(MLX4_OPCODE_LOCAL_INVAL),
+	[IB_WR_FAST_REG_MR]			= cpu_to_be32(MLX4_OPCODE_FMR),
+	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS),
+	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA),
+	[IB_WR_BIND_MW]				= cpu_to_be32(MLX4_OPCODE_BIND_MW),
+};
+
+static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp)
+{
+	return container_of(mqp, struct mlx4_ib_sqp, qp);
+}
+
+static int is_tunnel_qp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	if (!mlx4_is_master(dev->dev))
+		return 0;
+
+	return qp->mqp.qpn >= dev->dev->phys_caps.base_tunnel_sqpn &&
+	       qp->mqp.qpn < dev->dev->phys_caps.base_tunnel_sqpn +
+		8 * MLX4_MFUNC_MAX;
+}
+
+static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	int proxy_sqp = 0;
+	int real_sqp = 0;
+	int i;
+	/* PPF or Native -- real SQP */
+	real_sqp = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) &&
+		    qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn &&
+		    qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 3);
+	if (real_sqp)
+		return 1;
+	/* VF or PF -- proxy SQP */
+	if (mlx4_is_mfunc(dev->dev)) {
+		for (i = 0; i < dev->dev->caps.num_ports; i++) {
+			if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i] ||
+			    qp->mqp.qpn == dev->dev->caps.qp1_proxy[i]) {
+				proxy_sqp = 1;
+				break;
+			}
+		}
+	}
+	return proxy_sqp;
+}
+
+/* used for INIT/CLOSE port logic */
+static int is_qp0(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	int proxy_qp0 = 0;
+	int real_qp0 = 0;
+	int i;
+	/* PPF or Native -- real QP0 */
+	real_qp0 = ((mlx4_is_master(dev->dev) || !mlx4_is_mfunc(dev->dev)) &&
+		    qp->mqp.qpn >= dev->dev->phys_caps.base_sqpn &&
+		    qp->mqp.qpn <= dev->dev->phys_caps.base_sqpn + 1);
+	if (real_qp0)
+		return 1;
+	/* VF or PF -- proxy QP0 */
+	if (mlx4_is_mfunc(dev->dev)) {
+		for (i = 0; i < dev->dev->caps.num_ports; i++) {
+			if (qp->mqp.qpn == dev->dev->caps.qp0_proxy[i]) {
+				proxy_qp0 = 1;
+				break;
+			}
+		}
+	}
+	return proxy_qp0;
+}
+
+static void *get_wqe(struct mlx4_ib_qp *qp, int offset)
+{
+	return mlx4_buf_offset(&qp->buf, offset);
+}
+
+static void *get_recv_wqe(struct mlx4_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
+}
+
+static void *get_send_wqe(struct mlx4_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->sq.offset + (n << qp->sq.wqe_shift));
+}
+
+/*
+ * Stamp a SQ WQE so that it is invalid if prefetched by marking the
+ * first four bytes of every 64 byte chunk with
+ *     0x7FFFFFF | (invalid_ownership_value << 31).
+ *
+ * When the max work request size is less than or equal to the WQE
+ * basic block size, as an optimization, we can stamp all WQEs with
+ * 0xffffffff, and skip the very first chunk of each WQE.
+ */
+static void stamp_send_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+	__be32 *wqe;
+	int i;
+	int s;
+	int ind;
+	void *buf;
+	__be32 stamp;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+
+	if (qp->sq_max_wqes_per_wr > 1) {
+		s = roundup(size, 1U << qp->sq.wqe_shift);
+		for (i = 0; i < s; i += 64) {
+			ind = (i >> qp->sq.wqe_shift) + n;
+			stamp = ind & qp->sq.wqe_cnt ? cpu_to_be32(0x7fffffff) :
+						       cpu_to_be32(0xffffffff);
+			buf = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+			wqe = buf + (i & ((1 << qp->sq.wqe_shift) - 1));
+			*wqe = stamp;
+		}
+	} else {
+		ctrl = buf = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+		s = (ctrl->fence_size & 0x3f) << 4;
+		for (i = 64; i < s; i += 64) {
+			wqe = buf + i;
+			*wqe = cpu_to_be32(0xffffffff);
+		}
+	}
+}
+
+static void post_nop_wqe(struct mlx4_ib_qp *qp, int n, int size)
+{
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_inline_seg *inl;
+	void *wqe;
+	int s;
+
+	ctrl = wqe = get_send_wqe(qp, n & (qp->sq.wqe_cnt - 1));
+	s = sizeof(struct mlx4_wqe_ctrl_seg);
+
+	if (qp->ibqp.qp_type == IB_QPT_UD) {
+		struct mlx4_wqe_datagram_seg *dgram = wqe + sizeof *ctrl;
+		struct mlx4_av *av = (struct mlx4_av *)dgram->av;
+		memset(dgram, 0, sizeof *dgram);
+		av->port_pd = cpu_to_be32((qp->port << 24) | to_mpd(qp->ibqp.pd)->pdn);
+		s += sizeof(struct mlx4_wqe_datagram_seg);
+	}
+
+	/* Pad the remainder of the WQE with an inline data segment. */
+	if (size > s) {
+		inl = wqe + s;
+		inl->byte_count = cpu_to_be32(1 << 31 | (size - s - sizeof *inl));
+	}
+	ctrl->srcrb_flags = 0;
+	ctrl->fence_size = size / 16;
+	/*
+	 * Make sure descriptor is fully written before setting ownership bit
+	 * (because HW can start executing as soon as we do).
+	 */
+	wmb();
+
+	ctrl->owner_opcode = cpu_to_be32(MLX4_OPCODE_NOP | MLX4_WQE_CTRL_NEC) |
+		(n & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0);
+
+	stamp_send_wqe(qp, n + qp->sq_spare_wqes, size);
+}
+
+/* Post NOP WQE to prevent wrap-around in the middle of WR */
+static inline unsigned pad_wraparound(struct mlx4_ib_qp *qp, int ind)
+{
+	unsigned s = qp->sq.wqe_cnt - (ind & (qp->sq.wqe_cnt - 1));
+	if (unlikely(s < qp->sq_max_wqes_per_wr)) {
+		post_nop_wqe(qp, ind, s << qp->sq.wqe_shift);
+		ind += s;
+	}
+	return ind;
+}
+
+static void mlx4_ib_qp_event(struct mlx4_qp *qp, enum mlx4_event type)
+{
+	struct ib_event event;
+	struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
+
+	if (type == MLX4_EVENT_TYPE_PATH_MIG)
+		to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+
+	if (ibqp->event_handler) {
+		event.device     = ibqp->device;
+		event.element.qp = ibqp;
+		switch (type) {
+		case MLX4_EVENT_TYPE_PATH_MIG:
+			event.event = IB_EVENT_PATH_MIG;
+			break;
+		case MLX4_EVENT_TYPE_COMM_EST:
+			event.event = IB_EVENT_COMM_EST;
+			break;
+		case MLX4_EVENT_TYPE_SQ_DRAINED:
+			event.event = IB_EVENT_SQ_DRAINED;
+			break;
+		case MLX4_EVENT_TYPE_SRQ_QP_LAST_WQE:
+			event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			break;
+		case MLX4_EVENT_TYPE_WQ_CATAS_ERROR:
+			event.event = IB_EVENT_QP_FATAL;
+			break;
+		case MLX4_EVENT_TYPE_PATH_MIG_FAILED:
+			event.event = IB_EVENT_PATH_MIG_ERR;
+			break;
+		case MLX4_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+			event.event = IB_EVENT_QP_REQ_ERR;
+			break;
+		case MLX4_EVENT_TYPE_WQ_ACCESS_ERROR:
+			event.event = IB_EVENT_QP_ACCESS_ERR;
+			break;
+		default:
+			pr_warn("Unexpected event type %d "
+			       "on QP %06x\n", type, qp->qpn);
+			return;
+		}
+
+		ibqp->event_handler(&event, ibqp->qp_context);
+	}
+}
+
+static int send_wqe_overhead(enum mlx4_ib_qp_type type, u32 flags)
+{
+	/*
+	 * UD WQEs must have a datagram segment.
+	 * RC and UC WQEs might have a remote address segment.
+	 * MLX WQEs need two extra inline data segments (for the UD
+	 * header and space for the ICRC).
+	 */
+	switch (type) {
+	case MLX4_IB_QPT_UD:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_datagram_seg) +
+			((flags & MLX4_IB_QP_LSO) ? MLX4_IB_LSO_HEADER_SPARE : 0);
+	case MLX4_IB_QPT_PROXY_SMI_OWNER:
+	case MLX4_IB_QPT_PROXY_SMI:
+	case MLX4_IB_QPT_PROXY_GSI:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_datagram_seg) + 64;
+	case MLX4_IB_QPT_TUN_SMI_OWNER:
+	case MLX4_IB_QPT_TUN_GSI:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_datagram_seg);
+
+	case MLX4_IB_QPT_UC:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_raddr_seg);
+	case MLX4_IB_QPT_RC:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			sizeof (struct mlx4_wqe_atomic_seg) +
+			sizeof (struct mlx4_wqe_raddr_seg);
+	case MLX4_IB_QPT_SMI:
+	case MLX4_IB_QPT_GSI:
+		return sizeof (struct mlx4_wqe_ctrl_seg) +
+			ALIGN(MLX4_IB_UD_HEADER_SIZE +
+			      DIV_ROUND_UP(MLX4_IB_UD_HEADER_SIZE,
+					   MLX4_INLINE_ALIGN) *
+			      sizeof (struct mlx4_wqe_inline_seg),
+			      sizeof (struct mlx4_wqe_data_seg)) +
+			ALIGN(4 +
+			      sizeof (struct mlx4_wqe_inline_seg),
+			      sizeof (struct mlx4_wqe_data_seg));
+	default:
+		return sizeof (struct mlx4_wqe_ctrl_seg);
+	}
+}
+
+static int set_rq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+		       int is_user, int has_rq, struct mlx4_ib_qp *qp)
+{
+	/* Sanity check RQ size before proceeding */
+	if (cap->max_recv_wr > dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE ||
+	    cap->max_recv_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg))
+		return -EINVAL;
+
+	if (!has_rq) {
+		if (cap->max_recv_wr)
+			return -EINVAL;
+
+		qp->rq.wqe_cnt = qp->rq.max_gs = 0;
+	} else {
+		/* HW requires >= 1 RQ entry with >= 1 gather entry */
+		if (is_user && (!cap->max_recv_wr || !cap->max_recv_sge))
+			return -EINVAL;
+
+		qp->rq.wqe_cnt	 = roundup_pow_of_two(max(1U, cap->max_recv_wr));
+		qp->rq.max_gs	 = roundup_pow_of_two(max(1U, cap->max_recv_sge));
+		qp->rq.wqe_shift = ilog2(qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg));
+	}
+
+	/* leave userspace return values as they were, so as not to break ABI */
+	if (is_user) {
+		cap->max_recv_wr  = qp->rq.max_post = qp->rq.wqe_cnt;
+		cap->max_recv_sge = qp->rq.max_gs;
+	} else {
+		cap->max_recv_wr  = qp->rq.max_post =
+			min(dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE, qp->rq.wqe_cnt);
+		cap->max_recv_sge = min(qp->rq.max_gs,
+					min(dev->dev->caps.max_sq_sg,
+					    dev->dev->caps.max_rq_sg));
+	}
+
+	return 0;
+}
+
+static int set_kernel_sq_size(struct mlx4_ib_dev *dev, struct ib_qp_cap *cap,
+			      enum mlx4_ib_qp_type type, struct mlx4_ib_qp *qp)
+{
+	int s;
+
+	/* Sanity check SQ size before proceeding */
+	if (cap->max_send_wr  > (dev->dev->caps.max_wqes - MLX4_IB_SQ_MAX_SPARE) ||
+	    cap->max_send_sge > min(dev->dev->caps.max_sq_sg, dev->dev->caps.max_rq_sg) ||
+	    cap->max_inline_data + send_wqe_overhead(type, qp->flags) +
+	    sizeof (struct mlx4_wqe_inline_seg) > dev->dev->caps.max_sq_desc_sz)
+		return -EINVAL;
+
+	/*
+	 * For MLX transport we need 2 extra S/G entries:
+	 * one for the header and one for the checksum at the end
+	 */
+	if ((type == MLX4_IB_QPT_SMI || type == MLX4_IB_QPT_GSI ||
+	     type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) &&
+	    cap->max_send_sge + 2 > dev->dev->caps.max_sq_sg)
+		return -EINVAL;
+
+	s = max(cap->max_send_sge * sizeof (struct mlx4_wqe_data_seg),
+		cap->max_inline_data + sizeof (struct mlx4_wqe_inline_seg)) +
+		send_wqe_overhead(type, qp->flags);
+
+	if (s > dev->dev->caps.max_sq_desc_sz)
+		return -EINVAL;
+
+	/*
+	 * Hermon supports shrinking WQEs, such that a single work
+	 * request can include multiple units of 1 << wqe_shift.  This
+	 * way, work requests can differ in size, and do not have to
+	 * be a power of 2 in size, saving memory and speeding up send
+	 * WR posting.  Unfortunately, if we do this then the
+	 * wqe_index field in CQEs can't be used to look up the WR ID
+	 * anymore, so we do this only if selective signaling is off.
+	 *
+	 * Further, on 32-bit platforms, we can't use vmap() to make
+	 * the QP buffer virtually contiguous.  Thus we have to use
+	 * constant-sized WRs to make sure a WR is always fully within
+	 * a single page-sized chunk.
+	 *
+	 * Finally, we use NOP work requests to pad the end of the
+	 * work queue, to avoid wrap-around in the middle of WR.  We
+	 * set NEC bit to avoid getting completions with error for
+	 * these NOP WRs, but since NEC is only supported starting
+	 * with firmware 2.2.232, we use constant-sized WRs for older
+	 * firmware.
+	 *
+	 * And, since MLX QPs only support SEND, we use constant-sized
+	 * WRs in this case.
+	 *
+	 * We look for the smallest value of wqe_shift such that the
+	 * resulting number of wqes does not exceed device
+	 * capabilities.
+	 *
+	 * We set WQE size to at least 64 bytes, this way stamping
+	 * invalidates each WQE.
+	 */
+	if (dev->dev->caps.fw_ver >= MLX4_FW_VER_WQE_CTRL_NEC &&
+	    qp->sq_signal_bits && BITS_PER_LONG == 64 &&
+	    type != MLX4_IB_QPT_SMI && type != MLX4_IB_QPT_GSI &&
+	    !(type & (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_PROXY_SMI |
+		      MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER)))
+		qp->sq.wqe_shift = ilog2(64);
+	else
+		qp->sq.wqe_shift = ilog2(roundup_pow_of_two(s));
+
+	for (;;) {
+		qp->sq_max_wqes_per_wr = DIV_ROUND_UP(s, 1U << qp->sq.wqe_shift);
+
+		/*
+		 * We need to leave 2 KB + 1 WR of headroom in the SQ to
+		 * allow HW to prefetch.
+		 */
+		qp->sq_spare_wqes = (2048 >> qp->sq.wqe_shift) + qp->sq_max_wqes_per_wr;
+		qp->sq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr *
+						    qp->sq_max_wqes_per_wr +
+						    qp->sq_spare_wqes);
+
+		if (qp->sq.wqe_cnt <= dev->dev->caps.max_wqes)
+			break;
+
+		if (qp->sq_max_wqes_per_wr <= 1)
+			return -EINVAL;
+
+		++qp->sq.wqe_shift;
+	}
+
+	qp->sq.max_gs = (min(dev->dev->caps.max_sq_desc_sz,
+			     (qp->sq_max_wqes_per_wr << qp->sq.wqe_shift)) -
+			 send_wqe_overhead(type, qp->flags)) /
+		sizeof (struct mlx4_wqe_data_seg);
+
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
+	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
+		qp->rq.offset = 0;
+		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+	} else {
+		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
+		qp->sq.offset = 0;
+	}
+
+	cap->max_send_wr  = qp->sq.max_post =
+		(qp->sq.wqe_cnt - qp->sq_spare_wqes) / qp->sq_max_wqes_per_wr;
+	cap->max_send_sge = min(qp->sq.max_gs,
+				min(dev->dev->caps.max_sq_sg,
+				    dev->dev->caps.max_rq_sg));
+	/* We don't support inline sends for kernel QPs (yet) */
+	cap->max_inline_data = 0;
+
+	return 0;
+}
+
+static int set_user_sq_size(struct mlx4_ib_dev *dev,
+			    struct mlx4_ib_qp *qp,
+			    struct mlx4_ib_create_qp *ucmd)
+{
+	/* Sanity check SQ size before proceeding */
+	if ((1 << ucmd->log_sq_bb_count) > dev->dev->caps.max_wqes	 ||
+	    ucmd->log_sq_stride >
+		ilog2(roundup_pow_of_two(dev->dev->caps.max_sq_desc_sz)) ||
+	    ucmd->log_sq_stride < MLX4_IB_MIN_SQ_STRIDE)
+		return -EINVAL;
+
+	qp->sq.wqe_cnt   = 1 << ucmd->log_sq_bb_count;
+	qp->sq.wqe_shift = ucmd->log_sq_stride;
+
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
+
+	return 0;
+}
+
+static int alloc_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
+{
+	int i;
+
+	qp->sqp_proxy_rcv =
+		kmalloc(sizeof (struct mlx4_ib_buf) * qp->rq.wqe_cnt,
+			GFP_KERNEL);
+	if (!qp->sqp_proxy_rcv)
+		return -ENOMEM;
+	for (i = 0; i < qp->rq.wqe_cnt; i++) {
+		qp->sqp_proxy_rcv[i].addr =
+			kmalloc(sizeof (struct mlx4_ib_proxy_sqp_hdr),
+				GFP_KERNEL);
+		if (!qp->sqp_proxy_rcv[i].addr)
+			goto err;
+		qp->sqp_proxy_rcv[i].map =
+			ib_dma_map_single(dev, qp->sqp_proxy_rcv[i].addr,
+					  sizeof (struct mlx4_ib_proxy_sqp_hdr),
+					  DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(dev, qp->sqp_proxy_rcv[i].map)) {
+			kfree(qp->sqp_proxy_rcv[i].addr);
+			goto err;
+		}
+	}
+	return 0;
+
+err:
+	while (i > 0) {
+		--i;
+		ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
+				    sizeof (struct mlx4_ib_proxy_sqp_hdr),
+				    DMA_FROM_DEVICE);
+		kfree(qp->sqp_proxy_rcv[i].addr);
+	}
+	kfree(qp->sqp_proxy_rcv);
+	qp->sqp_proxy_rcv = NULL;
+	return -ENOMEM;
+}
+
+static void free_proxy_bufs(struct ib_device *dev, struct mlx4_ib_qp *qp)
+{
+	int i;
+
+	for (i = 0; i < qp->rq.wqe_cnt; i++) {
+		ib_dma_unmap_single(dev, qp->sqp_proxy_rcv[i].map,
+				    sizeof (struct mlx4_ib_proxy_sqp_hdr),
+				    DMA_FROM_DEVICE);
+		kfree(qp->sqp_proxy_rcv[i].addr);
+	}
+	kfree(qp->sqp_proxy_rcv);
+}
+
+static int qp_has_rq(struct ib_qp_init_attr *attr)
+{
+	if (attr->qp_type == IB_QPT_XRC_INI || attr->qp_type == IB_QPT_XRC_TGT)
+		return 0;
+
+	return !attr->srq;
+}
+
+static int qp0_enabled_vf(struct mlx4_dev *dev, int qpn)
+{
+	int i;
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (qpn == dev->caps.qp0_proxy[i])
+			return !!dev->caps.qp0_qkey[i];
+	}
+	return 0;
+}
+
+static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata, int sqpn, struct mlx4_ib_qp **caller_qp,
+			    gfp_t gfp)
+{
+	int qpn;
+	int err;
+	struct mlx4_ib_sqp *sqp;
+	struct mlx4_ib_qp *qp;
+	enum mlx4_ib_qp_type qp_type = (enum mlx4_ib_qp_type) init_attr->qp_type;
+	struct mlx4_ib_cq *mcq;
+	unsigned long flags;
+
+	/* When tunneling special qps, we use a plain UD qp */
+	if (sqpn) {
+		if (mlx4_is_mfunc(dev->dev) &&
+		    (!mlx4_is_master(dev->dev) ||
+		     !(init_attr->create_flags & MLX4_IB_SRIOV_SQP))) {
+			if (init_attr->qp_type == IB_QPT_GSI)
+				qp_type = MLX4_IB_QPT_PROXY_GSI;
+			else {
+				if (mlx4_is_master(dev->dev) ||
+				    qp0_enabled_vf(dev->dev, sqpn))
+					qp_type = MLX4_IB_QPT_PROXY_SMI_OWNER;
+				else
+					qp_type = MLX4_IB_QPT_PROXY_SMI;
+			}
+		}
+		qpn = sqpn;
+		/* add extra sg entry for tunneling */
+		init_attr->cap.max_recv_sge++;
+	} else if (init_attr->create_flags & MLX4_IB_SRIOV_TUNNEL_QP) {
+		struct mlx4_ib_qp_tunnel_init_attr *tnl_init =
+			container_of(init_attr,
+				     struct mlx4_ib_qp_tunnel_init_attr, init_attr);
+		if ((tnl_init->proxy_qp_type != IB_QPT_SMI &&
+		     tnl_init->proxy_qp_type != IB_QPT_GSI)   ||
+		    !mlx4_is_master(dev->dev))
+			return -EINVAL;
+		if (tnl_init->proxy_qp_type == IB_QPT_GSI)
+			qp_type = MLX4_IB_QPT_TUN_GSI;
+		else if (tnl_init->slave == mlx4_master_func_num(dev->dev) ||
+			 mlx4_vf_smi_enabled(dev->dev, tnl_init->slave,
+					     tnl_init->port))
+			qp_type = MLX4_IB_QPT_TUN_SMI_OWNER;
+		else
+			qp_type = MLX4_IB_QPT_TUN_SMI;
+		/* we are definitely in the PPF here, since we are creating
+		 * tunnel QPs. base_tunnel_sqpn is therefore valid. */
+		qpn = dev->dev->phys_caps.base_tunnel_sqpn + 8 * tnl_init->slave
+			+ tnl_init->proxy_qp_type * 2 + tnl_init->port - 1;
+		sqpn = qpn;
+	}
+
+	if (!*caller_qp) {
+		if (qp_type == MLX4_IB_QPT_SMI || qp_type == MLX4_IB_QPT_GSI ||
+		    (qp_type & (MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_SMI_OWNER |
+				MLX4_IB_QPT_PROXY_GSI | MLX4_IB_QPT_TUN_SMI_OWNER))) {
+			sqp = kzalloc(sizeof (struct mlx4_ib_sqp), gfp);
+			if (!sqp)
+				return -ENOMEM;
+			qp = &sqp->qp;
+			qp->pri.vid = 0xFFFF;
+			qp->alt.vid = 0xFFFF;
+		} else {
+			qp = kzalloc(sizeof (struct mlx4_ib_qp), gfp);
+			if (!qp)
+				return -ENOMEM;
+			qp->pri.vid = 0xFFFF;
+			qp->alt.vid = 0xFFFF;
+		}
+	} else
+		qp = *caller_qp;
+
+	qp->mlx4_ib_qp_type = qp_type;
+
+	mutex_init(&qp->mutex);
+	spin_lock_init(&qp->sq.lock);
+	spin_lock_init(&qp->rq.lock);
+	INIT_LIST_HEAD(&qp->gid_list);
+	INIT_LIST_HEAD(&qp->steering_rules);
+
+	qp->state	 = IB_QPS_RESET;
+	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		qp->sq_signal_bits = cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+	err = set_rq_size(dev, &init_attr->cap, !!pd->uobject, qp_has_rq(init_attr), qp);
+	if (err)
+		goto err;
+
+	if (pd->uobject) {
+		struct mlx4_ib_create_qp ucmd;
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err;
+		}
+
+		qp->sq_no_prefetch = ucmd.sq_no_prefetch;
+
+		err = set_user_sq_size(dev, qp, &ucmd);
+		if (err)
+			goto err;
+
+		qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+				       qp->buf_size, 0, 0);
+		if (IS_ERR(qp->umem)) {
+			err = PTR_ERR(qp->umem);
+			goto err;
+		}
+
+		err = mlx4_mtt_init(dev->dev, ib_umem_page_count(qp->umem),
+				    ilog2(qp->umem->page_size), &qp->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_ib_umem_write_mtt(dev, &qp->mtt, qp->umem);
+		if (err)
+			goto err_mtt;
+
+		if (qp_has_rq(init_attr)) {
+			err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+						  ucmd.db_addr, &qp->db);
+			if (err)
+				goto err_mtt;
+		}
+	} else {
+		qp->sq_no_prefetch = 0;
+
+		if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK)
+			qp->flags |= MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+
+		if (init_attr->create_flags & IB_QP_CREATE_IPOIB_UD_LSO)
+			qp->flags |= MLX4_IB_QP_LSO;
+
+		if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
+			if (dev->steering_support ==
+			    MLX4_STEERING_MODE_DEVICE_MANAGED)
+				qp->flags |= MLX4_IB_QP_NETIF;
+			else
+				goto err;
+		}
+
+		err = set_kernel_sq_size(dev, &init_attr->cap, qp_type, qp);
+		if (err)
+			goto err;
+
+		if (qp_has_rq(init_attr)) {
+			err = mlx4_db_alloc(dev->dev, &qp->db, 0, gfp);
+			if (err)
+				goto err;
+
+			*qp->db.db = 0;
+		}
+
+		if (mlx4_buf_alloc(dev->dev, qp->buf_size, PAGE_SIZE * 2, &qp->buf, gfp)) {
+			err = -ENOMEM;
+			goto err_db;
+		}
+
+		err = mlx4_mtt_init(dev->dev, qp->buf.npages, qp->buf.page_shift,
+				    &qp->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_buf_write_mtt(dev->dev, &qp->mtt, &qp->buf, gfp);
+		if (err)
+			goto err_mtt;
+
+		qp->sq.wrid  = kmalloc(qp->sq.wqe_cnt * sizeof (u64), gfp);
+		qp->rq.wrid  = kmalloc(qp->rq.wqe_cnt * sizeof (u64), gfp);
+		if (!qp->sq.wrid || !qp->rq.wrid) {
+			err = -ENOMEM;
+			goto err_wrid;
+		}
+	}
+
+	if (sqpn) {
+		if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+		    MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
+			if (alloc_proxy_bufs(pd->device, qp)) {
+				err = -ENOMEM;
+				goto err_wrid;
+			}
+		}
+	} else {
+		/* Raw packet QPNs may not have bits 6,7 set in their qp_num;
+		 * otherwise, the WQE BlueFlame setup flow wrongly causes
+		 * VLAN insertion. */
+		if (init_attr->qp_type == IB_QPT_RAW_PACKET)
+			err = mlx4_qp_reserve_range(dev->dev, 1, 1, &qpn,
+						    (init_attr->cap.max_send_wr ?
+						     MLX4_RESERVE_ETH_BF_QP : 0) |
+						    (init_attr->cap.max_recv_wr ?
+						     MLX4_RESERVE_A0_QP : 0));
+		else
+			if (qp->flags & MLX4_IB_QP_NETIF)
+				err = mlx4_ib_steer_qp_alloc(dev, 1, &qpn);
+			else
+				err = mlx4_qp_reserve_range(dev->dev, 1, 1,
+							    &qpn, 0);
+		if (err)
+			goto err_proxy;
+	}
+
+	err = mlx4_qp_alloc(dev->dev, qpn, &qp->mqp, gfp);
+	if (err)
+		goto err_qpn;
+
+	if (init_attr->qp_type == IB_QPT_XRC_TGT)
+		qp->mqp.qpn |= (1 << 23);
+
+	/*
+	 * Hardware wants QPN written in big-endian order (after
+	 * shifting) for send doorbell.  Precompute this value to save
+	 * a little bit when posting sends.
+	 */
+	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+	qp->mqp.event = mlx4_ib_qp_event;
+	if (!*caller_qp)
+		*caller_qp = qp;
+
+	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+	mlx4_ib_lock_cqs(to_mcq(init_attr->send_cq),
+			 to_mcq(init_attr->recv_cq));
+	/* Maintain device to QPs access, needed for further handling
+	 * via reset flow
+	 */
+	list_add_tail(&qp->qps_list, &dev->qp_list);
+	/* Maintain CQ to QPs access, needed for further handling
+	 * via reset flow
+	 */
+	mcq = to_mcq(init_attr->send_cq);
+	list_add_tail(&qp->cq_send_list, &mcq->send_qp_list);
+	mcq = to_mcq(init_attr->recv_cq);
+	list_add_tail(&qp->cq_recv_list, &mcq->recv_qp_list);
+	mlx4_ib_unlock_cqs(to_mcq(init_attr->send_cq),
+			   to_mcq(init_attr->recv_cq));
+	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
+	return 0;
+
+err_qpn:
+	if (!sqpn) {
+		if (qp->flags & MLX4_IB_QP_NETIF)
+			mlx4_ib_steer_qp_free(dev, qpn, 1);
+		else
+			mlx4_qp_release_range(dev->dev, qpn, 1);
+	}
+err_proxy:
+	if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
+		free_proxy_bufs(pd->device, qp);
+err_wrid:
+	if (pd->uobject) {
+		if (qp_has_rq(init_attr))
+			mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &qp->db);
+	} else {
+		kfree(qp->sq.wrid);
+		kfree(qp->rq.wrid);
+	}
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+err_buf:
+	if (pd->uobject)
+		ib_umem_release(qp->umem);
+	else
+		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+
+err_db:
+	if (!pd->uobject && qp_has_rq(init_attr))
+		mlx4_db_free(dev->dev, &qp->db);
+
+err:
+	if (!*caller_qp)
+		kfree(qp);
+	return err;
+}
+
+static enum mlx4_qp_state to_mlx4_state(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:	return MLX4_QP_STATE_RST;
+	case IB_QPS_INIT:	return MLX4_QP_STATE_INIT;
+	case IB_QPS_RTR:	return MLX4_QP_STATE_RTR;
+	case IB_QPS_RTS:	return MLX4_QP_STATE_RTS;
+	case IB_QPS_SQD:	return MLX4_QP_STATE_SQD;
+	case IB_QPS_SQE:	return MLX4_QP_STATE_SQER;
+	case IB_QPS_ERR:	return MLX4_QP_STATE_ERR;
+	default:		return -1;
+	}
+}
+
+static void mlx4_ib_lock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+	__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
+{
+	if (send_cq == recv_cq) {
+		spin_lock(&send_cq->lock);
+		__acquire(&recv_cq->lock);
+	} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+		spin_lock(&send_cq->lock);
+		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock(&recv_cq->lock);
+		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+	}
+}
+
+static void mlx4_ib_unlock_cqs(struct mlx4_ib_cq *send_cq, struct mlx4_ib_cq *recv_cq)
+	__releases(&send_cq->lock) __releases(&recv_cq->lock)
+{
+	if (send_cq == recv_cq) {
+		__release(&recv_cq->lock);
+		spin_unlock(&send_cq->lock);
+	} else if (send_cq->mcq.cqn < recv_cq->mcq.cqn) {
+		spin_unlock(&recv_cq->lock);
+		spin_unlock(&send_cq->lock);
+	} else {
+		spin_unlock(&send_cq->lock);
+		spin_unlock(&recv_cq->lock);
+	}
+}
+
+static void del_gid_entries(struct mlx4_ib_qp *qp)
+{
+	struct mlx4_ib_gid_entry *ge, *tmp;
+
+	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+		list_del(&ge->list);
+		kfree(ge);
+	}
+}
+
+static struct mlx4_ib_pd *get_pd(struct mlx4_ib_qp *qp)
+{
+	if (qp->ibqp.qp_type == IB_QPT_XRC_TGT)
+		return to_mpd(to_mxrcd(qp->ibqp.xrcd)->pd);
+	else
+		return to_mpd(qp->ibqp.pd);
+}
+
+static void get_cqs(struct mlx4_ib_qp *qp,
+		    struct mlx4_ib_cq **send_cq, struct mlx4_ib_cq **recv_cq)
+{
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_XRC_TGT:
+		*send_cq = to_mcq(to_mxrcd(qp->ibqp.xrcd)->cq);
+		*recv_cq = *send_cq;
+		break;
+	case IB_QPT_XRC_INI:
+		*send_cq = to_mcq(qp->ibqp.send_cq);
+		*recv_cq = *send_cq;
+		break;
+	default:
+		*send_cq = to_mcq(qp->ibqp.send_cq);
+		*recv_cq = to_mcq(qp->ibqp.recv_cq);
+		break;
+	}
+}
+
+static void destroy_qp_common(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp,
+			      int is_user)
+{
+	struct mlx4_ib_cq *send_cq, *recv_cq;
+	unsigned long flags;
+
+	if (qp->state != IB_QPS_RESET) {
+		if (mlx4_qp_modify(dev->dev, NULL, to_mlx4_state(qp->state),
+				   MLX4_QP_STATE_RST, NULL, 0, 0, &qp->mqp))
+			pr_warn("modify QP %06x to RESET failed.\n",
+			       qp->mqp.qpn);
+		if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) {
+			mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
+			qp->pri.smac = 0;
+			qp->pri.smac_port = 0;
+		}
+		if (qp->alt.smac) {
+			mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
+			qp->alt.smac = 0;
+		}
+		if (qp->pri.vid < 0x1000) {
+			mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
+			qp->pri.vid = 0xFFFF;
+			qp->pri.candidate_vid = 0xFFFF;
+			qp->pri.update_vid = 0;
+		}
+		if (qp->alt.vid < 0x1000) {
+			mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
+			qp->alt.vid = 0xFFFF;
+			qp->alt.candidate_vid = 0xFFFF;
+			qp->alt.update_vid = 0;
+		}
+	}
+
+	get_cqs(qp, &send_cq, &recv_cq);
+
+	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
+	mlx4_ib_lock_cqs(send_cq, recv_cq);
+
+	/* del from lists under both locks above to protect reset flow paths */
+	list_del(&qp->qps_list);
+	list_del(&qp->cq_send_list);
+	list_del(&qp->cq_recv_list);
+	if (!is_user) {
+		__mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
+				 qp->ibqp.srq ? to_msrq(qp->ibqp.srq): NULL);
+		if (send_cq != recv_cq)
+			__mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+	}
+
+	mlx4_qp_remove(dev->dev, &qp->mqp);
+
+	mlx4_ib_unlock_cqs(send_cq, recv_cq);
+	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
+
+	mlx4_qp_free(dev->dev, &qp->mqp);
+
+	if (!is_sqp(dev, qp) && !is_tunnel_qp(dev, qp)) {
+		if (qp->flags & MLX4_IB_QP_NETIF)
+			mlx4_ib_steer_qp_free(dev, qp->mqp.qpn, 1);
+		else
+			mlx4_qp_release_range(dev->dev, qp->mqp.qpn, 1);
+	}
+
+	mlx4_mtt_cleanup(dev->dev, &qp->mtt);
+
+	if (is_user) {
+		if (qp->rq.wqe_cnt)
+			mlx4_ib_db_unmap_user(to_mucontext(qp->ibqp.uobject->context),
+					      &qp->db);
+		ib_umem_release(qp->umem);
+	} else {
+		kfree(qp->sq.wrid);
+		kfree(qp->rq.wrid);
+		if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+		    MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI))
+			free_proxy_bufs(&dev->ib_dev, qp);
+		mlx4_buf_free(dev->dev, qp->buf_size, &qp->buf);
+		if (qp->rq.wqe_cnt)
+			mlx4_db_free(dev->dev, &qp->db);
+	}
+
+	del_gid_entries(qp);
+}
+
+static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr)
+{
+	/* Native or PPF */
+	if (!mlx4_is_mfunc(dev->dev) ||
+	    (mlx4_is_master(dev->dev) &&
+	     attr->create_flags & MLX4_IB_SRIOV_SQP)) {
+		return  dev->dev->phys_caps.base_sqpn +
+			(attr->qp_type == IB_QPT_SMI ? 0 : 2) +
+			attr->port_num - 1;
+	}
+	/* PF or VF -- creating proxies */
+	if (attr->qp_type == IB_QPT_SMI)
+		return dev->dev->caps.qp0_proxy[attr->port_num - 1];
+	else
+		return dev->dev->caps.qp1_proxy[attr->port_num - 1];
+}
+
+struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata)
+{
+	struct mlx4_ib_qp *qp = NULL;
+	int err;
+	u16 xrcdn = 0;
+	gfp_t gfp;
+
+	gfp = (init_attr->create_flags & MLX4_IB_QP_CREATE_USE_GFP_NOIO) ?
+		GFP_NOIO : GFP_KERNEL;
+	/*
+	 * We only support LSO, vendor flag1, and multicast loopback blocking,
+	 * and only for kernel UD QPs.
+	 */
+	if (init_attr->create_flags & ~(MLX4_IB_QP_LSO |
+					MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK |
+					MLX4_IB_SRIOV_TUNNEL_QP |
+					MLX4_IB_SRIOV_SQP |
+					MLX4_IB_QP_NETIF |
+					MLX4_IB_QP_CREATE_USE_GFP_NOIO))
+		return ERR_PTR(-EINVAL);
+
+	if (init_attr->create_flags & IB_QP_CREATE_NETIF_QP) {
+		if (init_attr->qp_type != IB_QPT_UD)
+			return ERR_PTR(-EINVAL);
+	}
+
+	if (init_attr->create_flags &&
+	    (udata ||
+	     ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | MLX4_IB_QP_CREATE_USE_GFP_NOIO)) &&
+	      init_attr->qp_type != IB_QPT_UD) ||
+	     ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) &&
+	      init_attr->qp_type > IB_QPT_GSI)))
+		return ERR_PTR(-EINVAL);
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_XRC_TGT:
+		pd = to_mxrcd(init_attr->xrcd)->pd;
+		xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn;
+		init_attr->send_cq = to_mxrcd(init_attr->xrcd)->cq;
+		/* fall through */
+	case IB_QPT_XRC_INI:
+		if (!(to_mdev(pd->device)->dev->caps.flags & MLX4_DEV_CAP_FLAG_XRC))
+			return ERR_PTR(-ENOSYS);
+		init_attr->recv_cq = init_attr->send_cq;
+		/* fall through */
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_RAW_PACKET:
+		qp = kzalloc(sizeof *qp, gfp);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+		qp->pri.vid = 0xFFFF;
+		qp->alt.vid = 0xFFFF;
+		/* fall through */
+	case IB_QPT_UD:
+	{
+		err = create_qp_common(to_mdev(pd->device), pd, init_attr,
+				       udata, 0, &qp, gfp);
+		if (err)
+			return ERR_PTR(err);
+
+		qp->ibqp.qp_num = qp->mqp.qpn;
+		qp->xrcdn = xrcdn;
+
+		break;
+	}
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	{
+		/* Userspace is not allowed to create special QPs: */
+		if (udata)
+			return ERR_PTR(-EINVAL);
+
+		err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata,
+				       get_sqp_num(to_mdev(pd->device), init_attr),
+				       &qp, gfp);
+		if (err)
+			return ERR_PTR(err);
+
+		qp->port	= init_attr->port_num;
+		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
+
+		break;
+	}
+	default:
+		/* Don't support raw QPs */
+		return ERR_PTR(-EINVAL);
+	}
+
+	return &qp->ibqp;
+}
+
+int mlx4_ib_destroy_qp(struct ib_qp *qp)
+{
+	struct mlx4_ib_dev *dev = to_mdev(qp->device);
+	struct mlx4_ib_qp *mqp = to_mqp(qp);
+	struct mlx4_ib_pd *pd;
+
+	if (is_qp0(dev, mqp))
+		mlx4_CLOSE_PORT(dev->dev, mqp->port);
+
+	if (dev->qp1_proxy[mqp->port - 1] == mqp) {
+		mutex_lock(&dev->qp1_proxy_lock[mqp->port - 1]);
+		dev->qp1_proxy[mqp->port - 1] = NULL;
+		mutex_unlock(&dev->qp1_proxy_lock[mqp->port - 1]);
+	}
+
+	pd = get_pd(mqp);
+	destroy_qp_common(dev, mqp, !!pd->ibpd.uobject);
+
+	if (is_sqp(dev, mqp))
+		kfree(to_msqp(mqp));
+	else
+		kfree(mqp);
+
+	return 0;
+}
+
+static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type)
+{
+	switch (type) {
+	case MLX4_IB_QPT_RC:		return MLX4_QP_ST_RC;
+	case MLX4_IB_QPT_UC:		return MLX4_QP_ST_UC;
+	case MLX4_IB_QPT_UD:		return MLX4_QP_ST_UD;
+	case MLX4_IB_QPT_XRC_INI:
+	case MLX4_IB_QPT_XRC_TGT:	return MLX4_QP_ST_XRC;
+	case MLX4_IB_QPT_SMI:
+	case MLX4_IB_QPT_GSI:
+	case MLX4_IB_QPT_RAW_PACKET:	return MLX4_QP_ST_MLX;
+
+	case MLX4_IB_QPT_PROXY_SMI_OWNER:
+	case MLX4_IB_QPT_TUN_SMI_OWNER:	return (mlx4_is_mfunc(dev->dev) ?
+						MLX4_QP_ST_MLX : -1);
+	case MLX4_IB_QPT_PROXY_SMI:
+	case MLX4_IB_QPT_TUN_SMI:
+	case MLX4_IB_QPT_PROXY_GSI:
+	case MLX4_IB_QPT_TUN_GSI:	return (mlx4_is_mfunc(dev->dev) ?
+						MLX4_QP_ST_UD : -1);
+	default:			return -1;
+	}
+}
+
+static __be32 to_mlx4_access_flags(struct mlx4_ib_qp *qp, const struct ib_qp_attr *attr,
+				   int attr_mask)
+{
+	u8 dest_rd_atomic;
+	u32 access_flags;
+	u32 hw_access_flags = 0;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		dest_rd_atomic = attr->max_dest_rd_atomic;
+	else
+		dest_rd_atomic = qp->resp_depth;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		access_flags = attr->qp_access_flags;
+	else
+		access_flags = qp->atomic_rd_en;
+
+	if (!dest_rd_atomic)
+		access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_READ)
+		hw_access_flags |= MLX4_QP_BIT_RRE;
+	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+		hw_access_flags |= MLX4_QP_BIT_RAE;
+	if (access_flags & IB_ACCESS_REMOTE_WRITE)
+		hw_access_flags |= MLX4_QP_BIT_RWE;
+
+	return cpu_to_be32(hw_access_flags);
+}
+
+static void store_sqp_attrs(struct mlx4_ib_sqp *sqp, const struct ib_qp_attr *attr,
+			    int attr_mask)
+{
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		sqp->pkey_index = attr->pkey_index;
+	if (attr_mask & IB_QP_QKEY)
+		sqp->qkey = attr->qkey;
+	if (attr_mask & IB_QP_SQ_PSN)
+		sqp->send_psn = attr->sq_psn;
+}
+
+static void mlx4_set_sched(struct mlx4_qp_path *path, u8 port)
+{
+	path->sched_queue = (path->sched_queue & 0xbf) | ((port - 1) << 6);
+}
+
+static int _mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_ah_attr *ah,
+			  u64 smac, u16 vlan_tag, struct mlx4_qp_path *path,
+			  struct mlx4_roce_smac_vlan_info *smac_info, u8 port)
+{
+	int is_eth = rdma_port_get_link_layer(&dev->ib_dev, port) ==
+		IB_LINK_LAYER_ETHERNET;
+	int vidx;
+	int smac_index;
+	int err;
+
+
+	path->grh_mylmc     = ah->src_path_bits & 0x7f;
+	path->rlid	    = cpu_to_be16(ah->dlid);
+	if (ah->static_rate) {
+		path->static_rate = ah->static_rate + MLX4_STAT_RATE_OFFSET;
+		while (path->static_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET &&
+		       !(1 << path->static_rate & dev->dev->caps.stat_rate_support))
+			--path->static_rate;
+	} else
+		path->static_rate = 0;
+
+	if (ah->ah_flags & IB_AH_GRH) {
+		if (ah->grh.sgid_index >= dev->dev->caps.gid_table_len[port]) {
+			pr_err("sgid_index (%u) too large. max is %d\n",
+			       ah->grh.sgid_index, dev->dev->caps.gid_table_len[port] - 1);
+			return -1;
+		}
+
+		path->grh_mylmc |= 1 << 7;
+		path->mgid_index = ah->grh.sgid_index;
+		path->hop_limit  = ah->grh.hop_limit;
+		path->tclass_flowlabel =
+			cpu_to_be32((ah->grh.traffic_class << 20) |
+				    (ah->grh.flow_label));
+		memcpy(path->rgid, ah->grh.dgid.raw, 16);
+	}
+
+	if (is_eth) {
+		if (!(ah->ah_flags & IB_AH_GRH))
+			return -1;
+
+		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
+			((port - 1) << 6) | ((ah->sl & 7) << 3);
+
+		path->feup |= MLX4_FEUP_FORCE_ETH_UP;
+		if (vlan_tag < 0x1000) {
+			if (smac_info->vid < 0x1000) {
+				/* both valid vlan ids */
+				if (smac_info->vid != vlan_tag) {
+					/* different VIDs.  unreg old and reg new */
+					err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx);
+					if (err)
+						return err;
+					smac_info->candidate_vid = vlan_tag;
+					smac_info->candidate_vlan_index = vidx;
+					smac_info->candidate_vlan_port = port;
+					smac_info->update_vid = 1;
+					path->vlan_index = vidx;
+				} else {
+					path->vlan_index = smac_info->vlan_index;
+				}
+			} else {
+				/* no current vlan tag in qp */
+				err = mlx4_register_vlan(dev->dev, port, vlan_tag, &vidx);
+				if (err)
+					return err;
+				smac_info->candidate_vid = vlan_tag;
+				smac_info->candidate_vlan_index = vidx;
+				smac_info->candidate_vlan_port = port;
+				smac_info->update_vid = 1;
+				path->vlan_index = vidx;
+			}
+			path->feup |= MLX4_FVL_FORCE_ETH_VLAN;
+			path->fl = 1 << 6;
+		} else {
+			/* have current vlan tag. unregister it at modify-qp success */
+			if (smac_info->vid < 0x1000) {
+				smac_info->candidate_vid = 0xFFFF;
+				smac_info->update_vid = 1;
+			}
+		}
+
+		/* get smac_index for RoCE use.
+		 * If no smac was yet assigned, register one.
+		 * If one was already assigned, but the new mac differs,
+		 * unregister the old one and register the new one.
+		*/
+		if ((!smac_info->smac && !smac_info->smac_port) ||
+		    smac_info->smac != smac) {
+			/* register candidate now, unreg if needed, after success */
+			smac_index = mlx4_register_mac(dev->dev, port, smac);
+			if (smac_index >= 0) {
+				smac_info->candidate_smac_index = smac_index;
+				smac_info->candidate_smac = smac;
+				smac_info->candidate_smac_port = port;
+			} else {
+				return -EINVAL;
+			}
+		} else {
+			smac_index = smac_info->smac_index;
+		}
+
+		memcpy(path->dmac, ah->dmac, 6);
+		path->ackto = MLX4_IB_LINK_TYPE_ETH;
+		/* put MAC table smac index for IBoE */
+		path->grh_mylmc = (u8) (smac_index) | 0x80;
+	} else {
+		path->sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE |
+			((port - 1) << 6) | ((ah->sl & 0xf) << 2);
+	}
+
+	return 0;
+}
+
+static int mlx4_set_path(struct mlx4_ib_dev *dev, const struct ib_qp_attr *qp,
+			 enum ib_qp_attr_mask qp_attr_mask,
+			 struct mlx4_ib_qp *mqp,
+			 struct mlx4_qp_path *path, u8 port)
+{
+	return _mlx4_set_path(dev, &qp->ah_attr,
+			      mlx4_mac_to_u64((u8 *)qp->smac),
+			      (qp_attr_mask & IB_QP_VID) ? qp->vlan_id : 0xffff,
+			      path, &mqp->pri, port);
+}
+
+static int mlx4_set_alt_path(struct mlx4_ib_dev *dev,
+			     const struct ib_qp_attr *qp,
+			     enum ib_qp_attr_mask qp_attr_mask,
+			     struct mlx4_ib_qp *mqp,
+			     struct mlx4_qp_path *path, u8 port)
+{
+	return _mlx4_set_path(dev, &qp->alt_ah_attr,
+			      mlx4_mac_to_u64((u8 *)qp->alt_smac),
+			      (qp_attr_mask & IB_QP_ALT_VID) ?
+			      qp->alt_vlan_id : 0xffff,
+			      path, &mqp->alt, port);
+}
+
+static void update_mcg_macs(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp)
+{
+	struct mlx4_ib_gid_entry *ge, *tmp;
+
+	list_for_each_entry_safe(ge, tmp, &qp->gid_list, list) {
+		if (!ge->added && mlx4_ib_add_mc(dev, qp, &ge->gid)) {
+			ge->added = 1;
+			ge->port = qp->port;
+		}
+	}
+}
+
+static int handle_eth_ud_smac_index(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp, u8 *smac,
+				    struct mlx4_qp_context *context)
+{
+	u64 u64_mac;
+	int smac_index;
+
+	u64_mac = atomic64_read(&dev->iboe.mac[qp->port - 1]);
+
+	context->pri_path.sched_queue = MLX4_IB_DEFAULT_SCHED_QUEUE | ((qp->port - 1) << 6);
+	if (!qp->pri.smac && !qp->pri.smac_port) {
+		smac_index = mlx4_register_mac(dev->dev, qp->port, u64_mac);
+		if (smac_index >= 0) {
+			qp->pri.candidate_smac_index = smac_index;
+			qp->pri.candidate_smac = u64_mac;
+			qp->pri.candidate_smac_port = qp->port;
+			context->pri_path.grh_mylmc = 0x80 | (u8) smac_index;
+		} else {
+			return -ENOENT;
+		}
+	}
+	return 0;
+}
+
+static int __mlx4_ib_modify_qp(struct ib_qp *ibqp,
+			       const struct ib_qp_attr *attr, int attr_mask,
+			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_ib_pd *pd;
+	struct mlx4_ib_cq *send_cq, *recv_cq;
+	struct mlx4_qp_context *context;
+	enum mlx4_qp_optpar optpar = 0;
+	int sqd_event;
+	int steer_qp = 0;
+	int err = -EINVAL;
+
+	/* APM is not supported under RoCE */
+	if (attr_mask & IB_QP_ALT_PATH &&
+	    rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
+	    IB_LINK_LAYER_ETHERNET)
+		return -ENOTSUPP;
+
+	context = kzalloc(sizeof *context, GFP_KERNEL);
+	if (!context)
+		return -ENOMEM;
+
+	context->flags = cpu_to_be32((to_mlx4_state(new_state) << 28) |
+				     (to_mlx4_st(dev, qp->mlx4_ib_qp_type) << 16));
+
+	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+		context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+	else {
+		optpar |= MLX4_QP_OPTPAR_PM_STATE;
+		switch (attr->path_mig_state) {
+		case IB_MIG_MIGRATED:
+			context->flags |= cpu_to_be32(MLX4_QP_PM_MIGRATED << 11);
+			break;
+		case IB_MIG_REARM:
+			context->flags |= cpu_to_be32(MLX4_QP_PM_REARM << 11);
+			break;
+		case IB_MIG_ARMED:
+			context->flags |= cpu_to_be32(MLX4_QP_PM_ARMED << 11);
+			break;
+		}
+	}
+
+	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI)
+		context->mtu_msgmax = (IB_MTU_4096 << 5) | 11;
+	else if (ibqp->qp_type == IB_QPT_RAW_PACKET)
+		context->mtu_msgmax = (MLX4_RAW_QP_MTU << 5) | MLX4_RAW_QP_MSGMAX;
+	else if (ibqp->qp_type == IB_QPT_UD) {
+		if (qp->flags & MLX4_IB_QP_LSO)
+			context->mtu_msgmax = (IB_MTU_4096 << 5) |
+					      ilog2(dev->dev->caps.max_gso_sz);
+		else
+			context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
+	} else if (attr_mask & IB_QP_PATH_MTU) {
+		if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_4096) {
+			pr_err("path MTU (%u) is invalid\n",
+			       attr->path_mtu);
+			goto out;
+		}
+		context->mtu_msgmax = (attr->path_mtu << 5) |
+			ilog2(dev->dev->caps.max_msg_sz);
+	}
+
+	if (qp->rq.wqe_cnt)
+		context->rq_size_stride = ilog2(qp->rq.wqe_cnt) << 3;
+	context->rq_size_stride |= qp->rq.wqe_shift - 4;
+
+	if (qp->sq.wqe_cnt)
+		context->sq_size_stride = ilog2(qp->sq.wqe_cnt) << 3;
+	context->sq_size_stride |= qp->sq.wqe_shift - 4;
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		context->sq_size_stride |= !!qp->sq_no_prefetch << 7;
+		context->xrcd = cpu_to_be32((u32) qp->xrcdn);
+		if (ibqp->qp_type == IB_QPT_RAW_PACKET)
+			context->param3 |= cpu_to_be32(1 << 30);
+	}
+
+	if (qp->ibqp.uobject)
+		context->usr_page = cpu_to_be32(to_mucontext(ibqp->uobject->context)->uar.index);
+	else
+		context->usr_page = cpu_to_be32(dev->priv_uar.index);
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+
+	if (attr_mask & IB_QP_PORT) {
+		if (cur_state == IB_QPS_SQD && new_state == IB_QPS_SQD &&
+		    !(attr_mask & IB_QP_AV)) {
+			mlx4_set_sched(&context->pri_path, attr->port_num);
+			optpar |= MLX4_QP_OPTPAR_SCHED_QUEUE;
+		}
+	}
+
+	if (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+		if (dev->counters[qp->port - 1] != -1) {
+			context->pri_path.counter_index =
+						dev->counters[qp->port - 1];
+			optpar |= MLX4_QP_OPTPAR_COUNTER_INDEX;
+		} else
+			context->pri_path.counter_index = 0xff;
+
+		if (qp->flags & MLX4_IB_QP_NETIF) {
+			mlx4_ib_steer_qp_reg(dev, qp, 1);
+			steer_qp = 1;
+		}
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
+			context->pri_path.disable_pkey_check = 0x40;
+		context->pri_path.pkey_index = attr->pkey_index;
+		optpar |= MLX4_QP_OPTPAR_PKEY_INDEX;
+	}
+
+	if (attr_mask & IB_QP_AV) {
+		if (mlx4_set_path(dev, attr, attr_mask, qp, &context->pri_path,
+				  attr_mask & IB_QP_PORT ?
+				  attr->port_num : qp->port))
+			goto out;
+
+		optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH |
+			   MLX4_QP_OPTPAR_SCHED_QUEUE);
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		context->pri_path.ackto |= attr->timeout << 3;
+		optpar |= MLX4_QP_OPTPAR_ACK_TIMEOUT;
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		if (attr->alt_port_num == 0 ||
+		    attr->alt_port_num > dev->dev->caps.num_ports)
+			goto out;
+
+		if (attr->alt_pkey_index >=
+		    dev->dev->caps.pkey_table_len[attr->alt_port_num])
+			goto out;
+
+		if (mlx4_set_alt_path(dev, attr, attr_mask, qp,
+				      &context->alt_path,
+				      attr->alt_port_num))
+			goto out;
+
+		context->alt_path.pkey_index = attr->alt_pkey_index;
+		context->alt_path.ackto = attr->alt_timeout << 3;
+		optpar |= MLX4_QP_OPTPAR_ALT_ADDR_PATH;
+	}
+
+	pd = get_pd(qp);
+	get_cqs(qp, &send_cq, &recv_cq);
+	context->pd       = cpu_to_be32(pd->pdn);
+	context->cqn_send = cpu_to_be32(send_cq->mcq.cqn);
+	context->cqn_recv = cpu_to_be32(recv_cq->mcq.cqn);
+	context->params1  = cpu_to_be32(MLX4_IB_ACK_REQ_FREQ << 28);
+
+	/* Set "fast registration enabled" for all kernel QPs */
+	if (!qp->ibqp.uobject)
+		context->params1 |= cpu_to_be32(1 << 11);
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
+		optpar |= MLX4_QP_OPTPAR_RNR_RETRY;
+	}
+
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+		optpar |= MLX4_QP_OPTPAR_RETRY_COUNT;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attr->max_rd_atomic)
+			context->params1 |=
+				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+		optpar |= MLX4_QP_OPTPAR_SRA_MAX;
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN)
+		context->next_send_psn = cpu_to_be32(attr->sq_psn);
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attr->max_dest_rd_atomic)
+			context->params2 |=
+				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+		optpar |= MLX4_QP_OPTPAR_RRA_MAX;
+	}
+
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
+		context->params2 |= to_mlx4_access_flags(qp, attr, attr_mask);
+		optpar |= MLX4_QP_OPTPAR_RWE | MLX4_QP_OPTPAR_RRE | MLX4_QP_OPTPAR_RAE;
+	}
+
+	if (ibqp->srq)
+		context->params2 |= cpu_to_be32(MLX4_QP_BIT_RIC);
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+		optpar |= MLX4_QP_OPTPAR_RNR_TIMEOUT;
+	}
+	if (attr_mask & IB_QP_RQ_PSN)
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+	/* proxy and tunnel qp qkeys will be changed in modify-qp wrappers */
+	if (attr_mask & IB_QP_QKEY) {
+		if (qp->mlx4_ib_qp_type &
+		    (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))
+			context->qkey = cpu_to_be32(IB_QP_SET_QKEY);
+		else {
+			if (mlx4_is_mfunc(dev->dev) &&
+			    !(qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV) &&
+			    (attr->qkey & MLX4_RESERVED_QKEY_MASK) ==
+			    MLX4_RESERVED_QKEY_BASE) {
+				pr_err("Cannot use reserved QKEY"
+				       " 0x%x (range 0xffff0000..0xffffffff"
+				       " is reserved)\n", attr->qkey);
+				err = -EINVAL;
+				goto out;
+			}
+			context->qkey = cpu_to_be32(attr->qkey);
+		}
+		optpar |= MLX4_QP_OPTPAR_Q_KEY;
+	}
+
+	if (ibqp->srq)
+		context->srqn = cpu_to_be32(1 << 24 | to_msrq(ibqp->srq)->msrq.srqn);
+
+	if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		context->db_rec_addr = cpu_to_be64(qp->db.dma);
+
+	if (cur_state == IB_QPS_INIT &&
+	    new_state == IB_QPS_RTR  &&
+	    (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI ||
+	     ibqp->qp_type == IB_QPT_UD ||
+	     ibqp->qp_type == IB_QPT_RAW_PACKET)) {
+		context->pri_path.sched_queue = (qp->port - 1) << 6;
+		if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
+		    qp->mlx4_ib_qp_type &
+		    (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER)) {
+			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_QP0_SCHED_QUEUE;
+			if (qp->mlx4_ib_qp_type != MLX4_IB_QPT_SMI)
+				context->pri_path.fl = 0x80;
+		} else {
+			if (qp->mlx4_ib_qp_type & MLX4_IB_QPT_ANY_SRIOV)
+				context->pri_path.fl = 0x80;
+			context->pri_path.sched_queue |= MLX4_IB_DEFAULT_SCHED_QUEUE;
+		}
+		if (rdma_port_get_link_layer(&dev->ib_dev, qp->port) ==
+		    IB_LINK_LAYER_ETHERNET) {
+			if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI ||
+			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI)
+				context->pri_path.feup = 1 << 7; /* don't fsm */
+			/* handle smac_index */
+			if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_UD ||
+			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI ||
+			    qp->mlx4_ib_qp_type == MLX4_IB_QPT_TUN_GSI) {
+				err = handle_eth_ud_smac_index(dev, qp, (u8 *)attr->smac, context);
+				if (err) {
+					err = -EINVAL;
+					goto out;
+				}
+				if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_GSI)
+					dev->qp1_proxy[qp->port - 1] = qp;
+			}
+		}
+	}
+
+	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
+		context->pri_path.ackto = (context->pri_path.ackto & 0xf8) |
+					MLX4_IB_LINK_TYPE_ETH;
+		if (dev->dev->caps.tunnel_offload_mode ==  MLX4_TUNNEL_OFFLOAD_MODE_VXLAN) {
+			/* set QP to receive both tunneled & non-tunneled packets */
+			if (!(context->flags & cpu_to_be32(1 << MLX4_RSS_QPC_FLAG_OFFSET)))
+				context->srqn = cpu_to_be32(7 << 28);
+		}
+	}
+
+	if (ibqp->qp_type == IB_QPT_UD && (new_state == IB_QPS_RTR)) {
+		int is_eth = rdma_port_get_link_layer(
+				&dev->ib_dev, qp->port) ==
+				IB_LINK_LAYER_ETHERNET;
+		if (is_eth) {
+			context->pri_path.ackto = MLX4_IB_LINK_TYPE_ETH;
+			optpar |= MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH;
+		}
+	}
+
+
+	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
+	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
+		sqd_event = 1;
+	else
+		sqd_event = 0;
+
+	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		context->rlkey |= (1 << 4);
+
+	/*
+	 * Before passing a kernel QP to the HW, make sure that the
+	 * ownership bits of the send queue are set and the SQ
+	 * headroom is stamped so that the hardware doesn't start
+	 * processing stale work requests.
+	 */
+	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
+		struct mlx4_wqe_ctrl_seg *ctrl;
+		int i;
+
+		for (i = 0; i < qp->sq.wqe_cnt; ++i) {
+			ctrl = get_send_wqe(qp, i);
+			ctrl->owner_opcode = cpu_to_be32(1 << 31);
+			if (qp->sq_max_wqes_per_wr == 1)
+				ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
+
+			stamp_send_wqe(qp, i, 1 << qp->sq.wqe_shift);
+		}
+	}
+
+	err = mlx4_qp_modify(dev->dev, &qp->mtt, to_mlx4_state(cur_state),
+			     to_mlx4_state(new_state), context, optpar,
+			     sqd_event, &qp->mqp);
+	if (err)
+		goto out;
+
+	qp->state = new_state;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->atomic_rd_en = attr->qp_access_flags;
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->resp_depth = attr->max_dest_rd_atomic;
+	if (attr_mask & IB_QP_PORT) {
+		qp->port = attr->port_num;
+		update_mcg_macs(dev, qp);
+	}
+	if (attr_mask & IB_QP_ALT_PATH)
+		qp->alt_port = attr->alt_port_num;
+
+	if (is_sqp(dev, qp))
+		store_sqp_attrs(to_msqp(qp), attr, attr_mask);
+
+	/*
+	 * If we moved QP0 to RTR, bring the IB link up; if we moved
+	 * QP0 to RESET or ERROR, bring the link back down.
+	 */
+	if (is_qp0(dev, qp)) {
+		if (cur_state != IB_QPS_RTR && new_state == IB_QPS_RTR)
+			if (mlx4_INIT_PORT(dev->dev, qp->port))
+				pr_warn("INIT_PORT failed for port %d\n",
+				       qp->port);
+
+		if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
+		    (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
+			mlx4_CLOSE_PORT(dev->dev, qp->port);
+	}
+
+	/*
+	 * If we moved a kernel QP to RESET, clean up all old CQ
+	 * entries and reinitialize the QP.
+	 */
+	if (new_state == IB_QPS_RESET) {
+		if (!ibqp->uobject) {
+			mlx4_ib_cq_clean(recv_cq, qp->mqp.qpn,
+					 ibqp->srq ? to_msrq(ibqp->srq) : NULL);
+			if (send_cq != recv_cq)
+				mlx4_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+
+			qp->rq.head = 0;
+			qp->rq.tail = 0;
+			qp->sq.head = 0;
+			qp->sq.tail = 0;
+			qp->sq_next_wqe = 0;
+			if (qp->rq.wqe_cnt)
+				*qp->db.db  = 0;
+
+			if (qp->flags & MLX4_IB_QP_NETIF)
+				mlx4_ib_steer_qp_reg(dev, qp, 0);
+		}
+		if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port)) {
+			mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
+			qp->pri.smac = 0;
+			qp->pri.smac_port = 0;
+		}
+		if (qp->alt.smac) {
+			mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
+			qp->alt.smac = 0;
+		}
+		if (qp->pri.vid < 0x1000) {
+			mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port, qp->pri.vid);
+			qp->pri.vid = 0xFFFF;
+			qp->pri.candidate_vid = 0xFFFF;
+			qp->pri.update_vid = 0;
+		}
+
+		if (qp->alt.vid < 0x1000) {
+			mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port, qp->alt.vid);
+			qp->alt.vid = 0xFFFF;
+			qp->alt.candidate_vid = 0xFFFF;
+			qp->alt.update_vid = 0;
+		}
+	}
+out:
+	if (err && steer_qp)
+		mlx4_ib_steer_qp_reg(dev, qp, 0);
+	kfree(context);
+	if (qp->pri.candidate_smac ||
+	    (!qp->pri.candidate_smac && qp->pri.candidate_smac_port)) {
+		if (err) {
+			mlx4_unregister_mac(dev->dev, qp->pri.candidate_smac_port, qp->pri.candidate_smac);
+		} else {
+			if (qp->pri.smac || (!qp->pri.smac && qp->pri.smac_port))
+				mlx4_unregister_mac(dev->dev, qp->pri.smac_port, qp->pri.smac);
+			qp->pri.smac = qp->pri.candidate_smac;
+			qp->pri.smac_index = qp->pri.candidate_smac_index;
+			qp->pri.smac_port = qp->pri.candidate_smac_port;
+		}
+		qp->pri.candidate_smac = 0;
+		qp->pri.candidate_smac_index = 0;
+		qp->pri.candidate_smac_port = 0;
+	}
+	if (qp->alt.candidate_smac) {
+		if (err) {
+			mlx4_unregister_mac(dev->dev, qp->alt.candidate_smac_port, qp->alt.candidate_smac);
+		} else {
+			if (qp->alt.smac)
+				mlx4_unregister_mac(dev->dev, qp->alt.smac_port, qp->alt.smac);
+			qp->alt.smac = qp->alt.candidate_smac;
+			qp->alt.smac_index = qp->alt.candidate_smac_index;
+			qp->alt.smac_port = qp->alt.candidate_smac_port;
+		}
+		qp->alt.candidate_smac = 0;
+		qp->alt.candidate_smac_index = 0;
+		qp->alt.candidate_smac_port = 0;
+	}
+
+	if (qp->pri.update_vid) {
+		if (err) {
+			if (qp->pri.candidate_vid < 0x1000)
+				mlx4_unregister_vlan(dev->dev, qp->pri.candidate_vlan_port,
+						     qp->pri.candidate_vid);
+		} else {
+			if (qp->pri.vid < 0x1000)
+				mlx4_unregister_vlan(dev->dev, qp->pri.vlan_port,
+						     qp->pri.vid);
+			qp->pri.vid = qp->pri.candidate_vid;
+			qp->pri.vlan_port = qp->pri.candidate_vlan_port;
+			qp->pri.vlan_index =  qp->pri.candidate_vlan_index;
+		}
+		qp->pri.candidate_vid = 0xFFFF;
+		qp->pri.update_vid = 0;
+	}
+
+	if (qp->alt.update_vid) {
+		if (err) {
+			if (qp->alt.candidate_vid < 0x1000)
+				mlx4_unregister_vlan(dev->dev, qp->alt.candidate_vlan_port,
+						     qp->alt.candidate_vid);
+		} else {
+			if (qp->alt.vid < 0x1000)
+				mlx4_unregister_vlan(dev->dev, qp->alt.vlan_port,
+						     qp->alt.vid);
+			qp->alt.vid = qp->alt.candidate_vid;
+			qp->alt.vlan_port = qp->alt.candidate_vlan_port;
+			qp->alt.vlan_index =  qp->alt.candidate_vlan_index;
+		}
+		qp->alt.candidate_vid = 0xFFFF;
+		qp->alt.update_vid = 0;
+	}
+
+	return err;
+}
+
+int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	int err = -EINVAL;
+	int ll;
+	mutex_lock(&qp->mutex);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		ll = IB_LINK_LAYER_UNSPECIFIED;
+	} else {
+		int port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+		ll = rdma_port_get_link_layer(&dev->ib_dev, port);
+	}
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+				attr_mask, ll)) {
+		pr_debug("qpn 0x%x: invalid attribute mask specified "
+			 "for transition %d to %d. qp_type %d,"
+			 " attr_mask 0x%x\n",
+			 ibqp->qp_num, cur_state, new_state,
+			 ibqp->qp_type, attr_mask);
+		goto out;
+	}
+
+	if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT)) {
+		if ((cur_state == IB_QPS_RESET) && (new_state == IB_QPS_INIT)) {
+			if ((ibqp->qp_type == IB_QPT_RC) ||
+			    (ibqp->qp_type == IB_QPT_UD) ||
+			    (ibqp->qp_type == IB_QPT_UC) ||
+			    (ibqp->qp_type == IB_QPT_RAW_PACKET) ||
+			    (ibqp->qp_type == IB_QPT_XRC_INI)) {
+				attr->port_num = mlx4_ib_bond_next_port(dev);
+			}
+		} else {
+			/* no sense in changing port_num
+			 * when ports are bonded */
+			attr_mask &= ~IB_QP_PORT;
+		}
+	}
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > dev->num_ports)) {
+		pr_debug("qpn 0x%x: invalid port number (%d) specified "
+			 "for transition %d to %d. qp_type %d\n",
+			 ibqp->qp_num, attr->port_num, cur_state,
+			 new_state, ibqp->qp_type);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PORT) && (ibqp->qp_type == IB_QPT_RAW_PACKET) &&
+	    (rdma_port_get_link_layer(&dev->ib_dev, attr->port_num) !=
+	     IB_LINK_LAYER_ETHERNET))
+		goto out;
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		int p = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+		if (attr->pkey_index >= dev->dev->caps.pkey_table_len[p]) {
+			pr_debug("qpn 0x%x: invalid pkey index (%d) specified "
+				 "for transition %d to %d. qp_type %d\n",
+				 ibqp->qp_num, attr->pkey_index, cur_state,
+				 new_state, ibqp->qp_type);
+			goto out;
+		}
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > dev->dev->caps.max_qp_init_rdma) {
+		pr_debug("qpn 0x%x: max_rd_atomic (%d) too large. "
+			 "Transition %d to %d. qp_type %d\n",
+			 ibqp->qp_num, attr->max_rd_atomic, cur_state,
+			 new_state, ibqp->qp_type);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > dev->dev->caps.max_qp_dest_rdma) {
+		pr_debug("qpn 0x%x: max_dest_rd_atomic (%d) too large. "
+			 "Transition %d to %d. qp_type %d\n",
+			 ibqp->qp_num, attr->max_dest_rd_atomic, cur_state,
+			 new_state, ibqp->qp_type);
+		goto out;
+	}
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		err = 0;
+		goto out;
+	}
+
+	err = __mlx4_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+
+	if (mlx4_is_bonded(dev->dev) && (attr_mask & IB_QP_PORT))
+		attr->port_num = 1;
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey)
+{
+	int i;
+	for (i = 0; i < dev->caps.num_ports; i++) {
+		if (qpn == dev->caps.qp0_proxy[i] ||
+		    qpn == dev->caps.qp0_tunnel[i]) {
+			*qkey = dev->caps.qp0_qkey[i];
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
+static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp,
+				  struct ib_send_wr *wr,
+				  void *wqe, unsigned *mlx_seg_len)
+{
+	struct mlx4_ib_dev *mdev = to_mdev(sqp->qp.ibqp.device);
+	struct ib_device *ib_dev = &mdev->ib_dev;
+	struct mlx4_wqe_mlx_seg *mlx = wqe;
+	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+	u16 pkey;
+	u32 qkey;
+	int send_size;
+	int header_size;
+	int spc;
+	int i;
+
+	if (wr->opcode != IB_WR_SEND)
+		return -EINVAL;
+
+	send_size = 0;
+
+	for (i = 0; i < wr->num_sge; ++i)
+		send_size += wr->sg_list[i].length;
+
+	/* for proxy-qp0 sends, need to add in size of tunnel header */
+	/* for tunnel-qp0 sends, tunnel header is already in s/g list */
+	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER)
+		send_size += sizeof (struct mlx4_ib_tunnel_header);
+
+	ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header);
+
+	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) {
+		sqp->ud_header.lrh.service_level =
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+		sqp->ud_header.lrh.destination_lid =
+			cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+		sqp->ud_header.lrh.source_lid =
+			cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+	}
+
+	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+	/* force loopback */
+	mlx->flags |= cpu_to_be32(MLX4_WQE_MLX_VL15 | 0x1 | MLX4_WQE_MLX_SLR);
+	mlx->rlid = sqp->ud_header.lrh.destination_lid;
+
+	sqp->ud_header.lrh.virtual_lane    = 0;
+	sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+	ib_get_cached_pkey(ib_dev, sqp->qp.port, 0, &pkey);
+	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+	if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_TUN_SMI_OWNER)
+		sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	else
+		sqp->ud_header.bth.destination_qpn =
+			cpu_to_be32(mdev->dev->caps.qp0_tunnel[sqp->qp.port - 1]);
+
+	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	if (mlx4_is_master(mdev->dev)) {
+		if (mlx4_get_parav_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
+			return -EINVAL;
+	} else {
+		if (vf_get_qp0_qkey(mdev->dev, sqp->qp.mqp.qpn, &qkey))
+			return -EINVAL;
+	}
+	sqp->ud_header.deth.qkey = cpu_to_be32(qkey);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.mqp.qpn);
+
+	sqp->ud_header.bth.opcode        = IB_OPCODE_UD_SEND_ONLY;
+	sqp->ud_header.immediate_present = 0;
+
+	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+	/*
+	 * Inline data segments may not cross a 64 byte boundary.  If
+	 * our UD header is bigger than the space available up to the
+	 * next 64 byte boundary in the WQE, use two inline data
+	 * segments to hold the UD header.
+	 */
+	spc = MLX4_INLINE_ALIGN -
+	      ((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+	if (header_size <= spc) {
+		inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+		memcpy(inl + 1, sqp->header_buf, header_size);
+		i = 1;
+	} else {
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+		memcpy(inl + 1, sqp->header_buf, spc);
+
+		inl = (void *) (inl + 1) + spc;
+		memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+		/*
+		 * Need a barrier here to make sure all the data is
+		 * visible before the byte_count field is set.
+		 * Otherwise the HCA prefetcher could grab the 64-byte
+		 * chunk with this inline segment and get a valid (!=
+		 * 0xffffffff) byte count but stale data, and end up
+		 * generating a packet with bad headers.
+		 *
+		 * The first inline segment's byte_count field doesn't
+		 * need a barrier, because it comes after a
+		 * control/MLX segment and therefore is at an offset
+		 * of 16 mod 64.
+		 */
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+		i = 2;
+	}
+
+	*mlx_seg_len =
+	ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+	return 0;
+}
+
+static void mlx4_u64_to_smac(u8 *dst_mac, u64 src_mac)
+{
+	int i;
+
+	for (i = ETH_ALEN; i; i--) {
+		dst_mac[i - 1] = src_mac & 0xff;
+		src_mac >>= 8;
+	}
+}
+
+static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_send_wr *wr,
+			    void *wqe, unsigned *mlx_seg_len)
+{
+	struct ib_device *ib_dev = sqp->qp.ibqp.device;
+	struct mlx4_wqe_mlx_seg *mlx = wqe;
+	struct mlx4_wqe_ctrl_seg *ctrl = wqe;
+	struct mlx4_wqe_inline_seg *inl = wqe + sizeof *mlx;
+	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+	union ib_gid sgid;
+	u16 pkey;
+	int send_size;
+	int header_size;
+	int spc;
+	int i;
+	int err = 0;
+	u16 vlan = 0xffff;
+	bool is_eth;
+	bool is_vlan = false;
+	bool is_grh;
+
+	send_size = 0;
+	for (i = 0; i < wr->num_sge; ++i)
+		send_size += wr->sg_list[i].length;
+
+	is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET;
+	is_grh = mlx4_ib_ah_grh_present(ah);
+	if (is_eth) {
+		if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+			/* When multi-function is enabled, the ib_core gid
+			 * indexes don't necessarily match the hw ones, so
+			 * we must use our own cache */
+			err = mlx4_get_roce_gid_from_slave(to_mdev(ib_dev)->dev,
+							   be32_to_cpu(ah->av.ib.port_pd) >> 24,
+							   ah->av.ib.gid_index, &sgid.raw[0]);
+			if (err)
+				return err;
+		} else  {
+			err = ib_get_cached_gid(ib_dev,
+						be32_to_cpu(ah->av.ib.port_pd) >> 24,
+						ah->av.ib.gid_index, &sgid);
+			if (err)
+				return err;
+		}
+
+		if (ah->av.eth.vlan != cpu_to_be16(0xffff)) {
+			vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff;
+			is_vlan = 1;
+		}
+	}
+	ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header);
+
+	if (!is_eth) {
+		sqp->ud_header.lrh.service_level =
+			be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 28;
+		sqp->ud_header.lrh.destination_lid = ah->av.ib.dlid;
+		sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f);
+	}
+
+	if (is_grh) {
+		sqp->ud_header.grh.traffic_class =
+			(be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff;
+		sqp->ud_header.grh.flow_label    =
+			ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+		sqp->ud_header.grh.hop_limit     = ah->av.ib.hop_limit;
+		if (is_eth)
+			memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16);
+		else {
+		if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+			/* When multi-function is enabled, the ib_core gid
+			 * indexes don't necessarily match the hw ones, so
+			 * we must use our own cache */
+			sqp->ud_header.grh.source_gid.global.subnet_prefix =
+				to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
+						       subnet_prefix;
+			sqp->ud_header.grh.source_gid.global.interface_id =
+				to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
+					       guid_cache[ah->av.ib.gid_index];
+		} else
+			ib_get_cached_gid(ib_dev,
+					  be32_to_cpu(ah->av.ib.port_pd) >> 24,
+					  ah->av.ib.gid_index,
+					  &sqp->ud_header.grh.source_gid);
+		}
+		memcpy(sqp->ud_header.grh.destination_gid.raw,
+		       ah->av.ib.dgid, 16);
+	}
+
+	mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE);
+
+	if (!is_eth) {
+		mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MLX4_WQE_MLX_VL15 : 0) |
+					  (sqp->ud_header.lrh.destination_lid ==
+					   IB_LID_PERMISSIVE ? MLX4_WQE_MLX_SLR : 0) |
+					  (sqp->ud_header.lrh.service_level << 8));
+		if (ah->av.ib.port_pd & cpu_to_be32(0x80000000))
+			mlx->flags |= cpu_to_be32(0x1); /* force loopback */
+		mlx->rlid = sqp->ud_header.lrh.destination_lid;
+	}
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		sqp->ud_header.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY;
+		sqp->ud_header.immediate_present = 0;
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		sqp->ud_header.bth.opcode	 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		sqp->ud_header.immediate_present = 1;
+		sqp->ud_header.immediate_data    = wr->ex.imm_data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	if (is_eth) {
+		struct in6_addr in6;
+
+		u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13;
+
+		mlx->sched_prio = cpu_to_be16(pcp);
+
+		memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6);
+		/* FIXME: cache smac value? */
+		memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2);
+		memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4);
+		memcpy(&in6, sgid.raw, sizeof(in6));
+
+		if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
+			u64 mac = atomic64_read(&to_mdev(ib_dev)->iboe.mac[sqp->qp.port - 1]);
+			u8 smac[ETH_ALEN];
+
+			mlx4_u64_to_smac(smac, mac);
+			memcpy(sqp->ud_header.eth.smac_h, smac, ETH_ALEN);
+		} else {
+			/* use the src mac of the tunnel */
+			memcpy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac, ETH_ALEN);
+		}
+
+		if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6))
+			mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK);
+		if (!is_vlan) {
+			sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
+		} else {
+			sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE);
+			sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp);
+		}
+	} else {
+		sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
+		if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+			sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+	}
+	sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+	if (!sqp->qp.ibqp.qp_num)
+		ib_get_cached_pkey(ib_dev, sqp->qp.port, sqp->pkey_index, &pkey);
+	else
+		ib_get_cached_pkey(ib_dev, sqp->qp.port, wr->wr.ud.pkey_index, &pkey);
+	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+					       sqp->qkey : wr->wr.ud.remote_qkey);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+	header_size = ib_ud_header_pack(&sqp->ud_header, sqp->header_buf);
+
+	if (0) {
+		pr_err("built UD header of size %d:\n", header_size);
+		for (i = 0; i < header_size / 4; ++i) {
+			if (i % 8 == 0)
+				pr_err("  [%02x] ", i * 4);
+			pr_cont(" %08x",
+				be32_to_cpu(((__be32 *) sqp->header_buf)[i]));
+			if ((i + 1) % 8 == 0)
+				pr_cont("\n");
+		}
+		pr_err("\n");
+	}
+
+	/*
+	 * Inline data segments may not cross a 64 byte boundary.  If
+	 * our UD header is bigger than the space available up to the
+	 * next 64 byte boundary in the WQE, use two inline data
+	 * segments to hold the UD header.
+	 */
+	spc = MLX4_INLINE_ALIGN -
+		((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+	if (header_size <= spc) {
+		inl->byte_count = cpu_to_be32(1 << 31 | header_size);
+		memcpy(inl + 1, sqp->header_buf, header_size);
+		i = 1;
+	} else {
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+		memcpy(inl + 1, sqp->header_buf, spc);
+
+		inl = (void *) (inl + 1) + spc;
+		memcpy(inl + 1, sqp->header_buf + spc, header_size - spc);
+		/*
+		 * Need a barrier here to make sure all the data is
+		 * visible before the byte_count field is set.
+		 * Otherwise the HCA prefetcher could grab the 64-byte
+		 * chunk with this inline segment and get a valid (!=
+		 * 0xffffffff) byte count but stale data, and end up
+		 * generating a packet with bad headers.
+		 *
+		 * The first inline segment's byte_count field doesn't
+		 * need a barrier, because it comes after a
+		 * control/MLX segment and therefore is at an offset
+		 * of 16 mod 64.
+		 */
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (header_size - spc));
+		i = 2;
+	}
+
+	*mlx_seg_len =
+		ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + header_size, 16);
+	return 0;
+}
+
+static int mlx4_wq_overflow(struct mlx4_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
+{
+	unsigned cur;
+	struct mlx4_ib_cq *cq;
+
+	cur = wq->head - wq->tail;
+	if (likely(cur + nreq < wq->max_post))
+		return 0;
+
+	cq = to_mcq(ib_cq);
+	spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max_post;
+}
+
+static __be32 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ?
+		cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC)       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ?
+		cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE) : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ?
+		cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ)  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_WRITE)  : 0) |
+		cpu_to_be32(MLX4_WQE_FMR_PERM_LOCAL_READ);
+}
+
+static void set_fmr_seg(struct mlx4_wqe_fmr_seg *fseg, struct ib_send_wr *wr)
+{
+	struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list);
+	int i;
+
+	for (i = 0; i < wr->wr.fast_reg.page_list_len; ++i)
+		mfrpl->mapped_page_list[i] =
+			cpu_to_be64(wr->wr.fast_reg.page_list->page_list[i] |
+				    MLX4_MTT_FLAG_PRESENT);
+
+	fseg->flags		= convert_access(wr->wr.fast_reg.access_flags);
+	fseg->mem_key		= cpu_to_be32(wr->wr.fast_reg.rkey);
+	fseg->buf_list		= cpu_to_be64(mfrpl->map);
+	fseg->start_addr	= cpu_to_be64(wr->wr.fast_reg.iova_start);
+	fseg->reg_len		= cpu_to_be64(wr->wr.fast_reg.length);
+	fseg->offset		= 0; /* XXX -- is this just for ZBVA? */
+	fseg->page_size		= cpu_to_be32(wr->wr.fast_reg.page_shift);
+	fseg->reserved[0]	= 0;
+	fseg->reserved[1]	= 0;
+}
+
+static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ib_send_wr *wr)
+{
+	bseg->flags1 =
+		convert_access(wr->wr.bind_mw.bind_info.mw_access_flags) &
+		cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ  |
+			    MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE |
+			    MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC);
+	bseg->flags2 = 0;
+	if (wr->wr.bind_mw.mw->type == IB_MW_TYPE_2)
+		bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_TYPE_2);
+	if (wr->wr.bind_mw.bind_info.mw_access_flags & IB_ZERO_BASED)
+		bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_ZERO_BASED);
+	bseg->new_rkey = cpu_to_be32(wr->wr.bind_mw.rkey);
+	bseg->lkey = cpu_to_be32(wr->wr.bind_mw.bind_info.mr->lkey);
+	bseg->addr = cpu_to_be64(wr->wr.bind_mw.bind_info.addr);
+	bseg->length = cpu_to_be64(wr->wr.bind_mw.bind_info.length);
+}
+
+static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey)
+{
+	memset(iseg, 0, sizeof(*iseg));
+	iseg->mem_key = cpu_to_be32(rkey);
+}
+
+static __always_inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
+					  u64 remote_addr, u32 rkey)
+{
+	rseg->raddr    = cpu_to_be64(remote_addr);
+	rseg->rkey     = cpu_to_be32(rkey);
+	rseg->reserved = 0;
+}
+
+static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ib_send_wr *wr)
+{
+	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
+		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add);
+	} else if (wr->opcode == IB_WR_MASKED_ATOMIC_FETCH_AND_ADD) {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add_mask);
+	} else {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+		aseg->compare  = 0;
+	}
+
+}
+
+static void set_masked_atomic_seg(struct mlx4_wqe_masked_atomic_seg *aseg,
+				  struct ib_send_wr *wr)
+{
+	aseg->swap_add		= cpu_to_be64(wr->wr.atomic.swap);
+	aseg->swap_add_mask	= cpu_to_be64(wr->wr.atomic.swap_mask);
+	aseg->compare		= cpu_to_be64(wr->wr.atomic.compare_add);
+	aseg->compare_mask	= cpu_to_be64(wr->wr.atomic.compare_add_mask);
+}
+
+static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
+			     struct ib_send_wr *wr)
+{
+	memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
+	dseg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	dseg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+	dseg->vlan = to_mah(wr->wr.ud.ah)->av.eth.vlan;
+	memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->av.eth.mac, 6);
+}
+
+static void set_tunnel_datagram_seg(struct mlx4_ib_dev *dev,
+				    struct mlx4_wqe_datagram_seg *dseg,
+				    struct ib_send_wr *wr,
+				    enum mlx4_ib_qp_type qpt)
+{
+	union mlx4_ext_av *av = &to_mah(wr->wr.ud.ah)->av;
+	struct mlx4_av sqp_av = {0};
+	int port = *((u8 *) &av->ib.port_pd) & 0x3;
+
+	/* force loopback */
+	sqp_av.port_pd = av->ib.port_pd | cpu_to_be32(0x80000000);
+	sqp_av.g_slid = av->ib.g_slid & 0x7f; /* no GRH */
+	sqp_av.sl_tclass_flowlabel = av->ib.sl_tclass_flowlabel &
+			cpu_to_be32(0xf0000000);
+
+	memcpy(dseg->av, &sqp_av, sizeof (struct mlx4_av));
+	if (qpt == MLX4_IB_QPT_PROXY_GSI)
+		dseg->dqpn = cpu_to_be32(dev->dev->caps.qp1_tunnel[port - 1]);
+	else
+		dseg->dqpn = cpu_to_be32(dev->dev->caps.qp0_tunnel[port - 1]);
+	/* Use QKEY from the QP context, which is set by master */
+	dseg->qkey = cpu_to_be32(IB_QP_SET_QKEY);
+}
+
+static void build_tunnel_header(struct ib_send_wr *wr, void *wqe, unsigned *mlx_seg_len)
+{
+	struct mlx4_wqe_inline_seg *inl = wqe;
+	struct mlx4_ib_tunnel_header hdr;
+	struct mlx4_ib_ah *ah = to_mah(wr->wr.ud.ah);
+	int spc;
+	int i;
+
+	memcpy(&hdr.av, &ah->av, sizeof hdr.av);
+	hdr.remote_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	hdr.pkey_index = cpu_to_be16(wr->wr.ud.pkey_index);
+	hdr.qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+	memcpy(hdr.mac, ah->av.eth.mac, 6);
+	hdr.vlan = ah->av.eth.vlan;
+
+	spc = MLX4_INLINE_ALIGN -
+		((unsigned long) (inl + 1) & (MLX4_INLINE_ALIGN - 1));
+	if (sizeof (hdr) <= spc) {
+		memcpy(inl + 1, &hdr, sizeof (hdr));
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | sizeof (hdr));
+		i = 1;
+	} else {
+		memcpy(inl + 1, &hdr, spc);
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | spc);
+
+		inl = (void *) (inl + 1) + spc;
+		memcpy(inl + 1, (void *) &hdr + spc, sizeof (hdr) - spc);
+		wmb();
+		inl->byte_count = cpu_to_be32(1 << 31 | (sizeof (hdr) - spc));
+		i = 2;
+	}
+
+	*mlx_seg_len =
+		ALIGN(i * sizeof (struct mlx4_wqe_inline_seg) + sizeof (hdr), 16);
+}
+
+static void set_mlx_icrc_seg(void *dseg)
+{
+	u32 *t = dseg;
+	struct mlx4_wqe_inline_seg *iseg = dseg;
+
+	t[1] = 0;
+
+	/*
+	 * Need a barrier here before writing the byte_count field to
+	 * make sure that all the data is visible before the
+	 * byte_count field is set.  Otherwise, if the segment begins
+	 * a new cacheline, the HCA prefetcher could grab the 64-byte
+	 * chunk and get a valid (!= * 0xffffffff) byte count but
+	 * stale data, and end up sending the wrong data.
+	 */
+	wmb();
+
+	iseg->byte_count = cpu_to_be32((1 << 31) | 4);
+}
+
+static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+
+	/*
+	 * Need a barrier here before writing the byte_count field to
+	 * make sure that all the data is visible before the
+	 * byte_count field is set.  Otherwise, if the segment begins
+	 * a new cacheline, the HCA prefetcher could grab the 64-byte
+	 * chunk and get a valid (!= * 0xffffffff) byte count but
+	 * stale data, and end up sending the wrong data.
+	 */
+	wmb();
+
+	dseg->byte_count = cpu_to_be32(sg->length);
+}
+
+static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->byte_count = cpu_to_be32(sg->length);
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+}
+
+static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
+			 struct mlx4_ib_qp *qp, unsigned *lso_seg_len,
+			 __be32 *lso_hdr_sz, __be32 *blh)
+{
+	unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16);
+
+	if (unlikely(halign > MLX4_IB_CACHE_LINE_SIZE))
+		*blh = cpu_to_be32(1 << 6);
+
+	if (unlikely(!(qp->flags & MLX4_IB_QP_LSO) &&
+		     wr->num_sge > qp->sq.max_gs - (halign >> 4)))
+		return -EINVAL;
+
+	memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen);
+
+	*lso_hdr_sz  = cpu_to_be32(wr->wr.ud.mss << 16 | wr->wr.ud.hlen);
+	*lso_seg_len = halign;
+	return 0;
+}
+
+static __be32 send_ieth(struct ib_send_wr *wr)
+{
+	switch (wr->opcode) {
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		return wr->ex.imm_data;
+
+	case IB_WR_SEND_WITH_INV:
+		return cpu_to_be32(wr->ex.invalidate_rkey);
+
+	default:
+		return 0;
+	}
+}
+
+static void add_zero_len_inline(void *wqe)
+{
+	struct mlx4_wqe_inline_seg *inl = wqe;
+	memset(wqe, 0, 16);
+	inl->byte_count = cpu_to_be32(1 << 31);
+}
+
+int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr)
+{
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	void *wqe;
+	struct mlx4_wqe_ctrl_seg *ctrl;
+	struct mlx4_wqe_data_seg *dseg;
+	unsigned long flags;
+	int nreq;
+	int err = 0;
+	unsigned ind;
+	int uninitialized_var(stamp);
+	int uninitialized_var(size);
+	unsigned uninitialized_var(seglen);
+	__be32 dummy;
+	__be32 *lso_wqe;
+	__be32 uninitialized_var(lso_hdr_sz);
+	__be32 blh;
+	int i;
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		err = -EIO;
+		*bad_wr = wr;
+		nreq = 0;
+		goto out;
+	}
+
+	ind = qp->sq_next_wqe;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		lso_wqe = &dummy;
+		blh = 0;
+
+		if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->sq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
+		qp->sq.wrid[(qp->sq.head + nreq) & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
+
+		ctrl->srcrb_flags =
+			(wr->send_flags & IB_SEND_SIGNALED ?
+			 cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
+			(wr->send_flags & IB_SEND_SOLICITED ?
+			 cpu_to_be32(MLX4_WQE_CTRL_SOLICITED) : 0) |
+			((wr->send_flags & IB_SEND_IP_CSUM) ?
+			 cpu_to_be32(MLX4_WQE_CTRL_IP_CSUM |
+				     MLX4_WQE_CTRL_TCP_UDP_CSUM) : 0) |
+			qp->sq_signal_bits;
+
+		ctrl->imm = send_ieth(wr);
+
+		wqe += sizeof *ctrl;
+		size = sizeof *ctrl / 16;
+
+		switch (qp->mlx4_ib_qp_type) {
+		case MLX4_IB_QPT_RC:
+		case MLX4_IB_QPT_UC:
+			switch (wr->opcode) {
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+			case IB_WR_MASKED_ATOMIC_FETCH_AND_ADD:
+				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+
+				set_atomic_seg(wqe, wr);
+				wqe  += sizeof (struct mlx4_wqe_atomic_seg);
+
+				size += (sizeof (struct mlx4_wqe_raddr_seg) +
+					 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
+
+				break;
+
+			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+
+				set_masked_atomic_seg(wqe, wr);
+				wqe  += sizeof (struct mlx4_wqe_masked_atomic_seg);
+
+				size += (sizeof (struct mlx4_wqe_raddr_seg) +
+					 sizeof (struct mlx4_wqe_masked_atomic_seg)) / 16;
+
+				break;
+
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
+				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
+				break;
+
+			case IB_WR_LOCAL_INV:
+				ctrl->srcrb_flags |=
+					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+				set_local_inv_seg(wqe, wr->ex.invalidate_rkey);
+				wqe  += sizeof (struct mlx4_wqe_local_inval_seg);
+				size += sizeof (struct mlx4_wqe_local_inval_seg) / 16;
+				break;
+
+			case IB_WR_FAST_REG_MR:
+				ctrl->srcrb_flags |=
+					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+				set_fmr_seg(wqe, wr);
+				wqe  += sizeof (struct mlx4_wqe_fmr_seg);
+				size += sizeof (struct mlx4_wqe_fmr_seg) / 16;
+				break;
+
+			case IB_WR_BIND_MW:
+				ctrl->srcrb_flags |=
+					cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER);
+				set_bind_seg(wqe, wr);
+				wqe  += sizeof(struct mlx4_wqe_bind_seg);
+				size += sizeof(struct mlx4_wqe_bind_seg) / 16;
+				break;
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+			break;
+
+		case MLX4_IB_QPT_TUN_SMI_OWNER:
+			err =  build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen);
+			if (unlikely(err)) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe  += seglen;
+			size += seglen / 16;
+			break;
+		case MLX4_IB_QPT_TUN_SMI:
+		case MLX4_IB_QPT_TUN_GSI:
+			/* this is a UD qp used in MAD responses to slaves. */
+			set_datagram_seg(wqe, wr);
+			/* set the forced-loopback bit in the data seg av */
+			*(__be32 *) wqe |= cpu_to_be32(0x80000000);
+			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+			break;
+		case MLX4_IB_QPT_UD:
+			set_datagram_seg(wqe, wr);
+			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+
+			if (wr->opcode == IB_WR_LSO) {
+				err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz, &blh);
+				if (unlikely(err)) {
+					*bad_wr = wr;
+					goto out;
+				}
+				lso_wqe = (__be32 *) wqe;
+				wqe  += seglen;
+				size += seglen / 16;
+			}
+			break;
+
+		case MLX4_IB_QPT_PROXY_SMI_OWNER:
+			err = build_sriov_qp0_header(to_msqp(qp), wr, ctrl, &seglen);
+			if (unlikely(err)) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe  += seglen;
+			size += seglen / 16;
+			/* to start tunnel header on a cache-line boundary */
+			add_zero_len_inline(wqe);
+			wqe += 16;
+			size++;
+			build_tunnel_header(wr, wqe, &seglen);
+			wqe  += seglen;
+			size += seglen / 16;
+			break;
+		case MLX4_IB_QPT_PROXY_SMI:
+		case MLX4_IB_QPT_PROXY_GSI:
+			/* If we are tunneling special qps, this is a UD qp.
+			 * In this case we first add a UD segment targeting
+			 * the tunnel qp, and then add a header with address
+			 * information */
+			set_tunnel_datagram_seg(to_mdev(ibqp->device), wqe, wr,
+						qp->mlx4_ib_qp_type);
+			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
+			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
+			build_tunnel_header(wr, wqe, &seglen);
+			wqe  += seglen;
+			size += seglen / 16;
+			break;
+
+		case MLX4_IB_QPT_SMI:
+		case MLX4_IB_QPT_GSI:
+			err = build_mlx_header(to_msqp(qp), wr, ctrl, &seglen);
+			if (unlikely(err)) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe  += seglen;
+			size += seglen / 16;
+			break;
+
+		default:
+			break;
+		}
+
+		/*
+		 * Write data segments in reverse order, so as to
+		 * overwrite cacheline stamp last within each
+		 * cacheline.  This avoids issues with WQE
+		 * prefetching.
+		 */
+
+		dseg = wqe;
+		dseg += wr->num_sge - 1;
+		size += wr->num_sge * (sizeof (struct mlx4_wqe_data_seg) / 16);
+
+		/* Add one more inline data segment for ICRC for MLX sends */
+		if (unlikely(qp->mlx4_ib_qp_type == MLX4_IB_QPT_SMI ||
+			     qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI ||
+			     qp->mlx4_ib_qp_type &
+			     (MLX4_IB_QPT_PROXY_SMI_OWNER | MLX4_IB_QPT_TUN_SMI_OWNER))) {
+			set_mlx_icrc_seg(dseg + 1);
+			size += sizeof (struct mlx4_wqe_data_seg) / 16;
+		}
+
+		for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
+			set_data_seg(dseg, wr->sg_list + i);
+
+		/*
+		 * Possibly overwrite stamping in cacheline with LSO
+		 * segment only after making sure all data segments
+		 * are written.
+		 */
+		wmb();
+		*lso_wqe = lso_hdr_sz;
+
+		ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
+				    MLX4_WQE_CTRL_FENCE : 0) | size;
+
+		/*
+		 * Make sure descriptor is fully written before
+		 * setting ownership bit (because HW can start
+		 * executing as soon as we do).
+		 */
+		wmb();
+
+		if (wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx4_ib_opcode)) {
+			*bad_wr = wr;
+			err = -EINVAL;
+			goto out;
+		}
+
+		ctrl->owner_opcode = mlx4_ib_opcode[wr->opcode] |
+			(ind & qp->sq.wqe_cnt ? cpu_to_be32(1 << 31) : 0) | blh;
+
+		stamp = ind + qp->sq_spare_wqes;
+		ind += DIV_ROUND_UP(size * 16, 1U << qp->sq.wqe_shift);
+
+		/*
+		 * We can improve latency by not stamping the last
+		 * send queue WQE until after ringing the doorbell, so
+		 * only stamp here if there are still more WQEs to post.
+		 *
+		 * Same optimization applies to padding with NOP wqe
+		 * in case of WQE shrinking (used to prevent wrap-around
+		 * in the middle of WR).
+		 */
+		if (wr->next) {
+			stamp_send_wqe(qp, stamp, size * 16);
+			ind = pad_wraparound(qp, ind);
+		}
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->sq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		writel(qp->doorbell_qpn,
+		       to_mdev(ibqp->device)->uar_map + MLX4_SEND_DOORBELL);
+
+		/*
+		 * Make sure doorbells don't leak out of SQ spinlock
+		 * and reach the HCA out of order.
+		 */
+		mmiowb();
+
+		stamp_send_wqe(qp, stamp, size * 16);
+
+		ind = pad_wraparound(qp, ind);
+		qp->sq_next_wqe = ind;
+	}
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+
+	return err;
+}
+
+int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_wqe_data_seg *scat;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int ind;
+	int max_gs;
+	int i;
+	struct mlx4_ib_dev *mdev = to_mdev(ibqp->device);
+
+	max_gs = qp->rq.max_gs;
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		err = -EIO;
+		*bad_wr = wr;
+		nreq = 0;
+		goto out;
+	}
+
+	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mlx4_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		scat = get_recv_wqe(qp, ind);
+
+		if (qp->mlx4_ib_qp_type & (MLX4_IB_QPT_PROXY_SMI_OWNER |
+		    MLX4_IB_QPT_PROXY_SMI | MLX4_IB_QPT_PROXY_GSI)) {
+			ib_dma_sync_single_for_device(ibqp->device,
+						      qp->sqp_proxy_rcv[ind].map,
+						      sizeof (struct mlx4_ib_proxy_sqp_hdr),
+						      DMA_FROM_DEVICE);
+			scat->byte_count =
+				cpu_to_be32(sizeof (struct mlx4_ib_proxy_sqp_hdr));
+			/* use dma lkey from upper layer entry */
+			scat->lkey = cpu_to_be32(wr->sg_list->lkey);
+			scat->addr = cpu_to_be64(qp->sqp_proxy_rcv[ind].map);
+			scat++;
+			max_gs--;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i)
+			__set_data_seg(scat + i, wr->sg_list + i);
+
+		if (i < max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+
+		qp->rq.wrid[ind] = wr->wr_id;
+
+		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->rq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+	return err;
+}
+
+static inline enum ib_qp_state to_ib_qp_state(enum mlx4_qp_state mlx4_state)
+{
+	switch (mlx4_state) {
+	case MLX4_QP_STATE_RST:      return IB_QPS_RESET;
+	case MLX4_QP_STATE_INIT:     return IB_QPS_INIT;
+	case MLX4_QP_STATE_RTR:      return IB_QPS_RTR;
+	case MLX4_QP_STATE_RTS:      return IB_QPS_RTS;
+	case MLX4_QP_STATE_SQ_DRAINING:
+	case MLX4_QP_STATE_SQD:      return IB_QPS_SQD;
+	case MLX4_QP_STATE_SQER:     return IB_QPS_SQE;
+	case MLX4_QP_STATE_ERR:      return IB_QPS_ERR;
+	default:		     return -1;
+	}
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mlx4_mig_state)
+{
+	switch (mlx4_mig_state) {
+	case MLX4_QP_PM_ARMED:		return IB_MIG_ARMED;
+	case MLX4_QP_PM_REARM:		return IB_MIG_REARM;
+	case MLX4_QP_PM_MIGRATED:	return IB_MIG_MIGRATED;
+	default: return -1;
+	}
+}
+
+static int to_ib_qp_access_flags(int mlx4_flags)
+{
+	int ib_flags = 0;
+
+	if (mlx4_flags & MLX4_QP_BIT_RRE)
+		ib_flags |= IB_ACCESS_REMOTE_READ;
+	if (mlx4_flags & MLX4_QP_BIT_RWE)
+		ib_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (mlx4_flags & MLX4_QP_BIT_RAE)
+		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+	return ib_flags;
+}
+
+static void to_ib_ah_attr(struct mlx4_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr,
+				struct mlx4_qp_path *path)
+{
+	struct mlx4_dev *dev = ibdev->dev;
+	int is_eth;
+
+	memset(ib_ah_attr, 0, sizeof *ib_ah_attr);
+	ib_ah_attr->port_num	  = path->sched_queue & 0x40 ? 2 : 1;
+
+	if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->caps.num_ports)
+		return;
+
+	is_eth = rdma_port_get_link_layer(&ibdev->ib_dev, ib_ah_attr->port_num) ==
+		IB_LINK_LAYER_ETHERNET;
+	if (is_eth)
+		ib_ah_attr->sl = ((path->sched_queue >> 3) & 0x7) |
+		((path->sched_queue & 4) << 1);
+	else
+		ib_ah_attr->sl = (path->sched_queue >> 2) & 0xf;
+
+	ib_ah_attr->dlid	  = be16_to_cpu(path->rlid);
+	ib_ah_attr->src_path_bits = path->grh_mylmc & 0x7f;
+	ib_ah_attr->static_rate   = path->static_rate ? path->static_rate - 5 : 0;
+	ib_ah_attr->ah_flags      = (path->grh_mylmc & (1 << 7)) ? IB_AH_GRH : 0;
+	if (ib_ah_attr->ah_flags) {
+		ib_ah_attr->grh.sgid_index = path->mgid_index;
+		ib_ah_attr->grh.hop_limit  = path->hop_limit;
+		ib_ah_attr->grh.traffic_class =
+			(be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff;
+		ib_ah_attr->grh.flow_label =
+			be32_to_cpu(path->tclass_flowlabel) & 0xfffff;
+		memcpy(ib_ah_attr->grh.dgid.raw,
+			path->rgid, sizeof ib_ah_attr->grh.dgid.raw);
+	}
+}
+
+int mlx4_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx4_ib_qp *qp = to_mqp(ibqp);
+	struct mlx4_qp_context context;
+	int mlx4_state;
+	int err = 0;
+
+	mutex_lock(&qp->mutex);
+
+	if (qp->state == IB_QPS_RESET) {
+		qp_attr->qp_state = IB_QPS_RESET;
+		goto done;
+	}
+
+	err = mlx4_qp_query(dev->dev, &qp->mqp, &context);
+	if (err) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	mlx4_state = be32_to_cpu(context.flags) >> 28;
+
+	qp->state		     = to_ib_qp_state(mlx4_state);
+	qp_attr->qp_state	     = qp->state;
+	qp_attr->path_mtu	     = context.mtu_msgmax >> 5;
+	qp_attr->path_mig_state	     =
+		to_ib_mig_state((be32_to_cpu(context.flags) >> 11) & 0x3);
+	qp_attr->qkey		     = be32_to_cpu(context.qkey);
+	qp_attr->rq_psn		     = be32_to_cpu(context.rnr_nextrecvpsn) & 0xffffff;
+	qp_attr->sq_psn		     = be32_to_cpu(context.next_send_psn) & 0xffffff;
+	qp_attr->dest_qp_num	     = be32_to_cpu(context.remote_qpn) & 0xffffff;
+	qp_attr->qp_access_flags     =
+		to_ib_qp_access_flags(be32_to_cpu(context.params2));
+
+	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
+		to_ib_ah_attr(dev, &qp_attr->ah_attr, &context.pri_path);
+		to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context.alt_path);
+		qp_attr->alt_pkey_index = context.alt_path.pkey_index & 0x7f;
+		qp_attr->alt_port_num	= qp_attr->alt_ah_attr.port_num;
+	}
+
+	qp_attr->pkey_index = context.pri_path.pkey_index & 0x7f;
+	if (qp_attr->qp_state == IB_QPS_INIT)
+		qp_attr->port_num = qp->port;
+	else
+		qp_attr->port_num = context.pri_path.sched_queue & 0x40 ? 2 : 1;
+
+	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+	qp_attr->sq_draining = mlx4_state == MLX4_QP_STATE_SQ_DRAINING;
+
+	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context.params1) >> 21) & 0x7);
+
+	qp_attr->max_dest_rd_atomic =
+		1 << ((be32_to_cpu(context.params2) >> 21) & 0x7);
+	qp_attr->min_rnr_timer	    =
+		(be32_to_cpu(context.rnr_nextrecvpsn) >> 24) & 0x1f;
+	qp_attr->timeout	    = context.pri_path.ackto >> 3;
+	qp_attr->retry_cnt	    = (be32_to_cpu(context.params1) >> 16) & 0x7;
+	qp_attr->rnr_retry	    = (be32_to_cpu(context.params1) >> 13) & 0x7;
+	qp_attr->alt_timeout	    = context.alt_path.ackto >> 3;
+
+done:
+	qp_attr->cur_qp_state	     = qp_attr->qp_state;
+	qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
+	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
+
+	if (!ibqp->uobject) {
+		qp_attr->cap.max_send_wr  = qp->sq.wqe_cnt;
+		qp_attr->cap.max_send_sge = qp->sq.max_gs;
+	} else {
+		qp_attr->cap.max_send_wr  = 0;
+		qp_attr->cap.max_send_sge = 0;
+	}
+
+	/*
+	 * We don't support inline sends for kernel QPs (yet), and we
+	 * don't know what userspace's value should be.
+	 */
+	qp_attr->cap.max_inline_data = 0;
+
+	qp_init_attr->cap	     = qp_attr->cap;
+
+	qp_init_attr->create_flags = 0;
+	if (qp->flags & MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+		qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+	if (qp->flags & MLX4_IB_QP_LSO)
+		qp_init_attr->create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+	if (qp->flags & MLX4_IB_QP_NETIF)
+		qp_init_attr->create_flags |= IB_QP_CREATE_NETIF_QP;
+
+	qp_init_attr->sq_sig_type =
+		qp->sq_signal_bits == cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE) ?
+		IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
new file mode 100644
index 000000000..dce5dfe3a
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx4/qp.h>
+#include <linux/mlx4/srq.h>
+#include <linux/slab.h>
+
+#include "mlx4_ib.h"
+#include "user.h"
+
+static void *get_wqe(struct mlx4_ib_srq *srq, int n)
+{
+	return mlx4_buf_offset(&srq->buf, n << srq->msrq.wqe_shift);
+}
+
+static void mlx4_ib_srq_event(struct mlx4_srq *srq, enum mlx4_event type)
+{
+	struct ib_event event;
+	struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq;
+
+	if (ibsrq->event_handler) {
+		event.device      = ibsrq->device;
+		event.element.srq = ibsrq;
+		switch (type) {
+		case MLX4_EVENT_TYPE_SRQ_LIMIT:
+			event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			break;
+		case MLX4_EVENT_TYPE_SRQ_CATAS_ERROR:
+			event.event = IB_EVENT_SRQ_ERR;
+			break;
+		default:
+			pr_warn("Unexpected event type %d "
+			       "on SRQ %06x\n", type, srq->srqn);
+			return;
+		}
+
+		ibsrq->event_handler(&event, ibsrq->srq_context);
+	}
+}
+
+struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_srq *srq;
+	struct mlx4_wqe_srq_next_seg *next;
+	struct mlx4_wqe_data_seg *scatter;
+	u32 cqn;
+	u16 xrcdn;
+	int desc_size;
+	int buf_size;
+	int err;
+	int i;
+
+	/* Sanity check SRQ size before proceeding */
+	if (init_attr->attr.max_wr  >= dev->dev->caps.max_srq_wqes ||
+	    init_attr->attr.max_sge >  dev->dev->caps.max_srq_sge)
+		return ERR_PTR(-EINVAL);
+
+	srq = kmalloc(sizeof *srq, GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&srq->mutex);
+	spin_lock_init(&srq->lock);
+	srq->msrq.max    = roundup_pow_of_two(init_attr->attr.max_wr + 1);
+	srq->msrq.max_gs = init_attr->attr.max_sge;
+
+	desc_size = max(32UL,
+			roundup_pow_of_two(sizeof (struct mlx4_wqe_srq_next_seg) +
+					   srq->msrq.max_gs *
+					   sizeof (struct mlx4_wqe_data_seg)));
+	srq->msrq.wqe_shift = ilog2(desc_size);
+
+	buf_size = srq->msrq.max * desc_size;
+
+	if (pd->uobject) {
+		struct mlx4_ib_create_srq ucmd;
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err_srq;
+		}
+
+		srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+					buf_size, 0, 0);
+		if (IS_ERR(srq->umem)) {
+			err = PTR_ERR(srq->umem);
+			goto err_srq;
+		}
+
+		err = mlx4_mtt_init(dev->dev, ib_umem_page_count(srq->umem),
+				    ilog2(srq->umem->page_size), &srq->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_ib_umem_write_mtt(dev, &srq->mtt, srq->umem);
+		if (err)
+			goto err_mtt;
+
+		err = mlx4_ib_db_map_user(to_mucontext(pd->uobject->context),
+					  ucmd.db_addr, &srq->db);
+		if (err)
+			goto err_mtt;
+	} else {
+		err = mlx4_db_alloc(dev->dev, &srq->db, 0, GFP_KERNEL);
+		if (err)
+			goto err_srq;
+
+		*srq->db.db = 0;
+
+		if (mlx4_buf_alloc(dev->dev, buf_size, PAGE_SIZE * 2, &srq->buf,
+				   GFP_KERNEL)) {
+			err = -ENOMEM;
+			goto err_db;
+		}
+
+		srq->head    = 0;
+		srq->tail    = srq->msrq.max - 1;
+		srq->wqe_ctr = 0;
+
+		for (i = 0; i < srq->msrq.max; ++i) {
+			next = get_wqe(srq, i);
+			next->next_wqe_index =
+				cpu_to_be16((i + 1) & (srq->msrq.max - 1));
+
+			for (scatter = (void *) (next + 1);
+			     (void *) scatter < (void *) next + desc_size;
+			     ++scatter)
+				scatter->lkey = cpu_to_be32(MLX4_INVALID_LKEY);
+		}
+
+		err = mlx4_mtt_init(dev->dev, srq->buf.npages, srq->buf.page_shift,
+				    &srq->mtt);
+		if (err)
+			goto err_buf;
+
+		err = mlx4_buf_write_mtt(dev->dev, &srq->mtt, &srq->buf, GFP_KERNEL);
+		if (err)
+			goto err_mtt;
+
+		srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL);
+		if (!srq->wrid) {
+			err = -ENOMEM;
+			goto err_mtt;
+		}
+	}
+
+	cqn = (init_attr->srq_type == IB_SRQT_XRC) ?
+		to_mcq(init_attr->ext.xrc.cq)->mcq.cqn : 0;
+	xrcdn = (init_attr->srq_type == IB_SRQT_XRC) ?
+		to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn :
+		(u16) dev->dev->caps.reserved_xrcds;
+	err = mlx4_srq_alloc(dev->dev, to_mpd(pd)->pdn, cqn, xrcdn, &srq->mtt,
+			     srq->db.dma, &srq->msrq);
+	if (err)
+		goto err_wrid;
+
+	srq->msrq.event = mlx4_ib_srq_event;
+	srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn;
+
+	if (pd->uobject)
+		if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof (__u32))) {
+			err = -EFAULT;
+			goto err_wrid;
+		}
+
+	init_attr->attr.max_wr = srq->msrq.max - 1;
+
+	return &srq->ibsrq;
+
+err_wrid:
+	if (pd->uobject)
+		mlx4_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+	else
+		kfree(srq->wrid);
+
+err_mtt:
+	mlx4_mtt_cleanup(dev->dev, &srq->mtt);
+
+err_buf:
+	if (pd->uobject)
+		ib_umem_release(srq->umem);
+	else
+		mlx4_buf_free(dev->dev, buf_size, &srq->buf);
+
+err_db:
+	if (!pd->uobject)
+		mlx4_db_free(dev->dev, &srq->db);
+
+err_srq:
+	kfree(srq);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+
+	/* We don't support resizing SRQs (yet?) */
+	if (attr_mask & IB_SRQ_MAX_WR)
+		return -EINVAL;
+
+	if (attr_mask & IB_SRQ_LIMIT) {
+		if (attr->srq_limit >= srq->msrq.max)
+			return -EINVAL;
+
+		mutex_lock(&srq->mutex);
+		ret = mlx4_srq_arm(dev->dev, &srq->msrq, attr->srq_limit);
+		mutex_unlock(&srq->mutex);
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+	int limit_watermark;
+
+	ret = mlx4_srq_query(dev->dev, &srq->msrq, &limit_watermark);
+	if (ret)
+		return ret;
+
+	srq_attr->srq_limit = limit_watermark;
+	srq_attr->max_wr    = srq->msrq.max - 1;
+	srq_attr->max_sge   = srq->msrq.max_gs;
+
+	return 0;
+}
+
+int mlx4_ib_destroy_srq(struct ib_srq *srq)
+{
+	struct mlx4_ib_dev *dev = to_mdev(srq->device);
+	struct mlx4_ib_srq *msrq = to_msrq(srq);
+
+	mlx4_srq_free(dev->dev, &msrq->msrq);
+	mlx4_mtt_cleanup(dev->dev, &msrq->mtt);
+
+	if (srq->uobject) {
+		mlx4_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
+		ib_umem_release(msrq->umem);
+	} else {
+		kfree(msrq->wrid);
+		mlx4_buf_free(dev->dev, msrq->msrq.max << msrq->msrq.wqe_shift,
+			      &msrq->buf);
+		mlx4_db_free(dev->dev, &msrq->db);
+	}
+
+	kfree(msrq);
+
+	return 0;
+}
+
+void mlx4_ib_free_srq_wqe(struct mlx4_ib_srq *srq, int wqe_index)
+{
+	struct mlx4_wqe_srq_next_seg *next;
+
+	/* always called with interrupts disabled. */
+	spin_lock(&srq->lock);
+
+	next = get_wqe(srq, srq->tail);
+	next->next_wqe_index = cpu_to_be16(wqe_index);
+	srq->tail = wqe_index;
+
+	spin_unlock(&srq->lock);
+}
+
+int mlx4_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr)
+{
+	struct mlx4_ib_srq *srq = to_msrq(ibsrq);
+	struct mlx4_wqe_srq_next_seg *next;
+	struct mlx4_wqe_data_seg *scat;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	struct mlx4_ib_dev *mdev = to_mdev(ibsrq->device);
+
+	spin_lock_irqsave(&srq->lock, flags);
+	if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) {
+		err = -EIO;
+		*bad_wr = wr;
+		nreq = 0;
+		goto out;
+	}
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		if (unlikely(srq->head == srq->tail)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		srq->wrid[srq->head] = wr->wr_id;
+
+		next      = get_wqe(srq, srq->head);
+		srq->head = be16_to_cpu(next->next_wqe_index);
+		scat      = (struct mlx4_wqe_data_seg *) (next + 1);
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
+			scat[i].lkey       = cpu_to_be32(wr->sg_list[i].lkey);
+			scat[i].addr       = cpu_to_be64(wr->sg_list[i].addr);
+		}
+
+		if (i < srq->msrq.max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX4_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+	}
+
+	if (likely(nreq)) {
+		srq->wqe_ctr += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*srq->db.db = cpu_to_be32(srq->wqe_ctr);
+	}
+out:
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx4/sysfs.c b/drivers/infiniband/hw/mlx4/sysfs.c
new file mode 100644
index 000000000..6797108ce
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/sysfs.c
@@ -0,0 +1,886 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*#include "core_priv.h"*/
+#include "mlx4_ib.h"
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/stat.h>
+
+#include <rdma/ib_mad.h>
+/*show_admin_alias_guid returns the administratively assigned value of that GUID.
+ * Values returned in buf parameter string:
+ *	0			- requests opensm to assign a value.
+ *	ffffffffffffffff	- delete this entry.
+ *	other			- value assigned by administrator.
+ */
+static ssize_t show_admin_alias_guid(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+		container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+	struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+	struct mlx4_ib_dev *mdev = port->dev;
+	__be64 sysadmin_ag_val;
+
+	sysadmin_ag_val = mlx4_get_admin_guid(mdev->dev,
+					      mlx4_ib_iov_dentry->entry_num,
+					      port->num);
+
+	return sprintf(buf, "%llx\n", be64_to_cpu(sysadmin_ag_val));
+}
+
+/* store_admin_alias_guid stores the (new) administratively assigned value of that GUID.
+ * Values in buf parameter string:
+ *	0			- requests opensm to assign a value.
+ *	0xffffffffffffffff	- delete this entry.
+ *	other			- guid value assigned by the administrator.
+ */
+static ssize_t store_admin_alias_guid(struct device *dev,
+				      struct device_attribute *attr,
+				      const char *buf, size_t count)
+{
+	int record_num;/*0-15*/
+	int guid_index_in_rec; /*0 - 7*/
+	struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+		container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+	struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+	struct mlx4_ib_dev *mdev = port->dev;
+	u64 sysadmin_ag_val;
+	unsigned long flags;
+
+	record_num = mlx4_ib_iov_dentry->entry_num / 8;
+	guid_index_in_rec = mlx4_ib_iov_dentry->entry_num % 8;
+	if (0 == record_num && 0 == guid_index_in_rec) {
+		pr_err("GUID 0 block 0 is RO\n");
+		return count;
+	}
+	spin_lock_irqsave(&mdev->sriov.alias_guid.ag_work_lock, flags);
+	sscanf(buf, "%llx", &sysadmin_ag_val);
+	*(__be64 *)&mdev->sriov.alias_guid.ports_guid[port->num - 1].
+		all_rec_per_port[record_num].
+		all_recs[GUID_REC_SIZE * guid_index_in_rec] =
+			cpu_to_be64(sysadmin_ag_val);
+
+	/* Change the state to be pending for update */
+	mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].status
+		= MLX4_GUID_INFO_STATUS_IDLE ;
+	mlx4_set_admin_guid(mdev->dev, cpu_to_be64(sysadmin_ag_val),
+			    mlx4_ib_iov_dentry->entry_num,
+			    port->num);
+
+	/* set the record index */
+	mdev->sriov.alias_guid.ports_guid[port->num - 1].all_rec_per_port[record_num].guid_indexes
+		|= mlx4_ib_get_aguid_comp_mask_from_ix(guid_index_in_rec);
+
+	spin_unlock_irqrestore(&mdev->sriov.alias_guid.ag_work_lock, flags);
+	mlx4_ib_init_alias_guid_work(mdev, port->num - 1);
+
+	return count;
+}
+
+static ssize_t show_port_gid(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+		container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+	struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+	struct mlx4_ib_dev *mdev = port->dev;
+	union ib_gid gid;
+	ssize_t ret;
+
+	ret = __mlx4_ib_query_gid(&mdev->ib_dev, port->num,
+				  mlx4_ib_iov_dentry->entry_num, &gid, 1);
+	if (ret)
+		return ret;
+	ret = sprintf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+		      be16_to_cpu(((__be16 *) gid.raw)[0]),
+		      be16_to_cpu(((__be16 *) gid.raw)[1]),
+		      be16_to_cpu(((__be16 *) gid.raw)[2]),
+		      be16_to_cpu(((__be16 *) gid.raw)[3]),
+		      be16_to_cpu(((__be16 *) gid.raw)[4]),
+		      be16_to_cpu(((__be16 *) gid.raw)[5]),
+		      be16_to_cpu(((__be16 *) gid.raw)[6]),
+		      be16_to_cpu(((__be16 *) gid.raw)[7]));
+	return ret;
+}
+
+static ssize_t show_phys_port_pkey(struct device *dev,
+				   struct device_attribute *attr,
+				   char *buf)
+{
+	struct mlx4_ib_iov_sysfs_attr *mlx4_ib_iov_dentry =
+		container_of(attr, struct mlx4_ib_iov_sysfs_attr, dentry);
+	struct mlx4_ib_iov_port *port = mlx4_ib_iov_dentry->ctx;
+	struct mlx4_ib_dev *mdev = port->dev;
+	u16 pkey;
+	ssize_t ret;
+
+	ret = __mlx4_ib_query_pkey(&mdev->ib_dev, port->num,
+				   mlx4_ib_iov_dentry->entry_num, &pkey, 1);
+	if (ret)
+		return ret;
+
+	return sprintf(buf, "0x%04x\n", pkey);
+}
+
+#define DENTRY_REMOVE(_dentry)						\
+do {									\
+	sysfs_remove_file((_dentry)->kobj, &(_dentry)->dentry.attr);	\
+} while (0);
+
+static int create_sysfs_entry(void *_ctx, struct mlx4_ib_iov_sysfs_attr *_dentry,
+			      char *_name, struct kobject *_kobj,
+			      ssize_t (*show)(struct device *dev,
+					      struct device_attribute *attr,
+					      char *buf),
+			      ssize_t (*store)(struct device *dev,
+					       struct device_attribute *attr,
+					       const char *buf, size_t count)
+			      )
+{
+	int ret = 0;
+	struct mlx4_ib_iov_sysfs_attr *vdentry = _dentry;
+
+	vdentry->ctx = _ctx;
+	vdentry->dentry.show = show;
+	vdentry->dentry.store = store;
+	sysfs_attr_init(&vdentry->dentry.attr);
+	vdentry->dentry.attr.name = vdentry->name;
+	vdentry->dentry.attr.mode = 0;
+	vdentry->kobj = _kobj;
+	snprintf(vdentry->name, 15, "%s", _name);
+
+	if (vdentry->dentry.store)
+		vdentry->dentry.attr.mode |= S_IWUSR;
+
+	if (vdentry->dentry.show)
+		vdentry->dentry.attr.mode |= S_IRUGO;
+
+	ret = sysfs_create_file(vdentry->kobj, &vdentry->dentry.attr);
+	if (ret) {
+		pr_err("failed to create %s\n", vdentry->dentry.attr.name);
+		vdentry->ctx = NULL;
+		return ret;
+	}
+
+	return ret;
+}
+
+int add_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+		struct attribute *attr)
+{
+	struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1];
+	int ret;
+
+	ret = sysfs_create_file(port->mcgs_parent, attr);
+	if (ret)
+		pr_err("failed to create %s\n", attr->name);
+
+	return ret;
+}
+
+void del_sysfs_port_mcg_attr(struct mlx4_ib_dev *device, int port_num,
+		struct attribute *attr)
+{
+	struct mlx4_ib_iov_port *port = &device->iov_ports[port_num - 1];
+
+	sysfs_remove_file(port->mcgs_parent, attr);
+}
+
+static int add_port_entries(struct mlx4_ib_dev *device, int port_num)
+{
+	int i;
+	char buff[10];
+	struct mlx4_ib_iov_port *port = NULL;
+	int ret = 0 ;
+	struct ib_port_attr attr;
+
+	/* get the physical gid and pkey table sizes.*/
+	ret = __mlx4_ib_query_port(&device->ib_dev, port_num, &attr, 1);
+	if (ret)
+		goto err;
+
+	port = &device->iov_ports[port_num - 1];
+	port->dev = device;
+	port->num = port_num;
+	/* Directory structure:
+	 * iov -
+	 *   port num -
+	 *	admin_guids
+	 *	gids (operational)
+	 *	mcg_table
+	 */
+	port->dentr_ar = kzalloc(sizeof (struct mlx4_ib_iov_sysfs_attr_ar),
+				 GFP_KERNEL);
+	if (!port->dentr_ar) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	sprintf(buff, "%d", port_num);
+	port->cur_port = kobject_create_and_add(buff,
+				 kobject_get(device->ports_parent));
+	if (!port->cur_port) {
+		ret = -ENOMEM;
+		goto kobj_create_err;
+	}
+	/* admin GUIDs */
+	port->admin_alias_parent = kobject_create_and_add("admin_guids",
+						  kobject_get(port->cur_port));
+	if (!port->admin_alias_parent) {
+		ret = -ENOMEM;
+		goto err_admin_guids;
+	}
+	for (i = 0 ; i < attr.gid_tbl_len; i++) {
+		sprintf(buff, "%d", i);
+		port->dentr_ar->dentries[i].entry_num = i;
+		ret = create_sysfs_entry(port, &port->dentr_ar->dentries[i],
+					  buff, port->admin_alias_parent,
+					  show_admin_alias_guid, store_admin_alias_guid);
+		if (ret)
+			goto err_admin_alias_parent;
+	}
+
+	/* gids subdirectory (operational gids) */
+	port->gids_parent = kobject_create_and_add("gids",
+						  kobject_get(port->cur_port));
+	if (!port->gids_parent) {
+		ret = -ENOMEM;
+		goto err_gids;
+	}
+
+	for (i = 0 ; i < attr.gid_tbl_len; i++) {
+		sprintf(buff, "%d", i);
+		port->dentr_ar->dentries[attr.gid_tbl_len + i].entry_num = i;
+		ret = create_sysfs_entry(port,
+					 &port->dentr_ar->dentries[attr.gid_tbl_len + i],
+					 buff,
+					 port->gids_parent, show_port_gid, NULL);
+		if (ret)
+			goto err_gids_parent;
+	}
+
+	/* physical port pkey table */
+	port->pkeys_parent =
+		kobject_create_and_add("pkeys", kobject_get(port->cur_port));
+	if (!port->pkeys_parent) {
+		ret = -ENOMEM;
+		goto err_pkeys;
+	}
+
+	for (i = 0 ; i < attr.pkey_tbl_len; i++) {
+		sprintf(buff, "%d", i);
+		port->dentr_ar->dentries[2 * attr.gid_tbl_len + i].entry_num = i;
+		ret = create_sysfs_entry(port,
+					 &port->dentr_ar->dentries[2 * attr.gid_tbl_len + i],
+					 buff, port->pkeys_parent,
+					 show_phys_port_pkey, NULL);
+		if (ret)
+			goto err_pkeys_parent;
+	}
+
+	/* MCGs table */
+	port->mcgs_parent =
+		kobject_create_and_add("mcgs", kobject_get(port->cur_port));
+	if (!port->mcgs_parent) {
+		ret = -ENOMEM;
+		goto err_mcgs;
+	}
+	return 0;
+
+err_mcgs:
+	kobject_put(port->cur_port);
+
+err_pkeys_parent:
+	kobject_put(port->pkeys_parent);
+
+err_pkeys:
+	kobject_put(port->cur_port);
+
+err_gids_parent:
+	kobject_put(port->gids_parent);
+
+err_gids:
+	kobject_put(port->cur_port);
+
+err_admin_alias_parent:
+	kobject_put(port->admin_alias_parent);
+
+err_admin_guids:
+	kobject_put(port->cur_port);
+	kobject_put(port->cur_port); /* once more for create_and_add buff */
+
+kobj_create_err:
+	kobject_put(device->ports_parent);
+	kfree(port->dentr_ar);
+
+err:
+	pr_err("add_port_entries FAILED: for port:%d, error: %d\n",
+	       port_num, ret);
+	return ret;
+}
+
+static void get_name(struct mlx4_ib_dev *dev, char *name, int i, int max)
+{
+	char base_name[9];
+
+	/* pci_name format is: bus:dev:func -> xxxx:yy:zz.n */
+	strlcpy(name, pci_name(dev->dev->persist->pdev), max);
+	strncpy(base_name, name, 8); /*till xxxx:yy:*/
+	base_name[8] = '\0';
+	/* with no ARI only 3 last bits are used so when the fn is higher than 8
+	 * need to add it to the dev num, so count in the last number will be
+	 * modulo 8 */
+	sprintf(name, "%s%.2d.%d", base_name, (i/8), (i%8));
+}
+
+struct mlx4_port {
+	struct kobject         kobj;
+	struct mlx4_ib_dev    *dev;
+	struct attribute_group pkey_group;
+	struct attribute_group gid_group;
+	struct device_attribute	enable_smi_admin;
+	struct device_attribute	smi_enabled;
+	int		       slave;
+	u8                     port_num;
+};
+
+
+static void mlx4_port_release(struct kobject *kobj)
+{
+	struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+	struct attribute *a;
+	int i;
+
+	for (i = 0; (a = p->pkey_group.attrs[i]); ++i)
+		kfree(a);
+	kfree(p->pkey_group.attrs);
+	for (i = 0; (a = p->gid_group.attrs[i]); ++i)
+		kfree(a);
+	kfree(p->gid_group.attrs);
+	kfree(p);
+}
+
+struct port_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct mlx4_port *, struct port_attribute *, char *buf);
+	ssize_t (*store)(struct mlx4_port *, struct port_attribute *,
+			 const char *buf, size_t count);
+};
+
+static ssize_t port_attr_show(struct kobject *kobj,
+			      struct attribute *attr, char *buf)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+
+	if (!port_attr->show)
+		return -EIO;
+	return port_attr->show(p, port_attr, buf);
+}
+
+static ssize_t port_attr_store(struct kobject *kobj,
+			       struct attribute *attr,
+			       const char *buf, size_t size)
+{
+	struct port_attribute *port_attr =
+		container_of(attr, struct port_attribute, attr);
+	struct mlx4_port *p = container_of(kobj, struct mlx4_port, kobj);
+
+	if (!port_attr->store)
+		return -EIO;
+	return port_attr->store(p, port_attr, buf, size);
+}
+
+static const struct sysfs_ops port_sysfs_ops = {
+	.show = port_attr_show,
+	.store = port_attr_store,
+};
+
+static struct kobj_type port_type = {
+	.release    = mlx4_port_release,
+	.sysfs_ops  = &port_sysfs_ops,
+};
+
+struct port_table_attribute {
+	struct port_attribute	attr;
+	char			name[8];
+	int			index;
+};
+
+static ssize_t show_port_pkey(struct mlx4_port *p, struct port_attribute *attr,
+			      char *buf)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	ssize_t ret = -ENODEV;
+
+	if (p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1][tab_attr->index] >=
+	    (p->dev->dev->caps.pkey_table_len[p->port_num]))
+		ret = sprintf(buf, "none\n");
+	else
+		ret = sprintf(buf, "%d\n",
+			      p->dev->pkeys.virt2phys_pkey[p->slave]
+			      [p->port_num - 1][tab_attr->index]);
+	return ret;
+}
+
+static ssize_t store_port_pkey(struct mlx4_port *p, struct port_attribute *attr,
+			       const char *buf, size_t count)
+{
+	struct port_table_attribute *tab_attr =
+		container_of(attr, struct port_table_attribute, attr);
+	int idx;
+	int err;
+
+	/* do not allow remapping Dom0 virtual pkey table */
+	if (p->slave == mlx4_master_func_num(p->dev->dev))
+		return -EINVAL;
+
+	if (!strncasecmp(buf, "no", 2))
+		idx = p->dev->dev->phys_caps.pkey_phys_table_len[p->port_num] - 1;
+	else if (sscanf(buf, "%i", &idx) != 1 ||
+		 idx >= p->dev->dev->caps.pkey_table_len[p->port_num] ||
+		 idx < 0)
+		return -EINVAL;
+
+	p->dev->pkeys.virt2phys_pkey[p->slave][p->port_num - 1]
+				    [tab_attr->index] = idx;
+	mlx4_sync_pkey_table(p->dev->dev, p->slave, p->port_num,
+			     tab_attr->index, idx);
+	err = mlx4_gen_pkey_eqe(p->dev->dev, p->slave, p->port_num);
+	if (err) {
+		pr_err("mlx4_gen_pkey_eqe failed for slave %d,"
+		       " port %d, index %d\n", p->slave, p->port_num, idx);
+		return err;
+	}
+	return count;
+}
+
+static ssize_t show_port_gid_idx(struct mlx4_port *p,
+				 struct port_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", p->slave);
+}
+
+static struct attribute **
+alloc_group_attrs(ssize_t (*show)(struct mlx4_port *,
+				  struct port_attribute *, char *buf),
+		  ssize_t (*store)(struct mlx4_port *, struct port_attribute *,
+				   const char *buf, size_t count),
+		  int len)
+{
+	struct attribute **tab_attr;
+	struct port_table_attribute *element;
+	int i;
+
+	tab_attr = kcalloc(1 + len, sizeof (struct attribute *), GFP_KERNEL);
+	if (!tab_attr)
+		return NULL;
+
+	for (i = 0; i < len; i++) {
+		element = kzalloc(sizeof (struct port_table_attribute),
+				  GFP_KERNEL);
+		if (!element)
+			goto err;
+		if (snprintf(element->name, sizeof (element->name),
+			     "%d", i) >= sizeof (element->name)) {
+			kfree(element);
+			goto err;
+		}
+		sysfs_attr_init(&element->attr.attr);
+		element->attr.attr.name  = element->name;
+		if (store) {
+			element->attr.attr.mode  = S_IWUSR | S_IRUGO;
+			element->attr.store	 = store;
+		} else
+			element->attr.attr.mode  = S_IRUGO;
+
+		element->attr.show       = show;
+		element->index		 = i;
+		tab_attr[i] = &element->attr.attr;
+	}
+	return tab_attr;
+
+err:
+	while (--i >= 0)
+		kfree(tab_attr[i]);
+	kfree(tab_attr);
+	return NULL;
+}
+
+static ssize_t sysfs_show_smi_enabled(struct device *dev,
+				      struct device_attribute *attr, char *buf)
+{
+	struct mlx4_port *p =
+		container_of(attr, struct mlx4_port, smi_enabled);
+	ssize_t len = 0;
+
+	if (mlx4_vf_smi_enabled(p->dev->dev, p->slave, p->port_num))
+		len = sprintf(buf, "%d\n", 1);
+	else
+		len = sprintf(buf, "%d\n", 0);
+
+	return len;
+}
+
+static ssize_t sysfs_show_enable_smi_admin(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct mlx4_port *p =
+		container_of(attr, struct mlx4_port, enable_smi_admin);
+	ssize_t len = 0;
+
+	if (mlx4_vf_get_enable_smi_admin(p->dev->dev, p->slave, p->port_num))
+		len = sprintf(buf, "%d\n", 1);
+	else
+		len = sprintf(buf, "%d\n", 0);
+
+	return len;
+}
+
+static ssize_t sysfs_store_enable_smi_admin(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct mlx4_port *p =
+		container_of(attr, struct mlx4_port, enable_smi_admin);
+	int enable;
+
+	if (sscanf(buf, "%i", &enable) != 1 ||
+	    enable < 0 || enable > 1)
+		return -EINVAL;
+
+	if (mlx4_vf_set_enable_smi_admin(p->dev->dev, p->slave, p->port_num, enable))
+		return -EINVAL;
+	return count;
+}
+
+static int add_vf_smi_entries(struct mlx4_port *p)
+{
+	int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) ==
+			IB_LINK_LAYER_ETHERNET;
+	int ret;
+
+	/* do not display entries if eth transport, or if master */
+	if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev))
+		return 0;
+
+	sysfs_attr_init(&p->smi_enabled.attr);
+	p->smi_enabled.show = sysfs_show_smi_enabled;
+	p->smi_enabled.store = NULL;
+	p->smi_enabled.attr.name = "smi_enabled";
+	p->smi_enabled.attr.mode = 0444;
+	ret = sysfs_create_file(&p->kobj, &p->smi_enabled.attr);
+	if (ret) {
+		pr_err("failed to create smi_enabled\n");
+		return ret;
+	}
+
+	sysfs_attr_init(&p->enable_smi_admin.attr);
+	p->enable_smi_admin.show = sysfs_show_enable_smi_admin;
+	p->enable_smi_admin.store = sysfs_store_enable_smi_admin;
+	p->enable_smi_admin.attr.name = "enable_smi_admin";
+	p->enable_smi_admin.attr.mode = 0644;
+	ret = sysfs_create_file(&p->kobj, &p->enable_smi_admin.attr);
+	if (ret) {
+		pr_err("failed to create enable_smi_admin\n");
+		sysfs_remove_file(&p->kobj, &p->smi_enabled.attr);
+		return ret;
+	}
+	return 0;
+}
+
+static void remove_vf_smi_entries(struct mlx4_port *p)
+{
+	int is_eth = rdma_port_get_link_layer(&p->dev->ib_dev, p->port_num) ==
+			IB_LINK_LAYER_ETHERNET;
+
+	if (is_eth || p->slave == mlx4_master_func_num(p->dev->dev))
+		return;
+
+	sysfs_remove_file(&p->kobj, &p->smi_enabled.attr);
+	sysfs_remove_file(&p->kobj, &p->enable_smi_admin.attr);
+}
+
+static int add_port(struct mlx4_ib_dev *dev, int port_num, int slave)
+{
+	struct mlx4_port *p;
+	int i;
+	int ret;
+
+	p = kzalloc(sizeof *p, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	p->dev = dev;
+	p->port_num = port_num;
+	p->slave = slave;
+
+	ret = kobject_init_and_add(&p->kobj, &port_type,
+				   kobject_get(dev->dev_ports_parent[slave]),
+				   "%d", port_num);
+	if (ret)
+		goto err_alloc;
+
+	p->pkey_group.name  = "pkey_idx";
+	p->pkey_group.attrs =
+		alloc_group_attrs(show_port_pkey, store_port_pkey,
+				  dev->dev->caps.pkey_table_len[port_num]);
+	if (!p->pkey_group.attrs) {
+		ret = -ENOMEM;
+		goto err_alloc;
+	}
+
+	ret = sysfs_create_group(&p->kobj, &p->pkey_group);
+	if (ret)
+		goto err_free_pkey;
+
+	p->gid_group.name  = "gid_idx";
+	p->gid_group.attrs = alloc_group_attrs(show_port_gid_idx, NULL, 1);
+	if (!p->gid_group.attrs) {
+		ret = -ENOMEM;
+		goto err_free_pkey;
+	}
+
+	ret = sysfs_create_group(&p->kobj, &p->gid_group);
+	if (ret)
+		goto err_free_gid;
+
+	ret = add_vf_smi_entries(p);
+	if (ret)
+		goto err_free_gid;
+
+	list_add_tail(&p->kobj.entry, &dev->pkeys.pkey_port_list[slave]);
+	return 0;
+
+err_free_gid:
+	kfree(p->gid_group.attrs[0]);
+	kfree(p->gid_group.attrs);
+
+err_free_pkey:
+	for (i = 0; i < dev->dev->caps.pkey_table_len[port_num]; ++i)
+		kfree(p->pkey_group.attrs[i]);
+	kfree(p->pkey_group.attrs);
+
+err_alloc:
+	kobject_put(dev->dev_ports_parent[slave]);
+	kfree(p);
+	return ret;
+}
+
+static int register_one_pkey_tree(struct mlx4_ib_dev *dev, int slave)
+{
+	char name[32];
+	int err;
+	int port;
+	struct kobject *p, *t;
+	struct mlx4_port *mport;
+	struct mlx4_active_ports actv_ports;
+
+	get_name(dev, name, slave, sizeof name);
+
+	dev->pkeys.device_parent[slave] =
+		kobject_create_and_add(name, kobject_get(dev->iov_parent));
+
+	if (!dev->pkeys.device_parent[slave]) {
+		err = -ENOMEM;
+		goto fail_dev;
+	}
+
+	INIT_LIST_HEAD(&dev->pkeys.pkey_port_list[slave]);
+
+	dev->dev_ports_parent[slave] =
+		kobject_create_and_add("ports",
+				       kobject_get(dev->pkeys.device_parent[slave]));
+
+	if (!dev->dev_ports_parent[slave]) {
+		err = -ENOMEM;
+		goto err_ports;
+	}
+
+	actv_ports = mlx4_get_active_ports(dev->dev, slave);
+
+	for (port = 1; port <= dev->dev->caps.num_ports; ++port) {
+		if (!test_bit(port - 1, actv_ports.ports))
+			continue;
+		err = add_port(dev, port, slave);
+		if (err)
+			goto err_add;
+	}
+	return 0;
+
+err_add:
+	list_for_each_entry_safe(p, t,
+				 &dev->pkeys.pkey_port_list[slave],
+				 entry) {
+		list_del(&p->entry);
+		mport = container_of(p, struct mlx4_port, kobj);
+		sysfs_remove_group(p, &mport->pkey_group);
+		sysfs_remove_group(p, &mport->gid_group);
+		remove_vf_smi_entries(mport);
+		kobject_put(p);
+	}
+	kobject_put(dev->dev_ports_parent[slave]);
+
+err_ports:
+	kobject_put(dev->pkeys.device_parent[slave]);
+	/* extra put for the device_parent create_and_add */
+	kobject_put(dev->pkeys.device_parent[slave]);
+
+fail_dev:
+	kobject_put(dev->iov_parent);
+	return err;
+}
+
+static int register_pkey_tree(struct mlx4_ib_dev *device)
+{
+	int i;
+
+	if (!mlx4_is_master(device->dev))
+		return 0;
+
+	for (i = 0; i <= device->dev->persist->num_vfs; ++i)
+		register_one_pkey_tree(device, i);
+
+	return 0;
+}
+
+static void unregister_pkey_tree(struct mlx4_ib_dev *device)
+{
+	int slave;
+	struct kobject *p, *t;
+	struct mlx4_port *port;
+
+	if (!mlx4_is_master(device->dev))
+		return;
+
+	for (slave = device->dev->persist->num_vfs; slave >= 0; --slave) {
+		list_for_each_entry_safe(p, t,
+					 &device->pkeys.pkey_port_list[slave],
+					 entry) {
+			list_del(&p->entry);
+			port = container_of(p, struct mlx4_port, kobj);
+			sysfs_remove_group(p, &port->pkey_group);
+			sysfs_remove_group(p, &port->gid_group);
+			remove_vf_smi_entries(port);
+			kobject_put(p);
+			kobject_put(device->dev_ports_parent[slave]);
+		}
+		kobject_put(device->dev_ports_parent[slave]);
+		kobject_put(device->pkeys.device_parent[slave]);
+		kobject_put(device->pkeys.device_parent[slave]);
+		kobject_put(device->iov_parent);
+	}
+}
+
+int mlx4_ib_device_register_sysfs(struct mlx4_ib_dev *dev)
+{
+	int i;
+	int ret = 0;
+
+	if (!mlx4_is_master(dev->dev))
+		return 0;
+
+	dev->iov_parent =
+		kobject_create_and_add("iov",
+				       kobject_get(dev->ib_dev.ports_parent->parent));
+	if (!dev->iov_parent) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	dev->ports_parent =
+		kobject_create_and_add("ports",
+				       kobject_get(dev->iov_parent));
+	if (!dev->ports_parent) {
+		ret = -ENOMEM;
+		goto err_ports;
+	}
+
+	for (i = 1; i <= dev->ib_dev.phys_port_cnt; ++i) {
+		ret = add_port_entries(dev, i);
+		if (ret)
+			goto err_add_entries;
+	}
+
+	ret = register_pkey_tree(dev);
+	if (ret)
+		goto err_add_entries;
+	return 0;
+
+err_add_entries:
+	kobject_put(dev->ports_parent);
+
+err_ports:
+	kobject_put(dev->iov_parent);
+err:
+	kobject_put(dev->ib_dev.ports_parent->parent);
+	pr_err("mlx4_ib_device_register_sysfs error (%d)\n", ret);
+	return ret;
+}
+
+static void unregister_alias_guid_tree(struct mlx4_ib_dev *device)
+{
+	struct mlx4_ib_iov_port *p;
+	int i;
+
+	if (!mlx4_is_master(device->dev))
+		return;
+
+	for (i = 0; i < device->dev->caps.num_ports; i++) {
+		p = &device->iov_ports[i];
+		kobject_put(p->admin_alias_parent);
+		kobject_put(p->gids_parent);
+		kobject_put(p->pkeys_parent);
+		kobject_put(p->mcgs_parent);
+		kobject_put(p->cur_port);
+		kobject_put(p->cur_port);
+		kobject_put(p->cur_port);
+		kobject_put(p->cur_port);
+		kobject_put(p->cur_port);
+		kobject_put(p->dev->ports_parent);
+		kfree(p->dentr_ar);
+	}
+}
+
+void mlx4_ib_device_unregister_sysfs(struct mlx4_ib_dev *device)
+{
+	unregister_alias_guid_tree(device);
+	unregister_pkey_tree(device);
+	kobject_put(device->ports_parent);
+	kobject_put(device->iov_parent);
+	kobject_put(device->iov_parent);
+	kobject_put(device->ib_dev.ports_parent->parent);
+}
diff --git a/drivers/infiniband/hw/mlx4/user.h b/drivers/infiniband/hw/mlx4/user.h
new file mode 100644
index 000000000..07e6769ef
--- /dev/null
+++ b/drivers/infiniband/hw/mlx4/user.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX4_IB_USER_H
+#define MLX4_IB_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+
+#define MLX4_IB_UVERBS_NO_DEV_CAPS_ABI_VERSION	3
+#define MLX4_IB_UVERBS_ABI_VERSION		4
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mlx4_ib_alloc_ucontext_resp_v3 {
+	__u32	qp_tab_size;
+	__u16	bf_reg_size;
+	__u16	bf_regs_per_page;
+};
+
+struct mlx4_ib_alloc_ucontext_resp {
+	__u32	dev_caps;
+	__u32	qp_tab_size;
+	__u16	bf_reg_size;
+	__u16	bf_regs_per_page;
+	__u32	cqe_size;
+};
+
+struct mlx4_ib_alloc_pd_resp {
+	__u32	pdn;
+	__u32	reserved;
+};
+
+struct mlx4_ib_create_cq {
+	__u64	buf_addr;
+	__u64	db_addr;
+};
+
+struct mlx4_ib_create_cq_resp {
+	__u32	cqn;
+	__u32	reserved;
+};
+
+struct mlx4_ib_resize_cq {
+	__u64	buf_addr;
+};
+
+struct mlx4_ib_create_srq {
+	__u64	buf_addr;
+	__u64	db_addr;
+};
+
+struct mlx4_ib_create_srq_resp {
+	__u32	srqn;
+	__u32	reserved;
+};
+
+struct mlx4_ib_create_qp {
+	__u64	buf_addr;
+	__u64	db_addr;
+	__u8	log_sq_bb_count;
+	__u8	log_sq_stride;
+	__u8	sq_no_prefetch;
+	__u8	reserved[5];
+};
+
+#endif /* MLX4_IB_USER_H */
diff --git a/drivers/infiniband/hw/mlx5/Kconfig b/drivers/infiniband/hw/mlx5/Kconfig
new file mode 100644
index 000000000..10df386c6
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/Kconfig
@@ -0,0 +1,10 @@
+config MLX5_INFINIBAND
+	tristate "Mellanox Connect-IB HCA support"
+	depends on NETDEVICES && ETHERNET && PCI
+	select NET_VENDOR_MELLANOX
+	select MLX5_CORE
+	---help---
+	  This driver provides low-level InfiniBand support for
+	  Mellanox Connect-IB PCI Express host channel adapters (HCAs).
+	  This is required to use InfiniBand protocols such as
+	  IP-over-IB or SRP with these devices.
diff --git a/drivers/infiniband/hw/mlx5/Makefile b/drivers/infiniband/hw/mlx5/Makefile
new file mode 100644
index 000000000..27a70159e
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_MLX5_INFINIBAND)	+= mlx5_ib.o
+
+mlx5_ib-y :=	main.o cq.o doorbell.o qp.o mem.o srq.o mr.o ah.o mad.o
+mlx5_ib-$(CONFIG_INFINIBAND_ON_DEMAND_PAGING) += odp.o
diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c
new file mode 100644
index 000000000..66080580e
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/ah.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "mlx5_ib.h"
+
+struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
+			   struct mlx5_ib_ah *ah)
+{
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		memcpy(ah->av.rgid, &ah_attr->grh.dgid, 16);
+		ah->av.grh_gid_fl = cpu_to_be32(ah_attr->grh.flow_label |
+						(1 << 30) |
+						ah_attr->grh.sgid_index << 20);
+		ah->av.hop_limit = ah_attr->grh.hop_limit;
+		ah->av.tclass = ah_attr->grh.traffic_class;
+	}
+
+	ah->av.rlid = cpu_to_be16(ah_attr->dlid);
+	ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f;
+	ah->av.stat_rate_sl = (ah_attr->static_rate << 4) | (ah_attr->sl & 0xf);
+
+	return &ah->ibah;
+}
+
+struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	struct mlx5_ib_ah *ah;
+
+	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	return create_ib_ah(ah_attr, ah); /* never fails */
+}
+
+int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	struct mlx5_ib_ah *ah = to_mah(ibah);
+	u32 tmp;
+
+	memset(ah_attr, 0, sizeof(*ah_attr));
+
+	tmp = be32_to_cpu(ah->av.grh_gid_fl);
+	if (tmp & (1 << 30)) {
+		ah_attr->ah_flags = IB_AH_GRH;
+		ah_attr->grh.sgid_index = (tmp >> 20) & 0xff;
+		ah_attr->grh.flow_label = tmp & 0xfffff;
+		memcpy(&ah_attr->grh.dgid, ah->av.rgid, 16);
+		ah_attr->grh.hop_limit = ah->av.hop_limit;
+		ah_attr->grh.traffic_class = ah->av.tclass;
+	}
+	ah_attr->dlid = be16_to_cpu(ah->av.rlid);
+	ah_attr->static_rate = ah->av.stat_rate_sl >> 4;
+	ah_attr->sl = ah->av.stat_rate_sl & 0xf;
+
+	return 0;
+}
+
+int mlx5_ib_destroy_ah(struct ib_ah *ah)
+{
+	kfree(to_mah(ah));
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c
new file mode 100644
index 000000000..2ee6b1051
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/cq.c
@@ -0,0 +1,1189 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kref.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+#include "mlx5_ib.h"
+#include "user.h"
+
+static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq)
+{
+	struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
+
+	ibcq->comp_handler(ibcq, ibcq->cq_context);
+}
+
+static void mlx5_ib_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
+{
+	struct mlx5_ib_cq *cq = container_of(mcq, struct mlx5_ib_cq, mcq);
+	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
+	struct ib_cq *ibcq = &cq->ibcq;
+	struct ib_event event;
+
+	if (type != MLX5_EVENT_TYPE_CQ_ERROR) {
+		mlx5_ib_warn(dev, "Unexpected event type %d on CQ %06x\n",
+			     type, mcq->cqn);
+		return;
+	}
+
+	if (ibcq->event_handler) {
+		event.device     = &dev->ib_dev;
+		event.event      = IB_EVENT_CQ_ERR;
+		event.element.cq = ibcq;
+		ibcq->event_handler(&event, ibcq->cq_context);
+	}
+}
+
+static void *get_cqe_from_buf(struct mlx5_ib_cq_buf *buf, int n, int size)
+{
+	return mlx5_buf_offset(&buf->buf, n * size);
+}
+
+static void *get_cqe(struct mlx5_ib_cq *cq, int n)
+{
+	return get_cqe_from_buf(&cq->buf, n, cq->mcq.cqe_sz);
+}
+
+static u8 sw_ownership_bit(int n, int nent)
+{
+	return (n & nent) ? 1 : 0;
+}
+
+static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n)
+{
+	void *cqe = get_cqe(cq, n & cq->ibcq.cqe);
+	struct mlx5_cqe64 *cqe64;
+
+	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
+
+	if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) &&
+	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) {
+		return cqe;
+	} else {
+		return NULL;
+	}
+}
+
+static void *next_cqe_sw(struct mlx5_ib_cq *cq)
+{
+	return get_sw_cqe(cq, cq->mcq.cons_index);
+}
+
+static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx)
+{
+	switch (wq->wr_data[idx]) {
+	case MLX5_IB_WR_UMR:
+		return 0;
+
+	case IB_WR_LOCAL_INV:
+		return IB_WC_LOCAL_INV;
+
+	case IB_WR_FAST_REG_MR:
+		return IB_WC_FAST_REG_MR;
+
+	default:
+		pr_warn("unknown completion status\n");
+		return 0;
+	}
+}
+
+static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
+			    struct mlx5_ib_wq *wq, int idx)
+{
+	wc->wc_flags = 0;
+	switch (be32_to_cpu(cqe->sop_drop_qpn) >> 24) {
+	case MLX5_OPCODE_RDMA_WRITE_IMM:
+		wc->wc_flags |= IB_WC_WITH_IMM;
+	case MLX5_OPCODE_RDMA_WRITE:
+		wc->opcode    = IB_WC_RDMA_WRITE;
+		break;
+	case MLX5_OPCODE_SEND_IMM:
+		wc->wc_flags |= IB_WC_WITH_IMM;
+	case MLX5_OPCODE_SEND:
+	case MLX5_OPCODE_SEND_INVAL:
+		wc->opcode    = IB_WC_SEND;
+		break;
+	case MLX5_OPCODE_RDMA_READ:
+		wc->opcode    = IB_WC_RDMA_READ;
+		wc->byte_len  = be32_to_cpu(cqe->byte_cnt);
+		break;
+	case MLX5_OPCODE_ATOMIC_CS:
+		wc->opcode    = IB_WC_COMP_SWAP;
+		wc->byte_len  = 8;
+		break;
+	case MLX5_OPCODE_ATOMIC_FA:
+		wc->opcode    = IB_WC_FETCH_ADD;
+		wc->byte_len  = 8;
+		break;
+	case MLX5_OPCODE_ATOMIC_MASKED_CS:
+		wc->opcode    = IB_WC_MASKED_COMP_SWAP;
+		wc->byte_len  = 8;
+		break;
+	case MLX5_OPCODE_ATOMIC_MASKED_FA:
+		wc->opcode    = IB_WC_MASKED_FETCH_ADD;
+		wc->byte_len  = 8;
+		break;
+	case MLX5_OPCODE_BIND_MW:
+		wc->opcode    = IB_WC_BIND_MW;
+		break;
+	case MLX5_OPCODE_UMR:
+		wc->opcode = get_umr_comp(wq, idx);
+		break;
+	}
+}
+
+enum {
+	MLX5_GRH_IN_BUFFER = 1,
+	MLX5_GRH_IN_CQE	   = 2,
+};
+
+static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
+			     struct mlx5_ib_qp *qp)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);
+	struct mlx5_ib_srq *srq;
+	struct mlx5_ib_wq *wq;
+	u16 wqe_ctr;
+	u8 g;
+
+	if (qp->ibqp.srq || qp->ibqp.xrcd) {
+		struct mlx5_core_srq *msrq = NULL;
+
+		if (qp->ibqp.xrcd) {
+			msrq = mlx5_core_get_srq(dev->mdev,
+						 be32_to_cpu(cqe->srqn));
+			srq = to_mibsrq(msrq);
+		} else {
+			srq = to_msrq(qp->ibqp.srq);
+		}
+		if (srq) {
+			wqe_ctr = be16_to_cpu(cqe->wqe_counter);
+			wc->wr_id = srq->wrid[wqe_ctr];
+			mlx5_ib_free_srq_wqe(srq, wqe_ctr);
+			if (msrq && atomic_dec_and_test(&msrq->refcount))
+				complete(&msrq->free);
+		}
+	} else {
+		wq	  = &qp->rq;
+		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+		++wq->tail;
+	}
+	wc->byte_len = be32_to_cpu(cqe->byte_cnt);
+
+	switch (cqe->op_own >> 4) {
+	case MLX5_CQE_RESP_WR_IMM:
+		wc->opcode	= IB_WC_RECV_RDMA_WITH_IMM;
+		wc->wc_flags	= IB_WC_WITH_IMM;
+		wc->ex.imm_data = cqe->imm_inval_pkey;
+		break;
+	case MLX5_CQE_RESP_SEND:
+		wc->opcode   = IB_WC_RECV;
+		wc->wc_flags = 0;
+		break;
+	case MLX5_CQE_RESP_SEND_IMM:
+		wc->opcode	= IB_WC_RECV;
+		wc->wc_flags	= IB_WC_WITH_IMM;
+		wc->ex.imm_data = cqe->imm_inval_pkey;
+		break;
+	case MLX5_CQE_RESP_SEND_INV:
+		wc->opcode	= IB_WC_RECV;
+		wc->wc_flags	= IB_WC_WITH_INVALIDATE;
+		wc->ex.invalidate_rkey = be32_to_cpu(cqe->imm_inval_pkey);
+		break;
+	}
+	wc->slid	   = be16_to_cpu(cqe->slid);
+	wc->sl		   = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0xf;
+	wc->src_qp	   = be32_to_cpu(cqe->flags_rqpn) & 0xffffff;
+	wc->dlid_path_bits = cqe->ml_path;
+	g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3;
+	wc->wc_flags |= g ? IB_WC_GRH : 0;
+	wc->pkey_index     = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff;
+}
+
+static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe)
+{
+	__be32 *p = (__be32 *)cqe;
+	int i;
+
+	mlx5_ib_warn(dev, "dump error cqe\n");
+	for (i = 0; i < sizeof(*cqe) / 16; i++, p += 4)
+		pr_info("%08x %08x %08x %08x\n", be32_to_cpu(p[0]),
+			be32_to_cpu(p[1]), be32_to_cpu(p[2]),
+			be32_to_cpu(p[3]));
+}
+
+static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev,
+				  struct mlx5_err_cqe *cqe,
+				  struct ib_wc *wc)
+{
+	int dump = 1;
+
+	switch (cqe->syndrome) {
+	case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR:
+		wc->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR:
+		wc->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR:
+		wc->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_WR_FLUSH_ERR:
+		dump = 0;
+		wc->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_MW_BIND_ERR:
+		wc->status = IB_WC_MW_BIND_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_BAD_RESP_ERR:
+		wc->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR:
+		wc->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
+		wc->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR:
+		wc->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_REMOTE_OP_ERR:
+		wc->status = IB_WC_REM_OP_ERR;
+		break;
+	case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
+		wc->status = IB_WC_RETRY_EXC_ERR;
+		dump = 0;
+		break;
+	case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
+		wc->status = IB_WC_RNR_RETRY_EXC_ERR;
+		dump = 0;
+		break;
+	case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR:
+		wc->status = IB_WC_REM_ABORT_ERR;
+		break;
+	default:
+		wc->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	wc->vendor_err = cqe->vendor_err_synd;
+	if (dump)
+		dump_cqe(dev, cqe);
+}
+
+static int is_atomic_response(struct mlx5_ib_qp *qp, uint16_t idx)
+{
+	/* TBD: waiting decision
+	*/
+	return 0;
+}
+
+static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, uint16_t idx)
+{
+	struct mlx5_wqe_data_seg *dpseg;
+	void *addr;
+
+	dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) +
+		sizeof(struct mlx5_wqe_raddr_seg) +
+		sizeof(struct mlx5_wqe_atomic_seg);
+	addr = (void *)(unsigned long)be64_to_cpu(dpseg->addr);
+	return addr;
+}
+
+static void handle_atomic(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
+			  uint16_t idx)
+{
+	void *addr;
+	int byte_count;
+	int i;
+
+	if (!is_atomic_response(qp, idx))
+		return;
+
+	byte_count = be32_to_cpu(cqe64->byte_cnt);
+	addr = mlx5_get_atomic_laddr(qp, idx);
+
+	if (byte_count == 4) {
+		*(uint32_t *)addr = be32_to_cpu(*((__be32 *)addr));
+	} else {
+		for (i = 0; i < byte_count; i += 8) {
+			*(uint64_t *)addr = be64_to_cpu(*((__be64 *)addr));
+			addr += 8;
+		}
+	}
+
+	return;
+}
+
+static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
+			   u16 tail, u16 head)
+{
+	u16 idx;
+
+	do {
+		idx = tail & (qp->sq.wqe_cnt - 1);
+		handle_atomic(qp, cqe64, idx);
+		if (idx == head)
+			break;
+
+		tail = qp->sq.w_list[idx].next;
+	} while (1);
+	tail = qp->sq.w_list[idx].next;
+	qp->sq.last_poll = tail;
+}
+
+static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf)
+{
+	mlx5_buf_free(dev->mdev, &buf->buf);
+}
+
+static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe,
+			     struct ib_sig_err *item)
+{
+	u16 syndrome = be16_to_cpu(cqe->syndrome);
+
+#define GUARD_ERR   (1 << 13)
+#define APPTAG_ERR  (1 << 12)
+#define REFTAG_ERR  (1 << 11)
+
+	if (syndrome & GUARD_ERR) {
+		item->err_type = IB_SIG_BAD_GUARD;
+		item->expected = be32_to_cpu(cqe->expected_trans_sig) >> 16;
+		item->actual = be32_to_cpu(cqe->actual_trans_sig) >> 16;
+	} else
+	if (syndrome & REFTAG_ERR) {
+		item->err_type = IB_SIG_BAD_REFTAG;
+		item->expected = be32_to_cpu(cqe->expected_reftag);
+		item->actual = be32_to_cpu(cqe->actual_reftag);
+	} else
+	if (syndrome & APPTAG_ERR) {
+		item->err_type = IB_SIG_BAD_APPTAG;
+		item->expected = be32_to_cpu(cqe->expected_trans_sig) & 0xffff;
+		item->actual = be32_to_cpu(cqe->actual_trans_sig) & 0xffff;
+	} else {
+		pr_err("Got signature completion error with bad syndrome %04x\n",
+		       syndrome);
+	}
+
+	item->sig_err_offset = be64_to_cpu(cqe->err_offset);
+	item->key = be32_to_cpu(cqe->mkey);
+}
+
+static int mlx5_poll_one(struct mlx5_ib_cq *cq,
+			 struct mlx5_ib_qp **cur_qp,
+			 struct ib_wc *wc)
+{
+	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
+	struct mlx5_err_cqe *err_cqe;
+	struct mlx5_cqe64 *cqe64;
+	struct mlx5_core_qp *mqp;
+	struct mlx5_ib_wq *wq;
+	struct mlx5_sig_err_cqe *sig_err_cqe;
+	struct mlx5_core_mr *mmr;
+	struct mlx5_ib_mr *mr;
+	uint8_t opcode;
+	uint32_t qpn;
+	u16 wqe_ctr;
+	void *cqe;
+	int idx;
+
+repoll:
+	cqe = next_cqe_sw(cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
+
+	++cq->mcq.cons_index;
+
+	/* Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+
+	opcode = cqe64->op_own >> 4;
+	if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) {
+		if (likely(cq->resize_buf)) {
+			free_cq_buf(dev, &cq->buf);
+			cq->buf = *cq->resize_buf;
+			kfree(cq->resize_buf);
+			cq->resize_buf = NULL;
+			goto repoll;
+		} else {
+			mlx5_ib_warn(dev, "unexpected resize cqe\n");
+		}
+	}
+
+	qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff;
+	if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) {
+		/* We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		mqp = __mlx5_qp_lookup(dev->mdev, qpn);
+		if (unlikely(!mqp)) {
+			mlx5_ib_warn(dev, "CQE@CQ %06x for unknown QPN %6x\n",
+				     cq->mcq.cqn, qpn);
+			return -EINVAL;
+		}
+
+		*cur_qp = to_mibqp(mqp);
+	}
+
+	wc->qp  = &(*cur_qp)->ibqp;
+	switch (opcode) {
+	case MLX5_CQE_REQ:
+		wq = &(*cur_qp)->sq;
+		wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
+		idx = wqe_ctr & (wq->wqe_cnt - 1);
+		handle_good_req(wc, cqe64, wq, idx);
+		handle_atomics(*cur_qp, cqe64, wq->last_poll, idx);
+		wc->wr_id = wq->wrid[idx];
+		wq->tail = wq->wqe_head[idx] + 1;
+		wc->status = IB_WC_SUCCESS;
+		break;
+	case MLX5_CQE_RESP_WR_IMM:
+	case MLX5_CQE_RESP_SEND:
+	case MLX5_CQE_RESP_SEND_IMM:
+	case MLX5_CQE_RESP_SEND_INV:
+		handle_responder(wc, cqe64, *cur_qp);
+		wc->status = IB_WC_SUCCESS;
+		break;
+	case MLX5_CQE_RESIZE_CQ:
+		break;
+	case MLX5_CQE_REQ_ERR:
+	case MLX5_CQE_RESP_ERR:
+		err_cqe = (struct mlx5_err_cqe *)cqe64;
+		mlx5_handle_error_cqe(dev, err_cqe, wc);
+		mlx5_ib_dbg(dev, "%s error cqe on cqn 0x%x:\n",
+			    opcode == MLX5_CQE_REQ_ERR ?
+			    "Requestor" : "Responder", cq->mcq.cqn);
+		mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n",
+			    err_cqe->syndrome, err_cqe->vendor_err_synd);
+		if (opcode == MLX5_CQE_REQ_ERR) {
+			wq = &(*cur_qp)->sq;
+			wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
+			idx = wqe_ctr & (wq->wqe_cnt - 1);
+			wc->wr_id = wq->wrid[idx];
+			wq->tail = wq->wqe_head[idx] + 1;
+		} else {
+			struct mlx5_ib_srq *srq;
+
+			if ((*cur_qp)->ibqp.srq) {
+				srq = to_msrq((*cur_qp)->ibqp.srq);
+				wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
+				wc->wr_id = srq->wrid[wqe_ctr];
+				mlx5_ib_free_srq_wqe(srq, wqe_ctr);
+			} else {
+				wq = &(*cur_qp)->rq;
+				wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
+				++wq->tail;
+			}
+		}
+		break;
+	case MLX5_CQE_SIG_ERR:
+		sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64;
+
+		read_lock(&dev->mdev->priv.mr_table.lock);
+		mmr = __mlx5_mr_lookup(dev->mdev,
+				       mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey)));
+		if (unlikely(!mmr)) {
+			read_unlock(&dev->mdev->priv.mr_table.lock);
+			mlx5_ib_warn(dev, "CQE@CQ %06x for unknown MR %6x\n",
+				     cq->mcq.cqn, be32_to_cpu(sig_err_cqe->mkey));
+			return -EINVAL;
+		}
+
+		mr = to_mibmr(mmr);
+		get_sig_err_item(sig_err_cqe, &mr->sig->err_item);
+		mr->sig->sig_err_exists = true;
+		mr->sig->sigerr_count++;
+
+		mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n",
+			     cq->mcq.cqn, mr->sig->err_item.key,
+			     mr->sig->err_item.err_type,
+			     mr->sig->err_item.sig_err_offset,
+			     mr->sig->err_item.expected,
+			     mr->sig->err_item.actual);
+
+		read_unlock(&dev->mdev->priv.mr_table.lock);
+		goto repoll;
+	}
+
+	return 0;
+}
+
+int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	struct mlx5_ib_cq *cq = to_mcq(ibcq);
+	struct mlx5_ib_qp *cur_qp = NULL;
+	unsigned long flags;
+	int npolled;
+	int err = 0;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	for (npolled = 0; npolled < num_entries; npolled++) {
+		err = mlx5_poll_one(cq, &cur_qp, wc + npolled);
+		if (err)
+			break;
+	}
+
+	if (npolled)
+		mlx5_cq_set_ci(&cq->mcq);
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	if (err == 0 || err == -EAGAIN)
+		return npolled;
+	else
+		return err;
+}
+
+int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev;
+	void __iomem *uar_page = mdev->priv.uuari.uars[0].map;
+
+	mlx5_cq_arm(&to_mcq(ibcq)->mcq,
+		    (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		    MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT,
+		    uar_page,
+		    MLX5_GET_DOORBELL_LOCK(&mdev->priv.cq_uar_lock),
+		    to_mcq(ibcq)->mcq.cons_index);
+
+	return 0;
+}
+
+static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf,
+			int nent, int cqe_size)
+{
+	int err;
+
+	err = mlx5_buf_alloc(dev->mdev, nent * cqe_size,
+			     PAGE_SIZE * 2, &buf->buf);
+	if (err)
+		return err;
+
+	buf->cqe_size = cqe_size;
+	buf->nent = nent;
+
+	return 0;
+}
+
+static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
+			  struct ib_ucontext *context, struct mlx5_ib_cq *cq,
+			  int entries, struct mlx5_create_cq_mbox_in **cqb,
+			  int *cqe_size, int *index, int *inlen)
+{
+	struct mlx5_ib_create_cq ucmd;
+	size_t ucmdlen;
+	int page_shift;
+	int npages;
+	int ncont;
+	int err;
+
+	ucmdlen =
+		(udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) <
+		 sizeof(ucmd)) ? (sizeof(ucmd) -
+				  sizeof(ucmd.reserved)) : sizeof(ucmd);
+
+	if (ib_copy_from_udata(&ucmd, udata, ucmdlen))
+		return -EFAULT;
+
+	if (ucmdlen == sizeof(ucmd) &&
+	    ucmd.reserved != 0)
+		return -EINVAL;
+
+	if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128)
+		return -EINVAL;
+
+	*cqe_size = ucmd.cqe_size;
+
+	cq->buf.umem = ib_umem_get(context, ucmd.buf_addr,
+				   entries * ucmd.cqe_size,
+				   IB_ACCESS_LOCAL_WRITE, 1);
+	if (IS_ERR(cq->buf.umem)) {
+		err = PTR_ERR(cq->buf.umem);
+		return err;
+	}
+
+	err = mlx5_ib_db_map_user(to_mucontext(context), ucmd.db_addr,
+				  &cq->db);
+	if (err)
+		goto err_umem;
+
+	mlx5_ib_cont_pages(cq->buf.umem, ucmd.buf_addr, &npages, &page_shift,
+			   &ncont, NULL);
+	mlx5_ib_dbg(dev, "addr 0x%llx, size %u, npages %d, page_shift %d, ncont %d\n",
+		    ucmd.buf_addr, entries * ucmd.cqe_size, npages, page_shift, ncont);
+
+	*inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * ncont;
+	*cqb = mlx5_vzalloc(*inlen);
+	if (!*cqb) {
+		err = -ENOMEM;
+		goto err_db;
+	}
+	mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, (*cqb)->pas, 0);
+	(*cqb)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+
+	*index = to_mucontext(context)->uuari.uars[0].index;
+
+	return 0;
+
+err_db:
+	mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db);
+
+err_umem:
+	ib_umem_release(cq->buf.umem);
+	return err;
+}
+
+static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_ucontext *context)
+{
+	mlx5_ib_db_unmap_user(to_mucontext(context), &cq->db);
+	ib_umem_release(cq->buf.umem);
+}
+
+static void init_cq_buf(struct mlx5_ib_cq *cq, struct mlx5_ib_cq_buf *buf)
+{
+	int i;
+	void *cqe;
+	struct mlx5_cqe64 *cqe64;
+
+	for (i = 0; i < buf->nent; i++) {
+		cqe = get_cqe_from_buf(buf, i, buf->cqe_size);
+		cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
+		cqe64->op_own = MLX5_CQE_INVALID << 4;
+	}
+}
+
+static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
+			    int entries, int cqe_size,
+			    struct mlx5_create_cq_mbox_in **cqb,
+			    int *index, int *inlen)
+{
+	int err;
+
+	err = mlx5_db_alloc(dev->mdev, &cq->db);
+	if (err)
+		return err;
+
+	cq->mcq.set_ci_db  = cq->db.db;
+	cq->mcq.arm_db     = cq->db.db + 1;
+	cq->mcq.cqe_sz = cqe_size;
+
+	err = alloc_cq_buf(dev, &cq->buf, entries, cqe_size);
+	if (err)
+		goto err_db;
+
+	init_cq_buf(cq, &cq->buf);
+
+	*inlen = sizeof(**cqb) + sizeof(*(*cqb)->pas) * cq->buf.buf.npages;
+	*cqb = mlx5_vzalloc(*inlen);
+	if (!*cqb) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+	mlx5_fill_page_array(&cq->buf.buf, (*cqb)->pas);
+
+	(*cqb)->ctx.log_pg_sz = cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+	*index = dev->mdev->priv.uuari.uars[0].index;
+
+	return 0;
+
+err_buf:
+	free_cq_buf(dev, &cq->buf);
+
+err_db:
+	mlx5_db_free(dev->mdev, &cq->db);
+	return err;
+}
+
+static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq)
+{
+	free_cq_buf(dev, &cq->buf);
+	mlx5_db_free(dev->mdev, &cq->db);
+}
+
+struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries,
+				int vector, struct ib_ucontext *context,
+				struct ib_udata *udata)
+{
+	struct mlx5_create_cq_mbox_in *cqb = NULL;
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_cq *cq;
+	int uninitialized_var(index);
+	int uninitialized_var(inlen);
+	int cqe_size;
+	int irqn;
+	int eqn;
+	int err;
+
+	if (entries < 0)
+		return ERR_PTR(-EINVAL);
+
+	entries = roundup_pow_of_two(entries + 1);
+	if (entries > dev->mdev->caps.gen.max_cqes)
+		return ERR_PTR(-EINVAL);
+
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	cq->ibcq.cqe = entries - 1;
+	mutex_init(&cq->resize_mutex);
+	spin_lock_init(&cq->lock);
+	cq->resize_buf = NULL;
+	cq->resize_umem = NULL;
+
+	if (context) {
+		err = create_cq_user(dev, udata, context, cq, entries,
+				     &cqb, &cqe_size, &index, &inlen);
+		if (err)
+			goto err_create;
+	} else {
+		/* for now choose 64 bytes till we have a proper interface */
+		cqe_size = 64;
+		err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb,
+				       &index, &inlen);
+		if (err)
+			goto err_create;
+	}
+
+	cq->cqe_size = cqe_size;
+	cqb->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5;
+	cqb->ctx.log_sz_usr_page = cpu_to_be32((ilog2(entries) << 24) | index);
+	err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn);
+	if (err)
+		goto err_cqb;
+
+	cqb->ctx.c_eqn = cpu_to_be16(eqn);
+	cqb->ctx.db_record_addr = cpu_to_be64(cq->db.dma);
+
+	err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen);
+	if (err)
+		goto err_cqb;
+
+	mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn);
+	cq->mcq.irqn = irqn;
+	cq->mcq.comp  = mlx5_ib_cq_comp;
+	cq->mcq.event = mlx5_ib_cq_event;
+
+	if (context)
+		if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) {
+			err = -EFAULT;
+			goto err_cmd;
+		}
+
+
+	kvfree(cqb);
+	return &cq->ibcq;
+
+err_cmd:
+	mlx5_core_destroy_cq(dev->mdev, &cq->mcq);
+
+err_cqb:
+	kvfree(cqb);
+	if (context)
+		destroy_cq_user(cq, context);
+	else
+		destroy_cq_kernel(dev, cq);
+
+err_create:
+	kfree(cq);
+
+	return ERR_PTR(err);
+}
+
+
+int mlx5_ib_destroy_cq(struct ib_cq *cq)
+{
+	struct mlx5_ib_dev *dev = to_mdev(cq->device);
+	struct mlx5_ib_cq *mcq = to_mcq(cq);
+	struct ib_ucontext *context = NULL;
+
+	if (cq->uobject)
+		context = cq->uobject->context;
+
+	mlx5_core_destroy_cq(dev->mdev, &mcq->mcq);
+	if (context)
+		destroy_cq_user(mcq, context);
+	else
+		destroy_cq_kernel(dev, mcq);
+
+	kfree(mcq);
+
+	return 0;
+}
+
+static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn)
+{
+	return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff);
+}
+
+void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq)
+{
+	struct mlx5_cqe64 *cqe64, *dest64;
+	void *cqe, *dest;
+	u32 prod_index;
+	int nfreed = 0;
+	u8 owner_bit;
+
+	if (!cq)
+		return;
+
+	/* First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); prod_index++)
+		if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
+			break;
+
+	/* Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
+		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+		cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
+		if (is_equal_rsn(cqe64, rsn)) {
+			if (srq && (ntohl(cqe64->srqn) & 0xffffff))
+				mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter));
+			++nfreed;
+		} else if (nfreed) {
+			dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
+			dest64 = (cq->mcq.cqe_sz == 64) ? dest : dest + 64;
+			owner_bit = dest64->op_own & MLX5_CQE_OWNER_MASK;
+			memcpy(dest, cqe, cq->mcq.cqe_sz);
+			dest64->op_own = owner_bit |
+				(dest64->op_own & ~MLX5_CQE_OWNER_MASK);
+		}
+	}
+
+	if (nfreed) {
+		cq->mcq.cons_index += nfreed;
+		/* Make sure update of buffer contents is done before
+		 * updating consumer index.
+		 */
+		wmb();
+		mlx5_cq_set_ci(&cq->mcq);
+	}
+}
+
+void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq)
+{
+	if (!cq)
+		return;
+
+	spin_lock_irq(&cq->lock);
+	__mlx5_ib_cq_clean(cq, qpn, srq);
+	spin_unlock_irq(&cq->lock);
+}
+
+int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
+{
+	struct mlx5_modify_cq_mbox_in *in;
+	struct mlx5_ib_dev *dev = to_mdev(cq->device);
+	struct mlx5_ib_cq *mcq = to_mcq(cq);
+	int err;
+	u32 fsel;
+
+	if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_CQ_MODER))
+		return -ENOSYS;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	in->cqn = cpu_to_be32(mcq->mcq.cqn);
+	fsel = (MLX5_CQ_MODIFY_PERIOD | MLX5_CQ_MODIFY_COUNT);
+	in->ctx.cq_period = cpu_to_be16(cq_period);
+	in->ctx.cq_max_count = cpu_to_be16(cq_count);
+	in->field_select = cpu_to_be32(fsel);
+	err = mlx5_core_modify_cq(dev->mdev, &mcq->mcq, in, sizeof(*in));
+	kfree(in);
+
+	if (err)
+		mlx5_ib_warn(dev, "modify cq 0x%x failed\n", mcq->mcq.cqn);
+
+	return err;
+}
+
+static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
+		       int entries, struct ib_udata *udata, int *npas,
+		       int *page_shift, int *cqe_size)
+{
+	struct mlx5_ib_resize_cq ucmd;
+	struct ib_umem *umem;
+	int err;
+	int npages;
+	struct ib_ucontext *context = cq->buf.umem->context;
+
+	err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
+	if (err)
+		return err;
+
+	if (ucmd.reserved0 || ucmd.reserved1)
+		return -EINVAL;
+
+	umem = ib_umem_get(context, ucmd.buf_addr, entries * ucmd.cqe_size,
+			   IB_ACCESS_LOCAL_WRITE, 1);
+	if (IS_ERR(umem)) {
+		err = PTR_ERR(umem);
+		return err;
+	}
+
+	mlx5_ib_cont_pages(umem, ucmd.buf_addr, &npages, page_shift,
+			   npas, NULL);
+
+	cq->resize_umem = umem;
+	*cqe_size = ucmd.cqe_size;
+
+	return 0;
+}
+
+static void un_resize_user(struct mlx5_ib_cq *cq)
+{
+	ib_umem_release(cq->resize_umem);
+}
+
+static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
+			 int entries, int cqe_size)
+{
+	int err;
+
+	cq->resize_buf = kzalloc(sizeof(*cq->resize_buf), GFP_KERNEL);
+	if (!cq->resize_buf)
+		return -ENOMEM;
+
+	err = alloc_cq_buf(dev, cq->resize_buf, entries, cqe_size);
+	if (err)
+		goto ex;
+
+	init_cq_buf(cq, cq->resize_buf);
+
+	return 0;
+
+ex:
+	kfree(cq->resize_buf);
+	return err;
+}
+
+static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq)
+{
+	free_cq_buf(dev, cq->resize_buf);
+	cq->resize_buf = NULL;
+}
+
+static int copy_resize_cqes(struct mlx5_ib_cq *cq)
+{
+	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
+	struct mlx5_cqe64 *scqe64;
+	struct mlx5_cqe64 *dcqe64;
+	void *start_cqe;
+	void *scqe;
+	void *dcqe;
+	int ssize;
+	int dsize;
+	int i;
+	u8 sw_own;
+
+	ssize = cq->buf.cqe_size;
+	dsize = cq->resize_buf->cqe_size;
+	if (ssize != dsize) {
+		mlx5_ib_warn(dev, "resize from different cqe size is not supported\n");
+		return -EINVAL;
+	}
+
+	i = cq->mcq.cons_index;
+	scqe = get_sw_cqe(cq, i);
+	scqe64 = ssize == 64 ? scqe : scqe + 64;
+	start_cqe = scqe;
+	if (!scqe) {
+		mlx5_ib_warn(dev, "expected cqe in sw ownership\n");
+		return -EINVAL;
+	}
+
+	while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) {
+		dcqe = get_cqe_from_buf(cq->resize_buf,
+					(i + 1) & (cq->resize_buf->nent),
+					dsize);
+		dcqe64 = dsize == 64 ? dcqe : dcqe + 64;
+		sw_own = sw_ownership_bit(i + 1, cq->resize_buf->nent);
+		memcpy(dcqe, scqe, dsize);
+		dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own;
+
+		++i;
+		scqe = get_sw_cqe(cq, i);
+		scqe64 = ssize == 64 ? scqe : scqe + 64;
+		if (!scqe) {
+			mlx5_ib_warn(dev, "expected cqe in sw ownership\n");
+			return -EINVAL;
+		}
+
+		if (scqe == start_cqe) {
+			pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n",
+				cq->mcq.cqn);
+			return -ENOMEM;
+		}
+	}
+	++cq->mcq.cons_index;
+	return 0;
+}
+
+int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibcq->device);
+	struct mlx5_ib_cq *cq = to_mcq(ibcq);
+	struct mlx5_modify_cq_mbox_in *in;
+	int err;
+	int npas;
+	int page_shift;
+	int inlen;
+	int uninitialized_var(cqe_size);
+	unsigned long flags;
+
+	if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_RESIZE_CQ)) {
+		pr_info("Firmware does not support resize CQ\n");
+		return -ENOSYS;
+	}
+
+	if (entries < 1)
+		return -EINVAL;
+
+	entries = roundup_pow_of_two(entries + 1);
+	if (entries > dev->mdev->caps.gen.max_cqes + 1)
+		return -EINVAL;
+
+	if (entries == ibcq->cqe + 1)
+		return 0;
+
+	mutex_lock(&cq->resize_mutex);
+	if (udata) {
+		err = resize_user(dev, cq, entries, udata, &npas, &page_shift,
+				  &cqe_size);
+	} else {
+		cqe_size = 64;
+		err = resize_kernel(dev, cq, entries, cqe_size);
+		if (!err) {
+			npas = cq->resize_buf->buf.npages;
+			page_shift = cq->resize_buf->buf.page_shift;
+		}
+	}
+
+	if (err)
+		goto ex;
+
+	inlen = sizeof(*in) + npas * sizeof(in->pas[0]);
+	in = mlx5_vzalloc(inlen);
+	if (!in) {
+		err = -ENOMEM;
+		goto ex_resize;
+	}
+
+	if (udata)
+		mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift,
+				     in->pas, 0);
+	else
+		mlx5_fill_page_array(&cq->resize_buf->buf, in->pas);
+
+	in->field_select = cpu_to_be32(MLX5_MODIFY_CQ_MASK_LOG_SIZE  |
+				       MLX5_MODIFY_CQ_MASK_PG_OFFSET |
+				       MLX5_MODIFY_CQ_MASK_PG_SIZE);
+	in->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+	in->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5;
+	in->ctx.page_offset = 0;
+	in->ctx.log_sz_usr_page = cpu_to_be32(ilog2(entries) << 24);
+	in->hdr.opmod = cpu_to_be16(MLX5_CQ_OPMOD_RESIZE);
+	in->cqn = cpu_to_be32(cq->mcq.cqn);
+
+	err = mlx5_core_modify_cq(dev->mdev, &cq->mcq, in, inlen);
+	if (err)
+		goto ex_alloc;
+
+	if (udata) {
+		cq->ibcq.cqe = entries - 1;
+		ib_umem_release(cq->buf.umem);
+		cq->buf.umem = cq->resize_umem;
+		cq->resize_umem = NULL;
+	} else {
+		struct mlx5_ib_cq_buf tbuf;
+		int resized = 0;
+
+		spin_lock_irqsave(&cq->lock, flags);
+		if (cq->resize_buf) {
+			err = copy_resize_cqes(cq);
+			if (!err) {
+				tbuf = cq->buf;
+				cq->buf = *cq->resize_buf;
+				kfree(cq->resize_buf);
+				cq->resize_buf = NULL;
+				resized = 1;
+			}
+		}
+		cq->ibcq.cqe = entries - 1;
+		spin_unlock_irqrestore(&cq->lock, flags);
+		if (resized)
+			free_cq_buf(dev, &tbuf);
+	}
+	mutex_unlock(&cq->resize_mutex);
+
+	kvfree(in);
+	return 0;
+
+ex_alloc:
+	kvfree(in);
+
+ex_resize:
+	if (udata)
+		un_resize_user(cq);
+	else
+		un_resize_kernel(dev, cq);
+ex:
+	mutex_unlock(&cq->resize_mutex);
+	return err;
+}
+
+int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq)
+{
+	struct mlx5_ib_cq *cq;
+
+	if (!ibcq)
+		return 128;
+
+	cq = to_mcq(ibcq);
+	return cq->cqe_size;
+}
diff --git a/drivers/infiniband/hw/mlx5/doorbell.c b/drivers/infiniband/hw/mlx5/doorbell.c
new file mode 100644
index 000000000..a0e4e6ddb
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/doorbell.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kref.h>
+#include <linux/slab.h>
+#include <rdma/ib_umem.h>
+
+#include "mlx5_ib.h"
+
+struct mlx5_ib_user_db_page {
+	struct list_head	list;
+	struct ib_umem	       *umem;
+	unsigned long		user_virt;
+	int			refcnt;
+};
+
+int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
+			struct mlx5_db *db)
+{
+	struct mlx5_ib_user_db_page *page;
+	int err = 0;
+
+	mutex_lock(&context->db_page_mutex);
+
+	list_for_each_entry(page, &context->db_page_list, list)
+		if (page->user_virt == (virt & PAGE_MASK))
+			goto found;
+
+	page = kmalloc(sizeof(*page), GFP_KERNEL);
+	if (!page) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	page->user_virt = (virt & PAGE_MASK);
+	page->refcnt    = 0;
+	page->umem      = ib_umem_get(&context->ibucontext, virt & PAGE_MASK,
+				      PAGE_SIZE, 0, 0);
+	if (IS_ERR(page->umem)) {
+		err = PTR_ERR(page->umem);
+		kfree(page);
+		goto out;
+	}
+
+	list_add(&page->list, &context->db_page_list);
+
+found:
+	db->dma = sg_dma_address(page->umem->sg_head.sgl) + (virt & ~PAGE_MASK);
+	db->u.user_page = page;
+	++page->refcnt;
+
+out:
+	mutex_unlock(&context->db_page_mutex);
+
+	return err;
+}
+
+void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db)
+{
+	mutex_lock(&context->db_page_mutex);
+
+	if (!--db->u.user_page->refcnt) {
+		list_del(&db->u.user_page->list);
+		ib_umem_release(db->u.user_page->umem);
+		kfree(db->u.user_page);
+	}
+
+	mutex_unlock(&context->db_page_mutex);
+}
diff --git a/drivers/infiniband/hw/mlx5/mad.c b/drivers/infiniband/hw/mlx5/mad.c
new file mode 100644
index 000000000..9cf9a37bb
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/mad.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mlx5/cmd.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+#include "mlx5_ib.h"
+
+enum {
+	MLX5_IB_VENDOR_CLASS1 = 0x9,
+	MLX5_IB_VENDOR_CLASS2 = 0xa
+};
+
+int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+		 u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		 void *in_mad, void *response_mad)
+{
+	u8 op_modifier = 0;
+
+	/* Key check traps can't be generated unless we have in_wc to
+	 * tell us where to send the trap.
+	 */
+	if (ignore_mkey || !in_wc)
+		op_modifier |= 0x1;
+	if (ignore_bkey || !in_wc)
+		op_modifier |= 0x2;
+
+	return mlx5_core_mad_ifc(dev->mdev, in_mad, response_mad, op_modifier, port);
+}
+
+int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	u16 slid;
+	int err;
+
+	slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP && slid == 0)
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+			return IB_MAD_RESULT_SUCCESS;
+
+		/* Don't process SMInfo queries -- the SMA can't handle them.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO)
+			return IB_MAD_RESULT_SUCCESS;
+	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+		   in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS1   ||
+		   in_mad->mad_hdr.mgmt_class == MLX5_IB_VENDOR_CLASS2   ||
+		   in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_CONG_MGMT) {
+		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+			return IB_MAD_RESULT_SUCCESS;
+	} else {
+		return IB_MAD_RESULT_SUCCESS;
+	}
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev),
+			   mad_flags & IB_MAD_IGNORE_MKEY,
+			   mad_flags & IB_MAD_IGNORE_BKEY,
+			   port_num, in_wc, in_grh, in_mad, out_mad);
+	if (err)
+		return IB_MAD_RESULT_FAILURE;
+
+	/* set return bit in status of directed route responses */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+		/* no response for trap repress */
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	u16 packet_error;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+
+	packet_error = be16_to_cpu(out_mad->status);
+
+	dev->mdev->caps.gen.ext_port_cap[port - 1] = (!err && !packet_error) ?
+		MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO : 0;
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c
new file mode 100644
index 000000000..57c9809e8
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/main.c
@@ -0,0 +1,1397 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm-generic/kmap_types.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+#include <linux/io-mapping.h>
+#include <linux/sched.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include "user.h"
+#include "mlx5_ib.h"
+
+#define DRIVER_NAME "mlx5_ib"
+#define DRIVER_VERSION "2.2-1"
+#define DRIVER_RELDATE	"Feb 2014"
+
+MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
+MODULE_DESCRIPTION("Mellanox Connect-IB HCA IB driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRIVER_VERSION);
+
+static int deprecated_prof_sel = 2;
+module_param_named(prof_sel, deprecated_prof_sel, int, 0444);
+MODULE_PARM_DESC(prof_sel, "profile selector. Deprecated here. Moved to module mlx5_core");
+
+static char mlx5_version[] =
+	DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v"
+	DRIVER_VERSION " (" DRIVER_RELDATE ")\n";
+
+static int mlx5_ib_query_device(struct ib_device *ibdev,
+				struct ib_device_attr *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	struct mlx5_general_caps *gen;
+	int err = -ENOMEM;
+	int max_rq_sg;
+	int max_sq_sg;
+	u64 flags;
+
+	gen = &dev->mdev->caps.gen;
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memset(props, 0, sizeof(*props));
+
+	props->fw_ver = ((u64)fw_rev_maj(dev->mdev) << 32) |
+		(fw_rev_min(dev->mdev) << 16) |
+		fw_rev_sub(dev->mdev);
+	props->device_cap_flags    = IB_DEVICE_CHANGE_PHY_PORT |
+		IB_DEVICE_PORT_ACTIVE_EVENT		|
+		IB_DEVICE_SYS_IMAGE_GUID		|
+		IB_DEVICE_RC_RNR_NAK_GEN;
+	flags = gen->flags;
+	if (flags & MLX5_DEV_CAP_FLAG_BAD_PKEY_CNTR)
+		props->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+	if (flags & MLX5_DEV_CAP_FLAG_BAD_QKEY_CNTR)
+		props->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+	if (flags & MLX5_DEV_CAP_FLAG_APM)
+		props->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+	props->device_cap_flags |= IB_DEVICE_LOCAL_DMA_LKEY;
+	if (flags & MLX5_DEV_CAP_FLAG_XRC)
+		props->device_cap_flags |= IB_DEVICE_XRC;
+	props->device_cap_flags |= IB_DEVICE_MEM_MGT_EXTENSIONS;
+	if (flags & MLX5_DEV_CAP_FLAG_SIG_HAND_OVER) {
+		props->device_cap_flags |= IB_DEVICE_SIGNATURE_HANDOVER;
+		/* At this stage no support for signature handover */
+		props->sig_prot_cap = IB_PROT_T10DIF_TYPE_1 |
+				      IB_PROT_T10DIF_TYPE_2 |
+				      IB_PROT_T10DIF_TYPE_3;
+		props->sig_guard_cap = IB_GUARD_T10DIF_CRC |
+				       IB_GUARD_T10DIF_CSUM;
+	}
+	if (flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST)
+		props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+
+	props->vendor_id	   = be32_to_cpup((__be32 *)(out_mad->data + 36)) &
+		0xffffff;
+	props->vendor_part_id	   = be16_to_cpup((__be16 *)(out_mad->data + 30));
+	props->hw_ver		   = be32_to_cpup((__be32 *)(out_mad->data + 32));
+	memcpy(&props->sys_image_guid, out_mad->data +	4, 8);
+
+	props->max_mr_size	   = ~0ull;
+	props->page_size_cap	   = gen->min_page_sz;
+	props->max_qp		   = 1 << gen->log_max_qp;
+	props->max_qp_wr	   = gen->max_wqes;
+	max_rq_sg = gen->max_rq_desc_sz / sizeof(struct mlx5_wqe_data_seg);
+	max_sq_sg = (gen->max_sq_desc_sz - sizeof(struct mlx5_wqe_ctrl_seg)) /
+		sizeof(struct mlx5_wqe_data_seg);
+	props->max_sge = min(max_rq_sg, max_sq_sg);
+	props->max_cq		   = 1 << gen->log_max_cq;
+	props->max_cqe		   = gen->max_cqes - 1;
+	props->max_mr		   = 1 << gen->log_max_mkey;
+	props->max_pd		   = 1 << gen->log_max_pd;
+	props->max_qp_rd_atom	   = 1 << gen->log_max_ra_req_qp;
+	props->max_qp_init_rd_atom = 1 << gen->log_max_ra_res_qp;
+	props->max_srq		   = 1 << gen->log_max_srq;
+	props->max_srq_wr	   = gen->max_srq_wqes - 1;
+	props->local_ca_ack_delay  = gen->local_ca_ack_delay;
+	props->max_res_rd_atom	   = props->max_qp_rd_atom * props->max_qp;
+	props->max_srq_sge	   = max_rq_sg - 1;
+	props->max_fast_reg_page_list_len = (unsigned int)-1;
+	props->local_ca_ack_delay  = gen->local_ca_ack_delay;
+	props->atomic_cap	   = IB_ATOMIC_NONE;
+	props->masked_atomic_cap   = IB_ATOMIC_NONE;
+	props->max_pkeys	   = be16_to_cpup((__be16 *)(out_mad->data + 28));
+	props->max_mcast_grp	   = 1 << gen->log_max_mcg;
+	props->max_mcast_qp_attach = gen->max_qp_mcg;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+					   props->max_mcast_grp;
+	props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG)
+		props->device_cap_flags |= IB_DEVICE_ON_DEMAND_PAGING;
+	props->odp_caps = dev->odp_caps;
+#endif
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return err;
+}
+
+int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
+		       struct ib_port_attr *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	struct mlx5_general_caps *gen;
+	int ext_active_speed;
+	int err = -ENOMEM;
+
+	gen = &dev->mdev->caps.gen;
+	if (port < 1 || port > gen->num_ports) {
+		mlx5_ib_warn(dev, "invalid port number %d\n", port);
+		return -EINVAL;
+	}
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(props, 0, sizeof(*props));
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mlx5_MAD_IFC(dev, 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err) {
+		mlx5_ib_warn(dev, "err %d\n", err);
+		goto out;
+	}
+
+
+	props->lid		= be16_to_cpup((__be16 *)(out_mad->data + 16));
+	props->lmc		= out_mad->data[34] & 0x7;
+	props->sm_lid		= be16_to_cpup((__be16 *)(out_mad->data + 18));
+	props->sm_sl		= out_mad->data[36] & 0xf;
+	props->state		= out_mad->data[32] & 0xf;
+	props->phys_state	= out_mad->data[33] >> 4;
+	props->port_cap_flags	= be32_to_cpup((__be32 *)(out_mad->data + 20));
+	props->gid_tbl_len	= out_mad->data[50];
+	props->max_msg_sz	= 1 << gen->log_max_msg;
+	props->pkey_tbl_len	= gen->port[port - 1].pkey_table_len;
+	props->bad_pkey_cntr	= be16_to_cpup((__be16 *)(out_mad->data + 46));
+	props->qkey_viol_cntr	= be16_to_cpup((__be16 *)(out_mad->data + 48));
+	props->active_width	= out_mad->data[31] & 0xf;
+	props->active_speed	= out_mad->data[35] >> 4;
+	props->max_mtu		= out_mad->data[41] & 0xf;
+	props->active_mtu	= out_mad->data[36] >> 4;
+	props->subnet_timeout	= out_mad->data[51] & 0x1f;
+	props->max_vl_num	= out_mad->data[37] >> 4;
+	props->init_type_reply	= out_mad->data[41] >> 4;
+
+	/* Check if extended speeds (EDR/FDR/...) are supported */
+	if (props->port_cap_flags & IB_PORT_EXTENDED_SPEEDS_SUP) {
+		ext_active_speed = out_mad->data[62] >> 4;
+
+		switch (ext_active_speed) {
+		case 1:
+			props->active_speed = 16; /* FDR */
+			break;
+		case 2:
+			props->active_speed = 32; /* EDR */
+			break;
+		}
+	}
+
+	/* If reported active speed is QDR, check if is FDR-10 */
+	if (props->active_speed == 4) {
+		if (gen->ext_port_cap[port - 1] &
+		    MLX_EXT_PORT_CAP_FLAG_EXTENDED_PORT_INFO) {
+			init_query_mad(in_mad);
+			in_mad->attr_id = MLX5_ATTR_EXTENDED_PORT_INFO;
+			in_mad->attr_mod = cpu_to_be32(port);
+
+			err = mlx5_MAD_IFC(dev, 1, 1, port,
+					   NULL, NULL, in_mad, out_mad);
+			if (err)
+				goto out;
+
+			/* Checking LinkSpeedActive for FDR-10 */
+			if (out_mad->data[15] & 0x1)
+				props->active_speed = 8;
+		}
+	}
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+
+	return err;
+}
+
+static int mlx5_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+			     union ib_gid *gid)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw, out_mad->data + 8, 8);
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
+	in_mad->attr_mod = cpu_to_be32(index / 8);
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mlx5_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			      u16 *pkey)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	err = mlx5_MAD_IFC(to_mdev(ibdev), 1, 1, port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	*pkey = be16_to_cpu(((__be16 *)out_mad->data)[index % 32]);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+struct mlx5_reg_node_desc {
+	u8	desc[64];
+};
+
+static int mlx5_ib_modify_device(struct ib_device *ibdev, int mask,
+				 struct ib_device_modify *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_reg_node_desc in;
+	struct mlx5_reg_node_desc out;
+	int err;
+
+	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+		return -EOPNOTSUPP;
+
+	if (!(mask & IB_DEVICE_MODIFY_NODE_DESC))
+		return 0;
+
+	/*
+	 * If possible, pass node desc to FW, so it can generate
+	 * a 144 trap.  If cmd fails, just ignore.
+	 */
+	memcpy(&in, props->node_desc, 64);
+	err = mlx5_core_access_reg(dev->mdev, &in, sizeof(in), &out,
+				   sizeof(out), MLX5_REG_NODE_DESC, 0, 1);
+	if (err)
+		return err;
+
+	memcpy(ibdev->node_desc, props->node_desc, 64);
+
+	return err;
+}
+
+static int mlx5_ib_modify_port(struct ib_device *ibdev, u8 port, int mask,
+			       struct ib_port_modify *props)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct ib_port_attr attr;
+	u32 tmp;
+	int err;
+
+	mutex_lock(&dev->cap_mask_mutex);
+
+	err = mlx5_ib_query_port(ibdev, port, &attr);
+	if (err)
+		goto out;
+
+	tmp = (attr.port_cap_flags | props->set_port_cap_mask) &
+		~props->clr_port_cap_mask;
+
+	err = mlx5_set_port_caps(dev->mdev, port, tmp);
+
+out:
+	mutex_unlock(&dev->cap_mask_mutex);
+	return err;
+}
+
+static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev,
+						  struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_alloc_ucontext_req_v2 req;
+	struct mlx5_ib_alloc_ucontext_resp resp;
+	struct mlx5_ib_ucontext *context;
+	struct mlx5_general_caps *gen;
+	struct mlx5_uuar_info *uuari;
+	struct mlx5_uar *uars;
+	int gross_uuars;
+	int num_uars;
+	int ver;
+	int uuarn;
+	int err;
+	int i;
+	size_t reqlen;
+
+	gen = &dev->mdev->caps.gen;
+	if (!dev->ib_active)
+		return ERR_PTR(-EAGAIN);
+
+	memset(&req, 0, sizeof(req));
+	reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr);
+	if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req))
+		ver = 0;
+	else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2))
+		ver = 2;
+	else
+		return ERR_PTR(-EINVAL);
+
+	err = ib_copy_from_udata(&req, udata, reqlen);
+	if (err)
+		return ERR_PTR(err);
+
+	if (req.flags || req.reserved)
+		return ERR_PTR(-EINVAL);
+
+	if (req.total_num_uuars > MLX5_MAX_UUARS)
+		return ERR_PTR(-ENOMEM);
+
+	if (req.total_num_uuars == 0)
+		return ERR_PTR(-EINVAL);
+
+	req.total_num_uuars = ALIGN(req.total_num_uuars,
+				    MLX5_NON_FP_BF_REGS_PER_PAGE);
+	if (req.num_low_latency_uuars > req.total_num_uuars - 1)
+		return ERR_PTR(-EINVAL);
+
+	num_uars = req.total_num_uuars / MLX5_NON_FP_BF_REGS_PER_PAGE;
+	gross_uuars = num_uars * MLX5_BF_REGS_PER_PAGE;
+	resp.qp_tab_size      = 1 << gen->log_max_qp;
+	resp.bf_reg_size      = gen->bf_reg_size;
+	resp.cache_line_size  = L1_CACHE_BYTES;
+	resp.max_sq_desc_sz = gen->max_sq_desc_sz;
+	resp.max_rq_desc_sz = gen->max_rq_desc_sz;
+	resp.max_send_wqebb = gen->max_wqes;
+	resp.max_recv_wr = gen->max_wqes;
+	resp.max_srq_recv_wr = gen->max_srq_wqes;
+
+	context = kzalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	uuari = &context->uuari;
+	mutex_init(&uuari->lock);
+	uars = kcalloc(num_uars, sizeof(*uars), GFP_KERNEL);
+	if (!uars) {
+		err = -ENOMEM;
+		goto out_ctx;
+	}
+
+	uuari->bitmap = kcalloc(BITS_TO_LONGS(gross_uuars),
+				sizeof(*uuari->bitmap),
+				GFP_KERNEL);
+	if (!uuari->bitmap) {
+		err = -ENOMEM;
+		goto out_uar_ctx;
+	}
+	/*
+	 * clear all fast path uuars
+	 */
+	for (i = 0; i < gross_uuars; i++) {
+		uuarn = i & 3;
+		if (uuarn == 2 || uuarn == 3)
+			set_bit(i, uuari->bitmap);
+	}
+
+	uuari->count = kcalloc(gross_uuars, sizeof(*uuari->count), GFP_KERNEL);
+	if (!uuari->count) {
+		err = -ENOMEM;
+		goto out_bitmap;
+	}
+
+	for (i = 0; i < num_uars; i++) {
+		err = mlx5_cmd_alloc_uar(dev->mdev, &uars[i].index);
+		if (err)
+			goto out_count;
+	}
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range;
+#endif
+
+	INIT_LIST_HEAD(&context->db_page_list);
+	mutex_init(&context->db_page_mutex);
+
+	resp.tot_uuars = req.total_num_uuars;
+	resp.num_ports = gen->num_ports;
+	err = ib_copy_to_udata(udata, &resp,
+			       sizeof(resp) - sizeof(resp.reserved));
+	if (err)
+		goto out_uars;
+
+	uuari->ver = ver;
+	uuari->num_low_latency_uuars = req.num_low_latency_uuars;
+	uuari->uars = uars;
+	uuari->num_uars = num_uars;
+	return &context->ibucontext;
+
+out_uars:
+	for (i--; i >= 0; i--)
+		mlx5_cmd_free_uar(dev->mdev, uars[i].index);
+out_count:
+	kfree(uuari->count);
+
+out_bitmap:
+	kfree(uuari->bitmap);
+
+out_uar_ctx:
+	kfree(uars);
+
+out_ctx:
+	kfree(context);
+	return ERR_PTR(err);
+}
+
+static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
+	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
+	struct mlx5_uuar_info *uuari = &context->uuari;
+	int i;
+
+	for (i = 0; i < uuari->num_uars; i++) {
+		if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index))
+			mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index);
+	}
+
+	kfree(uuari->count);
+	kfree(uuari->bitmap);
+	kfree(uuari->uars);
+	kfree(context);
+
+	return 0;
+}
+
+static phys_addr_t uar_index2pfn(struct mlx5_ib_dev *dev, int index)
+{
+	return (pci_resource_start(dev->mdev->pdev, 0) >> PAGE_SHIFT) + index;
+}
+
+static int get_command(unsigned long offset)
+{
+	return (offset >> MLX5_IB_MMAP_CMD_SHIFT) & MLX5_IB_MMAP_CMD_MASK;
+}
+
+static int get_arg(unsigned long offset)
+{
+	return offset & ((1 << MLX5_IB_MMAP_CMD_SHIFT) - 1);
+}
+
+static int get_index(unsigned long offset)
+{
+	return get_arg(offset);
+}
+
+static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vma)
+{
+	struct mlx5_ib_ucontext *context = to_mucontext(ibcontext);
+	struct mlx5_ib_dev *dev = to_mdev(ibcontext->device);
+	struct mlx5_uuar_info *uuari = &context->uuari;
+	unsigned long command;
+	unsigned long idx;
+	phys_addr_t pfn;
+
+	command = get_command(vma->vm_pgoff);
+	switch (command) {
+	case MLX5_IB_MMAP_REGULAR_PAGE:
+		if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+			return -EINVAL;
+
+		idx = get_index(vma->vm_pgoff);
+		if (idx >= uuari->num_uars)
+			return -EINVAL;
+
+		pfn = uar_index2pfn(dev, uuari->uars[idx].index);
+		mlx5_ib_dbg(dev, "uar idx 0x%lx, pfn 0x%llx\n", idx,
+			    (unsigned long long)pfn);
+
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+		if (io_remap_pfn_range(vma, vma->vm_start, pfn,
+				       PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+
+		mlx5_ib_dbg(dev, "mapped WC at 0x%lx, PA 0x%llx\n",
+			    vma->vm_start,
+			    (unsigned long long)pfn << PAGE_SHIFT);
+		break;
+
+	case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES:
+		return -ENOSYS;
+
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int alloc_pa_mkey(struct mlx5_ib_dev *dev, u32 *key, u32 pdn)
+{
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_mkey_seg *seg;
+	struct mlx5_core_mr mr;
+	int err;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	seg = &in->seg;
+	seg->flags = MLX5_PERM_LOCAL_READ | MLX5_ACCESS_MODE_PA;
+	seg->flags_pd = cpu_to_be32(pdn | MLX5_MKEY_LEN64);
+	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+	seg->start_addr = 0;
+
+	err = mlx5_core_create_mkey(dev->mdev, &mr, in, sizeof(*in),
+				    NULL, NULL, NULL);
+	if (err) {
+		mlx5_ib_warn(dev, "failed to create mkey, %d\n", err);
+		goto err_in;
+	}
+
+	kfree(in);
+	*key = mr.key;
+
+	return 0;
+
+err_in:
+	kfree(in);
+
+	return err;
+}
+
+static void free_pa_mkey(struct mlx5_ib_dev *dev, u32 key)
+{
+	struct mlx5_core_mr mr;
+	int err;
+
+	memset(&mr, 0, sizeof(mr));
+	mr.key = key;
+	err = mlx5_core_destroy_mkey(dev->mdev, &mr);
+	if (err)
+		mlx5_ib_warn(dev, "failed to destroy mkey 0x%x\n", key);
+}
+
+static struct ib_pd *mlx5_ib_alloc_pd(struct ib_device *ibdev,
+				      struct ib_ucontext *context,
+				      struct ib_udata *udata)
+{
+	struct mlx5_ib_alloc_pd_resp resp;
+	struct mlx5_ib_pd *pd;
+	int err;
+
+	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx5_core_alloc_pd(to_mdev(ibdev)->mdev, &pd->pdn);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	if (context) {
+		resp.pdn = pd->pdn;
+		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
+			mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+	} else {
+		err = alloc_pa_mkey(to_mdev(ibdev), &pd->pa_lkey, pd->pdn);
+		if (err) {
+			mlx5_core_dealloc_pd(to_mdev(ibdev)->mdev, pd->pdn);
+			kfree(pd);
+			return ERR_PTR(err);
+		}
+	}
+
+	return &pd->ibpd;
+}
+
+static int mlx5_ib_dealloc_pd(struct ib_pd *pd)
+{
+	struct mlx5_ib_dev *mdev = to_mdev(pd->device);
+	struct mlx5_ib_pd *mpd = to_mpd(pd);
+
+	if (!pd->uobject)
+		free_pa_mkey(mdev, mpd->pa_lkey);
+
+	mlx5_core_dealloc_pd(mdev->mdev, mpd->pdn);
+	kfree(mpd);
+
+	return 0;
+}
+
+static int mlx5_ib_mcg_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	int err;
+
+	err = mlx5_core_attach_mcg(dev->mdev, gid, ibqp->qp_num);
+	if (err)
+		mlx5_ib_warn(dev, "failed attaching QPN 0x%x, MGID %pI6\n",
+			     ibqp->qp_num, gid->raw);
+
+	return err;
+}
+
+static int mlx5_ib_mcg_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	int err;
+
+	err = mlx5_core_detach_mcg(dev->mdev, gid, ibqp->qp_num);
+	if (err)
+		mlx5_ib_warn(dev, "failed detaching QPN 0x%x, MGID %pI6\n",
+			     ibqp->qp_num, gid->raw);
+
+	return err;
+}
+
+static int init_node_data(struct mlx5_ib_dev *dev)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof(*in_mad), GFP_KERNEL);
+	out_mad = kmalloc(sizeof(*out_mad), GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+
+	err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(dev->ib_dev.node_desc, out_mad->data, 64);
+
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mlx5_MAD_IFC(dev, 1, 1, 1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	dev->mdev->rev_id = be32_to_cpup((__be32 *)(out_mad->data + 32));
+	memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static ssize_t show_fw_pages(struct device *device, struct device_attribute *attr,
+			     char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+
+	return sprintf(buf, "%d\n", dev->mdev->priv.fw_pages);
+}
+
+static ssize_t show_reg_pages(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+
+	return sprintf(buf, "%d\n", atomic_read(&dev->mdev->priv.reg_pages));
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+	return sprintf(buf, "MT%d\n", dev->mdev->pdev->device);
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+			   char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%d.%d.%d\n", fw_rev_maj(dev->mdev),
+		       fw_rev_min(dev->mdev), fw_rev_sub(dev->mdev));
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%x\n", dev->mdev->rev_id);
+}
+
+static ssize_t show_board(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct mlx5_ib_dev *dev =
+		container_of(device, struct mlx5_ib_dev, ib_dev.dev);
+	return sprintf(buf, "%.*s\n", MLX5_BOARD_ID_LEN,
+		       dev->mdev->board_id);
+}
+
+static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
+static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+static DEVICE_ATTR(fw_pages, S_IRUGO, show_fw_pages, NULL);
+static DEVICE_ATTR(reg_pages, S_IRUGO, show_reg_pages, NULL);
+
+static struct device_attribute *mlx5_class_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+	&dev_attr_fw_pages,
+	&dev_attr_reg_pages,
+};
+
+static void mlx5_ib_event(struct mlx5_core_dev *dev, void *context,
+			  enum mlx5_dev_event event, unsigned long param)
+{
+	struct mlx5_ib_dev *ibdev = (struct mlx5_ib_dev *)context;
+	struct ib_event ibev;
+
+	u8 port = 0;
+
+	switch (event) {
+	case MLX5_DEV_EVENT_SYS_ERROR:
+		ibdev->ib_active = false;
+		ibev.event = IB_EVENT_DEVICE_FATAL;
+		break;
+
+	case MLX5_DEV_EVENT_PORT_UP:
+		ibev.event = IB_EVENT_PORT_ACTIVE;
+		port = (u8)param;
+		break;
+
+	case MLX5_DEV_EVENT_PORT_DOWN:
+		ibev.event = IB_EVENT_PORT_ERR;
+		port = (u8)param;
+		break;
+
+	case MLX5_DEV_EVENT_PORT_INITIALIZED:
+		/* not used by ULPs */
+		return;
+
+	case MLX5_DEV_EVENT_LID_CHANGE:
+		ibev.event = IB_EVENT_LID_CHANGE;
+		port = (u8)param;
+		break;
+
+	case MLX5_DEV_EVENT_PKEY_CHANGE:
+		ibev.event = IB_EVENT_PKEY_CHANGE;
+		port = (u8)param;
+		break;
+
+	case MLX5_DEV_EVENT_GUID_CHANGE:
+		ibev.event = IB_EVENT_GID_CHANGE;
+		port = (u8)param;
+		break;
+
+	case MLX5_DEV_EVENT_CLIENT_REREG:
+		ibev.event = IB_EVENT_CLIENT_REREGISTER;
+		port = (u8)param;
+		break;
+	}
+
+	ibev.device	      = &ibdev->ib_dev;
+	ibev.element.port_num = port;
+
+	if (port < 1 || port > ibdev->num_ports) {
+		mlx5_ib_warn(ibdev, "warning: event on port %d\n", port);
+		return;
+	}
+
+	if (ibdev->ib_active)
+		ib_dispatch_event(&ibev);
+}
+
+static void get_ext_port_caps(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_general_caps *gen;
+	int port;
+
+	gen = &dev->mdev->caps.gen;
+	for (port = 1; port <= gen->num_ports; port++)
+		mlx5_query_ext_port_caps(dev, port);
+}
+
+static int get_port_caps(struct mlx5_ib_dev *dev)
+{
+	struct ib_device_attr *dprops = NULL;
+	struct ib_port_attr *pprops = NULL;
+	struct mlx5_general_caps *gen;
+	int err = -ENOMEM;
+	int port;
+
+	gen = &dev->mdev->caps.gen;
+	pprops = kmalloc(sizeof(*pprops), GFP_KERNEL);
+	if (!pprops)
+		goto out;
+
+	dprops = kmalloc(sizeof(*dprops), GFP_KERNEL);
+	if (!dprops)
+		goto out;
+
+	err = mlx5_ib_query_device(&dev->ib_dev, dprops);
+	if (err) {
+		mlx5_ib_warn(dev, "query_device failed %d\n", err);
+		goto out;
+	}
+
+	for (port = 1; port <= gen->num_ports; port++) {
+		err = mlx5_ib_query_port(&dev->ib_dev, port, pprops);
+		if (err) {
+			mlx5_ib_warn(dev, "query_port %d failed %d\n", port, err);
+			break;
+		}
+		gen->port[port - 1].pkey_table_len = dprops->max_pkeys;
+		gen->port[port - 1].gid_table_len = pprops->gid_tbl_len;
+		mlx5_ib_dbg(dev, "pkey_table_len %d, gid_table_len %d\n",
+			    dprops->max_pkeys, pprops->gid_tbl_len);
+	}
+
+out:
+	kfree(pprops);
+	kfree(dprops);
+
+	return err;
+}
+
+static void destroy_umrc_res(struct mlx5_ib_dev *dev)
+{
+	int err;
+
+	err = mlx5_mr_cache_cleanup(dev);
+	if (err)
+		mlx5_ib_warn(dev, "mr cache cleanup failed\n");
+
+	mlx5_ib_destroy_qp(dev->umrc.qp);
+	ib_destroy_cq(dev->umrc.cq);
+	ib_dereg_mr(dev->umrc.mr);
+	ib_dealloc_pd(dev->umrc.pd);
+}
+
+enum {
+	MAX_UMR_WR = 128,
+};
+
+static int create_umr_res(struct mlx5_ib_dev *dev)
+{
+	struct ib_qp_init_attr *init_attr = NULL;
+	struct ib_qp_attr *attr = NULL;
+	struct ib_pd *pd;
+	struct ib_cq *cq;
+	struct ib_qp *qp;
+	struct ib_mr *mr;
+	int ret;
+
+	attr = kzalloc(sizeof(*attr), GFP_KERNEL);
+	init_attr = kzalloc(sizeof(*init_attr), GFP_KERNEL);
+	if (!attr || !init_attr) {
+		ret = -ENOMEM;
+		goto error_0;
+	}
+
+	pd = ib_alloc_pd(&dev->ib_dev);
+	if (IS_ERR(pd)) {
+		mlx5_ib_dbg(dev, "Couldn't create PD for sync UMR QP\n");
+		ret = PTR_ERR(pd);
+		goto error_0;
+	}
+
+	mr = ib_get_dma_mr(pd,  IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(mr)) {
+		mlx5_ib_dbg(dev, "Couldn't create DMA MR for sync UMR QP\n");
+		ret = PTR_ERR(mr);
+		goto error_1;
+	}
+
+	cq = ib_create_cq(&dev->ib_dev, mlx5_umr_cq_handler, NULL, NULL, 128,
+			  0);
+	if (IS_ERR(cq)) {
+		mlx5_ib_dbg(dev, "Couldn't create CQ for sync UMR QP\n");
+		ret = PTR_ERR(cq);
+		goto error_2;
+	}
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+
+	init_attr->send_cq = cq;
+	init_attr->recv_cq = cq;
+	init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+	init_attr->cap.max_send_wr = MAX_UMR_WR;
+	init_attr->cap.max_send_sge = 1;
+	init_attr->qp_type = MLX5_IB_QPT_REG_UMR;
+	init_attr->port_num = 1;
+	qp = mlx5_ib_create_qp(pd, init_attr, NULL);
+	if (IS_ERR(qp)) {
+		mlx5_ib_dbg(dev, "Couldn't create sync UMR QP\n");
+		ret = PTR_ERR(qp);
+		goto error_3;
+	}
+	qp->device     = &dev->ib_dev;
+	qp->real_qp    = qp;
+	qp->uobject    = NULL;
+	qp->qp_type    = MLX5_IB_QPT_REG_UMR;
+
+	attr->qp_state = IB_QPS_INIT;
+	attr->port_num = 1;
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE | IB_QP_PKEY_INDEX |
+				IB_QP_PORT, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify UMR QP\n");
+		goto error_4;
+	}
+
+	memset(attr, 0, sizeof(*attr));
+	attr->qp_state = IB_QPS_RTR;
+	attr->path_mtu = IB_MTU_256;
+
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rtr\n");
+		goto error_4;
+	}
+
+	memset(attr, 0, sizeof(*attr));
+	attr->qp_state = IB_QPS_RTS;
+	ret = mlx5_ib_modify_qp(qp, attr, IB_QP_STATE, NULL);
+	if (ret) {
+		mlx5_ib_dbg(dev, "Couldn't modify umr QP to rts\n");
+		goto error_4;
+	}
+
+	dev->umrc.qp = qp;
+	dev->umrc.cq = cq;
+	dev->umrc.mr = mr;
+	dev->umrc.pd = pd;
+
+	sema_init(&dev->umrc.sem, MAX_UMR_WR);
+	ret = mlx5_mr_cache_init(dev);
+	if (ret) {
+		mlx5_ib_warn(dev, "mr cache init failed %d\n", ret);
+		goto error_4;
+	}
+
+	kfree(attr);
+	kfree(init_attr);
+
+	return 0;
+
+error_4:
+	mlx5_ib_destroy_qp(qp);
+
+error_3:
+	ib_destroy_cq(cq);
+
+error_2:
+	ib_dereg_mr(mr);
+
+error_1:
+	ib_dealloc_pd(pd);
+
+error_0:
+	kfree(attr);
+	kfree(init_attr);
+	return ret;
+}
+
+static int create_dev_resources(struct mlx5_ib_resources *devr)
+{
+	struct ib_srq_init_attr attr;
+	struct mlx5_ib_dev *dev;
+	int ret = 0;
+
+	dev = container_of(devr, struct mlx5_ib_dev, devr);
+
+	devr->p0 = mlx5_ib_alloc_pd(&dev->ib_dev, NULL, NULL);
+	if (IS_ERR(devr->p0)) {
+		ret = PTR_ERR(devr->p0);
+		goto error0;
+	}
+	devr->p0->device  = &dev->ib_dev;
+	devr->p0->uobject = NULL;
+	atomic_set(&devr->p0->usecnt, 0);
+
+	devr->c0 = mlx5_ib_create_cq(&dev->ib_dev, 1, 0, NULL, NULL);
+	if (IS_ERR(devr->c0)) {
+		ret = PTR_ERR(devr->c0);
+		goto error1;
+	}
+	devr->c0->device        = &dev->ib_dev;
+	devr->c0->uobject       = NULL;
+	devr->c0->comp_handler  = NULL;
+	devr->c0->event_handler = NULL;
+	devr->c0->cq_context    = NULL;
+	atomic_set(&devr->c0->usecnt, 0);
+
+	devr->x0 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
+	if (IS_ERR(devr->x0)) {
+		ret = PTR_ERR(devr->x0);
+		goto error2;
+	}
+	devr->x0->device = &dev->ib_dev;
+	devr->x0->inode = NULL;
+	atomic_set(&devr->x0->usecnt, 0);
+	mutex_init(&devr->x0->tgt_qp_mutex);
+	INIT_LIST_HEAD(&devr->x0->tgt_qp_list);
+
+	devr->x1 = mlx5_ib_alloc_xrcd(&dev->ib_dev, NULL, NULL);
+	if (IS_ERR(devr->x1)) {
+		ret = PTR_ERR(devr->x1);
+		goto error3;
+	}
+	devr->x1->device = &dev->ib_dev;
+	devr->x1->inode = NULL;
+	atomic_set(&devr->x1->usecnt, 0);
+	mutex_init(&devr->x1->tgt_qp_mutex);
+	INIT_LIST_HEAD(&devr->x1->tgt_qp_list);
+
+	memset(&attr, 0, sizeof(attr));
+	attr.attr.max_sge = 1;
+	attr.attr.max_wr = 1;
+	attr.srq_type = IB_SRQT_XRC;
+	attr.ext.xrc.cq = devr->c0;
+	attr.ext.xrc.xrcd = devr->x0;
+
+	devr->s0 = mlx5_ib_create_srq(devr->p0, &attr, NULL);
+	if (IS_ERR(devr->s0)) {
+		ret = PTR_ERR(devr->s0);
+		goto error4;
+	}
+	devr->s0->device	= &dev->ib_dev;
+	devr->s0->pd		= devr->p0;
+	devr->s0->uobject       = NULL;
+	devr->s0->event_handler = NULL;
+	devr->s0->srq_context   = NULL;
+	devr->s0->srq_type      = IB_SRQT_XRC;
+	devr->s0->ext.xrc.xrcd	= devr->x0;
+	devr->s0->ext.xrc.cq	= devr->c0;
+	atomic_inc(&devr->s0->ext.xrc.xrcd->usecnt);
+	atomic_inc(&devr->s0->ext.xrc.cq->usecnt);
+	atomic_inc(&devr->p0->usecnt);
+	atomic_set(&devr->s0->usecnt, 0);
+
+	return 0;
+
+error4:
+	mlx5_ib_dealloc_xrcd(devr->x1);
+error3:
+	mlx5_ib_dealloc_xrcd(devr->x0);
+error2:
+	mlx5_ib_destroy_cq(devr->c0);
+error1:
+	mlx5_ib_dealloc_pd(devr->p0);
+error0:
+	return ret;
+}
+
+static void destroy_dev_resources(struct mlx5_ib_resources *devr)
+{
+	mlx5_ib_destroy_srq(devr->s0);
+	mlx5_ib_dealloc_xrcd(devr->x0);
+	mlx5_ib_dealloc_xrcd(devr->x1);
+	mlx5_ib_destroy_cq(devr->c0);
+	mlx5_ib_dealloc_pd(devr->p0);
+}
+
+static void *mlx5_ib_add(struct mlx5_core_dev *mdev)
+{
+	struct mlx5_ib_dev *dev;
+	int err;
+	int i;
+
+	printk_once(KERN_INFO "%s", mlx5_version);
+
+	dev = (struct mlx5_ib_dev *)ib_alloc_device(sizeof(*dev));
+	if (!dev)
+		return NULL;
+
+	dev->mdev = mdev;
+
+	err = get_port_caps(dev);
+	if (err)
+		goto err_dealloc;
+
+	get_ext_port_caps(dev);
+
+	MLX5_INIT_DOORBELL_LOCK(&dev->uar_lock);
+
+	strlcpy(dev->ib_dev.name, "mlx5_%d", IB_DEVICE_NAME_MAX);
+	dev->ib_dev.owner		= THIS_MODULE;
+	dev->ib_dev.node_type		= RDMA_NODE_IB_CA;
+	dev->ib_dev.local_dma_lkey	= mdev->caps.gen.reserved_lkey;
+	dev->num_ports		= mdev->caps.gen.num_ports;
+	dev->ib_dev.phys_port_cnt     = dev->num_ports;
+	dev->ib_dev.num_comp_vectors    =
+		dev->mdev->priv.eq_table.num_comp_vectors;
+	dev->ib_dev.dma_device	= &mdev->pdev->dev;
+
+	dev->ib_dev.uverbs_abi_ver	= MLX5_IB_UVERBS_ABI_VERSION;
+	dev->ib_dev.uverbs_cmd_mask	=
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_XSRQ)		|
+		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
+	dev->ib_dev.uverbs_ex_cmd_mask =
+		(1ull << IB_USER_VERBS_EX_CMD_QUERY_DEVICE);
+
+	dev->ib_dev.query_device	= mlx5_ib_query_device;
+	dev->ib_dev.query_port		= mlx5_ib_query_port;
+	dev->ib_dev.query_gid		= mlx5_ib_query_gid;
+	dev->ib_dev.query_pkey		= mlx5_ib_query_pkey;
+	dev->ib_dev.modify_device	= mlx5_ib_modify_device;
+	dev->ib_dev.modify_port		= mlx5_ib_modify_port;
+	dev->ib_dev.alloc_ucontext	= mlx5_ib_alloc_ucontext;
+	dev->ib_dev.dealloc_ucontext	= mlx5_ib_dealloc_ucontext;
+	dev->ib_dev.mmap		= mlx5_ib_mmap;
+	dev->ib_dev.alloc_pd		= mlx5_ib_alloc_pd;
+	dev->ib_dev.dealloc_pd		= mlx5_ib_dealloc_pd;
+	dev->ib_dev.create_ah		= mlx5_ib_create_ah;
+	dev->ib_dev.query_ah		= mlx5_ib_query_ah;
+	dev->ib_dev.destroy_ah		= mlx5_ib_destroy_ah;
+	dev->ib_dev.create_srq		= mlx5_ib_create_srq;
+	dev->ib_dev.modify_srq		= mlx5_ib_modify_srq;
+	dev->ib_dev.query_srq		= mlx5_ib_query_srq;
+	dev->ib_dev.destroy_srq		= mlx5_ib_destroy_srq;
+	dev->ib_dev.post_srq_recv	= mlx5_ib_post_srq_recv;
+	dev->ib_dev.create_qp		= mlx5_ib_create_qp;
+	dev->ib_dev.modify_qp		= mlx5_ib_modify_qp;
+	dev->ib_dev.query_qp		= mlx5_ib_query_qp;
+	dev->ib_dev.destroy_qp		= mlx5_ib_destroy_qp;
+	dev->ib_dev.post_send		= mlx5_ib_post_send;
+	dev->ib_dev.post_recv		= mlx5_ib_post_recv;
+	dev->ib_dev.create_cq		= mlx5_ib_create_cq;
+	dev->ib_dev.modify_cq		= mlx5_ib_modify_cq;
+	dev->ib_dev.resize_cq		= mlx5_ib_resize_cq;
+	dev->ib_dev.destroy_cq		= mlx5_ib_destroy_cq;
+	dev->ib_dev.poll_cq		= mlx5_ib_poll_cq;
+	dev->ib_dev.req_notify_cq	= mlx5_ib_arm_cq;
+	dev->ib_dev.get_dma_mr		= mlx5_ib_get_dma_mr;
+	dev->ib_dev.reg_user_mr		= mlx5_ib_reg_user_mr;
+	dev->ib_dev.dereg_mr		= mlx5_ib_dereg_mr;
+	dev->ib_dev.destroy_mr		= mlx5_ib_destroy_mr;
+	dev->ib_dev.attach_mcast	= mlx5_ib_mcg_attach;
+	dev->ib_dev.detach_mcast	= mlx5_ib_mcg_detach;
+	dev->ib_dev.process_mad		= mlx5_ib_process_mad;
+	dev->ib_dev.create_mr		= mlx5_ib_create_mr;
+	dev->ib_dev.alloc_fast_reg_mr	= mlx5_ib_alloc_fast_reg_mr;
+	dev->ib_dev.alloc_fast_reg_page_list = mlx5_ib_alloc_fast_reg_page_list;
+	dev->ib_dev.free_fast_reg_page_list  = mlx5_ib_free_fast_reg_page_list;
+	dev->ib_dev.check_mr_status	= mlx5_ib_check_mr_status;
+
+	mlx5_ib_internal_query_odp_caps(dev);
+
+	if (mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_XRC) {
+		dev->ib_dev.alloc_xrcd = mlx5_ib_alloc_xrcd;
+		dev->ib_dev.dealloc_xrcd = mlx5_ib_dealloc_xrcd;
+		dev->ib_dev.uverbs_cmd_mask |=
+			(1ull << IB_USER_VERBS_CMD_OPEN_XRCD) |
+			(1ull << IB_USER_VERBS_CMD_CLOSE_XRCD);
+	}
+
+	err = init_node_data(dev);
+	if (err)
+		goto err_dealloc;
+
+	mutex_init(&dev->cap_mask_mutex);
+
+	err = create_dev_resources(&dev->devr);
+	if (err)
+		goto err_dealloc;
+
+	err = mlx5_ib_odp_init_one(dev);
+	if (err)
+		goto err_rsrc;
+
+	err = ib_register_device(&dev->ib_dev, NULL);
+	if (err)
+		goto err_odp;
+
+	err = create_umr_res(dev);
+	if (err)
+		goto err_dev;
+
+	for (i = 0; i < ARRAY_SIZE(mlx5_class_attributes); i++) {
+		err = device_create_file(&dev->ib_dev.dev,
+					 mlx5_class_attributes[i]);
+		if (err)
+			goto err_umrc;
+	}
+
+	dev->ib_active = true;
+
+	return dev;
+
+err_umrc:
+	destroy_umrc_res(dev);
+
+err_dev:
+	ib_unregister_device(&dev->ib_dev);
+
+err_odp:
+	mlx5_ib_odp_remove_one(dev);
+
+err_rsrc:
+	destroy_dev_resources(&dev->devr);
+
+err_dealloc:
+	ib_dealloc_device((struct ib_device *)dev);
+
+	return NULL;
+}
+
+static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context)
+{
+	struct mlx5_ib_dev *dev = context;
+
+	ib_unregister_device(&dev->ib_dev);
+	destroy_umrc_res(dev);
+	mlx5_ib_odp_remove_one(dev);
+	destroy_dev_resources(&dev->devr);
+	ib_dealloc_device(&dev->ib_dev);
+}
+
+static struct mlx5_interface mlx5_ib_interface = {
+	.add            = mlx5_ib_add,
+	.remove         = mlx5_ib_remove,
+	.event          = mlx5_ib_event,
+	.protocol	= MLX5_INTERFACE_PROTOCOL_IB,
+};
+
+static int __init mlx5_ib_init(void)
+{
+	int err;
+
+	if (deprecated_prof_sel != 2)
+		pr_warn("prof_sel is deprecated for mlx5_ib, set it for mlx5_core\n");
+
+	err = mlx5_ib_odp_init();
+	if (err)
+		return err;
+
+	err = mlx5_register_interface(&mlx5_ib_interface);
+	if (err)
+		goto clean_odp;
+
+	return err;
+
+clean_odp:
+	mlx5_ib_odp_cleanup();
+	return err;
+}
+
+static void __exit mlx5_ib_cleanup(void)
+{
+	mlx5_unregister_interface(&mlx5_ib_interface);
+	mlx5_ib_odp_cleanup();
+}
+
+module_init(mlx5_ib_init);
+module_exit(mlx5_ib_cleanup);
diff --git a/drivers/infiniband/hw/mlx5/mem.c b/drivers/infiniband/hw/mlx5/mem.c
new file mode 100644
index 000000000..40df2cca0
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/mem.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
+#include "mlx5_ib.h"
+
+/* @umem: umem object to scan
+ * @addr: ib virtual address requested by the user
+ * @count: number of PAGE_SIZE pages covered by umem
+ * @shift: page shift for the compound pages found in the region
+ * @ncont: number of compund pages
+ * @order: log2 of the number of compound pages
+ */
+void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
+			int *ncont, int *order)
+{
+	unsigned long tmp;
+	unsigned long m;
+	int i, k;
+	u64 base = 0;
+	int p = 0;
+	int skip;
+	int mask;
+	u64 len;
+	u64 pfn;
+	struct scatterlist *sg;
+	int entry;
+	unsigned long page_shift = ilog2(umem->page_size);
+
+	/* With ODP we must always match OS page size. */
+	if (umem->odp_data) {
+		*count = ib_umem_page_count(umem);
+		*shift = PAGE_SHIFT;
+		*ncont = *count;
+		if (order)
+			*order = ilog2(roundup_pow_of_two(*count));
+
+		return;
+	}
+
+	addr = addr >> page_shift;
+	tmp = (unsigned long)addr;
+	m = find_first_bit(&tmp, sizeof(tmp));
+	skip = 1 << m;
+	mask = skip - 1;
+	i = 0;
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		len = sg_dma_len(sg) >> page_shift;
+		pfn = sg_dma_address(sg) >> page_shift;
+		for (k = 0; k < len; k++) {
+			if (!(i & mask)) {
+				tmp = (unsigned long)pfn;
+				m = min_t(unsigned long, m, find_first_bit(&tmp, sizeof(tmp)));
+				skip = 1 << m;
+				mask = skip - 1;
+				base = pfn;
+				p = 0;
+			} else {
+				if (base + p != pfn) {
+					tmp = (unsigned long)p;
+					m = find_first_bit(&tmp, sizeof(tmp));
+					skip = 1 << m;
+					mask = skip - 1;
+					base = pfn;
+					p = 0;
+				}
+			}
+			p++;
+			i++;
+		}
+	}
+
+	if (i) {
+		m = min_t(unsigned long, ilog2(roundup_pow_of_two(i)), m);
+
+		if (order)
+			*order = ilog2(roundup_pow_of_two(i) >> m);
+
+		*ncont = DIV_ROUND_UP(i, (1 << m));
+	} else {
+		m  = 0;
+
+		if (order)
+			*order = 0;
+
+		*ncont = 0;
+	}
+	*shift = page_shift + m;
+	*count = i;
+}
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+static u64 umem_dma_to_mtt(dma_addr_t umem_dma)
+{
+	u64 mtt_entry = umem_dma & ODP_DMA_ADDR_MASK;
+
+	if (umem_dma & ODP_READ_ALLOWED_BIT)
+		mtt_entry |= MLX5_IB_MTT_READ;
+	if (umem_dma & ODP_WRITE_ALLOWED_BIT)
+		mtt_entry |= MLX5_IB_MTT_WRITE;
+
+	return mtt_entry;
+}
+#endif
+
+/*
+ * Populate the given array with bus addresses from the umem.
+ *
+ * dev - mlx5_ib device
+ * umem - umem to use to fill the pages
+ * page_shift - determines the page size used in the resulting array
+ * offset - offset into the umem to start from,
+ *          only implemented for ODP umems
+ * num_pages - total number of pages to fill
+ * pas - bus addresses array to fill
+ * access_flags - access flags to set on all present pages.
+		  use enum mlx5_ib_mtt_access_flags for this.
+ */
+void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+			    int page_shift, size_t offset, size_t num_pages,
+			    __be64 *pas, int access_flags)
+{
+	unsigned long umem_page_shift = ilog2(umem->page_size);
+	int shift = page_shift - umem_page_shift;
+	int mask = (1 << shift) - 1;
+	int i, k;
+	u64 cur = 0;
+	u64 base;
+	int len;
+	struct scatterlist *sg;
+	int entry;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	const bool odp = umem->odp_data != NULL;
+
+	if (odp) {
+		WARN_ON(shift != 0);
+		WARN_ON(access_flags != (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE));
+
+		for (i = 0; i < num_pages; ++i) {
+			dma_addr_t pa = umem->odp_data->dma_list[offset + i];
+
+			pas[i] = cpu_to_be64(umem_dma_to_mtt(pa));
+		}
+		return;
+	}
+#endif
+
+	i = 0;
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		len = sg_dma_len(sg) >> umem_page_shift;
+		base = sg_dma_address(sg);
+		for (k = 0; k < len; k++) {
+			if (!(i & mask)) {
+				cur = base + (k << umem_page_shift);
+				cur |= access_flags;
+
+				pas[i >> shift] = cpu_to_be64(cur);
+				mlx5_ib_dbg(dev, "pas[%d] 0x%llx\n",
+					    i >> shift, be64_to_cpu(pas[i >> shift]));
+			}  else
+				mlx5_ib_dbg(dev, "=====> 0x%llx\n",
+					    base + (k << umem_page_shift));
+			i++;
+		}
+	}
+}
+
+void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+			  int page_shift, __be64 *pas, int access_flags)
+{
+	return __mlx5_ib_populate_pas(dev, umem, page_shift, 0,
+				      ib_umem_num_pages(umem), pas,
+				      access_flags);
+}
+int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset)
+{
+	u64 page_size;
+	u64 page_mask;
+	u64 off_size;
+	u64 off_mask;
+	u64 buf_off;
+
+	page_size = (u64)1 << page_shift;
+	page_mask = page_size - 1;
+	buf_off = addr & page_mask;
+	off_size = page_size >> 6;
+	off_mask = off_size - 1;
+
+	if (buf_off & off_mask)
+		return -EINVAL;
+
+	*offset = buf_off >> ilog2(off_size);
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h
new file mode 100644
index 000000000..dff1cfcdf
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h
@@ -0,0 +1,669 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_IB_H
+#define MLX5_IB_H
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#include <linux/mlx5/driver.h>
+#include <linux/mlx5/cq.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/srq.h>
+#include <linux/types.h>
+
+#define mlx5_ib_dbg(dev, format, arg...)				\
+pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
+	 __LINE__, current->pid, ##arg)
+
+#define mlx5_ib_err(dev, format, arg...)				\
+pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
+	__LINE__, current->pid, ##arg)
+
+#define mlx5_ib_warn(dev, format, arg...)				\
+pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__,	\
+	__LINE__, current->pid, ##arg)
+
+enum {
+	MLX5_IB_MMAP_CMD_SHIFT	= 8,
+	MLX5_IB_MMAP_CMD_MASK	= 0xff,
+};
+
+enum mlx5_ib_mmap_cmd {
+	MLX5_IB_MMAP_REGULAR_PAGE		= 0,
+	MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES	= 1, /* always last */
+};
+
+enum {
+	MLX5_RES_SCAT_DATA32_CQE	= 0x1,
+	MLX5_RES_SCAT_DATA64_CQE	= 0x2,
+	MLX5_REQ_SCAT_DATA32_CQE	= 0x11,
+	MLX5_REQ_SCAT_DATA64_CQE	= 0x22,
+};
+
+enum mlx5_ib_latency_class {
+	MLX5_IB_LATENCY_CLASS_LOW,
+	MLX5_IB_LATENCY_CLASS_MEDIUM,
+	MLX5_IB_LATENCY_CLASS_HIGH,
+	MLX5_IB_LATENCY_CLASS_FAST_PATH
+};
+
+enum mlx5_ib_mad_ifc_flags {
+	MLX5_MAD_IFC_IGNORE_MKEY	= 1,
+	MLX5_MAD_IFC_IGNORE_BKEY	= 2,
+	MLX5_MAD_IFC_NET_VIEW		= 4,
+};
+
+struct mlx5_ib_ucontext {
+	struct ib_ucontext	ibucontext;
+	struct list_head	db_page_list;
+
+	/* protect doorbell record alloc/free
+	 */
+	struct mutex		db_page_mutex;
+	struct mlx5_uuar_info	uuari;
+};
+
+static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct mlx5_ib_ucontext, ibucontext);
+}
+
+struct mlx5_ib_pd {
+	struct ib_pd		ibpd;
+	u32			pdn;
+	u32			pa_lkey;
+};
+
+/* Use macros here so that don't have to duplicate
+ * enum ib_send_flags and enum ib_qp_type for low-level driver
+ */
+
+#define MLX5_IB_SEND_UMR_UNREG	IB_SEND_RESERVED_START
+#define MLX5_IB_SEND_UMR_FAIL_IF_FREE (IB_SEND_RESERVED_START << 1)
+#define MLX5_IB_SEND_UMR_UPDATE_MTT (IB_SEND_RESERVED_START << 2)
+#define MLX5_IB_QPT_REG_UMR	IB_QPT_RESERVED1
+#define MLX5_IB_WR_UMR		IB_WR_RESERVED1
+
+struct wr_list {
+	u16	opcode;
+	u16	next;
+};
+
+struct mlx5_ib_wq {
+	u64		       *wrid;
+	u32		       *wr_data;
+	struct wr_list	       *w_list;
+	unsigned	       *wqe_head;
+	u16		        unsig_count;
+
+	/* serialize post to the work queue
+	 */
+	spinlock_t		lock;
+	int			wqe_cnt;
+	int			max_post;
+	int			max_gs;
+	int			offset;
+	int			wqe_shift;
+	unsigned		head;
+	unsigned		tail;
+	u16			cur_post;
+	u16			last_poll;
+	void		       *qend;
+};
+
+enum {
+	MLX5_QP_USER,
+	MLX5_QP_KERNEL,
+	MLX5_QP_EMPTY
+};
+
+/*
+ * Connect-IB can trigger up to four concurrent pagefaults
+ * per-QP.
+ */
+enum mlx5_ib_pagefault_context {
+	MLX5_IB_PAGEFAULT_RESPONDER_READ,
+	MLX5_IB_PAGEFAULT_REQUESTOR_READ,
+	MLX5_IB_PAGEFAULT_RESPONDER_WRITE,
+	MLX5_IB_PAGEFAULT_REQUESTOR_WRITE,
+	MLX5_IB_PAGEFAULT_CONTEXTS
+};
+
+static inline enum mlx5_ib_pagefault_context
+	mlx5_ib_get_pagefault_context(struct mlx5_pagefault *pagefault)
+{
+	return pagefault->flags & (MLX5_PFAULT_REQUESTOR | MLX5_PFAULT_WRITE);
+}
+
+struct mlx5_ib_pfault {
+	struct work_struct	work;
+	struct mlx5_pagefault	mpfault;
+};
+
+struct mlx5_ib_qp {
+	struct ib_qp		ibqp;
+	struct mlx5_core_qp	mqp;
+	struct mlx5_buf		buf;
+
+	struct mlx5_db		db;
+	struct mlx5_ib_wq	rq;
+
+	u32			doorbell_qpn;
+	u8			sq_signal_bits;
+	u8			fm_cache;
+	int			sq_max_wqes_per_wr;
+	int			sq_spare_wqes;
+	struct mlx5_ib_wq	sq;
+
+	struct ib_umem	       *umem;
+	int			buf_size;
+
+	/* serialize qp state modifications
+	 */
+	struct mutex		mutex;
+	u16			xrcdn;
+	u32			flags;
+	u8			port;
+	u8			alt_port;
+	u8			atomic_rd_en;
+	u8			resp_depth;
+	u8			state;
+	int			mlx_type;
+	int			wq_sig;
+	int			scat_cqe;
+	int			max_inline_data;
+	struct mlx5_bf	       *bf;
+	int			has_rq;
+
+	/* only for user space QPs. For kernel
+	 * we have it from the bf object
+	 */
+	int			uuarn;
+
+	int			create_type;
+	u32			pa_lkey;
+
+	/* Store signature errors */
+	bool			signature_en;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	/*
+	 * A flag that is true for QP's that are in a state that doesn't
+	 * allow page faults, and shouldn't schedule any more faults.
+	 */
+	int                     disable_page_faults;
+	/*
+	 * The disable_page_faults_lock protects a QP's disable_page_faults
+	 * field, allowing for a thread to atomically check whether the QP
+	 * allows page faults, and if so schedule a page fault.
+	 */
+	spinlock_t              disable_page_faults_lock;
+	struct mlx5_ib_pfault	pagefaults[MLX5_IB_PAGEFAULT_CONTEXTS];
+#endif
+};
+
+struct mlx5_ib_cq_buf {
+	struct mlx5_buf		buf;
+	struct ib_umem		*umem;
+	int			cqe_size;
+	int			nent;
+};
+
+enum mlx5_ib_qp_flags {
+	MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK     = 1 << 0,
+	MLX5_IB_QP_SIGNATURE_HANDLING           = 1 << 1,
+};
+
+struct mlx5_umr_wr {
+	union {
+		u64			virt_addr;
+		u64			offset;
+	} target;
+	struct ib_pd		       *pd;
+	unsigned int			page_shift;
+	unsigned int			npages;
+	u32				length;
+	int				access_flags;
+	u32				mkey;
+};
+
+struct mlx5_shared_mr_info {
+	int mr_id;
+	struct ib_umem		*umem;
+};
+
+struct mlx5_ib_cq {
+	struct ib_cq		ibcq;
+	struct mlx5_core_cq	mcq;
+	struct mlx5_ib_cq_buf	buf;
+	struct mlx5_db		db;
+
+	/* serialize access to the CQ
+	 */
+	spinlock_t		lock;
+
+	/* protect resize cq
+	 */
+	struct mutex		resize_mutex;
+	struct mlx5_ib_cq_buf  *resize_buf;
+	struct ib_umem	       *resize_umem;
+	int			cqe_size;
+};
+
+struct mlx5_ib_srq {
+	struct ib_srq		ibsrq;
+	struct mlx5_core_srq	msrq;
+	struct mlx5_buf		buf;
+	struct mlx5_db		db;
+	u64		       *wrid;
+	/* protect SRQ hanlding
+	 */
+	spinlock_t		lock;
+	int			head;
+	int			tail;
+	u16			wqe_ctr;
+	struct ib_umem	       *umem;
+	/* serialize arming a SRQ
+	 */
+	struct mutex		mutex;
+	int			wq_sig;
+};
+
+struct mlx5_ib_xrcd {
+	struct ib_xrcd		ibxrcd;
+	u32			xrcdn;
+};
+
+enum mlx5_ib_mtt_access_flags {
+	MLX5_IB_MTT_READ  = (1 << 0),
+	MLX5_IB_MTT_WRITE = (1 << 1),
+};
+
+#define MLX5_IB_MTT_PRESENT (MLX5_IB_MTT_READ | MLX5_IB_MTT_WRITE)
+
+struct mlx5_ib_mr {
+	struct ib_mr		ibmr;
+	struct mlx5_core_mr	mmr;
+	struct ib_umem	       *umem;
+	struct mlx5_shared_mr_info	*smr_info;
+	struct list_head	list;
+	int			order;
+	int			umred;
+	int			npages;
+	struct mlx5_ib_dev     *dev;
+	struct mlx5_create_mkey_mbox_out out;
+	struct mlx5_core_sig_ctx    *sig;
+	int			live;
+};
+
+struct mlx5_ib_fast_reg_page_list {
+	struct ib_fast_reg_page_list	ibfrpl;
+	__be64			       *mapped_page_list;
+	dma_addr_t			map;
+};
+
+struct mlx5_ib_umr_context {
+	enum ib_wc_status	status;
+	struct completion	done;
+};
+
+static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
+{
+	context->status = -1;
+	init_completion(&context->done);
+}
+
+struct umr_common {
+	struct ib_pd	*pd;
+	struct ib_cq	*cq;
+	struct ib_qp	*qp;
+	struct ib_mr	*mr;
+	/* control access to UMR QP
+	 */
+	struct semaphore	sem;
+};
+
+enum {
+	MLX5_FMR_INVALID,
+	MLX5_FMR_VALID,
+	MLX5_FMR_BUSY,
+};
+
+struct mlx5_ib_fmr {
+	struct ib_fmr			ibfmr;
+	struct mlx5_core_mr		mr;
+	int				access_flags;
+	int				state;
+	/* protect fmr state
+	 */
+	spinlock_t			lock;
+	u64				wrid;
+	struct ib_send_wr		wr[2];
+	u8				page_shift;
+	struct ib_fast_reg_page_list	page_list;
+};
+
+struct mlx5_cache_ent {
+	struct list_head	head;
+	/* sync access to the cahce entry
+	 */
+	spinlock_t		lock;
+
+
+	struct dentry	       *dir;
+	char                    name[4];
+	u32                     order;
+	u32			size;
+	u32                     cur;
+	u32                     miss;
+	u32			limit;
+
+	struct dentry          *fsize;
+	struct dentry          *fcur;
+	struct dentry          *fmiss;
+	struct dentry          *flimit;
+
+	struct mlx5_ib_dev     *dev;
+	struct work_struct	work;
+	struct delayed_work	dwork;
+	int			pending;
+};
+
+struct mlx5_mr_cache {
+	struct workqueue_struct *wq;
+	struct mlx5_cache_ent	ent[MAX_MR_CACHE_ENTRIES];
+	int			stopped;
+	struct dentry		*root;
+	unsigned long		last_add;
+};
+
+struct mlx5_ib_resources {
+	struct ib_cq	*c0;
+	struct ib_xrcd	*x0;
+	struct ib_xrcd	*x1;
+	struct ib_pd	*p0;
+	struct ib_srq	*s0;
+};
+
+struct mlx5_ib_dev {
+	struct ib_device		ib_dev;
+	struct mlx5_core_dev		*mdev;
+	MLX5_DECLARE_DOORBELL_LOCK(uar_lock);
+	int				num_ports;
+	/* serialize update of capability mask
+	 */
+	struct mutex			cap_mask_mutex;
+	bool				ib_active;
+	struct umr_common		umrc;
+	/* sync used page count stats
+	 */
+	struct mlx5_ib_resources	devr;
+	struct mlx5_mr_cache		cache;
+	struct timer_list		delay_timer;
+	int				fill_delay;
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	struct ib_odp_caps	odp_caps;
+	/*
+	 * Sleepable RCU that prevents destruction of MRs while they are still
+	 * being used by a page fault handler.
+	 */
+	struct srcu_struct      mr_srcu;
+#endif
+};
+
+static inline struct mlx5_ib_cq *to_mibcq(struct mlx5_core_cq *mcq)
+{
+	return container_of(mcq, struct mlx5_ib_cq, mcq);
+}
+
+static inline struct mlx5_ib_xrcd *to_mxrcd(struct ib_xrcd *ibxrcd)
+{
+	return container_of(ibxrcd, struct mlx5_ib_xrcd, ibxrcd);
+}
+
+static inline struct mlx5_ib_dev *to_mdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct mlx5_ib_dev, ib_dev);
+}
+
+static inline struct mlx5_ib_fmr *to_mfmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct mlx5_ib_fmr, ibfmr);
+}
+
+static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct mlx5_ib_cq, ibcq);
+}
+
+static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp)
+{
+	return container_of(mqp, struct mlx5_ib_qp, mqp);
+}
+
+static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr)
+{
+	return container_of(mmr, struct mlx5_ib_mr, mmr);
+}
+
+static inline struct mlx5_ib_pd *to_mpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct mlx5_ib_pd, ibpd);
+}
+
+static inline struct mlx5_ib_srq *to_msrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct mlx5_ib_srq, ibsrq);
+}
+
+static inline struct mlx5_ib_qp *to_mqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct mlx5_ib_qp, ibqp);
+}
+
+static inline struct mlx5_ib_srq *to_mibsrq(struct mlx5_core_srq *msrq)
+{
+	return container_of(msrq, struct mlx5_ib_srq, msrq);
+}
+
+static inline struct mlx5_ib_mr *to_mmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct mlx5_ib_mr, ibmr);
+}
+
+static inline struct mlx5_ib_fast_reg_page_list *to_mfrpl(struct ib_fast_reg_page_list *ibfrpl)
+{
+	return container_of(ibfrpl, struct mlx5_ib_fast_reg_page_list, ibfrpl);
+}
+
+struct mlx5_ib_ah {
+	struct ib_ah		ibah;
+	struct mlx5_av		av;
+};
+
+static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct mlx5_ib_ah, ibah);
+}
+
+int mlx5_ib_db_map_user(struct mlx5_ib_ucontext *context, unsigned long virt,
+			struct mlx5_db *db);
+void mlx5_ib_db_unmap_user(struct mlx5_ib_ucontext *context, struct mlx5_db *db);
+void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
+void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq);
+void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index);
+int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey,
+		 u8 port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		 void *in_mad, void *response_mad);
+struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr,
+			   struct mlx5_ib_ah *ah);
+struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr);
+int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr);
+int mlx5_ib_destroy_ah(struct ib_ah *ah);
+struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata);
+int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr);
+int mlx5_ib_destroy_srq(struct ib_srq *srq);
+int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr);
+struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata);
+int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata);
+int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr);
+int mlx5_ib_destroy_qp(struct ib_qp *qp);
+int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr);
+int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr);
+void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n);
+int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
+			  void *buffer, u32 length);
+struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int entries,
+				int vector, struct ib_ucontext *context,
+				struct ib_udata *udata);
+int mlx5_ib_destroy_cq(struct ib_cq *cq);
+int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc);
+int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags);
+int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period);
+int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata);
+struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc);
+struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata);
+int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index,
+		       int npages, int zap);
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr);
+int mlx5_ib_destroy_mr(struct ib_mr *ibmr);
+struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
+				struct ib_mr_init_attr *mr_init_attr);
+struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
+					int max_page_list_len);
+struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
+							       int page_list_len);
+void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list);
+struct ib_fmr *mlx5_ib_fmr_alloc(struct ib_pd *pd, int acc,
+				 struct ib_fmr_attr *fmr_attr);
+int mlx5_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		      int npages, u64 iova);
+int mlx5_ib_unmap_fmr(struct list_head *fmr_list);
+int mlx5_ib_fmr_dealloc(struct ib_fmr *ibfmr);
+int mlx5_ib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+			struct ib_wc *in_wc, struct ib_grh *in_grh,
+			struct ib_mad *in_mad, struct ib_mad *out_mad);
+struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
+					  struct ib_ucontext *context,
+					  struct ib_udata *udata);
+int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd);
+int mlx5_ib_get_buf_offset(u64 addr, int page_shift, u32 *offset);
+int mlx5_query_ext_port_caps(struct mlx5_ib_dev *dev, u8 port);
+int mlx5_ib_query_port(struct ib_device *ibdev, u8 port,
+		       struct ib_port_attr *props);
+int mlx5_ib_init_fmr(struct mlx5_ib_dev *dev);
+void mlx5_ib_cleanup_fmr(struct mlx5_ib_dev *dev);
+void mlx5_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift,
+			int *ncont, int *order);
+void __mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+			    int page_shift, size_t offset, size_t num_pages,
+			    __be64 *pas, int access_flags);
+void mlx5_ib_populate_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
+			  int page_shift, __be64 *pas, int access_flags);
+void mlx5_ib_copy_pas(u64 *old, u64 *new, int step, int num);
+int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq);
+int mlx5_mr_cache_init(struct mlx5_ib_dev *dev);
+int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev);
+int mlx5_mr_ib_cont_pages(struct ib_umem *umem, u64 addr, int *count, int *shift);
+void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context);
+int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
+			    struct ib_mr_status *mr_status);
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+extern struct workqueue_struct *mlx5_ib_page_fault_wq;
+
+int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev);
+void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
+			       struct mlx5_ib_pfault *pfault);
+void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp);
+int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev);
+void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev);
+int __init mlx5_ib_odp_init(void);
+void mlx5_ib_odp_cleanup(void);
+void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp);
+void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp);
+void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
+			      unsigned long end);
+
+#else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+static inline int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
+{
+	return 0;
+}
+
+static inline void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)		{}
+static inline int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev) { return 0; }
+static inline void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)	{}
+static inline int mlx5_ib_odp_init(void) { return 0; }
+static inline void mlx5_ib_odp_cleanup(void)				{}
+static inline void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp) {}
+static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)  {}
+
+#endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */
+
+static inline void init_query_mad(struct ib_smp *mad)
+{
+	mad->base_version  = 1;
+	mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	mad->class_version = 1;
+	mad->method	   = IB_MGMT_METHOD_GET;
+}
+
+static inline u8 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX5_PERM_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MLX5_PERM_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX5_PERM_LOCAL_WRITE  : 0) |
+	       MLX5_PERM_LOCAL_READ;
+}
+
+#define MLX5_MAX_UMR_SHIFT 16
+#define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT)
+
+#endif /* MLX5_IB_H */
diff --git a/drivers/infiniband/hw/mlx5/mr.c b/drivers/infiniband/hw/mlx5/mr.c
new file mode 100644
index 000000000..71c593583
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/mr.c
@@ -0,0 +1,1479 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+
+#include <linux/kref.h>
+#include <linux/random.h>
+#include <linux/debugfs.h>
+#include <linux/export.h>
+#include <linux/delay.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
+#include <rdma/ib_verbs.h>
+#include "mlx5_ib.h"
+
+enum {
+	MAX_PENDING_REG_MR = 8,
+};
+
+#define MLX5_UMR_ALIGN 2048
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+static __be64 mlx5_ib_update_mtt_emergency_buffer[
+		MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)]
+	__aligned(MLX5_UMR_ALIGN);
+static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
+#endif
+
+static int clean_mr(struct mlx5_ib_mr *mr);
+
+static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+{
+	int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmr);
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	/* Wait until all page fault handlers using the mr complete. */
+	synchronize_srcu(&dev->mr_srcu);
+#endif
+
+	return err;
+}
+
+static int order2idx(struct mlx5_ib_dev *dev, int order)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+
+	if (order < cache->ent[0].order)
+		return 0;
+	else
+		return order - cache->ent[0].order;
+}
+
+static void reg_mr_callback(int status, void *context)
+{
+	struct mlx5_ib_mr *mr = context;
+	struct mlx5_ib_dev *dev = mr->dev;
+	struct mlx5_mr_cache *cache = &dev->cache;
+	int c = order2idx(dev, mr->order);
+	struct mlx5_cache_ent *ent = &cache->ent[c];
+	u8 key;
+	unsigned long flags;
+	struct mlx5_mr_table *table = &dev->mdev->priv.mr_table;
+	int err;
+
+	spin_lock_irqsave(&ent->lock, flags);
+	ent->pending--;
+	spin_unlock_irqrestore(&ent->lock, flags);
+	if (status) {
+		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
+		kfree(mr);
+		dev->fill_delay = 1;
+		mod_timer(&dev->delay_timer, jiffies + HZ);
+		return;
+	}
+
+	if (mr->out.hdr.status) {
+		mlx5_ib_warn(dev, "failed - status %d, syndorme 0x%x\n",
+			     mr->out.hdr.status,
+			     be32_to_cpu(mr->out.hdr.syndrome));
+		kfree(mr);
+		dev->fill_delay = 1;
+		mod_timer(&dev->delay_timer, jiffies + HZ);
+		return;
+	}
+
+	spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags);
+	key = dev->mdev->priv.mkey_key++;
+	spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags);
+	mr->mmr.key = mlx5_idx_to_mkey(be32_to_cpu(mr->out.mkey) & 0xffffff) | key;
+
+	cache->last_add = jiffies;
+
+	spin_lock_irqsave(&ent->lock, flags);
+	list_add_tail(&mr->list, &ent->head);
+	ent->cur++;
+	ent->size++;
+	spin_unlock_irqrestore(&ent->lock, flags);
+
+	write_lock_irqsave(&table->lock, flags);
+	err = radix_tree_insert(&table->tree, mlx5_base_mkey(mr->mmr.key),
+				&mr->mmr);
+	if (err)
+		pr_err("Error inserting to mr tree. 0x%x\n", -err);
+	write_unlock_irqrestore(&table->lock, flags);
+}
+
+static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent = &cache->ent[c];
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_ib_mr *mr;
+	int npages = 1 << ent->order;
+	int err = 0;
+	int i;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	for (i = 0; i < num; i++) {
+		if (ent->pending >= MAX_PENDING_REG_MR) {
+			err = -EAGAIN;
+			break;
+		}
+
+		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+		if (!mr) {
+			err = -ENOMEM;
+			break;
+		}
+		mr->order = ent->order;
+		mr->umred = 1;
+		mr->dev = dev;
+		in->seg.status = MLX5_MKEY_STATUS_FREE;
+		in->seg.xlt_oct_size = cpu_to_be32((npages + 1) / 2);
+		in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+		in->seg.flags = MLX5_ACCESS_MODE_MTT | MLX5_PERM_UMR_EN;
+		in->seg.log2_page_size = 12;
+
+		spin_lock_irq(&ent->lock);
+		ent->pending++;
+		spin_unlock_irq(&ent->lock);
+		err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in,
+					    sizeof(*in), reg_mr_callback,
+					    mr, &mr->out);
+		if (err) {
+			spin_lock_irq(&ent->lock);
+			ent->pending--;
+			spin_unlock_irq(&ent->lock);
+			mlx5_ib_warn(dev, "create mkey failed %d\n", err);
+			kfree(mr);
+			break;
+		}
+	}
+
+	kfree(in);
+	return err;
+}
+
+static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent = &cache->ent[c];
+	struct mlx5_ib_mr *mr;
+	int err;
+	int i;
+
+	for (i = 0; i < num; i++) {
+		spin_lock_irq(&ent->lock);
+		if (list_empty(&ent->head)) {
+			spin_unlock_irq(&ent->lock);
+			return;
+		}
+		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
+		list_del(&mr->list);
+		ent->cur--;
+		ent->size--;
+		spin_unlock_irq(&ent->lock);
+		err = destroy_mkey(dev, mr);
+		if (err)
+			mlx5_ib_warn(dev, "failed destroy mkey\n");
+		else
+			kfree(mr);
+	}
+}
+
+static ssize_t size_write(struct file *filp, const char __user *buf,
+			  size_t count, loff_t *pos)
+{
+	struct mlx5_cache_ent *ent = filp->private_data;
+	struct mlx5_ib_dev *dev = ent->dev;
+	char lbuf[20];
+	u32 var;
+	int err;
+	int c;
+
+	if (copy_from_user(lbuf, buf, sizeof(lbuf)))
+		return -EFAULT;
+
+	c = order2idx(dev, ent->order);
+	lbuf[sizeof(lbuf) - 1] = 0;
+
+	if (sscanf(lbuf, "%u", &var) != 1)
+		return -EINVAL;
+
+	if (var < ent->limit)
+		return -EINVAL;
+
+	if (var > ent->size) {
+		do {
+			err = add_keys(dev, c, var - ent->size);
+			if (err && err != -EAGAIN)
+				return err;
+
+			usleep_range(3000, 5000);
+		} while (err);
+	} else if (var < ent->size) {
+		remove_keys(dev, c, ent->size - var);
+	}
+
+	return count;
+}
+
+static ssize_t size_read(struct file *filp, char __user *buf, size_t count,
+			 loff_t *pos)
+{
+	struct mlx5_cache_ent *ent = filp->private_data;
+	char lbuf[20];
+	int err;
+
+	if (*pos)
+		return 0;
+
+	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->size);
+	if (err < 0)
+		return err;
+
+	if (copy_to_user(buf, lbuf, err))
+		return -EFAULT;
+
+	*pos += err;
+
+	return err;
+}
+
+static const struct file_operations size_fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= size_write,
+	.read	= size_read,
+};
+
+static ssize_t limit_write(struct file *filp, const char __user *buf,
+			   size_t count, loff_t *pos)
+{
+	struct mlx5_cache_ent *ent = filp->private_data;
+	struct mlx5_ib_dev *dev = ent->dev;
+	char lbuf[20];
+	u32 var;
+	int err;
+	int c;
+
+	if (copy_from_user(lbuf, buf, sizeof(lbuf)))
+		return -EFAULT;
+
+	c = order2idx(dev, ent->order);
+	lbuf[sizeof(lbuf) - 1] = 0;
+
+	if (sscanf(lbuf, "%u", &var) != 1)
+		return -EINVAL;
+
+	if (var > ent->size)
+		return -EINVAL;
+
+	ent->limit = var;
+
+	if (ent->cur < ent->limit) {
+		err = add_keys(dev, c, 2 * ent->limit - ent->cur);
+		if (err)
+			return err;
+	}
+
+	return count;
+}
+
+static ssize_t limit_read(struct file *filp, char __user *buf, size_t count,
+			  loff_t *pos)
+{
+	struct mlx5_cache_ent *ent = filp->private_data;
+	char lbuf[20];
+	int err;
+
+	if (*pos)
+		return 0;
+
+	err = snprintf(lbuf, sizeof(lbuf), "%d\n", ent->limit);
+	if (err < 0)
+		return err;
+
+	if (copy_to_user(buf, lbuf, err))
+		return -EFAULT;
+
+	*pos += err;
+
+	return err;
+}
+
+static const struct file_operations limit_fops = {
+	.owner	= THIS_MODULE,
+	.open	= simple_open,
+	.write	= limit_write,
+	.read	= limit_read,
+};
+
+static int someone_adding(struct mlx5_mr_cache *cache)
+{
+	int i;
+
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+		if (cache->ent[i].cur < cache->ent[i].limit)
+			return 1;
+	}
+
+	return 0;
+}
+
+static void __cache_work_func(struct mlx5_cache_ent *ent)
+{
+	struct mlx5_ib_dev *dev = ent->dev;
+	struct mlx5_mr_cache *cache = &dev->cache;
+	int i = order2idx(dev, ent->order);
+	int err;
+
+	if (cache->stopped)
+		return;
+
+	ent = &dev->cache.ent[i];
+	if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
+		err = add_keys(dev, i, 1);
+		if (ent->cur < 2 * ent->limit) {
+			if (err == -EAGAIN) {
+				mlx5_ib_dbg(dev, "returned eagain, order %d\n",
+					    i + 2);
+				queue_delayed_work(cache->wq, &ent->dwork,
+						   msecs_to_jiffies(3));
+			} else if (err) {
+				mlx5_ib_warn(dev, "command failed order %d, err %d\n",
+					     i + 2, err);
+				queue_delayed_work(cache->wq, &ent->dwork,
+						   msecs_to_jiffies(1000));
+			} else {
+				queue_work(cache->wq, &ent->work);
+			}
+		}
+	} else if (ent->cur > 2 * ent->limit) {
+		if (!someone_adding(cache) &&
+		    time_after(jiffies, cache->last_add + 300 * HZ)) {
+			remove_keys(dev, i, 1);
+			if (ent->cur > ent->limit)
+				queue_work(cache->wq, &ent->work);
+		} else {
+			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
+		}
+	}
+}
+
+static void delayed_cache_work_func(struct work_struct *work)
+{
+	struct mlx5_cache_ent *ent;
+
+	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
+	__cache_work_func(ent);
+}
+
+static void cache_work_func(struct work_struct *work)
+{
+	struct mlx5_cache_ent *ent;
+
+	ent = container_of(work, struct mlx5_cache_ent, work);
+	__cache_work_func(ent);
+}
+
+static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_ib_mr *mr = NULL;
+	struct mlx5_cache_ent *ent;
+	int c;
+	int i;
+
+	c = order2idx(dev, order);
+	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
+		mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
+		return NULL;
+	}
+
+	for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) {
+		ent = &cache->ent[i];
+
+		mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
+
+		spin_lock_irq(&ent->lock);
+		if (!list_empty(&ent->head)) {
+			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
+					      list);
+			list_del(&mr->list);
+			ent->cur--;
+			spin_unlock_irq(&ent->lock);
+			if (ent->cur < ent->limit)
+				queue_work(cache->wq, &ent->work);
+			break;
+		}
+		spin_unlock_irq(&ent->lock);
+
+		queue_work(cache->wq, &ent->work);
+
+		if (mr)
+			break;
+	}
+
+	if (!mr)
+		cache->ent[c].miss++;
+
+	return mr;
+}
+
+static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent;
+	int shrink = 0;
+	int c;
+
+	c = order2idx(dev, mr->order);
+	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
+		mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c);
+		return;
+	}
+	ent = &cache->ent[c];
+	spin_lock_irq(&ent->lock);
+	list_add_tail(&mr->list, &ent->head);
+	ent->cur++;
+	if (ent->cur > 2 * ent->limit)
+		shrink = 1;
+	spin_unlock_irq(&ent->lock);
+
+	if (shrink)
+		queue_work(cache->wq, &ent->work);
+}
+
+static void clean_keys(struct mlx5_ib_dev *dev, int c)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent = &cache->ent[c];
+	struct mlx5_ib_mr *mr;
+	int err;
+
+	cancel_delayed_work(&ent->dwork);
+	while (1) {
+		spin_lock_irq(&ent->lock);
+		if (list_empty(&ent->head)) {
+			spin_unlock_irq(&ent->lock);
+			return;
+		}
+		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
+		list_del(&mr->list);
+		ent->cur--;
+		ent->size--;
+		spin_unlock_irq(&ent->lock);
+		err = destroy_mkey(dev, mr);
+		if (err)
+			mlx5_ib_warn(dev, "failed destroy mkey\n");
+		else
+			kfree(mr);
+	}
+}
+
+static int mlx5_mr_cache_debugfs_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent;
+	int i;
+
+	if (!mlx5_debugfs_root)
+		return 0;
+
+	cache->root = debugfs_create_dir("mr_cache", dev->mdev->priv.dbg_root);
+	if (!cache->root)
+		return -ENOMEM;
+
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+		ent = &cache->ent[i];
+		sprintf(ent->name, "%d", ent->order);
+		ent->dir = debugfs_create_dir(ent->name,  cache->root);
+		if (!ent->dir)
+			return -ENOMEM;
+
+		ent->fsize = debugfs_create_file("size", 0600, ent->dir, ent,
+						 &size_fops);
+		if (!ent->fsize)
+			return -ENOMEM;
+
+		ent->flimit = debugfs_create_file("limit", 0600, ent->dir, ent,
+						  &limit_fops);
+		if (!ent->flimit)
+			return -ENOMEM;
+
+		ent->fcur = debugfs_create_u32("cur", 0400, ent->dir,
+					       &ent->cur);
+		if (!ent->fcur)
+			return -ENOMEM;
+
+		ent->fmiss = debugfs_create_u32("miss", 0600, ent->dir,
+						&ent->miss);
+		if (!ent->fmiss)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void mlx5_mr_cache_debugfs_cleanup(struct mlx5_ib_dev *dev)
+{
+	if (!mlx5_debugfs_root)
+		return;
+
+	debugfs_remove_recursive(dev->cache.root);
+}
+
+static void delay_time_func(unsigned long ctx)
+{
+	struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx;
+
+	dev->fill_delay = 0;
+}
+
+int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
+{
+	struct mlx5_mr_cache *cache = &dev->cache;
+	struct mlx5_cache_ent *ent;
+	int limit;
+	int err;
+	int i;
+
+	cache->wq = create_singlethread_workqueue("mkey_cache");
+	if (!cache->wq) {
+		mlx5_ib_warn(dev, "failed to create work queue\n");
+		return -ENOMEM;
+	}
+
+	setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev);
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
+		INIT_LIST_HEAD(&cache->ent[i].head);
+		spin_lock_init(&cache->ent[i].lock);
+
+		ent = &cache->ent[i];
+		INIT_LIST_HEAD(&ent->head);
+		spin_lock_init(&ent->lock);
+		ent->order = i + 2;
+		ent->dev = dev;
+
+		if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE)
+			limit = dev->mdev->profile->mr_cache[i].limit;
+		else
+			limit = 0;
+
+		INIT_WORK(&ent->work, cache_work_func);
+		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
+		ent->limit = limit;
+		queue_work(cache->wq, &ent->work);
+	}
+
+	err = mlx5_mr_cache_debugfs_init(dev);
+	if (err)
+		mlx5_ib_warn(dev, "cache debugfs failure\n");
+
+	return 0;
+}
+
+int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
+{
+	int i;
+
+	dev->cache.stopped = 1;
+	flush_workqueue(dev->cache.wq);
+
+	mlx5_mr_cache_debugfs_cleanup(dev);
+
+	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
+		clean_keys(dev, i);
+
+	destroy_workqueue(dev->cache.wq);
+	del_timer_sync(&dev->delay_timer);
+
+	return 0;
+}
+
+struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_mkey_seg *seg;
+	struct mlx5_ib_mr *mr;
+	int err;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	seg = &in->seg;
+	seg->flags = convert_access(acc) | MLX5_ACCESS_MODE_PA;
+	seg->flags_pd = cpu_to_be32(to_mpd(pd)->pdn | MLX5_MKEY_LEN64);
+	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+	seg->start_addr = 0;
+
+	err = mlx5_core_create_mkey(mdev, &mr->mmr, in, sizeof(*in), NULL, NULL,
+				    NULL);
+	if (err)
+		goto err_in;
+
+	kfree(in);
+	mr->ibmr.lkey = mr->mmr.key;
+	mr->ibmr.rkey = mr->mmr.key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_in:
+	kfree(in);
+
+err_free:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+static int get_octo_len(u64 addr, u64 len, int page_size)
+{
+	u64 offset;
+	int npages;
+
+	offset = addr & (page_size - 1);
+	npages = ALIGN(len + offset, page_size) >> ilog2(page_size);
+	return (npages + 1) / 2;
+}
+
+static int use_umr(int order)
+{
+	return order <= MLX5_MAX_UMR_SHIFT;
+}
+
+static void prep_umr_reg_wqe(struct ib_pd *pd, struct ib_send_wr *wr,
+			     struct ib_sge *sg, u64 dma, int n, u32 key,
+			     int page_shift, u64 virt_addr, u64 len,
+			     int access_flags)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct ib_mr *mr = dev->umrc.mr;
+	struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
+
+	sg->addr = dma;
+	sg->length = ALIGN(sizeof(u64) * n, 64);
+	sg->lkey = mr->lkey;
+
+	wr->next = NULL;
+	wr->send_flags = 0;
+	wr->sg_list = sg;
+	if (n)
+		wr->num_sge = 1;
+	else
+		wr->num_sge = 0;
+
+	wr->opcode = MLX5_IB_WR_UMR;
+
+	umrwr->npages = n;
+	umrwr->page_shift = page_shift;
+	umrwr->mkey = key;
+	umrwr->target.virt_addr = virt_addr;
+	umrwr->length = len;
+	umrwr->access_flags = access_flags;
+	umrwr->pd = pd;
+}
+
+static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,
+			       struct ib_send_wr *wr, u32 key)
+{
+	struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
+
+	wr->send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE;
+	wr->opcode = MLX5_IB_WR_UMR;
+	umrwr->mkey = key;
+}
+
+void mlx5_umr_cq_handler(struct ib_cq *cq, void *cq_context)
+{
+	struct mlx5_ib_umr_context *context;
+	struct ib_wc wc;
+	int err;
+
+	while (1) {
+		err = ib_poll_cq(cq, 1, &wc);
+		if (err < 0) {
+			pr_warn("poll cq error %d\n", err);
+			return;
+		}
+		if (err == 0)
+			break;
+
+		context = (struct mlx5_ib_umr_context *) (unsigned long) wc.wr_id;
+		context->status = wc.status;
+		complete(&context->done);
+	}
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+}
+
+static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
+				  u64 virt_addr, u64 len, int npages,
+				  int page_shift, int order, int access_flags)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct device *ddev = dev->ib_dev.dma_device;
+	struct umr_common *umrc = &dev->umrc;
+	struct mlx5_ib_umr_context umr_context;
+	struct ib_send_wr wr, *bad;
+	struct mlx5_ib_mr *mr;
+	struct ib_sge sg;
+	int size;
+	__be64 *mr_pas;
+	__be64 *pas;
+	dma_addr_t dma;
+	int err = 0;
+	int i;
+
+	for (i = 0; i < 1; i++) {
+		mr = alloc_cached_mr(dev, order);
+		if (mr)
+			break;
+
+		err = add_keys(dev, order2idx(dev, order), 1);
+		if (err && err != -EAGAIN) {
+			mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
+			break;
+		}
+	}
+
+	if (!mr)
+		return ERR_PTR(-EAGAIN);
+
+	/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes.
+	 * To avoid copying garbage after the pas array, we allocate
+	 * a little more. */
+	size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT);
+	mr_pas = kmalloc(size + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
+	if (!mr_pas) {
+		err = -ENOMEM;
+		goto free_mr;
+	}
+
+	pas = PTR_ALIGN(mr_pas, MLX5_UMR_ALIGN);
+	mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT);
+	/* Clear padding after the actual pages. */
+	memset(pas + npages, 0, size - npages * sizeof(u64));
+
+	dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
+	if (dma_mapping_error(ddev, dma)) {
+		err = -ENOMEM;
+		goto free_pas;
+	}
+
+	memset(&wr, 0, sizeof(wr));
+	wr.wr_id = (u64)(unsigned long)&umr_context;
+	prep_umr_reg_wqe(pd, &wr, &sg, dma, npages, mr->mmr.key, page_shift,
+			 virt_addr, len, access_flags);
+
+	mlx5_ib_init_umr_context(&umr_context);
+	down(&umrc->sem);
+	err = ib_post_send(umrc->qp, &wr, &bad);
+	if (err) {
+		mlx5_ib_warn(dev, "post send failed, err %d\n", err);
+		goto unmap_dma;
+	} else {
+		wait_for_completion(&umr_context.done);
+		if (umr_context.status != IB_WC_SUCCESS) {
+			mlx5_ib_warn(dev, "reg umr failed\n");
+			err = -EFAULT;
+		}
+	}
+
+	mr->mmr.iova = virt_addr;
+	mr->mmr.size = len;
+	mr->mmr.pd = to_mpd(pd)->pdn;
+
+	mr->live = 1;
+
+unmap_dma:
+	up(&umrc->sem);
+	dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
+
+free_pas:
+	kfree(mr_pas);
+
+free_mr:
+	if (err) {
+		free_cached_mr(dev, mr);
+		return ERR_PTR(err);
+	}
+
+	return mr;
+}
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
+		       int zap)
+{
+	struct mlx5_ib_dev *dev = mr->dev;
+	struct device *ddev = dev->ib_dev.dma_device;
+	struct umr_common *umrc = &dev->umrc;
+	struct mlx5_ib_umr_context umr_context;
+	struct ib_umem *umem = mr->umem;
+	int size;
+	__be64 *pas;
+	dma_addr_t dma;
+	struct ib_send_wr wr, *bad;
+	struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr.wr.fast_reg;
+	struct ib_sge sg;
+	int err = 0;
+	const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64);
+	const int page_index_mask = page_index_alignment - 1;
+	size_t pages_mapped = 0;
+	size_t pages_to_map = 0;
+	size_t pages_iter = 0;
+	int use_emergency_buf = 0;
+
+	/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
+	 * so we need to align the offset and length accordingly */
+	if (start_page_index & page_index_mask) {
+		npages += start_page_index & page_index_mask;
+		start_page_index &= ~page_index_mask;
+	}
+
+	pages_to_map = ALIGN(npages, page_index_alignment);
+
+	if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES)
+		return -EINVAL;
+
+	size = sizeof(u64) * pages_to_map;
+	size = min_t(int, PAGE_SIZE, size);
+	/* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim
+	 * code, when we are called from an invalidation. The pas buffer must
+	 * be 2k-aligned for Connect-IB. */
+	pas = (__be64 *)get_zeroed_page(GFP_ATOMIC);
+	if (!pas) {
+		mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n");
+		pas = mlx5_ib_update_mtt_emergency_buffer;
+		size = MLX5_UMR_MTT_MIN_CHUNK_SIZE;
+		use_emergency_buf = 1;
+		mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
+		memset(pas, 0, size);
+	}
+	pages_iter = size / sizeof(u64);
+	dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
+	if (dma_mapping_error(ddev, dma)) {
+		mlx5_ib_err(dev, "unable to map DMA during MTT update.\n");
+		err = -ENOMEM;
+		goto free_pas;
+	}
+
+	for (pages_mapped = 0;
+	     pages_mapped < pages_to_map && !err;
+	     pages_mapped += pages_iter, start_page_index += pages_iter) {
+		dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
+
+		npages = min_t(size_t,
+			       pages_iter,
+			       ib_umem_num_pages(umem) - start_page_index);
+
+		if (!zap) {
+			__mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT,
+					       start_page_index, npages, pas,
+					       MLX5_IB_MTT_PRESENT);
+			/* Clear padding after the pages brought from the
+			 * umem. */
+			memset(pas + npages, 0, size - npages * sizeof(u64));
+		}
+
+		dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
+
+		memset(&wr, 0, sizeof(wr));
+		wr.wr_id = (u64)(unsigned long)&umr_context;
+
+		sg.addr = dma;
+		sg.length = ALIGN(npages * sizeof(u64),
+				MLX5_UMR_MTT_ALIGNMENT);
+		sg.lkey = dev->umrc.mr->lkey;
+
+		wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
+				MLX5_IB_SEND_UMR_UPDATE_MTT;
+		wr.sg_list = &sg;
+		wr.num_sge = 1;
+		wr.opcode = MLX5_IB_WR_UMR;
+		umrwr->npages = sg.length / sizeof(u64);
+		umrwr->page_shift = PAGE_SHIFT;
+		umrwr->mkey = mr->mmr.key;
+		umrwr->target.offset = start_page_index;
+
+		mlx5_ib_init_umr_context(&umr_context);
+		down(&umrc->sem);
+		err = ib_post_send(umrc->qp, &wr, &bad);
+		if (err) {
+			mlx5_ib_err(dev, "UMR post send failed, err %d\n", err);
+		} else {
+			wait_for_completion(&umr_context.done);
+			if (umr_context.status != IB_WC_SUCCESS) {
+				mlx5_ib_err(dev, "UMR completion failed, code %d\n",
+					    umr_context.status);
+				err = -EFAULT;
+			}
+		}
+		up(&umrc->sem);
+	}
+	dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
+
+free_pas:
+	if (!use_emergency_buf)
+		free_page((unsigned long)pas);
+	else
+		mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
+
+	return err;
+}
+#endif
+
+static struct mlx5_ib_mr *reg_create(struct ib_pd *pd, u64 virt_addr,
+				     u64 length, struct ib_umem *umem,
+				     int npages, int page_shift,
+				     int access_flags)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_ib_mr *mr;
+	int inlen;
+	int err;
+	bool pg_cap = !!(dev->mdev->caps.gen.flags &
+			 MLX5_DEV_CAP_FLAG_ON_DMND_PG);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	inlen = sizeof(*in) + sizeof(*in->pas) * ((npages + 1) / 2) * 2;
+	in = mlx5_vzalloc(inlen);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_1;
+	}
+	mlx5_ib_populate_pas(dev, umem, page_shift, in->pas,
+			     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
+
+	/* The MLX5_MKEY_INBOX_PG_ACCESS bit allows setting the access flags
+	 * in the page list submitted with the command. */
+	in->flags = pg_cap ? cpu_to_be32(MLX5_MKEY_INBOX_PG_ACCESS) : 0;
+	in->seg.flags = convert_access(access_flags) |
+		MLX5_ACCESS_MODE_MTT;
+	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
+	in->seg.start_addr = cpu_to_be64(virt_addr);
+	in->seg.len = cpu_to_be64(length);
+	in->seg.bsfs_octo_size = 0;
+	in->seg.xlt_oct_size = cpu_to_be32(get_octo_len(virt_addr, length, 1 << page_shift));
+	in->seg.log2_page_size = page_shift;
+	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+	in->xlat_oct_act_size = cpu_to_be32(get_octo_len(virt_addr, length,
+							 1 << page_shift));
+	err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, inlen, NULL,
+				    NULL, NULL);
+	if (err) {
+		mlx5_ib_warn(dev, "create mkey failed\n");
+		goto err_2;
+	}
+	mr->umem = umem;
+	mr->dev = dev;
+	mr->live = 1;
+	kvfree(in);
+
+	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmr.key);
+
+	return mr;
+
+err_2:
+	kvfree(in);
+
+err_1:
+	kfree(mr);
+
+	return ERR_PTR(err);
+}
+
+struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				  u64 virt_addr, int access_flags,
+				  struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_mr *mr = NULL;
+	struct ib_umem *umem;
+	int page_shift;
+	int npages;
+	int ncont;
+	int order;
+	int err;
+
+	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
+		    start, virt_addr, length, access_flags);
+	umem = ib_umem_get(pd->uobject->context, start, length, access_flags,
+			   0);
+	if (IS_ERR(umem)) {
+		mlx5_ib_dbg(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
+		return (void *)umem;
+	}
+
+	mlx5_ib_cont_pages(umem, start, &npages, &page_shift, &ncont, &order);
+	if (!npages) {
+		mlx5_ib_warn(dev, "avoid zero region\n");
+		err = -EINVAL;
+		goto error;
+	}
+
+	mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
+		    npages, ncont, order, page_shift);
+
+	if (use_umr(order)) {
+		mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift,
+			     order, access_flags);
+		if (PTR_ERR(mr) == -EAGAIN) {
+			mlx5_ib_dbg(dev, "cache empty for order %d", order);
+			mr = NULL;
+		}
+	} else if (access_flags & IB_ACCESS_ON_DEMAND) {
+		err = -EINVAL;
+		pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB");
+		goto error;
+	}
+
+	if (!mr)
+		mr = reg_create(pd, virt_addr, length, umem, ncont, page_shift,
+				access_flags);
+
+	if (IS_ERR(mr)) {
+		err = PTR_ERR(mr);
+		goto error;
+	}
+
+	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmr.key);
+
+	mr->umem = umem;
+	mr->npages = npages;
+	atomic_add(npages, &dev->mdev->priv.reg_pages);
+	mr->ibmr.lkey = mr->mmr.key;
+	mr->ibmr.rkey = mr->mmr.key;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (umem->odp_data) {
+		/*
+		 * This barrier prevents the compiler from moving the
+		 * setting of umem->odp_data->private to point to our
+		 * MR, before reg_umr finished, to ensure that the MR
+		 * initialization have finished before starting to
+		 * handle invalidations.
+		 */
+		smp_wmb();
+		mr->umem->odp_data->private = mr;
+		/*
+		 * Make sure we will see the new
+		 * umem->odp_data->private value in the invalidation
+		 * routines, before we can get page faults on the
+		 * MR. Page faults can happen once we put the MR in
+		 * the tree, below this line. Without the barrier,
+		 * there can be a fault handling and an invalidation
+		 * before umem->odp_data->private == mr is visible to
+		 * the invalidation handler.
+		 */
+		smp_wmb();
+	}
+#endif
+
+	return &mr->ibmr;
+
+error:
+	/*
+	 * Destroy the umem *before* destroying the MR, to ensure we
+	 * will not have any in-flight notifiers when destroying the
+	 * MR.
+	 *
+	 * As the MR is completely invalid to begin with, and this
+	 * error path is only taken if we can't push the mr entry into
+	 * the pagefault tree, this is safe.
+	 */
+
+	ib_umem_release(umem);
+	/* Kill the MR, and return an error code. */
+	clean_mr(mr);
+	return ERR_PTR(err);
+}
+
+static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
+{
+	struct umr_common *umrc = &dev->umrc;
+	struct mlx5_ib_umr_context umr_context;
+	struct ib_send_wr wr, *bad;
+	int err;
+
+	memset(&wr, 0, sizeof(wr));
+	wr.wr_id = (u64)(unsigned long)&umr_context;
+	prep_umr_unreg_wqe(dev, &wr, mr->mmr.key);
+
+	mlx5_ib_init_umr_context(&umr_context);
+	down(&umrc->sem);
+	err = ib_post_send(umrc->qp, &wr, &bad);
+	if (err) {
+		up(&umrc->sem);
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		goto error;
+	} else {
+		wait_for_completion(&umr_context.done);
+		up(&umrc->sem);
+	}
+	if (umr_context.status != IB_WC_SUCCESS) {
+		mlx5_ib_warn(dev, "unreg umr failed\n");
+		err = -EFAULT;
+		goto error;
+	}
+	return 0;
+
+error:
+	return err;
+}
+
+static int clean_mr(struct mlx5_ib_mr *mr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
+	int umred = mr->umred;
+	int err;
+
+	if (!umred) {
+		err = destroy_mkey(dev, mr);
+		if (err) {
+			mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
+				     mr->mmr.key, err);
+			return err;
+		}
+	} else {
+		err = unreg_umr(dev, mr);
+		if (err) {
+			mlx5_ib_warn(dev, "failed unregister\n");
+			return err;
+		}
+		free_cached_mr(dev, mr);
+	}
+
+	if (!umred)
+		kfree(mr);
+
+	return 0;
+}
+
+int mlx5_ib_dereg_mr(struct ib_mr *ibmr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+	int npages = mr->npages;
+	struct ib_umem *umem = mr->umem;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	if (umem && umem->odp_data) {
+		/* Prevent new page faults from succeeding */
+		mr->live = 0;
+		/* Wait for all running page-fault handlers to finish. */
+		synchronize_srcu(&dev->mr_srcu);
+		/* Destroy all page mappings */
+		mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
+					 ib_umem_end(umem));
+		/*
+		 * We kill the umem before the MR for ODP,
+		 * so that there will not be any invalidations in
+		 * flight, looking at the *mr struct.
+		 */
+		ib_umem_release(umem);
+		atomic_sub(npages, &dev->mdev->priv.reg_pages);
+
+		/* Avoid double-freeing the umem. */
+		umem = NULL;
+	}
+#endif
+
+	clean_mr(mr);
+
+	if (umem) {
+		ib_umem_release(umem);
+		atomic_sub(npages, &dev->mdev->priv.reg_pages);
+	}
+
+	return 0;
+}
+
+struct ib_mr *mlx5_ib_create_mr(struct ib_pd *pd,
+				struct ib_mr_init_attr *mr_init_attr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_ib_mr *mr;
+	int access_mode, err;
+	int ndescs = roundup(mr_init_attr->max_reg_descriptors, 4);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	in->seg.status = MLX5_MKEY_STATUS_FREE;
+	in->seg.xlt_oct_size = cpu_to_be32(ndescs);
+	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
+	access_mode = MLX5_ACCESS_MODE_MTT;
+
+	if (mr_init_attr->flags & IB_MR_SIGNATURE_EN) {
+		u32 psv_index[2];
+
+		in->seg.flags_pd = cpu_to_be32(be32_to_cpu(in->seg.flags_pd) |
+							   MLX5_MKEY_BSF_EN);
+		in->seg.bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE);
+		mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
+		if (!mr->sig) {
+			err = -ENOMEM;
+			goto err_free_in;
+		}
+
+		/* create mem & wire PSVs */
+		err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn,
+					   2, psv_index);
+		if (err)
+			goto err_free_sig;
+
+		access_mode = MLX5_ACCESS_MODE_KLM;
+		mr->sig->psv_memory.psv_idx = psv_index[0];
+		mr->sig->psv_wire.psv_idx = psv_index[1];
+
+		mr->sig->sig_status_checked = true;
+		mr->sig->sig_err_exists = false;
+		/* Next UMR, Arm SIGERR */
+		++mr->sig->sigerr_count;
+	}
+
+	in->seg.flags = MLX5_PERM_UMR_EN | access_mode;
+	err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in),
+				    NULL, NULL, NULL);
+	if (err)
+		goto err_destroy_psv;
+
+	mr->ibmr.lkey = mr->mmr.key;
+	mr->ibmr.rkey = mr->mmr.key;
+	mr->umem = NULL;
+	kfree(in);
+
+	return &mr->ibmr;
+
+err_destroy_psv:
+	if (mr->sig) {
+		if (mlx5_core_destroy_psv(dev->mdev,
+					  mr->sig->psv_memory.psv_idx))
+			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
+				     mr->sig->psv_memory.psv_idx);
+		if (mlx5_core_destroy_psv(dev->mdev,
+					  mr->sig->psv_wire.psv_idx))
+			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
+				     mr->sig->psv_wire.psv_idx);
+	}
+err_free_sig:
+	kfree(mr->sig);
+err_free_in:
+	kfree(in);
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+int mlx5_ib_destroy_mr(struct ib_mr *ibmr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
+	struct mlx5_ib_mr *mr = to_mmr(ibmr);
+	int err;
+
+	if (mr->sig) {
+		if (mlx5_core_destroy_psv(dev->mdev,
+					  mr->sig->psv_memory.psv_idx))
+			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
+				     mr->sig->psv_memory.psv_idx);
+		if (mlx5_core_destroy_psv(dev->mdev,
+					  mr->sig->psv_wire.psv_idx))
+			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
+				     mr->sig->psv_wire.psv_idx);
+		kfree(mr->sig);
+	}
+
+	err = destroy_mkey(dev, mr);
+	if (err) {
+		mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
+			     mr->mmr.key, err);
+		return err;
+	}
+
+	kfree(mr);
+
+	return err;
+}
+
+struct ib_mr *mlx5_ib_alloc_fast_reg_mr(struct ib_pd *pd,
+					int max_page_list_len)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_create_mkey_mbox_in *in;
+	struct mlx5_ib_mr *mr;
+	int err;
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in) {
+		err = -ENOMEM;
+		goto err_free;
+	}
+
+	in->seg.status = MLX5_MKEY_STATUS_FREE;
+	in->seg.xlt_oct_size = cpu_to_be32((max_page_list_len + 1) / 2);
+	in->seg.qpn_mkey7_0 = cpu_to_be32(0xffffff << 8);
+	in->seg.flags = MLX5_PERM_UMR_EN | MLX5_ACCESS_MODE_MTT;
+	in->seg.flags_pd = cpu_to_be32(to_mpd(pd)->pdn);
+	/*
+	 * TBD not needed - issue 197292 */
+	in->seg.log2_page_size = PAGE_SHIFT;
+
+	err = mlx5_core_create_mkey(dev->mdev, &mr->mmr, in, sizeof(*in), NULL,
+				    NULL, NULL);
+	kfree(in);
+	if (err)
+		goto err_free;
+
+	mr->ibmr.lkey = mr->mmr.key;
+	mr->ibmr.rkey = mr->mmr.key;
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+struct ib_fast_reg_page_list *mlx5_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
+							       int page_list_len)
+{
+	struct mlx5_ib_fast_reg_page_list *mfrpl;
+	int size = page_list_len * sizeof(u64);
+
+	mfrpl = kmalloc(sizeof(*mfrpl), GFP_KERNEL);
+	if (!mfrpl)
+		return ERR_PTR(-ENOMEM);
+
+	mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL);
+	if (!mfrpl->ibfrpl.page_list)
+		goto err_free;
+
+	mfrpl->mapped_page_list = dma_alloc_coherent(ibdev->dma_device,
+						     size, &mfrpl->map,
+						     GFP_KERNEL);
+	if (!mfrpl->mapped_page_list)
+		goto err_free;
+
+	WARN_ON(mfrpl->map & 0x3f);
+
+	return &mfrpl->ibfrpl;
+
+err_free:
+	kfree(mfrpl->ibfrpl.page_list);
+	kfree(mfrpl);
+	return ERR_PTR(-ENOMEM);
+}
+
+void mlx5_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
+{
+	struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
+	struct mlx5_ib_dev *dev = to_mdev(page_list->device);
+	int size = page_list->max_page_list_len * sizeof(u64);
+
+	dma_free_coherent(&dev->mdev->pdev->dev, size, mfrpl->mapped_page_list,
+			  mfrpl->map);
+	kfree(mfrpl->ibfrpl.page_list);
+	kfree(mfrpl);
+}
+
+int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
+			    struct ib_mr_status *mr_status)
+{
+	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
+	int ret = 0;
+
+	if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
+		pr_err("Invalid status check mask\n");
+		ret = -EINVAL;
+		goto done;
+	}
+
+	mr_status->fail_status = 0;
+	if (check_mask & IB_MR_CHECK_SIG_STATUS) {
+		if (!mmr->sig) {
+			ret = -EINVAL;
+			pr_err("signature status check requested on a non-signature enabled MR\n");
+			goto done;
+		}
+
+		mmr->sig->sig_status_checked = true;
+		if (!mmr->sig->sig_err_exists)
+			goto done;
+
+		if (ibmr->lkey == mmr->sig->err_item.key)
+			memcpy(&mr_status->sig_err, &mmr->sig->err_item,
+			       sizeof(mr_status->sig_err));
+		else {
+			mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
+			mr_status->sig_err.sig_err_offset = 0;
+			mr_status->sig_err.key = mmr->sig->err_item.key;
+		}
+
+		mmr->sig->sig_err_exists = false;
+		mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
+	}
+
+done:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c
new file mode 100644
index 000000000..5099db08a
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/odp.c
@@ -0,0 +1,798 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_umem_odp.h>
+
+#include "mlx5_ib.h"
+
+#define MAX_PREFETCH_LEN (4*1024*1024U)
+
+/* Timeout in ms to wait for an active mmu notifier to complete when handling
+ * a pagefault. */
+#define MMU_NOTIFIER_TIMEOUT 1000
+
+struct workqueue_struct *mlx5_ib_page_fault_wq;
+
+void mlx5_ib_invalidate_range(struct ib_umem *umem, unsigned long start,
+			      unsigned long end)
+{
+	struct mlx5_ib_mr *mr;
+	const u64 umr_block_mask = (MLX5_UMR_MTT_ALIGNMENT / sizeof(u64)) - 1;
+	u64 idx = 0, blk_start_idx = 0;
+	int in_block = 0;
+	u64 addr;
+
+	if (!umem || !umem->odp_data) {
+		pr_err("invalidation called on NULL umem or non-ODP umem\n");
+		return;
+	}
+
+	mr = umem->odp_data->private;
+
+	if (!mr || !mr->ibmr.pd)
+		return;
+
+	start = max_t(u64, ib_umem_start(umem), start);
+	end = min_t(u64, ib_umem_end(umem), end);
+
+	/*
+	 * Iteration one - zap the HW's MTTs. The notifiers_count ensures that
+	 * while we are doing the invalidation, no page fault will attempt to
+	 * overwrite the same MTTs.  Concurent invalidations might race us,
+	 * but they will write 0s as well, so no difference in the end result.
+	 */
+
+	for (addr = start; addr < end; addr += (u64)umem->page_size) {
+		idx = (addr - ib_umem_start(umem)) / PAGE_SIZE;
+		/*
+		 * Strive to write the MTTs in chunks, but avoid overwriting
+		 * non-existing MTTs. The huristic here can be improved to
+		 * estimate the cost of another UMR vs. the cost of bigger
+		 * UMR.
+		 */
+		if (umem->odp_data->dma_list[idx] &
+		    (ODP_READ_ALLOWED_BIT | ODP_WRITE_ALLOWED_BIT)) {
+			if (!in_block) {
+				blk_start_idx = idx;
+				in_block = 1;
+			}
+		} else {
+			u64 umr_offset = idx & umr_block_mask;
+
+			if (in_block && umr_offset == 0) {
+				mlx5_ib_update_mtt(mr, blk_start_idx,
+						   idx - blk_start_idx, 1);
+				in_block = 0;
+			}
+		}
+	}
+	if (in_block)
+		mlx5_ib_update_mtt(mr, blk_start_idx, idx - blk_start_idx + 1,
+				   1);
+
+	/*
+	 * We are now sure that the device will not access the
+	 * memory. We can safely unmap it, and mark it as dirty if
+	 * needed.
+	 */
+
+	ib_umem_odp_unmap_dma_pages(umem, start, end);
+}
+
+#define COPY_ODP_BIT_MLX_TO_IB(reg, ib_caps, field_name, bit_name) do {	\
+	if (be32_to_cpu(reg.field_name) & MLX5_ODP_SUPPORT_##bit_name)	\
+		ib_caps->field_name |= IB_ODP_SUPPORT_##bit_name;	\
+} while (0)
+
+int mlx5_ib_internal_query_odp_caps(struct mlx5_ib_dev *dev)
+{
+	int err;
+	struct mlx5_odp_caps hw_caps;
+	struct ib_odp_caps *caps = &dev->odp_caps;
+
+	memset(caps, 0, sizeof(*caps));
+
+	if (!(dev->mdev->caps.gen.flags & MLX5_DEV_CAP_FLAG_ON_DMND_PG))
+		return 0;
+
+	err = mlx5_query_odp_caps(dev->mdev, &hw_caps);
+	if (err)
+		goto out;
+
+	caps->general_caps = IB_ODP_SUPPORT;
+	COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.ud_odp_caps,
+			       SEND);
+	COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
+			       SEND);
+	COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
+			       RECV);
+	COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
+			       WRITE);
+	COPY_ODP_BIT_MLX_TO_IB(hw_caps, caps, per_transport_caps.rc_odp_caps,
+			       READ);
+
+out:
+	return err;
+}
+
+static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev,
+						   u32 key)
+{
+	u32 base_key = mlx5_base_mkey(key);
+	struct mlx5_core_mr *mmr = __mlx5_mr_lookup(dev->mdev, base_key);
+	struct mlx5_ib_mr *mr = container_of(mmr, struct mlx5_ib_mr, mmr);
+
+	if (!mmr || mmr->key != key || !mr->live)
+		return NULL;
+
+	return container_of(mmr, struct mlx5_ib_mr, mmr);
+}
+
+static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp,
+				      struct mlx5_ib_pfault *pfault,
+				      int error) {
+	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
+	int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn,
+					      pfault->mpfault.flags,
+					      error);
+	if (ret)
+		pr_err("Failed to resolve the page fault on QP 0x%x\n",
+		       qp->mqp.qpn);
+}
+
+/*
+ * Handle a single data segment in a page-fault WQE.
+ *
+ * Returns number of pages retrieved on success. The caller will continue to
+ * the next data segment.
+ * Can return the following error codes:
+ * -EAGAIN to designate a temporary error. The caller will abort handling the
+ *  page fault and resolve it.
+ * -EFAULT when there's an error mapping the requested pages. The caller will
+ *  abort the page fault handling and possibly move the QP to an error state.
+ * On other errors the QP should also be closed with an error.
+ */
+static int pagefault_single_data_segment(struct mlx5_ib_qp *qp,
+					 struct mlx5_ib_pfault *pfault,
+					 u32 key, u64 io_virt, size_t bcnt,
+					 u32 *bytes_mapped)
+{
+	struct mlx5_ib_dev *mib_dev = to_mdev(qp->ibqp.pd->device);
+	int srcu_key;
+	unsigned int current_seq;
+	u64 start_idx;
+	int npages = 0, ret = 0;
+	struct mlx5_ib_mr *mr;
+	u64 access_mask = ODP_READ_ALLOWED_BIT;
+
+	srcu_key = srcu_read_lock(&mib_dev->mr_srcu);
+	mr = mlx5_ib_odp_find_mr_lkey(mib_dev, key);
+	/*
+	 * If we didn't find the MR, it means the MR was closed while we were
+	 * handling the ODP event. In this case we return -EFAULT so that the
+	 * QP will be closed.
+	 */
+	if (!mr || !mr->ibmr.pd) {
+		pr_err("Failed to find relevant mr for lkey=0x%06x, probably the MR was destroyed\n",
+		       key);
+		ret = -EFAULT;
+		goto srcu_unlock;
+	}
+	if (!mr->umem->odp_data) {
+		pr_debug("skipping non ODP MR (lkey=0x%06x) in page fault handler.\n",
+			 key);
+		if (bytes_mapped)
+			*bytes_mapped +=
+				(bcnt - pfault->mpfault.bytes_committed);
+		goto srcu_unlock;
+	}
+	if (mr->ibmr.pd != qp->ibqp.pd) {
+		pr_err("Page-fault with different PDs for QP and MR.\n");
+		ret = -EFAULT;
+		goto srcu_unlock;
+	}
+
+	current_seq = ACCESS_ONCE(mr->umem->odp_data->notifiers_seq);
+	/*
+	 * Ensure the sequence number is valid for some time before we call
+	 * gup.
+	 */
+	smp_rmb();
+
+	/*
+	 * Avoid branches - this code will perform correctly
+	 * in all iterations (in iteration 2 and above,
+	 * bytes_committed == 0).
+	 */
+	io_virt += pfault->mpfault.bytes_committed;
+	bcnt -= pfault->mpfault.bytes_committed;
+
+	start_idx = (io_virt - (mr->mmr.iova & PAGE_MASK)) >> PAGE_SHIFT;
+
+	if (mr->umem->writable)
+		access_mask |= ODP_WRITE_ALLOWED_BIT;
+	npages = ib_umem_odp_map_dma_pages(mr->umem, io_virt, bcnt,
+					   access_mask, current_seq);
+	if (npages < 0) {
+		ret = npages;
+		goto srcu_unlock;
+	}
+
+	if (npages > 0) {
+		mutex_lock(&mr->umem->odp_data->umem_mutex);
+		if (!ib_umem_mmu_notifier_retry(mr->umem, current_seq)) {
+			/*
+			 * No need to check whether the MTTs really belong to
+			 * this MR, since ib_umem_odp_map_dma_pages already
+			 * checks this.
+			 */
+			ret = mlx5_ib_update_mtt(mr, start_idx, npages, 0);
+		} else {
+			ret = -EAGAIN;
+		}
+		mutex_unlock(&mr->umem->odp_data->umem_mutex);
+		if (ret < 0) {
+			if (ret != -EAGAIN)
+				pr_err("Failed to update mkey page tables\n");
+			goto srcu_unlock;
+		}
+
+		if (bytes_mapped) {
+			u32 new_mappings = npages * PAGE_SIZE -
+				(io_virt - round_down(io_virt, PAGE_SIZE));
+			*bytes_mapped += min_t(u32, new_mappings, bcnt);
+		}
+	}
+
+srcu_unlock:
+	if (ret == -EAGAIN) {
+		if (!mr->umem->odp_data->dying) {
+			struct ib_umem_odp *odp_data = mr->umem->odp_data;
+			unsigned long timeout =
+				msecs_to_jiffies(MMU_NOTIFIER_TIMEOUT);
+
+			if (!wait_for_completion_timeout(
+					&odp_data->notifier_completion,
+					timeout)) {
+				pr_warn("timeout waiting for mmu notifier completion\n");
+			}
+		} else {
+			/* The MR is being killed, kill the QP as well. */
+			ret = -EFAULT;
+		}
+	}
+	srcu_read_unlock(&mib_dev->mr_srcu, srcu_key);
+	pfault->mpfault.bytes_committed = 0;
+	return ret ? ret : npages;
+}
+
+/**
+ * Parse a series of data segments for page fault handling.
+ *
+ * @qp the QP on which the fault occurred.
+ * @pfault contains page fault information.
+ * @wqe points at the first data segment in the WQE.
+ * @wqe_end points after the end of the WQE.
+ * @bytes_mapped receives the number of bytes that the function was able to
+ *               map. This allows the caller to decide intelligently whether
+ *               enough memory was mapped to resolve the page fault
+ *               successfully (e.g. enough for the next MTU, or the entire
+ *               WQE).
+ * @total_wqe_bytes receives the total data size of this WQE in bytes (minus
+ *                  the committed bytes).
+ *
+ * Returns the number of pages loaded if positive, zero for an empty WQE, or a
+ * negative error code.
+ */
+static int pagefault_data_segments(struct mlx5_ib_qp *qp,
+				   struct mlx5_ib_pfault *pfault, void *wqe,
+				   void *wqe_end, u32 *bytes_mapped,
+				   u32 *total_wqe_bytes, int receive_queue)
+{
+	int ret = 0, npages = 0;
+	u64 io_virt;
+	u32 key;
+	u32 byte_count;
+	size_t bcnt;
+	int inline_segment;
+
+	/* Skip SRQ next-WQE segment. */
+	if (receive_queue && qp->ibqp.srq)
+		wqe += sizeof(struct mlx5_wqe_srq_next_seg);
+
+	if (bytes_mapped)
+		*bytes_mapped = 0;
+	if (total_wqe_bytes)
+		*total_wqe_bytes = 0;
+
+	while (wqe < wqe_end) {
+		struct mlx5_wqe_data_seg *dseg = wqe;
+
+		io_virt = be64_to_cpu(dseg->addr);
+		key = be32_to_cpu(dseg->lkey);
+		byte_count = be32_to_cpu(dseg->byte_count);
+		inline_segment = !!(byte_count &  MLX5_INLINE_SEG);
+		bcnt	       = byte_count & ~MLX5_INLINE_SEG;
+
+		if (inline_segment) {
+			bcnt = bcnt & MLX5_WQE_INLINE_SEG_BYTE_COUNT_MASK;
+			wqe += ALIGN(sizeof(struct mlx5_wqe_inline_seg) + bcnt,
+				     16);
+		} else {
+			wqe += sizeof(*dseg);
+		}
+
+		/* receive WQE end of sg list. */
+		if (receive_queue && bcnt == 0 && key == MLX5_INVALID_LKEY &&
+		    io_virt == 0)
+			break;
+
+		if (!inline_segment && total_wqe_bytes) {
+			*total_wqe_bytes += bcnt - min_t(size_t, bcnt,
+					pfault->mpfault.bytes_committed);
+		}
+
+		/* A zero length data segment designates a length of 2GB. */
+		if (bcnt == 0)
+			bcnt = 1U << 31;
+
+		if (inline_segment || bcnt <= pfault->mpfault.bytes_committed) {
+			pfault->mpfault.bytes_committed -=
+				min_t(size_t, bcnt,
+				      pfault->mpfault.bytes_committed);
+			continue;
+		}
+
+		ret = pagefault_single_data_segment(qp, pfault, key, io_virt,
+						    bcnt, bytes_mapped);
+		if (ret < 0)
+			break;
+		npages += ret;
+	}
+
+	return ret < 0 ? ret : npages;
+}
+
+/*
+ * Parse initiator WQE. Advances the wqe pointer to point at the
+ * scatter-gather list, and set wqe_end to the end of the WQE.
+ */
+static int mlx5_ib_mr_initiator_pfault_handler(
+	struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault,
+	void **wqe, void **wqe_end, int wqe_length)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
+	struct mlx5_wqe_ctrl_seg *ctrl = *wqe;
+	u16 wqe_index = pfault->mpfault.wqe.wqe_index;
+	unsigned ds, opcode;
+#if defined(DEBUG)
+	u32 ctrl_wqe_index, ctrl_qpn;
+#endif
+
+	ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
+	if (ds * MLX5_WQE_DS_UNITS > wqe_length) {
+		mlx5_ib_err(dev, "Unable to read the complete WQE. ds = 0x%x, ret = 0x%x\n",
+			    ds, wqe_length);
+		return -EFAULT;
+	}
+
+	if (ds == 0) {
+		mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n",
+			    wqe_index, qp->mqp.qpn);
+		return -EFAULT;
+	}
+
+#if defined(DEBUG)
+	ctrl_wqe_index = (be32_to_cpu(ctrl->opmod_idx_opcode) &
+			MLX5_WQE_CTRL_WQE_INDEX_MASK) >>
+			MLX5_WQE_CTRL_WQE_INDEX_SHIFT;
+	if (wqe_index != ctrl_wqe_index) {
+		mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n",
+			    wqe_index, qp->mqp.qpn,
+			    ctrl_wqe_index);
+		return -EFAULT;
+	}
+
+	ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >>
+		MLX5_WQE_CTRL_QPN_SHIFT;
+	if (qp->mqp.qpn != ctrl_qpn) {
+		mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n",
+			    wqe_index, qp->mqp.qpn,
+			    ctrl_qpn);
+		return -EFAULT;
+	}
+#endif /* DEBUG */
+
+	*wqe_end = *wqe + ds * MLX5_WQE_DS_UNITS;
+	*wqe += sizeof(*ctrl);
+
+	opcode = be32_to_cpu(ctrl->opmod_idx_opcode) &
+		 MLX5_WQE_CTRL_OPCODE_MASK;
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_RC:
+		switch (opcode) {
+		case MLX5_OPCODE_SEND:
+		case MLX5_OPCODE_SEND_IMM:
+		case MLX5_OPCODE_SEND_INVAL:
+			if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
+			      IB_ODP_SUPPORT_SEND))
+				goto invalid_transport_or_opcode;
+			break;
+		case MLX5_OPCODE_RDMA_WRITE:
+		case MLX5_OPCODE_RDMA_WRITE_IMM:
+			if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
+			      IB_ODP_SUPPORT_WRITE))
+				goto invalid_transport_or_opcode;
+			*wqe += sizeof(struct mlx5_wqe_raddr_seg);
+			break;
+		case MLX5_OPCODE_RDMA_READ:
+			if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
+			      IB_ODP_SUPPORT_READ))
+				goto invalid_transport_or_opcode;
+			*wqe += sizeof(struct mlx5_wqe_raddr_seg);
+			break;
+		default:
+			goto invalid_transport_or_opcode;
+		}
+		break;
+	case IB_QPT_UD:
+		switch (opcode) {
+		case MLX5_OPCODE_SEND:
+		case MLX5_OPCODE_SEND_IMM:
+			if (!(dev->odp_caps.per_transport_caps.ud_odp_caps &
+			      IB_ODP_SUPPORT_SEND))
+				goto invalid_transport_or_opcode;
+			*wqe += sizeof(struct mlx5_wqe_datagram_seg);
+			break;
+		default:
+			goto invalid_transport_or_opcode;
+		}
+		break;
+	default:
+invalid_transport_or_opcode:
+		mlx5_ib_err(dev, "ODP fault on QP of an unsupported opcode or transport. transport: 0x%x opcode: 0x%x.\n",
+			    qp->ibqp.qp_type, opcode);
+		return -EFAULT;
+	}
+
+	return 0;
+}
+
+/*
+ * Parse responder WQE. Advances the wqe pointer to point at the
+ * scatter-gather list, and set wqe_end to the end of the WQE.
+ */
+static int mlx5_ib_mr_responder_pfault_handler(
+	struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault,
+	void **wqe, void **wqe_end, int wqe_length)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
+	struct mlx5_ib_wq *wq = &qp->rq;
+	int wqe_size = 1 << wq->wqe_shift;
+
+	if (qp->ibqp.srq) {
+		mlx5_ib_err(dev, "ODP fault on SRQ is not supported\n");
+		return -EFAULT;
+	}
+
+	if (qp->wq_sig) {
+		mlx5_ib_err(dev, "ODP fault with WQE signatures is not supported\n");
+		return -EFAULT;
+	}
+
+	if (wqe_size > wqe_length) {
+		mlx5_ib_err(dev, "Couldn't read all of the receive WQE's content\n");
+		return -EFAULT;
+	}
+
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_RC:
+		if (!(dev->odp_caps.per_transport_caps.rc_odp_caps &
+		      IB_ODP_SUPPORT_RECV))
+			goto invalid_transport_or_opcode;
+		break;
+	default:
+invalid_transport_or_opcode:
+		mlx5_ib_err(dev, "ODP fault on QP of an unsupported transport. transport: 0x%x\n",
+			    qp->ibqp.qp_type);
+		return -EFAULT;
+	}
+
+	*wqe_end = *wqe + wqe_size;
+
+	return 0;
+}
+
+static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp,
+					  struct mlx5_ib_pfault *pfault)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device);
+	int ret;
+	void *wqe, *wqe_end;
+	u32 bytes_mapped, total_wqe_bytes;
+	char *buffer = NULL;
+	int resume_with_error = 0;
+	u16 wqe_index = pfault->mpfault.wqe.wqe_index;
+	int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR;
+
+	buffer = (char *)__get_free_page(GFP_KERNEL);
+	if (!buffer) {
+		mlx5_ib_err(dev, "Error allocating memory for IO page fault handling.\n");
+		resume_with_error = 1;
+		goto resolve_page_fault;
+	}
+
+	ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer,
+				    PAGE_SIZE);
+	if (ret < 0) {
+		mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n",
+			    -ret, wqe_index, qp->mqp.qpn);
+		resume_with_error = 1;
+		goto resolve_page_fault;
+	}
+
+	wqe = buffer;
+	if (requestor)
+		ret = mlx5_ib_mr_initiator_pfault_handler(qp, pfault, &wqe,
+							  &wqe_end, ret);
+	else
+		ret = mlx5_ib_mr_responder_pfault_handler(qp, pfault, &wqe,
+							  &wqe_end, ret);
+	if (ret < 0) {
+		resume_with_error = 1;
+		goto resolve_page_fault;
+	}
+
+	if (wqe >= wqe_end) {
+		mlx5_ib_err(dev, "ODP fault on invalid WQE.\n");
+		resume_with_error = 1;
+		goto resolve_page_fault;
+	}
+
+	ret = pagefault_data_segments(qp, pfault, wqe, wqe_end, &bytes_mapped,
+				      &total_wqe_bytes, !requestor);
+	if (ret == -EAGAIN) {
+		goto resolve_page_fault;
+	} else if (ret < 0 || total_wqe_bytes > bytes_mapped) {
+		mlx5_ib_err(dev, "Error getting user pages for page fault. Error: 0x%x\n",
+			    -ret);
+		resume_with_error = 1;
+		goto resolve_page_fault;
+	}
+
+resolve_page_fault:
+	mlx5_ib_page_fault_resume(qp, pfault, resume_with_error);
+	mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n",
+		    qp->mqp.qpn, resume_with_error, pfault->mpfault.flags);
+
+	free_page((unsigned long)buffer);
+}
+
+static int pages_in_range(u64 address, u32 length)
+{
+	return (ALIGN(address + length, PAGE_SIZE) -
+		(address & PAGE_MASK)) >> PAGE_SHIFT;
+}
+
+static void mlx5_ib_mr_rdma_pfault_handler(struct mlx5_ib_qp *qp,
+					   struct mlx5_ib_pfault *pfault)
+{
+	struct mlx5_pagefault *mpfault = &pfault->mpfault;
+	u64 address;
+	u32 length;
+	u32 prefetch_len = mpfault->bytes_committed;
+	int prefetch_activated = 0;
+	u32 rkey = mpfault->rdma.r_key;
+	int ret;
+
+	/* The RDMA responder handler handles the page fault in two parts.
+	 * First it brings the necessary pages for the current packet
+	 * (and uses the pfault context), and then (after resuming the QP)
+	 * prefetches more pages. The second operation cannot use the pfault
+	 * context and therefore uses the dummy_pfault context allocated on
+	 * the stack */
+	struct mlx5_ib_pfault dummy_pfault = {};
+
+	dummy_pfault.mpfault.bytes_committed = 0;
+
+	mpfault->rdma.rdma_va += mpfault->bytes_committed;
+	mpfault->rdma.rdma_op_len -= min(mpfault->bytes_committed,
+					 mpfault->rdma.rdma_op_len);
+	mpfault->bytes_committed = 0;
+
+	address = mpfault->rdma.rdma_va;
+	length  = mpfault->rdma.rdma_op_len;
+
+	/* For some operations, the hardware cannot tell the exact message
+	 * length, and in those cases it reports zero. Use prefetch
+	 * logic. */
+	if (length == 0) {
+		prefetch_activated = 1;
+		length = mpfault->rdma.packet_size;
+		prefetch_len = min(MAX_PREFETCH_LEN, prefetch_len);
+	}
+
+	ret = pagefault_single_data_segment(qp, pfault, rkey, address, length,
+					    NULL);
+	if (ret == -EAGAIN) {
+		/* We're racing with an invalidation, don't prefetch */
+		prefetch_activated = 0;
+	} else if (ret < 0 || pages_in_range(address, length) > ret) {
+		mlx5_ib_page_fault_resume(qp, pfault, 1);
+		return;
+	}
+
+	mlx5_ib_page_fault_resume(qp, pfault, 0);
+
+	/* At this point, there might be a new pagefault already arriving in
+	 * the eq, switch to the dummy pagefault for the rest of the
+	 * processing. We're still OK with the objects being alive as the
+	 * work-queue is being fenced. */
+
+	if (prefetch_activated) {
+		ret = pagefault_single_data_segment(qp, &dummy_pfault, rkey,
+						    address,
+						    prefetch_len,
+						    NULL);
+		if (ret < 0) {
+			pr_warn("Prefetch failed (ret = %d, prefetch_activated = %d) for QPN %d, address: 0x%.16llx, length = 0x%.16x\n",
+				ret, prefetch_activated,
+				qp->ibqp.qp_num, address, prefetch_len);
+		}
+	}
+}
+
+void mlx5_ib_mr_pfault_handler(struct mlx5_ib_qp *qp,
+			       struct mlx5_ib_pfault *pfault)
+{
+	u8 event_subtype = pfault->mpfault.event_subtype;
+
+	switch (event_subtype) {
+	case MLX5_PFAULT_SUBTYPE_WQE:
+		mlx5_ib_mr_wqe_pfault_handler(qp, pfault);
+		break;
+	case MLX5_PFAULT_SUBTYPE_RDMA:
+		mlx5_ib_mr_rdma_pfault_handler(qp, pfault);
+		break;
+	default:
+		pr_warn("Invalid page fault event subtype: 0x%x\n",
+			event_subtype);
+		mlx5_ib_page_fault_resume(qp, pfault, 1);
+		break;
+	}
+}
+
+static void mlx5_ib_qp_pfault_action(struct work_struct *work)
+{
+	struct mlx5_ib_pfault *pfault = container_of(work,
+						     struct mlx5_ib_pfault,
+						     work);
+	enum mlx5_ib_pagefault_context context =
+		mlx5_ib_get_pagefault_context(&pfault->mpfault);
+	struct mlx5_ib_qp *qp = container_of(pfault, struct mlx5_ib_qp,
+					     pagefaults[context]);
+	mlx5_ib_mr_pfault_handler(qp, pfault);
+}
+
+void mlx5_ib_qp_disable_pagefaults(struct mlx5_ib_qp *qp)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
+	qp->disable_page_faults = 1;
+	spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
+
+	/*
+	 * Note that at this point, we are guarenteed that no more
+	 * work queue elements will be posted to the work queue with
+	 * the QP we are closing.
+	 */
+	flush_workqueue(mlx5_ib_page_fault_wq);
+}
+
+void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->disable_page_faults_lock, flags);
+	qp->disable_page_faults = 0;
+	spin_unlock_irqrestore(&qp->disable_page_faults_lock, flags);
+}
+
+static void mlx5_ib_pfault_handler(struct mlx5_core_qp *qp,
+				   struct mlx5_pagefault *pfault)
+{
+	/*
+	 * Note that we will only get one fault event per QP per context
+	 * (responder/initiator, read/write), until we resolve the page fault
+	 * with the mlx5_ib_page_fault_resume command. Since this function is
+	 * called from within the work element, there is no risk of missing
+	 * events.
+	 */
+	struct mlx5_ib_qp *mibqp = to_mibqp(qp);
+	enum mlx5_ib_pagefault_context context =
+		mlx5_ib_get_pagefault_context(pfault);
+	struct mlx5_ib_pfault *qp_pfault = &mibqp->pagefaults[context];
+
+	qp_pfault->mpfault = *pfault;
+
+	/* No need to stop interrupts here since we are in an interrupt */
+	spin_lock(&mibqp->disable_page_faults_lock);
+	if (!mibqp->disable_page_faults)
+		queue_work(mlx5_ib_page_fault_wq, &qp_pfault->work);
+	spin_unlock(&mibqp->disable_page_faults_lock);
+}
+
+void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp)
+{
+	int i;
+
+	qp->disable_page_faults = 1;
+	spin_lock_init(&qp->disable_page_faults_lock);
+
+	qp->mqp.pfault_handler	= mlx5_ib_pfault_handler;
+
+	for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i)
+		INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action);
+}
+
+int mlx5_ib_odp_init_one(struct mlx5_ib_dev *ibdev)
+{
+	int ret;
+
+	ret = init_srcu_struct(&ibdev->mr_srcu);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+void mlx5_ib_odp_remove_one(struct mlx5_ib_dev *ibdev)
+{
+	cleanup_srcu_struct(&ibdev->mr_srcu);
+}
+
+int __init mlx5_ib_odp_init(void)
+{
+	mlx5_ib_page_fault_wq =
+		create_singlethread_workqueue("mlx5_ib_page_faults");
+	if (!mlx5_ib_page_fault_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mlx5_ib_odp_cleanup(void)
+{
+	destroy_workqueue(mlx5_ib_page_fault_wq);
+}
diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c
new file mode 100644
index 000000000..d35f62d4f
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/qp.c
@@ -0,0 +1,3174 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <rdma/ib_umem.h>
+#include "mlx5_ib.h"
+#include "user.h"
+
+/* not supported currently */
+static int wq_signature;
+
+enum {
+	MLX5_IB_ACK_REQ_FREQ	= 8,
+};
+
+enum {
+	MLX5_IB_DEFAULT_SCHED_QUEUE	= 0x83,
+	MLX5_IB_DEFAULT_QP0_SCHED_QUEUE	= 0x3f,
+	MLX5_IB_LINK_TYPE_IB		= 0,
+	MLX5_IB_LINK_TYPE_ETH		= 1
+};
+
+enum {
+	MLX5_IB_SQ_STRIDE	= 6,
+	MLX5_IB_CACHE_LINE_SIZE	= 64,
+};
+
+static const u32 mlx5_ib_opcode[] = {
+	[IB_WR_SEND]				= MLX5_OPCODE_SEND,
+	[IB_WR_SEND_WITH_IMM]			= MLX5_OPCODE_SEND_IMM,
+	[IB_WR_RDMA_WRITE]			= MLX5_OPCODE_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM]		= MLX5_OPCODE_RDMA_WRITE_IMM,
+	[IB_WR_RDMA_READ]			= MLX5_OPCODE_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP]		= MLX5_OPCODE_ATOMIC_CS,
+	[IB_WR_ATOMIC_FETCH_AND_ADD]		= MLX5_OPCODE_ATOMIC_FA,
+	[IB_WR_SEND_WITH_INV]			= MLX5_OPCODE_SEND_INVAL,
+	[IB_WR_LOCAL_INV]			= MLX5_OPCODE_UMR,
+	[IB_WR_FAST_REG_MR]			= MLX5_OPCODE_UMR,
+	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= MLX5_OPCODE_ATOMIC_MASKED_CS,
+	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= MLX5_OPCODE_ATOMIC_MASKED_FA,
+	[MLX5_IB_WR_UMR]			= MLX5_OPCODE_UMR,
+};
+
+
+static int is_qp0(enum ib_qp_type qp_type)
+{
+	return qp_type == IB_QPT_SMI;
+}
+
+static int is_qp1(enum ib_qp_type qp_type)
+{
+	return qp_type == IB_QPT_GSI;
+}
+
+static int is_sqp(enum ib_qp_type qp_type)
+{
+	return is_qp0(qp_type) || is_qp1(qp_type);
+}
+
+static void *get_wqe(struct mlx5_ib_qp *qp, int offset)
+{
+	return mlx5_buf_offset(&qp->buf, offset);
+}
+
+static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
+}
+
+void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n)
+{
+	return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE));
+}
+
+/**
+ * mlx5_ib_read_user_wqe() - Copy a user-space WQE to kernel space.
+ *
+ * @qp: QP to copy from.
+ * @send: copy from the send queue when non-zero, use the receive queue
+ *	  otherwise.
+ * @wqe_index:  index to start copying from. For send work queues, the
+ *		wqe_index is in units of MLX5_SEND_WQE_BB.
+ *		For receive work queue, it is the number of work queue
+ *		element in the queue.
+ * @buffer: destination buffer.
+ * @length: maximum number of bytes to copy.
+ *
+ * Copies at least a single WQE, but may copy more data.
+ *
+ * Return: the number of bytes copied, or an error code.
+ */
+int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index,
+			  void *buffer, u32 length)
+{
+	struct ib_device *ibdev = qp->ibqp.device;
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq;
+	size_t offset;
+	size_t wq_end;
+	struct ib_umem *umem = qp->umem;
+	u32 first_copy_length;
+	int wqe_length;
+	int ret;
+
+	if (wq->wqe_cnt == 0) {
+		mlx5_ib_dbg(dev, "mlx5_ib_read_user_wqe for a QP with wqe_cnt == 0. qp_type: 0x%x\n",
+			    qp->ibqp.qp_type);
+		return -EINVAL;
+	}
+
+	offset = wq->offset + ((wqe_index % wq->wqe_cnt) << wq->wqe_shift);
+	wq_end = wq->offset + (wq->wqe_cnt << wq->wqe_shift);
+
+	if (send && length < sizeof(struct mlx5_wqe_ctrl_seg))
+		return -EINVAL;
+
+	if (offset > umem->length ||
+	    (send && offset + sizeof(struct mlx5_wqe_ctrl_seg) > umem->length))
+		return -EINVAL;
+
+	first_copy_length = min_t(u32, offset + length, wq_end) - offset;
+	ret = ib_umem_copy_from(buffer, umem, offset, first_copy_length);
+	if (ret)
+		return ret;
+
+	if (send) {
+		struct mlx5_wqe_ctrl_seg *ctrl = buffer;
+		int ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK;
+
+		wqe_length = ds * MLX5_WQE_DS_UNITS;
+	} else {
+		wqe_length = 1 << wq->wqe_shift;
+	}
+
+	if (wqe_length <= first_copy_length)
+		return first_copy_length;
+
+	ret = ib_umem_copy_from(buffer + first_copy_length, umem, wq->offset,
+				wqe_length - first_copy_length);
+	if (ret)
+		return ret;
+
+	return wqe_length;
+}
+
+static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
+{
+	struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
+	struct ib_event event;
+
+	if (type == MLX5_EVENT_TYPE_PATH_MIG)
+		to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
+
+	if (ibqp->event_handler) {
+		event.device     = ibqp->device;
+		event.element.qp = ibqp;
+		switch (type) {
+		case MLX5_EVENT_TYPE_PATH_MIG:
+			event.event = IB_EVENT_PATH_MIG;
+			break;
+		case MLX5_EVENT_TYPE_COMM_EST:
+			event.event = IB_EVENT_COMM_EST;
+			break;
+		case MLX5_EVENT_TYPE_SQ_DRAINED:
+			event.event = IB_EVENT_SQ_DRAINED;
+			break;
+		case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
+			event.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			break;
+		case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
+			event.event = IB_EVENT_QP_FATAL;
+			break;
+		case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
+			event.event = IB_EVENT_PATH_MIG_ERR;
+			break;
+		case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+			event.event = IB_EVENT_QP_REQ_ERR;
+			break;
+		case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
+			event.event = IB_EVENT_QP_ACCESS_ERR;
+			break;
+		default:
+			pr_warn("mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn);
+			return;
+		}
+
+		ibqp->event_handler(&event, ibqp->qp_context);
+	}
+}
+
+static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap,
+		       int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd)
+{
+	struct mlx5_general_caps *gen;
+	int wqe_size;
+	int wq_size;
+
+	gen = &dev->mdev->caps.gen;
+	/* Sanity check RQ size before proceeding */
+	if (cap->max_recv_wr  > gen->max_wqes)
+		return -EINVAL;
+
+	if (!has_rq) {
+		qp->rq.max_gs = 0;
+		qp->rq.wqe_cnt = 0;
+		qp->rq.wqe_shift = 0;
+	} else {
+		if (ucmd) {
+			qp->rq.wqe_cnt = ucmd->rq_wqe_count;
+			qp->rq.wqe_shift = ucmd->rq_wqe_shift;
+			qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig;
+			qp->rq.max_post = qp->rq.wqe_cnt;
+		} else {
+			wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0;
+			wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg);
+			wqe_size = roundup_pow_of_two(wqe_size);
+			wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size;
+			wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB);
+			qp->rq.wqe_cnt = wq_size / wqe_size;
+			if (wqe_size > gen->max_rq_desc_sz) {
+				mlx5_ib_dbg(dev, "wqe_size %d, max %d\n",
+					    wqe_size,
+					    gen->max_rq_desc_sz);
+				return -EINVAL;
+			}
+			qp->rq.wqe_shift = ilog2(wqe_size);
+			qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig;
+			qp->rq.max_post = qp->rq.wqe_cnt;
+		}
+	}
+
+	return 0;
+}
+
+static int sq_overhead(enum ib_qp_type qp_type)
+{
+	int size = 0;
+
+	switch (qp_type) {
+	case IB_QPT_XRC_INI:
+		size += sizeof(struct mlx5_wqe_xrc_seg);
+		/* fall through */
+	case IB_QPT_RC:
+		size += sizeof(struct mlx5_wqe_ctrl_seg) +
+			sizeof(struct mlx5_wqe_atomic_seg) +
+			sizeof(struct mlx5_wqe_raddr_seg);
+		break;
+
+	case IB_QPT_XRC_TGT:
+		return 0;
+
+	case IB_QPT_UC:
+		size += sizeof(struct mlx5_wqe_ctrl_seg) +
+			sizeof(struct mlx5_wqe_raddr_seg) +
+			sizeof(struct mlx5_wqe_umr_ctrl_seg) +
+			sizeof(struct mlx5_mkey_seg);
+		break;
+
+	case IB_QPT_UD:
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		size += sizeof(struct mlx5_wqe_ctrl_seg) +
+			sizeof(struct mlx5_wqe_datagram_seg);
+		break;
+
+	case MLX5_IB_QPT_REG_UMR:
+		size += sizeof(struct mlx5_wqe_ctrl_seg) +
+			sizeof(struct mlx5_wqe_umr_ctrl_seg) +
+			sizeof(struct mlx5_mkey_seg);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	return size;
+}
+
+static int calc_send_wqe(struct ib_qp_init_attr *attr)
+{
+	int inl_size = 0;
+	int size;
+
+	size = sq_overhead(attr->qp_type);
+	if (size < 0)
+		return size;
+
+	if (attr->cap.max_inline_data) {
+		inl_size = size + sizeof(struct mlx5_wqe_inline_seg) +
+			attr->cap.max_inline_data;
+	}
+
+	size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg);
+	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN &&
+	    ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB) < MLX5_SIG_WQE_SIZE)
+			return MLX5_SIG_WQE_SIZE;
+	else
+		return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB);
+}
+
+static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,
+			struct mlx5_ib_qp *qp)
+{
+	struct mlx5_general_caps *gen;
+	int wqe_size;
+	int wq_size;
+
+	gen = &dev->mdev->caps.gen;
+	if (!attr->cap.max_send_wr)
+		return 0;
+
+	wqe_size = calc_send_wqe(attr);
+	mlx5_ib_dbg(dev, "wqe_size %d\n", wqe_size);
+	if (wqe_size < 0)
+		return wqe_size;
+
+	if (wqe_size > gen->max_sq_desc_sz) {
+		mlx5_ib_dbg(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n",
+			    wqe_size, gen->max_sq_desc_sz);
+		return -EINVAL;
+	}
+
+	qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) -
+		sizeof(struct mlx5_wqe_inline_seg);
+	attr->cap.max_inline_data = qp->max_inline_data;
+
+	if (attr->create_flags & IB_QP_CREATE_SIGNATURE_EN)
+		qp->signature_en = true;
+
+	wq_size = roundup_pow_of_two(attr->cap.max_send_wr * wqe_size);
+	qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB;
+	if (qp->sq.wqe_cnt > gen->max_wqes) {
+		mlx5_ib_dbg(dev, "wqe count(%d) exceeds limits(%d)\n",
+			    qp->sq.wqe_cnt, gen->max_wqes);
+		return -ENOMEM;
+	}
+	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
+	qp->sq.max_gs = attr->cap.max_send_sge;
+	qp->sq.max_post = wq_size / wqe_size;
+	attr->cap.max_send_wr = qp->sq.max_post;
+
+	return wq_size;
+}
+
+static int set_user_buf_size(struct mlx5_ib_dev *dev,
+			    struct mlx5_ib_qp *qp,
+			    struct mlx5_ib_create_qp *ucmd)
+{
+	struct mlx5_general_caps *gen;
+	int desc_sz = 1 << qp->sq.wqe_shift;
+
+	gen = &dev->mdev->caps.gen;
+	if (desc_sz > gen->max_sq_desc_sz) {
+		mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n",
+			     desc_sz, gen->max_sq_desc_sz);
+		return -EINVAL;
+	}
+
+	if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) {
+		mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n",
+			     ucmd->sq_wqe_count, ucmd->sq_wqe_count);
+		return -EINVAL;
+	}
+
+	qp->sq.wqe_cnt = ucmd->sq_wqe_count;
+
+	if (qp->sq.wqe_cnt > gen->max_wqes) {
+		mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n",
+			     qp->sq.wqe_cnt, gen->max_wqes);
+		return -EINVAL;
+	}
+
+	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
+		(qp->sq.wqe_cnt << 6);
+
+	return 0;
+}
+
+static int qp_has_rq(struct ib_qp_init_attr *attr)
+{
+	if (attr->qp_type == IB_QPT_XRC_INI ||
+	    attr->qp_type == IB_QPT_XRC_TGT || attr->srq ||
+	    attr->qp_type == MLX5_IB_QPT_REG_UMR ||
+	    !attr->cap.max_recv_wr)
+		return 0;
+
+	return 1;
+}
+
+static int first_med_uuar(void)
+{
+	return 1;
+}
+
+static int next_uuar(int n)
+{
+	n++;
+
+	while (((n % 4) & 2))
+		n++;
+
+	return n;
+}
+
+static int num_med_uuar(struct mlx5_uuar_info *uuari)
+{
+	int n;
+
+	n = uuari->num_uars * MLX5_NON_FP_BF_REGS_PER_PAGE -
+		uuari->num_low_latency_uuars - 1;
+
+	return n >= 0 ? n : 0;
+}
+
+static int max_uuari(struct mlx5_uuar_info *uuari)
+{
+	return uuari->num_uars * 4;
+}
+
+static int first_hi_uuar(struct mlx5_uuar_info *uuari)
+{
+	int med;
+	int i;
+	int t;
+
+	med = num_med_uuar(uuari);
+	for (t = 0, i = first_med_uuar();; i = next_uuar(i)) {
+		t++;
+		if (t == med)
+			return next_uuar(i);
+	}
+
+	return 0;
+}
+
+static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari)
+{
+	int i;
+
+	for (i = first_hi_uuar(uuari); i < max_uuari(uuari); i = next_uuar(i)) {
+		if (!test_bit(i, uuari->bitmap)) {
+			set_bit(i, uuari->bitmap);
+			uuari->count[i]++;
+			return i;
+		}
+	}
+
+	return -ENOMEM;
+}
+
+static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari)
+{
+	int minidx = first_med_uuar();
+	int i;
+
+	for (i = first_med_uuar(); i < first_hi_uuar(uuari); i = next_uuar(i)) {
+		if (uuari->count[i] < uuari->count[minidx])
+			minidx = i;
+	}
+
+	uuari->count[minidx]++;
+	return minidx;
+}
+
+static int alloc_uuar(struct mlx5_uuar_info *uuari,
+		      enum mlx5_ib_latency_class lat)
+{
+	int uuarn = -EINVAL;
+
+	mutex_lock(&uuari->lock);
+	switch (lat) {
+	case MLX5_IB_LATENCY_CLASS_LOW:
+		uuarn = 0;
+		uuari->count[uuarn]++;
+		break;
+
+	case MLX5_IB_LATENCY_CLASS_MEDIUM:
+		if (uuari->ver < 2)
+			uuarn = -ENOMEM;
+		else
+			uuarn = alloc_med_class_uuar(uuari);
+		break;
+
+	case MLX5_IB_LATENCY_CLASS_HIGH:
+		if (uuari->ver < 2)
+			uuarn = -ENOMEM;
+		else
+			uuarn = alloc_high_class_uuar(uuari);
+		break;
+
+	case MLX5_IB_LATENCY_CLASS_FAST_PATH:
+		uuarn = 2;
+		break;
+	}
+	mutex_unlock(&uuari->lock);
+
+	return uuarn;
+}
+
+static void free_med_class_uuar(struct mlx5_uuar_info *uuari, int uuarn)
+{
+	clear_bit(uuarn, uuari->bitmap);
+	--uuari->count[uuarn];
+}
+
+static void free_high_class_uuar(struct mlx5_uuar_info *uuari, int uuarn)
+{
+	clear_bit(uuarn, uuari->bitmap);
+	--uuari->count[uuarn];
+}
+
+static void free_uuar(struct mlx5_uuar_info *uuari, int uuarn)
+{
+	int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE;
+	int high_uuar = nuuars - uuari->num_low_latency_uuars;
+
+	mutex_lock(&uuari->lock);
+	if (uuarn == 0) {
+		--uuari->count[uuarn];
+		goto out;
+	}
+
+	if (uuarn < high_uuar) {
+		free_med_class_uuar(uuari, uuarn);
+		goto out;
+	}
+
+	free_high_class_uuar(uuari, uuarn);
+
+out:
+	mutex_unlock(&uuari->lock);
+}
+
+static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:	return MLX5_QP_STATE_RST;
+	case IB_QPS_INIT:	return MLX5_QP_STATE_INIT;
+	case IB_QPS_RTR:	return MLX5_QP_STATE_RTR;
+	case IB_QPS_RTS:	return MLX5_QP_STATE_RTS;
+	case IB_QPS_SQD:	return MLX5_QP_STATE_SQD;
+	case IB_QPS_SQE:	return MLX5_QP_STATE_SQER;
+	case IB_QPS_ERR:	return MLX5_QP_STATE_ERR;
+	default:		return -1;
+	}
+}
+
+static int to_mlx5_st(enum ib_qp_type type)
+{
+	switch (type) {
+	case IB_QPT_RC:			return MLX5_QP_ST_RC;
+	case IB_QPT_UC:			return MLX5_QP_ST_UC;
+	case IB_QPT_UD:			return MLX5_QP_ST_UD;
+	case MLX5_IB_QPT_REG_UMR:	return MLX5_QP_ST_REG_UMR;
+	case IB_QPT_XRC_INI:
+	case IB_QPT_XRC_TGT:		return MLX5_QP_ST_XRC;
+	case IB_QPT_SMI:		return MLX5_QP_ST_QP0;
+	case IB_QPT_GSI:		return MLX5_QP_ST_QP1;
+	case IB_QPT_RAW_IPV6:		return MLX5_QP_ST_RAW_IPV6;
+	case IB_QPT_RAW_ETHERTYPE:	return MLX5_QP_ST_RAW_ETHERTYPE;
+	case IB_QPT_RAW_PACKET:
+	case IB_QPT_MAX:
+	default:		return -EINVAL;
+	}
+}
+
+static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn)
+{
+	return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index;
+}
+
+static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			  struct mlx5_ib_qp *qp, struct ib_udata *udata,
+			  struct mlx5_create_qp_mbox_in **in,
+			  struct mlx5_ib_create_qp_resp *resp, int *inlen)
+{
+	struct mlx5_ib_ucontext *context;
+	struct mlx5_ib_create_qp ucmd;
+	int page_shift = 0;
+	int uar_index;
+	int npages;
+	u32 offset = 0;
+	int uuarn;
+	int ncont = 0;
+	int err;
+
+	err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
+	if (err) {
+		mlx5_ib_dbg(dev, "copy failed\n");
+		return err;
+	}
+
+	context = to_mucontext(pd->uobject->context);
+	/*
+	 * TBD: should come from the verbs when we have the API
+	 */
+	uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH);
+	if (uuarn < 0) {
+		mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n");
+		mlx5_ib_dbg(dev, "reverting to medium latency\n");
+		uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM);
+		if (uuarn < 0) {
+			mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n");
+			mlx5_ib_dbg(dev, "reverting to high latency\n");
+			uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW);
+			if (uuarn < 0) {
+				mlx5_ib_warn(dev, "uuar allocation failed\n");
+				return uuarn;
+			}
+		}
+	}
+
+	uar_index = uuarn_to_uar_index(&context->uuari, uuarn);
+	mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index);
+
+	qp->rq.offset = 0;
+	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
+	qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+
+	err = set_user_buf_size(dev, qp, &ucmd);
+	if (err)
+		goto err_uuar;
+
+	if (ucmd.buf_addr && qp->buf_size) {
+		qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr,
+				       qp->buf_size, 0, 0);
+		if (IS_ERR(qp->umem)) {
+			mlx5_ib_dbg(dev, "umem_get failed\n");
+			err = PTR_ERR(qp->umem);
+			goto err_uuar;
+		}
+	} else {
+		qp->umem = NULL;
+	}
+
+	if (qp->umem) {
+		mlx5_ib_cont_pages(qp->umem, ucmd.buf_addr, &npages, &page_shift,
+				   &ncont, NULL);
+		err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset);
+		if (err) {
+			mlx5_ib_warn(dev, "bad offset\n");
+			goto err_umem;
+		}
+		mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n",
+			    ucmd.buf_addr, qp->buf_size, npages, page_shift, ncont, offset);
+	}
+
+	*inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont;
+	*in = mlx5_vzalloc(*inlen);
+	if (!*in) {
+		err = -ENOMEM;
+		goto err_umem;
+	}
+	if (qp->umem)
+		mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0);
+	(*in)->ctx.log_pg_sz_remote_qpn =
+		cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24);
+	(*in)->ctx.params2 = cpu_to_be32(offset << 6);
+
+	(*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index);
+	resp->uuar_index = uuarn;
+	qp->uuarn = uuarn;
+
+	err = mlx5_ib_db_map_user(context, ucmd.db_addr, &qp->db);
+	if (err) {
+		mlx5_ib_dbg(dev, "map failed\n");
+		goto err_free;
+	}
+
+	err = ib_copy_to_udata(udata, resp, sizeof(*resp));
+	if (err) {
+		mlx5_ib_dbg(dev, "copy failed\n");
+		goto err_unmap;
+	}
+	qp->create_type = MLX5_QP_USER;
+
+	return 0;
+
+err_unmap:
+	mlx5_ib_db_unmap_user(context, &qp->db);
+
+err_free:
+	kvfree(*in);
+
+err_umem:
+	if (qp->umem)
+		ib_umem_release(qp->umem);
+
+err_uuar:
+	free_uuar(&context->uuari, uuarn);
+	return err;
+}
+
+static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp)
+{
+	struct mlx5_ib_ucontext *context;
+
+	context = to_mucontext(pd->uobject->context);
+	mlx5_ib_db_unmap_user(context, &qp->db);
+	if (qp->umem)
+		ib_umem_release(qp->umem);
+	free_uuar(&context->uuari, qp->uuarn);
+}
+
+static int create_kernel_qp(struct mlx5_ib_dev *dev,
+			    struct ib_qp_init_attr *init_attr,
+			    struct mlx5_ib_qp *qp,
+			    struct mlx5_create_qp_mbox_in **in, int *inlen)
+{
+	enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW;
+	struct mlx5_uuar_info *uuari;
+	int uar_index;
+	int uuarn;
+	int err;
+
+	uuari = &dev->mdev->priv.uuari;
+	if (init_attr->create_flags & ~(IB_QP_CREATE_SIGNATURE_EN | IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK))
+		return -EINVAL;
+
+	if (init_attr->qp_type == MLX5_IB_QPT_REG_UMR)
+		lc = MLX5_IB_LATENCY_CLASS_FAST_PATH;
+
+	uuarn = alloc_uuar(uuari, lc);
+	if (uuarn < 0) {
+		mlx5_ib_dbg(dev, "\n");
+		return -ENOMEM;
+	}
+
+	qp->bf = &uuari->bfs[uuarn];
+	uar_index = qp->bf->uar->index;
+
+	err = calc_sq_size(dev, init_attr, qp);
+	if (err < 0) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		goto err_uuar;
+	}
+
+	qp->rq.offset = 0;
+	qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
+	qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
+
+	err = mlx5_buf_alloc(dev->mdev, qp->buf_size, PAGE_SIZE * 2, &qp->buf);
+	if (err) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		goto err_uuar;
+	}
+
+	qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt);
+	*inlen = sizeof(**in) + sizeof(*(*in)->pas) * qp->buf.npages;
+	*in = mlx5_vzalloc(*inlen);
+	if (!*in) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+	(*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index);
+	(*in)->ctx.log_pg_sz_remote_qpn =
+		cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24);
+	/* Set "fast registration enabled" for all kernel QPs */
+	(*in)->ctx.params1 |= cpu_to_be32(1 << 11);
+	(*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4);
+
+	mlx5_fill_page_array(&qp->buf, (*in)->pas);
+
+	err = mlx5_db_alloc(dev->mdev, &qp->db);
+	if (err) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		goto err_free;
+	}
+
+	qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid), GFP_KERNEL);
+	qp->sq.wr_data = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data), GFP_KERNEL);
+	qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(*qp->rq.wrid), GFP_KERNEL);
+	qp->sq.w_list = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.w_list), GFP_KERNEL);
+	qp->sq.wqe_head = kmalloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head), GFP_KERNEL);
+
+	if (!qp->sq.wrid || !qp->sq.wr_data || !qp->rq.wrid ||
+	    !qp->sq.w_list || !qp->sq.wqe_head) {
+		err = -ENOMEM;
+		goto err_wrid;
+	}
+	qp->create_type = MLX5_QP_KERNEL;
+
+	return 0;
+
+err_wrid:
+	mlx5_db_free(dev->mdev, &qp->db);
+	kfree(qp->sq.wqe_head);
+	kfree(qp->sq.w_list);
+	kfree(qp->sq.wrid);
+	kfree(qp->sq.wr_data);
+	kfree(qp->rq.wrid);
+
+err_free:
+	kvfree(*in);
+
+err_buf:
+	mlx5_buf_free(dev->mdev, &qp->buf);
+
+err_uuar:
+	free_uuar(&dev->mdev->priv.uuari, uuarn);
+	return err;
+}
+
+static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
+{
+	mlx5_db_free(dev->mdev, &qp->db);
+	kfree(qp->sq.wqe_head);
+	kfree(qp->sq.w_list);
+	kfree(qp->sq.wrid);
+	kfree(qp->sq.wr_data);
+	kfree(qp->rq.wrid);
+	mlx5_buf_free(dev->mdev, &qp->buf);
+	free_uuar(&dev->mdev->priv.uuari, qp->bf->uuarn);
+}
+
+static __be32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr)
+{
+	if (attr->srq || (attr->qp_type == IB_QPT_XRC_TGT) ||
+	    (attr->qp_type == IB_QPT_XRC_INI))
+		return cpu_to_be32(MLX5_SRQ_RQ);
+	else if (!qp->has_rq)
+		return cpu_to_be32(MLX5_ZERO_LEN_RQ);
+	else
+		return cpu_to_be32(MLX5_NON_ZERO_RQ);
+}
+
+static int is_connected(enum ib_qp_type qp_type)
+{
+	if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC)
+		return 1;
+
+	return 0;
+}
+
+static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata, struct mlx5_ib_qp *qp)
+{
+	struct mlx5_ib_resources *devr = &dev->devr;
+	struct mlx5_ib_create_qp_resp resp;
+	struct mlx5_create_qp_mbox_in *in;
+	struct mlx5_general_caps *gen;
+	struct mlx5_ib_create_qp ucmd;
+	int inlen = sizeof(*in);
+	int err;
+
+	mlx5_ib_odp_create_qp(qp);
+
+	gen = &dev->mdev->caps.gen;
+	mutex_init(&qp->mutex);
+	spin_lock_init(&qp->sq.lock);
+	spin_lock_init(&qp->rq.lock);
+
+	if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
+		if (!(gen->flags & MLX5_DEV_CAP_FLAG_BLOCK_MCAST)) {
+			mlx5_ib_dbg(dev, "block multicast loopback isn't supported\n");
+			return -EINVAL;
+		} else {
+			qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK;
+		}
+	}
+
+	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
+		qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;
+
+	if (pd && pd->uobject) {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof(ucmd))) {
+			mlx5_ib_dbg(dev, "copy failed\n");
+			return -EFAULT;
+		}
+
+		qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE);
+		qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE);
+	} else {
+		qp->wq_sig = !!wq_signature;
+	}
+
+	qp->has_rq = qp_has_rq(init_attr);
+	err = set_rq_size(dev, &init_attr->cap, qp->has_rq,
+			  qp, (pd && pd->uobject) ? &ucmd : NULL);
+	if (err) {
+		mlx5_ib_dbg(dev, "err %d\n", err);
+		return err;
+	}
+
+	if (pd) {
+		if (pd->uobject) {
+			mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count);
+			if (ucmd.rq_wqe_shift != qp->rq.wqe_shift ||
+			    ucmd.rq_wqe_count != qp->rq.wqe_cnt) {
+				mlx5_ib_dbg(dev, "invalid rq params\n");
+				return -EINVAL;
+			}
+			if (ucmd.sq_wqe_count > gen->max_wqes) {
+				mlx5_ib_dbg(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n",
+					    ucmd.sq_wqe_count, gen->max_wqes);
+				return -EINVAL;
+			}
+			err = create_user_qp(dev, pd, qp, udata, &in, &resp, &inlen);
+			if (err)
+				mlx5_ib_dbg(dev, "err %d\n", err);
+		} else {
+			err = create_kernel_qp(dev, init_attr, qp, &in, &inlen);
+			if (err)
+				mlx5_ib_dbg(dev, "err %d\n", err);
+			else
+				qp->pa_lkey = to_mpd(pd)->pa_lkey;
+		}
+
+		if (err)
+			return err;
+	} else {
+		in = mlx5_vzalloc(sizeof(*in));
+		if (!in)
+			return -ENOMEM;
+
+		qp->create_type = MLX5_QP_EMPTY;
+	}
+
+	if (is_sqp(init_attr->qp_type))
+		qp->port = init_attr->port_num;
+
+	in->ctx.flags = cpu_to_be32(to_mlx5_st(init_attr->qp_type) << 16 |
+				    MLX5_QP_PM_MIGRATED << 11);
+
+	if (init_attr->qp_type != MLX5_IB_QPT_REG_UMR)
+		in->ctx.flags_pd = cpu_to_be32(to_mpd(pd ? pd : devr->p0)->pdn);
+	else
+		in->ctx.flags_pd = cpu_to_be32(MLX5_QP_LAT_SENSITIVE);
+
+	if (qp->wq_sig)
+		in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_ENABLE_SIG);
+
+	if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+		in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST);
+
+	if (qp->scat_cqe && is_connected(init_attr->qp_type)) {
+		int rcqe_sz;
+		int scqe_sz;
+
+		rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq);
+		scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq);
+
+		if (rcqe_sz == 128)
+			in->ctx.cs_res = MLX5_RES_SCAT_DATA64_CQE;
+		else
+			in->ctx.cs_res = MLX5_RES_SCAT_DATA32_CQE;
+
+		if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) {
+			if (scqe_sz == 128)
+				in->ctx.cs_req = MLX5_REQ_SCAT_DATA64_CQE;
+			else
+				in->ctx.cs_req = MLX5_REQ_SCAT_DATA32_CQE;
+		}
+	}
+
+	if (qp->rq.wqe_cnt) {
+		in->ctx.rq_size_stride = (qp->rq.wqe_shift - 4);
+		in->ctx.rq_size_stride |= ilog2(qp->rq.wqe_cnt) << 3;
+	}
+
+	in->ctx.rq_type_srqn = get_rx_type(qp, init_attr);
+
+	if (qp->sq.wqe_cnt)
+		in->ctx.sq_crq_size |= cpu_to_be16(ilog2(qp->sq.wqe_cnt) << 11);
+	else
+		in->ctx.sq_crq_size |= cpu_to_be16(0x8000);
+
+	/* Set default resources */
+	switch (init_attr->qp_type) {
+	case IB_QPT_XRC_TGT:
+		in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn);
+		in->ctx.cqn_send = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn);
+		in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn);
+		in->ctx.xrcd = cpu_to_be32(to_mxrcd(init_attr->xrcd)->xrcdn);
+		break;
+	case IB_QPT_XRC_INI:
+		in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn);
+		in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn);
+		in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn);
+		break;
+	default:
+		if (init_attr->srq) {
+			in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x0)->xrcdn);
+			in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(init_attr->srq)->msrq.srqn);
+		} else {
+			in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn);
+			in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn);
+		}
+	}
+
+	if (init_attr->send_cq)
+		in->ctx.cqn_send = cpu_to_be32(to_mcq(init_attr->send_cq)->mcq.cqn);
+
+	if (init_attr->recv_cq)
+		in->ctx.cqn_recv = cpu_to_be32(to_mcq(init_attr->recv_cq)->mcq.cqn);
+
+	in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma);
+
+	err = mlx5_core_create_qp(dev->mdev, &qp->mqp, in, inlen);
+	if (err) {
+		mlx5_ib_dbg(dev, "create qp failed\n");
+		goto err_create;
+	}
+
+	kvfree(in);
+	/* Hardware wants QPN written in big-endian order (after
+	 * shifting) for send doorbell.  Precompute this value to save
+	 * a little bit when posting sends.
+	 */
+	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
+
+	qp->mqp.event = mlx5_ib_qp_event;
+
+	return 0;
+
+err_create:
+	if (qp->create_type == MLX5_QP_USER)
+		destroy_qp_user(pd, qp);
+	else if (qp->create_type == MLX5_QP_KERNEL)
+		destroy_qp_kernel(dev, qp);
+
+	kvfree(in);
+	return err;
+}
+
+static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq)
+	__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
+{
+	if (send_cq) {
+		if (recv_cq) {
+			if (send_cq->mcq.cqn < recv_cq->mcq.cqn)  {
+				spin_lock_irq(&send_cq->lock);
+				spin_lock_nested(&recv_cq->lock,
+						 SINGLE_DEPTH_NESTING);
+			} else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
+				spin_lock_irq(&send_cq->lock);
+				__acquire(&recv_cq->lock);
+			} else {
+				spin_lock_irq(&recv_cq->lock);
+				spin_lock_nested(&send_cq->lock,
+						 SINGLE_DEPTH_NESTING);
+			}
+		} else {
+			spin_lock_irq(&send_cq->lock);
+			__acquire(&recv_cq->lock);
+		}
+	} else if (recv_cq) {
+		spin_lock_irq(&recv_cq->lock);
+		__acquire(&send_cq->lock);
+	} else {
+		__acquire(&send_cq->lock);
+		__acquire(&recv_cq->lock);
+	}
+}
+
+static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq)
+	__releases(&send_cq->lock) __releases(&recv_cq->lock)
+{
+	if (send_cq) {
+		if (recv_cq) {
+			if (send_cq->mcq.cqn < recv_cq->mcq.cqn)  {
+				spin_unlock(&recv_cq->lock);
+				spin_unlock_irq(&send_cq->lock);
+			} else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
+				__release(&recv_cq->lock);
+				spin_unlock_irq(&send_cq->lock);
+			} else {
+				spin_unlock(&send_cq->lock);
+				spin_unlock_irq(&recv_cq->lock);
+			}
+		} else {
+			__release(&recv_cq->lock);
+			spin_unlock_irq(&send_cq->lock);
+		}
+	} else if (recv_cq) {
+		__release(&send_cq->lock);
+		spin_unlock_irq(&recv_cq->lock);
+	} else {
+		__release(&recv_cq->lock);
+		__release(&send_cq->lock);
+	}
+}
+
+static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp)
+{
+	return to_mpd(qp->ibqp.pd);
+}
+
+static void get_cqs(struct mlx5_ib_qp *qp,
+		    struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq)
+{
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_XRC_TGT:
+		*send_cq = NULL;
+		*recv_cq = NULL;
+		break;
+	case MLX5_IB_QPT_REG_UMR:
+	case IB_QPT_XRC_INI:
+		*send_cq = to_mcq(qp->ibqp.send_cq);
+		*recv_cq = NULL;
+		break;
+
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_UD:
+	case IB_QPT_RAW_IPV6:
+	case IB_QPT_RAW_ETHERTYPE:
+		*send_cq = to_mcq(qp->ibqp.send_cq);
+		*recv_cq = to_mcq(qp->ibqp.recv_cq);
+		break;
+
+	case IB_QPT_RAW_PACKET:
+	case IB_QPT_MAX:
+	default:
+		*send_cq = NULL;
+		*recv_cq = NULL;
+		break;
+	}
+}
+
+static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
+{
+	struct mlx5_ib_cq *send_cq, *recv_cq;
+	struct mlx5_modify_qp_mbox_in *in;
+	int err;
+
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return;
+
+	if (qp->state != IB_QPS_RESET) {
+		mlx5_ib_qp_disable_pagefaults(qp);
+		if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state),
+					MLX5_QP_STATE_RST, in, 0, &qp->mqp))
+			mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n",
+				     qp->mqp.qpn);
+	}
+
+	get_cqs(qp, &send_cq, &recv_cq);
+
+	if (qp->create_type == MLX5_QP_KERNEL) {
+		mlx5_ib_lock_cqs(send_cq, recv_cq);
+		__mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn,
+				   qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
+		if (send_cq != recv_cq)
+			__mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+		mlx5_ib_unlock_cqs(send_cq, recv_cq);
+	}
+
+	err = mlx5_core_destroy_qp(dev->mdev, &qp->mqp);
+	if (err)
+		mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", qp->mqp.qpn);
+	kfree(in);
+
+
+	if (qp->create_type == MLX5_QP_KERNEL)
+		destroy_qp_kernel(dev, qp);
+	else if (qp->create_type == MLX5_QP_USER)
+		destroy_qp_user(&get_pd(qp)->ibpd, qp);
+}
+
+static const char *ib_qp_type_str(enum ib_qp_type type)
+{
+	switch (type) {
+	case IB_QPT_SMI:
+		return "IB_QPT_SMI";
+	case IB_QPT_GSI:
+		return "IB_QPT_GSI";
+	case IB_QPT_RC:
+		return "IB_QPT_RC";
+	case IB_QPT_UC:
+		return "IB_QPT_UC";
+	case IB_QPT_UD:
+		return "IB_QPT_UD";
+	case IB_QPT_RAW_IPV6:
+		return "IB_QPT_RAW_IPV6";
+	case IB_QPT_RAW_ETHERTYPE:
+		return "IB_QPT_RAW_ETHERTYPE";
+	case IB_QPT_XRC_INI:
+		return "IB_QPT_XRC_INI";
+	case IB_QPT_XRC_TGT:
+		return "IB_QPT_XRC_TGT";
+	case IB_QPT_RAW_PACKET:
+		return "IB_QPT_RAW_PACKET";
+	case MLX5_IB_QPT_REG_UMR:
+		return "MLX5_IB_QPT_REG_UMR";
+	case IB_QPT_MAX:
+	default:
+		return "Invalid QP type";
+	}
+}
+
+struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
+				struct ib_qp_init_attr *init_attr,
+				struct ib_udata *udata)
+{
+	struct mlx5_general_caps *gen;
+	struct mlx5_ib_dev *dev;
+	struct mlx5_ib_qp *qp;
+	u16 xrcdn = 0;
+	int err;
+
+	if (pd) {
+		dev = to_mdev(pd->device);
+	} else {
+		/* being cautious here */
+		if (init_attr->qp_type != IB_QPT_XRC_TGT &&
+		    init_attr->qp_type != MLX5_IB_QPT_REG_UMR) {
+			pr_warn("%s: no PD for transport %s\n", __func__,
+				ib_qp_type_str(init_attr->qp_type));
+			return ERR_PTR(-EINVAL);
+		}
+		dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device);
+	}
+	gen = &dev->mdev->caps.gen;
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_XRC_TGT:
+	case IB_QPT_XRC_INI:
+		if (!(gen->flags & MLX5_DEV_CAP_FLAG_XRC)) {
+			mlx5_ib_dbg(dev, "XRC not supported\n");
+			return ERR_PTR(-ENOSYS);
+		}
+		init_attr->recv_cq = NULL;
+		if (init_attr->qp_type == IB_QPT_XRC_TGT) {
+			xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn;
+			init_attr->send_cq = NULL;
+		}
+
+		/* fall through */
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_UD:
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	case MLX5_IB_QPT_REG_UMR:
+		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		err = create_qp_common(dev, pd, init_attr, udata, qp);
+		if (err) {
+			mlx5_ib_dbg(dev, "create_qp_common failed\n");
+			kfree(qp);
+			return ERR_PTR(err);
+		}
+
+		if (is_qp0(init_attr->qp_type))
+			qp->ibqp.qp_num = 0;
+		else if (is_qp1(init_attr->qp_type))
+			qp->ibqp.qp_num = 1;
+		else
+			qp->ibqp.qp_num = qp->mqp.qpn;
+
+		mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n",
+			    qp->ibqp.qp_num, qp->mqp.qpn, to_mcq(init_attr->recv_cq)->mcq.cqn,
+			    to_mcq(init_attr->send_cq)->mcq.cqn);
+
+		qp->xrcdn = xrcdn;
+
+		break;
+
+	case IB_QPT_RAW_IPV6:
+	case IB_QPT_RAW_ETHERTYPE:
+	case IB_QPT_RAW_PACKET:
+	case IB_QPT_MAX:
+	default:
+		mlx5_ib_dbg(dev, "unsupported qp type %d\n",
+			    init_attr->qp_type);
+		/* Don't support raw QPs */
+		return ERR_PTR(-EINVAL);
+	}
+
+	return &qp->ibqp;
+}
+
+int mlx5_ib_destroy_qp(struct ib_qp *qp)
+{
+	struct mlx5_ib_dev *dev = to_mdev(qp->device);
+	struct mlx5_ib_qp *mqp = to_mqp(qp);
+
+	destroy_qp_common(dev, mqp);
+
+	kfree(mqp);
+
+	return 0;
+}
+
+static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr,
+				   int attr_mask)
+{
+	u32 hw_access_flags = 0;
+	u8 dest_rd_atomic;
+	u32 access_flags;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		dest_rd_atomic = attr->max_dest_rd_atomic;
+	else
+		dest_rd_atomic = qp->resp_depth;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		access_flags = attr->qp_access_flags;
+	else
+		access_flags = qp->atomic_rd_en;
+
+	if (!dest_rd_atomic)
+		access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_READ)
+		hw_access_flags |= MLX5_QP_BIT_RRE;
+	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+		hw_access_flags |= (MLX5_QP_BIT_RAE | MLX5_ATOMIC_MODE_CX);
+	if (access_flags & IB_ACCESS_REMOTE_WRITE)
+		hw_access_flags |= MLX5_QP_BIT_RWE;
+
+	return cpu_to_be32(hw_access_flags);
+}
+
+enum {
+	MLX5_PATH_FLAG_FL	= 1 << 0,
+	MLX5_PATH_FLAG_FREE_AR	= 1 << 1,
+	MLX5_PATH_FLAG_COUNTER	= 1 << 2,
+};
+
+static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
+{
+	struct mlx5_general_caps *gen;
+
+	gen = &dev->mdev->caps.gen;
+	if (rate == IB_RATE_PORT_CURRENT) {
+		return 0;
+	} else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) {
+		return -EINVAL;
+	} else {
+		while (rate != IB_RATE_2_5_GBPS &&
+		       !(1 << (rate + MLX5_STAT_RATE_OFFSET) &
+			 gen->stat_rate_support))
+			--rate;
+	}
+
+	return rate + MLX5_STAT_RATE_OFFSET;
+}
+
+static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah,
+			 struct mlx5_qp_path *path, u8 port, int attr_mask,
+			 u32 path_flags, const struct ib_qp_attr *attr)
+{
+	struct mlx5_general_caps *gen;
+	int err;
+
+	gen = &dev->mdev->caps.gen;
+	path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
+	path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : 0;
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		path->pkey_index = attr->pkey_index;
+
+	path->grh_mlid	= ah->src_path_bits & 0x7f;
+	path->rlid	= cpu_to_be16(ah->dlid);
+
+	if (ah->ah_flags & IB_AH_GRH) {
+		if (ah->grh.sgid_index >= gen->port[port - 1].gid_table_len) {
+			pr_err("sgid_index (%u) too large. max is %d\n",
+			       ah->grh.sgid_index, gen->port[port - 1].gid_table_len);
+			return -EINVAL;
+		}
+		path->grh_mlid |= 1 << 7;
+		path->mgid_index = ah->grh.sgid_index;
+		path->hop_limit  = ah->grh.hop_limit;
+		path->tclass_flowlabel =
+			cpu_to_be32((ah->grh.traffic_class << 20) |
+				    (ah->grh.flow_label));
+		memcpy(path->rgid, ah->grh.dgid.raw, 16);
+	}
+
+	err = ib_rate_to_mlx5(dev, ah->static_rate);
+	if (err < 0)
+		return err;
+	path->static_rate = err;
+	path->port = port;
+
+	if (attr_mask & IB_QP_TIMEOUT)
+		path->ackto_lt = attr->timeout << 3;
+
+	path->sl = ah->sl & 0xf;
+
+	return 0;
+}
+
+static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_QP_ST_MAX] = {
+	[MLX5_QP_STATE_INIT] = {
+		[MLX5_QP_STATE_INIT] = {
+			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE		|
+					  MLX5_QP_OPTPAR_RAE		|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PKEY_INDEX	|
+					  MLX5_QP_OPTPAR_PRI_PORT,
+			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PKEY_INDEX	|
+					  MLX5_QP_OPTPAR_PRI_PORT,
+			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX	|
+					  MLX5_QP_OPTPAR_Q_KEY		|
+					  MLX5_QP_OPTPAR_PRI_PORT,
+		},
+		[MLX5_QP_STATE_RTR] = {
+			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
+					  MLX5_QP_OPTPAR_RRE            |
+					  MLX5_QP_OPTPAR_RAE            |
+					  MLX5_QP_OPTPAR_RWE            |
+					  MLX5_QP_OPTPAR_PKEY_INDEX,
+			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
+					  MLX5_QP_OPTPAR_RWE            |
+					  MLX5_QP_OPTPAR_PKEY_INDEX,
+			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX     |
+					  MLX5_QP_OPTPAR_Q_KEY,
+			[MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX	|
+					   MLX5_QP_OPTPAR_Q_KEY,
+			[MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH |
+					  MLX5_QP_OPTPAR_RRE            |
+					  MLX5_QP_OPTPAR_RAE            |
+					  MLX5_QP_OPTPAR_RWE            |
+					  MLX5_QP_OPTPAR_PKEY_INDEX,
+		},
+	},
+	[MLX5_QP_STATE_RTR] = {
+		[MLX5_QP_STATE_RTS] = {
+			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH	|
+					  MLX5_QP_OPTPAR_RRE		|
+					  MLX5_QP_OPTPAR_RAE		|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PM_STATE	|
+					  MLX5_QP_OPTPAR_RNR_TIMEOUT,
+			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH	|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PM_STATE,
+			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY,
+		},
+	},
+	[MLX5_QP_STATE_RTS] = {
+		[MLX5_QP_STATE_RTS] = {
+			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE		|
+					  MLX5_QP_OPTPAR_RAE		|
+					  MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_RNR_TIMEOUT	|
+					  MLX5_QP_OPTPAR_PM_STATE	|
+					  MLX5_QP_OPTPAR_ALT_ADDR_PATH,
+			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE		|
+					  MLX5_QP_OPTPAR_PM_STATE	|
+					  MLX5_QP_OPTPAR_ALT_ADDR_PATH,
+			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY		|
+					  MLX5_QP_OPTPAR_SRQN		|
+					  MLX5_QP_OPTPAR_CQN_RCV,
+		},
+	},
+	[MLX5_QP_STATE_SQER] = {
+		[MLX5_QP_STATE_RTS] = {
+			[MLX5_QP_ST_UD]	 = MLX5_QP_OPTPAR_Q_KEY,
+			[MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY,
+			[MLX5_QP_ST_UC]	 = MLX5_QP_OPTPAR_RWE,
+			[MLX5_QP_ST_RC]	 = MLX5_QP_OPTPAR_RNR_TIMEOUT	|
+					   MLX5_QP_OPTPAR_RWE		|
+					   MLX5_QP_OPTPAR_RAE		|
+					   MLX5_QP_OPTPAR_RRE,
+		},
+	},
+};
+
+static int ib_nr_to_mlx5_nr(int ib_mask)
+{
+	switch (ib_mask) {
+	case IB_QP_STATE:
+		return 0;
+	case IB_QP_CUR_STATE:
+		return 0;
+	case IB_QP_EN_SQD_ASYNC_NOTIFY:
+		return 0;
+	case IB_QP_ACCESS_FLAGS:
+		return MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE |
+			MLX5_QP_OPTPAR_RAE;
+	case IB_QP_PKEY_INDEX:
+		return MLX5_QP_OPTPAR_PKEY_INDEX;
+	case IB_QP_PORT:
+		return MLX5_QP_OPTPAR_PRI_PORT;
+	case IB_QP_QKEY:
+		return MLX5_QP_OPTPAR_Q_KEY;
+	case IB_QP_AV:
+		return MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH |
+			MLX5_QP_OPTPAR_PRI_PORT;
+	case IB_QP_PATH_MTU:
+		return 0;
+	case IB_QP_TIMEOUT:
+		return MLX5_QP_OPTPAR_ACK_TIMEOUT;
+	case IB_QP_RETRY_CNT:
+		return MLX5_QP_OPTPAR_RETRY_COUNT;
+	case IB_QP_RNR_RETRY:
+		return MLX5_QP_OPTPAR_RNR_RETRY;
+	case IB_QP_RQ_PSN:
+		return 0;
+	case IB_QP_MAX_QP_RD_ATOMIC:
+		return MLX5_QP_OPTPAR_SRA_MAX;
+	case IB_QP_ALT_PATH:
+		return MLX5_QP_OPTPAR_ALT_ADDR_PATH;
+	case IB_QP_MIN_RNR_TIMER:
+		return MLX5_QP_OPTPAR_RNR_TIMEOUT;
+	case IB_QP_SQ_PSN:
+		return 0;
+	case IB_QP_MAX_DEST_RD_ATOMIC:
+		return MLX5_QP_OPTPAR_RRA_MAX | MLX5_QP_OPTPAR_RWE |
+			MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE;
+	case IB_QP_PATH_MIG_STATE:
+		return MLX5_QP_OPTPAR_PM_STATE;
+	case IB_QP_CAP:
+		return 0;
+	case IB_QP_DEST_QPN:
+		return 0;
+	}
+	return 0;
+}
+
+static int ib_mask_to_mlx5_opt(int ib_mask)
+{
+	int result = 0;
+	int i;
+
+	for (i = 0; i < 8 * sizeof(int); i++) {
+		if ((1 << i) & ib_mask)
+			result |= ib_nr_to_mlx5_nr(1 << i);
+	}
+
+	return result;
+}
+
+static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
+			       const struct ib_qp_attr *attr, int attr_mask,
+			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_ib_cq *send_cq, *recv_cq;
+	struct mlx5_qp_context *context;
+	struct mlx5_general_caps *gen;
+	struct mlx5_modify_qp_mbox_in *in;
+	struct mlx5_ib_pd *pd;
+	enum mlx5_qp_state mlx5_cur, mlx5_new;
+	enum mlx5_qp_optpar optpar;
+	int sqd_event;
+	int mlx5_st;
+	int err;
+
+	gen = &dev->mdev->caps.gen;
+	in = kzalloc(sizeof(*in), GFP_KERNEL);
+	if (!in)
+		return -ENOMEM;
+
+	context = &in->ctx;
+	err = to_mlx5_st(ibqp->qp_type);
+	if (err < 0)
+		goto out;
+
+	context->flags = cpu_to_be32(err << 16);
+
+	if (!(attr_mask & IB_QP_PATH_MIG_STATE)) {
+		context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
+	} else {
+		switch (attr->path_mig_state) {
+		case IB_MIG_MIGRATED:
+			context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
+			break;
+		case IB_MIG_REARM:
+			context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11);
+			break;
+		case IB_MIG_ARMED:
+			context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11);
+			break;
+		}
+	}
+
+	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) {
+		context->mtu_msgmax = (IB_MTU_256 << 5) | 8;
+	} else if (ibqp->qp_type == IB_QPT_UD ||
+		   ibqp->qp_type == MLX5_IB_QPT_REG_UMR) {
+		context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
+	} else if (attr_mask & IB_QP_PATH_MTU) {
+		if (attr->path_mtu < IB_MTU_256 ||
+		    attr->path_mtu > IB_MTU_4096) {
+			mlx5_ib_warn(dev, "invalid mtu %d\n", attr->path_mtu);
+			err = -EINVAL;
+			goto out;
+		}
+		context->mtu_msgmax = (attr->path_mtu << 5) | gen->log_max_msg;
+	}
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num);
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		context->pri_path.pkey_index = attr->pkey_index;
+
+	/* todo implement counter_index functionality */
+
+	if (is_sqp(ibqp->qp_type))
+		context->pri_path.port = qp->port;
+
+	if (attr_mask & IB_QP_PORT)
+		context->pri_path.port = attr->port_num;
+
+	if (attr_mask & IB_QP_AV) {
+		err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path,
+				    attr_mask & IB_QP_PORT ? attr->port_num : qp->port,
+				    attr_mask, 0, attr);
+		if (err)
+			goto out;
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT)
+		context->pri_path.ackto_lt |= attr->timeout << 3;
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path,
+				    attr->alt_port_num, attr_mask, 0, attr);
+		if (err)
+			goto out;
+	}
+
+	pd = get_pd(qp);
+	get_cqs(qp, &send_cq, &recv_cq);
+
+	context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn);
+	context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0;
+	context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0;
+	context->params1  = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28);
+
+	if (attr_mask & IB_QP_RNR_RETRY)
+		context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
+
+	if (attr_mask & IB_QP_RETRY_CNT)
+		context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attr->max_rd_atomic)
+			context->params1 |=
+				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN)
+		context->next_send_psn = cpu_to_be32(attr->sq_psn);
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attr->max_dest_rd_atomic)
+			context->params2 |=
+				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+	}
+
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC))
+		context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask);
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+
+	if (attr_mask & IB_QP_RQ_PSN)
+		context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+	if (attr_mask & IB_QP_QKEY)
+		context->qkey = cpu_to_be32(attr->qkey);
+
+	if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		context->db_rec_addr = cpu_to_be64(qp->db.dma);
+
+	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
+	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
+		sqd_event = 1;
+	else
+		sqd_event = 0;
+
+	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		context->sq_crq_size |= cpu_to_be16(1 << 4);
+
+
+	mlx5_cur = to_mlx5_state(cur_state);
+	mlx5_new = to_mlx5_state(new_state);
+	mlx5_st = to_mlx5_st(ibqp->qp_type);
+	if (mlx5_st < 0)
+		goto out;
+
+	/* If moving to a reset or error state, we must disable page faults on
+	 * this QP and flush all current page faults. Otherwise a stale page
+	 * fault may attempt to work on this QP after it is reset and moved
+	 * again to RTS, and may cause the driver and the device to get out of
+	 * sync. */
+	if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR &&
+	    (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR))
+		mlx5_ib_qp_disable_pagefaults(qp);
+
+	optpar = ib_mask_to_mlx5_opt(attr_mask);
+	optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
+	in->optparam = cpu_to_be32(optpar);
+	err = mlx5_core_qp_modify(dev->mdev, to_mlx5_state(cur_state),
+				  to_mlx5_state(new_state), in, sqd_event,
+				  &qp->mqp);
+	if (err)
+		goto out;
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		mlx5_ib_qp_enable_pagefaults(qp);
+
+	qp->state = new_state;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->atomic_rd_en = attr->qp_access_flags;
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->resp_depth = attr->max_dest_rd_atomic;
+	if (attr_mask & IB_QP_PORT)
+		qp->port = attr->port_num;
+	if (attr_mask & IB_QP_ALT_PATH)
+		qp->alt_port = attr->alt_port_num;
+
+	/*
+	 * If we moved a kernel QP to RESET, clean up all old CQ
+	 * entries and reinitialize the QP.
+	 */
+	if (new_state == IB_QPS_RESET && !ibqp->uobject) {
+		mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn,
+				 ibqp->srq ? to_msrq(ibqp->srq) : NULL);
+		if (send_cq != recv_cq)
+			mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
+
+		qp->rq.head = 0;
+		qp->rq.tail = 0;
+		qp->sq.head = 0;
+		qp->sq.tail = 0;
+		qp->sq.cur_post = 0;
+		qp->sq.last_poll = 0;
+		qp->db.db[MLX5_RCV_DBR] = 0;
+		qp->db.db[MLX5_SND_DBR] = 0;
+	}
+
+out:
+	kfree(in);
+	return err;
+}
+
+int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask, struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	struct mlx5_general_caps *gen;
+	int err = -EINVAL;
+	int port;
+
+	gen = &dev->mdev->caps.gen;
+	mutex_lock(&qp->mutex);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR &&
+	    !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask,
+				IB_LINK_LAYER_UNSPECIFIED))
+		goto out;
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > gen->num_ports))
+		goto out;
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
+		if (attr->pkey_index >= gen->port[port - 1].pkey_table_len)
+			goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > (1 << gen->log_max_ra_res_qp))
+		goto out;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > (1 << gen->log_max_ra_req_qp))
+		goto out;
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		err = 0;
+		goto out;
+	}
+
+	err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
+{
+	struct mlx5_ib_cq *cq;
+	unsigned cur;
+
+	cur = wq->head - wq->tail;
+	if (likely(cur + nreq < wq->max_post))
+		return 0;
+
+	cq = to_mcq(ib_cq);
+	spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max_post;
+}
+
+static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg,
+					  u64 remote_addr, u32 rkey)
+{
+	rseg->raddr    = cpu_to_be64(remote_addr);
+	rseg->rkey     = cpu_to_be32(rkey);
+	rseg->reserved = 0;
+}
+
+static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg,
+			     struct ib_send_wr *wr)
+{
+	memcpy(&dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof(struct mlx5_av));
+	dseg->av.dqp_dct = cpu_to_be32(wr->wr.ud.remote_qpn | MLX5_EXTENDED_UD_AV);
+	dseg->av.key.qkey.qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+}
+
+static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->byte_count = cpu_to_be32(sg->length);
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+}
+
+static __be16 get_klm_octo(int npages)
+{
+	return cpu_to_be16(ALIGN(npages, 8) / 2);
+}
+
+static __be64 frwr_mkey_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_LEN		|
+		MLX5_MKEY_MASK_PAGE_SIZE	|
+		MLX5_MKEY_MASK_START_ADDR	|
+		MLX5_MKEY_MASK_EN_RINVAL	|
+		MLX5_MKEY_MASK_KEY		|
+		MLX5_MKEY_MASK_LR		|
+		MLX5_MKEY_MASK_LW		|
+		MLX5_MKEY_MASK_RR		|
+		MLX5_MKEY_MASK_RW		|
+		MLX5_MKEY_MASK_A		|
+		MLX5_MKEY_MASK_SMALL_FENCE	|
+		MLX5_MKEY_MASK_FREE;
+
+	return cpu_to_be64(result);
+}
+
+static __be64 sig_mkey_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_LEN		|
+		MLX5_MKEY_MASK_PAGE_SIZE	|
+		MLX5_MKEY_MASK_START_ADDR	|
+		MLX5_MKEY_MASK_EN_SIGERR	|
+		MLX5_MKEY_MASK_EN_RINVAL	|
+		MLX5_MKEY_MASK_KEY		|
+		MLX5_MKEY_MASK_LR		|
+		MLX5_MKEY_MASK_LW		|
+		MLX5_MKEY_MASK_RR		|
+		MLX5_MKEY_MASK_RW		|
+		MLX5_MKEY_MASK_SMALL_FENCE	|
+		MLX5_MKEY_MASK_FREE		|
+		MLX5_MKEY_MASK_BSF_EN;
+
+	return cpu_to_be64(result);
+}
+
+static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
+				 struct ib_send_wr *wr, int li)
+{
+	memset(umr, 0, sizeof(*umr));
+
+	if (li) {
+		umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE);
+		umr->flags = 1 << 7;
+		return;
+	}
+
+	umr->flags = (1 << 5); /* fail if not free */
+	umr->klm_octowords = get_klm_octo(wr->wr.fast_reg.page_list_len);
+	umr->mkey_mask = frwr_mkey_mask();
+}
+
+static __be64 get_umr_reg_mr_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_LEN		|
+		 MLX5_MKEY_MASK_PAGE_SIZE	|
+		 MLX5_MKEY_MASK_START_ADDR	|
+		 MLX5_MKEY_MASK_PD		|
+		 MLX5_MKEY_MASK_LR		|
+		 MLX5_MKEY_MASK_LW		|
+		 MLX5_MKEY_MASK_KEY		|
+		 MLX5_MKEY_MASK_RR		|
+		 MLX5_MKEY_MASK_RW		|
+		 MLX5_MKEY_MASK_A		|
+		 MLX5_MKEY_MASK_FREE;
+
+	return cpu_to_be64(result);
+}
+
+static __be64 get_umr_unreg_mr_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_FREE;
+
+	return cpu_to_be64(result);
+}
+
+static __be64 get_umr_update_mtt_mask(void)
+{
+	u64 result;
+
+	result = MLX5_MKEY_MASK_FREE;
+
+	return cpu_to_be64(result);
+}
+
+static void set_reg_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
+				struct ib_send_wr *wr)
+{
+	struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
+
+	memset(umr, 0, sizeof(*umr));
+
+	if (wr->send_flags & MLX5_IB_SEND_UMR_FAIL_IF_FREE)
+		umr->flags = MLX5_UMR_CHECK_FREE; /* fail if free */
+	else
+		umr->flags = MLX5_UMR_CHECK_NOT_FREE; /* fail if not free */
+
+	if (!(wr->send_flags & MLX5_IB_SEND_UMR_UNREG)) {
+		umr->klm_octowords = get_klm_octo(umrwr->npages);
+		if (wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT) {
+			umr->mkey_mask = get_umr_update_mtt_mask();
+			umr->bsf_octowords = get_klm_octo(umrwr->target.offset);
+			umr->flags |= MLX5_UMR_TRANSLATION_OFFSET_EN;
+		} else {
+			umr->mkey_mask = get_umr_reg_mr_mask();
+		}
+	} else {
+		umr->mkey_mask = get_umr_unreg_mr_mask();
+	}
+
+	if (!wr->num_sge)
+		umr->flags |= MLX5_UMR_INLINE;
+}
+
+static u8 get_umr_flags(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX5_PERM_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MLX5_PERM_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX5_PERM_LOCAL_WRITE  : 0) |
+		MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN;
+}
+
+static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr,
+			     int li, int *writ)
+{
+	memset(seg, 0, sizeof(*seg));
+	if (li) {
+		seg->status = MLX5_MKEY_STATUS_FREE;
+		return;
+	}
+
+	seg->flags = get_umr_flags(wr->wr.fast_reg.access_flags) |
+		     MLX5_ACCESS_MODE_MTT;
+	*writ = seg->flags & (MLX5_PERM_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE);
+	seg->qpn_mkey7_0 = cpu_to_be32((wr->wr.fast_reg.rkey & 0xff) | 0xffffff00);
+	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL);
+	seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start);
+	seg->len = cpu_to_be64(wr->wr.fast_reg.length);
+	seg->xlt_oct_size = cpu_to_be32((wr->wr.fast_reg.page_list_len + 1) / 2);
+	seg->log2_page_size = wr->wr.fast_reg.page_shift;
+}
+
+static void set_reg_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr)
+{
+	struct mlx5_umr_wr *umrwr = (struct mlx5_umr_wr *)&wr->wr.fast_reg;
+
+	memset(seg, 0, sizeof(*seg));
+	if (wr->send_flags & MLX5_IB_SEND_UMR_UNREG) {
+		seg->status = MLX5_MKEY_STATUS_FREE;
+		return;
+	}
+
+	seg->flags = convert_access(umrwr->access_flags);
+	if (!(wr->send_flags & MLX5_IB_SEND_UMR_UPDATE_MTT)) {
+		seg->flags_pd = cpu_to_be32(to_mpd(umrwr->pd)->pdn);
+		seg->start_addr = cpu_to_be64(umrwr->target.virt_addr);
+	}
+	seg->len = cpu_to_be64(umrwr->length);
+	seg->log2_page_size = umrwr->page_shift;
+	seg->qpn_mkey7_0 = cpu_to_be32(0xffffff00 |
+				       mlx5_mkey_variant(umrwr->mkey));
+}
+
+static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg,
+			   struct ib_send_wr *wr,
+			   struct mlx5_core_dev *mdev,
+			   struct mlx5_ib_pd *pd,
+			   int writ)
+{
+	struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list);
+	u64 *page_list = wr->wr.fast_reg.page_list->page_list;
+	u64 perm = MLX5_EN_RD | (writ ? MLX5_EN_WR : 0);
+	int i;
+
+	for (i = 0; i < wr->wr.fast_reg.page_list_len; i++)
+		mfrpl->mapped_page_list[i] = cpu_to_be64(page_list[i] | perm);
+	dseg->addr = cpu_to_be64(mfrpl->map);
+	dseg->byte_count = cpu_to_be32(ALIGN(sizeof(u64) * wr->wr.fast_reg.page_list_len, 64));
+	dseg->lkey = cpu_to_be32(pd->pa_lkey);
+}
+
+static __be32 send_ieth(struct ib_send_wr *wr)
+{
+	switch (wr->opcode) {
+	case IB_WR_SEND_WITH_IMM:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		return wr->ex.imm_data;
+
+	case IB_WR_SEND_WITH_INV:
+		return cpu_to_be32(wr->ex.invalidate_rkey);
+
+	default:
+		return 0;
+	}
+}
+
+static u8 calc_sig(void *wqe, int size)
+{
+	u8 *p = wqe;
+	u8 res = 0;
+	int i;
+
+	for (i = 0; i < size; i++)
+		res ^= p[i];
+
+	return ~res;
+}
+
+static u8 wq_sig(void *wqe)
+{
+	return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4);
+}
+
+static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr,
+			    void *wqe, int *sz)
+{
+	struct mlx5_wqe_inline_seg *seg;
+	void *qend = qp->sq.qend;
+	void *addr;
+	int inl = 0;
+	int copy;
+	int len;
+	int i;
+
+	seg = wqe;
+	wqe += sizeof(*seg);
+	for (i = 0; i < wr->num_sge; i++) {
+		addr = (void *)(unsigned long)(wr->sg_list[i].addr);
+		len  = wr->sg_list[i].length;
+		inl += len;
+
+		if (unlikely(inl > qp->max_inline_data))
+			return -ENOMEM;
+
+		if (unlikely(wqe + len > qend)) {
+			copy = qend - wqe;
+			memcpy(wqe, addr, copy);
+			addr += copy;
+			len -= copy;
+			wqe = mlx5_get_send_wqe(qp, 0);
+		}
+		memcpy(wqe, addr, len);
+		wqe += len;
+	}
+
+	seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG);
+
+	*sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16;
+
+	return 0;
+}
+
+static u16 prot_field_size(enum ib_signature_type type)
+{
+	switch (type) {
+	case IB_SIG_TYPE_T10_DIF:
+		return MLX5_DIF_SIZE;
+	default:
+		return 0;
+	}
+}
+
+static u8 bs_selector(int block_size)
+{
+	switch (block_size) {
+	case 512:	    return 0x1;
+	case 520:	    return 0x2;
+	case 4096:	    return 0x3;
+	case 4160:	    return 0x4;
+	case 1073741824:    return 0x5;
+	default:	    return 0;
+	}
+}
+
+static void mlx5_fill_inl_bsf(struct ib_sig_domain *domain,
+			      struct mlx5_bsf_inl *inl)
+{
+	/* Valid inline section and allow BSF refresh */
+	inl->vld_refresh = cpu_to_be16(MLX5_BSF_INL_VALID |
+				       MLX5_BSF_REFRESH_DIF);
+	inl->dif_apptag = cpu_to_be16(domain->sig.dif.app_tag);
+	inl->dif_reftag = cpu_to_be32(domain->sig.dif.ref_tag);
+	/* repeating block */
+	inl->rp_inv_seed = MLX5_BSF_REPEAT_BLOCK;
+	inl->sig_type = domain->sig.dif.bg_type == IB_T10DIF_CRC ?
+			MLX5_DIF_CRC : MLX5_DIF_IPCS;
+
+	if (domain->sig.dif.ref_remap)
+		inl->dif_inc_ref_guard_check |= MLX5_BSF_INC_REFTAG;
+
+	if (domain->sig.dif.app_escape) {
+		if (domain->sig.dif.ref_escape)
+			inl->dif_inc_ref_guard_check |= MLX5_BSF_APPREF_ESCAPE;
+		else
+			inl->dif_inc_ref_guard_check |= MLX5_BSF_APPTAG_ESCAPE;
+	}
+
+	inl->dif_app_bitmask_check =
+		cpu_to_be16(domain->sig.dif.apptag_check_mask);
+}
+
+static int mlx5_set_bsf(struct ib_mr *sig_mr,
+			struct ib_sig_attrs *sig_attrs,
+			struct mlx5_bsf *bsf, u32 data_size)
+{
+	struct mlx5_core_sig_ctx *msig = to_mmr(sig_mr)->sig;
+	struct mlx5_bsf_basic *basic = &bsf->basic;
+	struct ib_sig_domain *mem = &sig_attrs->mem;
+	struct ib_sig_domain *wire = &sig_attrs->wire;
+
+	memset(bsf, 0, sizeof(*bsf));
+
+	/* Basic + Extended + Inline */
+	basic->bsf_size_sbs = 1 << 7;
+	/* Input domain check byte mask */
+	basic->check_byte_mask = sig_attrs->check_mask;
+	basic->raw_data_size = cpu_to_be32(data_size);
+
+	/* Memory domain */
+	switch (sig_attrs->mem.sig_type) {
+	case IB_SIG_TYPE_NONE:
+		break;
+	case IB_SIG_TYPE_T10_DIF:
+		basic->mem.bs_selector = bs_selector(mem->sig.dif.pi_interval);
+		basic->m_bfs_psv = cpu_to_be32(msig->psv_memory.psv_idx);
+		mlx5_fill_inl_bsf(mem, &bsf->m_inl);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	/* Wire domain */
+	switch (sig_attrs->wire.sig_type) {
+	case IB_SIG_TYPE_NONE:
+		break;
+	case IB_SIG_TYPE_T10_DIF:
+		if (mem->sig.dif.pi_interval == wire->sig.dif.pi_interval &&
+		    mem->sig_type == wire->sig_type) {
+			/* Same block structure */
+			basic->bsf_size_sbs |= 1 << 4;
+			if (mem->sig.dif.bg_type == wire->sig.dif.bg_type)
+				basic->wire.copy_byte_mask |= MLX5_CPY_GRD_MASK;
+			if (mem->sig.dif.app_tag == wire->sig.dif.app_tag)
+				basic->wire.copy_byte_mask |= MLX5_CPY_APP_MASK;
+			if (mem->sig.dif.ref_tag == wire->sig.dif.ref_tag)
+				basic->wire.copy_byte_mask |= MLX5_CPY_REF_MASK;
+		} else
+			basic->wire.bs_selector = bs_selector(wire->sig.dif.pi_interval);
+
+		basic->w_bfs_psv = cpu_to_be32(msig->psv_wire.psv_idx);
+		mlx5_fill_inl_bsf(wire, &bsf->w_inl);
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int set_sig_data_segment(struct ib_send_wr *wr, struct mlx5_ib_qp *qp,
+				void **seg, int *size)
+{
+	struct ib_sig_attrs *sig_attrs = wr->wr.sig_handover.sig_attrs;
+	struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr;
+	struct mlx5_bsf *bsf;
+	u32 data_len = wr->sg_list->length;
+	u32 data_key = wr->sg_list->lkey;
+	u64 data_va = wr->sg_list->addr;
+	int ret;
+	int wqe_size;
+
+	if (!wr->wr.sig_handover.prot ||
+	    (data_key == wr->wr.sig_handover.prot->lkey &&
+	     data_va == wr->wr.sig_handover.prot->addr &&
+	     data_len == wr->wr.sig_handover.prot->length)) {
+		/**
+		 * Source domain doesn't contain signature information
+		 * or data and protection are interleaved in memory.
+		 * So need construct:
+		 *                  ------------------
+		 *                 |     data_klm     |
+		 *                  ------------------
+		 *                 |       BSF        |
+		 *                  ------------------
+		 **/
+		struct mlx5_klm *data_klm = *seg;
+
+		data_klm->bcount = cpu_to_be32(data_len);
+		data_klm->key = cpu_to_be32(data_key);
+		data_klm->va = cpu_to_be64(data_va);
+		wqe_size = ALIGN(sizeof(*data_klm), 64);
+	} else {
+		/**
+		 * Source domain contains signature information
+		 * So need construct a strided block format:
+		 *               ---------------------------
+		 *              |     stride_block_ctrl     |
+		 *               ---------------------------
+		 *              |          data_klm         |
+		 *               ---------------------------
+		 *              |          prot_klm         |
+		 *               ---------------------------
+		 *              |             BSF           |
+		 *               ---------------------------
+		 **/
+		struct mlx5_stride_block_ctrl_seg *sblock_ctrl;
+		struct mlx5_stride_block_entry *data_sentry;
+		struct mlx5_stride_block_entry *prot_sentry;
+		u32 prot_key = wr->wr.sig_handover.prot->lkey;
+		u64 prot_va = wr->wr.sig_handover.prot->addr;
+		u16 block_size = sig_attrs->mem.sig.dif.pi_interval;
+		int prot_size;
+
+		sblock_ctrl = *seg;
+		data_sentry = (void *)sblock_ctrl + sizeof(*sblock_ctrl);
+		prot_sentry = (void *)data_sentry + sizeof(*data_sentry);
+
+		prot_size = prot_field_size(sig_attrs->mem.sig_type);
+		if (!prot_size) {
+			pr_err("Bad block size given: %u\n", block_size);
+			return -EINVAL;
+		}
+		sblock_ctrl->bcount_per_cycle = cpu_to_be32(block_size +
+							    prot_size);
+		sblock_ctrl->op = cpu_to_be32(MLX5_STRIDE_BLOCK_OP);
+		sblock_ctrl->repeat_count = cpu_to_be32(data_len / block_size);
+		sblock_ctrl->num_entries = cpu_to_be16(2);
+
+		data_sentry->bcount = cpu_to_be16(block_size);
+		data_sentry->key = cpu_to_be32(data_key);
+		data_sentry->va = cpu_to_be64(data_va);
+		data_sentry->stride = cpu_to_be16(block_size);
+
+		prot_sentry->bcount = cpu_to_be16(prot_size);
+		prot_sentry->key = cpu_to_be32(prot_key);
+		prot_sentry->va = cpu_to_be64(prot_va);
+		prot_sentry->stride = cpu_to_be16(prot_size);
+
+		wqe_size = ALIGN(sizeof(*sblock_ctrl) + sizeof(*data_sentry) +
+				 sizeof(*prot_sentry), 64);
+	}
+
+	*seg += wqe_size;
+	*size += wqe_size / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+
+	bsf = *seg;
+	ret = mlx5_set_bsf(sig_mr, sig_attrs, bsf, data_len);
+	if (ret)
+		return -EINVAL;
+
+	*seg += sizeof(*bsf);
+	*size += sizeof(*bsf) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+
+	return 0;
+}
+
+static void set_sig_mkey_segment(struct mlx5_mkey_seg *seg,
+				 struct ib_send_wr *wr, u32 nelements,
+				 u32 length, u32 pdn)
+{
+	struct ib_mr *sig_mr = wr->wr.sig_handover.sig_mr;
+	u32 sig_key = sig_mr->rkey;
+	u8 sigerr = to_mmr(sig_mr)->sig->sigerr_count & 1;
+
+	memset(seg, 0, sizeof(*seg));
+
+	seg->flags = get_umr_flags(wr->wr.sig_handover.access_flags) |
+				   MLX5_ACCESS_MODE_KLM;
+	seg->qpn_mkey7_0 = cpu_to_be32((sig_key & 0xff) | 0xffffff00);
+	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL | sigerr << 26 |
+				    MLX5_MKEY_BSF_EN | pdn);
+	seg->len = cpu_to_be64(length);
+	seg->xlt_oct_size = cpu_to_be32(be16_to_cpu(get_klm_octo(nelements)));
+	seg->bsfs_octo_size = cpu_to_be32(MLX5_MKEY_BSF_OCTO_SIZE);
+}
+
+static void set_sig_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
+				struct ib_send_wr *wr, u32 nelements)
+{
+	memset(umr, 0, sizeof(*umr));
+
+	umr->flags = MLX5_FLAGS_INLINE | MLX5_FLAGS_CHECK_FREE;
+	umr->klm_octowords = get_klm_octo(nelements);
+	umr->bsf_octowords = cpu_to_be16(MLX5_MKEY_BSF_OCTO_SIZE);
+	umr->mkey_mask = sig_mkey_mask();
+}
+
+
+static int set_sig_umr_wr(struct ib_send_wr *wr, struct mlx5_ib_qp *qp,
+			  void **seg, int *size)
+{
+	struct mlx5_ib_mr *sig_mr = to_mmr(wr->wr.sig_handover.sig_mr);
+	u32 pdn = get_pd(qp)->pdn;
+	u32 klm_oct_size;
+	int region_len, ret;
+
+	if (unlikely(wr->num_sge != 1) ||
+	    unlikely(wr->wr.sig_handover.access_flags &
+		     IB_ACCESS_REMOTE_ATOMIC) ||
+	    unlikely(!sig_mr->sig) || unlikely(!qp->signature_en) ||
+	    unlikely(!sig_mr->sig->sig_status_checked))
+		return -EINVAL;
+
+	/* length of the protected region, data + protection */
+	region_len = wr->sg_list->length;
+	if (wr->wr.sig_handover.prot &&
+	    (wr->wr.sig_handover.prot->lkey != wr->sg_list->lkey  ||
+	     wr->wr.sig_handover.prot->addr != wr->sg_list->addr  ||
+	     wr->wr.sig_handover.prot->length != wr->sg_list->length))
+		region_len += wr->wr.sig_handover.prot->length;
+
+	/**
+	 * KLM octoword size - if protection was provided
+	 * then we use strided block format (3 octowords),
+	 * else we use single KLM (1 octoword)
+	 **/
+	klm_oct_size = wr->wr.sig_handover.prot ? 3 : 1;
+
+	set_sig_umr_segment(*seg, wr, klm_oct_size);
+	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+
+	set_sig_mkey_segment(*seg, wr, klm_oct_size, region_len, pdn);
+	*seg += sizeof(struct mlx5_mkey_seg);
+	*size += sizeof(struct mlx5_mkey_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+
+	ret = set_sig_data_segment(wr, qp, seg, size);
+	if (ret)
+		return ret;
+
+	sig_mr->sig->sig_status_checked = false;
+	return 0;
+}
+
+static int set_psv_wr(struct ib_sig_domain *domain,
+		      u32 psv_idx, void **seg, int *size)
+{
+	struct mlx5_seg_set_psv *psv_seg = *seg;
+
+	memset(psv_seg, 0, sizeof(*psv_seg));
+	psv_seg->psv_num = cpu_to_be32(psv_idx);
+	switch (domain->sig_type) {
+	case IB_SIG_TYPE_NONE:
+		break;
+	case IB_SIG_TYPE_T10_DIF:
+		psv_seg->transient_sig = cpu_to_be32(domain->sig.dif.bg << 16 |
+						     domain->sig.dif.app_tag);
+		psv_seg->ref_tag = cpu_to_be32(domain->sig.dif.ref_tag);
+		break;
+	default:
+		pr_err("Bad signature type given.\n");
+		return 1;
+	}
+
+	*seg += sizeof(*psv_seg);
+	*size += sizeof(*psv_seg) / 16;
+
+	return 0;
+}
+
+static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size,
+			  struct mlx5_core_dev *mdev, struct mlx5_ib_pd *pd, struct mlx5_ib_qp *qp)
+{
+	int writ = 0;
+	int li;
+
+	li = wr->opcode == IB_WR_LOCAL_INV ? 1 : 0;
+	if (unlikely(wr->send_flags & IB_SEND_INLINE))
+		return -EINVAL;
+
+	set_frwr_umr_segment(*seg, wr, li);
+	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+	set_mkey_segment(*seg, wr, li, &writ);
+	*seg += sizeof(struct mlx5_mkey_seg);
+	*size += sizeof(struct mlx5_mkey_seg) / 16;
+	if (unlikely((*seg == qp->sq.qend)))
+		*seg = mlx5_get_send_wqe(qp, 0);
+	if (!li) {
+		if (unlikely(wr->wr.fast_reg.page_list_len >
+			     wr->wr.fast_reg.page_list->max_page_list_len))
+			return	-ENOMEM;
+
+		set_frwr_pages(*seg, wr, mdev, pd, writ);
+		*seg += sizeof(struct mlx5_wqe_data_seg);
+		*size += (sizeof(struct mlx5_wqe_data_seg) / 16);
+	}
+	return 0;
+}
+
+static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16)
+{
+	__be32 *p = NULL;
+	int tidx = idx;
+	int i, j;
+
+	pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx));
+	for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) {
+		if ((i & 0xf) == 0) {
+			void *buf = mlx5_get_send_wqe(qp, tidx);
+			tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1);
+			p = buf;
+			j = 0;
+		}
+		pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]),
+			 be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]),
+			 be32_to_cpu(p[j + 3]));
+	}
+}
+
+static void mlx5_bf_copy(u64 __iomem *dst, u64 *src,
+			 unsigned bytecnt, struct mlx5_ib_qp *qp)
+{
+	while (bytecnt > 0) {
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		__iowrite64_copy(dst++, src++, 8);
+		bytecnt -= 64;
+		if (unlikely(src == qp->sq.qend))
+			src = mlx5_get_send_wqe(qp, 0);
+	}
+}
+
+static u8 get_fence(u8 fence, struct ib_send_wr *wr)
+{
+	if (unlikely(wr->opcode == IB_WR_LOCAL_INV &&
+		     wr->send_flags & IB_SEND_FENCE))
+		return MLX5_FENCE_MODE_STRONG_ORDERING;
+
+	if (unlikely(fence)) {
+		if (wr->send_flags & IB_SEND_FENCE)
+			return MLX5_FENCE_MODE_SMALL_AND_FENCE;
+		else
+			return fence;
+
+	} else {
+		return 0;
+	}
+}
+
+static int begin_wqe(struct mlx5_ib_qp *qp, void **seg,
+		     struct mlx5_wqe_ctrl_seg **ctrl,
+		     struct ib_send_wr *wr, unsigned *idx,
+		     int *size, int nreq)
+{
+	int err = 0;
+
+	if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) {
+		err = -ENOMEM;
+		return err;
+	}
+
+	*idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
+	*seg = mlx5_get_send_wqe(qp, *idx);
+	*ctrl = *seg;
+	*(uint32_t *)(*seg + 8) = 0;
+	(*ctrl)->imm = send_ieth(wr);
+	(*ctrl)->fm_ce_se = qp->sq_signal_bits |
+		(wr->send_flags & IB_SEND_SIGNALED ?
+		 MLX5_WQE_CTRL_CQ_UPDATE : 0) |
+		(wr->send_flags & IB_SEND_SOLICITED ?
+		 MLX5_WQE_CTRL_SOLICITED : 0);
+
+	*seg += sizeof(**ctrl);
+	*size = sizeof(**ctrl) / 16;
+
+	return err;
+}
+
+static void finish_wqe(struct mlx5_ib_qp *qp,
+		       struct mlx5_wqe_ctrl_seg *ctrl,
+		       u8 size, unsigned idx, u64 wr_id,
+		       int nreq, u8 fence, u8 next_fence,
+		       u32 mlx5_opcode)
+{
+	u8 opmod = 0;
+
+	ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) |
+					     mlx5_opcode | ((u32)opmod << 24));
+	ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8));
+	ctrl->fm_ce_se |= fence;
+	qp->fm_cache = next_fence;
+	if (unlikely(qp->wq_sig))
+		ctrl->signature = wq_sig(ctrl);
+
+	qp->sq.wrid[idx] = wr_id;
+	qp->sq.w_list[idx].opcode = mlx5_opcode;
+	qp->sq.wqe_head[idx] = qp->sq.head + nreq;
+	qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB);
+	qp->sq.w_list[idx].next = qp->sq.cur_post;
+}
+
+
+int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		      struct ib_send_wr **bad_wr)
+{
+	struct mlx5_wqe_ctrl_seg *ctrl = NULL;  /* compiler warning */
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_core_dev *mdev = dev->mdev;
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_ib_mr *mr;
+	struct mlx5_wqe_data_seg *dpseg;
+	struct mlx5_wqe_xrc_seg *xrc;
+	struct mlx5_bf *bf = qp->bf;
+	int uninitialized_var(size);
+	void *qend = qp->sq.qend;
+	unsigned long flags;
+	unsigned idx;
+	int err = 0;
+	int inl = 0;
+	int num_sge;
+	void *seg;
+	int nreq;
+	int i;
+	u8 next_fence = 0;
+	u8 fence;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	for (nreq = 0; wr; nreq++, wr = wr->next) {
+		if (unlikely(wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) {
+			mlx5_ib_warn(dev, "\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		fence = qp->fm_cache;
+		num_sge = wr->num_sge;
+		if (unlikely(num_sge > qp->sq.max_gs)) {
+			mlx5_ib_warn(dev, "\n");
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq);
+		if (err) {
+			mlx5_ib_warn(dev, "\n");
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		switch (ibqp->qp_type) {
+		case IB_QPT_XRC_INI:
+			xrc = seg;
+			xrc->xrc_srqn = htonl(wr->xrc_remote_srq_num);
+			seg += sizeof(*xrc);
+			size += sizeof(*xrc) / 16;
+			/* fall through */
+		case IB_QPT_RC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(seg, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				seg += sizeof(struct mlx5_wqe_raddr_seg);
+				size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
+				break;
+
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
+				mlx5_ib_warn(dev, "Atomic operations are not supported yet\n");
+				err = -ENOSYS;
+				*bad_wr = wr;
+				goto out;
+
+			case IB_WR_LOCAL_INV:
+				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
+				qp->sq.wr_data[idx] = IB_WR_LOCAL_INV;
+				ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey);
+				err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				num_sge = 0;
+				break;
+
+			case IB_WR_FAST_REG_MR:
+				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
+				qp->sq.wr_data[idx] = IB_WR_FAST_REG_MR;
+				ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey);
+				err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+				num_sge = 0;
+				break;
+
+			case IB_WR_REG_SIG_MR:
+				qp->sq.wr_data[idx] = IB_WR_REG_SIG_MR;
+				mr = to_mmr(wr->wr.sig_handover.sig_mr);
+
+				ctrl->imm = cpu_to_be32(mr->ibmr.rkey);
+				err = set_sig_umr_wr(wr, qp, &seg, &size);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+
+				finish_wqe(qp, ctrl, size, idx, wr->wr_id,
+					   nreq, get_fence(fence, wr),
+					   next_fence, MLX5_OPCODE_UMR);
+				/*
+				 * SET_PSV WQEs are not signaled and solicited
+				 * on error
+				 */
+				wr->send_flags &= ~IB_SEND_SIGNALED;
+				wr->send_flags |= IB_SEND_SOLICITED;
+				err = begin_wqe(qp, &seg, &ctrl, wr,
+						&idx, &size, nreq);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					err = -ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+
+				err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->mem,
+						 mr->sig->psv_memory.psv_idx, &seg,
+						 &size);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+
+				finish_wqe(qp, ctrl, size, idx, wr->wr_id,
+					   nreq, get_fence(fence, wr),
+					   next_fence, MLX5_OPCODE_SET_PSV);
+				err = begin_wqe(qp, &seg, &ctrl, wr,
+						&idx, &size, nreq);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					err = -ENOMEM;
+					*bad_wr = wr;
+					goto out;
+				}
+
+				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
+				err = set_psv_wr(&wr->wr.sig_handover.sig_attrs->wire,
+						 mr->sig->psv_wire.psv_idx, &seg,
+						 &size);
+				if (err) {
+					mlx5_ib_warn(dev, "\n");
+					*bad_wr = wr;
+					goto out;
+				}
+
+				finish_wqe(qp, ctrl, size, idx, wr->wr_id,
+					   nreq, get_fence(fence, wr),
+					   next_fence, MLX5_OPCODE_SET_PSV);
+				num_sge = 0;
+				goto skip_psv;
+
+			default:
+				break;
+			}
+			break;
+
+		case IB_QPT_UC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(seg, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				seg  += sizeof(struct mlx5_wqe_raddr_seg);
+				size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
+				break;
+
+			default:
+				break;
+			}
+			break;
+
+		case IB_QPT_UD:
+		case IB_QPT_SMI:
+		case IB_QPT_GSI:
+			set_datagram_seg(seg, wr);
+			seg += sizeof(struct mlx5_wqe_datagram_seg);
+			size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
+			if (unlikely((seg == qend)))
+				seg = mlx5_get_send_wqe(qp, 0);
+			break;
+
+		case MLX5_IB_QPT_REG_UMR:
+			if (wr->opcode != MLX5_IB_WR_UMR) {
+				err = -EINVAL;
+				mlx5_ib_warn(dev, "bad opcode\n");
+				goto out;
+			}
+			qp->sq.wr_data[idx] = MLX5_IB_WR_UMR;
+			ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey);
+			set_reg_umr_segment(seg, wr);
+			seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
+			size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
+			if (unlikely((seg == qend)))
+				seg = mlx5_get_send_wqe(qp, 0);
+			set_reg_mkey_segment(seg, wr);
+			seg += sizeof(struct mlx5_mkey_seg);
+			size += sizeof(struct mlx5_mkey_seg) / 16;
+			if (unlikely((seg == qend)))
+				seg = mlx5_get_send_wqe(qp, 0);
+			break;
+
+		default:
+			break;
+		}
+
+		if (wr->send_flags & IB_SEND_INLINE && num_sge) {
+			int uninitialized_var(sz);
+
+			err = set_data_inl_seg(qp, wr, seg, &sz);
+			if (unlikely(err)) {
+				mlx5_ib_warn(dev, "\n");
+				*bad_wr = wr;
+				goto out;
+			}
+			inl = 1;
+			size += sz;
+		} else {
+			dpseg = seg;
+			for (i = 0; i < num_sge; i++) {
+				if (unlikely(dpseg == qend)) {
+					seg = mlx5_get_send_wqe(qp, 0);
+					dpseg = seg;
+				}
+				if (likely(wr->sg_list[i].length)) {
+					set_data_ptr_seg(dpseg, wr->sg_list + i);
+					size += sizeof(struct mlx5_wqe_data_seg) / 16;
+					dpseg++;
+				}
+			}
+		}
+
+		finish_wqe(qp, ctrl, size, idx, wr->wr_id, nreq,
+			   get_fence(fence, wr), next_fence,
+			   mlx5_ib_opcode[wr->opcode]);
+skip_psv:
+		if (0)
+			dump_wqe(qp, idx, size);
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->sq.head += nreq;
+
+		/* Make sure that descriptors are written before
+		 * updating doorbell record and ringing the doorbell
+		 */
+		wmb();
+
+		qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post);
+
+		/* Make sure doorbell record is visible to the HCA before
+		 * we hit doorbell */
+		wmb();
+
+		if (bf->need_lock)
+			spin_lock(&bf->lock);
+		else
+			__acquire(&bf->lock);
+
+		/* TBD enable WC */
+		if (0 && nreq == 1 && bf->uuarn && inl && size > 1 && size <= bf->buf_size / 16) {
+			mlx5_bf_copy(bf->reg + bf->offset, (u64 *)ctrl, ALIGN(size * 16, 64), qp);
+			/* wc_wmb(); */
+		} else {
+			mlx5_write64((__be32 *)ctrl, bf->regreg + bf->offset,
+				     MLX5_GET_DOORBELL_LOCK(&bf->lock32));
+			/* Make sure doorbells don't leak out of SQ spinlock
+			 * and reach the HCA out of order.
+			 */
+			mmiowb();
+		}
+		bf->offset ^= bf->buf_size;
+		if (bf->need_lock)
+			spin_unlock(&bf->lock);
+		else
+			__release(&bf->lock);
+	}
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+
+	return err;
+}
+
+static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size)
+{
+	sig->signature = calc_sig(sig, size);
+}
+
+int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		      struct ib_recv_wr **bad_wr)
+{
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_wqe_data_seg *scat;
+	struct mlx5_rwqe_sig *sig;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int ind;
+	int i;
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
+
+	for (nreq = 0; wr; nreq++, wr = wr->next) {
+		if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		scat = get_recv_wqe(qp, ind);
+		if (qp->wq_sig)
+			scat++;
+
+		for (i = 0; i < wr->num_sge; i++)
+			set_data_ptr_seg(scat + i, wr->sg_list + i);
+
+		if (i < qp->rq.max_gs) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX5_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+
+		if (qp->wq_sig) {
+			sig = (struct mlx5_rwqe_sig *)scat;
+			set_sig_seg(sig, (qp->rq.max_gs + 1) << 2);
+		}
+
+		qp->rq.wrid[ind] = wr->wr_id;
+
+		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
+	}
+
+out:
+	if (likely(nreq)) {
+		qp->rq.head += nreq;
+
+		/* Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+
+	return err;
+}
+
+static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state)
+{
+	switch (mlx5_state) {
+	case MLX5_QP_STATE_RST:      return IB_QPS_RESET;
+	case MLX5_QP_STATE_INIT:     return IB_QPS_INIT;
+	case MLX5_QP_STATE_RTR:      return IB_QPS_RTR;
+	case MLX5_QP_STATE_RTS:      return IB_QPS_RTS;
+	case MLX5_QP_STATE_SQ_DRAINING:
+	case MLX5_QP_STATE_SQD:      return IB_QPS_SQD;
+	case MLX5_QP_STATE_SQER:     return IB_QPS_SQE;
+	case MLX5_QP_STATE_ERR:      return IB_QPS_ERR;
+	default:		     return -1;
+	}
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state)
+{
+	switch (mlx5_mig_state) {
+	case MLX5_QP_PM_ARMED:		return IB_MIG_ARMED;
+	case MLX5_QP_PM_REARM:		return IB_MIG_REARM;
+	case MLX5_QP_PM_MIGRATED:	return IB_MIG_MIGRATED;
+	default: return -1;
+	}
+}
+
+static int to_ib_qp_access_flags(int mlx5_flags)
+{
+	int ib_flags = 0;
+
+	if (mlx5_flags & MLX5_QP_BIT_RRE)
+		ib_flags |= IB_ACCESS_REMOTE_READ;
+	if (mlx5_flags & MLX5_QP_BIT_RWE)
+		ib_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (mlx5_flags & MLX5_QP_BIT_RAE)
+		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+	return ib_flags;
+}
+
+static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr,
+				struct mlx5_qp_path *path)
+{
+	struct mlx5_core_dev *dev = ibdev->mdev;
+
+	memset(ib_ah_attr, 0, sizeof(*ib_ah_attr));
+	ib_ah_attr->port_num	  = path->port;
+
+	if (ib_ah_attr->port_num == 0 ||
+	    ib_ah_attr->port_num > dev->caps.gen.num_ports)
+		return;
+
+	ib_ah_attr->sl = path->sl & 0xf;
+
+	ib_ah_attr->dlid	  = be16_to_cpu(path->rlid);
+	ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f;
+	ib_ah_attr->static_rate   = path->static_rate ? path->static_rate - 5 : 0;
+	ib_ah_attr->ah_flags      = (path->grh_mlid & (1 << 7)) ? IB_AH_GRH : 0;
+	if (ib_ah_attr->ah_flags) {
+		ib_ah_attr->grh.sgid_index = path->mgid_index;
+		ib_ah_attr->grh.hop_limit  = path->hop_limit;
+		ib_ah_attr->grh.traffic_class =
+			(be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff;
+		ib_ah_attr->grh.flow_label =
+			be32_to_cpu(path->tclass_flowlabel) & 0xfffff;
+		memcpy(ib_ah_attr->grh.dgid.raw,
+		       path->rgid, sizeof(ib_ah_attr->grh.dgid.raw));
+	}
+}
+
+int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		     struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
+	struct mlx5_ib_qp *qp = to_mqp(ibqp);
+	struct mlx5_query_qp_mbox_out *outb;
+	struct mlx5_qp_context *context;
+	int mlx5_state;
+	int err = 0;
+
+#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
+	/*
+	 * Wait for any outstanding page faults, in case the user frees memory
+	 * based upon this query's result.
+	 */
+	flush_workqueue(mlx5_ib_page_fault_wq);
+#endif
+
+	mutex_lock(&qp->mutex);
+	outb = kzalloc(sizeof(*outb), GFP_KERNEL);
+	if (!outb) {
+		err = -ENOMEM;
+		goto out;
+	}
+	context = &outb->ctx;
+	err = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, sizeof(*outb));
+	if (err)
+		goto out_free;
+
+	mlx5_state = be32_to_cpu(context->flags) >> 28;
+
+	qp->state		     = to_ib_qp_state(mlx5_state);
+	qp_attr->qp_state	     = qp->state;
+	qp_attr->path_mtu	     = context->mtu_msgmax >> 5;
+	qp_attr->path_mig_state	     =
+		to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3);
+	qp_attr->qkey		     = be32_to_cpu(context->qkey);
+	qp_attr->rq_psn		     = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff;
+	qp_attr->sq_psn		     = be32_to_cpu(context->next_send_psn) & 0xffffff;
+	qp_attr->dest_qp_num	     = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff;
+	qp_attr->qp_access_flags     =
+		to_ib_qp_access_flags(be32_to_cpu(context->params2));
+
+	if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
+		to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path);
+		to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path);
+		qp_attr->alt_pkey_index = context->alt_path.pkey_index & 0x7f;
+		qp_attr->alt_port_num	= qp_attr->alt_ah_attr.port_num;
+	}
+
+	qp_attr->pkey_index = context->pri_path.pkey_index & 0x7f;
+	qp_attr->port_num = context->pri_path.port;
+
+	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+	qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING;
+
+	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7);
+
+	qp_attr->max_dest_rd_atomic =
+		1 << ((be32_to_cpu(context->params2) >> 21) & 0x7);
+	qp_attr->min_rnr_timer	    =
+		(be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f;
+	qp_attr->timeout	    = context->pri_path.ackto_lt >> 3;
+	qp_attr->retry_cnt	    = (be32_to_cpu(context->params1) >> 16) & 0x7;
+	qp_attr->rnr_retry	    = (be32_to_cpu(context->params1) >> 13) & 0x7;
+	qp_attr->alt_timeout	    = context->alt_path.ackto_lt >> 3;
+	qp_attr->cur_qp_state	     = qp_attr->qp_state;
+	qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
+	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
+
+	if (!ibqp->uobject) {
+		qp_attr->cap.max_send_wr  = qp->sq.wqe_cnt;
+		qp_attr->cap.max_send_sge = qp->sq.max_gs;
+	} else {
+		qp_attr->cap.max_send_wr  = 0;
+		qp_attr->cap.max_send_sge = 0;
+	}
+
+	/* We don't support inline sends for kernel QPs (yet), and we
+	 * don't know what userspace's value should be.
+	 */
+	qp_attr->cap.max_inline_data = 0;
+
+	qp_init_attr->cap	     = qp_attr->cap;
+
+	qp_init_attr->create_flags = 0;
+	if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
+		qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+	qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ?
+		IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
+
+out_free:
+	kfree(outb);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
+					  struct ib_ucontext *context,
+					  struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibdev);
+	struct mlx5_general_caps *gen;
+	struct mlx5_ib_xrcd *xrcd;
+	int err;
+
+	gen = &dev->mdev->caps.gen;
+	if (!(gen->flags & MLX5_DEV_CAP_FLAG_XRC))
+		return ERR_PTR(-ENOSYS);
+
+	xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL);
+	if (!xrcd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx5_core_xrcd_alloc(dev->mdev, &xrcd->xrcdn);
+	if (err) {
+		kfree(xrcd);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return &xrcd->ibxrcd;
+}
+
+int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
+{
+	struct mlx5_ib_dev *dev = to_mdev(xrcd->device);
+	u32 xrcdn = to_mxrcd(xrcd)->xrcdn;
+	int err;
+
+	err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn);
+	if (err) {
+		mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn);
+		return err;
+	}
+
+	kfree(xrcd);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c
new file mode 100644
index 000000000..02d77a297
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/srq.c
@@ -0,0 +1,485 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/mlx5/qp.h>
+#include <linux/mlx5/srq.h>
+#include <linux/slab.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "mlx5_ib.h"
+#include "user.h"
+
+/* not supported currently */
+static int srq_signature;
+
+static void *get_wqe(struct mlx5_ib_srq *srq, int n)
+{
+	return mlx5_buf_offset(&srq->buf, n << srq->msrq.wqe_shift);
+}
+
+static void mlx5_ib_srq_event(struct mlx5_core_srq *srq, enum mlx5_event type)
+{
+	struct ib_event event;
+	struct ib_srq *ibsrq = &to_mibsrq(srq)->ibsrq;
+
+	if (ibsrq->event_handler) {
+		event.device      = ibsrq->device;
+		event.element.srq = ibsrq;
+		switch (type) {
+		case MLX5_EVENT_TYPE_SRQ_RQ_LIMIT:
+			event.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			break;
+		case MLX5_EVENT_TYPE_SRQ_CATAS_ERROR:
+			event.event = IB_EVENT_SRQ_ERR;
+			break;
+		default:
+			pr_warn("mlx5_ib: Unexpected event type %d on SRQ %06x\n",
+				type, srq->srqn);
+			return;
+		}
+
+		ibsrq->event_handler(&event, ibsrq->srq_context);
+	}
+}
+
+static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq,
+			   struct mlx5_create_srq_mbox_in **in,
+			   struct ib_udata *udata, int buf_size, int *inlen)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_ib_create_srq ucmd;
+	size_t ucmdlen;
+	int err;
+	int npages;
+	int page_shift;
+	int ncont;
+	u32 offset;
+
+	ucmdlen =
+		(udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) <
+		 sizeof(ucmd)) ? (sizeof(ucmd) -
+				  sizeof(ucmd.reserved)) : sizeof(ucmd);
+
+	if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) {
+		mlx5_ib_dbg(dev, "failed copy udata\n");
+		return -EFAULT;
+	}
+
+	if (ucmdlen == sizeof(ucmd) &&
+	    ucmd.reserved != 0)
+		return -EINVAL;
+
+	srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE);
+
+	srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size,
+				0, 0);
+	if (IS_ERR(srq->umem)) {
+		mlx5_ib_dbg(dev, "failed umem get, size %d\n", buf_size);
+		err = PTR_ERR(srq->umem);
+		return err;
+	}
+
+	mlx5_ib_cont_pages(srq->umem, ucmd.buf_addr, &npages,
+			   &page_shift, &ncont, NULL);
+	err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift,
+				     &offset);
+	if (err) {
+		mlx5_ib_warn(dev, "bad offset\n");
+		goto err_umem;
+	}
+
+	*inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont;
+	*in = mlx5_vzalloc(*inlen);
+	if (!(*in)) {
+		err = -ENOMEM;
+		goto err_umem;
+	}
+
+	mlx5_ib_populate_pas(dev, srq->umem, page_shift, (*in)->pas, 0);
+
+	err = mlx5_ib_db_map_user(to_mucontext(pd->uobject->context),
+				  ucmd.db_addr, &srq->db);
+	if (err) {
+		mlx5_ib_dbg(dev, "map doorbell failed\n");
+		goto err_in;
+	}
+
+	(*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+	(*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26);
+
+	return 0;
+
+err_in:
+	kvfree(*in);
+
+err_umem:
+	ib_umem_release(srq->umem);
+
+	return err;
+}
+
+static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq,
+			     struct mlx5_create_srq_mbox_in **in, int buf_size,
+			     int *inlen)
+{
+	int err;
+	int i;
+	struct mlx5_wqe_srq_next_seg *next;
+	int page_shift;
+	int npages;
+
+	err = mlx5_db_alloc(dev->mdev, &srq->db);
+	if (err) {
+		mlx5_ib_warn(dev, "alloc dbell rec failed\n");
+		return err;
+	}
+
+	if (mlx5_buf_alloc(dev->mdev, buf_size, PAGE_SIZE * 2, &srq->buf)) {
+		mlx5_ib_dbg(dev, "buf alloc failed\n");
+		err = -ENOMEM;
+		goto err_db;
+	}
+	page_shift = srq->buf.page_shift;
+
+	srq->head    = 0;
+	srq->tail    = srq->msrq.max - 1;
+	srq->wqe_ctr = 0;
+
+	for (i = 0; i < srq->msrq.max; i++) {
+		next = get_wqe(srq, i);
+		next->next_wqe_index =
+			cpu_to_be16((i + 1) & (srq->msrq.max - 1));
+	}
+
+	npages = DIV_ROUND_UP(srq->buf.npages, 1 << (page_shift - PAGE_SHIFT));
+	mlx5_ib_dbg(dev, "buf_size %d, page_shift %d, npages %d, calc npages %d\n",
+		    buf_size, page_shift, srq->buf.npages, npages);
+	*inlen = sizeof(**in) + sizeof(*(*in)->pas) * npages;
+	*in = mlx5_vzalloc(*inlen);
+	if (!*in) {
+		err = -ENOMEM;
+		goto err_buf;
+	}
+	mlx5_fill_page_array(&srq->buf, (*in)->pas);
+
+	srq->wrid = kmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL);
+	if (!srq->wrid) {
+		mlx5_ib_dbg(dev, "kmalloc failed %lu\n",
+			    (unsigned long)(srq->msrq.max * sizeof(u64)));
+		err = -ENOMEM;
+		goto err_in;
+	}
+	srq->wq_sig = !!srq_signature;
+
+	(*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT;
+
+	return 0;
+
+err_in:
+	kvfree(*in);
+
+err_buf:
+	mlx5_buf_free(dev->mdev, &srq->buf);
+
+err_db:
+	mlx5_db_free(dev->mdev, &srq->db);
+	return err;
+}
+
+static void destroy_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq)
+{
+	mlx5_ib_db_unmap_user(to_mucontext(pd->uobject->context), &srq->db);
+	ib_umem_release(srq->umem);
+}
+
+
+static void destroy_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq)
+{
+	kfree(srq->wrid);
+	mlx5_buf_free(dev->mdev, &srq->buf);
+	mlx5_db_free(dev->mdev, &srq->db);
+}
+
+struct ib_srq *mlx5_ib_create_srq(struct ib_pd *pd,
+				  struct ib_srq_init_attr *init_attr,
+				  struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(pd->device);
+	struct mlx5_general_caps *gen;
+	struct mlx5_ib_srq *srq;
+	int desc_size;
+	int buf_size;
+	int err;
+	struct mlx5_create_srq_mbox_in *uninitialized_var(in);
+	int uninitialized_var(inlen);
+	int is_xrc;
+	u32 flgs, xrcdn;
+
+	gen = &dev->mdev->caps.gen;
+	/* Sanity check SRQ size before proceeding */
+	if (init_attr->attr.max_wr >= gen->max_srq_wqes) {
+		mlx5_ib_dbg(dev, "max_wr %d, cap %d\n",
+			    init_attr->attr.max_wr,
+			    gen->max_srq_wqes);
+		return ERR_PTR(-EINVAL);
+	}
+
+	srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&srq->mutex);
+	spin_lock_init(&srq->lock);
+	srq->msrq.max    = roundup_pow_of_two(init_attr->attr.max_wr + 1);
+	srq->msrq.max_gs = init_attr->attr.max_sge;
+
+	desc_size = sizeof(struct mlx5_wqe_srq_next_seg) +
+		    srq->msrq.max_gs * sizeof(struct mlx5_wqe_data_seg);
+	desc_size = roundup_pow_of_two(desc_size);
+	desc_size = max_t(int, 32, desc_size);
+	srq->msrq.max_avail_gather = (desc_size - sizeof(struct mlx5_wqe_srq_next_seg)) /
+		sizeof(struct mlx5_wqe_data_seg);
+	srq->msrq.wqe_shift = ilog2(desc_size);
+	buf_size = srq->msrq.max * desc_size;
+	mlx5_ib_dbg(dev, "desc_size 0x%x, req wr 0x%x, srq size 0x%x, max_gs 0x%x, max_avail_gather 0x%x\n",
+		    desc_size, init_attr->attr.max_wr, srq->msrq.max, srq->msrq.max_gs,
+		    srq->msrq.max_avail_gather);
+
+	if (pd->uobject)
+		err = create_srq_user(pd, srq, &in, udata, buf_size, &inlen);
+	else
+		err = create_srq_kernel(dev, srq, &in, buf_size, &inlen);
+
+	if (err) {
+		mlx5_ib_warn(dev, "create srq %s failed, err %d\n",
+			     pd->uobject ? "user" : "kernel", err);
+		goto err_srq;
+	}
+
+	is_xrc = (init_attr->srq_type == IB_SRQT_XRC);
+	in->ctx.state_log_sz = ilog2(srq->msrq.max);
+	flgs = ((srq->msrq.wqe_shift - 4) | (is_xrc << 5) | (srq->wq_sig << 7)) << 24;
+	xrcdn = 0;
+	if (is_xrc) {
+		xrcdn = to_mxrcd(init_attr->ext.xrc.xrcd)->xrcdn;
+		in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(init_attr->ext.xrc.cq)->mcq.cqn);
+	} else if (init_attr->srq_type == IB_SRQT_BASIC) {
+		xrcdn = to_mxrcd(dev->devr.x0)->xrcdn;
+		in->ctx.pgoff_cqn |= cpu_to_be32(to_mcq(dev->devr.c0)->mcq.cqn);
+	}
+
+	in->ctx.flags_xrcd = cpu_to_be32((flgs & 0xFF000000) | (xrcdn & 0xFFFFFF));
+
+	in->ctx.pd = cpu_to_be32(to_mpd(pd)->pdn);
+	in->ctx.db_record = cpu_to_be64(srq->db.dma);
+	err = mlx5_core_create_srq(dev->mdev, &srq->msrq, in, inlen);
+	kvfree(in);
+	if (err) {
+		mlx5_ib_dbg(dev, "create SRQ failed, err %d\n", err);
+		goto err_usr_kern_srq;
+	}
+
+	mlx5_ib_dbg(dev, "create SRQ with srqn 0x%x\n", srq->msrq.srqn);
+
+	srq->msrq.event = mlx5_ib_srq_event;
+	srq->ibsrq.ext.xrc.srq_num = srq->msrq.srqn;
+
+	if (pd->uobject)
+		if (ib_copy_to_udata(udata, &srq->msrq.srqn, sizeof(__u32))) {
+			mlx5_ib_dbg(dev, "copy to user failed\n");
+			err = -EFAULT;
+			goto err_core;
+		}
+
+	init_attr->attr.max_wr = srq->msrq.max - 1;
+
+	return &srq->ibsrq;
+
+err_core:
+	mlx5_core_destroy_srq(dev->mdev, &srq->msrq);
+
+err_usr_kern_srq:
+	if (pd->uobject)
+		destroy_srq_user(pd, srq);
+	else
+		destroy_srq_kernel(dev, srq);
+
+err_srq:
+	kfree(srq);
+
+	return ERR_PTR(err);
+}
+
+int mlx5_ib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		       enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx5_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+
+	/* We don't support resizing SRQs yet */
+	if (attr_mask & IB_SRQ_MAX_WR)
+		return -EINVAL;
+
+	if (attr_mask & IB_SRQ_LIMIT) {
+		if (attr->srq_limit >= srq->msrq.max)
+			return -EINVAL;
+
+		mutex_lock(&srq->mutex);
+		ret = mlx5_core_arm_srq(dev->mdev, &srq->msrq, attr->srq_limit, 1);
+		mutex_unlock(&srq->mutex);
+
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+int mlx5_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	struct mlx5_ib_dev *dev = to_mdev(ibsrq->device);
+	struct mlx5_ib_srq *srq = to_msrq(ibsrq);
+	int ret;
+	struct mlx5_query_srq_mbox_out *out;
+
+	out = kzalloc(sizeof(*out), GFP_KERNEL);
+	if (!out)
+		return -ENOMEM;
+
+	ret = mlx5_core_query_srq(dev->mdev, &srq->msrq, out);
+	if (ret)
+		goto out_box;
+
+	srq_attr->srq_limit = be16_to_cpu(out->ctx.lwm);
+	srq_attr->max_wr    = srq->msrq.max - 1;
+	srq_attr->max_sge   = srq->msrq.max_gs;
+
+out_box:
+	kfree(out);
+	return ret;
+}
+
+int mlx5_ib_destroy_srq(struct ib_srq *srq)
+{
+	struct mlx5_ib_dev *dev = to_mdev(srq->device);
+	struct mlx5_ib_srq *msrq = to_msrq(srq);
+
+	mlx5_core_destroy_srq(dev->mdev, &msrq->msrq);
+
+	if (srq->uobject) {
+		mlx5_ib_db_unmap_user(to_mucontext(srq->uobject->context), &msrq->db);
+		ib_umem_release(msrq->umem);
+	} else {
+		destroy_srq_kernel(dev, msrq);
+	}
+
+	kfree(srq);
+	return 0;
+}
+
+void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index)
+{
+	struct mlx5_wqe_srq_next_seg *next;
+
+	/* always called with interrupts disabled. */
+	spin_lock(&srq->lock);
+
+	next = get_wqe(srq, srq->tail);
+	next->next_wqe_index = cpu_to_be16(wqe_index);
+	srq->tail = wqe_index;
+
+	spin_unlock(&srq->lock);
+}
+
+int mlx5_ib_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			  struct ib_recv_wr **bad_wr)
+{
+	struct mlx5_ib_srq *srq = to_msrq(ibsrq);
+	struct mlx5_wqe_srq_next_seg *next;
+	struct mlx5_wqe_data_seg *scat;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	for (nreq = 0; wr; nreq++, wr = wr->next) {
+		if (unlikely(wr->num_sge > srq->msrq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		if (unlikely(srq->head == srq->tail)) {
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		srq->wrid[srq->head] = wr->wr_id;
+
+		next      = get_wqe(srq, srq->head);
+		srq->head = be16_to_cpu(next->next_wqe_index);
+		scat      = (struct mlx5_wqe_data_seg *)(next + 1);
+
+		for (i = 0; i < wr->num_sge; i++) {
+			scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
+			scat[i].lkey       = cpu_to_be32(wr->sg_list[i].lkey);
+			scat[i].addr       = cpu_to_be64(wr->sg_list[i].addr);
+		}
+
+		if (i < srq->msrq.max_avail_gather) {
+			scat[i].byte_count = 0;
+			scat[i].lkey       = cpu_to_be32(MLX5_INVALID_LKEY);
+			scat[i].addr       = 0;
+		}
+	}
+
+	if (likely(nreq)) {
+		srq->wqe_ctr += nreq;
+
+		/* Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+
+		*srq->db.db = cpu_to_be32(srq->wqe_ctr);
+	}
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h
new file mode 100644
index 000000000..76fb7b927
--- /dev/null
+++ b/drivers/infiniband/hw/mlx5/user.h
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2013-2015, Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MLX5_IB_USER_H
+#define MLX5_IB_USER_H
+
+#include <linux/types.h>
+
+enum {
+	MLX5_QP_FLAG_SIGNATURE		= 1 << 0,
+	MLX5_QP_FLAG_SCATTER_CQE	= 1 << 1,
+};
+
+enum {
+	MLX5_SRQ_FLAG_SIGNATURE		= 1 << 0,
+};
+
+
+/* Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define MLX5_IB_UVERBS_ABI_VERSION	1
+
+/* Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mlx5_ib_alloc_ucontext_req {
+	__u32	total_num_uuars;
+	__u32	num_low_latency_uuars;
+};
+
+struct mlx5_ib_alloc_ucontext_req_v2 {
+	__u32	total_num_uuars;
+	__u32	num_low_latency_uuars;
+	__u32	flags;
+	__u32	reserved;
+};
+
+struct mlx5_ib_alloc_ucontext_resp {
+	__u32	qp_tab_size;
+	__u32	bf_reg_size;
+	__u32	tot_uuars;
+	__u32	cache_line_size;
+	__u16	max_sq_desc_sz;
+	__u16	max_rq_desc_sz;
+	__u32	max_send_wqebb;
+	__u32	max_recv_wr;
+	__u32	max_srq_recv_wr;
+	__u16	num_ports;
+	__u16	reserved;
+};
+
+struct mlx5_ib_alloc_pd_resp {
+	__u32	pdn;
+};
+
+struct mlx5_ib_create_cq {
+	__u64	buf_addr;
+	__u64	db_addr;
+	__u32	cqe_size;
+	__u32	reserved; /* explicit padding (optional on i386) */
+};
+
+struct mlx5_ib_create_cq_resp {
+	__u32	cqn;
+	__u32	reserved;
+};
+
+struct mlx5_ib_resize_cq {
+	__u64	buf_addr;
+	__u16	cqe_size;
+	__u16	reserved0;
+	__u32	reserved1;
+};
+
+struct mlx5_ib_create_srq {
+	__u64	buf_addr;
+	__u64	db_addr;
+	__u32	flags;
+	__u32	reserved; /* explicit padding (optional on i386) */
+};
+
+struct mlx5_ib_create_srq_resp {
+	__u32	srqn;
+	__u32	reserved;
+};
+
+struct mlx5_ib_create_qp {
+	__u64	buf_addr;
+	__u64	db_addr;
+	__u32	sq_wqe_count;
+	__u32	rq_wqe_count;
+	__u32	rq_wqe_shift;
+	__u32	flags;
+};
+
+struct mlx5_ib_create_qp_resp {
+	__u32	uuar_index;
+};
+#endif /* MLX5_IB_USER_H */
diff --git a/drivers/infiniband/hw/mthca/Kconfig b/drivers/infiniband/hw/mthca/Kconfig
new file mode 100644
index 000000000..da314c3fe
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/Kconfig
@@ -0,0 +1,17 @@
+config INFINIBAND_MTHCA
+	tristate "Mellanox HCA support"
+	depends on PCI
+	---help---
+	  This is a low-level driver for Mellanox InfiniHost host
+	  channel adapters (HCAs), including the MT23108 PCI-X HCA
+	  ("Tavor") and the MT25208 PCI Express HCA ("Arbel").
+
+config INFINIBAND_MTHCA_DEBUG
+	bool "Verbose debugging output" if EXPERT
+	depends on INFINIBAND_MTHCA
+	default y
+	---help---
+	  This option causes debugging code to be compiled into the
+	  mthca driver.  The output can be turned on via the
+	  debug_level module parameter (which can also be set after
+	  the driver is loaded through sysfs).
diff --git a/drivers/infiniband/hw/mthca/Makefile b/drivers/infiniband/hw/mthca/Makefile
new file mode 100644
index 000000000..e388d95d0
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/Makefile
@@ -0,0 +1,7 @@
+obj-$(CONFIG_INFINIBAND_MTHCA) += ib_mthca.o
+
+ib_mthca-y :=	mthca_main.o mthca_cmd.o mthca_profile.o mthca_reset.o \
+		mthca_allocator.o mthca_eq.o mthca_pd.o mthca_cq.o \
+		mthca_mr.o mthca_qp.o mthca_av.o mthca_mcg.o mthca_mad.o \
+		mthca_provider.o mthca_memfree.o mthca_uar.o mthca_srq.o \
+		mthca_catas.o
diff --git a/drivers/infiniband/hw/mthca/mthca_allocator.c b/drivers/infiniband/hw/mthca/mthca_allocator.c
new file mode 100644
index 000000000..b4e0cf4e9
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_allocator.c
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+
+#include "mthca_dev.h"
+
+/* Trivial bitmap-based allocator */
+u32 mthca_alloc(struct mthca_alloc *alloc)
+{
+	unsigned long flags;
+	u32 obj;
+
+	spin_lock_irqsave(&alloc->lock, flags);
+
+	obj = find_next_zero_bit(alloc->table, alloc->max, alloc->last);
+	if (obj >= alloc->max) {
+		alloc->top = (alloc->top + alloc->max) & alloc->mask;
+		obj = find_first_zero_bit(alloc->table, alloc->max);
+	}
+
+	if (obj < alloc->max) {
+		set_bit(obj, alloc->table);
+		obj |= alloc->top;
+	} else
+		obj = -1;
+
+	spin_unlock_irqrestore(&alloc->lock, flags);
+
+	return obj;
+}
+
+void mthca_free(struct mthca_alloc *alloc, u32 obj)
+{
+	unsigned long flags;
+
+	obj &= alloc->max - 1;
+
+	spin_lock_irqsave(&alloc->lock, flags);
+
+	clear_bit(obj, alloc->table);
+	alloc->last = min(alloc->last, obj);
+	alloc->top = (alloc->top + alloc->max) & alloc->mask;
+
+	spin_unlock_irqrestore(&alloc->lock, flags);
+}
+
+int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
+		     u32 reserved)
+{
+	int i;
+
+	/* num must be a power of 2 */
+	if (num != 1 << (ffs(num) - 1))
+		return -EINVAL;
+
+	alloc->last = 0;
+	alloc->top  = 0;
+	alloc->max  = num;
+	alloc->mask = mask;
+	spin_lock_init(&alloc->lock);
+	alloc->table = kmalloc(BITS_TO_LONGS(num) * sizeof (long),
+			       GFP_KERNEL);
+	if (!alloc->table)
+		return -ENOMEM;
+
+	bitmap_zero(alloc->table, num);
+	for (i = 0; i < reserved; ++i)
+		set_bit(i, alloc->table);
+
+	return 0;
+}
+
+void mthca_alloc_cleanup(struct mthca_alloc *alloc)
+{
+	kfree(alloc->table);
+}
+
+/*
+ * Array of pointers with lazy allocation of leaf pages.  Callers of
+ * _get, _set and _clear methods must use a lock or otherwise
+ * serialize access to the array.
+ */
+
+#define MTHCA_ARRAY_MASK (PAGE_SIZE / sizeof (void *) - 1)
+
+void *mthca_array_get(struct mthca_array *array, int index)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	if (array->page_list[p].page)
+		return array->page_list[p].page[index & MTHCA_ARRAY_MASK];
+	else
+		return NULL;
+}
+
+int mthca_array_set(struct mthca_array *array, int index, void *value)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	/* Allocate with GFP_ATOMIC because we'll be called with locks held. */
+	if (!array->page_list[p].page)
+		array->page_list[p].page = (void **) get_zeroed_page(GFP_ATOMIC);
+
+	if (!array->page_list[p].page)
+		return -ENOMEM;
+
+	array->page_list[p].page[index & MTHCA_ARRAY_MASK] = value;
+	++array->page_list[p].used;
+
+	return 0;
+}
+
+void mthca_array_clear(struct mthca_array *array, int index)
+{
+	int p = (index * sizeof (void *)) >> PAGE_SHIFT;
+
+	if (--array->page_list[p].used == 0) {
+		free_page((unsigned long) array->page_list[p].page);
+		array->page_list[p].page = NULL;
+	} else
+		array->page_list[p].page[index & MTHCA_ARRAY_MASK] = NULL;
+
+	if (array->page_list[p].used < 0)
+		pr_debug("Array %p index %d page %d with ref count %d < 0\n",
+			 array, index, p, array->page_list[p].used);
+}
+
+int mthca_array_init(struct mthca_array *array, int nent)
+{
+	int npage = (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE;
+	int i;
+
+	array->page_list = kmalloc(npage * sizeof *array->page_list, GFP_KERNEL);
+	if (!array->page_list)
+		return -ENOMEM;
+
+	for (i = 0; i < npage; ++i) {
+		array->page_list[i].page = NULL;
+		array->page_list[i].used = 0;
+	}
+
+	return 0;
+}
+
+void mthca_array_cleanup(struct mthca_array *array, int nent)
+{
+	int i;
+
+	for (i = 0; i < (nent * sizeof (void *) + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
+		free_page((unsigned long) array->page_list[i].page);
+
+	kfree(array->page_list);
+}
+
+/*
+ * Handling for queue buffers -- we allocate a bunch of memory and
+ * register it in a memory region at HCA virtual address 0.  If the
+ * requested size is > max_direct, we split the allocation into
+ * multiple pages, so we don't require too much contiguous memory.
+ */
+
+int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct,
+		    union mthca_buf *buf, int *is_direct, struct mthca_pd *pd,
+		    int hca_write, struct mthca_mr *mr)
+{
+	int err = -ENOMEM;
+	int npages, shift;
+	u64 *dma_list = NULL;
+	dma_addr_t t;
+	int i;
+
+	if (size <= max_direct) {
+		*is_direct = 1;
+		npages     = 1;
+		shift      = get_order(size) + PAGE_SHIFT;
+
+		buf->direct.buf = dma_alloc_coherent(&dev->pdev->dev,
+						     size, &t, GFP_KERNEL);
+		if (!buf->direct.buf)
+			return -ENOMEM;
+
+		dma_unmap_addr_set(&buf->direct, mapping, t);
+
+		memset(buf->direct.buf, 0, size);
+
+		while (t & ((1 << shift) - 1)) {
+			--shift;
+			npages *= 2;
+		}
+
+		dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+		if (!dma_list)
+			goto err_free;
+
+		for (i = 0; i < npages; ++i)
+			dma_list[i] = t + i * (1 << shift);
+	} else {
+		*is_direct = 0;
+		npages     = (size + PAGE_SIZE - 1) / PAGE_SIZE;
+		shift      = PAGE_SHIFT;
+
+		dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+		if (!dma_list)
+			return -ENOMEM;
+
+		buf->page_list = kmalloc(npages * sizeof *buf->page_list,
+					 GFP_KERNEL);
+		if (!buf->page_list)
+			goto err_out;
+
+		for (i = 0; i < npages; ++i)
+			buf->page_list[i].buf = NULL;
+
+		for (i = 0; i < npages; ++i) {
+			buf->page_list[i].buf =
+				dma_alloc_coherent(&dev->pdev->dev, PAGE_SIZE,
+						   &t, GFP_KERNEL);
+			if (!buf->page_list[i].buf)
+				goto err_free;
+
+			dma_list[i] = t;
+			dma_unmap_addr_set(&buf->page_list[i], mapping, t);
+
+			clear_page(buf->page_list[i].buf);
+		}
+	}
+
+	err = mthca_mr_alloc_phys(dev, pd->pd_num,
+				  dma_list, shift, npages,
+				  0, size,
+				  MTHCA_MPT_FLAG_LOCAL_READ |
+				  (hca_write ? MTHCA_MPT_FLAG_LOCAL_WRITE : 0),
+				  mr);
+	if (err)
+		goto err_free;
+
+	kfree(dma_list);
+
+	return 0;
+
+err_free:
+	mthca_buf_free(dev, size, buf, *is_direct, NULL);
+
+err_out:
+	kfree(dma_list);
+
+	return err;
+}
+
+void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf,
+		    int is_direct, struct mthca_mr *mr)
+{
+	int i;
+
+	if (mr)
+		mthca_free_mr(dev, mr);
+
+	if (is_direct)
+		dma_free_coherent(&dev->pdev->dev, size, buf->direct.buf,
+				  dma_unmap_addr(&buf->direct, mapping));
+	else {
+		for (i = 0; i < (size + PAGE_SIZE - 1) / PAGE_SIZE; ++i)
+			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+					  buf->page_list[i].buf,
+					  dma_unmap_addr(&buf->page_list[i],
+							 mapping));
+		kfree(buf->page_list);
+	}
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_av.c b/drivers/infiniband/hw/mthca/mthca_av.c
new file mode 100644
index 000000000..32f6c6315
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_av.c
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+
+#include "mthca_dev.h"
+
+enum {
+      MTHCA_RATE_TAVOR_FULL   = 0,
+      MTHCA_RATE_TAVOR_1X     = 1,
+      MTHCA_RATE_TAVOR_4X     = 2,
+      MTHCA_RATE_TAVOR_1X_DDR = 3
+};
+
+enum {
+      MTHCA_RATE_MEMFREE_FULL    = 0,
+      MTHCA_RATE_MEMFREE_QUARTER = 1,
+      MTHCA_RATE_MEMFREE_EIGHTH  = 2,
+      MTHCA_RATE_MEMFREE_HALF    = 3
+};
+
+struct mthca_av {
+	__be32 port_pd;
+	u8     reserved1;
+	u8     g_slid;
+	__be16 dlid;
+	u8     reserved2;
+	u8     gid_index;
+	u8     msg_sr;
+	u8     hop_limit;
+	__be32 sl_tclass_flowlabel;
+	__be32 dgid[4];
+};
+
+static enum ib_rate memfree_rate_to_ib(u8 mthca_rate, u8 port_rate)
+{
+	switch (mthca_rate) {
+	case MTHCA_RATE_MEMFREE_EIGHTH:
+		return mult_to_ib_rate(port_rate >> 3);
+	case MTHCA_RATE_MEMFREE_QUARTER:
+		return mult_to_ib_rate(port_rate >> 2);
+	case MTHCA_RATE_MEMFREE_HALF:
+		return mult_to_ib_rate(port_rate >> 1);
+	case MTHCA_RATE_MEMFREE_FULL:
+	default:
+		return mult_to_ib_rate(port_rate);
+	}
+}
+
+static enum ib_rate tavor_rate_to_ib(u8 mthca_rate, u8 port_rate)
+{
+	switch (mthca_rate) {
+	case MTHCA_RATE_TAVOR_1X:     return IB_RATE_2_5_GBPS;
+	case MTHCA_RATE_TAVOR_1X_DDR: return IB_RATE_5_GBPS;
+	case MTHCA_RATE_TAVOR_4X:     return IB_RATE_10_GBPS;
+	default:		      return mult_to_ib_rate(port_rate);
+	}
+}
+
+enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port)
+{
+	if (mthca_is_memfree(dev)) {
+		/* Handle old Arbel FW */
+		if (dev->limits.stat_rate_support == 0x3 && mthca_rate)
+			return IB_RATE_2_5_GBPS;
+
+		return memfree_rate_to_ib(mthca_rate, dev->rate[port - 1]);
+	} else
+		return tavor_rate_to_ib(mthca_rate, dev->rate[port - 1]);
+}
+
+static u8 ib_rate_to_memfree(u8 req_rate, u8 cur_rate)
+{
+	if (cur_rate <= req_rate)
+		return 0;
+
+	/*
+	 * Inter-packet delay (IPD) to get from rate X down to a rate
+	 * no more than Y is (X - 1) / Y.
+	 */
+	switch ((cur_rate - 1) / req_rate) {
+	case 0:	 return MTHCA_RATE_MEMFREE_FULL;
+	case 1:	 return MTHCA_RATE_MEMFREE_HALF;
+	case 2:	 /* fall through */
+	case 3:	 return MTHCA_RATE_MEMFREE_QUARTER;
+	default: return MTHCA_RATE_MEMFREE_EIGHTH;
+	}
+}
+
+static u8 ib_rate_to_tavor(u8 static_rate)
+{
+	switch (static_rate) {
+	case IB_RATE_2_5_GBPS: return MTHCA_RATE_TAVOR_1X;
+	case IB_RATE_5_GBPS:   return MTHCA_RATE_TAVOR_1X_DDR;
+	case IB_RATE_10_GBPS:  return MTHCA_RATE_TAVOR_4X;
+	default:	       return MTHCA_RATE_TAVOR_FULL;
+	}
+}
+
+u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port)
+{
+	u8 rate;
+
+	if (!static_rate || ib_rate_to_mult(static_rate) >= dev->rate[port - 1])
+		return 0;
+
+	if (mthca_is_memfree(dev))
+		rate = ib_rate_to_memfree(ib_rate_to_mult(static_rate),
+					  dev->rate[port - 1]);
+	else
+		rate = ib_rate_to_tavor(static_rate);
+
+	if (!(dev->limits.stat_rate_support & (1 << rate)))
+		rate = 1;
+
+	return rate;
+}
+
+int mthca_create_ah(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct ib_ah_attr *ah_attr,
+		    struct mthca_ah *ah)
+{
+	u32 index = -1;
+	struct mthca_av *av = NULL;
+
+	ah->type = MTHCA_AH_PCI_POOL;
+
+	if (mthca_is_memfree(dev)) {
+		ah->av   = kmalloc(sizeof *ah->av, GFP_ATOMIC);
+		if (!ah->av)
+			return -ENOMEM;
+
+		ah->type = MTHCA_AH_KMALLOC;
+		av       = ah->av;
+	} else if (!atomic_read(&pd->sqp_count) &&
+		 !(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		index = mthca_alloc(&dev->av_table.alloc);
+
+		/* fall back to allocate in host memory */
+		if (index == -1)
+			goto on_hca_fail;
+
+		av = kmalloc(sizeof *av, GFP_ATOMIC);
+		if (!av)
+			goto on_hca_fail;
+
+		ah->type = MTHCA_AH_ON_HCA;
+		ah->avdma  = dev->av_table.ddr_av_base +
+			index * MTHCA_AV_SIZE;
+	}
+
+on_hca_fail:
+	if (ah->type == MTHCA_AH_PCI_POOL) {
+		ah->av = pci_pool_alloc(dev->av_table.pool,
+					GFP_ATOMIC, &ah->avdma);
+		if (!ah->av)
+			return -ENOMEM;
+
+		av = ah->av;
+	}
+
+	ah->key = pd->ntmr.ibmr.lkey;
+
+	memset(av, 0, MTHCA_AV_SIZE);
+
+	av->port_pd = cpu_to_be32(pd->pd_num | (ah_attr->port_num << 24));
+	av->g_slid  = ah_attr->src_path_bits;
+	av->dlid    = cpu_to_be16(ah_attr->dlid);
+	av->msg_sr  = (3 << 4) | /* 2K message */
+		mthca_get_rate(dev, ah_attr->static_rate, ah_attr->port_num);
+	av->sl_tclass_flowlabel = cpu_to_be32(ah_attr->sl << 28);
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		av->g_slid |= 0x80;
+		av->gid_index = (ah_attr->port_num - 1) * dev->limits.gid_table_len +
+			ah_attr->grh.sgid_index;
+		av->hop_limit = ah_attr->grh.hop_limit;
+		av->sl_tclass_flowlabel |=
+			cpu_to_be32((ah_attr->grh.traffic_class << 20) |
+				    ah_attr->grh.flow_label);
+		memcpy(av->dgid, ah_attr->grh.dgid.raw, 16);
+	} else {
+		/* Arbel workaround -- low byte of GID must be 2 */
+		av->dgid[3] = cpu_to_be32(2);
+	}
+
+	if (0) {
+		int j;
+
+		mthca_dbg(dev, "Created UDAV at %p/%08lx:\n",
+			  av, (unsigned long) ah->avdma);
+		for (j = 0; j < 8; ++j)
+			printk(KERN_DEBUG "  [%2x] %08x\n",
+			       j * 4, be32_to_cpu(((__be32 *) av)[j]));
+	}
+
+	if (ah->type == MTHCA_AH_ON_HCA) {
+		memcpy_toio(dev->av_table.av_map + index * MTHCA_AV_SIZE,
+			    av, MTHCA_AV_SIZE);
+		kfree(av);
+	}
+
+	return 0;
+}
+
+int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah)
+{
+	switch (ah->type) {
+	case MTHCA_AH_ON_HCA:
+		mthca_free(&dev->av_table.alloc,
+			   (ah->avdma - dev->av_table.ddr_av_base) /
+			   MTHCA_AV_SIZE);
+		break;
+
+	case MTHCA_AH_PCI_POOL:
+		pci_pool_free(dev->av_table.pool, ah->av, ah->avdma);
+		break;
+
+	case MTHCA_AH_KMALLOC:
+		kfree(ah->av);
+		break;
+	}
+
+	return 0;
+}
+
+int mthca_ah_grh_present(struct mthca_ah *ah)
+{
+	return !!(ah->av->g_slid & 0x80);
+}
+
+int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
+		  struct ib_ud_header *header)
+{
+	if (ah->type == MTHCA_AH_ON_HCA)
+		return -EINVAL;
+
+	header->lrh.service_level   = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28;
+	header->lrh.destination_lid = ah->av->dlid;
+	header->lrh.source_lid      = cpu_to_be16(ah->av->g_slid & 0x7f);
+	if (mthca_ah_grh_present(ah)) {
+		header->grh.traffic_class =
+			(be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20) & 0xff;
+		header->grh.flow_label    =
+			ah->av->sl_tclass_flowlabel & cpu_to_be32(0xfffff);
+		header->grh.hop_limit     = ah->av->hop_limit;
+		ib_get_cached_gid(&dev->ib_dev,
+				  be32_to_cpu(ah->av->port_pd) >> 24,
+				  ah->av->gid_index % dev->limits.gid_table_len,
+				  &header->grh.source_gid);
+		memcpy(header->grh.destination_gid.raw,
+		       ah->av->dgid, 16);
+	}
+
+	return 0;
+}
+
+int mthca_ah_query(struct ib_ah *ibah, struct ib_ah_attr *attr)
+{
+	struct mthca_ah *ah   = to_mah(ibah);
+	struct mthca_dev *dev = to_mdev(ibah->device);
+
+	/* Only implement for MAD and memfree ah for now. */
+	if (ah->type == MTHCA_AH_ON_HCA)
+		return -ENOSYS;
+
+	memset(attr, 0, sizeof *attr);
+	attr->dlid          = be16_to_cpu(ah->av->dlid);
+	attr->sl            = be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 28;
+	attr->port_num      = be32_to_cpu(ah->av->port_pd) >> 24;
+	attr->static_rate   = mthca_rate_to_ib(dev, ah->av->msg_sr & 0x7,
+					       attr->port_num);
+	attr->src_path_bits = ah->av->g_slid & 0x7F;
+	attr->ah_flags      = mthca_ah_grh_present(ah) ? IB_AH_GRH : 0;
+
+	if (attr->ah_flags) {
+		attr->grh.traffic_class =
+			be32_to_cpu(ah->av->sl_tclass_flowlabel) >> 20;
+		attr->grh.flow_label =
+			be32_to_cpu(ah->av->sl_tclass_flowlabel) & 0xfffff;
+		attr->grh.hop_limit  = ah->av->hop_limit;
+		attr->grh.sgid_index = ah->av->gid_index &
+				       (dev->limits.gid_table_len - 1);
+		memcpy(attr->grh.dgid.raw, ah->av->dgid, 16);
+	}
+
+	return 0;
+}
+
+int mthca_init_av_table(struct mthca_dev *dev)
+{
+	int err;
+
+	if (mthca_is_memfree(dev))
+		return 0;
+
+	err = mthca_alloc_init(&dev->av_table.alloc,
+			       dev->av_table.num_ddr_avs,
+			       dev->av_table.num_ddr_avs - 1,
+			       0);
+	if (err)
+		return err;
+
+	dev->av_table.pool = pci_pool_create("mthca_av", dev->pdev,
+					     MTHCA_AV_SIZE,
+					     MTHCA_AV_SIZE, 0);
+	if (!dev->av_table.pool)
+		goto out_free_alloc;
+
+	if (!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		dev->av_table.av_map = ioremap(pci_resource_start(dev->pdev, 4) +
+					       dev->av_table.ddr_av_base -
+					       dev->ddr_start,
+					       dev->av_table.num_ddr_avs *
+					       MTHCA_AV_SIZE);
+		if (!dev->av_table.av_map)
+			goto out_free_pool;
+	} else
+		dev->av_table.av_map = NULL;
+
+	return 0;
+
+ out_free_pool:
+	pci_pool_destroy(dev->av_table.pool);
+
+ out_free_alloc:
+	mthca_alloc_cleanup(&dev->av_table.alloc);
+	return -ENOMEM;
+}
+
+void mthca_cleanup_av_table(struct mthca_dev *dev)
+{
+	if (mthca_is_memfree(dev))
+		return;
+
+	if (dev->av_table.av_map)
+		iounmap(dev->av_table.av_map);
+	pci_pool_destroy(dev->av_table.pool);
+	mthca_alloc_cleanup(&dev->av_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_catas.c b/drivers/infiniband/hw/mthca/mthca_catas.c
new file mode 100644
index 000000000..712d2a30f
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_catas.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+
+#include "mthca_dev.h"
+
+enum {
+	MTHCA_CATAS_POLL_INTERVAL	= 5 * HZ,
+
+	MTHCA_CATAS_TYPE_INTERNAL	= 0,
+	MTHCA_CATAS_TYPE_UPLINK		= 3,
+	MTHCA_CATAS_TYPE_DDR		= 4,
+	MTHCA_CATAS_TYPE_PARITY		= 5,
+};
+
+static DEFINE_SPINLOCK(catas_lock);
+
+static LIST_HEAD(catas_list);
+static struct workqueue_struct *catas_wq;
+static struct work_struct catas_work;
+
+static int catas_reset_disable;
+module_param_named(catas_reset_disable, catas_reset_disable, int, 0644);
+MODULE_PARM_DESC(catas_reset_disable, "disable reset on catastrophic event if nonzero");
+
+static void catas_reset(struct work_struct *work)
+{
+	struct mthca_dev *dev, *tmpdev;
+	LIST_HEAD(tlist);
+	int ret;
+
+	mutex_lock(&mthca_device_mutex);
+
+	spin_lock_irq(&catas_lock);
+	list_splice_init(&catas_list, &tlist);
+	spin_unlock_irq(&catas_lock);
+
+	list_for_each_entry_safe(dev, tmpdev, &tlist, catas_err.list) {
+		struct pci_dev *pdev = dev->pdev;
+		ret = __mthca_restart_one(dev->pdev);
+		/* 'dev' now is not valid */
+		if (ret)
+			printk(KERN_ERR "mthca %s: Reset failed (%d)\n",
+			       pci_name(pdev), ret);
+		else {
+			struct mthca_dev *d = pci_get_drvdata(pdev);
+			mthca_dbg(d, "Reset succeeded\n");
+		}
+	}
+
+	mutex_unlock(&mthca_device_mutex);
+}
+
+static void handle_catas(struct mthca_dev *dev)
+{
+	struct ib_event event;
+	unsigned long flags;
+	const char *type;
+	int i;
+
+	event.device = &dev->ib_dev;
+	event.event  = IB_EVENT_DEVICE_FATAL;
+	event.element.port_num = 0;
+	dev->active = false;
+
+	ib_dispatch_event(&event);
+
+	switch (swab32(readl(dev->catas_err.map)) >> 24) {
+	case MTHCA_CATAS_TYPE_INTERNAL:
+		type = "internal error";
+		break;
+	case MTHCA_CATAS_TYPE_UPLINK:
+		type = "uplink bus error";
+		break;
+	case MTHCA_CATAS_TYPE_DDR:
+		type = "DDR data error";
+		break;
+	case MTHCA_CATAS_TYPE_PARITY:
+		type = "internal parity error";
+		break;
+	default:
+		type = "unknown error";
+		break;
+	}
+
+	mthca_err(dev, "Catastrophic error detected: %s\n", type);
+	for (i = 0; i < dev->catas_err.size; ++i)
+		mthca_err(dev, "  buf[%02x]: %08x\n",
+			  i, swab32(readl(dev->catas_err.map + i)));
+
+	if (catas_reset_disable)
+		return;
+
+	spin_lock_irqsave(&catas_lock, flags);
+	list_add(&dev->catas_err.list, &catas_list);
+	queue_work(catas_wq, &catas_work);
+	spin_unlock_irqrestore(&catas_lock, flags);
+}
+
+static void poll_catas(unsigned long dev_ptr)
+{
+	struct mthca_dev *dev = (struct mthca_dev *) dev_ptr;
+	int i;
+
+	for (i = 0; i < dev->catas_err.size; ++i)
+		if (readl(dev->catas_err.map + i)) {
+			handle_catas(dev);
+			return;
+		}
+
+	mod_timer(&dev->catas_err.timer,
+		  round_jiffies(jiffies + MTHCA_CATAS_POLL_INTERVAL));
+}
+
+void mthca_start_catas_poll(struct mthca_dev *dev)
+{
+	phys_addr_t addr;
+
+	init_timer(&dev->catas_err.timer);
+	dev->catas_err.map  = NULL;
+
+	addr = pci_resource_start(dev->pdev, 0) +
+		((pci_resource_len(dev->pdev, 0) - 1) &
+		 dev->catas_err.addr);
+
+	dev->catas_err.map = ioremap(addr, dev->catas_err.size * 4);
+	if (!dev->catas_err.map) {
+		mthca_warn(dev, "couldn't map catastrophic error region "
+			   "at 0x%llx/0x%x\n", (unsigned long long) addr,
+			   dev->catas_err.size * 4);
+		return;
+	}
+
+	dev->catas_err.timer.data     = (unsigned long) dev;
+	dev->catas_err.timer.function = poll_catas;
+	dev->catas_err.timer.expires  = jiffies + MTHCA_CATAS_POLL_INTERVAL;
+	INIT_LIST_HEAD(&dev->catas_err.list);
+	add_timer(&dev->catas_err.timer);
+}
+
+void mthca_stop_catas_poll(struct mthca_dev *dev)
+{
+	del_timer_sync(&dev->catas_err.timer);
+
+	if (dev->catas_err.map)
+		iounmap(dev->catas_err.map);
+
+	spin_lock_irq(&catas_lock);
+	list_del(&dev->catas_err.list);
+	spin_unlock_irq(&catas_lock);
+}
+
+int __init mthca_catas_init(void)
+{
+	INIT_WORK(&catas_work, catas_reset);
+
+	catas_wq = create_singlethread_workqueue("mthca_catas");
+	if (!catas_wq)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void mthca_catas_cleanup(void)
+{
+	destroy_workqueue(catas_wq);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
new file mode 100644
index 000000000..9d3e5c1ac
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -0,0 +1,1969 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/completion.h>
+#include <linux/pci.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <asm/io.h>
+#include <rdma/ib_mad.h>
+
+#include "mthca_dev.h"
+#include "mthca_config_reg.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+#define CMD_POLL_TOKEN 0xffff
+
+enum {
+	HCR_IN_PARAM_OFFSET    = 0x00,
+	HCR_IN_MODIFIER_OFFSET = 0x08,
+	HCR_OUT_PARAM_OFFSET   = 0x0c,
+	HCR_TOKEN_OFFSET       = 0x14,
+	HCR_STATUS_OFFSET      = 0x18,
+
+	HCR_OPMOD_SHIFT        = 12,
+	HCA_E_BIT              = 22,
+	HCR_GO_BIT             = 23
+};
+
+enum {
+	/* initialization and general commands */
+	CMD_SYS_EN          = 0x1,
+	CMD_SYS_DIS         = 0x2,
+	CMD_MAP_FA          = 0xfff,
+	CMD_UNMAP_FA        = 0xffe,
+	CMD_RUN_FW          = 0xff6,
+	CMD_MOD_STAT_CFG    = 0x34,
+	CMD_QUERY_DEV_LIM   = 0x3,
+	CMD_QUERY_FW        = 0x4,
+	CMD_ENABLE_LAM      = 0xff8,
+	CMD_DISABLE_LAM     = 0xff7,
+	CMD_QUERY_DDR       = 0x5,
+	CMD_QUERY_ADAPTER   = 0x6,
+	CMD_INIT_HCA        = 0x7,
+	CMD_CLOSE_HCA       = 0x8,
+	CMD_INIT_IB         = 0x9,
+	CMD_CLOSE_IB        = 0xa,
+	CMD_QUERY_HCA       = 0xb,
+	CMD_SET_IB          = 0xc,
+	CMD_ACCESS_DDR      = 0x2e,
+	CMD_MAP_ICM         = 0xffa,
+	CMD_UNMAP_ICM       = 0xff9,
+	CMD_MAP_ICM_AUX     = 0xffc,
+	CMD_UNMAP_ICM_AUX   = 0xffb,
+	CMD_SET_ICM_SIZE    = 0xffd,
+
+	/* TPT commands */
+	CMD_SW2HW_MPT 	    = 0xd,
+	CMD_QUERY_MPT 	    = 0xe,
+	CMD_HW2SW_MPT 	    = 0xf,
+	CMD_READ_MTT        = 0x10,
+	CMD_WRITE_MTT       = 0x11,
+	CMD_SYNC_TPT        = 0x2f,
+
+	/* EQ commands */
+	CMD_MAP_EQ          = 0x12,
+	CMD_SW2HW_EQ 	    = 0x13,
+	CMD_HW2SW_EQ 	    = 0x14,
+	CMD_QUERY_EQ        = 0x15,
+
+	/* CQ commands */
+	CMD_SW2HW_CQ 	    = 0x16,
+	CMD_HW2SW_CQ 	    = 0x17,
+	CMD_QUERY_CQ 	    = 0x18,
+	CMD_RESIZE_CQ       = 0x2c,
+
+	/* SRQ commands */
+	CMD_SW2HW_SRQ 	    = 0x35,
+	CMD_HW2SW_SRQ 	    = 0x36,
+	CMD_QUERY_SRQ       = 0x37,
+	CMD_ARM_SRQ         = 0x40,
+
+	/* QP/EE commands */
+	CMD_RST2INIT_QPEE   = 0x19,
+	CMD_INIT2RTR_QPEE   = 0x1a,
+	CMD_RTR2RTS_QPEE    = 0x1b,
+	CMD_RTS2RTS_QPEE    = 0x1c,
+	CMD_SQERR2RTS_QPEE  = 0x1d,
+	CMD_2ERR_QPEE       = 0x1e,
+	CMD_RTS2SQD_QPEE    = 0x1f,
+	CMD_SQD2SQD_QPEE    = 0x38,
+	CMD_SQD2RTS_QPEE    = 0x20,
+	CMD_ERR2RST_QPEE    = 0x21,
+	CMD_QUERY_QPEE      = 0x22,
+	CMD_INIT2INIT_QPEE  = 0x2d,
+	CMD_SUSPEND_QPEE    = 0x32,
+	CMD_UNSUSPEND_QPEE  = 0x33,
+	/* special QPs and management commands */
+	CMD_CONF_SPECIAL_QP = 0x23,
+	CMD_MAD_IFC         = 0x24,
+
+	/* multicast commands */
+	CMD_READ_MGM        = 0x25,
+	CMD_WRITE_MGM       = 0x26,
+	CMD_MGID_HASH       = 0x27,
+
+	/* miscellaneous commands */
+	CMD_DIAG_RPRT       = 0x30,
+	CMD_NOP             = 0x31,
+
+	/* debug commands */
+	CMD_QUERY_DEBUG_MSG = 0x2a,
+	CMD_SET_DEBUG_MSG   = 0x2b,
+};
+
+/*
+ * According to Mellanox code, FW may be starved and never complete
+ * commands.  So we can't use strict timeouts described in PRM -- we
+ * just arbitrarily select 60 seconds for now.
+ */
+#if 0
+/*
+ * Round up and add 1 to make sure we get the full wait time (since we
+ * will be starting in the middle of a jiffy)
+ */
+enum {
+	CMD_TIME_CLASS_A = (HZ + 999) / 1000 + 1,
+	CMD_TIME_CLASS_B = (HZ +  99) /  100 + 1,
+	CMD_TIME_CLASS_C = (HZ +   9) /   10 + 1,
+	CMD_TIME_CLASS_D = 60 * HZ
+};
+#else
+enum {
+	CMD_TIME_CLASS_A = 60 * HZ,
+	CMD_TIME_CLASS_B = 60 * HZ,
+	CMD_TIME_CLASS_C = 60 * HZ,
+	CMD_TIME_CLASS_D = 60 * HZ
+};
+#endif
+
+enum {
+	GO_BIT_TIMEOUT = HZ * 10
+};
+
+struct mthca_cmd_context {
+	struct completion done;
+	int               result;
+	int               next;
+	u64               out_param;
+	u16               token;
+	u8                status;
+};
+
+static int fw_cmd_doorbell = 0;
+module_param(fw_cmd_doorbell, int, 0644);
+MODULE_PARM_DESC(fw_cmd_doorbell, "post FW commands through doorbell page if nonzero "
+		 "(and supported by FW)");
+
+static inline int go_bit(struct mthca_dev *dev)
+{
+	return readl(dev->hcr + HCR_STATUS_OFFSET) &
+		swab32(1 << HCR_GO_BIT);
+}
+
+static void mthca_cmd_post_dbell(struct mthca_dev *dev,
+				 u64 in_param,
+				 u64 out_param,
+				 u32 in_modifier,
+				 u8 op_modifier,
+				 u16 op,
+				 u16 token)
+{
+	void __iomem *ptr = dev->cmd.dbell_map;
+	u16 *offs = dev->cmd.dbell_offsets;
+
+	__raw_writel((__force u32) cpu_to_be32(in_param >> 32),           ptr + offs[0]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful),  ptr + offs[1]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(in_modifier),              ptr + offs[2]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(out_param >> 32),          ptr + offs[3]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), ptr + offs[4]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(token << 16),              ptr + offs[5]);
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT)                |
+					       (1 << HCA_E_BIT)                 |
+					       (op_modifier << HCR_OPMOD_SHIFT) |
+						op),			  ptr + offs[6]);
+	wmb();
+	__raw_writel((__force u32) 0,                                     ptr + offs[7]);
+	wmb();
+}
+
+static int mthca_cmd_post_hcr(struct mthca_dev *dev,
+			      u64 in_param,
+			      u64 out_param,
+			      u32 in_modifier,
+			      u8 op_modifier,
+			      u16 op,
+			      u16 token,
+			      int event)
+{
+	if (event) {
+		unsigned long end = jiffies + GO_BIT_TIMEOUT;
+
+		while (go_bit(dev) && time_before(jiffies, end)) {
+			set_current_state(TASK_RUNNING);
+			schedule();
+		}
+	}
+
+	if (go_bit(dev))
+		return -EAGAIN;
+
+	/*
+	 * We use writel (instead of something like memcpy_toio)
+	 * because writes of less than 32 bits to the HCR don't work
+	 * (and some architectures such as ia64 implement memcpy_toio
+	 * in terms of writeb).
+	 */
+	__raw_writel((__force u32) cpu_to_be32(in_param >> 32),           dev->hcr + 0 * 4);
+	__raw_writel((__force u32) cpu_to_be32(in_param & 0xfffffffful),  dev->hcr + 1 * 4);
+	__raw_writel((__force u32) cpu_to_be32(in_modifier),              dev->hcr + 2 * 4);
+	__raw_writel((__force u32) cpu_to_be32(out_param >> 32),          dev->hcr + 3 * 4);
+	__raw_writel((__force u32) cpu_to_be32(out_param & 0xfffffffful), dev->hcr + 4 * 4);
+	__raw_writel((__force u32) cpu_to_be32(token << 16),              dev->hcr + 5 * 4);
+
+	/* __raw_writel may not order writes. */
+	wmb();
+
+	__raw_writel((__force u32) cpu_to_be32((1 << HCR_GO_BIT)                |
+					       (event ? (1 << HCA_E_BIT) : 0)   |
+					       (op_modifier << HCR_OPMOD_SHIFT) |
+					       op),                       dev->hcr + 6 * 4);
+
+	return 0;
+}
+
+static int mthca_cmd_post(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 out_param,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  u16 token,
+			  int event)
+{
+	int err = 0;
+
+	mutex_lock(&dev->cmd.hcr_mutex);
+
+	if (event && dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS && fw_cmd_doorbell)
+		mthca_cmd_post_dbell(dev, in_param, out_param, in_modifier,
+					   op_modifier, op, token);
+	else
+		err = mthca_cmd_post_hcr(dev, in_param, out_param, in_modifier,
+					 op_modifier, op, token, event);
+
+	/*
+	 * Make sure that our HCR writes don't get mixed in with
+	 * writes from another CPU starting a FW command.
+	 */
+	mmiowb();
+
+	mutex_unlock(&dev->cmd.hcr_mutex);
+	return err;
+}
+
+
+static int mthca_status_to_errno(u8 status)
+{
+	static const int trans_table[] = {
+		[MTHCA_CMD_STAT_INTERNAL_ERR]   = -EIO,
+		[MTHCA_CMD_STAT_BAD_OP]         = -EPERM,
+		[MTHCA_CMD_STAT_BAD_PARAM]      = -EINVAL,
+		[MTHCA_CMD_STAT_BAD_SYS_STATE]  = -ENXIO,
+		[MTHCA_CMD_STAT_BAD_RESOURCE]   = -EBADF,
+		[MTHCA_CMD_STAT_RESOURCE_BUSY]  = -EBUSY,
+		[MTHCA_CMD_STAT_DDR_MEM_ERR]    = -ENOMEM,
+		[MTHCA_CMD_STAT_EXCEED_LIM]     = -ENOMEM,
+		[MTHCA_CMD_STAT_BAD_RES_STATE]  = -EBADF,
+		[MTHCA_CMD_STAT_BAD_INDEX]      = -EBADF,
+		[MTHCA_CMD_STAT_BAD_NVMEM]      = -EFAULT,
+		[MTHCA_CMD_STAT_BAD_QPEE_STATE] = -EINVAL,
+		[MTHCA_CMD_STAT_BAD_SEG_PARAM]  = -EFAULT,
+		[MTHCA_CMD_STAT_REG_BOUND]      = -EBUSY,
+		[MTHCA_CMD_STAT_LAM_NOT_PRE]    = -EAGAIN,
+		[MTHCA_CMD_STAT_BAD_PKT]        = -EBADMSG,
+		[MTHCA_CMD_STAT_BAD_SIZE]       = -ENOMEM,
+	};
+
+	if (status >= ARRAY_SIZE(trans_table) ||
+			(status != MTHCA_CMD_STAT_OK
+			 && trans_table[status] == 0))
+		return -EINVAL;
+
+	return trans_table[status];
+}
+
+
+static int mthca_cmd_poll(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 *out_param,
+			  int out_is_imm,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  unsigned long timeout)
+{
+	int err = 0;
+	unsigned long end;
+	u8 status;
+
+	down(&dev->cmd.poll_sem);
+
+	err = mthca_cmd_post(dev, in_param,
+			     out_param ? *out_param : 0,
+			     in_modifier, op_modifier,
+			     op, CMD_POLL_TOKEN, 0);
+	if (err)
+		goto out;
+
+	end = timeout + jiffies;
+	while (go_bit(dev) && time_before(jiffies, end)) {
+		set_current_state(TASK_RUNNING);
+		schedule();
+	}
+
+	if (go_bit(dev)) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	if (out_is_imm)
+		*out_param =
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(dev->hcr + HCR_OUT_PARAM_OFFSET)) << 32 |
+			(u64) be32_to_cpu((__force __be32)
+					  __raw_readl(dev->hcr + HCR_OUT_PARAM_OFFSET + 4));
+
+	status = be32_to_cpu((__force __be32) __raw_readl(dev->hcr + HCR_STATUS_OFFSET)) >> 24;
+	if (status) {
+		mthca_dbg(dev, "Command %02x completed with status %02x\n",
+			  op, status);
+		err = mthca_status_to_errno(status);
+	}
+
+out:
+	up(&dev->cmd.poll_sem);
+	return err;
+}
+
+void mthca_cmd_event(struct mthca_dev *dev,
+		     u16 token,
+		     u8  status,
+		     u64 out_param)
+{
+	struct mthca_cmd_context *context =
+		&dev->cmd.context[token & dev->cmd.token_mask];
+
+	/* previously timed out command completing at long last */
+	if (token != context->token)
+		return;
+
+	context->result    = 0;
+	context->status    = status;
+	context->out_param = out_param;
+
+	complete(&context->done);
+}
+
+static int mthca_cmd_wait(struct mthca_dev *dev,
+			  u64 in_param,
+			  u64 *out_param,
+			  int out_is_imm,
+			  u32 in_modifier,
+			  u8 op_modifier,
+			  u16 op,
+			  unsigned long timeout)
+{
+	int err = 0;
+	struct mthca_cmd_context *context;
+
+	down(&dev->cmd.event_sem);
+
+	spin_lock(&dev->cmd.context_lock);
+	BUG_ON(dev->cmd.free_head < 0);
+	context = &dev->cmd.context[dev->cmd.free_head];
+	context->token += dev->cmd.token_mask + 1;
+	dev->cmd.free_head = context->next;
+	spin_unlock(&dev->cmd.context_lock);
+
+	init_completion(&context->done);
+
+	err = mthca_cmd_post(dev, in_param,
+			     out_param ? *out_param : 0,
+			     in_modifier, op_modifier,
+			     op, context->token, 1);
+	if (err)
+		goto out;
+
+	if (!wait_for_completion_timeout(&context->done, timeout)) {
+		err = -EBUSY;
+		goto out;
+	}
+
+	err = context->result;
+	if (err)
+		goto out;
+
+	if (context->status) {
+		mthca_dbg(dev, "Command %02x completed with status %02x\n",
+			  op, context->status);
+		err = mthca_status_to_errno(context->status);
+	}
+
+	if (out_is_imm)
+		*out_param = context->out_param;
+
+out:
+	spin_lock(&dev->cmd.context_lock);
+	context->next = dev->cmd.free_head;
+	dev->cmd.free_head = context - dev->cmd.context;
+	spin_unlock(&dev->cmd.context_lock);
+
+	up(&dev->cmd.event_sem);
+	return err;
+}
+
+/* Invoke a command with an output mailbox */
+static int mthca_cmd_box(struct mthca_dev *dev,
+			 u64 in_param,
+			 u64 out_param,
+			 u32 in_modifier,
+			 u8 op_modifier,
+			 u16 op,
+			 unsigned long timeout)
+{
+	if (dev->cmd.flags & MTHCA_CMD_USE_EVENTS)
+		return mthca_cmd_wait(dev, in_param, &out_param, 0,
+				      in_modifier, op_modifier, op,
+				      timeout);
+	else
+		return mthca_cmd_poll(dev, in_param, &out_param, 0,
+				      in_modifier, op_modifier, op,
+				      timeout);
+}
+
+/* Invoke a command with no output parameter */
+static int mthca_cmd(struct mthca_dev *dev,
+		     u64 in_param,
+		     u32 in_modifier,
+		     u8 op_modifier,
+		     u16 op,
+		     unsigned long timeout)
+{
+	return mthca_cmd_box(dev, in_param, 0, in_modifier,
+			     op_modifier, op, timeout);
+}
+
+/*
+ * Invoke a command with an immediate output parameter (and copy the
+ * output into the caller's out_param pointer after the command
+ * executes).
+ */
+static int mthca_cmd_imm(struct mthca_dev *dev,
+			 u64 in_param,
+			 u64 *out_param,
+			 u32 in_modifier,
+			 u8 op_modifier,
+			 u16 op,
+			 unsigned long timeout)
+{
+	if (dev->cmd.flags & MTHCA_CMD_USE_EVENTS)
+		return mthca_cmd_wait(dev, in_param, out_param, 1,
+				      in_modifier, op_modifier, op,
+				      timeout);
+	else
+		return mthca_cmd_poll(dev, in_param, out_param, 1,
+				      in_modifier, op_modifier, op,
+				      timeout);
+}
+
+int mthca_cmd_init(struct mthca_dev *dev)
+{
+	mutex_init(&dev->cmd.hcr_mutex);
+	sema_init(&dev->cmd.poll_sem, 1);
+	dev->cmd.flags = 0;
+
+	dev->hcr = ioremap(pci_resource_start(dev->pdev, 0) + MTHCA_HCR_BASE,
+			   MTHCA_HCR_SIZE);
+	if (!dev->hcr) {
+		mthca_err(dev, "Couldn't map command register.");
+		return -ENOMEM;
+	}
+
+	dev->cmd.pool = pci_pool_create("mthca_cmd", dev->pdev,
+					MTHCA_MAILBOX_SIZE,
+					MTHCA_MAILBOX_SIZE, 0);
+	if (!dev->cmd.pool) {
+		iounmap(dev->hcr);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void mthca_cmd_cleanup(struct mthca_dev *dev)
+{
+	pci_pool_destroy(dev->cmd.pool);
+	iounmap(dev->hcr);
+	if (dev->cmd.flags & MTHCA_CMD_POST_DOORBELLS)
+		iounmap(dev->cmd.dbell_map);
+}
+
+/*
+ * Switch to using events to issue FW commands (should be called after
+ * event queue to command events has been initialized).
+ */
+int mthca_cmd_use_events(struct mthca_dev *dev)
+{
+	int i;
+
+	dev->cmd.context = kmalloc(dev->cmd.max_cmds *
+				   sizeof (struct mthca_cmd_context),
+				   GFP_KERNEL);
+	if (!dev->cmd.context)
+		return -ENOMEM;
+
+	for (i = 0; i < dev->cmd.max_cmds; ++i) {
+		dev->cmd.context[i].token = i;
+		dev->cmd.context[i].next = i + 1;
+	}
+
+	dev->cmd.context[dev->cmd.max_cmds - 1].next = -1;
+	dev->cmd.free_head = 0;
+
+	sema_init(&dev->cmd.event_sem, dev->cmd.max_cmds);
+	spin_lock_init(&dev->cmd.context_lock);
+
+	for (dev->cmd.token_mask = 1;
+	     dev->cmd.token_mask < dev->cmd.max_cmds;
+	     dev->cmd.token_mask <<= 1)
+		; /* nothing */
+	--dev->cmd.token_mask;
+
+	dev->cmd.flags |= MTHCA_CMD_USE_EVENTS;
+
+	down(&dev->cmd.poll_sem);
+
+	return 0;
+}
+
+/*
+ * Switch back to polling (used when shutting down the device)
+ */
+void mthca_cmd_use_polling(struct mthca_dev *dev)
+{
+	int i;
+
+	dev->cmd.flags &= ~MTHCA_CMD_USE_EVENTS;
+
+	for (i = 0; i < dev->cmd.max_cmds; ++i)
+		down(&dev->cmd.event_sem);
+
+	kfree(dev->cmd.context);
+
+	up(&dev->cmd.poll_sem);
+}
+
+struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev,
+					  gfp_t gfp_mask)
+{
+	struct mthca_mailbox *mailbox;
+
+	mailbox = kmalloc(sizeof *mailbox, gfp_mask);
+	if (!mailbox)
+		return ERR_PTR(-ENOMEM);
+
+	mailbox->buf = pci_pool_alloc(dev->cmd.pool, gfp_mask, &mailbox->dma);
+	if (!mailbox->buf) {
+		kfree(mailbox);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return mailbox;
+}
+
+void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox)
+{
+	if (!mailbox)
+		return;
+
+	pci_pool_free(dev->cmd.pool, mailbox->buf, mailbox->dma);
+	kfree(mailbox);
+}
+
+int mthca_SYS_EN(struct mthca_dev *dev)
+{
+	u64 out;
+	int ret;
+
+	ret = mthca_cmd_imm(dev, 0, &out, 0, 0, CMD_SYS_EN, CMD_TIME_CLASS_D);
+
+	if (ret == -ENOMEM)
+		mthca_warn(dev, "SYS_EN DDR error: syn=%x, sock=%d, "
+			   "sladdr=%d, SPD source=%s\n",
+			   (int) (out >> 6) & 0xf, (int) (out >> 4) & 3,
+			   (int) (out >> 1) & 7, (int) out & 1 ? "NVMEM" : "DIMM");
+
+	return ret;
+}
+
+int mthca_SYS_DIS(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C);
+}
+
+static int mthca_map_cmd(struct mthca_dev *dev, u16 op, struct mthca_icm *icm,
+			 u64 virt)
+{
+	struct mthca_mailbox *mailbox;
+	struct mthca_icm_iter iter;
+	__be64 *pages;
+	int lg;
+	int nent = 0;
+	int i;
+	int err = 0;
+	int ts = 0, tc = 0;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	memset(mailbox->buf, 0, MTHCA_MAILBOX_SIZE);
+	pages = mailbox->buf;
+
+	for (mthca_icm_first(icm, &iter);
+	     !mthca_icm_last(&iter);
+	     mthca_icm_next(&iter)) {
+		/*
+		 * We have to pass pages that are aligned to their
+		 * size, so find the least significant 1 in the
+		 * address or size and use that as our log2 size.
+		 */
+		lg = ffs(mthca_icm_addr(&iter) | mthca_icm_size(&iter)) - 1;
+		if (lg < MTHCA_ICM_PAGE_SHIFT) {
+			mthca_warn(dev, "Got FW area not aligned to %d (%llx/%lx).\n",
+				   MTHCA_ICM_PAGE_SIZE,
+				   (unsigned long long) mthca_icm_addr(&iter),
+				   mthca_icm_size(&iter));
+			err = -EINVAL;
+			goto out;
+		}
+		for (i = 0; i < mthca_icm_size(&iter) >> lg; ++i) {
+			if (virt != -1) {
+				pages[nent * 2] = cpu_to_be64(virt);
+				virt += 1 << lg;
+			}
+
+			pages[nent * 2 + 1] =
+				cpu_to_be64((mthca_icm_addr(&iter) + (i << lg)) |
+					    (lg - MTHCA_ICM_PAGE_SHIFT));
+			ts += 1 << (lg - 10);
+			++tc;
+
+			if (++nent == MTHCA_MAILBOX_SIZE / 16) {
+				err = mthca_cmd(dev, mailbox->dma, nent, 0, op,
+						CMD_TIME_CLASS_B);
+				if (err)
+					goto out;
+				nent = 0;
+			}
+		}
+	}
+
+	if (nent)
+		err = mthca_cmd(dev, mailbox->dma, nent, 0, op,
+				CMD_TIME_CLASS_B);
+
+	switch (op) {
+	case CMD_MAP_FA:
+		mthca_dbg(dev, "Mapped %d chunks/%d KB for FW.\n", tc, ts);
+		break;
+	case CMD_MAP_ICM_AUX:
+		mthca_dbg(dev, "Mapped %d chunks/%d KB for ICM aux.\n", tc, ts);
+		break;
+	case CMD_MAP_ICM:
+		mthca_dbg(dev, "Mapped %d chunks/%d KB at %llx for ICM.\n",
+			  tc, ts, (unsigned long long) virt - (ts << 10));
+		break;
+	}
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm)
+{
+	return mthca_map_cmd(dev, CMD_MAP_FA, icm, -1);
+}
+
+int mthca_UNMAP_FA(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_FA, CMD_TIME_CLASS_B);
+}
+
+int mthca_RUN_FW(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_RUN_FW, CMD_TIME_CLASS_A);
+}
+
+static void mthca_setup_cmd_doorbells(struct mthca_dev *dev, u64 base)
+{
+	phys_addr_t addr;
+	u16 max_off = 0;
+	int i;
+
+	for (i = 0; i < 8; ++i)
+		max_off = max(max_off, dev->cmd.dbell_offsets[i]);
+
+	if ((base & PAGE_MASK) != ((base + max_off) & PAGE_MASK)) {
+		mthca_warn(dev, "Firmware doorbell region at 0x%016llx, "
+			   "length 0x%x crosses a page boundary\n",
+			   (unsigned long long) base, max_off);
+		return;
+	}
+
+	addr = pci_resource_start(dev->pdev, 2) +
+		((pci_resource_len(dev->pdev, 2) - 1) & base);
+	dev->cmd.dbell_map = ioremap(addr, max_off + sizeof(u32));
+	if (!dev->cmd.dbell_map)
+		return;
+
+	dev->cmd.flags |= MTHCA_CMD_POST_DOORBELLS;
+	mthca_dbg(dev, "Mapped doorbell page for posting FW commands\n");
+}
+
+int mthca_QUERY_FW(struct mthca_dev *dev)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *outbox;
+	u64 base;
+	u32 tmp;
+	int err = 0;
+	u8 lg;
+	int i;
+
+#define QUERY_FW_OUT_SIZE             0x100
+#define QUERY_FW_VER_OFFSET            0x00
+#define QUERY_FW_MAX_CMD_OFFSET        0x0f
+#define QUERY_FW_ERR_START_OFFSET      0x30
+#define QUERY_FW_ERR_SIZE_OFFSET       0x38
+
+#define QUERY_FW_CMD_DB_EN_OFFSET      0x10
+#define QUERY_FW_CMD_DB_OFFSET         0x50
+#define QUERY_FW_CMD_DB_BASE           0x60
+
+#define QUERY_FW_START_OFFSET          0x20
+#define QUERY_FW_END_OFFSET            0x28
+
+#define QUERY_FW_SIZE_OFFSET           0x00
+#define QUERY_FW_CLR_INT_BASE_OFFSET   0x20
+#define QUERY_FW_EQ_ARM_BASE_OFFSET    0x40
+#define QUERY_FW_EQ_SET_CI_BASE_OFFSET 0x48
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_FW,
+			    CMD_TIME_CLASS_A);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(dev->fw_ver,   outbox, QUERY_FW_VER_OFFSET);
+	/*
+	 * FW subminor version is at more significant bits than minor
+	 * version, so swap here.
+	 */
+	dev->fw_ver = (dev->fw_ver & 0xffff00000000ull) |
+		((dev->fw_ver & 0xffff0000ull) >> 16) |
+		((dev->fw_ver & 0x0000ffffull) << 16);
+
+	MTHCA_GET(lg, outbox, QUERY_FW_MAX_CMD_OFFSET);
+	dev->cmd.max_cmds = 1 << lg;
+
+	mthca_dbg(dev, "FW version %012llx, max commands %d\n",
+		  (unsigned long long) dev->fw_ver, dev->cmd.max_cmds);
+
+	MTHCA_GET(dev->catas_err.addr, outbox, QUERY_FW_ERR_START_OFFSET);
+	MTHCA_GET(dev->catas_err.size, outbox, QUERY_FW_ERR_SIZE_OFFSET);
+
+	mthca_dbg(dev, "Catastrophic error buffer at 0x%llx, size 0x%x\n",
+		  (unsigned long long) dev->catas_err.addr, dev->catas_err.size);
+
+	MTHCA_GET(tmp, outbox, QUERY_FW_CMD_DB_EN_OFFSET);
+	if (tmp & 0x1) {
+		mthca_dbg(dev, "FW supports commands through doorbells\n");
+
+		MTHCA_GET(base, outbox, QUERY_FW_CMD_DB_BASE);
+		for (i = 0; i < MTHCA_CMD_NUM_DBELL_DWORDS; ++i)
+			MTHCA_GET(dev->cmd.dbell_offsets[i], outbox,
+				  QUERY_FW_CMD_DB_OFFSET + (i << 1));
+
+		mthca_setup_cmd_doorbells(dev, base);
+	}
+
+	if (mthca_is_memfree(dev)) {
+		MTHCA_GET(dev->fw.arbel.fw_pages,       outbox, QUERY_FW_SIZE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.clr_int_base,   outbox, QUERY_FW_CLR_INT_BASE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.eq_arm_base,    outbox, QUERY_FW_EQ_ARM_BASE_OFFSET);
+		MTHCA_GET(dev->fw.arbel.eq_set_ci_base, outbox, QUERY_FW_EQ_SET_CI_BASE_OFFSET);
+		mthca_dbg(dev, "FW size %d KB\n", dev->fw.arbel.fw_pages << 2);
+
+		/*
+		 * Round up number of system pages needed in case
+		 * MTHCA_ICM_PAGE_SIZE < PAGE_SIZE.
+		 */
+		dev->fw.arbel.fw_pages =
+			ALIGN(dev->fw.arbel.fw_pages, PAGE_SIZE / MTHCA_ICM_PAGE_SIZE) >>
+				(PAGE_SHIFT - MTHCA_ICM_PAGE_SHIFT);
+
+		mthca_dbg(dev, "Clear int @ %llx, EQ arm @ %llx, EQ set CI @ %llx\n",
+			  (unsigned long long) dev->fw.arbel.clr_int_base,
+			  (unsigned long long) dev->fw.arbel.eq_arm_base,
+			  (unsigned long long) dev->fw.arbel.eq_set_ci_base);
+	} else {
+		MTHCA_GET(dev->fw.tavor.fw_start, outbox, QUERY_FW_START_OFFSET);
+		MTHCA_GET(dev->fw.tavor.fw_end,   outbox, QUERY_FW_END_OFFSET);
+
+		mthca_dbg(dev, "FW size %d KB (start %llx, end %llx)\n",
+			  (int) ((dev->fw.tavor.fw_end - dev->fw.tavor.fw_start) >> 10),
+			  (unsigned long long) dev->fw.tavor.fw_start,
+			  (unsigned long long) dev->fw.tavor.fw_end);
+	}
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_ENABLE_LAM(struct mthca_dev *dev)
+{
+	struct mthca_mailbox *mailbox;
+	u8 info;
+	u32 *outbox;
+	int err = 0;
+
+#define ENABLE_LAM_OUT_SIZE         0x100
+#define ENABLE_LAM_START_OFFSET     0x00
+#define ENABLE_LAM_END_OFFSET       0x08
+#define ENABLE_LAM_INFO_OFFSET      0x13
+
+#define ENABLE_LAM_INFO_HIDDEN_FLAG (1 << 4)
+#define ENABLE_LAM_INFO_ECC_MASK    0x3
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_ENABLE_LAM,
+			    CMD_TIME_CLASS_C);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(dev->ddr_start, outbox, ENABLE_LAM_START_OFFSET);
+	MTHCA_GET(dev->ddr_end,   outbox, ENABLE_LAM_END_OFFSET);
+	MTHCA_GET(info,           outbox, ENABLE_LAM_INFO_OFFSET);
+
+	if (!!(info & ENABLE_LAM_INFO_HIDDEN_FLAG) !=
+	    !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		mthca_info(dev, "FW reports that HCA-attached memory "
+			   "is %s hidden; does not match PCI config\n",
+			   (info & ENABLE_LAM_INFO_HIDDEN_FLAG) ?
+			   "" : "not");
+	}
+	if (info & ENABLE_LAM_INFO_HIDDEN_FLAG)
+		mthca_dbg(dev, "HCA-attached memory is hidden.\n");
+
+	mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n",
+		  (int) ((dev->ddr_end - dev->ddr_start) >> 10),
+		  (unsigned long long) dev->ddr_start,
+		  (unsigned long long) dev->ddr_end);
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_DISABLE_LAM(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYS_DIS, CMD_TIME_CLASS_C);
+}
+
+int mthca_QUERY_DDR(struct mthca_dev *dev)
+{
+	struct mthca_mailbox *mailbox;
+	u8 info;
+	u32 *outbox;
+	int err = 0;
+
+#define QUERY_DDR_OUT_SIZE         0x100
+#define QUERY_DDR_START_OFFSET     0x00
+#define QUERY_DDR_END_OFFSET       0x08
+#define QUERY_DDR_INFO_OFFSET      0x13
+
+#define QUERY_DDR_INFO_HIDDEN_FLAG (1 << 4)
+#define QUERY_DDR_INFO_ECC_MASK    0x3
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_DDR,
+			    CMD_TIME_CLASS_A);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(dev->ddr_start, outbox, QUERY_DDR_START_OFFSET);
+	MTHCA_GET(dev->ddr_end,   outbox, QUERY_DDR_END_OFFSET);
+	MTHCA_GET(info,           outbox, QUERY_DDR_INFO_OFFSET);
+
+	if (!!(info & QUERY_DDR_INFO_HIDDEN_FLAG) !=
+	    !!(dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN)) {
+		mthca_info(dev, "FW reports that HCA-attached memory "
+			   "is %s hidden; does not match PCI config\n",
+			   (info & QUERY_DDR_INFO_HIDDEN_FLAG) ?
+			   "" : "not");
+	}
+	if (info & QUERY_DDR_INFO_HIDDEN_FLAG)
+		mthca_dbg(dev, "HCA-attached memory is hidden.\n");
+
+	mthca_dbg(dev, "HCA memory size %d KB (start %llx, end %llx)\n",
+		  (int) ((dev->ddr_end - dev->ddr_start) >> 10),
+		  (unsigned long long) dev->ddr_start,
+		  (unsigned long long) dev->ddr_end);
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
+			struct mthca_dev_lim *dev_lim)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *outbox;
+	u8 field;
+	u16 size;
+	u16 stat_rate;
+	int err;
+
+#define QUERY_DEV_LIM_OUT_SIZE             0x100
+#define QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET     0x10
+#define QUERY_DEV_LIM_MAX_QP_SZ_OFFSET      0x11
+#define QUERY_DEV_LIM_RSVD_QP_OFFSET        0x12
+#define QUERY_DEV_LIM_MAX_QP_OFFSET         0x13
+#define QUERY_DEV_LIM_RSVD_SRQ_OFFSET       0x14
+#define QUERY_DEV_LIM_MAX_SRQ_OFFSET        0x15
+#define QUERY_DEV_LIM_RSVD_EEC_OFFSET       0x16
+#define QUERY_DEV_LIM_MAX_EEC_OFFSET        0x17
+#define QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET      0x19
+#define QUERY_DEV_LIM_RSVD_CQ_OFFSET        0x1a
+#define QUERY_DEV_LIM_MAX_CQ_OFFSET         0x1b
+#define QUERY_DEV_LIM_MAX_MPT_OFFSET        0x1d
+#define QUERY_DEV_LIM_RSVD_EQ_OFFSET        0x1e
+#define QUERY_DEV_LIM_MAX_EQ_OFFSET         0x1f
+#define QUERY_DEV_LIM_RSVD_MTT_OFFSET       0x20
+#define QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET     0x21
+#define QUERY_DEV_LIM_RSVD_MRW_OFFSET       0x22
+#define QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET    0x23
+#define QUERY_DEV_LIM_MAX_AV_OFFSET         0x27
+#define QUERY_DEV_LIM_MAX_REQ_QP_OFFSET     0x29
+#define QUERY_DEV_LIM_MAX_RES_QP_OFFSET     0x2b
+#define QUERY_DEV_LIM_MAX_RDMA_OFFSET       0x2f
+#define QUERY_DEV_LIM_RSZ_SRQ_OFFSET        0x33
+#define QUERY_DEV_LIM_ACK_DELAY_OFFSET      0x35
+#define QUERY_DEV_LIM_MTU_WIDTH_OFFSET      0x36
+#define QUERY_DEV_LIM_VL_PORT_OFFSET        0x37
+#define QUERY_DEV_LIM_MAX_GID_OFFSET        0x3b
+#define QUERY_DEV_LIM_RATE_SUPPORT_OFFSET   0x3c
+#define QUERY_DEV_LIM_MAX_PKEY_OFFSET       0x3f
+#define QUERY_DEV_LIM_FLAGS_OFFSET          0x44
+#define QUERY_DEV_LIM_RSVD_UAR_OFFSET       0x48
+#define QUERY_DEV_LIM_UAR_SZ_OFFSET         0x49
+#define QUERY_DEV_LIM_PAGE_SZ_OFFSET        0x4b
+#define QUERY_DEV_LIM_MAX_SG_OFFSET         0x51
+#define QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET    0x52
+#define QUERY_DEV_LIM_MAX_SG_RQ_OFFSET      0x55
+#define QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET 0x56
+#define QUERY_DEV_LIM_MAX_QP_MCG_OFFSET     0x61
+#define QUERY_DEV_LIM_RSVD_MCG_OFFSET       0x62
+#define QUERY_DEV_LIM_MAX_MCG_OFFSET        0x63
+#define QUERY_DEV_LIM_RSVD_PD_OFFSET        0x64
+#define QUERY_DEV_LIM_MAX_PD_OFFSET         0x65
+#define QUERY_DEV_LIM_RSVD_RDD_OFFSET       0x66
+#define QUERY_DEV_LIM_MAX_RDD_OFFSET        0x67
+#define QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET   0x80
+#define QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET   0x82
+#define QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET  0x84
+#define QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET  0x86
+#define QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET   0x88
+#define QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET   0x8a
+#define QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET   0x8c
+#define QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET   0x8e
+#define QUERY_DEV_LIM_MTT_ENTRY_SZ_OFFSET   0x90
+#define QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET   0x92
+#define QUERY_DEV_LIM_PBL_SZ_OFFSET         0x96
+#define QUERY_DEV_LIM_BMME_FLAGS_OFFSET     0x97
+#define QUERY_DEV_LIM_RSVD_LKEY_OFFSET      0x98
+#define QUERY_DEV_LIM_LAMR_OFFSET           0x9f
+#define QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET     0xa0
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_DEV_LIM,
+			    CMD_TIME_CLASS_A);
+
+	if (err)
+		goto out;
+
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_QP_OFFSET);
+	dev_lim->reserved_qps = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_OFFSET);
+	dev_lim->max_qps = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_SRQ_OFFSET);
+	dev_lim->reserved_srqs = 1 << (field >> 4);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_OFFSET);
+	dev_lim->max_srqs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EEC_OFFSET);
+	dev_lim->reserved_eecs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EEC_OFFSET);
+	dev_lim->max_eecs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_SZ_OFFSET);
+	dev_lim->max_cq_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_CQ_OFFSET);
+	dev_lim->reserved_cqs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_CQ_OFFSET);
+	dev_lim->max_cqs = 1 << (field & 0x1f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MPT_OFFSET);
+	dev_lim->max_mpts = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_EQ_OFFSET);
+	dev_lim->reserved_eqs = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_EQ_OFFSET);
+	dev_lim->max_eqs = 1 << (field & 0x7);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MTT_OFFSET);
+	if (mthca_is_memfree(dev))
+		dev_lim->reserved_mtts = ALIGN((1 << (field >> 4)) * sizeof(u64),
+					       dev->limits.mtt_seg_size) / dev->limits.mtt_seg_size;
+	else
+		dev_lim->reserved_mtts = 1 << (field >> 4);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MRW_SZ_OFFSET);
+	dev_lim->max_mrw_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MRW_OFFSET);
+	dev_lim->reserved_mrws = 1 << (field & 0xf);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MTT_SEG_OFFSET);
+	dev_lim->max_mtt_seg = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_REQ_QP_OFFSET);
+	dev_lim->max_requester_per_qp = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RES_QP_OFFSET);
+	dev_lim->max_responder_per_qp = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDMA_OFFSET);
+	dev_lim->max_rdma_global = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_ACK_DELAY_OFFSET);
+	dev_lim->local_ca_ack_delay = field & 0x1f;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MTU_WIDTH_OFFSET);
+	dev_lim->max_mtu        = field >> 4;
+	dev_lim->max_port_width = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_VL_PORT_OFFSET);
+	dev_lim->max_vl    = field >> 4;
+	dev_lim->num_ports = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_GID_OFFSET);
+	dev_lim->max_gids = 1 << (field & 0xf);
+	MTHCA_GET(stat_rate, outbox, QUERY_DEV_LIM_RATE_SUPPORT_OFFSET);
+	dev_lim->stat_rate_support = stat_rate;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PKEY_OFFSET);
+	dev_lim->max_pkeys = 1 << (field & 0xf);
+	MTHCA_GET(dev_lim->flags, outbox, QUERY_DEV_LIM_FLAGS_OFFSET);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_UAR_OFFSET);
+	dev_lim->reserved_uars = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_UAR_SZ_OFFSET);
+	dev_lim->uar_size = 1 << ((field & 0x3f) + 20);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_PAGE_SZ_OFFSET);
+	dev_lim->min_page_sz = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_OFFSET);
+	dev_lim->max_sg = field;
+
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_OFFSET);
+	dev_lim->max_desc_sz = size;
+
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_MCG_OFFSET);
+	dev_lim->max_qp_per_mcg = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_MCG_OFFSET);
+	dev_lim->reserved_mgms = field & 0xf;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_MCG_OFFSET);
+	dev_lim->max_mcgs = 1 << field;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_PD_OFFSET);
+	dev_lim->reserved_pds = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_PD_OFFSET);
+	dev_lim->max_pds = 1 << (field & 0x3f);
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSVD_RDD_OFFSET);
+	dev_lim->reserved_rdds = field >> 4;
+	MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_RDD_OFFSET);
+	dev_lim->max_rdds = 1 << (field & 0x3f);
+
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEC_ENTRY_SZ_OFFSET);
+	dev_lim->eec_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_QPC_ENTRY_SZ_OFFSET);
+	dev_lim->qpc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EEEC_ENTRY_SZ_OFFSET);
+	dev_lim->eeec_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQPC_ENTRY_SZ_OFFSET);
+	dev_lim->eqpc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_EQC_ENTRY_SZ_OFFSET);
+	dev_lim->eqc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_CQC_ENTRY_SZ_OFFSET);
+	dev_lim->cqc_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_SRQ_ENTRY_SZ_OFFSET);
+	dev_lim->srq_entry_sz = size;
+	MTHCA_GET(size, outbox, QUERY_DEV_LIM_UAR_ENTRY_SZ_OFFSET);
+	dev_lim->uar_scratch_entry_sz = size;
+
+	if (mthca_is_memfree(dev)) {
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET);
+		dev_lim->max_srq_sz = 1 << field;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET);
+		dev_lim->max_qp_sz = 1 << field;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_RSZ_SRQ_OFFSET);
+		dev_lim->hca.arbel.resize_srq = field & 1;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SG_RQ_OFFSET);
+		dev_lim->max_sg = min_t(int, field, dev_lim->max_sg);
+		MTHCA_GET(size, outbox, QUERY_DEV_LIM_MAX_DESC_SZ_RQ_OFFSET);
+		dev_lim->max_desc_sz = min_t(int, size, dev_lim->max_desc_sz);
+		MTHCA_GET(size, outbox, QUERY_DEV_LIM_MPT_ENTRY_SZ_OFFSET);
+		dev_lim->mpt_entry_sz = size;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_PBL_SZ_OFFSET);
+		dev_lim->hca.arbel.max_pbl_sz = 1 << (field & 0x3f);
+		MTHCA_GET(dev_lim->hca.arbel.bmme_flags, outbox,
+			  QUERY_DEV_LIM_BMME_FLAGS_OFFSET);
+		MTHCA_GET(dev_lim->hca.arbel.reserved_lkey, outbox,
+			  QUERY_DEV_LIM_RSVD_LKEY_OFFSET);
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_LAMR_OFFSET);
+		dev_lim->hca.arbel.lam_required = field & 1;
+		MTHCA_GET(dev_lim->hca.arbel.max_icm_sz, outbox,
+			  QUERY_DEV_LIM_MAX_ICM_SZ_OFFSET);
+
+		if (dev_lim->hca.arbel.bmme_flags & 1)
+			mthca_dbg(dev, "Base MM extensions: yes "
+				  "(flags %d, max PBL %d, rsvd L_Key %08x)\n",
+				  dev_lim->hca.arbel.bmme_flags,
+				  dev_lim->hca.arbel.max_pbl_sz,
+				  dev_lim->hca.arbel.reserved_lkey);
+		else
+			mthca_dbg(dev, "Base MM extensions: no\n");
+
+		mthca_dbg(dev, "Max ICM size %lld MB\n",
+			  (unsigned long long) dev_lim->hca.arbel.max_icm_sz >> 20);
+	} else {
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_SRQ_SZ_OFFSET);
+		dev_lim->max_srq_sz = (1 << field) - 1;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_QP_SZ_OFFSET);
+		dev_lim->max_qp_sz = (1 << field) - 1;
+		MTHCA_GET(field, outbox, QUERY_DEV_LIM_MAX_AV_OFFSET);
+		dev_lim->hca.tavor.max_avs = 1 << (field & 0x3f);
+		dev_lim->mpt_entry_sz = MTHCA_MPT_ENTRY_SIZE;
+	}
+
+	mthca_dbg(dev, "Max QPs: %d, reserved QPs: %d, entry size: %d\n",
+		  dev_lim->max_qps, dev_lim->reserved_qps, dev_lim->qpc_entry_sz);
+	mthca_dbg(dev, "Max SRQs: %d, reserved SRQs: %d, entry size: %d\n",
+		  dev_lim->max_srqs, dev_lim->reserved_srqs, dev_lim->srq_entry_sz);
+	mthca_dbg(dev, "Max CQs: %d, reserved CQs: %d, entry size: %d\n",
+		  dev_lim->max_cqs, dev_lim->reserved_cqs, dev_lim->cqc_entry_sz);
+	mthca_dbg(dev, "Max EQs: %d, reserved EQs: %d, entry size: %d\n",
+		  dev_lim->max_eqs, dev_lim->reserved_eqs, dev_lim->eqc_entry_sz);
+	mthca_dbg(dev, "reserved MPTs: %d, reserved MTTs: %d\n",
+		  dev_lim->reserved_mrws, dev_lim->reserved_mtts);
+	mthca_dbg(dev, "Max PDs: %d, reserved PDs: %d, reserved UARs: %d\n",
+		  dev_lim->max_pds, dev_lim->reserved_pds, dev_lim->reserved_uars);
+	mthca_dbg(dev, "Max QP/MCG: %d, reserved MGMs: %d\n",
+		  dev_lim->max_pds, dev_lim->reserved_mgms);
+	mthca_dbg(dev, "Max CQEs: %d, max WQEs: %d, max SRQ WQEs: %d\n",
+		  dev_lim->max_cq_sz, dev_lim->max_qp_sz, dev_lim->max_srq_sz);
+
+	mthca_dbg(dev, "Flags: %08x\n", dev_lim->flags);
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+static void get_board_id(void *vsd, char *board_id)
+{
+	int i;
+
+#define VSD_OFFSET_SIG1		0x00
+#define VSD_OFFSET_SIG2		0xde
+#define VSD_OFFSET_MLX_BOARD_ID	0xd0
+#define VSD_OFFSET_TS_BOARD_ID	0x20
+
+#define VSD_SIGNATURE_TOPSPIN	0x5ad
+
+	memset(board_id, 0, MTHCA_BOARD_ID_LEN);
+
+	if (be16_to_cpup(vsd + VSD_OFFSET_SIG1) == VSD_SIGNATURE_TOPSPIN &&
+	    be16_to_cpup(vsd + VSD_OFFSET_SIG2) == VSD_SIGNATURE_TOPSPIN) {
+		strlcpy(board_id, vsd + VSD_OFFSET_TS_BOARD_ID, MTHCA_BOARD_ID_LEN);
+	} else {
+		/*
+		 * The board ID is a string but the firmware byte
+		 * swaps each 4-byte word before passing it back to
+		 * us.  Therefore we need to swab it before printing.
+		 */
+		for (i = 0; i < 4; ++i)
+			((u32 *) board_id)[i] =
+				swab32(*(u32 *) (vsd + VSD_OFFSET_MLX_BOARD_ID + i * 4));
+	}
+}
+
+int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
+			struct mthca_adapter *adapter)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *outbox;
+	int err;
+
+#define QUERY_ADAPTER_OUT_SIZE             0x100
+#define QUERY_ADAPTER_VENDOR_ID_OFFSET     0x00
+#define QUERY_ADAPTER_DEVICE_ID_OFFSET     0x04
+#define QUERY_ADAPTER_REVISION_ID_OFFSET   0x08
+#define QUERY_ADAPTER_INTA_PIN_OFFSET      0x10
+#define QUERY_ADAPTER_VSD_OFFSET           0x20
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	outbox = mailbox->buf;
+
+	err = mthca_cmd_box(dev, 0, mailbox->dma, 0, 0, CMD_QUERY_ADAPTER,
+			    CMD_TIME_CLASS_A);
+
+	if (err)
+		goto out;
+
+	if (!mthca_is_memfree(dev)) {
+		MTHCA_GET(adapter->vendor_id, outbox,
+			  QUERY_ADAPTER_VENDOR_ID_OFFSET);
+		MTHCA_GET(adapter->device_id, outbox,
+			  QUERY_ADAPTER_DEVICE_ID_OFFSET);
+		MTHCA_GET(adapter->revision_id, outbox,
+			  QUERY_ADAPTER_REVISION_ID_OFFSET);
+	}
+	MTHCA_GET(adapter->inta_pin, outbox,    QUERY_ADAPTER_INTA_PIN_OFFSET);
+
+	get_board_id(outbox + QUERY_ADAPTER_VSD_OFFSET / 4,
+		     adapter->board_id);
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_INIT_HCA(struct mthca_dev *dev,
+		   struct mthca_init_hca_param *param)
+{
+	struct mthca_mailbox *mailbox;
+	__be32 *inbox;
+	int err;
+
+#define INIT_HCA_IN_SIZE             	 0x200
+#define INIT_HCA_FLAGS1_OFFSET           0x00c
+#define INIT_HCA_FLAGS2_OFFSET           0x014
+#define INIT_HCA_QPC_OFFSET          	 0x020
+#define  INIT_HCA_QPC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x10)
+#define  INIT_HCA_LOG_QP_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x17)
+#define  INIT_HCA_EEC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x20)
+#define  INIT_HCA_LOG_EEC_OFFSET     	 (INIT_HCA_QPC_OFFSET + 0x27)
+#define  INIT_HCA_SRQC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x28)
+#define  INIT_HCA_LOG_SRQ_OFFSET     	 (INIT_HCA_QPC_OFFSET + 0x2f)
+#define  INIT_HCA_CQC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x30)
+#define  INIT_HCA_LOG_CQ_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x37)
+#define  INIT_HCA_EQPC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x40)
+#define  INIT_HCA_EEEC_BASE_OFFSET   	 (INIT_HCA_QPC_OFFSET + 0x50)
+#define  INIT_HCA_EQC_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x60)
+#define  INIT_HCA_LOG_EQ_OFFSET      	 (INIT_HCA_QPC_OFFSET + 0x67)
+#define  INIT_HCA_RDB_BASE_OFFSET    	 (INIT_HCA_QPC_OFFSET + 0x70)
+#define INIT_HCA_UDAV_OFFSET         	 0x0b0
+#define  INIT_HCA_UDAV_LKEY_OFFSET   	 (INIT_HCA_UDAV_OFFSET + 0x0)
+#define  INIT_HCA_UDAV_PD_OFFSET     	 (INIT_HCA_UDAV_OFFSET + 0x4)
+#define INIT_HCA_MCAST_OFFSET        	 0x0c0
+#define  INIT_HCA_MC_BASE_OFFSET         (INIT_HCA_MCAST_OFFSET + 0x00)
+#define  INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x12)
+#define  INIT_HCA_MC_HASH_SZ_OFFSET      (INIT_HCA_MCAST_OFFSET + 0x16)
+#define  INIT_HCA_LOG_MC_TABLE_SZ_OFFSET (INIT_HCA_MCAST_OFFSET + 0x1b)
+#define INIT_HCA_TPT_OFFSET              0x0f0
+#define  INIT_HCA_MPT_BASE_OFFSET        (INIT_HCA_TPT_OFFSET + 0x00)
+#define  INIT_HCA_MTT_SEG_SZ_OFFSET      (INIT_HCA_TPT_OFFSET + 0x09)
+#define  INIT_HCA_LOG_MPT_SZ_OFFSET      (INIT_HCA_TPT_OFFSET + 0x0b)
+#define  INIT_HCA_MTT_BASE_OFFSET        (INIT_HCA_TPT_OFFSET + 0x10)
+#define INIT_HCA_UAR_OFFSET              0x120
+#define  INIT_HCA_UAR_BASE_OFFSET        (INIT_HCA_UAR_OFFSET + 0x00)
+#define  INIT_HCA_UARC_SZ_OFFSET         (INIT_HCA_UAR_OFFSET + 0x09)
+#define  INIT_HCA_LOG_UAR_SZ_OFFSET      (INIT_HCA_UAR_OFFSET + 0x0a)
+#define  INIT_HCA_UAR_PAGE_SZ_OFFSET     (INIT_HCA_UAR_OFFSET + 0x0b)
+#define  INIT_HCA_UAR_SCATCH_BASE_OFFSET (INIT_HCA_UAR_OFFSET + 0x10)
+#define  INIT_HCA_UAR_CTX_BASE_OFFSET    (INIT_HCA_UAR_OFFSET + 0x18)
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, INIT_HCA_IN_SIZE);
+
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		MTHCA_PUT(inbox, 0x1, INIT_HCA_FLAGS1_OFFSET);
+
+#if defined(__LITTLE_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) &= ~cpu_to_be32(1 << 1);
+#elif defined(__BIG_ENDIAN)
+	*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1 << 1);
+#else
+#error Host endianness not defined
+#endif
+	/* Check port for UD address vector: */
+	*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(1);
+
+	/* Enable IPoIB checksumming if we can: */
+	if (dev->device_cap_flags & IB_DEVICE_UD_IP_CSUM)
+		*(inbox + INIT_HCA_FLAGS2_OFFSET / 4) |= cpu_to_be32(7 << 3);
+
+	/* We leave wqe_quota, responder_exu, etc as 0 (default) */
+
+	/* QPC/EEC/CQC/EQC/RDB attributes */
+
+	MTHCA_PUT(inbox, param->qpc_base,     INIT_HCA_QPC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_qps,  INIT_HCA_LOG_QP_OFFSET);
+	MTHCA_PUT(inbox, param->eec_base,     INIT_HCA_EEC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_eecs, INIT_HCA_LOG_EEC_OFFSET);
+	MTHCA_PUT(inbox, param->srqc_base,    INIT_HCA_SRQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_srqs, INIT_HCA_LOG_SRQ_OFFSET);
+	MTHCA_PUT(inbox, param->cqc_base,     INIT_HCA_CQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_cqs,  INIT_HCA_LOG_CQ_OFFSET);
+	MTHCA_PUT(inbox, param->eqpc_base,    INIT_HCA_EQPC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->eeec_base,    INIT_HCA_EEEC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->eqc_base,     INIT_HCA_EQC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_num_eqs,  INIT_HCA_LOG_EQ_OFFSET);
+	MTHCA_PUT(inbox, param->rdb_base,     INIT_HCA_RDB_BASE_OFFSET);
+
+	/* UD AV attributes */
+
+	/* multicast attributes */
+
+	MTHCA_PUT(inbox, param->mc_base,         INIT_HCA_MC_BASE_OFFSET);
+	MTHCA_PUT(inbox, param->log_mc_entry_sz, INIT_HCA_LOG_MC_ENTRY_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->mc_hash_sz,      INIT_HCA_MC_HASH_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->log_mc_table_sz, INIT_HCA_LOG_MC_TABLE_SZ_OFFSET);
+
+	/* TPT attributes */
+
+	MTHCA_PUT(inbox, param->mpt_base,   INIT_HCA_MPT_BASE_OFFSET);
+	if (!mthca_is_memfree(dev))
+		MTHCA_PUT(inbox, param->mtt_seg_sz, INIT_HCA_MTT_SEG_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->log_mpt_sz, INIT_HCA_LOG_MPT_SZ_OFFSET);
+	MTHCA_PUT(inbox, param->mtt_base,   INIT_HCA_MTT_BASE_OFFSET);
+
+	/* UAR attributes */
+	{
+		u8 uar_page_sz = PAGE_SHIFT - 12;
+		MTHCA_PUT(inbox, uar_page_sz, INIT_HCA_UAR_PAGE_SZ_OFFSET);
+	}
+
+	MTHCA_PUT(inbox, param->uar_scratch_base, INIT_HCA_UAR_SCATCH_BASE_OFFSET);
+
+	if (mthca_is_memfree(dev)) {
+		MTHCA_PUT(inbox, param->log_uarc_sz, INIT_HCA_UARC_SZ_OFFSET);
+		MTHCA_PUT(inbox, param->log_uar_sz,  INIT_HCA_LOG_UAR_SZ_OFFSET);
+		MTHCA_PUT(inbox, param->uarc_base,   INIT_HCA_UAR_CTX_BASE_OFFSET);
+	}
+
+	err = mthca_cmd(dev, mailbox->dma, 0, 0,
+			CMD_INIT_HCA, CMD_TIME_CLASS_D);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_INIT_IB(struct mthca_dev *dev,
+		  struct mthca_init_ib_param *param,
+		  int port)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *inbox;
+	int err;
+	u32 flags;
+
+#define INIT_IB_IN_SIZE          56
+#define INIT_IB_FLAGS_OFFSET     0x00
+#define INIT_IB_FLAG_SIG         (1 << 18)
+#define INIT_IB_FLAG_NG          (1 << 17)
+#define INIT_IB_FLAG_G0          (1 << 16)
+#define INIT_IB_VL_SHIFT         4
+#define INIT_IB_PORT_WIDTH_SHIFT 8
+#define INIT_IB_MTU_SHIFT        12
+#define INIT_IB_MAX_GID_OFFSET   0x06
+#define INIT_IB_MAX_PKEY_OFFSET  0x0a
+#define INIT_IB_GUID0_OFFSET     0x10
+#define INIT_IB_NODE_GUID_OFFSET 0x18
+#define INIT_IB_SI_GUID_OFFSET   0x20
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, INIT_IB_IN_SIZE);
+
+	flags = 0;
+	flags |= param->set_guid0     ? INIT_IB_FLAG_G0  : 0;
+	flags |= param->set_node_guid ? INIT_IB_FLAG_NG  : 0;
+	flags |= param->set_si_guid   ? INIT_IB_FLAG_SIG : 0;
+	flags |= param->vl_cap << INIT_IB_VL_SHIFT;
+	flags |= param->port_width << INIT_IB_PORT_WIDTH_SHIFT;
+	flags |= param->mtu_cap << INIT_IB_MTU_SHIFT;
+	MTHCA_PUT(inbox, flags, INIT_IB_FLAGS_OFFSET);
+
+	MTHCA_PUT(inbox, param->gid_cap,   INIT_IB_MAX_GID_OFFSET);
+	MTHCA_PUT(inbox, param->pkey_cap,  INIT_IB_MAX_PKEY_OFFSET);
+	MTHCA_PUT(inbox, param->guid0,     INIT_IB_GUID0_OFFSET);
+	MTHCA_PUT(inbox, param->node_guid, INIT_IB_NODE_GUID_OFFSET);
+	MTHCA_PUT(inbox, param->si_guid,   INIT_IB_SI_GUID_OFFSET);
+
+	err = mthca_cmd(dev, mailbox->dma, port, 0, CMD_INIT_IB,
+			CMD_TIME_CLASS_A);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_CLOSE_IB(struct mthca_dev *dev, int port)
+{
+	return mthca_cmd(dev, 0, port, 0, CMD_CLOSE_IB, CMD_TIME_CLASS_A);
+}
+
+int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic)
+{
+	return mthca_cmd(dev, 0, 0, panic, CMD_CLOSE_HCA, CMD_TIME_CLASS_C);
+}
+
+int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param,
+		 int port)
+{
+	struct mthca_mailbox *mailbox;
+	u32 *inbox;
+	int err;
+	u32 flags = 0;
+
+#define SET_IB_IN_SIZE         0x40
+#define SET_IB_FLAGS_OFFSET    0x00
+#define SET_IB_FLAG_SIG        (1 << 18)
+#define SET_IB_FLAG_RQK        (1 <<  0)
+#define SET_IB_CAP_MASK_OFFSET 0x04
+#define SET_IB_SI_GUID_OFFSET  0x08
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, SET_IB_IN_SIZE);
+
+	flags |= param->set_si_guid     ? SET_IB_FLAG_SIG : 0;
+	flags |= param->reset_qkey_viol ? SET_IB_FLAG_RQK : 0;
+	MTHCA_PUT(inbox, flags, SET_IB_FLAGS_OFFSET);
+
+	MTHCA_PUT(inbox, param->cap_mask, SET_IB_CAP_MASK_OFFSET);
+	MTHCA_PUT(inbox, param->si_guid,  SET_IB_SI_GUID_OFFSET);
+
+	err = mthca_cmd(dev, mailbox->dma, port, 0, CMD_SET_IB,
+			CMD_TIME_CLASS_B);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt)
+{
+	return mthca_map_cmd(dev, CMD_MAP_ICM, icm, virt);
+}
+
+int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt)
+{
+	struct mthca_mailbox *mailbox;
+	__be64 *inbox;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	inbox[0] = cpu_to_be64(virt);
+	inbox[1] = cpu_to_be64(dma_addr);
+
+	err = mthca_cmd(dev, mailbox->dma, 1, 0, CMD_MAP_ICM,
+			CMD_TIME_CLASS_B);
+
+	mthca_free_mailbox(dev, mailbox);
+
+	if (!err)
+		mthca_dbg(dev, "Mapped page at %llx to %llx for ICM.\n",
+			  (unsigned long long) dma_addr, (unsigned long long) virt);
+
+	return err;
+}
+
+int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count)
+{
+	mthca_dbg(dev, "Unmapping %d pages at %llx from ICM.\n",
+		  page_count, (unsigned long long) virt);
+
+	return mthca_cmd(dev, virt, page_count, 0,
+			CMD_UNMAP_ICM, CMD_TIME_CLASS_B);
+}
+
+int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm)
+{
+	return mthca_map_cmd(dev, CMD_MAP_ICM_AUX, icm, -1);
+}
+
+int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_UNMAP_ICM_AUX, CMD_TIME_CLASS_B);
+}
+
+int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages)
+{
+	int ret = mthca_cmd_imm(dev, icm_size, aux_pages, 0,
+			0, CMD_SET_ICM_SIZE, CMD_TIME_CLASS_A);
+
+	if (ret)
+		return ret;
+
+	/*
+	 * Round up number of system pages needed in case
+	 * MTHCA_ICM_PAGE_SIZE < PAGE_SIZE.
+	 */
+	*aux_pages = ALIGN(*aux_pages, PAGE_SIZE / MTHCA_ICM_PAGE_SIZE) >>
+		(PAGE_SHIFT - MTHCA_ICM_PAGE_SHIFT);
+
+	return 0;
+}
+
+int mthca_SW2HW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int mpt_index)
+{
+	return mthca_cmd(dev, mailbox->dma, mpt_index, 0, CMD_SW2HW_MPT,
+			 CMD_TIME_CLASS_B);
+}
+
+int mthca_HW2SW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int mpt_index)
+{
+	return mthca_cmd_box(dev, 0, mailbox ? mailbox->dma : 0, mpt_index,
+			     !mailbox, CMD_HW2SW_MPT,
+			     CMD_TIME_CLASS_B);
+}
+
+int mthca_WRITE_MTT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int num_mtt)
+{
+	return mthca_cmd(dev, mailbox->dma, num_mtt, 0, CMD_WRITE_MTT,
+			 CMD_TIME_CLASS_B);
+}
+
+int mthca_SYNC_TPT(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0, 0, CMD_SYNC_TPT, CMD_TIME_CLASS_B);
+}
+
+int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
+		 int eq_num)
+{
+	mthca_dbg(dev, "%s mask %016llx for eqn %d\n",
+		  unmap ? "Clearing" : "Setting",
+		  (unsigned long long) event_mask, eq_num);
+	return mthca_cmd(dev, event_mask, (unmap << 31) | eq_num,
+			 0, CMD_MAP_EQ, CMD_TIME_CLASS_B);
+}
+
+int mthca_SW2HW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int eq_num)
+{
+	return mthca_cmd(dev, mailbox->dma, eq_num, 0, CMD_SW2HW_EQ,
+			 CMD_TIME_CLASS_A);
+}
+
+int mthca_HW2SW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int eq_num)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, eq_num, 0,
+			     CMD_HW2SW_EQ,
+			     CMD_TIME_CLASS_A);
+}
+
+int mthca_SW2HW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int cq_num)
+{
+	return mthca_cmd(dev, mailbox->dma, cq_num, 0, CMD_SW2HW_CQ,
+			CMD_TIME_CLASS_A);
+}
+
+int mthca_HW2SW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int cq_num)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, cq_num, 0,
+			     CMD_HW2SW_CQ,
+			     CMD_TIME_CLASS_A);
+}
+
+int mthca_RESIZE_CQ(struct mthca_dev *dev, int cq_num, u32 lkey, u8 log_size)
+{
+	struct mthca_mailbox *mailbox;
+	__be32 *inbox;
+	int err;
+
+#define RESIZE_CQ_IN_SIZE		0x40
+#define RESIZE_CQ_LOG_SIZE_OFFSET	0x0c
+#define RESIZE_CQ_LKEY_OFFSET		0x1c
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	inbox = mailbox->buf;
+
+	memset(inbox, 0, RESIZE_CQ_IN_SIZE);
+	/*
+	 * Leave start address fields zeroed out -- mthca assumes that
+	 * MRs for CQs always start at virtual address 0.
+	 */
+	MTHCA_PUT(inbox, log_size, RESIZE_CQ_LOG_SIZE_OFFSET);
+	MTHCA_PUT(inbox, lkey,     RESIZE_CQ_LKEY_OFFSET);
+
+	err = mthca_cmd(dev, mailbox->dma, cq_num, 1, CMD_RESIZE_CQ,
+			CMD_TIME_CLASS_B);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_SW2HW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int srq_num)
+{
+	return mthca_cmd(dev, mailbox->dma, srq_num, 0, CMD_SW2HW_SRQ,
+			CMD_TIME_CLASS_A);
+}
+
+int mthca_HW2SW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int srq_num)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, srq_num, 0,
+			     CMD_HW2SW_SRQ,
+			     CMD_TIME_CLASS_A);
+}
+
+int mthca_QUERY_SRQ(struct mthca_dev *dev, u32 num,
+		    struct mthca_mailbox *mailbox)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, num, 0,
+			     CMD_QUERY_SRQ, CMD_TIME_CLASS_A);
+}
+
+int mthca_ARM_SRQ(struct mthca_dev *dev, int srq_num, int limit)
+{
+	return mthca_cmd(dev, limit, srq_num, 0, CMD_ARM_SRQ,
+			 CMD_TIME_CLASS_B);
+}
+
+int mthca_MODIFY_QP(struct mthca_dev *dev, enum ib_qp_state cur,
+		    enum ib_qp_state next, u32 num, int is_ee,
+		    struct mthca_mailbox *mailbox, u32 optmask)
+{
+	static const u16 op[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
+		[IB_QPS_RESET] = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_INIT]	= CMD_RST2INIT_QPEE,
+		},
+		[IB_QPS_INIT]  = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_INIT]	= CMD_INIT2INIT_QPEE,
+			[IB_QPS_RTR]	= CMD_INIT2RTR_QPEE,
+		},
+		[IB_QPS_RTR]   = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_RTS]	= CMD_RTR2RTS_QPEE,
+		},
+		[IB_QPS_RTS]   = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_RTS]	= CMD_RTS2RTS_QPEE,
+			[IB_QPS_SQD]	= CMD_RTS2SQD_QPEE,
+		},
+		[IB_QPS_SQD] = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_RTS]	= CMD_SQD2RTS_QPEE,
+			[IB_QPS_SQD]	= CMD_SQD2SQD_QPEE,
+		},
+		[IB_QPS_SQE] = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+			[IB_QPS_RTS]	= CMD_SQERR2RTS_QPEE,
+		},
+		[IB_QPS_ERR] = {
+			[IB_QPS_RESET]	= CMD_ERR2RST_QPEE,
+			[IB_QPS_ERR]	= CMD_2ERR_QPEE,
+		}
+	};
+
+	u8 op_mod = 0;
+	int my_mailbox = 0;
+	int err;
+
+	if (op[cur][next] == CMD_ERR2RST_QPEE) {
+		op_mod = 3;	/* don't write outbox, any->reset */
+
+		/* For debugging */
+		if (!mailbox) {
+			mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+			if (!IS_ERR(mailbox)) {
+				my_mailbox = 1;
+				op_mod     = 2;	/* write outbox, any->reset */
+			} else
+				mailbox = NULL;
+		}
+
+		err = mthca_cmd_box(dev, 0, mailbox ? mailbox->dma : 0,
+				    (!!is_ee << 24) | num, op_mod,
+				    op[cur][next], CMD_TIME_CLASS_C);
+
+		if (0 && mailbox) {
+			int i;
+			mthca_dbg(dev, "Dumping QP context:\n");
+			printk(" %08x\n", be32_to_cpup(mailbox->buf));
+			for (i = 0; i < 0x100 / 4; ++i) {
+				if (i % 8 == 0)
+					printk("[%02x] ", i * 4);
+				printk(" %08x",
+				       be32_to_cpu(((__be32 *) mailbox->buf)[i + 2]));
+				if ((i + 1) % 8 == 0)
+					printk("\n");
+			}
+		}
+
+		if (my_mailbox)
+			mthca_free_mailbox(dev, mailbox);
+	} else {
+		if (0) {
+			int i;
+			mthca_dbg(dev, "Dumping QP context:\n");
+			printk("  opt param mask: %08x\n", be32_to_cpup(mailbox->buf));
+			for (i = 0; i < 0x100 / 4; ++i) {
+				if (i % 8 == 0)
+					printk("  [%02x] ", i * 4);
+				printk(" %08x",
+				       be32_to_cpu(((__be32 *) mailbox->buf)[i + 2]));
+				if ((i + 1) % 8 == 0)
+					printk("\n");
+			}
+		}
+
+		err = mthca_cmd(dev, mailbox->dma, optmask | (!!is_ee << 24) | num,
+				op_mod, op[cur][next], CMD_TIME_CLASS_C);
+	}
+
+	return err;
+}
+
+int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
+		   struct mthca_mailbox *mailbox)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, (!!is_ee << 24) | num, 0,
+			     CMD_QUERY_QPEE, CMD_TIME_CLASS_A);
+}
+
+int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn)
+{
+	u8 op_mod;
+
+	switch (type) {
+	case IB_QPT_SMI:
+		op_mod = 0;
+		break;
+	case IB_QPT_GSI:
+		op_mod = 1;
+		break;
+	case IB_QPT_RAW_IPV6:
+		op_mod = 2;
+		break;
+	case IB_QPT_RAW_ETHERTYPE:
+		op_mod = 3;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return mthca_cmd(dev, 0, qpn, op_mod, CMD_CONF_SPECIAL_QP,
+			 CMD_TIME_CLASS_B);
+}
+
+int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
+		  int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		  void *in_mad, void *response_mad)
+{
+	struct mthca_mailbox *inmailbox, *outmailbox;
+	void *inbox;
+	int err;
+	u32 in_modifier = port;
+	u8 op_modifier = 0;
+
+#define MAD_IFC_BOX_SIZE      0x400
+#define MAD_IFC_MY_QPN_OFFSET 0x100
+#define MAD_IFC_RQPN_OFFSET   0x108
+#define MAD_IFC_SL_OFFSET     0x10c
+#define MAD_IFC_G_PATH_OFFSET 0x10d
+#define MAD_IFC_RLID_OFFSET   0x10e
+#define MAD_IFC_PKEY_OFFSET   0x112
+#define MAD_IFC_GRH_OFFSET    0x140
+
+	inmailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(inmailbox))
+		return PTR_ERR(inmailbox);
+	inbox = inmailbox->buf;
+
+	outmailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(outmailbox)) {
+		mthca_free_mailbox(dev, inmailbox);
+		return PTR_ERR(outmailbox);
+	}
+
+	memcpy(inbox, in_mad, 256);
+
+	/*
+	 * Key check traps can't be generated unless we have in_wc to
+	 * tell us where to send the trap.
+	 */
+	if (ignore_mkey || !in_wc)
+		op_modifier |= 0x1;
+	if (ignore_bkey || !in_wc)
+		op_modifier |= 0x2;
+
+	if (in_wc) {
+		u8 val;
+
+		memset(inbox + 256, 0, 256);
+
+		MTHCA_PUT(inbox, in_wc->qp->qp_num, MAD_IFC_MY_QPN_OFFSET);
+		MTHCA_PUT(inbox, in_wc->src_qp,     MAD_IFC_RQPN_OFFSET);
+
+		val = in_wc->sl << 4;
+		MTHCA_PUT(inbox, val,               MAD_IFC_SL_OFFSET);
+
+		val = in_wc->dlid_path_bits |
+			(in_wc->wc_flags & IB_WC_GRH ? 0x80 : 0);
+		MTHCA_PUT(inbox, val,               MAD_IFC_G_PATH_OFFSET);
+
+		MTHCA_PUT(inbox, in_wc->slid,       MAD_IFC_RLID_OFFSET);
+		MTHCA_PUT(inbox, in_wc->pkey_index, MAD_IFC_PKEY_OFFSET);
+
+		if (in_grh)
+			memcpy(inbox + MAD_IFC_GRH_OFFSET, in_grh, 40);
+
+		op_modifier |= 0x4;
+
+		in_modifier |= in_wc->slid << 16;
+	}
+
+	err = mthca_cmd_box(dev, inmailbox->dma, outmailbox->dma,
+			    in_modifier, op_modifier,
+			    CMD_MAD_IFC, CMD_TIME_CLASS_C);
+
+	if (!err)
+		memcpy(response_mad, outmailbox->buf, 256);
+
+	mthca_free_mailbox(dev, inmailbox);
+	mthca_free_mailbox(dev, outmailbox);
+	return err;
+}
+
+int mthca_READ_MGM(struct mthca_dev *dev, int index,
+		   struct mthca_mailbox *mailbox)
+{
+	return mthca_cmd_box(dev, 0, mailbox->dma, index, 0,
+			     CMD_READ_MGM, CMD_TIME_CLASS_A);
+}
+
+int mthca_WRITE_MGM(struct mthca_dev *dev, int index,
+		    struct mthca_mailbox *mailbox)
+{
+	return mthca_cmd(dev, mailbox->dma, index, 0, CMD_WRITE_MGM,
+			 CMD_TIME_CLASS_A);
+}
+
+int mthca_MGID_HASH(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    u16 *hash)
+{
+	u64 imm;
+	int err;
+
+	err = mthca_cmd_imm(dev, mailbox->dma, &imm, 0, 0, CMD_MGID_HASH,
+			    CMD_TIME_CLASS_A);
+
+	*hash = imm;
+	return err;
+}
+
+int mthca_NOP(struct mthca_dev *dev)
+{
+	return mthca_cmd(dev, 0, 0x1f, 0, CMD_NOP, msecs_to_jiffies(100));
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.h b/drivers/infiniband/hw/mthca/mthca_cmd.h
new file mode 100644
index 000000000..f952244c5
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.h
@@ -0,0 +1,325 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_CMD_H
+#define MTHCA_CMD_H
+
+#include <rdma/ib_verbs.h>
+
+#define MTHCA_MAILBOX_SIZE 4096
+
+enum {
+	/* command completed successfully: */
+	MTHCA_CMD_STAT_OK 	      = 0x00,
+	/* Internal error (such as a bus error) occurred while processing command: */
+	MTHCA_CMD_STAT_INTERNAL_ERR   = 0x01,
+	/* Operation/command not supported or opcode modifier not supported: */
+	MTHCA_CMD_STAT_BAD_OP 	      = 0x02,
+	/* Parameter not supported or parameter out of range: */
+	MTHCA_CMD_STAT_BAD_PARAM      = 0x03,
+	/* System not enabled or bad system state: */
+	MTHCA_CMD_STAT_BAD_SYS_STATE  = 0x04,
+	/* Attempt to access reserved or unallocaterd resource: */
+	MTHCA_CMD_STAT_BAD_RESOURCE   = 0x05,
+	/* Requested resource is currently executing a command, or is otherwise busy: */
+	MTHCA_CMD_STAT_RESOURCE_BUSY  = 0x06,
+	/* memory error: */
+	MTHCA_CMD_STAT_DDR_MEM_ERR    = 0x07,
+	/* Required capability exceeds device limits: */
+	MTHCA_CMD_STAT_EXCEED_LIM     = 0x08,
+	/* Resource is not in the appropriate state or ownership: */
+	MTHCA_CMD_STAT_BAD_RES_STATE  = 0x09,
+	/* Index out of range: */
+	MTHCA_CMD_STAT_BAD_INDEX      = 0x0a,
+	/* FW image corrupted: */
+	MTHCA_CMD_STAT_BAD_NVMEM      = 0x0b,
+	/* Attempt to modify a QP/EE which is not in the presumed state: */
+	MTHCA_CMD_STAT_BAD_QPEE_STATE = 0x10,
+	/* Bad segment parameters (Address/Size): */
+	MTHCA_CMD_STAT_BAD_SEG_PARAM  = 0x20,
+	/* Memory Region has Memory Windows bound to: */
+	MTHCA_CMD_STAT_REG_BOUND      = 0x21,
+	/* HCA local attached memory not present: */
+	MTHCA_CMD_STAT_LAM_NOT_PRE    = 0x22,
+	/* Bad management packet (silently discarded): */
+	MTHCA_CMD_STAT_BAD_PKT 	      = 0x30,
+	/* More outstanding CQEs in CQ than new CQ size: */
+	MTHCA_CMD_STAT_BAD_SIZE       = 0x40
+};
+
+enum {
+	MTHCA_TRANS_INVALID = 0,
+	MTHCA_TRANS_RST2INIT,
+	MTHCA_TRANS_INIT2INIT,
+	MTHCA_TRANS_INIT2RTR,
+	MTHCA_TRANS_RTR2RTS,
+	MTHCA_TRANS_RTS2RTS,
+	MTHCA_TRANS_SQERR2RTS,
+	MTHCA_TRANS_ANY2ERR,
+	MTHCA_TRANS_RTS2SQD,
+	MTHCA_TRANS_SQD2SQD,
+	MTHCA_TRANS_SQD2RTS,
+	MTHCA_TRANS_ANY2RST,
+};
+
+enum {
+	DEV_LIM_FLAG_RC                 = 1 << 0,
+	DEV_LIM_FLAG_UC                 = 1 << 1,
+	DEV_LIM_FLAG_UD                 = 1 << 2,
+	DEV_LIM_FLAG_RD                 = 1 << 3,
+	DEV_LIM_FLAG_RAW_IPV6           = 1 << 4,
+	DEV_LIM_FLAG_RAW_ETHER          = 1 << 5,
+	DEV_LIM_FLAG_SRQ                = 1 << 6,
+	DEV_LIM_FLAG_IPOIB_CSUM		= 1 << 7,
+	DEV_LIM_FLAG_BAD_PKEY_CNTR      = 1 << 8,
+	DEV_LIM_FLAG_BAD_QKEY_CNTR      = 1 << 9,
+	DEV_LIM_FLAG_MW                 = 1 << 16,
+	DEV_LIM_FLAG_AUTO_PATH_MIG      = 1 << 17,
+	DEV_LIM_FLAG_ATOMIC             = 1 << 18,
+	DEV_LIM_FLAG_RAW_MULTI          = 1 << 19,
+	DEV_LIM_FLAG_UD_AV_PORT_ENFORCE = 1 << 20,
+	DEV_LIM_FLAG_UD_MULTI           = 1 << 21,
+};
+
+struct mthca_mailbox {
+	dma_addr_t dma;
+	void      *buf;
+};
+
+struct mthca_dev_lim {
+	int max_srq_sz;
+	int max_qp_sz;
+	int reserved_qps;
+	int max_qps;
+	int reserved_srqs;
+	int max_srqs;
+	int reserved_eecs;
+	int max_eecs;
+	int max_cq_sz;
+	int reserved_cqs;
+	int max_cqs;
+	int max_mpts;
+	int reserved_eqs;
+	int max_eqs;
+	int reserved_mtts;
+	int max_mrw_sz;
+	int reserved_mrws;
+	int max_mtt_seg;
+	int max_requester_per_qp;
+	int max_responder_per_qp;
+	int max_rdma_global;
+	int local_ca_ack_delay;
+	int max_mtu;
+	int max_port_width;
+	int max_vl;
+	int num_ports;
+	int max_gids;
+	u16 stat_rate_support;
+	int max_pkeys;
+	u32 flags;
+	int reserved_uars;
+	int uar_size;
+	int min_page_sz;
+	int max_sg;
+	int max_desc_sz;
+	int max_qp_per_mcg;
+	int reserved_mgms;
+	int max_mcgs;
+	int reserved_pds;
+	int max_pds;
+	int reserved_rdds;
+	int max_rdds;
+	int eec_entry_sz;
+	int qpc_entry_sz;
+	int eeec_entry_sz;
+	int eqpc_entry_sz;
+	int eqc_entry_sz;
+	int cqc_entry_sz;
+	int srq_entry_sz;
+	int uar_scratch_entry_sz;
+	int mpt_entry_sz;
+	union {
+		struct {
+			int max_avs;
+		} tavor;
+		struct {
+			int resize_srq;
+			int max_pbl_sz;
+			u8  bmme_flags;
+			u32 reserved_lkey;
+			int lam_required;
+			u64 max_icm_sz;
+		} arbel;
+	} hca;
+};
+
+struct mthca_adapter {
+	u32  vendor_id;
+	u32  device_id;
+	u32  revision_id;
+	char board_id[MTHCA_BOARD_ID_LEN];
+	u8   inta_pin;
+};
+
+struct mthca_init_hca_param {
+	u64 qpc_base;
+	u64 eec_base;
+	u64 srqc_base;
+	u64 cqc_base;
+	u64 eqpc_base;
+	u64 eeec_base;
+	u64 eqc_base;
+	u64 rdb_base;
+	u64 mc_base;
+	u64 mpt_base;
+	u64 mtt_base;
+	u64 uar_scratch_base;
+	u64 uarc_base;
+	u16 log_mc_entry_sz;
+	u16 mc_hash_sz;
+	u8  log_num_qps;
+	u8  log_num_eecs;
+	u8  log_num_srqs;
+	u8  log_num_cqs;
+	u8  log_num_eqs;
+	u8  log_mc_table_sz;
+	u8  mtt_seg_sz;
+	u8  log_mpt_sz;
+	u8  log_uar_sz;
+	u8  log_uarc_sz;
+};
+
+struct mthca_init_ib_param {
+	int port_width;
+	int vl_cap;
+	int mtu_cap;
+	u16 gid_cap;
+	u16 pkey_cap;
+	int set_guid0;
+	u64 guid0;
+	int set_node_guid;
+	u64 node_guid;
+	int set_si_guid;
+	u64 si_guid;
+};
+
+struct mthca_set_ib_param {
+	int set_si_guid;
+	int reset_qkey_viol;
+	u64 si_guid;
+	u32 cap_mask;
+};
+
+int mthca_cmd_init(struct mthca_dev *dev);
+void mthca_cmd_cleanup(struct mthca_dev *dev);
+int mthca_cmd_use_events(struct mthca_dev *dev);
+void mthca_cmd_use_polling(struct mthca_dev *dev);
+void mthca_cmd_event(struct mthca_dev *dev, u16 token,
+		     u8  status, u64 out_param);
+
+struct mthca_mailbox *mthca_alloc_mailbox(struct mthca_dev *dev,
+					  gfp_t gfp_mask);
+void mthca_free_mailbox(struct mthca_dev *dev, struct mthca_mailbox *mailbox);
+
+int mthca_SYS_EN(struct mthca_dev *dev);
+int mthca_SYS_DIS(struct mthca_dev *dev);
+int mthca_MAP_FA(struct mthca_dev *dev, struct mthca_icm *icm);
+int mthca_UNMAP_FA(struct mthca_dev *dev);
+int mthca_RUN_FW(struct mthca_dev *dev);
+int mthca_QUERY_FW(struct mthca_dev *dev);
+int mthca_ENABLE_LAM(struct mthca_dev *dev);
+int mthca_DISABLE_LAM(struct mthca_dev *dev);
+int mthca_QUERY_DDR(struct mthca_dev *dev);
+int mthca_QUERY_DEV_LIM(struct mthca_dev *dev,
+			struct mthca_dev_lim *dev_lim);
+int mthca_QUERY_ADAPTER(struct mthca_dev *dev,
+			struct mthca_adapter *adapter);
+int mthca_INIT_HCA(struct mthca_dev *dev,
+		   struct mthca_init_hca_param *param);
+int mthca_INIT_IB(struct mthca_dev *dev,
+		  struct mthca_init_ib_param *param,
+		  int port);
+int mthca_CLOSE_IB(struct mthca_dev *dev, int port);
+int mthca_CLOSE_HCA(struct mthca_dev *dev, int panic);
+int mthca_SET_IB(struct mthca_dev *dev, struct mthca_set_ib_param *param,
+		 int port);
+int mthca_MAP_ICM(struct mthca_dev *dev, struct mthca_icm *icm, u64 virt);
+int mthca_MAP_ICM_page(struct mthca_dev *dev, u64 dma_addr, u64 virt);
+int mthca_UNMAP_ICM(struct mthca_dev *dev, u64 virt, u32 page_count);
+int mthca_MAP_ICM_AUX(struct mthca_dev *dev, struct mthca_icm *icm);
+int mthca_UNMAP_ICM_AUX(struct mthca_dev *dev);
+int mthca_SET_ICM_SIZE(struct mthca_dev *dev, u64 icm_size, u64 *aux_pages);
+int mthca_SW2HW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int mpt_index);
+int mthca_HW2SW_MPT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int mpt_index);
+int mthca_WRITE_MTT(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int num_mtt);
+int mthca_SYNC_TPT(struct mthca_dev *dev);
+int mthca_MAP_EQ(struct mthca_dev *dev, u64 event_mask, int unmap,
+		 int eq_num);
+int mthca_SW2HW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int eq_num);
+int mthca_HW2SW_EQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int eq_num);
+int mthca_SW2HW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int cq_num);
+int mthca_HW2SW_CQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		   int cq_num);
+int mthca_RESIZE_CQ(struct mthca_dev *dev, int cq_num, u32 lkey, u8 log_size);
+int mthca_SW2HW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int srq_num);
+int mthca_HW2SW_SRQ(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    int srq_num);
+int mthca_QUERY_SRQ(struct mthca_dev *dev, u32 num,
+		    struct mthca_mailbox *mailbox);
+int mthca_ARM_SRQ(struct mthca_dev *dev, int srq_num, int limit);
+int mthca_MODIFY_QP(struct mthca_dev *dev, enum ib_qp_state cur,
+		    enum ib_qp_state next, u32 num, int is_ee,
+		    struct mthca_mailbox *mailbox, u32 optmask);
+int mthca_QUERY_QP(struct mthca_dev *dev, u32 num, int is_ee,
+		   struct mthca_mailbox *mailbox);
+int mthca_CONF_SPECIAL_QP(struct mthca_dev *dev, int type, u32 qpn);
+int mthca_MAD_IFC(struct mthca_dev *dev, int ignore_mkey, int ignore_bkey,
+		  int port, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		  void *in_mad, void *response_mad);
+int mthca_READ_MGM(struct mthca_dev *dev, int index,
+		   struct mthca_mailbox *mailbox);
+int mthca_WRITE_MGM(struct mthca_dev *dev, int index,
+		    struct mthca_mailbox *mailbox);
+int mthca_MGID_HASH(struct mthca_dev *dev, struct mthca_mailbox *mailbox,
+		    u16 *hash);
+int mthca_NOP(struct mthca_dev *dev);
+
+#endif /* MTHCA_CMD_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_config_reg.h b/drivers/infiniband/hw/mthca/mthca_config_reg.h
new file mode 100644
index 000000000..155bc6639
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_config_reg.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_CONFIG_REG_H
+#define MTHCA_CONFIG_REG_H
+
+#define MTHCA_HCR_BASE         0x80680
+#define MTHCA_HCR_SIZE         0x0001c
+#define MTHCA_ECR_BASE         0x80700
+#define MTHCA_ECR_SIZE         0x00008
+#define MTHCA_ECR_CLR_BASE     0x80708
+#define MTHCA_ECR_CLR_SIZE     0x00008
+#define MTHCA_MAP_ECR_SIZE     (MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE)
+#define MTHCA_CLR_INT_BASE     0xf00d8
+#define MTHCA_CLR_INT_SIZE     0x00008
+#define MTHCA_EQ_SET_CI_SIZE   (8 * 32)
+
+#endif /* MTHCA_CONFIG_REG_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c
new file mode 100644
index 000000000..40ba83338
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_cq.c
@@ -0,0 +1,984 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/gfp.h>
+#include <linux/hardirq.h>
+#include <linux/sched.h>
+
+#include <asm/io.h>
+
+#include <rdma/ib_pack.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+enum {
+	MTHCA_MAX_DIRECT_CQ_SIZE = 4 * PAGE_SIZE
+};
+
+enum {
+	MTHCA_CQ_ENTRY_SIZE = 0x20
+};
+
+enum {
+	MTHCA_ATOMIC_BYTE_LEN = 8
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_cq_context {
+	__be32 flags;
+	__be64 start;
+	__be32 logsize_usrpage;
+	__be32 error_eqn;	/* Tavor only */
+	__be32 comp_eqn;
+	__be32 pd;
+	__be32 lkey;
+	__be32 last_notified_index;
+	__be32 solicit_producer_index;
+	__be32 consumer_index;
+	__be32 producer_index;
+	__be32 cqn;
+	__be32 ci_db;		/* Arbel only */
+	__be32 state_db;	/* Arbel only */
+	u32    reserved;
+} __attribute__((packed));
+
+#define MTHCA_CQ_STATUS_OK          ( 0 << 28)
+#define MTHCA_CQ_STATUS_OVERFLOW    ( 9 << 28)
+#define MTHCA_CQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MTHCA_CQ_FLAG_TR            ( 1 << 18)
+#define MTHCA_CQ_FLAG_OI            ( 1 << 17)
+#define MTHCA_CQ_STATE_DISARMED     ( 0 <<  8)
+#define MTHCA_CQ_STATE_ARMED        ( 1 <<  8)
+#define MTHCA_CQ_STATE_ARMED_SOL    ( 4 <<  8)
+#define MTHCA_EQ_STATE_FIRED        (10 <<  8)
+
+enum {
+	MTHCA_ERROR_CQE_OPCODE_MASK = 0xfe
+};
+
+enum {
+	SYNDROME_LOCAL_LENGTH_ERR 	 = 0x01,
+	SYNDROME_LOCAL_QP_OP_ERR  	 = 0x02,
+	SYNDROME_LOCAL_EEC_OP_ERR 	 = 0x03,
+	SYNDROME_LOCAL_PROT_ERR   	 = 0x04,
+	SYNDROME_WR_FLUSH_ERR     	 = 0x05,
+	SYNDROME_MW_BIND_ERR      	 = 0x06,
+	SYNDROME_BAD_RESP_ERR     	 = 0x10,
+	SYNDROME_LOCAL_ACCESS_ERR 	 = 0x11,
+	SYNDROME_REMOTE_INVAL_REQ_ERR 	 = 0x12,
+	SYNDROME_REMOTE_ACCESS_ERR 	 = 0x13,
+	SYNDROME_REMOTE_OP_ERR     	 = 0x14,
+	SYNDROME_RETRY_EXC_ERR 		 = 0x15,
+	SYNDROME_RNR_RETRY_EXC_ERR 	 = 0x16,
+	SYNDROME_LOCAL_RDD_VIOL_ERR 	 = 0x20,
+	SYNDROME_REMOTE_INVAL_RD_REQ_ERR = 0x21,
+	SYNDROME_REMOTE_ABORTED_ERR 	 = 0x22,
+	SYNDROME_INVAL_EECN_ERR 	 = 0x23,
+	SYNDROME_INVAL_EEC_STATE_ERR 	 = 0x24
+};
+
+struct mthca_cqe {
+	__be32 my_qpn;
+	__be32 my_ee;
+	__be32 rqpn;
+	u8     sl_ipok;
+	u8     g_mlpath;
+	__be16 rlid;
+	__be32 imm_etype_pkey_eec;
+	__be32 byte_cnt;
+	__be32 wqe;
+	u8     opcode;
+	u8     is_send;
+	u8     reserved;
+	u8     owner;
+};
+
+struct mthca_err_cqe {
+	__be32 my_qpn;
+	u32    reserved1[3];
+	u8     syndrome;
+	u8     vendor_err;
+	__be16 db_cnt;
+	u32    reserved2;
+	__be32 wqe;
+	u8     opcode;
+	u8     reserved3[2];
+	u8     owner;
+};
+
+#define MTHCA_CQ_ENTRY_OWNER_SW      (0 << 7)
+#define MTHCA_CQ_ENTRY_OWNER_HW      (1 << 7)
+
+#define MTHCA_TAVOR_CQ_DB_INC_CI       (1 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT      (2 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL  (3 << 24)
+#define MTHCA_TAVOR_CQ_DB_SET_CI       (4 << 24)
+#define MTHCA_TAVOR_CQ_DB_REQ_NOT_MULT (5 << 24)
+
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL  (1 << 24)
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT      (2 << 24)
+#define MTHCA_ARBEL_CQ_DB_REQ_NOT_MULT (3 << 24)
+
+static inline struct mthca_cqe *get_cqe_from_buf(struct mthca_cq_buf *buf,
+						 int entry)
+{
+	if (buf->is_direct)
+		return buf->queue.direct.buf + (entry * MTHCA_CQ_ENTRY_SIZE);
+	else
+		return buf->queue.page_list[entry * MTHCA_CQ_ENTRY_SIZE / PAGE_SIZE].buf
+			+ (entry * MTHCA_CQ_ENTRY_SIZE) % PAGE_SIZE;
+}
+
+static inline struct mthca_cqe *get_cqe(struct mthca_cq *cq, int entry)
+{
+	return get_cqe_from_buf(&cq->buf, entry);
+}
+
+static inline struct mthca_cqe *cqe_sw(struct mthca_cqe *cqe)
+{
+	return MTHCA_CQ_ENTRY_OWNER_HW & cqe->owner ? NULL : cqe;
+}
+
+static inline struct mthca_cqe *next_cqe_sw(struct mthca_cq *cq)
+{
+	return cqe_sw(get_cqe(cq, cq->cons_index & cq->ibcq.cqe));
+}
+
+static inline void set_cqe_hw(struct mthca_cqe *cqe)
+{
+	cqe->owner = MTHCA_CQ_ENTRY_OWNER_HW;
+}
+
+static void dump_cqe(struct mthca_dev *dev, void *cqe_ptr)
+{
+	__be32 *cqe = cqe_ptr;
+
+	(void) cqe;	/* avoid warning if mthca_dbg compiled away... */
+	mthca_dbg(dev, "CQE contents %08x %08x %08x %08x %08x %08x %08x %08x\n",
+		  be32_to_cpu(cqe[0]), be32_to_cpu(cqe[1]), be32_to_cpu(cqe[2]),
+		  be32_to_cpu(cqe[3]), be32_to_cpu(cqe[4]), be32_to_cpu(cqe[5]),
+		  be32_to_cpu(cqe[6]), be32_to_cpu(cqe[7]));
+}
+
+/*
+ * incr is ignored in native Arbel (mem-free) mode, so cq->cons_index
+ * should be correct before calling update_cons_index().
+ */
+static inline void update_cons_index(struct mthca_dev *dev, struct mthca_cq *cq,
+				     int incr)
+{
+	if (mthca_is_memfree(dev)) {
+		*cq->set_ci_db = cpu_to_be32(cq->cons_index);
+		wmb();
+	} else {
+		mthca_write64(MTHCA_TAVOR_CQ_DB_INC_CI | cq->cqn, incr - 1,
+			      dev->kar + MTHCA_CQ_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+		/*
+		 * Make sure doorbells don't leak out of CQ spinlock
+		 * and reach the HCA out of order:
+		 */
+		mmiowb();
+	}
+}
+
+void mthca_cq_completion(struct mthca_dev *dev, u32 cqn)
+{
+	struct mthca_cq *cq;
+
+	cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
+
+	if (!cq) {
+		mthca_warn(dev, "Completion event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	++cq->arm_sn;
+
+	cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+}
+
+void mthca_cq_event(struct mthca_dev *dev, u32 cqn,
+		    enum ib_event_type event_type)
+{
+	struct mthca_cq *cq;
+	struct ib_event event;
+
+	spin_lock(&dev->cq_table.lock);
+
+	cq = mthca_array_get(&dev->cq_table.cq, cqn & (dev->limits.num_cqs - 1));
+	if (cq)
+		++cq->refcount;
+
+	spin_unlock(&dev->cq_table.lock);
+
+	if (!cq) {
+		mthca_warn(dev, "Async event for bogus CQ %08x\n", cqn);
+		return;
+	}
+
+	event.device      = &dev->ib_dev;
+	event.event       = event_type;
+	event.element.cq  = &cq->ibcq;
+	if (cq->ibcq.event_handler)
+		cq->ibcq.event_handler(&event, cq->ibcq.cq_context);
+
+	spin_lock(&dev->cq_table.lock);
+	if (!--cq->refcount)
+		wake_up(&cq->wait);
+	spin_unlock(&dev->cq_table.lock);
+}
+
+static inline int is_recv_cqe(struct mthca_cqe *cqe)
+{
+	if ((cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
+	    MTHCA_ERROR_CQE_OPCODE_MASK)
+		return !(cqe->opcode & 0x01);
+	else
+		return !(cqe->is_send & 0x80);
+}
+
+void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn,
+		    struct mthca_srq *srq)
+{
+	struct mthca_cqe *cqe;
+	u32 prod_index;
+	int i, nfreed = 0;
+
+	spin_lock_irq(&cq->lock);
+
+	/*
+	 * First we need to find the current producer index, so we
+	 * know where to start cleaning from.  It doesn't matter if HW
+	 * adds new entries after this loop -- the QP we're worried
+	 * about is already in RESET, so the new entries won't come
+	 * from our QP and therefore don't need to be checked.
+	 */
+	for (prod_index = cq->cons_index;
+	     cqe_sw(get_cqe(cq, prod_index & cq->ibcq.cqe));
+	     ++prod_index)
+		if (prod_index == cq->cons_index + cq->ibcq.cqe)
+			break;
+
+	if (0)
+		mthca_dbg(dev, "Cleaning QPN %06x from CQN %06x; ci %d, pi %d\n",
+			  qpn, cq->cqn, cq->cons_index, prod_index);
+
+	/*
+	 * Now sweep backwards through the CQ, removing CQ entries
+	 * that match our QP by copying older entries on top of them.
+	 */
+	while ((int) --prod_index - (int) cq->cons_index >= 0) {
+		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
+		if (cqe->my_qpn == cpu_to_be32(qpn)) {
+			if (srq && is_recv_cqe(cqe))
+				mthca_free_srq_wqe(srq, be32_to_cpu(cqe->wqe));
+			++nfreed;
+		} else if (nfreed)
+			memcpy(get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe),
+			       cqe, MTHCA_CQ_ENTRY_SIZE);
+	}
+
+	if (nfreed) {
+		for (i = 0; i < nfreed; ++i)
+			set_cqe_hw(get_cqe(cq, (cq->cons_index + i) & cq->ibcq.cqe));
+		wmb();
+		cq->cons_index += nfreed;
+		update_cons_index(dev, cq, nfreed);
+	}
+
+	spin_unlock_irq(&cq->lock);
+}
+
+void mthca_cq_resize_copy_cqes(struct mthca_cq *cq)
+{
+	int i;
+
+	/*
+	 * In Tavor mode, the hardware keeps the consumer and producer
+	 * indices mod the CQ size.  Since we might be making the CQ
+	 * bigger, we need to deal with the case where the producer
+	 * index wrapped around before the CQ was resized.
+	 */
+	if (!mthca_is_memfree(to_mdev(cq->ibcq.device)) &&
+	    cq->ibcq.cqe < cq->resize_buf->cqe) {
+		cq->cons_index &= cq->ibcq.cqe;
+		if (cqe_sw(get_cqe(cq, cq->ibcq.cqe)))
+			cq->cons_index -= cq->ibcq.cqe + 1;
+	}
+
+	for (i = cq->cons_index; cqe_sw(get_cqe(cq, i & cq->ibcq.cqe)); ++i)
+		memcpy(get_cqe_from_buf(&cq->resize_buf->buf,
+					i & cq->resize_buf->cqe),
+		       get_cqe(cq, i & cq->ibcq.cqe), MTHCA_CQ_ENTRY_SIZE);
+}
+
+int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent)
+{
+	int ret;
+	int i;
+
+	ret = mthca_buf_alloc(dev, nent * MTHCA_CQ_ENTRY_SIZE,
+			      MTHCA_MAX_DIRECT_CQ_SIZE,
+			      &buf->queue, &buf->is_direct,
+			      &dev->driver_pd, 1, &buf->mr);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < nent; ++i)
+		set_cqe_hw(get_cqe_from_buf(buf, i));
+
+	return 0;
+}
+
+void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe)
+{
+	mthca_buf_free(dev, (cqe + 1) * MTHCA_CQ_ENTRY_SIZE, &buf->queue,
+		       buf->is_direct, &buf->mr);
+}
+
+static void handle_error_cqe(struct mthca_dev *dev, struct mthca_cq *cq,
+			     struct mthca_qp *qp, int wqe_index, int is_send,
+			     struct mthca_err_cqe *cqe,
+			     struct ib_wc *entry, int *free_cqe)
+{
+	int dbd;
+	__be32 new_wqe;
+
+	if (cqe->syndrome == SYNDROME_LOCAL_QP_OP_ERR) {
+		mthca_dbg(dev, "local QP operation err "
+			  "(QPN %06x, WQE @ %08x, CQN %06x, index %d)\n",
+			  be32_to_cpu(cqe->my_qpn), be32_to_cpu(cqe->wqe),
+			  cq->cqn, cq->cons_index);
+		dump_cqe(dev, cqe);
+	}
+
+	/*
+	 * For completions in error, only work request ID, status, vendor error
+	 * (and freed resource count for RD) have to be set.
+	 */
+	switch (cqe->syndrome) {
+	case SYNDROME_LOCAL_LENGTH_ERR:
+		entry->status = IB_WC_LOC_LEN_ERR;
+		break;
+	case SYNDROME_LOCAL_QP_OP_ERR:
+		entry->status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case SYNDROME_LOCAL_EEC_OP_ERR:
+		entry->status = IB_WC_LOC_EEC_OP_ERR;
+		break;
+	case SYNDROME_LOCAL_PROT_ERR:
+		entry->status = IB_WC_LOC_PROT_ERR;
+		break;
+	case SYNDROME_WR_FLUSH_ERR:
+		entry->status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case SYNDROME_MW_BIND_ERR:
+		entry->status = IB_WC_MW_BIND_ERR;
+		break;
+	case SYNDROME_BAD_RESP_ERR:
+		entry->status = IB_WC_BAD_RESP_ERR;
+		break;
+	case SYNDROME_LOCAL_ACCESS_ERR:
+		entry->status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case SYNDROME_REMOTE_INVAL_REQ_ERR:
+		entry->status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case SYNDROME_REMOTE_ACCESS_ERR:
+		entry->status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case SYNDROME_REMOTE_OP_ERR:
+		entry->status = IB_WC_REM_OP_ERR;
+		break;
+	case SYNDROME_RETRY_EXC_ERR:
+		entry->status = IB_WC_RETRY_EXC_ERR;
+		break;
+	case SYNDROME_RNR_RETRY_EXC_ERR:
+		entry->status = IB_WC_RNR_RETRY_EXC_ERR;
+		break;
+	case SYNDROME_LOCAL_RDD_VIOL_ERR:
+		entry->status = IB_WC_LOC_RDD_VIOL_ERR;
+		break;
+	case SYNDROME_REMOTE_INVAL_RD_REQ_ERR:
+		entry->status = IB_WC_REM_INV_RD_REQ_ERR;
+		break;
+	case SYNDROME_REMOTE_ABORTED_ERR:
+		entry->status = IB_WC_REM_ABORT_ERR;
+		break;
+	case SYNDROME_INVAL_EECN_ERR:
+		entry->status = IB_WC_INV_EECN_ERR;
+		break;
+	case SYNDROME_INVAL_EEC_STATE_ERR:
+		entry->status = IB_WC_INV_EEC_STATE_ERR;
+		break;
+	default:
+		entry->status = IB_WC_GENERAL_ERR;
+		break;
+	}
+
+	entry->vendor_err = cqe->vendor_err;
+
+	/*
+	 * Mem-free HCAs always generate one CQE per WQE, even in the
+	 * error case, so we don't have to check the doorbell count, etc.
+	 */
+	if (mthca_is_memfree(dev))
+		return;
+
+	mthca_free_err_wqe(dev, qp, is_send, wqe_index, &dbd, &new_wqe);
+
+	/*
+	 * If we're at the end of the WQE chain, or we've used up our
+	 * doorbell count, free the CQE.  Otherwise just update it for
+	 * the next poll operation.
+	 */
+	if (!(new_wqe & cpu_to_be32(0x3f)) || (!cqe->db_cnt && dbd))
+		return;
+
+	be16_add_cpu(&cqe->db_cnt, -dbd);
+	cqe->wqe      = new_wqe;
+	cqe->syndrome = SYNDROME_WR_FLUSH_ERR;
+
+	*free_cqe = 0;
+}
+
+static inline int mthca_poll_one(struct mthca_dev *dev,
+				 struct mthca_cq *cq,
+				 struct mthca_qp **cur_qp,
+				 int *freed,
+				 struct ib_wc *entry)
+{
+	struct mthca_wq *wq;
+	struct mthca_cqe *cqe;
+	int wqe_index;
+	int is_error;
+	int is_send;
+	int free_cqe = 1;
+	int err = 0;
+	u16 checksum;
+
+	cqe = next_cqe_sw(cq);
+	if (!cqe)
+		return -EAGAIN;
+
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rmb();
+
+	if (0) {
+		mthca_dbg(dev, "%x/%d: CQE -> QPN %06x, WQE @ %08x\n",
+			  cq->cqn, cq->cons_index, be32_to_cpu(cqe->my_qpn),
+			  be32_to_cpu(cqe->wqe));
+		dump_cqe(dev, cqe);
+	}
+
+	is_error = (cqe->opcode & MTHCA_ERROR_CQE_OPCODE_MASK) ==
+		MTHCA_ERROR_CQE_OPCODE_MASK;
+	is_send  = is_error ? cqe->opcode & 0x01 : cqe->is_send & 0x80;
+
+	if (!*cur_qp || be32_to_cpu(cqe->my_qpn) != (*cur_qp)->qpn) {
+		/*
+		 * We do not have to take the QP table lock here,
+		 * because CQs will be locked while QPs are removed
+		 * from the table.
+		 */
+		*cur_qp = mthca_array_get(&dev->qp_table.qp,
+					  be32_to_cpu(cqe->my_qpn) &
+					  (dev->limits.num_qps - 1));
+		if (!*cur_qp) {
+			mthca_warn(dev, "CQ entry for unknown QP %06x\n",
+				   be32_to_cpu(cqe->my_qpn) & 0xffffff);
+			err = -EINVAL;
+			goto out;
+		}
+	}
+
+	entry->qp = &(*cur_qp)->ibqp;
+
+	if (is_send) {
+		wq = &(*cur_qp)->sq;
+		wqe_index = ((be32_to_cpu(cqe->wqe) - (*cur_qp)->send_wqe_offset)
+			     >> wq->wqe_shift);
+		entry->wr_id = (*cur_qp)->wrid[wqe_index +
+					       (*cur_qp)->rq.max];
+	} else if ((*cur_qp)->ibqp.srq) {
+		struct mthca_srq *srq = to_msrq((*cur_qp)->ibqp.srq);
+		u32 wqe = be32_to_cpu(cqe->wqe);
+		wq = NULL;
+		wqe_index = wqe >> srq->wqe_shift;
+		entry->wr_id = srq->wrid[wqe_index];
+		mthca_free_srq_wqe(srq, wqe);
+	} else {
+		s32 wqe;
+		wq = &(*cur_qp)->rq;
+		wqe = be32_to_cpu(cqe->wqe);
+		wqe_index = wqe >> wq->wqe_shift;
+		/*
+		 * WQE addr == base - 1 might be reported in receive completion
+		 * with error instead of (rq size - 1) by Sinai FW 1.0.800 and
+		 * Arbel FW 5.1.400.  This bug should be fixed in later FW revs.
+		 */
+		if (unlikely(wqe_index < 0))
+			wqe_index = wq->max - 1;
+		entry->wr_id = (*cur_qp)->wrid[wqe_index];
+	}
+
+	if (wq) {
+		if (wq->last_comp < wqe_index)
+			wq->tail += wqe_index - wq->last_comp;
+		else
+			wq->tail += wqe_index + wq->max - wq->last_comp;
+
+		wq->last_comp = wqe_index;
+	}
+
+	if (is_error) {
+		handle_error_cqe(dev, cq, *cur_qp, wqe_index, is_send,
+				 (struct mthca_err_cqe *) cqe,
+				 entry, &free_cqe);
+		goto out;
+	}
+
+	if (is_send) {
+		entry->wc_flags = 0;
+		switch (cqe->opcode) {
+		case MTHCA_OPCODE_RDMA_WRITE:
+			entry->opcode    = IB_WC_RDMA_WRITE;
+			break;
+		case MTHCA_OPCODE_RDMA_WRITE_IMM:
+			entry->opcode    = IB_WC_RDMA_WRITE;
+			entry->wc_flags |= IB_WC_WITH_IMM;
+			break;
+		case MTHCA_OPCODE_SEND:
+			entry->opcode    = IB_WC_SEND;
+			break;
+		case MTHCA_OPCODE_SEND_IMM:
+			entry->opcode    = IB_WC_SEND;
+			entry->wc_flags |= IB_WC_WITH_IMM;
+			break;
+		case MTHCA_OPCODE_RDMA_READ:
+			entry->opcode    = IB_WC_RDMA_READ;
+			entry->byte_len  = be32_to_cpu(cqe->byte_cnt);
+			break;
+		case MTHCA_OPCODE_ATOMIC_CS:
+			entry->opcode    = IB_WC_COMP_SWAP;
+			entry->byte_len  = MTHCA_ATOMIC_BYTE_LEN;
+			break;
+		case MTHCA_OPCODE_ATOMIC_FA:
+			entry->opcode    = IB_WC_FETCH_ADD;
+			entry->byte_len  = MTHCA_ATOMIC_BYTE_LEN;
+			break;
+		case MTHCA_OPCODE_BIND_MW:
+			entry->opcode    = IB_WC_BIND_MW;
+			break;
+		default:
+			entry->opcode    = MTHCA_OPCODE_INVALID;
+			break;
+		}
+	} else {
+		entry->byte_len = be32_to_cpu(cqe->byte_cnt);
+		switch (cqe->opcode & 0x1f) {
+		case IB_OPCODE_SEND_LAST_WITH_IMMEDIATE:
+		case IB_OPCODE_SEND_ONLY_WITH_IMMEDIATE:
+			entry->wc_flags = IB_WC_WITH_IMM;
+			entry->ex.imm_data = cqe->imm_etype_pkey_eec;
+			entry->opcode = IB_WC_RECV;
+			break;
+		case IB_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
+		case IB_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
+			entry->wc_flags = IB_WC_WITH_IMM;
+			entry->ex.imm_data = cqe->imm_etype_pkey_eec;
+			entry->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+			break;
+		default:
+			entry->wc_flags = 0;
+			entry->opcode = IB_WC_RECV;
+			break;
+		}
+		entry->slid 	   = be16_to_cpu(cqe->rlid);
+		entry->sl   	   = cqe->sl_ipok >> 4;
+		entry->src_qp 	   = be32_to_cpu(cqe->rqpn) & 0xffffff;
+		entry->dlid_path_bits = cqe->g_mlpath & 0x7f;
+		entry->pkey_index  = be32_to_cpu(cqe->imm_etype_pkey_eec) >> 16;
+		entry->wc_flags   |= cqe->g_mlpath & 0x80 ? IB_WC_GRH : 0;
+		checksum = (be32_to_cpu(cqe->rqpn) >> 24) |
+				((be32_to_cpu(cqe->my_ee) >> 16) & 0xff00);
+		entry->wc_flags	  |=  (cqe->sl_ipok & 1 && checksum == 0xffff) ?
+							IB_WC_IP_CSUM_OK : 0;
+	}
+
+	entry->status = IB_WC_SUCCESS;
+
+ out:
+	if (likely(free_cqe)) {
+		set_cqe_hw(cqe);
+		++(*freed);
+		++cq->cons_index;
+	}
+
+	return err;
+}
+
+int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
+		  struct ib_wc *entry)
+{
+	struct mthca_dev *dev = to_mdev(ibcq->device);
+	struct mthca_cq *cq = to_mcq(ibcq);
+	struct mthca_qp *qp = NULL;
+	unsigned long flags;
+	int err = 0;
+	int freed = 0;
+	int npolled;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	npolled = 0;
+repoll:
+	while (npolled < num_entries) {
+		err = mthca_poll_one(dev, cq, &qp,
+				     &freed, entry + npolled);
+		if (err)
+			break;
+		++npolled;
+	}
+
+	if (freed) {
+		wmb();
+		update_cons_index(dev, cq, freed);
+	}
+
+	/*
+	 * If a CQ resize is in progress and we discovered that the
+	 * old buffer is empty, then peek in the new buffer, and if
+	 * it's not empty, switch to the new buffer and continue
+	 * polling there.
+	 */
+	if (unlikely(err == -EAGAIN && cq->resize_buf &&
+		     cq->resize_buf->state == CQ_RESIZE_READY)) {
+		/*
+		 * In Tavor mode, the hardware keeps the producer
+		 * index modulo the CQ size.  Since we might be making
+		 * the CQ bigger, we need to mask our consumer index
+		 * using the size of the old CQ buffer before looking
+		 * in the new CQ buffer.
+		 */
+		if (!mthca_is_memfree(dev))
+			cq->cons_index &= cq->ibcq.cqe;
+
+		if (cqe_sw(get_cqe_from_buf(&cq->resize_buf->buf,
+					    cq->cons_index & cq->resize_buf->cqe))) {
+			struct mthca_cq_buf tbuf;
+			int tcqe;
+
+			tbuf         = cq->buf;
+			tcqe         = cq->ibcq.cqe;
+			cq->buf      = cq->resize_buf->buf;
+			cq->ibcq.cqe = cq->resize_buf->cqe;
+
+			cq->resize_buf->buf   = tbuf;
+			cq->resize_buf->cqe   = tcqe;
+			cq->resize_buf->state = CQ_RESIZE_SWAPPED;
+
+			goto repoll;
+		}
+	}
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return err == 0 || err == -EAGAIN ? npolled : err;
+}
+
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags)
+{
+	u32 dbhi = ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		    MTHCA_TAVOR_CQ_DB_REQ_NOT_SOL :
+		    MTHCA_TAVOR_CQ_DB_REQ_NOT) |
+		to_mcq(cq)->cqn;
+
+	mthca_write64(dbhi, 0xffffffff, to_mdev(cq->device)->kar + MTHCA_CQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&to_mdev(cq->device)->doorbell_lock));
+
+	return 0;
+}
+
+int mthca_arbel_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
+{
+	struct mthca_cq *cq = to_mcq(ibcq);
+	__be32 db_rec[2];
+	u32 dbhi;
+	u32 sn = cq->arm_sn & 3;
+
+	db_rec[0] = cpu_to_be32(cq->cons_index);
+	db_rec[1] = cpu_to_be32((cq->cqn << 8) | (2 << 5) | (sn << 3) |
+				((flags & IB_CQ_SOLICITED_MASK) ==
+				 IB_CQ_SOLICITED ? 1 : 2));
+
+	mthca_write_db_rec(db_rec, cq->arm_db);
+
+	/*
+	 * Make sure that the doorbell record in host memory is
+	 * written before ringing the doorbell via PCI MMIO.
+	 */
+	wmb();
+
+	dbhi = (sn << 28) |
+		((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
+		 MTHCA_ARBEL_CQ_DB_REQ_NOT_SOL :
+		 MTHCA_ARBEL_CQ_DB_REQ_NOT) | cq->cqn;
+
+	mthca_write64(dbhi, cq->cons_index,
+		      to_mdev(ibcq->device)->kar + MTHCA_CQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&to_mdev(ibcq->device)->doorbell_lock));
+
+	return 0;
+}
+
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+		  struct mthca_ucontext *ctx, u32 pdn,
+		  struct mthca_cq *cq)
+{
+	struct mthca_mailbox *mailbox;
+	struct mthca_cq_context *cq_context;
+	int err = -ENOMEM;
+
+	cq->ibcq.cqe  = nent - 1;
+	cq->is_kernel = !ctx;
+
+	cq->cqn = mthca_alloc(&dev->cq_table.alloc);
+	if (cq->cqn == -1)
+		return -ENOMEM;
+
+	if (mthca_is_memfree(dev)) {
+		err = mthca_table_get(dev, dev->cq_table.table, cq->cqn);
+		if (err)
+			goto err_out;
+
+		if (cq->is_kernel) {
+			cq->arm_sn = 1;
+
+			err = -ENOMEM;
+
+			cq->set_ci_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_SET_CI,
+							     cq->cqn, &cq->set_ci_db);
+			if (cq->set_ci_db_index < 0)
+				goto err_out_icm;
+
+			cq->arm_db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_CQ_ARM,
+							  cq->cqn, &cq->arm_db);
+			if (cq->arm_db_index < 0)
+				goto err_out_ci;
+		}
+	}
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		goto err_out_arm;
+
+	cq_context = mailbox->buf;
+
+	if (cq->is_kernel) {
+		err = mthca_alloc_cq_buf(dev, &cq->buf, nent);
+		if (err)
+			goto err_out_mailbox;
+	}
+
+	spin_lock_init(&cq->lock);
+	cq->refcount = 1;
+	init_waitqueue_head(&cq->wait);
+	mutex_init(&cq->mutex);
+
+	memset(cq_context, 0, sizeof *cq_context);
+	cq_context->flags           = cpu_to_be32(MTHCA_CQ_STATUS_OK      |
+						  MTHCA_CQ_STATE_DISARMED |
+						  MTHCA_CQ_FLAG_TR);
+	cq_context->logsize_usrpage = cpu_to_be32((ffs(nent) - 1) << 24);
+	if (ctx)
+		cq_context->logsize_usrpage |= cpu_to_be32(ctx->uar.index);
+	else
+		cq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index);
+	cq_context->error_eqn       = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
+	cq_context->comp_eqn        = cpu_to_be32(dev->eq_table.eq[MTHCA_EQ_COMP].eqn);
+	cq_context->pd              = cpu_to_be32(pdn);
+	cq_context->lkey            = cpu_to_be32(cq->buf.mr.ibmr.lkey);
+	cq_context->cqn             = cpu_to_be32(cq->cqn);
+
+	if (mthca_is_memfree(dev)) {
+		cq_context->ci_db    = cpu_to_be32(cq->set_ci_db_index);
+		cq_context->state_db = cpu_to_be32(cq->arm_db_index);
+	}
+
+	err = mthca_SW2HW_CQ(dev, mailbox, cq->cqn);
+	if (err) {
+		mthca_warn(dev, "SW2HW_CQ failed (%d)\n", err);
+		goto err_out_free_mr;
+	}
+
+	spin_lock_irq(&dev->cq_table.lock);
+	if (mthca_array_set(&dev->cq_table.cq,
+			    cq->cqn & (dev->limits.num_cqs - 1),
+			    cq)) {
+		spin_unlock_irq(&dev->cq_table.lock);
+		goto err_out_free_mr;
+	}
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	cq->cons_index = 0;
+
+	mthca_free_mailbox(dev, mailbox);
+
+	return 0;
+
+err_out_free_mr:
+	if (cq->is_kernel)
+		mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+
+err_out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+
+err_out_arm:
+	if (cq->is_kernel && mthca_is_memfree(dev))
+		mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM, cq->arm_db_index);
+
+err_out_ci:
+	if (cq->is_kernel && mthca_is_memfree(dev))
+		mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+
+err_out_icm:
+	mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+
+err_out:
+	mthca_free(&dev->cq_table.alloc, cq->cqn);
+
+	return err;
+}
+
+static inline int get_cq_refcount(struct mthca_dev *dev, struct mthca_cq *cq)
+{
+	int c;
+
+	spin_lock_irq(&dev->cq_table.lock);
+	c = cq->refcount;
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	return c;
+}
+
+void mthca_free_cq(struct mthca_dev *dev,
+		   struct mthca_cq *cq)
+{
+	struct mthca_mailbox *mailbox;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		mthca_warn(dev, "No memory for mailbox to free CQ.\n");
+		return;
+	}
+
+	err = mthca_HW2SW_CQ(dev, mailbox, cq->cqn);
+	if (err)
+		mthca_warn(dev, "HW2SW_CQ failed (%d)\n", err);
+
+	if (0) {
+		__be32 *ctx = mailbox->buf;
+		int j;
+
+		printk(KERN_ERR "context for CQN %x (cons index %x, next sw %d)\n",
+		       cq->cqn, cq->cons_index,
+		       cq->is_kernel ? !!next_cqe_sw(cq) : 0);
+		for (j = 0; j < 16; ++j)
+			printk(KERN_ERR "[%2x] %08x\n", j * 4, be32_to_cpu(ctx[j]));
+	}
+
+	spin_lock_irq(&dev->cq_table.lock);
+	mthca_array_clear(&dev->cq_table.cq,
+			  cq->cqn & (dev->limits.num_cqs - 1));
+	--cq->refcount;
+	spin_unlock_irq(&dev->cq_table.lock);
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI_X)
+		synchronize_irq(dev->eq_table.eq[MTHCA_EQ_COMP].msi_x_vector);
+	else
+		synchronize_irq(dev->pdev->irq);
+
+	wait_event(cq->wait, !get_cq_refcount(dev, cq));
+
+	if (cq->is_kernel) {
+		mthca_free_cq_buf(dev, &cq->buf, cq->ibcq.cqe);
+		if (mthca_is_memfree(dev)) {
+			mthca_free_db(dev, MTHCA_DB_TYPE_CQ_ARM,    cq->arm_db_index);
+			mthca_free_db(dev, MTHCA_DB_TYPE_CQ_SET_CI, cq->set_ci_db_index);
+		}
+	}
+
+	mthca_table_put(dev, dev->cq_table.table, cq->cqn);
+	mthca_free(&dev->cq_table.alloc, cq->cqn);
+	mthca_free_mailbox(dev, mailbox);
+}
+
+int mthca_init_cq_table(struct mthca_dev *dev)
+{
+	int err;
+
+	spin_lock_init(&dev->cq_table.lock);
+
+	err = mthca_alloc_init(&dev->cq_table.alloc,
+			       dev->limits.num_cqs,
+			       (1 << 24) - 1,
+			       dev->limits.reserved_cqs);
+	if (err)
+		return err;
+
+	err = mthca_array_init(&dev->cq_table.cq,
+			       dev->limits.num_cqs);
+	if (err)
+		mthca_alloc_cleanup(&dev->cq_table.alloc);
+
+	return err;
+}
+
+void mthca_cleanup_cq_table(struct mthca_dev *dev)
+{
+	mthca_array_cleanup(&dev->cq_table.cq, dev->limits.num_cqs);
+	mthca_alloc_cleanup(&dev->cq_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
new file mode 100644
index 000000000..7e6a6d64a
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_DEV_H
+#define MTHCA_DEV_H
+
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/timer.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/semaphore.h>
+
+#include "mthca_provider.h"
+#include "mthca_doorbell.h"
+
+#define DRV_NAME	"ib_mthca"
+#define PFX		DRV_NAME ": "
+#define DRV_VERSION	"1.0"
+#define DRV_RELDATE	"April 4, 2008"
+
+enum {
+	MTHCA_FLAG_DDR_HIDDEN = 1 << 1,
+	MTHCA_FLAG_SRQ        = 1 << 2,
+	MTHCA_FLAG_MSI_X      = 1 << 3,
+	MTHCA_FLAG_NO_LAM     = 1 << 4,
+	MTHCA_FLAG_FMR        = 1 << 5,
+	MTHCA_FLAG_MEMFREE    = 1 << 6,
+	MTHCA_FLAG_PCIE       = 1 << 7,
+	MTHCA_FLAG_SINAI_OPT  = 1 << 8
+};
+
+enum {
+	MTHCA_MAX_PORTS = 2
+};
+
+enum {
+	MTHCA_BOARD_ID_LEN = 64
+};
+
+enum {
+	MTHCA_EQ_CONTEXT_SIZE =  0x40,
+	MTHCA_CQ_CONTEXT_SIZE =  0x40,
+	MTHCA_QP_CONTEXT_SIZE = 0x200,
+	MTHCA_RDB_ENTRY_SIZE  =  0x20,
+	MTHCA_AV_SIZE         =  0x20,
+	MTHCA_MGM_ENTRY_SIZE  = 0x100,
+
+	/* Arbel FW gives us these, but we need them for Tavor */
+	MTHCA_MPT_ENTRY_SIZE  =  0x40,
+	MTHCA_MTT_SEG_SIZE    =  0x40,
+
+	MTHCA_QP_PER_MGM      = 4 * (MTHCA_MGM_ENTRY_SIZE / 16 - 2)
+};
+
+enum {
+	MTHCA_EQ_CMD,
+	MTHCA_EQ_ASYNC,
+	MTHCA_EQ_COMP,
+	MTHCA_NUM_EQ
+};
+
+enum {
+	MTHCA_OPCODE_NOP            = 0x00,
+	MTHCA_OPCODE_RDMA_WRITE     = 0x08,
+	MTHCA_OPCODE_RDMA_WRITE_IMM = 0x09,
+	MTHCA_OPCODE_SEND           = 0x0a,
+	MTHCA_OPCODE_SEND_IMM       = 0x0b,
+	MTHCA_OPCODE_RDMA_READ      = 0x10,
+	MTHCA_OPCODE_ATOMIC_CS      = 0x11,
+	MTHCA_OPCODE_ATOMIC_FA      = 0x12,
+	MTHCA_OPCODE_BIND_MW        = 0x18,
+	MTHCA_OPCODE_INVALID        = 0xff
+};
+
+enum {
+	MTHCA_CMD_USE_EVENTS         = 1 << 0,
+	MTHCA_CMD_POST_DOORBELLS     = 1 << 1
+};
+
+enum {
+	MTHCA_CMD_NUM_DBELL_DWORDS = 8
+};
+
+struct mthca_cmd {
+	struct pci_pool          *pool;
+	struct mutex              hcr_mutex;
+	struct semaphore 	  poll_sem;
+	struct semaphore 	  event_sem;
+	int              	  max_cmds;
+	spinlock_t                context_lock;
+	int                       free_head;
+	struct mthca_cmd_context *context;
+	u16                       token_mask;
+	u32                       flags;
+	void __iomem             *dbell_map;
+	u16                       dbell_offsets[MTHCA_CMD_NUM_DBELL_DWORDS];
+};
+
+struct mthca_limits {
+	int      num_ports;
+	int      vl_cap;
+	int      mtu_cap;
+	int      gid_table_len;
+	int      pkey_table_len;
+	int      local_ca_ack_delay;
+	int      num_uars;
+	int      max_sg;
+	int      num_qps;
+	int      max_wqes;
+	int	 max_desc_sz;
+	int	 max_qp_init_rdma;
+	int      reserved_qps;
+	int      num_srqs;
+	int      max_srq_wqes;
+	int      max_srq_sge;
+	int      reserved_srqs;
+	int      num_eecs;
+	int      reserved_eecs;
+	int      num_cqs;
+	int      max_cqes;
+	int      reserved_cqs;
+	int      num_eqs;
+	int      reserved_eqs;
+	int      num_mpts;
+	int      num_mtt_segs;
+	int	 mtt_seg_size;
+	int      fmr_reserved_mtts;
+	int      reserved_mtts;
+	int      reserved_mrws;
+	int      reserved_uars;
+	int      num_mgms;
+	int      num_amgms;
+	int      reserved_mcgs;
+	int      num_pds;
+	int      reserved_pds;
+	u32      page_size_cap;
+	u32      flags;
+	u16      stat_rate_support;
+	u8       port_width_cap;
+};
+
+struct mthca_alloc {
+	u32            last;
+	u32            top;
+	u32            max;
+	u32            mask;
+	spinlock_t     lock;
+	unsigned long *table;
+};
+
+struct mthca_array {
+	struct {
+		void    **page;
+		int       used;
+	} *page_list;
+};
+
+struct mthca_uar_table {
+	struct mthca_alloc alloc;
+	u64                uarc_base;
+	int                uarc_size;
+};
+
+struct mthca_pd_table {
+	struct mthca_alloc alloc;
+};
+
+struct mthca_buddy {
+	unsigned long **bits;
+	int	       *num_free;
+	int             max_order;
+	spinlock_t      lock;
+};
+
+struct mthca_mr_table {
+	struct mthca_alloc      mpt_alloc;
+	struct mthca_buddy      mtt_buddy;
+	struct mthca_buddy     *fmr_mtt_buddy;
+	u64                     mtt_base;
+	u64                     mpt_base;
+	struct mthca_icm_table *mtt_table;
+	struct mthca_icm_table *mpt_table;
+	struct {
+		void __iomem   *mpt_base;
+		void __iomem   *mtt_base;
+		struct mthca_buddy mtt_buddy;
+	} tavor_fmr;
+};
+
+struct mthca_eq_table {
+	struct mthca_alloc alloc;
+	void __iomem      *clr_int;
+	u32                clr_mask;
+	u32                arm_mask;
+	struct mthca_eq    eq[MTHCA_NUM_EQ];
+	u64                icm_virt;
+	struct page       *icm_page;
+	dma_addr_t         icm_dma;
+	int                have_irq;
+	u8                 inta_pin;
+};
+
+struct mthca_cq_table {
+	struct mthca_alloc 	alloc;
+	spinlock_t         	lock;
+	struct mthca_array      cq;
+	struct mthca_icm_table *table;
+};
+
+struct mthca_srq_table {
+	struct mthca_alloc 	alloc;
+	spinlock_t         	lock;
+	struct mthca_array      srq;
+	struct mthca_icm_table *table;
+};
+
+struct mthca_qp_table {
+	struct mthca_alloc     	alloc;
+	u32                    	rdb_base;
+	int                    	rdb_shift;
+	int                    	sqp_start;
+	spinlock_t             	lock;
+	struct mthca_array     	qp;
+	struct mthca_icm_table *qp_table;
+	struct mthca_icm_table *eqp_table;
+	struct mthca_icm_table *rdb_table;
+};
+
+struct mthca_av_table {
+	struct pci_pool   *pool;
+	int                num_ddr_avs;
+	u64                ddr_av_base;
+	void __iomem      *av_map;
+	struct mthca_alloc alloc;
+};
+
+struct mthca_mcg_table {
+	struct mutex		mutex;
+	struct mthca_alloc 	alloc;
+	struct mthca_icm_table *table;
+};
+
+struct mthca_catas_err {
+	u64			addr;
+	u32 __iomem	       *map;
+	u32			size;
+	struct timer_list	timer;
+	struct list_head	list;
+};
+
+extern struct mutex mthca_device_mutex;
+
+struct mthca_dev {
+	struct ib_device  ib_dev;
+	struct pci_dev   *pdev;
+
+	int          	 hca_type;
+	unsigned long	 mthca_flags;
+	unsigned long    device_cap_flags;
+
+	u32              rev_id;
+	char             board_id[MTHCA_BOARD_ID_LEN];
+
+	/* firmware info */
+	u64              fw_ver;
+	union {
+		struct {
+			u64 fw_start;
+			u64 fw_end;
+		}        tavor;
+		struct {
+			u64 clr_int_base;
+			u64 eq_arm_base;
+			u64 eq_set_ci_base;
+			struct mthca_icm *fw_icm;
+			struct mthca_icm *aux_icm;
+			u16 fw_pages;
+		}        arbel;
+	}                fw;
+
+	u64              ddr_start;
+	u64              ddr_end;
+
+	MTHCA_DECLARE_DOORBELL_LOCK(doorbell_lock)
+	struct mutex cap_mask_mutex;
+
+	void __iomem    *hcr;
+	void __iomem    *kar;
+	void __iomem    *clr_base;
+	union {
+		struct {
+			void __iomem *ecr_base;
+		} tavor;
+		struct {
+			void __iomem *eq_arm;
+			void __iomem *eq_set_ci_base;
+		} arbel;
+	} eq_regs;
+
+	struct mthca_cmd    cmd;
+	struct mthca_limits limits;
+
+	struct mthca_uar_table uar_table;
+	struct mthca_pd_table  pd_table;
+	struct mthca_mr_table  mr_table;
+	struct mthca_eq_table  eq_table;
+	struct mthca_cq_table  cq_table;
+	struct mthca_srq_table srq_table;
+	struct mthca_qp_table  qp_table;
+	struct mthca_av_table  av_table;
+	struct mthca_mcg_table mcg_table;
+
+	struct mthca_catas_err catas_err;
+
+	struct mthca_uar       driver_uar;
+	struct mthca_db_table *db_tab;
+	struct mthca_pd        driver_pd;
+	struct mthca_mr        driver_mr;
+
+	struct ib_mad_agent  *send_agent[MTHCA_MAX_PORTS][2];
+	struct ib_ah         *sm_ah[MTHCA_MAX_PORTS];
+	spinlock_t            sm_lock;
+	u8                    rate[MTHCA_MAX_PORTS];
+	bool		      active;
+};
+
+#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
+extern int mthca_debug_level;
+
+#define mthca_dbg(mdev, format, arg...)					\
+	do {								\
+		if (mthca_debug_level)					\
+			dev_printk(KERN_DEBUG, &mdev->pdev->dev, format, ## arg); \
+	} while (0)
+
+#else /* CONFIG_INFINIBAND_MTHCA_DEBUG */
+
+#define mthca_dbg(mdev, format, arg...) do { (void) mdev; } while (0)
+
+#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */
+
+#define mthca_err(mdev, format, arg...) \
+	dev_err(&mdev->pdev->dev, format, ## arg)
+#define mthca_info(mdev, format, arg...) \
+	dev_info(&mdev->pdev->dev, format, ## arg)
+#define mthca_warn(mdev, format, arg...) \
+	dev_warn(&mdev->pdev->dev, format, ## arg)
+
+extern void __buggy_use_of_MTHCA_GET(void);
+extern void __buggy_use_of_MTHCA_PUT(void);
+
+#define MTHCA_GET(dest, source, offset)                               \
+	do {                                                          \
+		void *__p = (char *) (source) + (offset);             \
+		switch (sizeof (dest)) {                              \
+		case 1: (dest) = *(u8 *) __p;       break;	      \
+		case 2: (dest) = be16_to_cpup(__p); break;	      \
+		case 4: (dest) = be32_to_cpup(__p); break;	      \
+		case 8: (dest) = be64_to_cpup(__p); break;	      \
+		default: __buggy_use_of_MTHCA_GET();		      \
+		}                                                     \
+	} while (0)
+
+#define MTHCA_PUT(dest, source, offset)                               \
+	do {                                                          \
+		void *__d = ((char *) (dest) + (offset));	      \
+		switch (sizeof(source)) {                             \
+		case 1: *(u8 *) __d = (source);                break; \
+		case 2:	*(__be16 *) __d = cpu_to_be16(source); break; \
+		case 4:	*(__be32 *) __d = cpu_to_be32(source); break; \
+		case 8:	*(__be64 *) __d = cpu_to_be64(source); break; \
+		default: __buggy_use_of_MTHCA_PUT();		      \
+		}                                                     \
+	} while (0)
+
+int mthca_reset(struct mthca_dev *mdev);
+
+u32 mthca_alloc(struct mthca_alloc *alloc);
+void mthca_free(struct mthca_alloc *alloc, u32 obj);
+int mthca_alloc_init(struct mthca_alloc *alloc, u32 num, u32 mask,
+		     u32 reserved);
+void mthca_alloc_cleanup(struct mthca_alloc *alloc);
+void *mthca_array_get(struct mthca_array *array, int index);
+int mthca_array_set(struct mthca_array *array, int index, void *value);
+void mthca_array_clear(struct mthca_array *array, int index);
+int mthca_array_init(struct mthca_array *array, int nent);
+void mthca_array_cleanup(struct mthca_array *array, int nent);
+int mthca_buf_alloc(struct mthca_dev *dev, int size, int max_direct,
+		    union mthca_buf *buf, int *is_direct, struct mthca_pd *pd,
+		    int hca_write, struct mthca_mr *mr);
+void mthca_buf_free(struct mthca_dev *dev, int size, union mthca_buf *buf,
+		    int is_direct, struct mthca_mr *mr);
+
+int mthca_init_uar_table(struct mthca_dev *dev);
+int mthca_init_pd_table(struct mthca_dev *dev);
+int mthca_init_mr_table(struct mthca_dev *dev);
+int mthca_init_eq_table(struct mthca_dev *dev);
+int mthca_init_cq_table(struct mthca_dev *dev);
+int mthca_init_srq_table(struct mthca_dev *dev);
+int mthca_init_qp_table(struct mthca_dev *dev);
+int mthca_init_av_table(struct mthca_dev *dev);
+int mthca_init_mcg_table(struct mthca_dev *dev);
+
+void mthca_cleanup_uar_table(struct mthca_dev *dev);
+void mthca_cleanup_pd_table(struct mthca_dev *dev);
+void mthca_cleanup_mr_table(struct mthca_dev *dev);
+void mthca_cleanup_eq_table(struct mthca_dev *dev);
+void mthca_cleanup_cq_table(struct mthca_dev *dev);
+void mthca_cleanup_srq_table(struct mthca_dev *dev);
+void mthca_cleanup_qp_table(struct mthca_dev *dev);
+void mthca_cleanup_av_table(struct mthca_dev *dev);
+void mthca_cleanup_mcg_table(struct mthca_dev *dev);
+
+int mthca_register_device(struct mthca_dev *dev);
+void mthca_unregister_device(struct mthca_dev *dev);
+
+void mthca_start_catas_poll(struct mthca_dev *dev);
+void mthca_stop_catas_poll(struct mthca_dev *dev);
+int __mthca_restart_one(struct pci_dev *pdev);
+int mthca_catas_init(void);
+void mthca_catas_cleanup(void);
+
+int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar);
+void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar);
+
+int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd);
+void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd);
+
+int mthca_write_mtt_size(struct mthca_dev *dev);
+
+struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size);
+void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt);
+int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+		    int start_index, u64 *buffer_list, int list_len);
+int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift,
+		   u64 iova, u64 total_size, u32 access, struct mthca_mr *mr);
+int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
+			   u32 access, struct mthca_mr *mr);
+int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
+			u64 *buffer_list, int buffer_size_shift,
+			int list_len, u64 iova, u64 total_size,
+			u32 access, struct mthca_mr *mr);
+void mthca_free_mr(struct mthca_dev *dev,  struct mthca_mr *mr);
+
+int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
+		    u32 access, struct mthca_fmr *fmr);
+int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+			     int list_len, u64 iova);
+void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
+int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+			     int list_len, u64 iova);
+void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr);
+int mthca_free_fmr(struct mthca_dev *dev,  struct mthca_fmr *fmr);
+
+int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt);
+void mthca_unmap_eq_icm(struct mthca_dev *dev);
+
+int mthca_poll_cq(struct ib_cq *ibcq, int num_entries,
+		  struct ib_wc *entry);
+int mthca_tavor_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+int mthca_arbel_arm_cq(struct ib_cq *cq, enum ib_cq_notify_flags flags);
+int mthca_init_cq(struct mthca_dev *dev, int nent,
+		  struct mthca_ucontext *ctx, u32 pdn,
+		  struct mthca_cq *cq);
+void mthca_free_cq(struct mthca_dev *dev,
+		   struct mthca_cq *cq);
+void mthca_cq_completion(struct mthca_dev *dev, u32 cqn);
+void mthca_cq_event(struct mthca_dev *dev, u32 cqn,
+		    enum ib_event_type event_type);
+void mthca_cq_clean(struct mthca_dev *dev, struct mthca_cq *cq, u32 qpn,
+		    struct mthca_srq *srq);
+void mthca_cq_resize_copy_cqes(struct mthca_cq *cq);
+int mthca_alloc_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int nent);
+void mthca_free_cq_buf(struct mthca_dev *dev, struct mthca_cq_buf *buf, int cqe);
+
+int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
+		    struct ib_srq_attr *attr, struct mthca_srq *srq);
+void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq);
+int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		     enum ib_srq_attr_mask attr_mask, struct ib_udata *udata);
+int mthca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr);
+int mthca_max_srq_sge(struct mthca_dev *dev);
+void mthca_srq_event(struct mthca_dev *dev, u32 srqn,
+		     enum ib_event_type event_type);
+void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr);
+int mthca_tavor_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr);
+int mthca_arbel_post_srq_recv(struct ib_srq *srq, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr);
+
+void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
+		    enum ib_event_type event_type);
+int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		   struct ib_qp_init_attr *qp_init_attr);
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+		    struct ib_udata *udata);
+int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr);
+int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr);
+int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr);
+int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr);
+void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
+			int index, int *dbd, __be32 *new_wqe);
+int mthca_alloc_qp(struct mthca_dev *dev,
+		   struct mthca_pd *pd,
+		   struct mthca_cq *send_cq,
+		   struct mthca_cq *recv_cq,
+		   enum ib_qp_type type,
+		   enum ib_sig_type send_policy,
+		   struct ib_qp_cap *cap,
+		   struct mthca_qp *qp);
+int mthca_alloc_sqp(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct mthca_cq *send_cq,
+		    struct mthca_cq *recv_cq,
+		    enum ib_sig_type send_policy,
+		    struct ib_qp_cap *cap,
+		    int qpn,
+		    int port,
+		    struct mthca_sqp *sqp);
+void mthca_free_qp(struct mthca_dev *dev, struct mthca_qp *qp);
+int mthca_create_ah(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct ib_ah_attr *ah_attr,
+		    struct mthca_ah *ah);
+int mthca_destroy_ah(struct mthca_dev *dev, struct mthca_ah *ah);
+int mthca_read_ah(struct mthca_dev *dev, struct mthca_ah *ah,
+		  struct ib_ud_header *header);
+int mthca_ah_query(struct ib_ah *ibah, struct ib_ah_attr *attr);
+int mthca_ah_grh_present(struct mthca_ah *ah);
+u8 mthca_get_rate(struct mthca_dev *dev, int static_rate, u8 port);
+enum ib_rate mthca_rate_to_ib(struct mthca_dev *dev, u8 mthca_rate, u8 port);
+
+int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int mthca_process_mad(struct ib_device *ibdev,
+		      int mad_flags,
+		      u8 port_num,
+		      struct ib_wc *in_wc,
+		      struct ib_grh *in_grh,
+		      struct ib_mad *in_mad,
+		      struct ib_mad *out_mad);
+int mthca_create_agents(struct mthca_dev *dev);
+void mthca_free_agents(struct mthca_dev *dev);
+
+static inline struct mthca_dev *to_mdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct mthca_dev, ib_dev);
+}
+
+static inline int mthca_is_memfree(struct mthca_dev *dev)
+{
+	return dev->mthca_flags & MTHCA_FLAG_MEMFREE;
+}
+
+#endif /* MTHCA_DEV_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_doorbell.h b/drivers/infiniband/hw/mthca/mthca_doorbell.h
new file mode 100644
index 000000000..14f51ef97
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_doorbell.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/types.h>
+
+#define MTHCA_RD_DOORBELL      0x00
+#define MTHCA_SEND_DOORBELL    0x10
+#define MTHCA_RECEIVE_DOORBELL 0x18
+#define MTHCA_CQ_DOORBELL      0x20
+#define MTHCA_EQ_DOORBELL      0x28
+
+#if BITS_PER_LONG == 64
+/*
+ * Assume that we can just write a 64-bit doorbell atomically.  s390
+ * actually doesn't have writeq() but S/390 systems don't even have
+ * PCI so we won't worry about it.
+ */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name)
+#define MTHCA_INIT_DOORBELL_LOCK(ptr)    do { } while (0)
+#define MTHCA_GET_DOORBELL_LOCK(ptr)      (NULL)
+
+static inline void mthca_write64_raw(__be64 val, void __iomem *dest)
+{
+	__raw_writeq((__force u64) val, dest);
+}
+
+static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest,
+				 spinlock_t *doorbell_lock)
+{
+	__raw_writeq((__force u64) cpu_to_be64((u64) hi << 32 | lo), dest);
+}
+
+static inline void mthca_write_db_rec(__be32 val[2], __be32 *db)
+{
+	*(u64 *) db = *(u64 *) val;
+}
+
+#else
+
+/*
+ * Just fall back to a spinlock to protect the doorbell if
+ * BITS_PER_LONG is 32 -- there's no portable way to do atomic 64-bit
+ * MMIO writes.
+ */
+
+#define MTHCA_DECLARE_DOORBELL_LOCK(name) spinlock_t name;
+#define MTHCA_INIT_DOORBELL_LOCK(ptr)     spin_lock_init(ptr)
+#define MTHCA_GET_DOORBELL_LOCK(ptr)      (ptr)
+
+static inline void mthca_write64_raw(__be64 val, void __iomem *dest)
+{
+	__raw_writel(((__force u32 *) &val)[0], dest);
+	__raw_writel(((__force u32 *) &val)[1], dest + 4);
+}
+
+static inline void mthca_write64(u32 hi, u32 lo, void __iomem *dest,
+				 spinlock_t *doorbell_lock)
+{
+	unsigned long flags;
+
+	hi = (__force u32) cpu_to_be32(hi);
+	lo = (__force u32) cpu_to_be32(lo);
+
+	spin_lock_irqsave(doorbell_lock, flags);
+	__raw_writel(hi, dest);
+	__raw_writel(lo, dest + 4);
+	spin_unlock_irqrestore(doorbell_lock, flags);
+}
+
+static inline void mthca_write_db_rec(__be32 val[2], __be32 *db)
+{
+	db[0] = val[0];
+	wmb();
+	db[1] = val[1];
+}
+
+#endif
diff --git a/drivers/infiniband/hw/mthca/mthca_eq.c b/drivers/infiniband/hw/mthca/mthca_eq.c
new file mode 100644
index 000000000..690201738
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_eq.c
@@ -0,0 +1,905 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/slab.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_config_reg.h"
+
+enum {
+	MTHCA_NUM_ASYNC_EQE = 0x80,
+	MTHCA_NUM_CMD_EQE   = 0x80,
+	MTHCA_NUM_SPARE_EQE = 0x80,
+	MTHCA_EQ_ENTRY_SIZE = 0x20
+};
+
+/*
+ * Must be packed because start is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_eq_context {
+	__be32 flags;
+	__be64 start;
+	__be32 logsize_usrpage;
+	__be32 tavor_pd;	/* reserved for Arbel */
+	u8     reserved1[3];
+	u8     intr;
+	__be32 arbel_pd;	/* lost_count for Tavor */
+	__be32 lkey;
+	u32    reserved2[2];
+	__be32 consumer_index;
+	__be32 producer_index;
+	u32    reserved3[4];
+} __attribute__((packed));
+
+#define MTHCA_EQ_STATUS_OK          ( 0 << 28)
+#define MTHCA_EQ_STATUS_OVERFLOW    ( 9 << 28)
+#define MTHCA_EQ_STATUS_WRITE_FAIL  (10 << 28)
+#define MTHCA_EQ_OWNER_SW           ( 0 << 24)
+#define MTHCA_EQ_OWNER_HW           ( 1 << 24)
+#define MTHCA_EQ_FLAG_TR            ( 1 << 18)
+#define MTHCA_EQ_FLAG_OI            ( 1 << 17)
+#define MTHCA_EQ_STATE_ARMED        ( 1 <<  8)
+#define MTHCA_EQ_STATE_FIRED        ( 2 <<  8)
+#define MTHCA_EQ_STATE_ALWAYS_ARMED ( 3 <<  8)
+#define MTHCA_EQ_STATE_ARBEL        ( 8 <<  8)
+
+enum {
+	MTHCA_EVENT_TYPE_COMP       	    = 0x00,
+	MTHCA_EVENT_TYPE_PATH_MIG   	    = 0x01,
+	MTHCA_EVENT_TYPE_COMM_EST   	    = 0x02,
+	MTHCA_EVENT_TYPE_SQ_DRAINED 	    = 0x03,
+	MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE    = 0x13,
+	MTHCA_EVENT_TYPE_SRQ_LIMIT	    = 0x14,
+	MTHCA_EVENT_TYPE_CQ_ERROR   	    = 0x04,
+	MTHCA_EVENT_TYPE_WQ_CATAS_ERROR     = 0x05,
+	MTHCA_EVENT_TYPE_EEC_CATAS_ERROR    = 0x06,
+	MTHCA_EVENT_TYPE_PATH_MIG_FAILED    = 0x07,
+	MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR = 0x10,
+	MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR    = 0x11,
+	MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR    = 0x12,
+	MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR  = 0x08,
+	MTHCA_EVENT_TYPE_PORT_CHANGE        = 0x09,
+	MTHCA_EVENT_TYPE_EQ_OVERFLOW        = 0x0f,
+	MTHCA_EVENT_TYPE_ECC_DETECT         = 0x0e,
+	MTHCA_EVENT_TYPE_CMD                = 0x0a
+};
+
+#define MTHCA_ASYNC_EVENT_MASK ((1ULL << MTHCA_EVENT_TYPE_PATH_MIG)           | \
+				(1ULL << MTHCA_EVENT_TYPE_COMM_EST)           | \
+				(1ULL << MTHCA_EVENT_TYPE_SQ_DRAINED)         | \
+				(1ULL << MTHCA_EVENT_TYPE_CQ_ERROR)           | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_CATAS_ERROR)     | \
+				(1ULL << MTHCA_EVENT_TYPE_EEC_CATAS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_PATH_MIG_FAILED)    | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | \
+				(1ULL << MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR)  | \
+				(1ULL << MTHCA_EVENT_TYPE_PORT_CHANGE)        | \
+				(1ULL << MTHCA_EVENT_TYPE_ECC_DETECT))
+#define MTHCA_SRQ_EVENT_MASK   ((1ULL << MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR)    | \
+				(1ULL << MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE)    | \
+				(1ULL << MTHCA_EVENT_TYPE_SRQ_LIMIT))
+#define MTHCA_CMD_EVENT_MASK    (1ULL << MTHCA_EVENT_TYPE_CMD)
+
+#define MTHCA_EQ_DB_INC_CI     (1 << 24)
+#define MTHCA_EQ_DB_REQ_NOT    (2 << 24)
+#define MTHCA_EQ_DB_DISARM_CQ  (3 << 24)
+#define MTHCA_EQ_DB_SET_CI     (4 << 24)
+#define MTHCA_EQ_DB_ALWAYS_ARM (5 << 24)
+
+struct mthca_eqe {
+	u8 reserved1;
+	u8 type;
+	u8 reserved2;
+	u8 subtype;
+	union {
+		u32 raw[6];
+		struct {
+			__be32 cqn;
+		} __attribute__((packed)) comp;
+		struct {
+			u16    reserved1;
+			__be16 token;
+			u32    reserved2;
+			u8     reserved3[3];
+			u8     status;
+			__be64 out_param;
+		} __attribute__((packed)) cmd;
+		struct {
+			__be32 qpn;
+		} __attribute__((packed)) qp;
+		struct {
+			__be32 srqn;
+		} __attribute__((packed)) srq;
+		struct {
+			__be32 cqn;
+			u32    reserved1;
+			u8     reserved2[3];
+			u8     syndrome;
+		} __attribute__((packed)) cq_err;
+		struct {
+			u32    reserved1[2];
+			__be32 port;
+		} __attribute__((packed)) port_change;
+	} event;
+	u8 reserved3[3];
+	u8 owner;
+} __attribute__((packed));
+
+#define  MTHCA_EQ_ENTRY_OWNER_SW      (0 << 7)
+#define  MTHCA_EQ_ENTRY_OWNER_HW      (1 << 7)
+
+static inline u64 async_mask(struct mthca_dev *dev)
+{
+	return dev->mthca_flags & MTHCA_FLAG_SRQ ?
+		MTHCA_ASYNC_EVENT_MASK | MTHCA_SRQ_EVENT_MASK :
+		MTHCA_ASYNC_EVENT_MASK;
+}
+
+static inline void tavor_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+	/*
+	 * This barrier makes sure that all updates to ownership bits
+	 * done by set_eqe_hw() hit memory before the consumer index
+	 * is updated.  set_eq_ci() allows the HCA to possibly write
+	 * more EQ entries, and we want to avoid the exceedingly
+	 * unlikely possibility of the HCA writing an entry and then
+	 * having set_eqe_hw() overwrite the owner field.
+	 */
+	wmb();
+	mthca_write64(MTHCA_EQ_DB_SET_CI | eq->eqn, ci & (eq->nent - 1),
+		      dev->kar + MTHCA_EQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline void arbel_set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+	/* See comment in tavor_set_eq_ci() above. */
+	wmb();
+	__raw_writel((__force u32) cpu_to_be32(ci),
+		     dev->eq_regs.arbel.eq_set_ci_base + eq->eqn * 8);
+	/* We still want ordering, just not swabbing, so add a barrier */
+	mb();
+}
+
+static inline void set_eq_ci(struct mthca_dev *dev, struct mthca_eq *eq, u32 ci)
+{
+	if (mthca_is_memfree(dev))
+		arbel_set_eq_ci(dev, eq, ci);
+	else
+		tavor_set_eq_ci(dev, eq, ci);
+}
+
+static inline void tavor_eq_req_not(struct mthca_dev *dev, int eqn)
+{
+	mthca_write64(MTHCA_EQ_DB_REQ_NOT | eqn, 0,
+		      dev->kar + MTHCA_EQ_DOORBELL,
+		      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+}
+
+static inline void arbel_eq_req_not(struct mthca_dev *dev, u32 eqn_mask)
+{
+	writel(eqn_mask, dev->eq_regs.arbel.eq_arm);
+}
+
+static inline void disarm_cq(struct mthca_dev *dev, int eqn, int cqn)
+{
+	if (!mthca_is_memfree(dev)) {
+		mthca_write64(MTHCA_EQ_DB_DISARM_CQ | eqn, cqn,
+			      dev->kar + MTHCA_EQ_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+}
+
+static inline struct mthca_eqe *get_eqe(struct mthca_eq *eq, u32 entry)
+{
+	unsigned long off = (entry & (eq->nent - 1)) * MTHCA_EQ_ENTRY_SIZE;
+	return eq->page_list[off / PAGE_SIZE].buf + off % PAGE_SIZE;
+}
+
+static inline struct mthca_eqe *next_eqe_sw(struct mthca_eq *eq)
+{
+	struct mthca_eqe *eqe;
+	eqe = get_eqe(eq, eq->cons_index);
+	return (MTHCA_EQ_ENTRY_OWNER_HW & eqe->owner) ? NULL : eqe;
+}
+
+static inline void set_eqe_hw(struct mthca_eqe *eqe)
+{
+	eqe->owner =  MTHCA_EQ_ENTRY_OWNER_HW;
+}
+
+static void port_change(struct mthca_dev *dev, int port, int active)
+{
+	struct ib_event record;
+
+	mthca_dbg(dev, "Port change to %s for port %d\n",
+		  active ? "active" : "down", port);
+
+	record.device = &dev->ib_dev;
+	record.event  = active ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+	record.element.port_num = port;
+
+	ib_dispatch_event(&record);
+}
+
+static int mthca_eq_int(struct mthca_dev *dev, struct mthca_eq *eq)
+{
+	struct mthca_eqe *eqe;
+	int disarm_cqn;
+	int eqes_found = 0;
+	int set_ci = 0;
+
+	while ((eqe = next_eqe_sw(eq))) {
+		/*
+		 * Make sure we read EQ entry contents after we've
+		 * checked the ownership bit.
+		 */
+		rmb();
+
+		switch (eqe->type) {
+		case MTHCA_EVENT_TYPE_COMP:
+			disarm_cqn = be32_to_cpu(eqe->event.comp.cqn) & 0xffffff;
+			disarm_cq(dev, eq->eqn, disarm_cqn);
+			mthca_cq_completion(dev, disarm_cqn);
+			break;
+
+		case MTHCA_EVENT_TYPE_PATH_MIG:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_PATH_MIG);
+			break;
+
+		case MTHCA_EVENT_TYPE_COMM_EST:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_COMM_EST);
+			break;
+
+		case MTHCA_EVENT_TYPE_SQ_DRAINED:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_SQ_DRAINED);
+			break;
+
+		case MTHCA_EVENT_TYPE_SRQ_QP_LAST_WQE:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_LAST_WQE_REACHED);
+			break;
+
+		case MTHCA_EVENT_TYPE_SRQ_LIMIT:
+			mthca_srq_event(dev, be32_to_cpu(eqe->event.srq.srqn) & 0xffffff,
+					IB_EVENT_SRQ_LIMIT_REACHED);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_CATAS_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_FATAL);
+			break;
+
+		case MTHCA_EVENT_TYPE_PATH_MIG_FAILED:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_PATH_MIG_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_REQ_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_WQ_ACCESS_ERROR:
+			mthca_qp_event(dev, be32_to_cpu(eqe->event.qp.qpn) & 0xffffff,
+				       IB_EVENT_QP_ACCESS_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_CMD:
+			mthca_cmd_event(dev,
+					be16_to_cpu(eqe->event.cmd.token),
+					eqe->event.cmd.status,
+					be64_to_cpu(eqe->event.cmd.out_param));
+			break;
+
+		case MTHCA_EVENT_TYPE_PORT_CHANGE:
+			port_change(dev,
+				    (be32_to_cpu(eqe->event.port_change.port) >> 28) & 3,
+				    eqe->subtype == 0x4);
+			break;
+
+		case MTHCA_EVENT_TYPE_CQ_ERROR:
+			mthca_warn(dev, "CQ %s on CQN %06x\n",
+				   eqe->event.cq_err.syndrome == 1 ?
+				   "overrun" : "access violation",
+				   be32_to_cpu(eqe->event.cq_err.cqn) & 0xffffff);
+			mthca_cq_event(dev, be32_to_cpu(eqe->event.cq_err.cqn),
+				       IB_EVENT_CQ_ERR);
+			break;
+
+		case MTHCA_EVENT_TYPE_EQ_OVERFLOW:
+			mthca_warn(dev, "EQ overrun on EQN %d\n", eq->eqn);
+			break;
+
+		case MTHCA_EVENT_TYPE_EEC_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_SRQ_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_LOCAL_CATAS_ERROR:
+		case MTHCA_EVENT_TYPE_ECC_DETECT:
+		default:
+			mthca_warn(dev, "Unhandled event %02x(%02x) on EQ %d\n",
+				   eqe->type, eqe->subtype, eq->eqn);
+			break;
+		}
+
+		set_eqe_hw(eqe);
+		++eq->cons_index;
+		eqes_found = 1;
+		++set_ci;
+
+		/*
+		 * The HCA will think the queue has overflowed if we
+		 * don't tell it we've been processing events.  We
+		 * create our EQs with MTHCA_NUM_SPARE_EQE extra
+		 * entries, so we must update our consumer index at
+		 * least that often.
+		 */
+		if (unlikely(set_ci >= MTHCA_NUM_SPARE_EQE)) {
+			/*
+			 * Conditional on hca_type is OK here because
+			 * this is a rare case, not the fast path.
+			 */
+			set_eq_ci(dev, eq, eq->cons_index);
+			set_ci = 0;
+		}
+	}
+
+	/*
+	 * Rely on caller to set consumer index so that we don't have
+	 * to test hca_type in our interrupt handling fast path.
+	 */
+	return eqes_found;
+}
+
+static irqreturn_t mthca_tavor_interrupt(int irq, void *dev_ptr)
+{
+	struct mthca_dev *dev = dev_ptr;
+	u32 ecr;
+	int i;
+
+	if (dev->eq_table.clr_mask)
+		writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);
+
+	ecr = readl(dev->eq_regs.tavor.ecr_base + 4);
+	if (!ecr)
+		return IRQ_NONE;
+
+	writel(ecr, dev->eq_regs.tavor.ecr_base +
+	       MTHCA_ECR_CLR_BASE - MTHCA_ECR_BASE + 4);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (ecr & dev->eq_table.eq[i].eqn_mask) {
+			if (mthca_eq_int(dev, &dev->eq_table.eq[i]))
+				tavor_set_eq_ci(dev, &dev->eq_table.eq[i],
+						dev->eq_table.eq[i].cons_index);
+			tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn);
+		}
+
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mthca_tavor_msi_x_interrupt(int irq, void *eq_ptr)
+{
+	struct mthca_eq  *eq  = eq_ptr;
+	struct mthca_dev *dev = eq->dev;
+
+	mthca_eq_int(dev, eq);
+	tavor_set_eq_ci(dev, eq, eq->cons_index);
+	tavor_eq_req_not(dev, eq->eqn);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+static irqreturn_t mthca_arbel_interrupt(int irq, void *dev_ptr)
+{
+	struct mthca_dev *dev = dev_ptr;
+	int work = 0;
+	int i;
+
+	if (dev->eq_table.clr_mask)
+		writel(dev->eq_table.clr_mask, dev->eq_table.clr_int);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (mthca_eq_int(dev, &dev->eq_table.eq[i])) {
+			work = 1;
+			arbel_set_eq_ci(dev, &dev->eq_table.eq[i],
+					dev->eq_table.eq[i].cons_index);
+		}
+
+	arbel_eq_req_not(dev, dev->eq_table.arm_mask);
+
+	return IRQ_RETVAL(work);
+}
+
+static irqreturn_t mthca_arbel_msi_x_interrupt(int irq, void *eq_ptr)
+{
+	struct mthca_eq  *eq  = eq_ptr;
+	struct mthca_dev *dev = eq->dev;
+
+	mthca_eq_int(dev, eq);
+	arbel_set_eq_ci(dev, eq, eq->cons_index);
+	arbel_eq_req_not(dev, eq->eqn_mask);
+
+	/* MSI-X vectors always belong to us */
+	return IRQ_HANDLED;
+}
+
+static int mthca_create_eq(struct mthca_dev *dev,
+			   int nent,
+			   u8 intr,
+			   struct mthca_eq *eq)
+{
+	int npages;
+	u64 *dma_list = NULL;
+	dma_addr_t t;
+	struct mthca_mailbox *mailbox;
+	struct mthca_eq_context *eq_context;
+	int err = -ENOMEM;
+	int i;
+
+	eq->dev  = dev;
+	eq->nent = roundup_pow_of_two(max(nent, 2));
+	npages = ALIGN(eq->nent * MTHCA_EQ_ENTRY_SIZE, PAGE_SIZE) / PAGE_SIZE;
+
+	eq->page_list = kmalloc(npages * sizeof *eq->page_list,
+				GFP_KERNEL);
+	if (!eq->page_list)
+		goto err_out;
+
+	for (i = 0; i < npages; ++i)
+		eq->page_list[i].buf = NULL;
+
+	dma_list = kmalloc(npages * sizeof *dma_list, GFP_KERNEL);
+	if (!dma_list)
+		goto err_out_free;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		goto err_out_free;
+	eq_context = mailbox->buf;
+
+	for (i = 0; i < npages; ++i) {
+		eq->page_list[i].buf = dma_alloc_coherent(&dev->pdev->dev,
+							  PAGE_SIZE, &t, GFP_KERNEL);
+		if (!eq->page_list[i].buf)
+			goto err_out_free_pages;
+
+		dma_list[i] = t;
+		dma_unmap_addr_set(&eq->page_list[i], mapping, t);
+
+		clear_page(eq->page_list[i].buf);
+	}
+
+	for (i = 0; i < eq->nent; ++i)
+		set_eqe_hw(get_eqe(eq, i));
+
+	eq->eqn = mthca_alloc(&dev->eq_table.alloc);
+	if (eq->eqn == -1)
+		goto err_out_free_pages;
+
+	err = mthca_mr_alloc_phys(dev, dev->driver_pd.pd_num,
+				  dma_list, PAGE_SHIFT, npages,
+				  0, npages * PAGE_SIZE,
+				  MTHCA_MPT_FLAG_LOCAL_WRITE |
+				  MTHCA_MPT_FLAG_LOCAL_READ,
+				  &eq->mr);
+	if (err)
+		goto err_out_free_eq;
+
+	memset(eq_context, 0, sizeof *eq_context);
+	eq_context->flags           = cpu_to_be32(MTHCA_EQ_STATUS_OK   |
+						  MTHCA_EQ_OWNER_HW    |
+						  MTHCA_EQ_STATE_ARMED |
+						  MTHCA_EQ_FLAG_TR);
+	if (mthca_is_memfree(dev))
+		eq_context->flags  |= cpu_to_be32(MTHCA_EQ_STATE_ARBEL);
+
+	eq_context->logsize_usrpage = cpu_to_be32((ffs(eq->nent) - 1) << 24);
+	if (mthca_is_memfree(dev)) {
+		eq_context->arbel_pd = cpu_to_be32(dev->driver_pd.pd_num);
+	} else {
+		eq_context->logsize_usrpage |= cpu_to_be32(dev->driver_uar.index);
+		eq_context->tavor_pd         = cpu_to_be32(dev->driver_pd.pd_num);
+	}
+	eq_context->intr            = intr;
+	eq_context->lkey            = cpu_to_be32(eq->mr.ibmr.lkey);
+
+	err = mthca_SW2HW_EQ(dev, mailbox, eq->eqn);
+	if (err) {
+		mthca_warn(dev, "SW2HW_EQ returned %d\n", err);
+		goto err_out_free_mr;
+	}
+
+	kfree(dma_list);
+	mthca_free_mailbox(dev, mailbox);
+
+	eq->eqn_mask   = swab32(1 << eq->eqn);
+	eq->cons_index = 0;
+
+	dev->eq_table.arm_mask |= eq->eqn_mask;
+
+	mthca_dbg(dev, "Allocated EQ %d with %d entries\n",
+		  eq->eqn, eq->nent);
+
+	return err;
+
+ err_out_free_mr:
+	mthca_free_mr(dev, &eq->mr);
+
+ err_out_free_eq:
+	mthca_free(&dev->eq_table.alloc, eq->eqn);
+
+ err_out_free_pages:
+	for (i = 0; i < npages; ++i)
+		if (eq->page_list[i].buf)
+			dma_free_coherent(&dev->pdev->dev, PAGE_SIZE,
+					  eq->page_list[i].buf,
+					  dma_unmap_addr(&eq->page_list[i],
+							 mapping));
+
+	mthca_free_mailbox(dev, mailbox);
+
+ err_out_free:
+	kfree(eq->page_list);
+	kfree(dma_list);
+
+ err_out:
+	return err;
+}
+
+static void mthca_free_eq(struct mthca_dev *dev,
+			  struct mthca_eq *eq)
+{
+	struct mthca_mailbox *mailbox;
+	int err;
+	int npages = (eq->nent * MTHCA_EQ_ENTRY_SIZE + PAGE_SIZE - 1) /
+		PAGE_SIZE;
+	int i;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return;
+
+	err = mthca_HW2SW_EQ(dev, mailbox, eq->eqn);
+	if (err)
+		mthca_warn(dev, "HW2SW_EQ returned %d\n", err);
+
+	dev->eq_table.arm_mask &= ~eq->eqn_mask;
+
+	if (0) {
+		mthca_dbg(dev, "Dumping EQ context %02x:\n", eq->eqn);
+		for (i = 0; i < sizeof (struct mthca_eq_context) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpup(mailbox->buf + i * 4));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	mthca_free_mr(dev, &eq->mr);
+	for (i = 0; i < npages; ++i)
+		pci_free_consistent(dev->pdev, PAGE_SIZE,
+				    eq->page_list[i].buf,
+				    dma_unmap_addr(&eq->page_list[i], mapping));
+
+	kfree(eq->page_list);
+	mthca_free_mailbox(dev, mailbox);
+}
+
+static void mthca_free_irqs(struct mthca_dev *dev)
+{
+	int i;
+
+	if (dev->eq_table.have_irq)
+		free_irq(dev->pdev->irq, dev);
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (dev->eq_table.eq[i].have_irq) {
+			free_irq(dev->eq_table.eq[i].msi_x_vector,
+				 dev->eq_table.eq + i);
+			dev->eq_table.eq[i].have_irq = 0;
+		}
+}
+
+static int mthca_map_reg(struct mthca_dev *dev,
+			 unsigned long offset, unsigned long size,
+			 void __iomem **map)
+{
+	phys_addr_t base = pci_resource_start(dev->pdev, 0);
+
+	*map = ioremap(base + offset, size);
+	if (!*map)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static int mthca_map_eq_regs(struct mthca_dev *dev)
+{
+	if (mthca_is_memfree(dev)) {
+		/*
+		 * We assume that the EQ arm and EQ set CI registers
+		 * fall within the first BAR.  We can't trust the
+		 * values firmware gives us, since those addresses are
+		 * valid on the HCA's side of the PCI bus but not
+		 * necessarily the host side.
+		 */
+		if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+				  dev->fw.arbel.clr_int_base, MTHCA_CLR_INT_SIZE,
+				  &dev->clr_base)) {
+			mthca_err(dev, "Couldn't map interrupt clear register, "
+				  "aborting.\n");
+			return -ENOMEM;
+		}
+
+		/*
+		 * Add 4 because we limit ourselves to EQs 0 ... 31,
+		 * so we only need the low word of the register.
+		 */
+		if (mthca_map_reg(dev, ((pci_resource_len(dev->pdev, 0) - 1) &
+					dev->fw.arbel.eq_arm_base) + 4, 4,
+				  &dev->eq_regs.arbel.eq_arm)) {
+			mthca_err(dev, "Couldn't map EQ arm register, aborting.\n");
+			iounmap(dev->clr_base);
+			return -ENOMEM;
+		}
+
+		if (mthca_map_reg(dev, (pci_resource_len(dev->pdev, 0) - 1) &
+				  dev->fw.arbel.eq_set_ci_base,
+				  MTHCA_EQ_SET_CI_SIZE,
+				  &dev->eq_regs.arbel.eq_set_ci_base)) {
+			mthca_err(dev, "Couldn't map EQ CI register, aborting.\n");
+			iounmap(dev->eq_regs.arbel.eq_arm);
+			iounmap(dev->clr_base);
+			return -ENOMEM;
+		}
+	} else {
+		if (mthca_map_reg(dev, MTHCA_CLR_INT_BASE, MTHCA_CLR_INT_SIZE,
+				  &dev->clr_base)) {
+			mthca_err(dev, "Couldn't map interrupt clear register, "
+				  "aborting.\n");
+			return -ENOMEM;
+		}
+
+		if (mthca_map_reg(dev, MTHCA_ECR_BASE,
+				  MTHCA_ECR_SIZE + MTHCA_ECR_CLR_SIZE,
+				  &dev->eq_regs.tavor.ecr_base)) {
+			mthca_err(dev, "Couldn't map ecr register, "
+				  "aborting.\n");
+			iounmap(dev->clr_base);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+
+}
+
+static void mthca_unmap_eq_regs(struct mthca_dev *dev)
+{
+	if (mthca_is_memfree(dev)) {
+		iounmap(dev->eq_regs.arbel.eq_set_ci_base);
+		iounmap(dev->eq_regs.arbel.eq_arm);
+		iounmap(dev->clr_base);
+	} else {
+		iounmap(dev->eq_regs.tavor.ecr_base);
+		iounmap(dev->clr_base);
+	}
+}
+
+int mthca_map_eq_icm(struct mthca_dev *dev, u64 icm_virt)
+{
+	int ret;
+
+	/*
+	 * We assume that mapping one page is enough for the whole EQ
+	 * context table.  This is fine with all current HCAs, because
+	 * we only use 32 EQs and each EQ uses 32 bytes of context
+	 * memory, or 1 KB total.
+	 */
+	dev->eq_table.icm_virt = icm_virt;
+	dev->eq_table.icm_page = alloc_page(GFP_HIGHUSER);
+	if (!dev->eq_table.icm_page)
+		return -ENOMEM;
+	dev->eq_table.icm_dma  = pci_map_page(dev->pdev, dev->eq_table.icm_page, 0,
+					      PAGE_SIZE, PCI_DMA_BIDIRECTIONAL);
+	if (pci_dma_mapping_error(dev->pdev, dev->eq_table.icm_dma)) {
+		__free_page(dev->eq_table.icm_page);
+		return -ENOMEM;
+	}
+
+	ret = mthca_MAP_ICM_page(dev, dev->eq_table.icm_dma, icm_virt);
+	if (ret) {
+		pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE,
+			       PCI_DMA_BIDIRECTIONAL);
+		__free_page(dev->eq_table.icm_page);
+	}
+
+	return ret;
+}
+
+void mthca_unmap_eq_icm(struct mthca_dev *dev)
+{
+	mthca_UNMAP_ICM(dev, dev->eq_table.icm_virt, 1);
+	pci_unmap_page(dev->pdev, dev->eq_table.icm_dma, PAGE_SIZE,
+		       PCI_DMA_BIDIRECTIONAL);
+	__free_page(dev->eq_table.icm_page);
+}
+
+int mthca_init_eq_table(struct mthca_dev *dev)
+{
+	int err;
+	u8 intr;
+	int i;
+
+	err = mthca_alloc_init(&dev->eq_table.alloc,
+			       dev->limits.num_eqs,
+			       dev->limits.num_eqs - 1,
+			       dev->limits.reserved_eqs);
+	if (err)
+		return err;
+
+	err = mthca_map_eq_regs(dev);
+	if (err)
+		goto err_out_free;
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+		dev->eq_table.clr_mask = 0;
+	} else {
+		dev->eq_table.clr_mask =
+			swab32(1 << (dev->eq_table.inta_pin & 31));
+		dev->eq_table.clr_int  = dev->clr_base +
+			(dev->eq_table.inta_pin < 32 ? 4 : 0);
+	}
+
+	dev->eq_table.arm_mask = 0;
+
+	intr = dev->eq_table.inta_pin;
+
+	err = mthca_create_eq(dev, dev->limits.num_cqs + MTHCA_NUM_SPARE_EQE,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 128 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_COMP]);
+	if (err)
+		goto err_out_unmap;
+
+	err = mthca_create_eq(dev, MTHCA_NUM_ASYNC_EQE + MTHCA_NUM_SPARE_EQE,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 129 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_ASYNC]);
+	if (err)
+		goto err_out_comp;
+
+	err = mthca_create_eq(dev, MTHCA_NUM_CMD_EQE + MTHCA_NUM_SPARE_EQE,
+			      (dev->mthca_flags & MTHCA_FLAG_MSI_X) ? 130 : intr,
+			      &dev->eq_table.eq[MTHCA_EQ_CMD]);
+	if (err)
+		goto err_out_async;
+
+	if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+		static const char *eq_name[] = {
+			[MTHCA_EQ_COMP]  = DRV_NAME "-comp",
+			[MTHCA_EQ_ASYNC] = DRV_NAME "-async",
+			[MTHCA_EQ_CMD]   = DRV_NAME "-cmd"
+		};
+
+		for (i = 0; i < MTHCA_NUM_EQ; ++i) {
+			snprintf(dev->eq_table.eq[i].irq_name,
+				 IB_DEVICE_NAME_MAX,
+				 "%s@pci:%s", eq_name[i],
+				 pci_name(dev->pdev));
+			err = request_irq(dev->eq_table.eq[i].msi_x_vector,
+					  mthca_is_memfree(dev) ?
+					  mthca_arbel_msi_x_interrupt :
+					  mthca_tavor_msi_x_interrupt,
+					  0, dev->eq_table.eq[i].irq_name,
+					  dev->eq_table.eq + i);
+			if (err)
+				goto err_out_cmd;
+			dev->eq_table.eq[i].have_irq = 1;
+		}
+	} else {
+		snprintf(dev->eq_table.eq[0].irq_name, IB_DEVICE_NAME_MAX,
+			 DRV_NAME "@pci:%s", pci_name(dev->pdev));
+		err = request_irq(dev->pdev->irq,
+				  mthca_is_memfree(dev) ?
+				  mthca_arbel_interrupt :
+				  mthca_tavor_interrupt,
+				  IRQF_SHARED, dev->eq_table.eq[0].irq_name, dev);
+		if (err)
+			goto err_out_cmd;
+		dev->eq_table.have_irq = 1;
+	}
+
+	err = mthca_MAP_EQ(dev, async_mask(dev),
+			   0, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
+	if (err)
+		mthca_warn(dev, "MAP_EQ for async EQ %d failed (%d)\n",
+			   dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn, err);
+
+	err = mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK,
+			   0, dev->eq_table.eq[MTHCA_EQ_CMD].eqn);
+	if (err)
+		mthca_warn(dev, "MAP_EQ for cmd EQ %d failed (%d)\n",
+			   dev->eq_table.eq[MTHCA_EQ_CMD].eqn, err);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		if (mthca_is_memfree(dev))
+			arbel_eq_req_not(dev, dev->eq_table.eq[i].eqn_mask);
+		else
+			tavor_eq_req_not(dev, dev->eq_table.eq[i].eqn);
+
+	return 0;
+
+err_out_cmd:
+	mthca_free_irqs(dev);
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_CMD]);
+
+err_out_async:
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_ASYNC]);
+
+err_out_comp:
+	mthca_free_eq(dev, &dev->eq_table.eq[MTHCA_EQ_COMP]);
+
+err_out_unmap:
+	mthca_unmap_eq_regs(dev);
+
+err_out_free:
+	mthca_alloc_cleanup(&dev->eq_table.alloc);
+	return err;
+}
+
+void mthca_cleanup_eq_table(struct mthca_dev *dev)
+{
+	int i;
+
+	mthca_free_irqs(dev);
+
+	mthca_MAP_EQ(dev, async_mask(dev),
+		     1, dev->eq_table.eq[MTHCA_EQ_ASYNC].eqn);
+	mthca_MAP_EQ(dev, MTHCA_CMD_EVENT_MASK,
+		     1, dev->eq_table.eq[MTHCA_EQ_CMD].eqn);
+
+	for (i = 0; i < MTHCA_NUM_EQ; ++i)
+		mthca_free_eq(dev, &dev->eq_table.eq[i]);
+
+	mthca_unmap_eq_regs(dev);
+
+	mthca_alloc_cleanup(&dev->eq_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_mad.c b/drivers/infiniband/hw/mthca/mthca_mad.c
new file mode 100644
index 000000000..8881fa376
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_mad.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_mad.h>
+#include <rdma/ib_smi.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+enum {
+	MTHCA_VENDOR_CLASS1 = 0x9,
+	MTHCA_VENDOR_CLASS2 = 0xa
+};
+
+static int mthca_update_rate(struct mthca_dev *dev, u8 port_num)
+{
+	struct ib_port_attr *tprops = NULL;
+	int                  ret;
+
+	tprops = kmalloc(sizeof *tprops, GFP_KERNEL);
+	if (!tprops)
+		return -ENOMEM;
+
+	ret = ib_query_port(&dev->ib_dev, port_num, tprops);
+	if (ret) {
+		printk(KERN_WARNING "ib_query_port failed (%d) for %s port %d\n",
+		       ret, dev->ib_dev.name, port_num);
+		goto out;
+	}
+
+	dev->rate[port_num - 1] = tprops->active_speed *
+				  ib_width_enum_to_int(tprops->active_width);
+
+out:
+	kfree(tprops);
+	return ret;
+}
+
+static void update_sm_ah(struct mthca_dev *dev,
+			 u8 port_num, u16 lid, u8 sl)
+{
+	struct ib_ah *new_ah;
+	struct ib_ah_attr ah_attr;
+	unsigned long flags;
+
+	if (!dev->send_agent[port_num - 1][0])
+		return;
+
+	memset(&ah_attr, 0, sizeof ah_attr);
+	ah_attr.dlid     = lid;
+	ah_attr.sl       = sl;
+	ah_attr.port_num = port_num;
+
+	new_ah = ib_create_ah(dev->send_agent[port_num - 1][0]->qp->pd,
+			      &ah_attr);
+	if (IS_ERR(new_ah))
+		return;
+
+	spin_lock_irqsave(&dev->sm_lock, flags);
+	if (dev->sm_ah[port_num - 1])
+		ib_destroy_ah(dev->sm_ah[port_num - 1]);
+	dev->sm_ah[port_num - 1] = new_ah;
+	spin_unlock_irqrestore(&dev->sm_lock, flags);
+}
+
+/*
+ * Snoop SM MADs for port info and P_Key table sets, so we can
+ * synthesize LID change and P_Key change events.
+ */
+static void smp_snoop(struct ib_device *ibdev,
+		      u8 port_num,
+		      struct ib_mad *mad,
+		      u16 prev_lid)
+{
+	struct ib_event event;
+
+	if ((mad->mad_hdr.mgmt_class  == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class  == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method     == IB_MGMT_METHOD_SET) {
+		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO) {
+			struct ib_port_info *pinfo =
+				(struct ib_port_info *) ((struct ib_smp *) mad)->data;
+			u16 lid = be16_to_cpu(pinfo->lid);
+
+			mthca_update_rate(to_mdev(ibdev), port_num);
+			update_sm_ah(to_mdev(ibdev), port_num,
+				     be16_to_cpu(pinfo->sm_lid),
+				     pinfo->neighbormtu_mastersmsl & 0xf);
+
+			event.device           = ibdev;
+			event.element.port_num = port_num;
+
+			if (pinfo->clientrereg_resv_subnetto & 0x80) {
+				event.event    = IB_EVENT_CLIENT_REREGISTER;
+				ib_dispatch_event(&event);
+			}
+
+			if (prev_lid != lid) {
+				event.event    = IB_EVENT_LID_CHANGE;
+				ib_dispatch_event(&event);
+			}
+		}
+
+		if (mad->mad_hdr.attr_id == IB_SMP_ATTR_PKEY_TABLE) {
+			event.device           = ibdev;
+			event.event            = IB_EVENT_PKEY_CHANGE;
+			event.element.port_num = port_num;
+			ib_dispatch_event(&event);
+		}
+	}
+}
+
+static void node_desc_override(struct ib_device *dev,
+			       struct ib_mad *mad)
+{
+	if ((mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    mad->mad_hdr.method == IB_MGMT_METHOD_GET_RESP &&
+	    mad->mad_hdr.attr_id == IB_SMP_ATTR_NODE_DESC) {
+		mutex_lock(&to_mdev(dev)->cap_mask_mutex);
+		memcpy(((struct ib_smp *) mad)->data, dev->node_desc, 64);
+		mutex_unlock(&to_mdev(dev)->cap_mask_mutex);
+	}
+}
+
+static void forward_trap(struct mthca_dev *dev,
+			 u8 port_num,
+			 struct ib_mad *mad)
+{
+	int qpn = mad->mad_hdr.mgmt_class != IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent = dev->send_agent[port_num - 1][qpn];
+	int ret;
+	unsigned long flags;
+
+	if (agent) {
+		send_buf = ib_create_send_mad(agent, qpn, 0, 0, IB_MGMT_MAD_HDR,
+					      IB_MGMT_MAD_DATA, GFP_ATOMIC);
+		if (IS_ERR(send_buf))
+			return;
+		/*
+		 * We rely here on the fact that MLX QPs don't use the
+		 * address handle after the send is posted (this is
+		 * wrong following the IB spec strictly, but we know
+		 * it's OK for our devices).
+		 */
+		spin_lock_irqsave(&dev->sm_lock, flags);
+		memcpy(send_buf->mad, mad, sizeof *mad);
+		if ((send_buf->ah = dev->sm_ah[port_num - 1]))
+			ret = ib_post_send_mad(send_buf, NULL);
+		else
+			ret = -EINVAL;
+		spin_unlock_irqrestore(&dev->sm_lock, flags);
+
+		if (ret)
+			ib_free_send_mad(send_buf);
+	}
+}
+
+int mthca_process_mad(struct ib_device *ibdev,
+		      int mad_flags,
+		      u8 port_num,
+		      struct ib_wc *in_wc,
+		      struct ib_grh *in_grh,
+		      struct ib_mad *in_mad,
+		      struct ib_mad *out_mad)
+{
+	int err;
+	u16 slid = in_wc ? in_wc->slid : be16_to_cpu(IB_LID_PERMISSIVE);
+	u16 prev_lid = 0;
+	struct ib_port_attr pattr;
+
+	/* Forward locally generated traps to the SM */
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP &&
+	    slid == 0) {
+		forward_trap(to_mdev(ibdev), port_num, in_mad);
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+	}
+
+	/*
+	 * Only handle SM gets, sets and trap represses for SM class
+	 *
+	 * Only handle PMA and Mellanox vendor-specific class gets and
+	 * sets for other classes.
+	 */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	    in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		if (in_mad->mad_hdr.method   != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_SET &&
+		    in_mad->mad_hdr.method   != IB_MGMT_METHOD_TRAP_REPRESS)
+			return IB_MAD_RESULT_SUCCESS;
+
+		/*
+		 * Don't process SMInfo queries or vendor-specific
+		 * MADs -- the SMA can't handle them.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_SM_INFO ||
+		    ((in_mad->mad_hdr.attr_id & IB_SMP_ATTR_VENDOR_MASK) ==
+		     IB_SMP_ATTR_VENDOR_MASK))
+			return IB_MAD_RESULT_SUCCESS;
+	} else if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_PERF_MGMT ||
+		   in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS1     ||
+		   in_mad->mad_hdr.mgmt_class == MTHCA_VENDOR_CLASS2) {
+		if (in_mad->mad_hdr.method  != IB_MGMT_METHOD_GET &&
+		    in_mad->mad_hdr.method  != IB_MGMT_METHOD_SET)
+			return IB_MAD_RESULT_SUCCESS;
+	} else
+		return IB_MAD_RESULT_SUCCESS;
+	if ((in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_LID_ROUTED ||
+	     in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) &&
+	    in_mad->mad_hdr.method == IB_MGMT_METHOD_SET &&
+	    in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+	    !ib_query_port(ibdev, port_num, &pattr))
+		prev_lid = pattr.lid;
+
+	err = mthca_MAD_IFC(to_mdev(ibdev),
+			    mad_flags & IB_MAD_IGNORE_MKEY,
+			    mad_flags & IB_MAD_IGNORE_BKEY,
+			    port_num, in_wc, in_grh, in_mad, out_mad);
+	if (err == -EBADMSG)
+		return IB_MAD_RESULT_SUCCESS;
+	else if (err) {
+		mthca_err(to_mdev(ibdev), "MAD_IFC returned %d\n", err);
+		return IB_MAD_RESULT_FAILURE;
+	}
+
+	if (!out_mad->mad_hdr.status) {
+		smp_snoop(ibdev, port_num, in_mad, prev_lid);
+		node_desc_override(ibdev, out_mad);
+	}
+
+	/* set return bit in status of directed route responses */
+	if (in_mad->mad_hdr.mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		out_mad->mad_hdr.status |= cpu_to_be16(1 << 15);
+
+	if (in_mad->mad_hdr.method == IB_MGMT_METHOD_TRAP_REPRESS)
+		/* no response for trap repress */
+		return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+int mthca_create_agents(struct mthca_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+	int ret;
+
+	spin_lock_init(&dev->sm_lock);
+
+	for (p = 0; p < dev->limits.num_ports; ++p)
+		for (q = 0; q <= 1; ++q) {
+			agent = ib_register_mad_agent(&dev->ib_dev, p + 1,
+						      q ? IB_QPT_GSI : IB_QPT_SMI,
+						      NULL, 0, send_handler,
+						      NULL, NULL, 0);
+			if (IS_ERR(agent)) {
+				ret = PTR_ERR(agent);
+				goto err;
+			}
+			dev->send_agent[p][q] = agent;
+		}
+
+
+	for (p = 1; p <= dev->limits.num_ports; ++p) {
+		ret = mthca_update_rate(dev, p);
+		if (ret) {
+			mthca_err(dev, "Failed to obtain port %d rate."
+				  " aborting.\n", p);
+			goto err;
+		}
+	}
+
+	return 0;
+
+err:
+	for (p = 0; p < dev->limits.num_ports; ++p)
+		for (q = 0; q <= 1; ++q)
+			if (dev->send_agent[p][q])
+				ib_unregister_mad_agent(dev->send_agent[p][q]);
+
+	return ret;
+}
+
+void mthca_free_agents(struct mthca_dev *dev)
+{
+	struct ib_mad_agent *agent;
+	int p, q;
+
+	for (p = 0; p < dev->limits.num_ports; ++p) {
+		for (q = 0; q <= 1; ++q) {
+			agent = dev->send_agent[p][q];
+			dev->send_agent[p][q] = NULL;
+			ib_unregister_mad_agent(agent);
+		}
+
+		if (dev->sm_ah[p])
+			ib_destroy_ah(dev->sm_ah[p]);
+	}
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
new file mode 100644
index 000000000..ded76c101
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -0,0 +1,1275 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/gfp.h>
+
+#include "mthca_dev.h"
+#include "mthca_config_reg.h"
+#include "mthca_cmd.h"
+#include "mthca_profile.h"
+#include "mthca_memfree.h"
+#include "mthca_wqe.h"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("Mellanox InfiniBand HCA low-level driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+#ifdef CONFIG_INFINIBAND_MTHCA_DEBUG
+
+int mthca_debug_level = 0;
+module_param_named(debug_level, mthca_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+
+#endif /* CONFIG_INFINIBAND_MTHCA_DEBUG */
+
+#ifdef CONFIG_PCI_MSI
+
+static int msi_x = 1;
+module_param(msi_x, int, 0444);
+MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
+
+#else /* CONFIG_PCI_MSI */
+
+#define msi_x (0)
+
+#endif /* CONFIG_PCI_MSI */
+
+static int tune_pci = 0;
+module_param(tune_pci, int, 0444);
+MODULE_PARM_DESC(tune_pci, "increase PCI burst from the default set by BIOS if nonzero");
+
+DEFINE_MUTEX(mthca_device_mutex);
+
+#define MTHCA_DEFAULT_NUM_QP            (1 << 16)
+#define MTHCA_DEFAULT_RDB_PER_QP        (1 << 2)
+#define MTHCA_DEFAULT_NUM_CQ            (1 << 16)
+#define MTHCA_DEFAULT_NUM_MCG           (1 << 13)
+#define MTHCA_DEFAULT_NUM_MPT           (1 << 17)
+#define MTHCA_DEFAULT_NUM_MTT           (1 << 20)
+#define MTHCA_DEFAULT_NUM_UDAV          (1 << 15)
+#define MTHCA_DEFAULT_NUM_RESERVED_MTTS (1 << 18)
+#define MTHCA_DEFAULT_NUM_UARC_SIZE     (1 << 18)
+
+static struct mthca_profile hca_profile = {
+	.num_qp             = MTHCA_DEFAULT_NUM_QP,
+	.rdb_per_qp         = MTHCA_DEFAULT_RDB_PER_QP,
+	.num_cq             = MTHCA_DEFAULT_NUM_CQ,
+	.num_mcg            = MTHCA_DEFAULT_NUM_MCG,
+	.num_mpt            = MTHCA_DEFAULT_NUM_MPT,
+	.num_mtt            = MTHCA_DEFAULT_NUM_MTT,
+	.num_udav           = MTHCA_DEFAULT_NUM_UDAV,          /* Tavor only */
+	.fmr_reserved_mtts  = MTHCA_DEFAULT_NUM_RESERVED_MTTS, /* Tavor only */
+	.uarc_size          = MTHCA_DEFAULT_NUM_UARC_SIZE,     /* Arbel only */
+};
+
+module_param_named(num_qp, hca_profile.num_qp, int, 0444);
+MODULE_PARM_DESC(num_qp, "maximum number of QPs per HCA");
+
+module_param_named(rdb_per_qp, hca_profile.rdb_per_qp, int, 0444);
+MODULE_PARM_DESC(rdb_per_qp, "number of RDB buffers per QP");
+
+module_param_named(num_cq, hca_profile.num_cq, int, 0444);
+MODULE_PARM_DESC(num_cq, "maximum number of CQs per HCA");
+
+module_param_named(num_mcg, hca_profile.num_mcg, int, 0444);
+MODULE_PARM_DESC(num_mcg, "maximum number of multicast groups per HCA");
+
+module_param_named(num_mpt, hca_profile.num_mpt, int, 0444);
+MODULE_PARM_DESC(num_mpt,
+		"maximum number of memory protection table entries per HCA");
+
+module_param_named(num_mtt, hca_profile.num_mtt, int, 0444);
+MODULE_PARM_DESC(num_mtt,
+		 "maximum number of memory translation table segments per HCA");
+
+module_param_named(num_udav, hca_profile.num_udav, int, 0444);
+MODULE_PARM_DESC(num_udav, "maximum number of UD address vectors per HCA");
+
+module_param_named(fmr_reserved_mtts, hca_profile.fmr_reserved_mtts, int, 0444);
+MODULE_PARM_DESC(fmr_reserved_mtts,
+		 "number of memory translation table segments reserved for FMR");
+
+static int log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8);
+module_param_named(log_mtts_per_seg, log_mtts_per_seg, int, 0444);
+MODULE_PARM_DESC(log_mtts_per_seg, "Log2 number of MTT entries per segment (1-5)");
+
+static char mthca_version[] =
+	DRV_NAME ": Mellanox InfiniBand HCA driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static int mthca_tune_pci(struct mthca_dev *mdev)
+{
+	if (!tune_pci)
+		return 0;
+
+	/* First try to max out Read Byte Count */
+	if (pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX)) {
+		if (pcix_set_mmrbc(mdev->pdev, pcix_get_max_mmrbc(mdev->pdev))) {
+			mthca_err(mdev, "Couldn't set PCI-X max read count, "
+				"aborting.\n");
+			return -ENODEV;
+		}
+	} else if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE))
+		mthca_info(mdev, "No PCI-X capability, not setting RBC.\n");
+
+	if (pci_is_pcie(mdev->pdev)) {
+		if (pcie_set_readrq(mdev->pdev, 4096)) {
+			mthca_err(mdev, "Couldn't write PCI Express read request, "
+				"aborting.\n");
+			return -ENODEV;
+		}
+	} else if (mdev->mthca_flags & MTHCA_FLAG_PCIE)
+		mthca_info(mdev, "No PCI Express capability, "
+			   "not setting Max Read Request Size.\n");
+
+	return 0;
+}
+
+static int mthca_dev_lim(struct mthca_dev *mdev, struct mthca_dev_lim *dev_lim)
+{
+	int err;
+
+	mdev->limits.mtt_seg_size = (1 << log_mtts_per_seg) * 8;
+	err = mthca_QUERY_DEV_LIM(mdev, dev_lim);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM command returned %d"
+				", aborting.\n", err);
+		return err;
+	}
+	if (dev_lim->min_page_sz > PAGE_SIZE) {
+		mthca_err(mdev, "HCA minimum page size of %d bigger than "
+			  "kernel PAGE_SIZE of %ld, aborting.\n",
+			  dev_lim->min_page_sz, PAGE_SIZE);
+		return -ENODEV;
+	}
+	if (dev_lim->num_ports > MTHCA_MAX_PORTS) {
+		mthca_err(mdev, "HCA has %d ports, but we only support %d, "
+			  "aborting.\n",
+			  dev_lim->num_ports, MTHCA_MAX_PORTS);
+		return -ENODEV;
+	}
+
+	if (dev_lim->uar_size > pci_resource_len(mdev->pdev, 2)) {
+		mthca_err(mdev, "HCA reported UAR size of 0x%x bigger than "
+			  "PCI resource 2 size of 0x%llx, aborting.\n",
+			  dev_lim->uar_size,
+			  (unsigned long long)pci_resource_len(mdev->pdev, 2));
+		return -ENODEV;
+	}
+
+	mdev->limits.num_ports      	= dev_lim->num_ports;
+	mdev->limits.vl_cap             = dev_lim->max_vl;
+	mdev->limits.mtu_cap            = dev_lim->max_mtu;
+	mdev->limits.gid_table_len  	= dev_lim->max_gids;
+	mdev->limits.pkey_table_len 	= dev_lim->max_pkeys;
+	mdev->limits.local_ca_ack_delay = dev_lim->local_ca_ack_delay;
+	/*
+	 * Need to allow for worst case send WQE overhead and check
+	 * whether max_desc_sz imposes a lower limit than max_sg; UD
+	 * send has the biggest overhead.
+	 */
+	mdev->limits.max_sg		= min_t(int, dev_lim->max_sg,
+					      (dev_lim->max_desc_sz -
+					       sizeof (struct mthca_next_seg) -
+					       (mthca_is_memfree(mdev) ?
+						sizeof (struct mthca_arbel_ud_seg) :
+						sizeof (struct mthca_tavor_ud_seg))) /
+						sizeof (struct mthca_data_seg));
+	mdev->limits.max_wqes           = dev_lim->max_qp_sz;
+	mdev->limits.max_qp_init_rdma   = dev_lim->max_requester_per_qp;
+	mdev->limits.reserved_qps       = dev_lim->reserved_qps;
+	mdev->limits.max_srq_wqes       = dev_lim->max_srq_sz;
+	mdev->limits.reserved_srqs      = dev_lim->reserved_srqs;
+	mdev->limits.reserved_eecs      = dev_lim->reserved_eecs;
+	mdev->limits.max_desc_sz        = dev_lim->max_desc_sz;
+	mdev->limits.max_srq_sge	= mthca_max_srq_sge(mdev);
+	/*
+	 * Subtract 1 from the limit because we need to allocate a
+	 * spare CQE so the HCA HW can tell the difference between an
+	 * empty CQ and a full CQ.
+	 */
+	mdev->limits.max_cqes           = dev_lim->max_cq_sz - 1;
+	mdev->limits.reserved_cqs       = dev_lim->reserved_cqs;
+	mdev->limits.reserved_eqs       = dev_lim->reserved_eqs;
+	mdev->limits.reserved_mtts      = dev_lim->reserved_mtts;
+	mdev->limits.reserved_mrws      = dev_lim->reserved_mrws;
+	mdev->limits.reserved_uars      = dev_lim->reserved_uars;
+	mdev->limits.reserved_pds       = dev_lim->reserved_pds;
+	mdev->limits.port_width_cap     = dev_lim->max_port_width;
+	mdev->limits.page_size_cap      = ~(u32) (dev_lim->min_page_sz - 1);
+	mdev->limits.flags              = dev_lim->flags;
+	/*
+	 * For old FW that doesn't return static rate support, use a
+	 * value of 0x3 (only static rate values of 0 or 1 are handled),
+	 * except on Sinai, where even old FW can handle static rate
+	 * values of 2 and 3.
+	 */
+	if (dev_lim->stat_rate_support)
+		mdev->limits.stat_rate_support = dev_lim->stat_rate_support;
+	else if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		mdev->limits.stat_rate_support = 0xf;
+	else
+		mdev->limits.stat_rate_support = 0x3;
+
+	/* IB_DEVICE_RESIZE_MAX_WR not supported by driver.
+	   May be doable since hardware supports it for SRQ.
+
+	   IB_DEVICE_N_NOTIFY_CQ is supported by hardware but not by driver.
+
+	   IB_DEVICE_SRQ_RESIZE is supported by hardware but SRQ is not
+	   supported by driver. */
+	mdev->device_cap_flags = IB_DEVICE_CHANGE_PHY_PORT |
+		IB_DEVICE_PORT_ACTIVE_EVENT |
+		IB_DEVICE_SYS_IMAGE_GUID |
+		IB_DEVICE_RC_RNR_NAK_GEN;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_BAD_PKEY_CNTR)
+		mdev->device_cap_flags |= IB_DEVICE_BAD_PKEY_CNTR;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_BAD_QKEY_CNTR)
+		mdev->device_cap_flags |= IB_DEVICE_BAD_QKEY_CNTR;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_RAW_MULTI)
+		mdev->device_cap_flags |= IB_DEVICE_RAW_MULTI;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_AUTO_PATH_MIG)
+		mdev->device_cap_flags |= IB_DEVICE_AUTO_PATH_MIG;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_UD_AV_PORT_ENFORCE)
+		mdev->device_cap_flags |= IB_DEVICE_UD_AV_PORT_ENFORCE;
+
+	if (dev_lim->flags & DEV_LIM_FLAG_SRQ)
+		mdev->mthca_flags |= MTHCA_FLAG_SRQ;
+
+	if (mthca_is_memfree(mdev))
+		if (dev_lim->flags & DEV_LIM_FLAG_IPOIB_CSUM)
+			mdev->device_cap_flags |= IB_DEVICE_UD_IP_CSUM;
+
+	return 0;
+}
+
+static int mthca_init_tavor(struct mthca_dev *mdev)
+{
+	s64 size;
+	int err;
+	struct mthca_dev_lim        dev_lim;
+	struct mthca_profile        profile;
+	struct mthca_init_hca_param init_hca;
+
+	err = mthca_SYS_EN(mdev);
+	if (err) {
+		mthca_err(mdev, "SYS_EN command returned %d, aborting.\n", err);
+		return err;
+	}
+
+	err = mthca_QUERY_FW(mdev);
+	if (err) {
+		mthca_err(mdev, "QUERY_FW command returned %d,"
+				" aborting.\n", err);
+		goto err_disable;
+	}
+	err = mthca_QUERY_DDR(mdev);
+	if (err) {
+		mthca_err(mdev, "QUERY_DDR command returned %d, aborting.\n", err);
+		goto err_disable;
+	}
+
+	err = mthca_dev_lim(mdev, &dev_lim);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM command returned %d, aborting.\n", err);
+		goto err_disable;
+	}
+
+	profile = hca_profile;
+	profile.num_uar   = dev_lim.uar_size / PAGE_SIZE;
+	profile.uarc_size = 0;
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+		profile.num_srq = dev_lim.max_srqs;
+
+	size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca);
+	if (size < 0) {
+		err = size;
+		goto err_disable;
+	}
+
+	err = mthca_INIT_HCA(mdev, &init_hca);
+	if (err) {
+		mthca_err(mdev, "INIT_HCA command returned %d, aborting.\n", err);
+		goto err_disable;
+	}
+
+	return 0;
+
+err_disable:
+	mthca_SYS_DIS(mdev);
+
+	return err;
+}
+
+static int mthca_load_fw(struct mthca_dev *mdev)
+{
+	int err;
+
+	/* FIXME: use HCA-attached memory for FW if present */
+
+	mdev->fw.arbel.fw_icm =
+		mthca_alloc_icm(mdev, mdev->fw.arbel.fw_pages,
+				GFP_HIGHUSER | __GFP_NOWARN, 0);
+	if (!mdev->fw.arbel.fw_icm) {
+		mthca_err(mdev, "Couldn't allocate FW area, aborting.\n");
+		return -ENOMEM;
+	}
+
+	err = mthca_MAP_FA(mdev, mdev->fw.arbel.fw_icm);
+	if (err) {
+		mthca_err(mdev, "MAP_FA command returned %d, aborting.\n", err);
+		goto err_free;
+	}
+	err = mthca_RUN_FW(mdev);
+	if (err) {
+		mthca_err(mdev, "RUN_FW command returned %d, aborting.\n", err);
+		goto err_unmap_fa;
+	}
+
+	return 0;
+
+err_unmap_fa:
+	mthca_UNMAP_FA(mdev);
+
+err_free:
+	mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
+	return err;
+}
+
+static int mthca_init_icm(struct mthca_dev *mdev,
+			  struct mthca_dev_lim *dev_lim,
+			  struct mthca_init_hca_param *init_hca,
+			  u64 icm_size)
+{
+	u64 aux_pages;
+	int err;
+
+	err = mthca_SET_ICM_SIZE(mdev, icm_size, &aux_pages);
+	if (err) {
+		mthca_err(mdev, "SET_ICM_SIZE command returned %d, aborting.\n", err);
+		return err;
+	}
+
+	mthca_dbg(mdev, "%lld KB of HCA context requires %lld KB aux memory.\n",
+		  (unsigned long long) icm_size >> 10,
+		  (unsigned long long) aux_pages << 2);
+
+	mdev->fw.arbel.aux_icm = mthca_alloc_icm(mdev, aux_pages,
+						 GFP_HIGHUSER | __GFP_NOWARN, 0);
+	if (!mdev->fw.arbel.aux_icm) {
+		mthca_err(mdev, "Couldn't allocate aux memory, aborting.\n");
+		return -ENOMEM;
+	}
+
+	err = mthca_MAP_ICM_AUX(mdev, mdev->fw.arbel.aux_icm);
+	if (err) {
+		mthca_err(mdev, "MAP_ICM_AUX returned %d, aborting.\n", err);
+		goto err_free_aux;
+	}
+
+	err = mthca_map_eq_icm(mdev, init_hca->eqc_base);
+	if (err) {
+		mthca_err(mdev, "Failed to map EQ context memory, aborting.\n");
+		goto err_unmap_aux;
+	}
+
+	/* CPU writes to non-reserved MTTs, while HCA might DMA to reserved mtts */
+	mdev->limits.reserved_mtts = ALIGN(mdev->limits.reserved_mtts * mdev->limits.mtt_seg_size,
+					   dma_get_cache_alignment()) / mdev->limits.mtt_seg_size;
+
+	mdev->mr_table.mtt_table = mthca_alloc_icm_table(mdev, init_hca->mtt_base,
+							 mdev->limits.mtt_seg_size,
+							 mdev->limits.num_mtt_segs,
+							 mdev->limits.reserved_mtts,
+							 1, 0);
+	if (!mdev->mr_table.mtt_table) {
+		mthca_err(mdev, "Failed to map MTT context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_eq;
+	}
+
+	mdev->mr_table.mpt_table = mthca_alloc_icm_table(mdev, init_hca->mpt_base,
+							 dev_lim->mpt_entry_sz,
+							 mdev->limits.num_mpts,
+							 mdev->limits.reserved_mrws,
+							 1, 1);
+	if (!mdev->mr_table.mpt_table) {
+		mthca_err(mdev, "Failed to map MPT context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_mtt;
+	}
+
+	mdev->qp_table.qp_table = mthca_alloc_icm_table(mdev, init_hca->qpc_base,
+							dev_lim->qpc_entry_sz,
+							mdev->limits.num_qps,
+							mdev->limits.reserved_qps,
+							0, 0);
+	if (!mdev->qp_table.qp_table) {
+		mthca_err(mdev, "Failed to map QP context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_mpt;
+	}
+
+	mdev->qp_table.eqp_table = mthca_alloc_icm_table(mdev, init_hca->eqpc_base,
+							 dev_lim->eqpc_entry_sz,
+							 mdev->limits.num_qps,
+							 mdev->limits.reserved_qps,
+							 0, 0);
+	if (!mdev->qp_table.eqp_table) {
+		mthca_err(mdev, "Failed to map EQP context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_qp;
+	}
+
+	mdev->qp_table.rdb_table = mthca_alloc_icm_table(mdev, init_hca->rdb_base,
+							 MTHCA_RDB_ENTRY_SIZE,
+							 mdev->limits.num_qps <<
+							 mdev->qp_table.rdb_shift, 0,
+							 0, 0);
+	if (!mdev->qp_table.rdb_table) {
+		mthca_err(mdev, "Failed to map RDB context memory, aborting\n");
+		err = -ENOMEM;
+		goto err_unmap_eqp;
+	}
+
+       mdev->cq_table.table = mthca_alloc_icm_table(mdev, init_hca->cqc_base,
+						    dev_lim->cqc_entry_sz,
+						    mdev->limits.num_cqs,
+						    mdev->limits.reserved_cqs,
+						    0, 0);
+	if (!mdev->cq_table.table) {
+		mthca_err(mdev, "Failed to map CQ context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_rdb;
+	}
+
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ) {
+		mdev->srq_table.table =
+			mthca_alloc_icm_table(mdev, init_hca->srqc_base,
+					      dev_lim->srq_entry_sz,
+					      mdev->limits.num_srqs,
+					      mdev->limits.reserved_srqs,
+					      0, 0);
+		if (!mdev->srq_table.table) {
+			mthca_err(mdev, "Failed to map SRQ context memory, "
+				  "aborting.\n");
+			err = -ENOMEM;
+			goto err_unmap_cq;
+		}
+	}
+
+	/*
+	 * It's not strictly required, but for simplicity just map the
+	 * whole multicast group table now.  The table isn't very big
+	 * and it's a lot easier than trying to track ref counts.
+	 */
+	mdev->mcg_table.table = mthca_alloc_icm_table(mdev, init_hca->mc_base,
+						      MTHCA_MGM_ENTRY_SIZE,
+						      mdev->limits.num_mgms +
+						      mdev->limits.num_amgms,
+						      mdev->limits.num_mgms +
+						      mdev->limits.num_amgms,
+						      0, 0);
+	if (!mdev->mcg_table.table) {
+		mthca_err(mdev, "Failed to map MCG context memory, aborting.\n");
+		err = -ENOMEM;
+		goto err_unmap_srq;
+	}
+
+	return 0;
+
+err_unmap_srq:
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+		mthca_free_icm_table(mdev, mdev->srq_table.table);
+
+err_unmap_cq:
+	mthca_free_icm_table(mdev, mdev->cq_table.table);
+
+err_unmap_rdb:
+	mthca_free_icm_table(mdev, mdev->qp_table.rdb_table);
+
+err_unmap_eqp:
+	mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);
+
+err_unmap_qp:
+	mthca_free_icm_table(mdev, mdev->qp_table.qp_table);
+
+err_unmap_mpt:
+	mthca_free_icm_table(mdev, mdev->mr_table.mpt_table);
+
+err_unmap_mtt:
+	mthca_free_icm_table(mdev, mdev->mr_table.mtt_table);
+
+err_unmap_eq:
+	mthca_unmap_eq_icm(mdev);
+
+err_unmap_aux:
+	mthca_UNMAP_ICM_AUX(mdev);
+
+err_free_aux:
+	mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0);
+
+	return err;
+}
+
+static void mthca_free_icms(struct mthca_dev *mdev)
+{
+
+	mthca_free_icm_table(mdev, mdev->mcg_table.table);
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+		mthca_free_icm_table(mdev, mdev->srq_table.table);
+	mthca_free_icm_table(mdev, mdev->cq_table.table);
+	mthca_free_icm_table(mdev, mdev->qp_table.rdb_table);
+	mthca_free_icm_table(mdev, mdev->qp_table.eqp_table);
+	mthca_free_icm_table(mdev, mdev->qp_table.qp_table);
+	mthca_free_icm_table(mdev, mdev->mr_table.mpt_table);
+	mthca_free_icm_table(mdev, mdev->mr_table.mtt_table);
+	mthca_unmap_eq_icm(mdev);
+
+	mthca_UNMAP_ICM_AUX(mdev);
+	mthca_free_icm(mdev, mdev->fw.arbel.aux_icm, 0);
+}
+
+static int mthca_init_arbel(struct mthca_dev *mdev)
+{
+	struct mthca_dev_lim        dev_lim;
+	struct mthca_profile        profile;
+	struct mthca_init_hca_param init_hca;
+	s64 icm_size;
+	int err;
+
+	err = mthca_QUERY_FW(mdev);
+	if (err) {
+		mthca_err(mdev, "QUERY_FW command failed %d, aborting.\n", err);
+		return err;
+	}
+
+	err = mthca_ENABLE_LAM(mdev);
+	if (err == -EAGAIN) {
+		mthca_dbg(mdev, "No HCA-attached memory (running in MemFree mode)\n");
+		mdev->mthca_flags |= MTHCA_FLAG_NO_LAM;
+	} else if (err) {
+		mthca_err(mdev, "ENABLE_LAM returned %d, aborting.\n", err);
+		return err;
+	}
+
+	err = mthca_load_fw(mdev);
+	if (err) {
+		mthca_err(mdev, "Loading FW returned %d, aborting.\n", err);
+		goto err_disable;
+	}
+
+	err = mthca_dev_lim(mdev, &dev_lim);
+	if (err) {
+		mthca_err(mdev, "QUERY_DEV_LIM returned %d, aborting.\n", err);
+		goto err_stop_fw;
+	}
+
+	profile = hca_profile;
+	profile.num_uar  = dev_lim.uar_size / PAGE_SIZE;
+	profile.num_udav = 0;
+	if (mdev->mthca_flags & MTHCA_FLAG_SRQ)
+		profile.num_srq = dev_lim.max_srqs;
+
+	icm_size = mthca_make_profile(mdev, &profile, &dev_lim, &init_hca);
+	if (icm_size < 0) {
+		err = icm_size;
+		goto err_stop_fw;
+	}
+
+	err = mthca_init_icm(mdev, &dev_lim, &init_hca, icm_size);
+	if (err)
+		goto err_stop_fw;
+
+	err = mthca_INIT_HCA(mdev, &init_hca);
+	if (err) {
+		mthca_err(mdev, "INIT_HCA command returned %d, aborting.\n", err);
+		goto err_free_icm;
+	}
+
+	return 0;
+
+err_free_icm:
+	mthca_free_icms(mdev);
+
+err_stop_fw:
+	mthca_UNMAP_FA(mdev);
+	mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
+
+err_disable:
+	if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
+		mthca_DISABLE_LAM(mdev);
+
+	return err;
+}
+
+static void mthca_close_hca(struct mthca_dev *mdev)
+{
+	mthca_CLOSE_HCA(mdev, 0);
+
+	if (mthca_is_memfree(mdev)) {
+		mthca_free_icms(mdev);
+
+		mthca_UNMAP_FA(mdev);
+		mthca_free_icm(mdev, mdev->fw.arbel.fw_icm, 0);
+
+		if (!(mdev->mthca_flags & MTHCA_FLAG_NO_LAM))
+			mthca_DISABLE_LAM(mdev);
+	} else
+		mthca_SYS_DIS(mdev);
+}
+
+static int mthca_init_hca(struct mthca_dev *mdev)
+{
+	int err;
+	struct mthca_adapter adapter;
+
+	if (mthca_is_memfree(mdev))
+		err = mthca_init_arbel(mdev);
+	else
+		err = mthca_init_tavor(mdev);
+
+	if (err)
+		return err;
+
+	err = mthca_QUERY_ADAPTER(mdev, &adapter);
+	if (err) {
+		mthca_err(mdev, "QUERY_ADAPTER command returned %d, aborting.\n", err);
+		goto err_close;
+	}
+
+	mdev->eq_table.inta_pin = adapter.inta_pin;
+	if (!mthca_is_memfree(mdev))
+		mdev->rev_id = adapter.revision_id;
+	memcpy(mdev->board_id, adapter.board_id, sizeof mdev->board_id);
+
+	return 0;
+
+err_close:
+	mthca_close_hca(mdev);
+	return err;
+}
+
+static int mthca_setup_hca(struct mthca_dev *dev)
+{
+	int err;
+
+	MTHCA_INIT_DOORBELL_LOCK(&dev->doorbell_lock);
+
+	err = mthca_init_uar_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "user access region table, aborting.\n");
+		return err;
+	}
+
+	err = mthca_uar_alloc(dev, &dev->driver_uar);
+	if (err) {
+		mthca_err(dev, "Failed to allocate driver access region, "
+			  "aborting.\n");
+		goto err_uar_table_free;
+	}
+
+	dev->kar = ioremap((phys_addr_t) dev->driver_uar.pfn << PAGE_SHIFT, PAGE_SIZE);
+	if (!dev->kar) {
+		mthca_err(dev, "Couldn't map kernel access region, "
+			  "aborting.\n");
+		err = -ENOMEM;
+		goto err_uar_free;
+	}
+
+	err = mthca_init_pd_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "protection domain table, aborting.\n");
+		goto err_kar_unmap;
+	}
+
+	err = mthca_init_mr_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "memory region table, aborting.\n");
+		goto err_pd_table_free;
+	}
+
+	err = mthca_pd_alloc(dev, 1, &dev->driver_pd);
+	if (err) {
+		mthca_err(dev, "Failed to create driver PD, "
+			  "aborting.\n");
+		goto err_mr_table_free;
+	}
+
+	err = mthca_init_eq_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "event queue table, aborting.\n");
+		goto err_pd_free;
+	}
+
+	err = mthca_cmd_use_events(dev);
+	if (err) {
+		mthca_err(dev, "Failed to switch to event-driven "
+			  "firmware commands, aborting.\n");
+		goto err_eq_table_free;
+	}
+
+	err = mthca_NOP(dev);
+	if (err) {
+		if (dev->mthca_flags & MTHCA_FLAG_MSI_X) {
+			mthca_warn(dev, "NOP command failed to generate interrupt "
+				   "(IRQ %d).\n",
+				   dev->eq_table.eq[MTHCA_EQ_CMD].msi_x_vector);
+			mthca_warn(dev, "Trying again with MSI-X disabled.\n");
+		} else {
+			mthca_err(dev, "NOP command failed to generate interrupt "
+				  "(IRQ %d), aborting.\n",
+				  dev->pdev->irq);
+			mthca_err(dev, "BIOS or ACPI interrupt routing problem?\n");
+		}
+
+		goto err_cmd_poll;
+	}
+
+	mthca_dbg(dev, "NOP command IRQ test passed\n");
+
+	err = mthca_init_cq_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "completion queue table, aborting.\n");
+		goto err_cmd_poll;
+	}
+
+	err = mthca_init_srq_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "shared receive queue table, aborting.\n");
+		goto err_cq_table_free;
+	}
+
+	err = mthca_init_qp_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "queue pair table, aborting.\n");
+		goto err_srq_table_free;
+	}
+
+	err = mthca_init_av_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "address vector table, aborting.\n");
+		goto err_qp_table_free;
+	}
+
+	err = mthca_init_mcg_table(dev);
+	if (err) {
+		mthca_err(dev, "Failed to initialize "
+			  "multicast group table, aborting.\n");
+		goto err_av_table_free;
+	}
+
+	return 0;
+
+err_av_table_free:
+	mthca_cleanup_av_table(dev);
+
+err_qp_table_free:
+	mthca_cleanup_qp_table(dev);
+
+err_srq_table_free:
+	mthca_cleanup_srq_table(dev);
+
+err_cq_table_free:
+	mthca_cleanup_cq_table(dev);
+
+err_cmd_poll:
+	mthca_cmd_use_polling(dev);
+
+err_eq_table_free:
+	mthca_cleanup_eq_table(dev);
+
+err_pd_free:
+	mthca_pd_free(dev, &dev->driver_pd);
+
+err_mr_table_free:
+	mthca_cleanup_mr_table(dev);
+
+err_pd_table_free:
+	mthca_cleanup_pd_table(dev);
+
+err_kar_unmap:
+	iounmap(dev->kar);
+
+err_uar_free:
+	mthca_uar_free(dev, &dev->driver_uar);
+
+err_uar_table_free:
+	mthca_cleanup_uar_table(dev);
+	return err;
+}
+
+static int mthca_enable_msi_x(struct mthca_dev *mdev)
+{
+	struct msix_entry entries[3];
+	int err;
+
+	entries[0].entry = 0;
+	entries[1].entry = 1;
+	entries[2].entry = 2;
+
+	err = pci_enable_msix_exact(mdev->pdev, entries, ARRAY_SIZE(entries));
+	if (err)
+		return err;
+
+	mdev->eq_table.eq[MTHCA_EQ_COMP ].msi_x_vector = entries[0].vector;
+	mdev->eq_table.eq[MTHCA_EQ_ASYNC].msi_x_vector = entries[1].vector;
+	mdev->eq_table.eq[MTHCA_EQ_CMD  ].msi_x_vector = entries[2].vector;
+
+	return 0;
+}
+
+/* Types of supported HCA */
+enum {
+	TAVOR,			/* MT23108                        */
+	ARBEL_COMPAT,		/* MT25208 in Tavor compat mode   */
+	ARBEL_NATIVE,		/* MT25208 with extended features */
+	SINAI			/* MT25204 */
+};
+
+#define MTHCA_FW_VER(major, minor, subminor) \
+	(((u64) (major) << 32) | ((u64) (minor) << 16) | (u64) (subminor))
+
+static struct {
+	u64 latest_fw;
+	u32 flags;
+} mthca_hca_table[] = {
+	[TAVOR]        = { .latest_fw = MTHCA_FW_VER(3, 5, 0),
+			   .flags     = 0 },
+	[ARBEL_COMPAT] = { .latest_fw = MTHCA_FW_VER(4, 8, 200),
+			   .flags     = MTHCA_FLAG_PCIE },
+	[ARBEL_NATIVE] = { .latest_fw = MTHCA_FW_VER(5, 3, 0),
+			   .flags     = MTHCA_FLAG_MEMFREE |
+					MTHCA_FLAG_PCIE },
+	[SINAI]        = { .latest_fw = MTHCA_FW_VER(1, 2, 0),
+			   .flags     = MTHCA_FLAG_MEMFREE |
+					MTHCA_FLAG_PCIE    |
+					MTHCA_FLAG_SINAI_OPT }
+};
+
+static int __mthca_init_one(struct pci_dev *pdev, int hca_type)
+{
+	int ddr_hidden = 0;
+	int err;
+	struct mthca_dev *mdev;
+
+	printk(KERN_INFO PFX "Initializing %s\n",
+	       pci_name(pdev));
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot enable PCI device, "
+			"aborting.\n");
+		return err;
+	}
+
+	/*
+	 * Check for BARs.  We expect 0: 1MB, 2: 8MB, 4: DDR (may not
+	 * be present)
+	 */
+	if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM) ||
+	    pci_resource_len(pdev, 0) != 1 << 20) {
+		dev_err(&pdev->dev, "Missing DCS, aborting.\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) {
+		dev_err(&pdev->dev, "Missing UAR, aborting.\n");
+		err = -ENODEV;
+		goto err_disable_pdev;
+	}
+	if (!(pci_resource_flags(pdev, 4) & IORESOURCE_MEM))
+		ddr_hidden = 1;
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		dev_err(&pdev->dev, "Cannot obtain PCI resources, "
+			"aborting.\n");
+		goto err_disable_pdev;
+	}
+
+	pci_set_master(pdev);
+
+	err = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit PCI DMA mask.\n");
+		err = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set PCI DMA mask, aborting.\n");
+			goto err_free_res;
+		}
+	}
+	err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (err) {
+		dev_warn(&pdev->dev, "Warning: couldn't set 64-bit "
+			 "consistent PCI DMA mask.\n");
+		err = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (err) {
+			dev_err(&pdev->dev, "Can't set consistent PCI DMA mask, "
+				"aborting.\n");
+			goto err_free_res;
+		}
+	}
+
+	/* We can handle large RDMA requests, so allow larger segments. */
+	dma_set_max_seg_size(&pdev->dev, 1024 * 1024 * 1024);
+
+	mdev = (struct mthca_dev *) ib_alloc_device(sizeof *mdev);
+	if (!mdev) {
+		dev_err(&pdev->dev, "Device struct alloc failed, "
+			"aborting.\n");
+		err = -ENOMEM;
+		goto err_free_res;
+	}
+
+	mdev->pdev = pdev;
+
+	mdev->mthca_flags = mthca_hca_table[hca_type].flags;
+	if (ddr_hidden)
+		mdev->mthca_flags |= MTHCA_FLAG_DDR_HIDDEN;
+
+	/*
+	 * Now reset the HCA before we touch the PCI capabilities or
+	 * attempt a firmware command, since a boot ROM may have left
+	 * the HCA in an undefined state.
+	 */
+	err = mthca_reset(mdev);
+	if (err) {
+		mthca_err(mdev, "Failed to reset HCA, aborting.\n");
+		goto err_free_dev;
+	}
+
+	if (mthca_cmd_init(mdev)) {
+		mthca_err(mdev, "Failed to init command interface, aborting.\n");
+		goto err_free_dev;
+	}
+
+	err = mthca_tune_pci(mdev);
+	if (err)
+		goto err_cmd;
+
+	err = mthca_init_hca(mdev);
+	if (err)
+		goto err_cmd;
+
+	if (mdev->fw_ver < mthca_hca_table[hca_type].latest_fw) {
+		mthca_warn(mdev, "HCA FW version %d.%d.%03d is old (%d.%d.%03d is current).\n",
+			   (int) (mdev->fw_ver >> 32), (int) (mdev->fw_ver >> 16) & 0xffff,
+			   (int) (mdev->fw_ver & 0xffff),
+			   (int) (mthca_hca_table[hca_type].latest_fw >> 32),
+			   (int) (mthca_hca_table[hca_type].latest_fw >> 16) & 0xffff,
+			   (int) (mthca_hca_table[hca_type].latest_fw & 0xffff));
+		mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n");
+	}
+
+	if (msi_x && !mthca_enable_msi_x(mdev))
+		mdev->mthca_flags |= MTHCA_FLAG_MSI_X;
+
+	err = mthca_setup_hca(mdev);
+	if (err == -EBUSY && (mdev->mthca_flags & MTHCA_FLAG_MSI_X)) {
+		if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+			pci_disable_msix(pdev);
+		mdev->mthca_flags &= ~MTHCA_FLAG_MSI_X;
+
+		err = mthca_setup_hca(mdev);
+	}
+
+	if (err)
+		goto err_close;
+
+	err = mthca_register_device(mdev);
+	if (err)
+		goto err_cleanup;
+
+	err = mthca_create_agents(mdev);
+	if (err)
+		goto err_unregister;
+
+	pci_set_drvdata(pdev, mdev);
+	mdev->hca_type = hca_type;
+
+	mdev->active = true;
+
+	return 0;
+
+err_unregister:
+	mthca_unregister_device(mdev);
+
+err_cleanup:
+	mthca_cleanup_mcg_table(mdev);
+	mthca_cleanup_av_table(mdev);
+	mthca_cleanup_qp_table(mdev);
+	mthca_cleanup_srq_table(mdev);
+	mthca_cleanup_cq_table(mdev);
+	mthca_cmd_use_polling(mdev);
+	mthca_cleanup_eq_table(mdev);
+
+	mthca_pd_free(mdev, &mdev->driver_pd);
+
+	mthca_cleanup_mr_table(mdev);
+	mthca_cleanup_pd_table(mdev);
+	mthca_cleanup_uar_table(mdev);
+
+err_close:
+	if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+		pci_disable_msix(pdev);
+
+	mthca_close_hca(mdev);
+
+err_cmd:
+	mthca_cmd_cleanup(mdev);
+
+err_free_dev:
+	ib_dealloc_device(&mdev->ib_dev);
+
+err_free_res:
+	pci_release_regions(pdev);
+
+err_disable_pdev:
+	pci_disable_device(pdev);
+	pci_set_drvdata(pdev, NULL);
+	return err;
+}
+
+static void __mthca_remove_one(struct pci_dev *pdev)
+{
+	struct mthca_dev *mdev = pci_get_drvdata(pdev);
+	int p;
+
+	if (mdev) {
+		mthca_free_agents(mdev);
+		mthca_unregister_device(mdev);
+
+		for (p = 1; p <= mdev->limits.num_ports; ++p)
+			mthca_CLOSE_IB(mdev, p);
+
+		mthca_cleanup_mcg_table(mdev);
+		mthca_cleanup_av_table(mdev);
+		mthca_cleanup_qp_table(mdev);
+		mthca_cleanup_srq_table(mdev);
+		mthca_cleanup_cq_table(mdev);
+		mthca_cmd_use_polling(mdev);
+		mthca_cleanup_eq_table(mdev);
+
+		mthca_pd_free(mdev, &mdev->driver_pd);
+
+		mthca_cleanup_mr_table(mdev);
+		mthca_cleanup_pd_table(mdev);
+
+		iounmap(mdev->kar);
+		mthca_uar_free(mdev, &mdev->driver_uar);
+		mthca_cleanup_uar_table(mdev);
+		mthca_close_hca(mdev);
+		mthca_cmd_cleanup(mdev);
+
+		if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+			pci_disable_msix(pdev);
+
+		ib_dealloc_device(&mdev->ib_dev);
+		pci_release_regions(pdev);
+		pci_disable_device(pdev);
+		pci_set_drvdata(pdev, NULL);
+	}
+}
+
+int __mthca_restart_one(struct pci_dev *pdev)
+{
+	struct mthca_dev *mdev;
+	int hca_type;
+
+	mdev = pci_get_drvdata(pdev);
+	if (!mdev)
+		return -ENODEV;
+	hca_type = mdev->hca_type;
+	__mthca_remove_one(pdev);
+	return __mthca_init_one(pdev, hca_type);
+}
+
+static int mthca_init_one(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	int ret;
+
+	mutex_lock(&mthca_device_mutex);
+
+	printk_once(KERN_INFO "%s", mthca_version);
+
+	if (id->driver_data >= ARRAY_SIZE(mthca_hca_table)) {
+		printk(KERN_ERR PFX "%s has invalid driver data %lx\n",
+		       pci_name(pdev), id->driver_data);
+		mutex_unlock(&mthca_device_mutex);
+		return -ENODEV;
+	}
+
+	ret = __mthca_init_one(pdev, id->driver_data);
+
+	mutex_unlock(&mthca_device_mutex);
+
+	return ret;
+}
+
+static void mthca_remove_one(struct pci_dev *pdev)
+{
+	mutex_lock(&mthca_device_mutex);
+	__mthca_remove_one(pdev);
+	mutex_unlock(&mthca_device_mutex);
+}
+
+static struct pci_device_id mthca_pci_table[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_TAVOR),
+	  .driver_data = TAVOR },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_TAVOR),
+	  .driver_data = TAVOR },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT),
+	  .driver_data = ARBEL_COMPAT },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT),
+	  .driver_data = ARBEL_COMPAT },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_ARBEL),
+	  .driver_data = ARBEL_NATIVE },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_ARBEL),
+	  .driver_data = ARBEL_NATIVE },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI),
+	  .driver_data = SINAI },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI),
+	  .driver_data = SINAI },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MELLANOX, PCI_DEVICE_ID_MELLANOX_SINAI_OLD),
+	  .driver_data = SINAI },
+	{ PCI_DEVICE(PCI_VENDOR_ID_TOPSPIN, PCI_DEVICE_ID_MELLANOX_SINAI_OLD),
+	  .driver_data = SINAI },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, mthca_pci_table);
+
+static struct pci_driver mthca_driver = {
+	.name		= DRV_NAME,
+	.id_table	= mthca_pci_table,
+	.probe		= mthca_init_one,
+	.remove		= mthca_remove_one,
+};
+
+static void __init __mthca_check_profile_val(const char *name, int *pval,
+					     int pval_default)
+{
+	/* value must be positive and power of 2 */
+	int old_pval = *pval;
+
+	if (old_pval <= 0)
+		*pval = pval_default;
+	else
+		*pval = roundup_pow_of_two(old_pval);
+
+	if (old_pval != *pval) {
+		printk(KERN_WARNING PFX "Invalid value %d for %s in module parameter.\n",
+		       old_pval, name);
+		printk(KERN_WARNING PFX "Corrected %s to %d.\n", name, *pval);
+	}
+}
+
+#define mthca_check_profile_val(name, default)				\
+	__mthca_check_profile_val(#name, &hca_profile.name, default)
+
+static void __init mthca_validate_profile(void)
+{
+	mthca_check_profile_val(num_qp,            MTHCA_DEFAULT_NUM_QP);
+	mthca_check_profile_val(rdb_per_qp,        MTHCA_DEFAULT_RDB_PER_QP);
+	mthca_check_profile_val(num_cq,            MTHCA_DEFAULT_NUM_CQ);
+	mthca_check_profile_val(num_mcg, 	   MTHCA_DEFAULT_NUM_MCG);
+	mthca_check_profile_val(num_mpt, 	   MTHCA_DEFAULT_NUM_MPT);
+	mthca_check_profile_val(num_mtt, 	   MTHCA_DEFAULT_NUM_MTT);
+	mthca_check_profile_val(num_udav,          MTHCA_DEFAULT_NUM_UDAV);
+	mthca_check_profile_val(fmr_reserved_mtts, MTHCA_DEFAULT_NUM_RESERVED_MTTS);
+
+	if (hca_profile.fmr_reserved_mtts >= hca_profile.num_mtt) {
+		printk(KERN_WARNING PFX "Invalid fmr_reserved_mtts module parameter %d.\n",
+		       hca_profile.fmr_reserved_mtts);
+		printk(KERN_WARNING PFX "(Must be smaller than num_mtt %d)\n",
+		       hca_profile.num_mtt);
+		hca_profile.fmr_reserved_mtts = hca_profile.num_mtt / 2;
+		printk(KERN_WARNING PFX "Corrected fmr_reserved_mtts to %d.\n",
+		       hca_profile.fmr_reserved_mtts);
+	}
+
+	if ((log_mtts_per_seg < 1) || (log_mtts_per_seg > 5)) {
+		printk(KERN_WARNING PFX "bad log_mtts_per_seg (%d). Using default - %d\n",
+		       log_mtts_per_seg, ilog2(MTHCA_MTT_SEG_SIZE / 8));
+		log_mtts_per_seg = ilog2(MTHCA_MTT_SEG_SIZE / 8);
+	}
+}
+
+static int __init mthca_init(void)
+{
+	int ret;
+
+	mthca_validate_profile();
+
+	ret = mthca_catas_init();
+	if (ret)
+		return ret;
+
+	ret = pci_register_driver(&mthca_driver);
+	if (ret < 0) {
+		mthca_catas_cleanup();
+		return ret;
+	}
+
+	return 0;
+}
+
+static void __exit mthca_cleanup(void)
+{
+	pci_unregister_driver(&mthca_driver);
+	mthca_catas_cleanup();
+}
+
+module_init(mthca_init);
+module_exit(mthca_cleanup);
diff --git a/drivers/infiniband/hw/mthca/mthca_mcg.c b/drivers/infiniband/hw/mthca/mthca_mcg.c
new file mode 100644
index 000000000..6304ae8f4
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_mcg.c
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/gfp.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+struct mthca_mgm {
+	__be32 next_gid_index;
+	u32    reserved[3];
+	u8     gid[16];
+	__be32 qp[MTHCA_QP_PER_MGM];
+};
+
+static const u8 zero_gid[16];	/* automatically initialized to 0 */
+
+/*
+ * Caller must hold MCG table semaphore.  gid and mgm parameters must
+ * be properly aligned for command interface.
+ *
+ *  Returns 0 unless a firmware command error occurs.
+ *
+ * If GID is found in MGM or MGM is empty, *index = *hash, *prev = -1
+ * and *mgm holds MGM entry.
+ *
+ * if GID is found in AMGM, *index = index in AMGM, *prev = index of
+ * previous entry in hash chain and *mgm holds AMGM entry.
+ *
+ * If no AMGM exists for given gid, *index = -1, *prev = index of last
+ * entry in hash chain and *mgm holds end of hash chain.
+ */
+static int find_mgm(struct mthca_dev *dev,
+		    u8 *gid, struct mthca_mailbox *mgm_mailbox,
+		    u16 *hash, int *prev, int *index)
+{
+	struct mthca_mailbox *mailbox;
+	struct mthca_mgm *mgm = mgm_mailbox->buf;
+	u8 *mgid;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return -ENOMEM;
+	mgid = mailbox->buf;
+
+	memcpy(mgid, gid, 16);
+
+	err = mthca_MGID_HASH(dev, mailbox, hash);
+	if (err) {
+		mthca_err(dev, "MGID_HASH failed (%d)\n", err);
+		goto out;
+	}
+
+	if (0)
+		mthca_dbg(dev, "Hash for %pI6 is %04x\n", gid, *hash);
+
+	*index = *hash;
+	*prev  = -1;
+
+	do {
+		err = mthca_READ_MGM(dev, *index, mgm_mailbox);
+		if (err) {
+			mthca_err(dev, "READ_MGM failed (%d)\n", err);
+			goto out;
+		}
+
+		if (!memcmp(mgm->gid, zero_gid, 16)) {
+			if (*index != *hash) {
+				mthca_err(dev, "Found zero MGID in AMGM.\n");
+				err = -EINVAL;
+			}
+			goto out;
+		}
+
+		if (!memcmp(mgm->gid, gid, 16))
+			goto out;
+
+		*prev = *index;
+		*index = be32_to_cpu(mgm->next_gid_index) >> 6;
+	} while (*index);
+
+	*index = -1;
+
+ out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_mailbox *mailbox;
+	struct mthca_mgm *mgm;
+	u16 hash;
+	int index, prev;
+	int link = 0;
+	int i;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&dev->mcg_table.mutex);
+
+	err = find_mgm(dev, gid->raw, mailbox, &hash, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index != -1) {
+		if (!memcmp(mgm->gid, zero_gid, 16))
+			memcpy(mgm->gid, gid->raw, 16);
+	} else {
+		link = 1;
+
+		index = mthca_alloc(&dev->mcg_table.alloc);
+		if (index == -1) {
+			mthca_err(dev, "No AMGM entries left\n");
+			err = -ENOMEM;
+			goto out;
+		}
+
+		err = mthca_READ_MGM(dev, index, mailbox);
+		if (err) {
+			mthca_err(dev, "READ_MGM failed (%d)\n", err);
+			goto out;
+		}
+		memset(mgm, 0, sizeof *mgm);
+		memcpy(mgm->gid, gid->raw, 16);
+	}
+
+	for (i = 0; i < MTHCA_QP_PER_MGM; ++i)
+		if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31))) {
+			mthca_dbg(dev, "QP %06x already a member of MGM\n",
+				  ibqp->qp_num);
+			err = 0;
+			goto out;
+		} else if (!(mgm->qp[i] & cpu_to_be32(1 << 31))) {
+			mgm->qp[i] = cpu_to_be32(ibqp->qp_num | (1 << 31));
+			break;
+		}
+
+	if (i == MTHCA_QP_PER_MGM) {
+		mthca_err(dev, "MGM at index %x is full.\n", index);
+		err = -ENOMEM;
+		goto out;
+	}
+
+	err = mthca_WRITE_MGM(dev, index, mailbox);
+	if (err) {
+		mthca_err(dev, "WRITE_MGM failed %d\n", err);
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (!link)
+		goto out;
+
+	err = mthca_READ_MGM(dev, prev, mailbox);
+	if (err) {
+		mthca_err(dev, "READ_MGM failed %d\n", err);
+		goto out;
+	}
+
+	mgm->next_gid_index = cpu_to_be32(index << 6);
+
+	err = mthca_WRITE_MGM(dev, prev, mailbox);
+	if (err)
+		mthca_err(dev, "WRITE_MGM returned %d\n", err);
+
+ out:
+	if (err && link && index != -1) {
+		BUG_ON(index < dev->limits.num_mgms);
+		mthca_free(&dev->mcg_table.alloc, index);
+	}
+	mutex_unlock(&dev->mcg_table.mutex);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_mailbox *mailbox;
+	struct mthca_mgm *mgm;
+	u16 hash;
+	int prev, index;
+	int i, loc;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mgm = mailbox->buf;
+
+	mutex_lock(&dev->mcg_table.mutex);
+
+	err = find_mgm(dev, gid->raw, mailbox, &hash, &prev, &index);
+	if (err)
+		goto out;
+
+	if (index == -1) {
+		mthca_err(dev, "MGID %pI6 not found\n", gid->raw);
+		err = -EINVAL;
+		goto out;
+	}
+
+	for (loc = -1, i = 0; i < MTHCA_QP_PER_MGM; ++i) {
+		if (mgm->qp[i] == cpu_to_be32(ibqp->qp_num | (1 << 31)))
+			loc = i;
+		if (!(mgm->qp[i] & cpu_to_be32(1 << 31)))
+			break;
+	}
+
+	if (loc == -1) {
+		mthca_err(dev, "QP %06x not found in MGM\n", ibqp->qp_num);
+		err = -EINVAL;
+		goto out;
+	}
+
+	mgm->qp[loc]   = mgm->qp[i - 1];
+	mgm->qp[i - 1] = 0;
+
+	err = mthca_WRITE_MGM(dev, index, mailbox);
+	if (err) {
+		mthca_err(dev, "WRITE_MGM returned %d\n", err);
+		goto out;
+	}
+
+	if (i != 1)
+		goto out;
+
+	if (prev == -1) {
+		/* Remove entry from MGM */
+		int amgm_index_to_free = be32_to_cpu(mgm->next_gid_index) >> 6;
+		if (amgm_index_to_free) {
+			err = mthca_READ_MGM(dev, amgm_index_to_free,
+					     mailbox);
+			if (err) {
+				mthca_err(dev, "READ_MGM returned %d\n", err);
+				goto out;
+			}
+		} else
+			memset(mgm->gid, 0, 16);
+
+		err = mthca_WRITE_MGM(dev, index, mailbox);
+		if (err) {
+			mthca_err(dev, "WRITE_MGM returned %d\n", err);
+			goto out;
+		}
+		if (amgm_index_to_free) {
+			BUG_ON(amgm_index_to_free < dev->limits.num_mgms);
+			mthca_free(&dev->mcg_table.alloc, amgm_index_to_free);
+		}
+	} else {
+		/* Remove entry from AMGM */
+		int curr_next_index = be32_to_cpu(mgm->next_gid_index) >> 6;
+		err = mthca_READ_MGM(dev, prev, mailbox);
+		if (err) {
+			mthca_err(dev, "READ_MGM returned %d\n", err);
+			goto out;
+		}
+
+		mgm->next_gid_index = cpu_to_be32(curr_next_index << 6);
+
+		err = mthca_WRITE_MGM(dev, prev, mailbox);
+		if (err) {
+			mthca_err(dev, "WRITE_MGM returned %d\n", err);
+			goto out;
+		}
+		BUG_ON(index < dev->limits.num_mgms);
+		mthca_free(&dev->mcg_table.alloc, index);
+	}
+
+ out:
+	mutex_unlock(&dev->mcg_table.mutex);
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_init_mcg_table(struct mthca_dev *dev)
+{
+	int err;
+	int table_size = dev->limits.num_mgms + dev->limits.num_amgms;
+
+	err = mthca_alloc_init(&dev->mcg_table.alloc,
+			       table_size,
+			       table_size - 1,
+			       dev->limits.num_mgms);
+	if (err)
+		return err;
+
+	mutex_init(&dev->mcg_table.mutex);
+
+	return 0;
+}
+
+void mthca_cleanup_mcg_table(struct mthca_dev *dev)
+{
+	mthca_alloc_cleanup(&dev->mcg_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.c b/drivers/infiniband/hw/mthca/mthca_memfree.c
new file mode 100644
index 000000000..7d2e42dd6
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.c
@@ -0,0 +1,760 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <asm/page.h>
+
+#include "mthca_memfree.h"
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+/*
+ * We allocate in as big chunks as we can, up to a maximum of 256 KB
+ * per chunk.
+ */
+enum {
+	MTHCA_ICM_ALLOC_SIZE   = 1 << 18,
+	MTHCA_TABLE_CHUNK_SIZE = 1 << 18
+};
+
+struct mthca_user_db_table {
+	struct mutex mutex;
+	struct {
+		u64                uvirt;
+		struct scatterlist mem;
+		int                refcount;
+	}                page[0];
+};
+
+static void mthca_free_icm_pages(struct mthca_dev *dev, struct mthca_icm_chunk *chunk)
+{
+	int i;
+
+	if (chunk->nsg > 0)
+		pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+			     PCI_DMA_BIDIRECTIONAL);
+
+	for (i = 0; i < chunk->npages; ++i)
+		__free_pages(sg_page(&chunk->mem[i]),
+			     get_order(chunk->mem[i].length));
+}
+
+static void mthca_free_icm_coherent(struct mthca_dev *dev, struct mthca_icm_chunk *chunk)
+{
+	int i;
+
+	for (i = 0; i < chunk->npages; ++i) {
+		dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length,
+				  lowmem_page_address(sg_page(&chunk->mem[i])),
+				  sg_dma_address(&chunk->mem[i]));
+	}
+}
+
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent)
+{
+	struct mthca_icm_chunk *chunk, *tmp;
+
+	if (!icm)
+		return;
+
+	list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
+		if (coherent)
+			mthca_free_icm_coherent(dev, chunk);
+		else
+			mthca_free_icm_pages(dev, chunk);
+
+		kfree(chunk);
+	}
+
+	kfree(icm);
+}
+
+static int mthca_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask)
+{
+	struct page *page;
+
+	/*
+	 * Use __GFP_ZERO because buggy firmware assumes ICM pages are
+	 * cleared, and subtle failures are seen if they aren't.
+	 */
+	page = alloc_pages(gfp_mask | __GFP_ZERO, order);
+	if (!page)
+		return -ENOMEM;
+
+	sg_set_page(mem, page, PAGE_SIZE << order, 0);
+	return 0;
+}
+
+static int mthca_alloc_icm_coherent(struct device *dev, struct scatterlist *mem,
+				    int order, gfp_t gfp_mask)
+{
+	void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order, &sg_dma_address(mem),
+				       gfp_mask);
+	if (!buf)
+		return -ENOMEM;
+
+	sg_set_buf(mem, buf, PAGE_SIZE << order);
+	BUG_ON(mem->offset);
+	sg_dma_len(mem) = PAGE_SIZE << order;
+	return 0;
+}
+
+struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
+				  gfp_t gfp_mask, int coherent)
+{
+	struct mthca_icm *icm;
+	struct mthca_icm_chunk *chunk = NULL;
+	int cur_order;
+	int ret;
+
+	/* We use sg_set_buf for coherent allocs, which assumes low memory */
+	BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM));
+
+	icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+	if (!icm)
+		return icm;
+
+	icm->refcount = 0;
+	INIT_LIST_HEAD(&icm->chunk_list);
+
+	cur_order = get_order(MTHCA_ICM_ALLOC_SIZE);
+
+	while (npages > 0) {
+		if (!chunk) {
+			chunk = kmalloc(sizeof *chunk,
+					gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
+			if (!chunk)
+				goto fail;
+
+			sg_init_table(chunk->mem, MTHCA_ICM_CHUNK_LEN);
+			chunk->npages = 0;
+			chunk->nsg    = 0;
+			list_add_tail(&chunk->list, &icm->chunk_list);
+		}
+
+		while (1 << cur_order > npages)
+			--cur_order;
+
+		if (coherent)
+			ret = mthca_alloc_icm_coherent(&dev->pdev->dev,
+						       &chunk->mem[chunk->npages],
+						       cur_order, gfp_mask);
+		else
+			ret = mthca_alloc_icm_pages(&chunk->mem[chunk->npages],
+						    cur_order, gfp_mask);
+
+		if (!ret) {
+			++chunk->npages;
+
+			if (coherent)
+				++chunk->nsg;
+			else if (chunk->npages == MTHCA_ICM_CHUNK_LEN) {
+				chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+							chunk->npages,
+							PCI_DMA_BIDIRECTIONAL);
+
+				if (chunk->nsg <= 0)
+					goto fail;
+			}
+
+			if (chunk->npages == MTHCA_ICM_CHUNK_LEN)
+				chunk = NULL;
+
+			npages -= 1 << cur_order;
+		} else {
+			--cur_order;
+			if (cur_order < 0)
+				goto fail;
+		}
+	}
+
+	if (!coherent && chunk) {
+		chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
+					chunk->npages,
+					PCI_DMA_BIDIRECTIONAL);
+
+		if (chunk->nsg <= 0)
+			goto fail;
+	}
+
+	return icm;
+
+fail:
+	mthca_free_icm(dev, icm, coherent);
+	return NULL;
+}
+
+int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj)
+{
+	int i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE;
+	int ret = 0;
+
+	mutex_lock(&table->mutex);
+
+	if (table->icm[i]) {
+		++table->icm[i]->refcount;
+		goto out;
+	}
+
+	table->icm[i] = mthca_alloc_icm(dev, MTHCA_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
+					(table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+					__GFP_NOWARN, table->coherent);
+	if (!table->icm[i]) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (mthca_MAP_ICM(dev, table->icm[i],
+			  table->virt + i * MTHCA_TABLE_CHUNK_SIZE)) {
+		mthca_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	++table->icm[i]->refcount;
+
+out:
+	mutex_unlock(&table->mutex);
+	return ret;
+}
+
+void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	i = (obj & (table->num_obj - 1)) * table->obj_size / MTHCA_TABLE_CHUNK_SIZE;
+
+	mutex_lock(&table->mutex);
+
+	if (--table->icm[i]->refcount == 0) {
+		mthca_UNMAP_ICM(dev, table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+				MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE);
+		mthca_free_icm(dev, table->icm[i], table->coherent);
+		table->icm[i] = NULL;
+	}
+
+	mutex_unlock(&table->mutex);
+}
+
+void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle)
+{
+	int idx, offset, dma_offset, i;
+	struct mthca_icm_chunk *chunk;
+	struct mthca_icm *icm;
+	struct page *page = NULL;
+
+	if (!table->lowmem)
+		return NULL;
+
+	mutex_lock(&table->mutex);
+
+	idx = (obj & (table->num_obj - 1)) * table->obj_size;
+	icm = table->icm[idx / MTHCA_TABLE_CHUNK_SIZE];
+	dma_offset = offset = idx % MTHCA_TABLE_CHUNK_SIZE;
+
+	if (!icm)
+		goto out;
+
+	list_for_each_entry(chunk, &icm->chunk_list, list) {
+		for (i = 0; i < chunk->npages; ++i) {
+			if (dma_handle && dma_offset >= 0) {
+				if (sg_dma_len(&chunk->mem[i]) > dma_offset)
+					*dma_handle = sg_dma_address(&chunk->mem[i]) +
+						dma_offset;
+				dma_offset -= sg_dma_len(&chunk->mem[i]);
+			}
+			/* DMA mapping can merge pages but not split them,
+			 * so if we found the page, dma_handle has already
+			 * been assigned to. */
+			if (chunk->mem[i].length > offset) {
+				page = sg_page(&chunk->mem[i]);
+				goto out;
+			}
+			offset -= chunk->mem[i].length;
+		}
+	}
+
+out:
+	mutex_unlock(&table->mutex);
+	return page ? lowmem_page_address(page) + offset : NULL;
+}
+
+int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+			  int start, int end)
+{
+	int inc = MTHCA_TABLE_CHUNK_SIZE / table->obj_size;
+	int i, err;
+
+	for (i = start; i <= end; i += inc) {
+		err = mthca_table_get(dev, table, i);
+		if (err)
+			goto fail;
+	}
+
+	return 0;
+
+fail:
+	while (i > start) {
+		i -= inc;
+		mthca_table_put(dev, table, i);
+	}
+
+	return err;
+}
+
+void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+			   int start, int end)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	for (i = start; i <= end; i += MTHCA_TABLE_CHUNK_SIZE / table->obj_size)
+		mthca_table_put(dev, table, i);
+}
+
+struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
+					      u64 virt, int obj_size,
+					      int nobj, int reserved,
+					      int use_lowmem, int use_coherent)
+{
+	struct mthca_icm_table *table;
+	int obj_per_chunk;
+	int num_icm;
+	unsigned chunk_size;
+	int i;
+
+	obj_per_chunk = MTHCA_TABLE_CHUNK_SIZE / obj_size;
+	num_icm = DIV_ROUND_UP(nobj, obj_per_chunk);
+
+	table = kmalloc(sizeof *table + num_icm * sizeof *table->icm, GFP_KERNEL);
+	if (!table)
+		return NULL;
+
+	table->virt     = virt;
+	table->num_icm  = num_icm;
+	table->num_obj  = nobj;
+	table->obj_size = obj_size;
+	table->lowmem   = use_lowmem;
+	table->coherent = use_coherent;
+	mutex_init(&table->mutex);
+
+	for (i = 0; i < num_icm; ++i)
+		table->icm[i] = NULL;
+
+	for (i = 0; i * MTHCA_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
+		chunk_size = MTHCA_TABLE_CHUNK_SIZE;
+		if ((i + 1) * MTHCA_TABLE_CHUNK_SIZE > nobj * obj_size)
+			chunk_size = nobj * obj_size - i * MTHCA_TABLE_CHUNK_SIZE;
+
+		table->icm[i] = mthca_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
+						(use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
+						__GFP_NOWARN, use_coherent);
+		if (!table->icm[i])
+			goto err;
+		if (mthca_MAP_ICM(dev, table->icm[i],
+				  virt + i * MTHCA_TABLE_CHUNK_SIZE)) {
+			mthca_free_icm(dev, table->icm[i], table->coherent);
+			table->icm[i] = NULL;
+			goto err;
+		}
+
+		/*
+		 * Add a reference to this ICM chunk so that it never
+		 * gets freed (since it contains reserved firmware objects).
+		 */
+		++table->icm[i]->refcount;
+	}
+
+	return table;
+
+err:
+	for (i = 0; i < num_icm; ++i)
+		if (table->icm[i]) {
+			mthca_UNMAP_ICM(dev, virt + i * MTHCA_TABLE_CHUNK_SIZE,
+					MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE);
+			mthca_free_icm(dev, table->icm[i], table->coherent);
+		}
+
+	kfree(table);
+
+	return NULL;
+}
+
+void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table)
+{
+	int i;
+
+	for (i = 0; i < table->num_icm; ++i)
+		if (table->icm[i]) {
+			mthca_UNMAP_ICM(dev,
+					table->virt + i * MTHCA_TABLE_CHUNK_SIZE,
+					MTHCA_TABLE_CHUNK_SIZE / MTHCA_ICM_PAGE_SIZE);
+			mthca_free_icm(dev, table->icm[i], table->coherent);
+		}
+
+	kfree(table);
+}
+
+static u64 mthca_uarc_virt(struct mthca_dev *dev, struct mthca_uar *uar, int page)
+{
+	return dev->uar_table.uarc_base +
+		uar->index * dev->uar_table.uarc_size +
+		page * MTHCA_ICM_PAGE_SIZE;
+}
+
+int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+		      struct mthca_user_db_table *db_tab, int index, u64 uaddr)
+{
+	struct page *pages[1];
+	int ret = 0;
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return 0;
+
+	if (index < 0 || index > dev->uar_table.uarc_size / 8)
+		return -EINVAL;
+
+	mutex_lock(&db_tab->mutex);
+
+	i = index / MTHCA_DB_REC_PER_PAGE;
+
+	if ((db_tab->page[i].refcount >= MTHCA_DB_REC_PER_PAGE)       ||
+	    (db_tab->page[i].uvirt && db_tab->page[i].uvirt != uaddr) ||
+	    (uaddr & 4095)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (db_tab->page[i].refcount) {
+		++db_tab->page[i].refcount;
+		goto out;
+	}
+
+	ret = get_user_pages(current, current->mm, uaddr & PAGE_MASK, 1, 1, 0,
+			     pages, NULL);
+	if (ret < 0)
+		goto out;
+
+	sg_set_page(&db_tab->page[i].mem, pages[0], MTHCA_ICM_PAGE_SIZE,
+			uaddr & ~PAGE_MASK);
+
+	ret = pci_map_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+	if (ret < 0) {
+		put_page(pages[0]);
+		goto out;
+	}
+
+	ret = mthca_MAP_ICM_page(dev, sg_dma_address(&db_tab->page[i].mem),
+				 mthca_uarc_virt(dev, uar, i));
+	if (ret) {
+		pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+		put_page(sg_page(&db_tab->page[i].mem));
+		goto out;
+	}
+
+	db_tab->page[i].uvirt    = uaddr;
+	db_tab->page[i].refcount = 1;
+
+out:
+	mutex_unlock(&db_tab->mutex);
+	return ret;
+}
+
+void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+			 struct mthca_user_db_table *db_tab, int index)
+{
+	if (!mthca_is_memfree(dev))
+		return;
+
+	/*
+	 * To make our bookkeeping simpler, we don't unmap DB
+	 * pages until we clean up the whole db table.
+	 */
+
+	mutex_lock(&db_tab->mutex);
+
+	--db_tab->page[index / MTHCA_DB_REC_PER_PAGE].refcount;
+
+	mutex_unlock(&db_tab->mutex);
+}
+
+struct mthca_user_db_table *mthca_init_user_db_tab(struct mthca_dev *dev)
+{
+	struct mthca_user_db_table *db_tab;
+	int npages;
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return NULL;
+
+	npages = dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE;
+	db_tab = kmalloc(sizeof *db_tab + npages * sizeof *db_tab->page, GFP_KERNEL);
+	if (!db_tab)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&db_tab->mutex);
+	for (i = 0; i < npages; ++i) {
+		db_tab->page[i].refcount = 0;
+		db_tab->page[i].uvirt    = 0;
+		sg_init_table(&db_tab->page[i].mem, 1);
+	}
+
+	return db_tab;
+}
+
+void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar,
+			       struct mthca_user_db_table *db_tab)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	for (i = 0; i < dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE; ++i) {
+		if (db_tab->page[i].uvirt) {
+			mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, uar, i), 1);
+			pci_unmap_sg(dev->pdev, &db_tab->page[i].mem, 1, PCI_DMA_TODEVICE);
+			put_page(sg_page(&db_tab->page[i].mem));
+		}
+	}
+
+	kfree(db_tab);
+}
+
+int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type,
+		   u32 qn, __be32 **db)
+{
+	int group;
+	int start, end, dir;
+	int i, j;
+	struct mthca_db_page *page;
+	int ret = 0;
+
+	mutex_lock(&dev->db_tab->mutex);
+
+	switch (type) {
+	case MTHCA_DB_TYPE_CQ_ARM:
+	case MTHCA_DB_TYPE_SQ:
+		group = 0;
+		start = 0;
+		end   = dev->db_tab->max_group1;
+		dir   = 1;
+		break;
+
+	case MTHCA_DB_TYPE_CQ_SET_CI:
+	case MTHCA_DB_TYPE_RQ:
+	case MTHCA_DB_TYPE_SRQ:
+		group = 1;
+		start = dev->db_tab->npages - 1;
+		end   = dev->db_tab->min_group2;
+		dir   = -1;
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto out;
+	}
+
+	for (i = start; i != end; i += dir)
+		if (dev->db_tab->page[i].db_rec &&
+		    !bitmap_full(dev->db_tab->page[i].used,
+				 MTHCA_DB_REC_PER_PAGE)) {
+			page = dev->db_tab->page + i;
+			goto found;
+		}
+
+	for (i = start; i != end; i += dir)
+		if (!dev->db_tab->page[i].db_rec) {
+			page = dev->db_tab->page + i;
+			goto alloc;
+		}
+
+	if (dev->db_tab->max_group1 >= dev->db_tab->min_group2 - 1) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (group == 0)
+		++dev->db_tab->max_group1;
+	else
+		--dev->db_tab->min_group2;
+
+	page = dev->db_tab->page + end;
+
+alloc:
+	page->db_rec = dma_alloc_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+					  &page->mapping, GFP_KERNEL);
+	if (!page->db_rec) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	memset(page->db_rec, 0, MTHCA_ICM_PAGE_SIZE);
+
+	ret = mthca_MAP_ICM_page(dev, page->mapping,
+				 mthca_uarc_virt(dev, &dev->driver_uar, i));
+	if (ret) {
+		dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+				  page->db_rec, page->mapping);
+		goto out;
+	}
+
+	bitmap_zero(page->used, MTHCA_DB_REC_PER_PAGE);
+
+found:
+	j = find_first_zero_bit(page->used, MTHCA_DB_REC_PER_PAGE);
+	set_bit(j, page->used);
+
+	if (group == 1)
+		j = MTHCA_DB_REC_PER_PAGE - 1 - j;
+
+	ret = i * MTHCA_DB_REC_PER_PAGE + j;
+
+	page->db_rec[j] = cpu_to_be64((qn << 8) | (type << 5));
+
+	*db = (__be32 *) &page->db_rec[j];
+
+out:
+	mutex_unlock(&dev->db_tab->mutex);
+
+	return ret;
+}
+
+void mthca_free_db(struct mthca_dev *dev, int type, int db_index)
+{
+	int i, j;
+	struct mthca_db_page *page;
+
+	i = db_index / MTHCA_DB_REC_PER_PAGE;
+	j = db_index % MTHCA_DB_REC_PER_PAGE;
+
+	page = dev->db_tab->page + i;
+
+	mutex_lock(&dev->db_tab->mutex);
+
+	page->db_rec[j] = 0;
+	if (i >= dev->db_tab->min_group2)
+		j = MTHCA_DB_REC_PER_PAGE - 1 - j;
+	clear_bit(j, page->used);
+
+	if (bitmap_empty(page->used, MTHCA_DB_REC_PER_PAGE) &&
+	    i >= dev->db_tab->max_group1 - 1) {
+		mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, &dev->driver_uar, i), 1);
+
+		dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+				  page->db_rec, page->mapping);
+		page->db_rec = NULL;
+
+		if (i == dev->db_tab->max_group1) {
+			--dev->db_tab->max_group1;
+			/* XXX may be able to unmap more pages now */
+		}
+		if (i == dev->db_tab->min_group2)
+			++dev->db_tab->min_group2;
+	}
+
+	mutex_unlock(&dev->db_tab->mutex);
+}
+
+int mthca_init_db_tab(struct mthca_dev *dev)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return 0;
+
+	dev->db_tab = kmalloc(sizeof *dev->db_tab, GFP_KERNEL);
+	if (!dev->db_tab)
+		return -ENOMEM;
+
+	mutex_init(&dev->db_tab->mutex);
+
+	dev->db_tab->npages     = dev->uar_table.uarc_size / MTHCA_ICM_PAGE_SIZE;
+	dev->db_tab->max_group1 = 0;
+	dev->db_tab->min_group2 = dev->db_tab->npages - 1;
+
+	dev->db_tab->page = kmalloc(dev->db_tab->npages *
+				    sizeof *dev->db_tab->page,
+				    GFP_KERNEL);
+	if (!dev->db_tab->page) {
+		kfree(dev->db_tab);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < dev->db_tab->npages; ++i)
+		dev->db_tab->page[i].db_rec = NULL;
+
+	return 0;
+}
+
+void mthca_cleanup_db_tab(struct mthca_dev *dev)
+{
+	int i;
+
+	if (!mthca_is_memfree(dev))
+		return;
+
+	/*
+	 * Because we don't always free our UARC pages when they
+	 * become empty to make mthca_free_db() simpler we need to
+	 * make a sweep through the doorbell pages and free any
+	 * leftover pages now.
+	 */
+	for (i = 0; i < dev->db_tab->npages; ++i) {
+		if (!dev->db_tab->page[i].db_rec)
+			continue;
+
+		if (!bitmap_empty(dev->db_tab->page[i].used, MTHCA_DB_REC_PER_PAGE))
+			mthca_warn(dev, "Kernel UARC page %d not empty\n", i);
+
+		mthca_UNMAP_ICM(dev, mthca_uarc_virt(dev, &dev->driver_uar, i), 1);
+
+		dma_free_coherent(&dev->pdev->dev, MTHCA_ICM_PAGE_SIZE,
+				  dev->db_tab->page[i].db_rec,
+				  dev->db_tab->page[i].mapping);
+	}
+
+	kfree(dev->db_tab->page);
+	kfree(dev->db_tab);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_memfree.h b/drivers/infiniband/hw/mthca/mthca_memfree.h
new file mode 100644
index 000000000..da9b8f9b8
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_memfree.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_MEMFREE_H
+#define MTHCA_MEMFREE_H
+
+#include <linux/list.h>
+#include <linux/mutex.h>
+
+#define MTHCA_ICM_CHUNK_LEN \
+	((256 - sizeof (struct list_head) - 2 * sizeof (int)) /		\
+	 (sizeof (struct scatterlist)))
+
+enum {
+	MTHCA_ICM_PAGE_SHIFT	= 12,
+	MTHCA_ICM_PAGE_SIZE	= 1 << MTHCA_ICM_PAGE_SHIFT,
+	MTHCA_DB_REC_PER_PAGE	= MTHCA_ICM_PAGE_SIZE / 8
+};
+
+struct mthca_icm_chunk {
+	struct list_head   list;
+	int                npages;
+	int                nsg;
+	struct scatterlist mem[MTHCA_ICM_CHUNK_LEN];
+};
+
+struct mthca_icm {
+	struct list_head chunk_list;
+	int              refcount;
+};
+
+struct mthca_icm_table {
+	u64               virt;
+	int               num_icm;
+	int               num_obj;
+	int               obj_size;
+	int               lowmem;
+	int               coherent;
+	struct mutex      mutex;
+	struct mthca_icm *icm[0];
+};
+
+struct mthca_icm_iter {
+	struct mthca_icm       *icm;
+	struct mthca_icm_chunk *chunk;
+	int                     page_idx;
+};
+
+struct mthca_dev;
+
+struct mthca_icm *mthca_alloc_icm(struct mthca_dev *dev, int npages,
+				  gfp_t gfp_mask, int coherent);
+void mthca_free_icm(struct mthca_dev *dev, struct mthca_icm *icm, int coherent);
+
+struct mthca_icm_table *mthca_alloc_icm_table(struct mthca_dev *dev,
+					      u64 virt, int obj_size,
+					      int nobj, int reserved,
+					      int use_lowmem, int use_coherent);
+void mthca_free_icm_table(struct mthca_dev *dev, struct mthca_icm_table *table);
+int mthca_table_get(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
+void mthca_table_put(struct mthca_dev *dev, struct mthca_icm_table *table, int obj);
+void *mthca_table_find(struct mthca_icm_table *table, int obj, dma_addr_t *dma_handle);
+int mthca_table_get_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+			  int start, int end);
+void mthca_table_put_range(struct mthca_dev *dev, struct mthca_icm_table *table,
+			   int start, int end);
+
+static inline void mthca_icm_first(struct mthca_icm *icm,
+				   struct mthca_icm_iter *iter)
+{
+	iter->icm      = icm;
+	iter->chunk    = list_empty(&icm->chunk_list) ?
+		NULL : list_entry(icm->chunk_list.next,
+				  struct mthca_icm_chunk, list);
+	iter->page_idx = 0;
+}
+
+static inline int mthca_icm_last(struct mthca_icm_iter *iter)
+{
+	return !iter->chunk;
+}
+
+static inline void mthca_icm_next(struct mthca_icm_iter *iter)
+{
+	if (++iter->page_idx >= iter->chunk->nsg) {
+		if (iter->chunk->list.next == &iter->icm->chunk_list) {
+			iter->chunk = NULL;
+			return;
+		}
+
+		iter->chunk = list_entry(iter->chunk->list.next,
+					 struct mthca_icm_chunk, list);
+		iter->page_idx = 0;
+	}
+}
+
+static inline dma_addr_t mthca_icm_addr(struct mthca_icm_iter *iter)
+{
+	return sg_dma_address(&iter->chunk->mem[iter->page_idx]);
+}
+
+static inline unsigned long mthca_icm_size(struct mthca_icm_iter *iter)
+{
+	return sg_dma_len(&iter->chunk->mem[iter->page_idx]);
+}
+
+struct mthca_db_page {
+	DECLARE_BITMAP(used, MTHCA_DB_REC_PER_PAGE);
+	__be64    *db_rec;
+	dma_addr_t mapping;
+};
+
+struct mthca_db_table {
+	int 	       	      npages;
+	int 	       	      max_group1;
+	int 	       	      min_group2;
+	struct mthca_db_page *page;
+	struct mutex          mutex;
+};
+
+enum mthca_db_type {
+	MTHCA_DB_TYPE_INVALID   = 0x0,
+	MTHCA_DB_TYPE_CQ_SET_CI = 0x1,
+	MTHCA_DB_TYPE_CQ_ARM    = 0x2,
+	MTHCA_DB_TYPE_SQ        = 0x3,
+	MTHCA_DB_TYPE_RQ        = 0x4,
+	MTHCA_DB_TYPE_SRQ       = 0x5,
+	MTHCA_DB_TYPE_GROUP_SEP = 0x7
+};
+
+struct mthca_user_db_table;
+struct mthca_uar;
+
+int mthca_map_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+		      struct mthca_user_db_table *db_tab, int index, u64 uaddr);
+void mthca_unmap_user_db(struct mthca_dev *dev, struct mthca_uar *uar,
+			 struct mthca_user_db_table *db_tab, int index);
+struct mthca_user_db_table *mthca_init_user_db_tab(struct mthca_dev *dev);
+void mthca_cleanup_user_db_tab(struct mthca_dev *dev, struct mthca_uar *uar,
+			       struct mthca_user_db_table *db_tab);
+
+int mthca_init_db_tab(struct mthca_dev *dev);
+void mthca_cleanup_db_tab(struct mthca_dev *dev);
+int mthca_alloc_db(struct mthca_dev *dev, enum mthca_db_type type,
+		   u32 qn, __be32 **db);
+void mthca_free_db(struct mthca_dev *dev, int type, int db_index);
+
+#endif /* MTHCA_MEMFREE_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_mr.c b/drivers/infiniband/hw/mthca/mthca_mr.c
new file mode 100644
index 000000000..ed9a989e5
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_mr.c
@@ -0,0 +1,965 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+
+struct mthca_mtt {
+	struct mthca_buddy *buddy;
+	int                 order;
+	u32                 first_seg;
+};
+
+/*
+ * Must be packed because mtt_seg is 64 bits but only aligned to 32 bits.
+ */
+struct mthca_mpt_entry {
+	__be32 flags;
+	__be32 page_size;
+	__be32 key;
+	__be32 pd;
+	__be64 start;
+	__be64 length;
+	__be32 lkey;
+	__be32 window_count;
+	__be32 window_count_limit;
+	__be64 mtt_seg;
+	__be32 mtt_sz;		/* Arbel only */
+	u32    reserved[2];
+} __attribute__((packed));
+
+#define MTHCA_MPT_FLAG_SW_OWNS       (0xfUL << 28)
+#define MTHCA_MPT_FLAG_MIO           (1 << 17)
+#define MTHCA_MPT_FLAG_BIND_ENABLE   (1 << 15)
+#define MTHCA_MPT_FLAG_PHYSICAL      (1 <<  9)
+#define MTHCA_MPT_FLAG_REGION        (1 <<  8)
+
+#define MTHCA_MTT_FLAG_PRESENT       1
+
+#define MTHCA_MPT_STATUS_SW 0xF0
+#define MTHCA_MPT_STATUS_HW 0x00
+
+#define SINAI_FMR_KEY_INC 0x1000000
+
+/*
+ * Buddy allocator for MTT segments (currently not very efficient
+ * since it doesn't keep a free list and just searches linearly
+ * through the bitmaps)
+ */
+
+static u32 mthca_buddy_alloc(struct mthca_buddy *buddy, int order)
+{
+	int o;
+	int m;
+	u32 seg;
+
+	spin_lock(&buddy->lock);
+
+	for (o = order; o <= buddy->max_order; ++o)
+		if (buddy->num_free[o]) {
+			m = 1 << (buddy->max_order - o);
+			seg = find_first_bit(buddy->bits[o], m);
+			if (seg < m)
+				goto found;
+		}
+
+	spin_unlock(&buddy->lock);
+	return -1;
+
+ found:
+	clear_bit(seg, buddy->bits[o]);
+	--buddy->num_free[o];
+
+	while (o > order) {
+		--o;
+		seg <<= 1;
+		set_bit(seg ^ 1, buddy->bits[o]);
+		++buddy->num_free[o];
+	}
+
+	spin_unlock(&buddy->lock);
+
+	seg <<= order;
+
+	return seg;
+}
+
+static void mthca_buddy_free(struct mthca_buddy *buddy, u32 seg, int order)
+{
+	seg >>= order;
+
+	spin_lock(&buddy->lock);
+
+	while (test_bit(seg ^ 1, buddy->bits[order])) {
+		clear_bit(seg ^ 1, buddy->bits[order]);
+		--buddy->num_free[order];
+		seg >>= 1;
+		++order;
+	}
+
+	set_bit(seg, buddy->bits[order]);
+	++buddy->num_free[order];
+
+	spin_unlock(&buddy->lock);
+}
+
+static int mthca_buddy_init(struct mthca_buddy *buddy, int max_order)
+{
+	int i, s;
+
+	buddy->max_order = max_order;
+	spin_lock_init(&buddy->lock);
+
+	buddy->bits = kzalloc((buddy->max_order + 1) * sizeof (long *),
+			      GFP_KERNEL);
+	buddy->num_free = kcalloc((buddy->max_order + 1), sizeof *buddy->num_free,
+				  GFP_KERNEL);
+	if (!buddy->bits || !buddy->num_free)
+		goto err_out;
+
+	for (i = 0; i <= buddy->max_order; ++i) {
+		s = BITS_TO_LONGS(1 << (buddy->max_order - i));
+		buddy->bits[i] = kmalloc(s * sizeof (long), GFP_KERNEL);
+		if (!buddy->bits[i])
+			goto err_out_free;
+		bitmap_zero(buddy->bits[i],
+			    1 << (buddy->max_order - i));
+	}
+
+	set_bit(0, buddy->bits[buddy->max_order]);
+	buddy->num_free[buddy->max_order] = 1;
+
+	return 0;
+
+err_out_free:
+	for (i = 0; i <= buddy->max_order; ++i)
+		kfree(buddy->bits[i]);
+
+err_out:
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+
+	return -ENOMEM;
+}
+
+static void mthca_buddy_cleanup(struct mthca_buddy *buddy)
+{
+	int i;
+
+	for (i = 0; i <= buddy->max_order; ++i)
+		kfree(buddy->bits[i]);
+
+	kfree(buddy->bits);
+	kfree(buddy->num_free);
+}
+
+static u32 mthca_alloc_mtt_range(struct mthca_dev *dev, int order,
+				 struct mthca_buddy *buddy)
+{
+	u32 seg = mthca_buddy_alloc(buddy, order);
+
+	if (seg == -1)
+		return -1;
+
+	if (mthca_is_memfree(dev))
+		if (mthca_table_get_range(dev, dev->mr_table.mtt_table, seg,
+					  seg + (1 << order) - 1)) {
+			mthca_buddy_free(buddy, seg, order);
+			seg = -1;
+		}
+
+	return seg;
+}
+
+static struct mthca_mtt *__mthca_alloc_mtt(struct mthca_dev *dev, int size,
+					   struct mthca_buddy *buddy)
+{
+	struct mthca_mtt *mtt;
+	int i;
+
+	if (size <= 0)
+		return ERR_PTR(-EINVAL);
+
+	mtt = kmalloc(sizeof *mtt, GFP_KERNEL);
+	if (!mtt)
+		return ERR_PTR(-ENOMEM);
+
+	mtt->buddy = buddy;
+	mtt->order = 0;
+	for (i = dev->limits.mtt_seg_size / 8; i < size; i <<= 1)
+		++mtt->order;
+
+	mtt->first_seg = mthca_alloc_mtt_range(dev, mtt->order, buddy);
+	if (mtt->first_seg == -1) {
+		kfree(mtt);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	return mtt;
+}
+
+struct mthca_mtt *mthca_alloc_mtt(struct mthca_dev *dev, int size)
+{
+	return __mthca_alloc_mtt(dev, size, &dev->mr_table.mtt_buddy);
+}
+
+void mthca_free_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt)
+{
+	if (!mtt)
+		return;
+
+	mthca_buddy_free(mtt->buddy, mtt->first_seg, mtt->order);
+
+	mthca_table_put_range(dev, dev->mr_table.mtt_table,
+			      mtt->first_seg,
+			      mtt->first_seg + (1 << mtt->order) - 1);
+
+	kfree(mtt);
+}
+
+static int __mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+			     int start_index, u64 *buffer_list, int list_len)
+{
+	struct mthca_mailbox *mailbox;
+	__be64 *mtt_entry;
+	int err = 0;
+	int i;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+	mtt_entry = mailbox->buf;
+
+	while (list_len > 0) {
+		mtt_entry[0] = cpu_to_be64(dev->mr_table.mtt_base +
+					   mtt->first_seg * dev->limits.mtt_seg_size +
+					   start_index * 8);
+		mtt_entry[1] = 0;
+		for (i = 0; i < list_len && i < MTHCA_MAILBOX_SIZE / 8 - 2; ++i)
+			mtt_entry[i + 2] = cpu_to_be64(buffer_list[i] |
+						       MTHCA_MTT_FLAG_PRESENT);
+
+		/*
+		 * If we have an odd number of entries to write, add
+		 * one more dummy entry for firmware efficiency.
+		 */
+		if (i & 1)
+			mtt_entry[i + 2] = 0;
+
+		err = mthca_WRITE_MTT(dev, mailbox, (i + 1) & ~1);
+		if (err) {
+			mthca_warn(dev, "WRITE_MTT failed (%d)\n", err);
+			goto out;
+		}
+
+		list_len    -= i;
+		start_index += i;
+		buffer_list += i;
+	}
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+}
+
+int mthca_write_mtt_size(struct mthca_dev *dev)
+{
+	if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy ||
+	    !(dev->mthca_flags & MTHCA_FLAG_FMR))
+		/*
+		 * Be friendly to WRITE_MTT command
+		 * and leave two empty slots for the
+		 * index and reserved fields of the
+		 * mailbox.
+		 */
+		return PAGE_SIZE / sizeof (u64) - 2;
+
+	/* For Arbel, all MTTs must fit in the same page. */
+	return mthca_is_memfree(dev) ? (PAGE_SIZE / sizeof (u64)) : 0x7ffffff;
+}
+
+static void mthca_tavor_write_mtt_seg(struct mthca_dev *dev,
+				      struct mthca_mtt *mtt, int start_index,
+				      u64 *buffer_list, int list_len)
+{
+	u64 __iomem *mtts;
+	int i;
+
+	mtts = dev->mr_table.tavor_fmr.mtt_base + mtt->first_seg * dev->limits.mtt_seg_size +
+		start_index * sizeof (u64);
+	for (i = 0; i < list_len; ++i)
+		mthca_write64_raw(cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT),
+				  mtts + i);
+}
+
+static void mthca_arbel_write_mtt_seg(struct mthca_dev *dev,
+				      struct mthca_mtt *mtt, int start_index,
+				      u64 *buffer_list, int list_len)
+{
+	__be64 *mtts;
+	dma_addr_t dma_handle;
+	int i;
+	int s = start_index * sizeof (u64);
+
+	/* For Arbel, all MTTs must fit in the same page. */
+	BUG_ON(s / PAGE_SIZE != (s + list_len * sizeof(u64) - 1) / PAGE_SIZE);
+	/* Require full segments */
+	BUG_ON(s % dev->limits.mtt_seg_size);
+
+	mtts = mthca_table_find(dev->mr_table.mtt_table, mtt->first_seg +
+				s / dev->limits.mtt_seg_size, &dma_handle);
+
+	BUG_ON(!mtts);
+
+	dma_sync_single_for_cpu(&dev->pdev->dev, dma_handle,
+				list_len * sizeof (u64), DMA_TO_DEVICE);
+
+	for (i = 0; i < list_len; ++i)
+		mtts[i] = cpu_to_be64(buffer_list[i] | MTHCA_MTT_FLAG_PRESENT);
+
+	dma_sync_single_for_device(&dev->pdev->dev, dma_handle,
+				   list_len * sizeof (u64), DMA_TO_DEVICE);
+}
+
+int mthca_write_mtt(struct mthca_dev *dev, struct mthca_mtt *mtt,
+		    int start_index, u64 *buffer_list, int list_len)
+{
+	int size = mthca_write_mtt_size(dev);
+	int chunk;
+
+	if (dev->mr_table.fmr_mtt_buddy != &dev->mr_table.mtt_buddy ||
+	    !(dev->mthca_flags & MTHCA_FLAG_FMR))
+		return __mthca_write_mtt(dev, mtt, start_index, buffer_list, list_len);
+
+	while (list_len > 0) {
+		chunk = min(size, list_len);
+		if (mthca_is_memfree(dev))
+			mthca_arbel_write_mtt_seg(dev, mtt, start_index,
+						  buffer_list, chunk);
+		else
+			mthca_tavor_write_mtt_seg(dev, mtt, start_index,
+						  buffer_list, chunk);
+
+		list_len    -= chunk;
+		start_index += chunk;
+		buffer_list += chunk;
+	}
+
+	return 0;
+}
+
+static inline u32 tavor_hw_index_to_key(u32 ind)
+{
+	return ind;
+}
+
+static inline u32 tavor_key_to_hw_index(u32 key)
+{
+	return key;
+}
+
+static inline u32 arbel_hw_index_to_key(u32 ind)
+{
+	return (ind >> 24) | (ind << 8);
+}
+
+static inline u32 arbel_key_to_hw_index(u32 key)
+{
+	return (key << 24) | (key >> 8);
+}
+
+static inline u32 hw_index_to_key(struct mthca_dev *dev, u32 ind)
+{
+	if (mthca_is_memfree(dev))
+		return arbel_hw_index_to_key(ind);
+	else
+		return tavor_hw_index_to_key(ind);
+}
+
+static inline u32 key_to_hw_index(struct mthca_dev *dev, u32 key)
+{
+	if (mthca_is_memfree(dev))
+		return arbel_key_to_hw_index(key);
+	else
+		return tavor_key_to_hw_index(key);
+}
+
+static inline u32 adjust_key(struct mthca_dev *dev, u32 key)
+{
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		return ((key << 20) & 0x800000) | (key & 0x7fffff);
+	else
+		return key;
+}
+
+int mthca_mr_alloc(struct mthca_dev *dev, u32 pd, int buffer_size_shift,
+		   u64 iova, u64 total_size, u32 access, struct mthca_mr *mr)
+{
+	struct mthca_mailbox *mailbox;
+	struct mthca_mpt_entry *mpt_entry;
+	u32 key;
+	int i;
+	int err;
+
+	WARN_ON(buffer_size_shift >= 32);
+
+	key = mthca_alloc(&dev->mr_table.mpt_alloc);
+	if (key == -1)
+		return -ENOMEM;
+	key = adjust_key(dev, key);
+	mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
+
+	if (mthca_is_memfree(dev)) {
+		err = mthca_table_get(dev, dev->mr_table.mpt_table, key);
+		if (err)
+			goto err_out_mpt_free;
+	}
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_out_table;
+	}
+	mpt_entry = mailbox->buf;
+
+	mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS     |
+				       MTHCA_MPT_FLAG_MIO         |
+				       MTHCA_MPT_FLAG_REGION      |
+				       access);
+	if (!mr->mtt)
+		mpt_entry->flags |= cpu_to_be32(MTHCA_MPT_FLAG_PHYSICAL);
+
+	mpt_entry->page_size = cpu_to_be32(buffer_size_shift - 12);
+	mpt_entry->key       = cpu_to_be32(key);
+	mpt_entry->pd        = cpu_to_be32(pd);
+	mpt_entry->start     = cpu_to_be64(iova);
+	mpt_entry->length    = cpu_to_be64(total_size);
+
+	memset(&mpt_entry->lkey, 0,
+	       sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, lkey));
+
+	if (mr->mtt)
+		mpt_entry->mtt_seg =
+			cpu_to_be64(dev->mr_table.mtt_base +
+				    mr->mtt->first_seg * dev->limits.mtt_seg_size);
+
+	if (0) {
+		mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
+		for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpu(((__be32 *) mpt_entry)[i]));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	err = mthca_SW2HW_MPT(dev, mailbox,
+			      key & (dev->limits.num_mpts - 1));
+	if (err) {
+		mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_out_mailbox;
+	}
+
+	mthca_free_mailbox(dev, mailbox);
+	return err;
+
+err_out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+
+err_out_table:
+	mthca_table_put(dev, dev->mr_table.mpt_table, key);
+
+err_out_mpt_free:
+	mthca_free(&dev->mr_table.mpt_alloc, key);
+	return err;
+}
+
+int mthca_mr_alloc_notrans(struct mthca_dev *dev, u32 pd,
+			   u32 access, struct mthca_mr *mr)
+{
+	mr->mtt = NULL;
+	return mthca_mr_alloc(dev, pd, 12, 0, ~0ULL, access, mr);
+}
+
+int mthca_mr_alloc_phys(struct mthca_dev *dev, u32 pd,
+			u64 *buffer_list, int buffer_size_shift,
+			int list_len, u64 iova, u64 total_size,
+			u32 access, struct mthca_mr *mr)
+{
+	int err;
+
+	mr->mtt = mthca_alloc_mtt(dev, list_len);
+	if (IS_ERR(mr->mtt))
+		return PTR_ERR(mr->mtt);
+
+	err = mthca_write_mtt(dev, mr->mtt, 0, buffer_list, list_len);
+	if (err) {
+		mthca_free_mtt(dev, mr->mtt);
+		return err;
+	}
+
+	err = mthca_mr_alloc(dev, pd, buffer_size_shift, iova,
+			     total_size, access, mr);
+	if (err)
+		mthca_free_mtt(dev, mr->mtt);
+
+	return err;
+}
+
+/* Free mr or fmr */
+static void mthca_free_region(struct mthca_dev *dev, u32 lkey)
+{
+	mthca_table_put(dev, dev->mr_table.mpt_table,
+			key_to_hw_index(dev, lkey));
+
+	mthca_free(&dev->mr_table.mpt_alloc, key_to_hw_index(dev, lkey));
+}
+
+void mthca_free_mr(struct mthca_dev *dev, struct mthca_mr *mr)
+{
+	int err;
+
+	err = mthca_HW2SW_MPT(dev, NULL,
+			      key_to_hw_index(dev, mr->ibmr.lkey) &
+			      (dev->limits.num_mpts - 1));
+	if (err)
+		mthca_warn(dev, "HW2SW_MPT failed (%d)\n", err);
+
+	mthca_free_region(dev, mr->ibmr.lkey);
+	mthca_free_mtt(dev, mr->mtt);
+}
+
+int mthca_fmr_alloc(struct mthca_dev *dev, u32 pd,
+		    u32 access, struct mthca_fmr *mr)
+{
+	struct mthca_mpt_entry *mpt_entry;
+	struct mthca_mailbox *mailbox;
+	u64 mtt_seg;
+	u32 key, idx;
+	int list_len = mr->attr.max_pages;
+	int err = -ENOMEM;
+	int i;
+
+	if (mr->attr.page_shift < 12 || mr->attr.page_shift >= 32)
+		return -EINVAL;
+
+	/* For Arbel, all MTTs must fit in the same page. */
+	if (mthca_is_memfree(dev) &&
+	    mr->attr.max_pages * sizeof *mr->mem.arbel.mtts > PAGE_SIZE)
+		return -EINVAL;
+
+	mr->maps = 0;
+
+	key = mthca_alloc(&dev->mr_table.mpt_alloc);
+	if (key == -1)
+		return -ENOMEM;
+	key = adjust_key(dev, key);
+
+	idx = key & (dev->limits.num_mpts - 1);
+	mr->ibmr.rkey = mr->ibmr.lkey = hw_index_to_key(dev, key);
+
+	if (mthca_is_memfree(dev)) {
+		err = mthca_table_get(dev, dev->mr_table.mpt_table, key);
+		if (err)
+			goto err_out_mpt_free;
+
+		mr->mem.arbel.mpt = mthca_table_find(dev->mr_table.mpt_table, key, NULL);
+		BUG_ON(!mr->mem.arbel.mpt);
+	} else
+		mr->mem.tavor.mpt = dev->mr_table.tavor_fmr.mpt_base +
+			sizeof *(mr->mem.tavor.mpt) * idx;
+
+	mr->mtt = __mthca_alloc_mtt(dev, list_len, dev->mr_table.fmr_mtt_buddy);
+	if (IS_ERR(mr->mtt)) {
+		err = PTR_ERR(mr->mtt);
+		goto err_out_table;
+	}
+
+	mtt_seg = mr->mtt->first_seg * dev->limits.mtt_seg_size;
+
+	if (mthca_is_memfree(dev)) {
+		mr->mem.arbel.mtts = mthca_table_find(dev->mr_table.mtt_table,
+						      mr->mtt->first_seg,
+						      &mr->mem.arbel.dma_handle);
+		BUG_ON(!mr->mem.arbel.mtts);
+	} else
+		mr->mem.tavor.mtts = dev->mr_table.tavor_fmr.mtt_base + mtt_seg;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_out_free_mtt;
+	}
+
+	mpt_entry = mailbox->buf;
+
+	mpt_entry->flags = cpu_to_be32(MTHCA_MPT_FLAG_SW_OWNS     |
+				       MTHCA_MPT_FLAG_MIO         |
+				       MTHCA_MPT_FLAG_REGION      |
+				       access);
+
+	mpt_entry->page_size = cpu_to_be32(mr->attr.page_shift - 12);
+	mpt_entry->key       = cpu_to_be32(key);
+	mpt_entry->pd        = cpu_to_be32(pd);
+	memset(&mpt_entry->start, 0,
+	       sizeof *mpt_entry - offsetof(struct mthca_mpt_entry, start));
+	mpt_entry->mtt_seg   = cpu_to_be64(dev->mr_table.mtt_base + mtt_seg);
+
+	if (0) {
+		mthca_dbg(dev, "Dumping MPT entry %08x:\n", mr->ibmr.lkey);
+		for (i = 0; i < sizeof (struct mthca_mpt_entry) / 4; ++i) {
+			if (i % 4 == 0)
+				printk("[%02x] ", i * 4);
+			printk(" %08x", be32_to_cpu(((__be32 *) mpt_entry)[i]));
+			if ((i + 1) % 4 == 0)
+				printk("\n");
+		}
+	}
+
+	err = mthca_SW2HW_MPT(dev, mailbox,
+			      key & (dev->limits.num_mpts - 1));
+	if (err) {
+		mthca_warn(dev, "SW2HW_MPT failed (%d)\n", err);
+		goto err_out_mailbox_free;
+	}
+
+	mthca_free_mailbox(dev, mailbox);
+	return 0;
+
+err_out_mailbox_free:
+	mthca_free_mailbox(dev, mailbox);
+
+err_out_free_mtt:
+	mthca_free_mtt(dev, mr->mtt);
+
+err_out_table:
+	mthca_table_put(dev, dev->mr_table.mpt_table, key);
+
+err_out_mpt_free:
+	mthca_free(&dev->mr_table.mpt_alloc, key);
+	return err;
+}
+
+int mthca_free_fmr(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+	if (fmr->maps)
+		return -EBUSY;
+
+	mthca_free_region(dev, fmr->ibmr.lkey);
+	mthca_free_mtt(dev, fmr->mtt);
+
+	return 0;
+}
+
+static inline int mthca_check_fmr(struct mthca_fmr *fmr, u64 *page_list,
+				  int list_len, u64 iova)
+{
+	int i, page_mask;
+
+	if (list_len > fmr->attr.max_pages)
+		return -EINVAL;
+
+	page_mask = (1 << fmr->attr.page_shift) - 1;
+
+	/* We are getting page lists, so va must be page aligned. */
+	if (iova & page_mask)
+		return -EINVAL;
+
+	/* Trust the user not to pass misaligned data in page_list */
+	if (0)
+		for (i = 0; i < list_len; ++i) {
+			if (page_list[i] & ~page_mask)
+				return -EINVAL;
+		}
+
+	if (fmr->maps >= fmr->attr.max_maps)
+		return -EINVAL;
+
+	return 0;
+}
+
+
+int mthca_tavor_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+			     int list_len, u64 iova)
+{
+	struct mthca_fmr *fmr = to_mfmr(ibfmr);
+	struct mthca_dev *dev = to_mdev(ibfmr->device);
+	struct mthca_mpt_entry mpt_entry;
+	u32 key;
+	int i, err;
+
+	err = mthca_check_fmr(fmr, page_list, list_len, iova);
+	if (err)
+		return err;
+
+	++fmr->maps;
+
+	key = tavor_key_to_hw_index(fmr->ibmr.lkey);
+	key += dev->limits.num_mpts;
+	fmr->ibmr.lkey = fmr->ibmr.rkey = tavor_hw_index_to_key(key);
+
+	writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt);
+
+	for (i = 0; i < list_len; ++i) {
+		__be64 mtt_entry = cpu_to_be64(page_list[i] |
+					       MTHCA_MTT_FLAG_PRESENT);
+		mthca_write64_raw(mtt_entry, fmr->mem.tavor.mtts + i);
+	}
+
+	mpt_entry.lkey   = cpu_to_be32(key);
+	mpt_entry.length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift));
+	mpt_entry.start  = cpu_to_be64(iova);
+
+	__raw_writel((__force u32) mpt_entry.lkey, &fmr->mem.tavor.mpt->key);
+	memcpy_toio(&fmr->mem.tavor.mpt->start, &mpt_entry.start,
+		    offsetof(struct mthca_mpt_entry, window_count) -
+		    offsetof(struct mthca_mpt_entry, start));
+
+	writeb(MTHCA_MPT_STATUS_HW, fmr->mem.tavor.mpt);
+
+	return 0;
+}
+
+int mthca_arbel_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+			     int list_len, u64 iova)
+{
+	struct mthca_fmr *fmr = to_mfmr(ibfmr);
+	struct mthca_dev *dev = to_mdev(ibfmr->device);
+	u32 key;
+	int i, err;
+
+	err = mthca_check_fmr(fmr, page_list, list_len, iova);
+	if (err)
+		return err;
+
+	++fmr->maps;
+
+	key = arbel_key_to_hw_index(fmr->ibmr.lkey);
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		key += SINAI_FMR_KEY_INC;
+	else
+		key += dev->limits.num_mpts;
+	fmr->ibmr.lkey = fmr->ibmr.rkey = arbel_hw_index_to_key(key);
+
+	*(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW;
+
+	wmb();
+
+	dma_sync_single_for_cpu(&dev->pdev->dev, fmr->mem.arbel.dma_handle,
+				list_len * sizeof(u64), DMA_TO_DEVICE);
+
+	for (i = 0; i < list_len; ++i)
+		fmr->mem.arbel.mtts[i] = cpu_to_be64(page_list[i] |
+						     MTHCA_MTT_FLAG_PRESENT);
+
+	dma_sync_single_for_device(&dev->pdev->dev, fmr->mem.arbel.dma_handle,
+				   list_len * sizeof(u64), DMA_TO_DEVICE);
+
+	fmr->mem.arbel.mpt->key    = cpu_to_be32(key);
+	fmr->mem.arbel.mpt->lkey   = cpu_to_be32(key);
+	fmr->mem.arbel.mpt->length = cpu_to_be64(list_len * (1ull << fmr->attr.page_shift));
+	fmr->mem.arbel.mpt->start  = cpu_to_be64(iova);
+
+	wmb();
+
+	*(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_HW;
+
+	wmb();
+
+	return 0;
+}
+
+void mthca_tavor_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+	if (!fmr->maps)
+		return;
+
+	fmr->maps = 0;
+
+	writeb(MTHCA_MPT_STATUS_SW, fmr->mem.tavor.mpt);
+}
+
+void mthca_arbel_fmr_unmap(struct mthca_dev *dev, struct mthca_fmr *fmr)
+{
+	if (!fmr->maps)
+		return;
+
+	fmr->maps = 0;
+
+	*(u8 *) fmr->mem.arbel.mpt = MTHCA_MPT_STATUS_SW;
+}
+
+int mthca_init_mr_table(struct mthca_dev *dev)
+{
+	phys_addr_t addr;
+	int mpts, mtts, err, i;
+
+	err = mthca_alloc_init(&dev->mr_table.mpt_alloc,
+			       dev->limits.num_mpts,
+			       ~0, dev->limits.reserved_mrws);
+	if (err)
+		return err;
+
+	if (!mthca_is_memfree(dev) &&
+	    (dev->mthca_flags & MTHCA_FLAG_DDR_HIDDEN))
+		dev->limits.fmr_reserved_mtts = 0;
+	else
+		dev->mthca_flags |= MTHCA_FLAG_FMR;
+
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		mthca_dbg(dev, "Memory key throughput optimization activated.\n");
+
+	err = mthca_buddy_init(&dev->mr_table.mtt_buddy,
+			       fls(dev->limits.num_mtt_segs - 1));
+
+	if (err)
+		goto err_mtt_buddy;
+
+	dev->mr_table.tavor_fmr.mpt_base = NULL;
+	dev->mr_table.tavor_fmr.mtt_base = NULL;
+
+	if (dev->limits.fmr_reserved_mtts) {
+		i = fls(dev->limits.fmr_reserved_mtts - 1);
+
+		if (i >= 31) {
+			mthca_warn(dev, "Unable to reserve 2^31 FMR MTTs.\n");
+			err = -EINVAL;
+			goto err_fmr_mpt;
+		}
+		mpts = mtts = 1 << i;
+	} else {
+		mtts = dev->limits.num_mtt_segs;
+		mpts = dev->limits.num_mpts;
+	}
+
+	if (!mthca_is_memfree(dev) &&
+	    (dev->mthca_flags & MTHCA_FLAG_FMR)) {
+
+		addr = pci_resource_start(dev->pdev, 4) +
+			((pci_resource_len(dev->pdev, 4) - 1) &
+			 dev->mr_table.mpt_base);
+
+		dev->mr_table.tavor_fmr.mpt_base =
+			ioremap(addr, mpts * sizeof(struct mthca_mpt_entry));
+
+		if (!dev->mr_table.tavor_fmr.mpt_base) {
+			mthca_warn(dev, "MPT ioremap for FMR failed.\n");
+			err = -ENOMEM;
+			goto err_fmr_mpt;
+		}
+
+		addr = pci_resource_start(dev->pdev, 4) +
+			((pci_resource_len(dev->pdev, 4) - 1) &
+			 dev->mr_table.mtt_base);
+
+		dev->mr_table.tavor_fmr.mtt_base =
+			ioremap(addr, mtts * dev->limits.mtt_seg_size);
+		if (!dev->mr_table.tavor_fmr.mtt_base) {
+			mthca_warn(dev, "MTT ioremap for FMR failed.\n");
+			err = -ENOMEM;
+			goto err_fmr_mtt;
+		}
+	}
+
+	if (dev->limits.fmr_reserved_mtts) {
+		err = mthca_buddy_init(&dev->mr_table.tavor_fmr.mtt_buddy, fls(mtts - 1));
+		if (err)
+			goto err_fmr_mtt_buddy;
+
+		/* Prevent regular MRs from using FMR keys */
+		err = mthca_buddy_alloc(&dev->mr_table.mtt_buddy, fls(mtts - 1));
+		if (err)
+			goto err_reserve_fmr;
+
+		dev->mr_table.fmr_mtt_buddy =
+			&dev->mr_table.tavor_fmr.mtt_buddy;
+	} else
+		dev->mr_table.fmr_mtt_buddy = &dev->mr_table.mtt_buddy;
+
+	/* FMR table is always the first, take reserved MTTs out of there */
+	if (dev->limits.reserved_mtts) {
+		i = fls(dev->limits.reserved_mtts - 1);
+
+		if (mthca_alloc_mtt_range(dev, i,
+					  dev->mr_table.fmr_mtt_buddy) == -1) {
+			mthca_warn(dev, "MTT table of order %d is too small.\n",
+				  dev->mr_table.fmr_mtt_buddy->max_order);
+			err = -ENOMEM;
+			goto err_reserve_mtts;
+		}
+	}
+
+	return 0;
+
+err_reserve_mtts:
+err_reserve_fmr:
+	if (dev->limits.fmr_reserved_mtts)
+		mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy);
+
+err_fmr_mtt_buddy:
+	if (dev->mr_table.tavor_fmr.mtt_base)
+		iounmap(dev->mr_table.tavor_fmr.mtt_base);
+
+err_fmr_mtt:
+	if (dev->mr_table.tavor_fmr.mpt_base)
+		iounmap(dev->mr_table.tavor_fmr.mpt_base);
+
+err_fmr_mpt:
+	mthca_buddy_cleanup(&dev->mr_table.mtt_buddy);
+
+err_mtt_buddy:
+	mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+
+	return err;
+}
+
+void mthca_cleanup_mr_table(struct mthca_dev *dev)
+{
+	/* XXX check if any MRs are still allocated? */
+	if (dev->limits.fmr_reserved_mtts)
+		mthca_buddy_cleanup(&dev->mr_table.tavor_fmr.mtt_buddy);
+
+	mthca_buddy_cleanup(&dev->mr_table.mtt_buddy);
+
+	if (dev->mr_table.tavor_fmr.mtt_base)
+		iounmap(dev->mr_table.tavor_fmr.mtt_base);
+	if (dev->mr_table.tavor_fmr.mpt_base)
+		iounmap(dev->mr_table.tavor_fmr.mpt_base);
+
+	mthca_alloc_cleanup(&dev->mr_table.mpt_alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_pd.c b/drivers/infiniband/hw/mthca/mthca_pd.c
new file mode 100644
index 000000000..266f14e47
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_pd.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+
+#include "mthca_dev.h"
+
+int mthca_pd_alloc(struct mthca_dev *dev, int privileged, struct mthca_pd *pd)
+{
+	int err = 0;
+
+	pd->privileged = privileged;
+
+	atomic_set(&pd->sqp_count, 0);
+	pd->pd_num = mthca_alloc(&dev->pd_table.alloc);
+	if (pd->pd_num == -1)
+		return -ENOMEM;
+
+	if (privileged) {
+		err = mthca_mr_alloc_notrans(dev, pd->pd_num,
+					     MTHCA_MPT_FLAG_LOCAL_READ |
+					     MTHCA_MPT_FLAG_LOCAL_WRITE,
+					     &pd->ntmr);
+		if (err)
+			mthca_free(&dev->pd_table.alloc, pd->pd_num);
+	}
+
+	return err;
+}
+
+void mthca_pd_free(struct mthca_dev *dev, struct mthca_pd *pd)
+{
+	if (pd->privileged)
+		mthca_free_mr(dev, &pd->ntmr);
+	mthca_free(&dev->pd_table.alloc, pd->pd_num);
+}
+
+int mthca_init_pd_table(struct mthca_dev *dev)
+{
+	return mthca_alloc_init(&dev->pd_table.alloc,
+				dev->limits.num_pds,
+				(1 << 24) - 1,
+				dev->limits.reserved_pds);
+}
+
+void mthca_cleanup_pd_table(struct mthca_dev *dev)
+{
+	/* XXX check if any PDs are still allocated? */
+	mthca_alloc_cleanup(&dev->pd_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.c b/drivers/infiniband/hw/mthca/mthca_profile.c
new file mode 100644
index 000000000..8edb28a9a
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_profile.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/string.h>
+#include <linux/slab.h>
+
+#include "mthca_profile.h"
+
+enum {
+	MTHCA_RES_QP,
+	MTHCA_RES_EEC,
+	MTHCA_RES_SRQ,
+	MTHCA_RES_CQ,
+	MTHCA_RES_EQP,
+	MTHCA_RES_EEEC,
+	MTHCA_RES_EQ,
+	MTHCA_RES_RDB,
+	MTHCA_RES_MCG,
+	MTHCA_RES_MPT,
+	MTHCA_RES_MTT,
+	MTHCA_RES_UAR,
+	MTHCA_RES_UDAV,
+	MTHCA_RES_UARC,
+	MTHCA_RES_NUM
+};
+
+enum {
+	MTHCA_NUM_EQS = 32,
+	MTHCA_NUM_PDS = 1 << 15
+};
+
+s64 mthca_make_profile(struct mthca_dev *dev,
+		       struct mthca_profile *request,
+		       struct mthca_dev_lim *dev_lim,
+		       struct mthca_init_hca_param *init_hca)
+{
+	struct mthca_resource {
+		u64 size;
+		u64 start;
+		int type;
+		int num;
+		int log_num;
+	};
+
+	u64 mem_base, mem_avail;
+	s64 total_size = 0;
+	struct mthca_resource *profile;
+	struct mthca_resource tmp;
+	int i, j;
+
+	profile = kzalloc(MTHCA_RES_NUM * sizeof *profile, GFP_KERNEL);
+	if (!profile)
+		return -ENOMEM;
+
+	profile[MTHCA_RES_QP].size   = dev_lim->qpc_entry_sz;
+	profile[MTHCA_RES_EEC].size  = dev_lim->eec_entry_sz;
+	profile[MTHCA_RES_SRQ].size  = dev_lim->srq_entry_sz;
+	profile[MTHCA_RES_CQ].size   = dev_lim->cqc_entry_sz;
+	profile[MTHCA_RES_EQP].size  = dev_lim->eqpc_entry_sz;
+	profile[MTHCA_RES_EEEC].size = dev_lim->eeec_entry_sz;
+	profile[MTHCA_RES_EQ].size   = dev_lim->eqc_entry_sz;
+	profile[MTHCA_RES_RDB].size  = MTHCA_RDB_ENTRY_SIZE;
+	profile[MTHCA_RES_MCG].size  = MTHCA_MGM_ENTRY_SIZE;
+	profile[MTHCA_RES_MPT].size  = dev_lim->mpt_entry_sz;
+	profile[MTHCA_RES_MTT].size  = dev->limits.mtt_seg_size;
+	profile[MTHCA_RES_UAR].size  = dev_lim->uar_scratch_entry_sz;
+	profile[MTHCA_RES_UDAV].size = MTHCA_AV_SIZE;
+	profile[MTHCA_RES_UARC].size = request->uarc_size;
+
+	profile[MTHCA_RES_QP].num    = request->num_qp;
+	profile[MTHCA_RES_SRQ].num   = request->num_srq;
+	profile[MTHCA_RES_EQP].num   = request->num_qp;
+	profile[MTHCA_RES_RDB].num   = request->num_qp * request->rdb_per_qp;
+	profile[MTHCA_RES_CQ].num    = request->num_cq;
+	profile[MTHCA_RES_EQ].num    = MTHCA_NUM_EQS;
+	profile[MTHCA_RES_MCG].num   = request->num_mcg;
+	profile[MTHCA_RES_MPT].num   = request->num_mpt;
+	profile[MTHCA_RES_MTT].num   = request->num_mtt;
+	profile[MTHCA_RES_UAR].num   = request->num_uar;
+	profile[MTHCA_RES_UARC].num  = request->num_uar;
+	profile[MTHCA_RES_UDAV].num  = request->num_udav;
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		profile[i].type     = i;
+		profile[i].log_num  = max(ffs(profile[i].num) - 1, 0);
+		profile[i].size    *= profile[i].num;
+		if (mthca_is_memfree(dev))
+			profile[i].size = max(profile[i].size, (u64) PAGE_SIZE);
+	}
+
+	if (mthca_is_memfree(dev)) {
+		mem_base  = 0;
+		mem_avail = dev_lim->hca.arbel.max_icm_sz;
+	} else {
+		mem_base  = dev->ddr_start;
+		mem_avail = dev->fw.tavor.fw_start - dev->ddr_start;
+	}
+
+	/*
+	 * Sort the resources in decreasing order of size.  Since they
+	 * all have sizes that are powers of 2, we'll be able to keep
+	 * resources aligned to their size and pack them without gaps
+	 * using the sorted order.
+	 */
+	for (i = MTHCA_RES_NUM; i > 0; --i)
+		for (j = 1; j < i; ++j) {
+			if (profile[j].size > profile[j - 1].size) {
+				tmp            = profile[j];
+				profile[j]     = profile[j - 1];
+				profile[j - 1] = tmp;
+			}
+		}
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		if (profile[i].size) {
+			profile[i].start = mem_base + total_size;
+			total_size      += profile[i].size;
+		}
+		if (total_size > mem_avail) {
+			mthca_err(dev, "Profile requires 0x%llx bytes; "
+				  "won't fit in 0x%llx bytes of context memory.\n",
+				  (unsigned long long) total_size,
+				  (unsigned long long) mem_avail);
+			kfree(profile);
+			return -ENOMEM;
+		}
+
+		if (profile[i].size)
+			mthca_dbg(dev, "profile[%2d]--%2d/%2d @ 0x%16llx "
+				  "(size 0x%8llx)\n",
+				  i, profile[i].type, profile[i].log_num,
+				  (unsigned long long) profile[i].start,
+				  (unsigned long long) profile[i].size);
+	}
+
+	if (mthca_is_memfree(dev))
+		mthca_dbg(dev, "HCA context memory: reserving %d KB\n",
+			  (int) (total_size >> 10));
+	else
+		mthca_dbg(dev, "HCA memory: allocated %d KB/%d KB (%d KB free)\n",
+			  (int) (total_size >> 10), (int) (mem_avail >> 10),
+			  (int) ((mem_avail - total_size) >> 10));
+
+	for (i = 0; i < MTHCA_RES_NUM; ++i) {
+		switch (profile[i].type) {
+		case MTHCA_RES_QP:
+			dev->limits.num_qps   = profile[i].num;
+			init_hca->qpc_base    = profile[i].start;
+			init_hca->log_num_qps = profile[i].log_num;
+			break;
+		case MTHCA_RES_EEC:
+			dev->limits.num_eecs   = profile[i].num;
+			init_hca->eec_base     = profile[i].start;
+			init_hca->log_num_eecs = profile[i].log_num;
+			break;
+		case MTHCA_RES_SRQ:
+			dev->limits.num_srqs   = profile[i].num;
+			init_hca->srqc_base    = profile[i].start;
+			init_hca->log_num_srqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_CQ:
+			dev->limits.num_cqs   = profile[i].num;
+			init_hca->cqc_base    = profile[i].start;
+			init_hca->log_num_cqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_EQP:
+			init_hca->eqpc_base = profile[i].start;
+			break;
+		case MTHCA_RES_EEEC:
+			init_hca->eeec_base = profile[i].start;
+			break;
+		case MTHCA_RES_EQ:
+			dev->limits.num_eqs   = profile[i].num;
+			init_hca->eqc_base    = profile[i].start;
+			init_hca->log_num_eqs = profile[i].log_num;
+			break;
+		case MTHCA_RES_RDB:
+			for (dev->qp_table.rdb_shift = 0;
+			     request->num_qp << dev->qp_table.rdb_shift < profile[i].num;
+			     ++dev->qp_table.rdb_shift)
+				; /* nothing */
+			dev->qp_table.rdb_base    = (u32) profile[i].start;
+			init_hca->rdb_base        = profile[i].start;
+			break;
+		case MTHCA_RES_MCG:
+			dev->limits.num_mgms      = profile[i].num >> 1;
+			dev->limits.num_amgms     = profile[i].num >> 1;
+			init_hca->mc_base         = profile[i].start;
+			init_hca->log_mc_entry_sz = ffs(MTHCA_MGM_ENTRY_SIZE) - 1;
+			init_hca->log_mc_table_sz = profile[i].log_num;
+			init_hca->mc_hash_sz      = 1 << (profile[i].log_num - 1);
+			break;
+		case MTHCA_RES_MPT:
+			dev->limits.num_mpts   = profile[i].num;
+			dev->mr_table.mpt_base = profile[i].start;
+			init_hca->mpt_base     = profile[i].start;
+			init_hca->log_mpt_sz   = profile[i].log_num;
+			break;
+		case MTHCA_RES_MTT:
+			dev->limits.num_mtt_segs = profile[i].num;
+			dev->mr_table.mtt_base   = profile[i].start;
+			init_hca->mtt_base       = profile[i].start;
+			init_hca->mtt_seg_sz     = ffs(dev->limits.mtt_seg_size) - 7;
+			break;
+		case MTHCA_RES_UAR:
+			dev->limits.num_uars       = profile[i].num;
+			init_hca->uar_scratch_base = profile[i].start;
+			break;
+		case MTHCA_RES_UDAV:
+			dev->av_table.ddr_av_base = profile[i].start;
+			dev->av_table.num_ddr_avs = profile[i].num;
+			break;
+		case MTHCA_RES_UARC:
+			dev->uar_table.uarc_size = request->uarc_size;
+			dev->uar_table.uarc_base = profile[i].start;
+			init_hca->uarc_base   	 = profile[i].start;
+			init_hca->log_uarc_sz 	 = ffs(request->uarc_size) - 13;
+			init_hca->log_uar_sz  	 = ffs(request->num_uar) - 1;
+			break;
+		default:
+			break;
+		}
+	}
+
+	/*
+	 * PDs don't take any HCA memory, but we assign them as part
+	 * of the HCA profile anyway.
+	 */
+	dev->limits.num_pds = MTHCA_NUM_PDS;
+
+	if (dev->mthca_flags & MTHCA_FLAG_SINAI_OPT &&
+	    init_hca->log_mpt_sz > 23) {
+		mthca_warn(dev, "MPT table too large (requested size 2^%d >= 2^24)\n",
+			   init_hca->log_mpt_sz);
+		mthca_warn(dev, "Disabling memory key throughput optimization.\n");
+		dev->mthca_flags &= ~MTHCA_FLAG_SINAI_OPT;
+	}
+
+	/*
+	 * For Tavor, FMRs use ioremapped PCI memory. For 32 bit
+	 * systems it may use too much vmalloc space to map all MTT
+	 * memory, so we reserve some MTTs for FMR access, taking them
+	 * out of the MR pool. They don't use additional memory, but
+	 * we assign them as part of the HCA profile anyway.
+	 */
+	if (mthca_is_memfree(dev) || BITS_PER_LONG == 64)
+		dev->limits.fmr_reserved_mtts = 0;
+	else
+		dev->limits.fmr_reserved_mtts = request->fmr_reserved_mtts;
+
+	kfree(profile);
+	return total_size;
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_profile.h b/drivers/infiniband/hw/mthca/mthca_profile.h
new file mode 100644
index 000000000..62b009cc8
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_profile.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_PROFILE_H
+#define MTHCA_PROFILE_H
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+struct mthca_profile {
+	int num_qp;
+	int rdb_per_qp;
+	int num_srq;
+	int num_cq;
+	int num_mcg;
+	int num_mpt;
+	int num_mtt;
+	int num_udav;
+	int num_uar;
+	int uarc_size;
+	int fmr_reserved_mtts;
+};
+
+s64 mthca_make_profile(struct mthca_dev *mdev,
+		       struct mthca_profile *request,
+		       struct mthca_dev_lim *dev_lim,
+		       struct mthca_init_hca_param *init_hca);
+
+#endif /* MTHCA_PROFILE_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c
new file mode 100644
index 000000000..415f8e1a5
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_provider.c
@@ -0,0 +1,1375 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_user_verbs.h>
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/mm.h>
+#include <linux/export.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_user.h"
+#include "mthca_memfree.h"
+
+static void init_query_mad(struct ib_smp *mad)
+{
+	mad->base_version  = 1;
+	mad->mgmt_class    = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	mad->class_version = 1;
+	mad->method    	   = IB_MGMT_METHOD_GET;
+}
+
+static int mthca_query_device(struct ib_device *ibdev,
+			      struct ib_device_attr *props)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+	struct mthca_dev *mdev = to_mdev(ibdev);
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(props, 0, sizeof *props);
+
+	props->fw_ver              = mdev->fw_ver;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mthca_MAD_IFC(mdev, 1, 1,
+			    1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	props->device_cap_flags    = mdev->device_cap_flags;
+	props->vendor_id           = be32_to_cpup((__be32 *) (out_mad->data + 36)) &
+		0xffffff;
+	props->vendor_part_id      = be16_to_cpup((__be16 *) (out_mad->data + 30));
+	props->hw_ver              = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&props->sys_image_guid, out_mad->data +  4, 8);
+
+	props->max_mr_size         = ~0ull;
+	props->page_size_cap       = mdev->limits.page_size_cap;
+	props->max_qp              = mdev->limits.num_qps - mdev->limits.reserved_qps;
+	props->max_qp_wr           = mdev->limits.max_wqes;
+	props->max_sge             = mdev->limits.max_sg;
+	props->max_cq              = mdev->limits.num_cqs - mdev->limits.reserved_cqs;
+	props->max_cqe             = mdev->limits.max_cqes;
+	props->max_mr              = mdev->limits.num_mpts - mdev->limits.reserved_mrws;
+	props->max_pd              = mdev->limits.num_pds - mdev->limits.reserved_pds;
+	props->max_qp_rd_atom      = 1 << mdev->qp_table.rdb_shift;
+	props->max_qp_init_rd_atom = mdev->limits.max_qp_init_rdma;
+	props->max_res_rd_atom     = props->max_qp_rd_atom * props->max_qp;
+	props->max_srq             = mdev->limits.num_srqs - mdev->limits.reserved_srqs;
+	props->max_srq_wr          = mdev->limits.max_srq_wqes;
+	props->max_srq_sge         = mdev->limits.max_srq_sge;
+	props->local_ca_ack_delay  = mdev->limits.local_ca_ack_delay;
+	props->atomic_cap          = mdev->limits.flags & DEV_LIM_FLAG_ATOMIC ?
+					IB_ATOMIC_HCA : IB_ATOMIC_NONE;
+	props->max_pkeys           = mdev->limits.pkey_table_len;
+	props->max_mcast_grp       = mdev->limits.num_mgms + mdev->limits.num_amgms;
+	props->max_mcast_qp_attach = MTHCA_QP_PER_MGM;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+					   props->max_mcast_grp;
+	/*
+	 * If Sinai memory key optimization is being used, then only
+	 * the 8-bit key portion will change.  For other HCAs, the
+	 * unused index bits will also be used for FMR remapping.
+	 */
+	if (mdev->mthca_flags & MTHCA_FLAG_SINAI_OPT)
+		props->max_map_per_fmr = 255;
+	else
+		props->max_map_per_fmr =
+			(1 << (32 - ilog2(mdev->limits.num_mpts))) - 1;
+
+	err = 0;
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_query_port(struct ib_device *ibdev,
+			    u8 port, struct ib_port_attr *props)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	memset(props, 0, sizeof *props);
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	props->lid               = be16_to_cpup((__be16 *) (out_mad->data + 16));
+	props->lmc               = out_mad->data[34] & 0x7;
+	props->sm_lid            = be16_to_cpup((__be16 *) (out_mad->data + 18));
+	props->sm_sl             = out_mad->data[36] & 0xf;
+	props->state             = out_mad->data[32] & 0xf;
+	props->phys_state        = out_mad->data[33] >> 4;
+	props->port_cap_flags    = be32_to_cpup((__be32 *) (out_mad->data + 20));
+	props->gid_tbl_len       = to_mdev(ibdev)->limits.gid_table_len;
+	props->max_msg_sz        = 0x80000000;
+	props->pkey_tbl_len      = to_mdev(ibdev)->limits.pkey_table_len;
+	props->bad_pkey_cntr     = be16_to_cpup((__be16 *) (out_mad->data + 46));
+	props->qkey_viol_cntr    = be16_to_cpup((__be16 *) (out_mad->data + 48));
+	props->active_width      = out_mad->data[31] & 0xf;
+	props->active_speed      = out_mad->data[35] >> 4;
+	props->max_mtu           = out_mad->data[41] & 0xf;
+	props->active_mtu        = out_mad->data[36] >> 4;
+	props->subnet_timeout    = out_mad->data[51] & 0x1f;
+	props->max_vl_num        = out_mad->data[37] >> 4;
+	props->init_type_reply   = out_mad->data[41] >> 4;
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_modify_device(struct ib_device *ibdev,
+			       int mask,
+			       struct ib_device_modify *props)
+{
+	if (mask & ~IB_DEVICE_MODIFY_NODE_DESC)
+		return -EOPNOTSUPP;
+
+	if (mask & IB_DEVICE_MODIFY_NODE_DESC) {
+		if (mutex_lock_interruptible(&to_mdev(ibdev)->cap_mask_mutex))
+			return -ERESTARTSYS;
+		memcpy(ibdev->node_desc, props->node_desc, 64);
+		mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+	}
+
+	return 0;
+}
+
+static int mthca_modify_port(struct ib_device *ibdev,
+			     u8 port, int port_modify_mask,
+			     struct ib_port_modify *props)
+{
+	struct mthca_set_ib_param set_ib;
+	struct ib_port_attr attr;
+	int err;
+
+	if (mutex_lock_interruptible(&to_mdev(ibdev)->cap_mask_mutex))
+		return -ERESTARTSYS;
+
+	err = mthca_query_port(ibdev, port, &attr);
+	if (err)
+		goto out;
+
+	set_ib.set_si_guid     = 0;
+	set_ib.reset_qkey_viol = !!(port_modify_mask & IB_PORT_RESET_QKEY_CNTR);
+
+	set_ib.cap_mask = (attr.port_cap_flags | props->set_port_cap_mask) &
+		~props->clr_port_cap_mask;
+
+	err = mthca_SET_IB(to_mdev(ibdev), &set_ib, port);
+	if (err)
+		goto out;
+out:
+	mutex_unlock(&to_mdev(ibdev)->cap_mask_mutex);
+	return err;
+}
+
+static int mthca_query_pkey(struct ib_device *ibdev,
+			    u8 port, u16 index, u16 *pkey)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PKEY_TABLE;
+	in_mad->attr_mod = cpu_to_be32(index / 32);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	*pkey = be16_to_cpu(((__be16 *) out_mad->data)[index % 32]);
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static int mthca_query_gid(struct ib_device *ibdev, u8 port,
+			   int index, union ib_gid *gid)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_PORT_INFO;
+	in_mad->attr_mod = cpu_to_be32(port);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw, out_mad->data + 8, 8);
+
+	init_query_mad(in_mad);
+	in_mad->attr_id  = IB_SMP_ATTR_GUID_INFO;
+	in_mad->attr_mod = cpu_to_be32(index / 8);
+
+	err = mthca_MAD_IFC(to_mdev(ibdev), 1, 1,
+			    port, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(gid->raw + 8, out_mad->data + (index % 8) * 8, 8);
+
+ out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+static struct ib_ucontext *mthca_alloc_ucontext(struct ib_device *ibdev,
+						struct ib_udata *udata)
+{
+	struct mthca_alloc_ucontext_resp uresp;
+	struct mthca_ucontext           *context;
+	int                              err;
+
+	if (!(to_mdev(ibdev)->active))
+		return ERR_PTR(-EAGAIN);
+
+	memset(&uresp, 0, sizeof uresp);
+
+	uresp.qp_tab_size = to_mdev(ibdev)->limits.num_qps;
+	if (mthca_is_memfree(to_mdev(ibdev)))
+		uresp.uarc_size = to_mdev(ibdev)->uar_table.uarc_size;
+	else
+		uresp.uarc_size = 0;
+
+	context = kmalloc(sizeof *context, GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_uar_alloc(to_mdev(ibdev), &context->uar);
+	if (err) {
+		kfree(context);
+		return ERR_PTR(err);
+	}
+
+	context->db_tab = mthca_init_user_db_tab(to_mdev(ibdev));
+	if (IS_ERR(context->db_tab)) {
+		err = PTR_ERR(context->db_tab);
+		mthca_uar_free(to_mdev(ibdev), &context->uar);
+		kfree(context);
+		return ERR_PTR(err);
+	}
+
+	if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
+		mthca_cleanup_user_db_tab(to_mdev(ibdev), &context->uar, context->db_tab);
+		mthca_uar_free(to_mdev(ibdev), &context->uar);
+		kfree(context);
+		return ERR_PTR(-EFAULT);
+	}
+
+	context->reg_mr_warned = 0;
+
+	return &context->ibucontext;
+}
+
+static int mthca_dealloc_ucontext(struct ib_ucontext *context)
+{
+	mthca_cleanup_user_db_tab(to_mdev(context->device), &to_mucontext(context)->uar,
+				  to_mucontext(context)->db_tab);
+	mthca_uar_free(to_mdev(context->device), &to_mucontext(context)->uar);
+	kfree(to_mucontext(context));
+
+	return 0;
+}
+
+static int mthca_mmap_uar(struct ib_ucontext *context,
+			  struct vm_area_struct *vma)
+{
+	if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+		return -EINVAL;
+
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+	if (io_remap_pfn_range(vma, vma->vm_start,
+			       to_mucontext(context)->uar.pfn,
+			       PAGE_SIZE, vma->vm_page_prot))
+		return -EAGAIN;
+
+	return 0;
+}
+
+static struct ib_pd *mthca_alloc_pd(struct ib_device *ibdev,
+				    struct ib_ucontext *context,
+				    struct ib_udata *udata)
+{
+	struct mthca_pd *pd;
+	int err;
+
+	pd = kmalloc(sizeof *pd, GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_pd_alloc(to_mdev(ibdev), !context, pd);
+	if (err) {
+		kfree(pd);
+		return ERR_PTR(err);
+	}
+
+	if (context) {
+		if (ib_copy_to_udata(udata, &pd->pd_num, sizeof (__u32))) {
+			mthca_pd_free(to_mdev(ibdev), pd);
+			kfree(pd);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+
+	return &pd->ibpd;
+}
+
+static int mthca_dealloc_pd(struct ib_pd *pd)
+{
+	mthca_pd_free(to_mdev(pd->device), to_mpd(pd));
+	kfree(pd);
+
+	return 0;
+}
+
+static struct ib_ah *mthca_ah_create(struct ib_pd *pd,
+				     struct ib_ah_attr *ah_attr)
+{
+	int err;
+	struct mthca_ah *ah;
+
+	ah = kmalloc(sizeof *ah, GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_create_ah(to_mdev(pd->device), to_mpd(pd), ah_attr, ah);
+	if (err) {
+		kfree(ah);
+		return ERR_PTR(err);
+	}
+
+	return &ah->ibah;
+}
+
+static int mthca_ah_destroy(struct ib_ah *ah)
+{
+	mthca_destroy_ah(to_mdev(ah->device), to_mah(ah));
+	kfree(ah);
+
+	return 0;
+}
+
+static struct ib_srq *mthca_create_srq(struct ib_pd *pd,
+				       struct ib_srq_init_attr *init_attr,
+				       struct ib_udata *udata)
+{
+	struct mthca_create_srq ucmd;
+	struct mthca_ucontext *context = NULL;
+	struct mthca_srq *srq;
+	int err;
+
+	if (init_attr->srq_type != IB_SRQT_BASIC)
+		return ERR_PTR(-ENOSYS);
+
+	srq = kmalloc(sizeof *srq, GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(-ENOMEM);
+
+	if (pd->uobject) {
+		context = to_mucontext(pd->uobject->context);
+
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			err = -EFAULT;
+			goto err_free;
+		}
+
+		err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+					context->db_tab, ucmd.db_index,
+					ucmd.db_page);
+
+		if (err)
+			goto err_free;
+
+		srq->mr.ibmr.lkey = ucmd.lkey;
+		srq->db_index     = ucmd.db_index;
+	}
+
+	err = mthca_alloc_srq(to_mdev(pd->device), to_mpd(pd),
+			      &init_attr->attr, srq);
+
+	if (err && pd->uobject)
+		mthca_unmap_user_db(to_mdev(pd->device), &context->uar,
+				    context->db_tab, ucmd.db_index);
+
+	if (err)
+		goto err_free;
+
+	if (context && ib_copy_to_udata(udata, &srq->srqn, sizeof (__u32))) {
+		mthca_free_srq(to_mdev(pd->device), srq);
+		err = -EFAULT;
+		goto err_free;
+	}
+
+	return &srq->ibsrq;
+
+err_free:
+	kfree(srq);
+
+	return ERR_PTR(err);
+}
+
+static int mthca_destroy_srq(struct ib_srq *srq)
+{
+	struct mthca_ucontext *context;
+
+	if (srq->uobject) {
+		context = to_mucontext(srq->uobject->context);
+
+		mthca_unmap_user_db(to_mdev(srq->device), &context->uar,
+				    context->db_tab, to_msrq(srq)->db_index);
+	}
+
+	mthca_free_srq(to_mdev(srq->device), to_msrq(srq));
+	kfree(srq);
+
+	return 0;
+}
+
+static struct ib_qp *mthca_create_qp(struct ib_pd *pd,
+				     struct ib_qp_init_attr *init_attr,
+				     struct ib_udata *udata)
+{
+	struct mthca_create_qp ucmd;
+	struct mthca_qp *qp;
+	int err;
+
+	if (init_attr->create_flags)
+		return ERR_PTR(-EINVAL);
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_RC:
+	case IB_QPT_UC:
+	case IB_QPT_UD:
+	{
+		struct mthca_ucontext *context;
+
+		qp = kmalloc(sizeof *qp, GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		if (pd->uobject) {
+			context = to_mucontext(pd->uobject->context);
+
+			if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+				kfree(qp);
+				return ERR_PTR(-EFAULT);
+			}
+
+			err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+						context->db_tab,
+						ucmd.sq_db_index, ucmd.sq_db_page);
+			if (err) {
+				kfree(qp);
+				return ERR_PTR(err);
+			}
+
+			err = mthca_map_user_db(to_mdev(pd->device), &context->uar,
+						context->db_tab,
+						ucmd.rq_db_index, ucmd.rq_db_page);
+			if (err) {
+				mthca_unmap_user_db(to_mdev(pd->device),
+						    &context->uar,
+						    context->db_tab,
+						    ucmd.sq_db_index);
+				kfree(qp);
+				return ERR_PTR(err);
+			}
+
+			qp->mr.ibmr.lkey = ucmd.lkey;
+			qp->sq.db_index  = ucmd.sq_db_index;
+			qp->rq.db_index  = ucmd.rq_db_index;
+		}
+
+		err = mthca_alloc_qp(to_mdev(pd->device), to_mpd(pd),
+				     to_mcq(init_attr->send_cq),
+				     to_mcq(init_attr->recv_cq),
+				     init_attr->qp_type, init_attr->sq_sig_type,
+				     &init_attr->cap, qp);
+
+		if (err && pd->uobject) {
+			context = to_mucontext(pd->uobject->context);
+
+			mthca_unmap_user_db(to_mdev(pd->device),
+					    &context->uar,
+					    context->db_tab,
+					    ucmd.sq_db_index);
+			mthca_unmap_user_db(to_mdev(pd->device),
+					    &context->uar,
+					    context->db_tab,
+					    ucmd.rq_db_index);
+		}
+
+		qp->ibqp.qp_num = qp->qpn;
+		break;
+	}
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+	{
+		/* Don't allow userspace to create special QPs */
+		if (pd->uobject)
+			return ERR_PTR(-EINVAL);
+
+		qp = kmalloc(sizeof (struct mthca_sqp), GFP_KERNEL);
+		if (!qp)
+			return ERR_PTR(-ENOMEM);
+
+		qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1;
+
+		err = mthca_alloc_sqp(to_mdev(pd->device), to_mpd(pd),
+				      to_mcq(init_attr->send_cq),
+				      to_mcq(init_attr->recv_cq),
+				      init_attr->sq_sig_type, &init_attr->cap,
+				      qp->ibqp.qp_num, init_attr->port_num,
+				      to_msqp(qp));
+		break;
+	}
+	default:
+		/* Don't support raw QPs */
+		return ERR_PTR(-ENOSYS);
+	}
+
+	if (err) {
+		kfree(qp);
+		return ERR_PTR(err);
+	}
+
+	init_attr->cap.max_send_wr     = qp->sq.max;
+	init_attr->cap.max_recv_wr     = qp->rq.max;
+	init_attr->cap.max_send_sge    = qp->sq.max_gs;
+	init_attr->cap.max_recv_sge    = qp->rq.max_gs;
+	init_attr->cap.max_inline_data = qp->max_inline_data;
+
+	return &qp->ibqp;
+}
+
+static int mthca_destroy_qp(struct ib_qp *qp)
+{
+	if (qp->uobject) {
+		mthca_unmap_user_db(to_mdev(qp->device),
+				    &to_mucontext(qp->uobject->context)->uar,
+				    to_mucontext(qp->uobject->context)->db_tab,
+				    to_mqp(qp)->sq.db_index);
+		mthca_unmap_user_db(to_mdev(qp->device),
+				    &to_mucontext(qp->uobject->context)->uar,
+				    to_mucontext(qp->uobject->context)->db_tab,
+				    to_mqp(qp)->rq.db_index);
+	}
+	mthca_free_qp(to_mdev(qp->device), to_mqp(qp));
+	kfree(qp);
+	return 0;
+}
+
+static struct ib_cq *mthca_create_cq(struct ib_device *ibdev, int entries,
+				     int comp_vector,
+				     struct ib_ucontext *context,
+				     struct ib_udata *udata)
+{
+	struct mthca_create_cq ucmd;
+	struct mthca_cq *cq;
+	int nent;
+	int err;
+
+	if (entries < 1 || entries > to_mdev(ibdev)->limits.max_cqes)
+		return ERR_PTR(-EINVAL);
+
+	if (context) {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
+			return ERR_PTR(-EFAULT);
+
+		err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+					to_mucontext(context)->db_tab,
+					ucmd.set_db_index, ucmd.set_db_page);
+		if (err)
+			return ERR_PTR(err);
+
+		err = mthca_map_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+					to_mucontext(context)->db_tab,
+					ucmd.arm_db_index, ucmd.arm_db_page);
+		if (err)
+			goto err_unmap_set;
+	}
+
+	cq = kmalloc(sizeof *cq, GFP_KERNEL);
+	if (!cq) {
+		err = -ENOMEM;
+		goto err_unmap_arm;
+	}
+
+	if (context) {
+		cq->buf.mr.ibmr.lkey = ucmd.lkey;
+		cq->set_ci_db_index  = ucmd.set_db_index;
+		cq->arm_db_index     = ucmd.arm_db_index;
+	}
+
+	for (nent = 1; nent <= entries; nent <<= 1)
+		; /* nothing */
+
+	err = mthca_init_cq(to_mdev(ibdev), nent,
+			    context ? to_mucontext(context) : NULL,
+			    context ? ucmd.pdn : to_mdev(ibdev)->driver_pd.pd_num,
+			    cq);
+	if (err)
+		goto err_free;
+
+	if (context && ib_copy_to_udata(udata, &cq->cqn, sizeof (__u32))) {
+		mthca_free_cq(to_mdev(ibdev), cq);
+		err = -EFAULT;
+		goto err_free;
+	}
+
+	cq->resize_buf = NULL;
+
+	return &cq->ibcq;
+
+err_free:
+	kfree(cq);
+
+err_unmap_arm:
+	if (context)
+		mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+				    to_mucontext(context)->db_tab, ucmd.arm_db_index);
+
+err_unmap_set:
+	if (context)
+		mthca_unmap_user_db(to_mdev(ibdev), &to_mucontext(context)->uar,
+				    to_mucontext(context)->db_tab, ucmd.set_db_index);
+
+	return ERR_PTR(err);
+}
+
+static int mthca_alloc_resize_buf(struct mthca_dev *dev, struct mthca_cq *cq,
+				  int entries)
+{
+	int ret;
+
+	spin_lock_irq(&cq->lock);
+	if (cq->resize_buf) {
+		ret = -EBUSY;
+		goto unlock;
+	}
+
+	cq->resize_buf = kmalloc(sizeof *cq->resize_buf, GFP_ATOMIC);
+	if (!cq->resize_buf) {
+		ret = -ENOMEM;
+		goto unlock;
+	}
+
+	cq->resize_buf->state = CQ_RESIZE_ALLOC;
+
+	ret = 0;
+
+unlock:
+	spin_unlock_irq(&cq->lock);
+
+	if (ret)
+		return ret;
+
+	ret = mthca_alloc_cq_buf(dev, &cq->resize_buf->buf, entries);
+	if (ret) {
+		spin_lock_irq(&cq->lock);
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		spin_unlock_irq(&cq->lock);
+		return ret;
+	}
+
+	cq->resize_buf->cqe = entries - 1;
+
+	spin_lock_irq(&cq->lock);
+	cq->resize_buf->state = CQ_RESIZE_READY;
+	spin_unlock_irq(&cq->lock);
+
+	return 0;
+}
+
+static int mthca_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(ibcq->device);
+	struct mthca_cq *cq = to_mcq(ibcq);
+	struct mthca_resize_cq ucmd;
+	u32 lkey;
+	int ret;
+
+	if (entries < 1 || entries > dev->limits.max_cqes)
+		return -EINVAL;
+
+	mutex_lock(&cq->mutex);
+
+	entries = roundup_pow_of_two(entries + 1);
+	if (entries == ibcq->cqe + 1) {
+		ret = 0;
+		goto out;
+	}
+
+	if (cq->is_kernel) {
+		ret = mthca_alloc_resize_buf(dev, cq, entries);
+		if (ret)
+			goto out;
+		lkey = cq->resize_buf->buf.mr.ibmr.lkey;
+	} else {
+		if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd)) {
+			ret = -EFAULT;
+			goto out;
+		}
+		lkey = ucmd.lkey;
+	}
+
+	ret = mthca_RESIZE_CQ(dev, cq->cqn, lkey, ilog2(entries));
+
+	if (ret) {
+		if (cq->resize_buf) {
+			mthca_free_cq_buf(dev, &cq->resize_buf->buf,
+					  cq->resize_buf->cqe);
+			kfree(cq->resize_buf);
+			spin_lock_irq(&cq->lock);
+			cq->resize_buf = NULL;
+			spin_unlock_irq(&cq->lock);
+		}
+		goto out;
+	}
+
+	if (cq->is_kernel) {
+		struct mthca_cq_buf tbuf;
+		int tcqe;
+
+		spin_lock_irq(&cq->lock);
+		if (cq->resize_buf->state == CQ_RESIZE_READY) {
+			mthca_cq_resize_copy_cqes(cq);
+			tbuf         = cq->buf;
+			tcqe         = cq->ibcq.cqe;
+			cq->buf      = cq->resize_buf->buf;
+			cq->ibcq.cqe = cq->resize_buf->cqe;
+		} else {
+			tbuf = cq->resize_buf->buf;
+			tcqe = cq->resize_buf->cqe;
+		}
+
+		kfree(cq->resize_buf);
+		cq->resize_buf = NULL;
+		spin_unlock_irq(&cq->lock);
+
+		mthca_free_cq_buf(dev, &tbuf, tcqe);
+	} else
+		ibcq->cqe = entries - 1;
+
+out:
+	mutex_unlock(&cq->mutex);
+
+	return ret;
+}
+
+static int mthca_destroy_cq(struct ib_cq *cq)
+{
+	if (cq->uobject) {
+		mthca_unmap_user_db(to_mdev(cq->device),
+				    &to_mucontext(cq->uobject->context)->uar,
+				    to_mucontext(cq->uobject->context)->db_tab,
+				    to_mcq(cq)->arm_db_index);
+		mthca_unmap_user_db(to_mdev(cq->device),
+				    &to_mucontext(cq->uobject->context)->uar,
+				    to_mucontext(cq->uobject->context)->db_tab,
+				    to_mcq(cq)->set_ci_db_index);
+	}
+	mthca_free_cq(to_mdev(cq->device), to_mcq(cq));
+	kfree(cq);
+
+	return 0;
+}
+
+static inline u32 convert_access(int acc)
+{
+	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MTHCA_MPT_FLAG_ATOMIC       : 0) |
+	       (acc & IB_ACCESS_REMOTE_WRITE  ? MTHCA_MPT_FLAG_REMOTE_WRITE : 0) |
+	       (acc & IB_ACCESS_REMOTE_READ   ? MTHCA_MPT_FLAG_REMOTE_READ  : 0) |
+	       (acc & IB_ACCESS_LOCAL_WRITE   ? MTHCA_MPT_FLAG_LOCAL_WRITE  : 0) |
+	       MTHCA_MPT_FLAG_LOCAL_READ;
+}
+
+static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct mthca_mr *mr;
+	int err;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mthca_mr_alloc_notrans(to_mdev(pd->device),
+				     to_mpd(pd)->pd_num,
+				     convert_access(acc), mr);
+
+	if (err) {
+		kfree(mr);
+		return ERR_PTR(err);
+	}
+
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+}
+
+static struct ib_mr *mthca_reg_phys_mr(struct ib_pd       *pd,
+				       struct ib_phys_buf *buffer_list,
+				       int                 num_phys_buf,
+				       int                 acc,
+				       u64                *iova_start)
+{
+	struct mthca_mr *mr;
+	u64 *page_list;
+	u64 total_size;
+	unsigned long mask;
+	int shift;
+	int npages;
+	int err;
+	int i, j, n;
+
+	mask = buffer_list[0].addr ^ *iova_start;
+	total_size = 0;
+	for (i = 0; i < num_phys_buf; ++i) {
+		if (i != 0)
+			mask |= buffer_list[i].addr;
+		if (i != num_phys_buf - 1)
+			mask |= buffer_list[i].addr + buffer_list[i].size;
+
+		total_size += buffer_list[i].size;
+	}
+
+	if (mask & ~PAGE_MASK)
+		return ERR_PTR(-EINVAL);
+
+	shift = __ffs(mask | 1 << 31);
+
+	buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1);
+	buffer_list[0].addr &= ~0ull << shift;
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	npages = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
+
+	if (!npages)
+		return &mr->ibmr;
+
+	page_list = kmalloc(npages * sizeof *page_list, GFP_KERNEL);
+	if (!page_list) {
+		kfree(mr);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	n = 0;
+	for (i = 0; i < num_phys_buf; ++i)
+		for (j = 0;
+		     j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift;
+		     ++j)
+			page_list[n++] = buffer_list[i].addr + ((u64) j << shift);
+
+	mthca_dbg(to_mdev(pd->device), "Registering memory at %llx (iova %llx) "
+		  "in PD %x; shift %d, npages %d.\n",
+		  (unsigned long long) buffer_list[0].addr,
+		  (unsigned long long) *iova_start,
+		  to_mpd(pd)->pd_num,
+		  shift, npages);
+
+	err = mthca_mr_alloc_phys(to_mdev(pd->device),
+				  to_mpd(pd)->pd_num,
+				  page_list, shift, npages,
+				  *iova_start, total_size,
+				  convert_access(acc), mr);
+
+	if (err) {
+		kfree(page_list);
+		kfree(mr);
+		return ERR_PTR(err);
+	}
+
+	kfree(page_list);
+	mr->umem = NULL;
+
+	return &mr->ibmr;
+}
+
+static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+				       u64 virt, int acc, struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(pd->device);
+	struct scatterlist *sg;
+	struct mthca_mr *mr;
+	struct mthca_reg_mr ucmd;
+	u64 *pages;
+	int shift, n, len;
+	int i, k, entry;
+	int err = 0;
+	int write_mtt_size;
+
+	if (udata->inlen - sizeof (struct ib_uverbs_cmd_hdr) < sizeof ucmd) {
+		if (!to_mucontext(pd->uobject->context)->reg_mr_warned) {
+			mthca_warn(dev, "Process '%s' did not pass in MR attrs.\n",
+				   current->comm);
+			mthca_warn(dev, "  Update libmthca to fix this.\n");
+		}
+		++to_mucontext(pd->uobject->context)->reg_mr_warned;
+		ucmd.mr_attrs = 0;
+	} else if (ib_copy_from_udata(&ucmd, udata, sizeof ucmd))
+		return ERR_PTR(-EFAULT);
+
+	mr = kmalloc(sizeof *mr, GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	mr->umem = ib_umem_get(pd->uobject->context, start, length, acc,
+			       ucmd.mr_attrs & MTHCA_MR_DMASYNC);
+
+	if (IS_ERR(mr->umem)) {
+		err = PTR_ERR(mr->umem);
+		goto err;
+	}
+
+	shift = ffs(mr->umem->page_size) - 1;
+	n = mr->umem->nmap;
+
+	mr->mtt = mthca_alloc_mtt(dev, n);
+	if (IS_ERR(mr->mtt)) {
+		err = PTR_ERR(mr->mtt);
+		goto err_umem;
+	}
+
+	pages = (u64 *) __get_free_page(GFP_KERNEL);
+	if (!pages) {
+		err = -ENOMEM;
+		goto err_mtt;
+	}
+
+	i = n = 0;
+
+	write_mtt_size = min(mthca_write_mtt_size(dev), (int) (PAGE_SIZE / sizeof *pages));
+
+	for_each_sg(mr->umem->sg_head.sgl, sg, mr->umem->nmap, entry) {
+		len = sg_dma_len(sg) >> shift;
+		for (k = 0; k < len; ++k) {
+			pages[i++] = sg_dma_address(sg) +
+				mr->umem->page_size * k;
+			/*
+			 * Be friendly to write_mtt and pass it chunks
+			 * of appropriate size.
+			 */
+			if (i == write_mtt_size) {
+				err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
+				if (err)
+					goto mtt_done;
+				n += i;
+				i = 0;
+			}
+		}
+	}
+
+	if (i)
+		err = mthca_write_mtt(dev, mr->mtt, n, pages, i);
+mtt_done:
+	free_page((unsigned long) pages);
+	if (err)
+		goto err_mtt;
+
+	err = mthca_mr_alloc(dev, to_mpd(pd)->pd_num, shift, virt, length,
+			     convert_access(acc), mr);
+
+	if (err)
+		goto err_mtt;
+
+	return &mr->ibmr;
+
+err_mtt:
+	mthca_free_mtt(dev, mr->mtt);
+
+err_umem:
+	ib_umem_release(mr->umem);
+
+err:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+static int mthca_dereg_mr(struct ib_mr *mr)
+{
+	struct mthca_mr *mmr = to_mmr(mr);
+
+	mthca_free_mr(to_mdev(mr->device), mmr);
+	if (mmr->umem)
+		ib_umem_release(mmr->umem);
+	kfree(mmr);
+
+	return 0;
+}
+
+static struct ib_fmr *mthca_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+				      struct ib_fmr_attr *fmr_attr)
+{
+	struct mthca_fmr *fmr;
+	int err;
+
+	fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
+	if (!fmr)
+		return ERR_PTR(-ENOMEM);
+
+	memcpy(&fmr->attr, fmr_attr, sizeof *fmr_attr);
+	err = mthca_fmr_alloc(to_mdev(pd->device), to_mpd(pd)->pd_num,
+			     convert_access(mr_access_flags), fmr);
+
+	if (err) {
+		kfree(fmr);
+		return ERR_PTR(err);
+	}
+
+	return &fmr->ibmr;
+}
+
+static int mthca_dealloc_fmr(struct ib_fmr *fmr)
+{
+	struct mthca_fmr *mfmr = to_mfmr(fmr);
+	int err;
+
+	err = mthca_free_fmr(to_mdev(fmr->device), mfmr);
+	if (err)
+		return err;
+
+	kfree(mfmr);
+	return 0;
+}
+
+static int mthca_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ib_fmr *fmr;
+	int err;
+	struct mthca_dev *mdev = NULL;
+
+	list_for_each_entry(fmr, fmr_list, list) {
+		if (mdev && to_mdev(fmr->device) != mdev)
+			return -EINVAL;
+		mdev = to_mdev(fmr->device);
+	}
+
+	if (!mdev)
+		return 0;
+
+	if (mthca_is_memfree(mdev)) {
+		list_for_each_entry(fmr, fmr_list, list)
+			mthca_arbel_fmr_unmap(mdev, to_mfmr(fmr));
+
+		wmb();
+	} else
+		list_for_each_entry(fmr, fmr_list, list)
+			mthca_tavor_fmr_unmap(mdev, to_mfmr(fmr));
+
+	err = mthca_SYNC_TPT(mdev);
+	return err;
+}
+
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mthca_dev *dev =
+		container_of(device, struct mthca_dev, ib_dev.dev);
+	return sprintf(buf, "%x\n", dev->rev_id);
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+			   char *buf)
+{
+	struct mthca_dev *dev =
+		container_of(device, struct mthca_dev, ib_dev.dev);
+	return sprintf(buf, "%d.%d.%d\n", (int) (dev->fw_ver >> 32),
+		       (int) (dev->fw_ver >> 16) & 0xffff,
+		       (int) dev->fw_ver & 0xffff);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct mthca_dev *dev =
+		container_of(device, struct mthca_dev, ib_dev.dev);
+	switch (dev->pdev->device) {
+	case PCI_DEVICE_ID_MELLANOX_TAVOR:
+		return sprintf(buf, "MT23108\n");
+	case PCI_DEVICE_ID_MELLANOX_ARBEL_COMPAT:
+		return sprintf(buf, "MT25208 (MT23108 compat mode)\n");
+	case PCI_DEVICE_ID_MELLANOX_ARBEL:
+		return sprintf(buf, "MT25208\n");
+	case PCI_DEVICE_ID_MELLANOX_SINAI:
+	case PCI_DEVICE_ID_MELLANOX_SINAI_OLD:
+		return sprintf(buf, "MT25204\n");
+	default:
+		return sprintf(buf, "unknown\n");
+	}
+}
+
+static ssize_t show_board(struct device *device, struct device_attribute *attr,
+			  char *buf)
+{
+	struct mthca_dev *dev =
+		container_of(device, struct mthca_dev, ib_dev.dev);
+	return sprintf(buf, "%.*s\n", MTHCA_BOARD_ID_LEN, dev->board_id);
+}
+
+static DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
+static DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+
+static struct device_attribute *mthca_dev_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id
+};
+
+static int mthca_init_node_data(struct mthca_dev *dev)
+{
+	struct ib_smp *in_mad  = NULL;
+	struct ib_smp *out_mad = NULL;
+	int err = -ENOMEM;
+
+	in_mad  = kzalloc(sizeof *in_mad, GFP_KERNEL);
+	out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL);
+	if (!in_mad || !out_mad)
+		goto out;
+
+	init_query_mad(in_mad);
+	in_mad->attr_id = IB_SMP_ATTR_NODE_DESC;
+
+	err = mthca_MAD_IFC(dev, 1, 1,
+			    1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	memcpy(dev->ib_dev.node_desc, out_mad->data, 64);
+
+	in_mad->attr_id = IB_SMP_ATTR_NODE_INFO;
+
+	err = mthca_MAD_IFC(dev, 1, 1,
+			    1, NULL, NULL, in_mad, out_mad);
+	if (err)
+		goto out;
+
+	if (mthca_is_memfree(dev))
+		dev->rev_id = be32_to_cpup((__be32 *) (out_mad->data + 32));
+	memcpy(&dev->ib_dev.node_guid, out_mad->data + 12, 8);
+
+out:
+	kfree(in_mad);
+	kfree(out_mad);
+	return err;
+}
+
+int mthca_register_device(struct mthca_dev *dev)
+{
+	int ret;
+	int i;
+
+	ret = mthca_init_node_data(dev);
+	if (ret)
+		return ret;
+
+	strlcpy(dev->ib_dev.name, "mthca%d", IB_DEVICE_NAME_MAX);
+	dev->ib_dev.owner                = THIS_MODULE;
+
+	dev->ib_dev.uverbs_abi_ver	 = MTHCA_UVERBS_ABI_VERSION;
+	dev->ib_dev.uverbs_cmd_mask	 =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)		|
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)		|
+		(1ull << IB_USER_VERBS_CMD_REG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL)	|
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)		|
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)		|
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)		|
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)	|
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST);
+	dev->ib_dev.node_type            = RDMA_NODE_IB_CA;
+	dev->ib_dev.phys_port_cnt        = dev->limits.num_ports;
+	dev->ib_dev.num_comp_vectors     = 1;
+	dev->ib_dev.dma_device           = &dev->pdev->dev;
+	dev->ib_dev.query_device         = mthca_query_device;
+	dev->ib_dev.query_port           = mthca_query_port;
+	dev->ib_dev.modify_device        = mthca_modify_device;
+	dev->ib_dev.modify_port          = mthca_modify_port;
+	dev->ib_dev.query_pkey           = mthca_query_pkey;
+	dev->ib_dev.query_gid            = mthca_query_gid;
+	dev->ib_dev.alloc_ucontext       = mthca_alloc_ucontext;
+	dev->ib_dev.dealloc_ucontext     = mthca_dealloc_ucontext;
+	dev->ib_dev.mmap                 = mthca_mmap_uar;
+	dev->ib_dev.alloc_pd             = mthca_alloc_pd;
+	dev->ib_dev.dealloc_pd           = mthca_dealloc_pd;
+	dev->ib_dev.create_ah            = mthca_ah_create;
+	dev->ib_dev.query_ah             = mthca_ah_query;
+	dev->ib_dev.destroy_ah           = mthca_ah_destroy;
+
+	if (dev->mthca_flags & MTHCA_FLAG_SRQ) {
+		dev->ib_dev.create_srq           = mthca_create_srq;
+		dev->ib_dev.modify_srq           = mthca_modify_srq;
+		dev->ib_dev.query_srq            = mthca_query_srq;
+		dev->ib_dev.destroy_srq          = mthca_destroy_srq;
+		dev->ib_dev.uverbs_cmd_mask	|=
+			(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)		|
+			(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ);
+
+		if (mthca_is_memfree(dev))
+			dev->ib_dev.post_srq_recv = mthca_arbel_post_srq_recv;
+		else
+			dev->ib_dev.post_srq_recv = mthca_tavor_post_srq_recv;
+	}
+
+	dev->ib_dev.create_qp            = mthca_create_qp;
+	dev->ib_dev.modify_qp            = mthca_modify_qp;
+	dev->ib_dev.query_qp             = mthca_query_qp;
+	dev->ib_dev.destroy_qp           = mthca_destroy_qp;
+	dev->ib_dev.create_cq            = mthca_create_cq;
+	dev->ib_dev.resize_cq            = mthca_resize_cq;
+	dev->ib_dev.destroy_cq           = mthca_destroy_cq;
+	dev->ib_dev.poll_cq              = mthca_poll_cq;
+	dev->ib_dev.get_dma_mr           = mthca_get_dma_mr;
+	dev->ib_dev.reg_phys_mr          = mthca_reg_phys_mr;
+	dev->ib_dev.reg_user_mr          = mthca_reg_user_mr;
+	dev->ib_dev.dereg_mr             = mthca_dereg_mr;
+
+	if (dev->mthca_flags & MTHCA_FLAG_FMR) {
+		dev->ib_dev.alloc_fmr            = mthca_alloc_fmr;
+		dev->ib_dev.unmap_fmr            = mthca_unmap_fmr;
+		dev->ib_dev.dealloc_fmr          = mthca_dealloc_fmr;
+		if (mthca_is_memfree(dev))
+			dev->ib_dev.map_phys_fmr = mthca_arbel_map_phys_fmr;
+		else
+			dev->ib_dev.map_phys_fmr = mthca_tavor_map_phys_fmr;
+	}
+
+	dev->ib_dev.attach_mcast         = mthca_multicast_attach;
+	dev->ib_dev.detach_mcast         = mthca_multicast_detach;
+	dev->ib_dev.process_mad          = mthca_process_mad;
+
+	if (mthca_is_memfree(dev)) {
+		dev->ib_dev.req_notify_cq = mthca_arbel_arm_cq;
+		dev->ib_dev.post_send     = mthca_arbel_post_send;
+		dev->ib_dev.post_recv     = mthca_arbel_post_receive;
+	} else {
+		dev->ib_dev.req_notify_cq = mthca_tavor_arm_cq;
+		dev->ib_dev.post_send     = mthca_tavor_post_send;
+		dev->ib_dev.post_recv     = mthca_tavor_post_receive;
+	}
+
+	mutex_init(&dev->cap_mask_mutex);
+
+	ret = ib_register_device(&dev->ib_dev, NULL);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < ARRAY_SIZE(mthca_dev_attributes); ++i) {
+		ret = device_create_file(&dev->ib_dev.dev,
+					 mthca_dev_attributes[i]);
+		if (ret) {
+			ib_unregister_device(&dev->ib_dev);
+			return ret;
+		}
+	}
+
+	mthca_start_catas_poll(dev);
+
+	return 0;
+}
+
+void mthca_unregister_device(struct mthca_dev *dev)
+{
+	mthca_stop_catas_poll(dev);
+	ib_unregister_device(&dev->ib_dev);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_provider.h b/drivers/infiniband/hw/mthca/mthca_provider.h
new file mode 100644
index 000000000..596acc455
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_provider.h
@@ -0,0 +1,344 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_PROVIDER_H
+#define MTHCA_PROVIDER_H
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+
+#define MTHCA_MPT_FLAG_ATOMIC        (1 << 14)
+#define MTHCA_MPT_FLAG_REMOTE_WRITE  (1 << 13)
+#define MTHCA_MPT_FLAG_REMOTE_READ   (1 << 12)
+#define MTHCA_MPT_FLAG_LOCAL_WRITE   (1 << 11)
+#define MTHCA_MPT_FLAG_LOCAL_READ    (1 << 10)
+
+struct mthca_buf_list {
+	void *buf;
+	DEFINE_DMA_UNMAP_ADDR(mapping);
+};
+
+union mthca_buf {
+	struct mthca_buf_list direct;
+	struct mthca_buf_list *page_list;
+};
+
+struct mthca_uar {
+	unsigned long pfn;
+	int           index;
+};
+
+struct mthca_user_db_table;
+
+struct mthca_ucontext {
+	struct ib_ucontext          ibucontext;
+	struct mthca_uar            uar;
+	struct mthca_user_db_table *db_tab;
+	int			    reg_mr_warned;
+};
+
+struct mthca_mtt;
+
+struct mthca_mr {
+	struct ib_mr      ibmr;
+	struct ib_umem   *umem;
+	struct mthca_mtt *mtt;
+};
+
+struct mthca_fmr {
+	struct ib_fmr      ibmr;
+	struct ib_fmr_attr attr;
+	struct mthca_mtt  *mtt;
+	int                maps;
+	union {
+		struct {
+			struct mthca_mpt_entry __iomem *mpt;
+			u64 __iomem *mtts;
+		} tavor;
+		struct {
+			struct mthca_mpt_entry *mpt;
+			__be64 *mtts;
+			dma_addr_t dma_handle;
+		} arbel;
+	} mem;
+};
+
+struct mthca_pd {
+	struct ib_pd    ibpd;
+	u32             pd_num;
+	atomic_t        sqp_count;
+	struct mthca_mr ntmr;
+	int             privileged;
+};
+
+struct mthca_eq {
+	struct mthca_dev      *dev;
+	int                    eqn;
+	u32                    eqn_mask;
+	u32                    cons_index;
+	u16                    msi_x_vector;
+	u16                    msi_x_entry;
+	int                    have_irq;
+	int                    nent;
+	struct mthca_buf_list *page_list;
+	struct mthca_mr        mr;
+	char		       irq_name[IB_DEVICE_NAME_MAX];
+};
+
+struct mthca_av;
+
+enum mthca_ah_type {
+	MTHCA_AH_ON_HCA,
+	MTHCA_AH_PCI_POOL,
+	MTHCA_AH_KMALLOC
+};
+
+struct mthca_ah {
+	struct ib_ah       ibah;
+	enum mthca_ah_type type;
+	u32                key;
+	struct mthca_av   *av;
+	dma_addr_t         avdma;
+};
+
+/*
+ * Quick description of our CQ/QP locking scheme:
+ *
+ * We have one global lock that protects dev->cq/qp_table.  Each
+ * struct mthca_cq/qp also has its own lock.  An individual qp lock
+ * may be taken inside of an individual cq lock.  Both cqs attached to
+ * a qp may be locked, with the cq with the lower cqn locked first.
+ * No other nesting should be done.
+ *
+ * Each struct mthca_cq/qp also has an ref count, protected by the
+ * corresponding table lock.  The pointer from the cq/qp_table to the
+ * struct counts as one reference.  This reference also is good for
+ * access through the consumer API, so modifying the CQ/QP etc doesn't
+ * need to take another reference.  Access to a QP because of a
+ * completion being polled does not need a reference either.
+ *
+ * Finally, each struct mthca_cq/qp has a wait_queue_head_t for the
+ * destroy function to sleep on.
+ *
+ * This means that access from the consumer API requires nothing but
+ * taking the struct's lock.
+ *
+ * Access because of a completion event should go as follows:
+ * - lock cq/qp_table and look up struct
+ * - increment ref count in struct
+ * - drop cq/qp_table lock
+ * - lock struct, do your thing, and unlock struct
+ * - decrement ref count; if zero, wake up waiters
+ *
+ * To destroy a CQ/QP, we can do the following:
+ * - lock cq/qp_table
+ * - remove pointer and decrement ref count
+ * - unlock cq/qp_table lock
+ * - wait_event until ref count is zero
+ *
+ * It is the consumer's responsibilty to make sure that no QP
+ * operations (WQE posting or state modification) are pending when a
+ * QP is destroyed.  Also, the consumer must make sure that calls to
+ * qp_modify are serialized.  Similarly, the consumer is responsible
+ * for ensuring that no CQ resize operations are pending when a CQ
+ * is destroyed.
+ *
+ * Possible optimizations (wait for profile data to see if/where we
+ * have locks bouncing between CPUs):
+ * - split cq/qp table lock into n separate (cache-aligned) locks,
+ *   indexed (say) by the page in the table
+ * - split QP struct lock into three (one for common info, one for the
+ *   send queue and one for the receive queue)
+ */
+
+struct mthca_cq_buf {
+	union mthca_buf		queue;
+	struct mthca_mr		mr;
+	int			is_direct;
+};
+
+struct mthca_cq_resize {
+	struct mthca_cq_buf	buf;
+	int			cqe;
+	enum {
+		CQ_RESIZE_ALLOC,
+		CQ_RESIZE_READY,
+		CQ_RESIZE_SWAPPED
+	}			state;
+};
+
+struct mthca_cq {
+	struct ib_cq		ibcq;
+	spinlock_t		lock;
+	int			refcount;
+	int			cqn;
+	u32			cons_index;
+	struct mthca_cq_buf	buf;
+	struct mthca_cq_resize *resize_buf;
+	int			is_kernel;
+
+	/* Next fields are Arbel only */
+	int			set_ci_db_index;
+	__be32		       *set_ci_db;
+	int			arm_db_index;
+	__be32		       *arm_db;
+	int			arm_sn;
+
+	wait_queue_head_t	wait;
+	struct mutex		mutex;
+};
+
+struct mthca_srq {
+	struct ib_srq		ibsrq;
+	spinlock_t		lock;
+	int			refcount;
+	int			srqn;
+	int			max;
+	int			max_gs;
+	int			wqe_shift;
+	int			first_free;
+	int			last_free;
+	u16			counter;  /* Arbel only */
+	int			db_index; /* Arbel only */
+	__be32		       *db;       /* Arbel only */
+	void		       *last;
+
+	int			is_direct;
+	u64		       *wrid;
+	union mthca_buf		queue;
+	struct mthca_mr		mr;
+
+	wait_queue_head_t	wait;
+	struct mutex		mutex;
+};
+
+struct mthca_wq {
+	spinlock_t lock;
+	int        max;
+	unsigned   next_ind;
+	unsigned   last_comp;
+	unsigned   head;
+	unsigned   tail;
+	void      *last;
+	int        max_gs;
+	int        wqe_shift;
+
+	int        db_index;	/* Arbel only */
+	__be32    *db;
+};
+
+struct mthca_qp {
+	struct ib_qp           ibqp;
+	int                    refcount;
+	u32                    qpn;
+	int                    is_direct;
+	u8                     port; /* for SQP and memfree use only */
+	u8                     alt_port; /* for memfree use only */
+	u8                     transport;
+	u8                     state;
+	u8                     atomic_rd_en;
+	u8                     resp_depth;
+
+	struct mthca_mr        mr;
+
+	struct mthca_wq        rq;
+	struct mthca_wq        sq;
+	enum ib_sig_type       sq_policy;
+	int                    send_wqe_offset;
+	int                    max_inline_data;
+
+	u64                   *wrid;
+	union mthca_buf	       queue;
+
+	wait_queue_head_t      wait;
+	struct mutex	       mutex;
+};
+
+struct mthca_sqp {
+	struct mthca_qp qp;
+	int             pkey_index;
+	u32             qkey;
+	u32             send_psn;
+	struct ib_ud_header ud_header;
+	int             header_buf_size;
+	void           *header_buf;
+	dma_addr_t      header_dma;
+};
+
+static inline struct mthca_ucontext *to_mucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct mthca_ucontext, ibucontext);
+}
+
+static inline struct mthca_fmr *to_mfmr(struct ib_fmr *ibmr)
+{
+	return container_of(ibmr, struct mthca_fmr, ibmr);
+}
+
+static inline struct mthca_mr *to_mmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct mthca_mr, ibmr);
+}
+
+static inline struct mthca_pd *to_mpd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct mthca_pd, ibpd);
+}
+
+static inline struct mthca_ah *to_mah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct mthca_ah, ibah);
+}
+
+static inline struct mthca_cq *to_mcq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct mthca_cq, ibcq);
+}
+
+static inline struct mthca_srq *to_msrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct mthca_srq, ibsrq);
+}
+
+static inline struct mthca_qp *to_mqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct mthca_qp, ibqp);
+}
+
+static inline struct mthca_sqp *to_msqp(struct mthca_qp *qp)
+{
+	return container_of(qp, struct mthca_sqp, qp);
+}
+
+#endif /* MTHCA_PROVIDER_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c
new file mode 100644
index 000000000..e354b2f04
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_qp.c
@@ -0,0 +1,2311 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/string.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+
+#include <asm/io.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_cache.h>
+#include <rdma/ib_pack.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+#include "mthca_wqe.h"
+
+enum {
+	MTHCA_MAX_DIRECT_QP_SIZE = 4 * PAGE_SIZE,
+	MTHCA_ACK_REQ_FREQ       = 10,
+	MTHCA_FLIGHT_LIMIT       = 9,
+	MTHCA_UD_HEADER_SIZE     = 72, /* largest UD header possible */
+	MTHCA_INLINE_HEADER_SIZE = 4,  /* data segment overhead for inline */
+	MTHCA_INLINE_CHUNK_SIZE  = 16  /* inline data segment chunk */
+};
+
+enum {
+	MTHCA_QP_STATE_RST  = 0,
+	MTHCA_QP_STATE_INIT = 1,
+	MTHCA_QP_STATE_RTR  = 2,
+	MTHCA_QP_STATE_RTS  = 3,
+	MTHCA_QP_STATE_SQE  = 4,
+	MTHCA_QP_STATE_SQD  = 5,
+	MTHCA_QP_STATE_ERR  = 6,
+	MTHCA_QP_STATE_DRAINING = 7
+};
+
+enum {
+	MTHCA_QP_ST_RC 	= 0x0,
+	MTHCA_QP_ST_UC 	= 0x1,
+	MTHCA_QP_ST_RD 	= 0x2,
+	MTHCA_QP_ST_UD 	= 0x3,
+	MTHCA_QP_ST_MLX = 0x7
+};
+
+enum {
+	MTHCA_QP_PM_MIGRATED = 0x3,
+	MTHCA_QP_PM_ARMED    = 0x0,
+	MTHCA_QP_PM_REARM    = 0x1
+};
+
+enum {
+	/* qp_context flags */
+	MTHCA_QP_BIT_DE  = 1 <<  8,
+	/* params1 */
+	MTHCA_QP_BIT_SRE = 1 << 15,
+	MTHCA_QP_BIT_SWE = 1 << 14,
+	MTHCA_QP_BIT_SAE = 1 << 13,
+	MTHCA_QP_BIT_SIC = 1 <<  4,
+	MTHCA_QP_BIT_SSC = 1 <<  3,
+	/* params2 */
+	MTHCA_QP_BIT_RRE = 1 << 15,
+	MTHCA_QP_BIT_RWE = 1 << 14,
+	MTHCA_QP_BIT_RAE = 1 << 13,
+	MTHCA_QP_BIT_RIC = 1 <<  4,
+	MTHCA_QP_BIT_RSC = 1 <<  3
+};
+
+enum {
+	MTHCA_SEND_DOORBELL_FENCE = 1 << 5
+};
+
+struct mthca_qp_path {
+	__be32 port_pkey;
+	u8     rnr_retry;
+	u8     g_mylmc;
+	__be16 rlid;
+	u8     ackto;
+	u8     mgid_index;
+	u8     static_rate;
+	u8     hop_limit;
+	__be32 sl_tclass_flowlabel;
+	u8     rgid[16];
+} __attribute__((packed));
+
+struct mthca_qp_context {
+	__be32 flags;
+	__be32 tavor_sched_queue; /* Reserved on Arbel */
+	u8     mtu_msgmax;
+	u8     rq_size_stride;	/* Reserved on Tavor */
+	u8     sq_size_stride;	/* Reserved on Tavor */
+	u8     rlkey_arbel_sched_queue;	/* Reserved on Tavor */
+	__be32 usr_page;
+	__be32 local_qpn;
+	__be32 remote_qpn;
+	u32    reserved1[2];
+	struct mthca_qp_path pri_path;
+	struct mthca_qp_path alt_path;
+	__be32 rdd;
+	__be32 pd;
+	__be32 wqe_base;
+	__be32 wqe_lkey;
+	__be32 params1;
+	__be32 reserved2;
+	__be32 next_send_psn;
+	__be32 cqn_snd;
+	__be32 snd_wqe_base_l;	/* Next send WQE on Tavor */
+	__be32 snd_db_index;	/* (debugging only entries) */
+	__be32 last_acked_psn;
+	__be32 ssn;
+	__be32 params2;
+	__be32 rnr_nextrecvpsn;
+	__be32 ra_buff_indx;
+	__be32 cqn_rcv;
+	__be32 rcv_wqe_base_l;	/* Next recv WQE on Tavor */
+	__be32 rcv_db_index;	/* (debugging only entries) */
+	__be32 qkey;
+	__be32 srqn;
+	__be32 rmsn;
+	__be16 rq_wqe_counter;	/* reserved on Tavor */
+	__be16 sq_wqe_counter;	/* reserved on Tavor */
+	u32    reserved3[18];
+} __attribute__((packed));
+
+struct mthca_qp_param {
+	__be32 opt_param_mask;
+	u32    reserved1;
+	struct mthca_qp_context context;
+	u32    reserved2[62];
+} __attribute__((packed));
+
+enum {
+	MTHCA_QP_OPTPAR_ALT_ADDR_PATH     = 1 << 0,
+	MTHCA_QP_OPTPAR_RRE               = 1 << 1,
+	MTHCA_QP_OPTPAR_RAE               = 1 << 2,
+	MTHCA_QP_OPTPAR_RWE               = 1 << 3,
+	MTHCA_QP_OPTPAR_PKEY_INDEX        = 1 << 4,
+	MTHCA_QP_OPTPAR_Q_KEY             = 1 << 5,
+	MTHCA_QP_OPTPAR_RNR_TIMEOUT       = 1 << 6,
+	MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH = 1 << 7,
+	MTHCA_QP_OPTPAR_SRA_MAX           = 1 << 8,
+	MTHCA_QP_OPTPAR_RRA_MAX           = 1 << 9,
+	MTHCA_QP_OPTPAR_PM_STATE          = 1 << 10,
+	MTHCA_QP_OPTPAR_PORT_NUM          = 1 << 11,
+	MTHCA_QP_OPTPAR_RETRY_COUNT       = 1 << 12,
+	MTHCA_QP_OPTPAR_ALT_RNR_RETRY     = 1 << 13,
+	MTHCA_QP_OPTPAR_ACK_TIMEOUT       = 1 << 14,
+	MTHCA_QP_OPTPAR_RNR_RETRY         = 1 << 15,
+	MTHCA_QP_OPTPAR_SCHED_QUEUE       = 1 << 16
+};
+
+static const u8 mthca_opcode[] = {
+	[IB_WR_SEND]                 = MTHCA_OPCODE_SEND,
+	[IB_WR_SEND_WITH_IMM]        = MTHCA_OPCODE_SEND_IMM,
+	[IB_WR_RDMA_WRITE]           = MTHCA_OPCODE_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM]  = MTHCA_OPCODE_RDMA_WRITE_IMM,
+	[IB_WR_RDMA_READ]            = MTHCA_OPCODE_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP]   = MTHCA_OPCODE_ATOMIC_CS,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = MTHCA_OPCODE_ATOMIC_FA,
+};
+
+static int is_sqp(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	return qp->qpn >= dev->qp_table.sqp_start &&
+		qp->qpn <= dev->qp_table.sqp_start + 3;
+}
+
+static int is_qp0(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	return qp->qpn >= dev->qp_table.sqp_start &&
+		qp->qpn <= dev->qp_table.sqp_start + 1;
+}
+
+static void *get_recv_wqe(struct mthca_qp *qp, int n)
+{
+	if (qp->is_direct)
+		return qp->queue.direct.buf + (n << qp->rq.wqe_shift);
+	else
+		return qp->queue.page_list[(n << qp->rq.wqe_shift) >> PAGE_SHIFT].buf +
+			((n << qp->rq.wqe_shift) & (PAGE_SIZE - 1));
+}
+
+static void *get_send_wqe(struct mthca_qp *qp, int n)
+{
+	if (qp->is_direct)
+		return qp->queue.direct.buf + qp->send_wqe_offset +
+			(n << qp->sq.wqe_shift);
+	else
+		return qp->queue.page_list[(qp->send_wqe_offset +
+					    (n << qp->sq.wqe_shift)) >>
+					   PAGE_SHIFT].buf +
+			((qp->send_wqe_offset + (n << qp->sq.wqe_shift)) &
+			 (PAGE_SIZE - 1));
+}
+
+static void mthca_wq_reset(struct mthca_wq *wq)
+{
+	wq->next_ind  = 0;
+	wq->last_comp = wq->max - 1;
+	wq->head      = 0;
+	wq->tail      = 0;
+}
+
+void mthca_qp_event(struct mthca_dev *dev, u32 qpn,
+		    enum ib_event_type event_type)
+{
+	struct mthca_qp *qp;
+	struct ib_event event;
+
+	spin_lock(&dev->qp_table.lock);
+	qp = mthca_array_get(&dev->qp_table.qp, qpn & (dev->limits.num_qps - 1));
+	if (qp)
+		++qp->refcount;
+	spin_unlock(&dev->qp_table.lock);
+
+	if (!qp) {
+		mthca_warn(dev, "Async event %d for bogus QP %08x\n",
+			   event_type, qpn);
+		return;
+	}
+
+	if (event_type == IB_EVENT_PATH_MIG)
+		qp->port = qp->alt_port;
+
+	event.device      = &dev->ib_dev;
+	event.event       = event_type;
+	event.element.qp  = &qp->ibqp;
+	if (qp->ibqp.event_handler)
+		qp->ibqp.event_handler(&event, qp->ibqp.qp_context);
+
+	spin_lock(&dev->qp_table.lock);
+	if (!--qp->refcount)
+		wake_up(&qp->wait);
+	spin_unlock(&dev->qp_table.lock);
+}
+
+static int to_mthca_state(enum ib_qp_state ib_state)
+{
+	switch (ib_state) {
+	case IB_QPS_RESET: return MTHCA_QP_STATE_RST;
+	case IB_QPS_INIT:  return MTHCA_QP_STATE_INIT;
+	case IB_QPS_RTR:   return MTHCA_QP_STATE_RTR;
+	case IB_QPS_RTS:   return MTHCA_QP_STATE_RTS;
+	case IB_QPS_SQD:   return MTHCA_QP_STATE_SQD;
+	case IB_QPS_SQE:   return MTHCA_QP_STATE_SQE;
+	case IB_QPS_ERR:   return MTHCA_QP_STATE_ERR;
+	default:                return -1;
+	}
+}
+
+enum { RC, UC, UD, RD, RDEE, MLX, NUM_TRANS };
+
+static int to_mthca_st(int transport)
+{
+	switch (transport) {
+	case RC:  return MTHCA_QP_ST_RC;
+	case UC:  return MTHCA_QP_ST_UC;
+	case UD:  return MTHCA_QP_ST_UD;
+	case RD:  return MTHCA_QP_ST_RD;
+	case MLX: return MTHCA_QP_ST_MLX;
+	default:  return -1;
+	}
+}
+
+static void store_attrs(struct mthca_sqp *sqp, const struct ib_qp_attr *attr,
+			int attr_mask)
+{
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		sqp->pkey_index = attr->pkey_index;
+	if (attr_mask & IB_QP_QKEY)
+		sqp->qkey = attr->qkey;
+	if (attr_mask & IB_QP_SQ_PSN)
+		sqp->send_psn = attr->sq_psn;
+}
+
+static void init_port(struct mthca_dev *dev, int port)
+{
+	int err;
+	struct mthca_init_ib_param param;
+
+	memset(&param, 0, sizeof param);
+
+	param.port_width = dev->limits.port_width_cap;
+	param.vl_cap     = dev->limits.vl_cap;
+	param.mtu_cap    = dev->limits.mtu_cap;
+	param.gid_cap    = dev->limits.gid_table_len;
+	param.pkey_cap   = dev->limits.pkey_table_len;
+
+	err = mthca_INIT_IB(dev, &param, port);
+	if (err)
+		mthca_warn(dev, "INIT_IB failed, return code %d.\n", err);
+}
+
+static __be32 get_hw_access_flags(struct mthca_qp *qp, const struct ib_qp_attr *attr,
+				  int attr_mask)
+{
+	u8 dest_rd_atomic;
+	u32 access_flags;
+	u32 hw_access_flags = 0;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		dest_rd_atomic = attr->max_dest_rd_atomic;
+	else
+		dest_rd_atomic = qp->resp_depth;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		access_flags = attr->qp_access_flags;
+	else
+		access_flags = qp->atomic_rd_en;
+
+	if (!dest_rd_atomic)
+		access_flags &= IB_ACCESS_REMOTE_WRITE;
+
+	if (access_flags & IB_ACCESS_REMOTE_READ)
+		hw_access_flags |= MTHCA_QP_BIT_RRE;
+	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
+		hw_access_flags |= MTHCA_QP_BIT_RAE;
+	if (access_flags & IB_ACCESS_REMOTE_WRITE)
+		hw_access_flags |= MTHCA_QP_BIT_RWE;
+
+	return cpu_to_be32(hw_access_flags);
+}
+
+static inline enum ib_qp_state to_ib_qp_state(int mthca_state)
+{
+	switch (mthca_state) {
+	case MTHCA_QP_STATE_RST:      return IB_QPS_RESET;
+	case MTHCA_QP_STATE_INIT:     return IB_QPS_INIT;
+	case MTHCA_QP_STATE_RTR:      return IB_QPS_RTR;
+	case MTHCA_QP_STATE_RTS:      return IB_QPS_RTS;
+	case MTHCA_QP_STATE_DRAINING:
+	case MTHCA_QP_STATE_SQD:      return IB_QPS_SQD;
+	case MTHCA_QP_STATE_SQE:      return IB_QPS_SQE;
+	case MTHCA_QP_STATE_ERR:      return IB_QPS_ERR;
+	default:                      return -1;
+	}
+}
+
+static inline enum ib_mig_state to_ib_mig_state(int mthca_mig_state)
+{
+	switch (mthca_mig_state) {
+	case 0:  return IB_MIG_ARMED;
+	case 1:  return IB_MIG_REARM;
+	case 3:  return IB_MIG_MIGRATED;
+	default: return -1;
+	}
+}
+
+static int to_ib_qp_access_flags(int mthca_flags)
+{
+	int ib_flags = 0;
+
+	if (mthca_flags & MTHCA_QP_BIT_RRE)
+		ib_flags |= IB_ACCESS_REMOTE_READ;
+	if (mthca_flags & MTHCA_QP_BIT_RWE)
+		ib_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (mthca_flags & MTHCA_QP_BIT_RAE)
+		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
+
+	return ib_flags;
+}
+
+static void to_ib_ah_attr(struct mthca_dev *dev, struct ib_ah_attr *ib_ah_attr,
+				struct mthca_qp_path *path)
+{
+	memset(ib_ah_attr, 0, sizeof *ib_ah_attr);
+	ib_ah_attr->port_num 	  = (be32_to_cpu(path->port_pkey) >> 24) & 0x3;
+
+	if (ib_ah_attr->port_num == 0 || ib_ah_attr->port_num > dev->limits.num_ports)
+		return;
+
+	ib_ah_attr->dlid     	  = be16_to_cpu(path->rlid);
+	ib_ah_attr->sl       	  = be32_to_cpu(path->sl_tclass_flowlabel) >> 28;
+	ib_ah_attr->src_path_bits = path->g_mylmc & 0x7f;
+	ib_ah_attr->static_rate   = mthca_rate_to_ib(dev,
+						     path->static_rate & 0xf,
+						     ib_ah_attr->port_num);
+	ib_ah_attr->ah_flags      = (path->g_mylmc & (1 << 7)) ? IB_AH_GRH : 0;
+	if (ib_ah_attr->ah_flags) {
+		ib_ah_attr->grh.sgid_index = path->mgid_index & (dev->limits.gid_table_len - 1);
+		ib_ah_attr->grh.hop_limit  = path->hop_limit;
+		ib_ah_attr->grh.traffic_class =
+			(be32_to_cpu(path->sl_tclass_flowlabel) >> 20) & 0xff;
+		ib_ah_attr->grh.flow_label =
+			be32_to_cpu(path->sl_tclass_flowlabel) & 0xfffff;
+		memcpy(ib_ah_attr->grh.dgid.raw,
+			path->rgid, sizeof ib_ah_attr->grh.dgid.raw);
+	}
+}
+
+int mthca_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
+		   struct ib_qp_init_attr *qp_init_attr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	int err = 0;
+	struct mthca_mailbox *mailbox = NULL;
+	struct mthca_qp_param *qp_param;
+	struct mthca_qp_context *context;
+	int mthca_state;
+
+	mutex_lock(&qp->mutex);
+
+	if (qp->state == IB_QPS_RESET) {
+		qp_attr->qp_state = IB_QPS_RESET;
+		goto done;
+	}
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto out;
+	}
+
+	err = mthca_QUERY_QP(dev, qp->qpn, 0, mailbox);
+	if (err) {
+		mthca_warn(dev, "QUERY_QP failed (%d)\n", err);
+		goto out_mailbox;
+	}
+
+	qp_param    = mailbox->buf;
+	context     = &qp_param->context;
+	mthca_state = be32_to_cpu(context->flags) >> 28;
+
+	qp->state		     = to_ib_qp_state(mthca_state);
+	qp_attr->qp_state	     = qp->state;
+	qp_attr->path_mtu 	     = context->mtu_msgmax >> 5;
+	qp_attr->path_mig_state      =
+		to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3);
+	qp_attr->qkey 		     = be32_to_cpu(context->qkey);
+	qp_attr->rq_psn 	     = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff;
+	qp_attr->sq_psn 	     = be32_to_cpu(context->next_send_psn) & 0xffffff;
+	qp_attr->dest_qp_num 	     = be32_to_cpu(context->remote_qpn) & 0xffffff;
+	qp_attr->qp_access_flags     =
+		to_ib_qp_access_flags(be32_to_cpu(context->params2));
+
+	if (qp->transport == RC || qp->transport == UC) {
+		to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path);
+		to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path);
+		qp_attr->alt_pkey_index =
+			be32_to_cpu(context->alt_path.port_pkey) & 0x7f;
+		qp_attr->alt_port_num 	= qp_attr->alt_ah_attr.port_num;
+	}
+
+	qp_attr->pkey_index = be32_to_cpu(context->pri_path.port_pkey) & 0x7f;
+	qp_attr->port_num   =
+		(be32_to_cpu(context->pri_path.port_pkey) >> 24) & 0x3;
+
+	/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
+	qp_attr->sq_draining = mthca_state == MTHCA_QP_STATE_DRAINING;
+
+	qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7);
+
+	qp_attr->max_dest_rd_atomic =
+		1 << ((be32_to_cpu(context->params2) >> 21) & 0x7);
+	qp_attr->min_rnr_timer 	    =
+		(be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f;
+	qp_attr->timeout 	    = context->pri_path.ackto >> 3;
+	qp_attr->retry_cnt 	    = (be32_to_cpu(context->params1) >> 16) & 0x7;
+	qp_attr->rnr_retry 	    = context->pri_path.rnr_retry >> 5;
+	qp_attr->alt_timeout 	    = context->alt_path.ackto >> 3;
+
+done:
+	qp_attr->cur_qp_state	     = qp_attr->qp_state;
+	qp_attr->cap.max_send_wr     = qp->sq.max;
+	qp_attr->cap.max_recv_wr     = qp->rq.max;
+	qp_attr->cap.max_send_sge    = qp->sq.max_gs;
+	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
+	qp_attr->cap.max_inline_data = qp->max_inline_data;
+
+	qp_init_attr->cap	     = qp_attr->cap;
+	qp_init_attr->sq_sig_type    = qp->sq_policy;
+
+out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+static int mthca_path_set(struct mthca_dev *dev, const struct ib_ah_attr *ah,
+			  struct mthca_qp_path *path, u8 port)
+{
+	path->g_mylmc     = ah->src_path_bits & 0x7f;
+	path->rlid        = cpu_to_be16(ah->dlid);
+	path->static_rate = mthca_get_rate(dev, ah->static_rate, port);
+
+	if (ah->ah_flags & IB_AH_GRH) {
+		if (ah->grh.sgid_index >= dev->limits.gid_table_len) {
+			mthca_dbg(dev, "sgid_index (%u) too large. max is %d\n",
+				  ah->grh.sgid_index, dev->limits.gid_table_len-1);
+			return -1;
+		}
+
+		path->g_mylmc   |= 1 << 7;
+		path->mgid_index = ah->grh.sgid_index;
+		path->hop_limit  = ah->grh.hop_limit;
+		path->sl_tclass_flowlabel =
+			cpu_to_be32((ah->sl << 28)                |
+				    (ah->grh.traffic_class << 20) |
+				    (ah->grh.flow_label));
+		memcpy(path->rgid, ah->grh.dgid.raw, 16);
+	} else
+		path->sl_tclass_flowlabel = cpu_to_be32(ah->sl << 28);
+
+	return 0;
+}
+
+static int __mthca_modify_qp(struct ib_qp *ibqp,
+			     const struct ib_qp_attr *attr, int attr_mask,
+			     enum ib_qp_state cur_state, enum ib_qp_state new_state)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	struct mthca_mailbox *mailbox;
+	struct mthca_qp_param *qp_param;
+	struct mthca_qp_context *qp_context;
+	u32 sqd_event = 0;
+	int err = -EINVAL;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto out;
+	}
+	qp_param = mailbox->buf;
+	qp_context = &qp_param->context;
+	memset(qp_param, 0, sizeof *qp_param);
+
+	qp_context->flags      = cpu_to_be32((to_mthca_state(new_state) << 28) |
+					     (to_mthca_st(qp->transport) << 16));
+	qp_context->flags     |= cpu_to_be32(MTHCA_QP_BIT_DE);
+	if (!(attr_mask & IB_QP_PATH_MIG_STATE))
+		qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11);
+	else {
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PM_STATE);
+		switch (attr->path_mig_state) {
+		case IB_MIG_MIGRATED:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_MIGRATED << 11);
+			break;
+		case IB_MIG_REARM:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_REARM << 11);
+			break;
+		case IB_MIG_ARMED:
+			qp_context->flags |= cpu_to_be32(MTHCA_QP_PM_ARMED << 11);
+			break;
+		}
+	}
+
+	/* leave tavor_sched_queue as 0 */
+
+	if (qp->transport == MLX || qp->transport == UD)
+		qp_context->mtu_msgmax = (IB_MTU_2048 << 5) | 11;
+	else if (attr_mask & IB_QP_PATH_MTU) {
+		if (attr->path_mtu < IB_MTU_256 || attr->path_mtu > IB_MTU_2048) {
+			mthca_dbg(dev, "path MTU (%u) is invalid\n",
+				  attr->path_mtu);
+			goto out_mailbox;
+		}
+		qp_context->mtu_msgmax = (attr->path_mtu << 5) | 31;
+	}
+
+	if (mthca_is_memfree(dev)) {
+		if (qp->rq.max)
+			qp_context->rq_size_stride = ilog2(qp->rq.max) << 3;
+		qp_context->rq_size_stride |= qp->rq.wqe_shift - 4;
+
+		if (qp->sq.max)
+			qp_context->sq_size_stride = ilog2(qp->sq.max) << 3;
+		qp_context->sq_size_stride |= qp->sq.wqe_shift - 4;
+	}
+
+	/* leave arbel_sched_queue as 0 */
+
+	if (qp->ibqp.uobject)
+		qp_context->usr_page =
+			cpu_to_be32(to_mucontext(qp->ibqp.uobject->context)->uar.index);
+	else
+		qp_context->usr_page = cpu_to_be32(dev->driver_uar.index);
+	qp_context->local_qpn  = cpu_to_be32(qp->qpn);
+	if (attr_mask & IB_QP_DEST_QPN) {
+		qp_context->remote_qpn = cpu_to_be32(attr->dest_qp_num);
+	}
+
+	if (qp->transport == MLX)
+		qp_context->pri_path.port_pkey |=
+			cpu_to_be32(qp->port << 24);
+	else {
+		if (attr_mask & IB_QP_PORT) {
+			qp_context->pri_path.port_pkey |=
+				cpu_to_be32(attr->port_num << 24);
+			qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PORT_NUM);
+		}
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		qp_context->pri_path.port_pkey |=
+			cpu_to_be32(attr->pkey_index);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PKEY_INDEX);
+	}
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		qp_context->alt_path.rnr_retry = qp_context->pri_path.rnr_retry =
+			attr->rnr_retry << 5;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_RETRY |
+							MTHCA_QP_OPTPAR_ALT_RNR_RETRY);
+	}
+
+	if (attr_mask & IB_QP_AV) {
+		if (mthca_path_set(dev, &attr->ah_attr, &qp_context->pri_path,
+				   attr_mask & IB_QP_PORT ? attr->port_num : qp->port))
+			goto out_mailbox;
+
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_PRIMARY_ADDR_PATH);
+	}
+
+	if (ibqp->qp_type == IB_QPT_RC &&
+	    cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR) {
+		u8 sched_queue = ibqp->uobject ? 0x2 : 0x1;
+
+		if (mthca_is_memfree(dev))
+			qp_context->rlkey_arbel_sched_queue |= sched_queue;
+		else
+			qp_context->tavor_sched_queue |= cpu_to_be32(sched_queue);
+
+		qp_param->opt_param_mask |=
+			cpu_to_be32(MTHCA_QP_OPTPAR_SCHED_QUEUE);
+	}
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		qp_context->pri_path.ackto = attr->timeout << 3;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ACK_TIMEOUT);
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		if (attr->alt_pkey_index >= dev->limits.pkey_table_len) {
+			mthca_dbg(dev, "Alternate P_Key index (%u) too large. max is %d\n",
+				  attr->alt_pkey_index, dev->limits.pkey_table_len-1);
+			goto out_mailbox;
+		}
+
+		if (attr->alt_port_num == 0 || attr->alt_port_num > dev->limits.num_ports) {
+			mthca_dbg(dev, "Alternate port number (%u) is invalid\n",
+				attr->alt_port_num);
+			goto out_mailbox;
+		}
+
+		if (mthca_path_set(dev, &attr->alt_ah_attr, &qp_context->alt_path,
+				   attr->alt_ah_attr.port_num))
+			goto out_mailbox;
+
+		qp_context->alt_path.port_pkey |= cpu_to_be32(attr->alt_pkey_index |
+							      attr->alt_port_num << 24);
+		qp_context->alt_path.ackto = attr->alt_timeout << 3;
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_ALT_ADDR_PATH);
+	}
+
+	/* leave rdd as 0 */
+	qp_context->pd         = cpu_to_be32(to_mpd(ibqp->pd)->pd_num);
+	/* leave wqe_base as 0 (we always create an MR based at 0 for WQs) */
+	qp_context->wqe_lkey   = cpu_to_be32(qp->mr.ibmr.lkey);
+	qp_context->params1    = cpu_to_be32((MTHCA_ACK_REQ_FREQ << 28) |
+					     (MTHCA_FLIGHT_LIMIT << 24) |
+					     MTHCA_QP_BIT_SWE);
+	if (qp->sq_policy == IB_SIGNAL_ALL_WR)
+		qp_context->params1 |= cpu_to_be32(MTHCA_QP_BIT_SSC);
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		qp_context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RETRY_COUNT);
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attr->max_rd_atomic) {
+			qp_context->params1 |=
+				cpu_to_be32(MTHCA_QP_BIT_SRE |
+					    MTHCA_QP_BIT_SAE);
+			qp_context->params1 |=
+				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
+		}
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_SRA_MAX);
+	}
+
+	if (attr_mask & IB_QP_SQ_PSN)
+		qp_context->next_send_psn = cpu_to_be32(attr->sq_psn);
+	qp_context->cqn_snd = cpu_to_be32(to_mcq(ibqp->send_cq)->cqn);
+
+	if (mthca_is_memfree(dev)) {
+		qp_context->snd_wqe_base_l = cpu_to_be32(qp->send_wqe_offset);
+		qp_context->snd_db_index   = cpu_to_be32(qp->sq.db_index);
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attr->max_dest_rd_atomic)
+			qp_context->params2 |=
+				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
+
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RRA_MAX);
+	}
+
+	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC)) {
+		qp_context->params2      |= get_hw_access_flags(qp, attr, attr_mask);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RWE |
+							MTHCA_QP_OPTPAR_RRE |
+							MTHCA_QP_OPTPAR_RAE);
+	}
+
+	qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RSC);
+
+	if (ibqp->srq)
+		qp_context->params2 |= cpu_to_be32(MTHCA_QP_BIT_RIC);
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_RNR_TIMEOUT);
+	}
+	if (attr_mask & IB_QP_RQ_PSN)
+		qp_context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn);
+
+	qp_context->ra_buff_indx =
+		cpu_to_be32(dev->qp_table.rdb_base +
+			    ((qp->qpn & (dev->limits.num_qps - 1)) * MTHCA_RDB_ENTRY_SIZE <<
+			     dev->qp_table.rdb_shift));
+
+	qp_context->cqn_rcv = cpu_to_be32(to_mcq(ibqp->recv_cq)->cqn);
+
+	if (mthca_is_memfree(dev))
+		qp_context->rcv_db_index   = cpu_to_be32(qp->rq.db_index);
+
+	if (attr_mask & IB_QP_QKEY) {
+		qp_context->qkey = cpu_to_be32(attr->qkey);
+		qp_param->opt_param_mask |= cpu_to_be32(MTHCA_QP_OPTPAR_Q_KEY);
+	}
+
+	if (ibqp->srq)
+		qp_context->srqn = cpu_to_be32(1 << 24 |
+					       to_msrq(ibqp->srq)->srqn);
+
+	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
+	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY		&&
+	    attr->en_sqd_async_notify)
+		sqd_event = 1 << 31;
+
+	err = mthca_MODIFY_QP(dev, cur_state, new_state, qp->qpn, 0,
+			      mailbox, sqd_event);
+	if (err) {
+		mthca_warn(dev, "modify QP %d->%d returned %d.\n",
+			   cur_state, new_state, err);
+		goto out_mailbox;
+	}
+
+	qp->state = new_state;
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->atomic_rd_en = attr->qp_access_flags;
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->resp_depth = attr->max_dest_rd_atomic;
+	if (attr_mask & IB_QP_PORT)
+		qp->port = attr->port_num;
+	if (attr_mask & IB_QP_ALT_PATH)
+		qp->alt_port = attr->alt_port_num;
+
+	if (is_sqp(dev, qp))
+		store_attrs(to_msqp(qp), attr, attr_mask);
+
+	/*
+	 * If we moved QP0 to RTR, bring the IB link up; if we moved
+	 * QP0 to RESET or ERROR, bring the link back down.
+	 */
+	if (is_qp0(dev, qp)) {
+		if (cur_state != IB_QPS_RTR &&
+		    new_state == IB_QPS_RTR)
+			init_port(dev, qp->port);
+
+		if (cur_state != IB_QPS_RESET &&
+		    cur_state != IB_QPS_ERR &&
+		    (new_state == IB_QPS_RESET ||
+		     new_state == IB_QPS_ERR))
+			mthca_CLOSE_IB(dev, qp->port);
+	}
+
+	/*
+	 * If we moved a kernel QP to RESET, clean up all old CQ
+	 * entries and reinitialize the QP.
+	 */
+	if (new_state == IB_QPS_RESET && !qp->ibqp.uobject) {
+		mthca_cq_clean(dev, to_mcq(qp->ibqp.recv_cq), qp->qpn,
+			       qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
+		if (qp->ibqp.send_cq != qp->ibqp.recv_cq)
+			mthca_cq_clean(dev, to_mcq(qp->ibqp.send_cq), qp->qpn, NULL);
+
+		mthca_wq_reset(&qp->sq);
+		qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
+
+		mthca_wq_reset(&qp->rq);
+		qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);
+
+		if (mthca_is_memfree(dev)) {
+			*qp->sq.db = 0;
+			*qp->rq.db = 0;
+		}
+	}
+
+out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+out:
+	return err;
+}
+
+int mthca_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, int attr_mask,
+		    struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	int err = -EINVAL;
+
+	mutex_lock(&qp->mutex);
+	if (attr_mask & IB_QP_CUR_STATE) {
+		cur_state = attr->cur_qp_state;
+	} else {
+		spin_lock_irq(&qp->sq.lock);
+		spin_lock(&qp->rq.lock);
+		cur_state = qp->state;
+		spin_unlock(&qp->rq.lock);
+		spin_unlock_irq(&qp->sq.lock);
+	}
+
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask,
+				IB_LINK_LAYER_UNSPECIFIED)) {
+		mthca_dbg(dev, "Bad QP transition (transport %d) "
+			  "%d->%d with attr 0x%08x\n",
+			  qp->transport, cur_state, new_state,
+			  attr_mask);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PKEY_INDEX) &&
+	     attr->pkey_index >= dev->limits.pkey_table_len) {
+		mthca_dbg(dev, "P_Key index (%u) too large. max is %d\n",
+			  attr->pkey_index, dev->limits.pkey_table_len-1);
+		goto out;
+	}
+
+	if ((attr_mask & IB_QP_PORT) &&
+	    (attr->port_num == 0 || attr->port_num > dev->limits.num_ports)) {
+		mthca_dbg(dev, "Port number (%u) is invalid\n", attr->port_num);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
+	    attr->max_rd_atomic > dev->limits.max_qp_init_rdma) {
+		mthca_dbg(dev, "Max rdma_atomic as initiator %u too large (max is %d)\n",
+			  attr->max_rd_atomic, dev->limits.max_qp_init_rdma);
+		goto out;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
+	    attr->max_dest_rd_atomic > 1 << dev->qp_table.rdb_shift) {
+		mthca_dbg(dev, "Max rdma_atomic as responder %u too large (max %d)\n",
+			  attr->max_dest_rd_atomic, 1 << dev->qp_table.rdb_shift);
+		goto out;
+	}
+
+	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
+		err = 0;
+		goto out;
+	}
+
+	err = __mthca_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
+
+out:
+	mutex_unlock(&qp->mutex);
+	return err;
+}
+
+static int mthca_max_data_size(struct mthca_dev *dev, struct mthca_qp *qp, int desc_sz)
+{
+	/*
+	 * Calculate the maximum size of WQE s/g segments, excluding
+	 * the next segment and other non-data segments.
+	 */
+	int max_data_size = desc_sz - sizeof (struct mthca_next_seg);
+
+	switch (qp->transport) {
+	case MLX:
+		max_data_size -= 2 * sizeof (struct mthca_data_seg);
+		break;
+
+	case UD:
+		if (mthca_is_memfree(dev))
+			max_data_size -= sizeof (struct mthca_arbel_ud_seg);
+		else
+			max_data_size -= sizeof (struct mthca_tavor_ud_seg);
+		break;
+
+	default:
+		max_data_size -= sizeof (struct mthca_raddr_seg);
+		break;
+	}
+
+	return max_data_size;
+}
+
+static inline int mthca_max_inline_data(struct mthca_pd *pd, int max_data_size)
+{
+	/* We don't support inline data for kernel QPs (yet). */
+	return pd->ibpd.uobject ? max_data_size - MTHCA_INLINE_HEADER_SIZE : 0;
+}
+
+static void mthca_adjust_qp_caps(struct mthca_dev *dev,
+				 struct mthca_pd *pd,
+				 struct mthca_qp *qp)
+{
+	int max_data_size = mthca_max_data_size(dev, qp,
+						min(dev->limits.max_desc_sz,
+						    1 << qp->sq.wqe_shift));
+
+	qp->max_inline_data = mthca_max_inline_data(pd, max_data_size);
+
+	qp->sq.max_gs = min_t(int, dev->limits.max_sg,
+			      max_data_size / sizeof (struct mthca_data_seg));
+	qp->rq.max_gs = min_t(int, dev->limits.max_sg,
+			       (min(dev->limits.max_desc_sz, 1 << qp->rq.wqe_shift) -
+				sizeof (struct mthca_next_seg)) /
+			       sizeof (struct mthca_data_seg));
+}
+
+/*
+ * Allocate and register buffer for WQEs.  qp->rq.max, sq.max,
+ * rq.max_gs and sq.max_gs must all be assigned.
+ * mthca_alloc_wqe_buf will calculate rq.wqe_shift and
+ * sq.wqe_shift (as well as send_wqe_offset, is_direct, and
+ * queue)
+ */
+static int mthca_alloc_wqe_buf(struct mthca_dev *dev,
+			       struct mthca_pd *pd,
+			       struct mthca_qp *qp)
+{
+	int size;
+	int err = -ENOMEM;
+
+	size = sizeof (struct mthca_next_seg) +
+		qp->rq.max_gs * sizeof (struct mthca_data_seg);
+
+	if (size > dev->limits.max_desc_sz)
+		return -EINVAL;
+
+	for (qp->rq.wqe_shift = 6; 1 << qp->rq.wqe_shift < size;
+	     qp->rq.wqe_shift++)
+		; /* nothing */
+
+	size = qp->sq.max_gs * sizeof (struct mthca_data_seg);
+	switch (qp->transport) {
+	case MLX:
+		size += 2 * sizeof (struct mthca_data_seg);
+		break;
+
+	case UD:
+		size += mthca_is_memfree(dev) ?
+			sizeof (struct mthca_arbel_ud_seg) :
+			sizeof (struct mthca_tavor_ud_seg);
+		break;
+
+	case UC:
+		size += sizeof (struct mthca_raddr_seg);
+		break;
+
+	case RC:
+		size += sizeof (struct mthca_raddr_seg);
+		/*
+		 * An atomic op will require an atomic segment, a
+		 * remote address segment and one scatter entry.
+		 */
+		size = max_t(int, size,
+			     sizeof (struct mthca_atomic_seg) +
+			     sizeof (struct mthca_raddr_seg) +
+			     sizeof (struct mthca_data_seg));
+		break;
+
+	default:
+		break;
+	}
+
+	/* Make sure that we have enough space for a bind request */
+	size = max_t(int, size, sizeof (struct mthca_bind_seg));
+
+	size += sizeof (struct mthca_next_seg);
+
+	if (size > dev->limits.max_desc_sz)
+		return -EINVAL;
+
+	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
+	     qp->sq.wqe_shift++)
+		; /* nothing */
+
+	qp->send_wqe_offset = ALIGN(qp->rq.max << qp->rq.wqe_shift,
+				    1 << qp->sq.wqe_shift);
+
+	/*
+	 * If this is a userspace QP, we don't actually have to
+	 * allocate anything.  All we need is to calculate the WQE
+	 * sizes and the send_wqe_offset, so we're done now.
+	 */
+	if (pd->ibpd.uobject)
+		return 0;
+
+	size = PAGE_ALIGN(qp->send_wqe_offset +
+			  (qp->sq.max << qp->sq.wqe_shift));
+
+	qp->wrid = kmalloc((qp->rq.max + qp->sq.max) * sizeof (u64),
+			   GFP_KERNEL);
+	if (!qp->wrid)
+		goto err_out;
+
+	err = mthca_buf_alloc(dev, size, MTHCA_MAX_DIRECT_QP_SIZE,
+			      &qp->queue, &qp->is_direct, pd, 0, &qp->mr);
+	if (err)
+		goto err_out;
+
+	return 0;
+
+err_out:
+	kfree(qp->wrid);
+	return err;
+}
+
+static void mthca_free_wqe_buf(struct mthca_dev *dev,
+			       struct mthca_qp *qp)
+{
+	mthca_buf_free(dev, PAGE_ALIGN(qp->send_wqe_offset +
+				       (qp->sq.max << qp->sq.wqe_shift)),
+		       &qp->queue, qp->is_direct, &qp->mr);
+	kfree(qp->wrid);
+}
+
+static int mthca_map_memfree(struct mthca_dev *dev,
+			     struct mthca_qp *qp)
+{
+	int ret;
+
+	if (mthca_is_memfree(dev)) {
+		ret = mthca_table_get(dev, dev->qp_table.qp_table, qp->qpn);
+		if (ret)
+			return ret;
+
+		ret = mthca_table_get(dev, dev->qp_table.eqp_table, qp->qpn);
+		if (ret)
+			goto err_qpc;
+
+		ret = mthca_table_get(dev, dev->qp_table.rdb_table,
+				      qp->qpn << dev->qp_table.rdb_shift);
+		if (ret)
+			goto err_eqpc;
+
+	}
+
+	return 0;
+
+err_eqpc:
+	mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+
+err_qpc:
+	mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+
+	return ret;
+}
+
+static void mthca_unmap_memfree(struct mthca_dev *dev,
+				struct mthca_qp *qp)
+{
+	mthca_table_put(dev, dev->qp_table.rdb_table,
+			qp->qpn << dev->qp_table.rdb_shift);
+	mthca_table_put(dev, dev->qp_table.eqp_table, qp->qpn);
+	mthca_table_put(dev, dev->qp_table.qp_table, qp->qpn);
+}
+
+static int mthca_alloc_memfree(struct mthca_dev *dev,
+			       struct mthca_qp *qp)
+{
+	if (mthca_is_memfree(dev)) {
+		qp->rq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_RQ,
+						 qp->qpn, &qp->rq.db);
+		if (qp->rq.db_index < 0)
+			return -ENOMEM;
+
+		qp->sq.db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SQ,
+						 qp->qpn, &qp->sq.db);
+		if (qp->sq.db_index < 0) {
+			mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+			return -ENOMEM;
+		}
+	}
+
+	return 0;
+}
+
+static void mthca_free_memfree(struct mthca_dev *dev,
+			       struct mthca_qp *qp)
+{
+	if (mthca_is_memfree(dev)) {
+		mthca_free_db(dev, MTHCA_DB_TYPE_SQ, qp->sq.db_index);
+		mthca_free_db(dev, MTHCA_DB_TYPE_RQ, qp->rq.db_index);
+	}
+}
+
+static int mthca_alloc_qp_common(struct mthca_dev *dev,
+				 struct mthca_pd *pd,
+				 struct mthca_cq *send_cq,
+				 struct mthca_cq *recv_cq,
+				 enum ib_sig_type send_policy,
+				 struct mthca_qp *qp)
+{
+	int ret;
+	int i;
+	struct mthca_next_seg *next;
+
+	qp->refcount = 1;
+	init_waitqueue_head(&qp->wait);
+	mutex_init(&qp->mutex);
+	qp->state    	 = IB_QPS_RESET;
+	qp->atomic_rd_en = 0;
+	qp->resp_depth   = 0;
+	qp->sq_policy    = send_policy;
+	mthca_wq_reset(&qp->sq);
+	mthca_wq_reset(&qp->rq);
+
+	spin_lock_init(&qp->sq.lock);
+	spin_lock_init(&qp->rq.lock);
+
+	ret = mthca_map_memfree(dev, qp);
+	if (ret)
+		return ret;
+
+	ret = mthca_alloc_wqe_buf(dev, pd, qp);
+	if (ret) {
+		mthca_unmap_memfree(dev, qp);
+		return ret;
+	}
+
+	mthca_adjust_qp_caps(dev, pd, qp);
+
+	/*
+	 * If this is a userspace QP, we're done now.  The doorbells
+	 * will be allocated and buffers will be initialized in
+	 * userspace.
+	 */
+	if (pd->ibpd.uobject)
+		return 0;
+
+	ret = mthca_alloc_memfree(dev, qp);
+	if (ret) {
+		mthca_free_wqe_buf(dev, qp);
+		mthca_unmap_memfree(dev, qp);
+		return ret;
+	}
+
+	if (mthca_is_memfree(dev)) {
+		struct mthca_data_seg *scatter;
+		int size = (sizeof (struct mthca_next_seg) +
+			    qp->rq.max_gs * sizeof (struct mthca_data_seg)) / 16;
+
+		for (i = 0; i < qp->rq.max; ++i) {
+			next = get_recv_wqe(qp, i);
+			next->nda_op = cpu_to_be32(((i + 1) & (qp->rq.max - 1)) <<
+						   qp->rq.wqe_shift);
+			next->ee_nds = cpu_to_be32(size);
+
+			for (scatter = (void *) (next + 1);
+			     (void *) scatter < (void *) next + (1 << qp->rq.wqe_shift);
+			     ++scatter)
+				scatter->lkey = cpu_to_be32(MTHCA_INVAL_LKEY);
+		}
+
+		for (i = 0; i < qp->sq.max; ++i) {
+			next = get_send_wqe(qp, i);
+			next->nda_op = cpu_to_be32((((i + 1) & (qp->sq.max - 1)) <<
+						    qp->sq.wqe_shift) +
+						   qp->send_wqe_offset);
+		}
+	} else {
+		for (i = 0; i < qp->rq.max; ++i) {
+			next = get_recv_wqe(qp, i);
+			next->nda_op = htonl((((i + 1) % qp->rq.max) <<
+					      qp->rq.wqe_shift) | 1);
+		}
+
+	}
+
+	qp->sq.last = get_send_wqe(qp, qp->sq.max - 1);
+	qp->rq.last = get_recv_wqe(qp, qp->rq.max - 1);
+
+	return 0;
+}
+
+static int mthca_set_qp_size(struct mthca_dev *dev, struct ib_qp_cap *cap,
+			     struct mthca_pd *pd, struct mthca_qp *qp)
+{
+	int max_data_size = mthca_max_data_size(dev, qp, dev->limits.max_desc_sz);
+
+	/* Sanity check QP size before proceeding */
+	if (cap->max_send_wr  	 > dev->limits.max_wqes ||
+	    cap->max_recv_wr  	 > dev->limits.max_wqes ||
+	    cap->max_send_sge 	 > dev->limits.max_sg   ||
+	    cap->max_recv_sge 	 > dev->limits.max_sg   ||
+	    cap->max_inline_data > mthca_max_inline_data(pd, max_data_size))
+		return -EINVAL;
+
+	/*
+	 * For MLX transport we need 2 extra send gather entries:
+	 * one for the header and one for the checksum at the end
+	 */
+	if (qp->transport == MLX && cap->max_send_sge + 2 > dev->limits.max_sg)
+		return -EINVAL;
+
+	if (mthca_is_memfree(dev)) {
+		qp->rq.max = cap->max_recv_wr ?
+			roundup_pow_of_two(cap->max_recv_wr) : 0;
+		qp->sq.max = cap->max_send_wr ?
+			roundup_pow_of_two(cap->max_send_wr) : 0;
+	} else {
+		qp->rq.max = cap->max_recv_wr;
+		qp->sq.max = cap->max_send_wr;
+	}
+
+	qp->rq.max_gs = cap->max_recv_sge;
+	qp->sq.max_gs = max_t(int, cap->max_send_sge,
+			      ALIGN(cap->max_inline_data + MTHCA_INLINE_HEADER_SIZE,
+				    MTHCA_INLINE_CHUNK_SIZE) /
+			      sizeof (struct mthca_data_seg));
+
+	return 0;
+}
+
+int mthca_alloc_qp(struct mthca_dev *dev,
+		   struct mthca_pd *pd,
+		   struct mthca_cq *send_cq,
+		   struct mthca_cq *recv_cq,
+		   enum ib_qp_type type,
+		   enum ib_sig_type send_policy,
+		   struct ib_qp_cap *cap,
+		   struct mthca_qp *qp)
+{
+	int err;
+
+	switch (type) {
+	case IB_QPT_RC: qp->transport = RC; break;
+	case IB_QPT_UC: qp->transport = UC; break;
+	case IB_QPT_UD: qp->transport = UD; break;
+	default: return -EINVAL;
+	}
+
+	err = mthca_set_qp_size(dev, cap, pd, qp);
+	if (err)
+		return err;
+
+	qp->qpn = mthca_alloc(&dev->qp_table.alloc);
+	if (qp->qpn == -1)
+		return -ENOMEM;
+
+	/* initialize port to zero for error-catching. */
+	qp->port = 0;
+
+	err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
+				    send_policy, qp);
+	if (err) {
+		mthca_free(&dev->qp_table.alloc, qp->qpn);
+		return err;
+	}
+
+	spin_lock_irq(&dev->qp_table.lock);
+	mthca_array_set(&dev->qp_table.qp,
+			qp->qpn & (dev->limits.num_qps - 1), qp);
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	return 0;
+}
+
+static void mthca_lock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq)
+	__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
+{
+	if (send_cq == recv_cq) {
+		spin_lock_irq(&send_cq->lock);
+		__acquire(&recv_cq->lock);
+	} else if (send_cq->cqn < recv_cq->cqn) {
+		spin_lock_irq(&send_cq->lock);
+		spin_lock_nested(&recv_cq->lock, SINGLE_DEPTH_NESTING);
+	} else {
+		spin_lock_irq(&recv_cq->lock);
+		spin_lock_nested(&send_cq->lock, SINGLE_DEPTH_NESTING);
+	}
+}
+
+static void mthca_unlock_cqs(struct mthca_cq *send_cq, struct mthca_cq *recv_cq)
+	__releases(&send_cq->lock) __releases(&recv_cq->lock)
+{
+	if (send_cq == recv_cq) {
+		__release(&recv_cq->lock);
+		spin_unlock_irq(&send_cq->lock);
+	} else if (send_cq->cqn < recv_cq->cqn) {
+		spin_unlock(&recv_cq->lock);
+		spin_unlock_irq(&send_cq->lock);
+	} else {
+		spin_unlock(&send_cq->lock);
+		spin_unlock_irq(&recv_cq->lock);
+	}
+}
+
+int mthca_alloc_sqp(struct mthca_dev *dev,
+		    struct mthca_pd *pd,
+		    struct mthca_cq *send_cq,
+		    struct mthca_cq *recv_cq,
+		    enum ib_sig_type send_policy,
+		    struct ib_qp_cap *cap,
+		    int qpn,
+		    int port,
+		    struct mthca_sqp *sqp)
+{
+	u32 mqpn = qpn * 2 + dev->qp_table.sqp_start + port - 1;
+	int err;
+
+	sqp->qp.transport = MLX;
+	err = mthca_set_qp_size(dev, cap, pd, &sqp->qp);
+	if (err)
+		return err;
+
+	sqp->header_buf_size = sqp->qp.sq.max * MTHCA_UD_HEADER_SIZE;
+	sqp->header_buf = dma_alloc_coherent(&dev->pdev->dev, sqp->header_buf_size,
+					     &sqp->header_dma, GFP_KERNEL);
+	if (!sqp->header_buf)
+		return -ENOMEM;
+
+	spin_lock_irq(&dev->qp_table.lock);
+	if (mthca_array_get(&dev->qp_table.qp, mqpn))
+		err = -EBUSY;
+	else
+		mthca_array_set(&dev->qp_table.qp, mqpn, sqp);
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	if (err)
+		goto err_out;
+
+	sqp->qp.port      = port;
+	sqp->qp.qpn       = mqpn;
+	sqp->qp.transport = MLX;
+
+	err = mthca_alloc_qp_common(dev, pd, send_cq, recv_cq,
+				    send_policy, &sqp->qp);
+	if (err)
+		goto err_out_free;
+
+	atomic_inc(&pd->sqp_count);
+
+	return 0;
+
+ err_out_free:
+	/*
+	 * Lock CQs here, so that CQ polling code can do QP lookup
+	 * without taking a lock.
+	 */
+	mthca_lock_cqs(send_cq, recv_cq);
+
+	spin_lock(&dev->qp_table.lock);
+	mthca_array_clear(&dev->qp_table.qp, mqpn);
+	spin_unlock(&dev->qp_table.lock);
+
+	mthca_unlock_cqs(send_cq, recv_cq);
+
+ err_out:
+	dma_free_coherent(&dev->pdev->dev, sqp->header_buf_size,
+			  sqp->header_buf, sqp->header_dma);
+
+	return err;
+}
+
+static inline int get_qp_refcount(struct mthca_dev *dev, struct mthca_qp *qp)
+{
+	int c;
+
+	spin_lock_irq(&dev->qp_table.lock);
+	c = qp->refcount;
+	spin_unlock_irq(&dev->qp_table.lock);
+
+	return c;
+}
+
+void mthca_free_qp(struct mthca_dev *dev,
+		   struct mthca_qp *qp)
+{
+	struct mthca_cq *send_cq;
+	struct mthca_cq *recv_cq;
+
+	send_cq = to_mcq(qp->ibqp.send_cq);
+	recv_cq = to_mcq(qp->ibqp.recv_cq);
+
+	/*
+	 * Lock CQs here, so that CQ polling code can do QP lookup
+	 * without taking a lock.
+	 */
+	mthca_lock_cqs(send_cq, recv_cq);
+
+	spin_lock(&dev->qp_table.lock);
+	mthca_array_clear(&dev->qp_table.qp,
+			  qp->qpn & (dev->limits.num_qps - 1));
+	--qp->refcount;
+	spin_unlock(&dev->qp_table.lock);
+
+	mthca_unlock_cqs(send_cq, recv_cq);
+
+	wait_event(qp->wait, !get_qp_refcount(dev, qp));
+
+	if (qp->state != IB_QPS_RESET)
+		mthca_MODIFY_QP(dev, qp->state, IB_QPS_RESET, qp->qpn, 0,
+				NULL, 0);
+
+	/*
+	 * If this is a userspace QP, the buffers, MR, CQs and so on
+	 * will be cleaned up in userspace, so all we have to do is
+	 * unref the mem-free tables and free the QPN in our table.
+	 */
+	if (!qp->ibqp.uobject) {
+		mthca_cq_clean(dev, recv_cq, qp->qpn,
+			       qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
+		if (send_cq != recv_cq)
+			mthca_cq_clean(dev, send_cq, qp->qpn, NULL);
+
+		mthca_free_memfree(dev, qp);
+		mthca_free_wqe_buf(dev, qp);
+	}
+
+	mthca_unmap_memfree(dev, qp);
+
+	if (is_sqp(dev, qp)) {
+		atomic_dec(&(to_mpd(qp->ibqp.pd)->sqp_count));
+		dma_free_coherent(&dev->pdev->dev,
+				  to_msqp(qp)->header_buf_size,
+				  to_msqp(qp)->header_buf,
+				  to_msqp(qp)->header_dma);
+	} else
+		mthca_free(&dev->qp_table.alloc, qp->qpn);
+}
+
+/* Create UD header for an MLX send and build a data segment for it */
+static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp,
+			    int ind, struct ib_send_wr *wr,
+			    struct mthca_mlx_seg *mlx,
+			    struct mthca_data_seg *data)
+{
+	int header_size;
+	int err;
+	u16 pkey;
+
+	ib_ud_header_init(256, /* assume a MAD */ 1, 0, 0,
+			  mthca_ah_grh_present(to_mah(wr->wr.ud.ah)), 0,
+			  &sqp->ud_header);
+
+	err = mthca_read_ah(dev, to_mah(wr->wr.ud.ah), &sqp->ud_header);
+	if (err)
+		return err;
+	mlx->flags &= ~cpu_to_be32(MTHCA_NEXT_SOLICIT | 1);
+	mlx->flags |= cpu_to_be32((!sqp->qp.ibqp.qp_num ? MTHCA_MLX_VL15 : 0) |
+				  (sqp->ud_header.lrh.destination_lid ==
+				   IB_LID_PERMISSIVE ? MTHCA_MLX_SLR : 0) |
+				  (sqp->ud_header.lrh.service_level << 8));
+	mlx->rlid = sqp->ud_header.lrh.destination_lid;
+	mlx->vcrc = 0;
+
+	switch (wr->opcode) {
+	case IB_WR_SEND:
+		sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY;
+		sqp->ud_header.immediate_present = 0;
+		break;
+	case IB_WR_SEND_WITH_IMM:
+		sqp->ud_header.bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE;
+		sqp->ud_header.immediate_present = 1;
+		sqp->ud_header.immediate_data = wr->ex.imm_data;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	sqp->ud_header.lrh.virtual_lane    = !sqp->qp.ibqp.qp_num ? 15 : 0;
+	if (sqp->ud_header.lrh.destination_lid == IB_LID_PERMISSIVE)
+		sqp->ud_header.lrh.source_lid = IB_LID_PERMISSIVE;
+	sqp->ud_header.bth.solicited_event = !!(wr->send_flags & IB_SEND_SOLICITED);
+	if (!sqp->qp.ibqp.qp_num)
+		ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port,
+				   sqp->pkey_index, &pkey);
+	else
+		ib_get_cached_pkey(&dev->ib_dev, sqp->qp.port,
+				   wr->wr.ud.pkey_index, &pkey);
+	sqp->ud_header.bth.pkey = cpu_to_be16(pkey);
+	sqp->ud_header.bth.destination_qpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	sqp->ud_header.bth.psn = cpu_to_be32((sqp->send_psn++) & ((1 << 24) - 1));
+	sqp->ud_header.deth.qkey = cpu_to_be32(wr->wr.ud.remote_qkey & 0x80000000 ?
+					       sqp->qkey : wr->wr.ud.remote_qkey);
+	sqp->ud_header.deth.source_qpn = cpu_to_be32(sqp->qp.ibqp.qp_num);
+
+	header_size = ib_ud_header_pack(&sqp->ud_header,
+					sqp->header_buf +
+					ind * MTHCA_UD_HEADER_SIZE);
+
+	data->byte_count = cpu_to_be32(header_size);
+	data->lkey       = cpu_to_be32(to_mpd(sqp->qp.ibqp.pd)->ntmr.ibmr.lkey);
+	data->addr       = cpu_to_be64(sqp->header_dma +
+				       ind * MTHCA_UD_HEADER_SIZE);
+
+	return 0;
+}
+
+static inline int mthca_wq_overflow(struct mthca_wq *wq, int nreq,
+				    struct ib_cq *ib_cq)
+{
+	unsigned cur;
+	struct mthca_cq *cq;
+
+	cur = wq->head - wq->tail;
+	if (likely(cur + nreq < wq->max))
+		return 0;
+
+	cq = to_mcq(ib_cq);
+	spin_lock(&cq->lock);
+	cur = wq->head - wq->tail;
+	spin_unlock(&cq->lock);
+
+	return cur + nreq >= wq->max;
+}
+
+static __always_inline void set_raddr_seg(struct mthca_raddr_seg *rseg,
+					  u64 remote_addr, u32 rkey)
+{
+	rseg->raddr    = cpu_to_be64(remote_addr);
+	rseg->rkey     = cpu_to_be32(rkey);
+	rseg->reserved = 0;
+}
+
+static __always_inline void set_atomic_seg(struct mthca_atomic_seg *aseg,
+					   struct ib_send_wr *wr)
+{
+	if (wr->opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.swap);
+		aseg->compare  = cpu_to_be64(wr->wr.atomic.compare_add);
+	} else {
+		aseg->swap_add = cpu_to_be64(wr->wr.atomic.compare_add);
+		aseg->compare  = 0;
+	}
+
+}
+
+static void set_tavor_ud_seg(struct mthca_tavor_ud_seg *useg,
+			     struct ib_send_wr *wr)
+{
+	useg->lkey    = cpu_to_be32(to_mah(wr->wr.ud.ah)->key);
+	useg->av_addr =	cpu_to_be64(to_mah(wr->wr.ud.ah)->avdma);
+	useg->dqpn    =	cpu_to_be32(wr->wr.ud.remote_qpn);
+	useg->qkey    =	cpu_to_be32(wr->wr.ud.remote_qkey);
+
+}
+
+static void set_arbel_ud_seg(struct mthca_arbel_ud_seg *useg,
+			     struct ib_send_wr *wr)
+{
+	memcpy(useg->av, to_mah(wr->wr.ud.ah)->av, MTHCA_AV_SIZE);
+	useg->dqpn = cpu_to_be32(wr->wr.ud.remote_qpn);
+	useg->qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
+}
+
+int mthca_tavor_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	void *wqe;
+	void *prev_wqe;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	/*
+	 * f0 and size0 are only used if nreq != 0, and they will
+	 * always be initialized the first time through the main loop
+	 * before nreq is incremented.  So nreq cannot become non-zero
+	 * without initializing f0 and size0, and they are in fact
+	 * never used uninitialized.
+	 */
+	int uninitialized_var(size0);
+	u32 uninitialized_var(f0);
+	int ind;
+	u8 op0 = 0;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	/* XXX check that state is OK to post send */
+
+	ind = qp->sq.next_ind;
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			mthca_err(dev, "SQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->sq.head, qp->sq.tail,
+					qp->sq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_send_wqe(qp, ind);
+		prev_wqe = qp->sq.last;
+		qp->sq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->nda_op = 0;
+		((struct mthca_next_seg *) wqe)->ee_nds = 0;
+		((struct mthca_next_seg *) wqe)->flags =
+			((wr->send_flags & IB_SEND_SIGNALED) ?
+			 cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) |
+			((wr->send_flags & IB_SEND_SOLICITED) ?
+			 cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0)   |
+			cpu_to_be32(1);
+		if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+			((struct mthca_next_seg *) wqe)->imm = wr->ex.imm_data;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		switch (qp->transport) {
+		case RC:
+			switch (wr->opcode) {
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				wqe += sizeof (struct mthca_raddr_seg);
+
+				set_atomic_seg(wqe, wr);
+				wqe += sizeof (struct mthca_atomic_seg);
+				size += (sizeof (struct mthca_raddr_seg) +
+					 sizeof (struct mthca_atomic_seg)) / 16;
+				break;
+
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+			case IB_WR_RDMA_READ:
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UD:
+			set_tavor_ud_seg(wqe, wr);
+			wqe  += sizeof (struct mthca_tavor_ud_seg);
+			size += sizeof (struct mthca_tavor_ud_seg) / 16;
+			break;
+
+		case MLX:
+			err = build_mlx_header(dev, to_msqp(qp), ind, wr,
+					       wqe - sizeof (struct mthca_next_seg),
+					       wqe);
+			if (err) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+			break;
+		}
+
+		if (wr->num_sge > qp->sq.max_gs) {
+			mthca_err(dev, "too many gathers\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe  += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		/* Add one more inline data segment for ICRC */
+		if (qp->transport == MLX) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32((1 << 31) | 4);
+			((u32 *) wqe)[1] = 0;
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind + qp->rq.max] = wr->wr_id;
+
+		if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) {
+			mthca_err(dev, "opcode invalid\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		((struct mthca_next_seg *) prev_wqe)->nda_op =
+			cpu_to_be32(((ind << qp->sq.wqe_shift) +
+				     qp->send_wqe_offset) |
+				    mthca_opcode[wr->opcode]);
+		wmb();
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			cpu_to_be32((nreq ? 0 : MTHCA_NEXT_DBD) | size |
+				    ((wr->send_flags & IB_SEND_FENCE) ?
+				    MTHCA_NEXT_FENCE : 0));
+
+		if (!nreq) {
+			size0 = size;
+			op0   = mthca_opcode[wr->opcode];
+			f0    = wr->send_flags & IB_SEND_FENCE ?
+				MTHCA_SEND_DOORBELL_FENCE : 0;
+		}
+
+		++ind;
+		if (unlikely(ind >= qp->sq.max))
+			ind -= qp->sq.max;
+	}
+
+out:
+	if (likely(nreq)) {
+		wmb();
+
+		mthca_write64(((qp->sq.next_ind << qp->sq.wqe_shift) +
+			       qp->send_wqe_offset) | f0 | op0,
+			      (qp->qpn << 8) | size0,
+			      dev->kar + MTHCA_SEND_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+		/*
+		 * Make sure doorbells don't leak out of SQ spinlock
+		 * and reach the HCA out of order:
+		 */
+		mmiowb();
+	}
+
+	qp->sq.next_ind = ind;
+	qp->sq.head    += nreq;
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+	return err;
+}
+
+int mthca_tavor_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	/*
+	 * size0 is only used if nreq != 0, and it will always be
+	 * initialized the first time through the main loop before
+	 * nreq is incremented.  So nreq cannot become non-zero
+	 * without initializing size0, and it is in fact never used
+	 * uninitialized.
+	 */
+	int uninitialized_var(size0);
+	int ind;
+	void *wqe;
+	void *prev_wqe;
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	/* XXX check that state is OK to post receive */
+
+	ind = qp->rq.next_ind;
+
+	for (nreq = 0; wr; wr = wr->next) {
+		if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			mthca_err(dev, "RQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->rq.head, qp->rq.tail,
+					qp->rq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_recv_wqe(qp, ind);
+		prev_wqe = qp->rq.last;
+		qp->rq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD);
+		((struct mthca_next_seg *) wqe)->flags = 0;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe  += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind] = wr->wr_id;
+
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD | size);
+
+		if (!nreq)
+			size0 = size;
+
+		++ind;
+		if (unlikely(ind >= qp->rq.max))
+			ind -= qp->rq.max;
+
+		++nreq;
+		if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) {
+			nreq = 0;
+
+			wmb();
+
+			mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0,
+				      qp->qpn << 8, dev->kar + MTHCA_RECEIVE_DOORBELL,
+				      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+
+			qp->rq.next_ind = ind;
+			qp->rq.head += MTHCA_TAVOR_MAX_WQES_PER_RECV_DB;
+		}
+	}
+
+out:
+	if (likely(nreq)) {
+		wmb();
+
+		mthca_write64((qp->rq.next_ind << qp->rq.wqe_shift) | size0,
+			      qp->qpn << 8 | nreq, dev->kar + MTHCA_RECEIVE_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	qp->rq.next_ind = ind;
+	qp->rq.head    += nreq;
+
+	/*
+	 * Make sure doorbells don't leak out of RQ spinlock and reach
+	 * the HCA out of order:
+	 */
+	mmiowb();
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+	return err;
+}
+
+int mthca_arbel_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			  struct ib_send_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	u32 dbhi;
+	void *wqe;
+	void *prev_wqe;
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int i;
+	int size;
+	/*
+	 * f0 and size0 are only used if nreq != 0, and they will
+	 * always be initialized the first time through the main loop
+	 * before nreq is incremented.  So nreq cannot become non-zero
+	 * without initializing f0 and size0, and they are in fact
+	 * never used uninitialized.
+	 */
+	int uninitialized_var(size0);
+	u32 uninitialized_var(f0);
+	int ind;
+	u8 op0 = 0;
+
+	spin_lock_irqsave(&qp->sq.lock, flags);
+
+	/* XXX check that state is OK to post send */
+
+	ind = qp->sq.head & (qp->sq.max - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (unlikely(nreq == MTHCA_ARBEL_MAX_WQES_PER_SEND_DB)) {
+			nreq = 0;
+
+			dbhi = (MTHCA_ARBEL_MAX_WQES_PER_SEND_DB << 24) |
+				((qp->sq.head & 0xffff) << 8) | f0 | op0;
+
+			qp->sq.head += MTHCA_ARBEL_MAX_WQES_PER_SEND_DB;
+
+			/*
+			 * Make sure that descriptors are written before
+			 * doorbell record.
+			 */
+			wmb();
+			*qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff);
+
+			/*
+			 * Make sure doorbell record is written before we
+			 * write MMIO send doorbell.
+			 */
+			wmb();
+
+			mthca_write64(dbhi, (qp->qpn << 8) | size0,
+				      dev->kar + MTHCA_SEND_DOORBELL,
+				      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+		}
+
+		if (mthca_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
+			mthca_err(dev, "SQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->sq.head, qp->sq.tail,
+					qp->sq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_send_wqe(qp, ind);
+		prev_wqe = qp->sq.last;
+		qp->sq.last = wqe;
+
+		((struct mthca_next_seg *) wqe)->flags =
+			((wr->send_flags & IB_SEND_SIGNALED) ?
+			 cpu_to_be32(MTHCA_NEXT_CQ_UPDATE) : 0) |
+			((wr->send_flags & IB_SEND_SOLICITED) ?
+			 cpu_to_be32(MTHCA_NEXT_SOLICIT) : 0)   |
+			((wr->send_flags & IB_SEND_IP_CSUM) ?
+			 cpu_to_be32(MTHCA_NEXT_IP_CSUM | MTHCA_NEXT_TCP_UDP_CSUM) : 0) |
+			cpu_to_be32(1);
+		if (wr->opcode == IB_WR_SEND_WITH_IMM ||
+		    wr->opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+			((struct mthca_next_seg *) wqe)->imm = wr->ex.imm_data;
+
+		wqe += sizeof (struct mthca_next_seg);
+		size = sizeof (struct mthca_next_seg) / 16;
+
+		switch (qp->transport) {
+		case RC:
+			switch (wr->opcode) {
+			case IB_WR_ATOMIC_CMP_AND_SWP:
+			case IB_WR_ATOMIC_FETCH_AND_ADD:
+				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
+					      wr->wr.atomic.rkey);
+				wqe += sizeof (struct mthca_raddr_seg);
+
+				set_atomic_seg(wqe, wr);
+				wqe  += sizeof (struct mthca_atomic_seg);
+				size += (sizeof (struct mthca_raddr_seg) +
+					 sizeof (struct mthca_atomic_seg)) / 16;
+				break;
+
+			case IB_WR_RDMA_READ:
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UC:
+			switch (wr->opcode) {
+			case IB_WR_RDMA_WRITE:
+			case IB_WR_RDMA_WRITE_WITH_IMM:
+				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
+					      wr->wr.rdma.rkey);
+				wqe  += sizeof (struct mthca_raddr_seg);
+				size += sizeof (struct mthca_raddr_seg) / 16;
+				break;
+
+			default:
+				/* No extra segments required for sends */
+				break;
+			}
+
+			break;
+
+		case UD:
+			set_arbel_ud_seg(wqe, wr);
+			wqe  += sizeof (struct mthca_arbel_ud_seg);
+			size += sizeof (struct mthca_arbel_ud_seg) / 16;
+			break;
+
+		case MLX:
+			err = build_mlx_header(dev, to_msqp(qp), ind, wr,
+					       wqe - sizeof (struct mthca_next_seg),
+					       wqe);
+			if (err) {
+				*bad_wr = wr;
+				goto out;
+			}
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+			break;
+		}
+
+		if (wr->num_sge > qp->sq.max_gs) {
+			mthca_err(dev, "too many gathers\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe  += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		/* Add one more inline data segment for ICRC */
+		if (qp->transport == MLX) {
+			((struct mthca_data_seg *) wqe)->byte_count =
+				cpu_to_be32((1 << 31) | 4);
+			((u32 *) wqe)[1] = 0;
+			wqe += sizeof (struct mthca_data_seg);
+			size += sizeof (struct mthca_data_seg) / 16;
+		}
+
+		qp->wrid[ind + qp->rq.max] = wr->wr_id;
+
+		if (wr->opcode >= ARRAY_SIZE(mthca_opcode)) {
+			mthca_err(dev, "opcode invalid\n");
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		((struct mthca_next_seg *) prev_wqe)->nda_op =
+			cpu_to_be32(((ind << qp->sq.wqe_shift) +
+				     qp->send_wqe_offset) |
+				    mthca_opcode[wr->opcode]);
+		wmb();
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD | size |
+				    ((wr->send_flags & IB_SEND_FENCE) ?
+				     MTHCA_NEXT_FENCE : 0));
+
+		if (!nreq) {
+			size0 = size;
+			op0   = mthca_opcode[wr->opcode];
+			f0    = wr->send_flags & IB_SEND_FENCE ?
+				MTHCA_SEND_DOORBELL_FENCE : 0;
+		}
+
+		++ind;
+		if (unlikely(ind >= qp->sq.max))
+			ind -= qp->sq.max;
+	}
+
+out:
+	if (likely(nreq)) {
+		dbhi = (nreq << 24) | ((qp->sq.head & 0xffff) << 8) | f0 | op0;
+
+		qp->sq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+		*qp->sq.db = cpu_to_be32(qp->sq.head & 0xffff);
+
+		/*
+		 * Make sure doorbell record is written before we
+		 * write MMIO send doorbell.
+		 */
+		wmb();
+
+		mthca_write64(dbhi, (qp->qpn << 8) | size0, dev->kar + MTHCA_SEND_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	/*
+	 * Make sure doorbells don't leak out of SQ spinlock and reach
+	 * the HCA out of order:
+	 */
+	mmiowb();
+
+	spin_unlock_irqrestore(&qp->sq.lock, flags);
+	return err;
+}
+
+int mthca_arbel_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			     struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibqp->device);
+	struct mthca_qp *qp = to_mqp(ibqp);
+	unsigned long flags;
+	int err = 0;
+	int nreq;
+	int ind;
+	int i;
+	void *wqe;
+
+	spin_lock_irqsave(&qp->rq.lock, flags);
+
+	/* XXX check that state is OK to post receive */
+
+	ind = qp->rq.head & (qp->rq.max - 1);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		if (mthca_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
+			mthca_err(dev, "RQ %06x full (%u head, %u tail,"
+					" %d max, %d nreq)\n", qp->qpn,
+					qp->rq.head, qp->rq.tail,
+					qp->rq.max, nreq);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		wqe = get_recv_wqe(qp, ind);
+
+		((struct mthca_next_seg *) wqe)->flags = 0;
+
+		wqe += sizeof (struct mthca_next_seg);
+
+		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			goto out;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe += sizeof (struct mthca_data_seg);
+		}
+
+		if (i < qp->rq.max_gs)
+			mthca_set_data_seg_inval(wqe);
+
+		qp->wrid[ind] = wr->wr_id;
+
+		++ind;
+		if (unlikely(ind >= qp->rq.max))
+			ind -= qp->rq.max;
+	}
+out:
+	if (likely(nreq)) {
+		qp->rq.head += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell record.
+		 */
+		wmb();
+		*qp->rq.db = cpu_to_be32(qp->rq.head & 0xffff);
+	}
+
+	spin_unlock_irqrestore(&qp->rq.lock, flags);
+	return err;
+}
+
+void mthca_free_err_wqe(struct mthca_dev *dev, struct mthca_qp *qp, int is_send,
+			int index, int *dbd, __be32 *new_wqe)
+{
+	struct mthca_next_seg *next;
+
+	/*
+	 * For SRQs, all receive WQEs generate a CQE, so we're always
+	 * at the end of the doorbell chain.
+	 */
+	if (qp->ibqp.srq && !is_send) {
+		*new_wqe = 0;
+		return;
+	}
+
+	if (is_send)
+		next = get_send_wqe(qp, index);
+	else
+		next = get_recv_wqe(qp, index);
+
+	*dbd = !!(next->ee_nds & cpu_to_be32(MTHCA_NEXT_DBD));
+	if (next->ee_nds & cpu_to_be32(0x3f))
+		*new_wqe = (next->nda_op & cpu_to_be32(~0x3f)) |
+			(next->ee_nds & cpu_to_be32(0x3f));
+	else
+		*new_wqe = 0;
+}
+
+int mthca_init_qp_table(struct mthca_dev *dev)
+{
+	int err;
+	int i;
+
+	spin_lock_init(&dev->qp_table.lock);
+
+	/*
+	 * We reserve 2 extra QPs per port for the special QPs.  The
+	 * special QP for port 1 has to be even, so round up.
+	 */
+	dev->qp_table.sqp_start = (dev->limits.reserved_qps + 1) & ~1UL;
+	err = mthca_alloc_init(&dev->qp_table.alloc,
+			       dev->limits.num_qps,
+			       (1 << 24) - 1,
+			       dev->qp_table.sqp_start +
+			       MTHCA_MAX_PORTS * 2);
+	if (err)
+		return err;
+
+	err = mthca_array_init(&dev->qp_table.qp,
+			       dev->limits.num_qps);
+	if (err) {
+		mthca_alloc_cleanup(&dev->qp_table.alloc);
+		return err;
+	}
+
+	for (i = 0; i < 2; ++i) {
+		err = mthca_CONF_SPECIAL_QP(dev, i ? IB_QPT_GSI : IB_QPT_SMI,
+				    dev->qp_table.sqp_start + i * 2);
+		if (err) {
+			mthca_warn(dev, "CONF_SPECIAL_QP returned "
+				   "%d, aborting.\n", err);
+			goto err_out;
+		}
+	}
+	return 0;
+
+ err_out:
+	for (i = 0; i < 2; ++i)
+		mthca_CONF_SPECIAL_QP(dev, i, 0);
+
+	mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps);
+	mthca_alloc_cleanup(&dev->qp_table.alloc);
+
+	return err;
+}
+
+void mthca_cleanup_qp_table(struct mthca_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < 2; ++i)
+		mthca_CONF_SPECIAL_QP(dev, i, 0);
+
+	mthca_array_cleanup(&dev->qp_table.qp, dev->limits.num_qps);
+	mthca_alloc_cleanup(&dev->qp_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_reset.c b/drivers/infiniband/hw/mthca/mthca_reset.c
new file mode 100644
index 000000000..74c6a9426
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_reset.c
@@ -0,0 +1,288 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+
+int mthca_reset(struct mthca_dev *mdev)
+{
+	int i;
+	int err = 0;
+	u32 *hca_header    = NULL;
+	u32 *bridge_header = NULL;
+	struct pci_dev *bridge = NULL;
+	int bridge_pcix_cap = 0;
+	int hca_pcie_cap = 0;
+	int hca_pcix_cap = 0;
+
+	u16 devctl;
+	u16 linkctl;
+
+#define MTHCA_RESET_OFFSET 0xf0010
+#define MTHCA_RESET_VALUE  swab32(1)
+
+	/*
+	 * Reset the chip.  This is somewhat ugly because we have to
+	 * save off the PCI header before reset and then restore it
+	 * after the chip reboots.  We skip config space offsets 22
+	 * and 23 since those have a special meaning.
+	 *
+	 * To make matters worse, for Tavor (PCI-X HCA) we have to
+	 * find the associated bridge device and save off its PCI
+	 * header as well.
+	 */
+
+	if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE)) {
+		/* Look for the bridge -- its device ID will be 2 more
+		   than HCA's device ID. */
+		while ((bridge = pci_get_device(mdev->pdev->vendor,
+						mdev->pdev->device + 2,
+						bridge)) != NULL) {
+			if (bridge->hdr_type    == PCI_HEADER_TYPE_BRIDGE &&
+			    bridge->subordinate == mdev->pdev->bus) {
+				mthca_dbg(mdev, "Found bridge: %s\n",
+					  pci_name(bridge));
+				break;
+			}
+		}
+
+		if (!bridge) {
+			/*
+			 * Didn't find a bridge for a Tavor device --
+			 * assume we're in no-bridge mode and hope for
+			 * the best.
+			 */
+			mthca_warn(mdev, "No bridge found for %s\n",
+				  pci_name(mdev->pdev));
+		}
+
+	}
+
+	/* For Arbel do we need to save off the full 4K PCI Express header?? */
+	hca_header = kmalloc(256, GFP_KERNEL);
+	if (!hca_header) {
+		err = -ENOMEM;
+		mthca_err(mdev, "Couldn't allocate memory to save HCA "
+			  "PCI header, aborting.\n");
+		goto out;
+	}
+
+	for (i = 0; i < 64; ++i) {
+		if (i == 22 || i == 23)
+			continue;
+		if (pci_read_config_dword(mdev->pdev, i * 4, hca_header + i)) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't save HCA "
+				  "PCI header, aborting.\n");
+			goto out;
+		}
+	}
+
+	hca_pcix_cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX);
+	hca_pcie_cap = pci_pcie_cap(mdev->pdev);
+
+	if (bridge) {
+		bridge_header = kmalloc(256, GFP_KERNEL);
+		if (!bridge_header) {
+			err = -ENOMEM;
+			mthca_err(mdev, "Couldn't allocate memory to save HCA "
+				  "bridge PCI header, aborting.\n");
+			goto out;
+		}
+
+		for (i = 0; i < 64; ++i) {
+			if (i == 22 || i == 23)
+				continue;
+			if (pci_read_config_dword(bridge, i * 4, bridge_header + i)) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't save HCA bridge "
+					  "PCI header, aborting.\n");
+				goto out;
+			}
+		}
+		bridge_pcix_cap = pci_find_capability(bridge, PCI_CAP_ID_PCIX);
+		if (!bridge_pcix_cap) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't locate HCA bridge "
+					  "PCI-X capability, aborting.\n");
+				goto out;
+		}
+	}
+
+	/* actually hit reset */
+	{
+		void __iomem *reset = ioremap(pci_resource_start(mdev->pdev, 0) +
+					      MTHCA_RESET_OFFSET, 4);
+
+		if (!reset) {
+			err = -ENOMEM;
+			mthca_err(mdev, "Couldn't map HCA reset register, "
+				  "aborting.\n");
+			goto out;
+		}
+
+		writel(MTHCA_RESET_VALUE, reset);
+		iounmap(reset);
+	}
+
+	/* Docs say to wait one second before accessing device */
+	msleep(1000);
+
+	/* Now wait for PCI device to start responding again */
+	{
+		u32 v;
+		int c = 0;
+
+		for (c = 0; c < 100; ++c) {
+			if (pci_read_config_dword(bridge ? bridge : mdev->pdev, 0, &v)) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't access HCA after reset, "
+					  "aborting.\n");
+				goto out;
+			}
+
+			if (v != 0xffffffff)
+				goto good;
+
+			msleep(100);
+		}
+
+		err = -ENODEV;
+		mthca_err(mdev, "PCI device did not come back after reset, "
+			  "aborting.\n");
+		goto out;
+	}
+
+good:
+	/* Now restore the PCI headers */
+	if (bridge) {
+		if (pci_write_config_dword(bridge, bridge_pcix_cap + 0x8,
+				 bridge_header[(bridge_pcix_cap + 0x8) / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA bridge Upstream "
+				  "split transaction control, aborting.\n");
+			goto out;
+		}
+		if (pci_write_config_dword(bridge, bridge_pcix_cap + 0xc,
+				 bridge_header[(bridge_pcix_cap + 0xc) / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA bridge Downstream "
+				  "split transaction control, aborting.\n");
+			goto out;
+		}
+		/*
+		 * Bridge control register is at 0x3e, so we'll
+		 * naturally restore it last in this loop.
+		 */
+		for (i = 0; i < 16; ++i) {
+			if (i * 4 == PCI_COMMAND)
+				continue;
+
+			if (pci_write_config_dword(bridge, i * 4, bridge_header[i])) {
+				err = -ENODEV;
+				mthca_err(mdev, "Couldn't restore HCA bridge reg %x, "
+					  "aborting.\n", i);
+				goto out;
+			}
+		}
+
+		if (pci_write_config_dword(bridge, PCI_COMMAND,
+					   bridge_header[PCI_COMMAND / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA bridge COMMAND, "
+				  "aborting.\n");
+			goto out;
+		}
+	}
+
+	if (hca_pcix_cap) {
+		if (pci_write_config_dword(mdev->pdev, hca_pcix_cap,
+				 hca_header[hca_pcix_cap / 4])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA PCI-X "
+				  "command register, aborting.\n");
+			goto out;
+		}
+	}
+
+	if (hca_pcie_cap) {
+		devctl = hca_header[(hca_pcie_cap + PCI_EXP_DEVCTL) / 4];
+		if (pcie_capability_write_word(mdev->pdev, PCI_EXP_DEVCTL,
+					       devctl)) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA PCI Express "
+				  "Device Control register, aborting.\n");
+			goto out;
+		}
+		linkctl = hca_header[(hca_pcie_cap + PCI_EXP_LNKCTL) / 4];
+		if (pcie_capability_write_word(mdev->pdev, PCI_EXP_LNKCTL,
+					       linkctl)) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA PCI Express "
+				  "Link control register, aborting.\n");
+			goto out;
+		}
+	}
+
+	for (i = 0; i < 16; ++i) {
+		if (i * 4 == PCI_COMMAND)
+			continue;
+
+		if (pci_write_config_dword(mdev->pdev, i * 4, hca_header[i])) {
+			err = -ENODEV;
+			mthca_err(mdev, "Couldn't restore HCA reg %x, "
+				  "aborting.\n", i);
+			goto out;
+		}
+	}
+
+	if (pci_write_config_dword(mdev->pdev, PCI_COMMAND,
+				   hca_header[PCI_COMMAND / 4])) {
+		err = -ENODEV;
+		mthca_err(mdev, "Couldn't restore HCA COMMAND, "
+			  "aborting.\n");
+		goto out;
+	}
+
+out:
+	if (bridge)
+		pci_dev_put(bridge);
+	kfree(bridge_header);
+	kfree(hca_header);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c
new file mode 100644
index 000000000..d22f97048
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/sched.h>
+
+#include <asm/io.h>
+
+#include "mthca_dev.h"
+#include "mthca_cmd.h"
+#include "mthca_memfree.h"
+#include "mthca_wqe.h"
+
+enum {
+	MTHCA_MAX_DIRECT_SRQ_SIZE = 4 * PAGE_SIZE
+};
+
+struct mthca_tavor_srq_context {
+	__be64 wqe_base_ds;	/* low 6 bits is descriptor size */
+	__be32 state_pd;
+	__be32 lkey;
+	__be32 uar;
+	__be16 limit_watermark;
+	__be16 wqe_cnt;
+	u32    reserved[2];
+};
+
+struct mthca_arbel_srq_context {
+	__be32 state_logsize_srqn;
+	__be32 lkey;
+	__be32 db_index;
+	__be32 logstride_usrpage;
+	__be64 wqe_base;
+	__be32 eq_pd;
+	__be16 limit_watermark;
+	__be16 wqe_cnt;
+	u16    reserved1;
+	__be16 wqe_counter;
+	u32    reserved2[3];
+};
+
+static void *get_wqe(struct mthca_srq *srq, int n)
+{
+	if (srq->is_direct)
+		return srq->queue.direct.buf + (n << srq->wqe_shift);
+	else
+		return srq->queue.page_list[(n << srq->wqe_shift) >> PAGE_SHIFT].buf +
+			((n << srq->wqe_shift) & (PAGE_SIZE - 1));
+}
+
+/*
+ * Return a pointer to the location within a WQE that we're using as a
+ * link when the WQE is in the free list.  We use the imm field
+ * because in the Tavor case, posting a WQE may overwrite the next
+ * segment of the previous WQE, but a receive WQE will never touch the
+ * imm field.  This avoids corrupting our free list if the previous
+ * WQE has already completed and been put on the free list when we
+ * post the next WQE.
+ */
+static inline int *wqe_to_link(void *wqe)
+{
+	return (int *) (wqe + offsetof(struct mthca_next_seg, imm));
+}
+
+static void mthca_tavor_init_srq_context(struct mthca_dev *dev,
+					 struct mthca_pd *pd,
+					 struct mthca_srq *srq,
+					 struct mthca_tavor_srq_context *context)
+{
+	memset(context, 0, sizeof *context);
+
+	context->wqe_base_ds = cpu_to_be64(1 << (srq->wqe_shift - 4));
+	context->state_pd    = cpu_to_be32(pd->pd_num);
+	context->lkey        = cpu_to_be32(srq->mr.ibmr.lkey);
+
+	if (pd->ibpd.uobject)
+		context->uar =
+			cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index);
+	else
+		context->uar = cpu_to_be32(dev->driver_uar.index);
+}
+
+static void mthca_arbel_init_srq_context(struct mthca_dev *dev,
+					 struct mthca_pd *pd,
+					 struct mthca_srq *srq,
+					 struct mthca_arbel_srq_context *context)
+{
+	int logsize, max;
+
+	memset(context, 0, sizeof *context);
+
+	/*
+	 * Put max in a temporary variable to work around gcc bug
+	 * triggered by ilog2() on sparc64.
+	 */
+	max = srq->max;
+	logsize = ilog2(max);
+	context->state_logsize_srqn = cpu_to_be32(logsize << 24 | srq->srqn);
+	context->lkey = cpu_to_be32(srq->mr.ibmr.lkey);
+	context->db_index = cpu_to_be32(srq->db_index);
+	context->logstride_usrpage = cpu_to_be32((srq->wqe_shift - 4) << 29);
+	if (pd->ibpd.uobject)
+		context->logstride_usrpage |=
+			cpu_to_be32(to_mucontext(pd->ibpd.uobject->context)->uar.index);
+	else
+		context->logstride_usrpage |= cpu_to_be32(dev->driver_uar.index);
+	context->eq_pd = cpu_to_be32(MTHCA_EQ_ASYNC << 24 | pd->pd_num);
+}
+
+static void mthca_free_srq_buf(struct mthca_dev *dev, struct mthca_srq *srq)
+{
+	mthca_buf_free(dev, srq->max << srq->wqe_shift, &srq->queue,
+		       srq->is_direct, &srq->mr);
+	kfree(srq->wrid);
+}
+
+static int mthca_alloc_srq_buf(struct mthca_dev *dev, struct mthca_pd *pd,
+			       struct mthca_srq *srq)
+{
+	struct mthca_data_seg *scatter;
+	void *wqe;
+	int err;
+	int i;
+
+	if (pd->ibpd.uobject)
+		return 0;
+
+	srq->wrid = kmalloc(srq->max * sizeof (u64), GFP_KERNEL);
+	if (!srq->wrid)
+		return -ENOMEM;
+
+	err = mthca_buf_alloc(dev, srq->max << srq->wqe_shift,
+			      MTHCA_MAX_DIRECT_SRQ_SIZE,
+			      &srq->queue, &srq->is_direct, pd, 1, &srq->mr);
+	if (err) {
+		kfree(srq->wrid);
+		return err;
+	}
+
+	/*
+	 * Now initialize the SRQ buffer so that all of the WQEs are
+	 * linked into the list of free WQEs.  In addition, set the
+	 * scatter list L_Keys to the sentry value of 0x100.
+	 */
+	for (i = 0; i < srq->max; ++i) {
+		struct mthca_next_seg *next;
+
+		next = wqe = get_wqe(srq, i);
+
+		if (i < srq->max - 1) {
+			*wqe_to_link(wqe) = i + 1;
+			next->nda_op = htonl(((i + 1) << srq->wqe_shift) | 1);
+		} else {
+			*wqe_to_link(wqe) = -1;
+			next->nda_op = 0;
+		}
+
+		for (scatter = wqe + sizeof (struct mthca_next_seg);
+		     (void *) scatter < wqe + (1 << srq->wqe_shift);
+		     ++scatter)
+			scatter->lkey = cpu_to_be32(MTHCA_INVAL_LKEY);
+	}
+
+	srq->last = get_wqe(srq, srq->max - 1);
+
+	return 0;
+}
+
+int mthca_alloc_srq(struct mthca_dev *dev, struct mthca_pd *pd,
+		    struct ib_srq_attr *attr, struct mthca_srq *srq)
+{
+	struct mthca_mailbox *mailbox;
+	int ds;
+	int err;
+
+	/* Sanity check SRQ size before proceeding */
+	if (attr->max_wr  > dev->limits.max_srq_wqes ||
+	    attr->max_sge > dev->limits.max_srq_sge)
+		return -EINVAL;
+
+	srq->max      = attr->max_wr;
+	srq->max_gs   = attr->max_sge;
+	srq->counter  = 0;
+
+	if (mthca_is_memfree(dev))
+		srq->max = roundup_pow_of_two(srq->max + 1);
+	else
+		srq->max = srq->max + 1;
+
+	ds = max(64UL,
+		 roundup_pow_of_two(sizeof (struct mthca_next_seg) +
+				    srq->max_gs * sizeof (struct mthca_data_seg)));
+
+	if (!mthca_is_memfree(dev) && (ds > dev->limits.max_desc_sz))
+		return -EINVAL;
+
+	srq->wqe_shift = ilog2(ds);
+
+	srq->srqn = mthca_alloc(&dev->srq_table.alloc);
+	if (srq->srqn == -1)
+		return -ENOMEM;
+
+	if (mthca_is_memfree(dev)) {
+		err = mthca_table_get(dev, dev->srq_table.table, srq->srqn);
+		if (err)
+			goto err_out;
+
+		if (!pd->ibpd.uobject) {
+			srq->db_index = mthca_alloc_db(dev, MTHCA_DB_TYPE_SRQ,
+						       srq->srqn, &srq->db);
+			if (srq->db_index < 0) {
+				err = -ENOMEM;
+				goto err_out_icm;
+			}
+		}
+	}
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		err = PTR_ERR(mailbox);
+		goto err_out_db;
+	}
+
+	err = mthca_alloc_srq_buf(dev, pd, srq);
+	if (err)
+		goto err_out_mailbox;
+
+	spin_lock_init(&srq->lock);
+	srq->refcount = 1;
+	init_waitqueue_head(&srq->wait);
+	mutex_init(&srq->mutex);
+
+	if (mthca_is_memfree(dev))
+		mthca_arbel_init_srq_context(dev, pd, srq, mailbox->buf);
+	else
+		mthca_tavor_init_srq_context(dev, pd, srq, mailbox->buf);
+
+	err = mthca_SW2HW_SRQ(dev, mailbox, srq->srqn);
+
+	if (err) {
+		mthca_warn(dev, "SW2HW_SRQ failed (%d)\n", err);
+		goto err_out_free_buf;
+	}
+
+	spin_lock_irq(&dev->srq_table.lock);
+	if (mthca_array_set(&dev->srq_table.srq,
+			    srq->srqn & (dev->limits.num_srqs - 1),
+			    srq)) {
+		spin_unlock_irq(&dev->srq_table.lock);
+		goto err_out_free_srq;
+	}
+	spin_unlock_irq(&dev->srq_table.lock);
+
+	mthca_free_mailbox(dev, mailbox);
+
+	srq->first_free = 0;
+	srq->last_free  = srq->max - 1;
+
+	attr->max_wr    = srq->max - 1;
+	attr->max_sge   = srq->max_gs;
+
+	return 0;
+
+err_out_free_srq:
+	err = mthca_HW2SW_SRQ(dev, mailbox, srq->srqn);
+	if (err)
+		mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err);
+
+err_out_free_buf:
+	if (!pd->ibpd.uobject)
+		mthca_free_srq_buf(dev, srq);
+
+err_out_mailbox:
+	mthca_free_mailbox(dev, mailbox);
+
+err_out_db:
+	if (!pd->ibpd.uobject && mthca_is_memfree(dev))
+		mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index);
+
+err_out_icm:
+	mthca_table_put(dev, dev->srq_table.table, srq->srqn);
+
+err_out:
+	mthca_free(&dev->srq_table.alloc, srq->srqn);
+
+	return err;
+}
+
+static inline int get_srq_refcount(struct mthca_dev *dev, struct mthca_srq *srq)
+{
+	int c;
+
+	spin_lock_irq(&dev->srq_table.lock);
+	c = srq->refcount;
+	spin_unlock_irq(&dev->srq_table.lock);
+
+	return c;
+}
+
+void mthca_free_srq(struct mthca_dev *dev, struct mthca_srq *srq)
+{
+	struct mthca_mailbox *mailbox;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox)) {
+		mthca_warn(dev, "No memory for mailbox to free SRQ.\n");
+		return;
+	}
+
+	err = mthca_HW2SW_SRQ(dev, mailbox, srq->srqn);
+	if (err)
+		mthca_warn(dev, "HW2SW_SRQ failed (%d)\n", err);
+
+	spin_lock_irq(&dev->srq_table.lock);
+	mthca_array_clear(&dev->srq_table.srq,
+			  srq->srqn & (dev->limits.num_srqs - 1));
+	--srq->refcount;
+	spin_unlock_irq(&dev->srq_table.lock);
+
+	wait_event(srq->wait, !get_srq_refcount(dev, srq));
+
+	if (!srq->ibsrq.uobject) {
+		mthca_free_srq_buf(dev, srq);
+		if (mthca_is_memfree(dev))
+			mthca_free_db(dev, MTHCA_DB_TYPE_SRQ, srq->db_index);
+	}
+
+	mthca_table_put(dev, dev->srq_table.table, srq->srqn);
+	mthca_free(&dev->srq_table.alloc, srq->srqn);
+	mthca_free_mailbox(dev, mailbox);
+}
+
+int mthca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		     enum ib_srq_attr_mask attr_mask, struct ib_udata *udata)
+{
+	struct mthca_dev *dev = to_mdev(ibsrq->device);
+	struct mthca_srq *srq = to_msrq(ibsrq);
+	int ret = 0;
+
+	/* We don't support resizing SRQs (yet?) */
+	if (attr_mask & IB_SRQ_MAX_WR)
+		return -EINVAL;
+
+	if (attr_mask & IB_SRQ_LIMIT) {
+		u32 max_wr = mthca_is_memfree(dev) ? srq->max - 1 : srq->max;
+		if (attr->srq_limit > max_wr)
+			return -EINVAL;
+
+		mutex_lock(&srq->mutex);
+		ret = mthca_ARM_SRQ(dev, srq->srqn, attr->srq_limit);
+		mutex_unlock(&srq->mutex);
+	}
+
+	return ret;
+}
+
+int mthca_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	struct mthca_dev *dev = to_mdev(ibsrq->device);
+	struct mthca_srq *srq = to_msrq(ibsrq);
+	struct mthca_mailbox *mailbox;
+	struct mthca_arbel_srq_context *arbel_ctx;
+	struct mthca_tavor_srq_context *tavor_ctx;
+	int err;
+
+	mailbox = mthca_alloc_mailbox(dev, GFP_KERNEL);
+	if (IS_ERR(mailbox))
+		return PTR_ERR(mailbox);
+
+	err = mthca_QUERY_SRQ(dev, srq->srqn, mailbox);
+	if (err)
+		goto out;
+
+	if (mthca_is_memfree(dev)) {
+		arbel_ctx = mailbox->buf;
+		srq_attr->srq_limit = be16_to_cpu(arbel_ctx->limit_watermark);
+	} else {
+		tavor_ctx = mailbox->buf;
+		srq_attr->srq_limit = be16_to_cpu(tavor_ctx->limit_watermark);
+	}
+
+	srq_attr->max_wr  = srq->max - 1;
+	srq_attr->max_sge = srq->max_gs;
+
+out:
+	mthca_free_mailbox(dev, mailbox);
+
+	return err;
+}
+
+void mthca_srq_event(struct mthca_dev *dev, u32 srqn,
+		     enum ib_event_type event_type)
+{
+	struct mthca_srq *srq;
+	struct ib_event event;
+
+	spin_lock(&dev->srq_table.lock);
+	srq = mthca_array_get(&dev->srq_table.srq, srqn & (dev->limits.num_srqs - 1));
+	if (srq)
+		++srq->refcount;
+	spin_unlock(&dev->srq_table.lock);
+
+	if (!srq) {
+		mthca_warn(dev, "Async event for bogus SRQ %08x\n", srqn);
+		return;
+	}
+
+	if (!srq->ibsrq.event_handler)
+		goto out;
+
+	event.device      = &dev->ib_dev;
+	event.event       = event_type;
+	event.element.srq = &srq->ibsrq;
+	srq->ibsrq.event_handler(&event, srq->ibsrq.srq_context);
+
+out:
+	spin_lock(&dev->srq_table.lock);
+	if (!--srq->refcount)
+		wake_up(&srq->wait);
+	spin_unlock(&dev->srq_table.lock);
+}
+
+/*
+ * This function must be called with IRQs disabled.
+ */
+void mthca_free_srq_wqe(struct mthca_srq *srq, u32 wqe_addr)
+{
+	int ind;
+	struct mthca_next_seg *last_free;
+
+	ind = wqe_addr >> srq->wqe_shift;
+
+	spin_lock(&srq->lock);
+
+	last_free = get_wqe(srq, srq->last_free);
+	*wqe_to_link(last_free) = ind;
+	last_free->nda_op = htonl((ind << srq->wqe_shift) | 1);
+	*wqe_to_link(get_wqe(srq, ind)) = -1;
+	srq->last_free = ind;
+
+	spin_unlock(&srq->lock);
+}
+
+int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibsrq->device);
+	struct mthca_srq *srq = to_msrq(ibsrq);
+	unsigned long flags;
+	int err = 0;
+	int first_ind;
+	int ind;
+	int next_ind;
+	int nreq;
+	int i;
+	void *wqe;
+	void *prev_wqe;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	first_ind = srq->first_free;
+
+	for (nreq = 0; wr; wr = wr->next) {
+		ind       = srq->first_free;
+		wqe       = get_wqe(srq, ind);
+		next_ind  = *wqe_to_link(wqe);
+
+		if (unlikely(next_ind < 0)) {
+			mthca_err(dev, "SRQ %06x full\n", srq->srqn);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		prev_wqe  = srq->last;
+		srq->last = wqe;
+
+		((struct mthca_next_seg *) wqe)->ee_nds = 0;
+		/* flags field will always remain 0 */
+
+		wqe += sizeof (struct mthca_next_seg);
+
+		if (unlikely(wr->num_sge > srq->max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			srq->last = prev_wqe;
+			break;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe += sizeof (struct mthca_data_seg);
+		}
+
+		if (i < srq->max_gs)
+			mthca_set_data_seg_inval(wqe);
+
+		((struct mthca_next_seg *) prev_wqe)->ee_nds =
+			cpu_to_be32(MTHCA_NEXT_DBD);
+
+		srq->wrid[ind]  = wr->wr_id;
+		srq->first_free = next_ind;
+
+		++nreq;
+		if (unlikely(nreq == MTHCA_TAVOR_MAX_WQES_PER_RECV_DB)) {
+			nreq = 0;
+
+			/*
+			 * Make sure that descriptors are written
+			 * before doorbell is rung.
+			 */
+			wmb();
+
+			mthca_write64(first_ind << srq->wqe_shift, srq->srqn << 8,
+				      dev->kar + MTHCA_RECEIVE_DOORBELL,
+				      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+
+			first_ind = srq->first_free;
+		}
+	}
+
+	if (likely(nreq)) {
+		/*
+		 * Make sure that descriptors are written before
+		 * doorbell is rung.
+		 */
+		wmb();
+
+		mthca_write64(first_ind << srq->wqe_shift, (srq->srqn << 8) | nreq,
+			      dev->kar + MTHCA_RECEIVE_DOORBELL,
+			      MTHCA_GET_DOORBELL_LOCK(&dev->doorbell_lock));
+	}
+
+	/*
+	 * Make sure doorbells don't leak out of SRQ spinlock and
+	 * reach the HCA out of order:
+	 */
+	mmiowb();
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+	return err;
+}
+
+int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			      struct ib_recv_wr **bad_wr)
+{
+	struct mthca_dev *dev = to_mdev(ibsrq->device);
+	struct mthca_srq *srq = to_msrq(ibsrq);
+	unsigned long flags;
+	int err = 0;
+	int ind;
+	int next_ind;
+	int nreq;
+	int i;
+	void *wqe;
+
+	spin_lock_irqsave(&srq->lock, flags);
+
+	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		ind       = srq->first_free;
+		wqe       = get_wqe(srq, ind);
+		next_ind  = *wqe_to_link(wqe);
+
+		if (unlikely(next_ind < 0)) {
+			mthca_err(dev, "SRQ %06x full\n", srq->srqn);
+			err = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+
+		((struct mthca_next_seg *) wqe)->ee_nds = 0;
+		/* flags field will always remain 0 */
+
+		wqe += sizeof (struct mthca_next_seg);
+
+		if (unlikely(wr->num_sge > srq->max_gs)) {
+			err = -EINVAL;
+			*bad_wr = wr;
+			break;
+		}
+
+		for (i = 0; i < wr->num_sge; ++i) {
+			mthca_set_data_seg(wqe, wr->sg_list + i);
+			wqe += sizeof (struct mthca_data_seg);
+		}
+
+		if (i < srq->max_gs)
+			mthca_set_data_seg_inval(wqe);
+
+		srq->wrid[ind]  = wr->wr_id;
+		srq->first_free = next_ind;
+	}
+
+	if (likely(nreq)) {
+		srq->counter += nreq;
+
+		/*
+		 * Make sure that descriptors are written before
+		 * we write doorbell record.
+		 */
+		wmb();
+		*srq->db = cpu_to_be32(srq->counter);
+	}
+
+	spin_unlock_irqrestore(&srq->lock, flags);
+	return err;
+}
+
+int mthca_max_srq_sge(struct mthca_dev *dev)
+{
+	if (mthca_is_memfree(dev))
+		return dev->limits.max_sg;
+
+	/*
+	 * SRQ allocations are based on powers of 2 for Tavor,
+	 * (although they only need to be multiples of 16 bytes).
+	 *
+	 * Therefore, we need to base the max number of sg entries on
+	 * the largest power of 2 descriptor size that is <= to the
+	 * actual max WQE descriptor size, rather than return the
+	 * max_sg value given by the firmware (which is based on WQE
+	 * sizes as multiples of 16, not powers of 2).
+	 *
+	 * If SRQ implementation is changed for Tavor to be based on
+	 * multiples of 16, the calculation below can be deleted and
+	 * the FW max_sg value returned.
+	 */
+	return min_t(int, dev->limits.max_sg,
+		     ((1 << (fls(dev->limits.max_desc_sz) - 1)) -
+		      sizeof (struct mthca_next_seg)) /
+		     sizeof (struct mthca_data_seg));
+}
+
+int mthca_init_srq_table(struct mthca_dev *dev)
+{
+	int err;
+
+	if (!(dev->mthca_flags & MTHCA_FLAG_SRQ))
+		return 0;
+
+	spin_lock_init(&dev->srq_table.lock);
+
+	err = mthca_alloc_init(&dev->srq_table.alloc,
+			       dev->limits.num_srqs,
+			       dev->limits.num_srqs - 1,
+			       dev->limits.reserved_srqs);
+	if (err)
+		return err;
+
+	err = mthca_array_init(&dev->srq_table.srq,
+			       dev->limits.num_srqs);
+	if (err)
+		mthca_alloc_cleanup(&dev->srq_table.alloc);
+
+	return err;
+}
+
+void mthca_cleanup_srq_table(struct mthca_dev *dev)
+{
+	if (!(dev->mthca_flags & MTHCA_FLAG_SRQ))
+		return;
+
+	mthca_array_cleanup(&dev->srq_table.srq, dev->limits.num_srqs);
+	mthca_alloc_cleanup(&dev->srq_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_uar.c b/drivers/infiniband/hw/mthca/mthca_uar.c
new file mode 100644
index 000000000..ca5900c96
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_uar.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <asm/page.h>		/* PAGE_SHIFT */
+
+#include "mthca_dev.h"
+#include "mthca_memfree.h"
+
+int mthca_uar_alloc(struct mthca_dev *dev, struct mthca_uar *uar)
+{
+	uar->index = mthca_alloc(&dev->uar_table.alloc);
+	if (uar->index == -1)
+		return -ENOMEM;
+
+	uar->pfn = (pci_resource_start(dev->pdev, 2) >> PAGE_SHIFT) + uar->index;
+
+	return 0;
+}
+
+void mthca_uar_free(struct mthca_dev *dev, struct mthca_uar *uar)
+{
+	mthca_free(&dev->uar_table.alloc, uar->index);
+}
+
+int mthca_init_uar_table(struct mthca_dev *dev)
+{
+	int ret;
+
+	ret = mthca_alloc_init(&dev->uar_table.alloc,
+			       dev->limits.num_uars,
+			       dev->limits.num_uars - 1,
+			       dev->limits.reserved_uars + 1);
+	if (ret)
+		return ret;
+
+	ret = mthca_init_db_tab(dev);
+	if (ret)
+		mthca_alloc_cleanup(&dev->uar_table.alloc);
+
+	return ret;
+}
+
+void mthca_cleanup_uar_table(struct mthca_dev *dev)
+{
+	mthca_cleanup_db_tab(dev);
+
+	/* XXX check if any UARs are still allocated? */
+	mthca_alloc_cleanup(&dev->uar_table.alloc);
+}
diff --git a/drivers/infiniband/hw/mthca/mthca_user.h b/drivers/infiniband/hw/mthca/mthca_user.h
new file mode 100644
index 000000000..5fe56e810
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_user.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_USER_H
+#define MTHCA_USER_H
+
+#include <linux/types.h>
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define MTHCA_UVERBS_ABI_VERSION	1
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct mthca_alloc_ucontext_resp {
+	__u32 qp_tab_size;
+	__u32 uarc_size;
+};
+
+struct mthca_alloc_pd_resp {
+	__u32 pdn;
+	__u32 reserved;
+};
+
+struct mthca_reg_mr {
+/*
+ * Mark the memory region with a DMA attribute that causes
+ * in-flight DMA to be flushed when the region is written to:
+ */
+#define MTHCA_MR_DMASYNC	0x1
+	__u32 mr_attrs;
+	__u32 reserved;
+};
+
+struct mthca_create_cq {
+	__u32 lkey;
+	__u32 pdn;
+	__u64 arm_db_page;
+	__u64 set_db_page;
+	__u32 arm_db_index;
+	__u32 set_db_index;
+};
+
+struct mthca_create_cq_resp {
+	__u32 cqn;
+	__u32 reserved;
+};
+
+struct mthca_resize_cq {
+	__u32 lkey;
+	__u32 reserved;
+};
+
+struct mthca_create_srq {
+	__u32 lkey;
+	__u32 db_index;
+	__u64 db_page;
+};
+
+struct mthca_create_srq_resp {
+	__u32 srqn;
+	__u32 reserved;
+};
+
+struct mthca_create_qp {
+	__u32 lkey;
+	__u32 reserved;
+	__u64 sq_db_page;
+	__u64 rq_db_page;
+	__u32 sq_db_index;
+	__u32 rq_db_index;
+};
+
+#endif /* MTHCA_USER_H */
diff --git a/drivers/infiniband/hw/mthca/mthca_wqe.h b/drivers/infiniband/hw/mthca/mthca_wqe.h
new file mode 100644
index 000000000..341a5ae88
--- /dev/null
+++ b/drivers/infiniband/hw/mthca/mthca_wqe.h
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2005 Cisco Systems. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef MTHCA_WQE_H
+#define MTHCA_WQE_H
+
+#include <linux/types.h>
+
+enum {
+	MTHCA_NEXT_DBD		= 1 << 7,
+	MTHCA_NEXT_FENCE	= 1 << 6,
+	MTHCA_NEXT_CQ_UPDATE	= 1 << 3,
+	MTHCA_NEXT_EVENT_GEN	= 1 << 2,
+	MTHCA_NEXT_SOLICIT	= 1 << 1,
+	MTHCA_NEXT_IP_CSUM	= 1 << 4,
+	MTHCA_NEXT_TCP_UDP_CSUM = 1 << 5,
+
+	MTHCA_MLX_VL15		= 1 << 17,
+	MTHCA_MLX_SLR		= 1 << 16
+};
+
+enum {
+	MTHCA_INVAL_LKEY			= 0x100,
+	MTHCA_TAVOR_MAX_WQES_PER_RECV_DB	= 256,
+	MTHCA_ARBEL_MAX_WQES_PER_SEND_DB	= 255
+};
+
+struct mthca_next_seg {
+	__be32 nda_op;		/* [31:6] next WQE [4:0] next opcode */
+	__be32 ee_nds;		/* [31:8] next EE  [7] DBD [6] F [5:0] next WQE size */
+	__be32 flags;		/* [3] CQ [2] Event [1] Solicit */
+	__be32 imm;		/* immediate data */
+};
+
+struct mthca_tavor_ud_seg {
+	u32    reserved1;
+	__be32 lkey;
+	__be64 av_addr;
+	u32    reserved2[4];
+	__be32 dqpn;
+	__be32 qkey;
+	u32    reserved3[2];
+};
+
+struct mthca_arbel_ud_seg {
+	__be32 av[8];
+	__be32 dqpn;
+	__be32 qkey;
+	u32    reserved[2];
+};
+
+struct mthca_bind_seg {
+	__be32 flags;		/* [31] Atomic [30] rem write [29] rem read */
+	u32    reserved;
+	__be32 new_rkey;
+	__be32 lkey;
+	__be64 addr;
+	__be64 length;
+};
+
+struct mthca_raddr_seg {
+	__be64 raddr;
+	__be32 rkey;
+	u32    reserved;
+};
+
+struct mthca_atomic_seg {
+	__be64 swap_add;
+	__be64 compare;
+};
+
+struct mthca_data_seg {
+	__be32 byte_count;
+	__be32 lkey;
+	__be64 addr;
+};
+
+struct mthca_mlx_seg {
+	__be32 nda_op;
+	__be32 nds;
+	__be32 flags;		/* [17] VL15 [16] SLR [14:12] static rate
+				   [11:8] SL [3] C [2] E */
+	__be16 rlid;
+	__be16 vcrc;
+};
+
+static __always_inline void mthca_set_data_seg(struct mthca_data_seg *dseg,
+					       struct ib_sge *sg)
+{
+	dseg->byte_count = cpu_to_be32(sg->length);
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+}
+
+static __always_inline void mthca_set_data_seg_inval(struct mthca_data_seg *dseg)
+{
+	dseg->byte_count = 0;
+	dseg->lkey       = cpu_to_be32(MTHCA_INVAL_LKEY);
+	dseg->addr       = 0;
+}
+
+#endif /* MTHCA_WQE_H */
diff --git a/drivers/infiniband/hw/nes/Kconfig b/drivers/infiniband/hw/nes/Kconfig
new file mode 100644
index 000000000..846dc97cf
--- /dev/null
+++ b/drivers/infiniband/hw/nes/Kconfig
@@ -0,0 +1,16 @@
+config INFINIBAND_NES
+	tristate "NetEffect RNIC Driver"
+	depends on PCI && INET && INFINIBAND
+	select LIBCRC32C
+	select INET_LRO
+	---help---
+	  This is the RDMA Network Interface Card (RNIC) driver for
+	  NetEffect Ethernet Cluster Server Adapters.
+
+config INFINIBAND_NES_DEBUG
+	bool "Verbose debugging output"
+	depends on INFINIBAND_NES
+	default n
+	---help---
+	  This option enables debug messages from the NetEffect RNIC
+	  driver.  Select this if you are diagnosing a problem.
diff --git a/drivers/infiniband/hw/nes/Makefile b/drivers/infiniband/hw/nes/Makefile
new file mode 100644
index 000000000..97820c23e
--- /dev/null
+++ b/drivers/infiniband/hw/nes/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_INFINIBAND_NES) += iw_nes.o
+
+iw_nes-objs := nes.o nes_hw.o nes_nic.o nes_utils.o nes_verbs.o nes_cm.o nes_mgt.o
diff --git a/drivers/infiniband/hw/nes/nes.c b/drivers/infiniband/hw/nes/nes.c
new file mode 100644
index 000000000..9f9d5c563
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes.c
@@ -0,0 +1,1270 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/if_arp.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+#include <rdma/ib_smi.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/iw_cm.h>
+
+#include "nes.h"
+
+#include <net/netevent.h>
+#include <net/neighbour.h>
+#include <linux/route.h>
+#include <net/ip_fib.h>
+
+MODULE_AUTHOR("NetEffect");
+MODULE_DESCRIPTION("NetEffect RNIC Low-level iWARP Driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+int max_mtu = 9000;
+int interrupt_mod_interval = 0;
+
+/* Interoperability */
+int mpa_version = 1;
+module_param(mpa_version, int, 0644);
+MODULE_PARM_DESC(mpa_version, "MPA version to be used int MPA Req/Resp (0 or 1)");
+
+/* Interoperability */
+int disable_mpa_crc = 0;
+module_param(disable_mpa_crc, int, 0644);
+MODULE_PARM_DESC(disable_mpa_crc, "Disable checking of MPA CRC");
+
+unsigned int nes_drv_opt = NES_DRV_OPT_DISABLE_INT_MOD | NES_DRV_OPT_ENABLE_PAU;
+module_param(nes_drv_opt, int, 0644);
+MODULE_PARM_DESC(nes_drv_opt, "Driver option parameters");
+
+unsigned int nes_debug_level = 0;
+module_param_named(debug_level, nes_debug_level, uint, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug output level");
+
+unsigned int wqm_quanta = 0x10000;
+module_param(wqm_quanta, int, 0644);
+MODULE_PARM_DESC(wqm_quanta, "WQM quanta");
+
+static bool limit_maxrdreqsz;
+module_param(limit_maxrdreqsz, bool, 0644);
+MODULE_PARM_DESC(limit_maxrdreqsz, "Limit max read request size to 256 Bytes");
+
+LIST_HEAD(nes_adapter_list);
+static LIST_HEAD(nes_dev_list);
+
+atomic_t qps_destroyed;
+
+static unsigned int ee_flsh_adapter;
+static unsigned int sysfs_nonidx_addr;
+static unsigned int sysfs_idx_addr;
+
+static struct pci_device_id nes_pci_table[] = {
+	{ PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020), },
+	{ PCI_VDEVICE(NETEFFECT, PCI_DEVICE_ID_NETEFFECT_NE020_KR), },
+	{0}
+};
+
+MODULE_DEVICE_TABLE(pci, nes_pci_table);
+
+/* registered nes netlink callbacks */
+static struct ibnl_client_cbs nes_nl_cb_table[] = {
+	[RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb},
+	[RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb},
+	[RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = iwpm_add_and_query_mapping_cb},
+	[RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb},
+	[RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb},
+	[RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb},
+	[RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = iwpm_ack_mapping_info_cb}
+};
+
+static int nes_inetaddr_event(struct notifier_block *, unsigned long, void *);
+static int nes_net_event(struct notifier_block *, unsigned long, void *);
+static int nes_notifiers_registered;
+
+
+static struct notifier_block nes_inetaddr_notifier = {
+	.notifier_call = nes_inetaddr_event
+};
+
+static struct notifier_block nes_net_notifier = {
+	.notifier_call = nes_net_event
+};
+
+/**
+ * nes_inetaddr_event
+ */
+static int nes_inetaddr_event(struct notifier_block *notifier,
+		unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = ptr;
+	struct net_device *event_netdev = ifa->ifa_dev->dev;
+	struct nes_device *nesdev;
+	struct net_device *netdev;
+	struct net_device *upper_dev;
+	struct nes_vnic *nesvnic;
+	unsigned int is_bonded;
+
+	nes_debug(NES_DBG_NETDEV, "nes_inetaddr_event: ip address %pI4, netmask %pI4.\n",
+		  &ifa->ifa_address, &ifa->ifa_mask);
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p. (%s)\n",
+				nesdev, nesdev->netdev[0]->name);
+		netdev = nesdev->netdev[0];
+		nesvnic = netdev_priv(netdev);
+		upper_dev = netdev_master_upper_dev_get(netdev);
+		is_bonded = netif_is_bond_slave(netdev) &&
+			    (upper_dev == event_netdev);
+		if ((netdev == event_netdev) || is_bonded) {
+			if (nesvnic->rdma_enabled == 0) {
+				nes_debug(NES_DBG_NETDEV, "Returning without processing event for %s since"
+						" RDMA is not enabled.\n",
+						netdev->name);
+				return NOTIFY_OK;
+			}
+			/* we have ifa->ifa_address/mask here if we need it */
+			switch (event) {
+				case NETDEV_DOWN:
+					nes_debug(NES_DBG_NETDEV, "event:DOWN\n");
+					nes_write_indexed(nesdev,
+							NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)), 0);
+
+					nes_manage_arp_cache(netdev, netdev->dev_addr,
+							ntohl(nesvnic->local_ipaddr), NES_ARP_DELETE);
+					nesvnic->local_ipaddr = 0;
+					if (is_bonded)
+						continue;
+					else
+						return NOTIFY_OK;
+					break;
+				case NETDEV_UP:
+					nes_debug(NES_DBG_NETDEV, "event:UP\n");
+
+					if (nesvnic->local_ipaddr != 0) {
+						nes_debug(NES_DBG_NETDEV, "Interface already has local_ipaddr\n");
+						return NOTIFY_OK;
+					}
+					/* fall through */
+				case NETDEV_CHANGEADDR:
+					/* Add the address to the IP table */
+					if (upper_dev)
+						nesvnic->local_ipaddr =
+							((struct in_device *)upper_dev->ip_ptr)->ifa_list->ifa_address;
+					else
+						nesvnic->local_ipaddr = ifa->ifa_address;
+
+					nes_write_indexed(nesdev,
+							NES_IDX_DST_IP_ADDR+(0x10*PCI_FUNC(nesdev->pcidev->devfn)),
+							ntohl(nesvnic->local_ipaddr));
+					nes_manage_arp_cache(netdev, netdev->dev_addr,
+							ntohl(nesvnic->local_ipaddr), NES_ARP_ADD);
+					if (is_bonded)
+						continue;
+					else
+						return NOTIFY_OK;
+					break;
+				default:
+					break;
+			}
+		}
+	}
+
+	return NOTIFY_DONE;
+}
+
+
+/**
+ * nes_net_event
+ */
+static int nes_net_event(struct notifier_block *notifier,
+		unsigned long event, void *ptr)
+{
+	struct neighbour *neigh = ptr;
+	struct nes_device *nesdev;
+	struct net_device *netdev;
+	struct nes_vnic *nesvnic;
+
+	switch (event) {
+		case NETEVENT_NEIGH_UPDATE:
+			list_for_each_entry(nesdev, &nes_dev_list, list) {
+				/* nes_debug(NES_DBG_NETDEV, "Nesdev list entry = 0x%p.\n", nesdev); */
+				netdev = nesdev->netdev[0];
+				nesvnic = netdev_priv(netdev);
+				if (netdev == neigh->dev) {
+					if (nesvnic->rdma_enabled == 0) {
+						nes_debug(NES_DBG_NETDEV, "Skipping device %s since no RDMA\n",
+								netdev->name);
+					} else {
+						if (neigh->nud_state & NUD_VALID) {
+							nes_manage_arp_cache(neigh->dev, neigh->ha,
+									ntohl(*(__be32 *)neigh->primary_key), NES_ARP_ADD);
+						} else {
+							nes_manage_arp_cache(neigh->dev, neigh->ha,
+									ntohl(*(__be32 *)neigh->primary_key), NES_ARP_DELETE);
+						}
+					}
+					return NOTIFY_OK;
+				}
+			}
+			break;
+		default:
+			nes_debug(NES_DBG_NETDEV, "NETEVENT_ %lu undefined\n", event);
+			break;
+	}
+
+	return NOTIFY_DONE;
+}
+
+
+/**
+ * nes_add_ref
+ */
+void nes_add_ref(struct ib_qp *ibqp)
+{
+	struct nes_qp *nesqp;
+
+	nesqp = to_nesqp(ibqp);
+	nes_debug(NES_DBG_QP, "Bumping refcount for QP%u.  Pre-inc value = %u\n",
+			ibqp->qp_num, atomic_read(&nesqp->refcount));
+	atomic_inc(&nesqp->refcount);
+}
+
+static void nes_cqp_rem_ref_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request)
+{
+	unsigned long flags;
+	struct nes_qp *nesqp = cqp_request->cqp_callback_pointer;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	atomic_inc(&qps_destroyed);
+
+	/* Free the control structures */
+
+	if (nesqp->pbl_vbase) {
+		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+				nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase);
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_256pbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase);
+		nesqp->pbl_vbase = NULL;
+
+	} else {
+		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+				nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase);
+	}
+	nes_free_resource(nesadapter, nesadapter->allocated_qps, nesqp->hwqp.qp_id);
+
+	nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = NULL;
+	kfree(nesqp->allocated_buffer);
+
+}
+
+/**
+ * nes_rem_ref
+ */
+void nes_rem_ref(struct ib_qp *ibqp)
+{
+	u64 u64temp;
+	struct nes_qp *nesqp;
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	u32 opcode;
+
+	nesqp = to_nesqp(ibqp);
+
+	if (atomic_read(&nesqp->refcount) == 0) {
+		printk(KERN_INFO PFX "%s: Reference count already 0 for QP%d, last aeq = 0x%04X.\n",
+				__func__, ibqp->qp_num, nesqp->last_aeq);
+		BUG();
+	}
+
+	if (atomic_dec_and_test(&nesqp->refcount)) {
+		if (nesqp->pau_mode)
+			nes_destroy_pau_qp(nesdev, nesqp);
+
+		/* Destroy the QP */
+		cqp_request = nes_get_cqp_request(nesdev);
+		if (cqp_request == NULL) {
+			nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
+			return;
+		}
+		cqp_request->waiting = 0;
+		cqp_request->callback = 1;
+		cqp_request->cqp_callback = nes_cqp_rem_ref_callback;
+		cqp_request->cqp_callback_pointer = nesqp;
+		cqp_wqe = &cqp_request->cqp_wqe;
+
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+		opcode = NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_IWARP;
+
+		if (nesqp->hte_added) {
+			opcode  |= NES_CQP_QP_DEL_HTE;
+			nesqp->hte_added = 0;
+		}
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+		u64temp = (u64)nesqp->nesqp_context_pbase;
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+		nes_post_cqp_request(nesdev, cqp_request);
+	}
+}
+
+
+/**
+ * nes_get_qp
+ */
+struct ib_qp *nes_get_qp(struct ib_device *device, int qpn)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	if ((qpn < NES_FIRST_QPN) || (qpn >= (NES_FIRST_QPN + nesadapter->max_qp)))
+		return NULL;
+
+	return &nesadapter->qp_table[qpn - NES_FIRST_QPN]->ibqp;
+}
+
+
+/**
+ * nes_print_macaddr
+ */
+static void nes_print_macaddr(struct net_device *netdev)
+{
+	nes_debug(NES_DBG_INIT, "%s: %pM, IRQ %u\n",
+		  netdev->name, netdev->dev_addr, netdev->irq);
+}
+
+/**
+ * nes_interrupt - handle interrupts
+ */
+static irqreturn_t nes_interrupt(int irq, void *dev_id)
+{
+	struct nes_device *nesdev = (struct nes_device *)dev_id;
+	int handled = 0;
+	u32 int_mask;
+	u32 int_req;
+	u32 int_stat;
+	u32 intf_int_stat;
+	u32 timer_stat;
+
+	if (nesdev->msi_enabled) {
+		/* No need to read the interrupt pending register if msi is enabled */
+		handled = 1;
+	} else {
+		if (unlikely(nesdev->nesadapter->hw_rev == NE020_REV)) {
+			/* Master interrupt enable provides synchronization for kicking off bottom half
+			  when interrupt sharing is going on */
+			int_mask = nes_read32(nesdev->regs + NES_INT_MASK);
+			if (int_mask & 0x80000000) {
+				/* Check interrupt status to see if this might be ours */
+				int_stat = nes_read32(nesdev->regs + NES_INT_STAT);
+				int_req = nesdev->int_req;
+				if (int_stat&int_req) {
+					/* if interesting CEQ or AEQ is pending, claim the interrupt */
+					if ((int_stat&int_req) & (~(NES_INT_TIMER|NES_INT_INTF))) {
+						handled = 1;
+					} else {
+						if (((int_stat & int_req) & NES_INT_TIMER) == NES_INT_TIMER) {
+							/* Timer might be running but might be for another function */
+							timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT);
+							if ((timer_stat & nesdev->timer_int_req) != 0) {
+								handled = 1;
+							}
+						}
+						if ((((int_stat & int_req) & NES_INT_INTF) == NES_INT_INTF) &&
+								(handled == 0)) {
+							intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT);
+							if ((intf_int_stat & nesdev->intf_int_req) != 0) {
+								handled = 1;
+							}
+						}
+					}
+					if (handled) {
+						nes_write32(nesdev->regs+NES_INT_MASK, int_mask & (~0x80000000));
+						int_mask = nes_read32(nesdev->regs+NES_INT_MASK);
+						/* Save off the status to save an additional read */
+						nesdev->int_stat = int_stat;
+						nesdev->napi_isr_ran = 1;
+					}
+				}
+			}
+		} else {
+			handled = nes_read32(nesdev->regs+NES_INT_PENDING);
+		}
+	}
+
+	if (handled) {
+
+		if (nes_napi_isr(nesdev) == 0) {
+			tasklet_schedule(&nesdev->dpc_tasklet);
+
+		}
+		return IRQ_HANDLED;
+	} else {
+		return IRQ_NONE;
+	}
+}
+
+
+/**
+ * nes_probe - Device initialization
+ */
+static int nes_probe(struct pci_dev *pcidev, const struct pci_device_id *ent)
+{
+	struct net_device *netdev = NULL;
+	struct nes_device *nesdev = NULL;
+	int ret = 0;
+	void __iomem *mmio_regs = NULL;
+	u8 hw_rev;
+
+	assert(pcidev != NULL);
+	assert(ent != NULL);
+
+	printk(KERN_INFO PFX "NetEffect RNIC driver v%s loading. (%s)\n",
+			DRV_VERSION, pci_name(pcidev));
+
+	ret = pci_enable_device(pcidev);
+	if (ret) {
+		printk(KERN_ERR PFX "Unable to enable PCI device. (%s)\n", pci_name(pcidev));
+		goto bail0;
+	}
+
+	nes_debug(NES_DBG_INIT, "BAR0 (@0x%08lX) size = 0x%lX bytes\n",
+			(long unsigned int)pci_resource_start(pcidev, BAR_0),
+			(long unsigned int)pci_resource_len(pcidev, BAR_0));
+	nes_debug(NES_DBG_INIT, "BAR1 (@0x%08lX) size = 0x%lX bytes\n",
+			(long unsigned int)pci_resource_start(pcidev, BAR_1),
+			(long unsigned int)pci_resource_len(pcidev, BAR_1));
+
+	/* Make sure PCI base addr are MMIO */
+	if (!(pci_resource_flags(pcidev, BAR_0) & IORESOURCE_MEM) ||
+			!(pci_resource_flags(pcidev, BAR_1) & IORESOURCE_MEM)) {
+		printk(KERN_ERR PFX "PCI regions not an MMIO resource\n");
+		ret = -ENODEV;
+		goto bail1;
+	}
+
+	/* Reserve PCI I/O and memory resources */
+	ret = pci_request_regions(pcidev, DRV_NAME);
+	if (ret) {
+		printk(KERN_ERR PFX "Unable to request regions. (%s)\n", pci_name(pcidev));
+		goto bail1;
+	}
+
+	if ((sizeof(dma_addr_t) > 4)) {
+		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(64));
+		if (ret < 0) {
+			printk(KERN_ERR PFX "64b DMA mask configuration failed\n");
+			goto bail2;
+		}
+		ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(64));
+		if (ret) {
+			printk(KERN_ERR PFX "64b DMA consistent mask configuration failed\n");
+			goto bail2;
+		}
+	} else {
+		ret = pci_set_dma_mask(pcidev, DMA_BIT_MASK(32));
+		if (ret < 0) {
+			printk(KERN_ERR PFX "32b DMA mask configuration failed\n");
+			goto bail2;
+		}
+		ret = pci_set_consistent_dma_mask(pcidev, DMA_BIT_MASK(32));
+		if (ret) {
+			printk(KERN_ERR PFX "32b DMA consistent mask configuration failed\n");
+			goto bail2;
+		}
+	}
+
+	pci_set_master(pcidev);
+
+	/* Allocate hardware structure */
+	nesdev = kzalloc(sizeof(struct nes_device), GFP_KERNEL);
+	if (!nesdev) {
+		printk(KERN_ERR PFX "%s: Unable to alloc hardware struct\n", pci_name(pcidev));
+		ret = -ENOMEM;
+		goto bail2;
+	}
+
+	nes_debug(NES_DBG_INIT, "Allocated nes device at %p\n", nesdev);
+	nesdev->pcidev = pcidev;
+	pci_set_drvdata(pcidev, nesdev);
+
+	pci_read_config_byte(pcidev, 0x0008, &hw_rev);
+	nes_debug(NES_DBG_INIT, "hw_rev=%u\n", hw_rev);
+
+	spin_lock_init(&nesdev->indexed_regs_lock);
+
+	/* Remap the PCI registers in adapter BAR0 to kernel VA space */
+	mmio_regs = ioremap_nocache(pci_resource_start(pcidev, BAR_0),
+				    pci_resource_len(pcidev, BAR_0));
+	if (mmio_regs == NULL) {
+		printk(KERN_ERR PFX "Unable to remap BAR0\n");
+		ret = -EIO;
+		goto bail3;
+	}
+	nesdev->regs = mmio_regs;
+	nesdev->index_reg = 0x50 + (PCI_FUNC(pcidev->devfn)*8) + mmio_regs;
+
+	/* Ensure interrupts are disabled */
+	nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff);
+
+	if (nes_drv_opt & NES_DRV_OPT_ENABLE_MSI) {
+		if (!pci_enable_msi(nesdev->pcidev)) {
+			nesdev->msi_enabled = 1;
+			nes_debug(NES_DBG_INIT, "MSI is enabled for device %s\n",
+					pci_name(pcidev));
+		} else {
+			nes_debug(NES_DBG_INIT, "MSI is disabled by linux for device %s\n",
+					pci_name(pcidev));
+		}
+	} else {
+		nes_debug(NES_DBG_INIT, "MSI not requested due to driver options for device %s\n",
+				pci_name(pcidev));
+	}
+
+	nesdev->csr_start = pci_resource_start(nesdev->pcidev, BAR_0);
+	nesdev->doorbell_region = pci_resource_start(nesdev->pcidev, BAR_1);
+
+	/* Init the adapter */
+	nesdev->nesadapter = nes_init_adapter(nesdev, hw_rev);
+	if (!nesdev->nesadapter) {
+		printk(KERN_ERR PFX "Unable to initialize adapter.\n");
+		ret = -ENOMEM;
+		goto bail5;
+	}
+	nesdev->nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval;
+	nesdev->nesadapter->wqm_quanta = wqm_quanta;
+
+	/* nesdev->base_doorbell_index =
+			nesdev->nesadapter->pd_config_base[PCI_FUNC(nesdev->pcidev->devfn)]; */
+	nesdev->base_doorbell_index = 1;
+	nesdev->doorbell_start = nesdev->nesadapter->doorbell_start;
+	if (nesdev->nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) {
+		switch (PCI_FUNC(nesdev->pcidev->devfn) %
+			nesdev->nesadapter->port_count) {
+		case 1:
+			nesdev->mac_index = 2;
+			break;
+		case 2:
+			nesdev->mac_index = 1;
+			break;
+		case 3:
+			nesdev->mac_index = 3;
+			break;
+		case 0:
+		default:
+			nesdev->mac_index = 0;
+		}
+	} else {
+		nesdev->mac_index = PCI_FUNC(nesdev->pcidev->devfn) %
+						nesdev->nesadapter->port_count;
+	}
+
+	if ((limit_maxrdreqsz ||
+	     ((nesdev->nesadapter->phy_type[0] == NES_PHY_TYPE_GLADIUS) &&
+	      (hw_rev == NE020_REV1))) &&
+	    (pcie_get_readrq(pcidev) > 256)) {
+		if (pcie_set_readrq(pcidev, 256))
+			printk(KERN_ERR PFX "Unable to set max read request"
+				" to 256 bytes\n");
+		else
+			nes_debug(NES_DBG_INIT, "Max read request size set"
+				" to 256 bytes\n");
+	}
+
+	tasklet_init(&nesdev->dpc_tasklet, nes_dpc, (unsigned long)nesdev);
+
+	/* bring up the Control QP */
+	if (nes_init_cqp(nesdev)) {
+		ret = -ENODEV;
+		goto bail6;
+	}
+
+	/* Arm the CCQ */
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+			PCI_FUNC(nesdev->pcidev->devfn));
+	nes_read32(nesdev->regs+NES_CQE_ALLOC);
+
+	/* Enable the interrupts */
+	nesdev->int_req = (0x101 << PCI_FUNC(nesdev->pcidev->devfn)) |
+			(1 << (PCI_FUNC(nesdev->pcidev->devfn)+16));
+	if (PCI_FUNC(nesdev->pcidev->devfn) < 4) {
+		nesdev->int_req |= (1 << (PCI_FUNC(nesdev->mac_index)+24));
+	}
+
+	/* TODO: This really should be the first driver to load, not function 0 */
+	if (PCI_FUNC(nesdev->pcidev->devfn) == 0) {
+		/* pick up PCI and critical errors if the first driver to load */
+		nesdev->intf_int_req = NES_INTF_INT_PCIERR | NES_INTF_INT_CRITERR;
+		nesdev->int_req |= NES_INT_INTF;
+	} else {
+		nesdev->intf_int_req = 0;
+	}
+	nesdev->intf_int_req |= (1 << (PCI_FUNC(nesdev->pcidev->devfn)+16));
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0, 0);
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 0);
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS2, 0x00001265);
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS4, 0x18021804);
+
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS3, 0x17801790);
+
+	/* deal with both periodic and one_shot */
+	nesdev->timer_int_req = 0x101 << PCI_FUNC(nesdev->pcidev->devfn);
+	nesdev->nesadapter->timer_int_req |= nesdev->timer_int_req;
+	nes_debug(NES_DBG_INIT, "setting int_req for function %u, nesdev = 0x%04X, adapter = 0x%04X\n",
+			PCI_FUNC(nesdev->pcidev->devfn),
+			nesdev->timer_int_req, nesdev->nesadapter->timer_int_req);
+
+	nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
+
+	list_add_tail(&nesdev->list, &nes_dev_list);
+
+	/* Request an interrupt line for the driver */
+	ret = request_irq(pcidev->irq, nes_interrupt, IRQF_SHARED, DRV_NAME, nesdev);
+	if (ret) {
+		printk(KERN_ERR PFX "%s: requested IRQ %u is busy\n",
+				pci_name(pcidev), pcidev->irq);
+		goto bail65;
+	}
+
+	nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+
+	if (nes_notifiers_registered == 0) {
+		register_inetaddr_notifier(&nes_inetaddr_notifier);
+		register_netevent_notifier(&nes_net_notifier);
+	}
+	nes_notifiers_registered++;
+
+	if (ibnl_add_client(RDMA_NL_NES, RDMA_NL_IWPM_NUM_OPS, nes_nl_cb_table))
+		printk(KERN_ERR PFX "%s[%u]: Failed to add netlink callback\n",
+			__func__, __LINE__);
+
+	ret = iwpm_init(RDMA_NL_NES);
+	if (ret) {
+		printk(KERN_ERR PFX "%s: port mapper initialization failed\n",
+				pci_name(pcidev));
+		goto bail7;
+	}
+
+	INIT_DELAYED_WORK(&nesdev->work, nes_recheck_link_status);
+
+	/* Initialize network devices */
+	netdev = nes_netdev_init(nesdev, mmio_regs);
+	if (netdev == NULL) {
+		ret = -ENOMEM;
+		goto bail7;
+	}
+
+	/* Register network device */
+	ret = register_netdev(netdev);
+	if (ret) {
+		printk(KERN_ERR PFX "Unable to register netdev, ret = %d\n", ret);
+		nes_netdev_destroy(netdev);
+		goto bail7;
+	}
+
+	nes_print_macaddr(netdev);
+
+	nesdev->netdev_count++;
+	nesdev->nesadapter->netdev_count++;
+
+	printk(KERN_INFO PFX "%s: NetEffect RNIC driver successfully loaded.\n",
+			pci_name(pcidev));
+	return 0;
+
+	bail7:
+	printk(KERN_ERR PFX "bail7\n");
+	while (nesdev->netdev_count > 0) {
+		nesdev->netdev_count--;
+		nesdev->nesadapter->netdev_count--;
+
+		unregister_netdev(nesdev->netdev[nesdev->netdev_count]);
+		nes_netdev_destroy(nesdev->netdev[nesdev->netdev_count]);
+	}
+
+	nes_debug(NES_DBG_INIT, "netdev_count=%d, nesadapter->netdev_count=%d\n",
+			nesdev->netdev_count, nesdev->nesadapter->netdev_count);
+	ibnl_remove_client(RDMA_NL_NES);
+
+	nes_notifiers_registered--;
+	if (nes_notifiers_registered == 0) {
+		unregister_netevent_notifier(&nes_net_notifier);
+		unregister_inetaddr_notifier(&nes_inetaddr_notifier);
+	}
+
+	list_del(&nesdev->list);
+	nes_destroy_cqp(nesdev);
+
+	bail65:
+	printk(KERN_ERR PFX "bail65\n");
+	free_irq(pcidev->irq, nesdev);
+	if (nesdev->msi_enabled) {
+		pci_disable_msi(pcidev);
+	}
+	bail6:
+	printk(KERN_ERR PFX "bail6\n");
+	tasklet_kill(&nesdev->dpc_tasklet);
+	/* Deallocate the Adapter Structure */
+	nes_destroy_adapter(nesdev->nesadapter);
+
+	bail5:
+	printk(KERN_ERR PFX "bail5\n");
+	iounmap(nesdev->regs);
+
+	bail3:
+	printk(KERN_ERR PFX "bail3\n");
+	kfree(nesdev);
+
+	bail2:
+	pci_release_regions(pcidev);
+
+	bail1:
+	pci_disable_device(pcidev);
+
+	bail0:
+	return ret;
+}
+
+
+/**
+ * nes_remove - unload from kernel
+ */
+static void nes_remove(struct pci_dev *pcidev)
+{
+	struct nes_device *nesdev = pci_get_drvdata(pcidev);
+	struct net_device *netdev;
+	int netdev_index = 0;
+	unsigned long flags;
+
+		if (nesdev->netdev_count) {
+			netdev = nesdev->netdev[netdev_index];
+			if (netdev) {
+				netif_stop_queue(netdev);
+				unregister_netdev(netdev);
+				nes_netdev_destroy(netdev);
+
+				nesdev->netdev[netdev_index] = NULL;
+				nesdev->netdev_count--;
+				nesdev->nesadapter->netdev_count--;
+			}
+		}
+	ibnl_remove_client(RDMA_NL_NES);
+	iwpm_exit(RDMA_NL_NES);
+
+	nes_notifiers_registered--;
+	if (nes_notifiers_registered == 0) {
+		unregister_netevent_notifier(&nes_net_notifier);
+		unregister_inetaddr_notifier(&nes_inetaddr_notifier);
+	}
+
+	list_del(&nesdev->list);
+	nes_destroy_cqp(nesdev);
+
+	free_irq(pcidev->irq, nesdev);
+	tasklet_kill(&nesdev->dpc_tasklet);
+
+	spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
+	if (nesdev->link_recheck) {
+		spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+		cancel_delayed_work_sync(&nesdev->work);
+	} else {
+		spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+	}
+
+	/* Deallocate the Adapter Structure */
+	nes_destroy_adapter(nesdev->nesadapter);
+
+	if (nesdev->msi_enabled) {
+		pci_disable_msi(pcidev);
+	}
+
+	iounmap(nesdev->regs);
+	kfree(nesdev);
+
+	/* nes_debug(NES_DBG_SHUTDOWN, "calling pci_release_regions.\n"); */
+	pci_release_regions(pcidev);
+	pci_disable_device(pcidev);
+	pci_set_drvdata(pcidev, NULL);
+}
+
+
+static struct pci_driver nes_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = nes_pci_table,
+	.probe = nes_probe,
+	.remove = nes_remove,
+};
+
+static ssize_t nes_show_adapter(struct device_driver *ddp, char *buf)
+{
+	unsigned int  devfn = 0xffffffff;
+	unsigned char bus_number = 0xff;
+	unsigned int  i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			devfn = nesdev->pcidev->devfn;
+			bus_number = nesdev->pcidev->bus->number;
+			break;
+		}
+		i++;
+	}
+
+	return snprintf(buf, PAGE_SIZE, "%x:%x\n", bus_number, devfn);
+}
+
+static ssize_t nes_store_adapter(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+
+	ee_flsh_adapter = simple_strtoul(p, &p, 10);
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_ee_cmd(struct device_driver *ddp, char *buf)
+{
+	u32 eeprom_cmd = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			eeprom_cmd = nes_read32(nesdev->regs + NES_EEPROM_COMMAND);
+			break;
+		}
+		i++;
+	}
+	return snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_cmd);
+}
+
+static ssize_t nes_store_ee_cmd(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + NES_EEPROM_COMMAND, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_ee_data(struct device_driver *ddp, char *buf)
+{
+	u32 eeprom_data = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			eeprom_data = nes_read32(nesdev->regs + NES_EEPROM_DATA);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", eeprom_data);
+}
+
+static ssize_t nes_store_ee_data(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + NES_EEPROM_DATA, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_flash_cmd(struct device_driver *ddp, char *buf)
+{
+	u32 flash_cmd = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			flash_cmd = nes_read32(nesdev->regs + NES_FLASH_COMMAND);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", flash_cmd);
+}
+
+static ssize_t nes_store_flash_cmd(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + NES_FLASH_COMMAND, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_flash_data(struct device_driver *ddp, char *buf)
+{
+	u32 flash_data = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			flash_data = nes_read32(nesdev->regs + NES_FLASH_DATA);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", flash_data);
+}
+
+static ssize_t nes_store_flash_data(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + NES_FLASH_DATA, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_nonidx_addr(struct device_driver *ddp, char *buf)
+{
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_nonidx_addr);
+}
+
+static ssize_t nes_store_nonidx_addr(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X')
+		sysfs_nonidx_addr = simple_strtoul(p, &p, 16);
+
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_nonidx_data(struct device_driver *ddp, char *buf)
+{
+	u32 nonidx_data = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			nonidx_data = nes_read32(nesdev->regs + sysfs_nonidx_addr);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", nonidx_data);
+}
+
+static ssize_t nes_store_nonidx_data(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write32(nesdev->regs + sysfs_nonidx_addr, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_idx_addr(struct device_driver *ddp, char *buf)
+{
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", sysfs_idx_addr);
+}
+
+static ssize_t nes_store_idx_addr(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X')
+		sysfs_idx_addr = simple_strtoul(p, &p, 16);
+
+	return strnlen(buf, count);
+}
+
+static ssize_t nes_show_idx_data(struct device_driver *ddp, char *buf)
+{
+	u32 idx_data = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			idx_data = nes_read_indexed(nesdev, sysfs_idx_addr);
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%x\n", idx_data);
+}
+
+static ssize_t nes_store_idx_data(struct device_driver *ddp,
+	const char *buf, size_t count)
+{
+	char *p = (char *)buf;
+	u32 val;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (p[1] == 'x' || p[1] == 'X' || p[0] == 'x' || p[0] == 'X') {
+		val = simple_strtoul(p, &p, 16);
+		list_for_each_entry(nesdev, &nes_dev_list, list) {
+			if (i == ee_flsh_adapter) {
+				nes_write_indexed(nesdev, sysfs_idx_addr, val);
+				break;
+			}
+			i++;
+		}
+	}
+	return strnlen(buf, count);
+}
+
+
+/**
+ * nes_show_wqm_quanta
+ */
+static ssize_t nes_show_wqm_quanta(struct device_driver *ddp, char *buf)
+{
+	u32 wqm_quanta_value = 0xdead;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			wqm_quanta_value = nesdev->nesadapter->wqm_quanta;
+			break;
+		}
+		i++;
+	}
+
+	return  snprintf(buf, PAGE_SIZE, "0x%X\n", wqm_quanta_value);
+}
+
+
+/**
+ * nes_store_wqm_quanta
+ */
+static ssize_t nes_store_wqm_quanta(struct device_driver *ddp,
+					const char *buf, size_t count)
+{
+	unsigned long wqm_quanta_value;
+	u32 wqm_config1;
+	u32 i = 0;
+	struct nes_device *nesdev;
+
+	if (kstrtoul(buf, 0, &wqm_quanta_value) < 0)
+		return -EINVAL;
+
+	list_for_each_entry(nesdev, &nes_dev_list, list) {
+		if (i == ee_flsh_adapter) {
+			nesdev->nesadapter->wqm_quanta = wqm_quanta_value;
+			wqm_config1 = nes_read_indexed(nesdev,
+						NES_IDX_WQM_CONFIG1);
+			nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG1,
+					((wqm_quanta_value << 1) |
+					(wqm_config1 & 0x00000001)));
+			break;
+		}
+		i++;
+	}
+	return strnlen(buf, count);
+}
+
+static DRIVER_ATTR(adapter, S_IRUSR | S_IWUSR,
+		   nes_show_adapter, nes_store_adapter);
+static DRIVER_ATTR(eeprom_cmd, S_IRUSR | S_IWUSR,
+		   nes_show_ee_cmd, nes_store_ee_cmd);
+static DRIVER_ATTR(eeprom_data, S_IRUSR | S_IWUSR,
+		   nes_show_ee_data, nes_store_ee_data);
+static DRIVER_ATTR(flash_cmd, S_IRUSR | S_IWUSR,
+		   nes_show_flash_cmd, nes_store_flash_cmd);
+static DRIVER_ATTR(flash_data, S_IRUSR | S_IWUSR,
+		   nes_show_flash_data, nes_store_flash_data);
+static DRIVER_ATTR(nonidx_addr, S_IRUSR | S_IWUSR,
+		   nes_show_nonidx_addr, nes_store_nonidx_addr);
+static DRIVER_ATTR(nonidx_data, S_IRUSR | S_IWUSR,
+		   nes_show_nonidx_data, nes_store_nonidx_data);
+static DRIVER_ATTR(idx_addr, S_IRUSR | S_IWUSR,
+		   nes_show_idx_addr, nes_store_idx_addr);
+static DRIVER_ATTR(idx_data, S_IRUSR | S_IWUSR,
+		   nes_show_idx_data, nes_store_idx_data);
+static DRIVER_ATTR(wqm_quanta, S_IRUSR | S_IWUSR,
+		   nes_show_wqm_quanta, nes_store_wqm_quanta);
+
+static int nes_create_driver_sysfs(struct pci_driver *drv)
+{
+	int error;
+	error  = driver_create_file(&drv->driver, &driver_attr_adapter);
+	error |= driver_create_file(&drv->driver, &driver_attr_eeprom_cmd);
+	error |= driver_create_file(&drv->driver, &driver_attr_eeprom_data);
+	error |= driver_create_file(&drv->driver, &driver_attr_flash_cmd);
+	error |= driver_create_file(&drv->driver, &driver_attr_flash_data);
+	error |= driver_create_file(&drv->driver, &driver_attr_nonidx_addr);
+	error |= driver_create_file(&drv->driver, &driver_attr_nonidx_data);
+	error |= driver_create_file(&drv->driver, &driver_attr_idx_addr);
+	error |= driver_create_file(&drv->driver, &driver_attr_idx_data);
+	error |= driver_create_file(&drv->driver, &driver_attr_wqm_quanta);
+	return error;
+}
+
+static void nes_remove_driver_sysfs(struct pci_driver *drv)
+{
+	driver_remove_file(&drv->driver, &driver_attr_adapter);
+	driver_remove_file(&drv->driver, &driver_attr_eeprom_cmd);
+	driver_remove_file(&drv->driver, &driver_attr_eeprom_data);
+	driver_remove_file(&drv->driver, &driver_attr_flash_cmd);
+	driver_remove_file(&drv->driver, &driver_attr_flash_data);
+	driver_remove_file(&drv->driver, &driver_attr_nonidx_addr);
+	driver_remove_file(&drv->driver, &driver_attr_nonidx_data);
+	driver_remove_file(&drv->driver, &driver_attr_idx_addr);
+	driver_remove_file(&drv->driver, &driver_attr_idx_data);
+	driver_remove_file(&drv->driver, &driver_attr_wqm_quanta);
+}
+
+/**
+ * nes_init_module - module initialization entry point
+ */
+static int __init nes_init_module(void)
+{
+	int retval;
+	int retval1;
+
+	retval = nes_cm_start();
+	if (retval) {
+		printk(KERN_ERR PFX "Unable to start NetEffect iWARP CM.\n");
+		return retval;
+	}
+	retval = pci_register_driver(&nes_pci_driver);
+	if (retval >= 0) {
+		retval1 = nes_create_driver_sysfs(&nes_pci_driver);
+		if (retval1 < 0)
+			printk(KERN_ERR PFX "Unable to create NetEffect sys files.\n");
+	}
+	return retval;
+}
+
+
+/**
+ * nes_exit_module - module unload entry point
+ */
+static void __exit nes_exit_module(void)
+{
+	nes_cm_stop();
+	nes_remove_driver_sysfs(&nes_pci_driver);
+
+	pci_unregister_driver(&nes_pci_driver);
+}
+
+
+module_init(nes_init_module);
+module_exit(nes_exit_module);
diff --git a/drivers/infiniband/hw/nes/nes.h b/drivers/infiniband/hw/nes/nes.h
new file mode 100644
index 000000000..bd9d132f1
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes.h
@@ -0,0 +1,582 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __NES_H
+#define __NES_H
+
+#include <linux/netdevice.h>
+#include <linux/inetdevice.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/workqueue.h>
+#include <linux/slab.h>
+#include <asm/io.h>
+#include <linux/crc32c.h>
+
+#include <rdma/ib_smi.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/rdma_cm.h>
+#include <rdma/iw_cm.h>
+#include <rdma/rdma_netlink.h>
+#include <rdma/iw_portmap.h>
+
+#define NES_SEND_FIRST_WRITE
+
+#define QUEUE_DISCONNECTS
+
+#define DRV_NAME    "iw_nes"
+#define DRV_VERSION "1.5.0.1"
+#define PFX         DRV_NAME ": "
+
+/*
+ * NetEffect PCI vendor id and NE010 PCI device id.
+ */
+#ifndef PCI_VENDOR_ID_NETEFFECT	/* not in pci.ids yet */
+#define PCI_VENDOR_ID_NETEFFECT          0x1678
+#define PCI_DEVICE_ID_NETEFFECT_NE020    0x0100
+#define PCI_DEVICE_ID_NETEFFECT_NE020_KR 0x0110
+#endif
+
+#define NE020_REV   4
+#define NE020_REV1  5
+
+#define BAR_0       0
+#define BAR_1       2
+
+#define RX_BUF_SIZE             (1536 + 8)
+#define NES_REG0_SIZE           (4 * 1024)
+#define NES_TX_TIMEOUT          (6*HZ)
+#define NES_FIRST_QPN           64
+#define NES_SW_CONTEXT_ALIGN    1024
+
+#define NES_NIC_MAX_NICS        16
+#define NES_MAX_ARP_TABLE_SIZE  4096
+
+#define NES_NIC_CEQ_SIZE        8
+/* NICs will be on a separate CQ */
+#define NES_CCEQ_SIZE ((nesadapter->max_cq / nesadapter->port_count) - 32)
+
+#define NES_MAX_PORT_COUNT 4
+
+#define MAX_DPC_ITERATIONS               128
+
+#define NES_DRV_OPT_ENABLE_MPA_VER_0     0x00000001
+#define NES_DRV_OPT_DISABLE_MPA_CRC      0x00000002
+#define NES_DRV_OPT_DISABLE_FIRST_WRITE  0x00000004
+#define NES_DRV_OPT_DISABLE_INTF         0x00000008
+#define NES_DRV_OPT_ENABLE_MSI           0x00000010
+#define NES_DRV_OPT_DUAL_LOGICAL_PORT    0x00000020
+#define NES_DRV_OPT_SUPRESS_OPTION_BC    0x00000040
+#define NES_DRV_OPT_NO_INLINE_DATA       0x00000080
+#define NES_DRV_OPT_DISABLE_INT_MOD      0x00000100
+#define NES_DRV_OPT_DISABLE_VIRT_WQ      0x00000200
+#define NES_DRV_OPT_ENABLE_PAU           0x00000400
+
+#define NES_AEQ_EVENT_TIMEOUT         2500
+#define NES_DISCONNECT_EVENT_TIMEOUT  2000
+
+/* debug levels */
+/* must match userspace */
+#define NES_DBG_HW          0x00000001
+#define NES_DBG_INIT        0x00000002
+#define NES_DBG_ISR         0x00000004
+#define NES_DBG_PHY         0x00000008
+#define NES_DBG_NETDEV      0x00000010
+#define NES_DBG_CM          0x00000020
+#define NES_DBG_CM1         0x00000040
+#define NES_DBG_NIC_RX      0x00000080
+#define NES_DBG_NIC_TX      0x00000100
+#define NES_DBG_CQP         0x00000200
+#define NES_DBG_MMAP        0x00000400
+#define NES_DBG_MR          0x00000800
+#define NES_DBG_PD          0x00001000
+#define NES_DBG_CQ          0x00002000
+#define NES_DBG_QP          0x00004000
+#define NES_DBG_MOD_QP      0x00008000
+#define NES_DBG_AEQ         0x00010000
+#define NES_DBG_IW_RX       0x00020000
+#define NES_DBG_IW_TX       0x00040000
+#define NES_DBG_SHUTDOWN    0x00080000
+#define NES_DBG_PAU         0x00100000
+#define NES_DBG_NLMSG       0x00200000
+#define NES_DBG_RSVD1       0x10000000
+#define NES_DBG_RSVD2       0x20000000
+#define NES_DBG_RSVD3       0x40000000
+#define NES_DBG_RSVD4       0x80000000
+#define NES_DBG_ALL         0xffffffff
+
+#ifdef CONFIG_INFINIBAND_NES_DEBUG
+#define nes_debug(level, fmt, args...) \
+do { \
+	if (level & nes_debug_level) \
+		printk(KERN_ERR PFX "%s[%u]: " fmt, __func__, __LINE__, ##args); \
+} while (0)
+
+#define assert(expr) \
+do { \
+	if (!(expr)) { \
+		printk(KERN_ERR PFX "Assertion failed! %s, %s, %s, line %d\n", \
+			   #expr, __FILE__, __func__, __LINE__); \
+	} \
+} while (0)
+
+#define NES_EVENT_TIMEOUT   1200000
+#else
+#define nes_debug(level, fmt, args...)
+#define assert(expr)          do {} while (0)
+
+#define NES_EVENT_TIMEOUT   100000
+#endif
+
+#include "nes_hw.h"
+#include "nes_verbs.h"
+#include "nes_context.h"
+#include "nes_user.h"
+#include "nes_cm.h"
+#include "nes_mgt.h"
+
+extern int max_mtu;
+#define max_frame_len (max_mtu+ETH_HLEN)
+extern int interrupt_mod_interval;
+extern int nes_if_count;
+extern int mpa_version;
+extern int disable_mpa_crc;
+extern unsigned int nes_drv_opt;
+extern unsigned int nes_debug_level;
+extern unsigned int wqm_quanta;
+extern struct list_head nes_adapter_list;
+
+extern atomic_t cm_connects;
+extern atomic_t cm_accepts;
+extern atomic_t cm_disconnects;
+extern atomic_t cm_closes;
+extern atomic_t cm_connecteds;
+extern atomic_t cm_connect_reqs;
+extern atomic_t cm_rejects;
+extern atomic_t mod_qp_timouts;
+extern atomic_t qps_created;
+extern atomic_t qps_destroyed;
+extern atomic_t sw_qps_destroyed;
+extern u32 mh_detected;
+extern u32 mh_pauses_sent;
+extern u32 cm_packets_sent;
+extern u32 cm_packets_bounced;
+extern u32 cm_packets_created;
+extern u32 cm_packets_received;
+extern u32 cm_packets_dropped;
+extern u32 cm_packets_retrans;
+extern atomic_t cm_listens_created;
+extern atomic_t cm_listens_destroyed;
+extern u32 cm_backlog_drops;
+extern atomic_t cm_loopbacks;
+extern atomic_t cm_nodes_created;
+extern atomic_t cm_nodes_destroyed;
+extern atomic_t cm_accel_dropped_pkts;
+extern atomic_t cm_resets_recvd;
+extern atomic_t pau_qps_created;
+extern atomic_t pau_qps_destroyed;
+
+extern u32 int_mod_timer_init;
+extern u32 int_mod_cq_depth_256;
+extern u32 int_mod_cq_depth_128;
+extern u32 int_mod_cq_depth_32;
+extern u32 int_mod_cq_depth_24;
+extern u32 int_mod_cq_depth_16;
+extern u32 int_mod_cq_depth_4;
+extern u32 int_mod_cq_depth_1;
+
+struct nes_device {
+	struct nes_adapter	   *nesadapter;
+	void __iomem           *regs;
+	void __iomem           *index_reg;
+	struct pci_dev         *pcidev;
+	struct net_device      *netdev[NES_NIC_MAX_NICS];
+	u64                    link_status_interrupts;
+	struct tasklet_struct  dpc_tasklet;
+	spinlock_t             indexed_regs_lock;
+	unsigned long          csr_start;
+	unsigned long          doorbell_region;
+	unsigned long          doorbell_start;
+	unsigned long          mac_tx_errors;
+	unsigned long          mac_pause_frames_sent;
+	unsigned long          mac_pause_frames_received;
+	unsigned long          mac_rx_errors;
+	unsigned long          mac_rx_crc_errors;
+	unsigned long          mac_rx_symbol_err_frames;
+	unsigned long          mac_rx_jabber_frames;
+	unsigned long          mac_rx_oversized_frames;
+	unsigned long          mac_rx_short_frames;
+	unsigned long          port_rx_discards;
+	unsigned long          port_tx_discards;
+	unsigned int           mac_index;
+	unsigned int           nes_stack_start;
+
+	/* Control Structures */
+	void                   *cqp_vbase;
+	dma_addr_t             cqp_pbase;
+	u32                    cqp_mem_size;
+	u8                     ceq_index;
+	u8                     nic_ceq_index;
+	struct nes_hw_cqp      cqp;
+	struct nes_hw_cq       ccq;
+	struct list_head       cqp_avail_reqs;
+	struct list_head       cqp_pending_reqs;
+	struct nes_cqp_request *nes_cqp_requests;
+
+	u32                    int_req;
+	u32                    int_stat;
+	u32                    timer_int_req;
+	u32                    timer_only_int_count;
+	u32                    intf_int_req;
+	u32                    last_mac_tx_pauses;
+	u32                    last_used_chunks_tx;
+	struct list_head       list;
+
+	u16                    base_doorbell_index;
+	u16                    currcq_count;
+	u16                    deepcq_count;
+	u8                     iw_status;
+	u8                     msi_enabled;
+	u8                     netdev_count;
+	u8                     napi_isr_ran;
+	u8                     disable_rx_flow_control;
+	u8                     disable_tx_flow_control;
+
+	struct delayed_work    work;
+	u8                     link_recheck;
+};
+
+/* Receive skb private area - must fit in skb->cb area */
+struct nes_rskb_cb {
+	u64                    busaddr;
+	u32                    maplen;
+	u32                    seqnum;
+	u8                     *data_start;
+	struct nes_qp          *nesqp;
+};
+
+static inline __le32 get_crc_value(struct nes_v4_quad *nes_quad)
+{
+	u32 crc_value;
+	crc_value = crc32c(~0, (void *)nes_quad, sizeof (struct nes_v4_quad));
+
+	/*
+	 * With commit ef19454b ("[LIB] crc32c: Keep intermediate crc
+	 * state in cpu order"), behavior of crc32c changes on
+	 * big-endian platforms.  Our algorithm expects the previous
+	 * behavior; otherwise we have RDMA connection establishment
+	 * issue on big-endian.
+	 */
+	return cpu_to_le32(crc_value);
+}
+
+static inline void
+set_wqe_64bit_value(__le32 *wqe_words, u32 index, u64 value)
+{
+	wqe_words[index]     = cpu_to_le32((u32) value);
+	wqe_words[index + 1] = cpu_to_le32(upper_32_bits(value));
+}
+
+static inline void
+set_wqe_32bit_value(__le32 *wqe_words, u32 index, u32 value)
+{
+	wqe_words[index] = cpu_to_le32(value);
+}
+
+static inline void
+nes_fill_init_cqp_wqe(struct nes_hw_cqp_wqe *cqp_wqe, struct nes_device *nesdev)
+{
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_LOW_IDX]       = 0;
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX]      = 0;
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_LOW_IDX]   = 0;
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX]  = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX] = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PBL_LEN_IDX]       = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_LOW_IDX]       = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_LOW_IDX]        = 0;
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_PA_HIGH_IDX]       = 0;
+}
+
+static inline void
+nes_fill_init_qp_wqe(struct nes_hw_qp_wqe *wqe, struct nes_qp *nesqp, u32 head)
+{
+	u32 value;
+	value = ((u32)((unsigned long) nesqp)) | head;
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX,
+			(u32)(upper_32_bits((unsigned long)(nesqp))));
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, value);
+}
+
+/* Read from memory-mapped device */
+static inline u32 nes_read_indexed(struct nes_device *nesdev, u32 reg_index)
+{
+	unsigned long flags;
+	void __iomem *addr = nesdev->index_reg;
+	u32 value;
+
+	spin_lock_irqsave(&nesdev->indexed_regs_lock, flags);
+
+	writel(reg_index, addr);
+	value = readl((void __iomem *)addr + 4);
+
+	spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags);
+	return value;
+}
+
+static inline u32 nes_read32(const void __iomem *addr)
+{
+	return readl(addr);
+}
+
+static inline u16 nes_read16(const void __iomem *addr)
+{
+	return readw(addr);
+}
+
+static inline u8 nes_read8(const void __iomem *addr)
+{
+	return readb(addr);
+}
+
+/* Write to memory-mapped device */
+static inline void nes_write_indexed(struct nes_device *nesdev, u32 reg_index, u32 val)
+{
+	unsigned long flags;
+	void __iomem *addr = nesdev->index_reg;
+
+	spin_lock_irqsave(&nesdev->indexed_regs_lock, flags);
+
+	writel(reg_index, addr);
+	writel(val, (void __iomem *)addr + 4);
+
+	spin_unlock_irqrestore(&nesdev->indexed_regs_lock, flags);
+}
+
+static inline void nes_write32(void __iomem *addr, u32 val)
+{
+	writel(val, addr);
+}
+
+static inline void nes_write16(void __iomem *addr, u16 val)
+{
+	writew(val, addr);
+}
+
+static inline void nes_write8(void __iomem *addr, u8 val)
+{
+	writeb(val, addr);
+}
+
+enum nes_resource {
+	NES_RESOURCE_MW = 1,
+	NES_RESOURCE_FAST_MR,
+	NES_RESOURCE_PHYS_MR,
+	NES_RESOURCE_USER_MR,
+	NES_RESOURCE_PD,
+	NES_RESOURCE_QP,
+	NES_RESOURCE_CQ,
+	NES_RESOURCE_ARP
+};
+
+static inline int nes_alloc_resource(struct nes_adapter *nesadapter,
+		unsigned long *resource_array, u32 max_resources,
+		u32 *req_resource_num, u32 *next, enum nes_resource resource_type)
+{
+	unsigned long flags;
+	u32 resource_num;
+
+	spin_lock_irqsave(&nesadapter->resource_lock, flags);
+
+	resource_num = find_next_zero_bit(resource_array, max_resources, *next);
+	if (resource_num >= max_resources) {
+		resource_num = find_first_zero_bit(resource_array, max_resources);
+		if (resource_num >= max_resources) {
+			printk(KERN_ERR PFX "%s: No available resources [type=%u].\n", __func__, resource_type);
+			spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+			return -EMFILE;
+		}
+	}
+	set_bit(resource_num, resource_array);
+	*next = resource_num+1;
+	if (*next == max_resources) {
+		*next = 0;
+	}
+	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+	*req_resource_num = resource_num;
+
+	return 0;
+}
+
+static inline int nes_is_resource_allocated(struct nes_adapter *nesadapter,
+		unsigned long *resource_array, u32 resource_num)
+{
+	unsigned long flags;
+	int bit_is_set;
+
+	spin_lock_irqsave(&nesadapter->resource_lock, flags);
+
+	bit_is_set = test_bit(resource_num, resource_array);
+	nes_debug(NES_DBG_HW, "resource_num %u is%s allocated.\n",
+			resource_num, (bit_is_set ? "": " not"));
+	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+
+	return bit_is_set;
+}
+
+static inline void nes_free_resource(struct nes_adapter *nesadapter,
+		unsigned long *resource_array, u32 resource_num)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&nesadapter->resource_lock, flags);
+	clear_bit(resource_num, resource_array);
+	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+}
+
+static inline struct nes_vnic *to_nesvnic(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct nes_ib_device, ibdev)->nesvnic;
+}
+
+static inline struct nes_pd *to_nespd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct nes_pd, ibpd);
+}
+
+static inline struct nes_ucontext *to_nesucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct nes_ucontext, ibucontext);
+}
+
+static inline struct nes_mr *to_nesmr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct nes_mr, ibmr);
+}
+
+static inline struct nes_mr *to_nesmr_from_ibfmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct nes_mr, ibfmr);
+}
+
+static inline struct nes_mr *to_nesmw(struct ib_mw *ibmw)
+{
+	return container_of(ibmw, struct nes_mr, ibmw);
+}
+
+static inline struct nes_fmr *to_nesfmr(struct nes_mr *nesmr)
+{
+	return container_of(nesmr, struct nes_fmr, nesmr);
+}
+
+static inline struct nes_cq *to_nescq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct nes_cq, ibcq);
+}
+
+static inline struct nes_qp *to_nesqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct nes_qp, ibqp);
+}
+
+
+
+/* nes.c */
+void nes_add_ref(struct ib_qp *);
+void nes_rem_ref(struct ib_qp *);
+struct ib_qp *nes_get_qp(struct ib_device *, int);
+
+
+/* nes_hw.c */
+struct nes_adapter *nes_init_adapter(struct nes_device *, u8);
+void  nes_nic_init_timer_defaults(struct nes_device *, u8);
+void nes_destroy_adapter(struct nes_adapter *);
+int nes_init_cqp(struct nes_device *);
+int nes_init_phy(struct nes_device *);
+int nes_init_nic_qp(struct nes_device *, struct net_device *);
+void nes_destroy_nic_qp(struct nes_vnic *);
+int nes_napi_isr(struct nes_device *);
+void nes_dpc(unsigned long);
+void nes_nic_ce_handler(struct nes_device *, struct nes_hw_nic_cq *);
+void nes_iwarp_ce_handler(struct nes_device *, struct nes_hw_cq *);
+int nes_destroy_cqp(struct nes_device *);
+int nes_nic_cm_xmit(struct sk_buff *, struct net_device *);
+void nes_recheck_link_status(struct work_struct *work);
+void nes_terminate_timeout(unsigned long context);
+
+/* nes_nic.c */
+struct net_device *nes_netdev_init(struct nes_device *, void __iomem *);
+void nes_netdev_destroy(struct net_device *);
+int nes_nic_cm_xmit(struct sk_buff *, struct net_device *);
+
+/* nes_cm.c */
+void *nes_cm_create(struct net_device *);
+int nes_cm_recv(struct sk_buff *, struct net_device *);
+void nes_update_arp(unsigned char *, u32, u32, u16, u16);
+void nes_manage_arp_cache(struct net_device *, unsigned char *, u32, u32);
+void nes_sock_release(struct nes_qp *, unsigned long *);
+void flush_wqes(struct nes_device *nesdev, struct nes_qp *, u32, u32);
+int nes_manage_apbvt(struct nes_vnic *, u32, u32, u32);
+int nes_cm_disconn(struct nes_qp *);
+void nes_cm_disconn_worker(void *);
+
+/* nes_verbs.c */
+int nes_hw_modify_qp(struct nes_device *, struct nes_qp *, u32, u32, u32);
+int nes_modify_qp(struct ib_qp *, struct ib_qp_attr *, int, struct ib_udata *);
+struct nes_ib_device *nes_init_ofa_device(struct net_device *);
+void  nes_port_ibevent(struct nes_vnic *nesvnic);
+void nes_destroy_ofa_device(struct nes_ib_device *);
+int nes_register_ofa_device(struct nes_ib_device *);
+
+/* nes_util.c */
+int nes_read_eeprom_values(struct nes_device *, struct nes_adapter *);
+void nes_write_1G_phy_reg(struct nes_device *, u8, u8, u16);
+void nes_read_1G_phy_reg(struct nes_device *, u8, u8, u16 *);
+void nes_write_10G_phy_reg(struct nes_device *, u16, u8, u16, u16);
+void nes_read_10G_phy_reg(struct nes_device *, u8, u8, u16);
+struct nes_cqp_request *nes_get_cqp_request(struct nes_device *);
+void nes_free_cqp_request(struct nes_device *nesdev,
+			  struct nes_cqp_request *cqp_request);
+void nes_put_cqp_request(struct nes_device *nesdev,
+			 struct nes_cqp_request *cqp_request);
+void nes_post_cqp_request(struct nes_device *, struct nes_cqp_request *);
+int nes_arp_table(struct nes_device *, u32, u8 *, u32);
+void nes_mh_fix(unsigned long);
+void nes_clc(unsigned long);
+void nes_dump_mem(unsigned int, void *, int);
+u32 nes_crc32(u32, u32, u32, u32, u8 *, u32, u32, u32);
+
+#endif	/* __NES_H */
diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c
new file mode 100644
index 000000000..72b43417c
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_cm.c
@@ -0,0 +1,4184 @@
+/*
+ * Copyright (c) 2006 - 2014 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#define TCPOPT_TIMESTAMP 8
+
+#include <linux/atomic.h>
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/notifier.h>
+#include <linux/net.h>
+#include <linux/types.h>
+#include <linux/timer.h>
+#include <linux/time.h>
+#include <linux/delay.h>
+#include <linux/etherdevice.h>
+#include <linux/netdevice.h>
+#include <linux/random.h>
+#include <linux/list.h>
+#include <linux/threads.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <net/arp.h>
+#include <net/neighbour.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#include <net/tcp.h>
+#include <linux/fcntl.h>
+
+#include "nes.h"
+
+u32 cm_packets_sent;
+u32 cm_packets_bounced;
+u32 cm_packets_dropped;
+u32 cm_packets_retrans;
+u32 cm_packets_created;
+u32 cm_packets_received;
+atomic_t cm_listens_created;
+atomic_t cm_listens_destroyed;
+u32 cm_backlog_drops;
+atomic_t cm_loopbacks;
+atomic_t cm_nodes_created;
+atomic_t cm_nodes_destroyed;
+atomic_t cm_accel_dropped_pkts;
+atomic_t cm_resets_recvd;
+
+static inline int mini_cm_accelerated(struct nes_cm_core *, struct nes_cm_node *);
+static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *, struct nes_vnic *, struct nes_cm_info *);
+static int mini_cm_del_listen(struct nes_cm_core *, struct nes_cm_listener *);
+static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *, struct nes_vnic *, u16, void *, struct nes_cm_info *);
+static int mini_cm_close(struct nes_cm_core *, struct nes_cm_node *);
+static int mini_cm_accept(struct nes_cm_core *, struct nes_cm_node *);
+static int mini_cm_reject(struct nes_cm_core *, struct nes_cm_node *);
+static int mini_cm_recv_pkt(struct nes_cm_core *, struct nes_vnic *, struct sk_buff *);
+static int mini_cm_dealloc_core(struct nes_cm_core *);
+static int mini_cm_get(struct nes_cm_core *);
+static int mini_cm_set(struct nes_cm_core *, u32, u32);
+
+static void form_cm_frame(struct sk_buff *, struct nes_cm_node *, void *, u32, void *, u32, u8);
+static int add_ref_cm_node(struct nes_cm_node *);
+static int rem_ref_cm_node(struct nes_cm_core *, struct nes_cm_node *);
+
+static int nes_cm_disconn_true(struct nes_qp *);
+static int nes_cm_post_event(struct nes_cm_event *event);
+static int nes_disconnect(struct nes_qp *nesqp, int abrupt);
+static void nes_disconnect_worker(struct work_struct *work);
+
+static int send_mpa_request(struct nes_cm_node *, struct sk_buff *);
+static int send_mpa_reject(struct nes_cm_node *);
+static int send_syn(struct nes_cm_node *, u32, struct sk_buff *);
+static int send_reset(struct nes_cm_node *, struct sk_buff *);
+static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb);
+static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb);
+static void process_packet(struct nes_cm_node *, struct sk_buff *, struct nes_cm_core *);
+
+static void active_open_err(struct nes_cm_node *, struct sk_buff *, int);
+static void passive_open_err(struct nes_cm_node *, struct sk_buff *, int);
+static void cleanup_retrans_entry(struct nes_cm_node *);
+static void handle_rcv_mpa(struct nes_cm_node *, struct sk_buff *);
+static void free_retrans_entry(struct nes_cm_node *cm_node);
+static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph, struct sk_buff *skb, int optionsize, int passive);
+
+/* CM event handler functions */
+static void cm_event_connected(struct nes_cm_event *);
+static void cm_event_connect_error(struct nes_cm_event *);
+static void cm_event_reset(struct nes_cm_event *);
+static void cm_event_mpa_req(struct nes_cm_event *);
+static void cm_event_mpa_reject(struct nes_cm_event *);
+static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node);
+
+/* MPA build functions */
+static int cm_build_mpa_frame(struct nes_cm_node *, u8 **, u16 *, u8 *, u8);
+static void build_mpa_v2(struct nes_cm_node *, void *, u8);
+static void build_mpa_v1(struct nes_cm_node *, void *, u8);
+static void build_rdma0_msg(struct nes_cm_node *, struct nes_qp **);
+
+static void print_core(struct nes_cm_core *core);
+static void record_ird_ord(struct nes_cm_node *, u16, u16);
+
+/* External CM API Interface */
+/* instance of function pointers for client API */
+/* set address of this instance to cm_core->cm_ops at cm_core alloc */
+static struct nes_cm_ops nes_cm_api = {
+	mini_cm_accelerated,
+	mini_cm_listen,
+	mini_cm_del_listen,
+	mini_cm_connect,
+	mini_cm_close,
+	mini_cm_accept,
+	mini_cm_reject,
+	mini_cm_recv_pkt,
+	mini_cm_dealloc_core,
+	mini_cm_get,
+	mini_cm_set
+};
+
+static struct nes_cm_core *g_cm_core;
+
+atomic_t cm_connects;
+atomic_t cm_accepts;
+atomic_t cm_disconnects;
+atomic_t cm_closes;
+atomic_t cm_connecteds;
+atomic_t cm_connect_reqs;
+atomic_t cm_rejects;
+
+int nes_add_ref_cm_node(struct nes_cm_node *cm_node)
+{
+	return add_ref_cm_node(cm_node);
+}
+
+int nes_rem_ref_cm_node(struct nes_cm_node *cm_node)
+{
+	return rem_ref_cm_node(cm_node->cm_core, cm_node);
+}
+/**
+ * create_event
+ */
+static struct nes_cm_event *create_event(struct nes_cm_node *	cm_node,
+					 enum nes_cm_event_type type)
+{
+	struct nes_cm_event *event;
+
+	if (!cm_node->cm_id)
+		return NULL;
+
+	/* allocate an empty event */
+	event = kzalloc(sizeof(*event), GFP_ATOMIC);
+
+	if (!event)
+		return NULL;
+
+	event->type = type;
+	event->cm_node = cm_node;
+	event->cm_info.rem_addr = cm_node->rem_addr;
+	event->cm_info.loc_addr = cm_node->loc_addr;
+	event->cm_info.rem_port = cm_node->rem_port;
+	event->cm_info.loc_port = cm_node->loc_port;
+	event->cm_info.cm_id = cm_node->cm_id;
+
+	nes_debug(NES_DBG_CM, "cm_node=%p Created event=%p, type=%u, "
+		  "dst_addr=%08x[%x], src_addr=%08x[%x]\n",
+		  cm_node, event, type, event->cm_info.loc_addr,
+		  event->cm_info.loc_port, event->cm_info.rem_addr,
+		  event->cm_info.rem_port);
+
+	nes_cm_post_event(event);
+	return event;
+}
+
+
+/**
+ * send_mpa_request
+ */
+static int send_mpa_request(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	u8 start_addr = 0;
+	u8 *start_ptr = &start_addr;
+	u8 **start_buff = &start_ptr;
+	u16 buff_len = 0;
+
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "skb set to NULL\n");
+		return -1;
+	}
+
+	/* send an MPA Request frame */
+	cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REQUEST);
+	form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK);
+
+	return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+}
+
+
+
+static int send_mpa_reject(struct nes_cm_node *cm_node)
+{
+	struct sk_buff *skb = NULL;
+	u8 start_addr = 0;
+	u8 *start_ptr = &start_addr;
+	u8 **start_buff = &start_ptr;
+	u16 buff_len = 0;
+	struct ietf_mpa_v1 *mpa_frame;
+
+	skb = dev_alloc_skb(MAX_CM_BUFFER);
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -ENOMEM;
+	}
+
+	/* send an MPA reject frame */
+	cm_build_mpa_frame(cm_node, start_buff, &buff_len, NULL, MPA_KEY_REPLY);
+	mpa_frame = (struct ietf_mpa_v1 *)*start_buff;
+	mpa_frame->flags |= IETF_MPA_FLAGS_REJECT;
+	form_cm_frame(skb, cm_node, NULL, 0, *start_buff, buff_len, SET_ACK | SET_FIN);
+
+	cm_node->state = NES_CM_STATE_FIN_WAIT1;
+	return schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+}
+
+
+/**
+ * recv_mpa - process a received TCP pkt, we are expecting an
+ * IETF MPA frame
+ */
+static int parse_mpa(struct nes_cm_node *cm_node, u8 *buffer, u32 *type,
+		     u32 len)
+{
+	struct ietf_mpa_v1 *mpa_frame;
+	struct ietf_mpa_v2 *mpa_v2_frame;
+	struct ietf_rtr_msg *rtr_msg;
+	int mpa_hdr_len;
+	int priv_data_len;
+
+	*type = NES_MPA_REQUEST_ACCEPT;
+
+	/* assume req frame is in tcp data payload */
+	if (len < sizeof(struct ietf_mpa_v1)) {
+		nes_debug(NES_DBG_CM, "The received ietf buffer was too small (%x)\n", len);
+		return -EINVAL;
+	}
+
+	/* points to the beginning of the frame, which could be MPA V1 or V2 */
+	mpa_frame = (struct ietf_mpa_v1 *)buffer;
+	mpa_hdr_len = sizeof(struct ietf_mpa_v1);
+	priv_data_len = ntohs(mpa_frame->priv_data_len);
+
+	/* make sure mpa private data len is less than 512 bytes */
+	if (priv_data_len > IETF_MAX_PRIV_DATA_LEN) {
+		nes_debug(NES_DBG_CM, "The received Length of Private"
+			  " Data field exceeds 512 octets\n");
+		return -EINVAL;
+	}
+	/*
+	 * make sure MPA receiver interoperate with the
+	 * received MPA version and MPA key information
+	 *
+	 */
+	if (mpa_frame->rev != IETF_MPA_V1 && mpa_frame->rev != IETF_MPA_V2) {
+		nes_debug(NES_DBG_CM, "The received mpa version"
+			  " is not supported\n");
+		return -EINVAL;
+	}
+	/*
+	* backwards compatibility only
+	*/
+	if (mpa_frame->rev > cm_node->mpa_frame_rev) {
+		nes_debug(NES_DBG_CM, "The received mpa version"
+			" can not be interoperated\n");
+		return -EINVAL;
+	} else {
+		cm_node->mpa_frame_rev = mpa_frame->rev;
+	}
+
+	if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) {
+		if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE)) {
+			nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n");
+			return -EINVAL;
+		}
+	} else {
+		if (memcmp(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE)) {
+			nes_debug(NES_DBG_CM, "Unexpected MPA Key received \n");
+			return -EINVAL;
+		}
+	}
+
+	if (priv_data_len + mpa_hdr_len != len) {
+		nes_debug(NES_DBG_CM, "The received ietf buffer was not right"
+			" complete (%x + %x != %x)\n",
+			priv_data_len, mpa_hdr_len, len);
+		return -EINVAL;
+	}
+	/* make sure it does not exceed the max size */
+	if (len > MAX_CM_BUFFER) {
+		nes_debug(NES_DBG_CM, "The received ietf buffer was too large"
+			" (%x + %x != %x)\n",
+			priv_data_len, mpa_hdr_len, len);
+		return -EINVAL;
+	}
+
+	cm_node->mpa_frame_size = priv_data_len;
+
+	switch (mpa_frame->rev) {
+	case IETF_MPA_V2: {
+		u16 ird_size;
+		u16 ord_size;
+		u16 rtr_ctrl_ird;
+		u16 rtr_ctrl_ord;
+
+		mpa_v2_frame = (struct ietf_mpa_v2 *)buffer;
+		mpa_hdr_len += IETF_RTR_MSG_SIZE;
+		cm_node->mpa_frame_size -= IETF_RTR_MSG_SIZE;
+		rtr_msg = &mpa_v2_frame->rtr_msg;
+
+		/* parse rtr message */
+		rtr_ctrl_ird = ntohs(rtr_msg->ctrl_ird);
+		rtr_ctrl_ord = ntohs(rtr_msg->ctrl_ord);
+		ird_size = rtr_ctrl_ird & IETF_NO_IRD_ORD;
+		ord_size = rtr_ctrl_ord & IETF_NO_IRD_ORD;
+
+		if (!(rtr_ctrl_ird & IETF_PEER_TO_PEER)) {
+			/* send reset */
+			return -EINVAL;
+		}
+		if (ird_size == IETF_NO_IRD_ORD || ord_size == IETF_NO_IRD_ORD)
+			cm_node->mpav2_ird_ord = IETF_NO_IRD_ORD;
+
+		if (cm_node->mpav2_ird_ord != IETF_NO_IRD_ORD) {
+			/* responder */
+			if (cm_node->state != NES_CM_STATE_MPAREQ_SENT) {
+				/* we are still negotiating */
+				if (ord_size > NES_MAX_IRD) {
+					cm_node->ird_size = NES_MAX_IRD;
+				} else {
+					cm_node->ird_size = ord_size;
+					if (ord_size == 0 &&
+					(rtr_ctrl_ord & IETF_RDMA0_READ)) {
+						cm_node->ird_size = 1;
+						nes_debug(NES_DBG_CM,
+						"%s: Remote peer doesn't support RDMA0_READ (ord=%u)\n",
+							__func__, ord_size);
+					}
+				}
+				if (ird_size > NES_MAX_ORD)
+					cm_node->ord_size = NES_MAX_ORD;
+				else
+					cm_node->ord_size = ird_size;
+			} else { /* initiator */
+				if (ord_size > NES_MAX_IRD) {
+					nes_debug(NES_DBG_CM,
+					"%s: Unable to support the requested (ord =%u)\n",
+							__func__, ord_size);
+					return -EINVAL;
+				}
+				cm_node->ird_size = ord_size;
+
+				if (ird_size > NES_MAX_ORD) {
+					cm_node->ord_size = NES_MAX_ORD;
+				} else {
+					if (ird_size == 0 &&
+					(rtr_ctrl_ord & IETF_RDMA0_READ)) {
+						nes_debug(NES_DBG_CM,
+						"%s: Remote peer doesn't support RDMA0_READ (ird=%u)\n",
+							__func__, ird_size);
+						return -EINVAL;
+					} else {
+						cm_node->ord_size = ird_size;
+					}
+				}
+			}
+		}
+
+		if (rtr_ctrl_ord & IETF_RDMA0_READ) {
+			cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO;
+
+		} else if (rtr_ctrl_ord & IETF_RDMA0_WRITE) {
+			cm_node->send_rdma0_op = SEND_RDMA_WRITE_ZERO;
+		} else {        /* Not supported RDMA0 operation */
+			return -EINVAL;
+		}
+		break;
+	}
+	case IETF_MPA_V1:
+	default:
+		break;
+	}
+
+	/* copy entire MPA frame to our cm_node's frame */
+	memcpy(cm_node->mpa_frame_buf, buffer + mpa_hdr_len, cm_node->mpa_frame_size);
+
+	if (mpa_frame->flags & IETF_MPA_FLAGS_REJECT)
+		*type = NES_MPA_REQUEST_REJECT;
+	return 0;
+}
+
+
+/**
+ * form_cm_frame - get a free packet and build empty frame Use
+ * node info to build.
+ */
+static void form_cm_frame(struct sk_buff *skb,
+			  struct nes_cm_node *cm_node, void *options, u32 optionsize,
+			  void *data, u32 datasize, u8 flags)
+{
+	struct tcphdr *tcph;
+	struct iphdr *iph;
+	struct ethhdr *ethh;
+	u8 *buf;
+	u16 packetsize = sizeof(*iph);
+
+	packetsize += sizeof(*tcph);
+	packetsize += optionsize + datasize;
+
+	skb_trim(skb, 0);
+	memset(skb->data, 0x00, ETH_HLEN + sizeof(*iph) + sizeof(*tcph));
+
+	buf = skb_put(skb, packetsize + ETH_HLEN);
+
+	ethh = (struct ethhdr *)buf;
+	buf += ETH_HLEN;
+
+	iph = (struct iphdr *)buf;
+	buf += sizeof(*iph);
+	tcph = (struct tcphdr *)buf;
+	skb_reset_mac_header(skb);
+	skb_set_network_header(skb, ETH_HLEN);
+	skb_set_transport_header(skb, ETH_HLEN + sizeof(*iph));
+	buf += sizeof(*tcph);
+
+	skb->ip_summed = CHECKSUM_PARTIAL;
+	if (!(cm_node->netdev->features & NETIF_F_IP_CSUM))
+		skb->ip_summed = CHECKSUM_NONE;
+	skb->protocol = htons(0x800);
+	skb->data_len = 0;
+	skb->mac_len = ETH_HLEN;
+
+	memcpy(ethh->h_dest, cm_node->rem_mac, ETH_ALEN);
+	memcpy(ethh->h_source, cm_node->loc_mac, ETH_ALEN);
+	ethh->h_proto = htons(0x0800);
+
+	iph->version = IPVERSION;
+	iph->ihl = 5;           /* 5 * 4Byte words, IP headr len */
+	iph->tos = 0;
+	iph->tot_len = htons(packetsize);
+	iph->id = htons(++cm_node->tcp_cntxt.loc_id);
+
+	iph->frag_off = htons(0x4000);
+	iph->ttl = 0x40;
+	iph->protocol = 0x06;   /* IPPROTO_TCP */
+
+	iph->saddr = htonl(cm_node->mapped_loc_addr);
+	iph->daddr = htonl(cm_node->mapped_rem_addr);
+
+	tcph->source = htons(cm_node->mapped_loc_port);
+	tcph->dest = htons(cm_node->mapped_rem_port);
+	tcph->seq = htonl(cm_node->tcp_cntxt.loc_seq_num);
+
+	if (flags & SET_ACK) {
+		cm_node->tcp_cntxt.loc_ack_num = cm_node->tcp_cntxt.rcv_nxt;
+		tcph->ack_seq = htonl(cm_node->tcp_cntxt.loc_ack_num);
+		tcph->ack = 1;
+	} else {
+		tcph->ack_seq = 0;
+	}
+
+	if (flags & SET_SYN) {
+		cm_node->tcp_cntxt.loc_seq_num++;
+		tcph->syn = 1;
+	} else {
+		cm_node->tcp_cntxt.loc_seq_num += datasize;
+	}
+
+	if (flags & SET_FIN) {
+		cm_node->tcp_cntxt.loc_seq_num++;
+		tcph->fin = 1;
+	}
+
+	if (flags & SET_RST)
+		tcph->rst = 1;
+
+	tcph->doff = (u16)((sizeof(*tcph) + optionsize + 3) >> 2);
+	tcph->window = htons(cm_node->tcp_cntxt.rcv_wnd);
+	tcph->urg_ptr = 0;
+	if (optionsize)
+		memcpy(buf, options, optionsize);
+	buf += optionsize;
+	if (datasize)
+		memcpy(buf, data, datasize);
+
+	skb_shinfo(skb)->nr_frags = 0;
+	cm_packets_created++;
+}
+
+/*
+ * nes_create_sockaddr - Record ip addr and tcp port in a sockaddr struct
+ */
+static void nes_create_sockaddr(__be32 ip_addr, __be16 port,
+				struct sockaddr_storage *addr)
+{
+	struct sockaddr_in *nes_sockaddr = (struct sockaddr_in *)addr;
+	nes_sockaddr->sin_family = AF_INET;
+	memcpy(&nes_sockaddr->sin_addr.s_addr, &ip_addr, sizeof(__be32));
+	nes_sockaddr->sin_port = port;
+}
+
+/*
+ * nes_create_mapinfo - Create a mapinfo object in the port mapper data base
+ */
+static int nes_create_mapinfo(struct nes_cm_info *cm_info)
+{
+	struct sockaddr_storage local_sockaddr;
+	struct sockaddr_storage mapped_sockaddr;
+
+	nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port),
+				&local_sockaddr);
+	nes_create_sockaddr(htonl(cm_info->mapped_loc_addr),
+			htons(cm_info->mapped_loc_port), &mapped_sockaddr);
+
+	return iwpm_create_mapinfo(&local_sockaddr,
+				&mapped_sockaddr, RDMA_NL_NES);
+}
+
+/*
+ * nes_remove_mapinfo - Remove a mapinfo object from the port mapper data base
+ *                      and send a remove mapping op message to
+ *                      the userspace port mapper
+ */
+static int nes_remove_mapinfo(u32 loc_addr, u16 loc_port,
+			u32 mapped_loc_addr, u16 mapped_loc_port)
+{
+	struct sockaddr_storage local_sockaddr;
+	struct sockaddr_storage mapped_sockaddr;
+
+	nes_create_sockaddr(htonl(loc_addr), htons(loc_port), &local_sockaddr);
+	nes_create_sockaddr(htonl(mapped_loc_addr), htons(mapped_loc_port),
+				&mapped_sockaddr);
+
+	iwpm_remove_mapinfo(&local_sockaddr, &mapped_sockaddr);
+	return iwpm_remove_mapping(&local_sockaddr, RDMA_NL_NES);
+}
+
+/*
+ * nes_form_pm_msg - Form a port mapper message with mapping info
+ */
+static void nes_form_pm_msg(struct nes_cm_info *cm_info,
+				struct iwpm_sa_data *pm_msg)
+{
+	nes_create_sockaddr(htonl(cm_info->loc_addr), htons(cm_info->loc_port),
+				&pm_msg->loc_addr);
+	nes_create_sockaddr(htonl(cm_info->rem_addr), htons(cm_info->rem_port),
+				&pm_msg->rem_addr);
+}
+
+/*
+ * nes_form_reg_msg - Form a port mapper message with dev info
+ */
+static void nes_form_reg_msg(struct nes_vnic *nesvnic,
+			struct iwpm_dev_data *pm_msg)
+{
+	memcpy(pm_msg->dev_name, nesvnic->nesibdev->ibdev.name,
+				IWPM_DEVNAME_SIZE);
+	memcpy(pm_msg->if_name, nesvnic->netdev->name, IWPM_IFNAME_SIZE);
+}
+
+static void record_sockaddr_info(struct sockaddr_storage *addr_info,
+					nes_addr_t *ip_addr, u16 *port_num)
+{
+	struct sockaddr_in *in_addr = (struct sockaddr_in *)addr_info;
+
+	if (in_addr->sin_family == AF_INET) {
+		*ip_addr = ntohl(in_addr->sin_addr.s_addr);
+		*port_num = ntohs(in_addr->sin_port);
+	}
+}
+
+/*
+ * nes_record_pm_msg - Save the received mapping info
+ */
+static void nes_record_pm_msg(struct nes_cm_info *cm_info,
+			struct iwpm_sa_data *pm_msg)
+{
+	record_sockaddr_info(&pm_msg->mapped_loc_addr,
+		&cm_info->mapped_loc_addr, &cm_info->mapped_loc_port);
+
+	record_sockaddr_info(&pm_msg->mapped_rem_addr,
+		&cm_info->mapped_rem_addr, &cm_info->mapped_rem_port);
+}
+
+/*
+ * nes_get_reminfo - Get the address info of the remote connecting peer
+ */
+static int nes_get_remote_addr(struct nes_cm_node *cm_node)
+{
+	struct sockaddr_storage mapped_loc_addr, mapped_rem_addr;
+	struct sockaddr_storage remote_addr;
+	int ret;
+
+	nes_create_sockaddr(htonl(cm_node->mapped_loc_addr),
+			htons(cm_node->mapped_loc_port), &mapped_loc_addr);
+	nes_create_sockaddr(htonl(cm_node->mapped_rem_addr),
+			htons(cm_node->mapped_rem_port), &mapped_rem_addr);
+
+	ret = iwpm_get_remote_info(&mapped_loc_addr, &mapped_rem_addr,
+				&remote_addr, RDMA_NL_NES);
+	if (ret)
+		nes_debug(NES_DBG_CM, "Unable to find remote peer address info\n");
+	else
+		record_sockaddr_info(&remote_addr, &cm_node->rem_addr,
+				&cm_node->rem_port);
+	return ret;
+}
+
+/**
+ * print_core - dump a cm core
+ */
+static void print_core(struct nes_cm_core *core)
+{
+	nes_debug(NES_DBG_CM, "---------------------------------------------\n");
+	nes_debug(NES_DBG_CM, "CM Core  -- (core = %p )\n", core);
+	if (!core)
+		return;
+	nes_debug(NES_DBG_CM, "---------------------------------------------\n");
+
+	nes_debug(NES_DBG_CM, "State         : %u \n", core->state);
+
+	nes_debug(NES_DBG_CM, "Listen Nodes  : %u \n", atomic_read(&core->listen_node_cnt));
+	nes_debug(NES_DBG_CM, "Active Nodes  : %u \n", atomic_read(&core->node_cnt));
+
+	nes_debug(NES_DBG_CM, "core          : %p \n", core);
+
+	nes_debug(NES_DBG_CM, "-------------- end core ---------------\n");
+}
+
+static void record_ird_ord(struct nes_cm_node *cm_node,
+					u16 conn_ird, u16 conn_ord)
+{
+	if (conn_ird > NES_MAX_IRD)
+		conn_ird = NES_MAX_IRD;
+
+	if (conn_ord > NES_MAX_ORD)
+		conn_ord = NES_MAX_ORD;
+
+	cm_node->ird_size = conn_ird;
+	cm_node->ord_size = conn_ord;
+}
+
+/**
+ * cm_build_mpa_frame - build a MPA V1 frame or MPA V2 frame
+ */
+static int cm_build_mpa_frame(struct nes_cm_node *cm_node, u8 **start_buff,
+			      u16 *buff_len, u8 *pci_mem, u8 mpa_key)
+{
+	int ret = 0;
+
+	*start_buff = (pci_mem) ? pci_mem : &cm_node->mpa_frame_buf[0];
+
+	switch (cm_node->mpa_frame_rev) {
+	case IETF_MPA_V1:
+		*start_buff = (u8 *)*start_buff + sizeof(struct ietf_rtr_msg);
+		*buff_len = sizeof(struct ietf_mpa_v1) + cm_node->mpa_frame_size;
+		build_mpa_v1(cm_node, *start_buff, mpa_key);
+		break;
+	case IETF_MPA_V2:
+		*buff_len = sizeof(struct ietf_mpa_v2) + cm_node->mpa_frame_size;
+		build_mpa_v2(cm_node, *start_buff, mpa_key);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+/**
+ * build_mpa_v2 - build a MPA V2 frame
+ */
+static void build_mpa_v2(struct nes_cm_node *cm_node,
+			 void *start_addr, u8 mpa_key)
+{
+	struct ietf_mpa_v2 *mpa_frame = (struct ietf_mpa_v2 *)start_addr;
+	struct ietf_rtr_msg *rtr_msg = &mpa_frame->rtr_msg;
+	u16 ctrl_ird;
+	u16 ctrl_ord;
+
+	/* initialize the upper 5 bytes of the frame */
+	build_mpa_v1(cm_node, start_addr, mpa_key);
+	mpa_frame->flags |= IETF_MPA_V2_FLAG; /* set a bit to indicate MPA V2 */
+	mpa_frame->priv_data_len += htons(IETF_RTR_MSG_SIZE);
+
+	/* initialize RTR msg */
+	if (cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) {
+		ctrl_ird = IETF_NO_IRD_ORD;
+		ctrl_ord = IETF_NO_IRD_ORD;
+	} else {
+		ctrl_ird = cm_node->ird_size & IETF_NO_IRD_ORD;
+		ctrl_ord = cm_node->ord_size & IETF_NO_IRD_ORD;
+	}
+	ctrl_ird |= IETF_PEER_TO_PEER;
+	ctrl_ird |= IETF_FLPDU_ZERO_LEN;
+
+	switch (mpa_key) {
+	case MPA_KEY_REQUEST:
+		ctrl_ord |= IETF_RDMA0_WRITE;
+		ctrl_ord |= IETF_RDMA0_READ;
+		break;
+	case MPA_KEY_REPLY:
+		switch (cm_node->send_rdma0_op) {
+		case SEND_RDMA_WRITE_ZERO:
+			ctrl_ord |= IETF_RDMA0_WRITE;
+			break;
+		case SEND_RDMA_READ_ZERO:
+			ctrl_ord |= IETF_RDMA0_READ;
+			break;
+		}
+	}
+	rtr_msg->ctrl_ird = htons(ctrl_ird);
+	rtr_msg->ctrl_ord = htons(ctrl_ord);
+}
+
+/**
+ * build_mpa_v1 - build a MPA V1 frame
+ */
+static void build_mpa_v1(struct nes_cm_node *cm_node, void *start_addr, u8 mpa_key)
+{
+	struct ietf_mpa_v1 *mpa_frame = (struct ietf_mpa_v1 *)start_addr;
+
+	switch (mpa_key) {
+	case MPA_KEY_REQUEST:
+		memcpy(mpa_frame->key, IEFT_MPA_KEY_REQ, IETF_MPA_KEY_SIZE);
+		break;
+	case MPA_KEY_REPLY:
+		memcpy(mpa_frame->key, IEFT_MPA_KEY_REP, IETF_MPA_KEY_SIZE);
+		break;
+	}
+	mpa_frame->flags = IETF_MPA_FLAGS_CRC;
+	mpa_frame->rev = cm_node->mpa_frame_rev;
+	mpa_frame->priv_data_len = htons(cm_node->mpa_frame_size);
+}
+
+static void build_rdma0_msg(struct nes_cm_node *cm_node, struct nes_qp **nesqp_addr)
+{
+	u64 u64temp;
+	struct nes_qp *nesqp = *nesqp_addr;
+	struct nes_hw_qp_wqe *wqe = &nesqp->hwqp.sq_vbase[0];
+
+	u64temp = (unsigned long)nesqp->nesuqp_addr;
+	u64temp |= NES_SW_CONTEXT_ALIGN >> 1;
+	set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX, u64temp);
+
+	wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_LOW_IDX] = 0;
+	wqe->wqe_words[NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX] = 0;
+
+	switch (cm_node->send_rdma0_op) {
+	case SEND_RDMA_WRITE_ZERO:
+		nes_debug(NES_DBG_CM, "Sending first write.\n");
+		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] =
+			cpu_to_le32(NES_IWARP_SQ_OP_RDMAW);
+		wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] = 0;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] = 0;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 0;
+		break;
+
+	case SEND_RDMA_READ_ZERO:
+	default:
+		if (cm_node->send_rdma0_op != SEND_RDMA_READ_ZERO)
+			WARN(1, "Unsupported RDMA0 len operation=%u\n",
+			     cm_node->send_rdma0_op);
+		nes_debug(NES_DBG_CM, "Sending first rdma operation.\n");
+		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] =
+			cpu_to_le32(NES_IWARP_SQ_OP_RDMAR);
+		wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX] = 1;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_TO_HIGH_IDX] = 0;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] = 0;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_STAG_IDX] = 1;
+		wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = 1;
+		break;
+	}
+
+	if (nesqp->sq_kmapped) {
+		nesqp->sq_kmapped = 0;
+		kunmap(nesqp->page);
+	}
+
+	/*use the reserved spot on the WQ for the extra first WQE*/
+	nesqp->nesqp_context->ird_ord_sizes &= cpu_to_le32(~(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT |
+							     NES_QPCONTEXT_ORDIRD_WRPDU |
+							     NES_QPCONTEXT_ORDIRD_ALSMM));
+	nesqp->skip_lsmm = 1;
+	nesqp->hwqp.sq_tail = 0;
+}
+
+/**
+ * schedule_nes_timer
+ * note - cm_node needs to be protected before calling this. Encase in:
+ *			rem_ref_cm_node(cm_core, cm_node);add_ref_cm_node(cm_node);
+ */
+int schedule_nes_timer(struct nes_cm_node *cm_node, struct sk_buff *skb,
+		       enum nes_timer_type type, int send_retrans,
+		       int close_when_complete)
+{
+	unsigned long flags;
+	struct nes_cm_core *cm_core = cm_node->cm_core;
+	struct nes_timer_entry *new_send;
+	int ret = 0;
+
+	new_send = kzalloc(sizeof(*new_send), GFP_ATOMIC);
+	if (!new_send)
+		return -ENOMEM;
+
+	/* new_send->timetosend = currenttime */
+	new_send->retrycount = NES_DEFAULT_RETRYS;
+	new_send->retranscount = NES_DEFAULT_RETRANS;
+	new_send->skb = skb;
+	new_send->timetosend = jiffies;
+	new_send->type = type;
+	new_send->netdev = cm_node->netdev;
+	new_send->send_retrans = send_retrans;
+	new_send->close_when_complete = close_when_complete;
+
+	if (type == NES_TIMER_TYPE_CLOSE) {
+		new_send->timetosend += (HZ / 10);
+		if (cm_node->recv_entry) {
+			kfree(new_send);
+			WARN_ON(1);
+			return -EINVAL;
+		}
+		cm_node->recv_entry = new_send;
+	}
+
+	if (type == NES_TIMER_TYPE_SEND) {
+		new_send->seq_num = ntohl(tcp_hdr(skb)->seq);
+		atomic_inc(&new_send->skb->users);
+		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+		cm_node->send_entry = new_send;
+		add_ref_cm_node(cm_node);
+		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+		new_send->timetosend = jiffies + NES_RETRY_TIMEOUT;
+
+		ret = nes_nic_cm_xmit(new_send->skb, cm_node->netdev);
+		if (ret != NETDEV_TX_OK) {
+			nes_debug(NES_DBG_CM, "Error sending packet %p "
+				  "(jiffies = %lu)\n", new_send, jiffies);
+			new_send->timetosend = jiffies;
+			ret = NETDEV_TX_OK;
+		} else {
+			cm_packets_sent++;
+			if (!send_retrans) {
+				cleanup_retrans_entry(cm_node);
+				if (close_when_complete)
+					rem_ref_cm_node(cm_core, cm_node);
+				return ret;
+			}
+		}
+	}
+
+	if (!timer_pending(&cm_core->tcp_timer))
+		mod_timer(&cm_core->tcp_timer, new_send->timetosend);
+
+	return ret;
+}
+
+static void nes_retrans_expired(struct nes_cm_node *cm_node)
+{
+	struct iw_cm_id *cm_id = cm_node->cm_id;
+	enum nes_cm_node_state state = cm_node->state;
+	cm_node->state = NES_CM_STATE_CLOSED;
+
+	switch (state) {
+	case NES_CM_STATE_SYN_RCVD:
+	case NES_CM_STATE_CLOSING:
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		break;
+	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_FIN_WAIT1:
+		if (cm_node->cm_id)
+			cm_id->rem_ref(cm_id);
+		send_reset(cm_node, NULL);
+		break;
+	default:
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, NULL);
+		create_event(cm_node, NES_CM_EVENT_ABORTED);
+	}
+}
+
+static void handle_recv_entry(struct nes_cm_node *cm_node, u32 rem_node)
+{
+	struct nes_timer_entry *recv_entry = cm_node->recv_entry;
+	struct iw_cm_id *cm_id = cm_node->cm_id;
+	struct nes_qp *nesqp;
+	unsigned long qplockflags;
+
+	if (!recv_entry)
+		return;
+	nesqp = (struct nes_qp *)recv_entry->skb;
+	if (nesqp) {
+		spin_lock_irqsave(&nesqp->lock, qplockflags);
+		if (nesqp->cm_id) {
+			nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, "
+				  "refcount = %d: HIT A "
+				  "NES_TIMER_TYPE_CLOSE with something "
+				  "to do!!!\n", nesqp->hwqp.qp_id, cm_id,
+				  atomic_read(&nesqp->refcount));
+			nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+			nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT;
+			nesqp->ibqp_state = IB_QPS_ERR;
+			spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+			nes_cm_disconn(nesqp);
+		} else {
+			spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+			nes_debug(NES_DBG_CM, "QP%u: cm_id = %p, "
+				  "refcount = %d: HIT A "
+				  "NES_TIMER_TYPE_CLOSE with nothing "
+				  "to do!!!\n", nesqp->hwqp.qp_id, cm_id,
+				  atomic_read(&nesqp->refcount));
+		}
+	} else if (rem_node) {
+		/* TIME_WAIT state */
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+	}
+	if (cm_node->cm_id)
+		cm_id->rem_ref(cm_id);
+	kfree(recv_entry);
+	cm_node->recv_entry = NULL;
+}
+
+/**
+ * nes_cm_timer_tick
+ */
+static void nes_cm_timer_tick(unsigned long pass)
+{
+	unsigned long flags;
+	unsigned long nexttimeout = jiffies + NES_LONG_TIME;
+	struct nes_cm_node *cm_node;
+	struct nes_timer_entry *send_entry, *recv_entry;
+	struct list_head *list_core_temp;
+	struct list_head *list_node;
+	struct nes_cm_core *cm_core = g_cm_core;
+	u32 settimer = 0;
+	unsigned long timetosend;
+	int ret = NETDEV_TX_OK;
+
+	struct list_head timer_list;
+
+	INIT_LIST_HEAD(&timer_list);
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+
+	list_for_each_safe(list_node, list_core_temp,
+			   &cm_core->connected_nodes) {
+		cm_node = container_of(list_node, struct nes_cm_node, list);
+		if ((cm_node->recv_entry) || (cm_node->send_entry)) {
+			add_ref_cm_node(cm_node);
+			list_add(&cm_node->timer_entry, &timer_list);
+		}
+	}
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	list_for_each_safe(list_node, list_core_temp, &timer_list) {
+		cm_node = container_of(list_node, struct nes_cm_node,
+				       timer_entry);
+		recv_entry = cm_node->recv_entry;
+
+		if (recv_entry) {
+			if (time_after(recv_entry->timetosend, jiffies)) {
+				if (nexttimeout > recv_entry->timetosend ||
+				    !settimer) {
+					nexttimeout = recv_entry->timetosend;
+					settimer = 1;
+				}
+			} else {
+				handle_recv_entry(cm_node, 1);
+			}
+		}
+
+		spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+		do {
+			send_entry = cm_node->send_entry;
+			if (!send_entry)
+				break;
+			if (time_after(send_entry->timetosend, jiffies)) {
+				if (cm_node->state != NES_CM_STATE_TSA) {
+					if ((nexttimeout >
+					     send_entry->timetosend) ||
+					    !settimer) {
+						nexttimeout =
+							send_entry->timetosend;
+						settimer = 1;
+					}
+				} else {
+					free_retrans_entry(cm_node);
+				}
+				break;
+			}
+
+			if ((cm_node->state == NES_CM_STATE_TSA) ||
+			    (cm_node->state == NES_CM_STATE_CLOSED)) {
+				free_retrans_entry(cm_node);
+				break;
+			}
+
+			if (!send_entry->retranscount ||
+			    !send_entry->retrycount) {
+				cm_packets_dropped++;
+				free_retrans_entry(cm_node);
+
+				spin_unlock_irqrestore(
+					&cm_node->retrans_list_lock, flags);
+				nes_retrans_expired(cm_node);
+				cm_node->state = NES_CM_STATE_CLOSED;
+				spin_lock_irqsave(&cm_node->retrans_list_lock,
+						  flags);
+				break;
+			}
+			atomic_inc(&send_entry->skb->users);
+			cm_packets_retrans++;
+			nes_debug(NES_DBG_CM, "Retransmitting send_entry %p "
+				  "for node %p, jiffies = %lu, time to send = "
+				  "%lu, retranscount = %u, send_entry->seq_num = "
+				  "0x%08X, cm_node->tcp_cntxt.rem_ack_num = "
+				  "0x%08X\n", send_entry, cm_node, jiffies,
+				  send_entry->timetosend,
+				  send_entry->retranscount,
+				  send_entry->seq_num,
+				  cm_node->tcp_cntxt.rem_ack_num);
+
+			spin_unlock_irqrestore(&cm_node->retrans_list_lock,
+					       flags);
+			ret = nes_nic_cm_xmit(send_entry->skb, cm_node->netdev);
+			spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+			if (ret != NETDEV_TX_OK) {
+				nes_debug(NES_DBG_CM, "rexmit failed for "
+					  "node=%p\n", cm_node);
+				cm_packets_bounced++;
+				send_entry->retrycount--;
+				nexttimeout = jiffies + NES_SHORT_TIME;
+				settimer = 1;
+				break;
+			} else {
+				cm_packets_sent++;
+			}
+			nes_debug(NES_DBG_CM, "Packet Sent: retrans count = "
+				  "%u, retry count = %u.\n",
+				  send_entry->retranscount,
+				  send_entry->retrycount);
+			if (send_entry->send_retrans) {
+				send_entry->retranscount--;
+				timetosend = (NES_RETRY_TIMEOUT <<
+					      (NES_DEFAULT_RETRANS - send_entry->retranscount));
+
+				send_entry->timetosend = jiffies +
+							 min(timetosend, NES_MAX_TIMEOUT);
+				if (nexttimeout > send_entry->timetosend ||
+				    !settimer) {
+					nexttimeout = send_entry->timetosend;
+					settimer = 1;
+				}
+			} else {
+				int close_when_complete;
+				close_when_complete =
+					send_entry->close_when_complete;
+				nes_debug(NES_DBG_CM, "cm_node=%p state=%d\n",
+					  cm_node, cm_node->state);
+				free_retrans_entry(cm_node);
+				if (close_when_complete)
+					rem_ref_cm_node(cm_node->cm_core,
+							cm_node);
+			}
+		} while (0);
+
+		spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+	}
+
+	if (settimer) {
+		if (!timer_pending(&cm_core->tcp_timer))
+			mod_timer(&cm_core->tcp_timer, nexttimeout);
+	}
+}
+
+
+/**
+ * send_syn
+ */
+static int send_syn(struct nes_cm_node *cm_node, u32 sendack,
+		    struct sk_buff *skb)
+{
+	int ret;
+	int flags = SET_SYN;
+	char optionsbuffer[sizeof(struct option_mss) +
+			   sizeof(struct option_windowscale) + sizeof(struct option_base) +
+			   TCP_OPTIONS_PADDING];
+
+	int optionssize = 0;
+	/* Sending MSS option */
+	union all_known_options *options;
+
+	if (!cm_node)
+		return -EINVAL;
+
+	options = (union all_known_options *)&optionsbuffer[optionssize];
+	options->as_mss.optionnum = OPTION_NUMBER_MSS;
+	options->as_mss.length = sizeof(struct option_mss);
+	options->as_mss.mss = htons(cm_node->tcp_cntxt.mss);
+	optionssize += sizeof(struct option_mss);
+
+	options = (union all_known_options *)&optionsbuffer[optionssize];
+	options->as_windowscale.optionnum = OPTION_NUMBER_WINDOW_SCALE;
+	options->as_windowscale.length = sizeof(struct option_windowscale);
+	options->as_windowscale.shiftcount = cm_node->tcp_cntxt.rcv_wscale;
+	optionssize += sizeof(struct option_windowscale);
+
+	if (sendack && !(NES_DRV_OPT_SUPRESS_OPTION_BC & nes_drv_opt)) {
+		options = (union all_known_options *)&optionsbuffer[optionssize];
+		options->as_base.optionnum = OPTION_NUMBER_WRITE0;
+		options->as_base.length = sizeof(struct option_base);
+		optionssize += sizeof(struct option_base);
+		/* we need the size to be a multiple of 4 */
+		options = (union all_known_options *)&optionsbuffer[optionssize];
+		options->as_end = 1;
+		optionssize += 1;
+		options = (union all_known_options *)&optionsbuffer[optionssize];
+		options->as_end = 1;
+		optionssize += 1;
+	}
+
+	options = (union all_known_options *)&optionsbuffer[optionssize];
+	options->as_end = OPTION_NUMBER_END;
+	optionssize += 1;
+
+	if (!skb)
+		skb = dev_alloc_skb(MAX_CM_BUFFER);
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -1;
+	}
+
+	if (sendack)
+		flags |= SET_ACK;
+
+	form_cm_frame(skb, cm_node, optionsbuffer, optionssize, NULL, 0, flags);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+
+	return ret;
+}
+
+
+/**
+ * send_reset
+ */
+static int send_reset(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	int ret;
+	int flags = SET_RST | SET_ACK;
+
+	if (!skb)
+		skb = dev_alloc_skb(MAX_CM_BUFFER);
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -ENOMEM;
+	}
+
+	form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, flags);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 1);
+
+	return ret;
+}
+
+
+/**
+ * send_ack
+ */
+static int send_ack(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	int ret;
+
+	if (!skb)
+		skb = dev_alloc_skb(MAX_CM_BUFFER);
+
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -1;
+	}
+
+	form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 0, 0);
+
+	return ret;
+}
+
+
+/**
+ * send_fin
+ */
+static int send_fin(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	int ret;
+
+	/* if we didn't get a frame get one */
+	if (!skb)
+		skb = dev_alloc_skb(MAX_CM_BUFFER);
+
+	if (!skb) {
+		nes_debug(NES_DBG_CM, "Failed to get a Free pkt\n");
+		return -1;
+	}
+
+	form_cm_frame(skb, cm_node, NULL, 0, NULL, 0, SET_ACK | SET_FIN);
+	ret = schedule_nes_timer(cm_node, skb, NES_TIMER_TYPE_SEND, 1, 0);
+
+	return ret;
+}
+
+
+/**
+ * find_node - find a cm node that matches the reference cm node
+ */
+static struct nes_cm_node *find_node(struct nes_cm_core *cm_core,
+				     u16 rem_port, nes_addr_t rem_addr, u16 loc_port, nes_addr_t loc_addr)
+{
+	unsigned long flags;
+	struct list_head *hte;
+	struct nes_cm_node *cm_node;
+
+	/* get a handle on the hte */
+	hte = &cm_core->connected_nodes;
+
+	/* walk list and find cm_node associated with this session ID */
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+	list_for_each_entry(cm_node, hte, list) {
+		/* compare quad, return node handle if a match */
+		nes_debug(NES_DBG_CM, "finding node %x:%x =? %x:%x ^ %x:%x =? %x:%x\n",
+			  cm_node->loc_addr, cm_node->loc_port,
+			  loc_addr, loc_port,
+			  cm_node->rem_addr, cm_node->rem_port,
+			  rem_addr, rem_port);
+		if ((cm_node->mapped_loc_addr == loc_addr) &&
+			(cm_node->mapped_loc_port == loc_port) &&
+			(cm_node->mapped_rem_addr == rem_addr) &&
+			(cm_node->mapped_rem_port == rem_port)) {
+
+			add_ref_cm_node(cm_node);
+			spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+			return cm_node;
+		}
+	}
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	/* no owner node */
+	return NULL;
+}
+
+
+/**
+ * find_listener - find a cm node listening on this addr-port pair
+ */
+static struct nes_cm_listener *find_listener(struct nes_cm_core *cm_core,
+					nes_addr_t dst_addr, u16 dst_port,
+					enum nes_cm_listener_state listener_state, int local)
+{
+	unsigned long flags;
+	struct nes_cm_listener *listen_node;
+	nes_addr_t listen_addr;
+	u16 listen_port;
+
+	/* walk list and find cm_node associated with this session ID */
+	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+	list_for_each_entry(listen_node, &cm_core->listen_list.list, list) {
+		if (local) {
+			listen_addr = listen_node->loc_addr;
+			listen_port = listen_node->loc_port;
+		} else {
+			listen_addr = listen_node->mapped_loc_addr;
+			listen_port = listen_node->mapped_loc_port;
+		}
+		/* compare node pair, return node handle if a match */
+		if (((listen_addr == dst_addr) ||
+		     listen_addr == 0x00000000) &&
+		    (listen_port == dst_port) &&
+		    (listener_state & listen_node->listener_state)) {
+			atomic_inc(&listen_node->ref_count);
+			spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+			return listen_node;
+		}
+	}
+	spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+
+	/* no listener */
+	return NULL;
+}
+
+/**
+ * add_hte_node - add a cm node to the hash table
+ */
+static int add_hte_node(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
+{
+	unsigned long flags;
+	struct list_head *hte;
+
+	if (!cm_node || !cm_core)
+		return -EINVAL;
+
+	nes_debug(NES_DBG_CM, "Adding Node %p to Active Connection HT\n",
+		  cm_node);
+
+	spin_lock_irqsave(&cm_core->ht_lock, flags);
+
+	/* get a handle on the hash table element (list head for this slot) */
+	hte = &cm_core->connected_nodes;
+	list_add_tail(&cm_node->list, hte);
+	atomic_inc(&cm_core->ht_node_cnt);
+
+	spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+
+	return 0;
+}
+
+
+/**
+ * mini_cm_dec_refcnt_listen
+ */
+static int mini_cm_dec_refcnt_listen(struct nes_cm_core *cm_core,
+				     struct nes_cm_listener *listener, int free_hanging_nodes)
+{
+	int ret = -EINVAL;
+	int err = 0;
+	unsigned long flags;
+	struct list_head *list_pos = NULL;
+	struct list_head *list_temp = NULL;
+	struct nes_cm_node *cm_node = NULL;
+	struct list_head reset_list;
+
+	nes_debug(NES_DBG_CM, "attempting listener= %p free_nodes= %d, "
+		  "refcnt=%d\n", listener, free_hanging_nodes,
+		  atomic_read(&listener->ref_count));
+	/* free non-accelerated child nodes for this listener */
+	INIT_LIST_HEAD(&reset_list);
+	if (free_hanging_nodes) {
+		spin_lock_irqsave(&cm_core->ht_lock, flags);
+		list_for_each_safe(list_pos, list_temp,
+				   &g_cm_core->connected_nodes) {
+			cm_node = container_of(list_pos, struct nes_cm_node,
+					       list);
+			if ((cm_node->listener == listener) &&
+			    (!cm_node->accelerated)) {
+				add_ref_cm_node(cm_node);
+				list_add(&cm_node->reset_entry, &reset_list);
+			}
+		}
+		spin_unlock_irqrestore(&cm_core->ht_lock, flags);
+	}
+
+	list_for_each_safe(list_pos, list_temp, &reset_list) {
+		cm_node = container_of(list_pos, struct nes_cm_node,
+				       reset_entry);
+		{
+			struct nes_cm_node *loopback = cm_node->loopbackpartner;
+			enum nes_cm_node_state old_state;
+			if (NES_CM_STATE_FIN_WAIT1 <= cm_node->state) {
+				rem_ref_cm_node(cm_node->cm_core, cm_node);
+			} else {
+				if (!loopback) {
+					cleanup_retrans_entry(cm_node);
+					err = send_reset(cm_node, NULL);
+					if (err) {
+						cm_node->state =
+							NES_CM_STATE_CLOSED;
+						WARN_ON(1);
+					} else {
+						old_state = cm_node->state;
+						cm_node->state = NES_CM_STATE_LISTENER_DESTROYED;
+						if (old_state != NES_CM_STATE_MPAREQ_RCVD)
+							rem_ref_cm_node(
+								cm_node->cm_core,
+								cm_node);
+					}
+				} else {
+					struct nes_cm_event event;
+
+					event.cm_node = loopback;
+					event.cm_info.rem_addr =
+							loopback->rem_addr;
+					event.cm_info.loc_addr =
+							loopback->loc_addr;
+					event.cm_info.rem_port =
+							loopback->rem_port;
+					event.cm_info.loc_port =
+							 loopback->loc_port;
+					event.cm_info.cm_id = loopback->cm_id;
+					add_ref_cm_node(loopback);
+					loopback->state = NES_CM_STATE_CLOSED;
+					cm_event_connect_error(&event);
+					cm_node->state = NES_CM_STATE_LISTENER_DESTROYED;
+
+					rem_ref_cm_node(cm_node->cm_core,
+							 cm_node);
+
+				}
+			}
+		}
+	}
+
+	spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+	if (!atomic_dec_return(&listener->ref_count)) {
+		list_del(&listener->list);
+
+		/* decrement our listen node count */
+		atomic_dec(&cm_core->listen_node_cnt);
+
+		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+
+		if (listener->nesvnic) {
+			nes_manage_apbvt(listener->nesvnic,
+				listener->mapped_loc_port,
+				PCI_FUNC(listener->nesvnic->nesdev->pcidev->devfn),
+				NES_MANAGE_APBVT_DEL);
+
+			nes_remove_mapinfo(listener->loc_addr,
+					listener->loc_port,
+					listener->mapped_loc_addr,
+					listener->mapped_loc_port);
+			nes_debug(NES_DBG_NLMSG,
+					"Delete APBVT mapped_loc_port = %04X\n",
+					listener->mapped_loc_port);
+		}
+
+		nes_debug(NES_DBG_CM, "destroying listener (%p)\n", listener);
+
+		kfree(listener);
+		listener = NULL;
+		ret = 0;
+		atomic_inc(&cm_listens_destroyed);
+	} else {
+		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+	}
+	if (listener) {
+		if (atomic_read(&listener->pend_accepts_cnt) > 0)
+			nes_debug(NES_DBG_CM, "destroying listener (%p)"
+				  " with non-zero pending accepts=%u\n",
+				  listener, atomic_read(&listener->pend_accepts_cnt));
+	}
+
+	return ret;
+}
+
+
+/**
+ * mini_cm_del_listen
+ */
+static int mini_cm_del_listen(struct nes_cm_core *cm_core,
+			      struct nes_cm_listener *listener)
+{
+	listener->listener_state = NES_CM_LISTENER_PASSIVE_STATE;
+	listener->cm_id = NULL; /* going to be destroyed pretty soon */
+	return mini_cm_dec_refcnt_listen(cm_core, listener, 1);
+}
+
+
+/**
+ * mini_cm_accelerated
+ */
+static inline int mini_cm_accelerated(struct nes_cm_core *cm_core,
+				      struct nes_cm_node *cm_node)
+{
+	cm_node->accelerated = 1;
+
+	if (cm_node->accept_pend) {
+		BUG_ON(!cm_node->listener);
+		atomic_dec(&cm_node->listener->pend_accepts_cnt);
+		cm_node->accept_pend = 0;
+		BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0);
+	}
+
+	if (!timer_pending(&cm_core->tcp_timer))
+		mod_timer(&cm_core->tcp_timer, (jiffies + NES_SHORT_TIME));
+
+	return 0;
+}
+
+
+/**
+ * nes_addr_resolve_neigh
+ */
+static int nes_addr_resolve_neigh(struct nes_vnic *nesvnic, u32 dst_ip, int arpindex)
+{
+	struct rtable *rt;
+	struct neighbour *neigh;
+	int rc = arpindex;
+	struct net_device *netdev;
+	struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter;
+
+	rt = ip_route_output(&init_net, htonl(dst_ip), 0, 0, 0);
+	if (IS_ERR(rt)) {
+		printk(KERN_ERR "%s: ip_route_output_key failed for 0x%08X\n",
+		       __func__, dst_ip);
+		return rc;
+	}
+
+	if (netif_is_bond_slave(nesvnic->netdev))
+		netdev = netdev_master_upper_dev_get(nesvnic->netdev);
+	else
+		netdev = nesvnic->netdev;
+
+	neigh = neigh_lookup(&arp_tbl, &rt->rt_gateway, netdev);
+
+	rcu_read_lock();
+	if (neigh) {
+		if (neigh->nud_state & NUD_VALID) {
+			nes_debug(NES_DBG_CM, "Neighbor MAC address for 0x%08X"
+				  " is %pM, Gateway is 0x%08X \n", dst_ip,
+				  neigh->ha, ntohl(rt->rt_gateway));
+
+			if (arpindex >= 0) {
+				if (ether_addr_equal(nesadapter->arp_table[arpindex].mac_addr, neigh->ha)) {
+					/* Mac address same as in nes_arp_table */
+					goto out;
+				}
+
+				nes_manage_arp_cache(nesvnic->netdev,
+						     nesadapter->arp_table[arpindex].mac_addr,
+						     dst_ip, NES_ARP_DELETE);
+			}
+
+			nes_manage_arp_cache(nesvnic->netdev, neigh->ha,
+					     dst_ip, NES_ARP_ADD);
+			rc = nes_arp_table(nesvnic->nesdev, dst_ip, NULL,
+					   NES_ARP_RESOLVE);
+		} else {
+			neigh_event_send(neigh, NULL);
+		}
+	}
+out:
+	rcu_read_unlock();
+
+	if (neigh)
+		neigh_release(neigh);
+
+	ip_rt_put(rt);
+	return rc;
+}
+
+/**
+ * make_cm_node - create a new instance of a cm node
+ */
+static struct nes_cm_node *make_cm_node(struct nes_cm_core *cm_core,
+					struct nes_vnic *nesvnic, struct nes_cm_info *cm_info,
+					struct nes_cm_listener *listener)
+{
+	struct nes_cm_node *cm_node;
+	struct timespec ts;
+	int oldarpindex = 0;
+	int arpindex = 0;
+	struct nes_device *nesdev;
+	struct nes_adapter *nesadapter;
+
+	/* create an hte and cm_node for this instance */
+	cm_node = kzalloc(sizeof(*cm_node), GFP_ATOMIC);
+	if (!cm_node)
+		return NULL;
+
+	/* set our node specific transport info */
+	if (listener) {
+		cm_node->loc_addr = listener->loc_addr;
+		cm_node->loc_port = listener->loc_port;
+	} else {
+		cm_node->loc_addr = cm_info->loc_addr;
+		cm_node->loc_port = cm_info->loc_port;
+	}
+	cm_node->rem_addr = cm_info->rem_addr;
+	cm_node->rem_port = cm_info->rem_port;
+
+	cm_node->mapped_loc_addr = cm_info->mapped_loc_addr;
+	cm_node->mapped_rem_addr = cm_info->mapped_rem_addr;
+	cm_node->mapped_loc_port = cm_info->mapped_loc_port;
+	cm_node->mapped_rem_port = cm_info->mapped_rem_port;
+
+	cm_node->mpa_frame_rev = mpa_version;
+	cm_node->send_rdma0_op = SEND_RDMA_READ_ZERO;
+	cm_node->mpav2_ird_ord = 0;
+	cm_node->ird_size = 0;
+	cm_node->ord_size = 0;
+
+	nes_debug(NES_DBG_CM, "Make node addresses : loc = %pI4:%x, rem = %pI4:%x\n",
+		  &cm_node->loc_addr, cm_node->loc_port,
+		  &cm_node->rem_addr, cm_node->rem_port);
+	cm_node->listener = listener;
+	cm_node->netdev = nesvnic->netdev;
+	cm_node->cm_id = cm_info->cm_id;
+	memcpy(cm_node->loc_mac, nesvnic->netdev->dev_addr, ETH_ALEN);
+
+	nes_debug(NES_DBG_CM, "listener=%p, cm_id=%p\n", cm_node->listener,
+		  cm_node->cm_id);
+
+	spin_lock_init(&cm_node->retrans_list_lock);
+
+	cm_node->loopbackpartner = NULL;
+	atomic_set(&cm_node->ref_count, 1);
+	/* associate our parent CM core */
+	cm_node->cm_core = cm_core;
+	cm_node->tcp_cntxt.loc_id = NES_CM_DEF_LOCAL_ID;
+	cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE;
+	cm_node->tcp_cntxt.rcv_wnd = NES_CM_DEFAULT_RCV_WND_SCALED >>
+				     NES_CM_DEFAULT_RCV_WND_SCALE;
+	ts = current_kernel_time();
+	cm_node->tcp_cntxt.loc_seq_num = htonl(ts.tv_nsec);
+	cm_node->tcp_cntxt.mss = nesvnic->max_frame_size - sizeof(struct iphdr) -
+				 sizeof(struct tcphdr) - ETH_HLEN - VLAN_HLEN;
+	cm_node->tcp_cntxt.rcv_nxt = 0;
+	/* get a unique session ID , add thread_id to an upcounter to handle race */
+	atomic_inc(&cm_core->node_cnt);
+	cm_node->conn_type = cm_info->conn_type;
+	cm_node->apbvt_set = 0;
+	cm_node->accept_pend = 0;
+
+	cm_node->nesvnic = nesvnic;
+	/* get some device handles, for arp lookup */
+	nesdev = nesvnic->nesdev;
+	nesadapter = nesdev->nesadapter;
+
+	cm_node->loopbackpartner = NULL;
+
+	/* get the mac addr for the remote node */
+	oldarpindex = nes_arp_table(nesdev, cm_node->mapped_rem_addr,
+				NULL, NES_ARP_RESOLVE);
+	arpindex = nes_addr_resolve_neigh(nesvnic,
+				cm_node->mapped_rem_addr, oldarpindex);
+	if (arpindex < 0) {
+		kfree(cm_node);
+		return NULL;
+	}
+
+	/* copy the mac addr to node context */
+	memcpy(cm_node->rem_mac, nesadapter->arp_table[arpindex].mac_addr, ETH_ALEN);
+	nes_debug(NES_DBG_CM, "Remote mac addr from arp table: %pM\n",
+		  cm_node->rem_mac);
+
+	add_hte_node(cm_core, cm_node);
+	atomic_inc(&cm_nodes_created);
+
+	return cm_node;
+}
+
+
+/**
+ * add_ref_cm_node - destroy an instance of a cm node
+ */
+static int add_ref_cm_node(struct nes_cm_node *cm_node)
+{
+	atomic_inc(&cm_node->ref_count);
+	return 0;
+}
+
+
+/**
+ * rem_ref_cm_node - destroy an instance of a cm node
+ */
+static int rem_ref_cm_node(struct nes_cm_core *cm_core,
+			   struct nes_cm_node *cm_node)
+{
+	unsigned long flags;
+	struct nes_qp *nesqp;
+
+	if (!cm_node)
+		return -EINVAL;
+
+	spin_lock_irqsave(&cm_node->cm_core->ht_lock, flags);
+	if (atomic_dec_return(&cm_node->ref_count)) {
+		spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
+		return 0;
+	}
+	list_del(&cm_node->list);
+	atomic_dec(&cm_core->ht_node_cnt);
+	spin_unlock_irqrestore(&cm_node->cm_core->ht_lock, flags);
+
+	/* if the node is destroyed before connection was accelerated */
+	if (!cm_node->accelerated && cm_node->accept_pend) {
+		BUG_ON(!cm_node->listener);
+		atomic_dec(&cm_node->listener->pend_accepts_cnt);
+		BUG_ON(atomic_read(&cm_node->listener->pend_accepts_cnt) < 0);
+	}
+	WARN_ON(cm_node->send_entry);
+	if (cm_node->recv_entry)
+		handle_recv_entry(cm_node, 0);
+	if (cm_node->listener) {
+		mini_cm_dec_refcnt_listen(cm_core, cm_node->listener, 0);
+	} else {
+		if (cm_node->apbvt_set && cm_node->nesvnic) {
+			nes_manage_apbvt(cm_node->nesvnic, cm_node->mapped_loc_port,
+					 PCI_FUNC(cm_node->nesvnic->nesdev->pcidev->devfn),
+					 NES_MANAGE_APBVT_DEL);
+		}
+		nes_debug(NES_DBG_NLMSG, "Delete APBVT mapped_loc_port = %04X\n",
+					cm_node->mapped_loc_port);
+		nes_remove_mapinfo(cm_node->loc_addr, cm_node->loc_port,
+			cm_node->mapped_loc_addr, cm_node->mapped_loc_port);
+	}
+
+	atomic_dec(&cm_core->node_cnt);
+	atomic_inc(&cm_nodes_destroyed);
+	nesqp = cm_node->nesqp;
+	if (nesqp) {
+		nesqp->cm_node = NULL;
+		nes_rem_ref(&nesqp->ibqp);
+		cm_node->nesqp = NULL;
+	}
+
+	kfree(cm_node);
+	return 0;
+}
+
+/**
+ * process_options
+ */
+static int process_options(struct nes_cm_node *cm_node, u8 *optionsloc,
+			   u32 optionsize, u32 syn_packet)
+{
+	u32 tmp;
+	u32 offset = 0;
+	union all_known_options *all_options;
+	char got_mss_option = 0;
+
+	while (offset < optionsize) {
+		all_options = (union all_known_options *)(optionsloc + offset);
+		switch (all_options->as_base.optionnum) {
+		case OPTION_NUMBER_END:
+			offset = optionsize;
+			break;
+		case OPTION_NUMBER_NONE:
+			offset += 1;
+			continue;
+		case OPTION_NUMBER_MSS:
+			nes_debug(NES_DBG_CM, "%s: MSS Length: %d Offset: %d "
+				  "Size: %d\n", __func__,
+				  all_options->as_mss.length, offset, optionsize);
+			got_mss_option = 1;
+			if (all_options->as_mss.length != 4) {
+				return 1;
+			} else {
+				tmp = ntohs(all_options->as_mss.mss);
+				if (tmp > 0 && tmp <
+				    cm_node->tcp_cntxt.mss)
+					cm_node->tcp_cntxt.mss = tmp;
+			}
+			break;
+		case OPTION_NUMBER_WINDOW_SCALE:
+			cm_node->tcp_cntxt.snd_wscale =
+				all_options->as_windowscale.shiftcount;
+			break;
+		default:
+			nes_debug(NES_DBG_CM, "TCP Option not understood: %x\n",
+				  all_options->as_base.optionnum);
+			break;
+		}
+		offset += all_options->as_base.length;
+	}
+	if ((!got_mss_option) && (syn_packet))
+		cm_node->tcp_cntxt.mss = NES_CM_DEFAULT_MSS;
+	return 0;
+}
+
+static void drop_packet(struct sk_buff *skb)
+{
+	atomic_inc(&cm_accel_dropped_pkts);
+	dev_kfree_skb_any(skb);
+}
+
+static void handle_fin_pkt(struct nes_cm_node *cm_node)
+{
+	nes_debug(NES_DBG_CM, "Received FIN, cm_node = %p, state = %u. "
+		  "refcnt=%d\n", cm_node, cm_node->state,
+		  atomic_read(&cm_node->ref_count));
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_RCVD:
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_MPAREJ_RCVD:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_LAST_ACK;
+		send_fin(cm_node, NULL);
+		break;
+	case NES_CM_STATE_MPAREQ_SENT:
+		create_event(cm_node, NES_CM_EVENT_ABORTED);
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, NULL);
+		break;
+	case NES_CM_STATE_FIN_WAIT1:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSING;
+		send_ack(cm_node, NULL);
+		/* Wait for ACK as this is simultaneous close..
+		* After we receive ACK, do not send anything..
+		* Just rm the node.. Done.. */
+		break;
+	case NES_CM_STATE_FIN_WAIT2:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_TIME_WAIT;
+		send_ack(cm_node, NULL);
+		schedule_nes_timer(cm_node, NULL,  NES_TIMER_TYPE_CLOSE, 1, 0);
+		break;
+	case NES_CM_STATE_TIME_WAIT:
+		cm_node->tcp_cntxt.rcv_nxt++;
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		break;
+	case NES_CM_STATE_TSA:
+	default:
+		nes_debug(NES_DBG_CM, "Error Rcvd FIN for node-%p state = %d\n",
+			cm_node, cm_node->state);
+		break;
+	}
+}
+
+
+static void handle_rst_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
+	struct tcphdr *tcph)
+{
+
+	int	reset = 0;	/* whether to send reset in case of err.. */
+	atomic_inc(&cm_resets_recvd);
+	nes_debug(NES_DBG_CM, "Received Reset, cm_node = %p, state = %u."
+			" refcnt=%d\n", cm_node, cm_node->state,
+			atomic_read(&cm_node->ref_count));
+	cleanup_retrans_entry(cm_node);
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_MPAREQ_SENT:
+		nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p "
+			"listener=%p state=%d\n", __func__, __LINE__, cm_node,
+			cm_node->listener, cm_node->state);
+		switch (cm_node->mpa_frame_rev) {
+		case IETF_MPA_V2:
+			cm_node->mpa_frame_rev = IETF_MPA_V1;
+			/* send a syn and goto syn sent state */
+			cm_node->state = NES_CM_STATE_SYN_SENT;
+			if (send_syn(cm_node, 0, NULL)) {
+				active_open_err(cm_node, skb, reset);
+			}
+			break;
+		case IETF_MPA_V1:
+		default:
+			active_open_err(cm_node, skb, reset);
+			break;
+		}
+		break;
+	case NES_CM_STATE_MPAREQ_RCVD:
+		atomic_inc(&cm_node->passive_state);
+		dev_kfree_skb_any(skb);
+		break;
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_SYN_RCVD:
+	case NES_CM_STATE_LISTENING:
+		nes_debug(NES_DBG_CM, "Bad state %s[%u]\n", __func__, __LINE__);
+		passive_open_err(cm_node, skb, reset);
+		break;
+	case NES_CM_STATE_TSA:
+		active_open_err(cm_node, skb, reset);
+		break;
+	case NES_CM_STATE_CLOSED:
+		drop_packet(skb);
+		break;
+	case NES_CM_STATE_FIN_WAIT2:
+	case NES_CM_STATE_FIN_WAIT1:
+	case NES_CM_STATE_LAST_ACK:
+		cm_node->cm_id->rem_ref(cm_node->cm_id);
+	case NES_CM_STATE_TIME_WAIT:
+		cm_node->state = NES_CM_STATE_CLOSED;
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		drop_packet(skb);
+		break;
+	default:
+		drop_packet(skb);
+		break;
+	}
+}
+
+
+static void handle_rcv_mpa(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	int ret = 0;
+	int datasize = skb->len;
+	u8 *dataloc = skb->data;
+
+	enum nes_cm_event_type type = NES_CM_EVENT_UNKNOWN;
+	u32 res_type;
+
+	ret = parse_mpa(cm_node, dataloc, &res_type, datasize);
+	if (ret) {
+		nes_debug(NES_DBG_CM, "didn't like MPA Request\n");
+		if (cm_node->state == NES_CM_STATE_MPAREQ_SENT) {
+			nes_debug(NES_DBG_CM, "%s[%u] create abort for "
+				  "cm_node=%p listener=%p state=%d\n", __func__,
+				  __LINE__, cm_node, cm_node->listener,
+				  cm_node->state);
+			active_open_err(cm_node, skb, 1);
+		} else {
+			passive_open_err(cm_node, skb, 1);
+		}
+		return;
+	}
+
+	switch (cm_node->state) {
+	case NES_CM_STATE_ESTABLISHED:
+		if (res_type == NES_MPA_REQUEST_REJECT)
+			/*BIG problem as we are receiving the MPA.. So should
+			 * not be REJECT.. This is Passive Open.. We can
+			 * only receive it Reject for Active Open...*/
+			WARN_ON(1);
+		cm_node->state = NES_CM_STATE_MPAREQ_RCVD;
+		type = NES_CM_EVENT_MPA_REQ;
+		atomic_set(&cm_node->passive_state,
+			   NES_PASSIVE_STATE_INDICATED);
+		break;
+	case NES_CM_STATE_MPAREQ_SENT:
+		cleanup_retrans_entry(cm_node);
+		if (res_type == NES_MPA_REQUEST_REJECT) {
+			type = NES_CM_EVENT_MPA_REJECT;
+			cm_node->state = NES_CM_STATE_MPAREJ_RCVD;
+		} else {
+			type = NES_CM_EVENT_CONNECTED;
+			cm_node->state = NES_CM_STATE_TSA;
+		}
+
+		break;
+	default:
+		WARN_ON(1);
+		break;
+	}
+	dev_kfree_skb_any(skb);
+	create_event(cm_node, type);
+}
+
+static void indicate_pkt_err(struct nes_cm_node *cm_node, struct sk_buff *skb)
+{
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_MPAREQ_SENT:
+		nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p "
+			  "listener=%p state=%d\n", __func__, __LINE__, cm_node,
+			  cm_node->listener, cm_node->state);
+		active_open_err(cm_node, skb, 1);
+		break;
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_SYN_RCVD:
+		passive_open_err(cm_node, skb, 1);
+		break;
+	case NES_CM_STATE_TSA:
+	default:
+		drop_packet(skb);
+	}
+}
+
+static int check_syn(struct nes_cm_node *cm_node, struct tcphdr *tcph,
+		     struct sk_buff *skb)
+{
+	int err;
+
+	err = ((ntohl(tcph->ack_seq) == cm_node->tcp_cntxt.loc_seq_num)) ? 0 : 1;
+	if (err)
+		active_open_err(cm_node, skb, 1);
+
+	return err;
+}
+
+static int check_seq(struct nes_cm_node *cm_node, struct tcphdr *tcph,
+		     struct sk_buff *skb)
+{
+	int err = 0;
+	u32 seq;
+	u32 ack_seq;
+	u32 loc_seq_num = cm_node->tcp_cntxt.loc_seq_num;
+	u32 rcv_nxt = cm_node->tcp_cntxt.rcv_nxt;
+	u32 rcv_wnd;
+
+	seq = ntohl(tcph->seq);
+	ack_seq = ntohl(tcph->ack_seq);
+	rcv_wnd = cm_node->tcp_cntxt.rcv_wnd;
+	if (ack_seq != loc_seq_num)
+		err = 1;
+	else if (!between(seq, rcv_nxt, (rcv_nxt + rcv_wnd)))
+		err = 1;
+	if (err) {
+		nes_debug(NES_DBG_CM, "%s[%u] create abort for cm_node=%p "
+			  "listener=%p state=%d\n", __func__, __LINE__, cm_node,
+			  cm_node->listener, cm_node->state);
+		indicate_pkt_err(cm_node, skb);
+		nes_debug(NES_DBG_CM, "seq ERROR cm_node =%p seq=0x%08X "
+			  "rcv_nxt=0x%08X rcv_wnd=0x%x\n", cm_node, seq, rcv_nxt,
+			  rcv_wnd);
+	}
+	return err;
+}
+
+/*
+ * handle_syn_pkt() is for Passive node. The syn packet is received when a node
+ * is created with a listener or it may comein as rexmitted packet which in
+ * that case will be just dropped.
+ */
+static void handle_syn_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			   struct tcphdr *tcph)
+{
+	int ret;
+	u32 inc_sequence;
+	int optionsize;
+
+	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
+	skb_trim(skb, 0);
+	inc_sequence = ntohl(tcph->seq);
+
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_MPAREQ_SENT:
+		/* Rcvd syn on active open connection*/
+		active_open_err(cm_node, skb, 1);
+		break;
+	case NES_CM_STATE_LISTENING:
+		/* Passive OPEN */
+		if (atomic_read(&cm_node->listener->pend_accepts_cnt) >
+		    cm_node->listener->backlog) {
+			nes_debug(NES_DBG_CM, "drop syn due to backlog "
+				  "pressure \n");
+			cm_backlog_drops++;
+			passive_open_err(cm_node, skb, 0);
+			break;
+		}
+		ret = handle_tcp_options(cm_node, tcph, skb, optionsize,
+					 1);
+		if (ret) {
+			passive_open_err(cm_node, skb, 0);
+			/* drop pkt */
+			break;
+		}
+		cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1;
+		BUG_ON(cm_node->send_entry);
+		cm_node->accept_pend = 1;
+		atomic_inc(&cm_node->listener->pend_accepts_cnt);
+
+		cm_node->state = NES_CM_STATE_SYN_RCVD;
+		send_syn(cm_node, 1, skb);
+		break;
+	case NES_CM_STATE_CLOSED:
+		cleanup_retrans_entry(cm_node);
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, skb);
+		break;
+	case NES_CM_STATE_TSA:
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_FIN_WAIT1:
+	case NES_CM_STATE_FIN_WAIT2:
+	case NES_CM_STATE_MPAREQ_RCVD:
+	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_CLOSING:
+	case NES_CM_STATE_UNKNOWN:
+	default:
+		drop_packet(skb);
+		break;
+	}
+}
+
+static void handle_synack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			      struct tcphdr *tcph)
+{
+	int ret;
+	u32 inc_sequence;
+	int optionsize;
+
+	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
+	skb_trim(skb, 0);
+	inc_sequence = ntohl(tcph->seq);
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_SENT:
+		cleanup_retrans_entry(cm_node);
+		/* active open */
+		if (check_syn(cm_node, tcph, skb))
+			return;
+		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
+		/* setup options */
+		ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 0);
+		if (ret) {
+			nes_debug(NES_DBG_CM, "cm_node=%p tcp_options failed\n",
+				  cm_node);
+			break;
+		}
+		cleanup_retrans_entry(cm_node);
+		cm_node->tcp_cntxt.rcv_nxt = inc_sequence + 1;
+		send_mpa_request(cm_node, skb);
+		cm_node->state = NES_CM_STATE_MPAREQ_SENT;
+		break;
+	case NES_CM_STATE_MPAREQ_RCVD:
+		/* passive open, so should not be here */
+		passive_open_err(cm_node, skb, 1);
+		break;
+	case NES_CM_STATE_LISTENING:
+		cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		send_reset(cm_node, skb);
+		break;
+	case NES_CM_STATE_CLOSED:
+		cm_node->tcp_cntxt.loc_seq_num = ntohl(tcph->ack_seq);
+		cleanup_retrans_entry(cm_node);
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, skb);
+		break;
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_FIN_WAIT1:
+	case NES_CM_STATE_FIN_WAIT2:
+	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_TSA:
+	case NES_CM_STATE_CLOSING:
+	case NES_CM_STATE_UNKNOWN:
+	case NES_CM_STATE_MPAREQ_SENT:
+	default:
+		drop_packet(skb);
+		break;
+	}
+}
+
+static int handle_ack_pkt(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			  struct tcphdr *tcph)
+{
+	int datasize = 0;
+	u32 inc_sequence;
+	int ret = 0;
+	int optionsize;
+
+	optionsize = (tcph->doff << 2) - sizeof(struct tcphdr);
+
+	if (check_seq(cm_node, tcph, skb))
+		return -EINVAL;
+
+	skb_pull(skb, tcph->doff << 2);
+	inc_sequence = ntohl(tcph->seq);
+	datasize = skb->len;
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_RCVD:
+		/* Passive OPEN */
+		cleanup_retrans_entry(cm_node);
+		ret = handle_tcp_options(cm_node, tcph, skb, optionsize, 1);
+		if (ret)
+			break;
+		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
+		cm_node->state = NES_CM_STATE_ESTABLISHED;
+		if (datasize) {
+			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
+			nes_get_remote_addr(cm_node);
+			handle_rcv_mpa(cm_node, skb);
+		} else { /* rcvd ACK only */
+			dev_kfree_skb_any(skb);
+		}
+		break;
+	case NES_CM_STATE_ESTABLISHED:
+		/* Passive OPEN */
+		cleanup_retrans_entry(cm_node);
+		if (datasize) {
+			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
+			handle_rcv_mpa(cm_node, skb);
+		} else {
+			drop_packet(skb);
+		}
+		break;
+	case NES_CM_STATE_MPAREQ_SENT:
+		cm_node->tcp_cntxt.rem_ack_num = ntohl(tcph->ack_seq);
+		if (datasize) {
+			cm_node->tcp_cntxt.rcv_nxt = inc_sequence + datasize;
+			handle_rcv_mpa(cm_node, skb);
+		} else { /* Could be just an ack pkt.. */
+			dev_kfree_skb_any(skb);
+		}
+		break;
+	case NES_CM_STATE_LISTENING:
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		send_reset(cm_node, skb);
+		break;
+	case NES_CM_STATE_CLOSED:
+		cleanup_retrans_entry(cm_node);
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, skb);
+		break;
+	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_CLOSING:
+		cleanup_retrans_entry(cm_node);
+		cm_node->state = NES_CM_STATE_CLOSED;
+		cm_node->cm_id->rem_ref(cm_node->cm_id);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		drop_packet(skb);
+		break;
+	case NES_CM_STATE_FIN_WAIT1:
+		cleanup_retrans_entry(cm_node);
+		drop_packet(skb);
+		cm_node->state = NES_CM_STATE_FIN_WAIT2;
+		break;
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_FIN_WAIT2:
+	case NES_CM_STATE_TSA:
+	case NES_CM_STATE_MPAREQ_RCVD:
+	case NES_CM_STATE_UNKNOWN:
+	default:
+		cleanup_retrans_entry(cm_node);
+		drop_packet(skb);
+		break;
+	}
+	return ret;
+}
+
+
+
+static int handle_tcp_options(struct nes_cm_node *cm_node, struct tcphdr *tcph,
+			      struct sk_buff *skb, int optionsize, int passive)
+{
+	u8 *optionsloc = (u8 *)&tcph[1];
+
+	if (optionsize) {
+		if (process_options(cm_node, optionsloc, optionsize,
+				    (u32)tcph->syn)) {
+			nes_debug(NES_DBG_CM, "%s: Node %p, Sending RESET\n",
+				  __func__, cm_node);
+			if (passive)
+				passive_open_err(cm_node, skb, 1);
+			else
+				active_open_err(cm_node, skb, 1);
+			return 1;
+		}
+	}
+
+	cm_node->tcp_cntxt.snd_wnd = ntohs(tcph->window) <<
+				     cm_node->tcp_cntxt.snd_wscale;
+
+	if (cm_node->tcp_cntxt.snd_wnd > cm_node->tcp_cntxt.max_snd_wnd)
+		cm_node->tcp_cntxt.max_snd_wnd = cm_node->tcp_cntxt.snd_wnd;
+	return 0;
+}
+
+/*
+ * active_open_err() will send reset() if flag set..
+ * It will also send ABORT event.
+ */
+static void active_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			    int reset)
+{
+	cleanup_retrans_entry(cm_node);
+	if (reset) {
+		nes_debug(NES_DBG_CM, "ERROR active err called for cm_node=%p, "
+			  "state=%d\n", cm_node, cm_node->state);
+		add_ref_cm_node(cm_node);
+		send_reset(cm_node, skb);
+	} else {
+		dev_kfree_skb_any(skb);
+	}
+
+	cm_node->state = NES_CM_STATE_CLOSED;
+	create_event(cm_node, NES_CM_EVENT_ABORTED);
+}
+
+/*
+ * passive_open_err() will either do a reset() or will free up the skb and
+ * remove the cm_node.
+ */
+static void passive_open_err(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			     int reset)
+{
+	cleanup_retrans_entry(cm_node);
+	cm_node->state = NES_CM_STATE_CLOSED;
+	if (reset) {
+		nes_debug(NES_DBG_CM, "passive_open_err sending RST for "
+			  "cm_node=%p state =%d\n", cm_node, cm_node->state);
+		send_reset(cm_node, skb);
+	} else {
+		dev_kfree_skb_any(skb);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+	}
+}
+
+/*
+ * free_retrans_entry() routines assumes that the retrans_list_lock has
+ * been acquired before calling.
+ */
+static void free_retrans_entry(struct nes_cm_node *cm_node)
+{
+	struct nes_timer_entry *send_entry;
+
+	send_entry = cm_node->send_entry;
+	if (send_entry) {
+		cm_node->send_entry = NULL;
+		dev_kfree_skb_any(send_entry->skb);
+		kfree(send_entry);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+	}
+}
+
+static void cleanup_retrans_entry(struct nes_cm_node *cm_node)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&cm_node->retrans_list_lock, flags);
+	free_retrans_entry(cm_node);
+	spin_unlock_irqrestore(&cm_node->retrans_list_lock, flags);
+}
+
+/**
+ * process_packet
+ * Returns skb if to be freed, else it will return NULL if already used..
+ */
+static void process_packet(struct nes_cm_node *cm_node, struct sk_buff *skb,
+			   struct nes_cm_core *cm_core)
+{
+	enum nes_tcpip_pkt_type pkt_type = NES_PKT_TYPE_UNKNOWN;
+	struct tcphdr *tcph = tcp_hdr(skb);
+	u32 fin_set = 0;
+	int ret = 0;
+
+	skb_pull(skb, ip_hdr(skb)->ihl << 2);
+
+	nes_debug(NES_DBG_CM, "process_packet: cm_node=%p state =%d syn=%d "
+		  "ack=%d rst=%d fin=%d\n", cm_node, cm_node->state, tcph->syn,
+		  tcph->ack, tcph->rst, tcph->fin);
+
+	if (tcph->rst) {
+		pkt_type = NES_PKT_TYPE_RST;
+	} else if (tcph->syn) {
+		pkt_type = NES_PKT_TYPE_SYN;
+		if (tcph->ack)
+			pkt_type = NES_PKT_TYPE_SYNACK;
+	} else if (tcph->ack) {
+		pkt_type = NES_PKT_TYPE_ACK;
+	}
+	if (tcph->fin)
+		fin_set = 1;
+
+	switch (pkt_type) {
+	case NES_PKT_TYPE_SYN:
+		handle_syn_pkt(cm_node, skb, tcph);
+		break;
+	case NES_PKT_TYPE_SYNACK:
+		handle_synack_pkt(cm_node, skb, tcph);
+		break;
+	case NES_PKT_TYPE_ACK:
+		ret = handle_ack_pkt(cm_node, skb, tcph);
+		if (fin_set && !ret)
+			handle_fin_pkt(cm_node);
+		break;
+	case NES_PKT_TYPE_RST:
+		handle_rst_pkt(cm_node, skb, tcph);
+		break;
+	default:
+		if ((fin_set) && (!check_seq(cm_node, tcph, skb)))
+			handle_fin_pkt(cm_node);
+		drop_packet(skb);
+		break;
+	}
+}
+
+/**
+ * mini_cm_listen - create a listen node with params
+ */
+static struct nes_cm_listener *mini_cm_listen(struct nes_cm_core *cm_core,
+			struct nes_vnic *nesvnic, struct nes_cm_info *cm_info)
+{
+	struct nes_cm_listener *listener;
+	struct iwpm_dev_data pm_reg_msg;
+	struct iwpm_sa_data pm_msg;
+	unsigned long flags;
+	int iwpm_err = 0;
+
+	nes_debug(NES_DBG_CM, "Search for 0x%08x : 0x%04x\n",
+		  cm_info->loc_addr, cm_info->loc_port);
+
+	/* cannot have multiple matching listeners */
+	listener = find_listener(cm_core, cm_info->loc_addr, cm_info->loc_port,
+				NES_CM_LISTENER_EITHER_STATE, 1);
+
+	if (listener && listener->listener_state == NES_CM_LISTENER_ACTIVE_STATE) {
+		/* find automatically incs ref count ??? */
+		atomic_dec(&listener->ref_count);
+		nes_debug(NES_DBG_CM, "Not creating listener since it already exists\n");
+		return NULL;
+	}
+
+	if (!listener) {
+		nes_form_reg_msg(nesvnic, &pm_reg_msg);
+		iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES);
+		if (iwpm_err) {
+			nes_debug(NES_DBG_NLMSG,
+			"Port Mapper reg pid fail (err = %d).\n", iwpm_err);
+		}
+		if (iwpm_valid_pid() && !iwpm_err) {
+			nes_form_pm_msg(cm_info, &pm_msg);
+			iwpm_err = iwpm_add_mapping(&pm_msg, RDMA_NL_NES);
+			if (iwpm_err)
+				nes_debug(NES_DBG_NLMSG,
+				"Port Mapper query fail (err = %d).\n", iwpm_err);
+			else
+				nes_record_pm_msg(cm_info, &pm_msg);
+		}
+
+		/* create a CM listen node (1/2 node to compare incoming traffic to) */
+		listener = kzalloc(sizeof(*listener), GFP_ATOMIC);
+		if (!listener) {
+			nes_debug(NES_DBG_CM, "Not creating listener memory allocation failed\n");
+			return NULL;
+		}
+
+		listener->loc_addr = cm_info->loc_addr;
+		listener->loc_port = cm_info->loc_port;
+		listener->mapped_loc_addr = cm_info->mapped_loc_addr;
+		listener->mapped_loc_port = cm_info->mapped_loc_port;
+		listener->reused_node = 0;
+
+		atomic_set(&listener->ref_count, 1);
+	}
+	/* pasive case */
+	/* find already inc'ed the ref count */
+	else {
+		listener->reused_node = 1;
+	}
+
+	listener->cm_id = cm_info->cm_id;
+	atomic_set(&listener->pend_accepts_cnt, 0);
+	listener->cm_core = cm_core;
+	listener->nesvnic = nesvnic;
+	atomic_inc(&cm_core->node_cnt);
+
+	listener->conn_type = cm_info->conn_type;
+	listener->backlog = cm_info->backlog;
+	listener->listener_state = NES_CM_LISTENER_ACTIVE_STATE;
+
+	if (!listener->reused_node) {
+		spin_lock_irqsave(&cm_core->listen_list_lock, flags);
+		list_add(&listener->list, &cm_core->listen_list.list);
+		spin_unlock_irqrestore(&cm_core->listen_list_lock, flags);
+		atomic_inc(&cm_core->listen_node_cnt);
+	}
+
+	nes_debug(NES_DBG_CM, "Api - listen(): addr=0x%08X, port=0x%04x,"
+		  " listener = %p, backlog = %d, cm_id = %p.\n",
+		  cm_info->loc_addr, cm_info->loc_port,
+		  listener, listener->backlog, listener->cm_id);
+
+	return listener;
+}
+
+
+/**
+ * mini_cm_connect - make a connection node with params
+ */
+static struct nes_cm_node *mini_cm_connect(struct nes_cm_core *cm_core,
+					   struct nes_vnic *nesvnic, u16 private_data_len,
+					   void *private_data, struct nes_cm_info *cm_info)
+{
+	int ret = 0;
+	struct nes_cm_node *cm_node;
+	struct nes_cm_listener *loopbackremotelistener;
+	struct nes_cm_node *loopbackremotenode;
+	struct nes_cm_info loopback_cm_info;
+	u8 *start_buff;
+
+	/* create a CM connection node */
+	cm_node = make_cm_node(cm_core, nesvnic, cm_info, NULL);
+	if (!cm_node)
+		return NULL;
+
+	/* set our node side to client (active) side */
+	cm_node->tcp_cntxt.client = 1;
+	cm_node->tcp_cntxt.rcv_wscale = NES_CM_DEFAULT_RCV_WND_SCALE;
+
+	if (cm_info->loc_addr == cm_info->rem_addr) {
+		loopbackremotelistener = find_listener(cm_core,
+			cm_node->mapped_loc_addr, cm_node->mapped_rem_port,
+			NES_CM_LISTENER_ACTIVE_STATE, 0);
+		if (loopbackremotelistener == NULL) {
+			create_event(cm_node, NES_CM_EVENT_ABORTED);
+		} else {
+			loopback_cm_info = *cm_info;
+			loopback_cm_info.loc_port = cm_info->rem_port;
+			loopback_cm_info.rem_port = cm_info->loc_port;
+			loopback_cm_info.mapped_loc_port =
+				cm_info->mapped_rem_port;
+			loopback_cm_info.mapped_rem_port =
+				cm_info->mapped_loc_port;
+			loopback_cm_info.cm_id = loopbackremotelistener->cm_id;
+			loopbackremotenode = make_cm_node(cm_core, nesvnic,
+							  &loopback_cm_info, loopbackremotelistener);
+			if (!loopbackremotenode) {
+				rem_ref_cm_node(cm_node->cm_core, cm_node);
+				return NULL;
+			}
+			atomic_inc(&cm_loopbacks);
+			loopbackremotenode->loopbackpartner = cm_node;
+			loopbackremotenode->tcp_cntxt.rcv_wscale =
+				NES_CM_DEFAULT_RCV_WND_SCALE;
+			cm_node->loopbackpartner = loopbackremotenode;
+			memcpy(loopbackremotenode->mpa_frame_buf, private_data,
+			       private_data_len);
+			loopbackremotenode->mpa_frame_size = private_data_len;
+
+			/* we are done handling this state. */
+			/* set node to a TSA state */
+			cm_node->state = NES_CM_STATE_TSA;
+			cm_node->tcp_cntxt.rcv_nxt =
+				loopbackremotenode->tcp_cntxt.loc_seq_num;
+			loopbackremotenode->tcp_cntxt.rcv_nxt =
+				cm_node->tcp_cntxt.loc_seq_num;
+			cm_node->tcp_cntxt.max_snd_wnd =
+				loopbackremotenode->tcp_cntxt.rcv_wnd;
+			loopbackremotenode->tcp_cntxt.max_snd_wnd =
+				cm_node->tcp_cntxt.rcv_wnd;
+			cm_node->tcp_cntxt.snd_wnd =
+				loopbackremotenode->tcp_cntxt.rcv_wnd;
+			loopbackremotenode->tcp_cntxt.snd_wnd =
+				cm_node->tcp_cntxt.rcv_wnd;
+			cm_node->tcp_cntxt.snd_wscale =
+				loopbackremotenode->tcp_cntxt.rcv_wscale;
+			loopbackremotenode->tcp_cntxt.snd_wscale =
+				cm_node->tcp_cntxt.rcv_wscale;
+			loopbackremotenode->state = NES_CM_STATE_MPAREQ_RCVD;
+			create_event(loopbackremotenode, NES_CM_EVENT_MPA_REQ);
+		}
+		return cm_node;
+	}
+
+	start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2);
+	cm_node->mpa_frame_size = private_data_len;
+
+	memcpy(start_buff, private_data, private_data_len);
+
+	/* send a syn and goto syn sent state */
+	cm_node->state = NES_CM_STATE_SYN_SENT;
+	ret = send_syn(cm_node, 0, NULL);
+
+	if (ret) {
+		/* error in sending the syn free up the cm_node struct */
+		nes_debug(NES_DBG_CM, "Api - connect() FAILED: dest "
+			  "addr=0x%08X, port=0x%04x, cm_node=%p, cm_id = %p.\n",
+			  cm_node->rem_addr, cm_node->rem_port, cm_node,
+			  cm_node->cm_id);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		cm_node = NULL;
+	}
+
+	if (cm_node) {
+		nes_debug(NES_DBG_CM, "Api - connect(): dest addr=0x%08X,"
+			  "port=0x%04x, cm_node=%p, cm_id = %p.\n",
+			  cm_node->rem_addr, cm_node->rem_port, cm_node,
+			  cm_node->cm_id);
+	}
+
+	return cm_node;
+}
+
+
+/**
+ * mini_cm_accept - accept a connection
+ * This function is never called
+ */
+static int mini_cm_accept(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
+{
+	return 0;
+}
+
+
+/**
+ * mini_cm_reject - reject and teardown a connection
+ */
+static int mini_cm_reject(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
+{
+	int ret = 0;
+	int err = 0;
+	int passive_state;
+	struct nes_cm_event event;
+	struct iw_cm_id *cm_id = cm_node->cm_id;
+	struct nes_cm_node *loopback = cm_node->loopbackpartner;
+
+	nes_debug(NES_DBG_CM, "%s cm_node=%p type=%d state=%d\n",
+		  __func__, cm_node, cm_node->tcp_cntxt.client, cm_node->state);
+
+	if (cm_node->tcp_cntxt.client)
+		return ret;
+	cleanup_retrans_entry(cm_node);
+
+	if (!loopback) {
+		passive_state = atomic_add_return(1, &cm_node->passive_state);
+		if (passive_state == NES_SEND_RESET_EVENT) {
+			cm_node->state = NES_CM_STATE_CLOSED;
+			rem_ref_cm_node(cm_core, cm_node);
+		} else {
+			if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) {
+				rem_ref_cm_node(cm_core, cm_node);
+			} else {
+				ret = send_mpa_reject(cm_node);
+				if (ret) {
+					cm_node->state = NES_CM_STATE_CLOSED;
+					err = send_reset(cm_node, NULL);
+					if (err)
+						WARN_ON(1);
+				} else {
+					cm_id->add_ref(cm_id);
+				}
+			}
+		}
+	} else {
+		cm_node->cm_id = NULL;
+		if (cm_node->state == NES_CM_STATE_LISTENER_DESTROYED) {
+			rem_ref_cm_node(cm_core, cm_node);
+			rem_ref_cm_node(cm_core, loopback);
+		} else {
+			event.cm_node = loopback;
+			event.cm_info.rem_addr = loopback->rem_addr;
+			event.cm_info.loc_addr = loopback->loc_addr;
+			event.cm_info.rem_port = loopback->rem_port;
+			event.cm_info.loc_port = loopback->loc_port;
+			event.cm_info.cm_id = loopback->cm_id;
+			cm_event_mpa_reject(&event);
+			rem_ref_cm_node(cm_core, cm_node);
+			loopback->state = NES_CM_STATE_CLOSING;
+
+			cm_id = loopback->cm_id;
+			rem_ref_cm_node(cm_core, loopback);
+			cm_id->rem_ref(cm_id);
+		}
+	}
+
+	return ret;
+}
+
+
+/**
+ * mini_cm_close
+ */
+static int mini_cm_close(struct nes_cm_core *cm_core, struct nes_cm_node *cm_node)
+{
+	int ret = 0;
+
+	if (!cm_core || !cm_node)
+		return -EINVAL;
+
+	switch (cm_node->state) {
+	case NES_CM_STATE_SYN_RCVD:
+	case NES_CM_STATE_SYN_SENT:
+	case NES_CM_STATE_ONE_SIDE_ESTABLISHED:
+	case NES_CM_STATE_ESTABLISHED:
+	case NES_CM_STATE_ACCEPTING:
+	case NES_CM_STATE_MPAREQ_SENT:
+	case NES_CM_STATE_MPAREQ_RCVD:
+		cleanup_retrans_entry(cm_node);
+		send_reset(cm_node, NULL);
+		break;
+	case NES_CM_STATE_CLOSE_WAIT:
+		cm_node->state = NES_CM_STATE_LAST_ACK;
+		send_fin(cm_node, NULL);
+		break;
+	case NES_CM_STATE_FIN_WAIT1:
+	case NES_CM_STATE_FIN_WAIT2:
+	case NES_CM_STATE_LAST_ACK:
+	case NES_CM_STATE_TIME_WAIT:
+	case NES_CM_STATE_CLOSING:
+		ret = -1;
+		break;
+	case NES_CM_STATE_LISTENING:
+		cleanup_retrans_entry(cm_node);
+		send_reset(cm_node, NULL);
+		break;
+	case NES_CM_STATE_MPAREJ_RCVD:
+	case NES_CM_STATE_UNKNOWN:
+	case NES_CM_STATE_INITED:
+	case NES_CM_STATE_CLOSED:
+	case NES_CM_STATE_LISTENER_DESTROYED:
+		ret = rem_ref_cm_node(cm_core, cm_node);
+		break;
+	case NES_CM_STATE_TSA:
+		if (cm_node->send_entry)
+			printk(KERN_ERR "ERROR Close got called from STATE_TSA "
+			       "send_entry=%p\n", cm_node->send_entry);
+		ret = rem_ref_cm_node(cm_core, cm_node);
+		break;
+	}
+	return ret;
+}
+
+
+/**
+ * recv_pkt - recv an ETHERNET packet, and process it through CM
+ * node state machine
+ */
+static int mini_cm_recv_pkt(struct nes_cm_core *cm_core,
+			    struct nes_vnic *nesvnic, struct sk_buff *skb)
+{
+	struct nes_cm_node *cm_node = NULL;
+	struct nes_cm_listener *listener = NULL;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	struct nes_cm_info nfo;
+	int skb_handled = 1;
+	__be32 tmp_daddr, tmp_saddr;
+
+	if (!skb)
+		return 0;
+	if (skb->len < sizeof(struct iphdr) + sizeof(struct tcphdr))
+		return 0;
+
+	iph = (struct iphdr *)skb->data;
+	tcph = (struct tcphdr *)(skb->data + sizeof(struct iphdr));
+
+	nfo.loc_addr = ntohl(iph->daddr);
+	nfo.loc_port = ntohs(tcph->dest);
+	nfo.rem_addr = ntohl(iph->saddr);
+	nfo.rem_port = ntohs(tcph->source);
+
+	/* If port mapper is available these should be mapped address info */
+	nfo.mapped_loc_addr = ntohl(iph->daddr);
+	nfo.mapped_loc_port = ntohs(tcph->dest);
+	nfo.mapped_rem_addr = ntohl(iph->saddr);
+	nfo.mapped_rem_port = ntohs(tcph->source);
+
+	tmp_daddr = cpu_to_be32(iph->daddr);
+	tmp_saddr = cpu_to_be32(iph->saddr);
+
+	nes_debug(NES_DBG_CM, "Received packet: dest=%pI4:0x%04X src=%pI4:0x%04X\n",
+		  &tmp_daddr, tcph->dest, &tmp_saddr, tcph->source);
+
+	do {
+		cm_node = find_node(cm_core,
+				    nfo.mapped_rem_port, nfo.mapped_rem_addr,
+				    nfo.mapped_loc_port, nfo.mapped_loc_addr);
+
+		if (!cm_node) {
+			/* Only type of packet accepted are for */
+			/* the PASSIVE open (syn only) */
+			if ((!tcph->syn) || (tcph->ack)) {
+				skb_handled = 0;
+				break;
+			}
+			listener = find_listener(cm_core, nfo.mapped_loc_addr,
+					nfo.mapped_loc_port,
+					NES_CM_LISTENER_ACTIVE_STATE, 0);
+			if (!listener) {
+				nfo.cm_id = NULL;
+				nfo.conn_type = 0;
+				nes_debug(NES_DBG_CM, "Unable to find listener for the pkt\n");
+				skb_handled = 0;
+				break;
+			}
+			nfo.cm_id = listener->cm_id;
+			nfo.conn_type = listener->conn_type;
+			cm_node = make_cm_node(cm_core, nesvnic, &nfo,
+					       listener);
+			if (!cm_node) {
+				nes_debug(NES_DBG_CM, "Unable to allocate "
+					  "node\n");
+				cm_packets_dropped++;
+				atomic_dec(&listener->ref_count);
+				dev_kfree_skb_any(skb);
+				break;
+			}
+			if (!tcph->rst && !tcph->fin) {
+				cm_node->state = NES_CM_STATE_LISTENING;
+			} else {
+				cm_packets_dropped++;
+				rem_ref_cm_node(cm_core, cm_node);
+				dev_kfree_skb_any(skb);
+				break;
+			}
+			add_ref_cm_node(cm_node);
+		} else if (cm_node->state == NES_CM_STATE_TSA) {
+			if (cm_node->nesqp->pau_mode)
+				nes_queue_mgt_skbs(skb, nesvnic, cm_node->nesqp);
+			else {
+				rem_ref_cm_node(cm_core, cm_node);
+				atomic_inc(&cm_accel_dropped_pkts);
+				dev_kfree_skb_any(skb);
+			}
+			break;
+		}
+		skb_reset_network_header(skb);
+		skb_set_transport_header(skb, sizeof(*tcph));
+		skb->len = ntohs(iph->tot_len);
+		process_packet(cm_node, skb, cm_core);
+		rem_ref_cm_node(cm_core, cm_node);
+	} while (0);
+	return skb_handled;
+}
+
+
+/**
+ * nes_cm_alloc_core - allocate a top level instance of a cm core
+ */
+static struct nes_cm_core *nes_cm_alloc_core(void)
+{
+	struct nes_cm_core *cm_core;
+
+	/* setup the CM core */
+	/* alloc top level core control structure */
+	cm_core = kzalloc(sizeof(*cm_core), GFP_KERNEL);
+	if (!cm_core)
+		return NULL;
+
+	INIT_LIST_HEAD(&cm_core->connected_nodes);
+	init_timer(&cm_core->tcp_timer);
+	cm_core->tcp_timer.function = nes_cm_timer_tick;
+
+	cm_core->mtu = NES_CM_DEFAULT_MTU;
+	cm_core->state = NES_CM_STATE_INITED;
+	cm_core->free_tx_pkt_max = NES_CM_DEFAULT_FREE_PKTS;
+
+	atomic_set(&cm_core->events_posted, 0);
+
+	cm_core->api = &nes_cm_api;
+
+	spin_lock_init(&cm_core->ht_lock);
+	spin_lock_init(&cm_core->listen_list_lock);
+
+	INIT_LIST_HEAD(&cm_core->listen_list.list);
+
+	nes_debug(NES_DBG_CM, "Init CM Core completed -- cm_core=%p\n", cm_core);
+
+	nes_debug(NES_DBG_CM, "Enable QUEUE EVENTS\n");
+	cm_core->event_wq = create_singlethread_workqueue("nesewq");
+	cm_core->post_event = nes_cm_post_event;
+	nes_debug(NES_DBG_CM, "Enable QUEUE DISCONNECTS\n");
+	cm_core->disconn_wq = create_singlethread_workqueue("nesdwq");
+
+	print_core(cm_core);
+	return cm_core;
+}
+
+
+/**
+ * mini_cm_dealloc_core - deallocate a top level instance of a cm core
+ */
+static int mini_cm_dealloc_core(struct nes_cm_core *cm_core)
+{
+	nes_debug(NES_DBG_CM, "De-Alloc CM Core (%p)\n", cm_core);
+
+	if (!cm_core)
+		return -EINVAL;
+
+	barrier();
+
+	if (timer_pending(&cm_core->tcp_timer))
+		del_timer(&cm_core->tcp_timer);
+
+	destroy_workqueue(cm_core->event_wq);
+	destroy_workqueue(cm_core->disconn_wq);
+	nes_debug(NES_DBG_CM, "\n");
+	kfree(cm_core);
+
+	return 0;
+}
+
+
+/**
+ * mini_cm_get
+ */
+static int mini_cm_get(struct nes_cm_core *cm_core)
+{
+	return cm_core->state;
+}
+
+
+/**
+ * mini_cm_set
+ */
+static int mini_cm_set(struct nes_cm_core *cm_core, u32 type, u32 value)
+{
+	int ret = 0;
+
+	switch (type) {
+	case NES_CM_SET_PKT_SIZE:
+		cm_core->mtu = value;
+		break;
+	case NES_CM_SET_FREE_PKT_Q_SIZE:
+		cm_core->free_tx_pkt_max = value;
+		break;
+	default:
+		/* unknown set option */
+		ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+
+/**
+ * nes_cm_init_tsa_conn setup HW; MPA frames must be
+ * successfully exchanged when this is called
+ */
+static int nes_cm_init_tsa_conn(struct nes_qp *nesqp, struct nes_cm_node *cm_node)
+{
+	int ret = 0;
+
+	if (!nesqp)
+		return -EINVAL;
+
+	nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_IPV4 |
+						  NES_QPCONTEXT_MISC_NO_NAGLE | NES_QPCONTEXT_MISC_DO_NOT_FRAG |
+						  NES_QPCONTEXT_MISC_DROS);
+
+	if (cm_node->tcp_cntxt.snd_wscale || cm_node->tcp_cntxt.rcv_wscale)
+		nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WSCALE);
+
+	nesqp->nesqp_context->misc2 |= cpu_to_le32(64 << NES_QPCONTEXT_MISC2_TTL_SHIFT);
+
+	nesqp->nesqp_context->mss |= cpu_to_le32(((u32)cm_node->tcp_cntxt.mss) << 16);
+
+	nesqp->nesqp_context->tcp_state_flow_label |= cpu_to_le32(
+		(u32)NES_QPCONTEXT_TCPSTATE_EST << NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT);
+
+	nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32(
+		(cm_node->tcp_cntxt.snd_wscale << NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT) &
+		NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK);
+
+	nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32(
+		(cm_node->tcp_cntxt.rcv_wscale << NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT) &
+		NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK);
+
+	nesqp->nesqp_context->keepalive = cpu_to_le32(0x80);
+	nesqp->nesqp_context->ts_recent = 0;
+	nesqp->nesqp_context->ts_age = 0;
+	nesqp->nesqp_context->snd_nxt = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	nesqp->nesqp_context->snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.snd_wnd);
+	nesqp->nesqp_context->rcv_nxt = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt);
+	nesqp->nesqp_context->rcv_wnd = cpu_to_le32(cm_node->tcp_cntxt.rcv_wnd <<
+						    cm_node->tcp_cntxt.rcv_wscale);
+	nesqp->nesqp_context->snd_max = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	nesqp->nesqp_context->snd_una = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	nesqp->nesqp_context->srtt = 0;
+	nesqp->nesqp_context->rttvar = cpu_to_le32(0x6);
+	nesqp->nesqp_context->ssthresh = cpu_to_le32(0x3FFFC000);
+	nesqp->nesqp_context->cwnd = cpu_to_le32(2 * cm_node->tcp_cntxt.mss);
+	nesqp->nesqp_context->snd_wl1 = cpu_to_le32(cm_node->tcp_cntxt.rcv_nxt);
+	nesqp->nesqp_context->snd_wl2 = cpu_to_le32(cm_node->tcp_cntxt.loc_seq_num);
+	nesqp->nesqp_context->max_snd_wnd = cpu_to_le32(cm_node->tcp_cntxt.max_snd_wnd);
+
+	nes_debug(NES_DBG_CM, "QP%u: rcv_nxt = 0x%08X, snd_nxt = 0x%08X,"
+		  " Setting MSS to %u, PDWscale = 0x%08X, rcv_wnd = %u, context misc = 0x%08X.\n",
+		  nesqp->hwqp.qp_id, le32_to_cpu(nesqp->nesqp_context->rcv_nxt),
+		  le32_to_cpu(nesqp->nesqp_context->snd_nxt),
+		  cm_node->tcp_cntxt.mss, le32_to_cpu(nesqp->nesqp_context->pd_index_wscale),
+		  le32_to_cpu(nesqp->nesqp_context->rcv_wnd),
+		  le32_to_cpu(nesqp->nesqp_context->misc));
+	nes_debug(NES_DBG_CM, "  snd_wnd  = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->snd_wnd));
+	nes_debug(NES_DBG_CM, "  snd_cwnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->cwnd));
+	nes_debug(NES_DBG_CM, "  max_swnd = 0x%08X.\n", le32_to_cpu(nesqp->nesqp_context->max_snd_wnd));
+
+	nes_debug(NES_DBG_CM, "Change cm_node state to TSA\n");
+	cm_node->state = NES_CM_STATE_TSA;
+
+	return ret;
+}
+
+
+/**
+ * nes_cm_disconn
+ */
+int nes_cm_disconn(struct nes_qp *nesqp)
+{
+	struct disconn_work *work;
+
+	work = kzalloc(sizeof *work, GFP_ATOMIC);
+	if (!work)
+		return -ENOMEM;  /* Timer will clean up */
+
+	nes_add_ref(&nesqp->ibqp);
+	work->nesqp = nesqp;
+	INIT_WORK(&work->work, nes_disconnect_worker);
+	queue_work(g_cm_core->disconn_wq, &work->work);
+	return 0;
+}
+
+
+/**
+ * nes_disconnect_worker
+ */
+static void nes_disconnect_worker(struct work_struct *work)
+{
+	struct disconn_work *dwork = container_of(work, struct disconn_work, work);
+	struct nes_qp *nesqp = dwork->nesqp;
+
+	kfree(dwork);
+	nes_debug(NES_DBG_CM, "processing AEQE id 0x%04X for QP%u.\n",
+		  nesqp->last_aeq, nesqp->hwqp.qp_id);
+	nes_cm_disconn_true(nesqp);
+	nes_rem_ref(&nesqp->ibqp);
+}
+
+
+/**
+ * nes_cm_disconn_true
+ */
+static int nes_cm_disconn_true(struct nes_qp *nesqp)
+{
+	unsigned long flags;
+	int ret = 0;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	struct nes_vnic *nesvnic;
+	u16 last_ae;
+	u8 original_hw_tcp_state;
+	u8 original_ibqp_state;
+	int disconn_status = 0;
+	int issue_disconn = 0;
+	int issue_close = 0;
+	int issue_flush = 0;
+	u32 flush_q = NES_CQP_FLUSH_RQ;
+	struct ib_event ibevent;
+
+	if (!nesqp) {
+		nes_debug(NES_DBG_CM, "disconnect_worker nesqp is NULL\n");
+		return -1;
+	}
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+	cm_id = nesqp->cm_id;
+	/* make sure we havent already closed this connection */
+	if (!cm_id) {
+		nes_debug(NES_DBG_CM, "QP%u disconnect_worker cmid is NULL\n",
+			  nesqp->hwqp.qp_id);
+		spin_unlock_irqrestore(&nesqp->lock, flags);
+		return -1;
+	}
+
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	nes_debug(NES_DBG_CM, "Disconnecting QP%u\n", nesqp->hwqp.qp_id);
+
+	original_hw_tcp_state = nesqp->hw_tcp_state;
+	original_ibqp_state = nesqp->ibqp_state;
+	last_ae = nesqp->last_aeq;
+
+	if (nesqp->term_flags) {
+		issue_disconn = 1;
+		issue_close = 1;
+		nesqp->cm_id = NULL;
+		del_timer(&nesqp->terminate_timer);
+		if (nesqp->flush_issued == 0) {
+			nesqp->flush_issued = 1;
+			issue_flush = 1;
+		}
+	} else if ((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) ||
+			((original_ibqp_state == IB_QPS_RTS) &&
+			(last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
+		issue_disconn = 1;
+		if (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET)
+			disconn_status = -ECONNRESET;
+	}
+
+	if (((original_hw_tcp_state == NES_AEQE_TCP_STATE_CLOSED) ||
+		 (original_hw_tcp_state == NES_AEQE_TCP_STATE_TIME_WAIT) ||
+		 (last_ae == NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) ||
+		 (last_ae == NES_AEQE_AEID_LLP_CONNECTION_RESET))) {
+		issue_close = 1;
+		nesqp->cm_id = NULL;
+		if (nesqp->flush_issued == 0) {
+			nesqp->flush_issued = 1;
+			issue_flush = 1;
+		}
+	}
+
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	if ((issue_flush) && (nesqp->destroyed == 0)) {
+		/* Flush the queue(s) */
+		if (nesqp->hw_iwarp_state >= NES_AEQE_IWARP_STATE_TERMINATE)
+			flush_q |= NES_CQP_FLUSH_SQ;
+		flush_wqes(nesvnic->nesdev, nesqp, flush_q, 1);
+
+		if (nesqp->term_flags) {
+			ibevent.device = nesqp->ibqp.device;
+			ibevent.event = nesqp->terminate_eventtype;
+			ibevent.element.qp = &nesqp->ibqp;
+			if (nesqp->ibqp.event_handler)
+				nesqp->ibqp.event_handler(&ibevent, nesqp->ibqp.qp_context);
+		}
+	}
+
+	if ((cm_id) && (cm_id->event_handler)) {
+		if (issue_disconn) {
+			atomic_inc(&cm_disconnects);
+			cm_event.event = IW_CM_EVENT_DISCONNECT;
+			cm_event.status = disconn_status;
+			cm_event.local_addr = cm_id->local_addr;
+			cm_event.remote_addr = cm_id->remote_addr;
+			cm_event.private_data = NULL;
+			cm_event.private_data_len = 0;
+
+			nes_debug(NES_DBG_CM, "Generating a CM Disconnect Event"
+				  " for  QP%u, SQ Head = %u, SQ Tail = %u. "
+				  "cm_id = %p, refcount = %u.\n",
+				  nesqp->hwqp.qp_id, nesqp->hwqp.sq_head,
+				  nesqp->hwqp.sq_tail, cm_id,
+				  atomic_read(&nesqp->refcount));
+
+			ret = cm_id->event_handler(cm_id, &cm_event);
+			if (ret)
+				nes_debug(NES_DBG_CM, "OFA CM event_handler "
+					  "returned, ret=%d\n", ret);
+		}
+
+		if (issue_close) {
+			atomic_inc(&cm_closes);
+			nes_disconnect(nesqp, 1);
+
+			cm_id->provider_data = nesqp;
+			/* Send up the close complete event */
+			cm_event.event = IW_CM_EVENT_CLOSE;
+			cm_event.status = 0;
+			cm_event.provider_data = cm_id->provider_data;
+			cm_event.local_addr = cm_id->local_addr;
+			cm_event.remote_addr = cm_id->remote_addr;
+			cm_event.private_data = NULL;
+			cm_event.private_data_len = 0;
+
+			ret = cm_id->event_handler(cm_id, &cm_event);
+			if (ret)
+				nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+
+			cm_id->rem_ref(cm_id);
+		}
+	}
+
+	return 0;
+}
+
+
+/**
+ * nes_disconnect
+ */
+static int nes_disconnect(struct nes_qp *nesqp, int abrupt)
+{
+	int ret = 0;
+	struct nes_vnic *nesvnic;
+	struct nes_device *nesdev;
+	struct nes_ib_device *nesibdev;
+
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	if (!nesvnic)
+		return -EINVAL;
+
+	nesdev = nesvnic->nesdev;
+	nesibdev = nesvnic->nesibdev;
+
+	nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
+			netdev_refcnt_read(nesvnic->netdev));
+
+	if (nesqp->active_conn) {
+
+		/* indicate this connection is NOT active */
+		nesqp->active_conn = 0;
+	} else {
+		/* Need to free the Last Streaming Mode Message */
+		if (nesqp->ietf_frame) {
+			if (nesqp->lsmm_mr)
+				nesibdev->ibdev.dereg_mr(nesqp->lsmm_mr);
+			pci_free_consistent(nesdev->pcidev,
+					    nesqp->private_data_len + nesqp->ietf_frame_size,
+					    nesqp->ietf_frame, nesqp->ietf_frame_pbase);
+		}
+	}
+
+	/* close the CM node down if it is still active */
+	if (nesqp->cm_node) {
+		nes_debug(NES_DBG_CM, "Call close API\n");
+
+		g_cm_core->api->close(g_cm_core, nesqp->cm_node);
+	}
+
+	return ret;
+}
+
+
+/**
+ * nes_accept
+ */
+int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	u64 u64temp;
+	struct ib_qp *ibqp;
+	struct nes_qp *nesqp;
+	struct nes_vnic *nesvnic;
+	struct nes_device *nesdev;
+	struct nes_cm_node *cm_node;
+	struct nes_adapter *adapter;
+	struct ib_qp_attr attr;
+	struct iw_cm_event cm_event;
+	struct nes_hw_qp_wqe *wqe;
+	struct nes_v4_quad nes_quad;
+	u32 crc_value;
+	int ret;
+	int passive_state;
+	struct nes_ib_device *nesibdev;
+	struct ib_mr *ibmr = NULL;
+	struct ib_phys_buf ibphysbuf;
+	struct nes_pd *nespd;
+	u64 tagged_offset;
+	u8 mpa_frame_offset = 0;
+	struct ietf_mpa_v2 *mpa_v2_frame;
+	u8 start_addr = 0;
+	u8 *start_ptr = &start_addr;
+	u8 **start_buff = &start_ptr;
+	u16 buff_len = 0;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+
+	ibqp = nes_get_qp(cm_id->device, conn_param->qpn);
+	if (!ibqp)
+		return -EINVAL;
+
+	/* get all our handles */
+	nesqp = to_nesqp(ibqp);
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	nesdev = nesvnic->nesdev;
+	adapter = nesdev->nesadapter;
+
+	cm_node = (struct nes_cm_node *)cm_id->provider_data;
+	nes_debug(NES_DBG_CM, "nes_accept: cm_node= %p nesvnic=%p, netdev=%p,"
+		"%s\n", cm_node, nesvnic, nesvnic->netdev,
+		nesvnic->netdev->name);
+
+	if (NES_CM_STATE_LISTENER_DESTROYED == cm_node->state) {
+		if (cm_node->loopbackpartner)
+			rem_ref_cm_node(cm_node->cm_core, cm_node->loopbackpartner);
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		return -EINVAL;
+	}
+
+	passive_state = atomic_add_return(1, &cm_node->passive_state);
+	if (passive_state == NES_SEND_RESET_EVENT) {
+		rem_ref_cm_node(cm_node->cm_core, cm_node);
+		return -ECONNRESET;
+	}
+	/* associate the node with the QP */
+	nesqp->cm_node = (void *)cm_node;
+	cm_node->nesqp = nesqp;
+
+
+	nes_debug(NES_DBG_CM, "QP%u, cm_node=%p, jiffies = %lu listener = %p\n",
+		nesqp->hwqp.qp_id, cm_node, jiffies, cm_node->listener);
+	atomic_inc(&cm_accepts);
+
+	nes_debug(NES_DBG_CM, "netdev refcnt = %u.\n",
+			netdev_refcnt_read(nesvnic->netdev));
+
+	nesqp->ietf_frame_size = sizeof(struct ietf_mpa_v2);
+	/* allocate the ietf frame and space for private data */
+	nesqp->ietf_frame = pci_alloc_consistent(nesdev->pcidev,
+						 nesqp->ietf_frame_size + conn_param->private_data_len,
+						 &nesqp->ietf_frame_pbase);
+
+	if (!nesqp->ietf_frame) {
+		nes_debug(NES_DBG_CM, "Unable to allocate memory for private data\n");
+		return -ENOMEM;
+	}
+	mpa_v2_frame = (struct ietf_mpa_v2 *)nesqp->ietf_frame;
+
+	if (cm_node->mpa_frame_rev == IETF_MPA_V1)
+		mpa_frame_offset = 4;
+
+	if (cm_node->mpa_frame_rev == IETF_MPA_V1 ||
+			cm_node->mpav2_ird_ord == IETF_NO_IRD_ORD) {
+		record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord);
+	}
+
+	memcpy(mpa_v2_frame->priv_data, conn_param->private_data,
+	       conn_param->private_data_len);
+
+	cm_build_mpa_frame(cm_node, start_buff, &buff_len, nesqp->ietf_frame, MPA_KEY_REPLY);
+	nesqp->private_data_len = conn_param->private_data_len;
+
+	/* setup our first outgoing iWarp send WQE (the IETF frame response) */
+	wqe = &nesqp->hwqp.sq_vbase[0];
+
+	if (raddr->sin_addr.s_addr != laddr->sin_addr.s_addr) {
+		u64temp = (unsigned long)nesqp;
+		nesibdev = nesvnic->nesibdev;
+		nespd = nesqp->nespd;
+		ibphysbuf.addr = nesqp->ietf_frame_pbase + mpa_frame_offset;
+		ibphysbuf.size = buff_len;
+		tagged_offset = (u64)(unsigned long)*start_buff;
+		ibmr = nesibdev->ibdev.reg_phys_mr((struct ib_pd *)nespd,
+						   &ibphysbuf, 1,
+						   IB_ACCESS_LOCAL_WRITE,
+						   &tagged_offset);
+		if (!ibmr) {
+			nes_debug(NES_DBG_CM, "Unable to register memory region"
+				  "for lSMM for cm_node = %p \n",
+				  cm_node);
+			pci_free_consistent(nesdev->pcidev,
+					    nesqp->private_data_len + nesqp->ietf_frame_size,
+					    nesqp->ietf_frame, nesqp->ietf_frame_pbase);
+			return -ENOMEM;
+		}
+
+		ibmr->pd = &nespd->ibpd;
+		ibmr->device = nespd->ibpd.device;
+		nesqp->lsmm_mr = ibmr;
+
+		u64temp |= NES_SW_CONTEXT_ALIGN >> 1;
+		set_wqe_64bit_value(wqe->wqe_words,
+				    NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX,
+				    u64temp);
+		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] =
+			cpu_to_le32(NES_IWARP_SQ_WQE_STREAMING |
+				    NES_IWARP_SQ_WQE_WRPDU);
+		wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX] =
+			cpu_to_le32(buff_len);
+		set_wqe_64bit_value(wqe->wqe_words,
+				    NES_IWARP_SQ_WQE_FRAG0_LOW_IDX,
+				    (u64)(unsigned long)(*start_buff));
+		wqe->wqe_words[NES_IWARP_SQ_WQE_LENGTH0_IDX] =
+			cpu_to_le32(buff_len);
+		wqe->wqe_words[NES_IWARP_SQ_WQE_STAG0_IDX] = ibmr->lkey;
+		if (nesqp->sq_kmapped) {
+			nesqp->sq_kmapped = 0;
+			kunmap(nesqp->page);
+		}
+
+		nesqp->nesqp_context->ird_ord_sizes |=
+			cpu_to_le32(NES_QPCONTEXT_ORDIRD_LSMM_PRESENT |
+				    NES_QPCONTEXT_ORDIRD_WRPDU);
+	} else {
+		nesqp->nesqp_context->ird_ord_sizes |=
+			cpu_to_le32(NES_QPCONTEXT_ORDIRD_WRPDU);
+	}
+	nesqp->skip_lsmm = 1;
+
+	/* Cache the cm_id in the qp */
+	nesqp->cm_id = cm_id;
+	cm_node->cm_id = cm_id;
+
+	/*  nesqp->cm_node = (void *)cm_id->provider_data; */
+	cm_id->provider_data = nesqp;
+	nesqp->active_conn = 0;
+
+	if (cm_node->state == NES_CM_STATE_TSA)
+		nes_debug(NES_DBG_CM, "Already state = TSA for cm_node=%p\n",
+			  cm_node);
+
+	nes_cm_init_tsa_conn(nesqp, cm_node);
+
+	nesqp->nesqp_context->tcpPorts[0] =
+				cpu_to_le16(cm_node->mapped_loc_port);
+	nesqp->nesqp_context->tcpPorts[1] =
+				cpu_to_le16(cm_node->mapped_rem_port);
+
+	nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr);
+
+	nesqp->nesqp_context->misc2 |= cpu_to_le32(
+		(u32)PCI_FUNC(nesdev->pcidev->devfn) <<
+		NES_QPCONTEXT_MISC2_SRC_IP_SHIFT);
+
+	nesqp->nesqp_context->arp_index_vlan |=
+		cpu_to_le32(nes_arp_table(nesdev,
+					  le32_to_cpu(nesqp->nesqp_context->ip0), NULL,
+					  NES_ARP_RESOLVE) << 16);
+
+	nesqp->nesqp_context->ts_val_delta = cpu_to_le32(
+		jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW));
+
+	nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id);
+
+	nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(
+		((u32)1 << NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT));
+	nesqp->nesqp_context->ird_ord_sizes |=
+		cpu_to_le32((u32)cm_node->ord_size);
+
+	memset(&nes_quad, 0, sizeof(nes_quad));
+	nes_quad.DstIpAdrIndex =
+		cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
+	nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr);
+	nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port);
+	nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port);
+
+	/* Produce hash key */
+	crc_value = get_crc_value(&nes_quad);
+	nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff);
+	nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, CRC = 0x%08X\n",
+		  nesqp->hte_index, nesqp->hte_index & adapter->hte_index_mask);
+
+	nesqp->hte_index &= adapter->hte_index_mask;
+	nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
+
+	cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node);
+
+	nes_debug(NES_DBG_CM, "QP%u, Destination IP = 0x%08X:0x%04X, local = "
+		  "0x%08X:0x%04X, rcv_nxt=0x%08X, snd_nxt=0x%08X, mpa + "
+		  "private data length=%u.\n", nesqp->hwqp.qp_id,
+		  ntohl(raddr->sin_addr.s_addr), ntohs(raddr->sin_port),
+		  ntohl(laddr->sin_addr.s_addr), ntohs(laddr->sin_port),
+		  le32_to_cpu(nesqp->nesqp_context->rcv_nxt),
+		  le32_to_cpu(nesqp->nesqp_context->snd_nxt),
+		  buff_len);
+
+	/* notify OF layer that accept event was successful */
+	cm_id->add_ref(cm_id);
+	nes_add_ref(&nesqp->ibqp);
+
+	cm_event.event = IW_CM_EVENT_ESTABLISHED;
+	cm_event.status = 0;
+	cm_event.provider_data = (void *)nesqp;
+	cm_event.local_addr = cm_id->local_addr;
+	cm_event.remote_addr = cm_id->remote_addr;
+	cm_event.private_data = NULL;
+	cm_event.private_data_len = 0;
+	cm_event.ird = cm_node->ird_size;
+	cm_event.ord = cm_node->ord_size;
+
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	attr.qp_state = IB_QPS_RTS;
+	nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
+	if (cm_node->loopbackpartner) {
+		cm_node->loopbackpartner->mpa_frame_size =
+			nesqp->private_data_len;
+		/* copy entire MPA frame to our cm_node's frame */
+		memcpy(cm_node->loopbackpartner->mpa_frame_buf,
+		       conn_param->private_data, conn_param->private_data_len);
+		create_event(cm_node->loopbackpartner, NES_CM_EVENT_CONNECTED);
+	}
+	if (ret)
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, "
+		       "ret=%d\n", __func__, __LINE__, ret);
+
+	return 0;
+}
+
+
+/**
+ * nes_reject
+ */
+int nes_reject(struct iw_cm_id *cm_id, const void *pdata, u8 pdata_len)
+{
+	struct nes_cm_node *cm_node;
+	struct nes_cm_node *loopback;
+	struct nes_cm_core *cm_core;
+	u8 *start_buff;
+
+	atomic_inc(&cm_rejects);
+	cm_node = (struct nes_cm_node *)cm_id->provider_data;
+	loopback = cm_node->loopbackpartner;
+	cm_core = cm_node->cm_core;
+	cm_node->cm_id = cm_id;
+
+	if (pdata_len + sizeof(struct ietf_mpa_v2) > MAX_CM_BUFFER)
+		return -EINVAL;
+
+	if (loopback) {
+		memcpy(&loopback->mpa_frame.priv_data, pdata, pdata_len);
+		loopback->mpa_frame.priv_data_len = pdata_len;
+		loopback->mpa_frame_size = pdata_len;
+	} else {
+		start_buff = &cm_node->mpa_frame_buf[0] + sizeof(struct ietf_mpa_v2);
+		cm_node->mpa_frame_size = pdata_len;
+		memcpy(start_buff, pdata, pdata_len);
+	}
+	return cm_core->api->reject(cm_core, cm_node);
+}
+
+
+/**
+ * nes_connect
+ * setup and launch cm connect node
+ */
+int nes_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param)
+{
+	struct ib_qp *ibqp;
+	struct nes_qp *nesqp;
+	struct nes_vnic *nesvnic;
+	struct nes_device *nesdev;
+	struct nes_cm_node *cm_node;
+	struct nes_cm_info cm_info;
+	int apbvt_set = 0;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
+	struct sockaddr_in *raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+	struct iwpm_dev_data pm_reg_msg;
+	struct iwpm_sa_data pm_msg;
+	int iwpm_err = 0;
+
+	if (cm_id->remote_addr.ss_family != AF_INET)
+		return -ENOSYS;
+	ibqp = nes_get_qp(cm_id->device, conn_param->qpn);
+	if (!ibqp)
+		return -EINVAL;
+	nesqp = to_nesqp(ibqp);
+	if (!nesqp)
+		return -EINVAL;
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	if (!nesvnic)
+		return -EINVAL;
+	nesdev = nesvnic->nesdev;
+	if (!nesdev)
+		return -EINVAL;
+
+	if (!laddr->sin_port || !raddr->sin_port)
+		return -EINVAL;
+
+	nes_debug(NES_DBG_CM, "QP%u, current IP = 0x%08X, Destination IP = "
+		  "0x%08X:0x%04X, local = 0x%08X:0x%04X.\n", nesqp->hwqp.qp_id,
+		  ntohl(nesvnic->local_ipaddr), ntohl(raddr->sin_addr.s_addr),
+		  ntohs(raddr->sin_port), ntohl(laddr->sin_addr.s_addr),
+		  ntohs(laddr->sin_port));
+
+	atomic_inc(&cm_connects);
+	nesqp->active_conn = 1;
+
+	/* cache the cm_id in the qp */
+	nesqp->cm_id = cm_id;
+	cm_id->provider_data = nesqp;
+	nesqp->private_data_len = conn_param->private_data_len;
+
+	nes_debug(NES_DBG_CM, "requested ord = 0x%08X.\n", (u32)conn_param->ord);
+	nes_debug(NES_DBG_CM, "mpa private data len =%u\n",
+		  conn_param->private_data_len);
+
+	/* set up the connection params for the node */
+	cm_info.loc_addr = ntohl(laddr->sin_addr.s_addr);
+	cm_info.loc_port = ntohs(laddr->sin_port);
+	cm_info.rem_addr = ntohl(raddr->sin_addr.s_addr);
+	cm_info.rem_port = ntohs(raddr->sin_port);
+	cm_info.cm_id = cm_id;
+	cm_info.conn_type = NES_CM_IWARP_CONN_TYPE;
+
+	/* No port mapper available, go with the specified peer information */
+	cm_info.mapped_loc_addr = cm_info.loc_addr;
+	cm_info.mapped_loc_port = cm_info.loc_port;
+	cm_info.mapped_rem_addr = cm_info.rem_addr;
+	cm_info.mapped_rem_port = cm_info.rem_port;
+
+	nes_form_reg_msg(nesvnic, &pm_reg_msg);
+	iwpm_err = iwpm_register_pid(&pm_reg_msg, RDMA_NL_NES);
+	if (iwpm_err) {
+		nes_debug(NES_DBG_NLMSG,
+			"Port Mapper reg pid fail (err = %d).\n", iwpm_err);
+	}
+	if (iwpm_valid_pid() && !iwpm_err) {
+		nes_form_pm_msg(&cm_info, &pm_msg);
+		iwpm_err = iwpm_add_and_query_mapping(&pm_msg, RDMA_NL_NES);
+		if (iwpm_err)
+			nes_debug(NES_DBG_NLMSG,
+			"Port Mapper query fail (err = %d).\n", iwpm_err);
+		else
+			nes_record_pm_msg(&cm_info, &pm_msg);
+	}
+
+	if (laddr->sin_addr.s_addr != raddr->sin_addr.s_addr) {
+		nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port,
+			PCI_FUNC(nesdev->pcidev->devfn), NES_MANAGE_APBVT_ADD);
+		apbvt_set = 1;
+	}
+
+	if (nes_create_mapinfo(&cm_info))
+		return -ENOMEM;
+
+	cm_id->add_ref(cm_id);
+
+	/* create a connect CM node connection */
+	cm_node = g_cm_core->api->connect(g_cm_core, nesvnic,
+					  conn_param->private_data_len, (void *)conn_param->private_data,
+					  &cm_info);
+	if (!cm_node) {
+		if (apbvt_set)
+			nes_manage_apbvt(nesvnic, cm_info.mapped_loc_port,
+					 PCI_FUNC(nesdev->pcidev->devfn),
+					 NES_MANAGE_APBVT_DEL);
+
+		nes_debug(NES_DBG_NLMSG, "Delete mapped_loc_port = %04X\n",
+				cm_info.mapped_loc_port);
+		nes_remove_mapinfo(cm_info.loc_addr, cm_info.loc_port,
+			cm_info.mapped_loc_addr, cm_info.mapped_loc_port);
+		cm_id->rem_ref(cm_id);
+		return -ENOMEM;
+	}
+
+	record_ird_ord(cm_node, (u16)conn_param->ird, (u16)conn_param->ord);
+	if (cm_node->send_rdma0_op == SEND_RDMA_READ_ZERO &&
+				cm_node->ord_size == 0)
+		cm_node->ord_size = 1;
+
+	cm_node->apbvt_set = apbvt_set;
+	nesqp->cm_node = cm_node;
+	cm_node->nesqp = nesqp;
+	nes_add_ref(&nesqp->ibqp);
+
+	return 0;
+}
+
+
+/**
+ * nes_create_listen
+ */
+int nes_create_listen(struct iw_cm_id *cm_id, int backlog)
+{
+	struct nes_vnic *nesvnic;
+	struct nes_cm_listener *cm_node;
+	struct nes_cm_info cm_info;
+	int err;
+	struct sockaddr_in *laddr = (struct sockaddr_in *)&cm_id->local_addr;
+
+	nes_debug(NES_DBG_CM, "cm_id = %p, local port = 0x%04X.\n",
+		  cm_id, ntohs(laddr->sin_port));
+
+	if (cm_id->local_addr.ss_family != AF_INET)
+		return -ENOSYS;
+	nesvnic = to_nesvnic(cm_id->device);
+	if (!nesvnic)
+		return -EINVAL;
+
+	nes_debug(NES_DBG_CM, "nesvnic=%p, netdev=%p, %s\n",
+			nesvnic, nesvnic->netdev, nesvnic->netdev->name);
+
+	nes_debug(NES_DBG_CM, "nesvnic->local_ipaddr=0x%08x, sin_addr.s_addr=0x%08x\n",
+			nesvnic->local_ipaddr, laddr->sin_addr.s_addr);
+
+	/* setup listen params in our api call struct */
+	cm_info.loc_addr = ntohl(nesvnic->local_ipaddr);
+	cm_info.loc_port = ntohs(laddr->sin_port);
+	cm_info.backlog = backlog;
+	cm_info.cm_id = cm_id;
+
+	cm_info.conn_type = NES_CM_IWARP_CONN_TYPE;
+
+	/* No port mapper available, go with the specified info */
+	cm_info.mapped_loc_addr = cm_info.loc_addr;
+	cm_info.mapped_loc_port = cm_info.loc_port;
+
+	cm_node = g_cm_core->api->listen(g_cm_core, nesvnic, &cm_info);
+	if (!cm_node) {
+		printk(KERN_ERR "%s[%u] Error returned from listen API call\n",
+		       __func__, __LINE__);
+		return -ENOMEM;
+	}
+
+	cm_id->provider_data = cm_node;
+
+	if (!cm_node->reused_node) {
+		if (nes_create_mapinfo(&cm_info))
+			return -ENOMEM;
+
+		err = nes_manage_apbvt(nesvnic, cm_node->mapped_loc_port,
+				       PCI_FUNC(nesvnic->nesdev->pcidev->devfn),
+				       NES_MANAGE_APBVT_ADD);
+		if (err) {
+			printk(KERN_ERR "nes_manage_apbvt call returned %d.\n",
+			       err);
+			g_cm_core->api->stop_listener(g_cm_core, (void *)cm_node);
+			return err;
+		}
+		atomic_inc(&cm_listens_created);
+	}
+
+	cm_id->add_ref(cm_id);
+	cm_id->provider_data = (void *)cm_node;
+
+
+	return 0;
+}
+
+
+/**
+ * nes_destroy_listen
+ */
+int nes_destroy_listen(struct iw_cm_id *cm_id)
+{
+	if (cm_id->provider_data)
+		g_cm_core->api->stop_listener(g_cm_core, cm_id->provider_data);
+	else
+		nes_debug(NES_DBG_CM, "cm_id->provider_data was NULL\n");
+
+	cm_id->rem_ref(cm_id);
+
+	return 0;
+}
+
+
+/**
+ * nes_cm_recv
+ */
+int nes_cm_recv(struct sk_buff *skb, struct net_device *netdevice)
+{
+	int rc = 0;
+
+	cm_packets_received++;
+	if ((g_cm_core) && (g_cm_core->api))
+		rc = g_cm_core->api->recv_pkt(g_cm_core, netdev_priv(netdevice), skb);
+	else
+		nes_debug(NES_DBG_CM, "Unable to process packet for CM,"
+			  " cm is not setup properly.\n");
+
+	return rc;
+}
+
+
+/**
+ * nes_cm_start
+ * Start and init a cm core module
+ */
+int nes_cm_start(void)
+{
+	nes_debug(NES_DBG_CM, "\n");
+	/* create the primary CM core, pass this handle to subsequent core inits */
+	g_cm_core = nes_cm_alloc_core();
+	if (g_cm_core)
+		return 0;
+	else
+		return -ENOMEM;
+}
+
+
+/**
+ * nes_cm_stop
+ * stop and dealloc all cm core instances
+ */
+int nes_cm_stop(void)
+{
+	g_cm_core->api->destroy_cm_core(g_cm_core);
+	return 0;
+}
+
+
+/**
+ * cm_event_connected
+ * handle a connected event, setup QPs and HW
+ */
+static void cm_event_connected(struct nes_cm_event *event)
+{
+	struct nes_qp *nesqp;
+	struct nes_vnic *nesvnic;
+	struct nes_device *nesdev;
+	struct nes_cm_node *cm_node;
+	struct nes_adapter *nesadapter;
+	struct ib_qp_attr attr;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	struct nes_v4_quad nes_quad;
+	u32 crc_value;
+	int ret;
+	struct sockaddr_in *laddr;
+	struct sockaddr_in *raddr;
+	struct sockaddr_in *cm_event_laddr;
+
+	/* get all our handles */
+	cm_node = event->cm_node;
+	cm_id = cm_node->cm_id;
+	nes_debug(NES_DBG_CM, "cm_event_connected - %p - cm_id = %p\n", cm_node, cm_id);
+	nesqp = (struct nes_qp *)cm_id->provider_data;
+	nesvnic = to_nesvnic(nesqp->ibqp.device);
+	nesdev = nesvnic->nesdev;
+	nesadapter = nesdev->nesadapter;
+	laddr = (struct sockaddr_in *)&cm_id->local_addr;
+	raddr = (struct sockaddr_in *)&cm_id->remote_addr;
+	cm_event_laddr = (struct sockaddr_in *)&cm_event.local_addr;
+
+	if (nesqp->destroyed)
+		return;
+	atomic_inc(&cm_connecteds);
+	nes_debug(NES_DBG_CM, "QP%u attempting to connect to  0x%08X:0x%04X on"
+		  " local port 0x%04X. jiffies = %lu.\n",
+		  nesqp->hwqp.qp_id, ntohl(raddr->sin_addr.s_addr),
+		  ntohs(raddr->sin_port), ntohs(laddr->sin_port), jiffies);
+
+	nes_cm_init_tsa_conn(nesqp, cm_node);
+
+	/* set the QP tsa context */
+	nesqp->nesqp_context->tcpPorts[0] =
+			cpu_to_le16(cm_node->mapped_loc_port);
+	nesqp->nesqp_context->tcpPorts[1] =
+			cpu_to_le16(cm_node->mapped_rem_port);
+	nesqp->nesqp_context->ip0 = cpu_to_le32(cm_node->mapped_rem_addr);
+
+	nesqp->nesqp_context->misc2 |= cpu_to_le32(
+			(u32)PCI_FUNC(nesdev->pcidev->devfn) <<
+			NES_QPCONTEXT_MISC2_SRC_IP_SHIFT);
+	nesqp->nesqp_context->arp_index_vlan |= cpu_to_le32(
+			nes_arp_table(nesdev,
+			le32_to_cpu(nesqp->nesqp_context->ip0),
+			NULL, NES_ARP_RESOLVE) << 16);
+	nesqp->nesqp_context->ts_val_delta = cpu_to_le32(
+			jiffies - nes_read_indexed(nesdev, NES_IDX_TCP_NOW));
+	nesqp->nesqp_context->ird_index = cpu_to_le32(nesqp->hwqp.qp_id);
+	nesqp->nesqp_context->ird_ord_sizes |=
+			cpu_to_le32((u32)1 <<
+			NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT);
+	nesqp->nesqp_context->ird_ord_sizes |=
+			cpu_to_le32((u32)cm_node->ord_size);
+
+	/* Adjust tail for not having a LSMM */
+	/*nesqp->hwqp.sq_tail = 1;*/
+
+	build_rdma0_msg(cm_node, &nesqp);
+
+	nes_write32(nesdev->regs + NES_WQE_ALLOC,
+		    (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id);
+
+	memset(&nes_quad, 0, sizeof(nes_quad));
+
+	nes_quad.DstIpAdrIndex =
+		cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
+	nes_quad.SrcIpadr = htonl(cm_node->mapped_rem_addr);
+	nes_quad.TcpPorts[0] = htons(cm_node->mapped_rem_port);
+	nes_quad.TcpPorts[1] = htons(cm_node->mapped_loc_port);
+
+	/* Produce hash key */
+	crc_value = get_crc_value(&nes_quad);
+	nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff);
+	nes_debug(NES_DBG_CM, "HTE Index = 0x%08X, After CRC = 0x%08X\n",
+		  nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask);
+
+	nesqp->hte_index &= nesadapter->hte_index_mask;
+	nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
+
+	nesqp->ietf_frame = &cm_node->mpa_frame;
+	nesqp->private_data_len = (u8)cm_node->mpa_frame_size;
+	cm_node->cm_core->api->accelerated(cm_node->cm_core, cm_node);
+
+	/* notify OF layer we successfully created the requested connection */
+	cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+	cm_event.status = 0;
+	cm_event.provider_data = cm_id->provider_data;
+	cm_event_laddr->sin_family = AF_INET;
+	cm_event_laddr->sin_port = laddr->sin_port;
+	cm_event.remote_addr = cm_id->remote_addr;
+
+	cm_event.private_data = (void *)event->cm_node->mpa_frame_buf;
+	cm_event.private_data_len = (u8)event->cm_node->mpa_frame_size;
+	cm_event.ird = cm_node->ird_size;
+	cm_event.ord = cm_node->ord_size;
+
+	cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr);
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+
+	if (ret)
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, "
+		       "ret=%d\n", __func__, __LINE__, ret);
+	attr.qp_state = IB_QPS_RTS;
+	nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
+
+	nes_debug(NES_DBG_CM, "Exiting connect thread for QP%u. jiffies = "
+		  "%lu\n", nesqp->hwqp.qp_id, jiffies);
+
+	return;
+}
+
+
+/**
+ * cm_event_connect_error
+ */
+static void cm_event_connect_error(struct nes_cm_event *event)
+{
+	struct nes_qp *nesqp;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	/* struct nes_cm_info cm_info; */
+	int ret;
+
+	if (!event->cm_node)
+		return;
+
+	cm_id = event->cm_node->cm_id;
+	if (!cm_id)
+		return;
+
+	nes_debug(NES_DBG_CM, "cm_node=%p, cm_id=%p\n", event->cm_node, cm_id);
+	nesqp = cm_id->provider_data;
+
+	if (!nesqp)
+		return;
+
+	/* notify OF layer about this connection error event */
+	/* cm_id->rem_ref(cm_id); */
+	nesqp->cm_id = NULL;
+	cm_id->provider_data = NULL;
+	cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+	cm_event.status = -ECONNRESET;
+	cm_event.provider_data = cm_id->provider_data;
+	cm_event.local_addr = cm_id->local_addr;
+	cm_event.remote_addr = cm_id->remote_addr;
+	cm_event.private_data = NULL;
+	cm_event.private_data_len = 0;
+
+#ifdef CONFIG_INFINIBAND_NES_DEBUG
+	{
+		struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *)
+						     &cm_event.local_addr;
+		struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *)
+						     &cm_event.remote_addr;
+		nes_debug(NES_DBG_CM, "call CM_EVENT REJECTED, local_addr=%08x, remote_addr=%08x\n",
+			  cm_event_laddr->sin_addr.s_addr, cm_event_raddr->sin_addr.s_addr);
+	}
+#endif
+
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+	if (ret)
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, "
+		       "ret=%d\n", __func__, __LINE__, ret);
+	cm_id->rem_ref(cm_id);
+
+	rem_ref_cm_node(event->cm_node->cm_core, event->cm_node);
+	return;
+}
+
+
+/**
+ * cm_event_reset
+ */
+static void cm_event_reset(struct nes_cm_event *event)
+{
+	struct nes_qp *nesqp;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	/* struct nes_cm_info cm_info; */
+	int ret;
+
+	if (!event->cm_node)
+		return;
+
+	if (!event->cm_node->cm_id)
+		return;
+
+	cm_id = event->cm_node->cm_id;
+
+	nes_debug(NES_DBG_CM, "%p - cm_id = %p\n", event->cm_node, cm_id);
+	nesqp = cm_id->provider_data;
+	if (!nesqp)
+		return;
+
+	nesqp->cm_id = NULL;
+	/* cm_id->provider_data = NULL; */
+	cm_event.event = IW_CM_EVENT_DISCONNECT;
+	cm_event.status = -ECONNRESET;
+	cm_event.provider_data = cm_id->provider_data;
+	cm_event.local_addr = cm_id->local_addr;
+	cm_event.remote_addr = cm_id->remote_addr;
+	cm_event.private_data = NULL;
+	cm_event.private_data_len = 0;
+
+	cm_id->add_ref(cm_id);
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	atomic_inc(&cm_closes);
+	cm_event.event = IW_CM_EVENT_CLOSE;
+	cm_event.status = 0;
+	cm_event.provider_data = cm_id->provider_data;
+	cm_event.local_addr = cm_id->local_addr;
+	cm_event.remote_addr = cm_id->remote_addr;
+	cm_event.private_data = NULL;
+	cm_event.private_data_len = 0;
+	nes_debug(NES_DBG_CM, "NODE %p Generating CLOSE\n", event->cm_node);
+	ret = cm_id->event_handler(cm_id, &cm_event);
+
+	nes_debug(NES_DBG_CM, "OFA CM event_handler returned, ret=%d\n", ret);
+
+
+	/* notify OF layer about this connection error event */
+	cm_id->rem_ref(cm_id);
+
+	return;
+}
+
+
+/**
+ * cm_event_mpa_req
+ */
+static void cm_event_mpa_req(struct nes_cm_event *event)
+{
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	int ret;
+	struct nes_cm_node *cm_node;
+	struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *)
+					     &cm_event.local_addr;
+	struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *)
+					     &cm_event.remote_addr;
+
+	cm_node = event->cm_node;
+	if (!cm_node)
+		return;
+	cm_id = cm_node->cm_id;
+
+	atomic_inc(&cm_connect_reqs);
+	nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n",
+		  cm_node, cm_id, jiffies);
+
+	cm_event.event = IW_CM_EVENT_CONNECT_REQUEST;
+	cm_event.status = 0;
+	cm_event.provider_data = (void *)cm_node;
+
+	cm_event_laddr->sin_family = AF_INET;
+	cm_event_laddr->sin_port = htons(event->cm_info.loc_port);
+	cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr);
+
+	cm_event_raddr->sin_family = AF_INET;
+	cm_event_raddr->sin_port = htons(event->cm_info.rem_port);
+	cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr);
+	cm_event.private_data = cm_node->mpa_frame_buf;
+	cm_event.private_data_len = (u8)cm_node->mpa_frame_size;
+	if (cm_node->mpa_frame_rev == IETF_MPA_V1) {
+		cm_event.ird = NES_MAX_IRD;
+		cm_event.ord = NES_MAX_ORD;
+	} else {
+	cm_event.ird = cm_node->ird_size;
+	cm_event.ord = cm_node->ord_size;
+	}
+
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	if (ret)
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n",
+		       __func__, __LINE__, ret);
+	return;
+}
+
+
+static void cm_event_mpa_reject(struct nes_cm_event *event)
+{
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	struct nes_cm_node *cm_node;
+	int ret;
+	struct sockaddr_in *cm_event_laddr = (struct sockaddr_in *)
+					     &cm_event.local_addr;
+	struct sockaddr_in *cm_event_raddr = (struct sockaddr_in *)
+					     &cm_event.remote_addr;
+
+	cm_node = event->cm_node;
+	if (!cm_node)
+		return;
+	cm_id = cm_node->cm_id;
+
+	atomic_inc(&cm_connect_reqs);
+	nes_debug(NES_DBG_CM, "cm_node = %p - cm_id = %p, jiffies = %lu\n",
+		  cm_node, cm_id, jiffies);
+
+	cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+	cm_event.status = -ECONNREFUSED;
+	cm_event.provider_data = cm_id->provider_data;
+
+	cm_event_laddr->sin_family = AF_INET;
+	cm_event_laddr->sin_port = htons(event->cm_info.loc_port);
+	cm_event_laddr->sin_addr.s_addr = htonl(event->cm_info.loc_addr);
+
+	cm_event_raddr->sin_family = AF_INET;
+	cm_event_raddr->sin_port = htons(event->cm_info.rem_port);
+	cm_event_raddr->sin_addr.s_addr = htonl(event->cm_info.rem_addr);
+
+	cm_event.private_data = cm_node->mpa_frame_buf;
+	cm_event.private_data_len = (u8)cm_node->mpa_frame_size;
+
+	nes_debug(NES_DBG_CM, "call CM_EVENT_MPA_REJECTED, local_addr=%08x, "
+		  "remove_addr=%08x\n",
+		  cm_event_laddr->sin_addr.s_addr,
+		  cm_event_raddr->sin_addr.s_addr);
+
+	ret = cm_id->event_handler(cm_id, &cm_event);
+	if (ret)
+		printk(KERN_ERR "%s[%u] OFA CM event_handler returned, ret=%d\n",
+		       __func__, __LINE__, ret);
+
+	return;
+}
+
+
+static void nes_cm_event_handler(struct work_struct *);
+
+/**
+ * nes_cm_post_event
+ * post an event to the cm event handler
+ */
+static int nes_cm_post_event(struct nes_cm_event *event)
+{
+	atomic_inc(&event->cm_node->cm_core->events_posted);
+	add_ref_cm_node(event->cm_node);
+	event->cm_info.cm_id->add_ref(event->cm_info.cm_id);
+	INIT_WORK(&event->event_work, nes_cm_event_handler);
+	nes_debug(NES_DBG_CM, "cm_node=%p queue_work, event=%p\n",
+		  event->cm_node, event);
+
+	queue_work(event->cm_node->cm_core->event_wq, &event->event_work);
+
+	nes_debug(NES_DBG_CM, "Exit\n");
+	return 0;
+}
+
+
+/**
+ * nes_cm_event_handler
+ * worker function to handle cm events
+ * will free instance of nes_cm_event
+ */
+static void nes_cm_event_handler(struct work_struct *work)
+{
+	struct nes_cm_event *event = container_of(work, struct nes_cm_event,
+						  event_work);
+	struct nes_cm_core *cm_core;
+
+	if ((!event) || (!event->cm_node) || (!event->cm_node->cm_core))
+		return;
+
+	cm_core = event->cm_node->cm_core;
+	nes_debug(NES_DBG_CM, "event=%p, event->type=%u, events posted=%u\n",
+		  event, event->type, atomic_read(&cm_core->events_posted));
+
+	switch (event->type) {
+	case NES_CM_EVENT_MPA_REQ:
+		cm_event_mpa_req(event);
+		nes_debug(NES_DBG_CM, "cm_node=%p CM Event: MPA REQUEST\n",
+			  event->cm_node);
+		break;
+	case NES_CM_EVENT_RESET:
+		nes_debug(NES_DBG_CM, "cm_node = %p CM Event: RESET\n",
+			  event->cm_node);
+		cm_event_reset(event);
+		break;
+	case NES_CM_EVENT_CONNECTED:
+		if ((!event->cm_node->cm_id) ||
+		    (event->cm_node->state != NES_CM_STATE_TSA))
+			break;
+		cm_event_connected(event);
+		nes_debug(NES_DBG_CM, "CM Event: CONNECTED\n");
+		break;
+	case NES_CM_EVENT_MPA_REJECT:
+		if ((!event->cm_node->cm_id) ||
+		    (event->cm_node->state == NES_CM_STATE_TSA))
+			break;
+		cm_event_mpa_reject(event);
+		nes_debug(NES_DBG_CM, "CM Event: REJECT\n");
+		break;
+
+	case NES_CM_EVENT_ABORTED:
+		if ((!event->cm_node->cm_id) ||
+		    (event->cm_node->state == NES_CM_STATE_TSA))
+			break;
+		cm_event_connect_error(event);
+		nes_debug(NES_DBG_CM, "CM Event: ABORTED\n");
+		break;
+	case NES_CM_EVENT_DROPPED_PKT:
+		nes_debug(NES_DBG_CM, "CM Event: DROPPED PKT\n");
+		break;
+	default:
+		nes_debug(NES_DBG_CM, "CM Event: UNKNOWN EVENT TYPE\n");
+		break;
+	}
+
+	atomic_dec(&cm_core->events_posted);
+	event->cm_info.cm_id->rem_ref(event->cm_info.cm_id);
+	rem_ref_cm_node(cm_core, event->cm_node);
+	kfree(event);
+
+	return;
+}
diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h
new file mode 100644
index 000000000..f522cf639
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_cm.h
@@ -0,0 +1,476 @@
+/*
+ * Copyright (c) 2006 - 2014 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef NES_CM_H
+#define NES_CM_H
+
+#define QUEUE_EVENTS
+
+#define NES_MANAGE_APBVT_DEL 0
+#define NES_MANAGE_APBVT_ADD 1
+
+#define NES_MPA_REQUEST_ACCEPT  1
+#define NES_MPA_REQUEST_REJECT  2
+
+/* IETF MPA -- defines, enums, structs */
+#define IEFT_MPA_KEY_REQ  "MPA ID Req Frame"
+#define IEFT_MPA_KEY_REP  "MPA ID Rep Frame"
+#define IETF_MPA_KEY_SIZE 16
+#define IETF_MPA_VERSION  1
+#define IETF_MAX_PRIV_DATA_LEN 512
+#define IETF_MPA_FRAME_SIZE    20
+#define IETF_RTR_MSG_SIZE      4
+#define IETF_MPA_V2_FLAG       0x10
+
+/* IETF RTR MSG Fields               */
+#define IETF_PEER_TO_PEER       0x8000
+#define IETF_FLPDU_ZERO_LEN     0x4000
+#define IETF_RDMA0_WRITE        0x8000
+#define IETF_RDMA0_READ         0x4000
+#define IETF_NO_IRD_ORD         0x3FFF
+#define NES_MAX_IRD		 0x40
+#define NES_MAX_ORD		 0x7F
+
+enum ietf_mpa_flags {
+	IETF_MPA_FLAGS_MARKERS = 0x80,	/* receive Markers */
+	IETF_MPA_FLAGS_CRC     = 0x40,	/* receive Markers */
+	IETF_MPA_FLAGS_REJECT  = 0x20,	/* Reject */
+};
+
+struct ietf_mpa_v1 {
+	u8 key[IETF_MPA_KEY_SIZE];
+	u8 flags;
+	u8 rev;
+	__be16 priv_data_len;
+	u8 priv_data[0];
+};
+
+#define ietf_mpa_req_resp_frame ietf_mpa_frame
+
+struct ietf_rtr_msg {
+	__be16 ctrl_ird;
+	__be16 ctrl_ord;
+};
+
+struct ietf_mpa_v2 {
+	u8 key[IETF_MPA_KEY_SIZE];
+	u8 flags;
+	u8 rev;
+	 __be16 priv_data_len;
+	struct ietf_rtr_msg rtr_msg;
+	u8 priv_data[0];
+};
+
+struct nes_v4_quad {
+	u32 rsvd0;
+	__le32 DstIpAdrIndex;	/* Only most significant 5 bits are valid */
+	__be32 SrcIpadr;
+	__be16 TcpPorts[2];		/* src is low, dest is high */
+};
+
+struct nes_cm_node;
+enum nes_timer_type {
+	NES_TIMER_TYPE_SEND,
+	NES_TIMER_TYPE_RECV,
+	NES_TIMER_NODE_CLEANUP,
+	NES_TIMER_TYPE_CLOSE,
+};
+
+#define NES_PASSIVE_STATE_INDICATED	0
+#define NES_DO_NOT_SEND_RESET_EVENT	1
+#define NES_SEND_RESET_EVENT		2
+
+#define MAX_NES_IFS 4
+
+#define SET_ACK 1
+#define SET_SYN 2
+#define SET_FIN 4
+#define SET_RST 8
+
+#define TCP_OPTIONS_PADDING	3
+
+struct option_base {
+	u8 optionnum;
+	u8 length;
+};
+
+enum option_numbers {
+	OPTION_NUMBER_END,
+	OPTION_NUMBER_NONE,
+	OPTION_NUMBER_MSS,
+	OPTION_NUMBER_WINDOW_SCALE,
+	OPTION_NUMBER_SACK_PERM,
+	OPTION_NUMBER_SACK,
+	OPTION_NUMBER_WRITE0 = 0xbc
+};
+
+struct option_mss {
+	u8 optionnum;
+	u8 length;
+	__be16 mss;
+};
+
+struct option_windowscale {
+	u8 optionnum;
+	u8 length;
+	u8 shiftcount;
+};
+
+union all_known_options {
+	char as_end;
+	struct option_base as_base;
+	struct option_mss as_mss;
+	struct option_windowscale as_windowscale;
+};
+
+struct nes_timer_entry {
+	struct list_head list;
+	unsigned long timetosend;	/* jiffies */
+	struct sk_buff *skb;
+	u32 type;
+	u32 retrycount;
+	u32 retranscount;
+	u32 context;
+	u32 seq_num;
+	u32 send_retrans;
+	int close_when_complete;
+	struct net_device *netdev;
+};
+
+#define NES_DEFAULT_RETRYS  64
+#define NES_DEFAULT_RETRANS 8
+#ifdef CONFIG_INFINIBAND_NES_DEBUG
+#define NES_RETRY_TIMEOUT   (1000*HZ/1000)
+#else
+#define NES_RETRY_TIMEOUT   (3000*HZ/1000)
+#endif
+#define NES_SHORT_TIME      (10)
+#define NES_LONG_TIME       (2000*HZ/1000)
+#define NES_MAX_TIMEOUT     ((unsigned long) (12*HZ))
+
+#define NES_CM_HASHTABLE_SIZE         1024
+#define NES_CM_TCP_TIMER_INTERVAL     3000
+#define NES_CM_DEFAULT_MTU            1540
+#define NES_CM_DEFAULT_FRAME_CNT      10
+#define NES_CM_THREAD_STACK_SIZE      256
+#define NES_CM_DEFAULT_RCV_WND        64240	// before we know that window scaling is allowed
+#define NES_CM_DEFAULT_RCV_WND_SCALED 256960  // after we know that window scaling is allowed
+#define NES_CM_DEFAULT_RCV_WND_SCALE  2
+#define NES_CM_DEFAULT_FREE_PKTS      0x000A
+#define NES_CM_FREE_PKT_LO_WATERMARK  2
+
+#define NES_CM_DEFAULT_MSS   536
+
+#define NES_CM_DEF_SEQ       0x159bf75f
+#define NES_CM_DEF_LOCAL_ID  0x3b47
+
+#define NES_CM_DEF_SEQ2      0x18ed5740
+#define NES_CM_DEF_LOCAL_ID2 0xb807
+#define	MAX_CM_BUFFER	(IETF_MPA_FRAME_SIZE + IETF_RTR_MSG_SIZE + IETF_MAX_PRIV_DATA_LEN)
+
+typedef u32 nes_addr_t;
+
+#define nes_cm_tsa_context nes_qp_context
+
+struct nes_qp;
+
+/* cm node transition states */
+enum nes_cm_node_state {
+	NES_CM_STATE_UNKNOWN,
+	NES_CM_STATE_INITED,
+	NES_CM_STATE_LISTENING,
+	NES_CM_STATE_SYN_RCVD,
+	NES_CM_STATE_SYN_SENT,
+	NES_CM_STATE_ONE_SIDE_ESTABLISHED,
+	NES_CM_STATE_ESTABLISHED,
+	NES_CM_STATE_ACCEPTING,
+	NES_CM_STATE_MPAREQ_SENT,
+	NES_CM_STATE_MPAREQ_RCVD,
+	NES_CM_STATE_MPAREJ_RCVD,
+	NES_CM_STATE_TSA,
+	NES_CM_STATE_FIN_WAIT1,
+	NES_CM_STATE_FIN_WAIT2,
+	NES_CM_STATE_CLOSE_WAIT,
+	NES_CM_STATE_TIME_WAIT,
+	NES_CM_STATE_LAST_ACK,
+	NES_CM_STATE_CLOSING,
+	NES_CM_STATE_LISTENER_DESTROYED,
+	NES_CM_STATE_CLOSED
+};
+
+enum mpa_frame_version {
+	IETF_MPA_V1 = 1,
+	IETF_MPA_V2 = 2
+};
+
+enum mpa_frame_key {
+	MPA_KEY_REQUEST,
+	MPA_KEY_REPLY
+};
+
+enum send_rdma0 {
+	SEND_RDMA_READ_ZERO = 1,
+	SEND_RDMA_WRITE_ZERO = 2
+};
+
+enum nes_tcpip_pkt_type {
+	NES_PKT_TYPE_UNKNOWN,
+	NES_PKT_TYPE_SYN,
+	NES_PKT_TYPE_SYNACK,
+	NES_PKT_TYPE_ACK,
+	NES_PKT_TYPE_FIN,
+	NES_PKT_TYPE_RST
+};
+
+
+/* type of nes connection */
+enum nes_cm_conn_type {
+	NES_CM_IWARP_CONN_TYPE,
+};
+
+/* CM context params */
+struct nes_cm_tcp_context {
+	u8  client;
+
+	u32 loc_seq_num;
+	u32 loc_ack_num;
+	u32 rem_ack_num;
+	u32 rcv_nxt;
+
+	u32 loc_id;
+	u32 rem_id;
+
+	u32 snd_wnd;
+	u32 max_snd_wnd;
+
+	u32 rcv_wnd;
+	u32 mss;
+	u8  snd_wscale;
+	u8  rcv_wscale;
+
+	struct nes_cm_tsa_context tsa_cntxt;
+	struct timeval            sent_ts;
+};
+
+
+enum nes_cm_listener_state {
+	NES_CM_LISTENER_PASSIVE_STATE = 1,
+	NES_CM_LISTENER_ACTIVE_STATE = 2,
+	NES_CM_LISTENER_EITHER_STATE = 3
+};
+
+struct nes_cm_listener {
+	struct list_head           list;
+	struct nes_cm_core         *cm_core;
+	u8                         loc_mac[ETH_ALEN];
+	nes_addr_t                 loc_addr, mapped_loc_addr;
+	u16                        loc_port, mapped_loc_port;
+	struct iw_cm_id            *cm_id;
+	enum nes_cm_conn_type      conn_type;
+	atomic_t                   ref_count;
+	struct nes_vnic            *nesvnic;
+	atomic_t                   pend_accepts_cnt;
+	int                        backlog;
+	enum nes_cm_listener_state listener_state;
+	u32                        reused_node;
+};
+
+/* per connection node and node state information */
+struct nes_cm_node {
+	nes_addr_t                loc_addr, rem_addr;
+	nes_addr_t                mapped_loc_addr, mapped_rem_addr;
+	u16                       loc_port, rem_port;
+	u16                       mapped_loc_port, mapped_rem_port;
+
+	u8                        loc_mac[ETH_ALEN];
+	u8                        rem_mac[ETH_ALEN];
+
+	enum nes_cm_node_state    state;
+	struct nes_cm_tcp_context tcp_cntxt;
+	struct nes_cm_core        *cm_core;
+	struct sk_buff_head       resend_list;
+	atomic_t                  ref_count;
+	struct net_device         *netdev;
+
+	struct nes_cm_node        *loopbackpartner;
+
+	struct nes_timer_entry	  *send_entry;
+	struct nes_timer_entry    *recv_entry;
+	spinlock_t                retrans_list_lock;
+	enum send_rdma0           send_rdma0_op;
+
+	union {
+		struct ietf_mpa_v1 mpa_frame;
+		struct ietf_mpa_v2 mpa_v2_frame;
+		u8                 mpa_frame_buf[MAX_CM_BUFFER];
+	};
+	enum mpa_frame_version    mpa_frame_rev;
+	u16			  ird_size;
+	u16                       ord_size;
+	u16			  mpav2_ird_ord;
+
+	u16                       mpa_frame_size;
+	struct iw_cm_id           *cm_id;
+	struct list_head          list;
+	int                       accelerated;
+	struct nes_cm_listener    *listener;
+	enum nes_cm_conn_type     conn_type;
+	struct nes_vnic           *nesvnic;
+	int                       apbvt_set;
+	int                       accept_pend;
+	struct list_head	timer_entry;
+	struct list_head	reset_entry;
+	struct nes_qp		*nesqp;
+	atomic_t 		passive_state;
+};
+
+/* structure for client or CM to fill when making CM api calls. */
+/*	- only need to set relevant data, based on op. */
+struct nes_cm_info {
+	union {
+		struct iw_cm_id   *cm_id;
+		struct net_device *netdev;
+	};
+
+	u16 loc_port;
+	u16 rem_port;
+	nes_addr_t loc_addr;
+	nes_addr_t rem_addr;
+	u16 mapped_loc_port;
+	u16 mapped_rem_port;
+	nes_addr_t mapped_loc_addr;
+	nes_addr_t mapped_rem_addr;
+
+	enum nes_cm_conn_type  conn_type;
+	int backlog;
+};
+
+/* CM event codes */
+enum  nes_cm_event_type {
+	NES_CM_EVENT_UNKNOWN,
+	NES_CM_EVENT_ESTABLISHED,
+	NES_CM_EVENT_MPA_REQ,
+	NES_CM_EVENT_MPA_CONNECT,
+	NES_CM_EVENT_MPA_ACCEPT,
+	NES_CM_EVENT_MPA_REJECT,
+	NES_CM_EVENT_MPA_ESTABLISHED,
+	NES_CM_EVENT_CONNECTED,
+	NES_CM_EVENT_CLOSED,
+	NES_CM_EVENT_RESET,
+	NES_CM_EVENT_DROPPED_PKT,
+	NES_CM_EVENT_CLOSE_IMMED,
+	NES_CM_EVENT_CLOSE_HARD,
+	NES_CM_EVENT_CLOSE_CLEAN,
+	NES_CM_EVENT_ABORTED,
+	NES_CM_EVENT_SEND_FIRST
+};
+
+/* event to post to CM event handler */
+struct nes_cm_event {
+	enum nes_cm_event_type type;
+
+	struct nes_cm_info cm_info;
+	struct work_struct event_work;
+	struct nes_cm_node *cm_node;
+};
+
+struct nes_cm_core {
+	enum nes_cm_node_state  state;
+
+	atomic_t                listen_node_cnt;
+	struct nes_cm_node      listen_list;
+	spinlock_t              listen_list_lock;
+
+	u32                     mtu;
+	u32                     free_tx_pkt_max;
+	u32                     rx_pkt_posted;
+	atomic_t                ht_node_cnt;
+	struct list_head        connected_nodes;
+	/* struct list_head hashtable[NES_CM_HASHTABLE_SIZE]; */
+	spinlock_t              ht_lock;
+
+	struct timer_list       tcp_timer;
+
+	struct nes_cm_ops       *api;
+
+	int (*post_event)(struct nes_cm_event *event);
+	atomic_t                events_posted;
+	struct workqueue_struct *event_wq;
+	struct workqueue_struct *disconn_wq;
+
+	atomic_t                node_cnt;
+	u64                     aborted_connects;
+	u32                     options;
+
+	struct nes_cm_node      *current_listen_node;
+};
+
+
+#define NES_CM_SET_PKT_SIZE        (1 << 1)
+#define NES_CM_SET_FREE_PKT_Q_SIZE (1 << 2)
+
+/* CM ops/API for client interface */
+struct nes_cm_ops {
+	int (*accelerated)(struct nes_cm_core *, struct nes_cm_node *);
+	struct nes_cm_listener * (*listen)(struct nes_cm_core *, struct nes_vnic *,
+			struct nes_cm_info *);
+	int (*stop_listener)(struct nes_cm_core *, struct nes_cm_listener *);
+	struct nes_cm_node * (*connect)(struct nes_cm_core *,
+			struct nes_vnic *, u16, void *,
+			struct nes_cm_info *);
+	int (*close)(struct nes_cm_core *, struct nes_cm_node *);
+	int (*accept)(struct nes_cm_core *, struct nes_cm_node *);
+	int (*reject)(struct nes_cm_core *, struct nes_cm_node *);
+	int (*recv_pkt)(struct nes_cm_core *, struct nes_vnic *,
+			struct sk_buff *);
+	int (*destroy_cm_core)(struct nes_cm_core *);
+	int (*get)(struct nes_cm_core *);
+	int (*set)(struct nes_cm_core *, u32, u32);
+};
+
+int schedule_nes_timer(struct nes_cm_node *, struct sk_buff *,
+		enum nes_timer_type, int, int);
+
+int nes_accept(struct iw_cm_id *, struct iw_cm_conn_param *);
+int nes_reject(struct iw_cm_id *, const void *, u8);
+int nes_connect(struct iw_cm_id *, struct iw_cm_conn_param *);
+int nes_create_listen(struct iw_cm_id *, int);
+int nes_destroy_listen(struct iw_cm_id *);
+
+int nes_cm_recv(struct sk_buff *, struct net_device *);
+int nes_cm_start(void);
+int nes_cm_stop(void);
+int nes_add_ref_cm_node(struct nes_cm_node *cm_node);
+int nes_rem_ref_cm_node(struct nes_cm_node *cm_node);
+
+#endif			/* NES_CM_H */
diff --git a/drivers/infiniband/hw/nes/nes_context.h b/drivers/infiniband/hw/nes/nes_context.h
new file mode 100644
index 000000000..a69eef16d
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_context.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef NES_CONTEXT_H
+#define NES_CONTEXT_H
+
+struct nes_qp_context {
+	__le32   misc;
+	__le32   cqs;
+	__le32   sq_addr_low;
+	__le32   sq_addr_high;
+	__le32   rq_addr_low;
+	__le32   rq_addr_high;
+	__le32   misc2;
+	__le16   tcpPorts[2];
+	__le32   ip0;
+	__le32   ip1;
+	__le32   ip2;
+	__le32   ip3;
+	__le32   mss;
+	__le32   arp_index_vlan;
+	__le32   tcp_state_flow_label;
+	__le32   pd_index_wscale;
+	__le32   keepalive;
+	u32   ts_recent;
+	u32   ts_age;
+	__le32   snd_nxt;
+	__le32   snd_wnd;
+	__le32   rcv_nxt;
+	__le32   rcv_wnd;
+	__le32   snd_max;
+	__le32   snd_una;
+	u32   srtt;
+	__le32   rttvar;
+	__le32   ssthresh;
+	__le32   cwnd;
+	__le32   snd_wl1;
+	__le32   snd_wl2;
+	__le32   max_snd_wnd;
+	__le32   ts_val_delta;
+	u32   retransmit;
+	u32   probe_cnt;
+	u32   hte_index;
+	__le32   q2_addr_low;
+	__le32   q2_addr_high;
+	__le32   ird_index;
+	u32   Rsvd3;
+	__le32   ird_ord_sizes;
+	u32   mrkr_offset;
+	__le32   aeq_token_low;
+	__le32   aeq_token_high;
+};
+
+/* QP Context Misc Field */
+
+#define NES_QPCONTEXT_MISC_IWARP_VER_MASK    0x00000003
+#define NES_QPCONTEXT_MISC_IWARP_VER_SHIFT   0
+#define NES_QPCONTEXT_MISC_EFB_SIZE_MASK     0x000000C0
+#define NES_QPCONTEXT_MISC_EFB_SIZE_SHIFT    6
+#define NES_QPCONTEXT_MISC_RQ_SIZE_MASK      0x00000300
+#define NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT     8
+#define NES_QPCONTEXT_MISC_SQ_SIZE_MASK      0x00000c00
+#define NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT     10
+#define NES_QPCONTEXT_MISC_PCI_FCN_MASK      0x00007000
+#define NES_QPCONTEXT_MISC_PCI_FCN_SHIFT     12
+#define NES_QPCONTEXT_MISC_DUP_ACKS_MASK     0x00070000
+#define NES_QPCONTEXT_MISC_DUP_ACKS_SHIFT    16
+
+enum nes_qp_context_misc_bits {
+	NES_QPCONTEXT_MISC_RX_WQE_SIZE         = 0x00000004,
+	NES_QPCONTEXT_MISC_IPV4                = 0x00000008,
+	NES_QPCONTEXT_MISC_DO_NOT_FRAG         = 0x00000010,
+	NES_QPCONTEXT_MISC_INSERT_VLAN         = 0x00000020,
+	NES_QPCONTEXT_MISC_DROS                = 0x00008000,
+	NES_QPCONTEXT_MISC_WSCALE              = 0x00080000,
+	NES_QPCONTEXT_MISC_KEEPALIVE           = 0x00100000,
+	NES_QPCONTEXT_MISC_TIMESTAMP           = 0x00200000,
+	NES_QPCONTEXT_MISC_SACK                = 0x00400000,
+	NES_QPCONTEXT_MISC_RDMA_WRITE_EN       = 0x00800000,
+	NES_QPCONTEXT_MISC_RDMA_READ_EN        = 0x01000000,
+	NES_QPCONTEXT_MISC_WBIND_EN            = 0x10000000,
+	NES_QPCONTEXT_MISC_FAST_REGISTER_EN    = 0x20000000,
+	NES_QPCONTEXT_MISC_PRIV_EN             = 0x40000000,
+	NES_QPCONTEXT_MISC_NO_NAGLE            = 0x80000000
+};
+
+enum nes_qp_acc_wq_sizes {
+	HCONTEXT_TSA_WQ_SIZE_4 = 0,
+	HCONTEXT_TSA_WQ_SIZE_32 = 1,
+	HCONTEXT_TSA_WQ_SIZE_128 = 2,
+	HCONTEXT_TSA_WQ_SIZE_512 = 3
+};
+
+/* QP Context Misc2 Fields */
+#define NES_QPCONTEXT_MISC2_TTL_MASK            0x000000ff
+#define NES_QPCONTEXT_MISC2_TTL_SHIFT           0
+#define NES_QPCONTEXT_MISC2_HOP_LIMIT_MASK      0x000000ff
+#define NES_QPCONTEXT_MISC2_HOP_LIMIT_SHIFT     0
+#define NES_QPCONTEXT_MISC2_LIMIT_MASK          0x00000300
+#define NES_QPCONTEXT_MISC2_LIMIT_SHIFT         8
+#define NES_QPCONTEXT_MISC2_NIC_INDEX_MASK      0x0000fc00
+#define NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT     10
+#define NES_QPCONTEXT_MISC2_SRC_IP_MASK         0x001f0000
+#define NES_QPCONTEXT_MISC2_SRC_IP_SHIFT        16
+#define NES_QPCONTEXT_MISC2_TOS_MASK            0xff000000
+#define NES_QPCONTEXT_MISC2_TOS_SHIFT           24
+#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_MASK  0xff000000
+#define NES_QPCONTEXT_MISC2_TRAFFIC_CLASS_SHIFT 24
+
+/* QP Context Tcp State/Flow Label Fields */
+#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_MASK   0x000fffff
+#define NES_QPCONTEXT_TCPFLOW_FLOW_LABEL_SHIFT  0
+#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_MASK    0xf0000000
+#define NES_QPCONTEXT_TCPFLOW_TCP_STATE_SHIFT   28
+
+enum nes_qp_tcp_state {
+	NES_QPCONTEXT_TCPSTATE_CLOSED = 1,
+	NES_QPCONTEXT_TCPSTATE_EST = 5,
+	NES_QPCONTEXT_TCPSTATE_TIME_WAIT = 11,
+};
+
+/* QP Context PD Index/wscale Fields */
+#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_MASK  0x0000000f
+#define NES_QPCONTEXT_PDWSCALE_RCV_WSCALE_SHIFT 0
+#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_MASK  0x00000f00
+#define NES_QPCONTEXT_PDWSCALE_SND_WSCALE_SHIFT 8
+#define NES_QPCONTEXT_PDWSCALE_PDINDEX_MASK     0xffff0000
+#define NES_QPCONTEXT_PDWSCALE_PDINDEX_SHIFT    16
+
+/* QP Context Keepalive Fields */
+#define NES_QPCONTEXT_KEEPALIVE_DELTA_MASK      0x0000ffff
+#define NES_QPCONTEXT_KEEPALIVE_DELTA_SHIFT     0
+#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_MASK  0x00ff0000
+#define NES_QPCONTEXT_KEEPALIVE_PROBE_CNT_SHIFT 16
+#define NES_QPCONTEXT_KEEPALIVE_INTV_MASK       0xff000000
+#define NES_QPCONTEXT_KEEPALIVE_INTV_SHIFT      24
+
+/* QP Context ORD/IRD Fields */
+#define NES_QPCONTEXT_ORDIRD_ORDSIZE_MASK       0x0000007f
+#define NES_QPCONTEXT_ORDIRD_ORDSIZE_SHIFT      0
+#define NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK       0x00030000
+#define NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT      16
+#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_MASK    0x30000000
+#define NES_QPCONTEXT_ORDIRD_IWARP_MODE_SHIFT   28
+
+enum nes_ord_ird_bits {
+	NES_QPCONTEXT_ORDIRD_WRPDU                   = 0x02000000,
+	NES_QPCONTEXT_ORDIRD_LSMM_PRESENT            = 0x04000000,
+	NES_QPCONTEXT_ORDIRD_ALSMM                   = 0x08000000,
+	NES_QPCONTEXT_ORDIRD_AAH                     = 0x40000000,
+	NES_QPCONTEXT_ORDIRD_RNMC                    = 0x80000000
+};
+
+enum nes_iwarp_qp_state {
+	NES_QPCONTEXT_IWARP_STATE_NONEXIST  = 0,
+	NES_QPCONTEXT_IWARP_STATE_IDLE      = 1,
+	NES_QPCONTEXT_IWARP_STATE_RTS       = 2,
+	NES_QPCONTEXT_IWARP_STATE_CLOSING   = 3,
+	NES_QPCONTEXT_IWARP_STATE_TERMINATE = 5,
+	NES_QPCONTEXT_IWARP_STATE_ERROR     = 6
+};
+
+
+#endif		/* NES_CONTEXT_H */
diff --git a/drivers/infiniband/hw/nes/nes_hw.c b/drivers/infiniband/hw/nes/nes_hw.c
new file mode 100644
index 000000000..02120d340
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_hw.c
@@ -0,0 +1,3937 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if_vlan.h>
+#include <linux/inet_lro.h>
+#include <linux/slab.h>
+
+#include "nes.h"
+
+static unsigned int nes_lro_max_aggr = NES_LRO_MAX_AGGR;
+module_param(nes_lro_max_aggr, uint, 0444);
+MODULE_PARM_DESC(nes_lro_max_aggr, "NIC LRO max packet aggregation");
+
+static int wide_ppm_offset;
+module_param(wide_ppm_offset, int, 0644);
+MODULE_PARM_DESC(wide_ppm_offset, "Increase CX4 interface clock ppm offset, 0=100ppm (default), 1=300ppm");
+
+static u32 crit_err_count;
+u32 int_mod_timer_init;
+u32 int_mod_cq_depth_256;
+u32 int_mod_cq_depth_128;
+u32 int_mod_cq_depth_32;
+u32 int_mod_cq_depth_24;
+u32 int_mod_cq_depth_16;
+u32 int_mod_cq_depth_4;
+u32 int_mod_cq_depth_1;
+static const u8 nes_max_critical_error_count = 100;
+#include "nes_cm.h"
+
+static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq);
+static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count);
+static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count,
+				struct nes_adapter *nesadapter, u8  OneG_Mode);
+static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq);
+static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq);
+static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq);
+static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
+				   struct nes_hw_aeqe *aeqe);
+static void process_critical_error(struct nes_device *nesdev);
+static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number);
+static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode);
+static void nes_terminate_start_timer(struct nes_qp *nesqp);
+
+#ifdef CONFIG_INFINIBAND_NES_DEBUG
+static unsigned char *nes_iwarp_state_str[] = {
+	"Non-Existent",
+	"Idle",
+	"RTS",
+	"Closing",
+	"RSVD1",
+	"Terminate",
+	"Error",
+	"RSVD2",
+};
+
+static unsigned char *nes_tcp_state_str[] = {
+	"Non-Existent",
+	"Closed",
+	"Listen",
+	"SYN Sent",
+	"SYN Rcvd",
+	"Established",
+	"Close Wait",
+	"FIN Wait 1",
+	"Closing",
+	"Last Ack",
+	"FIN Wait 2",
+	"Time Wait",
+	"RSVD1",
+	"RSVD2",
+	"RSVD3",
+	"RSVD4",
+};
+#endif
+
+static inline void print_ip(struct nes_cm_node *cm_node)
+{
+	unsigned char *rem_addr;
+	if (cm_node) {
+		rem_addr = (unsigned char *)&cm_node->rem_addr;
+		printk(KERN_ERR PFX "Remote IP addr: %pI4\n", rem_addr);
+	}
+}
+
+/**
+ * nes_nic_init_timer_defaults
+ */
+void  nes_nic_init_timer_defaults(struct nes_device *nesdev, u8 jumbomode)
+{
+	unsigned long flags;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
+
+	shared_timer->timer_in_use_min = NES_NIC_FAST_TIMER_LOW;
+	shared_timer->timer_in_use_max = NES_NIC_FAST_TIMER_HIGH;
+	if (jumbomode) {
+		shared_timer->threshold_low    = DEFAULT_JUMBO_NES_QL_LOW;
+		shared_timer->threshold_target = DEFAULT_JUMBO_NES_QL_TARGET;
+		shared_timer->threshold_high   = DEFAULT_JUMBO_NES_QL_HIGH;
+	} else {
+		shared_timer->threshold_low    = DEFAULT_NES_QL_LOW;
+		shared_timer->threshold_target = DEFAULT_NES_QL_TARGET;
+		shared_timer->threshold_high   = DEFAULT_NES_QL_HIGH;
+	}
+
+	/* todo use netdev->mtu to set thresholds */
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+}
+
+
+/**
+ * nes_nic_init_timer
+ */
+static void  nes_nic_init_timer(struct nes_device *nesdev)
+{
+	unsigned long flags;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
+
+	if (shared_timer->timer_in_use_old == 0) {
+		nesdev->deepcq_count = 0;
+		shared_timer->timer_direction_upward = 0;
+		shared_timer->timer_direction_downward = 0;
+		shared_timer->timer_in_use = NES_NIC_FAST_TIMER;
+		shared_timer->timer_in_use_old = 0;
+
+	}
+	if (shared_timer->timer_in_use != shared_timer->timer_in_use_old) {
+		shared_timer->timer_in_use_old = shared_timer->timer_in_use;
+		nes_write32(nesdev->regs+NES_PERIODIC_CONTROL,
+			0x80000000 | ((u32)(shared_timer->timer_in_use*8)));
+	}
+	/* todo use netdev->mtu to set thresholds */
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+}
+
+
+/**
+ * nes_nic_tune_timer
+ */
+static void nes_nic_tune_timer(struct nes_device *nesdev)
+{
+	unsigned long flags;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+	u16 cq_count = nesdev->currcq_count;
+
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
+
+	if (shared_timer->cq_count_old <= cq_count)
+		shared_timer->cq_direction_downward = 0;
+	else
+		shared_timer->cq_direction_downward++;
+	shared_timer->cq_count_old = cq_count;
+	if (shared_timer->cq_direction_downward > NES_NIC_CQ_DOWNWARD_TREND) {
+		if (cq_count <= shared_timer->threshold_low &&
+		    shared_timer->threshold_low > 4) {
+			shared_timer->threshold_low = shared_timer->threshold_low/2;
+			shared_timer->cq_direction_downward=0;
+			nesdev->currcq_count = 0;
+			spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+			return;
+		}
+	}
+
+	if (cq_count > 1) {
+		nesdev->deepcq_count += cq_count;
+		if (cq_count <= shared_timer->threshold_low) {       /* increase timer gently */
+			shared_timer->timer_direction_upward++;
+			shared_timer->timer_direction_downward = 0;
+		} else if (cq_count <= shared_timer->threshold_target) { /* balanced */
+			shared_timer->timer_direction_upward = 0;
+			shared_timer->timer_direction_downward = 0;
+		} else if (cq_count <= shared_timer->threshold_high) {  /* decrease timer gently */
+			shared_timer->timer_direction_downward++;
+			shared_timer->timer_direction_upward = 0;
+		} else if (cq_count <= (shared_timer->threshold_high) * 2) {
+			shared_timer->timer_in_use -= 2;
+			shared_timer->timer_direction_upward = 0;
+			shared_timer->timer_direction_downward++;
+		} else {
+			shared_timer->timer_in_use -= 4;
+			shared_timer->timer_direction_upward = 0;
+			shared_timer->timer_direction_downward++;
+		}
+
+		if (shared_timer->timer_direction_upward > 3 ) {  /* using history */
+			shared_timer->timer_in_use += 3;
+			shared_timer->timer_direction_upward = 0;
+			shared_timer->timer_direction_downward = 0;
+		}
+		if (shared_timer->timer_direction_downward > 5) { /* using history */
+			shared_timer->timer_in_use -= 4 ;
+			shared_timer->timer_direction_downward = 0;
+			shared_timer->timer_direction_upward = 0;
+		}
+	}
+
+	/* boundary checking */
+	if (shared_timer->timer_in_use > shared_timer->threshold_high)
+		shared_timer->timer_in_use = shared_timer->threshold_high;
+	else if (shared_timer->timer_in_use < shared_timer->threshold_low)
+		shared_timer->timer_in_use = shared_timer->threshold_low;
+
+	nesdev->currcq_count = 0;
+
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+}
+
+
+/**
+ * nes_init_adapter - initialize adapter
+ */
+struct nes_adapter *nes_init_adapter(struct nes_device *nesdev, u8 hw_rev) {
+	struct nes_adapter *nesadapter = NULL;
+	unsigned long num_pds;
+	u32 u32temp;
+	u32 port_count;
+	u16 max_rq_wrs;
+	u16 max_sq_wrs;
+	u32 max_mr;
+	u32 max_256pbl;
+	u32 max_4kpbl;
+	u32 max_qp;
+	u32 max_irrq;
+	u32 max_cq;
+	u32 hte_index_mask;
+	u32 adapter_size;
+	u32 arp_table_size;
+	u16 vendor_id;
+	u16 device_id;
+	u8  OneG_Mode;
+	u8  func_index;
+
+	/* search the list of existing adapters */
+	list_for_each_entry(nesadapter, &nes_adapter_list, list) {
+		nes_debug(NES_DBG_INIT, "Searching Adapter list for PCI devfn = 0x%X,"
+				" adapter PCI slot/bus = %u/%u, pci devices PCI slot/bus = %u/%u, .\n",
+				nesdev->pcidev->devfn,
+				PCI_SLOT(nesadapter->devfn),
+				nesadapter->bus_number,
+				PCI_SLOT(nesdev->pcidev->devfn),
+				nesdev->pcidev->bus->number );
+		if ((PCI_SLOT(nesadapter->devfn) == PCI_SLOT(nesdev->pcidev->devfn)) &&
+				(nesadapter->bus_number == nesdev->pcidev->bus->number)) {
+			nesadapter->ref_count++;
+			return nesadapter;
+		}
+	}
+
+	/* no adapter found */
+	num_pds = pci_resource_len(nesdev->pcidev, BAR_1) >> PAGE_SHIFT;
+	if ((hw_rev != NE020_REV) && (hw_rev != NE020_REV1)) {
+		nes_debug(NES_DBG_INIT, "NE020 driver detected unknown hardware revision 0x%x\n",
+				hw_rev);
+		return NULL;
+	}
+
+	nes_debug(NES_DBG_INIT, "Determine Soft Reset, QP_control=0x%x, CPU0=0x%x, CPU1=0x%x, CPU2=0x%x\n",
+			nes_read_indexed(nesdev, NES_IDX_QP_CONTROL + PCI_FUNC(nesdev->pcidev->devfn) * 8),
+			nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS),
+			nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 4),
+			nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS + 8));
+
+	nes_debug(NES_DBG_INIT, "Reset and init NE020\n");
+
+
+	if ((port_count = nes_reset_adapter_ne020(nesdev, &OneG_Mode)) == 0)
+		return NULL;
+
+	max_qp = nes_read_indexed(nesdev, NES_IDX_QP_CTX_SIZE);
+	nes_debug(NES_DBG_INIT, "QP_CTX_SIZE=%u\n", max_qp);
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_QUAD_HASH_TABLE_SIZE);
+	if (max_qp > ((u32)1 << (u32temp & 0x001f))) {
+		nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to hash table size = 0x%08X\n",
+				max_qp, u32temp);
+		max_qp = (u32)1 << (u32temp & 0x001f);
+	}
+
+	hte_index_mask = ((u32)1 << ((u32temp & 0x001f)+1))-1;
+	nes_debug(NES_DBG_INIT, "Max QP = %u, hte_index_mask = 0x%08X.\n",
+			max_qp, hte_index_mask);
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_IRRQ_COUNT);
+
+	max_irrq = 1 << (u32temp & 0x001f);
+
+	if (max_qp > max_irrq) {
+		max_qp = max_irrq;
+		nes_debug(NES_DBG_INIT, "Reducing Max QPs to %u due to Available Q1s.\n",
+				max_qp);
+	}
+
+	/* there should be no reason to allocate more pds than qps */
+	if (num_pds > max_qp)
+		num_pds = max_qp;
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_MRT_SIZE);
+	max_mr = (u32)8192 << (u32temp & 0x7);
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_PBL_REGION_SIZE);
+	max_256pbl = (u32)1 << (u32temp & 0x0000001f);
+	max_4kpbl = (u32)1 << ((u32temp >> 16) & 0x0000001f);
+	max_cq = nes_read_indexed(nesdev, NES_IDX_CQ_CTX_SIZE);
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_ARP_CACHE_SIZE);
+	arp_table_size = 1 << u32temp;
+
+	adapter_size = (sizeof(struct nes_adapter) +
+			(sizeof(unsigned long)-1)) & (~(sizeof(unsigned long)-1));
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_qp);
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_mr);
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(max_cq);
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(num_pds);
+	adapter_size += sizeof(unsigned long) * BITS_TO_LONGS(arp_table_size);
+	adapter_size += sizeof(struct nes_qp **) * max_qp;
+
+	/* allocate a new adapter struct */
+	nesadapter = kzalloc(adapter_size, GFP_KERNEL);
+	if (nesadapter == NULL) {
+		return NULL;
+	}
+
+	nes_debug(NES_DBG_INIT, "Allocating new nesadapter @ %p, size = %u (actual size = %u).\n",
+			nesadapter, (u32)sizeof(struct nes_adapter), adapter_size);
+
+	if (nes_read_eeprom_values(nesdev, nesadapter)) {
+		printk(KERN_ERR PFX "Unable to read EEPROM data.\n");
+		kfree(nesadapter);
+		return NULL;
+	}
+
+	nesadapter->vendor_id = (((u32) nesadapter->mac_addr_high) << 8) |
+				(nesadapter->mac_addr_low >> 24);
+
+	pci_bus_read_config_word(nesdev->pcidev->bus, nesdev->pcidev->devfn,
+				 PCI_DEVICE_ID, &device_id);
+	nesadapter->vendor_part_id = device_id;
+
+	if (nes_init_serdes(nesdev, hw_rev, port_count, nesadapter,
+							OneG_Mode)) {
+		kfree(nesadapter);
+		return NULL;
+	}
+	nes_init_csr_ne020(nesdev, hw_rev, port_count);
+
+	memset(nesadapter->pft_mcast_map, 255,
+	       sizeof nesadapter->pft_mcast_map);
+
+	/* populate the new nesadapter */
+	nesadapter->devfn = nesdev->pcidev->devfn;
+	nesadapter->bus_number = nesdev->pcidev->bus->number;
+	nesadapter->ref_count = 1;
+	nesadapter->timer_int_req = 0xffff0000;
+	nesadapter->OneG_Mode = OneG_Mode;
+	nesadapter->doorbell_start = nesdev->doorbell_region;
+
+	/* nesadapter->tick_delta = clk_divisor; */
+	nesadapter->hw_rev = hw_rev;
+	nesadapter->port_count = port_count;
+
+	nesadapter->max_qp = max_qp;
+	nesadapter->hte_index_mask = hte_index_mask;
+	nesadapter->max_irrq = max_irrq;
+	nesadapter->max_mr = max_mr;
+	nesadapter->max_256pbl = max_256pbl - 1;
+	nesadapter->max_4kpbl = max_4kpbl - 1;
+	nesadapter->max_cq = max_cq;
+	nesadapter->free_256pbl = max_256pbl - 1;
+	nesadapter->free_4kpbl = max_4kpbl - 1;
+	nesadapter->max_pd = num_pds;
+	nesadapter->arp_table_size = arp_table_size;
+
+	nesadapter->et_pkt_rate_low = NES_TIMER_ENABLE_LIMIT;
+	if (nes_drv_opt & NES_DRV_OPT_DISABLE_INT_MOD) {
+		nesadapter->et_use_adaptive_rx_coalesce = 0;
+		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT;
+		nesadapter->et_rx_coalesce_usecs_irq = interrupt_mod_interval;
+	} else {
+		nesadapter->et_use_adaptive_rx_coalesce = 1;
+		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC;
+		nesadapter->et_rx_coalesce_usecs_irq = 0;
+		printk(PFX "%s: Using Adaptive Interrupt Moderation\n", __func__);
+	}
+	/* Setup and enable the periodic timer */
+	if (nesadapter->et_rx_coalesce_usecs_irq)
+		nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x80000000 |
+				((u32)(nesadapter->et_rx_coalesce_usecs_irq * 8)));
+	else
+		nes_write32(nesdev->regs+NES_PERIODIC_CONTROL, 0x00000000);
+
+	nesadapter->base_pd = 1;
+
+	nesadapter->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
+				       IB_DEVICE_MEM_WINDOW |
+				       IB_DEVICE_MEM_MGT_EXTENSIONS;
+
+	nesadapter->allocated_qps = (unsigned long *)&(((unsigned char *)nesadapter)
+			[(sizeof(struct nes_adapter)+(sizeof(unsigned long)-1))&(~(sizeof(unsigned long)-1))]);
+	nesadapter->allocated_cqs = &nesadapter->allocated_qps[BITS_TO_LONGS(max_qp)];
+	nesadapter->allocated_mrs = &nesadapter->allocated_cqs[BITS_TO_LONGS(max_cq)];
+	nesadapter->allocated_pds = &nesadapter->allocated_mrs[BITS_TO_LONGS(max_mr)];
+	nesadapter->allocated_arps = &nesadapter->allocated_pds[BITS_TO_LONGS(num_pds)];
+	nesadapter->qp_table = (struct nes_qp **)(&nesadapter->allocated_arps[BITS_TO_LONGS(arp_table_size)]);
+
+
+	/* mark the usual suspect QPs, MR and CQs as in use */
+	for (u32temp = 0; u32temp < NES_FIRST_QPN; u32temp++) {
+		set_bit(u32temp, nesadapter->allocated_qps);
+		set_bit(u32temp, nesadapter->allocated_cqs);
+	}
+	set_bit(0, nesadapter->allocated_mrs);
+
+	for (u32temp = 0; u32temp < 20; u32temp++)
+		set_bit(u32temp, nesadapter->allocated_pds);
+	u32temp = nes_read_indexed(nesdev, NES_IDX_QP_MAX_CFG_SIZES);
+
+	max_rq_wrs = ((u32temp >> 8) & 3);
+	switch (max_rq_wrs) {
+		case 0:
+			max_rq_wrs = 4;
+			break;
+		case 1:
+			max_rq_wrs = 16;
+			break;
+		case 2:
+			max_rq_wrs = 32;
+			break;
+		case 3:
+			max_rq_wrs = 512;
+			break;
+	}
+
+	max_sq_wrs = (u32temp & 3);
+	switch (max_sq_wrs) {
+		case 0:
+			max_sq_wrs = 4;
+			break;
+		case 1:
+			max_sq_wrs = 16;
+			break;
+		case 2:
+			max_sq_wrs = 32;
+			break;
+		case 3:
+			max_sq_wrs = 512;
+			break;
+	}
+	nesadapter->max_qp_wr = min(max_rq_wrs, max_sq_wrs);
+	nesadapter->max_irrq_wr = (u32temp >> 16) & 3;
+
+	nesadapter->max_sge = 4;
+	nesadapter->max_cqe = 32766;
+
+	if (nes_read_eeprom_values(nesdev, nesadapter)) {
+		printk(KERN_ERR PFX "Unable to read EEPROM data.\n");
+		kfree(nesadapter);
+		return NULL;
+	}
+
+	u32temp = nes_read_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG);
+	nes_write_indexed(nesdev, NES_IDX_TCP_TIMER_CONFIG,
+			(u32temp & 0xff000000) | (nesadapter->tcp_timer_core_clk_divisor & 0x00ffffff));
+
+	/* setup port configuration */
+	if (nesadapter->port_count == 1) {
+		nesadapter->log_port = 0x00000000;
+		if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT)
+			nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000002);
+		else
+			nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003);
+	} else {
+		if (nesadapter->phy_type[0] == NES_PHY_TYPE_PUMA_1G) {
+			nesadapter->log_port = 0x000000D8;
+		} else {
+			if (nesadapter->port_count == 2)
+				nesadapter->log_port = 0x00000044;
+			else
+				nesadapter->log_port = 0x000000e4;
+		}
+		nes_write_indexed(nesdev, NES_IDX_TX_POOL_SIZE, 0x00000003);
+	}
+
+	nes_write_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT,
+						nesadapter->log_port);
+	nes_debug(NES_DBG_INIT, "Probe time, LOG2PHY=%u\n",
+			nes_read_indexed(nesdev, NES_IDX_NIC_LOGPORT_TO_PHYPORT));
+
+	spin_lock_init(&nesadapter->resource_lock);
+	spin_lock_init(&nesadapter->phy_lock);
+	spin_lock_init(&nesadapter->pbl_lock);
+	spin_lock_init(&nesadapter->periodic_timer_lock);
+
+	INIT_LIST_HEAD(&nesadapter->nesvnic_list[0]);
+	INIT_LIST_HEAD(&nesadapter->nesvnic_list[1]);
+	INIT_LIST_HEAD(&nesadapter->nesvnic_list[2]);
+	INIT_LIST_HEAD(&nesadapter->nesvnic_list[3]);
+
+	if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) {
+		u32 pcs_control_status0, pcs_control_status1;
+		u32 reset_value;
+		u32 i = 0;
+		u32 int_cnt = 0;
+		u32 ext_cnt = 0;
+		unsigned long flags;
+		u32 j = 0;
+
+		pcs_control_status0 = nes_read_indexed(nesdev,
+			NES_IDX_PHY_PCS_CONTROL_STATUS0);
+		pcs_control_status1 = nes_read_indexed(nesdev,
+			NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+
+		for (i = 0; i < NES_MAX_LINK_CHECK; i++) {
+			pcs_control_status0 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0);
+			pcs_control_status1 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+			if ((0x0F000100 == (pcs_control_status0 & 0x0F000100))
+			    || (0x0F000100 == (pcs_control_status1 & 0x0F000100)))
+				int_cnt++;
+			msleep(1);
+		}
+		if (int_cnt > 1) {
+			spin_lock_irqsave(&nesadapter->phy_lock, flags);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
+			mh_detected++;
+			reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+			reset_value |= 0x0000003d;
+			nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
+
+			while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
+				& 0x00000040) != 0x00000040) && (j++ < 5000));
+			spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+
+			pcs_control_status0 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0);
+			pcs_control_status1 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+
+			for (i = 0; i < NES_MAX_LINK_CHECK; i++) {
+				pcs_control_status0 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0);
+				pcs_control_status1 = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+				if ((0x0F000100 == (pcs_control_status0 & 0x0F000100))
+					|| (0x0F000100 == (pcs_control_status1 & 0x0F000100))) {
+					if (++ext_cnt > int_cnt) {
+						spin_lock_irqsave(&nesadapter->phy_lock, flags);
+						nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1,
+								0x0000F088);
+						mh_detected++;
+						reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+						reset_value |= 0x0000003d;
+						nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
+
+						while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
+							& 0x00000040) != 0x00000040) && (j++ < 5000));
+						spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+						break;
+					}
+				}
+				msleep(1);
+			}
+		}
+	}
+
+	if (nesadapter->hw_rev == NE020_REV) {
+		init_timer(&nesadapter->mh_timer);
+		nesadapter->mh_timer.function = nes_mh_fix;
+		nesadapter->mh_timer.expires = jiffies + (HZ/5);  /* 1 second */
+		nesadapter->mh_timer.data = (unsigned long)nesdev;
+		add_timer(&nesadapter->mh_timer);
+	} else {
+		nes_write32(nesdev->regs+NES_INTF_INT_STAT, 0x0f000000);
+	}
+
+	init_timer(&nesadapter->lc_timer);
+	nesadapter->lc_timer.function = nes_clc;
+	nesadapter->lc_timer.expires = jiffies + 3600 * HZ;  /* 1 hour */
+	nesadapter->lc_timer.data = (unsigned long)nesdev;
+	add_timer(&nesadapter->lc_timer);
+
+	list_add_tail(&nesadapter->list, &nes_adapter_list);
+
+	for (func_index = 0; func_index < 8; func_index++) {
+		pci_bus_read_config_word(nesdev->pcidev->bus,
+					PCI_DEVFN(PCI_SLOT(nesdev->pcidev->devfn),
+					func_index), 0, &vendor_id);
+		if (vendor_id == 0xffff)
+			break;
+	}
+	nes_debug(NES_DBG_INIT, "%s %d functions found for %s.\n", __func__,
+		func_index, pci_name(nesdev->pcidev));
+	nesadapter->adapter_fcn_count = func_index;
+
+	return nesadapter;
+}
+
+
+/**
+ * nes_reset_adapter_ne020
+ */
+static unsigned int nes_reset_adapter_ne020(struct nes_device *nesdev, u8 *OneG_Mode)
+{
+	u32 port_count;
+	u32 u32temp;
+	u32 i;
+
+	u32temp = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+	port_count = ((u32temp & 0x00000300) >> 8) + 1;
+	/* TODO: assuming that both SERDES are set the same for now */
+	*OneG_Mode = (u32temp & 0x00003c00) ? 0 : 1;
+	nes_debug(NES_DBG_INIT, "Initial Software Reset = 0x%08X, port_count=%u\n",
+			u32temp, port_count);
+	if (*OneG_Mode)
+		nes_debug(NES_DBG_INIT, "Running in 1G mode.\n");
+	u32temp &= 0xff00ffc0;
+	switch (port_count) {
+		case 1:
+			u32temp |= 0x00ee0000;
+			break;
+		case 2:
+			u32temp |= 0x00cc0000;
+			break;
+		case 4:
+			u32temp |= 0x00000000;
+			break;
+		default:
+			return 0;
+			break;
+	}
+
+	/* check and do full reset if needed */
+	if (nes_read_indexed(nesdev, NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8))) {
+		nes_debug(NES_DBG_INIT, "Issuing Full Soft reset = 0x%08X\n", u32temp | 0xd);
+		nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd);
+
+		i = 0;
+		while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000)
+			mdelay(1);
+		if (i > 10000) {
+			nes_debug(NES_DBG_INIT, "Did not see full soft reset done.\n");
+			return 0;
+		}
+
+		i = 0;
+		while ((nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS) != 0x80) && i++ < 10000)
+			mdelay(1);
+		if (i > 10000) {
+			printk(KERN_ERR PFX "Internal CPU not ready, status = %02X\n",
+			       nes_read_indexed(nesdev, NES_IDX_INT_CPU_STATUS));
+			return 0;
+		}
+	}
+
+	/* port reset */
+	switch (port_count) {
+		case 1:
+			u32temp |= 0x00ee0010;
+			break;
+		case 2:
+			u32temp |= 0x00cc0030;
+			break;
+		case 4:
+			u32temp |= 0x00000030;
+			break;
+	}
+
+	nes_debug(NES_DBG_INIT, "Issuing Port Soft reset = 0x%08X\n", u32temp | 0xd);
+	nes_write32(nesdev->regs+NES_SOFTWARE_RESET, u32temp | 0xd);
+
+	i = 0;
+	while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET) & 0x00000040) == 0) && i++ < 10000)
+		mdelay(1);
+	if (i > 10000) {
+		nes_debug(NES_DBG_INIT, "Did not see port soft reset done.\n");
+		return 0;
+	}
+
+	/* serdes 0 */
+	i = 0;
+	while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0)
+			& 0x0000000f)) != 0x0000000f) && i++ < 5000)
+		mdelay(1);
+	if (i > 5000) {
+		nes_debug(NES_DBG_INIT, "Serdes 0 not ready, status=%x\n", u32temp);
+		return 0;
+	}
+
+	/* serdes 1 */
+	if (port_count > 1) {
+		i = 0;
+		while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1)
+				& 0x0000000f)) != 0x0000000f) && i++ < 5000)
+			mdelay(1);
+		if (i > 5000) {
+			nes_debug(NES_DBG_INIT, "Serdes 1 not ready, status=%x\n", u32temp);
+			return 0;
+		}
+	}
+
+	return port_count;
+}
+
+
+/**
+ * nes_init_serdes
+ */
+static int nes_init_serdes(struct nes_device *nesdev, u8 hw_rev, u8 port_count,
+				struct nes_adapter *nesadapter, u8  OneG_Mode)
+{
+	int i;
+	u32 u32temp;
+	u32 sds;
+
+	if (hw_rev != NE020_REV) {
+		/* init serdes 0 */
+		switch (nesadapter->phy_type[0]) {
+		case NES_PHY_TYPE_CX4:
+			if (wide_ppm_offset)
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000FFFAA);
+			else
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+			break;
+		case NES_PHY_TYPE_KR:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000);
+			break;
+		case NES_PHY_TYPE_PUMA_1G:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+			sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0);
+			sds |= 0x00000100;
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, sds);
+			break;
+		default:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000FF);
+			break;
+		}
+
+		if (!OneG_Mode)
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0, 0x11110000);
+
+		if (port_count < 2)
+			return 0;
+
+		/* init serdes 1 */
+		if (!(OneG_Mode && (nesadapter->phy_type[1] != NES_PHY_TYPE_PUMA_1G)))
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000FF);
+
+		switch (nesadapter->phy_type[1]) {
+		case NES_PHY_TYPE_ARGUS:
+		case NES_PHY_TYPE_SFP_D:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x00000000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000);
+			break;
+		case NES_PHY_TYPE_CX4:
+			if (wide_ppm_offset)
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000FFFAA);
+			break;
+		case NES_PHY_TYPE_KR:
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x00000000);
+			break;
+		case NES_PHY_TYPE_PUMA_1G:
+			sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
+			sds |= 0x000000100;
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds);
+		}
+		if (!OneG_Mode) {
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1, 0x11110000);
+			sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
+			sds &= 0xFFFFFFBF;
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, sds);
+		}
+	} else {
+		/* init serdes 0 */
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008);
+		i = 0;
+		while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0)
+				& 0x0000000f)) != 0x0000000f) && i++ < 5000)
+			mdelay(1);
+		if (i > 5000) {
+			nes_debug(NES_DBG_PHY, "Init: serdes 0 not ready, status=%x\n", u32temp);
+			return 1;
+		}
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000);
+		if (OneG_Mode)
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222);
+		else
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222);
+
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff);
+		if (port_count > 1) {
+			/* init serdes 1 */
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x00000048);
+			i = 0;
+			while (((u32temp = (nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS1)
+				& 0x0000000f)) != 0x0000000f) && (i++ < 5000))
+				mdelay(1);
+			if (i > 5000) {
+				printk("%s: Init: serdes 1 not ready, status=%x\n", __func__, u32temp);
+				/* return 1; */
+			}
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP1, 0x000bdef7);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE1, 0x9ce73000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE1, 0x0ff00000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET1, 0x00000000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS1, 0x00000000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1, 0x00000000);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL1, 0xf0002222);
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL1, 0x000000ff);
+		}
+	}
+	return 0;
+}
+
+
+/**
+ * nes_init_csr_ne020
+ * Initialize registers for ne020 hardware
+ */
+static void nes_init_csr_ne020(struct nes_device *nesdev, u8 hw_rev, u8 port_count)
+{
+	u32 u32temp;
+
+	nes_debug(NES_DBG_INIT, "port_count=%d\n", port_count);
+
+	nes_write_indexed(nesdev, 0x000001E4, 0x00000007);
+	/* nes_write_indexed(nesdev, 0x000001E8, 0x000208C4); */
+	nes_write_indexed(nesdev, 0x000001E8, 0x00020874);
+	nes_write_indexed(nesdev, 0x000001D8, 0x00048002);
+	/* nes_write_indexed(nesdev, 0x000001D8, 0x0004B002); */
+	nes_write_indexed(nesdev, 0x000001FC, 0x00050005);
+	nes_write_indexed(nesdev, 0x00000600, 0x55555555);
+	nes_write_indexed(nesdev, 0x00000604, 0x55555555);
+
+	/* TODO: move these MAC register settings to NIC bringup */
+	nes_write_indexed(nesdev, 0x00002000, 0x00000001);
+	nes_write_indexed(nesdev, 0x00002004, 0x00000001);
+	nes_write_indexed(nesdev, 0x00002008, 0x0000FFFF);
+	nes_write_indexed(nesdev, 0x0000200C, 0x00000001);
+	nes_write_indexed(nesdev, 0x00002010, 0x000003c1);
+	nes_write_indexed(nesdev, 0x0000201C, 0x75345678);
+	if (port_count > 1) {
+		nes_write_indexed(nesdev, 0x00002200, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002204, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002208, 0x0000FFFF);
+		nes_write_indexed(nesdev, 0x0000220C, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002210, 0x000003c1);
+		nes_write_indexed(nesdev, 0x0000221C, 0x75345678);
+		nes_write_indexed(nesdev, 0x00000908, 0x20000001);
+	}
+	if (port_count > 2) {
+		nes_write_indexed(nesdev, 0x00002400, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002404, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002408, 0x0000FFFF);
+		nes_write_indexed(nesdev, 0x0000240C, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002410, 0x000003c1);
+		nes_write_indexed(nesdev, 0x0000241C, 0x75345678);
+		nes_write_indexed(nesdev, 0x00000910, 0x20000001);
+
+		nes_write_indexed(nesdev, 0x00002600, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002604, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002608, 0x0000FFFF);
+		nes_write_indexed(nesdev, 0x0000260C, 0x00000001);
+		nes_write_indexed(nesdev, 0x00002610, 0x000003c1);
+		nes_write_indexed(nesdev, 0x0000261C, 0x75345678);
+		nes_write_indexed(nesdev, 0x00000918, 0x20000001);
+	}
+
+	nes_write_indexed(nesdev, 0x00005000, 0x00018000);
+	/* nes_write_indexed(nesdev, 0x00005000, 0x00010000); */
+	nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG1, (wqm_quanta << 1) |
+							 0x00000001);
+	nes_write_indexed(nesdev, 0x00005008, 0x1F1F1F1F);
+	nes_write_indexed(nesdev, 0x00005010, 0x1F1F1F1F);
+	nes_write_indexed(nesdev, 0x00005018, 0x1F1F1F1F);
+	nes_write_indexed(nesdev, 0x00005020, 0x1F1F1F1F);
+	nes_write_indexed(nesdev, 0x00006090, 0xFFFFFFFF);
+
+	/* TODO: move this to code, get from EEPROM */
+	nes_write_indexed(nesdev, 0x00000900, 0x20000001);
+	nes_write_indexed(nesdev, 0x000060C0, 0x0000028e);
+	nes_write_indexed(nesdev, 0x000060C8, 0x00000020);
+
+	nes_write_indexed(nesdev, 0x000001EC, 0x7b2625a0);
+	/* nes_write_indexed(nesdev, 0x000001EC, 0x5f2625a0); */
+
+	if (hw_rev != NE020_REV) {
+		u32temp = nes_read_indexed(nesdev, 0x000008e8);
+		u32temp |= 0x80000000;
+		nes_write_indexed(nesdev, 0x000008e8, u32temp);
+		u32temp = nes_read_indexed(nesdev, 0x000021f8);
+		u32temp &= 0x7fffffff;
+		u32temp |= 0x7fff0010;
+		nes_write_indexed(nesdev, 0x000021f8, u32temp);
+		if (port_count > 1) {
+			u32temp = nes_read_indexed(nesdev, 0x000023f8);
+			u32temp &= 0x7fffffff;
+			u32temp |= 0x7fff0010;
+			nes_write_indexed(nesdev, 0x000023f8, u32temp);
+		}
+	}
+}
+
+
+/**
+ * nes_destroy_adapter - destroy the adapter structure
+ */
+void nes_destroy_adapter(struct nes_adapter *nesadapter)
+{
+	struct nes_adapter *tmp_adapter;
+
+	list_for_each_entry(tmp_adapter, &nes_adapter_list, list) {
+		nes_debug(NES_DBG_SHUTDOWN, "Nes Adapter list entry = 0x%p.\n",
+				tmp_adapter);
+	}
+
+	nesadapter->ref_count--;
+	if (!nesadapter->ref_count) {
+		if (nesadapter->hw_rev == NE020_REV) {
+			del_timer(&nesadapter->mh_timer);
+		}
+		del_timer(&nesadapter->lc_timer);
+
+		list_del(&nesadapter->list);
+		kfree(nesadapter);
+	}
+}
+
+
+/**
+ * nes_init_cqp
+ */
+int nes_init_cqp(struct nes_device *nesdev)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_cqp_qp_context *cqp_qp_context;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_hw_ceq *ceq;
+	struct nes_hw_ceq *nic_ceq;
+	struct nes_hw_aeq *aeq;
+	void *vmem;
+	dma_addr_t pmem;
+	u32 count=0;
+	u32 cqp_head;
+	u64 u64temp;
+	u32 u32temp;
+
+	/* allocate CQP memory */
+	/* Need to add max_cq to the aeq size once cq overflow checking is added back */
+	/* SQ is 512 byte aligned, others are 256 byte aligned */
+	nesdev->cqp_mem_size = 512 +
+			(sizeof(struct nes_hw_cqp_wqe) * NES_CQP_SQ_SIZE) +
+			(sizeof(struct nes_hw_cqe) * NES_CCQ_SIZE) +
+			max(((u32)sizeof(struct nes_hw_ceqe) * NES_CCEQ_SIZE), (u32)256) +
+			max(((u32)sizeof(struct nes_hw_ceqe) * NES_NIC_CEQ_SIZE), (u32)256) +
+			(sizeof(struct nes_hw_aeqe) * nesadapter->max_qp) +
+			sizeof(struct nes_hw_cqp_qp_context);
+
+	nesdev->cqp_vbase = pci_zalloc_consistent(nesdev->pcidev,
+						  nesdev->cqp_mem_size,
+						  &nesdev->cqp_pbase);
+	if (!nesdev->cqp_vbase) {
+		nes_debug(NES_DBG_INIT, "Unable to allocate memory for host descriptor rings\n");
+		return -ENOMEM;
+	}
+
+	/* Allocate a twice the number of CQP requests as the SQ size */
+	nesdev->nes_cqp_requests = kzalloc(sizeof(struct nes_cqp_request) *
+			2 * NES_CQP_SQ_SIZE, GFP_KERNEL);
+	if (nesdev->nes_cqp_requests == NULL) {
+		nes_debug(NES_DBG_INIT, "Unable to allocate memory CQP request entries.\n");
+		pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase,
+				nesdev->cqp.sq_pbase);
+		return -ENOMEM;
+	}
+
+	nes_debug(NES_DBG_INIT, "Allocated CQP structures at %p (phys = %016lX), size = %u.\n",
+			nesdev->cqp_vbase, (unsigned long)nesdev->cqp_pbase, nesdev->cqp_mem_size);
+
+	spin_lock_init(&nesdev->cqp.lock);
+	init_waitqueue_head(&nesdev->cqp.waitq);
+
+	/* Setup Various Structures */
+	vmem = (void *)(((unsigned long)nesdev->cqp_vbase + (512 - 1)) &
+			~(unsigned long)(512 - 1));
+	pmem = (dma_addr_t)(((unsigned long long)nesdev->cqp_pbase + (512 - 1)) &
+			~(unsigned long long)(512 - 1));
+
+	nesdev->cqp.sq_vbase = vmem;
+	nesdev->cqp.sq_pbase = pmem;
+	nesdev->cqp.sq_size = NES_CQP_SQ_SIZE;
+	nesdev->cqp.sq_head = 0;
+	nesdev->cqp.sq_tail = 0;
+	nesdev->cqp.qp_id = PCI_FUNC(nesdev->pcidev->devfn);
+
+	vmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size);
+	pmem += (sizeof(struct nes_hw_cqp_wqe) * nesdev->cqp.sq_size);
+
+	nesdev->ccq.cq_vbase = vmem;
+	nesdev->ccq.cq_pbase = pmem;
+	nesdev->ccq.cq_size = NES_CCQ_SIZE;
+	nesdev->ccq.cq_head = 0;
+	nesdev->ccq.ce_handler = nes_cqp_ce_handler;
+	nesdev->ccq.cq_number = PCI_FUNC(nesdev->pcidev->devfn);
+
+	vmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size);
+	pmem += (sizeof(struct nes_hw_cqe) * nesdev->ccq.cq_size);
+
+	nesdev->ceq_index = PCI_FUNC(nesdev->pcidev->devfn);
+	ceq = &nesadapter->ceq[nesdev->ceq_index];
+	ceq->ceq_vbase = vmem;
+	ceq->ceq_pbase = pmem;
+	ceq->ceq_size = NES_CCEQ_SIZE;
+	ceq->ceq_head = 0;
+
+	vmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256);
+	pmem += max(((u32)sizeof(struct nes_hw_ceqe) * ceq->ceq_size), (u32)256);
+
+	nesdev->nic_ceq_index = PCI_FUNC(nesdev->pcidev->devfn) + 8;
+	nic_ceq = &nesadapter->ceq[nesdev->nic_ceq_index];
+	nic_ceq->ceq_vbase = vmem;
+	nic_ceq->ceq_pbase = pmem;
+	nic_ceq->ceq_size = NES_NIC_CEQ_SIZE;
+	nic_ceq->ceq_head = 0;
+
+	vmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256);
+	pmem += max(((u32)sizeof(struct nes_hw_ceqe) * nic_ceq->ceq_size), (u32)256);
+
+	aeq = &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)];
+	aeq->aeq_vbase = vmem;
+	aeq->aeq_pbase = pmem;
+	aeq->aeq_size = nesadapter->max_qp;
+	aeq->aeq_head = 0;
+
+	/* Setup QP Context */
+	vmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size);
+	pmem += (sizeof(struct nes_hw_aeqe) * aeq->aeq_size);
+
+	cqp_qp_context = vmem;
+	cqp_qp_context->context_words[0] =
+			cpu_to_le32((PCI_FUNC(nesdev->pcidev->devfn) << 12) + (2 << 10));
+	cqp_qp_context->context_words[1] = 0;
+	cqp_qp_context->context_words[2] = cpu_to_le32((u32)nesdev->cqp.sq_pbase);
+	cqp_qp_context->context_words[3] = cpu_to_le32(((u64)nesdev->cqp.sq_pbase) >> 32);
+
+
+	/* Write the address to Create CQP */
+	if ((sizeof(dma_addr_t) > 4)) {
+		nes_write_indexed(nesdev,
+				NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8),
+				((u64)pmem) >> 32);
+	} else {
+		nes_write_indexed(nesdev,
+				NES_IDX_CREATE_CQP_HIGH + (PCI_FUNC(nesdev->pcidev->devfn) * 8), 0);
+	}
+	nes_write_indexed(nesdev,
+			NES_IDX_CREATE_CQP_LOW + (PCI_FUNC(nesdev->pcidev->devfn) * 8),
+			(u32)pmem);
+
+	INIT_LIST_HEAD(&nesdev->cqp_avail_reqs);
+	INIT_LIST_HEAD(&nesdev->cqp_pending_reqs);
+
+	for (count = 0; count < 2*NES_CQP_SQ_SIZE; count++) {
+		init_waitqueue_head(&nesdev->nes_cqp_requests[count].waitq);
+		list_add_tail(&nesdev->nes_cqp_requests[count].list, &nesdev->cqp_avail_reqs);
+	}
+
+	/* Write Create CCQ WQE */
+	cqp_head = nesdev->cqp.sq_head++;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			(NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
+			NES_CQP_CQ_CHK_OVERFLOW | ((u32)nesdev->ccq.cq_size << 16)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+			    (nesdev->ccq.cq_number |
+			     ((u32)nesdev->ceq_index << 16)));
+	u64temp = (u64)nesdev->ccq.cq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
+	u64temp = (unsigned long)&nesdev->ccq;
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =
+			cpu_to_le32((u32)(u64temp >> 1));
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
+			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
+
+	/* Write Create CEQ WQE */
+	cqp_head = nesdev->cqp.sq_head++;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			    (NES_CQP_CREATE_CEQ + ((u32)nesdev->ceq_index << 8)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, ceq->ceq_size);
+	u64temp = (u64)ceq->ceq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+
+	/* Write Create AEQ WQE */
+	cqp_head = nesdev->cqp.sq_head++;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			(NES_CQP_CREATE_AEQ + ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX, aeq->aeq_size);
+	u64temp = (u64)aeq->aeq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+
+	/* Write Create NIC CEQ WQE */
+	cqp_head = nesdev->cqp.sq_head++;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			(NES_CQP_CREATE_CEQ + ((u32)nesdev->nic_ceq_index << 8)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX, nic_ceq->ceq_size);
+	u64temp = (u64)nic_ceq->ceq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+
+	/* Poll until CCQP done */
+	count = 0;
+	do {
+		if (count++ > 1000) {
+			printk(KERN_ERR PFX "Error creating CQP\n");
+			pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size,
+					nesdev->cqp_vbase, nesdev->cqp_pbase);
+			return -1;
+		}
+		udelay(10);
+	} while (!(nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn) * 8)) & (1 << 8)));
+
+	nes_debug(NES_DBG_INIT, "CQP Status = 0x%08X\n", nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
+
+	u32temp = 0x04800000;
+	nes_write32(nesdev->regs+NES_WQE_ALLOC, u32temp | nesdev->cqp.qp_id);
+
+	/* wait for the CCQ, CEQ, and AEQ to get created */
+	count = 0;
+	do {
+		if (count++ > 1000) {
+			printk(KERN_ERR PFX "Error creating CCQ, CEQ, and AEQ\n");
+			pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size,
+					nesdev->cqp_vbase, nesdev->cqp_pbase);
+			return -1;
+		}
+		udelay(10);
+	} while (((nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15<<8)) != (15<<8)));
+
+	/* dump the QP status value */
+	nes_debug(NES_DBG_INIT, "QP Status = 0x%08X\n", nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
+
+	nesdev->cqp.sq_tail++;
+
+	return 0;
+}
+
+
+/**
+ * nes_destroy_cqp
+ */
+int nes_destroy_cqp(struct nes_device *nesdev)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 count = 0;
+	u32 cqp_head;
+	unsigned long flags;
+
+	do {
+		if (count++ > 1000)
+			break;
+		udelay(10);
+	} while (!(nesdev->cqp.sq_head == nesdev->cqp.sq_tail));
+
+	/* Reset CCQ */
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_RESET |
+			nesdev->ccq.cq_number);
+
+	/* Disable device interrupts */
+	nes_write32(nesdev->regs+NES_INT_MASK, 0x7fffffff);
+
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+
+	/* Destroy the AEQ */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_AEQ |
+			((u32)PCI_FUNC(nesdev->pcidev->devfn) << 8));
+	cqp_wqe->wqe_words[NES_CQP_WQE_COMP_CTX_HIGH_IDX] = 0;
+
+	/* Destroy the NIC CEQ */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ |
+			((u32)nesdev->nic_ceq_index << 8));
+
+	/* Destroy the CEQ */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CEQ |
+			(nesdev->ceq_index << 8));
+
+	/* Destroy the CCQ */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_CQ);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->ccq.cq_number |
+			((u32)nesdev->ceq_index << 16));
+
+	/* Destroy CQP */
+	cqp_head = nesdev->cqp.sq_head++;
+	nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_DESTROY_QP |
+			NES_CQP_QP_TYPE_CQP);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesdev->cqp.qp_id);
+
+	barrier();
+	/* Ring doorbell (5 WQEs) */
+	nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x05800000 | nesdev->cqp.qp_id);
+
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+
+	/* wait for the CCQ, CEQ, and AEQ to get destroyed */
+	count = 0;
+	do {
+		if (count++ > 1000) {
+			printk(KERN_ERR PFX "Function%d: Error destroying CCQ, CEQ, and AEQ\n",
+					PCI_FUNC(nesdev->pcidev->devfn));
+			break;
+		}
+		udelay(10);
+	} while (((nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL + (PCI_FUNC(nesdev->pcidev->devfn)*8)) & (15 << 8)) != 0));
+
+	/* dump the QP status value */
+	nes_debug(NES_DBG_SHUTDOWN, "Function%d: QP Status = 0x%08X\n",
+			PCI_FUNC(nesdev->pcidev->devfn),
+			nes_read_indexed(nesdev,
+			NES_IDX_QP_CONTROL+(PCI_FUNC(nesdev->pcidev->devfn)*8)));
+
+	kfree(nesdev->nes_cqp_requests);
+
+	/* Free the control structures */
+	pci_free_consistent(nesdev->pcidev, nesdev->cqp_mem_size, nesdev->cqp.sq_vbase,
+			nesdev->cqp.sq_pbase);
+
+	return 0;
+}
+
+
+/**
+ * nes_init_1g_phy
+ */
+static int nes_init_1g_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
+{
+	u32 counter = 0;
+	u16 phy_data;
+	int ret = 0;
+
+	nes_read_1G_phy_reg(nesdev, 1, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 23, phy_index, 0xb000);
+
+	/* Reset the PHY */
+	nes_write_1G_phy_reg(nesdev, 0, phy_index, 0x8000);
+	udelay(100);
+	counter = 0;
+	do {
+		nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+		if (counter++ > 100) {
+			ret = -1;
+			break;
+		}
+	} while (phy_data & 0x8000);
+
+	/* Setting no phy loopback */
+	phy_data &= 0xbfff;
+	phy_data |= 0x1140;
+	nes_write_1G_phy_reg(nesdev, 0, phy_index,  phy_data);
+	nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+	nes_read_1G_phy_reg(nesdev, 0x17, phy_index, &phy_data);
+	nes_read_1G_phy_reg(nesdev, 0x1e, phy_index, &phy_data);
+
+	/* Setting the interrupt mask */
+	nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 0x19, phy_index, 0xffee);
+	nes_read_1G_phy_reg(nesdev, 0x19, phy_index, &phy_data);
+
+	/* turning on flow control */
+	nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 4, phy_index, (phy_data & ~(0x03E0)) | 0xc00);
+	nes_read_1G_phy_reg(nesdev, 4, phy_index, &phy_data);
+
+	/* Clear Half duplex */
+	nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 9, phy_index, phy_data & ~(0x0100));
+	nes_read_1G_phy_reg(nesdev, 9, phy_index, &phy_data);
+
+	nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+	nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data | 0x0300);
+
+	return ret;
+}
+
+
+/**
+ * nes_init_2025_phy
+ */
+static int nes_init_2025_phy(struct nes_device *nesdev, u8 phy_type, u8 phy_index)
+{
+	u32 temp_phy_data = 0;
+	u32 temp_phy_data2 = 0;
+	u32 counter = 0;
+	u32 sds;
+	u32 mac_index = nesdev->mac_index;
+	int ret = 0;
+	unsigned int first_attempt = 1;
+
+	/* Check firmware heartbeat */
+	nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+	temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	udelay(1500);
+	nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+	temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+
+	if (temp_phy_data != temp_phy_data2) {
+		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
+		temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+		if ((temp_phy_data & 0xff) > 0x20)
+			return 0;
+		printk(PFX "Reinitialize external PHY\n");
+	}
+
+	/* no heartbeat, configure the PHY */
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0x0000, 0x8000);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0000);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
+
+	switch (phy_type) {
+	case NES_PHY_TYPE_ARGUS:
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0008);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0001);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
+
+		/* setup LEDs */
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009);
+		break;
+
+	case NES_PHY_TYPE_SFP_D:
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x0004);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0038);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0098);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
+
+		/* setup LEDs */
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x0007);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0009);
+		break;
+
+	case NES_PHY_TYPE_KR:
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc316, 0x000A);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc318, 0x0052);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc302, 0x000C);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc319, 0x0010);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0027, 0x0013);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc31a, 0x0080);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0026, 0x0E00);
+
+		/* setup LEDs */
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd006, 0x000B);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd007, 0x0003);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd008, 0x0004);
+
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0022, 0x406D);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0023, 0x0020);
+		break;
+	}
+
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0x0028, 0xA528);
+
+	/* Bring PHY out of reset */
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc300, 0x0002);
+
+	/* Check for heartbeat */
+	counter = 0;
+	mdelay(690);
+	nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+	temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	do {
+		if (counter++ > 150) {
+			printk(PFX "No PHY heartbeat\n");
+			break;
+		}
+		mdelay(1);
+		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7ee);
+		temp_phy_data2 = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	} while ((temp_phy_data2 == temp_phy_data));
+
+	/* wait for tracking */
+	counter = 0;
+	do {
+		nes_read_10G_phy_reg(nesdev, phy_index, 0x3, 0xd7fd);
+		temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+		if (counter++ > 300) {
+			if (((temp_phy_data & 0xff) == 0x0) && first_attempt) {
+				first_attempt = 0;
+				counter = 0;
+				/* reset AMCC PHY and try again */
+				nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x00c0);
+				nes_write_10G_phy_reg(nesdev, phy_index, 0x3, 0xe854, 0x0040);
+				continue;
+			} else {
+				ret = 1;
+				break;
+			}
+		}
+		mdelay(10);
+	} while ((temp_phy_data & 0xff) < 0x30);
+
+	/* setup signal integrity */
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xd003, 0x0000);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00D, 0x00FE);
+	nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00E, 0x0032);
+	if (phy_type == NES_PHY_TYPE_KR) {
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x000C);
+	} else {
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xF00F, 0x0002);
+		nes_write_10G_phy_reg(nesdev, phy_index, 0x1, 0xc314, 0x0063);
+	}
+
+	/* reset serdes */
+	sds = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200);
+	sds |= 0x1;
+	nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds);
+	sds &= 0xfffffffe;
+	nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0 + mac_index * 0x200, sds);
+
+	counter = 0;
+	while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET) & 0x00000040) != 0x00000040)
+			&& (counter++ < 5000))
+		;
+
+	return ret;
+}
+
+
+/**
+ * nes_init_phy
+ */
+int nes_init_phy(struct nes_device *nesdev)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 mac_index = nesdev->mac_index;
+	u32 tx_config = 0;
+	unsigned long flags;
+	u8  phy_type = nesadapter->phy_type[mac_index];
+	u8  phy_index = nesadapter->phy_index[mac_index];
+	int ret = 0;
+
+	tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
+	if (phy_type == NES_PHY_TYPE_1G) {
+		/* setup 1G MDIO operation */
+		tx_config &= 0xFFFFFFE3;
+		tx_config |= 0x04;
+	} else {
+		/* setup 10G MDIO operation */
+		tx_config &= 0xFFFFFFE3;
+		tx_config |= 0x1D;
+	}
+	nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
+
+	spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
+
+	switch (phy_type) {
+	case NES_PHY_TYPE_1G:
+		ret = nes_init_1g_phy(nesdev, phy_type, phy_index);
+		break;
+	case NES_PHY_TYPE_ARGUS:
+	case NES_PHY_TYPE_SFP_D:
+	case NES_PHY_TYPE_KR:
+		ret = nes_init_2025_phy(nesdev, phy_type, phy_index);
+		break;
+	}
+
+	spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+
+	return ret;
+}
+
+
+/**
+ * nes_replenish_nic_rq
+ */
+static void nes_replenish_nic_rq(struct nes_vnic *nesvnic)
+{
+	unsigned long flags;
+	dma_addr_t bus_address;
+	struct sk_buff *skb;
+	struct nes_hw_nic_rq_wqe *nic_rqe;
+	struct nes_hw_nic *nesnic;
+	struct nes_device *nesdev;
+	struct nes_rskb_cb *cb;
+	u32 rx_wqes_posted = 0;
+
+	nesnic = &nesvnic->nic;
+	nesdev = nesvnic->nesdev;
+	spin_lock_irqsave(&nesnic->rq_lock, flags);
+	if (nesnic->replenishing_rq !=0) {
+		if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) &&
+				(atomic_read(&nesvnic->rx_skb_timer_running) == 0)) {
+			atomic_set(&nesvnic->rx_skb_timer_running, 1);
+			spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+			nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2);	/* 1/2 second */
+			add_timer(&nesvnic->rq_wqes_timer);
+		} else
+		spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+		return;
+	}
+	nesnic->replenishing_rq = 1;
+	spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+	do {
+		skb = dev_alloc_skb(nesvnic->max_frame_size);
+		if (skb) {
+			skb->dev = nesvnic->netdev;
+
+			bus_address = pci_map_single(nesdev->pcidev,
+					skb->data, nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+			cb = (struct nes_rskb_cb *)&skb->cb[0];
+			cb->busaddr = bus_address;
+			cb->maplen = nesvnic->max_frame_size;
+
+			nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_head];
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] =
+					cpu_to_le32(nesvnic->max_frame_size);
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] =
+					cpu_to_le32((u32)bus_address);
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] =
+					cpu_to_le32((u32)((u64)bus_address >> 32));
+			nesnic->rx_skb[nesnic->rq_head] = skb;
+			nesnic->rq_head++;
+			nesnic->rq_head &= nesnic->rq_size - 1;
+			atomic_dec(&nesvnic->rx_skbs_needed);
+			barrier();
+			if (++rx_wqes_posted == 255) {
+				nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id);
+				rx_wqes_posted = 0;
+			}
+		} else {
+			spin_lock_irqsave(&nesnic->rq_lock, flags);
+			if (((nesnic->rq_size-1) == atomic_read(&nesvnic->rx_skbs_needed)) &&
+					(atomic_read(&nesvnic->rx_skb_timer_running) == 0)) {
+				atomic_set(&nesvnic->rx_skb_timer_running, 1);
+				spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+				nesvnic->rq_wqes_timer.expires = jiffies + (HZ/2);	/* 1/2 second */
+				add_timer(&nesvnic->rq_wqes_timer);
+			} else
+				spin_unlock_irqrestore(&nesnic->rq_lock, flags);
+			break;
+		}
+	} while (atomic_read(&nesvnic->rx_skbs_needed));
+	barrier();
+	if (rx_wqes_posted)
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesnic->qp_id);
+	nesnic->replenishing_rq = 0;
+}
+
+
+/**
+ * nes_rq_wqes_timeout
+ */
+static void nes_rq_wqes_timeout(unsigned long parm)
+{
+	struct nes_vnic *nesvnic = (struct nes_vnic *)parm;
+	printk("%s: Timer fired.\n", __func__);
+	atomic_set(&nesvnic->rx_skb_timer_running, 0);
+	if (atomic_read(&nesvnic->rx_skbs_needed))
+		nes_replenish_nic_rq(nesvnic);
+}
+
+
+static int nes_lro_get_skb_hdr(struct sk_buff *skb, void **iphdr,
+			       void **tcph, u64 *hdr_flags, void *priv)
+{
+	unsigned int ip_len;
+	struct iphdr *iph;
+	skb_reset_network_header(skb);
+	iph = ip_hdr(skb);
+	if (iph->protocol != IPPROTO_TCP)
+		return -1;
+	ip_len = ip_hdrlen(skb);
+	skb_set_transport_header(skb, ip_len);
+	*tcph = tcp_hdr(skb);
+
+	*hdr_flags = LRO_IPV4 | LRO_TCP;
+	*iphdr = iph;
+	return 0;
+}
+
+
+/**
+ * nes_init_nic_qp
+ */
+int nes_init_nic_qp(struct nes_device *nesdev, struct net_device *netdev)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	struct nes_hw_nic_qp_context *nic_context;
+	struct sk_buff *skb;
+	struct nes_hw_nic_rq_wqe *nic_rqe;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	unsigned long flags;
+	void *vmem;
+	dma_addr_t pmem;
+	u64 u64temp;
+	int ret;
+	u32 cqp_head;
+	u32 counter;
+	u32 wqe_count;
+	struct nes_rskb_cb *cb;
+	u8 jumbomode=0;
+
+	/* Allocate fragment, SQ, RQ, and CQ; Reuse CEQ based on the PCI function */
+	nesvnic->nic_mem_size = 256 +
+			(NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag)) +
+			(NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe)) +
+			(NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe)) +
+			(NES_NIC_WQ_SIZE * 2 * sizeof(struct nes_hw_nic_cqe)) +
+			sizeof(struct nes_hw_nic_qp_context);
+
+	nesvnic->nic_vbase = pci_zalloc_consistent(nesdev->pcidev,
+						   nesvnic->nic_mem_size,
+						   &nesvnic->nic_pbase);
+	if (!nesvnic->nic_vbase) {
+		nes_debug(NES_DBG_INIT, "Unable to allocate memory for NIC host descriptor rings\n");
+		return -ENOMEM;
+	}
+	nes_debug(NES_DBG_INIT, "Allocated NIC QP structures at %p (phys = %016lX), size = %u.\n",
+			nesvnic->nic_vbase, (unsigned long)nesvnic->nic_pbase, nesvnic->nic_mem_size);
+
+	vmem = (void *)(((unsigned long)nesvnic->nic_vbase + (256 - 1)) &
+			~(unsigned long)(256 - 1));
+	pmem = (dma_addr_t)(((unsigned long long)nesvnic->nic_pbase + (256 - 1)) &
+			~(unsigned long long)(256 - 1));
+
+	/* Setup the first Fragment buffers */
+	nesvnic->nic.first_frag_vbase = vmem;
+
+	for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) {
+		nesvnic->nic.frag_paddr[counter] = pmem;
+		pmem += sizeof(struct nes_first_frag);
+	}
+
+	/* setup the SQ */
+	vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_first_frag));
+
+	nesvnic->nic.sq_vbase = (void *)vmem;
+	nesvnic->nic.sq_pbase = pmem;
+	nesvnic->nic.sq_head = 0;
+	nesvnic->nic.sq_tail = 0;
+	nesvnic->nic.sq_size = NES_NIC_WQ_SIZE;
+	for (counter = 0; counter < NES_NIC_WQ_SIZE; counter++) {
+		nic_sqe = &nesvnic->nic.sq_vbase[counter];
+		nic_sqe->wqe_words[NES_NIC_SQ_WQE_MISC_IDX] =
+				cpu_to_le32(NES_NIC_SQ_WQE_DISABLE_CHKSUM |
+				NES_NIC_SQ_WQE_COMPLETION);
+		nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX] =
+				cpu_to_le32((u32)NES_FIRST_FRAG_SIZE << 16);
+		nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX] =
+				cpu_to_le32((u32)nesvnic->nic.frag_paddr[counter]);
+		nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX] =
+				cpu_to_le32((u32)((u64)nesvnic->nic.frag_paddr[counter] >> 32));
+	}
+
+	nesvnic->get_cqp_request = nes_get_cqp_request;
+	nesvnic->post_cqp_request = nes_post_cqp_request;
+	nesvnic->mcrq_mcast_filter = NULL;
+
+	spin_lock_init(&nesvnic->nic.rq_lock);
+
+	/* setup the RQ */
+	vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe));
+	pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_sq_wqe));
+
+
+	nesvnic->nic.rq_vbase = vmem;
+	nesvnic->nic.rq_pbase = pmem;
+	nesvnic->nic.rq_head = 0;
+	nesvnic->nic.rq_tail = 0;
+	nesvnic->nic.rq_size = NES_NIC_WQ_SIZE;
+
+	/* setup the CQ */
+	vmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe));
+	pmem += (NES_NIC_WQ_SIZE * sizeof(struct nes_hw_nic_rq_wqe));
+
+	if (nesdev->nesadapter->netdev_count > 2)
+		nesvnic->mcrq_qp_id = nesvnic->nic_index + 32;
+	else
+		nesvnic->mcrq_qp_id = nesvnic->nic.qp_id + 4;
+
+	nesvnic->nic_cq.cq_vbase = vmem;
+	nesvnic->nic_cq.cq_pbase = pmem;
+	nesvnic->nic_cq.cq_head = 0;
+	nesvnic->nic_cq.cq_size = NES_NIC_WQ_SIZE * 2;
+
+	nesvnic->nic_cq.ce_handler = nes_nic_napi_ce_handler;
+
+	/* Send CreateCQ request to CQP */
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+	cqp_head = nesdev->cqp.sq_head;
+
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
+			NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
+			((u32)nesvnic->nic_cq.cq_size << 16));
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(
+			nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16));
+	u64temp = (u64)nesvnic->nic_cq.cq_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =  0;
+	u64temp = (unsigned long)&nesvnic->nic_cq;
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =  cpu_to_le32((u32)(u64temp >> 1));
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
+			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
+	if (++cqp_head >= nesdev->cqp.sq_size)
+		cqp_head = 0;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	/* Send CreateQP request to CQP */
+	nic_context = (void *)(&nesvnic->nic_cq.cq_vbase[nesvnic->nic_cq.cq_size]);
+	nic_context->context_words[NES_NIC_CTX_MISC_IDX] =
+			cpu_to_le32((u32)NES_NIC_CTX_SIZE |
+			((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12));
+	nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n",
+			nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE),
+			nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE));
+	if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0) {
+		nic_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE);
+	}
+
+	u64temp = (u64)nesvnic->nic.sq_pbase;
+	nic_context->context_words[NES_NIC_CTX_SQ_LOW_IDX]  = cpu_to_le32((u32)u64temp);
+	nic_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
+	u64temp = (u64)nesvnic->nic.rq_pbase;
+	nic_context->context_words[NES_NIC_CTX_RQ_LOW_IDX]  = cpu_to_le32((u32)u64temp);
+	nic_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP |
+			NES_CQP_QP_TYPE_NIC);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesvnic->nic.qp_id);
+	u64temp = (u64)nesvnic->nic_cq.cq_pbase +
+			(nesvnic->nic_cq.cq_size * sizeof(struct nes_hw_nic_cqe));
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+	if (++cqp_head >= nesdev->cqp.sq_size)
+		cqp_head = 0;
+	nesdev->cqp.sq_head = cqp_head;
+
+	barrier();
+
+	/* Ring doorbell (2 WQEs) */
+	nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
+
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+	nes_debug(NES_DBG_INIT, "Waiting for create NIC QP%u to complete.\n",
+			nesvnic->nic.qp_id);
+
+	ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_INIT, "Create NIC QP%u completed, wait_event_timeout ret = %u.\n",
+			nesvnic->nic.qp_id, ret);
+	if (!ret) {
+		nes_debug(NES_DBG_INIT, "NIC QP%u create timeout expired\n", nesvnic->nic.qp_id);
+		pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase,
+				nesvnic->nic_pbase);
+		return -EIO;
+	}
+
+	/* Populate the RQ */
+	for (counter = 0; counter < (NES_NIC_WQ_SIZE - 1); counter++) {
+		skb = dev_alloc_skb(nesvnic->max_frame_size);
+		if (!skb) {
+			nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name);
+
+			nes_destroy_nic_qp(nesvnic);
+			return -ENOMEM;
+		}
+
+		skb->dev = netdev;
+
+		pmem = pci_map_single(nesdev->pcidev, skb->data,
+				nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+		cb = (struct nes_rskb_cb *)&skb->cb[0];
+		cb->busaddr = pmem;
+		cb->maplen = nesvnic->max_frame_size;
+
+		nic_rqe = &nesvnic->nic.rq_vbase[counter];
+		nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32(nesvnic->max_frame_size);
+		nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
+		nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]  = cpu_to_le32((u32)pmem);
+		nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32));
+		nesvnic->nic.rx_skb[counter] = skb;
+	}
+
+	wqe_count = NES_NIC_WQ_SIZE - 1;
+	nesvnic->nic.rq_head = wqe_count;
+	barrier();
+	do {
+		counter = min(wqe_count, ((u32)255));
+		wqe_count -= counter;
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter << 24) | nesvnic->nic.qp_id);
+	} while (wqe_count);
+	init_timer(&nesvnic->rq_wqes_timer);
+	nesvnic->rq_wqes_timer.function = nes_rq_wqes_timeout;
+	nesvnic->rq_wqes_timer.data = (unsigned long)nesvnic;
+	nes_debug(NES_DBG_INIT, "NAPI support Enabled\n");
+	if (nesdev->nesadapter->et_use_adaptive_rx_coalesce)
+	{
+		nes_nic_init_timer(nesdev);
+		if (netdev->mtu > 1500)
+			jumbomode = 1;
+		nes_nic_init_timer_defaults(nesdev, jumbomode);
+	}
+	if ((nesdev->nesadapter->allow_unaligned_fpdus) &&
+		(nes_init_mgt_qp(nesdev, netdev, nesvnic))) {
+			nes_debug(NES_DBG_INIT, "%s: Out of memory for pau nic\n", netdev->name);
+			nes_destroy_nic_qp(nesvnic);
+		return -ENOMEM;
+	}
+
+	nesvnic->lro_mgr.max_aggr       = nes_lro_max_aggr;
+	nesvnic->lro_mgr.max_desc       = NES_MAX_LRO_DESCRIPTORS;
+	nesvnic->lro_mgr.lro_arr        = nesvnic->lro_desc;
+	nesvnic->lro_mgr.get_skb_header = nes_lro_get_skb_hdr;
+	nesvnic->lro_mgr.features       = LRO_F_NAPI | LRO_F_EXTRACT_VLAN_ID;
+	nesvnic->lro_mgr.dev            = netdev;
+	nesvnic->lro_mgr.ip_summed      = CHECKSUM_UNNECESSARY;
+	nesvnic->lro_mgr.ip_summed_aggr = CHECKSUM_UNNECESSARY;
+	return 0;
+}
+
+
+/**
+ * nes_destroy_nic_qp
+ */
+void nes_destroy_nic_qp(struct nes_vnic *nesvnic)
+{
+	u64 u64temp;
+	dma_addr_t bus_address;
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	__le16 *wqe_fragment_length;
+	u16  wqe_fragment_index;
+	u32 cqp_head;
+	u32 wqm_cfg0;
+	unsigned long flags;
+	struct sk_buff *rx_skb;
+	struct nes_rskb_cb *cb;
+	int ret;
+
+	if (nesdev->nesadapter->allow_unaligned_fpdus)
+		nes_destroy_mgt(nesvnic);
+
+	/* clear wqe stall before destroying NIC QP */
+	wqm_cfg0 = nes_read_indexed(nesdev, NES_IDX_WQM_CONFIG0);
+	nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0 & 0xFFFF7FFF);
+
+	/* Free remaining NIC receive buffers */
+	while (nesvnic->nic.rq_head != nesvnic->nic.rq_tail) {
+		rx_skb = nesvnic->nic.rx_skb[nesvnic->nic.rq_tail];
+		cb = (struct nes_rskb_cb *)&rx_skb->cb[0];
+		pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen,
+			PCI_DMA_FROMDEVICE);
+
+		dev_kfree_skb(nesvnic->nic.rx_skb[nesvnic->nic.rq_tail++]);
+		nesvnic->nic.rq_tail &= (nesvnic->nic.rq_size - 1);
+	}
+
+	/* Free remaining NIC transmit buffers */
+	while (nesvnic->nic.sq_head != nesvnic->nic.sq_tail) {
+		nic_sqe = &nesvnic->nic.sq_vbase[nesvnic->nic.sq_tail];
+		wqe_fragment_index = 1;
+		wqe_fragment_length = (__le16 *)
+			&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
+		/* bump past the vlan tag */
+		wqe_fragment_length++;
+		if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) {
+			u64temp = (u64)le32_to_cpu(
+				nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX+
+				wqe_fragment_index*2]);
+			u64temp += ((u64)le32_to_cpu(
+				nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX
+				+ wqe_fragment_index*2]))<<32;
+			bus_address = (dma_addr_t)u64temp;
+			if (test_and_clear_bit(nesvnic->nic.sq_tail,
+					nesvnic->nic.first_frag_overflow)) {
+				pci_unmap_single(nesdev->pcidev,
+						bus_address,
+						le16_to_cpu(wqe_fragment_length[
+							wqe_fragment_index++]),
+						PCI_DMA_TODEVICE);
+			}
+			for (; wqe_fragment_index < 5; wqe_fragment_index++) {
+				if (wqe_fragment_length[wqe_fragment_index]) {
+					u64temp = le32_to_cpu(
+						nic_sqe->wqe_words[
+						NES_NIC_SQ_WQE_FRAG0_LOW_IDX+
+						wqe_fragment_index*2]);
+					u64temp += ((u64)le32_to_cpu(
+						nic_sqe->wqe_words[
+						NES_NIC_SQ_WQE_FRAG0_HIGH_IDX+
+						wqe_fragment_index*2]))<<32;
+					bus_address = (dma_addr_t)u64temp;
+					pci_unmap_page(nesdev->pcidev,
+							bus_address,
+							le16_to_cpu(
+							wqe_fragment_length[
+							wqe_fragment_index]),
+							PCI_DMA_TODEVICE);
+				} else
+					break;
+			}
+		}
+		if (nesvnic->nic.tx_skb[nesvnic->nic.sq_tail])
+			dev_kfree_skb(
+				nesvnic->nic.tx_skb[nesvnic->nic.sq_tail]);
+
+		nesvnic->nic.sq_tail = (nesvnic->nic.sq_tail + 1)
+					& (nesvnic->nic.sq_size - 1);
+	}
+
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+
+	/* Destroy NIC QP */
+	cqp_head = nesdev->cqp.sq_head;
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+		(NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+		nesvnic->nic.qp_id);
+
+	if (++cqp_head >= nesdev->cqp.sq_size)
+		cqp_head = 0;
+
+	cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+
+	/* Destroy NIC CQ */
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+		(NES_CQP_DESTROY_CQ | ((u32)nesvnic->nic_cq.cq_size << 16)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+		(nesvnic->nic_cq.cq_number | ((u32)nesdev->nic_ceq_index << 16)));
+
+	if (++cqp_head >= nesdev->cqp.sq_size)
+		cqp_head = 0;
+
+	nesdev->cqp.sq_head = cqp_head;
+	barrier();
+
+	/* Ring doorbell (2 WQEs) */
+	nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
+
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+	nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u,"
+			" cqp.sq_tail=%u, cqp.sq_size=%u\n",
+			cqp_head, nesdev->cqp.sq_head,
+			nesdev->cqp.sq_tail, nesdev->cqp.sq_size);
+
+	ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
+			NES_EVENT_TIMEOUT);
+
+	nes_debug(NES_DBG_SHUTDOWN, "Destroy NIC QP returned, wait_event_timeout ret = %u, cqp_head=%u,"
+			" cqp.sq_head=%u, cqp.sq_tail=%u\n",
+			ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
+	if (!ret) {
+		nes_debug(NES_DBG_SHUTDOWN, "NIC QP%u destroy timeout expired\n",
+				nesvnic->nic.qp_id);
+	}
+
+	pci_free_consistent(nesdev->pcidev, nesvnic->nic_mem_size, nesvnic->nic_vbase,
+			nesvnic->nic_pbase);
+
+	/* restore old wqm_cfg0 value */
+	nes_write_indexed(nesdev, NES_IDX_WQM_CONFIG0, wqm_cfg0);
+}
+
+/**
+ * nes_napi_isr
+ */
+int nes_napi_isr(struct nes_device *nesdev)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 int_stat;
+
+	if (nesdev->napi_isr_ran) {
+		/* interrupt status has already been read in ISR */
+		int_stat = nesdev->int_stat;
+	} else {
+		int_stat = nes_read32(nesdev->regs + NES_INT_STAT);
+		nesdev->int_stat = int_stat;
+		nesdev->napi_isr_ran = 1;
+	}
+
+	int_stat &= nesdev->int_req;
+	/* iff NIC, process here, else wait for DPC */
+	if ((int_stat) && ((int_stat & 0x0000ff00) == int_stat)) {
+		nesdev->napi_isr_ran = 0;
+		nes_write32(nesdev->regs + NES_INT_STAT,
+			(int_stat &
+			~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 | NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3)));
+
+		/* Process the CEQs */
+		nes_process_ceq(nesdev, &nesdev->nesadapter->ceq[nesdev->nic_ceq_index]);
+
+		if (unlikely((((nesadapter->et_rx_coalesce_usecs_irq) &&
+					(!nesadapter->et_use_adaptive_rx_coalesce)) ||
+					((nesadapter->et_use_adaptive_rx_coalesce) &&
+					 (nesdev->deepcq_count > nesadapter->et_pkt_rate_low))))) {
+			if ((nesdev->int_req & NES_INT_TIMER) == 0) {
+				/* Enable Periodic timer interrupts */
+				nesdev->int_req |= NES_INT_TIMER;
+				/* ack any pending periodic timer interrupts so we don't get an immediate interrupt */
+				/* TODO: need to also ack other unused periodic timer values, get from nesadapter */
+				nes_write32(nesdev->regs+NES_TIMER_STAT,
+						nesdev->timer_int_req  | ~(nesdev->nesadapter->timer_int_req));
+				nes_write32(nesdev->regs+NES_INTF_INT_MASK,
+						~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER));
+			}
+
+			if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
+			{
+				nes_nic_init_timer(nesdev);
+			}
+			/* Enable interrupts, except CEQs */
+			nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
+		} else {
+			/* Enable interrupts, make sure timer is off */
+			nesdev->int_req &= ~NES_INT_TIMER;
+			nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
+			nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+		}
+		nesdev->deepcq_count = 0;
+		return 1;
+	} else {
+		return 0;
+	}
+}
+
+static void process_critical_error(struct nes_device *nesdev)
+{
+	u32 debug_error;
+	u32 nes_idx_debug_error_masks0 = 0;
+	u16 error_module = 0;
+
+	debug_error = nes_read_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS);
+	printk(KERN_ERR PFX "Critical Error reported by device!!! 0x%02X\n",
+			(u16)debug_error);
+	nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_CONTROL_STATUS,
+			0x01010000 | (debug_error & 0x0000ffff));
+	if (crit_err_count++ > 10)
+		nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS1, 1 << 0x17);
+	error_module = (u16) (debug_error & 0x1F00) >> 8;
+	if (++nesdev->nesadapter->crit_error_count[error_module-1] >=
+			nes_max_critical_error_count) {
+		printk(KERN_ERR PFX "Masking off critical error for module "
+			"0x%02X\n", (u16)error_module);
+		nes_idx_debug_error_masks0 = nes_read_indexed(nesdev,
+			NES_IDX_DEBUG_ERROR_MASKS0);
+		nes_write_indexed(nesdev, NES_IDX_DEBUG_ERROR_MASKS0,
+			nes_idx_debug_error_masks0 | (1 << error_module));
+	}
+}
+/**
+ * nes_dpc
+ */
+void nes_dpc(unsigned long param)
+{
+	struct nes_device *nesdev = (struct nes_device *)param;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 counter;
+	u32 loop_counter = 0;
+	u32 int_status_bit;
+	u32 int_stat;
+	u32 timer_stat;
+	u32 temp_int_stat;
+	u32 intf_int_stat;
+	u32 processed_intf_int = 0;
+	u16 processed_timer_int = 0;
+	u16 completion_ints = 0;
+	u16 timer_ints = 0;
+
+	/* nes_debug(NES_DBG_ISR, "\n"); */
+
+	do {
+		timer_stat = 0;
+		if (nesdev->napi_isr_ran) {
+			nesdev->napi_isr_ran = 0;
+			int_stat = nesdev->int_stat;
+		} else
+			int_stat = nes_read32(nesdev->regs+NES_INT_STAT);
+		if (processed_intf_int != 0)
+			int_stat &= nesdev->int_req & ~NES_INT_INTF;
+		else
+			int_stat &= nesdev->int_req;
+		if (processed_timer_int == 0) {
+			processed_timer_int = 1;
+			if (int_stat & NES_INT_TIMER) {
+				timer_stat = nes_read32(nesdev->regs + NES_TIMER_STAT);
+				if ((timer_stat & nesdev->timer_int_req) == 0) {
+					int_stat &= ~NES_INT_TIMER;
+				}
+			}
+		} else {
+			int_stat &= ~NES_INT_TIMER;
+		}
+
+		if (int_stat) {
+			if (int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0|
+					NES_INT_MAC1|NES_INT_MAC2 | NES_INT_MAC3)) {
+				/* Ack the interrupts */
+				nes_write32(nesdev->regs+NES_INT_STAT,
+					(int_stat & ~(NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0|
+					NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3)));
+			}
+
+			temp_int_stat = int_stat;
+			for (counter = 0, int_status_bit = 1; counter < 16; counter++) {
+				if (int_stat & int_status_bit) {
+					nes_process_ceq(nesdev, &nesadapter->ceq[counter]);
+					temp_int_stat &= ~int_status_bit;
+					completion_ints = 1;
+				}
+				if (!(temp_int_stat & 0x0000ffff))
+					break;
+				int_status_bit <<= 1;
+			}
+
+			/* Process the AEQ for this pci function */
+			int_status_bit = 1 << (16 + PCI_FUNC(nesdev->pcidev->devfn));
+			if (int_stat & int_status_bit) {
+				nes_process_aeq(nesdev, &nesadapter->aeq[PCI_FUNC(nesdev->pcidev->devfn)]);
+			}
+
+			/* Process the MAC interrupt for this pci function */
+			int_status_bit = 1 << (24 + nesdev->mac_index);
+			if (int_stat & int_status_bit) {
+				nes_process_mac_intr(nesdev, nesdev->mac_index);
+			}
+
+			if (int_stat & NES_INT_TIMER) {
+				if (timer_stat & nesdev->timer_int_req) {
+					nes_write32(nesdev->regs + NES_TIMER_STAT,
+							(timer_stat & nesdev->timer_int_req) |
+							~(nesdev->nesadapter->timer_int_req));
+					timer_ints = 1;
+				}
+			}
+
+			if (int_stat & NES_INT_INTF) {
+				processed_intf_int = 1;
+				intf_int_stat = nes_read32(nesdev->regs+NES_INTF_INT_STAT);
+				intf_int_stat &= nesdev->intf_int_req;
+				if (NES_INTF_INT_CRITERR & intf_int_stat) {
+					process_critical_error(nesdev);
+				}
+				if (NES_INTF_INT_PCIERR & intf_int_stat) {
+					printk(KERN_ERR PFX "PCI Error reported by device!!!\n");
+					BUG();
+				}
+				if (NES_INTF_INT_AEQ_OFLOW & intf_int_stat) {
+					printk(KERN_ERR PFX "AEQ Overflow reported by device!!!\n");
+					BUG();
+				}
+				nes_write32(nesdev->regs+NES_INTF_INT_STAT, intf_int_stat);
+			}
+
+			if (int_stat & NES_INT_TSW) {
+			}
+		}
+		/* Don't use the interface interrupt bit stay in loop */
+		int_stat &= ~NES_INT_INTF | NES_INT_TIMER | NES_INT_MAC0 |
+				NES_INT_MAC1 | NES_INT_MAC2 | NES_INT_MAC3;
+	} while ((int_stat != 0) && (loop_counter++ < MAX_DPC_ITERATIONS));
+
+	if (timer_ints == 1) {
+		if ((nesadapter->et_rx_coalesce_usecs_irq) || (nesadapter->et_use_adaptive_rx_coalesce)) {
+			if (completion_ints == 0) {
+				nesdev->timer_only_int_count++;
+				if (nesdev->timer_only_int_count>=nesadapter->timer_int_limit) {
+					nesdev->timer_only_int_count = 0;
+					nesdev->int_req &= ~NES_INT_TIMER;
+					nes_write32(nesdev->regs + NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
+					nes_write32(nesdev->regs + NES_INT_MASK, ~nesdev->int_req);
+				} else {
+					nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
+				}
+			} else {
+				if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
+				{
+					nes_nic_init_timer(nesdev);
+				}
+				nesdev->timer_only_int_count = 0;
+				nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
+			}
+		} else {
+			nesdev->timer_only_int_count = 0;
+			nesdev->int_req &= ~NES_INT_TIMER;
+			nes_write32(nesdev->regs+NES_INTF_INT_MASK, ~(nesdev->intf_int_req));
+			nes_write32(nesdev->regs+NES_TIMER_STAT,
+					nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req));
+			nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+		}
+	} else {
+		if ( (completion_ints == 1) &&
+			 (((nesadapter->et_rx_coalesce_usecs_irq) &&
+			   (!nesadapter->et_use_adaptive_rx_coalesce)) ||
+			  ((nesdev->deepcq_count > nesadapter->et_pkt_rate_low) &&
+			   (nesadapter->et_use_adaptive_rx_coalesce) )) ) {
+			/* nes_debug(NES_DBG_ISR, "Enabling periodic timer interrupt.\n" ); */
+			nesdev->timer_only_int_count = 0;
+			nesdev->int_req |= NES_INT_TIMER;
+			nes_write32(nesdev->regs+NES_TIMER_STAT,
+					nesdev->timer_int_req | ~(nesdev->nesadapter->timer_int_req));
+			nes_write32(nesdev->regs+NES_INTF_INT_MASK,
+					~(nesdev->intf_int_req | NES_INTF_PERIODIC_TIMER));
+			nes_write32(nesdev->regs+NES_INT_MASK, 0x0000ffff | (~nesdev->int_req));
+		} else {
+			nes_write32(nesdev->regs+NES_INT_MASK, ~nesdev->int_req);
+		}
+	}
+	nesdev->deepcq_count = 0;
+}
+
+
+/**
+ * nes_process_ceq
+ */
+static void nes_process_ceq(struct nes_device *nesdev, struct nes_hw_ceq *ceq)
+{
+	u64 u64temp;
+	struct nes_hw_cq *cq;
+	u32 head;
+	u32 ceq_size;
+
+	/* nes_debug(NES_DBG_CQ, "\n"); */
+	head = ceq->ceq_head;
+	ceq_size = ceq->ceq_size;
+
+	do {
+		if (le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]) &
+				NES_CEQE_VALID) {
+			u64temp = (((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX]))) << 32) |
+						((u64)(le32_to_cpu(ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_LOW_IDX])));
+			u64temp <<= 1;
+			cq = *((struct nes_hw_cq **)&u64temp);
+			/* nes_debug(NES_DBG_CQ, "pCQ = %p\n", cq); */
+			barrier();
+			ceq->ceq_vbase[head].ceqe_words[NES_CEQE_CQ_CTX_HIGH_IDX] = 0;
+
+			/* call the event handler */
+			cq->ce_handler(nesdev, cq);
+
+			if (++head >= ceq_size)
+				head = 0;
+		} else {
+			break;
+		}
+
+	} while (1);
+
+	ceq->ceq_head = head;
+}
+
+
+/**
+ * nes_process_aeq
+ */
+static void nes_process_aeq(struct nes_device *nesdev, struct nes_hw_aeq *aeq)
+{
+	/* u64 u64temp; */
+	u32 head;
+	u32 aeq_size;
+	u32 aeqe_misc;
+	u32 aeqe_cq_id;
+	struct nes_hw_aeqe volatile *aeqe;
+
+	head = aeq->aeq_head;
+	aeq_size = aeq->aeq_size;
+
+	do {
+		aeqe = &aeq->aeq_vbase[head];
+		if ((le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]) & NES_AEQE_VALID) == 0)
+			break;
+		aeqe_misc  = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+		aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]);
+		if (aeqe_misc & (NES_AEQE_QP|NES_AEQE_CQ)) {
+			if (aeqe_cq_id >= NES_FIRST_QPN) {
+				/* dealing with an accelerated QP related AE */
+				/*
+				 * u64temp = (((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX]))) << 32) |
+				 *	     ((u64)(le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX])));
+				 */
+				nes_process_iwarp_aeqe(nesdev, (struct nes_hw_aeqe *)aeqe);
+			} else {
+				/* TODO: dealing with a CQP related AE */
+				nes_debug(NES_DBG_AEQ, "Processing CQP related AE, misc = 0x%04X\n",
+						(u16)(aeqe_misc >> 16));
+			}
+		}
+
+		aeqe->aeqe_words[NES_AEQE_MISC_IDX] = 0;
+
+		if (++head >= aeq_size)
+			head = 0;
+
+		nes_write32(nesdev->regs + NES_AEQ_ALLOC, 1 << 16);
+	}
+	while (1);
+	aeq->aeq_head = head;
+}
+
+static void nes_reset_link(struct nes_device *nesdev, u32 mac_index)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 reset_value;
+	u32 i=0;
+	u32 u32temp;
+
+	if (nesadapter->hw_rev == NE020_REV) {
+		return;
+	}
+	mh_detected++;
+
+	reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+
+	if ((mac_index == 0) || ((mac_index == 1) && (nesadapter->OneG_Mode)))
+		reset_value |= 0x0000001d;
+	else
+		reset_value |= 0x0000002d;
+
+	if (4 <= (nesadapter->link_interrupt_count[mac_index] / ((u16)NES_MAX_LINK_INTERRUPTS))) {
+		if ((!nesadapter->OneG_Mode) && (nesadapter->port_count == 2)) {
+			nesadapter->link_interrupt_count[0] = 0;
+			nesadapter->link_interrupt_count[1] = 0;
+			u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
+			if (0x00000040 & u32temp)
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088);
+			else
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
+
+			reset_value |= 0x0000003d;
+		}
+		nesadapter->link_interrupt_count[mac_index] = 0;
+	}
+
+	nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
+
+	while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
+			& 0x00000040) != 0x00000040) && (i++ < 5000));
+
+	if (0x0000003d == (reset_value & 0x0000003d)) {
+		u32 pcs_control_status0, pcs_control_status1;
+
+		for (i = 0; i < 10; i++) {
+			pcs_control_status0 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0);
+			pcs_control_status1 = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+			if (((0x0F000000 == (pcs_control_status0 & 0x0F000000))
+			     && (pcs_control_status0 & 0x00100000))
+			    || ((0x0F000000 == (pcs_control_status1 & 0x0F000000))
+				&& (pcs_control_status1 & 0x00100000)))
+				continue;
+			else
+				break;
+		}
+		if (10 == i) {
+			u32temp = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1);
+			if (0x00000040 & u32temp)
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F088);
+			else
+				nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL1, 0x0000F0C8);
+
+			nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value);
+
+			while (((nes_read32(nesdev->regs + NES_SOFTWARE_RESET)
+				 & 0x00000040) != 0x00000040) && (i++ < 5000));
+		}
+	}
+}
+
+/**
+ * nes_process_mac_intr
+ */
+static void nes_process_mac_intr(struct nes_device *nesdev, u32 mac_number)
+{
+	unsigned long flags;
+	u32 pcs_control_status;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_vnic *nesvnic;
+	u32 mac_status;
+	u32 mac_index = nesdev->mac_index;
+	u32 u32temp;
+	u16 phy_data;
+	u16 temp_phy_data;
+	u32 pcs_val  = 0x0f0f0000;
+	u32 pcs_mask = 0x0f1f0000;
+	u32 cdr_ctrl;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+	if (nesadapter->mac_sw_state[mac_number] != NES_MAC_SW_IDLE) {
+		spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+		return;
+	}
+	nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_INTERRUPT;
+
+	/* ack the MAC interrupt */
+	mac_status = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200));
+	/* Clear the interrupt */
+	nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (mac_index * 0x200), mac_status);
+
+	nes_debug(NES_DBG_PHY, "MAC%u interrupt status = 0x%X.\n", mac_number, mac_status);
+
+	if (mac_status & (NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT)) {
+		nesdev->link_status_interrupts++;
+		if (0 == (++nesadapter->link_interrupt_count[mac_index] % ((u16)NES_MAX_LINK_INTERRUPTS)))
+			nes_reset_link(nesdev, mac_index);
+
+		/* read the PHY interrupt status register */
+		if ((nesadapter->OneG_Mode) &&
+		(nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) {
+			do {
+				nes_read_1G_phy_reg(nesdev, 0x1a,
+						nesadapter->phy_index[mac_index], &phy_data);
+				nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1a = 0x%X.\n",
+						nesadapter->phy_index[mac_index], phy_data);
+			} while (phy_data&0x8000);
+
+			temp_phy_data = 0;
+			do {
+				nes_read_1G_phy_reg(nesdev, 0x11,
+						nesadapter->phy_index[mac_index], &phy_data);
+				nes_debug(NES_DBG_PHY, "Phy%d data from register 0x11 = 0x%X.\n",
+						nesadapter->phy_index[mac_index], phy_data);
+				if (temp_phy_data == phy_data)
+					break;
+				temp_phy_data = phy_data;
+			} while (1);
+
+			nes_read_1G_phy_reg(nesdev, 0x1e,
+					nesadapter->phy_index[mac_index], &phy_data);
+			nes_debug(NES_DBG_PHY, "Phy%d data from register 0x1e = 0x%X.\n",
+					nesadapter->phy_index[mac_index], phy_data);
+
+			nes_read_1G_phy_reg(nesdev, 1,
+					nesadapter->phy_index[mac_index], &phy_data);
+			nes_debug(NES_DBG_PHY, "1G phy%u data from register 1 = 0x%X\n",
+					nesadapter->phy_index[mac_index], phy_data);
+
+			if (temp_phy_data & 0x1000) {
+				nes_debug(NES_DBG_PHY, "The Link is up according to the PHY\n");
+				phy_data = 4;
+			} else {
+				nes_debug(NES_DBG_PHY, "The Link is down according to the PHY\n");
+			}
+		}
+		nes_debug(NES_DBG_PHY, "Eth SERDES Common Status: 0=0x%08X, 1=0x%08X\n",
+				nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0),
+				nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0+0x200));
+
+		if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_PUMA_1G) {
+			switch (mac_index) {
+			case 1:
+			case 3:
+				pcs_control_status = nes_read_indexed(nesdev,
+						NES_IDX_PHY_PCS_CONTROL_STATUS0 + 0x200);
+				break;
+			default:
+				pcs_control_status = nes_read_indexed(nesdev,
+						NES_IDX_PHY_PCS_CONTROL_STATUS0);
+				break;
+			}
+		} else {
+			pcs_control_status = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200));
+			pcs_control_status = nes_read_indexed(nesdev,
+					NES_IDX_PHY_PCS_CONTROL_STATUS0 + ((mac_index & 1) * 0x200));
+		}
+
+		nes_debug(NES_DBG_PHY, "PCS PHY Control/Status%u: 0x%08X\n",
+				mac_index, pcs_control_status);
+		if ((nesadapter->OneG_Mode) &&
+				(nesadapter->phy_type[mac_index] != NES_PHY_TYPE_PUMA_1G)) {
+			u32temp = 0x01010000;
+			if (nesadapter->port_count > 2) {
+				u32temp |= 0x02020000;
+			}
+			if ((pcs_control_status & u32temp)!= u32temp) {
+				phy_data = 0;
+				nes_debug(NES_DBG_PHY, "PCS says the link is down\n");
+			}
+		} else {
+			switch (nesadapter->phy_type[mac_index]) {
+			case NES_PHY_TYPE_ARGUS:
+			case NES_PHY_TYPE_SFP_D:
+			case NES_PHY_TYPE_KR:
+				/* clear the alarms */
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0x0008);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc001);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc002);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc005);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 4, 0xc006);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9004);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9005);
+				/* check link status */
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003);
+				temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
+				nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+				nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
+				phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+
+				phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0;
+
+				nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n",
+					__func__, phy_data, nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP");
+				break;
+
+			case NES_PHY_TYPE_PUMA_1G:
+				if (mac_index < 2)
+					pcs_val = pcs_mask = 0x01010000;
+				else
+					pcs_val = pcs_mask = 0x02020000;
+				/* fall through */
+			default:
+				phy_data = (pcs_val == (pcs_control_status & pcs_mask)) ? 0x4 : 0x0;
+				break;
+			}
+		}
+
+		if (phy_data & 0x0004) {
+			if (wide_ppm_offset &&
+			    (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) &&
+			    (nesadapter->hw_rev != NE020_REV)) {
+				cdr_ctrl = nes_read_indexed(nesdev,
+							    NES_IDX_ETH_SERDES_CDR_CONTROL0 +
+							    mac_index * 0x200);
+				nes_write_indexed(nesdev,
+						  NES_IDX_ETH_SERDES_CDR_CONTROL0 +
+						  mac_index * 0x200,
+						  cdr_ctrl | 0x000F0000);
+			}
+			nesadapter->mac_link_down[mac_index] = 0;
+			list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
+				nes_debug(NES_DBG_PHY, "The Link is UP!!.  linkup was %d\n",
+						nesvnic->linkup);
+				if (nesvnic->linkup == 0) {
+					printk(PFX "The Link is now up for port %s, netdev %p.\n",
+							nesvnic->netdev->name, nesvnic->netdev);
+					if (netif_queue_stopped(nesvnic->netdev))
+						netif_start_queue(nesvnic->netdev);
+					nesvnic->linkup = 1;
+					netif_carrier_on(nesvnic->netdev);
+
+					spin_lock(&nesvnic->port_ibevent_lock);
+					if (nesvnic->of_device_registered) {
+						if (nesdev->iw_status == 0) {
+							nesdev->iw_status = 1;
+							nes_port_ibevent(nesvnic);
+						}
+					}
+					spin_unlock(&nesvnic->port_ibevent_lock);
+				}
+			}
+		} else {
+			if (wide_ppm_offset &&
+			    (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_CX4) &&
+			    (nesadapter->hw_rev != NE020_REV)) {
+				cdr_ctrl = nes_read_indexed(nesdev,
+							    NES_IDX_ETH_SERDES_CDR_CONTROL0 +
+							    mac_index * 0x200);
+				nes_write_indexed(nesdev,
+						  NES_IDX_ETH_SERDES_CDR_CONTROL0 +
+						  mac_index * 0x200,
+						  cdr_ctrl & 0xFFF0FFFF);
+			}
+			nesadapter->mac_link_down[mac_index] = 1;
+			list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
+				nes_debug(NES_DBG_PHY, "The Link is Down!!. linkup was %d\n",
+						nesvnic->linkup);
+				if (nesvnic->linkup == 1) {
+					printk(PFX "The Link is now down for port %s, netdev %p.\n",
+							nesvnic->netdev->name, nesvnic->netdev);
+					if (!(netif_queue_stopped(nesvnic->netdev)))
+						netif_stop_queue(nesvnic->netdev);
+					nesvnic->linkup = 0;
+					netif_carrier_off(nesvnic->netdev);
+
+					spin_lock(&nesvnic->port_ibevent_lock);
+					if (nesvnic->of_device_registered) {
+						if (nesdev->iw_status == 1) {
+							nesdev->iw_status = 0;
+							nes_port_ibevent(nesvnic);
+						}
+					}
+					spin_unlock(&nesvnic->port_ibevent_lock);
+				}
+			}
+		}
+		if (nesadapter->phy_type[mac_index] == NES_PHY_TYPE_SFP_D) {
+			nesdev->link_recheck = 1;
+			mod_delayed_work(system_wq, &nesdev->work,
+					 NES_LINK_RECHECK_DELAY);
+		}
+	}
+
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+
+	nesadapter->mac_sw_state[mac_number] = NES_MAC_SW_IDLE;
+}
+
+void nes_recheck_link_status(struct work_struct *work)
+{
+	unsigned long flags;
+	struct nes_device *nesdev = container_of(work, struct nes_device, work.work);
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_vnic *nesvnic;
+	u32 mac_index = nesdev->mac_index;
+	u16 phy_data;
+	u16 temp_phy_data;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+
+	/* check link status */
+	nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 1, 0x9003);
+	temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+
+	nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
+	nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	nes_read_10G_phy_reg(nesdev, nesadapter->phy_index[mac_index], 3, 0x0021);
+	phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+
+	phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0;
+
+	nes_debug(NES_DBG_PHY, "%s: Phy data = 0x%04X, link was %s.\n",
+		__func__, phy_data,
+		nesadapter->mac_link_down[mac_index] ? "DOWN" : "UP");
+
+	if (phy_data & 0x0004) {
+		nesadapter->mac_link_down[mac_index] = 0;
+		list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
+			if (nesvnic->linkup == 0) {
+				printk(PFX "The Link is now up for port %s, netdev %p.\n",
+						nesvnic->netdev->name, nesvnic->netdev);
+				if (netif_queue_stopped(nesvnic->netdev))
+					netif_start_queue(nesvnic->netdev);
+				nesvnic->linkup = 1;
+				netif_carrier_on(nesvnic->netdev);
+
+				spin_lock(&nesvnic->port_ibevent_lock);
+				if (nesvnic->of_device_registered) {
+					if (nesdev->iw_status == 0) {
+						nesdev->iw_status = 1;
+						nes_port_ibevent(nesvnic);
+					}
+				}
+				spin_unlock(&nesvnic->port_ibevent_lock);
+			}
+		}
+
+	} else {
+		nesadapter->mac_link_down[mac_index] = 1;
+		list_for_each_entry(nesvnic, &nesadapter->nesvnic_list[mac_index], list) {
+			if (nesvnic->linkup == 1) {
+				printk(PFX "The Link is now down for port %s, netdev %p.\n",
+						nesvnic->netdev->name, nesvnic->netdev);
+				if (!(netif_queue_stopped(nesvnic->netdev)))
+					netif_stop_queue(nesvnic->netdev);
+				nesvnic->linkup = 0;
+				netif_carrier_off(nesvnic->netdev);
+
+				spin_lock(&nesvnic->port_ibevent_lock);
+				if (nesvnic->of_device_registered) {
+					if (nesdev->iw_status == 1) {
+						nesdev->iw_status = 0;
+						nes_port_ibevent(nesvnic);
+					}
+				}
+				spin_unlock(&nesvnic->port_ibevent_lock);
+			}
+		}
+	}
+	if (nesdev->link_recheck++ < NES_LINK_RECHECK_MAX)
+		schedule_delayed_work(&nesdev->work, NES_LINK_RECHECK_DELAY);
+	else
+		nesdev->link_recheck = 0;
+
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+}
+
+
+static void nes_nic_napi_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
+{
+	struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq);
+
+	napi_schedule(&nesvnic->napi);
+}
+
+
+/* The MAX_RQES_TO_PROCESS defines how many max read requests to complete before
+* getting out of nic_ce_handler
+*/
+#define	MAX_RQES_TO_PROCESS	384
+
+/**
+ * nes_nic_ce_handler
+ */
+void nes_nic_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
+{
+	u64 u64temp;
+	dma_addr_t bus_address;
+	struct nes_hw_nic *nesnic;
+	struct nes_vnic *nesvnic = container_of(cq, struct nes_vnic, nic_cq);
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_nic_rq_wqe *nic_rqe;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	struct sk_buff *skb;
+	struct sk_buff *rx_skb;
+	struct nes_rskb_cb *cb;
+	__le16 *wqe_fragment_length;
+	u32 head;
+	u32 cq_size;
+	u32 rx_pkt_size;
+	u32 cqe_count=0;
+	u32 cqe_errv;
+	u32 cqe_misc;
+	u16 wqe_fragment_index = 1;	/* first fragment (0) is used by copy buffer */
+	u16 vlan_tag;
+	u16 pkt_type;
+	u16 rqes_processed = 0;
+	u8 sq_cqes = 0;
+	u8 nes_use_lro = 0;
+
+	head = cq->cq_head;
+	cq_size = cq->cq_size;
+	cq->cqes_pending = 1;
+	if (nesvnic->netdev->features & NETIF_F_LRO)
+		nes_use_lro = 1;
+	do {
+		if (le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]) &
+				NES_NIC_CQE_VALID) {
+			nesnic = &nesvnic->nic;
+			cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]);
+			if (cqe_misc & NES_NIC_CQE_SQ) {
+				sq_cqes++;
+				wqe_fragment_index = 1;
+				nic_sqe = &nesnic->sq_vbase[nesnic->sq_tail];
+				skb = nesnic->tx_skb[nesnic->sq_tail];
+				wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
+				/* bump past the vlan tag */
+				wqe_fragment_length++;
+				if (le16_to_cpu(wqe_fragment_length[wqe_fragment_index]) != 0) {
+					u64temp = (u64) le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX +
+							wqe_fragment_index * 2]);
+					u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX +
+							wqe_fragment_index * 2])) << 32;
+					bus_address = (dma_addr_t)u64temp;
+					if (test_and_clear_bit(nesnic->sq_tail, nesnic->first_frag_overflow)) {
+						pci_unmap_single(nesdev->pcidev,
+								bus_address,
+								le16_to_cpu(wqe_fragment_length[wqe_fragment_index++]),
+								PCI_DMA_TODEVICE);
+					}
+					for (; wqe_fragment_index < 5; wqe_fragment_index++) {
+						if (wqe_fragment_length[wqe_fragment_index]) {
+							u64temp = le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX +
+										wqe_fragment_index * 2]);
+							u64temp += ((u64)le32_to_cpu(nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_HIGH_IDX
+										+ wqe_fragment_index * 2])) <<32;
+							bus_address = (dma_addr_t)u64temp;
+							pci_unmap_page(nesdev->pcidev,
+									bus_address,
+									le16_to_cpu(wqe_fragment_length[wqe_fragment_index]),
+									PCI_DMA_TODEVICE);
+						} else
+							break;
+					}
+				}
+				if (skb)
+					dev_kfree_skb_any(skb);
+				nesnic->sq_tail++;
+				nesnic->sq_tail &= nesnic->sq_size-1;
+				if (sq_cqes > 128) {
+					barrier();
+					/* restart the queue if it had been stopped */
+					if (netif_queue_stopped(nesvnic->netdev))
+						netif_wake_queue(nesvnic->netdev);
+					sq_cqes = 0;
+				}
+			} else {
+				rqes_processed ++;
+
+				cq->rx_cqes_completed++;
+				cq->rx_pkts_indicated++;
+				rx_pkt_size = cqe_misc & 0x0000ffff;
+				nic_rqe = &nesnic->rq_vbase[nesnic->rq_tail];
+				/* Get the skb */
+				rx_skb = nesnic->rx_skb[nesnic->rq_tail];
+				nic_rqe = &nesnic->rq_vbase[nesvnic->nic.rq_tail];
+				bus_address = (dma_addr_t)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX]);
+				bus_address += ((u64)le32_to_cpu(nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX])) << 32;
+				pci_unmap_single(nesdev->pcidev, bus_address,
+						nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+				cb = (struct nes_rskb_cb *)&rx_skb->cb[0];
+				cb->busaddr = 0;
+				/* rx_skb->tail = rx_skb->data + rx_pkt_size; */
+				/* rx_skb->len = rx_pkt_size; */
+				rx_skb->len = 0;  /* TODO: see if this is necessary */
+				skb_put(rx_skb, rx_pkt_size);
+				rx_skb->protocol = eth_type_trans(rx_skb, nesvnic->netdev);
+				nesnic->rq_tail++;
+				nesnic->rq_tail &= nesnic->rq_size - 1;
+
+				atomic_inc(&nesvnic->rx_skbs_needed);
+				if (atomic_read(&nesvnic->rx_skbs_needed) > (nesvnic->nic.rq_size>>1)) {
+					nes_write32(nesdev->regs+NES_CQE_ALLOC,
+							cq->cq_number | (cqe_count << 16));
+					/* nesadapter->tune_timer.cq_count += cqe_count; */
+					nesdev->currcq_count += cqe_count;
+					cqe_count = 0;
+					nes_replenish_nic_rq(nesvnic);
+				}
+				pkt_type = (u16)(le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX]));
+				cqe_errv = (cqe_misc & NES_NIC_CQE_ERRV_MASK) >> NES_NIC_CQE_ERRV_SHIFT;
+				rx_skb->ip_summed = CHECKSUM_NONE;
+
+				if ((NES_PKT_TYPE_TCPV4_BITS == (pkt_type & NES_PKT_TYPE_TCPV4_MASK)) ||
+						(NES_PKT_TYPE_UDPV4_BITS == (pkt_type & NES_PKT_TYPE_UDPV4_MASK))) {
+					if ((cqe_errv &
+							(NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR |
+							NES_NIC_ERRV_BITS_IPH_ERR | NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) {
+						if (nesvnic->netdev->features & NETIF_F_RXCSUM)
+							rx_skb->ip_summed = CHECKSUM_UNNECESSARY;
+					} else
+						nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet."
+								" errv = 0x%X, pkt_type = 0x%X.\n",
+								nesvnic->netdev->name, cqe_errv, pkt_type);
+
+				} else if ((pkt_type & NES_PKT_TYPE_IPV4_MASK) == NES_PKT_TYPE_IPV4_BITS) {
+					if ((cqe_errv &
+							(NES_NIC_ERRV_BITS_IPV4_CSUM_ERR | NES_NIC_ERRV_BITS_IPH_ERR |
+							NES_NIC_ERRV_BITS_WQE_OVERRUN)) == 0) {
+						if (nesvnic->netdev->features & NETIF_F_RXCSUM) {
+							rx_skb->ip_summed = CHECKSUM_UNNECESSARY;
+							/* nes_debug(NES_DBG_CQ, "%s: Reporting successfully checksummed IPv4 packet.\n",
+								  nesvnic->netdev->name); */
+						}
+					} else
+						nes_debug(NES_DBG_CQ, "%s: unsuccessfully checksummed TCP or UDP packet."
+								" errv = 0x%X, pkt_type = 0x%X.\n",
+								nesvnic->netdev->name, cqe_errv, pkt_type);
+					}
+				/* nes_debug(NES_DBG_CQ, "pkt_type=%x, APBVT_MASK=%x\n",
+							pkt_type, (pkt_type & NES_PKT_TYPE_APBVT_MASK)); */
+
+				if ((pkt_type & NES_PKT_TYPE_APBVT_MASK) == NES_PKT_TYPE_APBVT_BITS) {
+					if (nes_cm_recv(rx_skb, nesvnic->netdev))
+						rx_skb = NULL;
+				}
+				if (rx_skb == NULL)
+					goto skip_rx_indicate0;
+
+
+				if (cqe_misc & NES_NIC_CQE_TAG_VALID) {
+					vlan_tag = (u16)(le32_to_cpu(
+							cq->cq_vbase[head].cqe_words[NES_NIC_CQE_TAG_PKT_TYPE_IDX])
+							>> 16);
+					nes_debug(NES_DBG_CQ, "%s: Reporting stripped VLAN packet. Tag = 0x%04X\n",
+							nesvnic->netdev->name, vlan_tag);
+
+					__vlan_hwaccel_put_tag(rx_skb, htons(ETH_P_8021Q), vlan_tag);
+				}
+				if (nes_use_lro)
+					lro_receive_skb(&nesvnic->lro_mgr, rx_skb, NULL);
+				else
+					netif_receive_skb(rx_skb);
+
+skip_rx_indicate0:
+				;
+				/* nesvnic->netstats.rx_packets++; */
+				/* nesvnic->netstats.rx_bytes += rx_pkt_size; */
+			}
+
+			cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0;
+			/* Accounting... */
+			cqe_count++;
+			if (++head >= cq_size)
+				head = 0;
+			if (cqe_count == 255) {
+				/* Replenish Nic CQ */
+				nes_write32(nesdev->regs+NES_CQE_ALLOC,
+						cq->cq_number | (cqe_count << 16));
+				/* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */
+				nesdev->currcq_count += cqe_count;
+				cqe_count = 0;
+			}
+
+			if (cq->rx_cqes_completed >= nesvnic->budget)
+				break;
+		} else {
+			cq->cqes_pending = 0;
+			break;
+		}
+
+	} while (1);
+
+	if (nes_use_lro)
+		lro_flush_all(&nesvnic->lro_mgr);
+	if (sq_cqes) {
+		barrier();
+		/* restart the queue if it had been stopped */
+		if (netif_queue_stopped(nesvnic->netdev))
+			netif_wake_queue(nesvnic->netdev);
+	}
+	cq->cq_head = head;
+	/* nes_debug(NES_DBG_CQ, "CQ%u Processed = %u cqes, new head = %u.\n",
+			cq->cq_number, cqe_count, cq->cq_head); */
+	cq->cqe_allocs_pending = cqe_count;
+	if (unlikely(nesadapter->et_use_adaptive_rx_coalesce))
+	{
+		/* nesdev->nesadapter->tune_timer.cq_count += cqe_count; */
+		nesdev->currcq_count += cqe_count;
+		nes_nic_tune_timer(nesdev);
+	}
+	if (atomic_read(&nesvnic->rx_skbs_needed))
+		nes_replenish_nic_rq(nesvnic);
+}
+
+
+
+/**
+ * nes_cqp_ce_handler
+ */
+static void nes_cqp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *cq)
+{
+	u64 u64temp;
+	unsigned long flags;
+	struct nes_hw_cqp *cqp = NULL;
+	struct nes_cqp_request *cqp_request;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 head;
+	u32 cq_size;
+	u32 cqe_count=0;
+	u32 error_code;
+	u32 opcode;
+	u32 ctx_index;
+	/* u32 counter; */
+
+	head = cq->cq_head;
+	cq_size = cq->cq_size;
+
+	do {
+		/* process the CQE */
+		/* nes_debug(NES_DBG_CQP, "head=%u cqe_words=%08X\n", head,
+			  le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])); */
+
+		opcode = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]);
+		if (opcode & NES_CQE_VALID) {
+			cqp = &nesdev->cqp;
+
+			error_code = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX]);
+			if (error_code) {
+				nes_debug(NES_DBG_CQP, "Bad Completion code for opcode 0x%02X from CQP,"
+						" Major/Minor codes = 0x%04X:%04X.\n",
+						le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX])&0x3f,
+						(u16)(error_code >> 16),
+						(u16)error_code);
+			}
+
+			u64temp = (((u64)(le32_to_cpu(cq->cq_vbase[head].
+					cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]))) << 32) |
+					((u64)(le32_to_cpu(cq->cq_vbase[head].
+					cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX])));
+
+			cqp_request = (struct nes_cqp_request *)(unsigned long)u64temp;
+			if (cqp_request) {
+				if (cqp_request->waiting) {
+					/* nes_debug(NES_DBG_CQP, "%s: Waking up requestor\n"); */
+					cqp_request->major_code = (u16)(error_code >> 16);
+					cqp_request->minor_code = (u16)error_code;
+					barrier();
+					cqp_request->request_done = 1;
+					wake_up(&cqp_request->waitq);
+					nes_put_cqp_request(nesdev, cqp_request);
+				} else {
+					if (cqp_request->callback)
+						cqp_request->cqp_callback(nesdev, cqp_request);
+					nes_free_cqp_request(nesdev, cqp_request);
+				}
+			} else {
+				wake_up(&nesdev->cqp.waitq);
+			}
+
+			cq->cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0;
+			nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (1 << 16));
+			if (++cqp->sq_tail >= cqp->sq_size)
+				cqp->sq_tail = 0;
+
+			/* Accounting... */
+			cqe_count++;
+			if (++head >= cq_size)
+				head = 0;
+		} else {
+			break;
+		}
+	} while (1);
+	cq->cq_head = head;
+
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+	while ((!list_empty(&nesdev->cqp_pending_reqs)) &&
+			((((nesdev->cqp.sq_tail+nesdev->cqp.sq_size)-nesdev->cqp.sq_head) &
+			(nesdev->cqp.sq_size - 1)) != 1)) {
+		cqp_request = list_entry(nesdev->cqp_pending_reqs.next,
+				struct nes_cqp_request, list);
+		list_del_init(&cqp_request->list);
+		head = nesdev->cqp.sq_head++;
+		nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+		cqp_wqe = &nesdev->cqp.sq_vbase[head];
+		memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe));
+		barrier();
+
+		opcode = cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX];
+		if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT)
+			ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX;
+		else
+			ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX;
+		cqp_wqe->wqe_words[ctx_index] =
+			cpu_to_le32((u32)((unsigned long)cqp_request));
+		cqp_wqe->wqe_words[ctx_index + 1] =
+			cpu_to_le32((u32)(upper_32_bits((unsigned long)cqp_request)));
+		nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) put on CQPs SQ wqe%u.\n",
+				cqp_request, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f, head);
+		/* Ring doorbell (1 WQEs) */
+		barrier();
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id);
+	}
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+
+	/* Arm the CCQ */
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+			cq->cq_number);
+	nes_read32(nesdev->regs+NES_CQE_ALLOC);
+}
+
+static u8 *locate_mpa(u8 *pkt, u32 aeq_info)
+{
+	if (aeq_info & NES_AEQE_Q2_DATA_ETHERNET) {
+		/* skip over ethernet header */
+		pkt += ETH_HLEN;
+
+		/* Skip over IP and TCP headers */
+		pkt += 4 * (pkt[0] & 0x0f);
+		pkt += 4 * ((pkt[12] >> 4) & 0x0f);
+	}
+	return pkt;
+}
+
+/* Determine if incoming error pkt is rdma layer */
+static u32 iwarp_opcode(struct nes_qp *nesqp, u32 aeq_info)
+{
+	u8 *pkt;
+	u16 *mpa;
+	u32 opcode = 0xffffffff;
+
+	if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) {
+		pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET;
+		mpa = (u16 *)locate_mpa(pkt, aeq_info);
+		opcode = be16_to_cpu(mpa[1]) & 0xf;
+	}
+
+	return opcode;
+}
+
+/* Build iWARP terminate header */
+static int nes_bld_terminate_hdr(struct nes_qp *nesqp, u16 async_event_id, u32 aeq_info)
+{
+	u8 *pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET;
+	u16 ddp_seg_len;
+	int copy_len = 0;
+	u8 is_tagged = 0;
+	u8 flush_code = 0;
+	struct nes_terminate_hdr *termhdr;
+
+	termhdr = (struct nes_terminate_hdr *)nesqp->hwqp.q2_vbase;
+	memset(termhdr, 0, 64);
+
+	if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) {
+
+		/* Use data from offending packet to fill in ddp & rdma hdrs */
+		pkt = locate_mpa(pkt, aeq_info);
+		ddp_seg_len = be16_to_cpu(*(u16 *)pkt);
+		if (ddp_seg_len) {
+			copy_len = 2;
+			termhdr->hdrct = DDP_LEN_FLAG;
+			if (pkt[2] & 0x80) {
+				is_tagged = 1;
+				if (ddp_seg_len >= TERM_DDP_LEN_TAGGED) {
+					copy_len += TERM_DDP_LEN_TAGGED;
+					termhdr->hdrct |= DDP_HDR_FLAG;
+				}
+			} else {
+				if (ddp_seg_len >= TERM_DDP_LEN_UNTAGGED) {
+					copy_len += TERM_DDP_LEN_UNTAGGED;
+					termhdr->hdrct |= DDP_HDR_FLAG;
+				}
+
+				if (ddp_seg_len >= (TERM_DDP_LEN_UNTAGGED + TERM_RDMA_LEN)) {
+					if ((pkt[3] & RDMA_OPCODE_MASK) == RDMA_READ_REQ_OPCODE) {
+						copy_len += TERM_RDMA_LEN;
+						termhdr->hdrct |= RDMA_HDR_FLAG;
+					}
+				}
+			}
+		}
+	}
+
+	switch (async_event_id) {
+	case NES_AEQE_AEID_AMP_UNALLOCATED_STAG:
+		switch (iwarp_opcode(nesqp, aeq_info)) {
+		case IWARP_OPCODE_WRITE:
+			flush_code = IB_WC_LOC_PROT_ERR;
+			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
+			termhdr->error_code = DDP_TAGGED_INV_STAG;
+			break;
+		default:
+			flush_code = IB_WC_REM_ACCESS_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+			termhdr->error_code = RDMAP_INV_STAG;
+		}
+		break;
+	case NES_AEQE_AEID_AMP_INVALID_STAG:
+		flush_code = IB_WC_REM_ACCESS_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+		termhdr->error_code = RDMAP_INV_STAG;
+		break;
+	case NES_AEQE_AEID_AMP_BAD_QP:
+		flush_code = IB_WC_LOC_QP_OP_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_QN;
+		break;
+	case NES_AEQE_AEID_AMP_BAD_STAG_KEY:
+	case NES_AEQE_AEID_AMP_BAD_STAG_INDEX:
+		switch (iwarp_opcode(nesqp, aeq_info)) {
+		case IWARP_OPCODE_SEND_INV:
+		case IWARP_OPCODE_SEND_SE_INV:
+			flush_code = IB_WC_REM_OP_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
+			termhdr->error_code = RDMAP_CANT_INV_STAG;
+			break;
+		default:
+			flush_code = IB_WC_REM_ACCESS_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+			termhdr->error_code = RDMAP_INV_STAG;
+		}
+		break;
+	case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION:
+		if (aeq_info & (NES_AEQE_Q2_DATA_ETHERNET | NES_AEQE_Q2_DATA_MPA)) {
+			flush_code = IB_WC_LOC_PROT_ERR;
+			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
+			termhdr->error_code = DDP_TAGGED_BOUNDS;
+		} else {
+			flush_code = IB_WC_REM_ACCESS_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+			termhdr->error_code = RDMAP_INV_BOUNDS;
+		}
+		break;
+	case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION:
+	case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS:
+	case NES_AEQE_AEID_PRIV_OPERATION_DENIED:
+		flush_code = IB_WC_REM_ACCESS_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+		termhdr->error_code = RDMAP_ACCESS;
+		break;
+	case NES_AEQE_AEID_AMP_TO_WRAP:
+		flush_code = IB_WC_REM_ACCESS_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+		termhdr->error_code = RDMAP_TO_WRAP;
+		break;
+	case NES_AEQE_AEID_AMP_BAD_PD:
+		switch (iwarp_opcode(nesqp, aeq_info)) {
+		case IWARP_OPCODE_WRITE:
+			flush_code = IB_WC_LOC_PROT_ERR;
+			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
+			termhdr->error_code = DDP_TAGGED_UNASSOC_STAG;
+			break;
+		case IWARP_OPCODE_SEND_INV:
+		case IWARP_OPCODE_SEND_SE_INV:
+			flush_code = IB_WC_REM_ACCESS_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+			termhdr->error_code = RDMAP_CANT_INV_STAG;
+			break;
+		default:
+			flush_code = IB_WC_REM_ACCESS_ERR;
+			termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_PROT;
+			termhdr->error_code = RDMAP_UNASSOC_STAG;
+		}
+		break;
+	case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH:
+		flush_code = IB_WC_LOC_LEN_ERR;
+		termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP;
+		termhdr->error_code = MPA_MARKER;
+		break;
+	case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR:
+		flush_code = IB_WC_GENERAL_ERR;
+		termhdr->layer_etype = (LAYER_MPA << 4) | DDP_LLP;
+		termhdr->error_code = MPA_CRC;
+		break;
+	case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE:
+	case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL:
+		flush_code = IB_WC_LOC_LEN_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC;
+		termhdr->error_code = DDP_CATASTROPHIC_LOCAL;
+		break;
+	case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC:
+	case NES_AEQE_AEID_DDP_NO_L_BIT:
+		flush_code = IB_WC_FATAL_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_CATASTROPHIC;
+		termhdr->error_code = DDP_CATASTROPHIC_LOCAL;
+		break;
+	case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN:
+	case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID:
+		flush_code = IB_WC_GENERAL_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_MSN_RANGE;
+		break;
+	case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
+		flush_code = IB_WC_LOC_LEN_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_TOO_LONG;
+		break;
+	case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION:
+		flush_code = IB_WC_GENERAL_ERR;
+		if (is_tagged) {
+			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_TAGGED_BUFFER;
+			termhdr->error_code = DDP_TAGGED_INV_DDP_VER;
+		} else {
+			termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+			termhdr->error_code = DDP_UNTAGGED_INV_DDP_VER;
+		}
+		break;
+	case NES_AEQE_AEID_DDP_UBE_INVALID_MO:
+		flush_code = IB_WC_GENERAL_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_MO;
+		break;
+	case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE:
+		flush_code = IB_WC_REM_OP_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_MSN_NO_BUF;
+		break;
+	case NES_AEQE_AEID_DDP_UBE_INVALID_QN:
+		flush_code = IB_WC_GENERAL_ERR;
+		termhdr->layer_etype = (LAYER_DDP << 4) | DDP_UNTAGGED_BUFFER;
+		termhdr->error_code = DDP_UNTAGGED_INV_QN;
+		break;
+	case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION:
+		flush_code = IB_WC_GENERAL_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
+		termhdr->error_code = RDMAP_INV_RDMAP_VER;
+		break;
+	case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE:
+		flush_code = IB_WC_LOC_QP_OP_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
+		termhdr->error_code = RDMAP_UNEXPECTED_OP;
+		break;
+	default:
+		flush_code = IB_WC_FATAL_ERR;
+		termhdr->layer_etype = (LAYER_RDMA << 4) | RDMAP_REMOTE_OP;
+		termhdr->error_code = RDMAP_UNSPECIFIED;
+		break;
+	}
+
+	if (copy_len)
+		memcpy(termhdr + 1, pkt, copy_len);
+
+	if ((flush_code) && ((NES_AEQE_INBOUND_RDMA & aeq_info) == 0)) {
+		if (aeq_info & NES_AEQE_SQ)
+			nesqp->term_sq_flush_code = flush_code;
+		else
+			nesqp->term_rq_flush_code = flush_code;
+	}
+
+	return sizeof(struct nes_terminate_hdr) + copy_len;
+}
+
+static void nes_terminate_connection(struct nes_device *nesdev, struct nes_qp *nesqp,
+		 struct nes_hw_aeqe *aeqe, enum ib_event_type eventtype)
+{
+	u64 context;
+	unsigned long flags;
+	u32 aeq_info;
+	u16 async_event_id;
+	u8 tcp_state;
+	u8 iwarp_state;
+	u32 termlen = 0;
+	u32 mod_qp_flags = NES_CQP_QP_IWARP_STATE_TERMINATE |
+			   NES_CQP_QP_TERM_DONT_SEND_FIN;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	if (nesqp->term_flags & NES_TERM_SENT)
+		return; /* Sanity check */
+
+	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+	tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
+	iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
+	async_event_id = (u16)aeq_info;
+
+	context = (unsigned long)nesadapter->qp_table[le32_to_cpu(
+		aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN];
+	if (!context) {
+		WARN_ON(!context);
+		return;
+	}
+
+	nesqp = (struct nes_qp *)(unsigned long)context;
+	spin_lock_irqsave(&nesqp->lock, flags);
+	nesqp->hw_iwarp_state = iwarp_state;
+	nesqp->hw_tcp_state = tcp_state;
+	nesqp->last_aeq = async_event_id;
+	nesqp->terminate_eventtype = eventtype;
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	if (nesadapter->send_term_ok)
+		termlen = nes_bld_terminate_hdr(nesqp, async_event_id, aeq_info);
+	else
+		mod_qp_flags |= NES_CQP_QP_TERM_DONT_SEND_TERM_MSG;
+
+	if (!nesdev->iw_status)  {
+		nesqp->term_flags = NES_TERM_DONE;
+		nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_ERROR, 0, 0);
+		nes_cm_disconn(nesqp);
+	} else {
+		nes_terminate_start_timer(nesqp);
+		nesqp->term_flags |= NES_TERM_SENT;
+		nes_hw_modify_qp(nesdev, nesqp, mod_qp_flags, termlen, 0);
+	}
+}
+
+static void nes_terminate_send_fin(struct nes_device *nesdev,
+			  struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe)
+{
+	u32 aeq_info;
+	u16 async_event_id;
+	u8 tcp_state;
+	u8 iwarp_state;
+	unsigned long flags;
+
+	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+	tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
+	iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
+	async_event_id = (u16)aeq_info;
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+	nesqp->hw_iwarp_state = iwarp_state;
+	nesqp->hw_tcp_state = tcp_state;
+	nesqp->last_aeq = async_event_id;
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	/* Send the fin only */
+	nes_hw_modify_qp(nesdev, nesqp, NES_CQP_QP_IWARP_STATE_TERMINATE |
+		NES_CQP_QP_TERM_DONT_SEND_TERM_MSG, 0, 0);
+}
+
+/* Cleanup after a terminate sent or received */
+static void nes_terminate_done(struct nes_qp *nesqp, int timeout_occurred)
+{
+	u32 next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
+	unsigned long flags;
+	struct nes_vnic *nesvnic = to_nesvnic(nesqp->ibqp.device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u8 first_time = 0;
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+	if (nesqp->hte_added) {
+		nesqp->hte_added = 0;
+		next_iwarp_state |= NES_CQP_QP_DEL_HTE;
+	}
+
+	first_time = (nesqp->term_flags & NES_TERM_DONE) == 0;
+	nesqp->term_flags |= NES_TERM_DONE;
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	/* Make sure we go through this only once */
+	if (first_time) {
+		if (timeout_occurred == 0)
+			del_timer(&nesqp->terminate_timer);
+		else
+			next_iwarp_state |= NES_CQP_QP_RESET;
+
+		nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
+		nes_cm_disconn(nesqp);
+	}
+}
+
+static void nes_terminate_received(struct nes_device *nesdev,
+				struct nes_qp *nesqp, struct nes_hw_aeqe *aeqe)
+{
+	u32 aeq_info;
+	u8 *pkt;
+	u32 *mpa;
+	u8 ddp_ctl;
+	u8 rdma_ctl;
+	u16 aeq_id = 0;
+
+	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+	if (aeq_info & NES_AEQE_Q2_DATA_WRITTEN) {
+		/* Terminate is not a performance path so the silicon */
+		/* did not validate the frame - do it now */
+		pkt = nesqp->hwqp.q2_vbase + BAD_FRAME_OFFSET;
+		mpa = (u32 *)locate_mpa(pkt, aeq_info);
+		ddp_ctl = (be32_to_cpu(mpa[0]) >> 8) & 0xff;
+		rdma_ctl = be32_to_cpu(mpa[0]) & 0xff;
+		if ((ddp_ctl & 0xc0) != 0x40)
+			aeq_id = NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC;
+		else if ((ddp_ctl & 0x03) != 1)
+			aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION;
+		else if (be32_to_cpu(mpa[2]) != 2)
+			aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_QN;
+		else if (be32_to_cpu(mpa[3]) != 1)
+			aeq_id = NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN;
+		else if (be32_to_cpu(mpa[4]) != 0)
+			aeq_id = NES_AEQE_AEID_DDP_UBE_INVALID_MO;
+		else if ((rdma_ctl & 0xc0) != 0x40)
+			aeq_id = NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION;
+
+		if (aeq_id) {
+			/* Bad terminate recvd - send back a terminate */
+			aeq_info = (aeq_info & 0xffff0000) | aeq_id;
+			aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info);
+			nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL);
+			return;
+		}
+	}
+
+	nesqp->term_flags |= NES_TERM_RCVD;
+	nesqp->terminate_eventtype = IB_EVENT_QP_FATAL;
+	nes_terminate_start_timer(nesqp);
+	nes_terminate_send_fin(nesdev, nesqp, aeqe);
+}
+
+/* Timeout routine in case terminate fails to complete */
+void nes_terminate_timeout(unsigned long context)
+{
+	struct nes_qp *nesqp = (struct nes_qp *)(unsigned long)context;
+
+	nes_terminate_done(nesqp, 1);
+}
+
+/* Set a timer in case hw cannot complete the terminate sequence */
+static void nes_terminate_start_timer(struct nes_qp *nesqp)
+{
+	mod_timer(&nesqp->terminate_timer, (jiffies + HZ));
+}
+
+/**
+ * nes_process_iwarp_aeqe
+ */
+static void nes_process_iwarp_aeqe(struct nes_device *nesdev,
+				   struct nes_hw_aeqe *aeqe)
+{
+	u64 context;
+	unsigned long flags;
+	struct nes_qp *nesqp;
+	struct nes_hw_cq *hw_cq;
+	struct nes_cq *nescq;
+	int resource_allocated;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 aeq_info;
+	u32 next_iwarp_state = 0;
+	u32 aeqe_cq_id;
+	u16 async_event_id;
+	u8 tcp_state;
+	u8 iwarp_state;
+	struct ib_event ibevent;
+
+	nes_debug(NES_DBG_AEQ, "\n");
+	aeq_info = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_MISC_IDX]);
+	if ((NES_AEQE_INBOUND_RDMA & aeq_info) || (!(NES_AEQE_QP & aeq_info))) {
+		context  = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_LOW_IDX]);
+		context += ((u64)le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_CTXT_HIGH_IDX])) << 32;
+	} else {
+		context = (unsigned long)nesadapter->qp_table[le32_to_cpu(
+						aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]) - NES_FIRST_QPN];
+		BUG_ON(!context);
+	}
+
+	/* context is nesqp unless async_event_id == CQ ERROR */
+	nesqp = (struct nes_qp *)(unsigned long)context;
+	async_event_id = (u16)aeq_info;
+	tcp_state = (aeq_info & NES_AEQE_TCP_STATE_MASK) >> NES_AEQE_TCP_STATE_SHIFT;
+	iwarp_state = (aeq_info & NES_AEQE_IWARP_STATE_MASK) >> NES_AEQE_IWARP_STATE_SHIFT;
+	nes_debug(NES_DBG_AEQ, "aeid = 0x%04X, qp-cq id = %d, aeqe = %p,"
+			" Tcp state = %s, iWARP state = %s\n",
+			async_event_id,
+			le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), aeqe,
+			nes_tcp_state_str[tcp_state], nes_iwarp_state_str[iwarp_state]);
+
+	aeqe_cq_id = le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]);
+	if (aeq_info & NES_AEQE_QP) {
+		if (!nes_is_resource_allocated(nesadapter,
+				nesadapter->allocated_qps,
+				aeqe_cq_id))
+			return;
+	}
+
+	switch (async_event_id) {
+		case NES_AEQE_AEID_LLP_FIN_RECEIVED:
+			if (nesqp->term_flags)
+				return; /* Ignore it, wait for close complete */
+
+			if (atomic_inc_return(&nesqp->close_timer_started) == 1) {
+				if ((tcp_state == NES_AEQE_TCP_STATE_CLOSE_WAIT) &&
+					(nesqp->ibqp_state == IB_QPS_RTS)) {
+					spin_lock_irqsave(&nesqp->lock, flags);
+					nesqp->hw_iwarp_state = iwarp_state;
+					nesqp->hw_tcp_state = tcp_state;
+					nesqp->last_aeq = async_event_id;
+					next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
+					nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
+					spin_unlock_irqrestore(&nesqp->lock, flags);
+					nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
+					nes_cm_disconn(nesqp);
+				}
+				nesqp->cm_id->add_ref(nesqp->cm_id);
+				schedule_nes_timer(nesqp->cm_node, (struct sk_buff *)nesqp,
+						NES_TIMER_TYPE_CLOSE, 1, 0);
+				nes_debug(NES_DBG_AEQ, "QP%u Not decrementing QP refcount (%d),"
+						" need ae to finish up, original_last_aeq = 0x%04X."
+						" last_aeq = 0x%04X, scheduling timer. TCP state = %d\n",
+						nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+						async_event_id, nesqp->last_aeq, tcp_state);
+			}
+			break;
+		case NES_AEQE_AEID_LLP_CLOSE_COMPLETE:
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			nes_cm_disconn(nesqp);
+			break;
+
+		case NES_AEQE_AEID_RESET_SENT:
+			tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			nesqp->hte_added = 0;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR | NES_CQP_QP_DEL_HTE;
+			nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 0);
+			nes_cm_disconn(nesqp);
+			break;
+
+		case NES_AEQE_AEID_LLP_CONNECTION_RESET:
+			if (atomic_read(&nesqp->close_timer_started))
+				return;
+			spin_lock_irqsave(&nesqp->lock, flags);
+			nesqp->hw_iwarp_state = iwarp_state;
+			nesqp->hw_tcp_state = tcp_state;
+			nesqp->last_aeq = async_event_id;
+			spin_unlock_irqrestore(&nesqp->lock, flags);
+			nes_cm_disconn(nesqp);
+			break;
+
+		case NES_AEQE_AEID_TERMINATE_SENT:
+			nes_terminate_send_fin(nesdev, nesqp, aeqe);
+			break;
+
+		case NES_AEQE_AEID_LLP_TERMINATE_RECEIVED:
+			nes_terminate_received(nesdev, nesqp, aeqe);
+			break;
+
+		case NES_AEQE_AEID_AMP_BAD_STAG_KEY:
+		case NES_AEQE_AEID_AMP_BAD_STAG_INDEX:
+		case NES_AEQE_AEID_AMP_UNALLOCATED_STAG:
+		case NES_AEQE_AEID_AMP_INVALID_STAG:
+		case NES_AEQE_AEID_AMP_RIGHTS_VIOLATION:
+		case NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS:
+		case NES_AEQE_AEID_PRIV_OPERATION_DENIED:
+		case NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER:
+		case NES_AEQE_AEID_AMP_BOUNDS_VIOLATION:
+		case NES_AEQE_AEID_AMP_TO_WRAP:
+			printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_ACCESS_ERR\n",
+					nesqp->hwqp.qp_id, async_event_id);
+			nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_ACCESS_ERR);
+			break;
+
+		case NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE:
+		case NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL:
+		case NES_AEQE_AEID_DDP_UBE_INVALID_MO:
+		case NES_AEQE_AEID_DDP_UBE_INVALID_QN:
+			if (iwarp_opcode(nesqp, aeq_info) > IWARP_OPCODE_TERM) {
+				aeq_info &= 0xffff0000;
+				aeq_info |= NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE;
+				aeqe->aeqe_words[NES_AEQE_MISC_IDX] = cpu_to_le32(aeq_info);
+			}
+
+		case NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE:
+		case NES_AEQE_AEID_LLP_TOO_MANY_RETRIES:
+		case NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE:
+		case NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR:
+		case NES_AEQE_AEID_AMP_BAD_QP:
+		case NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH:
+		case NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC:
+		case NES_AEQE_AEID_DDP_NO_L_BIT:
+		case NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN:
+		case NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID:
+		case NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION:
+		case NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION:
+		case NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE:
+		case NES_AEQE_AEID_AMP_BAD_PD:
+		case NES_AEQE_AEID_AMP_FASTREG_SHARED:
+		case NES_AEQE_AEID_AMP_FASTREG_VALID_STAG:
+		case NES_AEQE_AEID_AMP_FASTREG_MW_STAG:
+		case NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS:
+		case NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW:
+		case NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH:
+		case NES_AEQE_AEID_AMP_INVALIDATE_SHARED:
+		case NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS:
+		case NES_AEQE_AEID_AMP_MWBIND_VALID_STAG:
+		case NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG:
+		case NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG:
+		case NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG:
+		case NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS:
+		case NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS:
+		case NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT:
+		case NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED:
+		case NES_AEQE_AEID_BAD_CLOSE:
+		case NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO:
+		case NES_AEQE_AEID_STAG_ZERO_INVALID:
+		case NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST:
+		case NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP:
+			printk(KERN_ERR PFX "QP[%u] async_event_id=0x%04X IB_EVENT_QP_FATAL\n",
+					nesqp->hwqp.qp_id, async_event_id);
+			print_ip(nesqp->cm_node);
+			if (!atomic_read(&nesqp->close_timer_started))
+				nes_terminate_connection(nesdev, nesqp, aeqe, IB_EVENT_QP_FATAL);
+			break;
+
+		case NES_AEQE_AEID_CQ_OPERATION_ERROR:
+			context <<= 1;
+			nes_debug(NES_DBG_AEQ, "Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u, %p\n",
+					le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]), (void *)(unsigned long)context);
+			resource_allocated = nes_is_resource_allocated(nesadapter, nesadapter->allocated_cqs,
+					le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]));
+			if (resource_allocated) {
+				printk(KERN_ERR PFX "%s: Processing an NES_AEQE_AEID_CQ_OPERATION_ERROR event on CQ%u\n",
+						__func__, le32_to_cpu(aeqe->aeqe_words[NES_AEQE_COMP_QP_CQ_ID_IDX]));
+				hw_cq = (struct nes_hw_cq *)(unsigned long)context;
+				if (hw_cq) {
+					nescq = container_of(hw_cq, struct nes_cq, hw_cq);
+					if (nescq->ibcq.event_handler) {
+						ibevent.device = nescq->ibcq.device;
+						ibevent.event = IB_EVENT_CQ_ERR;
+						ibevent.element.cq = &nescq->ibcq;
+						nescq->ibcq.event_handler(&ibevent, nescq->ibcq.cq_context);
+					}
+				}
+			}
+			break;
+
+		default:
+			nes_debug(NES_DBG_AEQ, "Processing an iWARP related AE for QP, misc = 0x%04X\n",
+					async_event_id);
+			break;
+	}
+
+}
+
+/**
+ * nes_iwarp_ce_handler
+ */
+void nes_iwarp_ce_handler(struct nes_device *nesdev, struct nes_hw_cq *hw_cq)
+{
+	struct nes_cq *nescq = container_of(hw_cq, struct nes_cq, hw_cq);
+
+	/* nes_debug(NES_DBG_CQ, "Processing completion event for iWARP CQ%u.\n",
+			nescq->hw_cq.cq_number); */
+	nes_write32(nesdev->regs+NES_CQ_ACK, nescq->hw_cq.cq_number);
+
+	if (nescq->ibcq.comp_handler)
+		nescq->ibcq.comp_handler(&nescq->ibcq, nescq->ibcq.cq_context);
+
+	return;
+}
+
+
+/**
+ * nes_manage_apbvt()
+ */
+int nes_manage_apbvt(struct nes_vnic *nesvnic, u32 accel_local_port,
+		u32 nic_index, u32 add_port)
+{
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	int ret = 0;
+	u16 major_code;
+
+	/* Send manage APBVT request to CQP */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	nes_debug(NES_DBG_QP, "%s APBV for local port=%u(0x%04x), nic_index=%u\n",
+			(add_port == NES_MANAGE_APBVT_ADD) ? "ADD" : "DEL",
+			accel_local_port, accel_local_port, nic_index);
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, (NES_CQP_MANAGE_APBVT |
+			((add_port == NES_MANAGE_APBVT_ADD) ? NES_CQP_APBVT_ADD : 0)));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+			((nic_index << NES_CQP_APBVT_NIC_SHIFT) | accel_local_port));
+
+	nes_debug(NES_DBG_QP, "Waiting for CQP completion for APBVT.\n");
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	if (add_port == NES_MANAGE_APBVT_ADD)
+		ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+				NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_QP, "Completed, ret=%u,  CQP Major:Minor codes = 0x%04X:0x%04X\n",
+			ret, cqp_request->major_code, cqp_request->minor_code);
+	major_code = cqp_request->major_code;
+
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if (!ret)
+		return -ETIME;
+	else if (major_code)
+		return -EIO;
+	else
+		return 0;
+}
+
+
+/**
+ * nes_manage_arp_cache
+ */
+void nes_manage_arp_cache(struct net_device *netdev, unsigned char *mac_addr,
+		u32 ip_addr, u32 action)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev;
+	struct nes_cqp_request *cqp_request;
+	int arp_index;
+
+	nesdev = nesvnic->nesdev;
+	arp_index = nes_arp_table(nesdev, ip_addr, mac_addr, action);
+	if (arp_index == -1) {
+		return;
+	}
+
+	/* update the ARP entry */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_NETDEV, "Failed to get a cqp_request.\n");
+		return;
+	}
+	cqp_request->waiting = 0;
+	cqp_wqe = &cqp_request->cqp_wqe;
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
+			NES_CQP_MANAGE_ARP_CACHE | NES_CQP_ARP_PERM);
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(
+			(u32)PCI_FUNC(nesdev->pcidev->devfn) << NES_CQP_ARP_AEQ_INDEX_SHIFT);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(arp_index);
+
+	if (action == NES_ARP_ADD) {
+		cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_ARP_VALID);
+		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = cpu_to_le32(
+				(((u32)mac_addr[2]) << 24) | (((u32)mac_addr[3]) << 16) |
+				(((u32)mac_addr[4]) << 8)  | (u32)mac_addr[5]);
+		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = cpu_to_le32(
+				(((u32)mac_addr[0]) << 16) | (u32)mac_addr[1]);
+	} else {
+		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX] = 0;
+		cqp_wqe->wqe_words[NES_CQP_ARP_WQE_MAC_HIGH_IDX] = 0;
+	}
+
+	nes_debug(NES_DBG_NETDEV, "Not waiting for CQP, cqp.sq_head=%u, cqp.sq_tail=%u\n",
+			nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
+
+	atomic_set(&cqp_request->refcount, 1);
+	nes_post_cqp_request(nesdev, cqp_request);
+}
+
+
+/**
+ * flush_wqes
+ */
+void flush_wqes(struct nes_device *nesdev, struct nes_qp *nesqp,
+		u32 which_wq, u32 wait_completion)
+{
+	struct nes_cqp_request *cqp_request;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 sq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH;
+	u32 rq_code = (NES_IWARP_CQE_MAJOR_FLUSH << 16) | NES_IWARP_CQE_MINOR_FLUSH;
+	int ret;
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_QP, "Failed to get a cqp_request.\n");
+		return;
+	}
+	if (wait_completion) {
+		cqp_request->waiting = 1;
+		atomic_set(&cqp_request->refcount, 2);
+	} else {
+		cqp_request->waiting = 0;
+	}
+	cqp_wqe = &cqp_request->cqp_wqe;
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+	/* If wqe in error was identified, set code to be put into cqe */
+	if ((nesqp->term_sq_flush_code) && (which_wq & NES_CQP_FLUSH_SQ)) {
+		which_wq |= NES_CQP_FLUSH_MAJ_MIN;
+		sq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_sq_flush_code;
+		nesqp->term_sq_flush_code = 0;
+	}
+
+	if ((nesqp->term_rq_flush_code) && (which_wq & NES_CQP_FLUSH_RQ)) {
+		which_wq |= NES_CQP_FLUSH_MAJ_MIN;
+		rq_code = (CQE_MAJOR_DRV << 16) | nesqp->term_rq_flush_code;
+		nesqp->term_rq_flush_code = 0;
+	}
+
+	if (which_wq & NES_CQP_FLUSH_MAJ_MIN) {
+		cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_SQ_CODE] = cpu_to_le32(sq_code);
+		cqp_wqe->wqe_words[NES_CQP_QP_WQE_FLUSH_RQ_CODE] = cpu_to_le32(rq_code);
+	}
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] =
+			cpu_to_le32(NES_CQP_FLUSH_WQES | which_wq);
+	cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(nesqp->hwqp.qp_id);
+
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	if (wait_completion) {
+		/* Wait for CQP */
+		ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+				NES_EVENT_TIMEOUT);
+		nes_debug(NES_DBG_QP, "Flush SQ QP WQEs completed, ret=%u,"
+				" CQP Major:Minor codes = 0x%04X:0x%04X\n",
+				ret, cqp_request->major_code, cqp_request->minor_code);
+		nes_put_cqp_request(nesdev, cqp_request);
+	}
+}
diff --git a/drivers/infiniband/hw/nes/nes_hw.h b/drivers/infiniband/hw/nes/nes_hw.h
new file mode 100644
index 000000000..d748e4b31
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_hw.h
@@ -0,0 +1,1392 @@
+/*
+* Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenIB.org BSD license below:
+*
+*     Redistribution and use in source and binary forms, with or
+*     without modification, are permitted provided that the following
+*     conditions are met:
+*
+*      - Redistributions of source code must retain the above
+*        copyright notice, this list of conditions and the following
+*        disclaimer.
+*
+*      - Redistributions in binary form must reproduce the above
+*        copyright notice, this list of conditions and the following
+*        disclaimer in the documentation and/or other materials
+*        provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+
+#ifndef __NES_HW_H
+#define __NES_HW_H
+
+#include <linux/inet_lro.h>
+
+#define NES_PHY_TYPE_CX4       1
+#define NES_PHY_TYPE_1G        2
+#define NES_PHY_TYPE_ARGUS     4
+#define NES_PHY_TYPE_PUMA_1G   5
+#define NES_PHY_TYPE_PUMA_10G  6
+#define NES_PHY_TYPE_GLADIUS   7
+#define NES_PHY_TYPE_SFP_D     8
+#define NES_PHY_TYPE_KR	       9
+
+#define NES_MULTICAST_PF_MAX 8
+#define NES_A0 3
+
+#define NES_ENABLE_PAU 0x07000001
+#define NES_DISABLE_PAU 0x07000000
+#define NES_PAU_COUNTER 10
+#define NES_CQP_OPCODE_MASK 0x3f
+
+enum pci_regs {
+	NES_INT_STAT = 0x0000,
+	NES_INT_MASK = 0x0004,
+	NES_INT_PENDING = 0x0008,
+	NES_INTF_INT_STAT = 0x000C,
+	NES_INTF_INT_MASK = 0x0010,
+	NES_TIMER_STAT = 0x0014,
+	NES_PERIODIC_CONTROL = 0x0018,
+	NES_ONE_SHOT_CONTROL = 0x001C,
+	NES_EEPROM_COMMAND = 0x0020,
+	NES_EEPROM_DATA = 0x0024,
+	NES_FLASH_COMMAND = 0x0028,
+	NES_FLASH_DATA  = 0x002C,
+	NES_SOFTWARE_RESET = 0x0030,
+	NES_CQ_ACK = 0x0034,
+	NES_WQE_ALLOC = 0x0040,
+	NES_CQE_ALLOC = 0x0044,
+	NES_AEQ_ALLOC = 0x0048
+};
+
+enum indexed_regs {
+	NES_IDX_CREATE_CQP_LOW = 0x0000,
+	NES_IDX_CREATE_CQP_HIGH = 0x0004,
+	NES_IDX_QP_CONTROL = 0x0040,
+	NES_IDX_FLM_CONTROL = 0x0080,
+	NES_IDX_INT_CPU_STATUS = 0x00a0,
+	NES_IDX_GPR_TRIGGER = 0x00bc,
+	NES_IDX_GPIO_CONTROL = 0x00f0,
+	NES_IDX_GPIO_DATA = 0x00f4,
+	NES_IDX_GPR2 = 0x010c,
+	NES_IDX_TCP_CONFIG0 = 0x01e4,
+	NES_IDX_TCP_TIMER_CONFIG = 0x01ec,
+	NES_IDX_TCP_NOW = 0x01f0,
+	NES_IDX_QP_MAX_CFG_SIZES = 0x0200,
+	NES_IDX_QP_CTX_SIZE = 0x0218,
+	NES_IDX_TCP_TIMER_SIZE0 = 0x0238,
+	NES_IDX_TCP_TIMER_SIZE1 = 0x0240,
+	NES_IDX_ARP_CACHE_SIZE = 0x0258,
+	NES_IDX_CQ_CTX_SIZE = 0x0260,
+	NES_IDX_MRT_SIZE = 0x0278,
+	NES_IDX_PBL_REGION_SIZE = 0x0280,
+	NES_IDX_IRRQ_COUNT = 0x02b0,
+	NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x02f0,
+	NES_IDX_RX_WINDOW_BUFFER_SIZE = 0x0300,
+	NES_IDX_DST_IP_ADDR = 0x0400,
+	NES_IDX_PCIX_DIAG = 0x08e8,
+	NES_IDX_MPP_DEBUG = 0x0a00,
+	NES_IDX_PORT_RX_DISCARDS = 0x0a30,
+	NES_IDX_PORT_TX_DISCARDS = 0x0a34,
+	NES_IDX_MPP_LB_DEBUG = 0x0b00,
+	NES_IDX_DENALI_CTL_22 = 0x1058,
+	NES_IDX_MAC_TX_CONTROL = 0x2000,
+	NES_IDX_MAC_TX_CONFIG = 0x2004,
+	NES_IDX_MAC_TX_PAUSE_QUANTA = 0x2008,
+	NES_IDX_MAC_RX_CONTROL = 0x200c,
+	NES_IDX_MAC_RX_CONFIG = 0x2010,
+	NES_IDX_MAC_EXACT_MATCH_BOTTOM = 0x201c,
+	NES_IDX_MAC_MDIO_CONTROL = 0x2084,
+	NES_IDX_MAC_TX_OCTETS_LOW = 0x2100,
+	NES_IDX_MAC_TX_OCTETS_HIGH = 0x2104,
+	NES_IDX_MAC_TX_FRAMES_LOW = 0x2108,
+	NES_IDX_MAC_TX_FRAMES_HIGH = 0x210c,
+	NES_IDX_MAC_TX_PAUSE_FRAMES = 0x2118,
+	NES_IDX_MAC_TX_ERRORS = 0x2138,
+	NES_IDX_MAC_RX_OCTETS_LOW = 0x213c,
+	NES_IDX_MAC_RX_OCTETS_HIGH = 0x2140,
+	NES_IDX_MAC_RX_FRAMES_LOW = 0x2144,
+	NES_IDX_MAC_RX_FRAMES_HIGH = 0x2148,
+	NES_IDX_MAC_RX_BC_FRAMES_LOW = 0x214c,
+	NES_IDX_MAC_RX_MC_FRAMES_HIGH = 0x2150,
+	NES_IDX_MAC_RX_PAUSE_FRAMES = 0x2154,
+	NES_IDX_MAC_RX_SHORT_FRAMES = 0x2174,
+	NES_IDX_MAC_RX_OVERSIZED_FRAMES = 0x2178,
+	NES_IDX_MAC_RX_JABBER_FRAMES = 0x217c,
+	NES_IDX_MAC_RX_CRC_ERR_FRAMES = 0x2180,
+	NES_IDX_MAC_RX_LENGTH_ERR_FRAMES = 0x2184,
+	NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES = 0x2188,
+	NES_IDX_MAC_INT_STATUS = 0x21f0,
+	NES_IDX_MAC_INT_MASK = 0x21f4,
+	NES_IDX_PHY_PCS_CONTROL_STATUS0 = 0x2800,
+	NES_IDX_PHY_PCS_CONTROL_STATUS1 = 0x2a00,
+	NES_IDX_ETH_SERDES_COMMON_CONTROL0 = 0x2808,
+	NES_IDX_ETH_SERDES_COMMON_CONTROL1 = 0x2a08,
+	NES_IDX_ETH_SERDES_COMMON_STATUS0 = 0x280c,
+	NES_IDX_ETH_SERDES_COMMON_STATUS1 = 0x2a0c,
+	NES_IDX_ETH_SERDES_TX_EMP0 = 0x2810,
+	NES_IDX_ETH_SERDES_TX_EMP1 = 0x2a10,
+	NES_IDX_ETH_SERDES_TX_DRIVE0 = 0x2814,
+	NES_IDX_ETH_SERDES_TX_DRIVE1 = 0x2a14,
+	NES_IDX_ETH_SERDES_RX_MODE0 = 0x2818,
+	NES_IDX_ETH_SERDES_RX_MODE1 = 0x2a18,
+	NES_IDX_ETH_SERDES_RX_SIGDET0 = 0x281c,
+	NES_IDX_ETH_SERDES_RX_SIGDET1 = 0x2a1c,
+	NES_IDX_ETH_SERDES_BYPASS0 = 0x2820,
+	NES_IDX_ETH_SERDES_BYPASS1 = 0x2a20,
+	NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0 = 0x2824,
+	NES_IDX_ETH_SERDES_LOOPBACK_CONTROL1 = 0x2a24,
+	NES_IDX_ETH_SERDES_RX_EQ_CONTROL0 = 0x2828,
+	NES_IDX_ETH_SERDES_RX_EQ_CONTROL1 = 0x2a28,
+	NES_IDX_ETH_SERDES_RX_EQ_STATUS0 = 0x282c,
+	NES_IDX_ETH_SERDES_RX_EQ_STATUS1 = 0x2a2c,
+	NES_IDX_ETH_SERDES_CDR_RESET0 = 0x2830,
+	NES_IDX_ETH_SERDES_CDR_RESET1 = 0x2a30,
+	NES_IDX_ETH_SERDES_CDR_CONTROL0 = 0x2834,
+	NES_IDX_ETH_SERDES_CDR_CONTROL1 = 0x2a34,
+	NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE0 = 0x2838,
+	NES_IDX_ETH_SERDES_TX_HIGHZ_LANE_MODE1 = 0x2a38,
+	NES_IDX_ENDNODE0_NSTAT_RX_DISCARD = 0x3080,
+	NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO = 0x3000,
+	NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI = 0x3004,
+	NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO = 0x3008,
+	NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI = 0x300c,
+	NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO = 0x7000,
+	NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI = 0x7004,
+	NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO = 0x7008,
+	NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI = 0x700c,
+	NES_IDX_WQM_CONFIG0 = 0x5000,
+	NES_IDX_WQM_CONFIG1 = 0x5004,
+	NES_IDX_CM_CONFIG = 0x5100,
+	NES_IDX_NIC_LOGPORT_TO_PHYPORT = 0x6000,
+	NES_IDX_NIC_PHYPORT_TO_USW = 0x6008,
+	NES_IDX_NIC_ACTIVE = 0x6010,
+	NES_IDX_NIC_UNICAST_ALL = 0x6018,
+	NES_IDX_NIC_MULTICAST_ALL = 0x6020,
+	NES_IDX_NIC_MULTICAST_ENABLE = 0x6028,
+	NES_IDX_NIC_BROADCAST_ON = 0x6030,
+	NES_IDX_USED_CHUNKS_TX = 0x60b0,
+	NES_IDX_TX_POOL_SIZE = 0x60b8,
+	NES_IDX_QUAD_HASH_TABLE_SIZE = 0x6148,
+	NES_IDX_PERFECT_FILTER_LOW = 0x6200,
+	NES_IDX_PERFECT_FILTER_HIGH = 0x6204,
+	NES_IDX_IPV4_TCP_REXMITS = 0x7080,
+	NES_IDX_DEBUG_ERROR_CONTROL_STATUS = 0x913c,
+	NES_IDX_DEBUG_ERROR_MASKS0 = 0x9140,
+	NES_IDX_DEBUG_ERROR_MASKS1 = 0x9144,
+	NES_IDX_DEBUG_ERROR_MASKS2 = 0x9148,
+	NES_IDX_DEBUG_ERROR_MASKS3 = 0x914c,
+	NES_IDX_DEBUG_ERROR_MASKS4 = 0x9150,
+	NES_IDX_DEBUG_ERROR_MASKS5 = 0x9154,
+};
+
+#define NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE   1
+#define NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE (1 << 17)
+
+enum nes_cqp_opcodes {
+	NES_CQP_CREATE_QP = 0x00,
+	NES_CQP_MODIFY_QP = 0x01,
+	NES_CQP_DESTROY_QP = 0x02,
+	NES_CQP_CREATE_CQ = 0x03,
+	NES_CQP_MODIFY_CQ = 0x04,
+	NES_CQP_DESTROY_CQ = 0x05,
+	NES_CQP_ALLOCATE_STAG = 0x09,
+	NES_CQP_REGISTER_STAG = 0x0a,
+	NES_CQP_QUERY_STAG = 0x0b,
+	NES_CQP_REGISTER_SHARED_STAG = 0x0c,
+	NES_CQP_DEALLOCATE_STAG = 0x0d,
+	NES_CQP_MANAGE_ARP_CACHE = 0x0f,
+	NES_CQP_DOWNLOAD_SEGMENT = 0x10,
+	NES_CQP_SUSPEND_QPS = 0x11,
+	NES_CQP_UPLOAD_CONTEXT = 0x13,
+	NES_CQP_CREATE_CEQ = 0x16,
+	NES_CQP_DESTROY_CEQ = 0x18,
+	NES_CQP_CREATE_AEQ = 0x19,
+	NES_CQP_DESTROY_AEQ = 0x1b,
+	NES_CQP_LMI_ACCESS = 0x20,
+	NES_CQP_FLUSH_WQES = 0x22,
+	NES_CQP_MANAGE_APBVT = 0x23,
+	NES_CQP_MANAGE_QUAD_HASH = 0x25
+};
+
+enum nes_cqp_wqe_word_idx {
+	NES_CQP_WQE_OPCODE_IDX = 0,
+	NES_CQP_WQE_ID_IDX = 1,
+	NES_CQP_WQE_COMP_CTX_LOW_IDX = 2,
+	NES_CQP_WQE_COMP_CTX_HIGH_IDX = 3,
+	NES_CQP_WQE_COMP_SCRATCH_LOW_IDX = 4,
+	NES_CQP_WQE_COMP_SCRATCH_HIGH_IDX = 5,
+};
+
+enum nes_cqp_wqe_word_download_idx { /* format differs from other cqp ops */
+	NES_CQP_WQE_DL_OPCODE_IDX = 0,
+	NES_CQP_WQE_DL_COMP_CTX_LOW_IDX = 1,
+	NES_CQP_WQE_DL_COMP_CTX_HIGH_IDX = 2,
+	NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX = 3
+	/* For index values 4-15 use NES_NIC_SQ_WQE_ values */
+};
+
+enum nes_cqp_cq_wqeword_idx {
+	NES_CQP_CQ_WQE_PBL_LOW_IDX = 6,
+	NES_CQP_CQ_WQE_PBL_HIGH_IDX = 7,
+	NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX = 8,
+	NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX = 9,
+	NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX = 10,
+};
+
+enum nes_cqp_stag_wqeword_idx {
+	NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX = 1,
+	NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX = 6,
+	NES_CQP_STAG_WQE_LEN_LOW_IDX = 7,
+	NES_CQP_STAG_WQE_STAG_IDX = 8,
+	NES_CQP_STAG_WQE_VA_LOW_IDX = 10,
+	NES_CQP_STAG_WQE_VA_HIGH_IDX = 11,
+	NES_CQP_STAG_WQE_PA_LOW_IDX = 12,
+	NES_CQP_STAG_WQE_PA_HIGH_IDX = 13,
+	NES_CQP_STAG_WQE_PBL_LEN_IDX = 14
+};
+
+#define NES_CQP_OP_LOGICAL_PORT_SHIFT 26
+#define NES_CQP_OP_IWARP_STATE_SHIFT 28
+#define NES_CQP_OP_TERMLEN_SHIFT     28
+
+enum nes_cqp_qp_bits {
+	NES_CQP_QP_ARP_VALID = (1<<8),
+	NES_CQP_QP_WINBUF_VALID = (1<<9),
+	NES_CQP_QP_CONTEXT_VALID = (1<<10),
+	NES_CQP_QP_ORD_VALID = (1<<11),
+	NES_CQP_QP_WINBUF_DATAIND_EN = (1<<12),
+	NES_CQP_QP_VIRT_WQS = (1<<13),
+	NES_CQP_QP_DEL_HTE = (1<<14),
+	NES_CQP_QP_CQS_VALID = (1<<15),
+	NES_CQP_QP_TYPE_TSA = 0,
+	NES_CQP_QP_TYPE_IWARP = (1<<16),
+	NES_CQP_QP_TYPE_CQP = (4<<16),
+	NES_CQP_QP_TYPE_NIC = (5<<16),
+	NES_CQP_QP_MSS_CHG = (1<<20),
+	NES_CQP_QP_STATIC_RESOURCES = (1<<21),
+	NES_CQP_QP_IGNORE_MW_BOUND = (1<<22),
+	NES_CQP_QP_VWQ_USE_LMI = (1<<23),
+	NES_CQP_QP_IWARP_STATE_IDLE = (1<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_RTS = (2<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_CLOSING = (3<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_TERMINATE = (5<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_ERROR = (6<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_IWARP_STATE_MASK = (7<<NES_CQP_OP_IWARP_STATE_SHIFT),
+	NES_CQP_QP_TERM_DONT_SEND_FIN = (1<<24),
+	NES_CQP_QP_TERM_DONT_SEND_TERM_MSG = (1<<25),
+	NES_CQP_QP_RESET = (1<<31),
+};
+
+enum nes_cqp_qp_wqe_word_idx {
+	NES_CQP_QP_WQE_CONTEXT_LOW_IDX = 6,
+	NES_CQP_QP_WQE_CONTEXT_HIGH_IDX = 7,
+	NES_CQP_QP_WQE_FLUSH_SQ_CODE = 8,
+	NES_CQP_QP_WQE_FLUSH_RQ_CODE = 9,
+	NES_CQP_QP_WQE_NEW_MSS_IDX = 15,
+};
+
+enum nes_nic_ctx_bits {
+	NES_NIC_CTX_RQ_SIZE_32 = (3<<8),
+	NES_NIC_CTX_RQ_SIZE_512 = (3<<8),
+	NES_NIC_CTX_SQ_SIZE_32 = (1<<10),
+	NES_NIC_CTX_SQ_SIZE_512 = (3<<10),
+};
+
+enum nes_nic_qp_ctx_word_idx {
+	NES_NIC_CTX_MISC_IDX = 0,
+	NES_NIC_CTX_SQ_LOW_IDX = 2,
+	NES_NIC_CTX_SQ_HIGH_IDX = 3,
+	NES_NIC_CTX_RQ_LOW_IDX = 4,
+	NES_NIC_CTX_RQ_HIGH_IDX = 5,
+};
+
+enum nes_cqp_cq_bits {
+	NES_CQP_CQ_CEQE_MASK = (1<<9),
+	NES_CQP_CQ_CEQ_VALID = (1<<10),
+	NES_CQP_CQ_RESIZE = (1<<11),
+	NES_CQP_CQ_CHK_OVERFLOW = (1<<12),
+	NES_CQP_CQ_4KB_CHUNK = (1<<14),
+	NES_CQP_CQ_VIRT = (1<<15),
+};
+
+enum nes_cqp_stag_bits {
+	NES_CQP_STAG_VA_TO = (1<<9),
+	NES_CQP_STAG_DEALLOC_PBLS = (1<<10),
+	NES_CQP_STAG_PBL_BLK_SIZE = (1<<11),
+	NES_CQP_STAG_MR = (1<<13),
+	NES_CQP_STAG_RIGHTS_LOCAL_READ = (1<<16),
+	NES_CQP_STAG_RIGHTS_LOCAL_WRITE = (1<<17),
+	NES_CQP_STAG_RIGHTS_REMOTE_READ = (1<<18),
+	NES_CQP_STAG_RIGHTS_REMOTE_WRITE = (1<<19),
+	NES_CQP_STAG_RIGHTS_WINDOW_BIND = (1<<20),
+	NES_CQP_STAG_REM_ACC_EN = (1<<21),
+	NES_CQP_STAG_LEAVE_PENDING = (1<<31),
+};
+
+enum nes_cqp_ceq_wqeword_idx {
+	NES_CQP_CEQ_WQE_ELEMENT_COUNT_IDX = 1,
+	NES_CQP_CEQ_WQE_PBL_LOW_IDX = 6,
+	NES_CQP_CEQ_WQE_PBL_HIGH_IDX = 7,
+};
+
+enum nes_cqp_ceq_bits {
+	NES_CQP_CEQ_4KB_CHUNK = (1<<14),
+	NES_CQP_CEQ_VIRT = (1<<15),
+};
+
+enum nes_cqp_aeq_wqeword_idx {
+	NES_CQP_AEQ_WQE_ELEMENT_COUNT_IDX = 1,
+	NES_CQP_AEQ_WQE_PBL_LOW_IDX = 6,
+	NES_CQP_AEQ_WQE_PBL_HIGH_IDX = 7,
+};
+
+enum nes_cqp_aeq_bits {
+	NES_CQP_AEQ_4KB_CHUNK = (1<<14),
+	NES_CQP_AEQ_VIRT = (1<<15),
+};
+
+enum nes_cqp_lmi_wqeword_idx {
+	NES_CQP_LMI_WQE_LMI_OFFSET_IDX = 1,
+	NES_CQP_LMI_WQE_FRAG_LOW_IDX = 8,
+	NES_CQP_LMI_WQE_FRAG_HIGH_IDX = 9,
+	NES_CQP_LMI_WQE_FRAG_LEN_IDX = 10,
+};
+
+enum nes_cqp_arp_wqeword_idx {
+	NES_CQP_ARP_WQE_MAC_ADDR_LOW_IDX = 6,
+	NES_CQP_ARP_WQE_MAC_HIGH_IDX = 7,
+	NES_CQP_ARP_WQE_REACHABILITY_MAX_IDX = 1,
+};
+
+enum nes_cqp_upload_wqeword_idx {
+	NES_CQP_UPLOAD_WQE_CTXT_LOW_IDX = 6,
+	NES_CQP_UPLOAD_WQE_CTXT_HIGH_IDX = 7,
+	NES_CQP_UPLOAD_WQE_HTE_IDX = 8,
+};
+
+enum nes_cqp_arp_bits {
+	NES_CQP_ARP_VALID = (1<<8),
+	NES_CQP_ARP_PERM = (1<<9),
+};
+
+enum nes_cqp_flush_bits {
+	NES_CQP_FLUSH_SQ = (1<<30),
+	NES_CQP_FLUSH_RQ = (1<<31),
+	NES_CQP_FLUSH_MAJ_MIN = (1<<28),
+};
+
+enum nes_cqe_opcode_bits {
+	NES_CQE_STAG_VALID = (1<<6),
+	NES_CQE_ERROR = (1<<7),
+	NES_CQE_SQ = (1<<8),
+	NES_CQE_SE = (1<<9),
+	NES_CQE_PSH = (1<<29),
+	NES_CQE_FIN = (1<<30),
+	NES_CQE_VALID = (1<<31),
+};
+
+
+enum nes_cqe_word_idx {
+	NES_CQE_PAYLOAD_LENGTH_IDX = 0,
+	NES_CQE_COMP_COMP_CTX_LOW_IDX = 2,
+	NES_CQE_COMP_COMP_CTX_HIGH_IDX = 3,
+	NES_CQE_INV_STAG_IDX = 4,
+	NES_CQE_QP_ID_IDX = 5,
+	NES_CQE_ERROR_CODE_IDX = 6,
+	NES_CQE_OPCODE_IDX = 7,
+};
+
+enum nes_ceqe_word_idx {
+	NES_CEQE_CQ_CTX_LOW_IDX = 0,
+	NES_CEQE_CQ_CTX_HIGH_IDX = 1,
+};
+
+enum nes_ceqe_status_bit {
+	NES_CEQE_VALID = (1<<31),
+};
+
+enum nes_int_bits {
+	NES_INT_CEQ0 = (1<<0),
+	NES_INT_CEQ1 = (1<<1),
+	NES_INT_CEQ2 = (1<<2),
+	NES_INT_CEQ3 = (1<<3),
+	NES_INT_CEQ4 = (1<<4),
+	NES_INT_CEQ5 = (1<<5),
+	NES_INT_CEQ6 = (1<<6),
+	NES_INT_CEQ7 = (1<<7),
+	NES_INT_CEQ8 = (1<<8),
+	NES_INT_CEQ9 = (1<<9),
+	NES_INT_CEQ10 = (1<<10),
+	NES_INT_CEQ11 = (1<<11),
+	NES_INT_CEQ12 = (1<<12),
+	NES_INT_CEQ13 = (1<<13),
+	NES_INT_CEQ14 = (1<<14),
+	NES_INT_CEQ15 = (1<<15),
+	NES_INT_AEQ0 = (1<<16),
+	NES_INT_AEQ1 = (1<<17),
+	NES_INT_AEQ2 = (1<<18),
+	NES_INT_AEQ3 = (1<<19),
+	NES_INT_AEQ4 = (1<<20),
+	NES_INT_AEQ5 = (1<<21),
+	NES_INT_AEQ6 = (1<<22),
+	NES_INT_AEQ7 = (1<<23),
+	NES_INT_MAC0 = (1<<24),
+	NES_INT_MAC1 = (1<<25),
+	NES_INT_MAC2 = (1<<26),
+	NES_INT_MAC3 = (1<<27),
+	NES_INT_TSW = (1<<28),
+	NES_INT_TIMER = (1<<29),
+	NES_INT_INTF = (1<<30),
+};
+
+enum nes_intf_int_bits {
+	NES_INTF_INT_PCIERR = (1<<0),
+	NES_INTF_PERIODIC_TIMER = (1<<2),
+	NES_INTF_ONE_SHOT_TIMER = (1<<3),
+	NES_INTF_INT_CRITERR = (1<<14),
+	NES_INTF_INT_AEQ0_OFLOW = (1<<16),
+	NES_INTF_INT_AEQ1_OFLOW = (1<<17),
+	NES_INTF_INT_AEQ2_OFLOW = (1<<18),
+	NES_INTF_INT_AEQ3_OFLOW = (1<<19),
+	NES_INTF_INT_AEQ4_OFLOW = (1<<20),
+	NES_INTF_INT_AEQ5_OFLOW = (1<<21),
+	NES_INTF_INT_AEQ6_OFLOW = (1<<22),
+	NES_INTF_INT_AEQ7_OFLOW = (1<<23),
+	NES_INTF_INT_AEQ_OFLOW = (0xff<<16),
+};
+
+enum nes_mac_int_bits {
+	NES_MAC_INT_LINK_STAT_CHG = (1<<1),
+	NES_MAC_INT_XGMII_EXT = (1<<2),
+	NES_MAC_INT_TX_UNDERFLOW = (1<<6),
+	NES_MAC_INT_TX_ERROR = (1<<7),
+};
+
+enum nes_cqe_allocate_bits {
+	NES_CQE_ALLOC_INC_SELECT = (1<<28),
+	NES_CQE_ALLOC_NOTIFY_NEXT = (1<<29),
+	NES_CQE_ALLOC_NOTIFY_SE = (1<<30),
+	NES_CQE_ALLOC_RESET = (1<<31),
+};
+
+enum nes_nic_rq_wqe_word_idx {
+	NES_NIC_RQ_WQE_LENGTH_1_0_IDX = 0,
+	NES_NIC_RQ_WQE_LENGTH_3_2_IDX = 1,
+	NES_NIC_RQ_WQE_FRAG0_LOW_IDX = 2,
+	NES_NIC_RQ_WQE_FRAG0_HIGH_IDX = 3,
+	NES_NIC_RQ_WQE_FRAG1_LOW_IDX = 4,
+	NES_NIC_RQ_WQE_FRAG1_HIGH_IDX = 5,
+	NES_NIC_RQ_WQE_FRAG2_LOW_IDX = 6,
+	NES_NIC_RQ_WQE_FRAG2_HIGH_IDX = 7,
+	NES_NIC_RQ_WQE_FRAG3_LOW_IDX = 8,
+	NES_NIC_RQ_WQE_FRAG3_HIGH_IDX = 9,
+};
+
+enum nes_nic_sq_wqe_word_idx {
+	NES_NIC_SQ_WQE_MISC_IDX = 0,
+	NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX = 1,
+	NES_NIC_SQ_WQE_LSO_INFO_IDX = 2,
+	NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX = 3,
+	NES_NIC_SQ_WQE_LENGTH_2_1_IDX = 4,
+	NES_NIC_SQ_WQE_LENGTH_4_3_IDX = 5,
+	NES_NIC_SQ_WQE_FRAG0_LOW_IDX = 6,
+	NES_NIC_SQ_WQE_FRAG0_HIGH_IDX = 7,
+	NES_NIC_SQ_WQE_FRAG1_LOW_IDX = 8,
+	NES_NIC_SQ_WQE_FRAG1_HIGH_IDX = 9,
+	NES_NIC_SQ_WQE_FRAG2_LOW_IDX = 10,
+	NES_NIC_SQ_WQE_FRAG2_HIGH_IDX = 11,
+	NES_NIC_SQ_WQE_FRAG3_LOW_IDX = 12,
+	NES_NIC_SQ_WQE_FRAG3_HIGH_IDX = 13,
+	NES_NIC_SQ_WQE_FRAG4_LOW_IDX = 14,
+	NES_NIC_SQ_WQE_FRAG4_HIGH_IDX = 15,
+};
+
+enum nes_iwarp_sq_wqe_word_idx {
+	NES_IWARP_SQ_WQE_MISC_IDX = 0,
+	NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX = 1,
+	NES_IWARP_SQ_WQE_COMP_CTX_LOW_IDX = 2,
+	NES_IWARP_SQ_WQE_COMP_CTX_HIGH_IDX = 3,
+	NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX = 4,
+	NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX = 5,
+	NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX = 7,
+	NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX = 8,
+	NES_IWARP_SQ_WQE_RDMA_TO_HIGH_IDX = 9,
+	NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX = 10,
+	NES_IWARP_SQ_WQE_RDMA_STAG_IDX = 11,
+	NES_IWARP_SQ_WQE_IMM_DATA_START_IDX = 12,
+	NES_IWARP_SQ_WQE_FRAG0_LOW_IDX = 16,
+	NES_IWARP_SQ_WQE_FRAG0_HIGH_IDX = 17,
+	NES_IWARP_SQ_WQE_LENGTH0_IDX = 18,
+	NES_IWARP_SQ_WQE_STAG0_IDX = 19,
+	NES_IWARP_SQ_WQE_FRAG1_LOW_IDX = 20,
+	NES_IWARP_SQ_WQE_FRAG1_HIGH_IDX = 21,
+	NES_IWARP_SQ_WQE_LENGTH1_IDX = 22,
+	NES_IWARP_SQ_WQE_STAG1_IDX = 23,
+	NES_IWARP_SQ_WQE_FRAG2_LOW_IDX = 24,
+	NES_IWARP_SQ_WQE_FRAG2_HIGH_IDX = 25,
+	NES_IWARP_SQ_WQE_LENGTH2_IDX = 26,
+	NES_IWARP_SQ_WQE_STAG2_IDX = 27,
+	NES_IWARP_SQ_WQE_FRAG3_LOW_IDX = 28,
+	NES_IWARP_SQ_WQE_FRAG3_HIGH_IDX = 29,
+	NES_IWARP_SQ_WQE_LENGTH3_IDX = 30,
+	NES_IWARP_SQ_WQE_STAG3_IDX = 31,
+};
+
+enum nes_iwarp_sq_bind_wqe_word_idx {
+	NES_IWARP_SQ_BIND_WQE_MR_IDX = 6,
+	NES_IWARP_SQ_BIND_WQE_MW_IDX = 7,
+	NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX = 8,
+	NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX = 9,
+	NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX = 10,
+	NES_IWARP_SQ_BIND_WQE_VA_FBO_HIGH_IDX = 11,
+};
+
+enum nes_iwarp_sq_fmr_wqe_word_idx {
+	NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX = 7,
+	NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX = 8,
+	NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX = 9,
+	NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX = 10,
+	NES_IWARP_SQ_FMR_WQE_VA_FBO_HIGH_IDX = 11,
+	NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX = 12,
+	NES_IWARP_SQ_FMR_WQE_PBL_ADDR_HIGH_IDX = 13,
+	NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX = 14,
+};
+
+enum nes_iwarp_sq_fmr_opcodes {
+	NES_IWARP_SQ_FMR_WQE_ZERO_BASED			= (1<<6),
+	NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K		= (0<<7),
+	NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M		= (1<<7),
+	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ	= (1<<16),
+	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE 	= (1<<17),
+	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ 	= (1<<18),
+	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE = (1<<19),
+	NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND 	= (1<<20),
+};
+
+#define NES_IWARP_SQ_FMR_WQE_MR_LENGTH_HIGH_MASK	0xFF;
+
+enum nes_iwarp_sq_locinv_wqe_word_idx {
+	NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX = 6,
+};
+
+enum nes_iwarp_rq_wqe_word_idx {
+	NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX = 1,
+	NES_IWARP_RQ_WQE_COMP_CTX_LOW_IDX = 2,
+	NES_IWARP_RQ_WQE_COMP_CTX_HIGH_IDX = 3,
+	NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX = 4,
+	NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX = 5,
+	NES_IWARP_RQ_WQE_FRAG0_LOW_IDX = 8,
+	NES_IWARP_RQ_WQE_FRAG0_HIGH_IDX = 9,
+	NES_IWARP_RQ_WQE_LENGTH0_IDX = 10,
+	NES_IWARP_RQ_WQE_STAG0_IDX = 11,
+	NES_IWARP_RQ_WQE_FRAG1_LOW_IDX = 12,
+	NES_IWARP_RQ_WQE_FRAG1_HIGH_IDX = 13,
+	NES_IWARP_RQ_WQE_LENGTH1_IDX = 14,
+	NES_IWARP_RQ_WQE_STAG1_IDX = 15,
+	NES_IWARP_RQ_WQE_FRAG2_LOW_IDX = 16,
+	NES_IWARP_RQ_WQE_FRAG2_HIGH_IDX = 17,
+	NES_IWARP_RQ_WQE_LENGTH2_IDX = 18,
+	NES_IWARP_RQ_WQE_STAG2_IDX = 19,
+	NES_IWARP_RQ_WQE_FRAG3_LOW_IDX = 20,
+	NES_IWARP_RQ_WQE_FRAG3_HIGH_IDX = 21,
+	NES_IWARP_RQ_WQE_LENGTH3_IDX = 22,
+	NES_IWARP_RQ_WQE_STAG3_IDX = 23,
+};
+
+enum nes_nic_sq_wqe_bits {
+	NES_NIC_SQ_WQE_PHDR_CS_READY =  (1<<21),
+	NES_NIC_SQ_WQE_LSO_ENABLE = (1<<22),
+	NES_NIC_SQ_WQE_TAGVALUE_ENABLE = (1<<23),
+	NES_NIC_SQ_WQE_DISABLE_CHKSUM = (1<<30),
+	NES_NIC_SQ_WQE_COMPLETION = (1<<31),
+};
+
+enum nes_nic_cqe_word_idx {
+	NES_NIC_CQE_ACCQP_ID_IDX = 0,
+	NES_NIC_CQE_HASH_RCVNXT = 1,
+	NES_NIC_CQE_TAG_PKT_TYPE_IDX = 2,
+	NES_NIC_CQE_MISC_IDX = 3,
+};
+
+#define NES_PKT_TYPE_APBVT_BITS 0xC112
+#define NES_PKT_TYPE_APBVT_MASK 0xff3e
+
+#define NES_PKT_TYPE_PVALID_BITS 0x10000000
+#define NES_PKT_TYPE_PVALID_MASK 0x30000000
+
+#define NES_PKT_TYPE_TCPV4_BITS 0x0110
+#define NES_PKT_TYPE_TCPV4_MASK 0x3f30
+
+#define NES_PKT_TYPE_UDPV4_BITS 0x0210
+#define NES_PKT_TYPE_UDPV4_MASK 0x3f30
+
+#define NES_PKT_TYPE_IPV4_BITS  0x0010
+#define NES_PKT_TYPE_IPV4_MASK  0x3f30
+
+#define NES_PKT_TYPE_OTHER_BITS 0x0000
+#define NES_PKT_TYPE_OTHER_MASK 0x0030
+
+#define NES_NIC_CQE_ERRV_SHIFT 16
+enum nes_nic_ev_bits {
+	NES_NIC_ERRV_BITS_MODE = (1<<0),
+	NES_NIC_ERRV_BITS_IPV4_CSUM_ERR = (1<<1),
+	NES_NIC_ERRV_BITS_TCPUDP_CSUM_ERR = (1<<2),
+	NES_NIC_ERRV_BITS_WQE_OVERRUN = (1<<3),
+	NES_NIC_ERRV_BITS_IPH_ERR = (1<<4),
+};
+
+enum nes_nic_cqe_bits {
+	NES_NIC_CQE_ERRV_MASK = (0xff<<NES_NIC_CQE_ERRV_SHIFT),
+	NES_NIC_CQE_SQ = (1<<24),
+	NES_NIC_CQE_ACCQP_PORT = (1<<28),
+	NES_NIC_CQE_ACCQP_VALID = (1<<29),
+	NES_NIC_CQE_TAG_VALID = (1<<30),
+	NES_NIC_CQE_VALID = (1<<31),
+};
+
+enum nes_aeqe_word_idx {
+	NES_AEQE_COMP_CTXT_LOW_IDX = 0,
+	NES_AEQE_COMP_CTXT_HIGH_IDX = 1,
+	NES_AEQE_COMP_QP_CQ_ID_IDX = 2,
+	NES_AEQE_MISC_IDX = 3,
+};
+
+enum nes_aeqe_bits {
+	NES_AEQE_QP = (1<<16),
+	NES_AEQE_CQ = (1<<17),
+	NES_AEQE_SQ = (1<<18),
+	NES_AEQE_INBOUND_RDMA = (1<<19),
+	NES_AEQE_IWARP_STATE_MASK = (7<<20),
+	NES_AEQE_TCP_STATE_MASK = (0xf<<24),
+	NES_AEQE_Q2_DATA_WRITTEN = (0x3<<28),
+	NES_AEQE_VALID = (1<<31),
+};
+
+#define NES_AEQE_IWARP_STATE_SHIFT	20
+#define NES_AEQE_TCP_STATE_SHIFT	24
+#define NES_AEQE_Q2_DATA_ETHERNET       (1<<28)
+#define NES_AEQE_Q2_DATA_MPA            (1<<29)
+
+enum nes_aeqe_iwarp_state {
+	NES_AEQE_IWARP_STATE_NON_EXISTANT = 0,
+	NES_AEQE_IWARP_STATE_IDLE = 1,
+	NES_AEQE_IWARP_STATE_RTS = 2,
+	NES_AEQE_IWARP_STATE_CLOSING = 3,
+	NES_AEQE_IWARP_STATE_TERMINATE = 5,
+	NES_AEQE_IWARP_STATE_ERROR = 6
+};
+
+enum nes_aeqe_tcp_state {
+	NES_AEQE_TCP_STATE_NON_EXISTANT = 0,
+	NES_AEQE_TCP_STATE_CLOSED = 1,
+	NES_AEQE_TCP_STATE_LISTEN = 2,
+	NES_AEQE_TCP_STATE_SYN_SENT = 3,
+	NES_AEQE_TCP_STATE_SYN_RCVD = 4,
+	NES_AEQE_TCP_STATE_ESTABLISHED = 5,
+	NES_AEQE_TCP_STATE_CLOSE_WAIT = 6,
+	NES_AEQE_TCP_STATE_FIN_WAIT_1 = 7,
+	NES_AEQE_TCP_STATE_CLOSING = 8,
+	NES_AEQE_TCP_STATE_LAST_ACK = 9,
+	NES_AEQE_TCP_STATE_FIN_WAIT_2 = 10,
+	NES_AEQE_TCP_STATE_TIME_WAIT = 11
+};
+
+enum nes_aeqe_aeid {
+	NES_AEQE_AEID_AMP_UNALLOCATED_STAG                            = 0x0102,
+	NES_AEQE_AEID_AMP_INVALID_STAG                                = 0x0103,
+	NES_AEQE_AEID_AMP_BAD_QP                                      = 0x0104,
+	NES_AEQE_AEID_AMP_BAD_PD                                      = 0x0105,
+	NES_AEQE_AEID_AMP_BAD_STAG_KEY                                = 0x0106,
+	NES_AEQE_AEID_AMP_BAD_STAG_INDEX                              = 0x0107,
+	NES_AEQE_AEID_AMP_BOUNDS_VIOLATION                            = 0x0108,
+	NES_AEQE_AEID_AMP_RIGHTS_VIOLATION                            = 0x0109,
+	NES_AEQE_AEID_AMP_TO_WRAP                                     = 0x010a,
+	NES_AEQE_AEID_AMP_FASTREG_SHARED                              = 0x010b,
+	NES_AEQE_AEID_AMP_FASTREG_VALID_STAG                          = 0x010c,
+	NES_AEQE_AEID_AMP_FASTREG_MW_STAG                             = 0x010d,
+	NES_AEQE_AEID_AMP_FASTREG_INVALID_RIGHTS                      = 0x010e,
+	NES_AEQE_AEID_AMP_FASTREG_PBL_TABLE_OVERFLOW                  = 0x010f,
+	NES_AEQE_AEID_AMP_FASTREG_INVALID_LENGTH                      = 0x0110,
+	NES_AEQE_AEID_AMP_INVALIDATE_SHARED                           = 0x0111,
+	NES_AEQE_AEID_AMP_INVALIDATE_NO_REMOTE_ACCESS_RIGHTS          = 0x0112,
+	NES_AEQE_AEID_AMP_INVALIDATE_MR_WITH_BOUND_WINDOWS            = 0x0113,
+	NES_AEQE_AEID_AMP_MWBIND_VALID_STAG                           = 0x0114,
+	NES_AEQE_AEID_AMP_MWBIND_OF_MR_STAG                           = 0x0115,
+	NES_AEQE_AEID_AMP_MWBIND_TO_ZERO_BASED_STAG                   = 0x0116,
+	NES_AEQE_AEID_AMP_MWBIND_TO_MW_STAG                           = 0x0117,
+	NES_AEQE_AEID_AMP_MWBIND_INVALID_RIGHTS                       = 0x0118,
+	NES_AEQE_AEID_AMP_MWBIND_INVALID_BOUNDS                       = 0x0119,
+	NES_AEQE_AEID_AMP_MWBIND_TO_INVALID_PARENT                    = 0x011a,
+	NES_AEQE_AEID_AMP_MWBIND_BIND_DISABLED                        = 0x011b,
+	NES_AEQE_AEID_BAD_CLOSE                                       = 0x0201,
+	NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE                         = 0x0202,
+	NES_AEQE_AEID_CQ_OPERATION_ERROR                              = 0x0203,
+	NES_AEQE_AEID_PRIV_OPERATION_DENIED                           = 0x0204,
+	NES_AEQE_AEID_RDMA_READ_WHILE_ORD_ZERO                        = 0x0205,
+	NES_AEQE_AEID_STAG_ZERO_INVALID                               = 0x0206,
+	NES_AEQE_AEID_DDP_INVALID_MSN_GAP_IN_MSN                      = 0x0301,
+	NES_AEQE_AEID_DDP_INVALID_MSN_RANGE_IS_NOT_VALID              = 0x0302,
+	NES_AEQE_AEID_DDP_UBE_DDP_MESSAGE_TOO_LONG_FOR_AVAILABLE_BUFFER = 0x0303,
+	NES_AEQE_AEID_DDP_UBE_INVALID_DDP_VERSION                     = 0x0304,
+	NES_AEQE_AEID_DDP_UBE_INVALID_MO                              = 0x0305,
+	NES_AEQE_AEID_DDP_UBE_INVALID_MSN_NO_BUFFER_AVAILABLE         = 0x0306,
+	NES_AEQE_AEID_DDP_UBE_INVALID_QN                              = 0x0307,
+	NES_AEQE_AEID_DDP_NO_L_BIT                                    = 0x0308,
+	NES_AEQE_AEID_RDMAP_ROE_INVALID_RDMAP_VERSION                 = 0x0311,
+	NES_AEQE_AEID_RDMAP_ROE_UNEXPECTED_OPCODE                     = 0x0312,
+	NES_AEQE_AEID_ROE_INVALID_RDMA_READ_REQUEST                   = 0x0313,
+	NES_AEQE_AEID_ROE_INVALID_RDMA_WRITE_OR_READ_RESP             = 0x0314,
+	NES_AEQE_AEID_INVALID_ARP_ENTRY                               = 0x0401,
+	NES_AEQE_AEID_INVALID_TCP_OPTION_RCVD                         = 0x0402,
+	NES_AEQE_AEID_STALE_ARP_ENTRY                                 = 0x0403,
+	NES_AEQE_AEID_LLP_CLOSE_COMPLETE                              = 0x0501,
+	NES_AEQE_AEID_LLP_CONNECTION_RESET                            = 0x0502,
+	NES_AEQE_AEID_LLP_FIN_RECEIVED                                = 0x0503,
+	NES_AEQE_AEID_LLP_RECEIVED_MARKER_AND_LENGTH_FIELDS_DONT_MATCH =  0x0504,
+	NES_AEQE_AEID_LLP_RECEIVED_MPA_CRC_ERROR                      = 0x0505,
+	NES_AEQE_AEID_LLP_SEGMENT_TOO_LARGE                           = 0x0506,
+	NES_AEQE_AEID_LLP_SEGMENT_TOO_SMALL                           = 0x0507,
+	NES_AEQE_AEID_LLP_SYN_RECEIVED                                = 0x0508,
+	NES_AEQE_AEID_LLP_TERMINATE_RECEIVED                          = 0x0509,
+	NES_AEQE_AEID_LLP_TOO_MANY_RETRIES                            = 0x050a,
+	NES_AEQE_AEID_LLP_TOO_MANY_KEEPALIVE_RETRIES                  = 0x050b,
+	NES_AEQE_AEID_RESET_SENT                                      = 0x0601,
+	NES_AEQE_AEID_TERMINATE_SENT                                  = 0x0602,
+	NES_AEQE_AEID_DDP_LCE_LOCAL_CATASTROPHIC                      = 0x0700
+};
+
+enum nes_iwarp_sq_opcodes {
+	NES_IWARP_SQ_WQE_WRPDU = (1<<15),
+	NES_IWARP_SQ_WQE_PSH = (1<<21),
+	NES_IWARP_SQ_WQE_STREAMING = (1<<23),
+	NES_IWARP_SQ_WQE_IMM_DATA = (1<<28),
+	NES_IWARP_SQ_WQE_READ_FENCE = (1<<29),
+	NES_IWARP_SQ_WQE_LOCAL_FENCE = (1<<30),
+	NES_IWARP_SQ_WQE_SIGNALED_COMPL = (1<<31),
+};
+
+enum nes_iwarp_sq_wqe_bits {
+	NES_IWARP_SQ_OP_RDMAW = 0,
+	NES_IWARP_SQ_OP_RDMAR = 1,
+	NES_IWARP_SQ_OP_SEND = 3,
+	NES_IWARP_SQ_OP_SENDINV = 4,
+	NES_IWARP_SQ_OP_SENDSE = 5,
+	NES_IWARP_SQ_OP_SENDSEINV = 6,
+	NES_IWARP_SQ_OP_BIND = 8,
+	NES_IWARP_SQ_OP_FAST_REG = 9,
+	NES_IWARP_SQ_OP_LOCINV = 10,
+	NES_IWARP_SQ_OP_RDMAR_LOCINV = 11,
+	NES_IWARP_SQ_OP_NOP = 12,
+};
+
+enum nes_iwarp_cqe_major_code {
+	NES_IWARP_CQE_MAJOR_FLUSH = 1,
+	NES_IWARP_CQE_MAJOR_DRV = 0x8000
+};
+
+enum nes_iwarp_cqe_minor_code {
+	NES_IWARP_CQE_MINOR_FLUSH = 1
+};
+
+#define NES_EEPROM_READ_REQUEST (1<<16)
+#define NES_MAC_ADDR_VALID      (1<<20)
+
+/*
+ * NES index registers init values.
+ */
+struct nes_init_values {
+	u32 index;
+	u32 data;
+	u8  wrt;
+};
+
+/*
+ * NES registers in BAR0.
+ */
+struct nes_pci_regs {
+	u32 int_status;
+	u32 int_mask;
+	u32 int_pending;
+	u32 intf_int_status;
+	u32 intf_int_mask;
+	u32 other_regs[59];	 /* pad out to 256 bytes for now */
+};
+
+#define NES_CQP_SQ_SIZE    128
+#define NES_CCQ_SIZE       128
+#define NES_NIC_WQ_SIZE    512
+#define NES_NIC_CTX_SIZE   ((NES_NIC_CTX_RQ_SIZE_512) | (NES_NIC_CTX_SQ_SIZE_512))
+#define NES_NIC_BACK_STORE 0x00038000
+
+struct nes_device;
+
+struct nes_hw_nic_qp_context {
+	__le32 context_words[6];
+};
+
+struct nes_hw_nic_sq_wqe {
+	__le32 wqe_words[16];
+};
+
+struct nes_hw_nic_rq_wqe {
+	__le32 wqe_words[16];
+};
+
+struct nes_hw_nic_cqe {
+	__le32 cqe_words[4];
+};
+
+struct nes_hw_cqp_qp_context {
+	__le32 context_words[4];
+};
+
+struct nes_hw_cqp_wqe {
+	__le32 wqe_words[16];
+};
+
+struct nes_hw_qp_wqe {
+	__le32 wqe_words[32];
+};
+
+struct nes_hw_cqe {
+	__le32 cqe_words[8];
+};
+
+struct nes_hw_ceqe {
+	__le32 ceqe_words[2];
+};
+
+struct nes_hw_aeqe {
+	__le32 aeqe_words[4];
+};
+
+struct nes_cqp_request {
+	union {
+		u64 cqp_callback_context;
+		void *cqp_callback_pointer;
+	};
+	wait_queue_head_t     waitq;
+	struct nes_hw_cqp_wqe cqp_wqe;
+	struct list_head      list;
+	atomic_t              refcount;
+	void (*cqp_callback)(struct nes_device *nesdev, struct nes_cqp_request *cqp_request);
+	u16                   major_code;
+	u16                   minor_code;
+	u8                    waiting;
+	u8                    request_done;
+	u8                    dynamic;
+	u8                    callback;
+};
+
+struct nes_hw_cqp {
+	struct nes_hw_cqp_wqe *sq_vbase;
+	dma_addr_t            sq_pbase;
+	spinlock_t            lock;
+	wait_queue_head_t     waitq;
+	u16                   qp_id;
+	u16                   sq_head;
+	u16                   sq_tail;
+	u16                   sq_size;
+};
+
+#define NES_FIRST_FRAG_SIZE 128
+struct nes_first_frag {
+	u8 buffer[NES_FIRST_FRAG_SIZE];
+};
+
+struct nes_hw_nic {
+	struct nes_first_frag    *first_frag_vbase;	/* virtual address of first frags */
+	struct nes_hw_nic_sq_wqe *sq_vbase;			/* virtual address of sq */
+	struct nes_hw_nic_rq_wqe *rq_vbase;			/* virtual address of rq */
+	struct sk_buff           *tx_skb[NES_NIC_WQ_SIZE];
+	struct sk_buff           *rx_skb[NES_NIC_WQ_SIZE];
+	dma_addr_t frag_paddr[NES_NIC_WQ_SIZE];
+	unsigned long first_frag_overflow[BITS_TO_LONGS(NES_NIC_WQ_SIZE)];
+	dma_addr_t sq_pbase;			/* PCI memory for host rings */
+	dma_addr_t rq_pbase;			/* PCI memory for host rings */
+
+	u16 qp_id;
+	u16 sq_head;
+	u16 sq_tail;
+	u16 sq_size;
+	u16 rq_head;
+	u16 rq_tail;
+	u16 rq_size;
+	u8 replenishing_rq;
+	u8 reserved;
+
+	spinlock_t rq_lock;
+};
+
+struct nes_hw_nic_cq {
+	struct nes_hw_nic_cqe volatile *cq_vbase;	/* PCI memory for host rings */
+	void (*ce_handler)(struct nes_device *nesdev, struct nes_hw_nic_cq *cq);
+	dma_addr_t cq_pbase;	/* PCI memory for host rings */
+	int rx_cqes_completed;
+	int cqe_allocs_pending;
+	int rx_pkts_indicated;
+	u16 cq_head;
+	u16 cq_size;
+	u16 cq_number;
+	u8  cqes_pending;
+};
+
+struct nes_hw_qp {
+	struct nes_hw_qp_wqe *sq_vbase;		/* PCI memory for host rings */
+	struct nes_hw_qp_wqe *rq_vbase;		/* PCI memory for host rings */
+	void                 *q2_vbase;			/* PCI memory for host rings */
+	dma_addr_t sq_pbase;	/* PCI memory for host rings */
+	dma_addr_t rq_pbase;	/* PCI memory for host rings */
+	dma_addr_t q2_pbase;	/* PCI memory for host rings */
+	u32 qp_id;
+	u16 sq_head;
+	u16 sq_tail;
+	u16 sq_size;
+	u16 rq_head;
+	u16 rq_tail;
+	u16 rq_size;
+	u8  rq_encoded_size;
+	u8  sq_encoded_size;
+};
+
+struct nes_hw_cq {
+	struct nes_hw_cqe *cq_vbase;	/* PCI memory for host rings */
+	void (*ce_handler)(struct nes_device *nesdev, struct nes_hw_cq *cq);
+	dma_addr_t cq_pbase;	/* PCI memory for host rings */
+	u16 cq_head;
+	u16 cq_size;
+	u16 cq_number;
+};
+
+struct nes_hw_ceq {
+	struct nes_hw_ceqe volatile *ceq_vbase;	/* PCI memory for host rings */
+	dma_addr_t ceq_pbase;	/* PCI memory for host rings */
+	u16 ceq_head;
+	u16 ceq_size;
+};
+
+struct nes_hw_aeq {
+	struct nes_hw_aeqe volatile *aeq_vbase;	/* PCI memory for host rings */
+	dma_addr_t aeq_pbase;	/* PCI memory for host rings */
+	u16 aeq_head;
+	u16 aeq_size;
+};
+
+struct nic_qp_map {
+	u8 qpid;
+	u8 nic_index;
+	u8 logical_port;
+	u8 is_hnic;
+};
+
+#define	NES_CQP_ARP_AEQ_INDEX_MASK  0x000f0000
+#define	NES_CQP_ARP_AEQ_INDEX_SHIFT 16
+
+#define NES_CQP_APBVT_ADD			0x00008000
+#define NES_CQP_APBVT_NIC_SHIFT		16
+
+#define NES_ARP_ADD     1
+#define NES_ARP_DELETE  2
+#define NES_ARP_RESOLVE 3
+
+#define NES_MAC_SW_IDLE      0
+#define NES_MAC_SW_INTERRUPT 1
+#define NES_MAC_SW_MH        2
+
+struct nes_arp_entry {
+	u32 ip_addr;
+	u8  mac_addr[ETH_ALEN];
+};
+
+#define NES_NIC_FAST_TIMER          96
+#define NES_NIC_FAST_TIMER_LOW      40
+#define NES_NIC_FAST_TIMER_HIGH     1000
+#define DEFAULT_NES_QL_HIGH         256
+#define DEFAULT_NES_QL_LOW          16
+#define DEFAULT_NES_QL_TARGET       64
+#define DEFAULT_JUMBO_NES_QL_LOW    12
+#define DEFAULT_JUMBO_NES_QL_TARGET 40
+#define DEFAULT_JUMBO_NES_QL_HIGH   128
+#define NES_NIC_CQ_DOWNWARD_TREND   16
+#define NES_PFT_SIZE		    48
+
+#define NES_MGT_WQ_COUNT 32
+#define NES_MGT_CTX_SIZE ((NES_NIC_CTX_RQ_SIZE_32) | (NES_NIC_CTX_SQ_SIZE_32))
+#define NES_MGT_QP_OFFSET 36
+#define NES_MGT_QP_COUNT 4
+
+struct nes_hw_tune_timer {
+    /* u16 cq_count; */
+    u16 threshold_low;
+    u16 threshold_target;
+    u16 threshold_high;
+    u16 timer_in_use;
+    u16 timer_in_use_old;
+    u16 timer_in_use_min;
+    u16 timer_in_use_max;
+    u8  timer_direction_upward;
+    u8  timer_direction_downward;
+    u16 cq_count_old;
+    u8  cq_direction_downward;
+};
+
+#define NES_TIMER_INT_LIMIT         2
+#define NES_TIMER_INT_LIMIT_DYNAMIC 10
+#define NES_TIMER_ENABLE_LIMIT      4
+#define NES_MAX_LINK_INTERRUPTS     128
+#define NES_MAX_LINK_CHECK          200
+#define NES_MAX_LRO_DESCRIPTORS     32
+#define NES_LRO_MAX_AGGR            64
+
+struct nes_adapter {
+	u64              fw_ver;
+	unsigned long    *allocated_qps;
+	unsigned long    *allocated_cqs;
+	unsigned long    *allocated_mrs;
+	unsigned long    *allocated_pds;
+	unsigned long    *allocated_arps;
+	struct nes_qp    **qp_table;
+	struct workqueue_struct *work_q;
+
+	struct list_head list;
+	struct list_head active_listeners;
+	/* list of the netdev's associated with each logical port */
+	struct list_head nesvnic_list[4];
+
+	struct timer_list  mh_timer;
+	struct timer_list  lc_timer;
+	struct work_struct work;
+	spinlock_t         resource_lock;
+	spinlock_t         phy_lock;
+	spinlock_t         pbl_lock;
+	spinlock_t         periodic_timer_lock;
+
+	struct nes_arp_entry arp_table[NES_MAX_ARP_TABLE_SIZE];
+
+	/* Adapter CEQ and AEQs */
+	struct nes_hw_ceq ceq[16];
+	struct nes_hw_aeq aeq[8];
+
+	struct nes_hw_tune_timer tune_timer;
+
+	unsigned long doorbell_start;
+
+	u32 hw_rev;
+	u32 vendor_id;
+	u32 vendor_part_id;
+	u32 device_cap_flags;
+	u32 tick_delta;
+	u32 timer_int_req;
+	u32 arp_table_size;
+	u32 next_arp_index;
+
+	u32 max_mr;
+	u32 max_256pbl;
+	u32 max_4kpbl;
+	u32 free_256pbl;
+	u32 free_4kpbl;
+	u32 max_mr_size;
+	u32 max_qp;
+	u32 next_qp;
+	u32 max_irrq;
+	u32 max_qp_wr;
+	u32 max_sge;
+	u32 max_cq;
+	u32 next_cq;
+	u32 max_cqe;
+	u32 max_pd;
+	u32 base_pd;
+	u32 next_pd;
+	u32 hte_index_mask;
+
+	/* EEPROM information */
+	u32 rx_pool_size;
+	u32 tx_pool_size;
+	u32 rx_threshold;
+	u32 tcp_timer_core_clk_divisor;
+	u32 iwarp_config;
+	u32 cm_config;
+	u32 sws_timer_config;
+	u32 tcp_config1;
+	u32 wqm_wat;
+	u32 core_clock;
+	u32 firmware_version;
+	u32 eeprom_version;
+
+	u32 nic_rx_eth_route_err;
+
+	u32 et_rx_coalesce_usecs;
+	u32 et_rx_max_coalesced_frames;
+	u32 et_rx_coalesce_usecs_irq;
+	u32 et_rx_max_coalesced_frames_irq;
+	u32 et_pkt_rate_low;
+	u32 et_rx_coalesce_usecs_low;
+	u32 et_rx_max_coalesced_frames_low;
+	u32 et_pkt_rate_high;
+	u32 et_rx_coalesce_usecs_high;
+	u32 et_rx_max_coalesced_frames_high;
+	u32 et_rate_sample_interval;
+	u32 timer_int_limit;
+	u32 wqm_quanta;
+	u8 allow_unaligned_fpdus;
+
+	/* Adapter base MAC address */
+	u32 mac_addr_low;
+	u16 mac_addr_high;
+
+	u16 firmware_eeprom_offset;
+	u16 software_eeprom_offset;
+
+	u16 max_irrq_wr;
+
+	/* pd config for each port */
+	u16 pd_config_size[4];
+	u16 pd_config_base[4];
+
+	u16 link_interrupt_count[4];
+	u8 crit_error_count[32];
+
+	/* the phy index for each port */
+	u8  phy_index[4];
+	u8  mac_sw_state[4];
+	u8  mac_link_down[4];
+	u8  phy_type[4];
+	u8  log_port;
+
+	/* PCI information */
+	unsigned int  devfn;
+	unsigned char bus_number;
+	unsigned char OneG_Mode;
+
+	unsigned char ref_count;
+	u8            netdev_count;
+	u8            netdev_max;	/* from host nic address count in EEPROM */
+	u8            port_count;
+	u8            virtwq;
+	u8            send_term_ok;
+	u8            et_use_adaptive_rx_coalesce;
+	u8            adapter_fcn_count;
+	u8 pft_mcast_map[NES_PFT_SIZE];
+};
+
+struct nes_pbl {
+	u64              *pbl_vbase;
+	dma_addr_t       pbl_pbase;
+	struct page      *page;
+	unsigned long    user_base;
+	u32              pbl_size;
+	struct list_head list;
+	/* TODO: need to add list for two level tables */
+};
+
+#define NES_4K_PBL_CHUNK_SIZE	4096
+
+struct nes_fast_mr_wqe_pbl {
+	u64		*kva;
+	dma_addr_t	paddr;
+};
+
+struct nes_ib_fast_reg_page_list {
+	struct ib_fast_reg_page_list	ibfrpl;
+	struct nes_fast_mr_wqe_pbl 	nes_wqe_pbl;
+	u64 				pbl;
+};
+
+struct nes_listener {
+	struct work_struct      work;
+	struct workqueue_struct *wq;
+	struct nes_vnic         *nesvnic;
+	struct iw_cm_id         *cm_id;
+	struct list_head        list;
+	unsigned long           socket;
+	u8                      accept_failed;
+};
+
+struct nes_ib_device;
+
+#define NES_EVENT_DELAY msecs_to_jiffies(100)
+
+struct nes_vnic {
+	struct nes_ib_device *nesibdev;
+	u64 sq_full;
+	u64 tso_requests;
+	u64 segmented_tso_requests;
+	u64 linearized_skbs;
+	u64 tx_sw_dropped;
+	u64 endnode_nstat_rx_discard;
+	u64 endnode_nstat_rx_octets;
+	u64 endnode_nstat_rx_frames;
+	u64 endnode_nstat_tx_octets;
+	u64 endnode_nstat_tx_frames;
+	u64 endnode_ipv4_tcp_retransmits;
+	/* void *mem; */
+	struct nes_device *nesdev;
+	struct net_device *netdev;
+	atomic_t          rx_skbs_needed;
+	atomic_t          rx_skb_timer_running;
+	int               budget;
+	u32               msg_enable;
+	/* u32 tx_avail; */
+	__be32            local_ipaddr;
+	struct napi_struct   napi;
+	spinlock_t           tx_lock;	/* could use netdev tx lock? */
+	struct timer_list    rq_wqes_timer;
+	u32                  nic_mem_size;
+	void                 *nic_vbase;
+	dma_addr_t           nic_pbase;
+	struct nes_hw_nic    nic;
+	struct nes_hw_nic_cq nic_cq;
+	u32    mcrq_qp_id;
+	struct nes_ucontext *mcrq_ucontext;
+	struct nes_cqp_request* (*get_cqp_request)(struct nes_device *nesdev);
+	void (*post_cqp_request)(struct nes_device*, struct nes_cqp_request *);
+	int (*mcrq_mcast_filter)( struct nes_vnic* nesvnic, __u8* dmi_addr );
+	struct net_device_stats netstats;
+	/* used to put the netdev on the adapters logical port list */
+	struct list_head list;
+	u16 max_frame_size;
+	u8  netdev_open;
+	u8  linkup;
+	u8  logical_port;
+	u8  netdev_index;  /* might not be needed, indexes nesdev->netdev */
+	u8  perfect_filter_index;
+	u8  nic_index;
+	u8  qp_nic_index[4];
+	u8  next_qp_nic_index;
+	u8  of_device_registered;
+	u8  rdma_enabled;
+	u32 lro_max_aggr;
+	struct net_lro_mgr lro_mgr;
+	struct net_lro_desc lro_desc[NES_MAX_LRO_DESCRIPTORS];
+	struct timer_list event_timer;
+	enum ib_event_type delayed_event;
+	enum ib_event_type last_dispatched_event;
+	spinlock_t port_ibevent_lock;
+	u32 mgt_mem_size;
+	void *mgt_vbase;
+	dma_addr_t mgt_pbase;
+	struct nes_vnic_mgt *mgtvnic[NES_MGT_QP_COUNT];
+	struct task_struct *mgt_thread;
+	wait_queue_head_t mgt_wait_queue;
+	struct sk_buff_head mgt_skb_list;
+
+};
+
+struct nes_ib_device {
+	struct ib_device ibdev;
+	struct nes_vnic *nesvnic;
+
+	/* Virtual RNIC Limits */
+	u32 max_mr;
+	u32 max_qp;
+	u32 max_cq;
+	u32 max_pd;
+	u32 num_mr;
+	u32 num_qp;
+	u32 num_cq;
+	u32 num_pd;
+};
+
+enum nes_hdrct_flags {
+	DDP_LEN_FLAG                    = 0x80,
+	DDP_HDR_FLAG                    = 0x40,
+	RDMA_HDR_FLAG                   = 0x20
+};
+
+enum nes_term_layers {
+	LAYER_RDMA			= 0,
+	LAYER_DDP			= 1,
+	LAYER_MPA			= 2
+};
+
+enum nes_term_error_types {
+	RDMAP_CATASTROPHIC		= 0,
+	RDMAP_REMOTE_PROT		= 1,
+	RDMAP_REMOTE_OP			= 2,
+	DDP_CATASTROPHIC		= 0,
+	DDP_TAGGED_BUFFER		= 1,
+	DDP_UNTAGGED_BUFFER		= 2,
+	DDP_LLP				= 3
+};
+
+enum nes_term_rdma_errors {
+	RDMAP_INV_STAG			= 0x00,
+	RDMAP_INV_BOUNDS		= 0x01,
+	RDMAP_ACCESS			= 0x02,
+	RDMAP_UNASSOC_STAG		= 0x03,
+	RDMAP_TO_WRAP			= 0x04,
+	RDMAP_INV_RDMAP_VER		= 0x05,
+	RDMAP_UNEXPECTED_OP		= 0x06,
+	RDMAP_CATASTROPHIC_LOCAL	= 0x07,
+	RDMAP_CATASTROPHIC_GLOBAL	= 0x08,
+	RDMAP_CANT_INV_STAG		= 0x09,
+	RDMAP_UNSPECIFIED		= 0xff
+};
+
+enum nes_term_ddp_errors {
+	DDP_CATASTROPHIC_LOCAL		= 0x00,
+	DDP_TAGGED_INV_STAG		= 0x00,
+	DDP_TAGGED_BOUNDS		= 0x01,
+	DDP_TAGGED_UNASSOC_STAG		= 0x02,
+	DDP_TAGGED_TO_WRAP		= 0x03,
+	DDP_TAGGED_INV_DDP_VER		= 0x04,
+	DDP_UNTAGGED_INV_QN		= 0x01,
+	DDP_UNTAGGED_INV_MSN_NO_BUF	= 0x02,
+	DDP_UNTAGGED_INV_MSN_RANGE	= 0x03,
+	DDP_UNTAGGED_INV_MO		= 0x04,
+	DDP_UNTAGGED_INV_TOO_LONG	= 0x05,
+	DDP_UNTAGGED_INV_DDP_VER	= 0x06
+};
+
+enum nes_term_mpa_errors {
+	MPA_CLOSED			= 0x01,
+	MPA_CRC				= 0x02,
+	MPA_MARKER			= 0x03,
+	MPA_REQ_RSP			= 0x04,
+};
+
+struct nes_terminate_hdr {
+	u8 layer_etype;
+	u8 error_code;
+	u8 hdrct;
+	u8 rsvd;
+};
+
+/* Used to determine how to fill in terminate error codes */
+#define IWARP_OPCODE_WRITE		0
+#define IWARP_OPCODE_READREQ		1
+#define IWARP_OPCODE_READRSP		2
+#define IWARP_OPCODE_SEND		3
+#define IWARP_OPCODE_SEND_INV		4
+#define IWARP_OPCODE_SEND_SE		5
+#define IWARP_OPCODE_SEND_SE_INV	6
+#define IWARP_OPCODE_TERM		7
+
+/* These values are used only during terminate processing */
+#define TERM_DDP_LEN_TAGGED	14
+#define TERM_DDP_LEN_UNTAGGED	18
+#define TERM_RDMA_LEN		28
+#define RDMA_OPCODE_MASK	0x0f
+#define RDMA_READ_REQ_OPCODE	1
+#define BAD_FRAME_OFFSET	64
+#define CQE_MAJOR_DRV		0x8000
+
+/* Used for link status recheck after interrupt processing */
+#define NES_LINK_RECHECK_DELAY	msecs_to_jiffies(50)
+#define NES_LINK_RECHECK_MAX	60
+
+#endif		/* __NES_HW_H */
diff --git a/drivers/infiniband/hw/nes/nes_mgt.c b/drivers/infiniband/hw/nes/nes_mgt.c
new file mode 100644
index 000000000..416645259
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_mgt.c
@@ -0,0 +1,1160 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel-NE, Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/skbuff.h>
+#include <linux/etherdevice.h>
+#include <linux/kthread.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
+#include "nes.h"
+#include "nes_mgt.h"
+
+atomic_t pau_qps_created;
+atomic_t pau_qps_destroyed;
+
+static void nes_replenish_mgt_rq(struct nes_vnic_mgt *mgtvnic)
+{
+	unsigned long flags;
+	dma_addr_t bus_address;
+	struct sk_buff *skb;
+	struct nes_hw_nic_rq_wqe *nic_rqe;
+	struct nes_hw_mgt *nesmgt;
+	struct nes_device *nesdev;
+	struct nes_rskb_cb *cb;
+	u32 rx_wqes_posted = 0;
+
+	nesmgt = &mgtvnic->mgt;
+	nesdev = mgtvnic->nesvnic->nesdev;
+	spin_lock_irqsave(&nesmgt->rq_lock, flags);
+	if (nesmgt->replenishing_rq != 0) {
+		if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) &&
+		    (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) {
+			atomic_set(&mgtvnic->rx_skb_timer_running, 1);
+			spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
+			mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2);      /* 1/2 second */
+			add_timer(&mgtvnic->rq_wqes_timer);
+		} else {
+			spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
+		}
+		return;
+	}
+	nesmgt->replenishing_rq = 1;
+	spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
+	do {
+		skb = dev_alloc_skb(mgtvnic->nesvnic->max_frame_size);
+		if (skb) {
+			skb->dev = mgtvnic->nesvnic->netdev;
+
+			bus_address = pci_map_single(nesdev->pcidev,
+						     skb->data, mgtvnic->nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+			cb = (struct nes_rskb_cb *)&skb->cb[0];
+			cb->busaddr = bus_address;
+			cb->maplen = mgtvnic->nesvnic->max_frame_size;
+
+			nic_rqe = &nesmgt->rq_vbase[mgtvnic->mgt.rq_head];
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] =
+				cpu_to_le32(mgtvnic->nesvnic->max_frame_size);
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] =
+				cpu_to_le32((u32)bus_address);
+			nic_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] =
+				cpu_to_le32((u32)((u64)bus_address >> 32));
+			nesmgt->rx_skb[nesmgt->rq_head] = skb;
+			nesmgt->rq_head++;
+			nesmgt->rq_head &= nesmgt->rq_size - 1;
+			atomic_dec(&mgtvnic->rx_skbs_needed);
+			barrier();
+			if (++rx_wqes_posted == 255) {
+				nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id);
+				rx_wqes_posted = 0;
+			}
+		} else {
+			spin_lock_irqsave(&nesmgt->rq_lock, flags);
+			if (((nesmgt->rq_size - 1) == atomic_read(&mgtvnic->rx_skbs_needed)) &&
+			    (atomic_read(&mgtvnic->rx_skb_timer_running) == 0)) {
+				atomic_set(&mgtvnic->rx_skb_timer_running, 1);
+				spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
+				mgtvnic->rq_wqes_timer.expires = jiffies + (HZ / 2);      /* 1/2 second */
+				add_timer(&mgtvnic->rq_wqes_timer);
+			} else {
+				spin_unlock_irqrestore(&nesmgt->rq_lock, flags);
+			}
+			break;
+		}
+	} while (atomic_read(&mgtvnic->rx_skbs_needed));
+	barrier();
+	if (rx_wqes_posted)
+		nes_write32(nesdev->regs + NES_WQE_ALLOC, (rx_wqes_posted << 24) | nesmgt->qp_id);
+	nesmgt->replenishing_rq = 0;
+}
+
+/**
+ * nes_mgt_rq_wqes_timeout
+ */
+static void nes_mgt_rq_wqes_timeout(unsigned long parm)
+{
+	struct nes_vnic_mgt *mgtvnic = (struct nes_vnic_mgt *)parm;
+
+	atomic_set(&mgtvnic->rx_skb_timer_running, 0);
+	if (atomic_read(&mgtvnic->rx_skbs_needed))
+		nes_replenish_mgt_rq(mgtvnic);
+}
+
+/**
+ * nes_mgt_free_skb - unmap and free skb
+ */
+static void nes_mgt_free_skb(struct nes_device *nesdev, struct sk_buff *skb, u32 dir)
+{
+	struct nes_rskb_cb *cb;
+
+	cb = (struct nes_rskb_cb *)&skb->cb[0];
+	pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, dir);
+	cb->busaddr = 0;
+	dev_kfree_skb_any(skb);
+}
+
+/**
+ * nes_download_callback - handle download completions
+ */
+static void nes_download_callback(struct nes_device *nesdev, struct nes_cqp_request *cqp_request)
+{
+	struct pau_fpdu_info *fpdu_info = cqp_request->cqp_callback_pointer;
+	struct nes_qp *nesqp = fpdu_info->nesqp;
+	struct sk_buff *skb;
+	int i;
+
+	for (i = 0; i < fpdu_info->frag_cnt; i++) {
+		skb = fpdu_info->frags[i].skb;
+		if (fpdu_info->frags[i].cmplt) {
+			nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE);
+			nes_rem_ref_cm_node(nesqp->cm_node);
+		}
+	}
+
+	if (fpdu_info->hdr_vbase)
+		pci_free_consistent(nesdev->pcidev, fpdu_info->hdr_len,
+				    fpdu_info->hdr_vbase, fpdu_info->hdr_pbase);
+	kfree(fpdu_info);
+}
+
+/**
+ * nes_get_seq - Get the seq, ack_seq and window from the packet
+ */
+static u32 nes_get_seq(struct sk_buff *skb, u32 *ack, u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd)
+{
+	struct nes_rskb_cb *cb = (struct nes_rskb_cb *)&skb->cb[0];
+	struct iphdr *iph = (struct iphdr *)(cb->data_start + ETH_HLEN);
+	struct tcphdr *tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
+
+	*ack = be32_to_cpu(tcph->ack_seq);
+	*wnd = be16_to_cpu(tcph->window);
+	*fin_rcvd = tcph->fin;
+	*rst_rcvd = tcph->rst;
+	return be32_to_cpu(tcph->seq);
+}
+
+/**
+ * nes_get_next_skb - Get the next skb based on where current skb is in the queue
+ */
+static struct sk_buff *nes_get_next_skb(struct nes_device *nesdev, struct nes_qp *nesqp,
+					struct sk_buff *skb, u32 nextseq, u32 *ack,
+					u16 *wnd, u32 *fin_rcvd, u32 *rst_rcvd)
+{
+	u32 seq;
+	bool processacks;
+	struct sk_buff *old_skb;
+
+	if (skb) {
+		/* Continue processing fpdu */
+		if (skb->next == (struct sk_buff *)&nesqp->pau_list)
+			goto out;
+		skb = skb->next;
+		processacks = false;
+	} else {
+		/* Starting a new one */
+		if (skb_queue_empty(&nesqp->pau_list))
+			goto out;
+		skb = skb_peek(&nesqp->pau_list);
+		processacks = true;
+	}
+
+	while (1) {
+		if (skb_queue_empty(&nesqp->pau_list))
+			goto out;
+
+		seq = nes_get_seq(skb, ack, wnd, fin_rcvd, rst_rcvd);
+		if (seq == nextseq) {
+			if (skb->len || processacks)
+				break;
+		} else if (after(seq, nextseq)) {
+			goto out;
+		}
+
+		old_skb = skb;
+		skb = skb->next;
+		skb_unlink(old_skb, &nesqp->pau_list);
+		nes_mgt_free_skb(nesdev, old_skb, PCI_DMA_TODEVICE);
+		nes_rem_ref_cm_node(nesqp->cm_node);
+		if (skb == (struct sk_buff *)&nesqp->pau_list)
+			goto out;
+	}
+	return skb;
+
+out:
+	return NULL;
+}
+
+/**
+ * get_fpdu_info - Find the next complete fpdu and return its fragments.
+ */
+static int get_fpdu_info(struct nes_device *nesdev, struct nes_qp *nesqp,
+			 struct pau_fpdu_info **pau_fpdu_info)
+{
+	struct sk_buff *skb;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	struct nes_rskb_cb *cb;
+	struct pau_fpdu_info *fpdu_info = NULL;
+	struct pau_fpdu_frag frags[MAX_FPDU_FRAGS];
+	u32 fpdu_len = 0;
+	u32 tmp_len;
+	int frag_cnt = 0;
+	u32 tot_len;
+	u32 frag_tot;
+	u32 ack;
+	u32 fin_rcvd;
+	u32 rst_rcvd;
+	u16 wnd;
+	int i;
+	int rc = 0;
+
+	*pau_fpdu_info = NULL;
+
+	skb = nes_get_next_skb(nesdev, nesqp, NULL, nesqp->pau_rcv_nxt, &ack, &wnd, &fin_rcvd, &rst_rcvd);
+	if (!skb)
+		goto out;
+
+	cb = (struct nes_rskb_cb *)&skb->cb[0];
+	if (skb->len) {
+		fpdu_len = be16_to_cpu(*(__be16 *) skb->data) + MPA_FRAMING;
+		fpdu_len = (fpdu_len + 3) & 0xfffffffc;
+		tmp_len = fpdu_len;
+
+		/* See if we have all of the fpdu */
+		frag_tot = 0;
+		memset(&frags, 0, sizeof frags);
+		for (i = 0; i < MAX_FPDU_FRAGS; i++) {
+			frags[i].physaddr = cb->busaddr;
+			frags[i].physaddr += skb->data - cb->data_start;
+			frags[i].frag_len = min(tmp_len, skb->len);
+			frags[i].skb = skb;
+			frags[i].cmplt = (skb->len == frags[i].frag_len);
+			frag_tot += frags[i].frag_len;
+			frag_cnt++;
+
+			tmp_len -= frags[i].frag_len;
+			if (tmp_len == 0)
+				break;
+
+			skb = nes_get_next_skb(nesdev, nesqp, skb,
+					       nesqp->pau_rcv_nxt + frag_tot, &ack, &wnd, &fin_rcvd, &rst_rcvd);
+			if (!skb)
+				goto out;
+			if (rst_rcvd) {
+				/* rst received in the middle of fpdu */
+				for (; i >= 0; i--) {
+					skb_unlink(frags[i].skb, &nesqp->pau_list);
+					nes_mgt_free_skb(nesdev, frags[i].skb, PCI_DMA_TODEVICE);
+				}
+				cb = (struct nes_rskb_cb *)&skb->cb[0];
+				frags[0].physaddr = cb->busaddr;
+				frags[0].physaddr += skb->data - cb->data_start;
+				frags[0].frag_len = skb->len;
+				frags[0].skb = skb;
+				frags[0].cmplt = true;
+				frag_cnt = 1;
+				break;
+			}
+
+			cb = (struct nes_rskb_cb *)&skb->cb[0];
+		}
+	} else {
+		/* no data */
+		frags[0].physaddr = cb->busaddr;
+		frags[0].frag_len = 0;
+		frags[0].skb = skb;
+		frags[0].cmplt = true;
+		frag_cnt = 1;
+	}
+
+	/* Found one */
+	fpdu_info = kzalloc(sizeof(*fpdu_info), GFP_ATOMIC);
+	if (fpdu_info == NULL) {
+		nes_debug(NES_DBG_PAU, "Failed to alloc a fpdu_info.\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	fpdu_info->cqp_request = nes_get_cqp_request(nesdev);
+	if (fpdu_info->cqp_request == NULL) {
+		nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n");
+		rc = -ENOMEM;
+		goto out;
+	}
+
+	cb = (struct nes_rskb_cb *)&frags[0].skb->cb[0];
+	iph = (struct iphdr *)(cb->data_start + ETH_HLEN);
+	tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
+	fpdu_info->hdr_len = (((unsigned char *)tcph) + 4 * (tcph->doff)) - cb->data_start;
+	fpdu_info->data_len = fpdu_len;
+	tot_len = fpdu_info->hdr_len + fpdu_len - ETH_HLEN;
+
+	if (frags[0].cmplt) {
+		fpdu_info->hdr_pbase = cb->busaddr;
+		fpdu_info->hdr_vbase = NULL;
+	} else {
+		fpdu_info->hdr_vbase = pci_alloc_consistent(nesdev->pcidev,
+							    fpdu_info->hdr_len, &fpdu_info->hdr_pbase);
+		if (!fpdu_info->hdr_vbase) {
+			nes_debug(NES_DBG_PAU, "Unable to allocate memory for pau first frag\n");
+			rc = -ENOMEM;
+			goto out;
+		}
+
+		/* Copy hdrs, adjusting len and seqnum */
+		memcpy(fpdu_info->hdr_vbase, cb->data_start, fpdu_info->hdr_len);
+		iph = (struct iphdr *)(fpdu_info->hdr_vbase + ETH_HLEN);
+		tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
+	}
+
+	iph->tot_len = cpu_to_be16(tot_len);
+	iph->saddr = cpu_to_be32(0x7f000001);
+
+	tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt);
+	tcph->ack_seq = cpu_to_be32(ack);
+	tcph->window = cpu_to_be16(wnd);
+
+	nesqp->pau_rcv_nxt += fpdu_len + fin_rcvd;
+
+	memcpy(fpdu_info->frags, frags, sizeof(fpdu_info->frags));
+	fpdu_info->frag_cnt = frag_cnt;
+	fpdu_info->nesqp = nesqp;
+	*pau_fpdu_info = fpdu_info;
+
+	/* Update skb's for next pass */
+	for (i = 0; i < frag_cnt; i++) {
+		cb = (struct nes_rskb_cb *)&frags[i].skb->cb[0];
+		skb_pull(frags[i].skb, frags[i].frag_len);
+
+		if (frags[i].skb->len == 0) {
+			/* Pull skb off the list - it will be freed in the callback */
+			if (!skb_queue_empty(&nesqp->pau_list))
+				skb_unlink(frags[i].skb, &nesqp->pau_list);
+		} else {
+			/* Last skb still has data so update the seq */
+			iph = (struct iphdr *)(cb->data_start + ETH_HLEN);
+			tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
+			tcph->seq = cpu_to_be32(nesqp->pau_rcv_nxt);
+		}
+	}
+
+out:
+	if (rc) {
+		if (fpdu_info) {
+			if (fpdu_info->cqp_request)
+				nes_put_cqp_request(nesdev, fpdu_info->cqp_request);
+			kfree(fpdu_info);
+		}
+	}
+	return rc;
+}
+
+/**
+ * forward_fpdu - send complete fpdus, one at a time
+ */
+static int forward_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp)
+{
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct pau_fpdu_info *fpdu_info;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	u64 u64tmp;
+	u32 u32tmp;
+	int rc;
+
+	while (1) {
+		spin_lock_irqsave(&nesqp->pau_lock, flags);
+		rc = get_fpdu_info(nesdev, nesqp, &fpdu_info);
+		if (rc || (fpdu_info == NULL)) {
+			spin_unlock_irqrestore(&nesqp->pau_lock, flags);
+			return rc;
+		}
+
+		cqp_request = fpdu_info->cqp_request;
+		cqp_wqe = &cqp_request->cqp_wqe;
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_OPCODE_IDX,
+				    NES_CQP_DOWNLOAD_SEGMENT |
+				    (((u32)nesvnic->logical_port) << NES_CQP_OP_LOGICAL_PORT_SHIFT));
+
+		u32tmp = fpdu_info->hdr_len << 16;
+		u32tmp |= fpdu_info->hdr_len + (u32)fpdu_info->data_len;
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_DL_LENGTH_0_TOTAL_IDX,
+				    u32tmp);
+
+		u32tmp = (fpdu_info->frags[1].frag_len << 16) | fpdu_info->frags[0].frag_len;
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_2_1_IDX,
+				    u32tmp);
+
+		u32tmp = (fpdu_info->frags[3].frag_len << 16) | fpdu_info->frags[2].frag_len;
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_LENGTH_4_3_IDX,
+				    u32tmp);
+
+		u64tmp = (u64)fpdu_info->hdr_pbase;
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX,
+				    lower_32_bits(u64tmp));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_HIGH_IDX,
+				    upper_32_bits(u64tmp));
+
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
+				    lower_32_bits(fpdu_info->frags[0].physaddr));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_HIGH_IDX,
+				    upper_32_bits(fpdu_info->frags[0].physaddr));
+
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_LOW_IDX,
+				    lower_32_bits(fpdu_info->frags[1].physaddr));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG2_HIGH_IDX,
+				    upper_32_bits(fpdu_info->frags[1].physaddr));
+
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_LOW_IDX,
+				    lower_32_bits(fpdu_info->frags[2].physaddr));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG3_HIGH_IDX,
+				    upper_32_bits(fpdu_info->frags[2].physaddr));
+
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_LOW_IDX,
+				    lower_32_bits(fpdu_info->frags[3].physaddr));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_NIC_SQ_WQE_FRAG4_HIGH_IDX,
+				    upper_32_bits(fpdu_info->frags[3].physaddr));
+
+		cqp_request->cqp_callback_pointer = fpdu_info;
+		cqp_request->callback = 1;
+		cqp_request->cqp_callback = nes_download_callback;
+
+		atomic_set(&cqp_request->refcount, 1);
+		nes_post_cqp_request(nesdev, cqp_request);
+		spin_unlock_irqrestore(&nesqp->pau_lock, flags);
+	}
+
+	return 0;
+}
+
+static void process_fpdus(struct nes_vnic *nesvnic, struct nes_qp *nesqp)
+{
+	int again = 1;
+	unsigned long flags;
+
+	do {
+		/* Ignore rc - if it failed, tcp retries will cause it to try again */
+		forward_fpdus(nesvnic, nesqp);
+
+		spin_lock_irqsave(&nesqp->pau_lock, flags);
+		if (nesqp->pau_pending) {
+			nesqp->pau_pending = 0;
+		} else {
+			nesqp->pau_busy = 0;
+			again = 0;
+		}
+
+		spin_unlock_irqrestore(&nesqp->pau_lock, flags);
+	} while (again);
+}
+
+/**
+ * queue_fpdus - Handle fpdu's that hw passed up to sw
+ */
+static void queue_fpdus(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp)
+{
+	struct sk_buff *tmpskb;
+	struct nes_rskb_cb *cb;
+	struct iphdr *iph;
+	struct tcphdr *tcph;
+	unsigned char *tcph_end;
+	u32 rcv_nxt;
+	u32 rcv_wnd;
+	u32 seqnum;
+	u32 len;
+	bool process_it = false;
+	unsigned long flags;
+
+	/* Move data ptr to after tcp header */
+	iph = (struct iphdr *)skb->data;
+	tcph = (struct tcphdr *)(((char *)iph) + (4 * iph->ihl));
+	seqnum = be32_to_cpu(tcph->seq);
+	tcph_end = (((char *)tcph) + (4 * tcph->doff));
+
+	len = be16_to_cpu(iph->tot_len);
+	if (skb->len > len)
+		skb_trim(skb, len);
+	skb_pull(skb, tcph_end - skb->data);
+
+	/* Initialize tracking values */
+	cb = (struct nes_rskb_cb *)&skb->cb[0];
+	cb->seqnum = seqnum;
+
+	/* Make sure data is in the receive window */
+	rcv_nxt = nesqp->pau_rcv_nxt;
+	rcv_wnd = le32_to_cpu(nesqp->nesqp_context->rcv_wnd);
+	if (!between(seqnum, rcv_nxt, (rcv_nxt + rcv_wnd))) {
+		nes_mgt_free_skb(nesvnic->nesdev, skb, PCI_DMA_TODEVICE);
+		nes_rem_ref_cm_node(nesqp->cm_node);
+		return;
+	}
+
+	spin_lock_irqsave(&nesqp->pau_lock, flags);
+
+	if (nesqp->pau_busy)
+		nesqp->pau_pending = 1;
+	else
+		nesqp->pau_busy = 1;
+
+	/* Queue skb by sequence number */
+	if (skb_queue_len(&nesqp->pau_list) == 0) {
+		skb_queue_head(&nesqp->pau_list, skb);
+	} else {
+		tmpskb = nesqp->pau_list.next;
+		while (tmpskb != (struct sk_buff *)&nesqp->pau_list) {
+			cb = (struct nes_rskb_cb *)&tmpskb->cb[0];
+			if (before(seqnum, cb->seqnum))
+				break;
+			tmpskb = tmpskb->next;
+		}
+		skb_insert(tmpskb, skb, &nesqp->pau_list);
+	}
+	if (nesqp->pau_state == PAU_READY)
+		process_it = true;
+	spin_unlock_irqrestore(&nesqp->pau_lock, flags);
+
+	if (process_it)
+		process_fpdus(nesvnic, nesqp);
+
+	return;
+}
+
+/**
+ * mgt_thread - Handle mgt skbs in a safe context
+ */
+static int mgt_thread(void *context)
+{
+	struct nes_vnic *nesvnic = context;
+	struct sk_buff *skb;
+	struct nes_rskb_cb *cb;
+
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(nesvnic->mgt_wait_queue,
+					 skb_queue_len(&nesvnic->mgt_skb_list) || kthread_should_stop());
+		while ((skb_queue_len(&nesvnic->mgt_skb_list)) && !kthread_should_stop()) {
+			skb = skb_dequeue(&nesvnic->mgt_skb_list);
+			cb = (struct nes_rskb_cb *)&skb->cb[0];
+			cb->data_start = skb->data - ETH_HLEN;
+			cb->busaddr = pci_map_single(nesvnic->nesdev->pcidev, cb->data_start,
+						     nesvnic->max_frame_size, PCI_DMA_TODEVICE);
+			queue_fpdus(skb, nesvnic, cb->nesqp);
+		}
+	}
+
+	/* Closing down so delete any entries on the queue */
+	while (skb_queue_len(&nesvnic->mgt_skb_list)) {
+		skb = skb_dequeue(&nesvnic->mgt_skb_list);
+		cb = (struct nes_rskb_cb *)&skb->cb[0];
+		nes_rem_ref_cm_node(cb->nesqp->cm_node);
+		dev_kfree_skb_any(skb);
+	}
+	return 0;
+}
+
+/**
+ * nes_queue_skbs - Queue skb so it can be handled in a thread context
+ */
+void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp)
+{
+	struct nes_rskb_cb *cb;
+
+	cb = (struct nes_rskb_cb *)&skb->cb[0];
+	cb->nesqp = nesqp;
+	skb_queue_tail(&nesvnic->mgt_skb_list, skb);
+	wake_up_interruptible(&nesvnic->mgt_wait_queue);
+}
+
+void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp)
+{
+	struct sk_buff *skb;
+	unsigned long flags;
+	atomic_inc(&pau_qps_destroyed);
+
+	/* Free packets that have not yet been forwarded */
+	/* Lock is acquired by skb_dequeue when removing the skb */
+	spin_lock_irqsave(&nesqp->pau_lock, flags);
+	while (skb_queue_len(&nesqp->pau_list)) {
+		skb = skb_dequeue(&nesqp->pau_list);
+		nes_mgt_free_skb(nesdev, skb, PCI_DMA_TODEVICE);
+		nes_rem_ref_cm_node(nesqp->cm_node);
+	}
+	spin_unlock_irqrestore(&nesqp->pau_lock, flags);
+}
+
+static void nes_chg_qh_handler(struct nes_device *nesdev, struct nes_cqp_request *cqp_request)
+{
+	struct pau_qh_chg *qh_chg = cqp_request->cqp_callback_pointer;
+	struct nes_cqp_request *new_request;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_adapter *nesadapter;
+	struct nes_qp *nesqp;
+	struct nes_v4_quad nes_quad;
+	u32 crc_value;
+	u64 u64temp;
+
+	nesadapter = nesdev->nesadapter;
+	nesqp = qh_chg->nesqp;
+
+	/* Should we handle the bad completion */
+	if (cqp_request->major_code)
+		WARN(1, PFX "Invalid cqp_request major_code=0x%x\n",
+		       cqp_request->major_code);
+
+	switch (nesqp->pau_state) {
+	case PAU_DEL_QH:
+		/* Old hash code deleted, now set the new one */
+		nesqp->pau_state = PAU_ADD_LB_QH;
+		new_request = nes_get_cqp_request(nesdev);
+		if (new_request == NULL) {
+			nes_debug(NES_DBG_PAU, "Failed to get a new_request.\n");
+			WARN_ON(1);
+			return;
+		}
+
+		memset(&nes_quad, 0, sizeof(nes_quad));
+		nes_quad.DstIpAdrIndex =
+			cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) << 24);
+		nes_quad.SrcIpadr = cpu_to_be32(0x7f000001);
+		nes_quad.TcpPorts[0] = swab16(nesqp->nesqp_context->tcpPorts[1]);
+		nes_quad.TcpPorts[1] = swab16(nesqp->nesqp_context->tcpPorts[0]);
+
+		/* Produce hash key */
+		crc_value = get_crc_value(&nes_quad);
+		nesqp->hte_index = cpu_to_be32(crc_value ^ 0xffffffff);
+		nes_debug(NES_DBG_PAU, "new HTE Index = 0x%08X, CRC = 0x%08X\n",
+			  nesqp->hte_index, nesqp->hte_index & nesadapter->hte_index_mask);
+
+		nesqp->hte_index &= nesadapter->hte_index_mask;
+		nesqp->nesqp_context->hte_index = cpu_to_le32(nesqp->hte_index);
+		nesqp->nesqp_context->ip0 = cpu_to_le32(0x7f000001);
+		nesqp->nesqp_context->rcv_nxt = cpu_to_le32(nesqp->pau_rcv_nxt);
+
+		cqp_wqe = &new_request->cqp_wqe;
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+		set_wqe_32bit_value(cqp_wqe->wqe_words,
+				    NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH |
+				    NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+		u64temp = (u64)nesqp->nesqp_context_pbase;
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+		nes_debug(NES_DBG_PAU, "Waiting for CQP completion for adding the quad hash.\n");
+
+		new_request->cqp_callback_pointer = qh_chg;
+		new_request->callback = 1;
+		new_request->cqp_callback = nes_chg_qh_handler;
+		atomic_set(&new_request->refcount, 1);
+		nes_post_cqp_request(nesdev, new_request);
+		break;
+
+	case PAU_ADD_LB_QH:
+		/* Start processing the queued fpdu's */
+		nesqp->pau_state = PAU_READY;
+		process_fpdus(qh_chg->nesvnic, qh_chg->nesqp);
+		kfree(qh_chg);
+		break;
+	}
+}
+
+/**
+ * nes_change_quad_hash
+ */
+static int nes_change_quad_hash(struct nes_device *nesdev,
+				struct nes_vnic *nesvnic, struct nes_qp *nesqp)
+{
+	struct nes_cqp_request *cqp_request = NULL;
+	struct pau_qh_chg *qh_chg = NULL;
+	u64 u64temp;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	int ret = 0;
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n");
+		ret = -ENOMEM;
+		goto chg_qh_err;
+	}
+
+	qh_chg = kmalloc(sizeof *qh_chg, GFP_ATOMIC);
+	if (qh_chg == NULL) {
+		nes_debug(NES_DBG_PAU, "Failed to get a cqp_request.\n");
+		ret = -ENOMEM;
+		goto chg_qh_err;
+	}
+	qh_chg->nesdev = nesdev;
+	qh_chg->nesvnic = nesvnic;
+	qh_chg->nesqp = nesqp;
+	nesqp->pau_state = PAU_DEL_QH;
+
+	cqp_wqe = &cqp_request->cqp_wqe;
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words,
+			    NES_CQP_WQE_OPCODE_IDX, NES_CQP_MANAGE_QUAD_HASH | NES_CQP_QP_DEL_HTE |
+			    NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_CONTEXT_VALID | NES_CQP_QP_IWARP_STATE_RTS);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+	u64temp = (u64)nesqp->nesqp_context_pbase;
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+	nes_debug(NES_DBG_PAU, "Waiting for CQP completion for deleting the quad hash.\n");
+
+	cqp_request->cqp_callback_pointer = qh_chg;
+	cqp_request->callback = 1;
+	cqp_request->cqp_callback = nes_chg_qh_handler;
+	atomic_set(&cqp_request->refcount, 1);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	return ret;
+
+chg_qh_err:
+	kfree(qh_chg);
+	if (cqp_request)
+		nes_put_cqp_request(nesdev, cqp_request);
+	return ret;
+}
+
+/**
+ * nes_mgt_ce_handler
+ * This management code deals with any packed and unaligned (pau) fpdu's
+ * that the hardware cannot handle.
+ */
+static void nes_mgt_ce_handler(struct nes_device *nesdev, struct nes_hw_nic_cq *cq)
+{
+	struct nes_vnic_mgt *mgtvnic = container_of(cq, struct nes_vnic_mgt, mgt_cq);
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 head;
+	u32 cq_size;
+	u32 cqe_count = 0;
+	u32 cqe_misc;
+	u32 qp_id = 0;
+	u32 skbs_needed;
+	unsigned long context;
+	struct nes_qp *nesqp;
+	struct sk_buff *rx_skb;
+	struct nes_rskb_cb *cb;
+
+	head = cq->cq_head;
+	cq_size = cq->cq_size;
+
+	while (1) {
+		cqe_misc = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX]);
+		if (!(cqe_misc & NES_NIC_CQE_VALID))
+			break;
+
+		nesqp = NULL;
+		if (cqe_misc & NES_NIC_CQE_ACCQP_VALID) {
+			qp_id = le32_to_cpu(cq->cq_vbase[head].cqe_words[NES_NIC_CQE_ACCQP_ID_IDX]);
+			qp_id &= 0x001fffff;
+			if (qp_id < nesadapter->max_qp) {
+				context = (unsigned long)nesadapter->qp_table[qp_id - NES_FIRST_QPN];
+				nesqp = (struct nes_qp *)context;
+			}
+		}
+
+		if (nesqp) {
+			if (nesqp->pau_mode == false) {
+				nesqp->pau_mode = true; /* First time for this qp */
+				nesqp->pau_rcv_nxt = le32_to_cpu(
+					cq->cq_vbase[head].cqe_words[NES_NIC_CQE_HASH_RCVNXT]);
+				skb_queue_head_init(&nesqp->pau_list);
+				spin_lock_init(&nesqp->pau_lock);
+				atomic_inc(&pau_qps_created);
+				nes_change_quad_hash(nesdev, mgtvnic->nesvnic, nesqp);
+			}
+
+			rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail];
+			rx_skb->len = 0;
+			skb_put(rx_skb, cqe_misc & 0x0000ffff);
+			rx_skb->protocol = eth_type_trans(rx_skb, mgtvnic->nesvnic->netdev);
+			cb = (struct nes_rskb_cb *)&rx_skb->cb[0];
+			pci_unmap_single(nesdev->pcidev, cb->busaddr, cb->maplen, PCI_DMA_FROMDEVICE);
+			cb->busaddr = 0;
+			mgtvnic->mgt.rq_tail++;
+			mgtvnic->mgt.rq_tail &= mgtvnic->mgt.rq_size - 1;
+
+			nes_add_ref_cm_node(nesqp->cm_node);
+			nes_queue_mgt_skbs(rx_skb, mgtvnic->nesvnic, nesqp);
+		} else {
+			printk(KERN_ERR PFX "Invalid QP %d for packed/unaligned handling\n", qp_id);
+		}
+
+		cq->cq_vbase[head].cqe_words[NES_NIC_CQE_MISC_IDX] = 0;
+		cqe_count++;
+		if (++head >= cq_size)
+			head = 0;
+
+		if (cqe_count == 255) {
+			/* Replenish mgt CQ */
+			nes_write32(nesdev->regs + NES_CQE_ALLOC, cq->cq_number | (cqe_count << 16));
+			nesdev->currcq_count += cqe_count;
+			cqe_count = 0;
+		}
+
+		skbs_needed = atomic_inc_return(&mgtvnic->rx_skbs_needed);
+		if (skbs_needed > (mgtvnic->mgt.rq_size >> 1))
+			nes_replenish_mgt_rq(mgtvnic);
+	}
+
+	cq->cq_head = head;
+	nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+		    cq->cq_number | (cqe_count << 16));
+	nes_read32(nesdev->regs + NES_CQE_ALLOC);
+	nesdev->currcq_count += cqe_count;
+}
+
+/**
+ * nes_init_mgt_qp
+ */
+int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic)
+{
+	struct nes_vnic_mgt *mgtvnic;
+	u32 counter;
+	void *vmem;
+	dma_addr_t pmem;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 cqp_head;
+	unsigned long flags;
+	struct nes_hw_nic_qp_context *mgt_context;
+	u64 u64temp;
+	struct nes_hw_nic_rq_wqe *mgt_rqe;
+	struct sk_buff *skb;
+	u32 wqe_count;
+	struct nes_rskb_cb *cb;
+	u32 mgt_mem_size;
+	void *mgt_vbase;
+	dma_addr_t mgt_pbase;
+	int i;
+	int ret;
+
+	/* Allocate space the all mgt QPs once */
+	mgtvnic = kzalloc(NES_MGT_QP_COUNT * sizeof(struct nes_vnic_mgt), GFP_KERNEL);
+	if (mgtvnic == NULL) {
+		nes_debug(NES_DBG_INIT, "Unable to allocate memory for mgt structure\n");
+		return -ENOMEM;
+	}
+
+	/* Allocate fragment, RQ, and CQ; Reuse CEQ based on the PCI function */
+	/* We are not sending from this NIC so sq is not allocated */
+	mgt_mem_size = 256 +
+		       (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe)) +
+		       (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_cqe)) +
+		       sizeof(struct nes_hw_nic_qp_context);
+	mgt_mem_size = (mgt_mem_size + PAGE_SIZE - 1) & ~(PAGE_SIZE - 1);
+	mgt_vbase = pci_alloc_consistent(nesdev->pcidev, NES_MGT_QP_COUNT * mgt_mem_size, &mgt_pbase);
+	if (!mgt_vbase) {
+		kfree(mgtvnic);
+		nes_debug(NES_DBG_INIT, "Unable to allocate memory for mgt host descriptor rings\n");
+		return -ENOMEM;
+	}
+
+	nesvnic->mgt_mem_size = NES_MGT_QP_COUNT * mgt_mem_size;
+	nesvnic->mgt_vbase = mgt_vbase;
+	nesvnic->mgt_pbase = mgt_pbase;
+
+	skb_queue_head_init(&nesvnic->mgt_skb_list);
+	init_waitqueue_head(&nesvnic->mgt_wait_queue);
+	nesvnic->mgt_thread = kthread_run(mgt_thread, nesvnic, "nes_mgt_thread");
+
+	for (i = 0; i < NES_MGT_QP_COUNT; i++) {
+		mgtvnic->nesvnic = nesvnic;
+		mgtvnic->mgt.qp_id = nesdev->mac_index + NES_MGT_QP_OFFSET + i;
+		memset(mgt_vbase, 0, mgt_mem_size);
+		nes_debug(NES_DBG_INIT, "Allocated mgt QP structures at %p (phys = %016lX), size = %u.\n",
+			  mgt_vbase, (unsigned long)mgt_pbase, mgt_mem_size);
+
+		vmem = (void *)(((unsigned long)mgt_vbase + (256 - 1)) &
+				~(unsigned long)(256 - 1));
+		pmem = (dma_addr_t)(((unsigned long long)mgt_pbase + (256 - 1)) &
+				    ~(unsigned long long)(256 - 1));
+
+		spin_lock_init(&mgtvnic->mgt.rq_lock);
+
+		/* setup the RQ */
+		mgtvnic->mgt.rq_vbase = vmem;
+		mgtvnic->mgt.rq_pbase = pmem;
+		mgtvnic->mgt.rq_head = 0;
+		mgtvnic->mgt.rq_tail = 0;
+		mgtvnic->mgt.rq_size = NES_MGT_WQ_COUNT;
+
+		/* setup the CQ */
+		vmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe));
+		pmem += (NES_MGT_WQ_COUNT * sizeof(struct nes_hw_nic_rq_wqe));
+
+		mgtvnic->mgt_cq.cq_number = mgtvnic->mgt.qp_id;
+		mgtvnic->mgt_cq.cq_vbase = vmem;
+		mgtvnic->mgt_cq.cq_pbase = pmem;
+		mgtvnic->mgt_cq.cq_head = 0;
+		mgtvnic->mgt_cq.cq_size = NES_MGT_WQ_COUNT;
+
+		mgtvnic->mgt_cq.ce_handler = nes_mgt_ce_handler;
+
+		/* Send CreateCQ request to CQP */
+		spin_lock_irqsave(&nesdev->cqp.lock, flags);
+		cqp_head = nesdev->cqp.sq_head;
+
+		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+		cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(
+			NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
+			((u32)mgtvnic->mgt_cq.cq_size << 16));
+		cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(
+			mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16));
+		u64temp = (u64)mgtvnic->mgt_cq.cq_pbase;
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
+		u64temp = (unsigned long)&mgtvnic->mgt_cq;
+		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] = cpu_to_le32((u32)(u64temp >> 1));
+		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
+			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
+		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
+
+		if (++cqp_head >= nesdev->cqp.sq_size)
+			cqp_head = 0;
+		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+		/* Send CreateQP request to CQP */
+		mgt_context = (void *)(&mgtvnic->mgt_cq.cq_vbase[mgtvnic->mgt_cq.cq_size]);
+		mgt_context->context_words[NES_NIC_CTX_MISC_IDX] =
+			cpu_to_le32((u32)NES_MGT_CTX_SIZE |
+				    ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 12));
+		nes_debug(NES_DBG_INIT, "RX_WINDOW_BUFFER_PAGE_TABLE_SIZE = 0x%08X, RX_WINDOW_BUFFER_SIZE = 0x%08X\n",
+			  nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_PAGE_TABLE_SIZE),
+			  nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE));
+		if (nes_read_indexed(nesdev, NES_IDX_RX_WINDOW_BUFFER_SIZE) != 0)
+			mgt_context->context_words[NES_NIC_CTX_MISC_IDX] |= cpu_to_le32(NES_NIC_BACK_STORE);
+
+		u64temp = (u64)mgtvnic->mgt.rq_pbase;
+		mgt_context->context_words[NES_NIC_CTX_SQ_LOW_IDX] = cpu_to_le32((u32)u64temp);
+		mgt_context->context_words[NES_NIC_CTX_SQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
+		u64temp = (u64)mgtvnic->mgt.rq_pbase;
+		mgt_context->context_words[NES_NIC_CTX_RQ_LOW_IDX] = cpu_to_le32((u32)u64temp);
+		mgt_context->context_words[NES_NIC_CTX_RQ_HIGH_IDX] = cpu_to_le32((u32)(u64temp >> 32));
+
+		cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] = cpu_to_le32(NES_CQP_CREATE_QP |
+									 NES_CQP_QP_TYPE_NIC);
+		cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX] = cpu_to_le32(mgtvnic->mgt.qp_id);
+		u64temp = (u64)mgtvnic->mgt_cq.cq_pbase +
+			  (mgtvnic->mgt_cq.cq_size * sizeof(struct nes_hw_nic_cqe));
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+		if (++cqp_head >= nesdev->cqp.sq_size)
+			cqp_head = 0;
+		nesdev->cqp.sq_head = cqp_head;
+
+		barrier();
+
+		/* Ring doorbell (2 WQEs) */
+		nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
+
+		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+		nes_debug(NES_DBG_INIT, "Waiting for create MGT QP%u to complete.\n",
+			  mgtvnic->mgt.qp_id);
+
+		ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
+					 NES_EVENT_TIMEOUT);
+		nes_debug(NES_DBG_INIT, "Create MGT QP%u completed, wait_event_timeout ret = %u.\n",
+			  mgtvnic->mgt.qp_id, ret);
+		if (!ret) {
+			nes_debug(NES_DBG_INIT, "MGT QP%u create timeout expired\n", mgtvnic->mgt.qp_id);
+			if (i == 0) {
+				pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase,
+						    nesvnic->mgt_pbase);
+				kfree(mgtvnic);
+			} else {
+				nes_destroy_mgt(nesvnic);
+			}
+			return -EIO;
+		}
+
+		/* Populate the RQ */
+		for (counter = 0; counter < (NES_MGT_WQ_COUNT - 1); counter++) {
+			skb = dev_alloc_skb(nesvnic->max_frame_size);
+			if (!skb) {
+				nes_debug(NES_DBG_INIT, "%s: out of memory for receive skb\n", netdev->name);
+				return -ENOMEM;
+			}
+
+			skb->dev = netdev;
+
+			pmem = pci_map_single(nesdev->pcidev, skb->data,
+					      nesvnic->max_frame_size, PCI_DMA_FROMDEVICE);
+			cb = (struct nes_rskb_cb *)&skb->cb[0];
+			cb->busaddr = pmem;
+			cb->maplen = nesvnic->max_frame_size;
+
+			mgt_rqe = &mgtvnic->mgt.rq_vbase[counter];
+			mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_1_0_IDX] = cpu_to_le32((u32)nesvnic->max_frame_size);
+			mgt_rqe->wqe_words[NES_NIC_RQ_WQE_LENGTH_3_2_IDX] = 0;
+			mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_LOW_IDX] = cpu_to_le32((u32)pmem);
+			mgt_rqe->wqe_words[NES_NIC_RQ_WQE_FRAG0_HIGH_IDX] = cpu_to_le32((u32)((u64)pmem >> 32));
+			mgtvnic->mgt.rx_skb[counter] = skb;
+		}
+
+		init_timer(&mgtvnic->rq_wqes_timer);
+		mgtvnic->rq_wqes_timer.function = nes_mgt_rq_wqes_timeout;
+		mgtvnic->rq_wqes_timer.data = (unsigned long)mgtvnic;
+
+		wqe_count = NES_MGT_WQ_COUNT - 1;
+		mgtvnic->mgt.rq_head = wqe_count;
+		barrier();
+		do {
+			counter = min(wqe_count, ((u32)255));
+			wqe_count -= counter;
+			nes_write32(nesdev->regs + NES_WQE_ALLOC, (counter << 24) | mgtvnic->mgt.qp_id);
+		} while (wqe_count);
+
+		nes_write32(nesdev->regs + NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+			    mgtvnic->mgt_cq.cq_number);
+		nes_read32(nesdev->regs + NES_CQE_ALLOC);
+
+		mgt_vbase += mgt_mem_size;
+		mgt_pbase += mgt_mem_size;
+		nesvnic->mgtvnic[i] = mgtvnic++;
+	}
+	return 0;
+}
+
+
+void nes_destroy_mgt(struct nes_vnic *nesvnic)
+{
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_vnic_mgt *mgtvnic;
+	struct nes_vnic_mgt *first_mgtvnic;
+	unsigned long flags;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	u32 cqp_head;
+	struct sk_buff *rx_skb;
+	int i;
+	int ret;
+
+	kthread_stop(nesvnic->mgt_thread);
+
+	/* Free remaining NIC receive buffers */
+	first_mgtvnic = nesvnic->mgtvnic[0];
+	for (i = 0; i < NES_MGT_QP_COUNT; i++) {
+		mgtvnic = nesvnic->mgtvnic[i];
+		if (mgtvnic == NULL)
+			continue;
+
+		while (mgtvnic->mgt.rq_head != mgtvnic->mgt.rq_tail) {
+			rx_skb = mgtvnic->mgt.rx_skb[mgtvnic->mgt.rq_tail];
+			nes_mgt_free_skb(nesdev, rx_skb, PCI_DMA_FROMDEVICE);
+			mgtvnic->mgt.rq_tail++;
+			mgtvnic->mgt.rq_tail &= (mgtvnic->mgt.rq_size - 1);
+		}
+
+		spin_lock_irqsave(&nesdev->cqp.lock, flags);
+
+		/* Destroy NIC QP */
+		cqp_head = nesdev->cqp.sq_head;
+		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+				    (NES_CQP_DESTROY_QP | NES_CQP_QP_TYPE_NIC));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+				    mgtvnic->mgt.qp_id);
+
+		if (++cqp_head >= nesdev->cqp.sq_size)
+			cqp_head = 0;
+
+		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+
+		/* Destroy NIC CQ */
+		nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+				    (NES_CQP_DESTROY_CQ | ((u32)mgtvnic->mgt_cq.cq_size << 16)));
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+				    (mgtvnic->mgt_cq.cq_number | ((u32)nesdev->ceq_index << 16)));
+
+		if (++cqp_head >= nesdev->cqp.sq_size)
+			cqp_head = 0;
+
+		nesdev->cqp.sq_head = cqp_head;
+		barrier();
+
+		/* Ring doorbell (2 WQEs) */
+		nes_write32(nesdev->regs + NES_WQE_ALLOC, 0x02800000 | nesdev->cqp.qp_id);
+
+		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+		nes_debug(NES_DBG_SHUTDOWN, "Waiting for CQP, cqp_head=%u, cqp.sq_head=%u,"
+			  " cqp.sq_tail=%u, cqp.sq_size=%u\n",
+			  cqp_head, nesdev->cqp.sq_head,
+			  nesdev->cqp.sq_tail, nesdev->cqp.sq_size);
+
+		ret = wait_event_timeout(nesdev->cqp.waitq, (nesdev->cqp.sq_tail == cqp_head),
+					 NES_EVENT_TIMEOUT);
+
+		nes_debug(NES_DBG_SHUTDOWN, "Destroy MGT QP returned, wait_event_timeout ret = %u, cqp_head=%u,"
+			  " cqp.sq_head=%u, cqp.sq_tail=%u\n",
+			  ret, cqp_head, nesdev->cqp.sq_head, nesdev->cqp.sq_tail);
+		if (!ret)
+			nes_debug(NES_DBG_SHUTDOWN, "MGT QP%u destroy timeout expired\n",
+				  mgtvnic->mgt.qp_id);
+
+		nesvnic->mgtvnic[i] = NULL;
+	}
+
+	if (nesvnic->mgt_vbase) {
+		pci_free_consistent(nesdev->pcidev, nesvnic->mgt_mem_size, nesvnic->mgt_vbase,
+				    nesvnic->mgt_pbase);
+		nesvnic->mgt_vbase = NULL;
+		nesvnic->mgt_pbase = 0;
+	}
+
+	kfree(first_mgtvnic);
+}
diff --git a/drivers/infiniband/hw/nes/nes_mgt.h b/drivers/infiniband/hw/nes/nes_mgt.h
new file mode 100644
index 000000000..4f7f701c4
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_mgt.h
@@ -0,0 +1,97 @@
+/*
+* Copyright (c) 2006 - 2011 Intel-NE, Inc.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenIB.org BSD license below:
+*
+*     Redistribution and use in source and binary forms, with or
+*     without modification, are permitted provided that the following
+*     conditions are met:
+*
+*      - Redistributions of source code must retain the above
+*        copyright notice, this list of conditions and the following
+*        disclaimer.
+*
+*      - Redistributions in binary form must reproduce the above
+*        copyright notice, this list of conditions and the following
+*        disclaimer in the documentation and/or other materials
+*        provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*/
+
+#ifndef __NES_MGT_H
+#define __NES_MGT_H
+
+#define MPA_FRAMING 6	/* length is 2 bytes, crc is 4 bytes */
+
+int nes_init_mgt_qp(struct nes_device *nesdev, struct net_device *netdev, struct nes_vnic *nesvnic);
+void nes_queue_mgt_skbs(struct sk_buff *skb, struct nes_vnic *nesvnic, struct nes_qp *nesqp);
+void nes_destroy_mgt(struct nes_vnic *nesvnic);
+void nes_destroy_pau_qp(struct nes_device *nesdev, struct nes_qp *nesqp);
+
+struct nes_hw_mgt {
+	struct nes_hw_nic_rq_wqe *rq_vbase;	/* virtual address of rq */
+	dma_addr_t rq_pbase;			/* PCI memory for host rings */
+	struct sk_buff *rx_skb[NES_NIC_WQ_SIZE];
+	u16 qp_id;
+	u16 sq_head;
+	u16 rq_head;
+	u16 rq_tail;
+	u16 rq_size;
+	u8 replenishing_rq;
+	u8 reserved;
+	spinlock_t rq_lock;
+};
+
+struct nes_vnic_mgt {
+	struct nes_vnic        *nesvnic;
+	struct nes_hw_mgt      mgt;
+	struct nes_hw_nic_cq   mgt_cq;
+	atomic_t               rx_skbs_needed;
+	struct timer_list      rq_wqes_timer;
+	atomic_t               rx_skb_timer_running;
+};
+
+#define MAX_FPDU_FRAGS 4
+struct pau_fpdu_frag {
+	struct sk_buff         *skb;
+	u64                    physaddr;
+	u32                    frag_len;
+	bool                   cmplt;
+};
+
+struct pau_fpdu_info {
+	struct nes_qp          *nesqp;
+	struct nes_cqp_request *cqp_request;
+	void                   *hdr_vbase;
+	dma_addr_t             hdr_pbase;
+	int                    hdr_len;
+	u16                    data_len;
+	u16                    frag_cnt;
+	struct pau_fpdu_frag   frags[MAX_FPDU_FRAGS];
+};
+
+enum pau_qh_state {
+	PAU_DEL_QH,
+	PAU_ADD_LB_QH,
+	PAU_READY
+};
+
+struct pau_qh_chg {
+	struct nes_device *nesdev;
+	struct nes_vnic *nesvnic;
+	struct nes_qp *nesqp;
+};
+
+#endif          /* __NES_MGT_H */
diff --git a/drivers/infiniband/hw/nes/nes_nic.c b/drivers/infiniband/hw/nes/nes_nic.c
new file mode 100644
index 000000000..70acda91e
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_nic.c
@@ -0,0 +1,1883 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if_arp.h>
+#include <linux/if_vlan.h>
+#include <linux/ethtool.h>
+#include <linux/slab.h>
+#include <net/tcp.h>
+
+#include <net/inet_common.h>
+#include <linux/inet.h>
+
+#include "nes.h"
+
+static struct nic_qp_map nic_qp_mapping_0[] = {
+	{16,0,0,1},{24,4,0,0},{28,8,0,0},{32,12,0,0},
+	{20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0},
+	{18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0},
+	{22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_1[] = {
+	{18,1,1,1},{25,5,1,0},{29,9,1,0},{33,13,1,0},
+	{22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_2[] = {
+	{20,2,2,1},{26,6,2,0},{30,10,2,0},{34,14,2,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_3[] = {
+	{22,3,3,1},{27,7,3,0},{31,11,3,0},{35,15,3,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_4[] = {
+	{28,8,0,0},{32,12,0,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_5[] = {
+	{29,9,1,0},{33,13,1,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_6[] = {
+	{30,10,2,0},{34,14,2,0}
+};
+
+static struct nic_qp_map nic_qp_mapping_7[] = {
+	{31,11,3,0},{35,15,3,0}
+};
+
+static struct nic_qp_map *nic_qp_mapping_per_function[] = {
+	nic_qp_mapping_0, nic_qp_mapping_1, nic_qp_mapping_2, nic_qp_mapping_3,
+	nic_qp_mapping_4, nic_qp_mapping_5, nic_qp_mapping_6, nic_qp_mapping_7
+};
+
+static const u32 default_msg = NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_LINK
+		| NETIF_MSG_IFUP | NETIF_MSG_IFDOWN;
+static int debug = -1;
+static int nics_per_function = 1;
+
+/**
+ * nes_netdev_poll
+ */
+static int nes_netdev_poll(struct napi_struct *napi, int budget)
+{
+	struct nes_vnic *nesvnic = container_of(napi, struct nes_vnic, napi);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_nic_cq *nescq = &nesvnic->nic_cq;
+
+	nesvnic->budget = budget;
+	nescq->cqes_pending = 0;
+	nescq->rx_cqes_completed = 0;
+	nescq->cqe_allocs_pending = 0;
+	nescq->rx_pkts_indicated = 0;
+
+	nes_nic_ce_handler(nesdev, nescq);
+
+	if (nescq->cqes_pending == 0) {
+		napi_complete(napi);
+		/* clear out completed cqes and arm */
+		nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+				nescq->cq_number | (nescq->cqe_allocs_pending << 16));
+		nes_read32(nesdev->regs+NES_CQE_ALLOC);
+	} else {
+		/* clear out completed cqes but don't arm */
+		nes_write32(nesdev->regs+NES_CQE_ALLOC,
+				nescq->cq_number | (nescq->cqe_allocs_pending << 16));
+		nes_debug(NES_DBG_NETDEV, "%s: exiting with work pending\n",
+				nesvnic->netdev->name);
+	}
+	return nescq->rx_pkts_indicated;
+}
+
+
+/**
+ * nes_netdev_open - Activate the network interface; ifconfig
+ * ethx up.
+ */
+static int nes_netdev_open(struct net_device *netdev)
+{
+	u32 macaddr_low;
+	u16 macaddr_high;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	int ret;
+	int i;
+	struct nes_vnic *first_nesvnic = NULL;
+	u32 nic_active_bit;
+	u32 nic_active;
+	struct list_head *list_pos, *list_temp;
+	unsigned long flags;
+
+	assert(nesdev != NULL);
+
+	if (nesvnic->netdev_open == 1)
+		return 0;
+
+	if (netif_msg_ifup(nesvnic))
+		printk(KERN_INFO PFX "%s: enabling interface\n", netdev->name);
+
+	ret = nes_init_nic_qp(nesdev, netdev);
+	if (ret) {
+		return ret;
+	}
+
+	netif_carrier_off(netdev);
+	netif_stop_queue(netdev);
+
+	if ((!nesvnic->of_device_registered) && (nesvnic->rdma_enabled)) {
+		nesvnic->nesibdev = nes_init_ofa_device(netdev);
+		if (nesvnic->nesibdev == NULL) {
+			printk(KERN_ERR PFX "%s: nesvnic->nesibdev alloc failed", netdev->name);
+		} else {
+			nesvnic->nesibdev->nesvnic = nesvnic;
+			ret = nes_register_ofa_device(nesvnic->nesibdev);
+			if (ret) {
+				printk(KERN_ERR PFX "%s: Unable to register RDMA device, ret = %d\n",
+						netdev->name, ret);
+			}
+		}
+	}
+	/* Set packet filters */
+	nic_active_bit = 1 << nesvnic->nic_index;
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE);
+	nic_active |= nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE);
+	nic_active |= nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON);
+	nic_active |= nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active);
+
+	macaddr_high  = ((u16)netdev->dev_addr[0]) << 8;
+	macaddr_high += (u16)netdev->dev_addr[1];
+
+	macaddr_low   = ((u32)netdev->dev_addr[2]) << 24;
+	macaddr_low  += ((u32)netdev->dev_addr[3]) << 16;
+	macaddr_low  += ((u32)netdev->dev_addr[4]) << 8;
+	macaddr_low  += (u32)netdev->dev_addr[5];
+
+	/* Program the various MAC regs */
+	for (i = 0; i < NES_MAX_PORT_COUNT; i++) {
+		if (nesvnic->qp_nic_index[i] == 0xf) {
+			break;
+		}
+		nes_debug(NES_DBG_NETDEV, "i=%d, perfect filter table index= %d, PERF FILTER LOW"
+				" (Addr:%08X) = %08X, HIGH = %08X.\n",
+				i, nesvnic->qp_nic_index[i],
+				NES_IDX_PERFECT_FILTER_LOW+
+					(nesvnic->qp_nic_index[i] * 8),
+				macaddr_low,
+				(u32)macaddr_high | NES_MAC_ADDR_VALID |
+				((((u32)nesvnic->nic_index) << 16)));
+		nes_write_indexed(nesdev,
+				NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8),
+				macaddr_low);
+		nes_write_indexed(nesdev,
+				NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8),
+				(u32)macaddr_high | NES_MAC_ADDR_VALID |
+				((((u32)nesvnic->nic_index) << 16)));
+	}
+
+
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, NES_CQE_ALLOC_NOTIFY_NEXT |
+			nesvnic->nic_cq.cq_number);
+	nes_read32(nesdev->regs+NES_CQE_ALLOC);
+	list_for_each_safe(list_pos, list_temp, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]) {
+		first_nesvnic = container_of(list_pos, struct nes_vnic, list);
+		if (first_nesvnic->netdev_open == 1)
+			break;
+	}
+	if (first_nesvnic->netdev_open == 0) {
+		nes_debug(NES_DBG_INIT, "Setting up MAC interrupt mask.\n");
+		nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK + (0x200 * nesdev->mac_index),
+				~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT |
+				NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR));
+		first_nesvnic = nesvnic;
+	}
+
+	if (first_nesvnic->linkup) {
+		/* Enable network packets */
+		nesvnic->linkup = 1;
+		netif_start_queue(netdev);
+		netif_carrier_on(netdev);
+	}
+
+	spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
+	if (nesdev->nesadapter->phy_type[nesdev->mac_index] == NES_PHY_TYPE_SFP_D) {
+		nesdev->link_recheck = 1;
+		mod_delayed_work(system_wq, &nesdev->work,
+				 NES_LINK_RECHECK_DELAY);
+	}
+	spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+
+	spin_lock_irqsave(&nesvnic->port_ibevent_lock, flags);
+	if (nesvnic->of_device_registered) {
+		nesdev->nesadapter->send_term_ok = 1;
+		if (nesvnic->linkup == 1) {
+			if (nesdev->iw_status == 0) {
+				nesdev->iw_status = 1;
+				nes_port_ibevent(nesvnic);
+			}
+		} else {
+			nesdev->iw_status = 0;
+		}
+	}
+	spin_unlock_irqrestore(&nesvnic->port_ibevent_lock, flags);
+
+	napi_enable(&nesvnic->napi);
+	nesvnic->netdev_open = 1;
+
+	return 0;
+}
+
+
+/**
+ * nes_netdev_stop
+ */
+static int nes_netdev_stop(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u32 nic_active_mask;
+	u32 nic_active;
+	struct nes_vnic *first_nesvnic = NULL;
+	struct list_head *list_pos, *list_temp;
+	unsigned long flags;
+
+	nes_debug(NES_DBG_SHUTDOWN, "nesvnic=%p, nesdev=%p, netdev=%p %s\n",
+			nesvnic, nesdev, netdev, netdev->name);
+	if (nesvnic->netdev_open == 0)
+		return 0;
+
+	if (netif_msg_ifdown(nesvnic))
+		printk(KERN_INFO PFX "%s: disabling interface\n", netdev->name);
+	netif_carrier_off(netdev);
+
+	/* Disable network packets */
+	napi_disable(&nesvnic->napi);
+	netif_stop_queue(netdev);
+	list_for_each_safe(list_pos, list_temp, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]) {
+		first_nesvnic = container_of(list_pos, struct nes_vnic, list);
+		if ((first_nesvnic->netdev_open == 1) && (first_nesvnic != nesvnic))
+			break;
+	}
+
+	if ((first_nesvnic->netdev_open == 1) && (first_nesvnic != nesvnic)  &&
+		(PCI_FUNC(first_nesvnic->nesdev->pcidev->devfn) !=
+		PCI_FUNC(nesvnic->nesdev->pcidev->devfn))) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+
+				(0x200*nesdev->mac_index), 0xffffffff);
+			nes_write_indexed(first_nesvnic->nesdev,
+				NES_IDX_MAC_INT_MASK+
+				(0x200*first_nesvnic->nesdev->mac_index),
+			~(NES_MAC_INT_LINK_STAT_CHG | NES_MAC_INT_XGMII_EXT |
+			NES_MAC_INT_TX_UNDERFLOW | NES_MAC_INT_TX_ERROR));
+	} else {
+		nes_write_indexed(nesdev, NES_IDX_MAC_INT_MASK+(0x200*nesdev->mac_index), 0xffffffff);
+	}
+
+	nic_active_mask = ~((u32)(1 << nesvnic->nic_index));
+	nes_write_indexed(nesdev, NES_IDX_PERFECT_FILTER_HIGH+
+			(nesvnic->perfect_filter_index*8), 0);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_ACTIVE);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_ACTIVE, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ENABLE, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON);
+	nic_active &= nic_active_mask;
+	nes_write_indexed(nesdev, NES_IDX_NIC_BROADCAST_ON, nic_active);
+
+	spin_lock_irqsave(&nesvnic->port_ibevent_lock, flags);
+	if (nesvnic->of_device_registered) {
+		nesdev->nesadapter->send_term_ok = 0;
+		nesdev->iw_status = 0;
+		if (nesvnic->linkup == 1)
+			nes_port_ibevent(nesvnic);
+	}
+	del_timer_sync(&nesvnic->event_timer);
+	nesvnic->event_timer.function = NULL;
+	spin_unlock_irqrestore(&nesvnic->port_ibevent_lock, flags);
+
+	nes_destroy_nic_qp(nesvnic);
+
+	nesvnic->netdev_open = 0;
+
+	return 0;
+}
+
+
+/**
+ * nes_nic_send
+ */
+static int nes_nic_send(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_nic *nesnic = &nesvnic->nic;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	struct tcphdr *tcph;
+	__le16 *wqe_fragment_length;
+	u32 wqe_misc;
+	u16 wqe_fragment_index = 1;	/* first fragment (0) is used by copy buffer */
+	u16 skb_fragment_index;
+	dma_addr_t bus_address;
+
+	nic_sqe = &nesnic->sq_vbase[nesnic->sq_head];
+	wqe_fragment_length = (__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
+
+	/* setup the VLAN tag if present */
+	if (skb_vlan_tag_present(skb)) {
+		nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
+				netdev->name, skb_vlan_tag_get(skb));
+		wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
+		wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb);
+	} else
+		wqe_misc = 0;
+
+	/* bump past the vlan tag */
+	wqe_fragment_length++;
+	/*	wqe_fragment_address = (u64 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_FRAG0_LOW_IDX]; */
+	wqe_misc |= NES_NIC_SQ_WQE_COMPLETION;
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		if (skb_is_gso(skb)) {
+			tcph = tcp_hdr(skb);
+			/* nes_debug(NES_DBG_NIC_TX, "%s: TSO request... is_gso = %u seg size = %u\n",
+					netdev->name, skb_is_gso(skb), skb_shinfo(skb)->gso_size); */
+			wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE | (u16)skb_shinfo(skb)->gso_size;
+			set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX,
+					((u32)tcph->doff) |
+					(((u32)(((unsigned char *)tcph) - skb->data)) << 4));
+		}
+	} else {	/* CHECKSUM_HW */
+		wqe_misc |= NES_NIC_SQ_WQE_DISABLE_CHKSUM;
+	}
+
+	set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX,
+				skb->len);
+	memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer,
+			skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE), skb_headlen(skb)));
+	wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE),
+			skb_headlen(skb)));
+	wqe_fragment_length[1] = 0;
+	if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) {
+		if ((skb_shinfo(skb)->nr_frags + 1) > 4) {
+			nes_debug(NES_DBG_NIC_TX, "%s: Packet with %u fragments not sent, skb_headlen=%u\n",
+					netdev->name, skb_shinfo(skb)->nr_frags + 2, skb_headlen(skb));
+			kfree_skb(skb);
+			nesvnic->tx_sw_dropped++;
+			return NETDEV_TX_LOCKED;
+		}
+		set_bit(nesnic->sq_head, nesnic->first_frag_overflow);
+		bus_address = pci_map_single(nesdev->pcidev, skb->data + NES_FIRST_FRAG_SIZE,
+				skb_headlen(skb) - NES_FIRST_FRAG_SIZE, PCI_DMA_TODEVICE);
+		wqe_fragment_length[wqe_fragment_index++] =
+				cpu_to_le16(skb_headlen(skb) - NES_FIRST_FRAG_SIZE);
+		wqe_fragment_length[wqe_fragment_index] = 0;
+		set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
+				((u64)(bus_address)));
+		nesnic->tx_skb[nesnic->sq_head] = skb;
+	}
+
+	if (skb_headlen(skb) == skb->len) {
+		if (skb_headlen(skb) <= NES_FIRST_FRAG_SIZE) {
+			nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_2_1_IDX] = 0;
+			nesnic->tx_skb[nesnic->sq_head] = skb;
+		}
+	} else {
+		/* Deal with Fragments */
+		nesnic->tx_skb[nesnic->sq_head] = skb;
+		for (skb_fragment_index = 0; skb_fragment_index < skb_shinfo(skb)->nr_frags;
+				skb_fragment_index++) {
+			skb_frag_t *frag =
+				&skb_shinfo(skb)->frags[skb_fragment_index];
+			bus_address = skb_frag_dma_map(&nesdev->pcidev->dev,
+						       frag, 0, skb_frag_size(frag),
+						       DMA_TO_DEVICE);
+			wqe_fragment_length[wqe_fragment_index] =
+					cpu_to_le16(skb_frag_size(&skb_shinfo(skb)->frags[skb_fragment_index]));
+			set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index),
+				bus_address);
+			wqe_fragment_index++;
+			if (wqe_fragment_index < 5)
+				wqe_fragment_length[wqe_fragment_index] = 0;
+		}
+	}
+
+	set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX, wqe_misc);
+	nesnic->sq_head++;
+	nesnic->sq_head &= nesnic->sq_size - 1;
+
+	return NETDEV_TX_OK;
+}
+
+
+/**
+ * nes_netdev_start_xmit
+ */
+static int nes_netdev_start_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_hw_nic *nesnic = &nesvnic->nic;
+	struct nes_hw_nic_sq_wqe *nic_sqe;
+	struct tcphdr *tcph;
+	/* struct udphdr *udph; */
+#define NES_MAX_TSO_FRAGS MAX_SKB_FRAGS
+	/* 64K segment plus overflow on each side */
+	dma_addr_t tso_bus_address[NES_MAX_TSO_FRAGS];
+	dma_addr_t bus_address;
+	u32 tso_frag_index;
+	u32 tso_frag_count;
+	u32 tso_wqe_length;
+	u32 curr_tcp_seq;
+	u32 wqe_count=1;
+	u32 send_rc;
+	struct iphdr *iph;
+	__le16 *wqe_fragment_length;
+	u32 nr_frags;
+	u32 original_first_length;
+	/* u64 *wqe_fragment_address; */
+	/* first fragment (0) is used by copy buffer */
+	u16 wqe_fragment_index=1;
+	u16 hoffset;
+	u16 nhoffset;
+	u16 wqes_needed;
+	u16 wqes_available;
+	u32 wqe_misc;
+
+	/*
+	 * nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u,"
+	 *		" (%u frags), tso_size=%u\n",
+	 *		netdev->name, skb->len, skb_headlen(skb),
+	 *		skb_shinfo(skb)->nr_frags, skb_is_gso(skb));
+	 */
+
+	if (!netif_carrier_ok(netdev))
+		return NETDEV_TX_OK;
+
+	if (netif_queue_stopped(netdev))
+		return NETDEV_TX_BUSY;
+
+	/* Check if SQ is full */
+	if ((((nesnic->sq_tail+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) == 1) {
+		if (!netif_queue_stopped(netdev)) {
+			netif_stop_queue(netdev);
+			barrier();
+			if ((((((volatile u16)nesnic->sq_tail)+(nesnic->sq_size*2))-nesnic->sq_head) & (nesnic->sq_size - 1)) != 1) {
+				netif_start_queue(netdev);
+				goto sq_no_longer_full;
+			}
+		}
+		nesvnic->sq_full++;
+		return NETDEV_TX_BUSY;
+	}
+
+sq_no_longer_full:
+	nr_frags = skb_shinfo(skb)->nr_frags;
+	if (skb_headlen(skb) > NES_FIRST_FRAG_SIZE) {
+		nr_frags++;
+	}
+	/* Check if too many fragments */
+	if (unlikely((nr_frags > 4))) {
+		if (skb_is_gso(skb)) {
+			nesvnic->segmented_tso_requests++;
+			nesvnic->tso_requests++;
+			/* Basically 4 fragments available per WQE with extended fragments */
+			wqes_needed = nr_frags >> 2;
+			wqes_needed += (nr_frags&3)?1:0;
+			wqes_available = (((nesnic->sq_tail+nesnic->sq_size)-nesnic->sq_head) - 1) &
+					(nesnic->sq_size - 1);
+
+			if (unlikely(wqes_needed > wqes_available)) {
+				if (!netif_queue_stopped(netdev)) {
+					netif_stop_queue(netdev);
+					barrier();
+					wqes_available = (((((volatile u16)nesnic->sq_tail)+nesnic->sq_size)-nesnic->sq_head) - 1) &
+						(nesnic->sq_size - 1);
+					if (wqes_needed <= wqes_available) {
+						netif_start_queue(netdev);
+						goto tso_sq_no_longer_full;
+					}
+				}
+				nesvnic->sq_full++;
+				nes_debug(NES_DBG_NIC_TX, "%s: HNIC SQ full- TSO request has too many frags!\n",
+						netdev->name);
+				return NETDEV_TX_BUSY;
+			}
+tso_sq_no_longer_full:
+			/* Map all the buffers */
+			for (tso_frag_count=0; tso_frag_count < skb_shinfo(skb)->nr_frags;
+					tso_frag_count++) {
+				skb_frag_t *frag =
+					&skb_shinfo(skb)->frags[tso_frag_count];
+				tso_bus_address[tso_frag_count] =
+					skb_frag_dma_map(&nesdev->pcidev->dev,
+							 frag, 0, skb_frag_size(frag),
+							 DMA_TO_DEVICE);
+			}
+
+			tso_frag_index = 0;
+			curr_tcp_seq = ntohl(tcp_hdr(skb)->seq);
+			hoffset = skb_transport_header(skb) - skb->data;
+			nhoffset = skb_network_header(skb) - skb->data;
+			original_first_length = hoffset + ((((struct tcphdr *)skb_transport_header(skb))->doff)<<2);
+
+			for (wqe_count=0; wqe_count<((u32)wqes_needed); wqe_count++) {
+				tso_wqe_length = 0;
+				nic_sqe = &nesnic->sq_vbase[nesnic->sq_head];
+				wqe_fragment_length =
+						(__le16 *)&nic_sqe->wqe_words[NES_NIC_SQ_WQE_LENGTH_0_TAG_IDX];
+				/* setup the VLAN tag if present */
+				if (skb_vlan_tag_present(skb)) {
+					nes_debug(NES_DBG_NIC_TX, "%s: VLAN packet to send... VLAN = %08X\n",
+							netdev->name,
+						  skb_vlan_tag_get(skb));
+					wqe_misc = NES_NIC_SQ_WQE_TAGVALUE_ENABLE;
+					wqe_fragment_length[0] = (__force __le16) skb_vlan_tag_get(skb);
+				} else
+					wqe_misc = 0;
+
+				/* bump past the vlan tag */
+				wqe_fragment_length++;
+
+				/* Assumes header totally fits in allocated buffer and is in first fragment */
+				if (original_first_length > NES_FIRST_FRAG_SIZE) {
+					nes_debug(NES_DBG_NIC_TX, "ERROR: SKB header too big, headlen=%u, FIRST_FRAG_SIZE=%u\n",
+							original_first_length, NES_FIRST_FRAG_SIZE);
+					nes_debug(NES_DBG_NIC_TX, "%s Request to tx NIC packet length %u, headlen %u,"
+							" (%u frags), is_gso = %u tso_size=%u\n",
+							netdev->name,
+							skb->len, skb_headlen(skb),
+							skb_shinfo(skb)->nr_frags, skb_is_gso(skb), skb_shinfo(skb)->gso_size);
+				}
+				memcpy(&nesnic->first_frag_vbase[nesnic->sq_head].buffer,
+						skb->data, min(((unsigned int)NES_FIRST_FRAG_SIZE),
+						original_first_length));
+				iph = (struct iphdr *)
+				(&nesnic->first_frag_vbase[nesnic->sq_head].buffer[nhoffset]);
+				tcph = (struct tcphdr *)
+				(&nesnic->first_frag_vbase[nesnic->sq_head].buffer[hoffset]);
+				if ((wqe_count+1)!=(u32)wqes_needed) {
+					tcph->fin = 0;
+					tcph->psh = 0;
+					tcph->rst = 0;
+					tcph->urg = 0;
+				}
+				if (wqe_count) {
+					tcph->syn = 0;
+				}
+				tcph->seq = htonl(curr_tcp_seq);
+				wqe_fragment_length[0] = cpu_to_le16(min(((unsigned int)NES_FIRST_FRAG_SIZE),
+						original_first_length));
+
+				wqe_fragment_index = 1;
+				if ((wqe_count==0) && (skb_headlen(skb) > original_first_length)) {
+					set_bit(nesnic->sq_head, nesnic->first_frag_overflow);
+					bus_address = pci_map_single(nesdev->pcidev, skb->data + original_first_length,
+							skb_headlen(skb) - original_first_length, PCI_DMA_TODEVICE);
+					wqe_fragment_length[wqe_fragment_index++] =
+						cpu_to_le16(skb_headlen(skb) - original_first_length);
+					wqe_fragment_length[wqe_fragment_index] = 0;
+					set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG1_LOW_IDX,
+									bus_address);
+					tso_wqe_length += skb_headlen(skb) -
+							original_first_length;
+				}
+				while (wqe_fragment_index < 5) {
+					wqe_fragment_length[wqe_fragment_index] =
+							cpu_to_le16(skb_frag_size(&skb_shinfo(skb)->frags[tso_frag_index]));
+					set_wqe_64bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_FRAG0_LOW_IDX+(2*wqe_fragment_index),
+						(u64)tso_bus_address[tso_frag_index]);
+					wqe_fragment_index++;
+					tso_wqe_length += skb_frag_size(&skb_shinfo(skb)->frags[tso_frag_index++]);
+					if (wqe_fragment_index < 5)
+						wqe_fragment_length[wqe_fragment_index] = 0;
+					if (tso_frag_index == tso_frag_count)
+						break;
+				}
+				if ((wqe_count+1) == (u32)wqes_needed) {
+					nesnic->tx_skb[nesnic->sq_head] = skb;
+				} else {
+					nesnic->tx_skb[nesnic->sq_head] = NULL;
+				}
+				wqe_misc |= NES_NIC_SQ_WQE_COMPLETION | (u16)skb_shinfo(skb)->gso_size;
+				if ((tso_wqe_length + original_first_length) > skb_shinfo(skb)->gso_size) {
+					wqe_misc |= NES_NIC_SQ_WQE_LSO_ENABLE;
+				} else {
+					iph->tot_len = htons(tso_wqe_length + original_first_length - nhoffset);
+				}
+
+				set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_MISC_IDX,
+						 wqe_misc);
+				set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_LSO_INFO_IDX,
+						((u32)tcph->doff) | (((u32)hoffset) << 4));
+
+				set_wqe_32bit_value(nic_sqe->wqe_words, NES_NIC_SQ_WQE_TOTAL_LENGTH_IDX,
+						tso_wqe_length + original_first_length);
+				curr_tcp_seq += tso_wqe_length;
+				nesnic->sq_head++;
+				nesnic->sq_head &= nesnic->sq_size-1;
+			}
+		} else {
+			nesvnic->linearized_skbs++;
+			hoffset = skb_transport_header(skb) - skb->data;
+			nhoffset = skb_network_header(skb) - skb->data;
+			skb_linearize(skb);
+			skb_set_transport_header(skb, hoffset);
+			skb_set_network_header(skb, nhoffset);
+			send_rc = nes_nic_send(skb, netdev);
+			if (send_rc != NETDEV_TX_OK)
+				return NETDEV_TX_OK;
+		}
+	} else {
+		send_rc = nes_nic_send(skb, netdev);
+		if (send_rc != NETDEV_TX_OK)
+			return NETDEV_TX_OK;
+	}
+
+	barrier();
+
+	if (wqe_count)
+		nes_write32(nesdev->regs+NES_WQE_ALLOC,
+				(wqe_count << 24) | (1 << 23) | nesvnic->nic.qp_id);
+
+	netdev->trans_start = jiffies;
+
+	return NETDEV_TX_OK;
+}
+
+
+/**
+ * nes_netdev_get_stats
+ */
+static struct net_device_stats *nes_netdev_get_stats(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u64 u64temp;
+	u32 u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_DISCARD + (nesvnic->nic_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->endnode_nstat_rx_discard += u32temp;
+
+	u64temp = (u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO + (nesvnic->nic_index*0x200));
+	u64temp += ((u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32;
+
+	nesvnic->endnode_nstat_rx_octets += u64temp;
+	nesvnic->netstats.rx_bytes += u64temp;
+
+	u64temp = (u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO + (nesvnic->nic_index*0x200));
+	u64temp += ((u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32;
+
+	nesvnic->endnode_nstat_rx_frames += u64temp;
+	nesvnic->netstats.rx_packets += u64temp;
+
+	u64temp = (u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO + (nesvnic->nic_index*0x200));
+	u64temp += ((u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI + (nesvnic->nic_index*0x200))) << 32;
+
+	nesvnic->endnode_nstat_tx_octets += u64temp;
+	nesvnic->netstats.tx_bytes += u64temp;
+
+	u64temp = (u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO + (nesvnic->nic_index*0x200));
+	u64temp += ((u64)nes_read_indexed(nesdev,
+			NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI + (nesvnic->nic_index*0x200))) << 32;
+
+	nesvnic->endnode_nstat_tx_frames += u64temp;
+	nesvnic->netstats.tx_packets += u64temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_short_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_oversized_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_jabber_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_length_errors += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_crc_errors += u32temp;
+	nesvnic->netstats.rx_crc_errors += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_tx_errors += u32temp;
+	nesvnic->netstats.tx_errors += u32temp;
+
+	return &nesvnic->netstats;
+}
+
+
+/**
+ * nes_netdev_tx_timeout
+ */
+static void nes_netdev_tx_timeout(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+
+	if (netif_msg_timer(nesvnic))
+		nes_debug(NES_DBG_NIC_TX, "%s: tx timeout\n", netdev->name);
+}
+
+
+/**
+ * nes_netdev_set_mac_address
+ */
+static int nes_netdev_set_mac_address(struct net_device *netdev, void *p)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct sockaddr *mac_addr = p;
+	int i;
+	u32 macaddr_low;
+	u16 macaddr_high;
+
+	if (!is_valid_ether_addr(mac_addr->sa_data))
+		return -EADDRNOTAVAIL;
+
+	memcpy(netdev->dev_addr, mac_addr->sa_data, netdev->addr_len);
+	printk(PFX "%s: Address length = %d, Address = %pM\n",
+	       __func__, netdev->addr_len, mac_addr->sa_data);
+	macaddr_high  = ((u16)netdev->dev_addr[0]) << 8;
+	macaddr_high += (u16)netdev->dev_addr[1];
+	macaddr_low   = ((u32)netdev->dev_addr[2]) << 24;
+	macaddr_low  += ((u32)netdev->dev_addr[3]) << 16;
+	macaddr_low  += ((u32)netdev->dev_addr[4]) << 8;
+	macaddr_low  += (u32)netdev->dev_addr[5];
+
+	for (i = 0; i < NES_MAX_PORT_COUNT; i++) {
+		if (nesvnic->qp_nic_index[i] == 0xf) {
+			break;
+		}
+		nes_write_indexed(nesdev,
+				NES_IDX_PERFECT_FILTER_LOW + (nesvnic->qp_nic_index[i] * 8),
+				macaddr_low);
+		nes_write_indexed(nesdev,
+				NES_IDX_PERFECT_FILTER_HIGH + (nesvnic->qp_nic_index[i] * 8),
+				(u32)macaddr_high | NES_MAC_ADDR_VALID |
+				((((u32)nesvnic->nic_index) << 16)));
+	}
+	return 0;
+}
+
+
+static void set_allmulti(struct nes_device *nesdev, u32 nic_active_bit)
+{
+	u32 nic_active;
+
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
+	nic_active |= nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
+	nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+	nic_active &= ~nic_active_bit;
+	nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+}
+
+#define get_addr(addrs, index) ((addrs) + (index) * ETH_ALEN)
+
+/**
+ * nes_netdev_set_multicast_list
+ */
+static void nes_netdev_set_multicast_list(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter;
+	u32 nic_active_bit;
+	u32 nic_active;
+	u32 perfect_filter_register_address;
+	u32 macaddr_low;
+	u16 macaddr_high;
+	u8 mc_all_on = 0;
+	u8 mc_index;
+	int mc_nic_index = -1;
+	u8 pft_entries_preallocated = max(nesadapter->adapter_fcn_count *
+					nics_per_function, 4);
+	u8 max_pft_entries_avaiable = NES_PFT_SIZE - pft_entries_preallocated;
+	unsigned long flags;
+	int mc_count = netdev_mc_count(netdev);
+
+	spin_lock_irqsave(&nesadapter->resource_lock, flags);
+	nic_active_bit = 1 << nesvnic->nic_index;
+
+	if (netdev->flags & IFF_PROMISC) {
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
+		nic_active |= nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+		nic_active |= nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+		mc_all_on = 1;
+	} else if ((netdev->flags & IFF_ALLMULTI) ||
+			   (nesvnic->nic_index > 3)) {
+		set_allmulti(nesdev, nic_active_bit);
+		mc_all_on = 1;
+	} else {
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL);
+		nic_active &= ~nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL, nic_active);
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+		nic_active &= ~nic_active_bit;
+		nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+	}
+
+	nes_debug(NES_DBG_NIC_RX, "Number of MC entries = %d, Promiscuous = %d, All Multicast = %d.\n",
+		  mc_count, !!(netdev->flags & IFF_PROMISC),
+		  !!(netdev->flags & IFF_ALLMULTI));
+	if (!mc_all_on) {
+		char *addrs;
+		int i;
+		struct netdev_hw_addr *ha;
+
+		addrs = kmalloc(ETH_ALEN * mc_count, GFP_ATOMIC);
+		if (!addrs) {
+			set_allmulti(nesdev, nic_active_bit);
+			goto unlock;
+		}
+		i = 0;
+		netdev_for_each_mc_addr(ha, netdev)
+			memcpy(get_addr(addrs, i++), ha->addr, ETH_ALEN);
+
+		perfect_filter_register_address = NES_IDX_PERFECT_FILTER_LOW +
+						pft_entries_preallocated * 0x8;
+		for (i = 0, mc_index = 0; mc_index < max_pft_entries_avaiable;
+		     mc_index++) {
+			while (i < mc_count && nesvnic->mcrq_mcast_filter &&
+			((mc_nic_index = nesvnic->mcrq_mcast_filter(nesvnic,
+					get_addr(addrs, i++))) == 0));
+			if (mc_nic_index < 0)
+				mc_nic_index = nesvnic->nic_index;
+			while (nesadapter->pft_mcast_map[mc_index] < 16 &&
+				nesadapter->pft_mcast_map[mc_index] !=
+					nesvnic->nic_index &&
+					mc_index < max_pft_entries_avaiable) {
+						nes_debug(NES_DBG_NIC_RX,
+					"mc_index=%d skipping nic_index=%d, "
+					"used for=%d \n", mc_index,
+					nesvnic->nic_index,
+					nesadapter->pft_mcast_map[mc_index]);
+				mc_index++;
+			}
+			if (mc_index >= max_pft_entries_avaiable)
+				break;
+			if (i < mc_count) {
+				char *addr = get_addr(addrs, i++);
+
+				nes_debug(NES_DBG_NIC_RX, "Assigning MC Address %pM to register 0x%04X nic_idx=%d\n",
+					  addr,
+					  perfect_filter_register_address+(mc_index * 8),
+					  mc_nic_index);
+				macaddr_high  = ((u8) addr[0]) << 8;
+				macaddr_high += (u8) addr[1];
+				macaddr_low   = ((u8) addr[2]) << 24;
+				macaddr_low  += ((u8) addr[3]) << 16;
+				macaddr_low  += ((u8) addr[4]) << 8;
+				macaddr_low  += (u8) addr[5];
+
+				nes_write_indexed(nesdev,
+						perfect_filter_register_address+(mc_index * 8),
+						macaddr_low);
+				nes_write_indexed(nesdev,
+						perfect_filter_register_address+4+(mc_index * 8),
+						(u32)macaddr_high | NES_MAC_ADDR_VALID |
+						((((u32)(1<<mc_nic_index)) << 16)));
+				nesadapter->pft_mcast_map[mc_index] =
+							nesvnic->nic_index;
+			} else {
+				nes_debug(NES_DBG_NIC_RX, "Clearing MC Address at register 0x%04X\n",
+						  perfect_filter_register_address+(mc_index * 8));
+				nes_write_indexed(nesdev,
+						perfect_filter_register_address+4+(mc_index * 8),
+						0);
+				nesadapter->pft_mcast_map[mc_index] = 255;
+			}
+		}
+		kfree(addrs);
+		/* PFT is not large enough */
+		if (i < mc_count)
+			set_allmulti(nesdev, nic_active_bit);
+	}
+
+unlock:
+	spin_unlock_irqrestore(&nesadapter->resource_lock, flags);
+}
+
+
+/**
+ * nes_netdev_change_mtu
+ */
+static int nes_netdev_change_mtu(struct net_device *netdev, int new_mtu)
+{
+	struct nes_vnic	*nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	int ret = 0;
+	u8 jumbomode = 0;
+	u32 nic_active;
+	u32 nic_active_bit;
+	u32 uc_all_active;
+	u32 mc_all_active;
+
+	if ((new_mtu < ETH_ZLEN) || (new_mtu > max_mtu))
+		return -EINVAL;
+
+	netdev->mtu = new_mtu;
+	nesvnic->max_frame_size	= new_mtu + VLAN_ETH_HLEN;
+
+	if (netdev->mtu	> 1500)	{
+		jumbomode=1;
+	}
+	nes_nic_init_timer_defaults(nesdev, jumbomode);
+
+	if (netif_running(netdev)) {
+		nic_active_bit = 1 << nesvnic->nic_index;
+		mc_all_active = nes_read_indexed(nesdev,
+				NES_IDX_NIC_MULTICAST_ALL) & nic_active_bit;
+		uc_all_active = nes_read_indexed(nesdev,
+				NES_IDX_NIC_UNICAST_ALL)  & nic_active_bit;
+
+		nes_netdev_stop(netdev);
+		nes_netdev_open(netdev);
+
+		nic_active = nes_read_indexed(nesdev,
+					NES_IDX_NIC_MULTICAST_ALL);
+		nic_active |= mc_all_active;
+		nes_write_indexed(nesdev, NES_IDX_NIC_MULTICAST_ALL,
+							nic_active);
+
+		nic_active = nes_read_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL);
+		nic_active |= uc_all_active;
+		nes_write_indexed(nesdev, NES_IDX_NIC_UNICAST_ALL, nic_active);
+	}
+
+	return ret;
+}
+
+
+static const char nes_ethtool_stringset[][ETH_GSTRING_LEN] = {
+	"Link Change Interrupts",
+	"Linearized SKBs",
+	"T/GSO Requests",
+	"Pause Frames Sent",
+	"Pause Frames Received",
+	"Internal Routing Errors",
+	"SQ SW Dropped SKBs",
+	"SQ Full",
+	"Segmented TSO Requests",
+	"Rx Symbol Errors",
+	"Rx Jabber Errors",
+	"Rx Oversized Frames",
+	"Rx Short Frames",
+	"Rx Length Errors",
+	"Rx CRC Errors",
+	"Rx Port Discard",
+	"Endnode Rx Discards",
+	"Endnode Rx Octets",
+	"Endnode Rx Frames",
+	"Endnode Tx Octets",
+	"Endnode Tx Frames",
+	"Tx Errors",
+	"mh detected",
+	"mh pauses",
+	"Retransmission Count",
+	"CM Connects",
+	"CM Accepts",
+	"Disconnects",
+	"Connected Events",
+	"Connect Requests",
+	"CM Rejects",
+	"ModifyQP Timeouts",
+	"CreateQPs",
+	"SW DestroyQPs",
+	"DestroyQPs",
+	"CM Closes",
+	"CM Packets Sent",
+	"CM Packets Bounced",
+	"CM Packets Created",
+	"CM Packets Rcvd",
+	"CM Packets Dropped",
+	"CM Packets Retrans",
+	"CM Listens Created",
+	"CM Listens Destroyed",
+	"CM Backlog Drops",
+	"CM Loopbacks",
+	"CM Nodes Created",
+	"CM Nodes Destroyed",
+	"CM Accel Drops",
+	"CM Resets Received",
+	"Free 4Kpbls",
+	"Free 256pbls",
+	"Timer Inits",
+	"LRO aggregated",
+	"LRO flushed",
+	"LRO no_desc",
+	"PAU CreateQPs",
+	"PAU DestroyQPs",
+};
+#define NES_ETHTOOL_STAT_COUNT  ARRAY_SIZE(nes_ethtool_stringset)
+
+
+/**
+ * nes_netdev_get_sset_count
+ */
+static int nes_netdev_get_sset_count(struct net_device *netdev, int stringset)
+{
+	if (stringset == ETH_SS_STATS)
+		return NES_ETHTOOL_STAT_COUNT;
+	else
+		return -EINVAL;
+}
+
+
+/**
+ * nes_netdev_get_strings
+ */
+static void nes_netdev_get_strings(struct net_device *netdev, u32 stringset,
+		u8 *ethtool_strings)
+{
+	if (stringset == ETH_SS_STATS)
+		memcpy(ethtool_strings,
+				&nes_ethtool_stringset,
+				sizeof(nes_ethtool_stringset));
+}
+
+
+/**
+ * nes_netdev_get_ethtool_stats
+ */
+
+static void nes_netdev_get_ethtool_stats(struct net_device *netdev,
+		struct ethtool_stats *target_ethtool_stats, u64 *target_stat_values)
+{
+	u64 u64temp;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 nic_count;
+	u32 u32temp;
+	u32 index = 0;
+
+	target_ethtool_stats->n_stats = NES_ETHTOOL_STAT_COUNT;
+	target_stat_values[index] = nesvnic->nesdev->link_status_interrupts;
+	target_stat_values[++index] = nesvnic->linearized_skbs;
+	target_stat_values[++index] = nesvnic->tso_requests;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_TX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_pause_frames_sent += u32temp;
+	target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_sent;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_PAUSE_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_pause_frames_received += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_PORT_RX_DISCARDS + (nesvnic->nesdev->mac_index*0x40));
+	nesvnic->nesdev->port_rx_discards += u32temp;
+	nesvnic->netstats.rx_dropped += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_PORT_TX_DISCARDS + (nesvnic->nesdev->mac_index*0x40));
+	nesvnic->nesdev->port_tx_discards += u32temp;
+	nesvnic->netstats.tx_dropped += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_SHORT_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_short_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_OVERSIZED_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_oversized_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_JABBER_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_jabber_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_SYMBOL_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_dropped += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_symbol_err_frames += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_LENGTH_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->netstats.rx_length_errors += u32temp;
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_RX_CRC_ERR_FRAMES + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_rx_errors += u32temp;
+	nesvnic->nesdev->mac_rx_crc_errors += u32temp;
+	nesvnic->netstats.rx_crc_errors += u32temp;
+
+	u32temp = nes_read_indexed(nesdev,
+			NES_IDX_MAC_TX_ERRORS + (nesvnic->nesdev->mac_index*0x200));
+	nesvnic->nesdev->mac_tx_errors += u32temp;
+	nesvnic->netstats.tx_errors += u32temp;
+
+	for (nic_count = 0; nic_count < NES_MAX_PORT_COUNT; nic_count++) {
+		if (nesvnic->qp_nic_index[nic_count] == 0xf)
+			break;
+
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_DISCARD +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		nesvnic->netstats.rx_dropped += u32temp;
+		nesvnic->endnode_nstat_rx_discard += u32temp;
+
+		u64temp = (u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_LO +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		u64temp += ((u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_OCTETS_HI +
+				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
+
+		nesvnic->endnode_nstat_rx_octets += u64temp;
+		nesvnic->netstats.rx_bytes += u64temp;
+
+		u64temp = (u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_LO +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		u64temp += ((u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_RX_FRAMES_HI +
+				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
+
+		nesvnic->endnode_nstat_rx_frames += u64temp;
+		nesvnic->netstats.rx_packets += u64temp;
+
+		u64temp = (u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_LO +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		u64temp += ((u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_TX_OCTETS_HI +
+				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
+
+		nesvnic->endnode_nstat_tx_octets += u64temp;
+		nesvnic->netstats.tx_bytes += u64temp;
+
+		u64temp = (u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_LO +
+				(nesvnic->qp_nic_index[nic_count]*0x200));
+		u64temp += ((u64)nes_read_indexed(nesdev,
+				NES_IDX_ENDNODE0_NSTAT_TX_FRAMES_HI +
+				(nesvnic->qp_nic_index[nic_count]*0x200))) << 32;
+
+		nesvnic->endnode_nstat_tx_frames += u64temp;
+		nesvnic->netstats.tx_packets += u64temp;
+
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_IPV4_TCP_REXMITS + (nesvnic->qp_nic_index[nic_count]*0x200));
+		nesvnic->endnode_ipv4_tcp_retransmits += u32temp;
+	}
+
+	target_stat_values[++index] = nesvnic->nesdev->mac_pause_frames_received;
+	target_stat_values[++index] = nesdev->nesadapter->nic_rx_eth_route_err;
+	target_stat_values[++index] = nesvnic->tx_sw_dropped;
+	target_stat_values[++index] = nesvnic->sq_full;
+	target_stat_values[++index] = nesvnic->segmented_tso_requests;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_symbol_err_frames;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_jabber_frames;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_oversized_frames;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_short_frames;
+	target_stat_values[++index] = nesvnic->netstats.rx_length_errors;
+	target_stat_values[++index] = nesvnic->nesdev->mac_rx_crc_errors;
+	target_stat_values[++index] = nesvnic->nesdev->port_rx_discards;
+	target_stat_values[++index] = nesvnic->endnode_nstat_rx_discard;
+	target_stat_values[++index] = nesvnic->endnode_nstat_rx_octets;
+	target_stat_values[++index] = nesvnic->endnode_nstat_rx_frames;
+	target_stat_values[++index] = nesvnic->endnode_nstat_tx_octets;
+	target_stat_values[++index] = nesvnic->endnode_nstat_tx_frames;
+	target_stat_values[++index] = nesvnic->nesdev->mac_tx_errors;
+	target_stat_values[++index] = mh_detected;
+	target_stat_values[++index] = mh_pauses_sent;
+	target_stat_values[++index] = nesvnic->endnode_ipv4_tcp_retransmits;
+	target_stat_values[++index] = atomic_read(&cm_connects);
+	target_stat_values[++index] = atomic_read(&cm_accepts);
+	target_stat_values[++index] = atomic_read(&cm_disconnects);
+	target_stat_values[++index] = atomic_read(&cm_connecteds);
+	target_stat_values[++index] = atomic_read(&cm_connect_reqs);
+	target_stat_values[++index] = atomic_read(&cm_rejects);
+	target_stat_values[++index] = atomic_read(&mod_qp_timouts);
+	target_stat_values[++index] = atomic_read(&qps_created);
+	target_stat_values[++index] = atomic_read(&sw_qps_destroyed);
+	target_stat_values[++index] = atomic_read(&qps_destroyed);
+	target_stat_values[++index] = atomic_read(&cm_closes);
+	target_stat_values[++index] = cm_packets_sent;
+	target_stat_values[++index] = cm_packets_bounced;
+	target_stat_values[++index] = cm_packets_created;
+	target_stat_values[++index] = cm_packets_received;
+	target_stat_values[++index] = cm_packets_dropped;
+	target_stat_values[++index] = cm_packets_retrans;
+	target_stat_values[++index] = atomic_read(&cm_listens_created);
+	target_stat_values[++index] = atomic_read(&cm_listens_destroyed);
+	target_stat_values[++index] = cm_backlog_drops;
+	target_stat_values[++index] = atomic_read(&cm_loopbacks);
+	target_stat_values[++index] = atomic_read(&cm_nodes_created);
+	target_stat_values[++index] = atomic_read(&cm_nodes_destroyed);
+	target_stat_values[++index] = atomic_read(&cm_accel_dropped_pkts);
+	target_stat_values[++index] = atomic_read(&cm_resets_recvd);
+	target_stat_values[++index] = nesadapter->free_4kpbl;
+	target_stat_values[++index] = nesadapter->free_256pbl;
+	target_stat_values[++index] = int_mod_timer_init;
+	target_stat_values[++index] = nesvnic->lro_mgr.stats.aggregated;
+	target_stat_values[++index] = nesvnic->lro_mgr.stats.flushed;
+	target_stat_values[++index] = nesvnic->lro_mgr.stats.no_desc;
+	target_stat_values[++index] = atomic_read(&pau_qps_created);
+	target_stat_values[++index] = atomic_read(&pau_qps_destroyed);
+}
+
+/**
+ * nes_netdev_get_drvinfo
+ */
+static void nes_netdev_get_drvinfo(struct net_device *netdev,
+		struct ethtool_drvinfo *drvinfo)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_adapter *nesadapter = nesvnic->nesdev->nesadapter;
+
+	strlcpy(drvinfo->driver, DRV_NAME, sizeof(drvinfo->driver));
+	strlcpy(drvinfo->bus_info, pci_name(nesvnic->nesdev->pcidev),
+		sizeof(drvinfo->bus_info));
+	snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+		 "%u.%u", nesadapter->firmware_version >> 16,
+		 nesadapter->firmware_version & 0x000000ff);
+	strlcpy(drvinfo->version, DRV_VERSION, sizeof(drvinfo->version));
+	drvinfo->testinfo_len = 0;
+	drvinfo->eedump_len = 0;
+	drvinfo->regdump_len = 0;
+}
+
+
+/**
+ * nes_netdev_set_coalesce
+ */
+static int nes_netdev_set_coalesce(struct net_device *netdev,
+		struct ethtool_coalesce	*et_coalesce)
+{
+	struct nes_vnic	*nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock, flags);
+	if (et_coalesce->rx_max_coalesced_frames_low) {
+		shared_timer->threshold_low = et_coalesce->rx_max_coalesced_frames_low;
+	}
+	if (et_coalesce->rx_max_coalesced_frames_irq) {
+		shared_timer->threshold_target = et_coalesce->rx_max_coalesced_frames_irq;
+	}
+	if (et_coalesce->rx_max_coalesced_frames_high) {
+		shared_timer->threshold_high = et_coalesce->rx_max_coalesced_frames_high;
+	}
+	if (et_coalesce->rx_coalesce_usecs_low) {
+		shared_timer->timer_in_use_min = et_coalesce->rx_coalesce_usecs_low;
+	}
+	if (et_coalesce->rx_coalesce_usecs_high) {
+		shared_timer->timer_in_use_max = et_coalesce->rx_coalesce_usecs_high;
+	}
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+
+	/* using this to drive total interrupt moderation */
+	nesadapter->et_rx_coalesce_usecs_irq = et_coalesce->rx_coalesce_usecs_irq;
+	if (et_coalesce->use_adaptive_rx_coalesce) {
+		nesadapter->et_use_adaptive_rx_coalesce	= 1;
+		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT_DYNAMIC;
+		nesadapter->et_rx_coalesce_usecs_irq = 0;
+		if (et_coalesce->pkt_rate_low) {
+			nesadapter->et_pkt_rate_low = et_coalesce->pkt_rate_low;
+		}
+	} else {
+		nesadapter->et_use_adaptive_rx_coalesce	= 0;
+		nesadapter->timer_int_limit = NES_TIMER_INT_LIMIT;
+		if (nesadapter->et_rx_coalesce_usecs_irq) {
+			nes_write32(nesdev->regs+NES_PERIODIC_CONTROL,
+					0x80000000 | ((u32)(nesadapter->et_rx_coalesce_usecs_irq*8)));
+		}
+	}
+	return 0;
+}
+
+
+/**
+ * nes_netdev_get_coalesce
+ */
+static int nes_netdev_get_coalesce(struct net_device *netdev,
+		struct ethtool_coalesce	*et_coalesce)
+{
+	struct nes_vnic	*nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct ethtool_coalesce	temp_et_coalesce;
+	struct nes_hw_tune_timer *shared_timer = &nesadapter->tune_timer;
+	unsigned long flags;
+
+	memset(&temp_et_coalesce, 0, sizeof(temp_et_coalesce));
+	temp_et_coalesce.rx_coalesce_usecs_irq    = nesadapter->et_rx_coalesce_usecs_irq;
+	temp_et_coalesce.use_adaptive_rx_coalesce = nesadapter->et_use_adaptive_rx_coalesce;
+	temp_et_coalesce.rate_sample_interval     = nesadapter->et_rate_sample_interval;
+	temp_et_coalesce.pkt_rate_low =	nesadapter->et_pkt_rate_low;
+	spin_lock_irqsave(&nesadapter->periodic_timer_lock,	flags);
+	temp_et_coalesce.rx_max_coalesced_frames_low  = shared_timer->threshold_low;
+	temp_et_coalesce.rx_max_coalesced_frames_irq  = shared_timer->threshold_target;
+	temp_et_coalesce.rx_max_coalesced_frames_high = shared_timer->threshold_high;
+	temp_et_coalesce.rx_coalesce_usecs_low  = shared_timer->timer_in_use_min;
+	temp_et_coalesce.rx_coalesce_usecs_high = shared_timer->timer_in_use_max;
+	if (nesadapter->et_use_adaptive_rx_coalesce) {
+		temp_et_coalesce.rx_coalesce_usecs_irq = shared_timer->timer_in_use;
+	}
+	spin_unlock_irqrestore(&nesadapter->periodic_timer_lock, flags);
+	memcpy(et_coalesce, &temp_et_coalesce, sizeof(*et_coalesce));
+	return 0;
+}
+
+
+/**
+ * nes_netdev_get_pauseparam
+ */
+static void nes_netdev_get_pauseparam(struct net_device *netdev,
+		struct ethtool_pauseparam *et_pauseparam)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+
+	et_pauseparam->autoneg = 0;
+	et_pauseparam->rx_pause = (nesvnic->nesdev->disable_rx_flow_control == 0) ? 1:0;
+	et_pauseparam->tx_pause = (nesvnic->nesdev->disable_tx_flow_control == 0) ? 1:0;
+}
+
+
+/**
+ * nes_netdev_set_pauseparam
+ */
+static int nes_netdev_set_pauseparam(struct net_device *netdev,
+		struct ethtool_pauseparam *et_pauseparam)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u32 u32temp;
+
+	if (et_pauseparam->autoneg) {
+		/* TODO: should return unsupported */
+		return 0;
+	}
+	if ((et_pauseparam->tx_pause == 1) && (nesdev->disable_tx_flow_control == 1)) {
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200));
+		u32temp |= NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE;
+		nes_write_indexed(nesdev,
+				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp);
+		nesdev->disable_tx_flow_control = 0;
+	} else if ((et_pauseparam->tx_pause == 0) && (nesdev->disable_tx_flow_control == 0)) {
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200));
+		u32temp &= ~NES_IDX_MAC_TX_CONFIG_ENABLE_PAUSE;
+		nes_write_indexed(nesdev,
+				NES_IDX_MAC_TX_CONFIG + (nesdev->mac_index*0x200), u32temp);
+		nesdev->disable_tx_flow_control = 1;
+	}
+	if ((et_pauseparam->rx_pause == 1) && (nesdev->disable_rx_flow_control == 1)) {
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40));
+		u32temp &= ~NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE;
+		nes_write_indexed(nesdev,
+				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp);
+		nesdev->disable_rx_flow_control = 0;
+	} else if ((et_pauseparam->rx_pause == 0) && (nesdev->disable_rx_flow_control == 0)) {
+		u32temp = nes_read_indexed(nesdev,
+				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40));
+		u32temp |= NES_IDX_MPP_DEBUG_PORT_DISABLE_PAUSE;
+		nes_write_indexed(nesdev,
+				NES_IDX_MPP_DEBUG + (nesdev->mac_index*0x40), u32temp);
+		nesdev->disable_rx_flow_control = 1;
+	}
+
+	return 0;
+}
+
+
+/**
+ * nes_netdev_get_settings
+ */
+static int nes_netdev_get_settings(struct net_device *netdev, struct ethtool_cmd *et_cmd)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 mac_index = nesdev->mac_index;
+	u8 phy_type = nesadapter->phy_type[mac_index];
+	u8 phy_index = nesadapter->phy_index[mac_index];
+	u16 phy_data;
+
+	et_cmd->duplex = DUPLEX_FULL;
+	et_cmd->port   = PORT_MII;
+	et_cmd->maxtxpkt = 511;
+	et_cmd->maxrxpkt = 511;
+
+	if (nesadapter->OneG_Mode) {
+		ethtool_cmd_speed_set(et_cmd, SPEED_1000);
+		if (phy_type == NES_PHY_TYPE_PUMA_1G) {
+			et_cmd->supported   = SUPPORTED_1000baseT_Full;
+			et_cmd->advertising = ADVERTISED_1000baseT_Full;
+			et_cmd->autoneg     = AUTONEG_DISABLE;
+			et_cmd->transceiver = XCVR_INTERNAL;
+			et_cmd->phy_address = mac_index;
+		} else {
+			unsigned long flags;
+			et_cmd->supported   = SUPPORTED_1000baseT_Full
+					    | SUPPORTED_Autoneg;
+			et_cmd->advertising = ADVERTISED_1000baseT_Full
+					    | ADVERTISED_Autoneg;
+			spin_lock_irqsave(&nesadapter->phy_lock, flags);
+			nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+			spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+			if (phy_data & 0x1000)
+				et_cmd->autoneg = AUTONEG_ENABLE;
+			else
+				et_cmd->autoneg = AUTONEG_DISABLE;
+			et_cmd->transceiver = XCVR_EXTERNAL;
+			et_cmd->phy_address = phy_index;
+		}
+		return 0;
+	}
+	if ((phy_type == NES_PHY_TYPE_ARGUS) ||
+	    (phy_type == NES_PHY_TYPE_SFP_D) ||
+	    (phy_type == NES_PHY_TYPE_KR)) {
+		et_cmd->transceiver = XCVR_EXTERNAL;
+		et_cmd->port        = PORT_FIBRE;
+		et_cmd->supported   = SUPPORTED_FIBRE;
+		et_cmd->advertising = ADVERTISED_FIBRE;
+		et_cmd->phy_address = phy_index;
+	} else {
+		et_cmd->transceiver = XCVR_INTERNAL;
+		et_cmd->supported   = SUPPORTED_10000baseT_Full;
+		et_cmd->advertising = ADVERTISED_10000baseT_Full;
+		et_cmd->phy_address = mac_index;
+	}
+	ethtool_cmd_speed_set(et_cmd, SPEED_10000);
+	et_cmd->autoneg = AUTONEG_DISABLE;
+	return 0;
+}
+
+
+/**
+ * nes_netdev_set_settings
+ */
+static int nes_netdev_set_settings(struct net_device *netdev, struct ethtool_cmd *et_cmd)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	if ((nesadapter->OneG_Mode) &&
+	    (nesadapter->phy_type[nesdev->mac_index] != NES_PHY_TYPE_PUMA_1G)) {
+		unsigned long flags;
+		u16 phy_data;
+		u8 phy_index = nesadapter->phy_index[nesdev->mac_index];
+
+		spin_lock_irqsave(&nesadapter->phy_lock, flags);
+		nes_read_1G_phy_reg(nesdev, 0, phy_index, &phy_data);
+		if (et_cmd->autoneg) {
+			/* Turn on Full duplex, Autoneg, and restart autonegotiation */
+			phy_data |= 0x1300;
+		} else {
+			/* Turn off autoneg */
+			phy_data &= ~0x1000;
+		}
+		nes_write_1G_phy_reg(nesdev, 0, phy_index, phy_data);
+		spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+	}
+
+	return 0;
+}
+
+
+static const struct ethtool_ops nes_ethtool_ops = {
+	.get_link = ethtool_op_get_link,
+	.get_settings = nes_netdev_get_settings,
+	.set_settings = nes_netdev_set_settings,
+	.get_strings = nes_netdev_get_strings,
+	.get_sset_count = nes_netdev_get_sset_count,
+	.get_ethtool_stats = nes_netdev_get_ethtool_stats,
+	.get_drvinfo = nes_netdev_get_drvinfo,
+	.get_coalesce = nes_netdev_get_coalesce,
+	.set_coalesce = nes_netdev_set_coalesce,
+	.get_pauseparam = nes_netdev_get_pauseparam,
+	.set_pauseparam = nes_netdev_set_pauseparam,
+};
+
+static void nes_vlan_mode(struct net_device *netdev, struct nes_device *nesdev, netdev_features_t features)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 u32temp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+
+	nes_debug(NES_DBG_NETDEV, "%s: %s\n", __func__, netdev->name);
+
+	/* Enable/Disable VLAN Stripping */
+	u32temp = nes_read_indexed(nesdev, NES_IDX_PCIX_DIAG);
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		u32temp &= 0xfdffffff;
+	else
+		u32temp	|= 0x02000000;
+
+	nes_write_indexed(nesdev, NES_IDX_PCIX_DIAG, u32temp);
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+}
+
+static netdev_features_t nes_fix_features(struct net_device *netdev, netdev_features_t features)
+{
+	/*
+	 * Since there is no support for separate rx/tx vlan accel
+	 * enable/disable make sure tx flag is always in same state as rx.
+	 */
+	if (features & NETIF_F_HW_VLAN_CTAG_RX)
+		features |= NETIF_F_HW_VLAN_CTAG_TX;
+	else
+		features &= ~NETIF_F_HW_VLAN_CTAG_TX;
+
+	return features;
+}
+
+static int nes_set_features(struct net_device *netdev, netdev_features_t features)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	u32 changed = netdev->features ^ features;
+
+	if (changed & NETIF_F_HW_VLAN_CTAG_RX)
+		nes_vlan_mode(netdev, nesdev, features);
+
+	return 0;
+}
+
+static const struct net_device_ops nes_netdev_ops = {
+	.ndo_open		= nes_netdev_open,
+	.ndo_stop		= nes_netdev_stop,
+	.ndo_start_xmit		= nes_netdev_start_xmit,
+	.ndo_get_stats		= nes_netdev_get_stats,
+	.ndo_tx_timeout		= nes_netdev_tx_timeout,
+	.ndo_set_mac_address	= nes_netdev_set_mac_address,
+	.ndo_set_rx_mode	= nes_netdev_set_multicast_list,
+	.ndo_change_mtu		= nes_netdev_change_mtu,
+	.ndo_validate_addr	= eth_validate_addr,
+	.ndo_fix_features	= nes_fix_features,
+	.ndo_set_features	= nes_set_features,
+};
+
+/**
+ * nes_netdev_init - initialize network device
+ */
+struct net_device *nes_netdev_init(struct nes_device *nesdev,
+		void __iomem *mmio_addr)
+{
+	u64 u64temp;
+	struct nes_vnic *nesvnic;
+	struct net_device *netdev;
+	struct nic_qp_map *curr_qp_map;
+	u8 phy_type = nesdev->nesadapter->phy_type[nesdev->mac_index];
+
+	netdev = alloc_etherdev(sizeof(struct nes_vnic));
+	if (!netdev) {
+		printk(KERN_ERR PFX "nesvnic etherdev alloc failed");
+		return NULL;
+	}
+	nesvnic = netdev_priv(netdev);
+
+	nes_debug(NES_DBG_INIT, "netdev = %p, %s\n", netdev, netdev->name);
+
+	SET_NETDEV_DEV(netdev, &nesdev->pcidev->dev);
+
+	netdev->watchdog_timeo = NES_TX_TIMEOUT;
+	netdev->irq = nesdev->pcidev->irq;
+	netdev->mtu = ETH_DATA_LEN;
+	netdev->hard_header_len = ETH_HLEN;
+	netdev->addr_len = ETH_ALEN;
+	netdev->type = ARPHRD_ETHER;
+	netdev->netdev_ops = &nes_netdev_ops;
+	netdev->ethtool_ops = &nes_ethtool_ops;
+	netif_napi_add(netdev, &nesvnic->napi, nes_netdev_poll, 128);
+	nes_debug(NES_DBG_INIT, "Enabling VLAN Insert/Delete.\n");
+
+	/* Fill in the port structure */
+	nesvnic->netdev = netdev;
+	nesvnic->nesdev = nesdev;
+	nesvnic->msg_enable = netif_msg_init(debug, default_msg);
+	nesvnic->netdev_index = nesdev->netdev_count;
+	nesvnic->perfect_filter_index = nesdev->nesadapter->netdev_count;
+	nesvnic->max_frame_size = netdev->mtu + netdev->hard_header_len + VLAN_HLEN;
+
+	curr_qp_map = nic_qp_mapping_per_function[PCI_FUNC(nesdev->pcidev->devfn)];
+	nesvnic->nic.qp_id = curr_qp_map[nesdev->netdev_count].qpid;
+	nesvnic->nic_index = curr_qp_map[nesdev->netdev_count].nic_index;
+	nesvnic->logical_port = curr_qp_map[nesdev->netdev_count].logical_port;
+
+	/* Setup the burned in MAC address */
+	u64temp = (u64)nesdev->nesadapter->mac_addr_low;
+	u64temp += ((u64)nesdev->nesadapter->mac_addr_high) << 32;
+	u64temp += nesvnic->nic_index;
+	netdev->dev_addr[0] = (u8)(u64temp>>40);
+	netdev->dev_addr[1] = (u8)(u64temp>>32);
+	netdev->dev_addr[2] = (u8)(u64temp>>24);
+	netdev->dev_addr[3] = (u8)(u64temp>>16);
+	netdev->dev_addr[4] = (u8)(u64temp>>8);
+	netdev->dev_addr[5] = (u8)u64temp;
+
+	netdev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_RX;
+	if ((nesvnic->logical_port < 2) || (nesdev->nesadapter->hw_rev != NE020_REV))
+		netdev->hw_features |= NETIF_F_TSO;
+
+	netdev->features = netdev->hw_features | NETIF_F_HIGHDMA | NETIF_F_HW_VLAN_CTAG_TX;
+	netdev->hw_features |= NETIF_F_LRO;
+
+	nes_debug(NES_DBG_INIT, "nesvnic = %p, reported features = 0x%lX, QPid = %d,"
+			" nic_index = %d, logical_port = %d, mac_index = %d.\n",
+			nesvnic, (unsigned long)netdev->features, nesvnic->nic.qp_id,
+			nesvnic->nic_index, nesvnic->logical_port,  nesdev->mac_index);
+
+	if (nesvnic->nesdev->nesadapter->port_count == 1 &&
+		nesvnic->nesdev->nesadapter->adapter_fcn_count == 1) {
+
+		nesvnic->qp_nic_index[0] = nesvnic->nic_index;
+		nesvnic->qp_nic_index[1] = nesvnic->nic_index + 1;
+		if (nes_drv_opt & NES_DRV_OPT_DUAL_LOGICAL_PORT) {
+			nesvnic->qp_nic_index[2] = 0xf;
+			nesvnic->qp_nic_index[3] = 0xf;
+		} else {
+			nesvnic->qp_nic_index[2] = nesvnic->nic_index + 2;
+			nesvnic->qp_nic_index[3] = nesvnic->nic_index + 3;
+		}
+	} else {
+		if (nesvnic->nesdev->nesadapter->port_count == 2 ||
+			(nesvnic->nesdev->nesadapter->port_count == 1 &&
+			nesvnic->nesdev->nesadapter->adapter_fcn_count == 2)) {
+				nesvnic->qp_nic_index[0] = nesvnic->nic_index;
+				nesvnic->qp_nic_index[1] = nesvnic->nic_index
+									+ 2;
+				nesvnic->qp_nic_index[2] = 0xf;
+				nesvnic->qp_nic_index[3] = 0xf;
+		} else {
+			nesvnic->qp_nic_index[0] = nesvnic->nic_index;
+			nesvnic->qp_nic_index[1] = 0xf;
+			nesvnic->qp_nic_index[2] = 0xf;
+			nesvnic->qp_nic_index[3] = 0xf;
+		}
+	}
+	nesvnic->next_qp_nic_index = 0;
+
+	if (nesdev->netdev_count == 0) {
+		nesvnic->rdma_enabled = 1;
+	} else {
+		nesvnic->rdma_enabled = 0;
+	}
+	nesvnic->nic_cq.cq_number = nesvnic->nic.qp_id;
+	init_timer(&nesvnic->event_timer);
+	nesvnic->event_timer.function = NULL;
+	spin_lock_init(&nesvnic->tx_lock);
+	spin_lock_init(&nesvnic->port_ibevent_lock);
+	nesdev->netdev[nesdev->netdev_count] = netdev;
+
+	nes_debug(NES_DBG_INIT, "Adding nesvnic (%p) to the adapters nesvnic_list for MAC%d.\n",
+			nesvnic, nesdev->mac_index);
+	list_add_tail(&nesvnic->list, &nesdev->nesadapter->nesvnic_list[nesdev->mac_index]);
+
+	if ((nesdev->netdev_count == 0) &&
+	    ((PCI_FUNC(nesdev->pcidev->devfn) == nesdev->mac_index) ||
+	     ((phy_type == NES_PHY_TYPE_PUMA_1G) &&
+	      (((PCI_FUNC(nesdev->pcidev->devfn) == 1) && (nesdev->mac_index == 2)) ||
+	       ((PCI_FUNC(nesdev->pcidev->devfn) == 2) && (nesdev->mac_index == 1)))))) {
+		u32 u32temp;
+		u32 link_mask = 0;
+		u32 link_val = 0;
+		u16 temp_phy_data;
+		u16 phy_data = 0;
+		unsigned long flags;
+
+		u32temp = nes_read_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
+				(0x200 * (nesdev->mac_index & 1)));
+		if (phy_type != NES_PHY_TYPE_PUMA_1G) {
+			u32temp |= 0x00200000;
+			nes_write_indexed(nesdev, NES_IDX_PHY_PCS_CONTROL_STATUS0 +
+				(0x200 * (nesdev->mac_index & 1)), u32temp);
+		}
+
+		/* Check and set linkup here.  This is for back to back */
+		/* configuration where second port won't get link interrupt */
+		switch (phy_type) {
+		case NES_PHY_TYPE_PUMA_1G:
+			if (nesdev->mac_index < 2) {
+				link_mask = 0x01010000;
+				link_val = 0x01010000;
+			} else {
+				link_mask = 0x02020000;
+				link_val = 0x02020000;
+			}
+			break;
+		case NES_PHY_TYPE_SFP_D:
+			spin_lock_irqsave(&nesdev->nesadapter->phy_lock, flags);
+			nes_read_10G_phy_reg(nesdev,
+					     nesdev->nesadapter->phy_index[nesdev->mac_index],
+					     1, 0x9003);
+			temp_phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+			nes_read_10G_phy_reg(nesdev,
+					     nesdev->nesadapter->phy_index[nesdev->mac_index],
+					     3, 0x0021);
+			nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+			nes_read_10G_phy_reg(nesdev,
+					     nesdev->nesadapter->phy_index[nesdev->mac_index],
+					     3, 0x0021);
+			phy_data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+			spin_unlock_irqrestore(&nesdev->nesadapter->phy_lock, flags);
+			phy_data = (!temp_phy_data && (phy_data == 0x8000)) ? 0x4 : 0x0;
+			break;
+		default:
+			link_mask = 0x0f1f0000;
+			link_val = 0x0f0f0000;
+			break;
+		}
+
+		u32temp = nes_read_indexed(nesdev,
+					   NES_IDX_PHY_PCS_CONTROL_STATUS0 +
+					   (0x200 * (nesdev->mac_index & 1)));
+
+		if (phy_type == NES_PHY_TYPE_SFP_D) {
+			if (phy_data & 0x0004)
+				nesvnic->linkup = 1;
+		} else {
+			if ((u32temp & link_mask) == link_val)
+				nesvnic->linkup = 1;
+		}
+
+		/* clear the MAC interrupt status, assumes direct logical to physical mapping */
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index));
+		nes_debug(NES_DBG_INIT, "Phy interrupt status = 0x%X.\n", u32temp);
+		nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS + (0x200 * nesdev->mac_index), u32temp);
+
+		nes_init_phy(nesdev);
+	}
+
+	nes_vlan_mode(netdev, nesdev, netdev->features);
+
+	return netdev;
+}
+
+
+/**
+ * nes_netdev_destroy - destroy network device structure
+ */
+void nes_netdev_destroy(struct net_device *netdev)
+{
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+
+	/* make sure 'stop' method is called by Linux stack */
+	/* nes_netdev_stop(netdev); */
+
+	list_del(&nesvnic->list);
+
+	if (nesvnic->of_device_registered) {
+		nes_destroy_ofa_device(nesvnic->nesibdev);
+	}
+
+	free_netdev(netdev);
+}
+
+
+/**
+ * nes_nic_cm_xmit -- CM calls this to send out pkts
+ */
+int nes_nic_cm_xmit(struct sk_buff *skb, struct net_device *netdev)
+{
+	int ret;
+
+	skb->dev = netdev;
+	ret = dev_queue_xmit(skb);
+	if (ret) {
+		nes_debug(NES_DBG_CM, "Bad return code from dev_queue_xmit %d\n", ret);
+	}
+
+	return ret;
+}
diff --git a/drivers/infiniband/hw/nes/nes_user.h b/drivers/infiniband/hw/nes/nes_user.h
new file mode 100644
index 000000000..529c421bb
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_user.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef NES_USER_H
+#define NES_USER_H
+
+#include <linux/types.h>
+
+#define NES_ABI_USERSPACE_VER 2
+#define NES_ABI_KERNEL_VER    2
+
+/*
+ * Make sure that all structs defined in this file remain laid out so
+ * that they pack the same way on 32-bit and 64-bit architectures (to
+ * avoid incompatibility between 32-bit userspace and 64-bit kernels).
+ * In particular do not use pointer types -- pass pointers in __u64
+ * instead.
+ */
+
+struct nes_alloc_ucontext_req {
+	__u32 reserved32;
+	__u8  userspace_ver;
+	__u8  reserved8[3];
+};
+
+struct nes_alloc_ucontext_resp {
+	__u32 max_pds; /* maximum pds allowed for this user process */
+	__u32 max_qps; /* maximum qps allowed for this user process */
+	__u32 wq_size; /* size of the WQs (sq+rq) allocated to the mmaped area */
+	__u8  virtwq;  /* flag to indicate if virtual WQ are to be used or not */
+	__u8  kernel_ver;
+	__u8  reserved[2];
+};
+
+struct nes_alloc_pd_resp {
+	__u32 pd_id;
+	__u32 mmap_db_index;
+};
+
+struct nes_create_cq_req {
+	__u64 user_cq_buffer;
+	__u32 mcrqf;
+	__u8 reserved[4];
+};
+
+struct nes_create_qp_req {
+	__u64 user_wqe_buffers;
+	__u64 user_qp_buffer;
+};
+
+enum iwnes_memreg_type {
+	IWNES_MEMREG_TYPE_MEM = 0x0000,
+	IWNES_MEMREG_TYPE_QP = 0x0001,
+	IWNES_MEMREG_TYPE_CQ = 0x0002,
+	IWNES_MEMREG_TYPE_MW = 0x0003,
+	IWNES_MEMREG_TYPE_FMR = 0x0004,
+	IWNES_MEMREG_TYPE_FMEM = 0x0005,
+};
+
+struct nes_mem_reg_req {
+	__u32 reg_type;	/* indicates if id is memory, QP or CQ */
+	__u32 reserved;
+};
+
+struct nes_create_cq_resp {
+	__u32 cq_id;
+	__u32 cq_size;
+	__u32 mmap_db_index;
+	__u32 reserved;
+};
+
+struct nes_create_qp_resp {
+	__u32 qp_id;
+	__u32 actual_sq_size;
+	__u32 actual_rq_size;
+	__u32 mmap_sq_db_index;
+	__u32 mmap_rq_db_index;
+	__u32 nes_drv_opt;
+};
+
+#endif				/* NES_USER_H */
diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c
new file mode 100644
index 000000000..2042c0f29
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_utils.c
@@ -0,0 +1,972 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <linux/if_vlan.h>
+#include <linux/slab.h>
+#include <linux/crc32.h>
+#include <linux/in.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/init.h>
+
+#include <asm/io.h>
+#include <asm/irq.h>
+#include <asm/byteorder.h>
+
+#include "nes.h"
+
+static u16 nes_read16_eeprom(void __iomem *addr, u16 offset);
+
+u32 mh_detected;
+u32 mh_pauses_sent;
+
+static u32 nes_set_pau(struct nes_device *nesdev)
+{
+	u32 ret = 0;
+	u32 counter;
+
+	nes_write_indexed(nesdev, NES_IDX_GPR2, NES_ENABLE_PAU);
+	nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1);
+
+	for (counter = 0; counter < NES_PAU_COUNTER; counter++) {
+		udelay(30);
+		if (!nes_read_indexed(nesdev, NES_IDX_GPR2)) {
+			printk(KERN_INFO PFX "PAU is supported.\n");
+			break;
+		}
+		nes_write_indexed(nesdev, NES_IDX_GPR_TRIGGER, 1);
+	}
+	if (counter == NES_PAU_COUNTER) {
+		printk(KERN_INFO PFX "PAU is not supported.\n");
+		return -EPERM;
+	}
+	return ret;
+}
+
+/**
+ * nes_read_eeprom_values -
+ */
+int nes_read_eeprom_values(struct nes_device *nesdev, struct nes_adapter *nesadapter)
+{
+	u32 mac_addr_low;
+	u16 mac_addr_high;
+	u16 eeprom_data;
+	u16 eeprom_offset;
+	u16 next_section_address;
+	u16 sw_section_ver;
+	u8  major_ver = 0;
+	u8  minor_ver = 0;
+
+	/* TODO: deal with EEPROM endian issues */
+	if (nesadapter->firmware_eeprom_offset == 0) {
+		/* Read the EEPROM Parameters */
+		eeprom_data = nes_read16_eeprom(nesdev->regs, 0);
+		nes_debug(NES_DBG_HW, "EEPROM Offset 0  = 0x%04X\n", eeprom_data);
+		eeprom_offset = 2 + (((eeprom_data & 0x007f) << 3) <<
+				((eeprom_data & 0x0080) >> 7));
+		nes_debug(NES_DBG_HW, "Firmware Offset = 0x%04X\n", eeprom_offset);
+		nesadapter->firmware_eeprom_offset = eeprom_offset;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4);
+		if (eeprom_data != 0x5746) {
+			nes_debug(NES_DBG_HW, "Not a valid Firmware Image = 0x%04X\n", eeprom_data);
+			return -1;
+		}
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		eeprom_offset += ((eeprom_data & 0x00ff) << 3) << ((eeprom_data & 0x0100) >> 8);
+		nes_debug(NES_DBG_HW, "Software Offset = 0x%04X\n", eeprom_offset);
+		nesadapter->software_eeprom_offset = eeprom_offset;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 4);
+		if (eeprom_data != 0x5753) {
+			printk("Not a valid Software Image = 0x%04X\n", eeprom_data);
+			return -1;
+		}
+		sw_section_ver = nes_read16_eeprom(nesdev->regs, nesadapter->software_eeprom_offset  + 6);
+		nes_debug(NES_DBG_HW, "Software section version number = 0x%04X\n",
+				sw_section_ver);
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) <<
+				((eeprom_data & 0x0100) >> 8));
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x414d) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + (((eeprom_data & 0x00ff) << 3) <<
+				((eeprom_data & 0x0100) >> 8));
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x4f52) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x4f52 but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x5746) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5746 but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x5753) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x5753 but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x414d) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x414d but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_offset = next_section_address;
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset + 2);
+		nes_debug(NES_DBG_HW, "EEPROM Offset %u (next section)  = 0x%04X\n",
+				eeprom_offset + 2, eeprom_data);
+		next_section_address = eeprom_offset + ((eeprom_data & 0x00ff) << 3);
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 4);
+		if (eeprom_data != 0x464e) {
+			nes_debug(NES_DBG_HW, "EEPROM Changed offset should be 0x464e but was 0x%04X\n",
+					eeprom_data);
+			goto no_fw_rev;
+		}
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 8);
+		printk(PFX "Firmware version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data);
+		major_ver = (u8)(eeprom_data >> 8);
+		minor_ver = (u8)(eeprom_data);
+
+		if (nes_drv_opt & NES_DRV_OPT_DISABLE_VIRT_WQ) {
+			nes_debug(NES_DBG_HW, "Virtual WQs have been disabled\n");
+		} else if (((major_ver == 2) && (minor_ver > 21)) || ((major_ver > 2) && (major_ver != 255))) {
+			nesadapter->virtwq = 1;
+		}
+		if (((major_ver == 3) && (minor_ver >= 16)) || (major_ver > 3))
+			nesadapter->send_term_ok = 1;
+
+		if (nes_drv_opt & NES_DRV_OPT_ENABLE_PAU) {
+			if (!nes_set_pau(nesdev))
+				nesadapter->allow_unaligned_fpdus = 1;
+		}
+
+		nesadapter->firmware_version = (((u32)(u8)(eeprom_data>>8))  <<  16) +
+				(u32)((u8)eeprom_data);
+
+		eeprom_data = nes_read16_eeprom(nesdev->regs, next_section_address + 10);
+		printk(PFX "EEPROM version %u.%u\n", (u8)(eeprom_data>>8), (u8)eeprom_data);
+		nesadapter->eeprom_version = (((u32)(u8)(eeprom_data>>8)) << 16) +
+				(u32)((u8)eeprom_data);
+
+no_fw_rev:
+		/* eeprom is valid */
+		eeprom_offset = nesadapter->software_eeprom_offset;
+		eeprom_offset += 8;
+		nesadapter->netdev_max = (u8)nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		mac_addr_high = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		mac_addr_low = (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		mac_addr_low <<= 16;
+		mac_addr_low += (u32)nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "Base MAC Address = 0x%04X%08X\n",
+				mac_addr_high, mac_addr_low);
+		nes_debug(NES_DBG_HW, "MAC Address count = %u\n", nesadapter->netdev_max);
+
+		nesadapter->mac_addr_low = mac_addr_low;
+		nesadapter->mac_addr_high = mac_addr_high;
+
+		/* Read the Phy Type array */
+		eeprom_offset += 10;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->phy_type[0] = (u8)(eeprom_data >> 8);
+		nesadapter->phy_type[1] = (u8)eeprom_data;
+
+		/* Read the port array */
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->phy_type[2] = (u8)(eeprom_data >> 8);
+		nesadapter->phy_type[3] = (u8)eeprom_data;
+		/* port_count is set by soft reset reg */
+		nes_debug(NES_DBG_HW, "port_count = %u, port 0 -> %u, port 1 -> %u,"
+				" port 2 -> %u, port 3 -> %u\n",
+				nesadapter->port_count,
+				nesadapter->phy_type[0], nesadapter->phy_type[1],
+				nesadapter->phy_type[2], nesadapter->phy_type[3]);
+
+		/* Read PD config array */
+		eeprom_offset += 10;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_size[0] = eeprom_data;
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_base[0] = eeprom_data;
+		nes_debug(NES_DBG_HW, "PD0 config, size=0x%04x, base=0x%04x\n",
+				nesadapter->pd_config_size[0], nesadapter->pd_config_base[0]);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_size[1] = eeprom_data;
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_base[1] = eeprom_data;
+		nes_debug(NES_DBG_HW, "PD1 config, size=0x%04x, base=0x%04x\n",
+				nesadapter->pd_config_size[1], nesadapter->pd_config_base[1]);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_size[2] = eeprom_data;
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_base[2] = eeprom_data;
+		nes_debug(NES_DBG_HW, "PD2 config, size=0x%04x, base=0x%04x\n",
+				nesadapter->pd_config_size[2], nesadapter->pd_config_base[2]);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_size[3] = eeprom_data;
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nesadapter->pd_config_base[3] = eeprom_data;
+		nes_debug(NES_DBG_HW, "PD3 config, size=0x%04x, base=0x%04x\n",
+				nesadapter->pd_config_size[3], nesadapter->pd_config_base[3]);
+
+		/* Read Rx Pool Size */
+		eeprom_offset += 22;   /* 46 */
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->rx_pool_size = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "rx_pool_size = 0x%08X\n", nesadapter->rx_pool_size);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->tx_pool_size = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "tx_pool_size = 0x%08X\n", nesadapter->tx_pool_size);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->rx_threshold = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "rx_threshold = 0x%08X\n", nesadapter->rx_threshold);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->tcp_timer_core_clk_divisor = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "tcp_timer_core_clk_divisor = 0x%08X\n",
+				nesadapter->tcp_timer_core_clk_divisor);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->iwarp_config = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "iwarp_config = 0x%08X\n", nesadapter->iwarp_config);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->cm_config = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "cm_config = 0x%08X\n", nesadapter->cm_config);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->sws_timer_config = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "sws_timer_config = 0x%08X\n", nesadapter->sws_timer_config);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->tcp_config1 = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "tcp_config1 = 0x%08X\n", nesadapter->tcp_config1);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->wqm_wat = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "wqm_wat = 0x%08X\n", nesadapter->wqm_wat);
+
+		eeprom_offset += 2;
+		eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		eeprom_offset += 2;
+		nesadapter->core_clock = (((u32)eeprom_data) << 16) +
+				nes_read16_eeprom(nesdev->regs, eeprom_offset);
+		nes_debug(NES_DBG_HW, "core_clock = 0x%08X\n", nesadapter->core_clock);
+
+		if ((sw_section_ver) && (nesadapter->hw_rev != NE020_REV)) {
+			eeprom_offset += 2;
+			eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+			nesadapter->phy_index[0] = (eeprom_data & 0xff00)>>8;
+			nesadapter->phy_index[1] = eeprom_data & 0x00ff;
+			eeprom_offset += 2;
+			eeprom_data = nes_read16_eeprom(nesdev->regs, eeprom_offset);
+			nesadapter->phy_index[2] = (eeprom_data & 0xff00)>>8;
+			nesadapter->phy_index[3] = eeprom_data & 0x00ff;
+		} else {
+			nesadapter->phy_index[0] = 4;
+			nesadapter->phy_index[1] = 5;
+			nesadapter->phy_index[2] = 6;
+			nesadapter->phy_index[3] = 7;
+		}
+		nes_debug(NES_DBG_HW, "Phy address map = 0 > %u,  1 > %u, 2 > %u, 3 > %u\n",
+			   nesadapter->phy_index[0],nesadapter->phy_index[1],
+			   nesadapter->phy_index[2],nesadapter->phy_index[3]);
+	}
+
+	return 0;
+}
+
+
+/**
+ * nes_read16_eeprom
+ */
+static u16 nes_read16_eeprom(void __iomem *addr, u16 offset)
+{
+	writel(NES_EEPROM_READ_REQUEST + (offset >> 1),
+			(void __iomem *)addr + NES_EEPROM_COMMAND);
+
+	do {
+	} while (readl((void __iomem *)addr + NES_EEPROM_COMMAND) &
+			NES_EEPROM_READ_REQUEST);
+
+	return readw((void __iomem *)addr + NES_EEPROM_DATA);
+}
+
+
+/**
+ * nes_write_1G_phy_reg
+ */
+void nes_write_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 data)
+{
+	u32 u32temp;
+	u32 counter;
+
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x50020000 | data | ((u32)phy_reg << 18) | ((u32)phy_addr << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			/* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+}
+
+
+/**
+ * nes_read_1G_phy_reg
+ * This routine only issues the read, the data must be read
+ * separately.
+ */
+void nes_read_1G_phy_reg(struct nes_device *nesdev, u8 phy_reg, u8 phy_addr, u16 *data)
+{
+	u32 u32temp;
+	u32 counter;
+
+	/* nes_debug(NES_DBG_PHY, "phy addr = %d, mac_index = %d\n",
+			phy_addr, nesdev->mac_index); */
+
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x60020000 | ((u32)phy_reg << 18) | ((u32)phy_addr << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			/* nes_debug(NES_DBG_PHY, "Phy interrupt status = 0x%X.\n", u32temp); */
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1)) {
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+		*data = 0xffff;
+	} else {
+		*data = (u16)nes_read_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL);
+	}
+}
+
+
+/**
+ * nes_write_10G_phy_reg
+ */
+void nes_write_10G_phy_reg(struct nes_device *nesdev, u16 phy_addr, u8 dev_addr, u16 phy_reg,
+		u16 data)
+{
+	u32 port_addr;
+	u32 u32temp;
+	u32 counter;
+
+	port_addr = phy_addr;
+
+	/* set address */
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+
+	/* set data */
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x10020000 | (u32)data | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+}
+
+
+/**
+ * nes_read_10G_phy_reg
+ * This routine only issues the read, the data must be read
+ * separately.
+ */
+void nes_read_10G_phy_reg(struct nes_device *nesdev, u8 phy_addr, u8 dev_addr, u16 phy_reg)
+{
+	u32 port_addr;
+	u32 u32temp;
+	u32 counter;
+
+	port_addr = phy_addr;
+
+	/* set address */
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x00020000 | (u32)phy_reg | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+
+	/* issue read */
+	nes_write_indexed(nesdev, NES_IDX_MAC_MDIO_CONTROL,
+			0x30020000 | (((u32)dev_addr) << 18) | (((u32)port_addr) << 23));
+	for (counter = 0; counter < 100 ; counter++) {
+		udelay(30);
+		u32temp = nes_read_indexed(nesdev, NES_IDX_MAC_INT_STATUS);
+		if (u32temp & 1) {
+			nes_write_indexed(nesdev, NES_IDX_MAC_INT_STATUS, 1);
+			break;
+		}
+	}
+	if (!(u32temp & 1))
+		nes_debug(NES_DBG_PHY, "Phy is not responding. interrupt status = 0x%X.\n",
+				u32temp);
+}
+
+
+/**
+ * nes_get_cqp_request
+ */
+struct nes_cqp_request *nes_get_cqp_request(struct nes_device *nesdev)
+{
+	unsigned long flags;
+	struct nes_cqp_request *cqp_request = NULL;
+
+	if (!list_empty(&nesdev->cqp_avail_reqs)) {
+		spin_lock_irqsave(&nesdev->cqp.lock, flags);
+		if (!list_empty(&nesdev->cqp_avail_reqs)) {
+			cqp_request = list_entry(nesdev->cqp_avail_reqs.next,
+				struct nes_cqp_request, list);
+			list_del_init(&cqp_request->list);
+		}
+		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+	}
+	if (cqp_request == NULL) {
+		cqp_request = kzalloc(sizeof(struct nes_cqp_request), GFP_ATOMIC);
+		if (cqp_request) {
+			cqp_request->dynamic = 1;
+			INIT_LIST_HEAD(&cqp_request->list);
+		}
+	}
+
+	if (cqp_request) {
+		init_waitqueue_head(&cqp_request->waitq);
+		cqp_request->waiting = 0;
+		cqp_request->request_done = 0;
+		cqp_request->callback = 0;
+		init_waitqueue_head(&cqp_request->waitq);
+		nes_debug(NES_DBG_CQP, "Got cqp request %p from the available list \n",
+				cqp_request);
+	} else
+		printk(KERN_ERR PFX "%s: Could not allocated a CQP request.\n",
+			   __func__);
+
+	return cqp_request;
+}
+
+void nes_free_cqp_request(struct nes_device *nesdev,
+			  struct nes_cqp_request *cqp_request)
+{
+	unsigned long flags;
+
+	nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X) freed.\n",
+		  cqp_request,
+		  le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX]) & 0x3f);
+
+	if (cqp_request->dynamic) {
+		kfree(cqp_request);
+	} else {
+		spin_lock_irqsave(&nesdev->cqp.lock, flags);
+		list_add_tail(&cqp_request->list, &nesdev->cqp_avail_reqs);
+		spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+	}
+}
+
+void nes_put_cqp_request(struct nes_device *nesdev,
+			 struct nes_cqp_request *cqp_request)
+{
+	if (atomic_dec_and_test(&cqp_request->refcount))
+		nes_free_cqp_request(nesdev, cqp_request);
+}
+
+
+/**
+ * nes_post_cqp_request
+ */
+void nes_post_cqp_request(struct nes_device *nesdev,
+			  struct nes_cqp_request *cqp_request)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	unsigned long flags;
+	u32 cqp_head;
+	u64 u64temp;
+	u32 opcode;
+	int ctx_index = NES_CQP_WQE_COMP_CTX_LOW_IDX;
+
+	spin_lock_irqsave(&nesdev->cqp.lock, flags);
+
+	if (((((nesdev->cqp.sq_tail+(nesdev->cqp.sq_size*2))-nesdev->cqp.sq_head) &
+			(nesdev->cqp.sq_size - 1)) != 1)
+			&& (list_empty(&nesdev->cqp_pending_reqs))) {
+		cqp_head = nesdev->cqp.sq_head++;
+		nesdev->cqp.sq_head &= nesdev->cqp.sq_size-1;
+		cqp_wqe = &nesdev->cqp.sq_vbase[cqp_head];
+		memcpy(cqp_wqe, &cqp_request->cqp_wqe, sizeof(*cqp_wqe));
+		opcode = le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]);
+		if ((opcode & NES_CQP_OPCODE_MASK) == NES_CQP_DOWNLOAD_SEGMENT)
+			ctx_index = NES_CQP_WQE_DL_COMP_CTX_LOW_IDX;
+		barrier();
+		u64temp = (unsigned long)cqp_request;
+		set_wqe_64bit_value(cqp_wqe->wqe_words, ctx_index, u64temp);
+		nes_debug(NES_DBG_CQP, "CQP request (opcode 0x%02X), line 1 = 0x%08X put on CQPs SQ,"
+			" request = %p, cqp_head = %u, cqp_tail = %u, cqp_size = %u,"
+			" waiting = %d, refcount = %d.\n",
+			opcode & NES_CQP_OPCODE_MASK,
+			le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_ID_IDX]), cqp_request,
+			nesdev->cqp.sq_head, nesdev->cqp.sq_tail, nesdev->cqp.sq_size,
+			cqp_request->waiting, atomic_read(&cqp_request->refcount));
+
+		barrier();
+
+		/* Ring doorbell (1 WQEs) */
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, 0x01800000 | nesdev->cqp.qp_id);
+
+		barrier();
+	} else {
+		nes_debug(NES_DBG_CQP, "CQP request %p (opcode 0x%02X), line 1 = 0x%08X"
+				" put on the pending queue.\n",
+				cqp_request,
+				le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_OPCODE_IDX])&0x3f,
+				le32_to_cpu(cqp_request->cqp_wqe.wqe_words[NES_CQP_WQE_ID_IDX]));
+		list_add_tail(&cqp_request->list, &nesdev->cqp_pending_reqs);
+	}
+
+	spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+
+	return;
+}
+
+/**
+ * nes_arp_table
+ */
+int nes_arp_table(struct nes_device *nesdev, u32 ip_addr, u8 *mac_addr, u32 action)
+{
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	int arp_index;
+	int err = 0;
+	__be32 tmp_addr;
+
+	for (arp_index = 0; (u32) arp_index < nesadapter->arp_table_size; arp_index++) {
+		if (nesadapter->arp_table[arp_index].ip_addr == ip_addr)
+			break;
+	}
+
+	if (action == NES_ARP_ADD) {
+		if (arp_index != nesadapter->arp_table_size) {
+			return -1;
+		}
+
+		arp_index = 0;
+		err = nes_alloc_resource(nesadapter, nesadapter->allocated_arps,
+				nesadapter->arp_table_size, (u32 *)&arp_index, &nesadapter->next_arp_index, NES_RESOURCE_ARP);
+		if (err) {
+			nes_debug(NES_DBG_NETDEV, "nes_alloc_resource returned error = %u\n", err);
+			return err;
+		}
+		nes_debug(NES_DBG_NETDEV, "ADD, arp_index=%d\n", arp_index);
+
+		nesadapter->arp_table[arp_index].ip_addr = ip_addr;
+		memcpy(nesadapter->arp_table[arp_index].mac_addr, mac_addr, ETH_ALEN);
+		return arp_index;
+	}
+
+	/* DELETE or RESOLVE */
+	if (arp_index == nesadapter->arp_table_size) {
+		tmp_addr = cpu_to_be32(ip_addr);
+		nes_debug(NES_DBG_NETDEV, "MAC for %pI4 not in ARP table - cannot %s\n",
+			  &tmp_addr, action == NES_ARP_RESOLVE ? "resolve" : "delete");
+		return -1;
+	}
+
+	if (action == NES_ARP_RESOLVE) {
+		nes_debug(NES_DBG_NETDEV, "RESOLVE, arp_index=%d\n", arp_index);
+		return arp_index;
+	}
+
+	if (action == NES_ARP_DELETE) {
+		nes_debug(NES_DBG_NETDEV, "DELETE, arp_index=%d\n", arp_index);
+		nesadapter->arp_table[arp_index].ip_addr = 0;
+		memset(nesadapter->arp_table[arp_index].mac_addr, 0x00, ETH_ALEN);
+		nes_free_resource(nesadapter, nesadapter->allocated_arps, arp_index);
+		return arp_index;
+	}
+
+	return -1;
+}
+
+
+/**
+ * nes_mh_fix
+ */
+void nes_mh_fix(unsigned long parm)
+{
+	unsigned long flags;
+	struct nes_device *nesdev = (struct nes_device *)parm;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_vnic *nesvnic;
+	u32 used_chunks_tx;
+	u32 temp_used_chunks_tx;
+	u32 temp_last_used_chunks_tx;
+	u32 used_chunks_mask;
+	u32 mac_tx_frames_low;
+	u32 mac_tx_frames_high;
+	u32 mac_tx_pauses;
+	u32 serdes_status;
+	u32 reset_value;
+	u32 tx_control;
+	u32 tx_config;
+	u32 tx_pause_quanta;
+	u32 rx_control;
+	u32 rx_config;
+	u32 mac_exact_match;
+	u32 mpp_debug;
+	u32 i=0;
+	u32 chunks_tx_progress = 0;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+	if ((nesadapter->mac_sw_state[0] != NES_MAC_SW_IDLE) || (nesadapter->mac_link_down[0])) {
+		spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+		goto no_mh_work;
+	}
+	nesadapter->mac_sw_state[0] = NES_MAC_SW_MH;
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+	do {
+		mac_tx_frames_low = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_LOW);
+		mac_tx_frames_high = nes_read_indexed(nesdev, NES_IDX_MAC_TX_FRAMES_HIGH);
+		mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
+		used_chunks_tx = nes_read_indexed(nesdev, NES_IDX_USED_CHUNKS_TX);
+		nesdev->mac_pause_frames_sent += mac_tx_pauses;
+		used_chunks_mask = 0;
+		temp_used_chunks_tx = used_chunks_tx;
+		temp_last_used_chunks_tx = nesdev->last_used_chunks_tx;
+
+		if (nesdev->netdev[0]) {
+			nesvnic = netdev_priv(nesdev->netdev[0]);
+		} else {
+			break;
+		}
+
+		for (i=0; i<4; i++) {
+			used_chunks_mask <<= 8;
+			if (nesvnic->qp_nic_index[i] != 0xff) {
+				used_chunks_mask |= 0xff;
+				if ((temp_used_chunks_tx&0xff)<(temp_last_used_chunks_tx&0xff)) {
+					chunks_tx_progress = 1;
+				}
+			}
+			temp_used_chunks_tx >>= 8;
+			temp_last_used_chunks_tx >>= 8;
+		}
+		if ((mac_tx_frames_low) || (mac_tx_frames_high) ||
+			(!(used_chunks_tx&used_chunks_mask)) ||
+			(!(nesdev->last_used_chunks_tx&used_chunks_mask)) ||
+			(chunks_tx_progress) ) {
+			nesdev->last_used_chunks_tx = used_chunks_tx;
+			break;
+		}
+		nesdev->last_used_chunks_tx = used_chunks_tx;
+		barrier();
+
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000005);
+		mh_pauses_sent++;
+		mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
+		if (mac_tx_pauses) {
+			nesdev->mac_pause_frames_sent += mac_tx_pauses;
+			break;
+		}
+
+		tx_control = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONTROL);
+		tx_config = nes_read_indexed(nesdev, NES_IDX_MAC_TX_CONFIG);
+		tx_pause_quanta = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA);
+		rx_control = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONTROL);
+		rx_config = nes_read_indexed(nesdev, NES_IDX_MAC_RX_CONFIG);
+		mac_exact_match = nes_read_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM);
+		mpp_debug = nes_read_indexed(nesdev, NES_IDX_MPP_DEBUG);
+
+		/* one last ditch effort to avoid a false positive */
+		mac_tx_pauses = nes_read_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_FRAMES);
+		if (mac_tx_pauses) {
+			nesdev->last_mac_tx_pauses = nesdev->mac_pause_frames_sent;
+			nes_debug(NES_DBG_HW, "failsafe caught slow outbound pause\n");
+			break;
+		}
+		mh_detected++;
+
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, 0x00000000);
+		reset_value = nes_read32(nesdev->regs+NES_SOFTWARE_RESET);
+
+		nes_write32(nesdev->regs+NES_SOFTWARE_RESET, reset_value | 0x0000001d);
+
+		while (((nes_read32(nesdev->regs+NES_SOFTWARE_RESET)
+				& 0x00000040) != 0x00000040) && (i++ < 5000)) {
+			/* mdelay(1); */
+		}
+
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_CONTROL0, 0x00000008);
+		serdes_status = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_COMMON_STATUS0);
+
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_EMP0, 0x000bdef7);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_TX_DRIVE0, 0x9ce73000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_MODE0, 0x0ff00000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_SIGDET0, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_BYPASS0, 0x00000000);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_LOOPBACK_CONTROL0, 0x00000000);
+		if (nesadapter->OneG_Mode) {
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0182222);
+		} else {
+			nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_CONTROL0, 0xf0042222);
+		}
+		serdes_status = nes_read_indexed(nesdev, NES_IDX_ETH_SERDES_RX_EQ_STATUS0);
+		nes_write_indexed(nesdev, NES_IDX_ETH_SERDES_CDR_CONTROL0, 0x000000ff);
+
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONTROL, tx_control);
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_CONFIG, tx_config);
+		nes_write_indexed(nesdev, NES_IDX_MAC_TX_PAUSE_QUANTA, tx_pause_quanta);
+		nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONTROL, rx_control);
+		nes_write_indexed(nesdev, NES_IDX_MAC_RX_CONFIG, rx_config);
+		nes_write_indexed(nesdev, NES_IDX_MAC_EXACT_MATCH_BOTTOM, mac_exact_match);
+		nes_write_indexed(nesdev, NES_IDX_MPP_DEBUG, mpp_debug);
+
+	} while (0);
+
+	nesadapter->mac_sw_state[0] = NES_MAC_SW_IDLE;
+no_mh_work:
+	nesdev->nesadapter->mh_timer.expires = jiffies + (HZ/5);
+	add_timer(&nesdev->nesadapter->mh_timer);
+}
+
+/**
+ * nes_clc
+ */
+void nes_clc(unsigned long parm)
+{
+	unsigned long flags;
+	struct nes_device *nesdev = (struct nes_device *)parm;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	spin_lock_irqsave(&nesadapter->phy_lock, flags);
+    nesadapter->link_interrupt_count[0] = 0;
+    nesadapter->link_interrupt_count[1] = 0;
+    nesadapter->link_interrupt_count[2] = 0;
+    nesadapter->link_interrupt_count[3] = 0;
+	spin_unlock_irqrestore(&nesadapter->phy_lock, flags);
+
+	nesadapter->lc_timer.expires = jiffies + 3600 * HZ;  /* 1 hour */
+	add_timer(&nesadapter->lc_timer);
+}
+
+
+/**
+ * nes_dump_mem
+ */
+void nes_dump_mem(unsigned int dump_debug_level, void *addr, int length)
+{
+	char  xlate[] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
+		'a', 'b', 'c', 'd', 'e', 'f'};
+	char  *ptr;
+	char  hex_buf[80];
+	char  ascii_buf[20];
+	int   num_char;
+	int   num_ascii;
+	int   num_hex;
+
+	if (!(nes_debug_level & dump_debug_level)) {
+		return;
+	}
+
+	ptr = addr;
+	if (length > 0x100) {
+		nes_debug(dump_debug_level, "Length truncated from %x to %x\n", length, 0x100);
+		length = 0x100;
+	}
+	nes_debug(dump_debug_level, "Address=0x%p, length=0x%x (%d)\n", ptr, length, length);
+
+	memset(ascii_buf, 0, 20);
+	memset(hex_buf, 0, 80);
+
+	num_ascii = 0;
+	num_hex = 0;
+	for (num_char = 0; num_char < length; num_char++) {
+		if (num_ascii == 8) {
+			ascii_buf[num_ascii++] = ' ';
+			hex_buf[num_hex++] = '-';
+			hex_buf[num_hex++] = ' ';
+		}
+
+		if (*ptr < 0x20 || *ptr > 0x7e)
+			ascii_buf[num_ascii++] = '.';
+		else
+			ascii_buf[num_ascii++] = *ptr;
+		hex_buf[num_hex++] = xlate[((*ptr & 0xf0) >> 4)];
+		hex_buf[num_hex++] = xlate[*ptr & 0x0f];
+		hex_buf[num_hex++] = ' ';
+		ptr++;
+
+		if (num_ascii >= 17) {
+			/* output line and reset */
+			nes_debug(dump_debug_level, "   %s |  %s\n", hex_buf, ascii_buf);
+			memset(ascii_buf, 0, 20);
+			memset(hex_buf, 0, 80);
+			num_ascii = 0;
+			num_hex = 0;
+		}
+	}
+
+	/* output the rest */
+	if (num_ascii) {
+		while (num_ascii < 17) {
+			if (num_ascii == 8) {
+				hex_buf[num_hex++] = ' ';
+				hex_buf[num_hex++] = ' ';
+			}
+			hex_buf[num_hex++] = ' ';
+			hex_buf[num_hex++] = ' ';
+			hex_buf[num_hex++] = ' ';
+			num_ascii++;
+		}
+
+		nes_debug(dump_debug_level, "   %s |  %s\n", hex_buf, ascii_buf);
+	}
+}
diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c
new file mode 100644
index 000000000..c0d0296e7
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_verbs.c
@@ -0,0 +1,4054 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/random.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <asm/byteorder.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/iw_cm.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "nes.h"
+
+#include <rdma/ib_umem.h>
+
+atomic_t mod_qp_timouts;
+atomic_t qps_created;
+atomic_t sw_qps_destroyed;
+
+static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev);
+
+/**
+ * nes_alloc_mw
+ */
+static struct ib_mw *nes_alloc_mw(struct ib_pd *ibpd, enum ib_mw_type type)
+{
+	struct nes_pd *nespd = to_nespd(ibpd);
+	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_cqp_request *cqp_request;
+	struct nes_mr *nesmr;
+	struct ib_mw *ibmw;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	int ret;
+	u32 stag;
+	u32 stag_index = 0;
+	u32 next_stag_index = 0;
+	u32 driver_key = 0;
+	u8 stag_key = 0;
+
+	if (type != IB_MW_TYPE_1)
+		return ERR_PTR(-EINVAL);
+
+	get_random_bytes(&next_stag_index, sizeof(next_stag_index));
+	stag_key = (u8)next_stag_index;
+
+	driver_key = 0;
+
+	next_stag_index >>= 8;
+	next_stag_index %= nesadapter->max_mr;
+
+	ret = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
+			nesadapter->max_mr, &stag_index, &next_stag_index, NES_RESOURCE_MW);
+	if (ret) {
+		return ERR_PTR(ret);
+	}
+
+	nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+	if (!nesmr) {
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	stag = stag_index << 8;
+	stag |= driver_key;
+	stag += (u32)stag_key;
+
+	nes_debug(NES_DBG_MR, "Registering STag 0x%08X, index = 0x%08X\n",
+			stag, stag_index);
+
+	/* Register the region with the adapter */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		kfree(nesmr);
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] =
+			cpu_to_le32( NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_RIGHTS_REMOTE_READ |
+			NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_VA_TO |
+			NES_CQP_STAG_REM_ACC_EN);
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX, (nespd->pd_id & 0x00007fff));
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			stag, ret, cqp_request->major_code, cqp_request->minor_code);
+	if ((!ret) || (cqp_request->major_code)) {
+		nes_put_cqp_request(nesdev, cqp_request);
+		kfree(nesmr);
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		if (!ret) {
+			return ERR_PTR(-ETIME);
+		} else {
+			return ERR_PTR(-ENOMEM);
+		}
+	}
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	nesmr->ibmw.rkey = stag;
+	nesmr->mode = IWNES_MEMREG_TYPE_MW;
+	ibmw = &nesmr->ibmw;
+	nesmr->pbl_4k = 0;
+	nesmr->pbls_used = 0;
+
+	return ibmw;
+}
+
+
+/**
+ * nes_dealloc_mw
+ */
+static int nes_dealloc_mw(struct ib_mw *ibmw)
+{
+	struct nes_mr *nesmr = to_nesmw(ibmw);
+	struct nes_vnic *nesvnic = to_nesvnic(ibmw->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	int err = 0;
+	int ret;
+
+	/* Deallocate the window with the adapter */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, NES_CQP_DEALLOCATE_STAG);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ibmw->rkey);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X to complete.\n",
+			ibmw->rkey);
+	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Deallocate STag completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			ret, cqp_request->major_code, cqp_request->minor_code);
+	if (!ret)
+		err = -ETIME;
+	else if (cqp_request->major_code)
+		err = -EIO;
+
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+			(ibmw->rkey & 0x0fffff00) >> 8);
+	kfree(nesmr);
+
+	return err;
+}
+
+
+/**
+ * nes_bind_mw
+ */
+static int nes_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw,
+		struct ib_mw_bind *ibmw_bind)
+{
+	u64 u64temp;
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	/* struct nes_mr *nesmr = to_nesmw(ibmw); */
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_hw_qp_wqe *wqe;
+	unsigned long flags = 0;
+	u32 head;
+	u32 wqe_misc = 0;
+	u32 qsize;
+
+	if (nesqp->ibqp_state > IB_QPS_RTS)
+		return -EINVAL;
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+
+	head = nesqp->hwqp.sq_head;
+	qsize = nesqp->hwqp.sq_tail;
+
+	/* Check for SQ overflow */
+	if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) {
+		spin_unlock_irqrestore(&nesqp->lock, flags);
+		return -ENOMEM;
+	}
+
+	wqe = &nesqp->hwqp.sq_vbase[head];
+	/* nes_debug(NES_DBG_MR, "processing sq wqe at %p, head = %u.\n", wqe, head); */
+	nes_fill_init_qp_wqe(wqe, nesqp, head);
+	u64temp = ibmw_bind->wr_id;
+	set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, u64temp);
+	wqe_misc = NES_IWARP_SQ_OP_BIND;
+
+	wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
+
+	if (ibmw_bind->send_flags & IB_SEND_SIGNALED)
+		wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL;
+
+	if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_WRITE)
+		wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE;
+	if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_READ)
+		wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_READ;
+
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_MISC_IDX, wqe_misc);
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MR_IDX,
+			    ibmw_bind->bind_info.mr->lkey);
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MW_IDX, ibmw->rkey);
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX,
+			ibmw_bind->bind_info.length);
+	wqe->wqe_words[NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX] = 0;
+	u64temp = (u64)ibmw_bind->bind_info.addr;
+	set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX, u64temp);
+
+	head++;
+	if (head >= qsize)
+		head = 0;
+
+	nesqp->hwqp.sq_head = head;
+	barrier();
+
+	nes_write32(nesdev->regs+NES_WQE_ALLOC,
+			(1 << 24) | 0x00800000 | nesqp->hwqp.qp_id);
+
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+	return 0;
+}
+
+
+/*
+ * nes_alloc_fast_mr
+ */
+static int alloc_fast_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
+			     u32 stag, u32 page_count)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	int ret;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 opcode = 0;
+	u16 major_code;
+	u64 region_length = page_count * PAGE_SIZE;
+
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	nes_debug(NES_DBG_MR, "alloc_fast_reg_mr: page_count = %d, "
+			      "region_length = %llu\n",
+			      page_count, region_length);
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+	if (nesadapter->free_4kpbl > 0) {
+		nesadapter->free_4kpbl--;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	} else {
+		/* No 4kpbl's available: */
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		nes_debug(NES_DBG_MR, "Out of Pbls\n");
+		nes_free_cqp_request(nesdev, cqp_request);
+		return -ENOMEM;
+	}
+
+	opcode = NES_CQP_ALLOCATE_STAG | NES_CQP_STAG_MR |
+		 NES_CQP_STAG_PBL_BLK_SIZE | NES_CQP_STAG_VA_TO |
+		 NES_CQP_STAG_REM_ACC_EN;
+	/*
+	 * The current OFED API does not support the zero based TO option.
+	 * If added then need to changed the NES_CQP_STAG_VA* option.  Also,
+	 * the API does not support that ability to have the MR set for local
+	 * access only when created and not allow the SQ op to override. Given
+	 * this the remote enable must be set here.
+	 */
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, 1);
+
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] =
+			cpu_to_le32((u32)(region_length >> 8) & 0xff000000);
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |=
+			cpu_to_le32(nespd->pd_id & 0x00007fff);
+
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, 0);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, 0);
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, 0);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (page_count * 8));
+	cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE);
+	barrier();
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	ret = wait_event_timeout(cqp_request->waitq,
+				 (0 != cqp_request->request_done),
+				 NES_EVENT_TIMEOUT);
+
+	nes_debug(NES_DBG_MR, "Allocate STag 0x%08X completed, "
+		  "wait_event_timeout ret = %u, CQP Major:Minor codes = "
+		  "0x%04X:0x%04X.\n", stag, ret, cqp_request->major_code,
+		  cqp_request->minor_code);
+	major_code = cqp_request->major_code;
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if (!ret || major_code) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_4kpbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	}
+
+	if (!ret)
+		return -ETIME;
+	else if (major_code)
+		return -EIO;
+	return 0;
+}
+
+/*
+ * nes_alloc_fast_reg_mr
+ */
+static struct ib_mr *nes_alloc_fast_reg_mr(struct ib_pd *ibpd, int max_page_list_len)
+{
+	struct nes_pd *nespd = to_nespd(ibpd);
+	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	u32 next_stag_index;
+	u8 stag_key = 0;
+	u32 driver_key = 0;
+	int err = 0;
+	u32 stag_index = 0;
+	struct nes_mr *nesmr;
+	u32 stag;
+	int ret;
+	struct ib_mr *ibmr;
+/*
+ * Note:  Set to always use a fixed length single page entry PBL.  This is to allow
+ *	 for the fast_reg_mr operation to always know the size of the PBL.
+ */
+	if (max_page_list_len > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
+		return ERR_PTR(-E2BIG);
+
+	get_random_bytes(&next_stag_index, sizeof(next_stag_index));
+	stag_key = (u8)next_stag_index;
+	next_stag_index >>= 8;
+	next_stag_index %= nesadapter->max_mr;
+
+	err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
+				 nesadapter->max_mr, &stag_index,
+				 &next_stag_index, NES_RESOURCE_FAST_MR);
+	if (err)
+		return ERR_PTR(err);
+
+	nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+	if (!nesmr) {
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	stag = stag_index << 8;
+	stag |= driver_key;
+	stag += (u32)stag_key;
+
+	nes_debug(NES_DBG_MR, "Allocating STag 0x%08X index = 0x%08X\n",
+		  stag, stag_index);
+
+	ret = alloc_fast_reg_mr(nesdev, nespd, stag, max_page_list_len);
+
+	if (ret == 0) {
+		nesmr->ibmr.rkey = stag;
+		nesmr->ibmr.lkey = stag;
+		nesmr->mode = IWNES_MEMREG_TYPE_FMEM;
+		ibmr = &nesmr->ibmr;
+	} else {
+		kfree(nesmr);
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		ibmr = ERR_PTR(-ENOMEM);
+	}
+	return ibmr;
+}
+
+/*
+ * nes_alloc_fast_reg_page_list
+ */
+static struct ib_fast_reg_page_list *nes_alloc_fast_reg_page_list(
+							struct ib_device *ibdev,
+							int page_list_len)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct ib_fast_reg_page_list *pifrpl;
+	struct nes_ib_fast_reg_page_list *pnesfrpl;
+
+	if (page_list_len > (NES_4K_PBL_CHUNK_SIZE / sizeof(u64)))
+		return ERR_PTR(-E2BIG);
+	/*
+	 * Allocate the ib_fast_reg_page_list structure, the
+	 * nes_fast_bpl structure, and the PLB table.
+	 */
+	pnesfrpl = kmalloc(sizeof(struct nes_ib_fast_reg_page_list) +
+			   page_list_len * sizeof(u64), GFP_KERNEL);
+
+	if (!pnesfrpl)
+		return ERR_PTR(-ENOMEM);
+
+	pifrpl = &pnesfrpl->ibfrpl;
+	pifrpl->page_list = &pnesfrpl->pbl;
+	pifrpl->max_page_list_len = page_list_len;
+	/*
+	 * Allocate the WQE PBL
+	 */
+	pnesfrpl->nes_wqe_pbl.kva = pci_alloc_consistent(nesdev->pcidev,
+							 page_list_len * sizeof(u64),
+							 &pnesfrpl->nes_wqe_pbl.paddr);
+
+	if (!pnesfrpl->nes_wqe_pbl.kva) {
+		kfree(pnesfrpl);
+		return ERR_PTR(-ENOMEM);
+	}
+	nes_debug(NES_DBG_MR, "nes_alloc_fast_reg_pbl: nes_frpl = %p, "
+		  "ibfrpl = %p, ibfrpl.page_list = %p, pbl.kva = %p, "
+		  "pbl.paddr = %llx\n", pnesfrpl, &pnesfrpl->ibfrpl,
+		  pnesfrpl->ibfrpl.page_list, pnesfrpl->nes_wqe_pbl.kva,
+		  (unsigned long long) pnesfrpl->nes_wqe_pbl.paddr);
+
+	return pifrpl;
+}
+
+/*
+ * nes_free_fast_reg_page_list
+ */
+static void nes_free_fast_reg_page_list(struct ib_fast_reg_page_list *pifrpl)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(pifrpl->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_ib_fast_reg_page_list *pnesfrpl;
+
+	pnesfrpl = container_of(pifrpl, struct nes_ib_fast_reg_page_list, ibfrpl);
+	/*
+	 * Free the WQE PBL.
+	 */
+	pci_free_consistent(nesdev->pcidev,
+			    pifrpl->max_page_list_len * sizeof(u64),
+			    pnesfrpl->nes_wqe_pbl.kva,
+			    pnesfrpl->nes_wqe_pbl.paddr);
+	/*
+	 * Free the PBL structure
+	 */
+	kfree(pnesfrpl);
+}
+
+/**
+ * nes_query_device
+ */
+static int nes_query_device(struct ib_device *ibdev, struct ib_device_attr *props)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_ib_device *nesibdev = nesvnic->nesibdev;
+
+	memset(props, 0, sizeof(*props));
+	memcpy(&props->sys_image_guid, nesvnic->netdev->dev_addr, 6);
+
+	props->fw_ver = nesdev->nesadapter->firmware_version;
+	props->device_cap_flags = nesdev->nesadapter->device_cap_flags;
+	props->vendor_id = nesdev->nesadapter->vendor_id;
+	props->vendor_part_id = nesdev->nesadapter->vendor_part_id;
+	props->hw_ver = nesdev->nesadapter->hw_rev;
+	props->max_mr_size = 0x80000000;
+	props->max_qp = nesibdev->max_qp;
+	props->max_qp_wr = nesdev->nesadapter->max_qp_wr - 2;
+	props->max_sge = nesdev->nesadapter->max_sge;
+	props->max_cq = nesibdev->max_cq;
+	props->max_cqe = nesdev->nesadapter->max_cqe;
+	props->max_mr = nesibdev->max_mr;
+	props->max_mw = nesibdev->max_mr;
+	props->max_pd = nesibdev->max_pd;
+	props->max_sge_rd = 1;
+	switch (nesdev->nesadapter->max_irrq_wr) {
+		case 0:
+			props->max_qp_rd_atom = 2;
+			break;
+		case 1:
+			props->max_qp_rd_atom = 8;
+			break;
+		case 2:
+			props->max_qp_rd_atom = 32;
+			break;
+		case 3:
+			props->max_qp_rd_atom = 64;
+			break;
+		default:
+			props->max_qp_rd_atom = 0;
+	}
+	props->max_qp_init_rd_atom = props->max_qp_rd_atom;
+	props->atomic_cap = IB_ATOMIC_NONE;
+	props->max_map_per_fmr = 1;
+
+	return 0;
+}
+
+
+/**
+ * nes_query_port
+ */
+static int nes_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct net_device *netdev = nesvnic->netdev;
+
+	memset(props, 0, sizeof(*props));
+
+	props->max_mtu = IB_MTU_4096;
+
+	if (netdev->mtu  >= 4096)
+		props->active_mtu = IB_MTU_4096;
+	else if (netdev->mtu  >= 2048)
+		props->active_mtu = IB_MTU_2048;
+	else if (netdev->mtu  >= 1024)
+		props->active_mtu = IB_MTU_1024;
+	else if (netdev->mtu  >= 512)
+		props->active_mtu = IB_MTU_512;
+	else
+		props->active_mtu = IB_MTU_256;
+
+	props->lid = 1;
+	props->lmc = 0;
+	props->sm_lid = 0;
+	props->sm_sl = 0;
+	if (netif_queue_stopped(netdev))
+		props->state = IB_PORT_DOWN;
+	else if (nesvnic->linkup)
+		props->state = IB_PORT_ACTIVE;
+	else
+		props->state = IB_PORT_DOWN;
+	props->phys_state = 0;
+	props->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_REINIT_SUP |
+			IB_PORT_VENDOR_CLASS_SUP | IB_PORT_BOOT_MGMT_SUP;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->qkey_viol_cntr = 0;
+	props->active_width = IB_WIDTH_4X;
+	props->active_speed = IB_SPEED_SDR;
+	props->max_msg_sz = 0x80000000;
+
+	return 0;
+}
+
+
+/**
+ * nes_query_pkey
+ */
+static int nes_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+	*pkey = 0;
+	return 0;
+}
+
+
+/**
+ * nes_query_gid
+ */
+static int nes_query_gid(struct ib_device *ibdev, u8 port,
+		int index, union ib_gid *gid)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	memcpy(&(gid->raw[0]), nesvnic->netdev->dev_addr, 6);
+
+	return 0;
+}
+
+
+/**
+ * nes_alloc_ucontext - Allocate the user context data structure. This keeps track
+ * of all objects associated with a particular user-mode client.
+ */
+static struct ib_ucontext *nes_alloc_ucontext(struct ib_device *ibdev,
+		struct ib_udata *udata)
+{
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_alloc_ucontext_req req;
+	struct nes_alloc_ucontext_resp uresp;
+	struct nes_ucontext *nes_ucontext;
+	struct nes_ib_device *nesibdev = nesvnic->nesibdev;
+
+
+	if (ib_copy_from_udata(&req, udata, sizeof(struct nes_alloc_ucontext_req))) {
+		printk(KERN_ERR PFX "Invalid structure size on allocate user context.\n");
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (req.userspace_ver != NES_ABI_USERSPACE_VER) {
+		printk(KERN_ERR PFX "Invalid userspace driver version detected. Detected version %d, should be %d\n",
+			req.userspace_ver, NES_ABI_USERSPACE_VER);
+		return ERR_PTR(-EINVAL);
+	}
+
+
+	memset(&uresp, 0, sizeof uresp);
+
+	uresp.max_qps = nesibdev->max_qp;
+	uresp.max_pds = nesibdev->max_pd;
+	uresp.wq_size = nesdev->nesadapter->max_qp_wr * 2;
+	uresp.virtwq = nesadapter->virtwq;
+	uresp.kernel_ver = NES_ABI_KERNEL_VER;
+
+	nes_ucontext = kzalloc(sizeof *nes_ucontext, GFP_KERNEL);
+	if (!nes_ucontext)
+		return ERR_PTR(-ENOMEM);
+
+	nes_ucontext->nesdev = nesdev;
+	nes_ucontext->mmap_wq_offset = uresp.max_pds;
+	nes_ucontext->mmap_cq_offset = nes_ucontext->mmap_wq_offset +
+			((sizeof(struct nes_hw_qp_wqe) * uresp.max_qps * 2) + PAGE_SIZE-1) /
+			PAGE_SIZE;
+
+
+	if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
+		kfree(nes_ucontext);
+		return ERR_PTR(-EFAULT);
+	}
+
+	INIT_LIST_HEAD(&nes_ucontext->cq_reg_mem_list);
+	INIT_LIST_HEAD(&nes_ucontext->qp_reg_mem_list);
+	atomic_set(&nes_ucontext->usecnt, 1);
+	return &nes_ucontext->ibucontext;
+}
+
+
+/**
+ * nes_dealloc_ucontext
+ */
+static int nes_dealloc_ucontext(struct ib_ucontext *context)
+{
+	/* struct nes_vnic *nesvnic = to_nesvnic(context->device); */
+	/* struct nes_device *nesdev = nesvnic->nesdev; */
+	struct nes_ucontext *nes_ucontext = to_nesucontext(context);
+
+	if (!atomic_dec_and_test(&nes_ucontext->usecnt))
+	  return 0;
+	kfree(nes_ucontext);
+	return 0;
+}
+
+
+/**
+ * nes_mmap
+ */
+static int nes_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	unsigned long index;
+	struct nes_vnic *nesvnic = to_nesvnic(context->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	/* struct nes_adapter *nesadapter = nesdev->nesadapter; */
+	struct nes_ucontext *nes_ucontext;
+	struct nes_qp *nesqp;
+
+	nes_ucontext = to_nesucontext(context);
+
+
+	if (vma->vm_pgoff >= nes_ucontext->mmap_wq_offset) {
+		index = (vma->vm_pgoff - nes_ucontext->mmap_wq_offset) * PAGE_SIZE;
+		index /= ((sizeof(struct nes_hw_qp_wqe) * nesdev->nesadapter->max_qp_wr * 2) +
+				PAGE_SIZE-1) & (~(PAGE_SIZE-1));
+		if (!test_bit(index, nes_ucontext->allocated_wqs)) {
+			nes_debug(NES_DBG_MMAP, "wq %lu not allocated\n", index);
+			return -EFAULT;
+		}
+		nesqp = nes_ucontext->mmap_nesqp[index];
+		if (nesqp == NULL) {
+			nes_debug(NES_DBG_MMAP, "wq %lu has a NULL QP base.\n", index);
+			return -EFAULT;
+		}
+		if (remap_pfn_range(vma, vma->vm_start,
+				virt_to_phys(nesqp->hwqp.sq_vbase) >> PAGE_SHIFT,
+				vma->vm_end - vma->vm_start,
+				vma->vm_page_prot)) {
+			nes_debug(NES_DBG_MMAP, "remap_pfn_range failed.\n");
+			return -EAGAIN;
+		}
+		vma->vm_private_data = nesqp;
+		return 0;
+	} else {
+		index = vma->vm_pgoff;
+		if (!test_bit(index, nes_ucontext->allocated_doorbells))
+			return -EFAULT;
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		if (io_remap_pfn_range(vma, vma->vm_start,
+				(nesdev->doorbell_start +
+				((nes_ucontext->mmap_db_index[index] - nesdev->base_doorbell_index) * 4096))
+				>> PAGE_SHIFT, PAGE_SIZE, vma->vm_page_prot))
+			return -EAGAIN;
+		vma->vm_private_data = nes_ucontext;
+		return 0;
+	}
+
+	return -ENOSYS;
+}
+
+
+/**
+ * nes_alloc_pd
+ */
+static struct ib_pd *nes_alloc_pd(struct ib_device *ibdev,
+		struct ib_ucontext *context, struct ib_udata *udata)
+{
+	struct nes_pd *nespd;
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_ucontext *nesucontext;
+	struct nes_alloc_pd_resp uresp;
+	u32 pd_num = 0;
+	int err;
+
+	nes_debug(NES_DBG_PD, "nesvnic=%p, netdev=%p %s, ibdev=%p, context=%p, netdev refcnt=%u\n",
+			nesvnic, nesdev->netdev[0], nesdev->netdev[0]->name, ibdev, context,
+			netdev_refcnt_read(nesvnic->netdev));
+
+	err = nes_alloc_resource(nesadapter, nesadapter->allocated_pds,
+			nesadapter->max_pd, &pd_num, &nesadapter->next_pd, NES_RESOURCE_PD);
+	if (err) {
+		return ERR_PTR(err);
+	}
+
+	nespd = kzalloc(sizeof (struct nes_pd), GFP_KERNEL);
+	if (!nespd) {
+		nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	nes_debug(NES_DBG_PD, "Allocating PD (%p) for ib device %s\n",
+			nespd, nesvnic->nesibdev->ibdev.name);
+
+	nespd->pd_id = (pd_num << (PAGE_SHIFT-12)) + nesadapter->base_pd;
+
+	if (context) {
+		nesucontext = to_nesucontext(context);
+		nespd->mmap_db_index = find_next_zero_bit(nesucontext->allocated_doorbells,
+				NES_MAX_USER_DB_REGIONS, nesucontext->first_free_db);
+		nes_debug(NES_DBG_PD, "find_first_zero_biton doorbells returned %u, mapping pd_id %u.\n",
+				nespd->mmap_db_index, nespd->pd_id);
+		if (nespd->mmap_db_index >= NES_MAX_USER_DB_REGIONS) {
+			nes_debug(NES_DBG_PD, "mmap_db_index > MAX\n");
+			nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
+			kfree(nespd);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		uresp.pd_id = nespd->pd_id;
+		uresp.mmap_db_index = nespd->mmap_db_index;
+		if (ib_copy_to_udata(udata, &uresp, sizeof (struct nes_alloc_pd_resp))) {
+			nes_free_resource(nesadapter, nesadapter->allocated_pds, pd_num);
+			kfree(nespd);
+			return ERR_PTR(-EFAULT);
+		}
+
+		set_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells);
+		nesucontext->mmap_db_index[nespd->mmap_db_index] = nespd->pd_id;
+		nesucontext->first_free_db = nespd->mmap_db_index + 1;
+	}
+
+	nes_debug(NES_DBG_PD, "PD%u structure located @%p.\n", nespd->pd_id, nespd);
+	return &nespd->ibpd;
+}
+
+
+/**
+ * nes_dealloc_pd
+ */
+static int nes_dealloc_pd(struct ib_pd *ibpd)
+{
+	struct nes_ucontext *nesucontext;
+	struct nes_pd *nespd = to_nespd(ibpd);
+	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+
+	if ((ibpd->uobject) && (ibpd->uobject->context)) {
+		nesucontext = to_nesucontext(ibpd->uobject->context);
+		nes_debug(NES_DBG_PD, "Clearing bit %u from allocated doorbells\n",
+				nespd->mmap_db_index);
+		clear_bit(nespd->mmap_db_index, nesucontext->allocated_doorbells);
+		nesucontext->mmap_db_index[nespd->mmap_db_index] = 0;
+		if (nesucontext->first_free_db > nespd->mmap_db_index) {
+			nesucontext->first_free_db = nespd->mmap_db_index;
+		}
+	}
+
+	nes_debug(NES_DBG_PD, "Deallocating PD%u structure located @%p.\n",
+			nespd->pd_id, nespd);
+	nes_free_resource(nesadapter, nesadapter->allocated_pds,
+			(nespd->pd_id-nesadapter->base_pd)>>(PAGE_SHIFT-12));
+	kfree(nespd);
+
+	return 0;
+}
+
+
+/**
+ * nes_create_ah
+ */
+static struct ib_ah *nes_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr)
+{
+	return ERR_PTR(-ENOSYS);
+}
+
+
+/**
+ * nes_destroy_ah
+ */
+static int nes_destroy_ah(struct ib_ah *ah)
+{
+	return -ENOSYS;
+}
+
+
+/**
+ * nes_get_encoded_size
+ */
+static inline u8 nes_get_encoded_size(int *size)
+{
+	u8 encoded_size = 0;
+	if (*size <= 32) {
+		*size = 32;
+		encoded_size = 1;
+	} else if (*size <= 128) {
+		*size = 128;
+		encoded_size = 2;
+	} else if (*size <= 512) {
+		*size = 512;
+		encoded_size = 3;
+	}
+	return (encoded_size);
+}
+
+
+
+/**
+ * nes_setup_virt_qp
+ */
+static int nes_setup_virt_qp(struct nes_qp *nesqp, struct nes_pbl *nespbl,
+		struct nes_vnic *nesvnic, int sq_size, int rq_size)
+{
+	unsigned long flags;
+	void *mem;
+	__le64 *pbl = NULL;
+	__le64 *tpbl;
+	__le64 *pblbuffer;
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	u32 pbl_entries;
+	u8 rq_pbl_entries;
+	u8 sq_pbl_entries;
+
+	pbl_entries = nespbl->pbl_size >> 3;
+	nes_debug(NES_DBG_QP, "Userspace PBL, pbl_size=%u, pbl_entries = %d pbl_vbase=%p, pbl_pbase=%lx\n",
+			nespbl->pbl_size, pbl_entries,
+			(void *)nespbl->pbl_vbase,
+			(unsigned long) nespbl->pbl_pbase);
+	pbl = (__le64 *) nespbl->pbl_vbase; /* points to first pbl entry */
+	/* now lets set the sq_vbase as well as rq_vbase addrs we will assign */
+	/* the first pbl to be fro the rq_vbase... */
+	rq_pbl_entries = (rq_size * sizeof(struct nes_hw_qp_wqe)) >> 12;
+	sq_pbl_entries = (sq_size * sizeof(struct nes_hw_qp_wqe)) >> 12;
+	nesqp->hwqp.sq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32);
+	if (!nespbl->page) {
+		nes_debug(NES_DBG_QP, "QP nespbl->page is NULL \n");
+		kfree(nespbl);
+		return -ENOMEM;
+	}
+
+	nesqp->hwqp.sq_vbase = kmap(nespbl->page);
+	nesqp->page = nespbl->page;
+	if (!nesqp->hwqp.sq_vbase) {
+		nes_debug(NES_DBG_QP, "QP sq_vbase kmap failed\n");
+		kfree(nespbl);
+		return -ENOMEM;
+	}
+
+	/* Now to get to sq.. we need to calculate how many */
+	/* PBL entries were used by the rq.. */
+	pbl += sq_pbl_entries;
+	nesqp->hwqp.rq_pbase = (le32_to_cpu(((__le32 *)pbl)[0])) | ((u64)((le32_to_cpu(((__le32 *)pbl)[1]))) << 32);
+	/* nesqp->hwqp.rq_vbase = bus_to_virt(*pbl); */
+	/*nesqp->hwqp.rq_vbase = phys_to_virt(*pbl); */
+
+	nes_debug(NES_DBG_QP, "QP sq_vbase= %p sq_pbase=%lx rq_vbase=%p rq_pbase=%lx\n",
+		  nesqp->hwqp.sq_vbase, (unsigned long) nesqp->hwqp.sq_pbase,
+		  nesqp->hwqp.rq_vbase, (unsigned long) nesqp->hwqp.rq_pbase);
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+	if (!nesadapter->free_256pbl) {
+		pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+				nespbl->pbl_pbase);
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		kunmap(nesqp->page);
+		kfree(nespbl);
+		return -ENOMEM;
+	}
+	nesadapter->free_256pbl--;
+	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+	nesqp->pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 256, &nesqp->pbl_pbase);
+	pblbuffer = nesqp->pbl_vbase;
+	if (!nesqp->pbl_vbase) {
+		/* memory allocated during nes_reg_user_mr() */
+		pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+				    nespbl->pbl_pbase);
+		kfree(nespbl);
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_256pbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		kunmap(nesqp->page);
+		return -ENOMEM;
+	}
+	memset(nesqp->pbl_vbase, 0, 256);
+	/* fill in the page address in the pbl buffer.. */
+	tpbl = pblbuffer + 16;
+	pbl = (__le64 *)nespbl->pbl_vbase;
+	while (sq_pbl_entries--)
+		*tpbl++ = *pbl++;
+	tpbl = pblbuffer;
+	while (rq_pbl_entries--)
+		*tpbl++ = *pbl++;
+
+	/* done with memory allocated during nes_reg_user_mr() */
+	pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+			    nespbl->pbl_pbase);
+	kfree(nespbl);
+
+	nesqp->qp_mem_size =
+			max((u32)sizeof(struct nes_qp_context), ((u32)256)) + 256;     /* this is Q2 */
+	/* Round up to a multiple of a page */
+	nesqp->qp_mem_size += PAGE_SIZE - 1;
+	nesqp->qp_mem_size &= ~(PAGE_SIZE - 1);
+
+	mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+			&nesqp->hwqp.q2_pbase);
+
+	if (!mem) {
+		pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase);
+		nesqp->pbl_vbase = NULL;
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_256pbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		kunmap(nesqp->page);
+		return -ENOMEM;
+	}
+	nesqp->sq_kmapped = 1;
+	nesqp->hwqp.q2_vbase = mem;
+	mem += 256;
+	memset(nesqp->hwqp.q2_vbase, 0, 256);
+	nesqp->nesqp_context = mem;
+	memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context));
+	nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256;
+
+	return 0;
+}
+
+
+/**
+ * nes_setup_mmap_qp
+ */
+static int nes_setup_mmap_qp(struct nes_qp *nesqp, struct nes_vnic *nesvnic,
+		int sq_size, int rq_size)
+{
+	void *mem;
+	struct nes_device *nesdev = nesvnic->nesdev;
+
+	nesqp->qp_mem_size = (sizeof(struct nes_hw_qp_wqe) * sq_size) +
+			(sizeof(struct nes_hw_qp_wqe) * rq_size) +
+			max((u32)sizeof(struct nes_qp_context), ((u32)256)) +
+			256; /* this is Q2 */
+	/* Round up to a multiple of a page */
+	nesqp->qp_mem_size += PAGE_SIZE - 1;
+	nesqp->qp_mem_size &= ~(PAGE_SIZE - 1);
+
+	mem = pci_alloc_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+			&nesqp->hwqp.sq_pbase);
+	if (!mem)
+		return -ENOMEM;
+	nes_debug(NES_DBG_QP, "PCI consistent memory for "
+			"host descriptor rings located @ %p (pa = 0x%08lX.) size = %u.\n",
+			mem, (unsigned long)nesqp->hwqp.sq_pbase, nesqp->qp_mem_size);
+
+	memset(mem, 0, nesqp->qp_mem_size);
+
+	nesqp->hwqp.sq_vbase = mem;
+	mem += sizeof(struct nes_hw_qp_wqe) * sq_size;
+
+	nesqp->hwqp.rq_vbase = mem;
+	nesqp->hwqp.rq_pbase = nesqp->hwqp.sq_pbase +
+			sizeof(struct nes_hw_qp_wqe) * sq_size;
+	mem += sizeof(struct nes_hw_qp_wqe) * rq_size;
+
+	nesqp->hwqp.q2_vbase = mem;
+	nesqp->hwqp.q2_pbase = nesqp->hwqp.rq_pbase +
+			sizeof(struct nes_hw_qp_wqe) * rq_size;
+	mem += 256;
+	memset(nesqp->hwqp.q2_vbase, 0, 256);
+
+	nesqp->nesqp_context = mem;
+	nesqp->nesqp_context_pbase = nesqp->hwqp.q2_pbase + 256;
+	memset(nesqp->nesqp_context, 0, sizeof(*nesqp->nesqp_context));
+	return 0;
+}
+
+
+/**
+ * nes_free_qp_mem() is to free up the qp's pci_alloc_consistent() memory.
+ */
+static inline void nes_free_qp_mem(struct nes_device *nesdev,
+		struct nes_qp *nesqp, int virt_wqs)
+{
+	unsigned long flags;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	if (!virt_wqs) {
+		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size,
+				nesqp->hwqp.sq_vbase, nesqp->hwqp.sq_pbase);
+	}else {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		nesadapter->free_256pbl++;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+		pci_free_consistent(nesdev->pcidev, nesqp->qp_mem_size, nesqp->hwqp.q2_vbase, nesqp->hwqp.q2_pbase);
+		pci_free_consistent(nesdev->pcidev, 256, nesqp->pbl_vbase, nesqp->pbl_pbase );
+		nesqp->pbl_vbase = NULL;
+		if (nesqp->sq_kmapped) {
+			nesqp->sq_kmapped = 0;
+			kunmap(nesqp->page);
+		}
+	}
+}
+
+
+/**
+ * nes_create_qp
+ */
+static struct ib_qp *nes_create_qp(struct ib_pd *ibpd,
+		struct ib_qp_init_attr *init_attr, struct ib_udata *udata)
+{
+	u64 u64temp= 0;
+	u64 u64nesqp = 0;
+	struct nes_pd *nespd = to_nespd(ibpd);
+	struct nes_vnic *nesvnic = to_nesvnic(ibpd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_qp *nesqp;
+	struct nes_cq *nescq;
+	struct nes_ucontext *nes_ucontext;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	struct nes_create_qp_req req;
+	struct nes_create_qp_resp uresp;
+	struct nes_pbl  *nespbl = NULL;
+	u32 qp_num = 0;
+	u32 opcode = 0;
+	/* u32 counter = 0; */
+	void *mem;
+	unsigned long flags;
+	int ret;
+	int err;
+	int virt_wqs = 0;
+	int sq_size;
+	int rq_size;
+	u8 sq_encoded_size;
+	u8 rq_encoded_size;
+	/* int counter; */
+
+	if (init_attr->create_flags)
+		return ERR_PTR(-EINVAL);
+
+	atomic_inc(&qps_created);
+	switch (init_attr->qp_type) {
+		case IB_QPT_RC:
+			if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) {
+				init_attr->cap.max_inline_data = 0;
+			} else {
+				init_attr->cap.max_inline_data = 64;
+			}
+			sq_size = init_attr->cap.max_send_wr;
+			rq_size = init_attr->cap.max_recv_wr;
+
+			/* check if the encoded sizes are OK or not... */
+			sq_encoded_size = nes_get_encoded_size(&sq_size);
+			rq_encoded_size = nes_get_encoded_size(&rq_size);
+
+			if ((!sq_encoded_size) || (!rq_encoded_size)) {
+				nes_debug(NES_DBG_QP, "ERROR bad rq (%u) or sq (%u) size\n",
+						rq_size, sq_size);
+				return ERR_PTR(-EINVAL);
+			}
+
+			init_attr->cap.max_send_wr = sq_size -2;
+			init_attr->cap.max_recv_wr = rq_size -1;
+			nes_debug(NES_DBG_QP, "RQ size=%u, SQ Size=%u\n", rq_size, sq_size);
+
+			ret = nes_alloc_resource(nesadapter, nesadapter->allocated_qps,
+					nesadapter->max_qp, &qp_num, &nesadapter->next_qp, NES_RESOURCE_QP);
+			if (ret) {
+				return ERR_PTR(ret);
+			}
+
+			/* Need 512 (actually now 1024) byte alignment on this structure */
+			mem = kzalloc(sizeof(*nesqp)+NES_SW_CONTEXT_ALIGN-1, GFP_KERNEL);
+			if (!mem) {
+				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+				nes_debug(NES_DBG_QP, "Unable to allocate QP\n");
+				return ERR_PTR(-ENOMEM);
+			}
+			u64nesqp = (unsigned long)mem;
+			u64nesqp += ((u64)NES_SW_CONTEXT_ALIGN) - 1;
+			u64temp = ((u64)NES_SW_CONTEXT_ALIGN) - 1;
+			u64nesqp &= ~u64temp;
+			nesqp = (struct nes_qp *)(unsigned long)u64nesqp;
+			/* nes_debug(NES_DBG_QP, "nesqp=%p, allocated buffer=%p.  Rounded to closest %u\n",
+					nesqp, mem, NES_SW_CONTEXT_ALIGN); */
+			nesqp->allocated_buffer = mem;
+
+			if (udata) {
+				if (ib_copy_from_udata(&req, udata, sizeof(struct nes_create_qp_req))) {
+					nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+					kfree(nesqp->allocated_buffer);
+					nes_debug(NES_DBG_QP, "ib_copy_from_udata() Failed \n");
+					return ERR_PTR(-EFAULT);
+				}
+				if (req.user_wqe_buffers) {
+					virt_wqs = 1;
+				}
+				if (req.user_qp_buffer)
+					nesqp->nesuqp_addr = req.user_qp_buffer;
+				if ((ibpd->uobject) && (ibpd->uobject->context)) {
+					nesqp->user_mode = 1;
+					nes_ucontext = to_nesucontext(ibpd->uobject->context);
+					if (virt_wqs) {
+						err = 1;
+						list_for_each_entry(nespbl, &nes_ucontext->qp_reg_mem_list, list) {
+							if (nespbl->user_base == (unsigned long )req.user_wqe_buffers) {
+								list_del(&nespbl->list);
+								err = 0;
+								nes_debug(NES_DBG_QP, "Found PBL for virtual QP. nespbl=%p. user_base=0x%lx\n",
+									  nespbl, nespbl->user_base);
+								break;
+							}
+						}
+						if (err) {
+							nes_debug(NES_DBG_QP, "Didn't Find PBL for virtual QP. address = %llx.\n",
+								  (long long unsigned int)req.user_wqe_buffers);
+							nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+							kfree(nesqp->allocated_buffer);
+							return ERR_PTR(-EFAULT);
+						}
+					}
+
+					nes_ucontext = to_nesucontext(ibpd->uobject->context);
+					nesqp->mmap_sq_db_index =
+						find_next_zero_bit(nes_ucontext->allocated_wqs,
+								   NES_MAX_USER_WQ_REGIONS, nes_ucontext->first_free_wq);
+					/* nes_debug(NES_DBG_QP, "find_first_zero_biton wqs returned %u\n",
+							nespd->mmap_db_index); */
+					if (nesqp->mmap_sq_db_index >= NES_MAX_USER_WQ_REGIONS) {
+						nes_debug(NES_DBG_QP,
+							  "db index > max user regions, failing create QP\n");
+						nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+						if (virt_wqs) {
+							pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+									    nespbl->pbl_pbase);
+							kfree(nespbl);
+						}
+						kfree(nesqp->allocated_buffer);
+						return ERR_PTR(-ENOMEM);
+					}
+					set_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs);
+					nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = nesqp;
+					nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index + 1;
+				} else {
+					nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+					kfree(nesqp->allocated_buffer);
+					return ERR_PTR(-EFAULT);
+				}
+			}
+			err = (!virt_wqs) ? nes_setup_mmap_qp(nesqp, nesvnic, sq_size, rq_size) :
+					nes_setup_virt_qp(nesqp, nespbl, nesvnic, sq_size, rq_size);
+			if (err) {
+				nes_debug(NES_DBG_QP,
+					  "error geting qp mem code = %d\n", err);
+				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+				kfree(nesqp->allocated_buffer);
+				return ERR_PTR(-ENOMEM);
+			}
+
+			nesqp->hwqp.sq_size = sq_size;
+			nesqp->hwqp.sq_encoded_size = sq_encoded_size;
+			nesqp->hwqp.sq_head = 1;
+			nesqp->hwqp.rq_size = rq_size;
+			nesqp->hwqp.rq_encoded_size = rq_encoded_size;
+			/* nes_debug(NES_DBG_QP, "nesqp->nesqp_context_pbase = %p\n",
+					(void *)nesqp->nesqp_context_pbase);
+			*/
+			nesqp->hwqp.qp_id = qp_num;
+			nesqp->ibqp.qp_num = nesqp->hwqp.qp_id;
+			nesqp->nespd = nespd;
+
+			nescq = to_nescq(init_attr->send_cq);
+			nesqp->nesscq = nescq;
+			nescq = to_nescq(init_attr->recv_cq);
+			nesqp->nesrcq = nescq;
+
+			nesqp->nesqp_context->misc |= cpu_to_le32((u32)PCI_FUNC(nesdev->pcidev->devfn) <<
+					NES_QPCONTEXT_MISC_PCI_FCN_SHIFT);
+			nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.rq_encoded_size <<
+					NES_QPCONTEXT_MISC_RQ_SIZE_SHIFT);
+			nesqp->nesqp_context->misc |= cpu_to_le32((u32)nesqp->hwqp.sq_encoded_size <<
+					NES_QPCONTEXT_MISC_SQ_SIZE_SHIFT);
+			if (!udata) {
+				nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_PRIV_EN);
+				nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_FAST_REGISTER_EN);
+			}
+			nesqp->nesqp_context->cqs = cpu_to_le32(nesqp->nesscq->hw_cq.cq_number +
+					((u32)nesqp->nesrcq->hw_cq.cq_number << 16));
+			u64temp = (u64)nesqp->hwqp.sq_pbase;
+			nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp);
+			nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+
+
+			if (!virt_wqs) {
+				u64temp = (u64)nesqp->hwqp.sq_pbase;
+				nesqp->nesqp_context->sq_addr_low = cpu_to_le32((u32)u64temp);
+				nesqp->nesqp_context->sq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+				u64temp = (u64)nesqp->hwqp.rq_pbase;
+				nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp);
+				nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+			} else {
+				u64temp = (u64)nesqp->pbl_pbase;
+				nesqp->nesqp_context->rq_addr_low = cpu_to_le32((u32)u64temp);
+				nesqp->nesqp_context->rq_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+			}
+
+			/* nes_debug(NES_DBG_QP, "next_qp_nic_index=%u, using nic_index=%d\n",
+					nesvnic->next_qp_nic_index,
+					nesvnic->qp_nic_index[nesvnic->next_qp_nic_index]); */
+			spin_lock_irqsave(&nesdev->cqp.lock, flags);
+			nesqp->nesqp_context->misc2 |= cpu_to_le32(
+					(u32)nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] <<
+					NES_QPCONTEXT_MISC2_NIC_INDEX_SHIFT);
+			nesvnic->next_qp_nic_index++;
+			if ((nesvnic->next_qp_nic_index > 3) ||
+					(nesvnic->qp_nic_index[nesvnic->next_qp_nic_index] == 0xf)) {
+				nesvnic->next_qp_nic_index = 0;
+			}
+			spin_unlock_irqrestore(&nesdev->cqp.lock, flags);
+
+			nesqp->nesqp_context->pd_index_wscale |= cpu_to_le32((u32)nesqp->nespd->pd_id << 16);
+			u64temp = (u64)nesqp->hwqp.q2_pbase;
+			nesqp->nesqp_context->q2_addr_low = cpu_to_le32((u32)u64temp);
+			nesqp->nesqp_context->q2_addr_high = cpu_to_le32((u32)(u64temp >> 32));
+			nesqp->nesqp_context->aeq_token_low =  cpu_to_le32((u32)((unsigned long)(nesqp)));
+			nesqp->nesqp_context->aeq_token_high =  cpu_to_le32((u32)(upper_32_bits((unsigned long)(nesqp))));
+			nesqp->nesqp_context->ird_ord_sizes = cpu_to_le32(NES_QPCONTEXT_ORDIRD_ALSMM |
+					NES_QPCONTEXT_ORDIRD_AAH |
+					((((u32)nesadapter->max_irrq_wr) <<
+					NES_QPCONTEXT_ORDIRD_IRDSIZE_SHIFT) & NES_QPCONTEXT_ORDIRD_IRDSIZE_MASK));
+			if (disable_mpa_crc) {
+				nes_debug(NES_DBG_QP, "Disabling MPA crc checking due to module option.\n");
+				nesqp->nesqp_context->ird_ord_sizes |= cpu_to_le32(NES_QPCONTEXT_ORDIRD_RNMC);
+			}
+
+
+			/* Create the QP */
+			cqp_request = nes_get_cqp_request(nesdev);
+			if (cqp_request == NULL) {
+				nes_debug(NES_DBG_QP, "Failed to get a cqp_request\n");
+				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+				nes_free_qp_mem(nesdev, nesqp,virt_wqs);
+				kfree(nesqp->allocated_buffer);
+				return ERR_PTR(-ENOMEM);
+			}
+			cqp_request->waiting = 1;
+			cqp_wqe = &cqp_request->cqp_wqe;
+
+			if (!virt_wqs) {
+				opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP |
+					NES_CQP_QP_IWARP_STATE_IDLE;
+			} else {
+				opcode = NES_CQP_CREATE_QP | NES_CQP_QP_TYPE_IWARP | NES_CQP_QP_VIRT_WQS |
+					NES_CQP_QP_IWARP_STATE_IDLE;
+			}
+			opcode |= NES_CQP_QP_CQS_VALID;
+			nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+			set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+			set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+
+			u64temp = (u64)nesqp->nesqp_context_pbase;
+			set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, u64temp);
+
+			atomic_set(&cqp_request->refcount, 2);
+			nes_post_cqp_request(nesdev, cqp_request);
+
+			/* Wait for CQP */
+			nes_debug(NES_DBG_QP, "Waiting for create iWARP QP%u to complete.\n",
+					nesqp->hwqp.qp_id);
+			ret = wait_event_timeout(cqp_request->waitq,
+					(cqp_request->request_done != 0), NES_EVENT_TIMEOUT);
+			nes_debug(NES_DBG_QP, "Create iwarp QP%u completed, wait_event_timeout ret=%u,"
+					" nesdev->cqp_head = %u, nesdev->cqp.sq_tail = %u,"
+					" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+					nesqp->hwqp.qp_id, ret, nesdev->cqp.sq_head, nesdev->cqp.sq_tail,
+					cqp_request->major_code, cqp_request->minor_code);
+			if ((!ret) || (cqp_request->major_code)) {
+				nes_put_cqp_request(nesdev, cqp_request);
+				nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+				nes_free_qp_mem(nesdev, nesqp,virt_wqs);
+				kfree(nesqp->allocated_buffer);
+				if (!ret) {
+					return ERR_PTR(-ETIME);
+				} else {
+					return ERR_PTR(-EIO);
+				}
+			}
+
+			nes_put_cqp_request(nesdev, cqp_request);
+
+			if (ibpd->uobject) {
+				uresp.mmap_sq_db_index = nesqp->mmap_sq_db_index;
+				uresp.mmap_rq_db_index = 0;
+				uresp.actual_sq_size = sq_size;
+				uresp.actual_rq_size = rq_size;
+				uresp.qp_id = nesqp->hwqp.qp_id;
+				uresp.nes_drv_opt = nes_drv_opt;
+				if (ib_copy_to_udata(udata, &uresp, sizeof uresp)) {
+					nes_free_resource(nesadapter, nesadapter->allocated_qps, qp_num);
+					nes_free_qp_mem(nesdev, nesqp,virt_wqs);
+					kfree(nesqp->allocated_buffer);
+					return ERR_PTR(-EFAULT);
+				}
+			}
+
+			nes_debug(NES_DBG_QP, "QP%u structure located @%p.Size = %u.\n",
+					nesqp->hwqp.qp_id, nesqp, (u32)sizeof(*nesqp));
+			spin_lock_init(&nesqp->lock);
+			nes_add_ref(&nesqp->ibqp);
+			break;
+		default:
+			nes_debug(NES_DBG_QP, "Invalid QP type: %d\n", init_attr->qp_type);
+			return ERR_PTR(-EINVAL);
+	}
+
+	nesqp->sig_all = (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR);
+	init_timer(&nesqp->terminate_timer);
+	nesqp->terminate_timer.function = nes_terminate_timeout;
+	nesqp->terminate_timer.data = (unsigned long)nesqp;
+
+	/* update the QP table */
+	nesdev->nesadapter->qp_table[nesqp->hwqp.qp_id-NES_FIRST_QPN] = nesqp;
+	nes_debug(NES_DBG_QP, "netdev refcnt=%u\n",
+			netdev_refcnt_read(nesvnic->netdev));
+
+	return &nesqp->ibqp;
+}
+
+/**
+ * nes_clean_cq
+ */
+static void nes_clean_cq(struct nes_qp *nesqp, struct nes_cq *nescq)
+{
+	u32 cq_head;
+	u32 lo;
+	u32 hi;
+	u64 u64temp;
+	unsigned long flags = 0;
+
+	spin_lock_irqsave(&nescq->lock, flags);
+
+	cq_head = nescq->hw_cq.cq_head;
+	while (le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_VALID) {
+		rmb();
+		lo = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]);
+		hi = le32_to_cpu(nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX]);
+		u64temp = (((u64)hi) << 32) | ((u64)lo);
+		u64temp &= ~(NES_SW_CONTEXT_ALIGN-1);
+		if (u64temp == (u64)(unsigned long)nesqp) {
+			/* Zero the context value so cqe will be ignored */
+			nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] = 0;
+			nescq->hw_cq.cq_vbase[cq_head].cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX] = 0;
+		}
+
+		if (++cq_head >= nescq->hw_cq.cq_size)
+			cq_head = 0;
+	}
+
+	spin_unlock_irqrestore(&nescq->lock, flags);
+}
+
+
+/**
+ * nes_destroy_qp
+ */
+static int nes_destroy_qp(struct ib_qp *ibqp)
+{
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_ucontext *nes_ucontext;
+	struct ib_qp_attr attr;
+	struct iw_cm_id *cm_id;
+	struct iw_cm_event cm_event;
+	int ret = 0;
+
+	atomic_inc(&sw_qps_destroyed);
+	nesqp->destroyed = 1;
+
+	/* Blow away the connection if it exists. */
+	if (nesqp->ibqp_state >= IB_QPS_INIT && nesqp->ibqp_state <= IB_QPS_RTS) {
+		/* if (nesqp->ibqp_state == IB_QPS_RTS) { */
+		attr.qp_state = IB_QPS_ERR;
+		nes_modify_qp(&nesqp->ibqp, &attr, IB_QP_STATE, NULL);
+	}
+
+	if (((nesqp->ibqp_state == IB_QPS_INIT) ||
+			(nesqp->ibqp_state == IB_QPS_RTR)) && (nesqp->cm_id)) {
+		cm_id = nesqp->cm_id;
+		cm_event.event = IW_CM_EVENT_CONNECT_REPLY;
+		cm_event.status = -ETIMEDOUT;
+		cm_event.local_addr = cm_id->local_addr;
+		cm_event.remote_addr = cm_id->remote_addr;
+		cm_event.private_data = NULL;
+		cm_event.private_data_len = 0;
+
+		nes_debug(NES_DBG_QP, "Generating a CM Timeout Event for "
+				"QP%u. cm_id = %p, refcount = %u. \n",
+				nesqp->hwqp.qp_id, cm_id, atomic_read(&nesqp->refcount));
+
+		cm_id->rem_ref(cm_id);
+		ret = cm_id->event_handler(cm_id, &cm_event);
+		if (ret)
+			nes_debug(NES_DBG_QP, "OFA CM event_handler returned, ret=%d\n", ret);
+	}
+
+	if (nesqp->user_mode) {
+		if ((ibqp->uobject)&&(ibqp->uobject->context)) {
+			nes_ucontext = to_nesucontext(ibqp->uobject->context);
+			clear_bit(nesqp->mmap_sq_db_index, nes_ucontext->allocated_wqs);
+			nes_ucontext->mmap_nesqp[nesqp->mmap_sq_db_index] = NULL;
+			if (nes_ucontext->first_free_wq > nesqp->mmap_sq_db_index) {
+				nes_ucontext->first_free_wq = nesqp->mmap_sq_db_index;
+			}
+		}
+		if (nesqp->pbl_pbase && nesqp->sq_kmapped) {
+			nesqp->sq_kmapped = 0;
+			kunmap(nesqp->page);
+		}
+	} else {
+		/* Clean any pending completions from the cq(s) */
+		if (nesqp->nesscq)
+			nes_clean_cq(nesqp, nesqp->nesscq);
+
+		if ((nesqp->nesrcq) && (nesqp->nesrcq != nesqp->nesscq))
+			nes_clean_cq(nesqp, nesqp->nesrcq);
+	}
+	nes_rem_ref(&nesqp->ibqp);
+	return 0;
+}
+
+
+/**
+ * nes_create_cq
+ */
+static struct ib_cq *nes_create_cq(struct ib_device *ibdev, int entries,
+		int comp_vector,
+		struct ib_ucontext *context, struct ib_udata *udata)
+{
+	u64 u64temp;
+	struct nes_vnic *nesvnic = to_nesvnic(ibdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_cq *nescq;
+	struct nes_ucontext *nes_ucontext = NULL;
+	struct nes_cqp_request *cqp_request;
+	void *mem = NULL;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_pbl *nespbl = NULL;
+	struct nes_create_cq_req req;
+	struct nes_create_cq_resp resp;
+	u32 cq_num = 0;
+	u32 opcode = 0;
+	u32 pbl_entries = 1;
+	int err;
+	unsigned long flags;
+	int ret;
+
+	if (entries > nesadapter->max_cqe)
+		return ERR_PTR(-EINVAL);
+
+	err = nes_alloc_resource(nesadapter, nesadapter->allocated_cqs,
+			nesadapter->max_cq, &cq_num, &nesadapter->next_cq, NES_RESOURCE_CQ);
+	if (err) {
+		return ERR_PTR(err);
+	}
+
+	nescq = kzalloc(sizeof(struct nes_cq), GFP_KERNEL);
+	if (!nescq) {
+		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+		nes_debug(NES_DBG_CQ, "Unable to allocate nes_cq struct\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	nescq->hw_cq.cq_size = max(entries + 1, 5);
+	nescq->hw_cq.cq_number = cq_num;
+	nescq->ibcq.cqe = nescq->hw_cq.cq_size - 1;
+
+
+	if (context) {
+		nes_ucontext = to_nesucontext(context);
+		if (ib_copy_from_udata(&req, udata, sizeof (struct nes_create_cq_req))) {
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+			kfree(nescq);
+			return ERR_PTR(-EFAULT);
+		}
+		nesvnic->mcrq_ucontext = nes_ucontext;
+		nes_ucontext->mcrqf = req.mcrqf;
+		if (nes_ucontext->mcrqf) {
+			if (nes_ucontext->mcrqf & 0x80000000)
+				nescq->hw_cq.cq_number = nesvnic->nic.qp_id + 28 + 2 * ((nes_ucontext->mcrqf & 0xf) - 1);
+			else if (nes_ucontext->mcrqf & 0x40000000)
+				nescq->hw_cq.cq_number = nes_ucontext->mcrqf & 0xffff;
+			else
+				nescq->hw_cq.cq_number = nesvnic->mcrq_qp_id + nes_ucontext->mcrqf-1;
+			nescq->mcrqf = nes_ucontext->mcrqf;
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+		}
+		nes_debug(NES_DBG_CQ, "CQ Virtual Address = %08lX, size = %u.\n",
+				(unsigned long)req.user_cq_buffer, entries);
+		err = 1;
+		list_for_each_entry(nespbl, &nes_ucontext->cq_reg_mem_list, list) {
+			if (nespbl->user_base == (unsigned long )req.user_cq_buffer) {
+				list_del(&nespbl->list);
+				err = 0;
+				nes_debug(NES_DBG_CQ, "Found PBL for virtual CQ. nespbl=%p.\n",
+						nespbl);
+				break;
+			}
+		}
+		if (err) {
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+			kfree(nescq);
+			return ERR_PTR(-EFAULT);
+		}
+
+		pbl_entries = nespbl->pbl_size >> 3;
+		nescq->cq_mem_size = 0;
+	} else {
+		nescq->cq_mem_size = nescq->hw_cq.cq_size * sizeof(struct nes_hw_cqe);
+		nes_debug(NES_DBG_CQ, "Attempting to allocate pci memory (%u entries, %u bytes) for CQ%u.\n",
+				entries, nescq->cq_mem_size, nescq->hw_cq.cq_number);
+
+		/* allocate the physical buffer space */
+		mem = pci_zalloc_consistent(nesdev->pcidev, nescq->cq_mem_size,
+					    &nescq->hw_cq.cq_pbase);
+		if (!mem) {
+			printk(KERN_ERR PFX "Unable to allocate pci memory for cq\n");
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+			kfree(nescq);
+			return ERR_PTR(-ENOMEM);
+		}
+
+		nescq->hw_cq.cq_vbase = mem;
+		nescq->hw_cq.cq_head = 0;
+		nes_debug(NES_DBG_CQ, "CQ%u virtual address @ %p, phys = 0x%08X\n",
+				nescq->hw_cq.cq_number, nescq->hw_cq.cq_vbase,
+				(u32)nescq->hw_cq.cq_pbase);
+	}
+
+	nescq->hw_cq.ce_handler = nes_iwarp_ce_handler;
+	spin_lock_init(&nescq->lock);
+
+	/* send CreateCQ request to CQP */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n");
+		if (!context)
+			pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
+					nescq->hw_cq.cq_pbase);
+		else {
+			pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
+					    nespbl->pbl_vbase, nespbl->pbl_pbase);
+			kfree(nespbl);
+		}
+
+		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+		kfree(nescq);
+		return ERR_PTR(-ENOMEM);
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	opcode = NES_CQP_CREATE_CQ | NES_CQP_CQ_CEQ_VALID |
+			NES_CQP_CQ_CHK_OVERFLOW |
+			NES_CQP_CQ_CEQE_MASK | ((u32)nescq->hw_cq.cq_size << 16);
+
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+
+	if (pbl_entries != 1) {
+		if (pbl_entries > 32) {
+			/* use 4k pbl */
+			nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 4k PBL\n", pbl_entries);
+			if (nesadapter->free_4kpbl == 0) {
+				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+				nes_free_cqp_request(nesdev, cqp_request);
+				if (!context)
+					pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
+							nescq->hw_cq.cq_pbase);
+				else {
+					pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
+							    nespbl->pbl_vbase, nespbl->pbl_pbase);
+					kfree(nespbl);
+				}
+				nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+				kfree(nescq);
+				return ERR_PTR(-ENOMEM);
+			} else {
+				opcode |= (NES_CQP_CQ_VIRT | NES_CQP_CQ_4KB_CHUNK);
+				nescq->virtual_cq = 2;
+				nesadapter->free_4kpbl--;
+			}
+		} else {
+			/* use 256 byte pbl */
+			nes_debug(NES_DBG_CQ, "pbl_entries=%u, use a 256 byte PBL\n", pbl_entries);
+			if (nesadapter->free_256pbl == 0) {
+				spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+				nes_free_cqp_request(nesdev, cqp_request);
+				if (!context)
+					pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
+							nescq->hw_cq.cq_pbase);
+				else {
+					pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
+							    nespbl->pbl_vbase, nespbl->pbl_pbase);
+					kfree(nespbl);
+				}
+				nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+				kfree(nescq);
+				return ERR_PTR(-ENOMEM);
+			} else {
+				opcode |= NES_CQP_CQ_VIRT;
+				nescq->virtual_cq = 1;
+				nesadapter->free_256pbl--;
+			}
+		}
+	}
+
+	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+			(nescq->hw_cq.cq_number | ((u32)nesdev->ceq_index << 16)));
+
+	if (context) {
+		if (pbl_entries != 1)
+			u64temp = (u64)nespbl->pbl_pbase;
+		else
+			u64temp	= le64_to_cpu(nespbl->pbl_vbase[0]);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX,
+				nes_ucontext->mmap_db_index[0]);
+	} else {
+		u64temp = (u64)nescq->hw_cq.cq_pbase;
+		cqp_wqe->wqe_words[NES_CQP_CQ_WQE_DOORBELL_INDEX_HIGH_IDX] = 0;
+	}
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_CQ_WQE_PBL_LOW_IDX, u64temp);
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] = 0;
+	u64temp = (u64)(unsigned long)&nescq->hw_cq;
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_LOW_IDX] =
+			cpu_to_le32((u32)(u64temp >> 1));
+	cqp_wqe->wqe_words[NES_CQP_CQ_WQE_CQ_CONTEXT_HIGH_IDX] =
+			cpu_to_le32(((u32)((u64temp) >> 33)) & 0x7FFFFFFF);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	nes_debug(NES_DBG_CQ, "Waiting for create iWARP CQ%u to complete.\n",
+			nescq->hw_cq.cq_number);
+	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
+			NES_EVENT_TIMEOUT * 2);
+	nes_debug(NES_DBG_CQ, "Create iWARP CQ%u completed, wait_event_timeout ret = %d.\n",
+			nescq->hw_cq.cq_number, ret);
+	if ((!ret) || (cqp_request->major_code)) {
+		nes_put_cqp_request(nesdev, cqp_request);
+		if (!context)
+			pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size, mem,
+					nescq->hw_cq.cq_pbase);
+		else {
+			pci_free_consistent(nesdev->pcidev, nespbl->pbl_size,
+					    nespbl->pbl_vbase, nespbl->pbl_pbase);
+			kfree(nespbl);
+		}
+		nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+		kfree(nescq);
+		return ERR_PTR(-EIO);
+	}
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if (context) {
+		/* free the nespbl */
+		pci_free_consistent(nesdev->pcidev, nespbl->pbl_size, nespbl->pbl_vbase,
+				nespbl->pbl_pbase);
+		kfree(nespbl);
+		resp.cq_id = nescq->hw_cq.cq_number;
+		resp.cq_size = nescq->hw_cq.cq_size;
+		resp.mmap_db_index = 0;
+		if (ib_copy_to_udata(udata, &resp, sizeof resp - sizeof resp.reserved)) {
+			nes_free_resource(nesadapter, nesadapter->allocated_cqs, cq_num);
+			kfree(nescq);
+			return ERR_PTR(-EFAULT);
+		}
+	}
+
+	return &nescq->ibcq;
+}
+
+
+/**
+ * nes_destroy_cq
+ */
+static int nes_destroy_cq(struct ib_cq *ib_cq)
+{
+	struct nes_cq *nescq;
+	struct nes_device *nesdev;
+	struct nes_vnic *nesvnic;
+	struct nes_adapter *nesadapter;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	u32 opcode = 0;
+	int ret;
+
+	if (ib_cq == NULL)
+		return 0;
+
+	nescq = to_nescq(ib_cq);
+	nesvnic = to_nesvnic(ib_cq->device);
+	nesdev = nesvnic->nesdev;
+	nesadapter = nesdev->nesadapter;
+
+	nes_debug(NES_DBG_CQ, "Destroy CQ%u\n", nescq->hw_cq.cq_number);
+
+	/* Send DestroyCQ request to CQP */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_CQ, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+	opcode = NES_CQP_DESTROY_CQ | (nescq->hw_cq.cq_size << 16);
+	spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+	if (nescq->virtual_cq == 1) {
+		nesadapter->free_256pbl++;
+		if (nesadapter->free_256pbl > nesadapter->max_256pbl) {
+			printk(KERN_ERR PFX "%s: free 256B PBLs(%u) has exceeded the max(%u)\n",
+					__func__, nesadapter->free_256pbl, nesadapter->max_256pbl);
+		}
+	} else if (nescq->virtual_cq == 2) {
+		nesadapter->free_4kpbl++;
+		if (nesadapter->free_4kpbl > nesadapter->max_4kpbl) {
+			printk(KERN_ERR PFX "%s: free 4K PBLs(%u) has exceeded the max(%u)\n",
+					__func__, nesadapter->free_4kpbl, nesadapter->max_4kpbl);
+		}
+		opcode |= NES_CQP_CQ_4KB_CHUNK;
+	}
+
+	spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX,
+		(nescq->hw_cq.cq_number | ((u32)PCI_FUNC(nesdev->pcidev->devfn) << 16)));
+	if (!nescq->mcrqf)
+		nes_free_resource(nesadapter, nesadapter->allocated_cqs, nescq->hw_cq.cq_number);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	nes_debug(NES_DBG_CQ, "Waiting for destroy iWARP CQ%u to complete.\n",
+			nescq->hw_cq.cq_number);
+	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_CQ, "Destroy iWARP CQ%u completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			nescq->hw_cq.cq_number, ret, cqp_request->major_code,
+			cqp_request->minor_code);
+	if (!ret) {
+		nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy timeout expired\n",
+					nescq->hw_cq.cq_number);
+		ret = -ETIME;
+	} else if (cqp_request->major_code) {
+		nes_debug(NES_DBG_CQ, "iWARP CQ%u destroy failed\n",
+					nescq->hw_cq.cq_number);
+		ret = -EIO;
+	} else {
+		ret = 0;
+	}
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if (nescq->cq_mem_size)
+		pci_free_consistent(nesdev->pcidev, nescq->cq_mem_size,
+				    nescq->hw_cq.cq_vbase, nescq->hw_cq.cq_pbase);
+	kfree(nescq);
+
+	return ret;
+}
+
+/**
+ * root_256
+ */
+static u32 root_256(struct nes_device *nesdev,
+		    struct nes_root_vpbl *root_vpbl,
+		    struct nes_root_vpbl *new_root,
+		    u16 pbl_count_4k)
+{
+	u64 leaf_pbl;
+	int i, j, k;
+
+	if (pbl_count_4k == 1) {
+		new_root->pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
+						512, &new_root->pbl_pbase);
+
+		if (new_root->pbl_vbase == NULL)
+			return 0;
+
+		leaf_pbl = (u64)root_vpbl->pbl_pbase;
+		for (i = 0; i < 16; i++) {
+			new_root->pbl_vbase[i].pa_low =
+				cpu_to_le32((u32)leaf_pbl);
+			new_root->pbl_vbase[i].pa_high =
+				cpu_to_le32((u32)((((u64)leaf_pbl) >> 32)));
+			leaf_pbl += 256;
+		}
+	} else {
+		for (i = 3; i >= 0; i--) {
+			j = i * 16;
+			root_vpbl->pbl_vbase[j] = root_vpbl->pbl_vbase[i];
+			leaf_pbl = le32_to_cpu(root_vpbl->pbl_vbase[j].pa_low) +
+			    (((u64)le32_to_cpu(root_vpbl->pbl_vbase[j].pa_high))
+				<< 32);
+			for (k = 1; k < 16; k++) {
+				leaf_pbl += 256;
+				root_vpbl->pbl_vbase[j + k].pa_low =
+						cpu_to_le32((u32)leaf_pbl);
+				root_vpbl->pbl_vbase[j + k].pa_high =
+				    cpu_to_le32((u32)((((u64)leaf_pbl) >> 32)));
+			}
+		}
+	}
+
+	return 1;
+}
+
+
+/**
+ * nes_reg_mr
+ */
+static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd,
+		u32 stag, u64 region_length, struct nes_root_vpbl *root_vpbl,
+		dma_addr_t single_buffer, u16 pbl_count_4k,
+		u16 residual_page_count_4k, int acc, u64 *iova_start,
+		u16 *actual_pbl_cnt, u8 *used_4k_pbls)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	int ret;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	uint pg_cnt = 0;
+	u16 pbl_count_256 = 0;
+	u16 pbl_count = 0;
+	u8  use_256_pbls = 0;
+	u8  use_4k_pbls = 0;
+	u16 use_two_level = (pbl_count_4k > 1) ? 1 : 0;
+	struct nes_root_vpbl new_root = { 0, NULL, NULL };
+	u32 opcode = 0;
+	u16 major_code;
+
+	/* Register the region with the adapter */
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	if (pbl_count_4k) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+
+		pg_cnt = ((pbl_count_4k - 1) * 512) + residual_page_count_4k;
+		pbl_count_256 = (pg_cnt + 31) / 32;
+		if (pg_cnt <= 32) {
+			if (pbl_count_256 <= nesadapter->free_256pbl)
+				use_256_pbls = 1;
+			else if (pbl_count_4k <= nesadapter->free_4kpbl)
+				use_4k_pbls = 1;
+		} else if (pg_cnt <= 2048) {
+			if (((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) &&
+			    (nesadapter->free_4kpbl > (nesadapter->max_4kpbl >> 1))) {
+				use_4k_pbls = 1;
+			} else if ((pbl_count_256 + 1) <= nesadapter->free_256pbl) {
+				use_256_pbls = 1;
+				use_two_level = 1;
+			} else if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) {
+				use_4k_pbls = 1;
+			}
+		} else {
+			if ((pbl_count_4k + 1) <= nesadapter->free_4kpbl)
+				use_4k_pbls = 1;
+		}
+
+		if (use_256_pbls) {
+			pbl_count = pbl_count_256;
+			nesadapter->free_256pbl -= pbl_count + use_two_level;
+		} else if (use_4k_pbls) {
+			pbl_count =  pbl_count_4k;
+			nesadapter->free_4kpbl -= pbl_count + use_two_level;
+		} else {
+			spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+			nes_debug(NES_DBG_MR, "Out of Pbls\n");
+			nes_free_cqp_request(nesdev, cqp_request);
+			return -ENOMEM;
+		}
+
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	}
+
+	if (use_256_pbls && use_two_level) {
+		if (root_256(nesdev, root_vpbl, &new_root, pbl_count_4k) == 1) {
+			if (new_root.pbl_pbase != 0)
+				root_vpbl = &new_root;
+		} else {
+			spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+			nesadapter->free_256pbl += pbl_count_256 + use_two_level;
+			use_256_pbls = 0;
+
+			if (pbl_count_4k == 1)
+				use_two_level = 0;
+			pbl_count = pbl_count_4k;
+
+			if ((pbl_count_4k + use_two_level) <= nesadapter->free_4kpbl) {
+				nesadapter->free_4kpbl -= pbl_count + use_two_level;
+				use_4k_pbls = 1;
+			}
+			spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+
+			if (use_4k_pbls == 0)
+				return -ENOMEM;
+		}
+	}
+
+	opcode = NES_CQP_REGISTER_STAG | NES_CQP_STAG_RIGHTS_LOCAL_READ |
+					NES_CQP_STAG_VA_TO | NES_CQP_STAG_MR;
+	if (acc & IB_ACCESS_LOCAL_WRITE)
+		opcode |= NES_CQP_STAG_RIGHTS_LOCAL_WRITE;
+	if (acc & IB_ACCESS_REMOTE_WRITE)
+		opcode |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE | NES_CQP_STAG_REM_ACC_EN;
+	if (acc & IB_ACCESS_REMOTE_READ)
+		opcode |= NES_CQP_STAG_RIGHTS_REMOTE_READ | NES_CQP_STAG_REM_ACC_EN;
+	if (acc & IB_ACCESS_MW_BIND)
+		opcode |= NES_CQP_STAG_RIGHTS_WINDOW_BIND | NES_CQP_STAG_REM_ACC_EN;
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX, opcode);
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_VA_LOW_IDX, *iova_start);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_LEN_LOW_IDX, region_length);
+
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] =
+			cpu_to_le32((u32)(region_length >> 8) & 0xff000000);
+	cqp_wqe->wqe_words[NES_CQP_STAG_WQE_LEN_HIGH_PD_IDX] |=
+			cpu_to_le32(nespd->pd_id & 0x00007fff);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, stag);
+
+	if (pbl_count == 0) {
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, single_buffer);
+	} else {
+		set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PA_LOW_IDX, root_vpbl->pbl_pbase);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_BLK_COUNT_IDX, pbl_count);
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_PBL_LEN_IDX, (pg_cnt * 8));
+
+		if (use_4k_pbls)
+			cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX] |= cpu_to_le32(NES_CQP_STAG_PBL_BLK_SIZE);
+	}
+	barrier();
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	ret = wait_event_timeout(cqp_request->waitq, (0 != cqp_request->request_done),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Register STag 0x%08X completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+			stag, ret, cqp_request->major_code, cqp_request->minor_code);
+	major_code = cqp_request->major_code;
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if ((!ret || major_code) && pbl_count != 0) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		if (use_256_pbls)
+			nesadapter->free_256pbl += pbl_count + use_two_level;
+		else if (use_4k_pbls)
+			nesadapter->free_4kpbl += pbl_count + use_two_level;
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	}
+	if (new_root.pbl_pbase)
+		pci_free_consistent(nesdev->pcidev, 512, new_root.pbl_vbase,
+				    new_root.pbl_pbase);
+
+	if (!ret)
+		return -ETIME;
+	else if (major_code)
+		return -EIO;
+
+	*actual_pbl_cnt = pbl_count + use_two_level;
+	*used_4k_pbls = use_4k_pbls;
+	return 0;
+}
+
+
+/**
+ * nes_reg_phys_mr
+ */
+static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd,
+		struct ib_phys_buf *buffer_list, int num_phys_buf, int acc,
+		u64 * iova_start)
+{
+	u64 region_length;
+	struct nes_pd *nespd = to_nespd(ib_pd);
+	struct nes_vnic *nesvnic = to_nesvnic(ib_pd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_mr *nesmr;
+	struct ib_mr *ibmr;
+	struct nes_vpbl vpbl;
+	struct nes_root_vpbl root_vpbl;
+	u32 stag;
+	u32 i;
+	unsigned long mask;
+	u32 stag_index = 0;
+	u32 next_stag_index = 0;
+	u32 driver_key = 0;
+	u32 root_pbl_index = 0;
+	u32 cur_pbl_index = 0;
+	int err = 0;
+	int ret = 0;
+	u16 pbl_count = 0;
+	u8 single_page = 1;
+	u8 stag_key = 0;
+
+	region_length = 0;
+	vpbl.pbl_vbase = NULL;
+	root_vpbl.pbl_vbase = NULL;
+	root_vpbl.pbl_pbase = 0;
+
+	get_random_bytes(&next_stag_index, sizeof(next_stag_index));
+	stag_key = (u8)next_stag_index;
+
+	driver_key = 0;
+
+	next_stag_index >>= 8;
+	next_stag_index %= nesadapter->max_mr;
+	if (num_phys_buf > (1024*512)) {
+		return ERR_PTR(-E2BIG);
+	}
+
+	if ((buffer_list[0].addr ^ *iova_start) & ~PAGE_MASK)
+		return ERR_PTR(-EINVAL);
+
+	err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, nesadapter->max_mr,
+			&stag_index, &next_stag_index, NES_RESOURCE_PHYS_MR);
+	if (err) {
+		return ERR_PTR(err);
+	}
+
+	nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+	if (!nesmr) {
+		nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	for (i = 0; i < num_phys_buf; i++) {
+
+		if ((i & 0x01FF) == 0) {
+			if (root_pbl_index == 1) {
+				/* Allocate the root PBL */
+				root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 8192,
+						&root_vpbl.pbl_pbase);
+				nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n",
+						root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase);
+				if (!root_vpbl.pbl_vbase) {
+					pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+							vpbl.pbl_pbase);
+					nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+					kfree(nesmr);
+					return ERR_PTR(-ENOMEM);
+				}
+				root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, GFP_KERNEL);
+				if (!root_vpbl.leaf_vpbl) {
+					pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
+							root_vpbl.pbl_pbase);
+					pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+							vpbl.pbl_pbase);
+					nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+					kfree(nesmr);
+					return ERR_PTR(-ENOMEM);
+				}
+				root_vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)vpbl.pbl_pbase);
+				root_vpbl.pbl_vbase[0].pa_high =
+						cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
+				root_vpbl.leaf_vpbl[0] = vpbl;
+			}
+			/* Allocate a 4K buffer for the PBL */
+			vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
+					&vpbl.pbl_pbase);
+			nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n",
+					vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase);
+			if (!vpbl.pbl_vbase) {
+				nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+				ibmr = ERR_PTR(-ENOMEM);
+				kfree(nesmr);
+				goto reg_phys_err;
+			}
+			/* Fill in the root table */
+			if (1 <= root_pbl_index) {
+				root_vpbl.pbl_vbase[root_pbl_index].pa_low =
+						cpu_to_le32((u32)vpbl.pbl_pbase);
+				root_vpbl.pbl_vbase[root_pbl_index].pa_high =
+						cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
+				root_vpbl.leaf_vpbl[root_pbl_index] = vpbl;
+			}
+			root_pbl_index++;
+			cur_pbl_index = 0;
+		}
+
+		mask = !buffer_list[i].size;
+		if (i != 0)
+			mask |= buffer_list[i].addr;
+		if (i != num_phys_buf - 1)
+			mask |= buffer_list[i].addr + buffer_list[i].size;
+
+		if (mask & ~PAGE_MASK) {
+			nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+			nes_debug(NES_DBG_MR, "Invalid buffer addr or size\n");
+			ibmr = ERR_PTR(-EINVAL);
+			kfree(nesmr);
+			goto reg_phys_err;
+		}
+
+		region_length += buffer_list[i].size;
+		if ((i != 0) && (single_page)) {
+			if ((buffer_list[i-1].addr+PAGE_SIZE) != buffer_list[i].addr)
+				single_page = 0;
+		}
+		vpbl.pbl_vbase[cur_pbl_index].pa_low = cpu_to_le32((u32)buffer_list[i].addr & PAGE_MASK);
+		vpbl.pbl_vbase[cur_pbl_index++].pa_high =
+				cpu_to_le32((u32)((((u64)buffer_list[i].addr) >> 32)));
+	}
+
+	stag = stag_index << 8;
+	stag |= driver_key;
+	stag += (u32)stag_key;
+
+	nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%016lX,"
+			" length = 0x%016lX, index = 0x%08X\n",
+			stag, (unsigned long)*iova_start, (unsigned long)region_length, stag_index);
+
+	/* Make the leaf PBL the root if only one PBL */
+	if (root_pbl_index == 1) {
+		root_vpbl.pbl_pbase = vpbl.pbl_pbase;
+	}
+
+	if (single_page) {
+		pbl_count = 0;
+	} else {
+		pbl_count = root_pbl_index;
+	}
+	ret = nes_reg_mr(nesdev, nespd, stag, region_length, &root_vpbl,
+			buffer_list[0].addr, pbl_count, (u16)cur_pbl_index, acc, iova_start,
+			&nesmr->pbls_used, &nesmr->pbl_4k);
+
+	if (ret == 0) {
+		nesmr->ibmr.rkey = stag;
+		nesmr->ibmr.lkey = stag;
+		nesmr->mode = IWNES_MEMREG_TYPE_MEM;
+		ibmr = &nesmr->ibmr;
+	} else {
+		kfree(nesmr);
+		ibmr = ERR_PTR(-ENOMEM);
+	}
+
+	reg_phys_err:
+	/* free the resources */
+	if (root_pbl_index == 1) {
+		/* single PBL case */
+		pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase);
+	} else {
+		for (i=0; i<root_pbl_index; i++) {
+			pci_free_consistent(nesdev->pcidev, 4096, root_vpbl.leaf_vpbl[i].pbl_vbase,
+					root_vpbl.leaf_vpbl[i].pbl_pbase);
+		}
+		kfree(root_vpbl.leaf_vpbl);
+		pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
+				root_vpbl.pbl_pbase);
+	}
+
+	return ibmr;
+}
+
+
+/**
+ * nes_get_dma_mr
+ */
+static struct ib_mr *nes_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct ib_phys_buf bl;
+	u64 kva = 0;
+
+	nes_debug(NES_DBG_MR, "\n");
+
+	bl.size = (u64)0xffffffffffULL;
+	bl.addr = 0;
+	return nes_reg_phys_mr(pd, &bl, 1, acc, &kva);
+}
+
+
+/**
+ * nes_reg_user_mr
+ */
+static struct ib_mr *nes_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+		u64 virt, int acc, struct ib_udata *udata)
+{
+	u64 iova_start;
+	__le64 *pbl;
+	u64 region_length;
+	dma_addr_t last_dma_addr = 0;
+	dma_addr_t first_dma_addr = 0;
+	struct nes_pd *nespd = to_nespd(pd);
+	struct nes_vnic *nesvnic = to_nesvnic(pd->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct ib_mr *ibmr = ERR_PTR(-EINVAL);
+	struct scatterlist *sg;
+	struct nes_ucontext *nes_ucontext;
+	struct nes_pbl *nespbl;
+	struct nes_mr *nesmr;
+	struct ib_umem *region;
+	struct nes_mem_reg_req req;
+	struct nes_vpbl vpbl;
+	struct nes_root_vpbl root_vpbl;
+	int entry, page_index;
+	int page_count = 0;
+	int err, pbl_depth = 0;
+	int chunk_pages;
+	int ret;
+	u32 stag;
+	u32 stag_index = 0;
+	u32 next_stag_index;
+	u32 driver_key;
+	u32 root_pbl_index = 0;
+	u32 cur_pbl_index = 0;
+	u32 skip_pages;
+	u16 pbl_count;
+	u8 single_page = 1;
+	u8 stag_key;
+	int first_page = 1;
+
+	region = ib_umem_get(pd->uobject->context, start, length, acc, 0);
+	if (IS_ERR(region)) {
+		return (struct ib_mr *)region;
+	}
+
+	nes_debug(NES_DBG_MR, "User base = 0x%lX, Virt base = 0x%lX, length = %u,"
+			" offset = %u, page size = %u.\n",
+			(unsigned long int)start, (unsigned long int)virt, (u32)length,
+			ib_umem_offset(region), region->page_size);
+
+	skip_pages = ((u32)ib_umem_offset(region)) >> 12;
+
+	if (ib_copy_from_udata(&req, udata, sizeof(req))) {
+		ib_umem_release(region);
+		return ERR_PTR(-EFAULT);
+	}
+	nes_debug(NES_DBG_MR, "Memory Registration type = %08X.\n", req.reg_type);
+
+	switch (req.reg_type) {
+		case IWNES_MEMREG_TYPE_MEM:
+			pbl_depth = 0;
+			region_length = 0;
+			vpbl.pbl_vbase = NULL;
+			root_vpbl.pbl_vbase = NULL;
+			root_vpbl.pbl_pbase = 0;
+
+			get_random_bytes(&next_stag_index, sizeof(next_stag_index));
+			stag_key = (u8)next_stag_index;
+
+			driver_key = next_stag_index & 0x70000000;
+
+			next_stag_index >>= 8;
+			next_stag_index %= nesadapter->max_mr;
+
+			err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs,
+					nesadapter->max_mr, &stag_index, &next_stag_index, NES_RESOURCE_USER_MR);
+			if (err) {
+				ib_umem_release(region);
+				return ERR_PTR(err);
+			}
+
+			nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+			if (!nesmr) {
+				ib_umem_release(region);
+				nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+				return ERR_PTR(-ENOMEM);
+			}
+			nesmr->region = region;
+
+			for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) {
+				if (sg_dma_address(sg) & ~PAGE_MASK) {
+					ib_umem_release(region);
+					nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+					nes_debug(NES_DBG_MR, "Unaligned Memory Buffer: 0x%x\n",
+						  (unsigned int) sg_dma_address(sg));
+					ibmr = ERR_PTR(-EINVAL);
+					kfree(nesmr);
+					goto reg_user_mr_err;
+				}
+
+				if (!sg_dma_len(sg)) {
+					ib_umem_release(region);
+					nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+							  stag_index);
+					nes_debug(NES_DBG_MR, "Invalid Buffer Size\n");
+					ibmr = ERR_PTR(-EINVAL);
+					kfree(nesmr);
+					goto reg_user_mr_err;
+				}
+
+				region_length += sg_dma_len(sg);
+				chunk_pages = sg_dma_len(sg) >> 12;
+				region_length -= skip_pages << 12;
+				for (page_index = skip_pages; page_index < chunk_pages; page_index++) {
+					skip_pages = 0;
+					if ((page_count != 0) && (page_count << 12) - (ib_umem_offset(region) & (4096 - 1)) >= region->length)
+						goto enough_pages;
+					if ((page_count&0x01FF) == 0) {
+						if (page_count >= 1024 * 512) {
+							ib_umem_release(region);
+							nes_free_resource(nesadapter,
+									  nesadapter->allocated_mrs, stag_index);
+							kfree(nesmr);
+							ibmr = ERR_PTR(-E2BIG);
+							goto reg_user_mr_err;
+						}
+						if (root_pbl_index == 1) {
+							root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev,
+									8192, &root_vpbl.pbl_pbase);
+							nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n",
+								  root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase);
+							if (!root_vpbl.pbl_vbase) {
+								ib_umem_release(region);
+								pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+										    vpbl.pbl_pbase);
+								nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+										  stag_index);
+								kfree(nesmr);
+								ibmr = ERR_PTR(-ENOMEM);
+								goto reg_user_mr_err;
+							}
+							root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024,
+									GFP_KERNEL);
+							if (!root_vpbl.leaf_vpbl) {
+								ib_umem_release(region);
+								pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
+										    root_vpbl.pbl_pbase);
+								pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+										    vpbl.pbl_pbase);
+								nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+										  stag_index);
+								kfree(nesmr);
+								ibmr = ERR_PTR(-ENOMEM);
+								goto reg_user_mr_err;
+							}
+							root_vpbl.pbl_vbase[0].pa_low =
+									cpu_to_le32((u32)vpbl.pbl_pbase);
+							root_vpbl.pbl_vbase[0].pa_high =
+									cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32)));
+							root_vpbl.leaf_vpbl[0] = vpbl;
+						}
+						vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096,
+								&vpbl.pbl_pbase);
+						nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%08X\n",
+							  vpbl.pbl_vbase, (unsigned int)vpbl.pbl_pbase);
+						if (!vpbl.pbl_vbase) {
+							ib_umem_release(region);
+							nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index);
+							ibmr = ERR_PTR(-ENOMEM);
+							kfree(nesmr);
+							goto reg_user_mr_err;
+						}
+						if (1 <= root_pbl_index) {
+							root_vpbl.pbl_vbase[root_pbl_index].pa_low =
+									cpu_to_le32((u32)vpbl.pbl_pbase);
+							root_vpbl.pbl_vbase[root_pbl_index].pa_high =
+									cpu_to_le32((u32)((((u64)vpbl.pbl_pbase)>>32)));
+							root_vpbl.leaf_vpbl[root_pbl_index] = vpbl;
+						}
+						root_pbl_index++;
+						cur_pbl_index = 0;
+					}
+					if (single_page) {
+						if (page_count != 0) {
+							if ((last_dma_addr+4096) !=
+									(sg_dma_address(sg)+
+									(page_index*4096)))
+								single_page = 0;
+							last_dma_addr = sg_dma_address(sg)+
+									(page_index*4096);
+						} else {
+							first_dma_addr = sg_dma_address(sg)+
+									(page_index*4096);
+							last_dma_addr = first_dma_addr;
+						}
+					}
+
+					vpbl.pbl_vbase[cur_pbl_index].pa_low =
+							cpu_to_le32((u32)(sg_dma_address(sg)+
+							(page_index*4096)));
+					vpbl.pbl_vbase[cur_pbl_index].pa_high =
+							cpu_to_le32((u32)((((u64)(sg_dma_address(sg)+
+							(page_index*4096))) >> 32)));
+					cur_pbl_index++;
+					page_count++;
+				}
+			}
+
+			enough_pages:
+			nes_debug(NES_DBG_MR, "calculating stag, stag_index=0x%08x, driver_key=0x%08x,"
+					" stag_key=0x%08x\n",
+					stag_index, driver_key, stag_key);
+			stag = stag_index << 8;
+			stag |= driver_key;
+			stag += (u32)stag_key;
+
+			iova_start = virt;
+			/* Make the leaf PBL the root if only one PBL */
+			if (root_pbl_index == 1) {
+				root_vpbl.pbl_pbase = vpbl.pbl_pbase;
+			}
+
+			if (single_page) {
+				pbl_count = 0;
+			} else {
+				pbl_count = root_pbl_index;
+				first_dma_addr = 0;
+			}
+			nes_debug(NES_DBG_MR, "Registering STag 0x%08X, VA = 0x%08X, length = 0x%08X,"
+					" index = 0x%08X, region->length=0x%08llx, pbl_count = %u\n",
+					stag, (unsigned int)iova_start,
+					(unsigned int)region_length, stag_index,
+					(unsigned long long)region->length, pbl_count);
+			ret = nes_reg_mr(nesdev, nespd, stag, region->length, &root_vpbl,
+					 first_dma_addr, pbl_count, (u16)cur_pbl_index, acc,
+					 &iova_start, &nesmr->pbls_used, &nesmr->pbl_4k);
+
+			nes_debug(NES_DBG_MR, "ret=%d\n", ret);
+
+			if (ret == 0) {
+				nesmr->ibmr.rkey = stag;
+				nesmr->ibmr.lkey = stag;
+				nesmr->mode = IWNES_MEMREG_TYPE_MEM;
+				ibmr = &nesmr->ibmr;
+			} else {
+				ib_umem_release(region);
+				kfree(nesmr);
+				ibmr = ERR_PTR(-ENOMEM);
+			}
+
+			reg_user_mr_err:
+			/* free the resources */
+			if (root_pbl_index == 1) {
+				pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase,
+						vpbl.pbl_pbase);
+			} else {
+				for (page_index=0; page_index<root_pbl_index; page_index++) {
+					pci_free_consistent(nesdev->pcidev, 4096,
+							root_vpbl.leaf_vpbl[page_index].pbl_vbase,
+							root_vpbl.leaf_vpbl[page_index].pbl_pbase);
+				}
+				kfree(root_vpbl.leaf_vpbl);
+				pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase,
+						root_vpbl.pbl_pbase);
+			}
+
+			nes_debug(NES_DBG_MR, "Leaving, ibmr=%p", ibmr);
+
+			return ibmr;
+		case IWNES_MEMREG_TYPE_QP:
+		case IWNES_MEMREG_TYPE_CQ:
+			if (!region->length) {
+				nes_debug(NES_DBG_MR, "Unable to register zero length region for CQ\n");
+				ib_umem_release(region);
+				return ERR_PTR(-EINVAL);
+			}
+			nespbl = kzalloc(sizeof(*nespbl), GFP_KERNEL);
+			if (!nespbl) {
+				nes_debug(NES_DBG_MR, "Unable to allocate PBL\n");
+				ib_umem_release(region);
+				return ERR_PTR(-ENOMEM);
+			}
+			nesmr = kzalloc(sizeof(*nesmr), GFP_KERNEL);
+			if (!nesmr) {
+				ib_umem_release(region);
+				kfree(nespbl);
+				nes_debug(NES_DBG_MR, "Unable to allocate nesmr\n");
+				return ERR_PTR(-ENOMEM);
+			}
+			nesmr->region = region;
+			nes_ucontext = to_nesucontext(pd->uobject->context);
+			pbl_depth = region->length >> 12;
+			pbl_depth += (region->length & (4096-1)) ? 1 : 0;
+			nespbl->pbl_size = pbl_depth*sizeof(u64);
+			if (req.reg_type == IWNES_MEMREG_TYPE_QP) {
+				nes_debug(NES_DBG_MR, "Attempting to allocate QP PBL memory");
+			} else {
+				nes_debug(NES_DBG_MR, "Attempting to allocate CP PBL memory");
+			}
+
+			nes_debug(NES_DBG_MR, " %u bytes, %u entries.\n",
+					nespbl->pbl_size, pbl_depth);
+			pbl = pci_alloc_consistent(nesdev->pcidev, nespbl->pbl_size,
+					&nespbl->pbl_pbase);
+			if (!pbl) {
+				ib_umem_release(region);
+				kfree(nesmr);
+				kfree(nespbl);
+				nes_debug(NES_DBG_MR, "Unable to allocate PBL memory\n");
+				return ERR_PTR(-ENOMEM);
+			}
+
+			nespbl->pbl_vbase = (u64 *)pbl;
+			nespbl->user_base = start;
+			nes_debug(NES_DBG_MR, "Allocated PBL memory, %u bytes, pbl_pbase=%lx,"
+					" pbl_vbase=%p user_base=0x%lx\n",
+				  nespbl->pbl_size, (unsigned long) nespbl->pbl_pbase,
+				  (void *) nespbl->pbl_vbase, nespbl->user_base);
+
+			for_each_sg(region->sg_head.sgl, sg, region->nmap, entry) {
+				chunk_pages = sg_dma_len(sg) >> 12;
+				chunk_pages += (sg_dma_len(sg) & (4096-1)) ? 1 : 0;
+				if (first_page) {
+					nespbl->page = sg_page(sg);
+					first_page = 0;
+				}
+
+				for (page_index = 0; page_index < chunk_pages; page_index++) {
+					((__le32 *)pbl)[0] = cpu_to_le32((u32)
+							(sg_dma_address(sg)+
+							(page_index*4096)));
+					((__le32 *)pbl)[1] = cpu_to_le32(((u64)
+							(sg_dma_address(sg)+
+							(page_index*4096)))>>32);
+					nes_debug(NES_DBG_MR, "pbl=%p, *pbl=0x%016llx, 0x%08x%08x\n", pbl,
+						  (unsigned long long)*pbl,
+						  le32_to_cpu(((__le32 *)pbl)[1]), le32_to_cpu(((__le32 *)pbl)[0]));
+					pbl++;
+				}
+			}
+
+			if (req.reg_type == IWNES_MEMREG_TYPE_QP) {
+				list_add_tail(&nespbl->list, &nes_ucontext->qp_reg_mem_list);
+			} else {
+				list_add_tail(&nespbl->list, &nes_ucontext->cq_reg_mem_list);
+			}
+			nesmr->ibmr.rkey = -1;
+			nesmr->ibmr.lkey = -1;
+			nesmr->mode = req.reg_type;
+			return &nesmr->ibmr;
+	}
+
+	ib_umem_release(region);
+	return ERR_PTR(-ENOSYS);
+}
+
+
+/**
+ * nes_dereg_mr
+ */
+static int nes_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct nes_mr *nesmr = to_nesmr(ib_mr);
+	struct nes_vnic *nesvnic = to_nesvnic(ib_mr->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	struct nes_cqp_request *cqp_request;
+	unsigned long flags;
+	int ret;
+	u16 major_code;
+	u16 minor_code;
+
+	if (nesmr->region) {
+		ib_umem_release(nesmr->region);
+	}
+	if (nesmr->mode != IWNES_MEMREG_TYPE_MEM) {
+		kfree(nesmr);
+		return 0;
+	}
+
+	/* Deallocate the region with the adapter */
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MR, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	cqp_request->waiting = 1;
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			NES_CQP_DEALLOCATE_STAG | NES_CQP_STAG_VA_TO |
+			NES_CQP_STAG_DEALLOC_PBLS | NES_CQP_STAG_MR);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_STAG_WQE_STAG_IDX, ib_mr->rkey);
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	nes_debug(NES_DBG_MR, "Waiting for deallocate STag 0x%08X completed\n", ib_mr->rkey);
+	ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+			NES_EVENT_TIMEOUT);
+	nes_debug(NES_DBG_MR, "Deallocate STag 0x%08X completed, wait_event_timeout ret = %u,"
+			" CQP Major:Minor codes = 0x%04X:0x%04X\n",
+			ib_mr->rkey, ret, cqp_request->major_code, cqp_request->minor_code);
+
+	major_code = cqp_request->major_code;
+	minor_code = cqp_request->minor_code;
+
+	nes_put_cqp_request(nesdev, cqp_request);
+
+	if (!ret) {
+		nes_debug(NES_DBG_MR, "Timeout waiting to destroy STag,"
+				" ib_mr=%p, rkey = 0x%08X\n",
+				ib_mr, ib_mr->rkey);
+		return -ETIME;
+	} else if (major_code) {
+		nes_debug(NES_DBG_MR, "Error (0x%04X:0x%04X) while attempting"
+				" to destroy STag, ib_mr=%p, rkey = 0x%08X\n",
+				major_code, minor_code, ib_mr, ib_mr->rkey);
+		return -EIO;
+	}
+
+	if (nesmr->pbls_used != 0) {
+		spin_lock_irqsave(&nesadapter->pbl_lock, flags);
+		if (nesmr->pbl_4k) {
+			nesadapter->free_4kpbl += nesmr->pbls_used;
+			if (nesadapter->free_4kpbl > nesadapter->max_4kpbl)
+				printk(KERN_ERR PFX "free 4KB PBLs(%u) has "
+					"exceeded the max(%u)\n",
+					nesadapter->free_4kpbl,
+					nesadapter->max_4kpbl);
+		} else {
+			nesadapter->free_256pbl += nesmr->pbls_used;
+			if (nesadapter->free_256pbl > nesadapter->max_256pbl)
+				printk(KERN_ERR PFX "free 256B PBLs(%u) has "
+					"exceeded the max(%u)\n",
+					nesadapter->free_256pbl,
+					nesadapter->max_256pbl);
+		}
+		spin_unlock_irqrestore(&nesadapter->pbl_lock, flags);
+	}
+	nes_free_resource(nesadapter, nesadapter->allocated_mrs,
+			(ib_mr->rkey & 0x0fffff00) >> 8);
+
+	kfree(nesmr);
+
+	return 0;
+}
+
+
+/**
+ * show_rev
+ */
+static ssize_t show_rev(struct device *dev, struct device_attribute *attr,
+			char *buf)
+{
+	struct nes_ib_device *nesibdev =
+			container_of(dev, struct nes_ib_device, ibdev.dev);
+	struct nes_vnic *nesvnic = nesibdev->nesvnic;
+
+	nes_debug(NES_DBG_INIT, "\n");
+	return sprintf(buf, "%x\n", nesvnic->nesdev->nesadapter->hw_rev);
+}
+
+
+/**
+ * show_fw_ver
+ */
+static ssize_t show_fw_ver(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct nes_ib_device *nesibdev =
+			container_of(dev, struct nes_ib_device, ibdev.dev);
+	struct nes_vnic *nesvnic = nesibdev->nesvnic;
+
+	nes_debug(NES_DBG_INIT, "\n");
+	return sprintf(buf, "%u.%u\n",
+		(nesvnic->nesdev->nesadapter->firmware_version >> 16),
+		(nesvnic->nesdev->nesadapter->firmware_version & 0x000000ff));
+}
+
+
+/**
+ * show_hca
+ */
+static ssize_t show_hca(struct device *dev, struct device_attribute *attr,
+		        char *buf)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return sprintf(buf, "NES020\n");
+}
+
+
+/**
+ * show_board
+ */
+static ssize_t show_board(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return sprintf(buf, "%.*s\n", 32, "NES020 Board ID");
+}
+
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_board, NULL);
+
+static struct device_attribute *nes_dev_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type,
+	&dev_attr_board_id
+};
+
+
+/**
+ * nes_query_qp
+ */
+static int nes_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+
+	nes_debug(NES_DBG_QP, "\n");
+
+	attr->qp_access_flags = 0;
+	attr->cap.max_send_wr = nesqp->hwqp.sq_size;
+	attr->cap.max_recv_wr = nesqp->hwqp.rq_size;
+	attr->cap.max_recv_sge = 1;
+	if (nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA)
+		attr->cap.max_inline_data = 0;
+	else
+		attr->cap.max_inline_data = 64;
+
+	init_attr->event_handler = nesqp->ibqp.event_handler;
+	init_attr->qp_context = nesqp->ibqp.qp_context;
+	init_attr->send_cq = nesqp->ibqp.send_cq;
+	init_attr->recv_cq = nesqp->ibqp.recv_cq;
+	init_attr->srq = nesqp->ibqp.srq;
+	init_attr->cap = attr->cap;
+
+	return 0;
+}
+
+
+/**
+ * nes_hw_modify_qp
+ */
+int nes_hw_modify_qp(struct nes_device *nesdev, struct nes_qp *nesqp,
+		u32 next_iwarp_state, u32 termlen, u32 wait_completion)
+{
+	struct nes_hw_cqp_wqe *cqp_wqe;
+	/* struct iw_cm_id *cm_id = nesqp->cm_id; */
+	/* struct iw_cm_event cm_event; */
+	struct nes_cqp_request *cqp_request;
+	int ret;
+	u16 major_code;
+
+	nes_debug(NES_DBG_MOD_QP, "QP%u, refcount=%d\n",
+			nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
+
+	cqp_request = nes_get_cqp_request(nesdev);
+	if (cqp_request == NULL) {
+		nes_debug(NES_DBG_MOD_QP, "Failed to get a cqp_request.\n");
+		return -ENOMEM;
+	}
+	if (wait_completion) {
+		cqp_request->waiting = 1;
+	} else {
+		cqp_request->waiting = 0;
+	}
+	cqp_wqe = &cqp_request->cqp_wqe;
+
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_OPCODE_IDX,
+			NES_CQP_MODIFY_QP | NES_CQP_QP_TYPE_IWARP | next_iwarp_state);
+	nes_debug(NES_DBG_MOD_QP, "using next_iwarp_state=%08x, wqe_words=%08x\n",
+			next_iwarp_state, le32_to_cpu(cqp_wqe->wqe_words[NES_CQP_WQE_OPCODE_IDX]));
+	nes_fill_init_cqp_wqe(cqp_wqe, nesdev);
+	set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_WQE_ID_IDX, nesqp->hwqp.qp_id);
+	set_wqe_64bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_CONTEXT_LOW_IDX, (u64)nesqp->nesqp_context_pbase);
+
+	/* If sending a terminate message, fill in the length (in words) */
+	if (((next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK) == NES_CQP_QP_IWARP_STATE_TERMINATE) &&
+	    !(next_iwarp_state & NES_CQP_QP_TERM_DONT_SEND_TERM_MSG)) {
+		termlen = ((termlen + 3) >> 2) << NES_CQP_OP_TERMLEN_SHIFT;
+		set_wqe_32bit_value(cqp_wqe->wqe_words, NES_CQP_QP_WQE_NEW_MSS_IDX, termlen);
+	}
+
+	atomic_set(&cqp_request->refcount, 2);
+	nes_post_cqp_request(nesdev, cqp_request);
+
+	/* Wait for CQP */
+	if (wait_completion) {
+		/* nes_debug(NES_DBG_MOD_QP, "Waiting for modify iWARP QP%u to complete.\n",
+				nesqp->hwqp.qp_id); */
+		ret = wait_event_timeout(cqp_request->waitq, (cqp_request->request_done != 0),
+				NES_EVENT_TIMEOUT);
+		nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u completed, wait_event_timeout ret=%u, "
+				"CQP Major:Minor codes = 0x%04X:0x%04X.\n",
+				nesqp->hwqp.qp_id, ret, cqp_request->major_code, cqp_request->minor_code);
+		major_code = cqp_request->major_code;
+		if (major_code) {
+			nes_debug(NES_DBG_MOD_QP, "Modify iwarp QP%u failed"
+					"CQP Major:Minor codes = 0x%04X:0x%04X, intended next state = 0x%08X.\n",
+					nesqp->hwqp.qp_id, cqp_request->major_code,
+					cqp_request->minor_code, next_iwarp_state);
+		}
+
+		nes_put_cqp_request(nesdev, cqp_request);
+
+		if (!ret)
+			return -ETIME;
+		else if (major_code)
+			return -EIO;
+		else
+			return 0;
+	} else {
+		return 0;
+	}
+}
+
+
+/**
+ * nes_modify_qp
+ */
+int nes_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		int attr_mask, struct ib_udata *udata)
+{
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	/* u32 cqp_head; */
+	/* u32 counter; */
+	u32 next_iwarp_state = 0;
+	int err;
+	unsigned long qplockflags;
+	int ret;
+	u16 original_last_aeq;
+	u8 issue_modify_qp = 0;
+	u8 dont_wait = 0;
+
+	nes_debug(NES_DBG_MOD_QP, "QP%u: QP State=%u, cur QP State=%u,"
+			" iwarp_state=0x%X, refcount=%d\n",
+			nesqp->hwqp.qp_id, attr->qp_state, nesqp->ibqp_state,
+			nesqp->iwarp_state, atomic_read(&nesqp->refcount));
+
+	spin_lock_irqsave(&nesqp->lock, qplockflags);
+
+	nes_debug(NES_DBG_MOD_QP, "QP%u: hw_iwarp_state=0x%X, hw_tcp_state=0x%X,"
+			" QP Access Flags=0x%X, attr_mask = 0x%0x\n",
+			nesqp->hwqp.qp_id, nesqp->hw_iwarp_state,
+			nesqp->hw_tcp_state, attr->qp_access_flags, attr_mask);
+
+	if (attr_mask & IB_QP_STATE) {
+		switch (attr->qp_state) {
+			case IB_QPS_INIT:
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = init\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_IDLE) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE;
+				issue_modify_qp = 1;
+				break;
+			case IB_QPS_RTR:
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rtr\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_IDLE) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_IDLE;
+				issue_modify_qp = 1;
+				break;
+			case IB_QPS_RTS:
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = rts\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->iwarp_state>(u32)NES_CQP_QP_IWARP_STATE_RTS) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				if (nesqp->cm_id == NULL) {
+					nes_debug(NES_DBG_MOD_QP, "QP%u: Failing attempt to move QP to RTS without a CM_ID. \n",
+							nesqp->hwqp.qp_id );
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_RTS;
+				if (nesqp->iwarp_state != NES_CQP_QP_IWARP_STATE_RTS)
+					next_iwarp_state |= NES_CQP_QP_CONTEXT_VALID |
+							NES_CQP_QP_ARP_VALID | NES_CQP_QP_ORD_VALID;
+				issue_modify_qp = 1;
+				nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_ESTABLISHED;
+				nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_RTS;
+				nesqp->hte_added = 1;
+				break;
+			case IB_QPS_SQD:
+				issue_modify_qp = 1;
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state=closing. SQ head=%u, SQ tail=%u\n",
+						nesqp->hwqp.qp_id, nesqp->hwqp.sq_head, nesqp->hwqp.sq_tail);
+				if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_CLOSING) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return 0;
+				} else {
+					if (nesqp->iwarp_state > (u32)NES_CQP_QP_IWARP_STATE_CLOSING) {
+						nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing"
+								" ignored due to current iWARP state\n",
+								nesqp->hwqp.qp_id);
+						spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+						return -EINVAL;
+					}
+					if (nesqp->hw_iwarp_state != NES_AEQE_IWARP_STATE_RTS) {
+						nes_debug(NES_DBG_MOD_QP, "QP%u: State change to closing"
+								" already done based on hw state.\n",
+								nesqp->hwqp.qp_id);
+						issue_modify_qp = 0;
+					}
+					switch (nesqp->hw_iwarp_state) {
+						case NES_AEQE_IWARP_STATE_CLOSING:
+							next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
+							break;
+						case NES_AEQE_IWARP_STATE_TERMINATE:
+							next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE;
+							break;
+						case NES_AEQE_IWARP_STATE_ERROR:
+							next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
+							break;
+						default:
+							next_iwarp_state = NES_CQP_QP_IWARP_STATE_CLOSING;
+							nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_CLOSING;
+							break;
+					}
+				}
+				break;
+			case IB_QPS_SQE:
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = terminate\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->iwarp_state>=(u32)NES_CQP_QP_IWARP_STATE_TERMINATE) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				/* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_TERMINATE;
+				nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_TERMINATE;
+				issue_modify_qp = 1;
+				break;
+			case IB_QPS_ERR:
+			case IB_QPS_RESET:
+				if (nesqp->iwarp_state == (u32)NES_CQP_QP_IWARP_STATE_ERROR) {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					return -EINVAL;
+				}
+				nes_debug(NES_DBG_MOD_QP, "QP%u: new state = error\n",
+						nesqp->hwqp.qp_id);
+				if (nesqp->term_flags)
+					del_timer(&nesqp->terminate_timer);
+
+				next_iwarp_state = NES_CQP_QP_IWARP_STATE_ERROR;
+				/* next_iwarp_state = (NES_CQP_QP_IWARP_STATE_TERMINATE | 0x02000000); */
+					if (nesqp->hte_added) {
+						nes_debug(NES_DBG_MOD_QP, "set CQP_QP_DEL_HTE\n");
+						next_iwarp_state |= NES_CQP_QP_DEL_HTE;
+						nesqp->hte_added = 0;
+					}
+				if ((nesqp->hw_tcp_state > NES_AEQE_TCP_STATE_CLOSED) &&
+						(nesdev->iw_status) &&
+						(nesqp->hw_tcp_state != NES_AEQE_TCP_STATE_TIME_WAIT)) {
+					next_iwarp_state |= NES_CQP_QP_RESET;
+				} else {
+					nes_debug(NES_DBG_MOD_QP, "QP%u NOT setting NES_CQP_QP_RESET since TCP state = %u\n",
+							nesqp->hwqp.qp_id, nesqp->hw_tcp_state);
+					dont_wait = 1;
+				}
+				issue_modify_qp = 1;
+				nesqp->hw_iwarp_state = NES_AEQE_IWARP_STATE_ERROR;
+				break;
+			default:
+				spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+				return -EINVAL;
+				break;
+		}
+
+		nesqp->ibqp_state = attr->qp_state;
+		nesqp->iwarp_state = next_iwarp_state & NES_CQP_QP_IWARP_STATE_MASK;
+		nes_debug(NES_DBG_MOD_QP, "Change nesqp->iwarp_state=%08x\n",
+				nesqp->iwarp_state);
+	}
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS) {
+		if (attr->qp_access_flags & IB_ACCESS_LOCAL_WRITE) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN |
+					NES_QPCONTEXT_MISC_RDMA_READ_EN);
+			issue_modify_qp = 1;
+		}
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN);
+			issue_modify_qp = 1;
+		}
+		if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_READ_EN);
+			issue_modify_qp = 1;
+		}
+		if (attr->qp_access_flags & IB_ACCESS_MW_BIND) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_WBIND_EN);
+			issue_modify_qp = 1;
+		}
+
+		if (nesqp->user_mode) {
+			nesqp->nesqp_context->misc |= cpu_to_le32(NES_QPCONTEXT_MISC_RDMA_WRITE_EN |
+					NES_QPCONTEXT_MISC_RDMA_READ_EN);
+			issue_modify_qp = 1;
+		}
+	}
+
+	original_last_aeq = nesqp->last_aeq;
+	spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+
+	nes_debug(NES_DBG_MOD_QP, "issue_modify_qp=%u\n", issue_modify_qp);
+
+	ret = 0;
+
+
+	if (issue_modify_qp) {
+		nes_debug(NES_DBG_MOD_QP, "call nes_hw_modify_qp\n");
+		ret = nes_hw_modify_qp(nesdev, nesqp, next_iwarp_state, 0, 1);
+		if (ret)
+			nes_debug(NES_DBG_MOD_QP, "nes_hw_modify_qp (next_iwarp_state = 0x%08X)"
+					" failed for QP%u.\n",
+					next_iwarp_state, nesqp->hwqp.qp_id);
+
+	}
+
+	if ((issue_modify_qp) && (nesqp->ibqp_state > IB_QPS_RTS)) {
+		nes_debug(NES_DBG_MOD_QP, "QP%u Issued ModifyQP refcount (%d),"
+				" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
+				nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+				original_last_aeq, nesqp->last_aeq);
+		if (!ret || original_last_aeq != NES_AEQE_AEID_RDMAP_ROE_BAD_LLP_CLOSE) {
+			if (dont_wait) {
+				if (nesqp->cm_id && nesqp->hw_tcp_state != 0) {
+					nes_debug(NES_DBG_MOD_QP, "QP%u Queuing fake disconnect for QP refcount (%d),"
+							" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
+							nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+							original_last_aeq, nesqp->last_aeq);
+					/* this one is for the cm_disconnect thread */
+					spin_lock_irqsave(&nesqp->lock, qplockflags);
+					nesqp->hw_tcp_state = NES_AEQE_TCP_STATE_CLOSED;
+					nesqp->last_aeq = NES_AEQE_AEID_RESET_SENT;
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_cm_disconn(nesqp);
+				} else {
+					nes_debug(NES_DBG_MOD_QP, "QP%u No fake disconnect, QP refcount=%d\n",
+							nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
+				}
+			} else {
+				spin_lock_irqsave(&nesqp->lock, qplockflags);
+				if (nesqp->cm_id) {
+					/* These two are for the timer thread */
+					if (atomic_inc_return(&nesqp->close_timer_started) == 1) {
+						nesqp->cm_id->add_ref(nesqp->cm_id);
+						nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d),"
+								" need ae to finish up, original_last_aeq = 0x%04X."
+								" last_aeq = 0x%04X, scheduling timer.\n",
+								nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+								original_last_aeq, nesqp->last_aeq);
+						schedule_nes_timer(nesqp->cm_node, (struct sk_buff *) nesqp, NES_TIMER_TYPE_CLOSE, 1, 0);
+					}
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+				} else {
+					spin_unlock_irqrestore(&nesqp->lock, qplockflags);
+					nes_debug(NES_DBG_MOD_QP, "QP%u Not decrementing QP refcount (%d),"
+							" need ae to finish up, original_last_aeq = 0x%04X."
+							" last_aeq = 0x%04X.\n",
+							nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+							original_last_aeq, nesqp->last_aeq);
+				}
+			}
+		} else {
+			nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up,"
+					" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
+					nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+					original_last_aeq, nesqp->last_aeq);
+		}
+	} else {
+		nes_debug(NES_DBG_MOD_QP, "QP%u Decrementing QP refcount (%d), No ae to finish up,"
+				" original_last_aeq = 0x%04X. last_aeq = 0x%04X.\n",
+				nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount),
+				original_last_aeq, nesqp->last_aeq);
+	}
+
+	err = 0;
+
+	nes_debug(NES_DBG_MOD_QP, "QP%u Leaving, refcount=%d\n",
+			nesqp->hwqp.qp_id, atomic_read(&nesqp->refcount));
+
+	return err;
+}
+
+
+/**
+ * nes_muticast_attach
+ */
+static int nes_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return -ENOSYS;
+}
+
+
+/**
+ * nes_multicast_detach
+ */
+static int nes_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return -ENOSYS;
+}
+
+
+/**
+ * nes_process_mad
+ */
+static int nes_process_mad(struct ib_device *ibdev, int mad_flags,
+		u8 port_num, struct ib_wc *in_wc, struct ib_grh *in_grh,
+		struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	nes_debug(NES_DBG_INIT, "\n");
+	return -ENOSYS;
+}
+
+static inline void
+fill_wqe_sg_send(struct nes_hw_qp_wqe *wqe, struct ib_send_wr *ib_wr, u32 uselkey)
+{
+	int sge_index;
+	int total_payload_length = 0;
+	for (sge_index = 0; sge_index < ib_wr->num_sge; sge_index++) {
+		set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX+(sge_index*4),
+			ib_wr->sg_list[sge_index].addr);
+		set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_LENGTH0_IDX + (sge_index*4),
+			ib_wr->sg_list[sge_index].length);
+		if (uselkey)
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4),
+						(ib_wr->sg_list[sge_index].lkey));
+		else
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX + (sge_index*4), 0);
+
+		total_payload_length += ib_wr->sg_list[sge_index].length;
+	}
+	nes_debug(NES_DBG_IW_TX, "UC UC UC, sending total_payload_length=%u \n",
+			total_payload_length);
+	set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
+				total_payload_length);
+}
+
+/**
+ * nes_post_send
+ */
+static int nes_post_send(struct ib_qp *ibqp, struct ib_send_wr *ib_wr,
+		struct ib_send_wr **bad_wr)
+{
+	u64 u64temp;
+	unsigned long flags = 0;
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_hw_qp_wqe *wqe;
+	int err = 0;
+	u32 qsize = nesqp->hwqp.sq_size;
+	u32 head;
+	u32 wqe_misc = 0;
+	u32 wqe_count = 0;
+	u32 counter;
+
+	if (nesqp->ibqp_state > IB_QPS_RTS) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+
+	head = nesqp->hwqp.sq_head;
+
+	while (ib_wr) {
+		/* Check for QP error */
+		if (nesqp->term_flags) {
+			err = -EINVAL;
+			break;
+		}
+
+		/* Check for SQ overflow */
+		if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) {
+			err = -ENOMEM;
+			break;
+		}
+
+		wqe = &nesqp->hwqp.sq_vbase[head];
+		/* nes_debug(NES_DBG_IW_TX, "processing sq wqe for QP%u at %p, head = %u.\n",
+				nesqp->hwqp.qp_id, wqe, head); */
+		nes_fill_init_qp_wqe(wqe, nesqp, head);
+		u64temp = (u64)(ib_wr->wr_id);
+		set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX,
+					u64temp);
+		switch (ib_wr->opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_INV:
+			if (IB_WR_SEND == ib_wr->opcode) {
+				if (ib_wr->send_flags & IB_SEND_SOLICITED)
+					wqe_misc = NES_IWARP_SQ_OP_SENDSE;
+				else
+					wqe_misc = NES_IWARP_SQ_OP_SEND;
+			} else {
+				if (ib_wr->send_flags & IB_SEND_SOLICITED)
+					wqe_misc = NES_IWARP_SQ_OP_SENDSEINV;
+				else
+					wqe_misc = NES_IWARP_SQ_OP_SENDINV;
+
+				set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX,
+						    ib_wr->ex.invalidate_rkey);
+			}
+
+			if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
+				err = -EINVAL;
+				break;
+			}
+
+			if (ib_wr->send_flags & IB_SEND_FENCE)
+				wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
+
+			if ((ib_wr->send_flags & IB_SEND_INLINE) &&
+			    ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) &&
+			     (ib_wr->sg_list[0].length <= 64)) {
+				memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX],
+				       (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length);
+				set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
+						    ib_wr->sg_list[0].length);
+				wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA;
+			} else {
+				fill_wqe_sg_send(wqe, ib_wr, 1);
+			}
+
+			break;
+		case IB_WR_RDMA_WRITE:
+			wqe_misc = NES_IWARP_SQ_OP_RDMAW;
+			if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
+				nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=%u\n",
+					  ib_wr->num_sge, nesdev->nesadapter->max_sge);
+				err = -EINVAL;
+				break;
+			}
+
+			if (ib_wr->send_flags & IB_SEND_FENCE)
+				wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE;
+
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX,
+					    ib_wr->wr.rdma.rkey);
+			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX,
+					    ib_wr->wr.rdma.remote_addr);
+
+			if ((ib_wr->send_flags & IB_SEND_INLINE) &&
+			    ((nes_drv_opt & NES_DRV_OPT_NO_INLINE_DATA) == 0) &&
+			     (ib_wr->sg_list[0].length <= 64)) {
+				memcpy(&wqe->wqe_words[NES_IWARP_SQ_WQE_IMM_DATA_START_IDX],
+				       (void *)(unsigned long)ib_wr->sg_list[0].addr, ib_wr->sg_list[0].length);
+				set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX,
+						    ib_wr->sg_list[0].length);
+				wqe_misc |= NES_IWARP_SQ_WQE_IMM_DATA;
+			} else {
+				fill_wqe_sg_send(wqe, ib_wr, 1);
+			}
+
+			wqe->wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX] =
+				wqe->wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX];
+			break;
+		case IB_WR_RDMA_READ:
+		case IB_WR_RDMA_READ_WITH_INV:
+			/* iWARP only supports 1 sge for RDMA reads */
+			if (ib_wr->num_sge > 1) {
+				nes_debug(NES_DBG_IW_TX, "Exceeded max sge, ib_wr=%u, max=1\n",
+					  ib_wr->num_sge);
+				err = -EINVAL;
+				break;
+			}
+			if (ib_wr->opcode == IB_WR_RDMA_READ) {
+				wqe_misc = NES_IWARP_SQ_OP_RDMAR;
+			} else {
+				wqe_misc = NES_IWARP_SQ_OP_RDMAR_LOCINV;
+				set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_INV_STAG_LOW_IDX,
+						    ib_wr->ex.invalidate_rkey);
+			}
+
+			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_TO_LOW_IDX,
+					    ib_wr->wr.rdma.remote_addr);
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_STAG_IDX,
+					    ib_wr->wr.rdma.rkey);
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX,
+					    ib_wr->sg_list->length);
+			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_FRAG0_LOW_IDX,
+					    ib_wr->sg_list->addr);
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_STAG0_IDX,
+					    ib_wr->sg_list->lkey);
+			break;
+		case IB_WR_LOCAL_INV:
+			wqe_misc = NES_IWARP_SQ_OP_LOCINV;
+			set_wqe_32bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_LOCINV_WQE_INV_STAG_IDX,
+					    ib_wr->ex.invalidate_rkey);
+			break;
+		case IB_WR_FAST_REG_MR:
+		{
+			int i;
+			int flags = ib_wr->wr.fast_reg.access_flags;
+			struct nes_ib_fast_reg_page_list *pnesfrpl =
+				container_of(ib_wr->wr.fast_reg.page_list,
+					     struct nes_ib_fast_reg_page_list,
+					     ibfrpl);
+			u64 *src_page_list = pnesfrpl->ibfrpl.page_list;
+			u64 *dst_page_list = pnesfrpl->nes_wqe_pbl.kva;
+
+			if (ib_wr->wr.fast_reg.page_list_len >
+			    (NES_4K_PBL_CHUNK_SIZE / sizeof(u64))) {
+				nes_debug(NES_DBG_IW_TX, "SQ_FMR: bad page_list_len\n");
+				err = -EINVAL;
+				break;
+			}
+			wqe_misc = NES_IWARP_SQ_OP_FAST_REG;
+			set_wqe_64bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_VA_FBO_LOW_IDX,
+					    ib_wr->wr.fast_reg.iova_start);
+			set_wqe_32bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_LENGTH_LOW_IDX,
+					    ib_wr->wr.fast_reg.length);
+			set_wqe_32bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_LENGTH_HIGH_IDX, 0);
+			set_wqe_32bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_MR_STAG_IDX,
+					    ib_wr->wr.fast_reg.rkey);
+			/* Set page size: */
+			if (ib_wr->wr.fast_reg.page_shift == 12) {
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_4K;
+			} else if (ib_wr->wr.fast_reg.page_shift == 21) {
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_PAGE_SIZE_2M;
+			} else {
+				nes_debug(NES_DBG_IW_TX, "Invalid page shift,"
+					  " ib_wr=%u, max=1\n", ib_wr->num_sge);
+				err = -EINVAL;
+				break;
+			}
+			/* Set access_flags */
+			wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_READ;
+			if (flags & IB_ACCESS_LOCAL_WRITE)
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_LOCAL_WRITE;
+
+			if (flags & IB_ACCESS_REMOTE_WRITE)
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_WRITE;
+
+			if (flags & IB_ACCESS_REMOTE_READ)
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_REMOTE_READ;
+
+			if (flags & IB_ACCESS_MW_BIND)
+				wqe_misc |= NES_IWARP_SQ_FMR_WQE_RIGHTS_ENABLE_WINDOW_BIND;
+
+			/* Fill in PBL info: */
+			if (ib_wr->wr.fast_reg.page_list_len >
+			    pnesfrpl->ibfrpl.max_page_list_len) {
+				nes_debug(NES_DBG_IW_TX, "Invalid page list length,"
+					  " ib_wr=%p, value=%u, max=%u\n",
+					  ib_wr, ib_wr->wr.fast_reg.page_list_len,
+					  pnesfrpl->ibfrpl.max_page_list_len);
+				err = -EINVAL;
+				break;
+			}
+
+			set_wqe_64bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_PBL_ADDR_LOW_IDX,
+					    pnesfrpl->nes_wqe_pbl.paddr);
+
+			set_wqe_32bit_value(wqe->wqe_words,
+					    NES_IWARP_SQ_FMR_WQE_PBL_LENGTH_IDX,
+					    ib_wr->wr.fast_reg.page_list_len * 8);
+
+			for (i = 0; i < ib_wr->wr.fast_reg.page_list_len; i++)
+				dst_page_list[i] = cpu_to_le64(src_page_list[i]);
+
+			nes_debug(NES_DBG_IW_TX, "SQ_FMR: iova_start: %llx, "
+				  "length: %d, rkey: %0x, pgl_paddr: %llx, "
+				  "page_list_len: %u, wqe_misc: %x\n",
+				  (unsigned long long) ib_wr->wr.fast_reg.iova_start,
+				  ib_wr->wr.fast_reg.length,
+				  ib_wr->wr.fast_reg.rkey,
+				  (unsigned long long) pnesfrpl->nes_wqe_pbl.paddr,
+				  ib_wr->wr.fast_reg.page_list_len,
+				  wqe_misc);
+			break;
+		}
+		default:
+			/* error */
+			err = -EINVAL;
+			break;
+		}
+
+		if (err)
+			break;
+
+		if ((ib_wr->send_flags & IB_SEND_SIGNALED) || nesqp->sig_all)
+			wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL;
+
+		wqe->wqe_words[NES_IWARP_SQ_WQE_MISC_IDX] = cpu_to_le32(wqe_misc);
+
+		ib_wr = ib_wr->next;
+		head++;
+		wqe_count++;
+		if (head >= qsize)
+			head = 0;
+
+	}
+
+	nesqp->hwqp.sq_head = head;
+	barrier();
+	while (wqe_count) {
+		counter = min(wqe_count, ((u32)255));
+		wqe_count -= counter;
+		nes_write32(nesdev->regs + NES_WQE_ALLOC,
+				(counter << 24) | 0x00800000 | nesqp->hwqp.qp_id);
+	}
+
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+out:
+	if (err)
+		*bad_wr = ib_wr;
+	return err;
+}
+
+
+/**
+ * nes_post_recv
+ */
+static int nes_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *ib_wr,
+		struct ib_recv_wr **bad_wr)
+{
+	u64 u64temp;
+	unsigned long flags = 0;
+	struct nes_vnic *nesvnic = to_nesvnic(ibqp->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_qp *nesqp = to_nesqp(ibqp);
+	struct nes_hw_qp_wqe *wqe;
+	int err = 0;
+	int sge_index;
+	u32 qsize = nesqp->hwqp.rq_size;
+	u32 head;
+	u32 wqe_count = 0;
+	u32 counter;
+	u32 total_payload_length;
+
+	if (nesqp->ibqp_state > IB_QPS_RTS) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	spin_lock_irqsave(&nesqp->lock, flags);
+
+	head = nesqp->hwqp.rq_head;
+
+	while (ib_wr) {
+		/* Check for QP error */
+		if (nesqp->term_flags) {
+			err = -EINVAL;
+			break;
+		}
+
+		if (ib_wr->num_sge > nesdev->nesadapter->max_sge) {
+			err = -EINVAL;
+			break;
+		}
+		/* Check for RQ overflow */
+		if (((head + (2 * qsize) - nesqp->hwqp.rq_tail) % qsize) == (qsize - 1)) {
+			err = -ENOMEM;
+			break;
+		}
+
+		nes_debug(NES_DBG_IW_RX, "ibwr sge count = %u.\n", ib_wr->num_sge);
+		wqe = &nesqp->hwqp.rq_vbase[head];
+
+		/* nes_debug(NES_DBG_IW_RX, "QP%u:processing rq wqe at %p, head = %u.\n",
+				nesqp->hwqp.qp_id, wqe, head); */
+		nes_fill_init_qp_wqe(wqe, nesqp, head);
+		u64temp = (u64)(ib_wr->wr_id);
+		set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX,
+					u64temp);
+		total_payload_length = 0;
+		for (sge_index=0; sge_index < ib_wr->num_sge; sge_index++) {
+			set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_FRAG0_LOW_IDX+(sge_index*4),
+					ib_wr->sg_list[sge_index].addr);
+			set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_LENGTH0_IDX+(sge_index*4),
+					ib_wr->sg_list[sge_index].length);
+			set_wqe_32bit_value(wqe->wqe_words,NES_IWARP_RQ_WQE_STAG0_IDX+(sge_index*4),
+					ib_wr->sg_list[sge_index].lkey);
+
+			total_payload_length += ib_wr->sg_list[sge_index].length;
+		}
+		set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_RQ_WQE_TOTAL_PAYLOAD_IDX,
+					total_payload_length);
+
+		ib_wr = ib_wr->next;
+		head++;
+		wqe_count++;
+		if (head >= qsize)
+			head = 0;
+	}
+
+	nesqp->hwqp.rq_head = head;
+	barrier();
+	while (wqe_count) {
+		counter = min(wqe_count, ((u32)255));
+		wqe_count -= counter;
+		nes_write32(nesdev->regs+NES_WQE_ALLOC, (counter<<24) | nesqp->hwqp.qp_id);
+	}
+
+	spin_unlock_irqrestore(&nesqp->lock, flags);
+
+out:
+	if (err)
+		*bad_wr = ib_wr;
+	return err;
+}
+
+
+/**
+ * nes_poll_cq
+ */
+static int nes_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+	u64 u64temp;
+	u64 wrid;
+	unsigned long flags = 0;
+	struct nes_vnic *nesvnic = to_nesvnic(ibcq->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_cq *nescq = to_nescq(ibcq);
+	struct nes_qp *nesqp;
+	struct nes_hw_cqe cqe;
+	u32 head;
+	u32 wq_tail = 0;
+	u32 cq_size;
+	u32 cqe_count = 0;
+	u32 wqe_index;
+	u32 u32temp;
+	u32 move_cq_head = 1;
+	u32 err_code;
+
+	nes_debug(NES_DBG_CQ, "\n");
+
+	spin_lock_irqsave(&nescq->lock, flags);
+
+	head = nescq->hw_cq.cq_head;
+	cq_size = nescq->hw_cq.cq_size;
+
+	while (cqe_count < num_entries) {
+		if ((le32_to_cpu(nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX]) &
+				NES_CQE_VALID) == 0)
+			break;
+
+		/*
+		 * Make sure we read CQ entry contents *after*
+		 * we've checked the valid bit.
+		 */
+		rmb();
+
+		cqe = nescq->hw_cq.cq_vbase[head];
+		u32temp = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]);
+		wqe_index = u32temp & (nesdev->nesadapter->max_qp_wr - 1);
+		u32temp &= ~(NES_SW_CONTEXT_ALIGN-1);
+		/* parse CQE, get completion context from WQE (either rq or sq) */
+		u64temp = (((u64)(le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_HIGH_IDX])))<<32) |
+				((u64)u32temp);
+
+		if (u64temp) {
+			nesqp = (struct nes_qp *)(unsigned long)u64temp;
+			memset(entry, 0, sizeof *entry);
+			if (cqe.cqe_words[NES_CQE_ERROR_CODE_IDX] == 0) {
+				entry->status = IB_WC_SUCCESS;
+			} else {
+				err_code = le32_to_cpu(cqe.cqe_words[NES_CQE_ERROR_CODE_IDX]);
+				if (NES_IWARP_CQE_MAJOR_DRV == (err_code >> 16)) {
+					entry->status = err_code & 0x0000ffff;
+
+					/* The rest of the cqe's will be marked as flushed */
+					nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_ERROR_CODE_IDX] =
+						cpu_to_le32((NES_IWARP_CQE_MAJOR_FLUSH << 16) |
+							    NES_IWARP_CQE_MINOR_FLUSH);
+				} else
+					entry->status = IB_WC_WR_FLUSH_ERR;
+			}
+
+			entry->qp = &nesqp->ibqp;
+			entry->src_qp = nesqp->hwqp.qp_id;
+
+			if (le32_to_cpu(cqe.cqe_words[NES_CQE_OPCODE_IDX]) & NES_CQE_SQ) {
+				if (nesqp->skip_lsmm) {
+					nesqp->skip_lsmm = 0;
+					nesqp->hwqp.sq_tail++;
+				}
+
+				/* Working on a SQ Completion*/
+				wrid = (((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index].
+						wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_HIGH_IDX]))) << 32) |
+						((u64)(cpu_to_le32((u32)nesqp->hwqp.sq_vbase[wqe_index].
+						wqe_words[NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX])));
+				entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index].
+						wqe_words[NES_IWARP_SQ_WQE_TOTAL_PAYLOAD_IDX]);
+
+				switch (le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index].
+						wqe_words[NES_IWARP_SQ_WQE_MISC_IDX]) & 0x3f) {
+					case NES_IWARP_SQ_OP_RDMAW:
+						nes_debug(NES_DBG_CQ, "Operation = RDMA WRITE.\n");
+						entry->opcode = IB_WC_RDMA_WRITE;
+						break;
+					case NES_IWARP_SQ_OP_RDMAR:
+						nes_debug(NES_DBG_CQ, "Operation = RDMA READ.\n");
+						entry->opcode = IB_WC_RDMA_READ;
+						entry->byte_len = le32_to_cpu(nesqp->hwqp.sq_vbase[wqe_index].
+								wqe_words[NES_IWARP_SQ_WQE_RDMA_LENGTH_IDX]);
+						break;
+					case NES_IWARP_SQ_OP_SENDINV:
+					case NES_IWARP_SQ_OP_SENDSEINV:
+					case NES_IWARP_SQ_OP_SEND:
+					case NES_IWARP_SQ_OP_SENDSE:
+						nes_debug(NES_DBG_CQ, "Operation = Send.\n");
+						entry->opcode = IB_WC_SEND;
+						break;
+					case NES_IWARP_SQ_OP_LOCINV:
+						entry->opcode = IB_WC_LOCAL_INV;
+						break;
+					case NES_IWARP_SQ_OP_FAST_REG:
+						entry->opcode = IB_WC_FAST_REG_MR;
+						break;
+				}
+
+				nesqp->hwqp.sq_tail = (wqe_index+1)&(nesqp->hwqp.sq_size - 1);
+				if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.sq_tail != nesqp->hwqp.sq_head)) {
+					move_cq_head = 0;
+					wq_tail = nesqp->hwqp.sq_tail;
+				}
+			} else {
+				/* Working on a RQ Completion*/
+				entry->byte_len = le32_to_cpu(cqe.cqe_words[NES_CQE_PAYLOAD_LENGTH_IDX]);
+				wrid = ((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_LOW_IDX]))) |
+					((u64)(le32_to_cpu(nesqp->hwqp.rq_vbase[wqe_index].wqe_words[NES_IWARP_RQ_WQE_COMP_SCRATCH_HIGH_IDX]))<<32);
+					entry->opcode = IB_WC_RECV;
+
+				nesqp->hwqp.rq_tail = (wqe_index+1)&(nesqp->hwqp.rq_size - 1);
+				if ((entry->status != IB_WC_SUCCESS) && (nesqp->hwqp.rq_tail != nesqp->hwqp.rq_head)) {
+					move_cq_head = 0;
+					wq_tail = nesqp->hwqp.rq_tail;
+				}
+			}
+
+			entry->wr_id = wrid;
+			entry++;
+			cqe_count++;
+		}
+
+		if (move_cq_head) {
+			nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_OPCODE_IDX] = 0;
+			if (++head >= cq_size)
+				head = 0;
+			nescq->polled_completions++;
+
+			if ((nescq->polled_completions > (cq_size / 2)) ||
+					(nescq->polled_completions == 255)) {
+				nes_debug(NES_DBG_CQ, "CQ%u Issuing CQE Allocate since more than half of cqes"
+					" are pending %u of %u.\n",
+					nescq->hw_cq.cq_number, nescq->polled_completions, cq_size);
+				nes_write32(nesdev->regs+NES_CQE_ALLOC,
+					nescq->hw_cq.cq_number | (nescq->polled_completions << 16));
+				nescq->polled_completions = 0;
+			}
+		} else {
+			/* Update the wqe index and set status to flush */
+			wqe_index = le32_to_cpu(cqe.cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX]);
+			wqe_index = (wqe_index & (~(nesdev->nesadapter->max_qp_wr - 1))) | wq_tail;
+			nescq->hw_cq.cq_vbase[head].cqe_words[NES_CQE_COMP_COMP_CTX_LOW_IDX] =
+				cpu_to_le32(wqe_index);
+			move_cq_head = 1; /* ready for next pass */
+		}
+	}
+
+	if (nescq->polled_completions) {
+		nes_write32(nesdev->regs+NES_CQE_ALLOC,
+				nescq->hw_cq.cq_number | (nescq->polled_completions << 16));
+		nescq->polled_completions = 0;
+	}
+
+	nescq->hw_cq.cq_head = head;
+	nes_debug(NES_DBG_CQ, "Reporting %u completions for CQ%u.\n",
+			cqe_count, nescq->hw_cq.cq_number);
+
+	spin_unlock_irqrestore(&nescq->lock, flags);
+
+	return cqe_count;
+}
+
+
+/**
+ * nes_req_notify_cq
+ */
+static int nes_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+		{
+	struct nes_vnic *nesvnic = to_nesvnic(ibcq->device);
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_cq *nescq = to_nescq(ibcq);
+	u32 cq_arm;
+
+	nes_debug(NES_DBG_CQ, "Requesting notification for CQ%u.\n",
+			nescq->hw_cq.cq_number);
+
+	cq_arm = nescq->hw_cq.cq_number;
+	if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_NEXT_COMP)
+		cq_arm |= NES_CQE_ALLOC_NOTIFY_NEXT;
+	else if ((notify_flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED)
+		cq_arm |= NES_CQE_ALLOC_NOTIFY_SE;
+	else
+		return -EINVAL;
+
+	nes_write32(nesdev->regs+NES_CQE_ALLOC, cq_arm);
+	nes_read32(nesdev->regs+NES_CQE_ALLOC);
+
+	return 0;
+}
+
+
+/**
+ * nes_init_ofa_device
+ */
+struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev)
+{
+	struct nes_ib_device *nesibdev;
+	struct nes_vnic *nesvnic = netdev_priv(netdev);
+	struct nes_device *nesdev = nesvnic->nesdev;
+
+	nesibdev = (struct nes_ib_device *)ib_alloc_device(sizeof(struct nes_ib_device));
+	if (nesibdev == NULL) {
+		return NULL;
+	}
+	strlcpy(nesibdev->ibdev.name, "nes%d", IB_DEVICE_NAME_MAX);
+	nesibdev->ibdev.owner = THIS_MODULE;
+
+	nesibdev->ibdev.node_type = RDMA_NODE_RNIC;
+	memset(&nesibdev->ibdev.node_guid, 0, sizeof(nesibdev->ibdev.node_guid));
+	memcpy(&nesibdev->ibdev.node_guid, netdev->dev_addr, 6);
+
+	nesibdev->ibdev.uverbs_cmd_mask =
+			(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+			(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+			(1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+			(1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+			(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+			(1ull << IB_USER_VERBS_CMD_REG_MR) |
+			(1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+			(1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_AH) |
+			(1ull << IB_USER_VERBS_CMD_DESTROY_AH) |
+			(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ) |
+			(1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+			(1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+			(1ull << IB_USER_VERBS_CMD_POLL_CQ) |
+			(1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+			(1ull << IB_USER_VERBS_CMD_ALLOC_MW) |
+			(1ull << IB_USER_VERBS_CMD_BIND_MW) |
+			(1ull << IB_USER_VERBS_CMD_DEALLOC_MW) |
+			(1ull << IB_USER_VERBS_CMD_POST_RECV) |
+			(1ull << IB_USER_VERBS_CMD_POST_SEND);
+
+	nesibdev->ibdev.phys_port_cnt = 1;
+	nesibdev->ibdev.num_comp_vectors = 1;
+	nesibdev->ibdev.dma_device = &nesdev->pcidev->dev;
+	nesibdev->ibdev.dev.parent = &nesdev->pcidev->dev;
+	nesibdev->ibdev.query_device = nes_query_device;
+	nesibdev->ibdev.query_port = nes_query_port;
+	nesibdev->ibdev.query_pkey = nes_query_pkey;
+	nesibdev->ibdev.query_gid = nes_query_gid;
+	nesibdev->ibdev.alloc_ucontext = nes_alloc_ucontext;
+	nesibdev->ibdev.dealloc_ucontext = nes_dealloc_ucontext;
+	nesibdev->ibdev.mmap = nes_mmap;
+	nesibdev->ibdev.alloc_pd = nes_alloc_pd;
+	nesibdev->ibdev.dealloc_pd = nes_dealloc_pd;
+	nesibdev->ibdev.create_ah = nes_create_ah;
+	nesibdev->ibdev.destroy_ah = nes_destroy_ah;
+	nesibdev->ibdev.create_qp = nes_create_qp;
+	nesibdev->ibdev.modify_qp = nes_modify_qp;
+	nesibdev->ibdev.query_qp = nes_query_qp;
+	nesibdev->ibdev.destroy_qp = nes_destroy_qp;
+	nesibdev->ibdev.create_cq = nes_create_cq;
+	nesibdev->ibdev.destroy_cq = nes_destroy_cq;
+	nesibdev->ibdev.poll_cq = nes_poll_cq;
+	nesibdev->ibdev.get_dma_mr = nes_get_dma_mr;
+	nesibdev->ibdev.reg_phys_mr = nes_reg_phys_mr;
+	nesibdev->ibdev.reg_user_mr = nes_reg_user_mr;
+	nesibdev->ibdev.dereg_mr = nes_dereg_mr;
+	nesibdev->ibdev.alloc_mw = nes_alloc_mw;
+	nesibdev->ibdev.dealloc_mw = nes_dealloc_mw;
+	nesibdev->ibdev.bind_mw = nes_bind_mw;
+
+	nesibdev->ibdev.alloc_fast_reg_mr = nes_alloc_fast_reg_mr;
+	nesibdev->ibdev.alloc_fast_reg_page_list = nes_alloc_fast_reg_page_list;
+	nesibdev->ibdev.free_fast_reg_page_list = nes_free_fast_reg_page_list;
+
+	nesibdev->ibdev.attach_mcast = nes_multicast_attach;
+	nesibdev->ibdev.detach_mcast = nes_multicast_detach;
+	nesibdev->ibdev.process_mad = nes_process_mad;
+
+	nesibdev->ibdev.req_notify_cq = nes_req_notify_cq;
+	nesibdev->ibdev.post_send = nes_post_send;
+	nesibdev->ibdev.post_recv = nes_post_recv;
+
+	nesibdev->ibdev.iwcm = kzalloc(sizeof(*nesibdev->ibdev.iwcm), GFP_KERNEL);
+	if (nesibdev->ibdev.iwcm == NULL) {
+		ib_dealloc_device(&nesibdev->ibdev);
+		return NULL;
+	}
+	nesibdev->ibdev.iwcm->add_ref = nes_add_ref;
+	nesibdev->ibdev.iwcm->rem_ref = nes_rem_ref;
+	nesibdev->ibdev.iwcm->get_qp = nes_get_qp;
+	nesibdev->ibdev.iwcm->connect = nes_connect;
+	nesibdev->ibdev.iwcm->accept = nes_accept;
+	nesibdev->ibdev.iwcm->reject = nes_reject;
+	nesibdev->ibdev.iwcm->create_listen = nes_create_listen;
+	nesibdev->ibdev.iwcm->destroy_listen = nes_destroy_listen;
+
+	return nesibdev;
+}
+
+
+/**
+ * nes_handle_delayed_event
+ */
+static void nes_handle_delayed_event(unsigned long data)
+{
+	struct nes_vnic *nesvnic = (void *) data;
+
+	if (nesvnic->delayed_event != nesvnic->last_dispatched_event) {
+		struct ib_event event;
+
+		event.device = &nesvnic->nesibdev->ibdev;
+		if (!event.device)
+			goto stop_timer;
+		event.event = nesvnic->delayed_event;
+		event.element.port_num = nesvnic->logical_port + 1;
+		ib_dispatch_event(&event);
+	}
+
+stop_timer:
+	nesvnic->event_timer.function = NULL;
+}
+
+
+void  nes_port_ibevent(struct nes_vnic *nesvnic)
+{
+	struct nes_ib_device *nesibdev = nesvnic->nesibdev;
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct ib_event event;
+	event.device = &nesibdev->ibdev;
+	event.element.port_num = nesvnic->logical_port + 1;
+	event.event = nesdev->iw_status ? IB_EVENT_PORT_ACTIVE : IB_EVENT_PORT_ERR;
+
+	if (!nesvnic->event_timer.function) {
+		ib_dispatch_event(&event);
+		nesvnic->last_dispatched_event = event.event;
+		nesvnic->event_timer.function = nes_handle_delayed_event;
+		nesvnic->event_timer.data = (unsigned long) nesvnic;
+		nesvnic->event_timer.expires = jiffies + NES_EVENT_DELAY;
+		add_timer(&nesvnic->event_timer);
+	} else {
+		mod_timer(&nesvnic->event_timer, jiffies + NES_EVENT_DELAY);
+	}
+	nesvnic->delayed_event = event.event;
+}
+
+
+/**
+ * nes_destroy_ofa_device
+ */
+void nes_destroy_ofa_device(struct nes_ib_device *nesibdev)
+{
+	if (nesibdev == NULL)
+		return;
+
+	nes_unregister_ofa_device(nesibdev);
+
+	kfree(nesibdev->ibdev.iwcm);
+	ib_dealloc_device(&nesibdev->ibdev);
+}
+
+
+/**
+ * nes_register_ofa_device
+ */
+int nes_register_ofa_device(struct nes_ib_device *nesibdev)
+{
+	struct nes_vnic *nesvnic = nesibdev->nesvnic;
+	struct nes_device *nesdev = nesvnic->nesdev;
+	struct nes_adapter *nesadapter = nesdev->nesadapter;
+	int i, ret;
+
+	ret = ib_register_device(&nesvnic->nesibdev->ibdev, NULL);
+	if (ret) {
+		return ret;
+	}
+
+	/* Get the resources allocated to this device */
+	nesibdev->max_cq = (nesadapter->max_cq-NES_FIRST_QPN) / nesadapter->port_count;
+	nesibdev->max_mr = nesadapter->max_mr / nesadapter->port_count;
+	nesibdev->max_qp = (nesadapter->max_qp-NES_FIRST_QPN) / nesadapter->port_count;
+	nesibdev->max_pd = nesadapter->max_pd / nesadapter->port_count;
+
+	for (i = 0; i < ARRAY_SIZE(nes_dev_attributes); ++i) {
+		ret = device_create_file(&nesibdev->ibdev.dev, nes_dev_attributes[i]);
+		if (ret) {
+			while (i > 0) {
+				i--;
+				device_remove_file(&nesibdev->ibdev.dev,
+						   nes_dev_attributes[i]);
+			}
+			ib_unregister_device(&nesibdev->ibdev);
+			return ret;
+		}
+	}
+
+	nesvnic->of_device_registered = 1;
+
+	return 0;
+}
+
+
+/**
+ * nes_unregister_ofa_device
+ */
+static void nes_unregister_ofa_device(struct nes_ib_device *nesibdev)
+{
+	struct nes_vnic *nesvnic = nesibdev->nesvnic;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(nes_dev_attributes); ++i) {
+		device_remove_file(&nesibdev->ibdev.dev, nes_dev_attributes[i]);
+	}
+
+	if (nesvnic->of_device_registered) {
+		ib_unregister_device(&nesibdev->ibdev);
+	}
+
+	nesvnic->of_device_registered = 0;
+}
diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h
new file mode 100644
index 000000000..309b31c31
--- /dev/null
+++ b/drivers/infiniband/hw/nes/nes_verbs.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2006 - 2011 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef NES_VERBS_H
+#define NES_VERBS_H
+
+struct nes_device;
+
+#define NES_MAX_USER_DB_REGIONS  4096
+#define NES_MAX_USER_WQ_REGIONS  4096
+
+#define NES_TERM_SENT            0x01
+#define NES_TERM_RCVD            0x02
+#define NES_TERM_DONE            0x04
+
+struct nes_ucontext {
+	struct ib_ucontext ibucontext;
+	struct nes_device  *nesdev;
+	unsigned long      mmap_wq_offset;
+	unsigned long      mmap_cq_offset; /* to be removed */
+	int                index;		/* rnic index (minor) */
+	unsigned long      allocated_doorbells[BITS_TO_LONGS(NES_MAX_USER_DB_REGIONS)];
+	u16                mmap_db_index[NES_MAX_USER_DB_REGIONS];
+	u16                first_free_db;
+	unsigned long      allocated_wqs[BITS_TO_LONGS(NES_MAX_USER_WQ_REGIONS)];
+	struct nes_qp      *mmap_nesqp[NES_MAX_USER_WQ_REGIONS];
+	u16                first_free_wq;
+	struct list_head   cq_reg_mem_list;
+	struct list_head   qp_reg_mem_list;
+	u32                mcrqf;
+	atomic_t	   usecnt;
+};
+
+struct nes_pd {
+	struct ib_pd ibpd;
+	u16          pd_id;
+	atomic_t     sqp_count;
+	u16          mmap_db_index;
+};
+
+struct nes_mr {
+	union {
+		struct ib_mr  ibmr;
+		struct ib_mw  ibmw;
+		struct ib_fmr ibfmr;
+	};
+	struct ib_umem    *region;
+	u16               pbls_used;
+	u8                mode;
+	u8                pbl_4k;
+};
+
+struct nes_hw_pb {
+	__le32 pa_low;
+	__le32 pa_high;
+};
+
+struct nes_vpbl {
+	dma_addr_t       pbl_pbase;
+	struct nes_hw_pb *pbl_vbase;
+};
+
+struct nes_root_vpbl {
+	dma_addr_t       pbl_pbase;
+	struct nes_hw_pb *pbl_vbase;
+	struct nes_vpbl  *leaf_vpbl;
+};
+
+struct nes_fmr {
+	struct nes_mr        nesmr;
+	u32                  leaf_pbl_cnt;
+	struct nes_root_vpbl root_vpbl;
+	struct ib_qp         *ib_qp;
+	int                  access_rights;
+	struct ib_fmr_attr   attr;
+};
+
+struct nes_av;
+
+struct nes_cq {
+	struct ib_cq     ibcq;
+	struct nes_hw_cq hw_cq;
+	u32              polled_completions;
+	u32              cq_mem_size;
+	spinlock_t       lock;
+	u8               virtual_cq;
+	u8               pad[3];
+	u32		 mcrqf;
+};
+
+struct nes_wq {
+	spinlock_t lock;
+};
+
+struct disconn_work {
+	struct work_struct    work;
+	struct nes_qp         *nesqp;
+};
+
+struct iw_cm_id;
+struct ietf_mpa_frame;
+
+struct nes_qp {
+	struct ib_qp          ibqp;
+	void                  *allocated_buffer;
+	struct iw_cm_id       *cm_id;
+	struct nes_cq         *nesscq;
+	struct nes_cq         *nesrcq;
+	struct nes_pd         *nespd;
+	void *cm_node; /* handle of the node this QP is associated with */
+	void                  *ietf_frame;
+	u8                    ietf_frame_size;
+	dma_addr_t            ietf_frame_pbase;
+	struct ib_mr          *lsmm_mr;
+	struct nes_hw_qp      hwqp;
+	struct work_struct    work;
+	enum ib_qp_state      ibqp_state;
+	u32                   iwarp_state;
+	u32                   hte_index;
+	u32                   last_aeq;
+	u32                   qp_mem_size;
+	atomic_t              refcount;
+	atomic_t              close_timer_started;
+	u32                   mmap_sq_db_index;
+	u32                   mmap_rq_db_index;
+	spinlock_t            lock;
+	spinlock_t            pau_lock;
+	struct nes_qp_context *nesqp_context;
+	dma_addr_t            nesqp_context_pbase;
+	void	              *pbl_vbase;
+	dma_addr_t            pbl_pbase;
+	struct page           *page;
+	struct timer_list     terminate_timer;
+	enum ib_event_type    terminate_eventtype;
+	struct sk_buff_head   pau_list;
+	u32                   pau_rcv_nxt;
+	u16                   active_conn:1;
+	u16                   skip_lsmm:1;
+	u16                   user_mode:1;
+	u16                   hte_added:1;
+	u16                   flush_issued:1;
+	u16                   destroyed:1;
+	u16                   sig_all:1;
+	u16                   pau_mode:1;
+	u16                   rsvd:8;
+	u16                   private_data_len;
+	u16                   term_sq_flush_code;
+	u16                   term_rq_flush_code;
+	u8                    hw_iwarp_state;
+	u8                    hw_tcp_state;
+	u8                    term_flags;
+	u8                    sq_kmapped;
+	u8                    pau_busy;
+	u8                    pau_pending;
+	u8                    pau_state;
+	__u64                 nesuqp_addr;
+};
+#endif			/* NES_VERBS_H */
diff --git a/drivers/infiniband/hw/ocrdma/Kconfig b/drivers/infiniband/hw/ocrdma/Kconfig
new file mode 100644
index 000000000..c0cddc019
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/Kconfig
@@ -0,0 +1,8 @@
+config INFINIBAND_OCRDMA
+	tristate "Emulex One Connect HCA support"
+	depends on ETHERNET && NETDEVICES && PCI && INET && (IPV6 || IPV6=n)
+	select NET_VENDOR_EMULEX
+	select BE2NET
+	---help---
+	  This driver provides low-level InfiniBand over Ethernet
+	  support for Emulex One Connect host channel adapters (HCAs).
diff --git a/drivers/infiniband/hw/ocrdma/Makefile b/drivers/infiniband/hw/ocrdma/Makefile
new file mode 100644
index 000000000..d1bfd4f4c
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/Makefile
@@ -0,0 +1,5 @@
+ccflags-y := -Idrivers/net/ethernet/emulex/benet
+
+obj-$(CONFIG_INFINIBAND_OCRDMA)	+= ocrdma.o
+
+ocrdma-y :=	ocrdma_main.o ocrdma_verbs.o ocrdma_hw.o ocrdma_ah.o ocrdma_stats.o
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma.h b/drivers/infiniband/hw/ocrdma/ocrdma.h
new file mode 100644
index 000000000..b396344fa
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma.h
@@ -0,0 +1,579 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#ifndef __OCRDMA_H__
+#define __OCRDMA_H__
+
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include <be_roce.h>
+#include "ocrdma_sli.h"
+
+#define OCRDMA_ROCE_DRV_VERSION "10.6.0.0"
+
+#define OCRDMA_ROCE_DRV_DESC "Emulex OneConnect RoCE Driver"
+#define OCRDMA_NODE_DESC "Emulex OneConnect RoCE HCA"
+
+#define OC_NAME_SH	OCRDMA_NODE_DESC "(Skyhawk)"
+#define OC_NAME_UNKNOWN OCRDMA_NODE_DESC "(Unknown)"
+
+#define OC_SKH_DEVICE_PF 0x720
+#define OC_SKH_DEVICE_VF 0x728
+#define OCRDMA_MAX_AH 512
+
+#define OCRDMA_UVERBS(CMD_NAME) (1ull << IB_USER_VERBS_CMD_##CMD_NAME)
+
+#define convert_to_64bit(lo, hi) ((u64)hi << 32 | (u64)lo)
+#define EQ_INTR_PER_SEC_THRSH_HI 150000
+#define EQ_INTR_PER_SEC_THRSH_LOW 100000
+#define EQ_AIC_MAX_EQD 20
+#define EQ_AIC_MIN_EQD 0
+
+void ocrdma_eqd_set_task(struct work_struct *work);
+
+struct ocrdma_dev_attr {
+	u8 fw_ver[32];
+	u32 vendor_id;
+	u32 device_id;
+	u16 max_pd;
+	u16 max_dpp_pds;
+	u16 max_cq;
+	u16 max_cqe;
+	u16 max_qp;
+	u16 max_wqe;
+	u16 max_rqe;
+	u16 max_srq;
+	u32 max_inline_data;
+	int max_send_sge;
+	int max_recv_sge;
+	int max_srq_sge;
+	int max_rdma_sge;
+	int max_mr;
+	u64 max_mr_size;
+	u32 max_num_mr_pbl;
+	int max_mw;
+	int max_fmr;
+	int max_map_per_fmr;
+	int max_pages_per_frmr;
+	u16 max_ord_per_qp;
+	u16 max_ird_per_qp;
+
+	int device_cap_flags;
+	u8 cq_overflow_detect;
+	u8 srq_supported;
+
+	u32 wqe_size;
+	u32 rqe_size;
+	u32 ird_page_size;
+	u8 local_ca_ack_delay;
+	u8 ird;
+	u8 num_ird_pages;
+};
+
+struct ocrdma_dma_mem {
+	void *va;
+	dma_addr_t pa;
+	u32 size;
+};
+
+struct ocrdma_pbl {
+	void *va;
+	dma_addr_t pa;
+};
+
+struct ocrdma_queue_info {
+	void *va;
+	dma_addr_t dma;
+	u32 size;
+	u16 len;
+	u16 entry_size;		/* Size of an element in the queue */
+	u16 id;			/* qid, where to ring the doorbell. */
+	u16 head, tail;
+	bool created;
+};
+
+struct ocrdma_aic_obj {         /* Adaptive interrupt coalescing (AIC) info */
+	u32 prev_eqd;
+	u64 eq_intr_cnt;
+	u64 prev_eq_intr_cnt;
+};
+
+struct ocrdma_eq {
+	struct ocrdma_queue_info q;
+	u32 vector;
+	int cq_cnt;
+	struct ocrdma_dev *dev;
+	char irq_name[32];
+	struct ocrdma_aic_obj aic_obj;
+};
+
+struct ocrdma_mq {
+	struct ocrdma_queue_info sq;
+	struct ocrdma_queue_info cq;
+	bool rearm_cq;
+};
+
+struct mqe_ctx {
+	struct mutex lock; /* for serializing mailbox commands on MQ */
+	wait_queue_head_t cmd_wait;
+	u32 tag;
+	u16 cqe_status;
+	u16 ext_status;
+	bool cmd_done;
+	bool fw_error_state;
+};
+
+struct ocrdma_hw_mr {
+	u32 lkey;
+	u8 fr_mr;
+	u8 remote_atomic;
+	u8 remote_rd;
+	u8 remote_wr;
+	u8 local_rd;
+	u8 local_wr;
+	u8 mw_bind;
+	u8 rsvd;
+	u64 len;
+	struct ocrdma_pbl *pbl_table;
+	u32 num_pbls;
+	u32 num_pbes;
+	u32 pbl_size;
+	u32 pbe_size;
+	u64 fbo;
+	u64 va;
+};
+
+struct ocrdma_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct ocrdma_hw_mr hwmr;
+};
+
+struct ocrdma_stats {
+	u8 type;
+	struct ocrdma_dev *dev;
+};
+
+struct ocrdma_pd_resource_mgr {
+	u32 pd_norm_start;
+	u16 pd_norm_count;
+	u16 pd_norm_thrsh;
+	u16 max_normal_pd;
+	u32 pd_dpp_start;
+	u16 pd_dpp_count;
+	u16 pd_dpp_thrsh;
+	u16 max_dpp_pd;
+	u16 dpp_page_index;
+	unsigned long *pd_norm_bitmap;
+	unsigned long *pd_dpp_bitmap;
+	bool pd_prealloc_valid;
+};
+
+struct stats_mem {
+	struct ocrdma_mqe mqe;
+	void *va;
+	dma_addr_t pa;
+	u32 size;
+	char *debugfs_mem;
+};
+
+struct phy_info {
+	u16 auto_speeds_supported;
+	u16 fixed_speeds_supported;
+	u16 phy_type;
+	u16 interface_type;
+};
+
+struct ocrdma_dev {
+	struct ib_device ibdev;
+	struct ocrdma_dev_attr attr;
+
+	struct mutex dev_lock; /* provides syncronise access to device data */
+	spinlock_t flush_q_lock ____cacheline_aligned;
+
+	struct ocrdma_cq **cq_tbl;
+	struct ocrdma_qp **qp_tbl;
+
+	struct ocrdma_eq *eq_tbl;
+	int eq_cnt;
+	struct delayed_work eqd_work;
+	u16 base_eqid;
+	u16 max_eq;
+
+	union ib_gid *sgid_tbl;
+	/* provided synchronization to sgid table for
+	 * updating gid entries triggered by notifier.
+	 */
+	spinlock_t sgid_lock;
+
+	int gsi_qp_created;
+	struct ocrdma_cq *gsi_sqcq;
+	struct ocrdma_cq *gsi_rqcq;
+
+	struct {
+		struct ocrdma_av *va;
+		dma_addr_t pa;
+		u32 size;
+		u32 num_ah;
+		/* provide synchronization for av
+		 * entry allocations.
+		 */
+		spinlock_t lock;
+		u32 ahid;
+		struct ocrdma_pbl pbl;
+	} av_tbl;
+
+	void *mbx_cmd;
+	struct ocrdma_mq mq;
+	struct mqe_ctx mqe_ctx;
+
+	struct be_dev_info nic_info;
+	struct phy_info phy;
+	char model_number[32];
+	u32 hba_port_num;
+
+	struct list_head entry;
+	struct rcu_head rcu;
+	int id;
+	u64 *stag_arr;
+	u8 sl; /* service level */
+	bool pfc_state;
+	atomic_t update_sl;
+	u16 pvid;
+	u32 asic_id;
+
+	ulong last_stats_time;
+	struct mutex stats_lock; /* provide synch for debugfs operations */
+	struct stats_mem stats_mem;
+	struct ocrdma_stats rsrc_stats;
+	struct ocrdma_stats rx_stats;
+	struct ocrdma_stats wqe_stats;
+	struct ocrdma_stats tx_stats;
+	struct ocrdma_stats db_err_stats;
+	struct ocrdma_stats tx_qp_err_stats;
+	struct ocrdma_stats rx_qp_err_stats;
+	struct ocrdma_stats tx_dbg_stats;
+	struct ocrdma_stats rx_dbg_stats;
+	struct ocrdma_stats driver_stats;
+	struct ocrdma_stats reset_stats;
+	struct dentry *dir;
+	atomic_t async_err_stats[OCRDMA_MAX_ASYNC_ERRORS];
+	atomic_t cqe_err_stats[OCRDMA_MAX_CQE_ERR];
+	struct ocrdma_pd_resource_mgr *pd_mgr;
+};
+
+struct ocrdma_cq {
+	struct ib_cq ibcq;
+	struct ocrdma_cqe *va;
+	u32 phase;
+	u32 getp;	/* pointer to pending wrs to
+			 * return to stack, wrap arounds
+			 * at max_hw_cqe
+			 */
+	u32 max_hw_cqe;
+	bool phase_change;
+	bool deferred_arm, deferred_sol;
+	bool first_arm;
+
+	spinlock_t cq_lock ____cacheline_aligned; /* provide synchronization
+						   * to cq polling
+						   */
+	/* syncronizes cq completion handler invoked from multiple context */
+	spinlock_t comp_handler_lock ____cacheline_aligned;
+	u16 id;
+	u16 eqn;
+
+	struct ocrdma_ucontext *ucontext;
+	dma_addr_t pa;
+	u32 len;
+	u32 cqe_cnt;
+
+	/* head of all qp's sq and rq for which cqes need to be flushed
+	 * by the software.
+	 */
+	struct list_head sq_head, rq_head;
+};
+
+struct ocrdma_pd {
+	struct ib_pd ibpd;
+	struct ocrdma_ucontext *uctx;
+	u32 id;
+	int num_dpp_qp;
+	u32 dpp_page;
+	bool dpp_enabled;
+};
+
+struct ocrdma_ah {
+	struct ib_ah ibah;
+	struct ocrdma_av *av;
+	u16 sgid_index;
+	u32 id;
+};
+
+struct ocrdma_qp_hwq_info {
+	u8 *va;			/* virtual address */
+	u32 max_sges;
+	u32 head, tail;
+	u32 entry_size;
+	u32 max_cnt;
+	u32 max_wqe_idx;
+	u16 dbid;		/* qid, where to ring the doorbell. */
+	u32 len;
+	dma_addr_t pa;
+};
+
+struct ocrdma_srq {
+	struct ib_srq ibsrq;
+	u8 __iomem *db;
+	struct ocrdma_qp_hwq_info rq;
+	u64 *rqe_wr_id_tbl;
+	u32 *idx_bit_fields;
+	u32 bit_fields_len;
+
+	/* provide synchronization to multiple context(s) posting rqe */
+	spinlock_t q_lock ____cacheline_aligned;
+
+	struct ocrdma_pd *pd;
+	u32 id;
+};
+
+struct ocrdma_qp {
+	struct ib_qp ibqp;
+
+	u8 __iomem *sq_db;
+	struct ocrdma_qp_hwq_info sq;
+	struct {
+		uint64_t wrid;
+		uint16_t dpp_wqe_idx;
+		uint16_t dpp_wqe;
+		uint8_t  signaled;
+		uint8_t  rsvd[3];
+	} *wqe_wr_id_tbl;
+	u32 max_inline_data;
+
+	/* provide synchronization to multiple context(s) posting wqe, rqe */
+	spinlock_t q_lock ____cacheline_aligned;
+	struct ocrdma_cq *sq_cq;
+	/* list maintained per CQ to flush SQ errors */
+	struct list_head sq_entry;
+
+	u8 __iomem *rq_db;
+	struct ocrdma_qp_hwq_info rq;
+	u64 *rqe_wr_id_tbl;
+	struct ocrdma_cq *rq_cq;
+	struct ocrdma_srq *srq;
+	/* list maintained per CQ to flush RQ errors */
+	struct list_head rq_entry;
+
+	enum ocrdma_qp_state state;	/*  QP state */
+	int cap_flags;
+	u32 max_ord, max_ird;
+
+	u32 id;
+	struct ocrdma_pd *pd;
+
+	enum ib_qp_type qp_type;
+
+	int sgid_idx;
+	u32 qkey;
+	bool dpp_enabled;
+	u8 *ird_q_va;
+	bool signaled;
+};
+
+struct ocrdma_ucontext {
+	struct ib_ucontext ibucontext;
+
+	struct list_head mm_head;
+	struct mutex mm_list_lock; /* protects list entries of mm type */
+	struct ocrdma_pd *cntxt_pd;
+	int pd_in_use;
+
+	struct {
+		u32 *va;
+		dma_addr_t pa;
+		u32 len;
+	} ah_tbl;
+};
+
+struct ocrdma_mm {
+	struct {
+		u64 phy_addr;
+		unsigned long len;
+	} key;
+	struct list_head entry;
+};
+
+static inline struct ocrdma_dev *get_ocrdma_dev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct ocrdma_dev, ibdev);
+}
+
+static inline struct ocrdma_ucontext *get_ocrdma_ucontext(struct ib_ucontext
+							  *ibucontext)
+{
+	return container_of(ibucontext, struct ocrdma_ucontext, ibucontext);
+}
+
+static inline struct ocrdma_pd *get_ocrdma_pd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct ocrdma_pd, ibpd);
+}
+
+static inline struct ocrdma_cq *get_ocrdma_cq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct ocrdma_cq, ibcq);
+}
+
+static inline struct ocrdma_qp *get_ocrdma_qp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct ocrdma_qp, ibqp);
+}
+
+static inline struct ocrdma_mr *get_ocrdma_mr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct ocrdma_mr, ibmr);
+}
+
+static inline struct ocrdma_ah *get_ocrdma_ah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct ocrdma_ah, ibah);
+}
+
+static inline struct ocrdma_srq *get_ocrdma_srq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct ocrdma_srq, ibsrq);
+}
+
+static inline int is_cqe_valid(struct ocrdma_cq *cq, struct ocrdma_cqe *cqe)
+{
+	int cqe_valid;
+	cqe_valid = le32_to_cpu(cqe->flags_status_srcqpn) & OCRDMA_CQE_VALID;
+	return (cqe_valid == cq->phase);
+}
+
+static inline int is_cqe_for_sq(struct ocrdma_cqe *cqe)
+{
+	return (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_QTYPE) ? 0 : 1;
+}
+
+static inline int is_cqe_invalidated(struct ocrdma_cqe *cqe)
+{
+	return (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_INVALIDATE) ? 1 : 0;
+}
+
+static inline int is_cqe_imm(struct ocrdma_cqe *cqe)
+{
+	return (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_IMM) ? 1 : 0;
+}
+
+static inline int is_cqe_wr_imm(struct ocrdma_cqe *cqe)
+{
+	return (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_WRITE_IMM) ? 1 : 0;
+}
+
+static inline int ocrdma_resolve_dmac(struct ocrdma_dev *dev,
+		struct ib_ah_attr *ah_attr, u8 *mac_addr)
+{
+	struct in6_addr in6;
+
+	memcpy(&in6, ah_attr->grh.dgid.raw, sizeof(in6));
+	if (rdma_is_multicast_addr(&in6))
+		rdma_get_mcast_mac(&in6, mac_addr);
+	else if (rdma_link_local_addr(&in6))
+		rdma_get_ll_mac(&in6, mac_addr);
+	else
+		memcpy(mac_addr, ah_attr->dmac, ETH_ALEN);
+	return 0;
+}
+
+static inline char *hca_name(struct ocrdma_dev *dev)
+{
+	switch (dev->nic_info.pdev->device) {
+	case OC_SKH_DEVICE_PF:
+	case OC_SKH_DEVICE_VF:
+		return OC_NAME_SH;
+	default:
+		return OC_NAME_UNKNOWN;
+	}
+}
+
+static inline int ocrdma_get_eq_table_index(struct ocrdma_dev *dev,
+		int eqid)
+{
+	int indx;
+
+	for (indx = 0; indx < dev->eq_cnt; indx++) {
+		if (dev->eq_tbl[indx].q.id == eqid)
+			return indx;
+	}
+
+	return -EINVAL;
+}
+
+static inline u8 ocrdma_get_asic_type(struct ocrdma_dev *dev)
+{
+	if (dev->nic_info.dev_family == 0xF && !dev->asic_id) {
+		pci_read_config_dword(
+			dev->nic_info.pdev,
+			OCRDMA_SLI_ASIC_ID_OFFSET, &dev->asic_id);
+	}
+
+	return (dev->asic_id & OCRDMA_SLI_ASIC_GEN_NUM_MASK) >>
+				OCRDMA_SLI_ASIC_GEN_NUM_SHIFT;
+}
+
+static inline u8 ocrdma_get_pfc_prio(u8 *pfc, u8 prio)
+{
+	return *(pfc + prio);
+}
+
+static inline u8 ocrdma_get_app_prio(u8 *app_prio, u8 prio)
+{
+	return *(app_prio + prio);
+}
+
+static inline u8 ocrdma_is_enabled_and_synced(u32 state)
+{	/* May also be used to interpret TC-state, QCN-state
+	 * Appl-state and Logical-link-state in future.
+	 */
+	return (state & OCRDMA_STATE_FLAG_ENABLED) &&
+		(state & OCRDMA_STATE_FLAG_SYNC);
+}
+
+#endif
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_abi.h b/drivers/infiniband/hw/ocrdma/ocrdma_abi.h
new file mode 100644
index 000000000..1554cca57
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_abi.h
@@ -0,0 +1,134 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#ifndef __OCRDMA_ABI_H__
+#define __OCRDMA_ABI_H__
+
+#define OCRDMA_ABI_VERSION 2
+#define OCRDMA_BE_ROCE_ABI_VERSION 1
+/* user kernel communication data structures. */
+
+struct ocrdma_alloc_ucontext_resp {
+	u32 dev_id;
+	u32 wqe_size;
+	u32 max_inline_data;
+	u32 dpp_wqe_size;
+	u64 ah_tbl_page;
+	u32 ah_tbl_len;
+	u32 rqe_size;
+	u8 fw_ver[32];
+	/* for future use/new features in progress */
+	u64 rsvd1;
+	u64 rsvd2;
+};
+
+struct ocrdma_alloc_pd_ureq {
+	u64 rsvd1;
+};
+
+struct ocrdma_alloc_pd_uresp {
+	u32 id;
+	u32 dpp_enabled;
+	u32 dpp_page_addr_hi;
+	u32 dpp_page_addr_lo;
+	u64 rsvd1;
+};
+
+struct ocrdma_create_cq_ureq {
+	u32 dpp_cq;
+	u32 rsvd; /* pad */
+};
+
+#define MAX_CQ_PAGES 8
+struct ocrdma_create_cq_uresp {
+	u32 cq_id;
+	u32 page_size;
+	u32 num_pages;
+	u32 max_hw_cqe;
+	u64 page_addr[MAX_CQ_PAGES];
+	u64 db_page_addr;
+	u32 db_page_size;
+	u32 phase_change;
+	/* for future use/new features in progress */
+	u64 rsvd1;
+	u64 rsvd2;
+};
+
+#define MAX_QP_PAGES 8
+#define MAX_UD_AV_PAGES 8
+
+struct ocrdma_create_qp_ureq {
+	u8 enable_dpp_cq;
+	u8 rsvd;
+	u16 dpp_cq_id;
+	u32 rsvd1;	/* pad */
+};
+
+struct ocrdma_create_qp_uresp {
+	u16 qp_id;
+	u16 sq_dbid;
+	u16 rq_dbid;
+	u16 resv0;	/* pad */
+	u32 sq_page_size;
+	u32 rq_page_size;
+	u32 num_sq_pages;
+	u32 num_rq_pages;
+	u64 sq_page_addr[MAX_QP_PAGES];
+	u64 rq_page_addr[MAX_QP_PAGES];
+	u64 db_page_addr;
+	u32 db_page_size;
+	u32 dpp_credit;
+	u32 dpp_offset;
+	u32 num_wqe_allocated;
+	u32 num_rqe_allocated;
+	u32 db_sq_offset;
+	u32 db_rq_offset;
+	u32 db_shift;
+	u64 rsvd[11];
+} __packed;
+
+struct ocrdma_create_srq_uresp {
+	u16 rq_dbid;
+	u16 resv0;	/* pad */
+	u32 resv1;
+
+	u32 rq_page_size;
+	u32 num_rq_pages;
+
+	u64 rq_page_addr[MAX_QP_PAGES];
+	u64 db_page_addr;
+
+	u32 db_page_size;
+	u32 num_rqe_allocated;
+	u32 db_rq_offset;
+	u32 db_shift;
+
+	u64 rsvd2;
+	u64 rsvd3;
+};
+
+#endif				/* __OCRDMA_ABI_H__ */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
new file mode 100644
index 000000000..f5a5ea836
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c
@@ -0,0 +1,227 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#include <net/neighbour.h>
+#include <net/netevent.h>
+
+#include <rdma/ib_addr.h>
+#include <rdma/ib_mad.h>
+
+#include "ocrdma.h"
+#include "ocrdma_verbs.h"
+#include "ocrdma_ah.h"
+#include "ocrdma_hw.h"
+#include "ocrdma_stats.h"
+
+#define OCRDMA_VID_PCP_SHIFT	0xD
+
+static inline int set_av_attr(struct ocrdma_dev *dev, struct ocrdma_ah *ah,
+			struct ib_ah_attr *attr, union ib_gid *sgid,
+			int pdid, bool *isvlan)
+{
+	int status = 0;
+	u16 vlan_tag;
+	struct ocrdma_eth_vlan eth;
+	struct ocrdma_grh grh;
+	int eth_sz;
+
+	memset(&eth, 0, sizeof(eth));
+	memset(&grh, 0, sizeof(grh));
+
+	/* VLAN */
+	vlan_tag = attr->vlan_id;
+	if (!vlan_tag || (vlan_tag > 0xFFF))
+		vlan_tag = dev->pvid;
+	if (vlan_tag || dev->pfc_state) {
+		if (!vlan_tag) {
+			pr_err("ocrdma%d:Using VLAN with PFC is recommended\n",
+				dev->id);
+			pr_err("ocrdma%d:Using VLAN 0 for this connection\n",
+				dev->id);
+		}
+		eth.eth_type = cpu_to_be16(0x8100);
+		eth.roce_eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE);
+		vlan_tag |= (dev->sl & 0x07) << OCRDMA_VID_PCP_SHIFT;
+		eth.vlan_tag = cpu_to_be16(vlan_tag);
+		eth_sz = sizeof(struct ocrdma_eth_vlan);
+		*isvlan = true;
+	} else {
+		eth.eth_type = cpu_to_be16(OCRDMA_ROCE_ETH_TYPE);
+		eth_sz = sizeof(struct ocrdma_eth_basic);
+	}
+	/* MAC */
+	memcpy(&eth.smac[0], &dev->nic_info.mac_addr[0], ETH_ALEN);
+	status = ocrdma_resolve_dmac(dev, attr, &eth.dmac[0]);
+	if (status)
+		return status;
+	ah->sgid_index = attr->grh.sgid_index;
+	memcpy(&grh.sgid[0], sgid->raw, sizeof(union ib_gid));
+	memcpy(&grh.dgid[0], attr->grh.dgid.raw, sizeof(attr->grh.dgid.raw));
+
+	grh.tclass_flow = cpu_to_be32((6 << 28) |
+			(attr->grh.traffic_class << 24) |
+			attr->grh.flow_label);
+	/* 0x1b is next header value in GRH */
+	grh.pdid_hoplimit = cpu_to_be32((pdid << 16) |
+			(0x1b << 8) | attr->grh.hop_limit);
+	/* Eth HDR */
+	memcpy(&ah->av->eth_hdr, &eth, eth_sz);
+	memcpy((u8 *)ah->av + eth_sz, &grh, sizeof(struct ocrdma_grh));
+	if (*isvlan)
+		ah->av->valid |= OCRDMA_AV_VLAN_VALID;
+	ah->av->valid = cpu_to_le32(ah->av->valid);
+	return status;
+}
+
+struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr)
+{
+	u32 *ahid_addr;
+	bool isvlan = false;
+	int status;
+	struct ocrdma_ah *ah;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	union ib_gid sgid;
+
+	if (!(attr->ah_flags & IB_AH_GRH))
+		return ERR_PTR(-EINVAL);
+
+	if (atomic_cmpxchg(&dev->update_sl, 1, 0))
+		ocrdma_init_service_level(dev);
+	ah = kzalloc(sizeof(*ah), GFP_ATOMIC);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	status = ocrdma_alloc_av(dev, ah);
+	if (status)
+		goto av_err;
+
+	status = ocrdma_query_gid(&dev->ibdev, 1, attr->grh.sgid_index, &sgid);
+	if (status) {
+		pr_err("%s(): Failed to query sgid, status = %d\n",
+		      __func__, status);
+		goto av_conf_err;
+	}
+
+	if ((pd->uctx) &&
+	    (!rdma_is_multicast_addr((struct in6_addr *)attr->grh.dgid.raw)) &&
+	    (!rdma_link_local_addr((struct in6_addr *)attr->grh.dgid.raw))) {
+		status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid,
+                                        attr->dmac, &attr->vlan_id);
+		if (status) {
+			pr_err("%s(): Failed to resolve dmac from gid." 
+				"status = %d\n", __func__, status);
+			goto av_conf_err;
+		}
+	}
+
+	status = set_av_attr(dev, ah, attr, &sgid, pd->id, &isvlan);
+	if (status)
+		goto av_conf_err;
+
+	/* if pd is for the user process, pass the ah_id to user space */
+	if ((pd->uctx) && (pd->uctx->ah_tbl.va)) {
+		ahid_addr = pd->uctx->ah_tbl.va + attr->dlid;
+		*ahid_addr = 0;
+		*ahid_addr |= ah->id & OCRDMA_AH_ID_MASK;
+		if (isvlan)
+			*ahid_addr |= (OCRDMA_AH_VLAN_VALID_MASK <<
+				       OCRDMA_AH_VLAN_VALID_SHIFT);
+	}
+
+	return &ah->ibah;
+
+av_conf_err:
+	ocrdma_free_av(dev, ah);
+av_err:
+	kfree(ah);
+	return ERR_PTR(status);
+}
+
+int ocrdma_destroy_ah(struct ib_ah *ibah)
+{
+	struct ocrdma_ah *ah = get_ocrdma_ah(ibah);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibah->device);
+
+	ocrdma_free_av(dev, ah);
+	kfree(ah);
+	return 0;
+}
+
+int ocrdma_query_ah(struct ib_ah *ibah, struct ib_ah_attr *attr)
+{
+	struct ocrdma_ah *ah = get_ocrdma_ah(ibah);
+	struct ocrdma_av *av = ah->av;
+	struct ocrdma_grh *grh;
+	attr->ah_flags |= IB_AH_GRH;
+	if (ah->av->valid & OCRDMA_AV_VALID) {
+		grh = (struct ocrdma_grh *)((u8 *)ah->av +
+				sizeof(struct ocrdma_eth_vlan));
+		attr->sl = be16_to_cpu(av->eth_hdr.vlan_tag) >> 13;
+	} else {
+		grh = (struct ocrdma_grh *)((u8 *)ah->av +
+					sizeof(struct ocrdma_eth_basic));
+		attr->sl = 0;
+	}
+	memcpy(&attr->grh.dgid.raw[0], &grh->dgid[0], sizeof(grh->dgid));
+	attr->grh.sgid_index = ah->sgid_index;
+	attr->grh.hop_limit = be32_to_cpu(grh->pdid_hoplimit) & 0xff;
+	attr->grh.traffic_class = be32_to_cpu(grh->tclass_flow) >> 24;
+	attr->grh.flow_label = be32_to_cpu(grh->tclass_flow) & 0x00ffffffff;
+	return 0;
+}
+
+int ocrdma_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *attr)
+{
+	/* modify_ah is unsupported */
+	return -ENOSYS;
+}
+
+int ocrdma_process_mad(struct ib_device *ibdev,
+		       int process_mad_flags,
+		       u8 port_num,
+		       struct ib_wc *in_wc,
+		       struct ib_grh *in_grh,
+		       struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	int status;
+	struct ocrdma_dev *dev;
+
+	switch (in_mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_PERF_MGMT:
+		dev = get_ocrdma_dev(ibdev);
+		if (!ocrdma_pma_counters(dev, out_mad))
+			status = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+		else
+			status = IB_MAD_RESULT_SUCCESS;
+		break;
+	default:
+		status = IB_MAD_RESULT_SUCCESS;
+		break;
+	}
+	return status;
+}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.h b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
new file mode 100644
index 000000000..726a87cf2
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.h
@@ -0,0 +1,48 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#ifndef __OCRDMA_AH_H__
+#define __OCRDMA_AH_H__
+
+enum {
+	OCRDMA_AH_ID_MASK		= 0x3FF,
+	OCRDMA_AH_VLAN_VALID_MASK	= 0x01,
+	OCRDMA_AH_VLAN_VALID_SHIFT	= 0x1F
+};
+
+struct ib_ah *ocrdma_create_ah(struct ib_pd *, struct ib_ah_attr *);
+int ocrdma_destroy_ah(struct ib_ah *);
+int ocrdma_query_ah(struct ib_ah *, struct ib_ah_attr *);
+int ocrdma_modify_ah(struct ib_ah *, struct ib_ah_attr *);
+
+int ocrdma_process_mad(struct ib_device *,
+		       int process_mad_flags,
+		       u8 port_num,
+		       struct ib_wc *in_wc,
+		       struct ib_grh *in_grh,
+		       struct ib_mad *in_mad, struct ib_mad *out_mad);
+#endif				/* __OCRDMA_AH_H__ */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.c b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
new file mode 100644
index 000000000..47615ff33
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.c
@@ -0,0 +1,3172 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) CNA Adapters.              *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#include <linux/sched.h>
+#include <linux/interrupt.h>
+#include <linux/log2.h>
+#include <linux/dma-mapping.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+
+#include "ocrdma.h"
+#include "ocrdma_hw.h"
+#include "ocrdma_verbs.h"
+#include "ocrdma_ah.h"
+
+enum mbx_status {
+	OCRDMA_MBX_STATUS_FAILED		= 1,
+	OCRDMA_MBX_STATUS_ILLEGAL_FIELD		= 3,
+	OCRDMA_MBX_STATUS_OOR			= 100,
+	OCRDMA_MBX_STATUS_INVALID_PD		= 101,
+	OCRDMA_MBX_STATUS_PD_INUSE		= 102,
+	OCRDMA_MBX_STATUS_INVALID_CQ		= 103,
+	OCRDMA_MBX_STATUS_INVALID_QP		= 104,
+	OCRDMA_MBX_STATUS_INVALID_LKEY		= 105,
+	OCRDMA_MBX_STATUS_ORD_EXCEEDS		= 106,
+	OCRDMA_MBX_STATUS_IRD_EXCEEDS		= 107,
+	OCRDMA_MBX_STATUS_SENDQ_WQE_EXCEEDS	= 108,
+	OCRDMA_MBX_STATUS_RECVQ_RQE_EXCEEDS	= 109,
+	OCRDMA_MBX_STATUS_SGE_SEND_EXCEEDS	= 110,
+	OCRDMA_MBX_STATUS_SGE_WRITE_EXCEEDS	= 111,
+	OCRDMA_MBX_STATUS_SGE_RECV_EXCEEDS	= 112,
+	OCRDMA_MBX_STATUS_INVALID_STATE_CHANGE	= 113,
+	OCRDMA_MBX_STATUS_MW_BOUND		= 114,
+	OCRDMA_MBX_STATUS_INVALID_VA		= 115,
+	OCRDMA_MBX_STATUS_INVALID_LENGTH	= 116,
+	OCRDMA_MBX_STATUS_INVALID_FBO		= 117,
+	OCRDMA_MBX_STATUS_INVALID_ACC_RIGHTS	= 118,
+	OCRDMA_MBX_STATUS_INVALID_PBE_SIZE	= 119,
+	OCRDMA_MBX_STATUS_INVALID_PBL_ENTRY	= 120,
+	OCRDMA_MBX_STATUS_INVALID_PBL_SHIFT	= 121,
+	OCRDMA_MBX_STATUS_INVALID_SRQ_ID	= 129,
+	OCRDMA_MBX_STATUS_SRQ_ERROR		= 133,
+	OCRDMA_MBX_STATUS_RQE_EXCEEDS		= 134,
+	OCRDMA_MBX_STATUS_MTU_EXCEEDS		= 135,
+	OCRDMA_MBX_STATUS_MAX_QP_EXCEEDS	= 136,
+	OCRDMA_MBX_STATUS_SRQ_LIMIT_EXCEEDS	= 137,
+	OCRDMA_MBX_STATUS_SRQ_SIZE_UNDERUNS	= 138,
+	OCRDMA_MBX_STATUS_QP_BOUND		= 130,
+	OCRDMA_MBX_STATUS_INVALID_CHANGE	= 139,
+	OCRDMA_MBX_STATUS_ATOMIC_OPS_UNSUP	= 140,
+	OCRDMA_MBX_STATUS_INVALID_RNR_NAK_TIMER	= 141,
+	OCRDMA_MBX_STATUS_MW_STILL_BOUND	= 142,
+	OCRDMA_MBX_STATUS_PKEY_INDEX_INVALID	= 143,
+	OCRDMA_MBX_STATUS_PKEY_INDEX_EXCEEDS	= 144
+};
+
+enum additional_status {
+	OCRDMA_MBX_ADDI_STATUS_INSUFFICIENT_RESOURCES = 22
+};
+
+enum cqe_status {
+	OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_PRIVILEDGES	= 1,
+	OCRDMA_MBX_CQE_STATUS_INVALID_PARAMETER		= 2,
+	OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_RESOURCES	= 3,
+	OCRDMA_MBX_CQE_STATUS_QUEUE_FLUSHING		= 4,
+	OCRDMA_MBX_CQE_STATUS_DMA_FAILED		= 5
+};
+
+static inline void *ocrdma_get_eqe(struct ocrdma_eq *eq)
+{
+	return eq->q.va + (eq->q.tail * sizeof(struct ocrdma_eqe));
+}
+
+static inline void ocrdma_eq_inc_tail(struct ocrdma_eq *eq)
+{
+	eq->q.tail = (eq->q.tail + 1) & (OCRDMA_EQ_LEN - 1);
+}
+
+static inline void *ocrdma_get_mcqe(struct ocrdma_dev *dev)
+{
+	struct ocrdma_mcqe *cqe = (struct ocrdma_mcqe *)
+	    (dev->mq.cq.va + (dev->mq.cq.tail * sizeof(struct ocrdma_mcqe)));
+
+	if (!(le32_to_cpu(cqe->valid_ae_cmpl_cons) & OCRDMA_MCQE_VALID_MASK))
+		return NULL;
+	return cqe;
+}
+
+static inline void ocrdma_mcq_inc_tail(struct ocrdma_dev *dev)
+{
+	dev->mq.cq.tail = (dev->mq.cq.tail + 1) & (OCRDMA_MQ_CQ_LEN - 1);
+}
+
+static inline struct ocrdma_mqe *ocrdma_get_mqe(struct ocrdma_dev *dev)
+{
+	return dev->mq.sq.va + (dev->mq.sq.head * sizeof(struct ocrdma_mqe));
+}
+
+static inline void ocrdma_mq_inc_head(struct ocrdma_dev *dev)
+{
+	dev->mq.sq.head = (dev->mq.sq.head + 1) & (OCRDMA_MQ_LEN - 1);
+}
+
+static inline void *ocrdma_get_mqe_rsp(struct ocrdma_dev *dev)
+{
+	return dev->mq.sq.va + (dev->mqe_ctx.tag * sizeof(struct ocrdma_mqe));
+}
+
+enum ib_qp_state get_ibqp_state(enum ocrdma_qp_state qps)
+{
+	switch (qps) {
+	case OCRDMA_QPS_RST:
+		return IB_QPS_RESET;
+	case OCRDMA_QPS_INIT:
+		return IB_QPS_INIT;
+	case OCRDMA_QPS_RTR:
+		return IB_QPS_RTR;
+	case OCRDMA_QPS_RTS:
+		return IB_QPS_RTS;
+	case OCRDMA_QPS_SQD:
+	case OCRDMA_QPS_SQ_DRAINING:
+		return IB_QPS_SQD;
+	case OCRDMA_QPS_SQE:
+		return IB_QPS_SQE;
+	case OCRDMA_QPS_ERR:
+		return IB_QPS_ERR;
+	}
+	return IB_QPS_ERR;
+}
+
+static enum ocrdma_qp_state get_ocrdma_qp_state(enum ib_qp_state qps)
+{
+	switch (qps) {
+	case IB_QPS_RESET:
+		return OCRDMA_QPS_RST;
+	case IB_QPS_INIT:
+		return OCRDMA_QPS_INIT;
+	case IB_QPS_RTR:
+		return OCRDMA_QPS_RTR;
+	case IB_QPS_RTS:
+		return OCRDMA_QPS_RTS;
+	case IB_QPS_SQD:
+		return OCRDMA_QPS_SQD;
+	case IB_QPS_SQE:
+		return OCRDMA_QPS_SQE;
+	case IB_QPS_ERR:
+		return OCRDMA_QPS_ERR;
+	}
+	return OCRDMA_QPS_ERR;
+}
+
+static int ocrdma_get_mbx_errno(u32 status)
+{
+	int err_num;
+	u8 mbox_status = (status & OCRDMA_MBX_RSP_STATUS_MASK) >>
+					OCRDMA_MBX_RSP_STATUS_SHIFT;
+	u8 add_status = (status & OCRDMA_MBX_RSP_ASTATUS_MASK) >>
+					OCRDMA_MBX_RSP_ASTATUS_SHIFT;
+
+	switch (mbox_status) {
+	case OCRDMA_MBX_STATUS_OOR:
+	case OCRDMA_MBX_STATUS_MAX_QP_EXCEEDS:
+		err_num = -EAGAIN;
+		break;
+
+	case OCRDMA_MBX_STATUS_INVALID_PD:
+	case OCRDMA_MBX_STATUS_INVALID_CQ:
+	case OCRDMA_MBX_STATUS_INVALID_SRQ_ID:
+	case OCRDMA_MBX_STATUS_INVALID_QP:
+	case OCRDMA_MBX_STATUS_INVALID_CHANGE:
+	case OCRDMA_MBX_STATUS_MTU_EXCEEDS:
+	case OCRDMA_MBX_STATUS_INVALID_RNR_NAK_TIMER:
+	case OCRDMA_MBX_STATUS_PKEY_INDEX_INVALID:
+	case OCRDMA_MBX_STATUS_PKEY_INDEX_EXCEEDS:
+	case OCRDMA_MBX_STATUS_ILLEGAL_FIELD:
+	case OCRDMA_MBX_STATUS_INVALID_PBL_ENTRY:
+	case OCRDMA_MBX_STATUS_INVALID_LKEY:
+	case OCRDMA_MBX_STATUS_INVALID_VA:
+	case OCRDMA_MBX_STATUS_INVALID_LENGTH:
+	case OCRDMA_MBX_STATUS_INVALID_FBO:
+	case OCRDMA_MBX_STATUS_INVALID_ACC_RIGHTS:
+	case OCRDMA_MBX_STATUS_INVALID_PBE_SIZE:
+	case OCRDMA_MBX_STATUS_ATOMIC_OPS_UNSUP:
+	case OCRDMA_MBX_STATUS_SRQ_ERROR:
+	case OCRDMA_MBX_STATUS_SRQ_SIZE_UNDERUNS:
+		err_num = -EINVAL;
+		break;
+
+	case OCRDMA_MBX_STATUS_PD_INUSE:
+	case OCRDMA_MBX_STATUS_QP_BOUND:
+	case OCRDMA_MBX_STATUS_MW_STILL_BOUND:
+	case OCRDMA_MBX_STATUS_MW_BOUND:
+		err_num = -EBUSY;
+		break;
+
+	case OCRDMA_MBX_STATUS_RECVQ_RQE_EXCEEDS:
+	case OCRDMA_MBX_STATUS_SGE_RECV_EXCEEDS:
+	case OCRDMA_MBX_STATUS_RQE_EXCEEDS:
+	case OCRDMA_MBX_STATUS_SRQ_LIMIT_EXCEEDS:
+	case OCRDMA_MBX_STATUS_ORD_EXCEEDS:
+	case OCRDMA_MBX_STATUS_IRD_EXCEEDS:
+	case OCRDMA_MBX_STATUS_SENDQ_WQE_EXCEEDS:
+	case OCRDMA_MBX_STATUS_SGE_SEND_EXCEEDS:
+	case OCRDMA_MBX_STATUS_SGE_WRITE_EXCEEDS:
+		err_num = -ENOBUFS;
+		break;
+
+	case OCRDMA_MBX_STATUS_FAILED:
+		switch (add_status) {
+		case OCRDMA_MBX_ADDI_STATUS_INSUFFICIENT_RESOURCES:
+			err_num = -EAGAIN;
+			break;
+		}
+	default:
+		err_num = -EFAULT;
+	}
+	return err_num;
+}
+
+char *port_speed_string(struct ocrdma_dev *dev)
+{
+	char *str = "";
+	u16 speeds_supported;
+
+	speeds_supported = dev->phy.fixed_speeds_supported |
+				dev->phy.auto_speeds_supported;
+	if (speeds_supported & OCRDMA_PHY_SPEED_40GBPS)
+		str = "40Gbps ";
+	else if (speeds_supported & OCRDMA_PHY_SPEED_10GBPS)
+		str = "10Gbps ";
+	else if (speeds_supported & OCRDMA_PHY_SPEED_1GBPS)
+		str = "1Gbps ";
+
+	return str;
+}
+
+static int ocrdma_get_mbx_cqe_errno(u16 cqe_status)
+{
+	int err_num = -EINVAL;
+
+	switch (cqe_status) {
+	case OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_PRIVILEDGES:
+		err_num = -EPERM;
+		break;
+	case OCRDMA_MBX_CQE_STATUS_INVALID_PARAMETER:
+		err_num = -EINVAL;
+		break;
+	case OCRDMA_MBX_CQE_STATUS_INSUFFICIENT_RESOURCES:
+	case OCRDMA_MBX_CQE_STATUS_QUEUE_FLUSHING:
+		err_num = -EINVAL;
+		break;
+	case OCRDMA_MBX_CQE_STATUS_DMA_FAILED:
+	default:
+		err_num = -EINVAL;
+		break;
+	}
+	return err_num;
+}
+
+void ocrdma_ring_cq_db(struct ocrdma_dev *dev, u16 cq_id, bool armed,
+		       bool solicited, u16 cqe_popped)
+{
+	u32 val = cq_id & OCRDMA_DB_CQ_RING_ID_MASK;
+
+	val |= ((cq_id & OCRDMA_DB_CQ_RING_ID_EXT_MASK) <<
+	     OCRDMA_DB_CQ_RING_ID_EXT_MASK_SHIFT);
+
+	if (armed)
+		val |= (1 << OCRDMA_DB_CQ_REARM_SHIFT);
+	if (solicited)
+		val |= (1 << OCRDMA_DB_CQ_SOLICIT_SHIFT);
+	val |= (cqe_popped << OCRDMA_DB_CQ_NUM_POPPED_SHIFT);
+	iowrite32(val, dev->nic_info.db + OCRDMA_DB_CQ_OFFSET);
+}
+
+static void ocrdma_ring_mq_db(struct ocrdma_dev *dev)
+{
+	u32 val = 0;
+
+	val |= dev->mq.sq.id & OCRDMA_MQ_ID_MASK;
+	val |= 1 << OCRDMA_MQ_NUM_MQE_SHIFT;
+	iowrite32(val, dev->nic_info.db + OCRDMA_DB_MQ_OFFSET);
+}
+
+static void ocrdma_ring_eq_db(struct ocrdma_dev *dev, u16 eq_id,
+			      bool arm, bool clear_int, u16 num_eqe)
+{
+	u32 val = 0;
+
+	val |= eq_id & OCRDMA_EQ_ID_MASK;
+	val |= ((eq_id & OCRDMA_EQ_ID_EXT_MASK) << OCRDMA_EQ_ID_EXT_MASK_SHIFT);
+	if (arm)
+		val |= (1 << OCRDMA_REARM_SHIFT);
+	if (clear_int)
+		val |= (1 << OCRDMA_EQ_CLR_SHIFT);
+	val |= (1 << OCRDMA_EQ_TYPE_SHIFT);
+	val |= (num_eqe << OCRDMA_NUM_EQE_SHIFT);
+	iowrite32(val, dev->nic_info.db + OCRDMA_DB_EQ_OFFSET);
+}
+
+static void ocrdma_init_mch(struct ocrdma_mbx_hdr *cmd_hdr,
+			    u8 opcode, u8 subsys, u32 cmd_len)
+{
+	cmd_hdr->subsys_op = (opcode | (subsys << OCRDMA_MCH_SUBSYS_SHIFT));
+	cmd_hdr->timeout = 20; /* seconds */
+	cmd_hdr->cmd_len = cmd_len - sizeof(struct ocrdma_mbx_hdr);
+}
+
+static void *ocrdma_init_emb_mqe(u8 opcode, u32 cmd_len)
+{
+	struct ocrdma_mqe *mqe;
+
+	mqe = kzalloc(sizeof(struct ocrdma_mqe), GFP_KERNEL);
+	if (!mqe)
+		return NULL;
+	mqe->hdr.spcl_sge_cnt_emb |=
+		(OCRDMA_MQE_EMBEDDED << OCRDMA_MQE_HDR_EMB_SHIFT) &
+					OCRDMA_MQE_HDR_EMB_MASK;
+	mqe->hdr.pyld_len = cmd_len - sizeof(struct ocrdma_mqe_hdr);
+
+	ocrdma_init_mch(&mqe->u.emb_req.mch, opcode, OCRDMA_SUBSYS_ROCE,
+			mqe->hdr.pyld_len);
+	return mqe;
+}
+
+static void ocrdma_free_q(struct ocrdma_dev *dev, struct ocrdma_queue_info *q)
+{
+	dma_free_coherent(&dev->nic_info.pdev->dev, q->size, q->va, q->dma);
+}
+
+static int ocrdma_alloc_q(struct ocrdma_dev *dev,
+			  struct ocrdma_queue_info *q, u16 len, u16 entry_size)
+{
+	memset(q, 0, sizeof(*q));
+	q->len = len;
+	q->entry_size = entry_size;
+	q->size = len * entry_size;
+	q->va = dma_alloc_coherent(&dev->nic_info.pdev->dev, q->size,
+				   &q->dma, GFP_KERNEL);
+	if (!q->va)
+		return -ENOMEM;
+	memset(q->va, 0, q->size);
+	return 0;
+}
+
+static void ocrdma_build_q_pages(struct ocrdma_pa *q_pa, int cnt,
+					dma_addr_t host_pa, int hw_page_size)
+{
+	int i;
+
+	for (i = 0; i < cnt; i++) {
+		q_pa[i].lo = (u32) (host_pa & 0xffffffff);
+		q_pa[i].hi = (u32) upper_32_bits(host_pa);
+		host_pa += hw_page_size;
+	}
+}
+
+static int ocrdma_mbx_delete_q(struct ocrdma_dev *dev,
+			       struct ocrdma_queue_info *q, int queue_type)
+{
+	u8 opcode = 0;
+	int status;
+	struct ocrdma_delete_q_req *cmd = dev->mbx_cmd;
+
+	switch (queue_type) {
+	case QTYPE_MCCQ:
+		opcode = OCRDMA_CMD_DELETE_MQ;
+		break;
+	case QTYPE_CQ:
+		opcode = OCRDMA_CMD_DELETE_CQ;
+		break;
+	case QTYPE_EQ:
+		opcode = OCRDMA_CMD_DELETE_EQ;
+		break;
+	default:
+		BUG();
+	}
+	memset(cmd, 0, sizeof(*cmd));
+	ocrdma_init_mch(&cmd->req, opcode, OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+	cmd->id = q->id;
+
+	status = be_roce_mcc_cmd(dev->nic_info.netdev,
+				 cmd, sizeof(*cmd), NULL, NULL);
+	if (!status)
+		q->created = false;
+	return status;
+}
+
+static int ocrdma_mbx_create_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)
+{
+	int status;
+	struct ocrdma_create_eq_req *cmd = dev->mbx_cmd;
+	struct ocrdma_create_eq_rsp *rsp = dev->mbx_cmd;
+
+	memset(cmd, 0, sizeof(*cmd));
+	ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_EQ, OCRDMA_SUBSYS_COMMON,
+			sizeof(*cmd));
+
+	cmd->req.rsvd_version = 2;
+	cmd->num_pages = 4;
+	cmd->valid = OCRDMA_CREATE_EQ_VALID;
+	cmd->cnt = 4 << OCRDMA_CREATE_EQ_CNT_SHIFT;
+
+	ocrdma_build_q_pages(&cmd->pa[0], cmd->num_pages, eq->q.dma,
+			     PAGE_SIZE_4K);
+	status = be_roce_mcc_cmd(dev->nic_info.netdev, cmd, sizeof(*cmd), NULL,
+				 NULL);
+	if (!status) {
+		eq->q.id = rsp->vector_eqid & 0xffff;
+		eq->vector = (rsp->vector_eqid >> 16) & 0xffff;
+		eq->q.created = true;
+	}
+	return status;
+}
+
+static int ocrdma_create_eq(struct ocrdma_dev *dev,
+			    struct ocrdma_eq *eq, u16 q_len)
+{
+	int status;
+
+	status = ocrdma_alloc_q(dev, &eq->q, OCRDMA_EQ_LEN,
+				sizeof(struct ocrdma_eqe));
+	if (status)
+		return status;
+
+	status = ocrdma_mbx_create_eq(dev, eq);
+	if (status)
+		goto mbx_err;
+	eq->dev = dev;
+	ocrdma_ring_eq_db(dev, eq->q.id, true, true, 0);
+
+	return 0;
+mbx_err:
+	ocrdma_free_q(dev, &eq->q);
+	return status;
+}
+
+int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)
+{
+	int irq;
+
+	if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX)
+		irq = dev->nic_info.pdev->irq;
+	else
+		irq = dev->nic_info.msix.vector_list[eq->vector];
+	return irq;
+}
+
+static void _ocrdma_destroy_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)
+{
+	if (eq->q.created) {
+		ocrdma_mbx_delete_q(dev, &eq->q, QTYPE_EQ);
+		ocrdma_free_q(dev, &eq->q);
+	}
+}
+
+static void ocrdma_destroy_eq(struct ocrdma_dev *dev, struct ocrdma_eq *eq)
+{
+	int irq;
+
+	/* disarm EQ so that interrupts are not generated
+	 * during freeing and EQ delete is in progress.
+	 */
+	ocrdma_ring_eq_db(dev, eq->q.id, false, false, 0);
+
+	irq = ocrdma_get_irq(dev, eq);
+	free_irq(irq, eq);
+	_ocrdma_destroy_eq(dev, eq);
+}
+
+static void ocrdma_destroy_eqs(struct ocrdma_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < dev->eq_cnt; i++)
+		ocrdma_destroy_eq(dev, &dev->eq_tbl[i]);
+}
+
+static int ocrdma_mbx_mq_cq_create(struct ocrdma_dev *dev,
+				   struct ocrdma_queue_info *cq,
+				   struct ocrdma_queue_info *eq)
+{
+	struct ocrdma_create_cq_cmd *cmd = dev->mbx_cmd;
+	struct ocrdma_create_cq_cmd_rsp *rsp = dev->mbx_cmd;
+	int status;
+
+	memset(cmd, 0, sizeof(*cmd));
+	ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_CQ,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+
+	cmd->req.rsvd_version = OCRDMA_CREATE_CQ_VER2;
+	cmd->pgsz_pgcnt = (cq->size / OCRDMA_MIN_Q_PAGE_SIZE) <<
+		OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT;
+	cmd->pgsz_pgcnt |= PAGES_4K_SPANNED(cq->va, cq->size);
+
+	cmd->ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS;
+	cmd->eqn = eq->id;
+	cmd->pdid_cqecnt = cq->size / sizeof(struct ocrdma_mcqe);
+
+	ocrdma_build_q_pages(&cmd->pa[0], cq->size / OCRDMA_MIN_Q_PAGE_SIZE,
+			     cq->dma, PAGE_SIZE_4K);
+	status = be_roce_mcc_cmd(dev->nic_info.netdev,
+				 cmd, sizeof(*cmd), NULL, NULL);
+	if (!status) {
+		cq->id = (u16) (rsp->cq_id & OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK);
+		cq->created = true;
+	}
+	return status;
+}
+
+static u32 ocrdma_encoded_q_len(int q_len)
+{
+	u32 len_encoded = fls(q_len);	/* log2(len) + 1 */
+
+	if (len_encoded == 16)
+		len_encoded = 0;
+	return len_encoded;
+}
+
+static int ocrdma_mbx_create_mq(struct ocrdma_dev *dev,
+				struct ocrdma_queue_info *mq,
+				struct ocrdma_queue_info *cq)
+{
+	int num_pages, status;
+	struct ocrdma_create_mq_req *cmd = dev->mbx_cmd;
+	struct ocrdma_create_mq_rsp *rsp = dev->mbx_cmd;
+	struct ocrdma_pa *pa;
+
+	memset(cmd, 0, sizeof(*cmd));
+	num_pages = PAGES_4K_SPANNED(mq->va, mq->size);
+
+	ocrdma_init_mch(&cmd->req, OCRDMA_CMD_CREATE_MQ_EXT,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+	cmd->req.rsvd_version = 1;
+	cmd->cqid_pages = num_pages;
+	cmd->cqid_pages |= (cq->id << OCRDMA_CREATE_MQ_CQ_ID_SHIFT);
+	cmd->async_cqid_valid = OCRDMA_CREATE_MQ_ASYNC_CQ_VALID;
+
+	cmd->async_event_bitmap = BIT(OCRDMA_ASYNC_GRP5_EVE_CODE);
+	cmd->async_event_bitmap |= BIT(OCRDMA_ASYNC_RDMA_EVE_CODE);
+
+	cmd->async_cqid_ringsize = cq->id;
+	cmd->async_cqid_ringsize |= (ocrdma_encoded_q_len(mq->len) <<
+				OCRDMA_CREATE_MQ_RING_SIZE_SHIFT);
+	cmd->valid = OCRDMA_CREATE_MQ_VALID;
+	pa = &cmd->pa[0];
+
+	ocrdma_build_q_pages(pa, num_pages, mq->dma, PAGE_SIZE_4K);
+	status = be_roce_mcc_cmd(dev->nic_info.netdev,
+				 cmd, sizeof(*cmd), NULL, NULL);
+	if (!status) {
+		mq->id = rsp->id;
+		mq->created = true;
+	}
+	return status;
+}
+
+static int ocrdma_create_mq(struct ocrdma_dev *dev)
+{
+	int status;
+
+	/* Alloc completion queue for Mailbox queue */
+	status = ocrdma_alloc_q(dev, &dev->mq.cq, OCRDMA_MQ_CQ_LEN,
+				sizeof(struct ocrdma_mcqe));
+	if (status)
+		goto alloc_err;
+
+	dev->eq_tbl[0].cq_cnt++;
+	status = ocrdma_mbx_mq_cq_create(dev, &dev->mq.cq, &dev->eq_tbl[0].q);
+	if (status)
+		goto mbx_cq_free;
+
+	memset(&dev->mqe_ctx, 0, sizeof(dev->mqe_ctx));
+	init_waitqueue_head(&dev->mqe_ctx.cmd_wait);
+	mutex_init(&dev->mqe_ctx.lock);
+
+	/* Alloc Mailbox queue */
+	status = ocrdma_alloc_q(dev, &dev->mq.sq, OCRDMA_MQ_LEN,
+				sizeof(struct ocrdma_mqe));
+	if (status)
+		goto mbx_cq_destroy;
+	status = ocrdma_mbx_create_mq(dev, &dev->mq.sq, &dev->mq.cq);
+	if (status)
+		goto mbx_q_free;
+	ocrdma_ring_cq_db(dev, dev->mq.cq.id, true, false, 0);
+	return 0;
+
+mbx_q_free:
+	ocrdma_free_q(dev, &dev->mq.sq);
+mbx_cq_destroy:
+	ocrdma_mbx_delete_q(dev, &dev->mq.cq, QTYPE_CQ);
+mbx_cq_free:
+	ocrdma_free_q(dev, &dev->mq.cq);
+alloc_err:
+	return status;
+}
+
+static void ocrdma_destroy_mq(struct ocrdma_dev *dev)
+{
+	struct ocrdma_queue_info *mbxq, *cq;
+
+	/* mqe_ctx lock synchronizes with any other pending cmds. */
+	mutex_lock(&dev->mqe_ctx.lock);
+	mbxq = &dev->mq.sq;
+	if (mbxq->created) {
+		ocrdma_mbx_delete_q(dev, mbxq, QTYPE_MCCQ);
+		ocrdma_free_q(dev, mbxq);
+	}
+	mutex_unlock(&dev->mqe_ctx.lock);
+
+	cq = &dev->mq.cq;
+	if (cq->created) {
+		ocrdma_mbx_delete_q(dev, cq, QTYPE_CQ);
+		ocrdma_free_q(dev, cq);
+	}
+}
+
+static void ocrdma_process_qpcat_error(struct ocrdma_dev *dev,
+				       struct ocrdma_qp *qp)
+{
+	enum ib_qp_state new_ib_qps = IB_QPS_ERR;
+	enum ib_qp_state old_ib_qps;
+
+	if (qp == NULL)
+		BUG();
+	ocrdma_qp_state_change(qp, new_ib_qps, &old_ib_qps);
+}
+
+static void ocrdma_dispatch_ibevent(struct ocrdma_dev *dev,
+				    struct ocrdma_ae_mcqe *cqe)
+{
+	struct ocrdma_qp *qp = NULL;
+	struct ocrdma_cq *cq = NULL;
+	struct ib_event ib_evt;
+	int cq_event = 0;
+	int qp_event = 1;
+	int srq_event = 0;
+	int dev_event = 0;
+	int type = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_TYPE_MASK) >>
+	    OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT;
+
+	if (cqe->qpvalid_qpid & OCRDMA_AE_MCQE_QPVALID)
+		qp = dev->qp_tbl[cqe->qpvalid_qpid & OCRDMA_AE_MCQE_QPID_MASK];
+	if (cqe->cqvalid_cqid & OCRDMA_AE_MCQE_CQVALID)
+		cq = dev->cq_tbl[cqe->cqvalid_cqid & OCRDMA_AE_MCQE_CQID_MASK];
+
+	memset(&ib_evt, 0, sizeof(ib_evt));
+
+	ib_evt.device = &dev->ibdev;
+
+	switch (type) {
+	case OCRDMA_CQ_ERROR:
+		ib_evt.element.cq = &cq->ibcq;
+		ib_evt.event = IB_EVENT_CQ_ERR;
+		cq_event = 1;
+		qp_event = 0;
+		break;
+	case OCRDMA_CQ_OVERRUN_ERROR:
+		ib_evt.element.cq = &cq->ibcq;
+		ib_evt.event = IB_EVENT_CQ_ERR;
+		cq_event = 1;
+		qp_event = 0;
+		break;
+	case OCRDMA_CQ_QPCAT_ERROR:
+		ib_evt.element.qp = &qp->ibqp;
+		ib_evt.event = IB_EVENT_QP_FATAL;
+		ocrdma_process_qpcat_error(dev, qp);
+		break;
+	case OCRDMA_QP_ACCESS_ERROR:
+		ib_evt.element.qp = &qp->ibqp;
+		ib_evt.event = IB_EVENT_QP_ACCESS_ERR;
+		break;
+	case OCRDMA_QP_COMM_EST_EVENT:
+		ib_evt.element.qp = &qp->ibqp;
+		ib_evt.event = IB_EVENT_COMM_EST;
+		break;
+	case OCRDMA_SQ_DRAINED_EVENT:
+		ib_evt.element.qp = &qp->ibqp;
+		ib_evt.event = IB_EVENT_SQ_DRAINED;
+		break;
+	case OCRDMA_DEVICE_FATAL_EVENT:
+		ib_evt.element.port_num = 1;
+		ib_evt.event = IB_EVENT_DEVICE_FATAL;
+		qp_event = 0;
+		dev_event = 1;
+		break;
+	case OCRDMA_SRQCAT_ERROR:
+		ib_evt.element.srq = &qp->srq->ibsrq;
+		ib_evt.event = IB_EVENT_SRQ_ERR;
+		srq_event = 1;
+		qp_event = 0;
+		break;
+	case OCRDMA_SRQ_LIMIT_EVENT:
+		ib_evt.element.srq = &qp->srq->ibsrq;
+		ib_evt.event = IB_EVENT_SRQ_LIMIT_REACHED;
+		srq_event = 1;
+		qp_event = 0;
+		break;
+	case OCRDMA_QP_LAST_WQE_EVENT:
+		ib_evt.element.qp = &qp->ibqp;
+		ib_evt.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		break;
+	default:
+		cq_event = 0;
+		qp_event = 0;
+		srq_event = 0;
+		dev_event = 0;
+		pr_err("%s() unknown type=0x%x\n", __func__, type);
+		break;
+	}
+
+	if (type < OCRDMA_MAX_ASYNC_ERRORS)
+		atomic_inc(&dev->async_err_stats[type]);
+
+	if (qp_event) {
+		if (qp->ibqp.event_handler)
+			qp->ibqp.event_handler(&ib_evt, qp->ibqp.qp_context);
+	} else if (cq_event) {
+		if (cq->ibcq.event_handler)
+			cq->ibcq.event_handler(&ib_evt, cq->ibcq.cq_context);
+	} else if (srq_event) {
+		if (qp->srq->ibsrq.event_handler)
+			qp->srq->ibsrq.event_handler(&ib_evt,
+						     qp->srq->ibsrq.
+						     srq_context);
+	} else if (dev_event) {
+		pr_err("%s: Fatal event received\n", dev->ibdev.name);
+		ib_dispatch_event(&ib_evt);
+	}
+
+}
+
+static void ocrdma_process_grp5_aync(struct ocrdma_dev *dev,
+					struct ocrdma_ae_mcqe *cqe)
+{
+	struct ocrdma_ae_pvid_mcqe *evt;
+	int type = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_TYPE_MASK) >>
+			OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT;
+
+	switch (type) {
+	case OCRDMA_ASYNC_EVENT_PVID_STATE:
+		evt = (struct ocrdma_ae_pvid_mcqe *)cqe;
+		if ((evt->tag_enabled & OCRDMA_AE_PVID_MCQE_ENABLED_MASK) >>
+			OCRDMA_AE_PVID_MCQE_ENABLED_SHIFT)
+			dev->pvid = ((evt->tag_enabled &
+					OCRDMA_AE_PVID_MCQE_TAG_MASK) >>
+					OCRDMA_AE_PVID_MCQE_TAG_SHIFT);
+		break;
+
+	case OCRDMA_ASYNC_EVENT_COS_VALUE:
+		atomic_set(&dev->update_sl, 1);
+		break;
+	default:
+		/* Not interested evts. */
+		break;
+	}
+}
+
+static void ocrdma_process_acqe(struct ocrdma_dev *dev, void *ae_cqe)
+{
+	/* async CQE processing */
+	struct ocrdma_ae_mcqe *cqe = ae_cqe;
+	u32 evt_code = (cqe->valid_ae_event & OCRDMA_AE_MCQE_EVENT_CODE_MASK) >>
+			OCRDMA_AE_MCQE_EVENT_CODE_SHIFT;
+
+	if (evt_code == OCRDMA_ASYNC_RDMA_EVE_CODE)
+		ocrdma_dispatch_ibevent(dev, cqe);
+	else if (evt_code == OCRDMA_ASYNC_GRP5_EVE_CODE)
+		ocrdma_process_grp5_aync(dev, cqe);
+	else
+		pr_err("%s(%d) invalid evt code=0x%x\n", __func__,
+		       dev->id, evt_code);
+}
+
+static void ocrdma_process_mcqe(struct ocrdma_dev *dev, struct ocrdma_mcqe *cqe)
+{
+	if (dev->mqe_ctx.tag == cqe->tag_lo && dev->mqe_ctx.cmd_done == false) {
+		dev->mqe_ctx.cqe_status = (cqe->status &
+		     OCRDMA_MCQE_STATUS_MASK) >> OCRDMA_MCQE_STATUS_SHIFT;
+		dev->mqe_ctx.ext_status =
+		    (cqe->status & OCRDMA_MCQE_ESTATUS_MASK)
+		    >> OCRDMA_MCQE_ESTATUS_SHIFT;
+		dev->mqe_ctx.cmd_done = true;
+		wake_up(&dev->mqe_ctx.cmd_wait);
+	} else
+		pr_err("%s() cqe for invalid tag0x%x.expected=0x%x\n",
+		       __func__, cqe->tag_lo, dev->mqe_ctx.tag);
+}
+
+static int ocrdma_mq_cq_handler(struct ocrdma_dev *dev, u16 cq_id)
+{
+	u16 cqe_popped = 0;
+	struct ocrdma_mcqe *cqe;
+
+	while (1) {
+		cqe = ocrdma_get_mcqe(dev);
+		if (cqe == NULL)
+			break;
+		ocrdma_le32_to_cpu(cqe, sizeof(*cqe));
+		cqe_popped += 1;
+		if (cqe->valid_ae_cmpl_cons & OCRDMA_MCQE_AE_MASK)
+			ocrdma_process_acqe(dev, cqe);
+		else if (cqe->valid_ae_cmpl_cons & OCRDMA_MCQE_CMPL_MASK)
+			ocrdma_process_mcqe(dev, cqe);
+		memset(cqe, 0, sizeof(struct ocrdma_mcqe));
+		ocrdma_mcq_inc_tail(dev);
+	}
+	ocrdma_ring_cq_db(dev, dev->mq.cq.id, true, false, cqe_popped);
+	return 0;
+}
+
+static struct ocrdma_cq *_ocrdma_qp_buddy_cq_handler(struct ocrdma_dev *dev,
+				struct ocrdma_cq *cq, bool sq)
+{
+	struct ocrdma_qp *qp;
+	struct list_head *cur;
+	struct ocrdma_cq *bcq = NULL;
+	struct list_head *head = sq?(&cq->sq_head):(&cq->rq_head);
+
+	list_for_each(cur, head) {
+		if (sq)
+			qp = list_entry(cur, struct ocrdma_qp, sq_entry);
+		else
+			qp = list_entry(cur, struct ocrdma_qp, rq_entry);
+
+		if (qp->srq)
+			continue;
+		/* if wq and rq share the same cq, than comp_handler
+		 * is already invoked.
+		 */
+		if (qp->sq_cq == qp->rq_cq)
+			continue;
+		/* if completion came on sq, rq's cq is buddy cq.
+		 * if completion came on rq, sq's cq is buddy cq.
+		 */
+		if (qp->sq_cq == cq)
+			bcq = qp->rq_cq;
+		else
+			bcq = qp->sq_cq;
+		return bcq;
+	}
+	return NULL;
+}
+
+static void ocrdma_qp_buddy_cq_handler(struct ocrdma_dev *dev,
+				       struct ocrdma_cq *cq)
+{
+	unsigned long flags;
+	struct ocrdma_cq *bcq = NULL;
+
+	/* Go through list of QPs in error state which are using this CQ
+	 * and invoke its callback handler to trigger CQE processing for
+	 * error/flushed CQE. It is rare to find more than few entries in
+	 * this list as most consumers stops after getting error CQE.
+	 * List is traversed only once when a matching buddy cq found for a QP.
+	 */
+	spin_lock_irqsave(&dev->flush_q_lock, flags);
+	/* Check if buddy CQ is present.
+	 * true - Check for  SQ CQ
+	 * false - Check for RQ CQ
+	 */
+	bcq = _ocrdma_qp_buddy_cq_handler(dev, cq, true);
+	if (bcq == NULL)
+		bcq = _ocrdma_qp_buddy_cq_handler(dev, cq, false);
+	spin_unlock_irqrestore(&dev->flush_q_lock, flags);
+
+	/* if there is valid buddy cq, look for its completion handler */
+	if (bcq && bcq->ibcq.comp_handler) {
+		spin_lock_irqsave(&bcq->comp_handler_lock, flags);
+		(*bcq->ibcq.comp_handler) (&bcq->ibcq, bcq->ibcq.cq_context);
+		spin_unlock_irqrestore(&bcq->comp_handler_lock, flags);
+	}
+}
+
+static void ocrdma_qp_cq_handler(struct ocrdma_dev *dev, u16 cq_idx)
+{
+	unsigned long flags;
+	struct ocrdma_cq *cq;
+
+	if (cq_idx >= OCRDMA_MAX_CQ)
+		BUG();
+
+	cq = dev->cq_tbl[cq_idx];
+	if (cq == NULL)
+		return;
+
+	if (cq->ibcq.comp_handler) {
+		spin_lock_irqsave(&cq->comp_handler_lock, flags);
+		(*cq->ibcq.comp_handler) (&cq->ibcq, cq->ibcq.cq_context);
+		spin_unlock_irqrestore(&cq->comp_handler_lock, flags);
+	}
+	ocrdma_qp_buddy_cq_handler(dev, cq);
+}
+
+static void ocrdma_cq_handler(struct ocrdma_dev *dev, u16 cq_id)
+{
+	/* process the MQ-CQE. */
+	if (cq_id == dev->mq.cq.id)
+		ocrdma_mq_cq_handler(dev, cq_id);
+	else
+		ocrdma_qp_cq_handler(dev, cq_id);
+}
+
+static irqreturn_t ocrdma_irq_handler(int irq, void *handle)
+{
+	struct ocrdma_eq *eq = handle;
+	struct ocrdma_dev *dev = eq->dev;
+	struct ocrdma_eqe eqe;
+	struct ocrdma_eqe *ptr;
+	u16 cq_id;
+	u8 mcode;
+	int budget = eq->cq_cnt;
+
+	do {
+		ptr = ocrdma_get_eqe(eq);
+		eqe = *ptr;
+		ocrdma_le32_to_cpu(&eqe, sizeof(eqe));
+		mcode = (eqe.id_valid & OCRDMA_EQE_MAJOR_CODE_MASK)
+				>> OCRDMA_EQE_MAJOR_CODE_SHIFT;
+		if (mcode == OCRDMA_MAJOR_CODE_SENTINAL)
+			pr_err("EQ full on eqid = 0x%x, eqe = 0x%x\n",
+			       eq->q.id, eqe.id_valid);
+		if ((eqe.id_valid & OCRDMA_EQE_VALID_MASK) == 0)
+			break;
+
+		ptr->id_valid = 0;
+		/* ring eq doorbell as soon as its consumed. */
+		ocrdma_ring_eq_db(dev, eq->q.id, false, true, 1);
+		/* check whether its CQE or not. */
+		if ((eqe.id_valid & OCRDMA_EQE_FOR_CQE_MASK) == 0) {
+			cq_id = eqe.id_valid >> OCRDMA_EQE_RESOURCE_ID_SHIFT;
+			ocrdma_cq_handler(dev, cq_id);
+		}
+		ocrdma_eq_inc_tail(eq);
+
+		/* There can be a stale EQE after the last bound CQ is
+		 * destroyed. EQE valid and budget == 0 implies this.
+		 */
+		if (budget)
+			budget--;
+
+	} while (budget);
+
+	eq->aic_obj.eq_intr_cnt++;
+	ocrdma_ring_eq_db(dev, eq->q.id, true, true, 0);
+	return IRQ_HANDLED;
+}
+
+static void ocrdma_post_mqe(struct ocrdma_dev *dev, struct ocrdma_mqe *cmd)
+{
+	struct ocrdma_mqe *mqe;
+
+	dev->mqe_ctx.tag = dev->mq.sq.head;
+	dev->mqe_ctx.cmd_done = false;
+	mqe = ocrdma_get_mqe(dev);
+	cmd->hdr.tag_lo = dev->mq.sq.head;
+	ocrdma_copy_cpu_to_le32(mqe, cmd, sizeof(*mqe));
+	/* make sure descriptor is written before ringing doorbell */
+	wmb();
+	ocrdma_mq_inc_head(dev);
+	ocrdma_ring_mq_db(dev);
+}
+
+static int ocrdma_wait_mqe_cmpl(struct ocrdma_dev *dev)
+{
+	long status;
+	/* 30 sec timeout */
+	status = wait_event_timeout(dev->mqe_ctx.cmd_wait,
+				    (dev->mqe_ctx.cmd_done != false),
+				    msecs_to_jiffies(30000));
+	if (status)
+		return 0;
+	else {
+		dev->mqe_ctx.fw_error_state = true;
+		pr_err("%s(%d) mailbox timeout: fw not responding\n",
+		       __func__, dev->id);
+		return -1;
+	}
+}
+
+/* issue a mailbox command on the MQ */
+static int ocrdma_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe)
+{
+	int status = 0;
+	u16 cqe_status, ext_status;
+	struct ocrdma_mqe *rsp_mqe;
+	struct ocrdma_mbx_rsp *rsp = NULL;
+
+	mutex_lock(&dev->mqe_ctx.lock);
+	if (dev->mqe_ctx.fw_error_state)
+		goto mbx_err;
+	ocrdma_post_mqe(dev, mqe);
+	status = ocrdma_wait_mqe_cmpl(dev);
+	if (status)
+		goto mbx_err;
+	cqe_status = dev->mqe_ctx.cqe_status;
+	ext_status = dev->mqe_ctx.ext_status;
+	rsp_mqe = ocrdma_get_mqe_rsp(dev);
+	ocrdma_copy_le32_to_cpu(mqe, rsp_mqe, (sizeof(*mqe)));
+	if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >>
+				OCRDMA_MQE_HDR_EMB_SHIFT)
+		rsp = &mqe->u.rsp;
+
+	if (cqe_status || ext_status) {
+		pr_err("%s() cqe_status=0x%x, ext_status=0x%x,",
+		       __func__, cqe_status, ext_status);
+		if (rsp) {
+			/* This is for embedded cmds. */
+			pr_err("opcode=0x%x, subsystem=0x%x\n",
+			       (rsp->subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >>
+				OCRDMA_MBX_RSP_OPCODE_SHIFT,
+				(rsp->subsys_op & OCRDMA_MBX_RSP_SUBSYS_MASK) >>
+				OCRDMA_MBX_RSP_SUBSYS_SHIFT);
+		}
+		status = ocrdma_get_mbx_cqe_errno(cqe_status);
+		goto mbx_err;
+	}
+	/* For non embedded, rsp errors are handled in ocrdma_nonemb_mbx_cmd */
+	if (rsp && (mqe->u.rsp.status & OCRDMA_MBX_RSP_STATUS_MASK))
+		status = ocrdma_get_mbx_errno(mqe->u.rsp.status);
+mbx_err:
+	mutex_unlock(&dev->mqe_ctx.lock);
+	return status;
+}
+
+static int ocrdma_nonemb_mbx_cmd(struct ocrdma_dev *dev, struct ocrdma_mqe *mqe,
+				 void *payload_va)
+{
+	int status = 0;
+	struct ocrdma_mbx_rsp *rsp = payload_va;
+
+	if ((mqe->hdr.spcl_sge_cnt_emb & OCRDMA_MQE_HDR_EMB_MASK) >>
+				OCRDMA_MQE_HDR_EMB_SHIFT)
+		BUG();
+
+	status = ocrdma_mbx_cmd(dev, mqe);
+	if (!status)
+		/* For non embedded, only CQE failures are handled in
+		 * ocrdma_mbx_cmd. We need to check for RSP errors.
+		 */
+		if (rsp->status & OCRDMA_MBX_RSP_STATUS_MASK)
+			status = ocrdma_get_mbx_errno(rsp->status);
+
+	if (status)
+		pr_err("opcode=0x%x, subsystem=0x%x\n",
+		       (rsp->subsys_op & OCRDMA_MBX_RSP_OPCODE_MASK) >>
+			OCRDMA_MBX_RSP_OPCODE_SHIFT,
+			(rsp->subsys_op & OCRDMA_MBX_RSP_SUBSYS_MASK) >>
+			OCRDMA_MBX_RSP_SUBSYS_SHIFT);
+	return status;
+}
+
+static void ocrdma_get_attr(struct ocrdma_dev *dev,
+			      struct ocrdma_dev_attr *attr,
+			      struct ocrdma_mbx_query_config *rsp)
+{
+	attr->max_pd =
+	    (rsp->max_pd_ca_ack_delay & OCRDMA_MBX_QUERY_CFG_MAX_PD_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT;
+	attr->max_dpp_pds =
+	   (rsp->max_dpp_pds_credits & OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_OFFSET;
+	attr->max_qp =
+	    (rsp->qp_srq_cq_ird_ord & OCRDMA_MBX_QUERY_CFG_MAX_QP_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT;
+	attr->max_srq =
+		(rsp->max_srq_rpir_qps & OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK) >>
+		OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET;
+	attr->max_send_sge = ((rsp->max_write_send_sge &
+			       OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK) >>
+			      OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT);
+	attr->max_recv_sge = (rsp->max_write_send_sge &
+			      OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT;
+	attr->max_srq_sge = (rsp->max_srq_rqe_sge &
+			      OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET;
+	attr->max_rdma_sge = (rsp->max_write_send_sge &
+			      OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT;
+	attr->max_ord_per_qp = (rsp->max_ird_ord_per_qp &
+				OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_SHIFT;
+	attr->max_ird_per_qp = (rsp->max_ird_ord_per_qp &
+				OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT;
+	attr->cq_overflow_detect = (rsp->qp_srq_cq_ird_ord &
+				    OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_SHIFT;
+	attr->srq_supported = (rsp->qp_srq_cq_ird_ord &
+			       OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_SHIFT;
+	attr->local_ca_ack_delay = (rsp->max_pd_ca_ack_delay &
+				    OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK) >>
+	    OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT;
+	attr->max_mw = rsp->max_mw;
+	attr->max_mr = rsp->max_mr;
+	attr->max_mr_size = ((u64)rsp->max_mr_size_hi << 32) |
+			      rsp->max_mr_size_lo;
+	attr->max_fmr = 0;
+	attr->max_pages_per_frmr = rsp->max_pages_per_frmr;
+	attr->max_num_mr_pbl = rsp->max_num_mr_pbl;
+	attr->max_cqe = rsp->max_cq_cqes_per_cq &
+			OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_MASK;
+	attr->max_cq = (rsp->max_cq_cqes_per_cq &
+			OCRDMA_MBX_QUERY_CFG_MAX_CQ_MASK) >>
+			OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET;
+	attr->wqe_size = ((rsp->wqe_rqe_stride_max_dpp_cqs &
+		OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_MASK) >>
+		OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET) *
+		OCRDMA_WQE_STRIDE;
+	attr->rqe_size = ((rsp->wqe_rqe_stride_max_dpp_cqs &
+		OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_MASK) >>
+		OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_OFFSET) *
+		OCRDMA_WQE_STRIDE;
+	attr->max_inline_data =
+	    attr->wqe_size - (sizeof(struct ocrdma_hdr_wqe) +
+			      sizeof(struct ocrdma_sge));
+	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
+		attr->ird = 1;
+		attr->ird_page_size = OCRDMA_MIN_Q_PAGE_SIZE;
+		attr->num_ird_pages = MAX_OCRDMA_IRD_PAGES;
+	}
+	dev->attr.max_wqe = rsp->max_wqes_rqes_per_q >>
+		 OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_OFFSET;
+	dev->attr.max_rqe = rsp->max_wqes_rqes_per_q &
+		OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_MASK;
+}
+
+static int ocrdma_check_fw_config(struct ocrdma_dev *dev,
+				   struct ocrdma_fw_conf_rsp *conf)
+{
+	u32 fn_mode;
+
+	fn_mode = conf->fn_mode & OCRDMA_FN_MODE_RDMA;
+	if (fn_mode != OCRDMA_FN_MODE_RDMA)
+		return -EINVAL;
+	dev->base_eqid = conf->base_eqid;
+	dev->max_eq = conf->max_eq;
+	return 0;
+}
+
+/* can be issued only during init time. */
+static int ocrdma_mbx_query_fw_ver(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	struct ocrdma_mqe *cmd;
+	struct ocrdma_fw_ver_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_GET_FW_VER, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0],
+			OCRDMA_CMD_GET_FW_VER,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_fw_ver_rsp *)cmd;
+	memset(&dev->attr.fw_ver[0], 0, sizeof(dev->attr.fw_ver));
+	memcpy(&dev->attr.fw_ver[0], &rsp->running_ver[0],
+	       sizeof(rsp->running_ver));
+	ocrdma_le32_to_cpu(dev->attr.fw_ver, sizeof(rsp->running_ver));
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+/* can be issued only during init time. */
+static int ocrdma_mbx_query_fw_config(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	struct ocrdma_mqe *cmd;
+	struct ocrdma_fw_conf_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_GET_FW_CONFIG, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0],
+			OCRDMA_CMD_GET_FW_CONFIG,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_fw_conf_rsp *)cmd;
+	status = ocrdma_check_fw_config(dev, rsp);
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_rdma_stats(struct ocrdma_dev *dev, bool reset)
+{
+	struct ocrdma_rdma_stats_req *req = dev->stats_mem.va;
+	struct ocrdma_mqe *mqe = &dev->stats_mem.mqe;
+	struct ocrdma_rdma_stats_resp *old_stats;
+	int status;
+
+	old_stats = kmalloc(sizeof(*old_stats), GFP_KERNEL);
+	if (old_stats == NULL)
+		return -ENOMEM;
+
+	memset(mqe, 0, sizeof(*mqe));
+	mqe->hdr.pyld_len = dev->stats_mem.size;
+	mqe->hdr.spcl_sge_cnt_emb |=
+			(1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) &
+				OCRDMA_MQE_HDR_SGE_CNT_MASK;
+	mqe->u.nonemb_req.sge[0].pa_lo = (u32) (dev->stats_mem.pa & 0xffffffff);
+	mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dev->stats_mem.pa);
+	mqe->u.nonemb_req.sge[0].len = dev->stats_mem.size;
+
+	/* Cache the old stats */
+	memcpy(old_stats, req, sizeof(struct ocrdma_rdma_stats_resp));
+	memset(req, 0, dev->stats_mem.size);
+
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)req,
+			OCRDMA_CMD_GET_RDMA_STATS,
+			OCRDMA_SUBSYS_ROCE,
+			dev->stats_mem.size);
+	if (reset)
+		req->reset_stats = reset;
+
+	status = ocrdma_nonemb_mbx_cmd(dev, mqe, dev->stats_mem.va);
+	if (status)
+		/* Copy from cache, if mbox fails */
+		memcpy(req, old_stats, sizeof(struct ocrdma_rdma_stats_resp));
+	else
+		ocrdma_le32_to_cpu(req, dev->stats_mem.size);
+
+	kfree(old_stats);
+	return status;
+}
+
+static int ocrdma_mbx_get_ctrl_attribs(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	struct ocrdma_dma_mem dma;
+	struct ocrdma_mqe *mqe;
+	struct ocrdma_get_ctrl_attribs_rsp *ctrl_attr_rsp;
+	struct mgmt_hba_attribs *hba_attribs;
+
+	mqe = kzalloc(sizeof(struct ocrdma_mqe), GFP_KERNEL);
+	if (!mqe)
+		return status;
+
+	dma.size = sizeof(struct ocrdma_get_ctrl_attribs_rsp);
+	dma.va	 = dma_alloc_coherent(&dev->nic_info.pdev->dev,
+					dma.size, &dma.pa, GFP_KERNEL);
+	if (!dma.va)
+		goto free_mqe;
+
+	mqe->hdr.pyld_len = dma.size;
+	mqe->hdr.spcl_sge_cnt_emb |=
+			(1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) &
+			OCRDMA_MQE_HDR_SGE_CNT_MASK;
+	mqe->u.nonemb_req.sge[0].pa_lo = (u32) (dma.pa & 0xffffffff);
+	mqe->u.nonemb_req.sge[0].pa_hi = (u32) upper_32_bits(dma.pa);
+	mqe->u.nonemb_req.sge[0].len = dma.size;
+
+	memset(dma.va, 0, dma.size);
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)dma.va,
+			OCRDMA_CMD_GET_CTRL_ATTRIBUTES,
+			OCRDMA_SUBSYS_COMMON,
+			dma.size);
+
+	status = ocrdma_nonemb_mbx_cmd(dev, mqe, dma.va);
+	if (!status) {
+		ctrl_attr_rsp = (struct ocrdma_get_ctrl_attribs_rsp *)dma.va;
+		hba_attribs = &ctrl_attr_rsp->ctrl_attribs.hba_attribs;
+
+		dev->hba_port_num = (hba_attribs->ptpnum_maxdoms_hbast_cv &
+					OCRDMA_HBA_ATTRB_PTNUM_MASK)
+					>> OCRDMA_HBA_ATTRB_PTNUM_SHIFT;
+		strncpy(dev->model_number,
+			hba_attribs->controller_model_number, 31);
+	}
+	dma_free_coherent(&dev->nic_info.pdev->dev, dma.size, dma.va, dma.pa);
+free_mqe:
+	kfree(mqe);
+	return status;
+}
+
+static int ocrdma_mbx_query_dev(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	struct ocrdma_mbx_query_config *rsp;
+	struct ocrdma_mqe *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_CONFIG, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_mbx_query_config *)cmd;
+	ocrdma_get_attr(dev, &dev->attr, rsp);
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed)
+{
+	int status = -ENOMEM;
+	struct ocrdma_get_link_speed_rsp *rsp;
+	struct ocrdma_mqe *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1,
+				  sizeof(*cmd));
+	if (!cmd)
+		return status;
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0],
+			OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+
+	((struct ocrdma_mbx_hdr *)cmd->u.cmd)->rsvd_version = 0x1;
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+
+	rsp = (struct ocrdma_get_link_speed_rsp *)cmd;
+	*lnk_speed = (rsp->pflt_pps_ld_pnum & OCRDMA_PHY_PS_MASK)
+			>> OCRDMA_PHY_PS_SHIFT;
+
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_mbx_get_phy_info(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	struct ocrdma_mqe *cmd;
+	struct ocrdma_get_phy_info_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_PHY_DETAILS, sizeof(*cmd));
+	if (!cmd)
+		return status;
+
+	ocrdma_init_mch((struct ocrdma_mbx_hdr *)&cmd->u.cmd[0],
+			OCRDMA_CMD_PHY_DETAILS, OCRDMA_SUBSYS_COMMON,
+			sizeof(*cmd));
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+
+	rsp = (struct ocrdma_get_phy_info_rsp *)cmd;
+	dev->phy.phy_type =
+			(rsp->ityp_ptyp & OCRDMA_PHY_TYPE_MASK);
+	dev->phy.interface_type =
+			(rsp->ityp_ptyp & OCRDMA_IF_TYPE_MASK)
+				>> OCRDMA_IF_TYPE_SHIFT;
+	dev->phy.auto_speeds_supported  =
+			(rsp->fspeed_aspeed & OCRDMA_ASPEED_SUPP_MASK);
+	dev->phy.fixed_speeds_supported =
+			(rsp->fspeed_aspeed & OCRDMA_FSPEED_SUPP_MASK)
+				>> OCRDMA_FSPEED_SUPP_SHIFT;
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_alloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd)
+{
+	int status = -ENOMEM;
+	struct ocrdma_alloc_pd *cmd;
+	struct ocrdma_alloc_pd_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_ALLOC_PD, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	if (pd->dpp_enabled)
+		cmd->enable_dpp_rsvd |= OCRDMA_ALLOC_PD_ENABLE_DPP;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_alloc_pd_rsp *)cmd;
+	pd->id = rsp->dpp_page_pdid & OCRDMA_ALLOC_PD_RSP_PDID_MASK;
+	if (rsp->dpp_page_pdid & OCRDMA_ALLOC_PD_RSP_DPP) {
+		pd->dpp_enabled = true;
+		pd->dpp_page = rsp->dpp_page_pdid >>
+				OCRDMA_ALLOC_PD_RSP_DPP_PAGE_SHIFT;
+	} else {
+		pd->dpp_enabled = false;
+		pd->num_dpp_qp = 0;
+	}
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_dealloc_pd(struct ocrdma_dev *dev, struct ocrdma_pd *pd)
+{
+	int status = -ENOMEM;
+	struct ocrdma_dealloc_pd *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DEALLOC_PD, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->id = pd->id;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	kfree(cmd);
+	return status;
+}
+
+
+static int ocrdma_mbx_alloc_pd_range(struct ocrdma_dev *dev)
+{
+	int status = -ENOMEM;
+	size_t pd_bitmap_size;
+	struct ocrdma_alloc_pd_range *cmd;
+	struct ocrdma_alloc_pd_range_rsp *rsp;
+
+	/* Pre allocate the DPP PDs */
+	if (dev->attr.max_dpp_pds) {
+		cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_ALLOC_PD_RANGE,
+					  sizeof(*cmd));
+		if (!cmd)
+			return -ENOMEM;
+		cmd->pd_count = dev->attr.max_dpp_pds;
+		cmd->enable_dpp_rsvd |= OCRDMA_ALLOC_PD_ENABLE_DPP;
+		status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+		rsp = (struct ocrdma_alloc_pd_range_rsp *)cmd;
+
+		if (!status && (rsp->dpp_page_pdid & OCRDMA_ALLOC_PD_RSP_DPP) &&
+		    rsp->pd_count) {
+			dev->pd_mgr->dpp_page_index = rsp->dpp_page_pdid >>
+					OCRDMA_ALLOC_PD_RSP_DPP_PAGE_SHIFT;
+			dev->pd_mgr->pd_dpp_start = rsp->dpp_page_pdid &
+					OCRDMA_ALLOC_PD_RNG_RSP_START_PDID_MASK;
+			dev->pd_mgr->max_dpp_pd = rsp->pd_count;
+			pd_bitmap_size =
+				BITS_TO_LONGS(rsp->pd_count) * sizeof(long);
+			dev->pd_mgr->pd_dpp_bitmap = kzalloc(pd_bitmap_size,
+							     GFP_KERNEL);
+		}
+		kfree(cmd);
+	}
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_ALLOC_PD_RANGE, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+
+	cmd->pd_count = dev->attr.max_pd - dev->attr.max_dpp_pds;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	rsp = (struct ocrdma_alloc_pd_range_rsp *)cmd;
+	if (!status && rsp->pd_count) {
+		dev->pd_mgr->pd_norm_start = rsp->dpp_page_pdid &
+					OCRDMA_ALLOC_PD_RNG_RSP_START_PDID_MASK;
+		dev->pd_mgr->max_normal_pd = rsp->pd_count;
+		pd_bitmap_size = BITS_TO_LONGS(rsp->pd_count) * sizeof(long);
+		dev->pd_mgr->pd_norm_bitmap = kzalloc(pd_bitmap_size,
+						      GFP_KERNEL);
+	}
+	kfree(cmd);
+
+	if (dev->pd_mgr->pd_norm_bitmap || dev->pd_mgr->pd_dpp_bitmap) {
+		/* Enable PD resource manager */
+		dev->pd_mgr->pd_prealloc_valid = true;
+		return 0;
+	}
+	return status;
+}
+
+static void ocrdma_mbx_dealloc_pd_range(struct ocrdma_dev *dev)
+{
+	struct ocrdma_dealloc_pd_range *cmd;
+
+	/* return normal PDs to firmware */
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DEALLOC_PD_RANGE, sizeof(*cmd));
+	if (!cmd)
+		goto mbx_err;
+
+	if (dev->pd_mgr->max_normal_pd) {
+		cmd->start_pd_id = dev->pd_mgr->pd_norm_start;
+		cmd->pd_count = dev->pd_mgr->max_normal_pd;
+		ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	}
+
+	if (dev->pd_mgr->max_dpp_pd) {
+		kfree(cmd);
+		/* return DPP PDs to firmware */
+		cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DEALLOC_PD_RANGE,
+					  sizeof(*cmd));
+		if (!cmd)
+			goto mbx_err;
+
+		cmd->start_pd_id = dev->pd_mgr->pd_dpp_start;
+		cmd->pd_count = dev->pd_mgr->max_dpp_pd;
+		ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	}
+mbx_err:
+	kfree(cmd);
+}
+
+void ocrdma_alloc_pd_pool(struct ocrdma_dev *dev)
+{
+	int status;
+
+	dev->pd_mgr = kzalloc(sizeof(struct ocrdma_pd_resource_mgr),
+			      GFP_KERNEL);
+	if (!dev->pd_mgr) {
+		pr_err("%s(%d)Memory allocation failure.\n", __func__, dev->id);
+		return;
+	}
+	status = ocrdma_mbx_alloc_pd_range(dev);
+	if (status) {
+		pr_err("%s(%d) Unable to initialize PD pool, using default.\n",
+			 __func__, dev->id);
+	}
+}
+
+static void ocrdma_free_pd_pool(struct ocrdma_dev *dev)
+{
+	ocrdma_mbx_dealloc_pd_range(dev);
+	kfree(dev->pd_mgr->pd_norm_bitmap);
+	kfree(dev->pd_mgr->pd_dpp_bitmap);
+	kfree(dev->pd_mgr);
+}
+
+static int ocrdma_build_q_conf(u32 *num_entries, int entry_size,
+			       int *num_pages, int *page_size)
+{
+	int i;
+	int mem_size;
+
+	*num_entries = roundup_pow_of_two(*num_entries);
+	mem_size = *num_entries * entry_size;
+	/* find the possible lowest possible multiplier */
+	for (i = 0; i < OCRDMA_MAX_Q_PAGE_SIZE_CNT; i++) {
+		if (mem_size <= (OCRDMA_Q_PAGE_BASE_SIZE << i))
+			break;
+	}
+	if (i >= OCRDMA_MAX_Q_PAGE_SIZE_CNT)
+		return -EINVAL;
+	mem_size = roundup(mem_size,
+		       ((OCRDMA_Q_PAGE_BASE_SIZE << i) / OCRDMA_MAX_Q_PAGES));
+	*num_pages =
+	    mem_size / ((OCRDMA_Q_PAGE_BASE_SIZE << i) / OCRDMA_MAX_Q_PAGES);
+	*page_size = ((OCRDMA_Q_PAGE_BASE_SIZE << i) / OCRDMA_MAX_Q_PAGES);
+	*num_entries = mem_size / entry_size;
+	return 0;
+}
+
+static int ocrdma_mbx_create_ah_tbl(struct ocrdma_dev *dev)
+{
+	int i;
+	int status = 0;
+	int max_ah;
+	struct ocrdma_create_ah_tbl *cmd;
+	struct ocrdma_create_ah_tbl_rsp *rsp;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	dma_addr_t pa;
+	struct ocrdma_pbe *pbes;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_AH_TBL, sizeof(*cmd));
+	if (!cmd)
+		return status;
+
+	max_ah = OCRDMA_MAX_AH;
+	dev->av_tbl.size = sizeof(struct ocrdma_av) * max_ah;
+
+	/* number of PBEs in PBL */
+	cmd->ah_conf = (OCRDMA_AH_TBL_PAGES <<
+				OCRDMA_CREATE_AH_NUM_PAGES_SHIFT) &
+				OCRDMA_CREATE_AH_NUM_PAGES_MASK;
+
+	/* page size */
+	for (i = 0; i < OCRDMA_MAX_Q_PAGE_SIZE_CNT; i++) {
+		if (PAGE_SIZE == (OCRDMA_MIN_Q_PAGE_SIZE << i))
+			break;
+	}
+	cmd->ah_conf |= (i << OCRDMA_CREATE_AH_PAGE_SIZE_SHIFT) &
+				OCRDMA_CREATE_AH_PAGE_SIZE_MASK;
+
+	/* ah_entry size */
+	cmd->ah_conf |= (sizeof(struct ocrdma_av) <<
+				OCRDMA_CREATE_AH_ENTRY_SIZE_SHIFT) &
+				OCRDMA_CREATE_AH_ENTRY_SIZE_MASK;
+
+	dev->av_tbl.pbl.va = dma_alloc_coherent(&pdev->dev, PAGE_SIZE,
+						&dev->av_tbl.pbl.pa,
+						GFP_KERNEL);
+	if (dev->av_tbl.pbl.va == NULL)
+		goto mem_err;
+
+	dev->av_tbl.va = dma_alloc_coherent(&pdev->dev, dev->av_tbl.size,
+					    &pa, GFP_KERNEL);
+	if (dev->av_tbl.va == NULL)
+		goto mem_err_ah;
+	dev->av_tbl.pa = pa;
+	dev->av_tbl.num_ah = max_ah;
+	memset(dev->av_tbl.va, 0, dev->av_tbl.size);
+
+	pbes = (struct ocrdma_pbe *)dev->av_tbl.pbl.va;
+	for (i = 0; i < dev->av_tbl.size / OCRDMA_MIN_Q_PAGE_SIZE; i++) {
+		pbes[i].pa_lo = (u32)cpu_to_le32(pa & 0xffffffff);
+		pbes[i].pa_hi = (u32)cpu_to_le32(upper_32_bits(pa));
+		pa += PAGE_SIZE;
+	}
+	cmd->tbl_addr[0].lo = (u32)(dev->av_tbl.pbl.pa & 0xFFFFFFFF);
+	cmd->tbl_addr[0].hi = (u32)upper_32_bits(dev->av_tbl.pbl.pa);
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_create_ah_tbl_rsp *)cmd;
+	dev->av_tbl.ahid = rsp->ahid & 0xFFFF;
+	kfree(cmd);
+	return 0;
+
+mbx_err:
+	dma_free_coherent(&pdev->dev, dev->av_tbl.size, dev->av_tbl.va,
+			  dev->av_tbl.pa);
+	dev->av_tbl.va = NULL;
+mem_err_ah:
+	dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->av_tbl.pbl.va,
+			  dev->av_tbl.pbl.pa);
+	dev->av_tbl.pbl.va = NULL;
+	dev->av_tbl.size = 0;
+mem_err:
+	kfree(cmd);
+	return status;
+}
+
+static void ocrdma_mbx_delete_ah_tbl(struct ocrdma_dev *dev)
+{
+	struct ocrdma_delete_ah_tbl *cmd;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+
+	if (dev->av_tbl.va == NULL)
+		return;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_AH_TBL, sizeof(*cmd));
+	if (!cmd)
+		return;
+	cmd->ahid = dev->av_tbl.ahid;
+
+	ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	dma_free_coherent(&pdev->dev, dev->av_tbl.size, dev->av_tbl.va,
+			  dev->av_tbl.pa);
+	dev->av_tbl.va = NULL;
+	dma_free_coherent(&pdev->dev, PAGE_SIZE, dev->av_tbl.pbl.va,
+			  dev->av_tbl.pbl.pa);
+	kfree(cmd);
+}
+
+/* Multiple CQs uses the EQ. This routine returns least used
+ * EQ to associate with CQ. This will distributes the interrupt
+ * processing and CPU load to associated EQ, vector and so to that CPU.
+ */
+static u16 ocrdma_bind_eq(struct ocrdma_dev *dev)
+{
+	int i, selected_eq = 0, cq_cnt = 0;
+	u16 eq_id;
+
+	mutex_lock(&dev->dev_lock);
+	cq_cnt = dev->eq_tbl[0].cq_cnt;
+	eq_id = dev->eq_tbl[0].q.id;
+	/* find the EQ which is has the least number of
+	 * CQs associated with it.
+	 */
+	for (i = 0; i < dev->eq_cnt; i++) {
+		if (dev->eq_tbl[i].cq_cnt < cq_cnt) {
+			cq_cnt = dev->eq_tbl[i].cq_cnt;
+			eq_id = dev->eq_tbl[i].q.id;
+			selected_eq = i;
+		}
+	}
+	dev->eq_tbl[selected_eq].cq_cnt += 1;
+	mutex_unlock(&dev->dev_lock);
+	return eq_id;
+}
+
+static void ocrdma_unbind_eq(struct ocrdma_dev *dev, u16 eq_id)
+{
+	int i;
+
+	mutex_lock(&dev->dev_lock);
+	i = ocrdma_get_eq_table_index(dev, eq_id);
+	if (i == -EINVAL)
+		BUG();
+	dev->eq_tbl[i].cq_cnt -= 1;
+	mutex_unlock(&dev->dev_lock);
+}
+
+int ocrdma_mbx_create_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq,
+			 int entries, int dpp_cq, u16 pd_id)
+{
+	int status = -ENOMEM; int max_hw_cqe;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	struct ocrdma_create_cq *cmd;
+	struct ocrdma_create_cq_rsp *rsp;
+	u32 hw_pages, cqe_size, page_size, cqe_count;
+
+	if (entries > dev->attr.max_cqe) {
+		pr_err("%s(%d) max_cqe=0x%x, requester_cqe=0x%x\n",
+		       __func__, dev->id, dev->attr.max_cqe, entries);
+		return -EINVAL;
+	}
+	if (dpp_cq && (ocrdma_get_asic_type(dev) != OCRDMA_ASIC_GEN_SKH_R))
+		return -EINVAL;
+
+	if (dpp_cq) {
+		cq->max_hw_cqe = 1;
+		max_hw_cqe = 1;
+		cqe_size = OCRDMA_DPP_CQE_SIZE;
+		hw_pages = 1;
+	} else {
+		cq->max_hw_cqe = dev->attr.max_cqe;
+		max_hw_cqe = dev->attr.max_cqe;
+		cqe_size = sizeof(struct ocrdma_cqe);
+		hw_pages = OCRDMA_CREATE_CQ_MAX_PAGES;
+	}
+
+	cq->len = roundup(max_hw_cqe * cqe_size, OCRDMA_MIN_Q_PAGE_SIZE);
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_CQ, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	ocrdma_init_mch(&cmd->cmd.req, OCRDMA_CMD_CREATE_CQ,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+	cq->va = dma_alloc_coherent(&pdev->dev, cq->len, &cq->pa, GFP_KERNEL);
+	if (!cq->va) {
+		status = -ENOMEM;
+		goto mem_err;
+	}
+	memset(cq->va, 0, cq->len);
+	page_size = cq->len / hw_pages;
+	cmd->cmd.pgsz_pgcnt = (page_size / OCRDMA_MIN_Q_PAGE_SIZE) <<
+					OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT;
+	cmd->cmd.pgsz_pgcnt |= hw_pages;
+	cmd->cmd.ev_cnt_flags = OCRDMA_CREATE_CQ_DEF_FLAGS;
+
+	cq->eqn = ocrdma_bind_eq(dev);
+	cmd->cmd.req.rsvd_version = OCRDMA_CREATE_CQ_VER3;
+	cqe_count = cq->len / cqe_size;
+	cq->cqe_cnt = cqe_count;
+	if (cqe_count > 1024) {
+		/* Set cnt to 3 to indicate more than 1024 cq entries */
+		cmd->cmd.ev_cnt_flags |= (0x3 << OCRDMA_CREATE_CQ_CNT_SHIFT);
+	} else {
+		u8 count = 0;
+		switch (cqe_count) {
+		case 256:
+			count = 0;
+			break;
+		case 512:
+			count = 1;
+			break;
+		case 1024:
+			count = 2;
+			break;
+		default:
+			goto mbx_err;
+		}
+		cmd->cmd.ev_cnt_flags |= (count << OCRDMA_CREATE_CQ_CNT_SHIFT);
+	}
+	/* shared eq between all the consumer cqs. */
+	cmd->cmd.eqn = cq->eqn;
+	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
+		if (dpp_cq)
+			cmd->cmd.pgsz_pgcnt |= OCRDMA_CREATE_CQ_DPP <<
+				OCRDMA_CREATE_CQ_TYPE_SHIFT;
+		cq->phase_change = false;
+		cmd->cmd.pdid_cqecnt = (cq->len / cqe_size);
+	} else {
+		cmd->cmd.pdid_cqecnt = (cq->len / cqe_size) - 1;
+		cmd->cmd.ev_cnt_flags |= OCRDMA_CREATE_CQ_FLAGS_AUTO_VALID;
+		cq->phase_change = true;
+	}
+
+	/* pd_id valid only for v3 */
+	cmd->cmd.pdid_cqecnt |= (pd_id <<
+		OCRDMA_CREATE_CQ_CMD_PDID_SHIFT);
+	ocrdma_build_q_pages(&cmd->cmd.pa[0], hw_pages, cq->pa, page_size);
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+
+	rsp = (struct ocrdma_create_cq_rsp *)cmd;
+	cq->id = (u16) (rsp->rsp.cq_id & OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK);
+	kfree(cmd);
+	return 0;
+mbx_err:
+	ocrdma_unbind_eq(dev, cq->eqn);
+	dma_free_coherent(&pdev->dev, cq->len, cq->va, cq->pa);
+mem_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_destroy_cq(struct ocrdma_dev *dev, struct ocrdma_cq *cq)
+{
+	int status = -ENOMEM;
+	struct ocrdma_destroy_cq *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_CQ, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	ocrdma_init_mch(&cmd->req, OCRDMA_CMD_DELETE_CQ,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+
+	cmd->bypass_flush_qid |=
+	    (cq->id << OCRDMA_DESTROY_CQ_QID_SHIFT) &
+	    OCRDMA_DESTROY_CQ_QID_MASK;
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	ocrdma_unbind_eq(dev, cq->eqn);
+	dma_free_coherent(&dev->nic_info.pdev->dev, cq->len, cq->va, cq->pa);
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr,
+			  u32 pdid, int addr_check)
+{
+	int status = -ENOMEM;
+	struct ocrdma_alloc_lkey *cmd;
+	struct ocrdma_alloc_lkey_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_ALLOC_LKEY, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->pdid = pdid;
+	cmd->pbl_sz_flags |= addr_check;
+	cmd->pbl_sz_flags |= (hwmr->fr_mr << OCRDMA_ALLOC_LKEY_FMR_SHIFT);
+	cmd->pbl_sz_flags |=
+	    (hwmr->remote_wr << OCRDMA_ALLOC_LKEY_REMOTE_WR_SHIFT);
+	cmd->pbl_sz_flags |=
+	    (hwmr->remote_rd << OCRDMA_ALLOC_LKEY_REMOTE_RD_SHIFT);
+	cmd->pbl_sz_flags |=
+	    (hwmr->local_wr << OCRDMA_ALLOC_LKEY_LOCAL_WR_SHIFT);
+	cmd->pbl_sz_flags |=
+	    (hwmr->remote_atomic << OCRDMA_ALLOC_LKEY_REMOTE_ATOMIC_SHIFT);
+	cmd->pbl_sz_flags |=
+	    (hwmr->num_pbls << OCRDMA_ALLOC_LKEY_PBL_SIZE_SHIFT);
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_alloc_lkey_rsp *)cmd;
+	hwmr->lkey = rsp->lrkey;
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_dealloc_lkey(struct ocrdma_dev *dev, int fr_mr, u32 lkey)
+{
+	int status = -ENOMEM;
+	struct ocrdma_dealloc_lkey *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DEALLOC_LKEY, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	cmd->lkey = lkey;
+	cmd->rsvd_frmr = fr_mr ? 1 : 0;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_mbx_reg_mr(struct ocrdma_dev *dev, struct ocrdma_hw_mr *hwmr,
+			     u32 pdid, u32 pbl_cnt, u32 pbe_size, u32 last)
+{
+	int status = -ENOMEM;
+	int i;
+	struct ocrdma_reg_nsmr *cmd;
+	struct ocrdma_reg_nsmr_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_REGISTER_NSMR, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	cmd->num_pbl_pdid =
+	    pdid | (hwmr->num_pbls << OCRDMA_REG_NSMR_NUM_PBL_SHIFT);
+	cmd->fr_mr = hwmr->fr_mr;
+
+	cmd->flags_hpage_pbe_sz |= (hwmr->remote_wr <<
+				    OCRDMA_REG_NSMR_REMOTE_WR_SHIFT);
+	cmd->flags_hpage_pbe_sz |= (hwmr->remote_rd <<
+				    OCRDMA_REG_NSMR_REMOTE_RD_SHIFT);
+	cmd->flags_hpage_pbe_sz |= (hwmr->local_wr <<
+				    OCRDMA_REG_NSMR_LOCAL_WR_SHIFT);
+	cmd->flags_hpage_pbe_sz |= (hwmr->remote_atomic <<
+				    OCRDMA_REG_NSMR_REMOTE_ATOMIC_SHIFT);
+	cmd->flags_hpage_pbe_sz |= (hwmr->mw_bind <<
+				    OCRDMA_REG_NSMR_BIND_MEMWIN_SHIFT);
+	cmd->flags_hpage_pbe_sz |= (last << OCRDMA_REG_NSMR_LAST_SHIFT);
+
+	cmd->flags_hpage_pbe_sz |= (hwmr->pbe_size / OCRDMA_MIN_HPAGE_SIZE);
+	cmd->flags_hpage_pbe_sz |= (hwmr->pbl_size / OCRDMA_MIN_HPAGE_SIZE) <<
+					OCRDMA_REG_NSMR_HPAGE_SIZE_SHIFT;
+	cmd->totlen_low = hwmr->len;
+	cmd->totlen_high = upper_32_bits(hwmr->len);
+	cmd->fbo_low = (u32) (hwmr->fbo & 0xffffffff);
+	cmd->fbo_high = (u32) upper_32_bits(hwmr->fbo);
+	cmd->va_loaddr = (u32) hwmr->va;
+	cmd->va_hiaddr = (u32) upper_32_bits(hwmr->va);
+
+	for (i = 0; i < pbl_cnt; i++) {
+		cmd->pbl[i].lo = (u32) (hwmr->pbl_table[i].pa & 0xffffffff);
+		cmd->pbl[i].hi = upper_32_bits(hwmr->pbl_table[i].pa);
+	}
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_reg_nsmr_rsp *)cmd;
+	hwmr->lkey = rsp->lrkey;
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_mbx_reg_mr_cont(struct ocrdma_dev *dev,
+				  struct ocrdma_hw_mr *hwmr, u32 pbl_cnt,
+				  u32 pbl_offset, u32 last)
+{
+	int status = -ENOMEM;
+	int i;
+	struct ocrdma_reg_nsmr_cont *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_REGISTER_NSMR_CONT, sizeof(*cmd));
+	if (!cmd)
+		return -ENOMEM;
+	cmd->lrkey = hwmr->lkey;
+	cmd->num_pbl_offset = (pbl_cnt << OCRDMA_REG_NSMR_CONT_NUM_PBL_SHIFT) |
+	    (pbl_offset & OCRDMA_REG_NSMR_CONT_PBL_SHIFT_MASK);
+	cmd->last = last << OCRDMA_REG_NSMR_CONT_LAST_SHIFT;
+
+	for (i = 0; i < pbl_cnt; i++) {
+		cmd->pbl[i].lo =
+		    (u32) (hwmr->pbl_table[i + pbl_offset].pa & 0xffffffff);
+		cmd->pbl[i].hi =
+		    upper_32_bits(hwmr->pbl_table[i + pbl_offset].pa);
+	}
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_reg_mr(struct ocrdma_dev *dev,
+		  struct ocrdma_hw_mr *hwmr, u32 pdid, int acc)
+{
+	int status;
+	u32 last = 0;
+	u32 cur_pbl_cnt, pbl_offset;
+	u32 pending_pbl_cnt = hwmr->num_pbls;
+
+	pbl_offset = 0;
+	cur_pbl_cnt = min(pending_pbl_cnt, MAX_OCRDMA_NSMR_PBL);
+	if (cur_pbl_cnt == pending_pbl_cnt)
+		last = 1;
+
+	status = ocrdma_mbx_reg_mr(dev, hwmr, pdid,
+				   cur_pbl_cnt, hwmr->pbe_size, last);
+	if (status) {
+		pr_err("%s() status=%d\n", __func__, status);
+		return status;
+	}
+	/* if there is no more pbls to register then exit. */
+	if (last)
+		return 0;
+
+	while (!last) {
+		pbl_offset += cur_pbl_cnt;
+		pending_pbl_cnt -= cur_pbl_cnt;
+		cur_pbl_cnt = min(pending_pbl_cnt, MAX_OCRDMA_NSMR_PBL);
+		/* if we reach the end of the pbls, then need to set the last
+		 * bit, indicating no more pbls to register for this memory key.
+		 */
+		if (cur_pbl_cnt == pending_pbl_cnt)
+			last = 1;
+
+		status = ocrdma_mbx_reg_mr_cont(dev, hwmr, cur_pbl_cnt,
+						pbl_offset, last);
+		if (status)
+			break;
+	}
+	if (status)
+		pr_err("%s() err. status=%d\n", __func__, status);
+
+	return status;
+}
+
+bool ocrdma_is_qp_in_sq_flushlist(struct ocrdma_cq *cq, struct ocrdma_qp *qp)
+{
+	struct ocrdma_qp *tmp;
+	bool found = false;
+	list_for_each_entry(tmp, &cq->sq_head, sq_entry) {
+		if (qp == tmp) {
+			found = true;
+			break;
+		}
+	}
+	return found;
+}
+
+bool ocrdma_is_qp_in_rq_flushlist(struct ocrdma_cq *cq, struct ocrdma_qp *qp)
+{
+	struct ocrdma_qp *tmp;
+	bool found = false;
+	list_for_each_entry(tmp, &cq->rq_head, rq_entry) {
+		if (qp == tmp) {
+			found = true;
+			break;
+		}
+	}
+	return found;
+}
+
+void ocrdma_flush_qp(struct ocrdma_qp *qp)
+{
+	bool found;
+	unsigned long flags;
+	struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device);
+
+	spin_lock_irqsave(&dev->flush_q_lock, flags);
+	found = ocrdma_is_qp_in_sq_flushlist(qp->sq_cq, qp);
+	if (!found)
+		list_add_tail(&qp->sq_entry, &qp->sq_cq->sq_head);
+	if (!qp->srq) {
+		found = ocrdma_is_qp_in_rq_flushlist(qp->rq_cq, qp);
+		if (!found)
+			list_add_tail(&qp->rq_entry, &qp->rq_cq->rq_head);
+	}
+	spin_unlock_irqrestore(&dev->flush_q_lock, flags);
+}
+
+static void ocrdma_init_hwq_ptr(struct ocrdma_qp *qp)
+{
+	qp->sq.head = 0;
+	qp->sq.tail = 0;
+	qp->rq.head = 0;
+	qp->rq.tail = 0;
+}
+
+int ocrdma_qp_state_change(struct ocrdma_qp *qp, enum ib_qp_state new_ib_state,
+			   enum ib_qp_state *old_ib_state)
+{
+	unsigned long flags;
+	int status = 0;
+	enum ocrdma_qp_state new_state;
+	new_state = get_ocrdma_qp_state(new_ib_state);
+
+	/* sync with wqe and rqe posting */
+	spin_lock_irqsave(&qp->q_lock, flags);
+
+	if (old_ib_state)
+		*old_ib_state = get_ibqp_state(qp->state);
+	if (new_state == qp->state) {
+		spin_unlock_irqrestore(&qp->q_lock, flags);
+		return 1;
+	}
+
+
+	if (new_state == OCRDMA_QPS_INIT) {
+		ocrdma_init_hwq_ptr(qp);
+		ocrdma_del_flush_qp(qp);
+	} else if (new_state == OCRDMA_QPS_ERR) {
+		ocrdma_flush_qp(qp);
+	}
+
+	qp->state = new_state;
+
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+	return status;
+}
+
+static u32 ocrdma_set_create_qp_mbx_access_flags(struct ocrdma_qp *qp)
+{
+	u32 flags = 0;
+	if (qp->cap_flags & OCRDMA_QP_INB_RD)
+		flags |= OCRDMA_CREATE_QP_REQ_INB_RDEN_MASK;
+	if (qp->cap_flags & OCRDMA_QP_INB_WR)
+		flags |= OCRDMA_CREATE_QP_REQ_INB_WREN_MASK;
+	if (qp->cap_flags & OCRDMA_QP_MW_BIND)
+		flags |= OCRDMA_CREATE_QP_REQ_BIND_MEMWIN_MASK;
+	if (qp->cap_flags & OCRDMA_QP_LKEY0)
+		flags |= OCRDMA_CREATE_QP_REQ_ZERO_LKEYEN_MASK;
+	if (qp->cap_flags & OCRDMA_QP_FAST_REG)
+		flags |= OCRDMA_CREATE_QP_REQ_FMR_EN_MASK;
+	return flags;
+}
+
+static int ocrdma_set_create_qp_sq_cmd(struct ocrdma_create_qp_req *cmd,
+					struct ib_qp_init_attr *attrs,
+					struct ocrdma_qp *qp)
+{
+	int status;
+	u32 len, hw_pages, hw_page_size;
+	dma_addr_t pa;
+	struct ocrdma_pd *pd = qp->pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	u32 max_wqe_allocated;
+	u32 max_sges = attrs->cap.max_send_sge;
+
+	/* QP1 may exceed 127 */
+	max_wqe_allocated = min_t(u32, attrs->cap.max_send_wr + 1,
+				dev->attr.max_wqe);
+
+	status = ocrdma_build_q_conf(&max_wqe_allocated,
+		dev->attr.wqe_size, &hw_pages, &hw_page_size);
+	if (status) {
+		pr_err("%s() req. max_send_wr=0x%x\n", __func__,
+		       max_wqe_allocated);
+		return -EINVAL;
+	}
+	qp->sq.max_cnt = max_wqe_allocated;
+	len = (hw_pages * hw_page_size);
+
+	qp->sq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
+	if (!qp->sq.va)
+		return -EINVAL;
+	memset(qp->sq.va, 0, len);
+	qp->sq.len = len;
+	qp->sq.pa = pa;
+	qp->sq.entry_size = dev->attr.wqe_size;
+	ocrdma_build_q_pages(&cmd->wq_addr[0], hw_pages, pa, hw_page_size);
+
+	cmd->type_pgsz_pdn |= (ilog2(hw_page_size / OCRDMA_MIN_Q_PAGE_SIZE)
+				<< OCRDMA_CREATE_QP_REQ_SQ_PAGE_SIZE_SHIFT);
+	cmd->num_wq_rq_pages |= (hw_pages <<
+				 OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_SHIFT) &
+	    OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_MASK;
+	cmd->max_sge_send_write |= (max_sges <<
+				    OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_SHIFT) &
+	    OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_MASK;
+	cmd->max_sge_send_write |= (max_sges <<
+				    OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_SHIFT) &
+					OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_MASK;
+	cmd->max_wqe_rqe |= (ilog2(qp->sq.max_cnt) <<
+			     OCRDMA_CREATE_QP_REQ_MAX_WQE_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_MAX_WQE_MASK;
+	cmd->wqe_rqe_size |= (dev->attr.wqe_size <<
+			      OCRDMA_CREATE_QP_REQ_WQE_SIZE_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_WQE_SIZE_MASK;
+	return 0;
+}
+
+static int ocrdma_set_create_qp_rq_cmd(struct ocrdma_create_qp_req *cmd,
+					struct ib_qp_init_attr *attrs,
+					struct ocrdma_qp *qp)
+{
+	int status;
+	u32 len, hw_pages, hw_page_size;
+	dma_addr_t pa = 0;
+	struct ocrdma_pd *pd = qp->pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	u32 max_rqe_allocated = attrs->cap.max_recv_wr + 1;
+
+	status = ocrdma_build_q_conf(&max_rqe_allocated, dev->attr.rqe_size,
+				     &hw_pages, &hw_page_size);
+	if (status) {
+		pr_err("%s() req. max_recv_wr=0x%x\n", __func__,
+		       attrs->cap.max_recv_wr + 1);
+		return status;
+	}
+	qp->rq.max_cnt = max_rqe_allocated;
+	len = (hw_pages * hw_page_size);
+
+	qp->rq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
+	if (!qp->rq.va)
+		return -ENOMEM;
+	memset(qp->rq.va, 0, len);
+	qp->rq.pa = pa;
+	qp->rq.len = len;
+	qp->rq.entry_size = dev->attr.rqe_size;
+
+	ocrdma_build_q_pages(&cmd->rq_addr[0], hw_pages, pa, hw_page_size);
+	cmd->type_pgsz_pdn |= (ilog2(hw_page_size / OCRDMA_MIN_Q_PAGE_SIZE) <<
+		OCRDMA_CREATE_QP_REQ_RQ_PAGE_SIZE_SHIFT);
+	cmd->num_wq_rq_pages |=
+	    (hw_pages << OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_SHIFT) &
+	    OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_MASK;
+	cmd->max_sge_recv_flags |= (attrs->cap.max_recv_sge <<
+				OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_MASK;
+	cmd->max_wqe_rqe |= (ilog2(qp->rq.max_cnt) <<
+				OCRDMA_CREATE_QP_REQ_MAX_RQE_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_MAX_RQE_MASK;
+	cmd->wqe_rqe_size |= (dev->attr.rqe_size <<
+			OCRDMA_CREATE_QP_REQ_RQE_SIZE_SHIFT) &
+			OCRDMA_CREATE_QP_REQ_RQE_SIZE_MASK;
+	return 0;
+}
+
+static void ocrdma_set_create_qp_dpp_cmd(struct ocrdma_create_qp_req *cmd,
+					 struct ocrdma_pd *pd,
+					 struct ocrdma_qp *qp,
+					 u8 enable_dpp_cq, u16 dpp_cq_id)
+{
+	pd->num_dpp_qp--;
+	qp->dpp_enabled = true;
+	cmd->max_sge_recv_flags |= OCRDMA_CREATE_QP_REQ_ENABLE_DPP_MASK;
+	if (!enable_dpp_cq)
+		return;
+	cmd->max_sge_recv_flags |= OCRDMA_CREATE_QP_REQ_ENABLE_DPP_MASK;
+	cmd->dpp_credits_cqid = dpp_cq_id;
+	cmd->dpp_credits_cqid |= OCRDMA_CREATE_QP_REQ_DPP_CREDIT_LIMIT <<
+					OCRDMA_CREATE_QP_REQ_DPP_CREDIT_SHIFT;
+}
+
+static int ocrdma_set_create_qp_ird_cmd(struct ocrdma_create_qp_req *cmd,
+					struct ocrdma_qp *qp)
+{
+	struct ocrdma_pd *pd = qp->pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	dma_addr_t pa = 0;
+	int ird_page_size = dev->attr.ird_page_size;
+	int ird_q_len = dev->attr.num_ird_pages * ird_page_size;
+	struct ocrdma_hdr_wqe *rqe;
+	int i  = 0;
+
+	if (dev->attr.ird == 0)
+		return 0;
+
+	qp->ird_q_va = dma_alloc_coherent(&pdev->dev, ird_q_len,
+					&pa, GFP_KERNEL);
+	if (!qp->ird_q_va)
+		return -ENOMEM;
+	memset(qp->ird_q_va, 0, ird_q_len);
+	ocrdma_build_q_pages(&cmd->ird_addr[0], dev->attr.num_ird_pages,
+			     pa, ird_page_size);
+	for (; i < ird_q_len / dev->attr.rqe_size; i++) {
+		rqe = (struct ocrdma_hdr_wqe *)(qp->ird_q_va +
+			(i * dev->attr.rqe_size));
+		rqe->cw = 0;
+		rqe->cw |= 2;
+		rqe->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT);
+		rqe->cw |= (8 << OCRDMA_WQE_SIZE_SHIFT);
+		rqe->cw |= (8 << OCRDMA_WQE_NXT_WQE_SIZE_SHIFT);
+	}
+	return 0;
+}
+
+static void ocrdma_get_create_qp_rsp(struct ocrdma_create_qp_rsp *rsp,
+				     struct ocrdma_qp *qp,
+				     struct ib_qp_init_attr *attrs,
+				     u16 *dpp_offset, u16 *dpp_credit_lmt)
+{
+	u32 max_wqe_allocated, max_rqe_allocated;
+	qp->id = rsp->qp_id & OCRDMA_CREATE_QP_RSP_QP_ID_MASK;
+	qp->rq.dbid = rsp->sq_rq_id & OCRDMA_CREATE_QP_RSP_RQ_ID_MASK;
+	qp->sq.dbid = rsp->sq_rq_id >> OCRDMA_CREATE_QP_RSP_SQ_ID_SHIFT;
+	qp->max_ird = rsp->max_ord_ird & OCRDMA_CREATE_QP_RSP_MAX_IRD_MASK;
+	qp->max_ord = (rsp->max_ord_ird >> OCRDMA_CREATE_QP_RSP_MAX_ORD_SHIFT);
+	qp->dpp_enabled = false;
+	if (rsp->dpp_response & OCRDMA_CREATE_QP_RSP_DPP_ENABLED_MASK) {
+		qp->dpp_enabled = true;
+		*dpp_credit_lmt = (rsp->dpp_response &
+				OCRDMA_CREATE_QP_RSP_DPP_CREDITS_MASK) >>
+				OCRDMA_CREATE_QP_RSP_DPP_CREDITS_SHIFT;
+		*dpp_offset = (rsp->dpp_response &
+				OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_MASK) >>
+				OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_SHIFT;
+	}
+	max_wqe_allocated =
+		rsp->max_wqe_rqe >> OCRDMA_CREATE_QP_RSP_MAX_WQE_SHIFT;
+	max_wqe_allocated = 1 << max_wqe_allocated;
+	max_rqe_allocated = 1 << ((u16)rsp->max_wqe_rqe);
+
+	qp->sq.max_cnt = max_wqe_allocated;
+	qp->sq.max_wqe_idx = max_wqe_allocated - 1;
+
+	if (!attrs->srq) {
+		qp->rq.max_cnt = max_rqe_allocated;
+		qp->rq.max_wqe_idx = max_rqe_allocated - 1;
+	}
+}
+
+int ocrdma_mbx_create_qp(struct ocrdma_qp *qp, struct ib_qp_init_attr *attrs,
+			 u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset,
+			 u16 *dpp_credit_lmt)
+{
+	int status = -ENOMEM;
+	u32 flags = 0;
+	struct ocrdma_pd *pd = qp->pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	struct ocrdma_cq *cq;
+	struct ocrdma_create_qp_req *cmd;
+	struct ocrdma_create_qp_rsp *rsp;
+	int qptype;
+
+	switch (attrs->qp_type) {
+	case IB_QPT_GSI:
+		qptype = OCRDMA_QPT_GSI;
+		break;
+	case IB_QPT_RC:
+		qptype = OCRDMA_QPT_RC;
+		break;
+	case IB_QPT_UD:
+		qptype = OCRDMA_QPT_UD;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_QP, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->type_pgsz_pdn |= (qptype << OCRDMA_CREATE_QP_REQ_QPT_SHIFT) &
+						OCRDMA_CREATE_QP_REQ_QPT_MASK;
+	status = ocrdma_set_create_qp_sq_cmd(cmd, attrs, qp);
+	if (status)
+		goto sq_err;
+
+	if (attrs->srq) {
+		struct ocrdma_srq *srq = get_ocrdma_srq(attrs->srq);
+		cmd->max_sge_recv_flags |= OCRDMA_CREATE_QP_REQ_USE_SRQ_MASK;
+		cmd->rq_addr[0].lo = srq->id;
+		qp->srq = srq;
+	} else {
+		status = ocrdma_set_create_qp_rq_cmd(cmd, attrs, qp);
+		if (status)
+			goto rq_err;
+	}
+
+	status = ocrdma_set_create_qp_ird_cmd(cmd, qp);
+	if (status)
+		goto mbx_err;
+
+	cmd->type_pgsz_pdn |= (pd->id << OCRDMA_CREATE_QP_REQ_PD_ID_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_PD_ID_MASK;
+
+	flags = ocrdma_set_create_qp_mbx_access_flags(qp);
+
+	cmd->max_sge_recv_flags |= flags;
+	cmd->max_ord_ird |= (dev->attr.max_ord_per_qp <<
+			     OCRDMA_CREATE_QP_REQ_MAX_ORD_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_MAX_ORD_MASK;
+	cmd->max_ord_ird |= (dev->attr.max_ird_per_qp <<
+			     OCRDMA_CREATE_QP_REQ_MAX_IRD_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_MAX_IRD_MASK;
+	cq = get_ocrdma_cq(attrs->send_cq);
+	cmd->wq_rq_cqid |= (cq->id << OCRDMA_CREATE_QP_REQ_WQ_CQID_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_WQ_CQID_MASK;
+	qp->sq_cq = cq;
+	cq = get_ocrdma_cq(attrs->recv_cq);
+	cmd->wq_rq_cqid |= (cq->id << OCRDMA_CREATE_QP_REQ_RQ_CQID_SHIFT) &
+				OCRDMA_CREATE_QP_REQ_RQ_CQID_MASK;
+	qp->rq_cq = cq;
+
+	if (pd->dpp_enabled && attrs->cap.max_inline_data && pd->num_dpp_qp &&
+	    (attrs->cap.max_inline_data <= dev->attr.max_inline_data)) {
+		ocrdma_set_create_qp_dpp_cmd(cmd, pd, qp, enable_dpp_cq,
+					     dpp_cq_id);
+	}
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_create_qp_rsp *)cmd;
+	ocrdma_get_create_qp_rsp(rsp, qp, attrs, dpp_offset, dpp_credit_lmt);
+	qp->state = OCRDMA_QPS_RST;
+	kfree(cmd);
+	return 0;
+mbx_err:
+	if (qp->rq.va)
+		dma_free_coherent(&pdev->dev, qp->rq.len, qp->rq.va, qp->rq.pa);
+rq_err:
+	pr_err("%s(%d) rq_err\n", __func__, dev->id);
+	dma_free_coherent(&pdev->dev, qp->sq.len, qp->sq.va, qp->sq.pa);
+sq_err:
+	pr_err("%s(%d) sq_err\n", __func__, dev->id);
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_query_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp,
+			struct ocrdma_qp_params *param)
+{
+	int status = -ENOMEM;
+	struct ocrdma_query_qp *cmd;
+	struct ocrdma_query_qp_rsp *rsp;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_QP, sizeof(*rsp));
+	if (!cmd)
+		return status;
+	cmd->qp_id = qp->id;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_query_qp_rsp *)cmd;
+	memcpy(param, &rsp->params, sizeof(struct ocrdma_qp_params));
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_set_av_params(struct ocrdma_qp *qp,
+				struct ocrdma_modify_qp *cmd,
+				struct ib_qp_attr *attrs,
+				int attr_mask)
+{
+	int status;
+	struct ib_ah_attr *ah_attr = &attrs->ah_attr;
+	union ib_gid sgid, zgid;
+	u32 vlan_id = 0xFFFF;
+	u8 mac_addr[6];
+	struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device);
+
+	if ((ah_attr->ah_flags & IB_AH_GRH) == 0)
+		return -EINVAL;
+	if (atomic_cmpxchg(&dev->update_sl, 1, 0))
+		ocrdma_init_service_level(dev);
+	cmd->params.tclass_sq_psn |=
+	    (ah_attr->grh.traffic_class << OCRDMA_QP_PARAMS_TCLASS_SHIFT);
+	cmd->params.rnt_rc_sl_fl |=
+	    (ah_attr->grh.flow_label & OCRDMA_QP_PARAMS_FLOW_LABEL_MASK);
+	cmd->params.rnt_rc_sl_fl |= (ah_attr->sl << OCRDMA_QP_PARAMS_SL_SHIFT);
+	cmd->params.hop_lmt_rq_psn |=
+	    (ah_attr->grh.hop_limit << OCRDMA_QP_PARAMS_HOP_LMT_SHIFT);
+	cmd->flags |= OCRDMA_QP_PARA_FLOW_LBL_VALID;
+	memcpy(&cmd->params.dgid[0], &ah_attr->grh.dgid.raw[0],
+	       sizeof(cmd->params.dgid));
+	status = ocrdma_query_gid(&dev->ibdev, 1,
+			ah_attr->grh.sgid_index, &sgid);
+	if (status)
+		return status;
+
+	memset(&zgid, 0, sizeof(zgid));
+	if (!memcmp(&sgid, &zgid, sizeof(zgid)))
+		return -EINVAL;
+
+	qp->sgid_idx = ah_attr->grh.sgid_index;
+	memcpy(&cmd->params.sgid[0], &sgid.raw[0], sizeof(cmd->params.sgid));
+	status = ocrdma_resolve_dmac(dev, ah_attr, &mac_addr[0]);
+	if (status)
+		return status;
+	cmd->params.dmac_b0_to_b3 = mac_addr[0] | (mac_addr[1] << 8) |
+				(mac_addr[2] << 16) | (mac_addr[3] << 24);
+	/* convert them to LE format. */
+	ocrdma_cpu_to_le32(&cmd->params.dgid[0], sizeof(cmd->params.dgid));
+	ocrdma_cpu_to_le32(&cmd->params.sgid[0], sizeof(cmd->params.sgid));
+	cmd->params.vlan_dmac_b4_to_b5 = mac_addr[4] | (mac_addr[5] << 8);
+	if (attr_mask & IB_QP_VID) {
+		vlan_id = attrs->vlan_id;
+	} else if (dev->pfc_state) {
+		vlan_id = 0;
+		pr_err("ocrdma%d:Using VLAN with PFC is recommended\n",
+			dev->id);
+		pr_err("ocrdma%d:Using VLAN 0 for this connection\n",
+			dev->id);
+	}
+
+	if (vlan_id < 0x1000) {
+		cmd->params.vlan_dmac_b4_to_b5 |=
+		    vlan_id << OCRDMA_QP_PARAMS_VLAN_SHIFT;
+		cmd->flags |= OCRDMA_QP_PARA_VLAN_EN_VALID;
+		cmd->params.rnt_rc_sl_fl |=
+			(dev->sl & 0x07) << OCRDMA_QP_PARAMS_SL_SHIFT;
+	}
+
+	return 0;
+}
+
+static int ocrdma_set_qp_params(struct ocrdma_qp *qp,
+				struct ocrdma_modify_qp *cmd,
+				struct ib_qp_attr *attrs, int attr_mask)
+{
+	int status = 0;
+	struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device);
+
+	if (attr_mask & IB_QP_PKEY_INDEX) {
+		cmd->params.path_mtu_pkey_indx |= (attrs->pkey_index &
+					    OCRDMA_QP_PARAMS_PKEY_INDEX_MASK);
+		cmd->flags |= OCRDMA_QP_PARA_PKEY_VALID;
+	}
+	if (attr_mask & IB_QP_QKEY) {
+		qp->qkey = attrs->qkey;
+		cmd->params.qkey = attrs->qkey;
+		cmd->flags |= OCRDMA_QP_PARA_QKEY_VALID;
+	}
+	if (attr_mask & IB_QP_AV) {
+		status = ocrdma_set_av_params(qp, cmd, attrs, attr_mask);
+		if (status)
+			return status;
+	} else if (qp->qp_type == IB_QPT_GSI || qp->qp_type == IB_QPT_UD) {
+		/* set the default mac address for UD, GSI QPs */
+		cmd->params.dmac_b0_to_b3 = dev->nic_info.mac_addr[0] |
+			(dev->nic_info.mac_addr[1] << 8) |
+			(dev->nic_info.mac_addr[2] << 16) |
+			(dev->nic_info.mac_addr[3] << 24);
+		cmd->params.vlan_dmac_b4_to_b5 = dev->nic_info.mac_addr[4] |
+					(dev->nic_info.mac_addr[5] << 8);
+	}
+	if ((attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) &&
+	    attrs->en_sqd_async_notify) {
+		cmd->params.max_sge_recv_flags |=
+			OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC;
+		cmd->flags |= OCRDMA_QP_PARA_DST_QPN_VALID;
+	}
+	if (attr_mask & IB_QP_DEST_QPN) {
+		cmd->params.ack_to_rnr_rtc_dest_qpn |= (attrs->dest_qp_num &
+				OCRDMA_QP_PARAMS_DEST_QPN_MASK);
+		cmd->flags |= OCRDMA_QP_PARA_DST_QPN_VALID;
+	}
+	if (attr_mask & IB_QP_PATH_MTU) {
+		if (attrs->path_mtu < IB_MTU_512 ||
+		    attrs->path_mtu > IB_MTU_4096) {
+			pr_err("ocrdma%d: IB MTU %d is not supported\n",
+			       dev->id, ib_mtu_enum_to_int(attrs->path_mtu));
+			status = -EINVAL;
+			goto pmtu_err;
+		}
+		cmd->params.path_mtu_pkey_indx |=
+		    (ib_mtu_enum_to_int(attrs->path_mtu) <<
+		     OCRDMA_QP_PARAMS_PATH_MTU_SHIFT) &
+		    OCRDMA_QP_PARAMS_PATH_MTU_MASK;
+		cmd->flags |= OCRDMA_QP_PARA_PMTU_VALID;
+	}
+	if (attr_mask & IB_QP_TIMEOUT) {
+		cmd->params.ack_to_rnr_rtc_dest_qpn |= attrs->timeout <<
+		    OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT;
+		cmd->flags |= OCRDMA_QP_PARA_ACK_TO_VALID;
+	}
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		cmd->params.rnt_rc_sl_fl |= (attrs->retry_cnt <<
+				      OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT) &
+		    OCRDMA_QP_PARAMS_RETRY_CNT_MASK;
+		cmd->flags |= OCRDMA_QP_PARA_RETRY_CNT_VALID;
+	}
+	if (attr_mask & IB_QP_MIN_RNR_TIMER) {
+		cmd->params.rnt_rc_sl_fl |= (attrs->min_rnr_timer <<
+				      OCRDMA_QP_PARAMS_RNR_NAK_TIMER_SHIFT) &
+		    OCRDMA_QP_PARAMS_RNR_NAK_TIMER_MASK;
+		cmd->flags |= OCRDMA_QP_PARA_RNT_VALID;
+	}
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		cmd->params.ack_to_rnr_rtc_dest_qpn |= (attrs->rnr_retry <<
+			OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT)
+			& OCRDMA_QP_PARAMS_RNR_RETRY_CNT_MASK;
+		cmd->flags |= OCRDMA_QP_PARA_RRC_VALID;
+	}
+	if (attr_mask & IB_QP_SQ_PSN) {
+		cmd->params.tclass_sq_psn |= (attrs->sq_psn & 0x00ffffff);
+		cmd->flags |= OCRDMA_QP_PARA_SQPSN_VALID;
+	}
+	if (attr_mask & IB_QP_RQ_PSN) {
+		cmd->params.hop_lmt_rq_psn |= (attrs->rq_psn & 0x00ffffff);
+		cmd->flags |= OCRDMA_QP_PARA_RQPSN_VALID;
+	}
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
+		if (attrs->max_rd_atomic > dev->attr.max_ord_per_qp) {
+			status = -EINVAL;
+			goto pmtu_err;
+		}
+		qp->max_ord = attrs->max_rd_atomic;
+		cmd->flags |= OCRDMA_QP_PARA_MAX_ORD_VALID;
+	}
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
+		if (attrs->max_dest_rd_atomic > dev->attr.max_ird_per_qp) {
+			status = -EINVAL;
+			goto pmtu_err;
+		}
+		qp->max_ird = attrs->max_dest_rd_atomic;
+		cmd->flags |= OCRDMA_QP_PARA_MAX_IRD_VALID;
+	}
+	cmd->params.max_ord_ird = (qp->max_ord <<
+				OCRDMA_QP_PARAMS_MAX_ORD_SHIFT) |
+				(qp->max_ird & OCRDMA_QP_PARAMS_MAX_IRD_MASK);
+pmtu_err:
+	return status;
+}
+
+int ocrdma_mbx_modify_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp,
+			 struct ib_qp_attr *attrs, int attr_mask)
+{
+	int status = -ENOMEM;
+	struct ocrdma_modify_qp *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_MODIFY_QP, sizeof(*cmd));
+	if (!cmd)
+		return status;
+
+	cmd->params.id = qp->id;
+	cmd->flags = 0;
+	if (attr_mask & IB_QP_STATE) {
+		cmd->params.max_sge_recv_flags |=
+		    (get_ocrdma_qp_state(attrs->qp_state) <<
+		     OCRDMA_QP_PARAMS_STATE_SHIFT) &
+		    OCRDMA_QP_PARAMS_STATE_MASK;
+		cmd->flags |= OCRDMA_QP_PARA_QPS_VALID;
+	} else {
+		cmd->params.max_sge_recv_flags |=
+		    (qp->state << OCRDMA_QP_PARAMS_STATE_SHIFT) &
+		    OCRDMA_QP_PARAMS_STATE_MASK;
+	}
+
+	status = ocrdma_set_qp_params(qp, cmd, attrs, attr_mask);
+	if (status)
+		goto mbx_err;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_destroy_qp(struct ocrdma_dev *dev, struct ocrdma_qp *qp)
+{
+	int status = -ENOMEM;
+	struct ocrdma_destroy_qp *cmd;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_QP, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->qp_id = qp->id;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+
+mbx_err:
+	kfree(cmd);
+	if (qp->sq.va)
+		dma_free_coherent(&pdev->dev, qp->sq.len, qp->sq.va, qp->sq.pa);
+	if (!qp->srq && qp->rq.va)
+		dma_free_coherent(&pdev->dev, qp->rq.len, qp->rq.va, qp->rq.pa);
+	if (qp->dpp_enabled)
+		qp->pd->num_dpp_qp++;
+	return status;
+}
+
+int ocrdma_mbx_create_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq,
+			  struct ib_srq_init_attr *srq_attr,
+			  struct ocrdma_pd *pd)
+{
+	int status = -ENOMEM;
+	int hw_pages, hw_page_size;
+	int len;
+	struct ocrdma_create_srq_rsp *rsp;
+	struct ocrdma_create_srq *cmd;
+	dma_addr_t pa;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	u32 max_rqe_allocated;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_CREATE_SRQ, sizeof(*cmd));
+	if (!cmd)
+		return status;
+
+	cmd->pgsz_pdid = pd->id & OCRDMA_CREATE_SRQ_PD_ID_MASK;
+	max_rqe_allocated = srq_attr->attr.max_wr + 1;
+	status = ocrdma_build_q_conf(&max_rqe_allocated,
+				dev->attr.rqe_size,
+				&hw_pages, &hw_page_size);
+	if (status) {
+		pr_err("%s() req. max_wr=0x%x\n", __func__,
+		       srq_attr->attr.max_wr);
+		status = -EINVAL;
+		goto ret;
+	}
+	len = hw_pages * hw_page_size;
+	srq->rq.va = dma_alloc_coherent(&pdev->dev, len, &pa, GFP_KERNEL);
+	if (!srq->rq.va) {
+		status = -ENOMEM;
+		goto ret;
+	}
+	ocrdma_build_q_pages(&cmd->rq_addr[0], hw_pages, pa, hw_page_size);
+
+	srq->rq.entry_size = dev->attr.rqe_size;
+	srq->rq.pa = pa;
+	srq->rq.len = len;
+	srq->rq.max_cnt = max_rqe_allocated;
+
+	cmd->max_sge_rqe = ilog2(max_rqe_allocated);
+	cmd->max_sge_rqe |= srq_attr->attr.max_sge <<
+				OCRDMA_CREATE_SRQ_MAX_SGE_RECV_SHIFT;
+
+	cmd->pgsz_pdid |= (ilog2(hw_page_size / OCRDMA_MIN_Q_PAGE_SIZE)
+		<< OCRDMA_CREATE_SRQ_PG_SZ_SHIFT);
+	cmd->pages_rqe_sz |= (dev->attr.rqe_size
+		<< OCRDMA_CREATE_SRQ_RQE_SIZE_SHIFT)
+		& OCRDMA_CREATE_SRQ_RQE_SIZE_MASK;
+	cmd->pages_rqe_sz |= hw_pages << OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_SHIFT;
+
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+	rsp = (struct ocrdma_create_srq_rsp *)cmd;
+	srq->id = rsp->id;
+	srq->rq.dbid = rsp->id;
+	max_rqe_allocated = ((rsp->max_sge_rqe_allocated &
+		OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_MASK) >>
+		OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_SHIFT);
+	max_rqe_allocated = (1 << max_rqe_allocated);
+	srq->rq.max_cnt = max_rqe_allocated;
+	srq->rq.max_wqe_idx = max_rqe_allocated - 1;
+	srq->rq.max_sges = (rsp->max_sge_rqe_allocated &
+		OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_MASK) >>
+		OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_SHIFT;
+	goto ret;
+mbx_err:
+	dma_free_coherent(&pdev->dev, srq->rq.len, srq->rq.va, pa);
+ret:
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_modify_srq(struct ocrdma_srq *srq, struct ib_srq_attr *srq_attr)
+{
+	int status = -ENOMEM;
+	struct ocrdma_modify_srq *cmd;
+	struct ocrdma_pd *pd = srq->pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_MODIFY_SRQ, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->id = srq->id;
+	cmd->limit_max_rqe |= srq_attr->srq_limit <<
+	    OCRDMA_MODIFY_SRQ_LIMIT_SHIFT;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_query_srq(struct ocrdma_srq *srq, struct ib_srq_attr *srq_attr)
+{
+	int status = -ENOMEM;
+	struct ocrdma_query_srq *cmd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(srq->ibsrq.device);
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_QUERY_SRQ, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->id = srq->rq.dbid;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status == 0) {
+		struct ocrdma_query_srq_rsp *rsp =
+		    (struct ocrdma_query_srq_rsp *)cmd;
+		srq_attr->max_sge =
+		    rsp->srq_lmt_max_sge &
+		    OCRDMA_QUERY_SRQ_RSP_MAX_SGE_RECV_MASK;
+		srq_attr->max_wr =
+		    rsp->max_rqe_pdid >> OCRDMA_QUERY_SRQ_RSP_MAX_RQE_SHIFT;
+		srq_attr->srq_limit = rsp->srq_lmt_max_sge >>
+		    OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_SHIFT;
+	}
+	kfree(cmd);
+	return status;
+}
+
+int ocrdma_mbx_destroy_srq(struct ocrdma_dev *dev, struct ocrdma_srq *srq)
+{
+	int status = -ENOMEM;
+	struct ocrdma_destroy_srq *cmd;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_DELETE_SRQ, sizeof(*cmd));
+	if (!cmd)
+		return status;
+	cmd->id = srq->id;
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (srq->rq.va)
+		dma_free_coherent(&pdev->dev, srq->rq.len,
+				  srq->rq.va, srq->rq.pa);
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_mbx_get_dcbx_config(struct ocrdma_dev *dev, u32 ptype,
+				      struct ocrdma_dcbx_cfg *dcbxcfg)
+{
+	int status = 0;
+	dma_addr_t pa;
+	struct ocrdma_mqe cmd;
+
+	struct ocrdma_get_dcbx_cfg_req *req = NULL;
+	struct ocrdma_get_dcbx_cfg_rsp *rsp = NULL;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	struct ocrdma_mqe_sge *mqe_sge = cmd.u.nonemb_req.sge;
+
+	memset(&cmd, 0, sizeof(struct ocrdma_mqe));
+	cmd.hdr.pyld_len = max_t (u32, sizeof(struct ocrdma_get_dcbx_cfg_rsp),
+					sizeof(struct ocrdma_get_dcbx_cfg_req));
+	req = dma_alloc_coherent(&pdev->dev, cmd.hdr.pyld_len, &pa, GFP_KERNEL);
+	if (!req) {
+		status = -ENOMEM;
+		goto mem_err;
+	}
+
+	cmd.hdr.spcl_sge_cnt_emb |= (1 << OCRDMA_MQE_HDR_SGE_CNT_SHIFT) &
+					OCRDMA_MQE_HDR_SGE_CNT_MASK;
+	mqe_sge->pa_lo = (u32) (pa & 0xFFFFFFFFUL);
+	mqe_sge->pa_hi = (u32) upper_32_bits(pa);
+	mqe_sge->len = cmd.hdr.pyld_len;
+
+	memset(req, 0, sizeof(struct ocrdma_get_dcbx_cfg_req));
+	ocrdma_init_mch(&req->hdr, OCRDMA_CMD_GET_DCBX_CONFIG,
+			OCRDMA_SUBSYS_DCBX, cmd.hdr.pyld_len);
+	req->param_type = ptype;
+
+	status = ocrdma_mbx_cmd(dev, &cmd);
+	if (status)
+		goto mbx_err;
+
+	rsp = (struct ocrdma_get_dcbx_cfg_rsp *)req;
+	ocrdma_le32_to_cpu(rsp, sizeof(struct ocrdma_get_dcbx_cfg_rsp));
+	memcpy(dcbxcfg, &rsp->cfg, sizeof(struct ocrdma_dcbx_cfg));
+
+mbx_err:
+	dma_free_coherent(&pdev->dev, cmd.hdr.pyld_len, req, pa);
+mem_err:
+	return status;
+}
+
+#define OCRDMA_MAX_SERVICE_LEVEL_INDEX	0x08
+#define OCRDMA_DEFAULT_SERVICE_LEVEL	0x05
+
+static int ocrdma_parse_dcbxcfg_rsp(struct ocrdma_dev *dev, int ptype,
+				    struct ocrdma_dcbx_cfg *dcbxcfg,
+				    u8 *srvc_lvl)
+{
+	int status = -EINVAL, indx, slindx;
+	int ventry_cnt;
+	struct ocrdma_app_parameter *app_param;
+	u8 valid, proto_sel;
+	u8 app_prio, pfc_prio;
+	u16 proto;
+
+	if (!(dcbxcfg->tcv_aev_opv_st & OCRDMA_DCBX_STATE_MASK)) {
+		pr_info("%s ocrdma%d DCBX is disabled\n",
+			dev_name(&dev->nic_info.pdev->dev), dev->id);
+		goto out;
+	}
+
+	if (!ocrdma_is_enabled_and_synced(dcbxcfg->pfc_state)) {
+		pr_info("%s ocrdma%d priority flow control(%s) is %s%s\n",
+			dev_name(&dev->nic_info.pdev->dev), dev->id,
+			(ptype > 0 ? "operational" : "admin"),
+			(dcbxcfg->pfc_state & OCRDMA_STATE_FLAG_ENABLED) ?
+			"enabled" : "disabled",
+			(dcbxcfg->pfc_state & OCRDMA_STATE_FLAG_SYNC) ?
+			"" : ", not sync'ed");
+		goto out;
+	} else {
+		pr_info("%s ocrdma%d priority flow control is enabled and sync'ed\n",
+			dev_name(&dev->nic_info.pdev->dev), dev->id);
+	}
+
+	ventry_cnt = (dcbxcfg->tcv_aev_opv_st >>
+				OCRDMA_DCBX_APP_ENTRY_SHIFT)
+				& OCRDMA_DCBX_STATE_MASK;
+
+	for (indx = 0; indx < ventry_cnt; indx++) {
+		app_param = &dcbxcfg->app_param[indx];
+		valid = (app_param->valid_proto_app >>
+				OCRDMA_APP_PARAM_VALID_SHIFT)
+				& OCRDMA_APP_PARAM_VALID_MASK;
+		proto_sel = (app_param->valid_proto_app
+				>>  OCRDMA_APP_PARAM_PROTO_SEL_SHIFT)
+				& OCRDMA_APP_PARAM_PROTO_SEL_MASK;
+		proto = app_param->valid_proto_app &
+				OCRDMA_APP_PARAM_APP_PROTO_MASK;
+
+		if (
+			valid && proto == OCRDMA_APP_PROTO_ROCE &&
+			proto_sel == OCRDMA_PROTO_SELECT_L2) {
+			for (slindx = 0; slindx <
+				OCRDMA_MAX_SERVICE_LEVEL_INDEX; slindx++) {
+				app_prio = ocrdma_get_app_prio(
+						(u8 *)app_param->app_prio,
+						slindx);
+				pfc_prio = ocrdma_get_pfc_prio(
+						(u8 *)dcbxcfg->pfc_prio,
+						slindx);
+
+				if (app_prio && pfc_prio) {
+					*srvc_lvl = slindx;
+					status = 0;
+					goto out;
+				}
+			}
+			if (slindx == OCRDMA_MAX_SERVICE_LEVEL_INDEX) {
+				pr_info("%s ocrdma%d application priority not set for 0x%x protocol\n",
+					dev_name(&dev->nic_info.pdev->dev),
+					dev->id, proto);
+			}
+		}
+	}
+
+out:
+	return status;
+}
+
+void ocrdma_init_service_level(struct ocrdma_dev *dev)
+{
+	int status = 0, indx;
+	struct ocrdma_dcbx_cfg dcbxcfg;
+	u8 srvc_lvl = OCRDMA_DEFAULT_SERVICE_LEVEL;
+	int ptype = OCRDMA_PARAMETER_TYPE_OPER;
+
+	for (indx = 0; indx < 2; indx++) {
+		status = ocrdma_mbx_get_dcbx_config(dev, ptype, &dcbxcfg);
+		if (status) {
+			pr_err("%s(): status=%d\n", __func__, status);
+			ptype = OCRDMA_PARAMETER_TYPE_ADMIN;
+			continue;
+		}
+
+		status = ocrdma_parse_dcbxcfg_rsp(dev, ptype,
+						  &dcbxcfg, &srvc_lvl);
+		if (status) {
+			ptype = OCRDMA_PARAMETER_TYPE_ADMIN;
+			continue;
+		}
+
+		break;
+	}
+
+	if (status)
+		pr_info("%s ocrdma%d service level default\n",
+			dev_name(&dev->nic_info.pdev->dev), dev->id);
+	else
+		pr_info("%s ocrdma%d service level %d\n",
+			dev_name(&dev->nic_info.pdev->dev), dev->id,
+			srvc_lvl);
+
+	dev->pfc_state = ocrdma_is_enabled_and_synced(dcbxcfg.pfc_state);
+	dev->sl = srvc_lvl;
+}
+
+int ocrdma_alloc_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah)
+{
+	int i;
+	int status = -EINVAL;
+	struct ocrdma_av *av;
+	unsigned long flags;
+
+	av = dev->av_tbl.va;
+	spin_lock_irqsave(&dev->av_tbl.lock, flags);
+	for (i = 0; i < dev->av_tbl.num_ah; i++) {
+		if (av->valid == 0) {
+			av->valid = OCRDMA_AV_VALID;
+			ah->av = av;
+			ah->id = i;
+			status = 0;
+			break;
+		}
+		av++;
+	}
+	if (i == dev->av_tbl.num_ah)
+		status = -EAGAIN;
+	spin_unlock_irqrestore(&dev->av_tbl.lock, flags);
+	return status;
+}
+
+int ocrdma_free_av(struct ocrdma_dev *dev, struct ocrdma_ah *ah)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&dev->av_tbl.lock, flags);
+	ah->av->valid = 0;
+	spin_unlock_irqrestore(&dev->av_tbl.lock, flags);
+	return 0;
+}
+
+static int ocrdma_create_eqs(struct ocrdma_dev *dev)
+{
+	int num_eq, i, status = 0;
+	int irq;
+	unsigned long flags = 0;
+
+	num_eq = dev->nic_info.msix.num_vectors -
+			dev->nic_info.msix.start_vector;
+	if (dev->nic_info.intr_mode == BE_INTERRUPT_MODE_INTX) {
+		num_eq = 1;
+		flags = IRQF_SHARED;
+	} else {
+		num_eq = min_t(u32, num_eq, num_online_cpus());
+	}
+
+	if (!num_eq)
+		return -EINVAL;
+
+	dev->eq_tbl = kzalloc(sizeof(struct ocrdma_eq) * num_eq, GFP_KERNEL);
+	if (!dev->eq_tbl)
+		return -ENOMEM;
+
+	for (i = 0; i < num_eq; i++) {
+		status = ocrdma_create_eq(dev, &dev->eq_tbl[i],
+					OCRDMA_EQ_LEN);
+		if (status) {
+			status = -EINVAL;
+			break;
+		}
+		sprintf(dev->eq_tbl[i].irq_name, "ocrdma%d-%d",
+			dev->id, i);
+		irq = ocrdma_get_irq(dev, &dev->eq_tbl[i]);
+		status = request_irq(irq, ocrdma_irq_handler, flags,
+				     dev->eq_tbl[i].irq_name,
+				     &dev->eq_tbl[i]);
+		if (status)
+			goto done;
+		dev->eq_cnt += 1;
+	}
+	/* one eq is sufficient for data path to work */
+	return 0;
+done:
+	ocrdma_destroy_eqs(dev);
+	return status;
+}
+
+static int ocrdma_mbx_modify_eqd(struct ocrdma_dev *dev, struct ocrdma_eq *eq,
+				 int num)
+{
+	int i, status = -ENOMEM;
+	struct ocrdma_modify_eqd_req *cmd;
+
+	cmd = ocrdma_init_emb_mqe(OCRDMA_CMD_MODIFY_EQ_DELAY, sizeof(*cmd));
+	if (!cmd)
+		return status;
+
+	ocrdma_init_mch(&cmd->cmd.req, OCRDMA_CMD_MODIFY_EQ_DELAY,
+			OCRDMA_SUBSYS_COMMON, sizeof(*cmd));
+
+	cmd->cmd.num_eq = num;
+	for (i = 0; i < num; i++) {
+		cmd->cmd.set_eqd[i].eq_id = eq[i].q.id;
+		cmd->cmd.set_eqd[i].phase = 0;
+		cmd->cmd.set_eqd[i].delay_multiplier =
+				(eq[i].aic_obj.prev_eqd * 65)/100;
+	}
+	status = ocrdma_mbx_cmd(dev, (struct ocrdma_mqe *)cmd);
+	if (status)
+		goto mbx_err;
+mbx_err:
+	kfree(cmd);
+	return status;
+}
+
+static int ocrdma_modify_eqd(struct ocrdma_dev *dev, struct ocrdma_eq *eq,
+			     int num)
+{
+	int num_eqs, i = 0;
+	if (num > 8) {
+		while (num) {
+			num_eqs = min(num, 8);
+			ocrdma_mbx_modify_eqd(dev, &eq[i], num_eqs);
+			i += num_eqs;
+			num -= num_eqs;
+		}
+	} else {
+		ocrdma_mbx_modify_eqd(dev, eq, num);
+	}
+	return 0;
+}
+
+void ocrdma_eqd_set_task(struct work_struct *work)
+{
+	struct ocrdma_dev *dev =
+		container_of(work, struct ocrdma_dev, eqd_work.work);
+	struct ocrdma_eq *eq = 0;
+	int i, num = 0, status = -EINVAL;
+	u64 eq_intr;
+
+	for (i = 0; i < dev->eq_cnt; i++) {
+		eq = &dev->eq_tbl[i];
+		if (eq->aic_obj.eq_intr_cnt > eq->aic_obj.prev_eq_intr_cnt) {
+			eq_intr = eq->aic_obj.eq_intr_cnt -
+				  eq->aic_obj.prev_eq_intr_cnt;
+			if ((eq_intr > EQ_INTR_PER_SEC_THRSH_HI) &&
+			    (eq->aic_obj.prev_eqd == EQ_AIC_MIN_EQD)) {
+				eq->aic_obj.prev_eqd = EQ_AIC_MAX_EQD;
+				num++;
+			} else if ((eq_intr < EQ_INTR_PER_SEC_THRSH_LOW) &&
+				   (eq->aic_obj.prev_eqd == EQ_AIC_MAX_EQD)) {
+				eq->aic_obj.prev_eqd = EQ_AIC_MIN_EQD;
+				num++;
+			}
+		}
+		eq->aic_obj.prev_eq_intr_cnt = eq->aic_obj.eq_intr_cnt;
+	}
+
+	if (num)
+		status = ocrdma_modify_eqd(dev, &dev->eq_tbl[0], num);
+	schedule_delayed_work(&dev->eqd_work, msecs_to_jiffies(1000));
+}
+
+int ocrdma_init_hw(struct ocrdma_dev *dev)
+{
+	int status;
+
+	/* create the eqs  */
+	status = ocrdma_create_eqs(dev);
+	if (status)
+		goto qpeq_err;
+	status = ocrdma_create_mq(dev);
+	if (status)
+		goto mq_err;
+	status = ocrdma_mbx_query_fw_config(dev);
+	if (status)
+		goto conf_err;
+	status = ocrdma_mbx_query_dev(dev);
+	if (status)
+		goto conf_err;
+	status = ocrdma_mbx_query_fw_ver(dev);
+	if (status)
+		goto conf_err;
+	status = ocrdma_mbx_create_ah_tbl(dev);
+	if (status)
+		goto conf_err;
+	status = ocrdma_mbx_get_phy_info(dev);
+	if (status)
+		goto info_attrb_err;
+	status = ocrdma_mbx_get_ctrl_attribs(dev);
+	if (status)
+		goto info_attrb_err;
+
+	return 0;
+
+info_attrb_err:
+	ocrdma_mbx_delete_ah_tbl(dev);
+conf_err:
+	ocrdma_destroy_mq(dev);
+mq_err:
+	ocrdma_destroy_eqs(dev);
+qpeq_err:
+	pr_err("%s() status=%d\n", __func__, status);
+	return status;
+}
+
+void ocrdma_cleanup_hw(struct ocrdma_dev *dev)
+{
+	ocrdma_free_pd_pool(dev);
+	ocrdma_mbx_delete_ah_tbl(dev);
+
+	/* cleanup the control path */
+	ocrdma_destroy_mq(dev);
+
+	/* cleanup the eqs */
+	ocrdma_destroy_eqs(dev);
+}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_hw.h b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h
new file mode 100644
index 000000000..e905972fc
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_hw.h
@@ -0,0 +1,142 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) CNA Adapters.              *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#ifndef __OCRDMA_HW_H__
+#define __OCRDMA_HW_H__
+
+#include "ocrdma_sli.h"
+
+static inline void ocrdma_cpu_to_le32(void *dst, u32 len)
+{
+#ifdef __BIG_ENDIAN
+	int i = 0;
+	u32 *src_ptr = dst;
+	u32 *dst_ptr = dst;
+	for (; i < (len / 4); i++)
+		*(dst_ptr + i) = cpu_to_le32p(src_ptr + i);
+#endif
+}
+
+static inline void ocrdma_le32_to_cpu(void *dst, u32 len)
+{
+#ifdef __BIG_ENDIAN
+	int i = 0;
+	u32 *src_ptr = dst;
+	u32 *dst_ptr = dst;
+	for (; i < (len / sizeof(u32)); i++)
+		*(dst_ptr + i) = le32_to_cpu(*(src_ptr + i));
+#endif
+}
+
+static inline void ocrdma_copy_cpu_to_le32(void *dst, void *src, u32 len)
+{
+#ifdef __BIG_ENDIAN
+	int i = 0;
+	u32 *src_ptr = src;
+	u32 *dst_ptr = dst;
+	for (; i < (len / sizeof(u32)); i++)
+		*(dst_ptr + i) = cpu_to_le32p(src_ptr + i);
+#else
+	memcpy(dst, src, len);
+#endif
+}
+
+static inline void ocrdma_copy_le32_to_cpu(void *dst, void *src, u32 len)
+{
+#ifdef __BIG_ENDIAN
+	int i = 0;
+	u32 *src_ptr = src;
+	u32 *dst_ptr = dst;
+	for (; i < len / sizeof(u32); i++)
+		*(dst_ptr + i) = le32_to_cpu(*(src_ptr + i));
+#else
+	memcpy(dst, src, len);
+#endif
+}
+
+static inline u64 ocrdma_get_db_addr(struct ocrdma_dev *dev, u32 pdid)
+{
+	return dev->nic_info.unmapped_db + (pdid * dev->nic_info.db_page_size);
+}
+
+int ocrdma_init_hw(struct ocrdma_dev *);
+void ocrdma_cleanup_hw(struct ocrdma_dev *);
+
+enum ib_qp_state get_ibqp_state(enum ocrdma_qp_state qps);
+void ocrdma_ring_cq_db(struct ocrdma_dev *, u16 cq_id, bool armed,
+		       bool solicited, u16 cqe_popped);
+
+/* verbs specific mailbox commands */
+int ocrdma_mbx_get_link_speed(struct ocrdma_dev *dev, u8 *lnk_speed);
+int ocrdma_query_config(struct ocrdma_dev *,
+			struct ocrdma_mbx_query_config *config);
+
+int ocrdma_mbx_alloc_pd(struct ocrdma_dev *, struct ocrdma_pd *);
+int ocrdma_mbx_dealloc_pd(struct ocrdma_dev *, struct ocrdma_pd *);
+
+int ocrdma_mbx_alloc_lkey(struct ocrdma_dev *, struct ocrdma_hw_mr *hwmr,
+			  u32 pd_id, int addr_check);
+int ocrdma_mbx_dealloc_lkey(struct ocrdma_dev *, int fmr, u32 lkey);
+
+int ocrdma_reg_mr(struct ocrdma_dev *, struct ocrdma_hw_mr *hwmr,
+			u32 pd_id, int acc);
+int ocrdma_mbx_create_cq(struct ocrdma_dev *, struct ocrdma_cq *,
+				int entries, int dpp_cq, u16 pd_id);
+int ocrdma_mbx_destroy_cq(struct ocrdma_dev *, struct ocrdma_cq *);
+
+int ocrdma_mbx_create_qp(struct ocrdma_qp *, struct ib_qp_init_attr *attrs,
+			 u8 enable_dpp_cq, u16 dpp_cq_id, u16 *dpp_offset,
+			 u16 *dpp_credit_lmt);
+int ocrdma_mbx_modify_qp(struct ocrdma_dev *, struct ocrdma_qp *,
+			 struct ib_qp_attr *attrs, int attr_mask);
+int ocrdma_mbx_query_qp(struct ocrdma_dev *, struct ocrdma_qp *,
+			struct ocrdma_qp_params *param);
+int ocrdma_mbx_destroy_qp(struct ocrdma_dev *, struct ocrdma_qp *);
+int ocrdma_mbx_create_srq(struct ocrdma_dev *, struct ocrdma_srq *,
+			  struct ib_srq_init_attr *,
+			  struct ocrdma_pd *);
+int ocrdma_mbx_modify_srq(struct ocrdma_srq *, struct ib_srq_attr *);
+int ocrdma_mbx_query_srq(struct ocrdma_srq *, struct ib_srq_attr *);
+int ocrdma_mbx_destroy_srq(struct ocrdma_dev *, struct ocrdma_srq *);
+
+int ocrdma_alloc_av(struct ocrdma_dev *, struct ocrdma_ah *);
+int ocrdma_free_av(struct ocrdma_dev *, struct ocrdma_ah *);
+
+int ocrdma_qp_state_change(struct ocrdma_qp *, enum ib_qp_state new_state,
+			    enum ib_qp_state *old_ib_state);
+bool ocrdma_is_qp_in_sq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *);
+bool ocrdma_is_qp_in_rq_flushlist(struct ocrdma_cq *, struct ocrdma_qp *);
+void ocrdma_flush_qp(struct ocrdma_qp *);
+int ocrdma_get_irq(struct ocrdma_dev *dev, struct ocrdma_eq *eq);
+
+int ocrdma_mbx_rdma_stats(struct ocrdma_dev *, bool reset);
+char *port_speed_string(struct ocrdma_dev *dev);
+void ocrdma_init_service_level(struct ocrdma_dev *);
+void ocrdma_alloc_pd_pool(struct ocrdma_dev *dev);
+void ocrdma_free_pd_range(struct ocrdma_dev *dev);
+
+#endif				/* __OCRDMA_HW_H__ */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
new file mode 100644
index 000000000..7a2b59aca
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c
@@ -0,0 +1,682 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#include <linux/module.h>
+#include <linux/idr.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include <linux/netdevice.h>
+#include <net/addrconf.h>
+
+#include "ocrdma.h"
+#include "ocrdma_verbs.h"
+#include "ocrdma_ah.h"
+#include "be_roce.h"
+#include "ocrdma_hw.h"
+#include "ocrdma_stats.h"
+#include "ocrdma_abi.h"
+
+MODULE_VERSION(OCRDMA_ROCE_DRV_VERSION);
+MODULE_DESCRIPTION(OCRDMA_ROCE_DRV_DESC " " OCRDMA_ROCE_DRV_VERSION);
+MODULE_AUTHOR("Emulex Corporation");
+MODULE_LICENSE("GPL");
+
+static LIST_HEAD(ocrdma_dev_list);
+static DEFINE_SPINLOCK(ocrdma_devlist_lock);
+static DEFINE_IDR(ocrdma_dev_id);
+
+static union ib_gid ocrdma_zero_sgid;
+
+void ocrdma_get_guid(struct ocrdma_dev *dev, u8 *guid)
+{
+	u8 mac_addr[6];
+
+	memcpy(&mac_addr[0], &dev->nic_info.mac_addr[0], ETH_ALEN);
+	guid[0] = mac_addr[0] ^ 2;
+	guid[1] = mac_addr[1];
+	guid[2] = mac_addr[2];
+	guid[3] = 0xff;
+	guid[4] = 0xfe;
+	guid[5] = mac_addr[3];
+	guid[6] = mac_addr[4];
+	guid[7] = mac_addr[5];
+}
+
+static bool ocrdma_add_sgid(struct ocrdma_dev *dev, union ib_gid *new_sgid)
+{
+	int i;
+	unsigned long flags;
+
+	memset(&ocrdma_zero_sgid, 0, sizeof(union ib_gid));
+
+
+	spin_lock_irqsave(&dev->sgid_lock, flags);
+	for (i = 0; i < OCRDMA_MAX_SGID; i++) {
+		if (!memcmp(&dev->sgid_tbl[i], &ocrdma_zero_sgid,
+			    sizeof(union ib_gid))) {
+			/* found free entry */
+			memcpy(&dev->sgid_tbl[i], new_sgid,
+			       sizeof(union ib_gid));
+			spin_unlock_irqrestore(&dev->sgid_lock, flags);
+			return true;
+		} else if (!memcmp(&dev->sgid_tbl[i], new_sgid,
+				   sizeof(union ib_gid))) {
+			/* entry already present, no addition is required. */
+			spin_unlock_irqrestore(&dev->sgid_lock, flags);
+			return false;
+		}
+	}
+	spin_unlock_irqrestore(&dev->sgid_lock, flags);
+	return false;
+}
+
+static bool ocrdma_del_sgid(struct ocrdma_dev *dev, union ib_gid *sgid)
+{
+	int found = false;
+	int i;
+	unsigned long flags;
+
+
+	spin_lock_irqsave(&dev->sgid_lock, flags);
+	/* first is default sgid, which cannot be deleted. */
+	for (i = 1; i < OCRDMA_MAX_SGID; i++) {
+		if (!memcmp(&dev->sgid_tbl[i], sgid, sizeof(union ib_gid))) {
+			/* found matching entry */
+			memset(&dev->sgid_tbl[i], 0, sizeof(union ib_gid));
+			found = true;
+			break;
+		}
+	}
+	spin_unlock_irqrestore(&dev->sgid_lock, flags);
+	return found;
+}
+
+static int ocrdma_addr_event(unsigned long event, struct net_device *netdev,
+			     union ib_gid *gid)
+{
+	struct ib_event gid_event;
+	struct ocrdma_dev *dev;
+	bool found = false;
+	bool updated = false;
+	bool is_vlan = false;
+
+	is_vlan = netdev->priv_flags & IFF_802_1Q_VLAN;
+	if (is_vlan)
+		netdev = rdma_vlan_dev_real_dev(netdev);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(dev, &ocrdma_dev_list, entry) {
+		if (dev->nic_info.netdev == netdev) {
+			found = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	if (!found)
+		return NOTIFY_DONE;
+
+	mutex_lock(&dev->dev_lock);
+	switch (event) {
+	case NETDEV_UP:
+		updated = ocrdma_add_sgid(dev, gid);
+		break;
+	case NETDEV_DOWN:
+		updated = ocrdma_del_sgid(dev, gid);
+		break;
+	default:
+		break;
+	}
+	if (updated) {
+		/* GID table updated, notify the consumers about it */
+		gid_event.device = &dev->ibdev;
+		gid_event.element.port_num = 1;
+		gid_event.event = IB_EVENT_GID_CHANGE;
+		ib_dispatch_event(&gid_event);
+	}
+	mutex_unlock(&dev->dev_lock);
+	return NOTIFY_OK;
+}
+
+static int ocrdma_inetaddr_event(struct notifier_block *notifier,
+				  unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = ptr;
+	union ib_gid gid;
+	struct net_device *netdev = ifa->ifa_dev->dev;
+
+	ipv6_addr_set_v4mapped(ifa->ifa_address, (struct in6_addr *)&gid);
+	return ocrdma_addr_event(event, netdev, &gid);
+}
+
+static struct notifier_block ocrdma_inetaddr_notifier = {
+	.notifier_call = ocrdma_inetaddr_event
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+
+static int ocrdma_inet6addr_event(struct notifier_block *notifier,
+				  unsigned long event, void *ptr)
+{
+	struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+	union  ib_gid *gid = (union ib_gid *)&ifa->addr;
+	struct net_device *netdev = ifa->idev->dev;
+	return ocrdma_addr_event(event, netdev, gid);
+}
+
+static struct notifier_block ocrdma_inet6addr_notifier = {
+	.notifier_call = ocrdma_inet6addr_event
+};
+
+#endif /* IPV6 and VLAN */
+
+static enum rdma_link_layer ocrdma_link_layer(struct ib_device *device,
+					      u8 port_num)
+{
+	return IB_LINK_LAYER_ETHERNET;
+}
+
+static int ocrdma_register_device(struct ocrdma_dev *dev)
+{
+	strlcpy(dev->ibdev.name, "ocrdma%d", IB_DEVICE_NAME_MAX);
+	ocrdma_get_guid(dev, (u8 *)&dev->ibdev.node_guid);
+	memcpy(dev->ibdev.node_desc, OCRDMA_NODE_DESC,
+	       sizeof(OCRDMA_NODE_DESC));
+	dev->ibdev.owner = THIS_MODULE;
+	dev->ibdev.uverbs_abi_ver = OCRDMA_ABI_VERSION;
+	dev->ibdev.uverbs_cmd_mask =
+	    OCRDMA_UVERBS(GET_CONTEXT) |
+	    OCRDMA_UVERBS(QUERY_DEVICE) |
+	    OCRDMA_UVERBS(QUERY_PORT) |
+	    OCRDMA_UVERBS(ALLOC_PD) |
+	    OCRDMA_UVERBS(DEALLOC_PD) |
+	    OCRDMA_UVERBS(REG_MR) |
+	    OCRDMA_UVERBS(DEREG_MR) |
+	    OCRDMA_UVERBS(CREATE_COMP_CHANNEL) |
+	    OCRDMA_UVERBS(CREATE_CQ) |
+	    OCRDMA_UVERBS(RESIZE_CQ) |
+	    OCRDMA_UVERBS(DESTROY_CQ) |
+	    OCRDMA_UVERBS(REQ_NOTIFY_CQ) |
+	    OCRDMA_UVERBS(CREATE_QP) |
+	    OCRDMA_UVERBS(MODIFY_QP) |
+	    OCRDMA_UVERBS(QUERY_QP) |
+	    OCRDMA_UVERBS(DESTROY_QP) |
+	    OCRDMA_UVERBS(POLL_CQ) |
+	    OCRDMA_UVERBS(POST_SEND) |
+	    OCRDMA_UVERBS(POST_RECV);
+
+	dev->ibdev.uverbs_cmd_mask |=
+	    OCRDMA_UVERBS(CREATE_AH) |
+	     OCRDMA_UVERBS(MODIFY_AH) |
+	     OCRDMA_UVERBS(QUERY_AH) |
+	     OCRDMA_UVERBS(DESTROY_AH);
+
+	dev->ibdev.node_type = RDMA_NODE_IB_CA;
+	dev->ibdev.phys_port_cnt = 1;
+	dev->ibdev.num_comp_vectors = dev->eq_cnt;
+
+	/* mandatory verbs. */
+	dev->ibdev.query_device = ocrdma_query_device;
+	dev->ibdev.query_port = ocrdma_query_port;
+	dev->ibdev.modify_port = ocrdma_modify_port;
+	dev->ibdev.query_gid = ocrdma_query_gid;
+	dev->ibdev.get_link_layer = ocrdma_link_layer;
+	dev->ibdev.alloc_pd = ocrdma_alloc_pd;
+	dev->ibdev.dealloc_pd = ocrdma_dealloc_pd;
+
+	dev->ibdev.create_cq = ocrdma_create_cq;
+	dev->ibdev.destroy_cq = ocrdma_destroy_cq;
+	dev->ibdev.resize_cq = ocrdma_resize_cq;
+
+	dev->ibdev.create_qp = ocrdma_create_qp;
+	dev->ibdev.modify_qp = ocrdma_modify_qp;
+	dev->ibdev.query_qp = ocrdma_query_qp;
+	dev->ibdev.destroy_qp = ocrdma_destroy_qp;
+
+	dev->ibdev.query_pkey = ocrdma_query_pkey;
+	dev->ibdev.create_ah = ocrdma_create_ah;
+	dev->ibdev.destroy_ah = ocrdma_destroy_ah;
+	dev->ibdev.query_ah = ocrdma_query_ah;
+	dev->ibdev.modify_ah = ocrdma_modify_ah;
+
+	dev->ibdev.poll_cq = ocrdma_poll_cq;
+	dev->ibdev.post_send = ocrdma_post_send;
+	dev->ibdev.post_recv = ocrdma_post_recv;
+	dev->ibdev.req_notify_cq = ocrdma_arm_cq;
+
+	dev->ibdev.get_dma_mr = ocrdma_get_dma_mr;
+	dev->ibdev.reg_phys_mr = ocrdma_reg_kernel_mr;
+	dev->ibdev.dereg_mr = ocrdma_dereg_mr;
+	dev->ibdev.reg_user_mr = ocrdma_reg_user_mr;
+
+	dev->ibdev.alloc_fast_reg_mr = ocrdma_alloc_frmr;
+	dev->ibdev.alloc_fast_reg_page_list = ocrdma_alloc_frmr_page_list;
+	dev->ibdev.free_fast_reg_page_list = ocrdma_free_frmr_page_list;
+
+	/* mandatory to support user space verbs consumer. */
+	dev->ibdev.alloc_ucontext = ocrdma_alloc_ucontext;
+	dev->ibdev.dealloc_ucontext = ocrdma_dealloc_ucontext;
+	dev->ibdev.mmap = ocrdma_mmap;
+	dev->ibdev.dma_device = &dev->nic_info.pdev->dev;
+
+	dev->ibdev.process_mad = ocrdma_process_mad;
+
+	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
+		dev->ibdev.uverbs_cmd_mask |=
+		     OCRDMA_UVERBS(CREATE_SRQ) |
+		     OCRDMA_UVERBS(MODIFY_SRQ) |
+		     OCRDMA_UVERBS(QUERY_SRQ) |
+		     OCRDMA_UVERBS(DESTROY_SRQ) |
+		     OCRDMA_UVERBS(POST_SRQ_RECV);
+
+		dev->ibdev.create_srq = ocrdma_create_srq;
+		dev->ibdev.modify_srq = ocrdma_modify_srq;
+		dev->ibdev.query_srq = ocrdma_query_srq;
+		dev->ibdev.destroy_srq = ocrdma_destroy_srq;
+		dev->ibdev.post_srq_recv = ocrdma_post_srq_recv;
+	}
+	return ib_register_device(&dev->ibdev, NULL);
+}
+
+static int ocrdma_alloc_resources(struct ocrdma_dev *dev)
+{
+	mutex_init(&dev->dev_lock);
+	dev->sgid_tbl = kzalloc(sizeof(union ib_gid) *
+				OCRDMA_MAX_SGID, GFP_KERNEL);
+	if (!dev->sgid_tbl)
+		goto alloc_err;
+	spin_lock_init(&dev->sgid_lock);
+
+	dev->cq_tbl = kzalloc(sizeof(struct ocrdma_cq *) *
+			      OCRDMA_MAX_CQ, GFP_KERNEL);
+	if (!dev->cq_tbl)
+		goto alloc_err;
+
+	if (dev->attr.max_qp) {
+		dev->qp_tbl = kzalloc(sizeof(struct ocrdma_qp *) *
+				      OCRDMA_MAX_QP, GFP_KERNEL);
+		if (!dev->qp_tbl)
+			goto alloc_err;
+	}
+
+	dev->stag_arr = kzalloc(sizeof(u64) * OCRDMA_MAX_STAG, GFP_KERNEL);
+	if (dev->stag_arr == NULL)
+		goto alloc_err;
+
+	ocrdma_alloc_pd_pool(dev);
+
+	spin_lock_init(&dev->av_tbl.lock);
+	spin_lock_init(&dev->flush_q_lock);
+	return 0;
+alloc_err:
+	pr_err("%s(%d) error.\n", __func__, dev->id);
+	return -ENOMEM;
+}
+
+static void ocrdma_free_resources(struct ocrdma_dev *dev)
+{
+	kfree(dev->stag_arr);
+	kfree(dev->qp_tbl);
+	kfree(dev->cq_tbl);
+	kfree(dev->sgid_tbl);
+}
+
+/* OCRDMA sysfs interface */
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct ocrdma_dev *dev = dev_get_drvdata(device);
+
+	return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->nic_info.pdev->vendor);
+}
+
+static ssize_t show_fw_ver(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct ocrdma_dev *dev = dev_get_drvdata(device);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->attr.fw_ver[0]);
+}
+
+static ssize_t show_hca_type(struct device *device,
+			     struct device_attribute *attr, char *buf)
+{
+	struct ocrdma_dev *dev = dev_get_drvdata(device);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", &dev->model_number[0]);
+}
+
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL);
+
+static struct device_attribute *ocrdma_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_fw_ver,
+	&dev_attr_hca_type
+};
+
+static void ocrdma_remove_sysfiles(struct ocrdma_dev *dev)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++)
+		device_remove_file(&dev->ibdev.dev, ocrdma_attributes[i]);
+}
+
+static void ocrdma_add_default_sgid(struct ocrdma_dev *dev)
+{
+	/* GID Index 0 - Invariant manufacturer-assigned EUI-64 */
+	union ib_gid *sgid = &dev->sgid_tbl[0];
+
+	sgid->global.subnet_prefix = cpu_to_be64(0xfe80000000000000LL);
+	ocrdma_get_guid(dev, &sgid->raw[8]);
+}
+
+static void ocrdma_init_ipv4_gids(struct ocrdma_dev *dev,
+				  struct net_device *net)
+{
+	struct in_device *in_dev;
+	union ib_gid gid;
+	in_dev = in_dev_get(net);
+	if (in_dev) {
+		for_ifa(in_dev) {
+			ipv6_addr_set_v4mapped(ifa->ifa_address,
+					       (struct in6_addr *)&gid);
+			ocrdma_add_sgid(dev, &gid);
+		}
+		endfor_ifa(in_dev);
+		in_dev_put(in_dev);
+	}
+}
+
+static void ocrdma_init_ipv6_gids(struct ocrdma_dev *dev,
+				  struct net_device *net)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	struct inet6_dev *in6_dev;
+	union ib_gid  *pgid;
+	struct inet6_ifaddr *ifp;
+	in6_dev = in6_dev_get(net);
+	if (in6_dev) {
+		read_lock_bh(&in6_dev->lock);
+		list_for_each_entry(ifp, &in6_dev->addr_list, if_list) {
+			pgid = (union ib_gid *)&ifp->addr;
+			ocrdma_add_sgid(dev, pgid);
+		}
+		read_unlock_bh(&in6_dev->lock);
+		in6_dev_put(in6_dev);
+	}
+#endif
+}
+
+static void ocrdma_init_gid_table(struct ocrdma_dev *dev)
+{
+	struct  net_device *net_dev;
+
+	for_each_netdev(&init_net, net_dev) {
+		struct net_device *real_dev = rdma_vlan_dev_real_dev(net_dev) ?
+				rdma_vlan_dev_real_dev(net_dev) : net_dev;
+
+		if (real_dev == dev->nic_info.netdev) {
+			ocrdma_add_default_sgid(dev);
+			ocrdma_init_ipv4_gids(dev, net_dev);
+			ocrdma_init_ipv6_gids(dev, net_dev);
+		}
+	}
+}
+
+static struct ocrdma_dev *ocrdma_add(struct be_dev_info *dev_info)
+{
+	int status = 0, i;
+	struct ocrdma_dev *dev;
+
+	dev = (struct ocrdma_dev *)ib_alloc_device(sizeof(struct ocrdma_dev));
+	if (!dev) {
+		pr_err("Unable to allocate ib device\n");
+		return NULL;
+	}
+	dev->mbx_cmd = kzalloc(sizeof(struct ocrdma_mqe_emb_cmd), GFP_KERNEL);
+	if (!dev->mbx_cmd)
+		goto idr_err;
+
+	memcpy(&dev->nic_info, dev_info, sizeof(*dev_info));
+	dev->id = idr_alloc(&ocrdma_dev_id, NULL, 0, 0, GFP_KERNEL);
+	if (dev->id < 0)
+		goto idr_err;
+
+	status = ocrdma_init_hw(dev);
+	if (status)
+		goto init_err;
+
+	status = ocrdma_alloc_resources(dev);
+	if (status)
+		goto alloc_err;
+
+	ocrdma_init_service_level(dev);
+	ocrdma_init_gid_table(dev);
+	status = ocrdma_register_device(dev);
+	if (status)
+		goto alloc_err;
+
+	for (i = 0; i < ARRAY_SIZE(ocrdma_attributes); i++)
+		if (device_create_file(&dev->ibdev.dev, ocrdma_attributes[i]))
+			goto sysfs_err;
+	spin_lock(&ocrdma_devlist_lock);
+	list_add_tail_rcu(&dev->entry, &ocrdma_dev_list);
+	spin_unlock(&ocrdma_devlist_lock);
+	/* Init stats */
+	ocrdma_add_port_stats(dev);
+	/* Interrupt Moderation */
+	INIT_DELAYED_WORK(&dev->eqd_work, ocrdma_eqd_set_task);
+	schedule_delayed_work(&dev->eqd_work, msecs_to_jiffies(1000));
+
+	pr_info("%s %s: %s \"%s\" port %d\n",
+		dev_name(&dev->nic_info.pdev->dev), hca_name(dev),
+		port_speed_string(dev), dev->model_number,
+		dev->hba_port_num);
+	pr_info("%s ocrdma%d driver loaded successfully\n",
+		dev_name(&dev->nic_info.pdev->dev), dev->id);
+	return dev;
+
+sysfs_err:
+	ocrdma_remove_sysfiles(dev);
+alloc_err:
+	ocrdma_free_resources(dev);
+	ocrdma_cleanup_hw(dev);
+init_err:
+	idr_remove(&ocrdma_dev_id, dev->id);
+idr_err:
+	kfree(dev->mbx_cmd);
+	ib_dealloc_device(&dev->ibdev);
+	pr_err("%s() leaving. ret=%d\n", __func__, status);
+	return NULL;
+}
+
+static void ocrdma_remove_free(struct rcu_head *rcu)
+{
+	struct ocrdma_dev *dev = container_of(rcu, struct ocrdma_dev, rcu);
+
+	idr_remove(&ocrdma_dev_id, dev->id);
+	kfree(dev->mbx_cmd);
+	ib_dealloc_device(&dev->ibdev);
+}
+
+static void ocrdma_remove(struct ocrdma_dev *dev)
+{
+	/* first unregister with stack to stop all the active traffic
+	 * of the registered clients.
+	 */
+	cancel_delayed_work_sync(&dev->eqd_work);
+	ocrdma_remove_sysfiles(dev);
+	ib_unregister_device(&dev->ibdev);
+
+	ocrdma_rem_port_stats(dev);
+
+	spin_lock(&ocrdma_devlist_lock);
+	list_del_rcu(&dev->entry);
+	spin_unlock(&ocrdma_devlist_lock);
+
+	ocrdma_free_resources(dev);
+	ocrdma_cleanup_hw(dev);
+
+	call_rcu(&dev->rcu, ocrdma_remove_free);
+}
+
+static int ocrdma_open(struct ocrdma_dev *dev)
+{
+	struct ib_event port_event;
+
+	port_event.event = IB_EVENT_PORT_ACTIVE;
+	port_event.element.port_num = 1;
+	port_event.device = &dev->ibdev;
+	ib_dispatch_event(&port_event);
+	return 0;
+}
+
+static int ocrdma_close(struct ocrdma_dev *dev)
+{
+	int i;
+	struct ocrdma_qp *qp, **cur_qp;
+	struct ib_event err_event;
+	struct ib_qp_attr attrs;
+	int attr_mask = IB_QP_STATE;
+
+	attrs.qp_state = IB_QPS_ERR;
+	mutex_lock(&dev->dev_lock);
+	if (dev->qp_tbl) {
+		cur_qp = dev->qp_tbl;
+		for (i = 0; i < OCRDMA_MAX_QP; i++) {
+			qp = cur_qp[i];
+			if (qp && qp->ibqp.qp_type != IB_QPT_GSI) {
+				/* change the QP state to ERROR */
+				_ocrdma_modify_qp(&qp->ibqp, &attrs, attr_mask);
+
+				err_event.event = IB_EVENT_QP_FATAL;
+				err_event.element.qp = &qp->ibqp;
+				err_event.device = &dev->ibdev;
+				ib_dispatch_event(&err_event);
+			}
+		}
+	}
+	mutex_unlock(&dev->dev_lock);
+
+	err_event.event = IB_EVENT_PORT_ERR;
+	err_event.element.port_num = 1;
+	err_event.device = &dev->ibdev;
+	ib_dispatch_event(&err_event);
+	return 0;
+}
+
+static void ocrdma_shutdown(struct ocrdma_dev *dev)
+{
+	ocrdma_close(dev);
+	ocrdma_remove(dev);
+}
+
+/* event handling via NIC driver ensures that all the NIC specific
+ * initialization done before RoCE driver notifies
+ * event to stack.
+ */
+static void ocrdma_event_handler(struct ocrdma_dev *dev, u32 event)
+{
+	switch (event) {
+	case BE_DEV_UP:
+		ocrdma_open(dev);
+		break;
+	case BE_DEV_DOWN:
+		ocrdma_close(dev);
+		break;
+	case BE_DEV_SHUTDOWN:
+		ocrdma_shutdown(dev);
+		break;
+	}
+}
+
+static struct ocrdma_driver ocrdma_drv = {
+	.name			= "ocrdma_driver",
+	.add			= ocrdma_add,
+	.remove			= ocrdma_remove,
+	.state_change_handler	= ocrdma_event_handler,
+	.be_abi_version		= OCRDMA_BE_ROCE_ABI_VERSION,
+};
+
+static void ocrdma_unregister_inet6addr_notifier(void)
+{
+#if IS_ENABLED(CONFIG_IPV6)
+	unregister_inet6addr_notifier(&ocrdma_inet6addr_notifier);
+#endif
+}
+
+static void ocrdma_unregister_inetaddr_notifier(void)
+{
+	unregister_inetaddr_notifier(&ocrdma_inetaddr_notifier);
+}
+
+static int __init ocrdma_init_module(void)
+{
+	int status;
+
+	ocrdma_init_debugfs();
+
+	status = register_inetaddr_notifier(&ocrdma_inetaddr_notifier);
+	if (status)
+		return status;
+
+#if IS_ENABLED(CONFIG_IPV6)
+	status = register_inet6addr_notifier(&ocrdma_inet6addr_notifier);
+	if (status)
+		goto err_notifier6;
+#endif
+
+	status = be_roce_register_driver(&ocrdma_drv);
+	if (status)
+		goto err_be_reg;
+
+	return 0;
+
+err_be_reg:
+#if IS_ENABLED(CONFIG_IPV6)
+	ocrdma_unregister_inet6addr_notifier();
+err_notifier6:
+#endif
+	ocrdma_unregister_inetaddr_notifier();
+	return status;
+}
+
+static void __exit ocrdma_exit_module(void)
+{
+	be_roce_unregister_driver(&ocrdma_drv);
+	ocrdma_unregister_inet6addr_notifier();
+	ocrdma_unregister_inetaddr_notifier();
+	ocrdma_rem_debugfs();
+}
+
+module_init(ocrdma_init_module);
+module_exit(ocrdma_exit_module);
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_sli.h b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
new file mode 100644
index 000000000..02ad0aee9
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_sli.h
@@ -0,0 +1,2173 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#ifndef __OCRDMA_SLI_H__
+#define __OCRDMA_SLI_H__
+
+enum {
+	OCRDMA_ASIC_GEN_SKH_R = 0x04,
+	OCRDMA_ASIC_GEN_LANCER = 0x0B
+};
+
+enum {
+	OCRDMA_ASIC_REV_A0 = 0x00,
+	OCRDMA_ASIC_REV_B0 = 0x10,
+	OCRDMA_ASIC_REV_C0 = 0x20
+};
+
+#define OCRDMA_SUBSYS_ROCE 10
+enum {
+	OCRDMA_CMD_QUERY_CONFIG = 1,
+	OCRDMA_CMD_ALLOC_PD = 2,
+	OCRDMA_CMD_DEALLOC_PD = 3,
+
+	OCRDMA_CMD_CREATE_AH_TBL = 4,
+	OCRDMA_CMD_DELETE_AH_TBL = 5,
+
+	OCRDMA_CMD_CREATE_QP = 6,
+	OCRDMA_CMD_QUERY_QP = 7,
+	OCRDMA_CMD_MODIFY_QP = 8 ,
+	OCRDMA_CMD_DELETE_QP = 9,
+
+	OCRDMA_CMD_RSVD1 = 10,
+	OCRDMA_CMD_ALLOC_LKEY = 11,
+	OCRDMA_CMD_DEALLOC_LKEY = 12,
+	OCRDMA_CMD_REGISTER_NSMR = 13,
+	OCRDMA_CMD_REREGISTER_NSMR = 14,
+	OCRDMA_CMD_REGISTER_NSMR_CONT = 15,
+	OCRDMA_CMD_QUERY_NSMR = 16,
+	OCRDMA_CMD_ALLOC_MW = 17,
+	OCRDMA_CMD_QUERY_MW = 18,
+
+	OCRDMA_CMD_CREATE_SRQ = 19,
+	OCRDMA_CMD_QUERY_SRQ = 20,
+	OCRDMA_CMD_MODIFY_SRQ = 21,
+	OCRDMA_CMD_DELETE_SRQ = 22,
+
+	OCRDMA_CMD_ATTACH_MCAST = 23,
+	OCRDMA_CMD_DETACH_MCAST = 24,
+
+	OCRDMA_CMD_CREATE_RBQ = 25,
+	OCRDMA_CMD_DESTROY_RBQ = 26,
+
+	OCRDMA_CMD_GET_RDMA_STATS = 27,
+	OCRDMA_CMD_ALLOC_PD_RANGE = 28,
+	OCRDMA_CMD_DEALLOC_PD_RANGE = 29,
+
+	OCRDMA_CMD_MAX
+};
+
+#define OCRDMA_SUBSYS_COMMON 1
+enum {
+	OCRDMA_CMD_QUERY_NTWK_LINK_CONFIG_V1 = 5,
+	OCRDMA_CMD_CREATE_CQ		= 12,
+	OCRDMA_CMD_CREATE_EQ		= 13,
+	OCRDMA_CMD_CREATE_MQ		= 21,
+	OCRDMA_CMD_GET_CTRL_ATTRIBUTES  = 32,
+	OCRDMA_CMD_GET_FW_VER		= 35,
+	OCRDMA_CMD_MODIFY_EQ_DELAY      = 41,
+	OCRDMA_CMD_DELETE_MQ		= 53,
+	OCRDMA_CMD_DELETE_CQ		= 54,
+	OCRDMA_CMD_DELETE_EQ		= 55,
+	OCRDMA_CMD_GET_FW_CONFIG	= 58,
+	OCRDMA_CMD_CREATE_MQ_EXT	= 90,
+	OCRDMA_CMD_PHY_DETAILS		= 102
+};
+
+enum {
+	QTYPE_EQ	= 1,
+	QTYPE_CQ	= 2,
+	QTYPE_MCCQ	= 3
+};
+
+#define OCRDMA_MAX_SGID		16
+
+#define OCRDMA_MAX_QP    2048
+#define OCRDMA_MAX_CQ    2048
+#define OCRDMA_MAX_STAG 16384
+
+enum {
+	OCRDMA_DB_RQ_OFFSET		= 0xE0,
+	OCRDMA_DB_GEN2_RQ_OFFSET        = 0x100,
+	OCRDMA_DB_SQ_OFFSET		= 0x60,
+	OCRDMA_DB_GEN2_SQ_OFFSET	= 0x1C0,
+	OCRDMA_DB_SRQ_OFFSET		= OCRDMA_DB_RQ_OFFSET,
+	OCRDMA_DB_GEN2_SRQ_OFFSET	= OCRDMA_DB_GEN2_RQ_OFFSET,
+	OCRDMA_DB_CQ_OFFSET		= 0x120,
+	OCRDMA_DB_EQ_OFFSET		= OCRDMA_DB_CQ_OFFSET,
+	OCRDMA_DB_MQ_OFFSET		= 0x140,
+
+	OCRDMA_DB_SQ_SHIFT		= 16,
+	OCRDMA_DB_RQ_SHIFT		= 24
+};
+
+#define OCRDMA_DB_CQ_RING_ID_MASK       0x3FF	/* bits 0 - 9 */
+#define OCRDMA_DB_CQ_RING_ID_EXT_MASK  0x0C00	/* bits 10-11 of qid at 12-11 */
+/* qid #2 msbits at 12-11 */
+#define OCRDMA_DB_CQ_RING_ID_EXT_MASK_SHIFT  0x1
+#define OCRDMA_DB_CQ_NUM_POPPED_SHIFT	16	/* bits 16 - 28 */
+/* Rearm bit */
+#define OCRDMA_DB_CQ_REARM_SHIFT	29	/* bit 29 */
+/* solicited bit */
+#define OCRDMA_DB_CQ_SOLICIT_SHIFT	31	/* bit 31 */
+
+#define OCRDMA_EQ_ID_MASK		0x1FF	/* bits 0 - 8 */
+#define OCRDMA_EQ_ID_EXT_MASK		0x3e00	/* bits 9-13 */
+#define OCRDMA_EQ_ID_EXT_MASK_SHIFT	2	/* qid bits 9-13 at 11-15 */
+
+/* Clear the interrupt for this eq */
+#define OCRDMA_EQ_CLR_SHIFT		9	/* bit 9 */
+/* Must be 1 */
+#define OCRDMA_EQ_TYPE_SHIFT		10	/* bit 10 */
+/* Number of event entries processed */
+#define OCRDMA_NUM_EQE_SHIFT		16	/* bits 16 - 28 */
+/* Rearm bit */
+#define OCRDMA_REARM_SHIFT		29	/* bit 29 */
+
+#define OCRDMA_MQ_ID_MASK		0x7FF	/* bits 0 - 10 */
+/* Number of entries posted */
+#define OCRDMA_MQ_NUM_MQE_SHIFT	16	/* bits 16 - 29 */
+
+#define OCRDMA_MIN_HPAGE_SIZE	4096
+
+#define OCRDMA_MIN_Q_PAGE_SIZE	4096
+#define OCRDMA_MAX_Q_PAGES	8
+
+#define OCRDMA_SLI_ASIC_ID_OFFSET	0x9C
+#define OCRDMA_SLI_ASIC_REV_MASK	0x000000FF
+#define OCRDMA_SLI_ASIC_GEN_NUM_MASK	0x0000FF00
+#define OCRDMA_SLI_ASIC_GEN_NUM_SHIFT	0x08
+/*
+# 0: 4K Bytes
+# 1: 8K Bytes
+# 2: 16K Bytes
+# 3: 32K Bytes
+# 4: 64K Bytes
+# 5: 128K Bytes
+# 6: 256K Bytes
+# 7: 512K Bytes
+*/
+#define OCRDMA_MAX_Q_PAGE_SIZE_CNT	8
+#define OCRDMA_Q_PAGE_BASE_SIZE (OCRDMA_MIN_Q_PAGE_SIZE * OCRDMA_MAX_Q_PAGES)
+
+#define MAX_OCRDMA_QP_PAGES		8
+#define OCRDMA_MAX_WQE_MEM_SIZE (MAX_OCRDMA_QP_PAGES * OCRDMA_MIN_HQ_PAGE_SIZE)
+
+#define OCRDMA_CREATE_CQ_MAX_PAGES	4
+#define OCRDMA_DPP_CQE_SIZE		4
+
+#define OCRDMA_GEN2_MAX_CQE 1024
+#define OCRDMA_GEN2_CQ_PAGE_SIZE 4096
+#define OCRDMA_GEN2_WQE_SIZE 256
+#define OCRDMA_MAX_CQE  4095
+#define OCRDMA_CQ_PAGE_SIZE 16384
+#define OCRDMA_WQE_SIZE 128
+#define OCRDMA_WQE_STRIDE 8
+#define OCRDMA_WQE_ALIGN_BYTES 16
+
+#define MAX_OCRDMA_SRQ_PAGES MAX_OCRDMA_QP_PAGES
+
+enum {
+	OCRDMA_MCH_OPCODE_SHIFT	= 0,
+	OCRDMA_MCH_OPCODE_MASK	= 0xFF,
+	OCRDMA_MCH_SUBSYS_SHIFT	= 8,
+	OCRDMA_MCH_SUBSYS_MASK	= 0xFF00
+};
+
+/* mailbox cmd header */
+struct ocrdma_mbx_hdr {
+	u32 subsys_op;
+	u32 timeout;		/* in seconds */
+	u32 cmd_len;
+	u32 rsvd_version;
+};
+
+enum {
+	OCRDMA_MBX_RSP_OPCODE_SHIFT	= 0,
+	OCRDMA_MBX_RSP_OPCODE_MASK	= 0xFF,
+	OCRDMA_MBX_RSP_SUBSYS_SHIFT	= 8,
+	OCRDMA_MBX_RSP_SUBSYS_MASK	= 0xFF << OCRDMA_MBX_RSP_SUBSYS_SHIFT,
+
+	OCRDMA_MBX_RSP_STATUS_SHIFT	= 0,
+	OCRDMA_MBX_RSP_STATUS_MASK	= 0xFF,
+	OCRDMA_MBX_RSP_ASTATUS_SHIFT	= 8,
+	OCRDMA_MBX_RSP_ASTATUS_MASK	= 0xFF << OCRDMA_MBX_RSP_ASTATUS_SHIFT
+};
+
+/* mailbox cmd response */
+struct ocrdma_mbx_rsp {
+	u32 subsys_op;
+	u32 status;
+	u32 rsp_len;
+	u32 add_rsp_len;
+};
+
+enum {
+	OCRDMA_MQE_EMBEDDED	= 1,
+	OCRDMA_MQE_NONEMBEDDED	= 0
+};
+
+struct ocrdma_mqe_sge {
+	u32 pa_lo;
+	u32 pa_hi;
+	u32 len;
+};
+
+enum {
+	OCRDMA_MQE_HDR_EMB_SHIFT	= 0,
+	OCRDMA_MQE_HDR_EMB_MASK		= BIT(0),
+	OCRDMA_MQE_HDR_SGE_CNT_SHIFT	= 3,
+	OCRDMA_MQE_HDR_SGE_CNT_MASK	= 0x1F << OCRDMA_MQE_HDR_SGE_CNT_SHIFT,
+	OCRDMA_MQE_HDR_SPECIAL_SHIFT	= 24,
+	OCRDMA_MQE_HDR_SPECIAL_MASK	= 0xFF << OCRDMA_MQE_HDR_SPECIAL_SHIFT
+};
+
+struct ocrdma_mqe_hdr {
+	u32 spcl_sge_cnt_emb;
+	u32 pyld_len;
+	u32 tag_lo;
+	u32 tag_hi;
+	u32 rsvd3;
+};
+
+struct ocrdma_mqe_emb_cmd {
+	struct ocrdma_mbx_hdr mch;
+	u8 pyld[220];
+};
+
+struct ocrdma_mqe {
+	struct ocrdma_mqe_hdr hdr;
+	union {
+		struct ocrdma_mqe_emb_cmd emb_req;
+		struct {
+			struct ocrdma_mqe_sge sge[19];
+		} nonemb_req;
+		u8 cmd[236];
+		struct ocrdma_mbx_rsp rsp;
+	} u;
+};
+
+#define OCRDMA_EQ_LEN       4096
+#define OCRDMA_MQ_CQ_LEN    256
+#define OCRDMA_MQ_LEN       128
+
+#define PAGE_SHIFT_4K		12
+#define PAGE_SIZE_4K		(1 << PAGE_SHIFT_4K)
+
+/* Returns number of pages spanned by the data starting at the given addr */
+#define PAGES_4K_SPANNED(_address, size) \
+	((u32)((((size_t)(_address) & (PAGE_SIZE_4K - 1)) +	\
+			(size) + (PAGE_SIZE_4K - 1)) >> PAGE_SHIFT_4K))
+
+struct ocrdma_delete_q_req {
+	struct ocrdma_mbx_hdr req;
+	u32 id;
+};
+
+struct ocrdma_pa {
+	u32 lo;
+	u32 hi;
+};
+
+#define MAX_OCRDMA_EQ_PAGES	8
+struct ocrdma_create_eq_req {
+	struct ocrdma_mbx_hdr req;
+	u32 num_pages;
+	u32 valid;
+	u32 cnt;
+	u32 delay;
+	u32 rsvd;
+	struct ocrdma_pa pa[MAX_OCRDMA_EQ_PAGES];
+};
+
+enum {
+	OCRDMA_CREATE_EQ_VALID	= BIT(29),
+	OCRDMA_CREATE_EQ_CNT_SHIFT	= 26,
+	OCRDMA_CREATE_CQ_DELAY_SHIFT	= 13,
+};
+
+struct ocrdma_create_eq_rsp {
+	struct ocrdma_mbx_rsp rsp;
+	u32 vector_eqid;
+};
+
+#define OCRDMA_EQ_MINOR_OTHER	0x1
+
+struct ocrmda_set_eqd {
+	u32 eq_id;
+	u32 phase;
+	u32 delay_multiplier;
+};
+
+struct ocrdma_modify_eqd_cmd {
+	struct ocrdma_mbx_hdr req;
+	u32 num_eq;
+	struct ocrmda_set_eqd set_eqd[8];
+} __packed;
+
+struct ocrdma_modify_eqd_req {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_modify_eqd_cmd cmd;
+};
+
+
+struct ocrdma_modify_eq_delay_rsp {
+	struct ocrdma_mbx_rsp hdr;
+	u32 rsvd0;
+} __packed;
+
+enum {
+	OCRDMA_MCQE_STATUS_SHIFT	= 0,
+	OCRDMA_MCQE_STATUS_MASK		= 0xFFFF,
+	OCRDMA_MCQE_ESTATUS_SHIFT	= 16,
+	OCRDMA_MCQE_ESTATUS_MASK	= 0xFFFF << OCRDMA_MCQE_ESTATUS_SHIFT,
+	OCRDMA_MCQE_CONS_SHIFT		= 27,
+	OCRDMA_MCQE_CONS_MASK		= BIT(27),
+	OCRDMA_MCQE_CMPL_SHIFT		= 28,
+	OCRDMA_MCQE_CMPL_MASK		= BIT(28),
+	OCRDMA_MCQE_AE_SHIFT		= 30,
+	OCRDMA_MCQE_AE_MASK		= BIT(30),
+	OCRDMA_MCQE_VALID_SHIFT		= 31,
+	OCRDMA_MCQE_VALID_MASK		= BIT(31)
+};
+
+struct ocrdma_mcqe {
+	u32 status;
+	u32 tag_lo;
+	u32 tag_hi;
+	u32 valid_ae_cmpl_cons;
+};
+
+enum {
+	OCRDMA_AE_MCQE_QPVALID		= BIT(31),
+	OCRDMA_AE_MCQE_QPID_MASK	= 0xFFFF,
+
+	OCRDMA_AE_MCQE_CQVALID		= BIT(31),
+	OCRDMA_AE_MCQE_CQID_MASK	= 0xFFFF,
+	OCRDMA_AE_MCQE_VALID		= BIT(31),
+	OCRDMA_AE_MCQE_AE		= BIT(30),
+	OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT	= 16,
+	OCRDMA_AE_MCQE_EVENT_TYPE_MASK	=
+					0xFF << OCRDMA_AE_MCQE_EVENT_TYPE_SHIFT,
+	OCRDMA_AE_MCQE_EVENT_CODE_SHIFT	= 8,
+	OCRDMA_AE_MCQE_EVENT_CODE_MASK	=
+					0xFF << OCRDMA_AE_MCQE_EVENT_CODE_SHIFT
+};
+struct ocrdma_ae_mcqe {
+	u32 qpvalid_qpid;
+	u32 cqvalid_cqid;
+	u32 evt_tag;
+	u32 valid_ae_event;
+};
+
+enum {
+	OCRDMA_AE_PVID_MCQE_ENABLED_SHIFT = 0,
+	OCRDMA_AE_PVID_MCQE_ENABLED_MASK  = 0xFF,
+	OCRDMA_AE_PVID_MCQE_TAG_SHIFT = 16,
+	OCRDMA_AE_PVID_MCQE_TAG_MASK = 0xFFFF << OCRDMA_AE_PVID_MCQE_TAG_SHIFT
+};
+
+struct ocrdma_ae_pvid_mcqe {
+	u32 tag_enabled;
+	u32 event_tag;
+	u32 rsvd1;
+	u32 rsvd2;
+};
+
+enum {
+	OCRDMA_AE_MPA_MCQE_REQ_ID_SHIFT		= 16,
+	OCRDMA_AE_MPA_MCQE_REQ_ID_MASK		= 0xFFFF <<
+					OCRDMA_AE_MPA_MCQE_REQ_ID_SHIFT,
+
+	OCRDMA_AE_MPA_MCQE_EVENT_CODE_SHIFT	= 8,
+	OCRDMA_AE_MPA_MCQE_EVENT_CODE_MASK	= 0xFF <<
+					OCRDMA_AE_MPA_MCQE_EVENT_CODE_SHIFT,
+	OCRDMA_AE_MPA_MCQE_EVENT_TYPE_SHIFT	= 16,
+	OCRDMA_AE_MPA_MCQE_EVENT_TYPE_MASK	= 0xFF <<
+					OCRDMA_AE_MPA_MCQE_EVENT_TYPE_SHIFT,
+	OCRDMA_AE_MPA_MCQE_EVENT_AE_SHIFT	= 30,
+	OCRDMA_AE_MPA_MCQE_EVENT_AE_MASK	= BIT(30),
+	OCRDMA_AE_MPA_MCQE_EVENT_VALID_SHIFT	= 31,
+	OCRDMA_AE_MPA_MCQE_EVENT_VALID_MASK	= BIT(31)
+};
+
+struct ocrdma_ae_mpa_mcqe {
+	u32 req_id;
+	u32 w1;
+	u32 w2;
+	u32 valid_ae_event;
+};
+
+enum {
+	OCRDMA_AE_QP_MCQE_NEW_QP_STATE_SHIFT	= 0,
+	OCRDMA_AE_QP_MCQE_NEW_QP_STATE_MASK	= 0xFFFF,
+	OCRDMA_AE_QP_MCQE_QP_ID_SHIFT		= 16,
+	OCRDMA_AE_QP_MCQE_QP_ID_MASK		= 0xFFFF <<
+						OCRDMA_AE_QP_MCQE_QP_ID_SHIFT,
+
+	OCRDMA_AE_QP_MCQE_EVENT_CODE_SHIFT	= 8,
+	OCRDMA_AE_QP_MCQE_EVENT_CODE_MASK	= 0xFF <<
+				OCRDMA_AE_QP_MCQE_EVENT_CODE_SHIFT,
+	OCRDMA_AE_QP_MCQE_EVENT_TYPE_SHIFT	= 16,
+	OCRDMA_AE_QP_MCQE_EVENT_TYPE_MASK	= 0xFF <<
+				OCRDMA_AE_QP_MCQE_EVENT_TYPE_SHIFT,
+	OCRDMA_AE_QP_MCQE_EVENT_AE_SHIFT	= 30,
+	OCRDMA_AE_QP_MCQE_EVENT_AE_MASK		= BIT(30),
+	OCRDMA_AE_QP_MCQE_EVENT_VALID_SHIFT	= 31,
+	OCRDMA_AE_QP_MCQE_EVENT_VALID_MASK	= BIT(31)
+};
+
+struct ocrdma_ae_qp_mcqe {
+	u32 qp_id_state;
+	u32 w1;
+	u32 w2;
+	u32 valid_ae_event;
+};
+
+#define OCRDMA_ASYNC_RDMA_EVE_CODE 0x14
+#define OCRDMA_ASYNC_GRP5_EVE_CODE 0x5
+
+enum ocrdma_async_grp5_events {
+	OCRDMA_ASYNC_EVENT_QOS_VALUE	= 0x01,
+	OCRDMA_ASYNC_EVENT_COS_VALUE	= 0x02,
+	OCRDMA_ASYNC_EVENT_PVID_STATE	= 0x03
+};
+
+enum OCRDMA_ASYNC_EVENT_TYPE {
+	OCRDMA_CQ_ERROR			= 0x00,
+	OCRDMA_CQ_OVERRUN_ERROR		= 0x01,
+	OCRDMA_CQ_QPCAT_ERROR		= 0x02,
+	OCRDMA_QP_ACCESS_ERROR		= 0x03,
+	OCRDMA_QP_COMM_EST_EVENT	= 0x04,
+	OCRDMA_SQ_DRAINED_EVENT		= 0x05,
+	OCRDMA_DEVICE_FATAL_EVENT	= 0x08,
+	OCRDMA_SRQCAT_ERROR		= 0x0E,
+	OCRDMA_SRQ_LIMIT_EVENT		= 0x0F,
+	OCRDMA_QP_LAST_WQE_EVENT	= 0x10,
+
+	OCRDMA_MAX_ASYNC_ERRORS
+};
+
+/* mailbox command request and responses */
+enum {
+	OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_SHIFT		= 2,
+	OCRDMA_MBX_QUERY_CFG_CQ_OVERFLOW_MASK		= BIT(2),
+	OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_SHIFT	= 3,
+	OCRDMA_MBX_QUERY_CFG_SRQ_SUPPORTED_MASK		= BIT(3),
+	OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT		= 8,
+	OCRDMA_MBX_QUERY_CFG_MAX_QP_MASK		= 0xFFFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_QP_SHIFT,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT		= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_PD_MASK		= 0xFFFF <<
+					OCRDMA_MBX_QUERY_CFG_MAX_PD_SHIFT,
+	OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT		= 8,
+	OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_MASK		= 0xFF <<
+				OCRDMA_MBX_QUERY_CFG_CA_ACK_DELAY_SHIFT,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_SHIFT		= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_SEND_SGE_MASK		= 0xFFFF,
+	OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT	= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_WRITE_SGE_SHIFT,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_SHIFT	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_ORD_PER_QP_MASK	= 0xFFFF,
+	OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT	= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_MASK	= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_IRD_PER_QP_SHIFT,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET	= 24,
+	OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_MASK		= 0xFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_WQE_SIZE_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_OFFSET	= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_MASK		= 0xFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_RQE_SIZE_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_CQES_OFFSET	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_CQES_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_DPP_CQES_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET		= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_SRQ_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_RPIR_QPS_OFFSET	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_RPIR_QPS_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_RPIR_QPS_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_OFFSET		= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_DPP_PDS_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_CREDITS_OFFSET	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_CREDITS_MASK	= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_DPP_CREDITS_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_QPS_OFFSET		= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_DPP_QPS_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_DPP_QPS_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_OFFSET	= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_MASK	= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_WQES_PER_WQ_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_OFFSET	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_MASK	= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_RQES_PER_RQ_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET		= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_CQ_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_CQ_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_OFFSET	= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_MASK	= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_CQES_PER_CQ_OFFSET,
+
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_RQE_OFFSET		= 16,
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_RQE_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_SRQ_RQE_OFFSET,
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET		= 0,
+	OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_MASK		= 0xFFFF <<
+				OCRDMA_MBX_QUERY_CFG_MAX_SRQ_SGE_OFFSET,
+};
+
+struct ocrdma_mbx_query_config {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+	u32 qp_srq_cq_ird_ord;
+	u32 max_pd_ca_ack_delay;
+	u32 max_write_send_sge;
+	u32 max_ird_ord_per_qp;
+	u32 max_shared_ird_ord;
+	u32 max_mr;
+	u32 max_mr_size_hi;
+	u32 max_mr_size_lo;
+	u32 max_num_mr_pbl;
+	u32 max_mw;
+	u32 max_fmr;
+	u32 max_pages_per_frmr;
+	u32 max_mcast_group;
+	u32 max_mcast_qp_attach;
+	u32 max_total_mcast_qp_attach;
+	u32 wqe_rqe_stride_max_dpp_cqs;
+	u32 max_srq_rpir_qps;
+	u32 max_dpp_pds_credits;
+	u32 max_dpp_credits_pds_per_pd;
+	u32 max_wqes_rqes_per_q;
+	u32 max_cq_cqes_per_cq;
+	u32 max_srq_rqe_sge;
+};
+
+struct ocrdma_fw_ver_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u8 running_ver[32];
+};
+
+struct ocrdma_fw_conf_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 config_num;
+	u32 asic_revision;
+	u32 phy_port;
+	u32 fn_mode;
+	struct {
+		u32 mode;
+		u32 nic_wqid_base;
+		u32 nic_wq_tot;
+		u32 prot_wqid_base;
+		u32 prot_wq_tot;
+		u32 prot_rqid_base;
+		u32 prot_rqid_tot;
+		u32 rsvd[6];
+	} ulp[2];
+	u32 fn_capabilities;
+	u32 rsvd1;
+	u32 rsvd2;
+	u32 base_eqid;
+	u32 max_eq;
+
+};
+
+enum {
+	OCRDMA_FN_MODE_RDMA	= 0x4
+};
+
+enum {
+	OCRDMA_IF_TYPE_MASK		= 0xFFFF0000,
+	OCRDMA_IF_TYPE_SHIFT		= 0x10,
+	OCRDMA_PHY_TYPE_MASK		= 0x0000FFFF,
+	OCRDMA_FUTURE_DETAILS_MASK	= 0xFFFF0000,
+	OCRDMA_FUTURE_DETAILS_SHIFT	= 0x10,
+	OCRDMA_EX_PHY_DETAILS_MASK	= 0x0000FFFF,
+	OCRDMA_FSPEED_SUPP_MASK		= 0xFFFF0000,
+	OCRDMA_FSPEED_SUPP_SHIFT	= 0x10,
+	OCRDMA_ASPEED_SUPP_MASK		= 0x0000FFFF
+};
+
+struct ocrdma_get_phy_info_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 ityp_ptyp;
+	u32 misc_params;
+	u32 ftrdtl_exphydtl;
+	u32 fspeed_aspeed;
+	u32 future_use[2];
+};
+
+enum {
+	OCRDMA_PHY_SPEED_ZERO = 0x0,
+	OCRDMA_PHY_SPEED_10MBPS = 0x1,
+	OCRDMA_PHY_SPEED_100MBPS = 0x2,
+	OCRDMA_PHY_SPEED_1GBPS = 0x4,
+	OCRDMA_PHY_SPEED_10GBPS = 0x8,
+	OCRDMA_PHY_SPEED_40GBPS = 0x20
+};
+
+enum {
+	OCRDMA_PORT_NUM_MASK	= 0x3F,
+	OCRDMA_PT_MASK		= 0xC0,
+	OCRDMA_PT_SHIFT		= 0x6,
+	OCRDMA_LINK_DUP_MASK	= 0x0000FF00,
+	OCRDMA_LINK_DUP_SHIFT	= 0x8,
+	OCRDMA_PHY_PS_MASK	= 0x00FF0000,
+	OCRDMA_PHY_PS_SHIFT	= 0x10,
+	OCRDMA_PHY_PFLT_MASK	= 0xFF000000,
+	OCRDMA_PHY_PFLT_SHIFT	= 0x18,
+	OCRDMA_QOS_LNKSP_MASK	= 0xFFFF0000,
+	OCRDMA_QOS_LNKSP_SHIFT	= 0x10,
+	OCRDMA_LLST_MASK	= 0xFF,
+	OCRDMA_PLFC_MASK	= 0x00000400,
+	OCRDMA_PLFC_SHIFT	= 0x8,
+	OCRDMA_PLRFC_MASK	= 0x00000200,
+	OCRDMA_PLRFC_SHIFT	= 0x8,
+	OCRDMA_PLTFC_MASK	= 0x00000100,
+	OCRDMA_PLTFC_SHIFT	= 0x8
+};
+
+struct ocrdma_get_link_speed_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 pflt_pps_ld_pnum;
+	u32 qos_lsp;
+	u32 res_lls;
+};
+
+enum {
+	OCRDMA_PHYS_LINK_SPEED_ZERO = 0x0,
+	OCRDMA_PHYS_LINK_SPEED_10MBPS = 0x1,
+	OCRDMA_PHYS_LINK_SPEED_100MBPS = 0x2,
+	OCRDMA_PHYS_LINK_SPEED_1GBPS = 0x3,
+	OCRDMA_PHYS_LINK_SPEED_10GBPS = 0x4,
+	OCRDMA_PHYS_LINK_SPEED_20GBPS = 0x5,
+	OCRDMA_PHYS_LINK_SPEED_25GBPS = 0x6,
+	OCRDMA_PHYS_LINK_SPEED_40GBPS = 0x7,
+	OCRDMA_PHYS_LINK_SPEED_100GBPS = 0x8
+};
+
+enum {
+	OCRDMA_CREATE_CQ_VER2			= 2,
+	OCRDMA_CREATE_CQ_VER3			= 3,
+
+	OCRDMA_CREATE_CQ_PAGE_CNT_MASK		= 0xFFFF,
+	OCRDMA_CREATE_CQ_PAGE_SIZE_SHIFT	= 16,
+	OCRDMA_CREATE_CQ_PAGE_SIZE_MASK		= 0xFF,
+
+	OCRDMA_CREATE_CQ_COALESCWM_SHIFT	= 12,
+	OCRDMA_CREATE_CQ_COALESCWM_MASK		= BIT(13) | BIT(12),
+	OCRDMA_CREATE_CQ_FLAGS_NODELAY		= BIT(14),
+	OCRDMA_CREATE_CQ_FLAGS_AUTO_VALID	= BIT(15),
+
+	OCRDMA_CREATE_CQ_EQ_ID_MASK		= 0xFFFF,
+	OCRDMA_CREATE_CQ_CQE_COUNT_MASK		= 0xFFFF
+};
+
+enum {
+	OCRDMA_CREATE_CQ_VER0			= 0,
+	OCRDMA_CREATE_CQ_DPP			= 1,
+	OCRDMA_CREATE_CQ_TYPE_SHIFT		= 24,
+	OCRDMA_CREATE_CQ_EQID_SHIFT		= 22,
+
+	OCRDMA_CREATE_CQ_CNT_SHIFT		= 27,
+	OCRDMA_CREATE_CQ_FLAGS_VALID		= BIT(29),
+	OCRDMA_CREATE_CQ_FLAGS_EVENTABLE	= BIT(31),
+	OCRDMA_CREATE_CQ_DEF_FLAGS		= OCRDMA_CREATE_CQ_FLAGS_VALID |
+					OCRDMA_CREATE_CQ_FLAGS_EVENTABLE |
+					OCRDMA_CREATE_CQ_FLAGS_NODELAY
+};
+
+struct ocrdma_create_cq_cmd {
+	struct ocrdma_mbx_hdr req;
+	u32 pgsz_pgcnt;
+	u32 ev_cnt_flags;
+	u32 eqn;
+	u32 pdid_cqecnt;
+	u32 rsvd6;
+	struct ocrdma_pa pa[OCRDMA_CREATE_CQ_MAX_PAGES];
+};
+
+struct ocrdma_create_cq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_create_cq_cmd cmd;
+};
+
+enum {
+	OCRDMA_CREATE_CQ_CMD_PDID_SHIFT	= 0x10
+};
+
+enum {
+	OCRDMA_CREATE_CQ_RSP_CQ_ID_MASK	= 0xFFFF
+};
+
+struct ocrdma_create_cq_cmd_rsp {
+	struct ocrdma_mbx_rsp rsp;
+	u32 cq_id;
+};
+
+struct ocrdma_create_cq_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_create_cq_cmd_rsp rsp;
+};
+
+enum {
+	OCRDMA_CREATE_MQ_V0_CQ_ID_SHIFT		= 22,
+	OCRDMA_CREATE_MQ_CQ_ID_SHIFT		= 16,
+	OCRDMA_CREATE_MQ_RING_SIZE_SHIFT	= 16,
+	OCRDMA_CREATE_MQ_VALID			= BIT(31),
+	OCRDMA_CREATE_MQ_ASYNC_CQ_VALID		= BIT(0)
+};
+
+struct ocrdma_create_mq_req {
+	struct ocrdma_mbx_hdr req;
+	u32 cqid_pages;
+	u32 async_event_bitmap;
+	u32 async_cqid_ringsize;
+	u32 valid;
+	u32 async_cqid_valid;
+	u32 rsvd;
+	struct ocrdma_pa pa[8];
+};
+
+struct ocrdma_create_mq_rsp {
+	struct ocrdma_mbx_rsp rsp;
+	u32 id;
+};
+
+enum {
+	OCRDMA_DESTROY_CQ_QID_SHIFT			= 0,
+	OCRDMA_DESTROY_CQ_QID_MASK			= 0xFFFF,
+	OCRDMA_DESTROY_CQ_QID_BYPASS_FLUSH_SHIFT	= 16,
+	OCRDMA_DESTROY_CQ_QID_BYPASS_FLUSH_MASK		= 0xFFFF <<
+				OCRDMA_DESTROY_CQ_QID_BYPASS_FLUSH_SHIFT
+};
+
+struct ocrdma_destroy_cq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 bypass_flush_qid;
+};
+
+struct ocrdma_destroy_cq_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+enum {
+	OCRDMA_QPT_GSI	= 1,
+	OCRDMA_QPT_RC	= 2,
+	OCRDMA_QPT_UD	= 4,
+};
+
+enum {
+	OCRDMA_CREATE_QP_REQ_PD_ID_SHIFT	= 0,
+	OCRDMA_CREATE_QP_REQ_PD_ID_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_SQ_PAGE_SIZE_SHIFT	= 16,
+	OCRDMA_CREATE_QP_REQ_RQ_PAGE_SIZE_SHIFT	= 19,
+	OCRDMA_CREATE_QP_REQ_QPT_SHIFT		= 29,
+	OCRDMA_CREATE_QP_REQ_QPT_MASK		= BIT(31) | BIT(30) | BIT(29),
+
+	OCRDMA_CREATE_QP_REQ_MAX_RQE_SHIFT	= 0,
+	OCRDMA_CREATE_QP_REQ_MAX_RQE_MASK	= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_MAX_WQE_SHIFT	= 16,
+	OCRDMA_CREATE_QP_REQ_MAX_WQE_MASK	= 0xFFFF <<
+					OCRDMA_CREATE_QP_REQ_MAX_WQE_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_SHIFT	= 0,
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_WRITE_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_MASK		= 0xFFFF <<
+					OCRDMA_CREATE_QP_REQ_MAX_SGE_SEND_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_FMR_EN_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_FMR_EN_MASK		= BIT(0),
+	OCRDMA_CREATE_QP_REQ_ZERO_LKEYEN_SHIFT		= 1,
+	OCRDMA_CREATE_QP_REQ_ZERO_LKEYEN_MASK		= BIT(1),
+	OCRDMA_CREATE_QP_REQ_BIND_MEMWIN_SHIFT		= 2,
+	OCRDMA_CREATE_QP_REQ_BIND_MEMWIN_MASK		= BIT(2),
+	OCRDMA_CREATE_QP_REQ_INB_WREN_SHIFT		= 3,
+	OCRDMA_CREATE_QP_REQ_INB_WREN_MASK		= BIT(3),
+	OCRDMA_CREATE_QP_REQ_INB_RDEN_SHIFT		= 4,
+	OCRDMA_CREATE_QP_REQ_INB_RDEN_MASK		= BIT(4),
+	OCRDMA_CREATE_QP_REQ_USE_SRQ_SHIFT		= 5,
+	OCRDMA_CREATE_QP_REQ_USE_SRQ_MASK		= BIT(5),
+	OCRDMA_CREATE_QP_REQ_ENABLE_RPIR_SHIFT		= 6,
+	OCRDMA_CREATE_QP_REQ_ENABLE_RPIR_MASK		= BIT(6),
+	OCRDMA_CREATE_QP_REQ_ENABLE_DPP_SHIFT		= 7,
+	OCRDMA_CREATE_QP_REQ_ENABLE_DPP_MASK		= BIT(7),
+	OCRDMA_CREATE_QP_REQ_ENABLE_DPP_CQ_SHIFT	= 8,
+	OCRDMA_CREATE_QP_REQ_ENABLE_DPP_CQ_MASK		= BIT(8),
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_MAX_SGE_RECV_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_MAX_IRD_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_MAX_IRD_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_MAX_ORD_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_MAX_ORD_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_MAX_ORD_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_NUM_RQ_PAGES_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_NUM_WQ_PAGES_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_RQE_SIZE_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_RQE_SIZE_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_WQE_SIZE_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_WQE_SIZE_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_WQE_SIZE_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_RQ_CQID_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_RQ_CQID_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_WQ_CQID_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_WQ_CQID_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_WQ_CQID_SHIFT,
+
+	OCRDMA_CREATE_QP_REQ_DPP_CQPID_SHIFT		= 0,
+	OCRDMA_CREATE_QP_REQ_DPP_CQPID_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_REQ_DPP_CREDIT_SHIFT		= 16,
+	OCRDMA_CREATE_QP_REQ_DPP_CREDIT_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_REQ_DPP_CREDIT_SHIFT
+};
+
+enum {
+	OCRDMA_CREATE_QP_REQ_DPP_CREDIT_LIMIT	= 16,
+	OCRDMA_CREATE_QP_RSP_DPP_PAGE_SHIFT	= 1
+};
+
+#define MAX_OCRDMA_IRD_PAGES 4
+
+enum ocrdma_qp_flags {
+	OCRDMA_QP_MW_BIND	= 1,
+	OCRDMA_QP_LKEY0		= (1 << 1),
+	OCRDMA_QP_FAST_REG	= (1 << 2),
+	OCRDMA_QP_INB_RD	= (1 << 6),
+	OCRDMA_QP_INB_WR	= (1 << 7),
+};
+
+enum ocrdma_qp_state {
+	OCRDMA_QPS_RST		= 0,
+	OCRDMA_QPS_INIT		= 1,
+	OCRDMA_QPS_RTR		= 2,
+	OCRDMA_QPS_RTS		= 3,
+	OCRDMA_QPS_SQE		= 4,
+	OCRDMA_QPS_SQ_DRAINING	= 5,
+	OCRDMA_QPS_ERR		= 6,
+	OCRDMA_QPS_SQD		= 7
+};
+
+struct ocrdma_create_qp_req {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 type_pgsz_pdn;
+	u32 max_wqe_rqe;
+	u32 max_sge_send_write;
+	u32 max_sge_recv_flags;
+	u32 max_ord_ird;
+	u32 num_wq_rq_pages;
+	u32 wqe_rqe_size;
+	u32 wq_rq_cqid;
+	struct ocrdma_pa wq_addr[MAX_OCRDMA_QP_PAGES];
+	struct ocrdma_pa rq_addr[MAX_OCRDMA_QP_PAGES];
+	u32 dpp_credits_cqid;
+	u32 rpir_lkey;
+	struct ocrdma_pa ird_addr[MAX_OCRDMA_IRD_PAGES];
+};
+
+enum {
+	OCRDMA_CREATE_QP_RSP_QP_ID_SHIFT		= 0,
+	OCRDMA_CREATE_QP_RSP_QP_ID_MASK			= 0xFFFF,
+
+	OCRDMA_CREATE_QP_RSP_MAX_RQE_SHIFT		= 0,
+	OCRDMA_CREATE_QP_RSP_MAX_RQE_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_RSP_MAX_WQE_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_MAX_WQE_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_MAX_WQE_SHIFT,
+
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_WRITE_SHIFT	= 0,
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_WRITE_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_SEND_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_SEND_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_MAX_SGE_SEND_SHIFT,
+
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_RECV_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_MAX_SGE_RECV_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_MAX_SGE_RECV_SHIFT,
+
+	OCRDMA_CREATE_QP_RSP_MAX_IRD_SHIFT		= 0,
+	OCRDMA_CREATE_QP_RSP_MAX_IRD_MASK		= 0xFFFF,
+	OCRDMA_CREATE_QP_RSP_MAX_ORD_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_MAX_ORD_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_MAX_ORD_SHIFT,
+
+	OCRDMA_CREATE_QP_RSP_RQ_ID_SHIFT		= 0,
+	OCRDMA_CREATE_QP_RSP_RQ_ID_MASK			= 0xFFFF,
+	OCRDMA_CREATE_QP_RSP_SQ_ID_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_SQ_ID_MASK			= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_SQ_ID_SHIFT,
+
+	OCRDMA_CREATE_QP_RSP_DPP_ENABLED_MASK		= BIT(0),
+	OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_SHIFT	= 1,
+	OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_MASK	= 0x7FFF <<
+				OCRDMA_CREATE_QP_RSP_DPP_PAGE_OFFSET_SHIFT,
+	OCRDMA_CREATE_QP_RSP_DPP_CREDITS_SHIFT		= 16,
+	OCRDMA_CREATE_QP_RSP_DPP_CREDITS_MASK		= 0xFFFF <<
+				OCRDMA_CREATE_QP_RSP_DPP_CREDITS_SHIFT,
+};
+
+struct ocrdma_create_qp_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 qp_id;
+	u32 max_wqe_rqe;
+	u32 max_sge_send_write;
+	u32 max_sge_recv;
+	u32 max_ord_ird;
+	u32 sq_rq_id;
+	u32 dpp_response;
+};
+
+struct ocrdma_destroy_qp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 qp_id;
+};
+
+struct ocrdma_destroy_qp_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+enum {
+	OCRDMA_MODIFY_QP_ID_SHIFT	= 0,
+	OCRDMA_MODIFY_QP_ID_MASK	= 0xFFFF,
+
+	OCRDMA_QP_PARA_QPS_VALID	= BIT(0),
+	OCRDMA_QP_PARA_SQD_ASYNC_VALID	= BIT(1),
+	OCRDMA_QP_PARA_PKEY_VALID	= BIT(2),
+	OCRDMA_QP_PARA_QKEY_VALID	= BIT(3),
+	OCRDMA_QP_PARA_PMTU_VALID	= BIT(4),
+	OCRDMA_QP_PARA_ACK_TO_VALID	= BIT(5),
+	OCRDMA_QP_PARA_RETRY_CNT_VALID	= BIT(6),
+	OCRDMA_QP_PARA_RRC_VALID	= BIT(7),
+	OCRDMA_QP_PARA_RQPSN_VALID	= BIT(8),
+	OCRDMA_QP_PARA_MAX_IRD_VALID	= BIT(9),
+	OCRDMA_QP_PARA_MAX_ORD_VALID	= BIT(10),
+	OCRDMA_QP_PARA_RNT_VALID	= BIT(11),
+	OCRDMA_QP_PARA_SQPSN_VALID	= BIT(12),
+	OCRDMA_QP_PARA_DST_QPN_VALID	= BIT(13),
+	OCRDMA_QP_PARA_MAX_WQE_VALID	= BIT(14),
+	OCRDMA_QP_PARA_MAX_RQE_VALID	= BIT(15),
+	OCRDMA_QP_PARA_SGE_SEND_VALID	= BIT(16),
+	OCRDMA_QP_PARA_SGE_RECV_VALID	= BIT(17),
+	OCRDMA_QP_PARA_SGE_WR_VALID	= BIT(18),
+	OCRDMA_QP_PARA_INB_RDEN_VALID	= BIT(19),
+	OCRDMA_QP_PARA_INB_WREN_VALID	= BIT(20),
+	OCRDMA_QP_PARA_FLOW_LBL_VALID	= BIT(21),
+	OCRDMA_QP_PARA_BIND_EN_VALID	= BIT(22),
+	OCRDMA_QP_PARA_ZLKEY_EN_VALID	= BIT(23),
+	OCRDMA_QP_PARA_FMR_EN_VALID	= BIT(24),
+	OCRDMA_QP_PARA_INBAT_EN_VALID	= BIT(25),
+	OCRDMA_QP_PARA_VLAN_EN_VALID	= BIT(26),
+
+	OCRDMA_MODIFY_QP_FLAGS_RD	= BIT(0),
+	OCRDMA_MODIFY_QP_FLAGS_WR	= BIT(1),
+	OCRDMA_MODIFY_QP_FLAGS_SEND	= BIT(2),
+	OCRDMA_MODIFY_QP_FLAGS_ATOMIC	= BIT(3)
+};
+
+enum {
+	OCRDMA_QP_PARAMS_SRQ_ID_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_SRQ_ID_MASK		= 0xFFFF,
+
+	OCRDMA_QP_PARAMS_MAX_RQE_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_MAX_RQE_MASK		= 0xFFFF,
+	OCRDMA_QP_PARAMS_MAX_WQE_SHIFT		= 16,
+	OCRDMA_QP_PARAMS_MAX_WQE_MASK		= 0xFFFF <<
+	    OCRDMA_QP_PARAMS_MAX_WQE_SHIFT,
+
+	OCRDMA_QP_PARAMS_MAX_SGE_WRITE_SHIFT	= 0,
+	OCRDMA_QP_PARAMS_MAX_SGE_WRITE_MASK	= 0xFFFF,
+	OCRDMA_QP_PARAMS_MAX_SGE_SEND_SHIFT	= 16,
+	OCRDMA_QP_PARAMS_MAX_SGE_SEND_MASK	= 0xFFFF <<
+					OCRDMA_QP_PARAMS_MAX_SGE_SEND_SHIFT,
+
+	OCRDMA_QP_PARAMS_FLAGS_FMR_EN		= BIT(0),
+	OCRDMA_QP_PARAMS_FLAGS_LKEY_0_EN	= BIT(1),
+	OCRDMA_QP_PARAMS_FLAGS_BIND_MW_EN	= BIT(2),
+	OCRDMA_QP_PARAMS_FLAGS_INBWR_EN		= BIT(3),
+	OCRDMA_QP_PARAMS_FLAGS_INBRD_EN		= BIT(4),
+	OCRDMA_QP_PARAMS_STATE_SHIFT		= 5,
+	OCRDMA_QP_PARAMS_STATE_MASK		= BIT(5) | BIT(6) | BIT(7),
+	OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC	= BIT(8),
+	OCRDMA_QP_PARAMS_FLAGS_INB_ATEN		= BIT(9),
+	OCRDMA_QP_PARAMS_MAX_SGE_RECV_SHIFT	= 16,
+	OCRDMA_QP_PARAMS_MAX_SGE_RECV_MASK	= 0xFFFF <<
+					OCRDMA_QP_PARAMS_MAX_SGE_RECV_SHIFT,
+
+	OCRDMA_QP_PARAMS_MAX_IRD_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_MAX_IRD_MASK		= 0xFFFF,
+	OCRDMA_QP_PARAMS_MAX_ORD_SHIFT		= 16,
+	OCRDMA_QP_PARAMS_MAX_ORD_MASK		= 0xFFFF <<
+					OCRDMA_QP_PARAMS_MAX_ORD_SHIFT,
+
+	OCRDMA_QP_PARAMS_RQ_CQID_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_RQ_CQID_MASK		= 0xFFFF,
+	OCRDMA_QP_PARAMS_WQ_CQID_SHIFT		= 16,
+	OCRDMA_QP_PARAMS_WQ_CQID_MASK		= 0xFFFF <<
+					OCRDMA_QP_PARAMS_WQ_CQID_SHIFT,
+
+	OCRDMA_QP_PARAMS_RQ_PSN_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_RQ_PSN_MASK		= 0xFFFFFF,
+	OCRDMA_QP_PARAMS_HOP_LMT_SHIFT		= 24,
+	OCRDMA_QP_PARAMS_HOP_LMT_MASK		= 0xFF <<
+					OCRDMA_QP_PARAMS_HOP_LMT_SHIFT,
+
+	OCRDMA_QP_PARAMS_SQ_PSN_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_SQ_PSN_MASK		= 0xFFFFFF,
+	OCRDMA_QP_PARAMS_TCLASS_SHIFT		= 24,
+	OCRDMA_QP_PARAMS_TCLASS_MASK		= 0xFF <<
+					OCRDMA_QP_PARAMS_TCLASS_SHIFT,
+
+	OCRDMA_QP_PARAMS_DEST_QPN_SHIFT		= 0,
+	OCRDMA_QP_PARAMS_DEST_QPN_MASK		= 0xFFFFFF,
+	OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT	= 24,
+	OCRDMA_QP_PARAMS_RNR_RETRY_CNT_MASK	= 0x7 <<
+					OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT,
+	OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT	= 27,
+	OCRDMA_QP_PARAMS_ACK_TIMEOUT_MASK	= 0x1F <<
+					OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT,
+
+	OCRDMA_QP_PARAMS_PKEY_IDNEX_SHIFT	= 0,
+	OCRDMA_QP_PARAMS_PKEY_INDEX_MASK	= 0xFFFF,
+	OCRDMA_QP_PARAMS_PATH_MTU_SHIFT		= 18,
+	OCRDMA_QP_PARAMS_PATH_MTU_MASK		= 0x3FFF <<
+					OCRDMA_QP_PARAMS_PATH_MTU_SHIFT,
+
+	OCRDMA_QP_PARAMS_FLOW_LABEL_SHIFT	= 0,
+	OCRDMA_QP_PARAMS_FLOW_LABEL_MASK	= 0xFFFFF,
+	OCRDMA_QP_PARAMS_SL_SHIFT		= 20,
+	OCRDMA_QP_PARAMS_SL_MASK		= 0xF <<
+					OCRDMA_QP_PARAMS_SL_SHIFT,
+	OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT	= 24,
+	OCRDMA_QP_PARAMS_RETRY_CNT_MASK		= 0x7 <<
+					OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT,
+	OCRDMA_QP_PARAMS_RNR_NAK_TIMER_SHIFT	= 27,
+	OCRDMA_QP_PARAMS_RNR_NAK_TIMER_MASK	= 0x1F <<
+					OCRDMA_QP_PARAMS_RNR_NAK_TIMER_SHIFT,
+
+	OCRDMA_QP_PARAMS_DMAC_B4_TO_B5_SHIFT	= 0,
+	OCRDMA_QP_PARAMS_DMAC_B4_TO_B5_MASK	= 0xFFFF,
+	OCRDMA_QP_PARAMS_VLAN_SHIFT		= 16,
+	OCRDMA_QP_PARAMS_VLAN_MASK		= 0xFFFF <<
+					OCRDMA_QP_PARAMS_VLAN_SHIFT
+};
+
+struct ocrdma_qp_params {
+	u32 id;
+	u32 max_wqe_rqe;
+	u32 max_sge_send_write;
+	u32 max_sge_recv_flags;
+	u32 max_ord_ird;
+	u32 wq_rq_cqid;
+	u32 hop_lmt_rq_psn;
+	u32 tclass_sq_psn;
+	u32 ack_to_rnr_rtc_dest_qpn;
+	u32 path_mtu_pkey_indx;
+	u32 rnt_rc_sl_fl;
+	u8 sgid[16];
+	u8 dgid[16];
+	u32 dmac_b0_to_b3;
+	u32 vlan_dmac_b4_to_b5;
+	u32 qkey;
+};
+
+
+struct ocrdma_modify_qp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	struct ocrdma_qp_params params;
+	u32 flags;
+	u32 rdma_flags;
+	u32 num_outstanding_atomic_rd;
+};
+
+enum {
+	OCRDMA_MODIFY_QP_RSP_MAX_RQE_SHIFT	= 0,
+	OCRDMA_MODIFY_QP_RSP_MAX_RQE_MASK	= 0xFFFF,
+	OCRDMA_MODIFY_QP_RSP_MAX_WQE_SHIFT	= 16,
+	OCRDMA_MODIFY_QP_RSP_MAX_WQE_MASK	= 0xFFFF <<
+					OCRDMA_MODIFY_QP_RSP_MAX_WQE_SHIFT,
+
+	OCRDMA_MODIFY_QP_RSP_MAX_IRD_SHIFT	= 0,
+	OCRDMA_MODIFY_QP_RSP_MAX_IRD_MASK	= 0xFFFF,
+	OCRDMA_MODIFY_QP_RSP_MAX_ORD_SHIFT	= 16,
+	OCRDMA_MODIFY_QP_RSP_MAX_ORD_MASK	= 0xFFFF <<
+					OCRDMA_MODIFY_QP_RSP_MAX_ORD_SHIFT
+};
+
+struct ocrdma_modify_qp_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 max_wqe_rqe;
+	u32 max_ord_ird;
+};
+
+struct ocrdma_query_qp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+#define OCRDMA_QUERY_UP_QP_ID_SHIFT	0
+#define OCRDMA_QUERY_UP_QP_ID_MASK	0xFFFFFF
+	u32 qp_id;
+};
+
+struct ocrdma_query_qp_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+	struct ocrdma_qp_params params;
+	u32 dpp_credits_cqid;
+	u32 rbq_id;
+};
+
+enum {
+	OCRDMA_CREATE_SRQ_PD_ID_SHIFT		= 0,
+	OCRDMA_CREATE_SRQ_PD_ID_MASK		= 0xFFFF,
+	OCRDMA_CREATE_SRQ_PG_SZ_SHIFT		= 16,
+	OCRDMA_CREATE_SRQ_PG_SZ_MASK		= 0x3 <<
+					OCRDMA_CREATE_SRQ_PG_SZ_SHIFT,
+
+	OCRDMA_CREATE_SRQ_MAX_RQE_SHIFT		= 0,
+	OCRDMA_CREATE_SRQ_MAX_SGE_RECV_SHIFT	= 16,
+	OCRDMA_CREATE_SRQ_MAX_SGE_RECV_MASK	= 0xFFFF <<
+					OCRDMA_CREATE_SRQ_MAX_SGE_RECV_SHIFT,
+
+	OCRDMA_CREATE_SRQ_RQE_SIZE_SHIFT	= 0,
+	OCRDMA_CREATE_SRQ_RQE_SIZE_MASK		= 0xFFFF,
+	OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_SHIFT	= 16,
+	OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_MASK	= 0xFFFF <<
+					OCRDMA_CREATE_SRQ_NUM_RQ_PAGES_SHIFT
+};
+
+struct ocrdma_create_srq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 pgsz_pdid;
+	u32 max_sge_rqe;
+	u32 pages_rqe_sz;
+	struct ocrdma_pa rq_addr[MAX_OCRDMA_SRQ_PAGES];
+};
+
+enum {
+	OCRDMA_CREATE_SRQ_RSP_SRQ_ID_SHIFT			= 0,
+	OCRDMA_CREATE_SRQ_RSP_SRQ_ID_MASK			= 0xFFFFFF,
+
+	OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_SHIFT		= 0,
+	OCRDMA_CREATE_SRQ_RSP_MAX_RQE_ALLOCATED_MASK		= 0xFFFF,
+	OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_SHIFT	= 16,
+	OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_MASK	= 0xFFFF <<
+			OCRDMA_CREATE_SRQ_RSP_MAX_SGE_RECV_ALLOCATED_SHIFT
+};
+
+struct ocrdma_create_srq_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 id;
+	u32 max_sge_rqe_allocated;
+};
+
+enum {
+	OCRDMA_MODIFY_SRQ_ID_SHIFT	= 0,
+	OCRDMA_MODIFY_SRQ_ID_MASK	= 0xFFFFFF,
+
+	OCRDMA_MODIFY_SRQ_MAX_RQE_SHIFT	= 0,
+	OCRDMA_MODIFY_SRQ_MAX_RQE_MASK	= 0xFFFF,
+	OCRDMA_MODIFY_SRQ_LIMIT_SHIFT	= 16,
+	OCRDMA_MODIFY_SRQ__LIMIT_MASK	= 0xFFFF <<
+					OCRDMA_MODIFY_SRQ_LIMIT_SHIFT
+};
+
+struct ocrdma_modify_srq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rep;
+
+	u32 id;
+	u32 limit_max_rqe;
+};
+
+enum {
+	OCRDMA_QUERY_SRQ_ID_SHIFT	= 0,
+	OCRDMA_QUERY_SRQ_ID_MASK	= 0xFFFFFF
+};
+
+struct ocrdma_query_srq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp req;
+
+	u32 id;
+};
+
+enum {
+	OCRDMA_QUERY_SRQ_RSP_PD_ID_SHIFT	= 0,
+	OCRDMA_QUERY_SRQ_RSP_PD_ID_MASK		= 0xFFFF,
+	OCRDMA_QUERY_SRQ_RSP_MAX_RQE_SHIFT	= 16,
+	OCRDMA_QUERY_SRQ_RSP_MAX_RQE_MASK	= 0xFFFF <<
+					OCRDMA_QUERY_SRQ_RSP_MAX_RQE_SHIFT,
+
+	OCRDMA_QUERY_SRQ_RSP_MAX_SGE_RECV_SHIFT	= 0,
+	OCRDMA_QUERY_SRQ_RSP_MAX_SGE_RECV_MASK	= 0xFFFF,
+	OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_SHIFT	= 16,
+	OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_MASK	= 0xFFFF <<
+					OCRDMA_QUERY_SRQ_RSP_SRQ_LIMIT_SHIFT
+};
+
+struct ocrdma_query_srq_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp req;
+
+	u32 max_rqe_pdid;
+	u32 srq_lmt_max_sge;
+};
+
+enum {
+	OCRDMA_DESTROY_SRQ_ID_SHIFT	= 0,
+	OCRDMA_DESTROY_SRQ_ID_MASK	= 0xFFFFFF
+};
+
+struct ocrdma_destroy_srq {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp req;
+
+	u32 id;
+};
+
+enum {
+	OCRDMA_ALLOC_PD_ENABLE_DPP	= BIT(16),
+	OCRDMA_DPP_PAGE_SIZE		= 4096
+};
+
+struct ocrdma_alloc_pd {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 enable_dpp_rsvd;
+};
+
+enum {
+	OCRDMA_ALLOC_PD_RSP_DPP			= BIT(16),
+	OCRDMA_ALLOC_PD_RSP_DPP_PAGE_SHIFT	= 20,
+	OCRDMA_ALLOC_PD_RSP_PDID_MASK		= 0xFFFF,
+};
+
+struct ocrdma_alloc_pd_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+	u32 dpp_page_pdid;
+};
+
+struct ocrdma_dealloc_pd {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 id;
+};
+
+struct ocrdma_dealloc_pd_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+struct ocrdma_alloc_pd_range {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 enable_dpp_rsvd;
+	u32 pd_count;
+};
+
+struct ocrdma_alloc_pd_range_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+	u32 dpp_page_pdid;
+	u32 pd_count;
+};
+
+enum {
+	OCRDMA_ALLOC_PD_RNG_RSP_START_PDID_MASK = 0xFFFF,
+};
+
+struct ocrdma_dealloc_pd_range {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 start_pd_id;
+	u32 pd_count;
+};
+
+struct ocrdma_dealloc_pd_range_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 rsvd;
+};
+
+enum {
+	OCRDMA_ADDR_CHECK_ENABLE	= 1,
+	OCRDMA_ADDR_CHECK_DISABLE	= 0
+};
+
+enum {
+	OCRDMA_ALLOC_LKEY_PD_ID_SHIFT		= 0,
+	OCRDMA_ALLOC_LKEY_PD_ID_MASK		= 0xFFFF,
+
+	OCRDMA_ALLOC_LKEY_ADDR_CHECK_SHIFT	= 0,
+	OCRDMA_ALLOC_LKEY_ADDR_CHECK_MASK	= BIT(0),
+	OCRDMA_ALLOC_LKEY_FMR_SHIFT		= 1,
+	OCRDMA_ALLOC_LKEY_FMR_MASK		= BIT(1),
+	OCRDMA_ALLOC_LKEY_REMOTE_INV_SHIFT	= 2,
+	OCRDMA_ALLOC_LKEY_REMOTE_INV_MASK	= BIT(2),
+	OCRDMA_ALLOC_LKEY_REMOTE_WR_SHIFT	= 3,
+	OCRDMA_ALLOC_LKEY_REMOTE_WR_MASK	= BIT(3),
+	OCRDMA_ALLOC_LKEY_REMOTE_RD_SHIFT	= 4,
+	OCRDMA_ALLOC_LKEY_REMOTE_RD_MASK	= BIT(4),
+	OCRDMA_ALLOC_LKEY_LOCAL_WR_SHIFT	= 5,
+	OCRDMA_ALLOC_LKEY_LOCAL_WR_MASK		= BIT(5),
+	OCRDMA_ALLOC_LKEY_REMOTE_ATOMIC_MASK	= BIT(6),
+	OCRDMA_ALLOC_LKEY_REMOTE_ATOMIC_SHIFT	= 6,
+	OCRDMA_ALLOC_LKEY_PBL_SIZE_SHIFT	= 16,
+	OCRDMA_ALLOC_LKEY_PBL_SIZE_MASK		= 0xFFFF <<
+						OCRDMA_ALLOC_LKEY_PBL_SIZE_SHIFT
+};
+
+struct ocrdma_alloc_lkey {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 pdid;
+	u32 pbl_sz_flags;
+};
+
+struct ocrdma_alloc_lkey_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 lrkey;
+	u32 num_pbl_rsvd;
+};
+
+struct ocrdma_dealloc_lkey {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 lkey;
+	u32 rsvd_frmr;
+};
+
+struct ocrdma_dealloc_lkey_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+#define MAX_OCRDMA_NSMR_PBL    (u32)22
+#define MAX_OCRDMA_PBL_SIZE     65536
+#define MAX_OCRDMA_PBL_PER_LKEY	32767
+
+enum {
+	OCRDMA_REG_NSMR_LRKEY_INDEX_SHIFT	= 0,
+	OCRDMA_REG_NSMR_LRKEY_INDEX_MASK	= 0xFFFFFF,
+	OCRDMA_REG_NSMR_LRKEY_SHIFT		= 24,
+	OCRDMA_REG_NSMR_LRKEY_MASK		= 0xFF <<
+					OCRDMA_REG_NSMR_LRKEY_SHIFT,
+
+	OCRDMA_REG_NSMR_PD_ID_SHIFT		= 0,
+	OCRDMA_REG_NSMR_PD_ID_MASK		= 0xFFFF,
+	OCRDMA_REG_NSMR_NUM_PBL_SHIFT		= 16,
+	OCRDMA_REG_NSMR_NUM_PBL_MASK		= 0xFFFF <<
+					OCRDMA_REG_NSMR_NUM_PBL_SHIFT,
+
+	OCRDMA_REG_NSMR_PBE_SIZE_SHIFT		= 0,
+	OCRDMA_REG_NSMR_PBE_SIZE_MASK		= 0xFFFF,
+	OCRDMA_REG_NSMR_HPAGE_SIZE_SHIFT	= 16,
+	OCRDMA_REG_NSMR_HPAGE_SIZE_MASK		= 0xFF <<
+					OCRDMA_REG_NSMR_HPAGE_SIZE_SHIFT,
+	OCRDMA_REG_NSMR_BIND_MEMWIN_SHIFT	= 24,
+	OCRDMA_REG_NSMR_BIND_MEMWIN_MASK	= BIT(24),
+	OCRDMA_REG_NSMR_ZB_SHIFT		= 25,
+	OCRDMA_REG_NSMR_ZB_SHIFT_MASK		= BIT(25),
+	OCRDMA_REG_NSMR_REMOTE_INV_SHIFT	= 26,
+	OCRDMA_REG_NSMR_REMOTE_INV_MASK		= BIT(26),
+	OCRDMA_REG_NSMR_REMOTE_WR_SHIFT		= 27,
+	OCRDMA_REG_NSMR_REMOTE_WR_MASK		= BIT(27),
+	OCRDMA_REG_NSMR_REMOTE_RD_SHIFT		= 28,
+	OCRDMA_REG_NSMR_REMOTE_RD_MASK		= BIT(28),
+	OCRDMA_REG_NSMR_LOCAL_WR_SHIFT		= 29,
+	OCRDMA_REG_NSMR_LOCAL_WR_MASK		= BIT(29),
+	OCRDMA_REG_NSMR_REMOTE_ATOMIC_SHIFT	= 30,
+	OCRDMA_REG_NSMR_REMOTE_ATOMIC_MASK	= BIT(30),
+	OCRDMA_REG_NSMR_LAST_SHIFT		= 31,
+	OCRDMA_REG_NSMR_LAST_MASK		= BIT(31)
+};
+
+struct ocrdma_reg_nsmr {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr cmd;
+
+	u32 fr_mr;
+	u32 num_pbl_pdid;
+	u32 flags_hpage_pbe_sz;
+	u32 totlen_low;
+	u32 totlen_high;
+	u32 fbo_low;
+	u32 fbo_high;
+	u32 va_loaddr;
+	u32 va_hiaddr;
+	struct ocrdma_pa pbl[MAX_OCRDMA_NSMR_PBL];
+};
+
+enum {
+	OCRDMA_REG_NSMR_CONT_PBL_SHIFT		= 0,
+	OCRDMA_REG_NSMR_CONT_PBL_SHIFT_MASK	= 0xFFFF,
+	OCRDMA_REG_NSMR_CONT_NUM_PBL_SHIFT	= 16,
+	OCRDMA_REG_NSMR_CONT_NUM_PBL_MASK	= 0xFFFF <<
+					OCRDMA_REG_NSMR_CONT_NUM_PBL_SHIFT,
+
+	OCRDMA_REG_NSMR_CONT_LAST_SHIFT		= 31,
+	OCRDMA_REG_NSMR_CONT_LAST_MASK		= BIT(31)
+};
+
+struct ocrdma_reg_nsmr_cont {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr cmd;
+
+	u32 lrkey;
+	u32 num_pbl_offset;
+	u32 last;
+
+	struct ocrdma_pa pbl[MAX_OCRDMA_NSMR_PBL];
+};
+
+struct ocrdma_pbe {
+	u32 pa_hi;
+	u32 pa_lo;
+};
+
+enum {
+	OCRDMA_REG_NSMR_RSP_NUM_PBL_SHIFT	= 16,
+	OCRDMA_REG_NSMR_RSP_NUM_PBL_MASK	= 0xFFFF0000
+};
+struct ocrdma_reg_nsmr_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 lrkey;
+	u32 num_pbl;
+};
+
+enum {
+	OCRDMA_REG_NSMR_CONT_RSP_LRKEY_INDEX_SHIFT	= 0,
+	OCRDMA_REG_NSMR_CONT_RSP_LRKEY_INDEX_MASK	= 0xFFFFFF,
+	OCRDMA_REG_NSMR_CONT_RSP_LRKEY_SHIFT		= 24,
+	OCRDMA_REG_NSMR_CONT_RSP_LRKEY_MASK		= 0xFF <<
+					OCRDMA_REG_NSMR_CONT_RSP_LRKEY_SHIFT,
+
+	OCRDMA_REG_NSMR_CONT_RSP_NUM_PBL_SHIFT		= 16,
+	OCRDMA_REG_NSMR_CONT_RSP_NUM_PBL_MASK		= 0xFFFF <<
+					OCRDMA_REG_NSMR_CONT_RSP_NUM_PBL_SHIFT
+};
+
+struct ocrdma_reg_nsmr_cont_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 lrkey_key_index;
+	u32 num_pbl;
+};
+
+enum {
+	OCRDMA_ALLOC_MW_PD_ID_SHIFT	= 0,
+	OCRDMA_ALLOC_MW_PD_ID_MASK	= 0xFFFF
+};
+
+struct ocrdma_alloc_mw {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 pdid;
+};
+
+enum {
+	OCRDMA_ALLOC_MW_RSP_LRKEY_INDEX_SHIFT	= 0,
+	OCRDMA_ALLOC_MW_RSP_LRKEY_INDEX_MASK	= 0xFFFFFF
+};
+
+struct ocrdma_alloc_mw_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+
+	u32 lrkey_index;
+};
+
+struct ocrdma_attach_mcast {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 qp_id;
+	u8 mgid[16];
+	u32 mac_b0_to_b3;
+	u32 vlan_mac_b4_to_b5;
+};
+
+struct ocrdma_attach_mcast_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+struct ocrdma_detach_mcast {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 qp_id;
+	u8 mgid[16];
+	u32 mac_b0_to_b3;
+	u32 vlan_mac_b4_to_b5;
+};
+
+struct ocrdma_detach_mcast_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+enum {
+	OCRDMA_CREATE_AH_NUM_PAGES_SHIFT	= 19,
+	OCRDMA_CREATE_AH_NUM_PAGES_MASK		= 0xF <<
+					OCRDMA_CREATE_AH_NUM_PAGES_SHIFT,
+
+	OCRDMA_CREATE_AH_PAGE_SIZE_SHIFT	= 16,
+	OCRDMA_CREATE_AH_PAGE_SIZE_MASK		= 0x7 <<
+					OCRDMA_CREATE_AH_PAGE_SIZE_SHIFT,
+
+	OCRDMA_CREATE_AH_ENTRY_SIZE_SHIFT	= 23,
+	OCRDMA_CREATE_AH_ENTRY_SIZE_MASK	= 0x1FF <<
+					OCRDMA_CREATE_AH_ENTRY_SIZE_SHIFT,
+};
+
+#define OCRDMA_AH_TBL_PAGES 8
+
+struct ocrdma_create_ah_tbl {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+
+	u32 ah_conf;
+	struct ocrdma_pa tbl_addr[8];
+};
+
+struct ocrdma_create_ah_tbl_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+	u32 ahid;
+};
+
+struct ocrdma_delete_ah_tbl {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_hdr req;
+	u32 ahid;
+};
+
+struct ocrdma_delete_ah_tbl_rsp {
+	struct ocrdma_mqe_hdr hdr;
+	struct ocrdma_mbx_rsp rsp;
+};
+
+enum {
+	OCRDMA_EQE_VALID_SHIFT		= 0,
+	OCRDMA_EQE_VALID_MASK		= BIT(0),
+	OCRDMA_EQE_MAJOR_CODE_MASK      = 0x0E,
+	OCRDMA_EQE_MAJOR_CODE_SHIFT     = 0x01,
+	OCRDMA_EQE_FOR_CQE_MASK		= 0xFFFE,
+	OCRDMA_EQE_RESOURCE_ID_SHIFT	= 16,
+	OCRDMA_EQE_RESOURCE_ID_MASK	= 0xFFFF <<
+				OCRDMA_EQE_RESOURCE_ID_SHIFT,
+};
+
+enum major_code {
+	OCRDMA_MAJOR_CODE_COMPLETION    = 0x00,
+	OCRDMA_MAJOR_CODE_SENTINAL      = 0x01
+};
+
+struct ocrdma_eqe {
+	u32 id_valid;
+};
+
+enum OCRDMA_CQE_STATUS {
+	OCRDMA_CQE_SUCCESS = 0,
+	OCRDMA_CQE_LOC_LEN_ERR,
+	OCRDMA_CQE_LOC_QP_OP_ERR,
+	OCRDMA_CQE_LOC_EEC_OP_ERR,
+	OCRDMA_CQE_LOC_PROT_ERR,
+	OCRDMA_CQE_WR_FLUSH_ERR,
+	OCRDMA_CQE_MW_BIND_ERR,
+	OCRDMA_CQE_BAD_RESP_ERR,
+	OCRDMA_CQE_LOC_ACCESS_ERR,
+	OCRDMA_CQE_REM_INV_REQ_ERR,
+	OCRDMA_CQE_REM_ACCESS_ERR,
+	OCRDMA_CQE_REM_OP_ERR,
+	OCRDMA_CQE_RETRY_EXC_ERR,
+	OCRDMA_CQE_RNR_RETRY_EXC_ERR,
+	OCRDMA_CQE_LOC_RDD_VIOL_ERR,
+	OCRDMA_CQE_REM_INV_RD_REQ_ERR,
+	OCRDMA_CQE_REM_ABORT_ERR,
+	OCRDMA_CQE_INV_EECN_ERR,
+	OCRDMA_CQE_INV_EEC_STATE_ERR,
+	OCRDMA_CQE_FATAL_ERR,
+	OCRDMA_CQE_RESP_TIMEOUT_ERR,
+	OCRDMA_CQE_GENERAL_ERR,
+
+	OCRDMA_MAX_CQE_ERR
+};
+
+enum {
+	/* w0 */
+	OCRDMA_CQE_WQEIDX_SHIFT		= 0,
+	OCRDMA_CQE_WQEIDX_MASK		= 0xFFFF,
+
+	/* w1 */
+	OCRDMA_CQE_UD_XFER_LEN_SHIFT	= 16,
+	OCRDMA_CQE_PKEY_SHIFT		= 0,
+	OCRDMA_CQE_PKEY_MASK		= 0xFFFF,
+
+	/* w2 */
+	OCRDMA_CQE_QPN_SHIFT		= 0,
+	OCRDMA_CQE_QPN_MASK		= 0x0000FFFF,
+
+	OCRDMA_CQE_BUFTAG_SHIFT		= 16,
+	OCRDMA_CQE_BUFTAG_MASK		= 0xFFFF << OCRDMA_CQE_BUFTAG_SHIFT,
+
+	/* w3 */
+	OCRDMA_CQE_UD_STATUS_SHIFT	= 24,
+	OCRDMA_CQE_UD_STATUS_MASK	= 0x7 << OCRDMA_CQE_UD_STATUS_SHIFT,
+	OCRDMA_CQE_STATUS_SHIFT		= 16,
+	OCRDMA_CQE_STATUS_MASK		= 0xFF << OCRDMA_CQE_STATUS_SHIFT,
+	OCRDMA_CQE_VALID		= BIT(31),
+	OCRDMA_CQE_INVALIDATE		= BIT(30),
+	OCRDMA_CQE_QTYPE		= BIT(29),
+	OCRDMA_CQE_IMM			= BIT(28),
+	OCRDMA_CQE_WRITE_IMM		= BIT(27),
+	OCRDMA_CQE_QTYPE_SQ		= 0,
+	OCRDMA_CQE_QTYPE_RQ		= 1,
+	OCRDMA_CQE_SRCQP_MASK		= 0xFFFFFF
+};
+
+struct ocrdma_cqe {
+	union {
+		/* w0 to w2 */
+		struct {
+			u32 wqeidx;
+			u32 bytes_xfered;
+			u32 qpn;
+		} wq;
+		struct {
+			u32 lkey_immdt;
+			u32 rxlen;
+			u32 buftag_qpn;
+		} rq;
+		struct {
+			u32 lkey_immdt;
+			u32 rxlen_pkey;
+			u32 buftag_qpn;
+		} ud;
+		struct {
+			u32 word_0;
+			u32 word_1;
+			u32 qpn;
+		} cmn;
+	};
+	u32 flags_status_srcqpn;	/* w3 */
+};
+
+struct ocrdma_sge {
+	u32 addr_hi;
+	u32 addr_lo;
+	u32 lrkey;
+	u32 len;
+};
+
+enum {
+	OCRDMA_FLAG_SIG		= 0x1,
+	OCRDMA_FLAG_INV		= 0x2,
+	OCRDMA_FLAG_FENCE_L	= 0x4,
+	OCRDMA_FLAG_FENCE_R	= 0x8,
+	OCRDMA_FLAG_SOLICIT	= 0x10,
+	OCRDMA_FLAG_IMM		= 0x20,
+	OCRDMA_FLAG_AH_VLAN_PR  = 0x40,
+
+	/* Stag flags */
+	OCRDMA_LKEY_FLAG_LOCAL_WR	= 0x1,
+	OCRDMA_LKEY_FLAG_REMOTE_RD	= 0x2,
+	OCRDMA_LKEY_FLAG_REMOTE_WR	= 0x4,
+	OCRDMA_LKEY_FLAG_VATO		= 0x8,
+};
+
+enum OCRDMA_WQE_OPCODE {
+	OCRDMA_WRITE		= 0x06,
+	OCRDMA_READ		= 0x0C,
+	OCRDMA_RESV0		= 0x02,
+	OCRDMA_SEND		= 0x00,
+	OCRDMA_CMP_SWP		= 0x14,
+	OCRDMA_BIND_MW		= 0x10,
+	OCRDMA_FR_MR            = 0x11,
+	OCRDMA_RESV1		= 0x0A,
+	OCRDMA_LKEY_INV		= 0x15,
+	OCRDMA_FETCH_ADD	= 0x13,
+	OCRDMA_POST_RQ		= 0x12
+};
+
+enum {
+	OCRDMA_TYPE_INLINE	= 0x0,
+	OCRDMA_TYPE_LKEY	= 0x1,
+};
+
+enum {
+	OCRDMA_WQE_OPCODE_SHIFT		= 0,
+	OCRDMA_WQE_OPCODE_MASK		= 0x0000001F,
+	OCRDMA_WQE_FLAGS_SHIFT		= 5,
+	OCRDMA_WQE_TYPE_SHIFT		= 16,
+	OCRDMA_WQE_TYPE_MASK		= 0x00030000,
+	OCRDMA_WQE_SIZE_SHIFT		= 18,
+	OCRDMA_WQE_SIZE_MASK		= 0xFF,
+	OCRDMA_WQE_NXT_WQE_SIZE_SHIFT	= 25,
+
+	OCRDMA_WQE_LKEY_FLAGS_SHIFT	= 0,
+	OCRDMA_WQE_LKEY_FLAGS_MASK	= 0xF
+};
+
+/* header WQE for all the SQ and RQ operations */
+struct ocrdma_hdr_wqe {
+	u32 cw;
+	union {
+		u32 rsvd_tag;
+		u32 rsvd_lkey_flags;
+	};
+	union {
+		u32 immdt;
+		u32 lkey;
+	};
+	u32 total_len;
+};
+
+struct ocrdma_ewqe_ud_hdr {
+	u32 rsvd_dest_qpn;
+	u32 qkey;
+	u32 rsvd_ahid;
+	u32 rsvd;
+};
+
+/* extended wqe followed by hdr_wqe for Fast Memory register */
+struct ocrdma_ewqe_fr {
+	u32 va_hi;
+	u32 va_lo;
+	u32 fbo_hi;
+	u32 fbo_lo;
+	u32 size_sge;
+	u32 num_sges;
+	u32 rsvd;
+	u32 rsvd2;
+};
+
+struct ocrdma_eth_basic {
+	u8 dmac[6];
+	u8 smac[6];
+	__be16 eth_type;
+} __packed;
+
+struct ocrdma_eth_vlan {
+	u8 dmac[6];
+	u8 smac[6];
+	__be16 eth_type;
+	__be16 vlan_tag;
+#define OCRDMA_ROCE_ETH_TYPE 0x8915
+	__be16 roce_eth_type;
+} __packed;
+
+struct ocrdma_grh {
+	__be32	tclass_flow;
+	__be32	pdid_hoplimit;
+	u8	sgid[16];
+	u8	dgid[16];
+	u16	rsvd;
+} __packed;
+
+#define OCRDMA_AV_VALID		BIT(7)
+#define OCRDMA_AV_VLAN_VALID	BIT(1)
+
+struct ocrdma_av {
+	struct ocrdma_eth_vlan eth_hdr;
+	struct ocrdma_grh grh;
+	u32 valid;
+} __packed;
+
+struct ocrdma_rsrc_stats {
+	u32 dpp_pds;
+	u32 non_dpp_pds;
+	u32 rc_dpp_qps;
+	u32 uc_dpp_qps;
+	u32 ud_dpp_qps;
+	u32 rc_non_dpp_qps;
+	u32 rsvd;
+	u32 uc_non_dpp_qps;
+	u32 ud_non_dpp_qps;
+	u32 rsvd1;
+	u32 srqs;
+	u32 rbqs;
+	u32 r64K_nsmr;
+	u32 r64K_to_2M_nsmr;
+	u32 r2M_to_44M_nsmr;
+	u32 r44M_to_1G_nsmr;
+	u32 r1G_to_4G_nsmr;
+	u32 nsmr_count_4G_to_32G;
+	u32 r32G_to_64G_nsmr;
+	u32 r64G_to_128G_nsmr;
+	u32 r128G_to_higher_nsmr;
+	u32 embedded_nsmr;
+	u32 frmr;
+	u32 prefetch_qps;
+	u32 ondemand_qps;
+	u32 phy_mr;
+	u32 mw;
+	u32 rsvd2[7];
+};
+
+struct ocrdma_db_err_stats {
+	u32 sq_doorbell_errors;
+	u32 cq_doorbell_errors;
+	u32 rq_srq_doorbell_errors;
+	u32 cq_overflow_errors;
+	u32 rsvd[4];
+};
+
+struct ocrdma_wqe_stats {
+	u32 large_send_rc_wqes_lo;
+	u32 large_send_rc_wqes_hi;
+	u32 large_write_rc_wqes_lo;
+	u32 large_write_rc_wqes_hi;
+	u32 rsvd[4];
+	u32 read_wqes_lo;
+	u32 read_wqes_hi;
+	u32 frmr_wqes_lo;
+	u32 frmr_wqes_hi;
+	u32 mw_bind_wqes_lo;
+	u32 mw_bind_wqes_hi;
+	u32 invalidate_wqes_lo;
+	u32 invalidate_wqes_hi;
+	u32 rsvd1[2];
+	u32 dpp_wqe_drops;
+	u32 rsvd2[5];
+};
+
+struct ocrdma_tx_stats {
+	u32 send_pkts_lo;
+	u32 send_pkts_hi;
+	u32 write_pkts_lo;
+	u32 write_pkts_hi;
+	u32 read_pkts_lo;
+	u32 read_pkts_hi;
+	u32 read_rsp_pkts_lo;
+	u32 read_rsp_pkts_hi;
+	u32 ack_pkts_lo;
+	u32 ack_pkts_hi;
+	u32 send_bytes_lo;
+	u32 send_bytes_hi;
+	u32 write_bytes_lo;
+	u32 write_bytes_hi;
+	u32 read_req_bytes_lo;
+	u32 read_req_bytes_hi;
+	u32 read_rsp_bytes_lo;
+	u32 read_rsp_bytes_hi;
+	u32 ack_timeouts;
+	u32 rsvd[5];
+};
+
+
+struct ocrdma_tx_qp_err_stats {
+	u32 local_length_errors;
+	u32 local_protection_errors;
+	u32 local_qp_operation_errors;
+	u32 retry_count_exceeded_errors;
+	u32 rnr_retry_count_exceeded_errors;
+	u32 rsvd[3];
+};
+
+struct ocrdma_rx_stats {
+	u32 roce_frame_bytes_lo;
+	u32 roce_frame_bytes_hi;
+	u32 roce_frame_icrc_drops;
+	u32 roce_frame_payload_len_drops;
+	u32 ud_drops;
+	u32 qp1_drops;
+	u32 psn_error_request_packets;
+	u32 psn_error_resp_packets;
+	u32 rnr_nak_timeouts;
+	u32 rnr_nak_receives;
+	u32 roce_frame_rxmt_drops;
+	u32 nak_count_psn_sequence_errors;
+	u32 rc_drop_count_lookup_errors;
+	u32 rq_rnr_naks;
+	u32 srq_rnr_naks;
+	u32 roce_frames_lo;
+	u32 roce_frames_hi;
+	u32 rsvd;
+};
+
+struct ocrdma_rx_qp_err_stats {
+	u32 nak_invalid_requst_errors;
+	u32 nak_remote_operation_errors;
+	u32 nak_count_remote_access_errors;
+	u32 local_length_errors;
+	u32 local_protection_errors;
+	u32 local_qp_operation_errors;
+	u32 rsvd[2];
+};
+
+struct ocrdma_tx_dbg_stats {
+	u32 data[100];
+};
+
+struct ocrdma_rx_dbg_stats {
+	u32 data[200];
+};
+
+struct ocrdma_rdma_stats_req {
+	struct ocrdma_mbx_hdr hdr;
+	u8 reset_stats;
+	u8 rsvd[3];
+} __packed;
+
+struct ocrdma_rdma_stats_resp {
+	struct ocrdma_mbx_hdr hdr;
+	struct ocrdma_rsrc_stats act_rsrc_stats;
+	struct ocrdma_rsrc_stats th_rsrc_stats;
+	struct ocrdma_db_err_stats	db_err_stats;
+	struct ocrdma_wqe_stats		wqe_stats;
+	struct ocrdma_tx_stats		tx_stats;
+	struct ocrdma_tx_qp_err_stats	tx_qp_err_stats;
+	struct ocrdma_rx_stats		rx_stats;
+	struct ocrdma_rx_qp_err_stats	rx_qp_err_stats;
+	struct ocrdma_tx_dbg_stats	tx_dbg_stats;
+	struct ocrdma_rx_dbg_stats	rx_dbg_stats;
+} __packed;
+
+enum {
+	OCRDMA_HBA_ATTRB_EPROM_VER_LO_MASK	= 0xFF,
+	OCRDMA_HBA_ATTRB_EPROM_VER_HI_MASK	= 0xFF00,
+	OCRDMA_HBA_ATTRB_EPROM_VER_HI_SHIFT	= 0x08,
+	OCRDMA_HBA_ATTRB_CDBLEN_MASK		= 0xFFFF,
+	OCRDMA_HBA_ATTRB_ASIC_REV_MASK		= 0xFF0000,
+	OCRDMA_HBA_ATTRB_ASIC_REV_SHIFT		= 0x10,
+	OCRDMA_HBA_ATTRB_GUID0_MASK		= 0xFF000000,
+	OCRDMA_HBA_ATTRB_GUID0_SHIFT		= 0x18,
+	OCRDMA_HBA_ATTRB_GUID13_MASK		= 0xFF,
+	OCRDMA_HBA_ATTRB_GUID14_MASK		= 0xFF00,
+	OCRDMA_HBA_ATTRB_GUID14_SHIFT		= 0x08,
+	OCRDMA_HBA_ATTRB_GUID15_MASK		= 0xFF0000,
+	OCRDMA_HBA_ATTRB_GUID15_SHIFT		= 0x10,
+	OCRDMA_HBA_ATTRB_PCNT_MASK		= 0xFF000000,
+	OCRDMA_HBA_ATTRB_PCNT_SHIFT		= 0x18,
+	OCRDMA_HBA_ATTRB_LDTOUT_MASK		= 0xFFFF,
+	OCRDMA_HBA_ATTRB_ISCSI_VER_MASK		= 0xFF0000,
+	OCRDMA_HBA_ATTRB_ISCSI_VER_SHIFT	= 0x10,
+	OCRDMA_HBA_ATTRB_MFUNC_DEV_MASK		= 0xFF000000,
+	OCRDMA_HBA_ATTRB_MFUNC_DEV_SHIFT	= 0x18,
+	OCRDMA_HBA_ATTRB_CV_MASK		= 0xFF,
+	OCRDMA_HBA_ATTRB_HBA_ST_MASK		= 0xFF00,
+	OCRDMA_HBA_ATTRB_HBA_ST_SHIFT		= 0x08,
+	OCRDMA_HBA_ATTRB_MAX_DOMS_MASK		= 0xFF0000,
+	OCRDMA_HBA_ATTRB_MAX_DOMS_SHIFT		= 0x10,
+	OCRDMA_HBA_ATTRB_PTNUM_MASK		= 0x3F000000,
+	OCRDMA_HBA_ATTRB_PTNUM_SHIFT		= 0x18,
+	OCRDMA_HBA_ATTRB_PT_MASK		= 0xC0000000,
+	OCRDMA_HBA_ATTRB_PT_SHIFT		= 0x1E,
+	OCRDMA_HBA_ATTRB_ISCSI_FET_MASK		= 0xFF,
+	OCRDMA_HBA_ATTRB_ASIC_GEN_MASK		= 0xFF00,
+	OCRDMA_HBA_ATTRB_ASIC_GEN_SHIFT		= 0x08,
+	OCRDMA_HBA_ATTRB_PCI_VID_MASK		= 0xFFFF,
+	OCRDMA_HBA_ATTRB_PCI_DID_MASK		= 0xFFFF0000,
+	OCRDMA_HBA_ATTRB_PCI_DID_SHIFT		= 0x10,
+	OCRDMA_HBA_ATTRB_PCI_SVID_MASK		= 0xFFFF,
+	OCRDMA_HBA_ATTRB_PCI_SSID_MASK		= 0xFFFF0000,
+	OCRDMA_HBA_ATTRB_PCI_SSID_SHIFT		= 0x10,
+	OCRDMA_HBA_ATTRB_PCI_BUSNUM_MASK	= 0xFF,
+	OCRDMA_HBA_ATTRB_PCI_DEVNUM_MASK	= 0xFF00,
+	OCRDMA_HBA_ATTRB_PCI_DEVNUM_SHIFT	= 0x08,
+	OCRDMA_HBA_ATTRB_PCI_FUNCNUM_MASK	= 0xFF0000,
+	OCRDMA_HBA_ATTRB_PCI_FUNCNUM_SHIFT	= 0x10,
+	OCRDMA_HBA_ATTRB_IF_TYPE_MASK		= 0xFF000000,
+	OCRDMA_HBA_ATTRB_IF_TYPE_SHIFT		= 0x18,
+	OCRDMA_HBA_ATTRB_NETFIL_MASK		=0xFF
+};
+
+struct mgmt_hba_attribs {
+	u8 flashrom_version_string[32];
+	u8 manufacturer_name[32];
+	u32 supported_modes;
+	u32 rsvd_eprom_verhi_verlo;
+	u32 mbx_ds_ver;
+	u32 epfw_ds_ver;
+	u8 ncsi_ver_string[12];
+	u32 default_extended_timeout;
+	u8 controller_model_number[32];
+	u8 controller_description[64];
+	u8 controller_serial_number[32];
+	u8 ip_version_string[32];
+	u8 firmware_version_string[32];
+	u8 bios_version_string[32];
+	u8 redboot_version_string[32];
+	u8 driver_version_string[32];
+	u8 fw_on_flash_version_string[32];
+	u32 functionalities_supported;
+	u32 guid0_asicrev_cdblen;
+	u8 generational_guid[12];
+	u32 portcnt_guid15;
+	u32 mfuncdev_iscsi_ldtout;
+	u32 ptpnum_maxdoms_hbast_cv;
+	u32 firmware_post_status;
+	u32 hba_mtu[8];
+	u32 res_asicgen_iscsi_feaures;
+	u32 rsvd1[3];
+};
+
+struct mgmt_controller_attrib {
+	struct mgmt_hba_attribs hba_attribs;
+	u32 pci_did_vid;
+	u32 pci_ssid_svid;
+	u32 ityp_fnum_devnum_bnum;
+	u32 uid_hi;
+	u32 uid_lo;
+	u32 res_nnetfil;
+	u32 rsvd0[4];
+};
+
+struct ocrdma_get_ctrl_attribs_rsp {
+	struct ocrdma_mbx_hdr hdr;
+	struct mgmt_controller_attrib ctrl_attribs;
+};
+
+#define OCRDMA_SUBSYS_DCBX 0x10
+
+enum OCRDMA_DCBX_OPCODE {
+	OCRDMA_CMD_GET_DCBX_CONFIG = 0x01
+};
+
+enum OCRDMA_DCBX_PARAM_TYPE {
+	OCRDMA_PARAMETER_TYPE_ADMIN	= 0x00,
+	OCRDMA_PARAMETER_TYPE_OPER	= 0x01,
+	OCRDMA_PARAMETER_TYPE_PEER	= 0x02
+};
+
+enum OCRDMA_DCBX_APP_PROTO {
+	OCRDMA_APP_PROTO_ROCE	= 0x8915
+};
+
+enum OCRDMA_DCBX_PROTO {
+	OCRDMA_PROTO_SELECT_L2	= 0x00,
+	OCRDMA_PROTO_SELECT_L4	= 0x01
+};
+
+enum OCRDMA_DCBX_APP_PARAM {
+	OCRDMA_APP_PARAM_APP_PROTO_MASK = 0xFFFF,
+	OCRDMA_APP_PARAM_PROTO_SEL_MASK = 0xFF,
+	OCRDMA_APP_PARAM_PROTO_SEL_SHIFT = 0x10,
+	OCRDMA_APP_PARAM_VALID_MASK	= 0xFF,
+	OCRDMA_APP_PARAM_VALID_SHIFT	= 0x18
+};
+
+enum OCRDMA_DCBX_STATE_FLAGS {
+	OCRDMA_STATE_FLAG_ENABLED	= 0x01,
+	OCRDMA_STATE_FLAG_ADDVERTISED	= 0x02,
+	OCRDMA_STATE_FLAG_WILLING	= 0x04,
+	OCRDMA_STATE_FLAG_SYNC		= 0x08,
+	OCRDMA_STATE_FLAG_UNSUPPORTED	= 0x40000000,
+	OCRDMA_STATE_FLAG_NEG_FAILD	= 0x80000000
+};
+
+enum OCRDMA_TCV_AEV_OPV_ST {
+	OCRDMA_DCBX_TC_SUPPORT_MASK	= 0xFF,
+	OCRDMA_DCBX_TC_SUPPORT_SHIFT	= 0x18,
+	OCRDMA_DCBX_APP_ENTRY_SHIFT	= 0x10,
+	OCRDMA_DCBX_OP_PARAM_SHIFT	= 0x08,
+	OCRDMA_DCBX_STATE_MASK		= 0xFF
+};
+
+struct ocrdma_app_parameter {
+	u32 valid_proto_app;
+	u32 oui;
+	u32 app_prio[2];
+};
+
+struct ocrdma_dcbx_cfg {
+	u32 tcv_aev_opv_st;
+	u32 tc_state;
+	u32 pfc_state;
+	u32 qcn_state;
+	u32 appl_state;
+	u32 ll_state;
+	u32 tc_bw[2];
+	u32 tc_prio[8];
+	u32 pfc_prio[2];
+	struct ocrdma_app_parameter app_param[15];
+};
+
+struct ocrdma_get_dcbx_cfg_req {
+	struct ocrdma_mbx_hdr hdr;
+	u32 param_type;
+} __packed;
+
+struct ocrdma_get_dcbx_cfg_rsp {
+	struct ocrdma_mbx_rsp hdr;
+	struct ocrdma_dcbx_cfg cfg;
+} __packed;
+
+#endif				/* __OCRDMA_SLI_H__ */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.c b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
new file mode 100644
index 000000000..48d7ef51a
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.c
@@ -0,0 +1,857 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2014 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#include <rdma/ib_addr.h>
+#include <rdma/ib_pma.h>
+#include "ocrdma_stats.h"
+
+static struct dentry *ocrdma_dbgfs_dir;
+
+static int ocrdma_add_stat(char *start, char *pcur,
+				char *name, u64 count)
+{
+	char buff[128] = {0};
+	int cpy_len = 0;
+
+	snprintf(buff, 128, "%s: %llu\n", name, count);
+	cpy_len = strlen(buff);
+
+	if (pcur + cpy_len > start + OCRDMA_MAX_DBGFS_MEM) {
+		pr_err("%s: No space in stats buff\n", __func__);
+		return 0;
+	}
+
+	memcpy(pcur, buff, cpy_len);
+	return cpy_len;
+}
+
+static bool ocrdma_alloc_stats_mem(struct ocrdma_dev *dev)
+{
+	struct stats_mem *mem = &dev->stats_mem;
+
+	/* Alloc mbox command mem*/
+	mem->size = max_t(u32, sizeof(struct ocrdma_rdma_stats_req),
+			sizeof(struct ocrdma_rdma_stats_resp));
+
+	mem->va   = dma_alloc_coherent(&dev->nic_info.pdev->dev, mem->size,
+					 &mem->pa, GFP_KERNEL);
+	if (!mem->va) {
+		pr_err("%s: stats mbox allocation failed\n", __func__);
+		return false;
+	}
+
+	memset(mem->va, 0, mem->size);
+
+	/* Alloc debugfs mem */
+	mem->debugfs_mem = kzalloc(OCRDMA_MAX_DBGFS_MEM, GFP_KERNEL);
+	if (!mem->debugfs_mem) {
+		pr_err("%s: stats debugfs mem allocation failed\n", __func__);
+		return false;
+	}
+
+	return true;
+}
+
+static void ocrdma_release_stats_mem(struct ocrdma_dev *dev)
+{
+	struct stats_mem *mem = &dev->stats_mem;
+
+	if (mem->va)
+		dma_free_coherent(&dev->nic_info.pdev->dev, mem->size,
+				  mem->va, mem->pa);
+	kfree(mem->debugfs_mem);
+}
+
+static char *ocrdma_resource_stats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+			(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rsrc_stats *rsrc_stats = &rdma_stats->act_rsrc_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "active_dpp_pds",
+				(u64)rsrc_stats->dpp_pds);
+	pcur += ocrdma_add_stat(stats, pcur, "active_non_dpp_pds",
+				(u64)rsrc_stats->non_dpp_pds);
+	pcur += ocrdma_add_stat(stats, pcur, "active_rc_dpp_qps",
+				(u64)rsrc_stats->rc_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_uc_dpp_qps",
+				(u64)rsrc_stats->uc_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_ud_dpp_qps",
+				(u64)rsrc_stats->ud_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_rc_non_dpp_qps",
+				(u64)rsrc_stats->rc_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_uc_non_dpp_qps",
+				(u64)rsrc_stats->uc_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_ud_non_dpp_qps",
+				(u64)rsrc_stats->ud_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_srqs",
+				(u64)rsrc_stats->srqs);
+	pcur += ocrdma_add_stat(stats, pcur, "active_rbqs",
+				(u64)rsrc_stats->rbqs);
+	pcur += ocrdma_add_stat(stats, pcur, "active_64K_nsmr",
+				(u64)rsrc_stats->r64K_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_64K_to_2M_nsmr",
+				(u64)rsrc_stats->r64K_to_2M_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_2M_to_44M_nsmr",
+				(u64)rsrc_stats->r2M_to_44M_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_44M_to_1G_nsmr",
+				(u64)rsrc_stats->r44M_to_1G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_1G_to_4G_nsmr",
+				(u64)rsrc_stats->r1G_to_4G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_nsmr_count_4G_to_32G",
+				(u64)rsrc_stats->nsmr_count_4G_to_32G);
+	pcur += ocrdma_add_stat(stats, pcur, "active_32G_to_64G_nsmr",
+				(u64)rsrc_stats->r32G_to_64G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_64G_to_128G_nsmr",
+				(u64)rsrc_stats->r64G_to_128G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_128G_to_higher_nsmr",
+				(u64)rsrc_stats->r128G_to_higher_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_embedded_nsmr",
+				(u64)rsrc_stats->embedded_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_frmr",
+				(u64)rsrc_stats->frmr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_prefetch_qps",
+				(u64)rsrc_stats->prefetch_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_ondemand_qps",
+				(u64)rsrc_stats->ondemand_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "active_phy_mr",
+				(u64)rsrc_stats->phy_mr);
+	pcur += ocrdma_add_stat(stats, pcur, "active_mw",
+				(u64)rsrc_stats->mw);
+
+	/* Print the threshold stats */
+	rsrc_stats = &rdma_stats->th_rsrc_stats;
+
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_dpp_pds",
+				(u64)rsrc_stats->dpp_pds);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_non_dpp_pds",
+				(u64)rsrc_stats->non_dpp_pds);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_rc_dpp_qps",
+				(u64)rsrc_stats->rc_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_uc_dpp_qps",
+				(u64)rsrc_stats->uc_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_ud_dpp_qps",
+				(u64)rsrc_stats->ud_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_rc_non_dpp_qps",
+				(u64)rsrc_stats->rc_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_uc_non_dpp_qps",
+				(u64)rsrc_stats->uc_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_ud_non_dpp_qps",
+				(u64)rsrc_stats->ud_non_dpp_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_srqs",
+				(u64)rsrc_stats->srqs);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_rbqs",
+				(u64)rsrc_stats->rbqs);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_64K_nsmr",
+				(u64)rsrc_stats->r64K_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_64K_to_2M_nsmr",
+				(u64)rsrc_stats->r64K_to_2M_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_2M_to_44M_nsmr",
+				(u64)rsrc_stats->r2M_to_44M_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_44M_to_1G_nsmr",
+				(u64)rsrc_stats->r44M_to_1G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_1G_to_4G_nsmr",
+				(u64)rsrc_stats->r1G_to_4G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_nsmr_count_4G_to_32G",
+				(u64)rsrc_stats->nsmr_count_4G_to_32G);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_32G_to_64G_nsmr",
+				(u64)rsrc_stats->r32G_to_64G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_64G_to_128G_nsmr",
+				(u64)rsrc_stats->r64G_to_128G_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_128G_to_higher_nsmr",
+				(u64)rsrc_stats->r128G_to_higher_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_embedded_nsmr",
+				(u64)rsrc_stats->embedded_nsmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_frmr",
+				(u64)rsrc_stats->frmr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_prefetch_qps",
+				(u64)rsrc_stats->prefetch_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_ondemand_qps",
+				(u64)rsrc_stats->ondemand_qps);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_phy_mr",
+				(u64)rsrc_stats->phy_mr);
+	pcur += ocrdma_add_stat(stats, pcur, "threshold_mw",
+				(u64)rsrc_stats->mw);
+	return stats;
+}
+
+static char *ocrdma_rx_stats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rx_stats *rx_stats = &rdma_stats->rx_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat
+		(stats, pcur, "roce_frame_bytes",
+		 convert_to_64bit(rx_stats->roce_frame_bytes_lo,
+		 rx_stats->roce_frame_bytes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "roce_frame_icrc_drops",
+				(u64)rx_stats->roce_frame_icrc_drops);
+	pcur += ocrdma_add_stat(stats, pcur, "roce_frame_payload_len_drops",
+				(u64)rx_stats->roce_frame_payload_len_drops);
+	pcur += ocrdma_add_stat(stats, pcur, "ud_drops",
+				(u64)rx_stats->ud_drops);
+	pcur += ocrdma_add_stat(stats, pcur, "qp1_drops",
+				(u64)rx_stats->qp1_drops);
+	pcur += ocrdma_add_stat(stats, pcur, "psn_error_request_packets",
+				(u64)rx_stats->psn_error_request_packets);
+	pcur += ocrdma_add_stat(stats, pcur, "psn_error_resp_packets",
+				(u64)rx_stats->psn_error_resp_packets);
+	pcur += ocrdma_add_stat(stats, pcur, "rnr_nak_timeouts",
+				(u64)rx_stats->rnr_nak_timeouts);
+	pcur += ocrdma_add_stat(stats, pcur, "rnr_nak_receives",
+				(u64)rx_stats->rnr_nak_receives);
+	pcur += ocrdma_add_stat(stats, pcur, "roce_frame_rxmt_drops",
+				(u64)rx_stats->roce_frame_rxmt_drops);
+	pcur += ocrdma_add_stat(stats, pcur, "nak_count_psn_sequence_errors",
+				(u64)rx_stats->nak_count_psn_sequence_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "rc_drop_count_lookup_errors",
+				(u64)rx_stats->rc_drop_count_lookup_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "rq_rnr_naks",
+				(u64)rx_stats->rq_rnr_naks);
+	pcur += ocrdma_add_stat(stats, pcur, "srq_rnr_naks",
+				(u64)rx_stats->srq_rnr_naks);
+	pcur += ocrdma_add_stat(stats, pcur, "roce_frames",
+				convert_to_64bit(rx_stats->roce_frames_lo,
+						 rx_stats->roce_frames_hi));
+
+	return stats;
+}
+
+static u64 ocrdma_sysfs_rcv_pkts(struct ocrdma_dev *dev)
+{
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rx_stats *rx_stats = &rdma_stats->rx_stats;
+
+	return convert_to_64bit(rx_stats->roce_frames_lo,
+		rx_stats->roce_frames_hi) + (u64)rx_stats->roce_frame_icrc_drops
+		+ (u64)rx_stats->roce_frame_payload_len_drops;
+}
+
+static u64 ocrdma_sysfs_rcv_data(struct ocrdma_dev *dev)
+{
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rx_stats *rx_stats = &rdma_stats->rx_stats;
+
+	return (convert_to_64bit(rx_stats->roce_frame_bytes_lo,
+		rx_stats->roce_frame_bytes_hi))/4;
+}
+
+static char *ocrdma_tx_stats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_tx_stats *tx_stats = &rdma_stats->tx_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "send_pkts",
+				convert_to_64bit(tx_stats->send_pkts_lo,
+						 tx_stats->send_pkts_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "write_pkts",
+				convert_to_64bit(tx_stats->write_pkts_lo,
+						 tx_stats->write_pkts_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "read_pkts",
+				convert_to_64bit(tx_stats->read_pkts_lo,
+						 tx_stats->read_pkts_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "read_rsp_pkts",
+				convert_to_64bit(tx_stats->read_rsp_pkts_lo,
+						 tx_stats->read_rsp_pkts_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "ack_pkts",
+				convert_to_64bit(tx_stats->ack_pkts_lo,
+						 tx_stats->ack_pkts_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "send_bytes",
+				convert_to_64bit(tx_stats->send_bytes_lo,
+						 tx_stats->send_bytes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "write_bytes",
+				convert_to_64bit(tx_stats->write_bytes_lo,
+						 tx_stats->write_bytes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "read_req_bytes",
+				convert_to_64bit(tx_stats->read_req_bytes_lo,
+						 tx_stats->read_req_bytes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "read_rsp_bytes",
+				convert_to_64bit(tx_stats->read_rsp_bytes_lo,
+						 tx_stats->read_rsp_bytes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "ack_timeouts",
+				(u64)tx_stats->ack_timeouts);
+
+	return stats;
+}
+
+static u64 ocrdma_sysfs_xmit_pkts(struct ocrdma_dev *dev)
+{
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_tx_stats *tx_stats = &rdma_stats->tx_stats;
+
+	return (convert_to_64bit(tx_stats->send_pkts_lo,
+				 tx_stats->send_pkts_hi) +
+	convert_to_64bit(tx_stats->write_pkts_lo, tx_stats->write_pkts_hi) +
+	convert_to_64bit(tx_stats->read_pkts_lo, tx_stats->read_pkts_hi) +
+	convert_to_64bit(tx_stats->read_rsp_pkts_lo,
+			 tx_stats->read_rsp_pkts_hi) +
+	convert_to_64bit(tx_stats->ack_pkts_lo, tx_stats->ack_pkts_hi));
+}
+
+static u64 ocrdma_sysfs_xmit_data(struct ocrdma_dev *dev)
+{
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_tx_stats *tx_stats = &rdma_stats->tx_stats;
+
+	return (convert_to_64bit(tx_stats->send_bytes_lo,
+				 tx_stats->send_bytes_hi) +
+		convert_to_64bit(tx_stats->write_bytes_lo,
+				 tx_stats->write_bytes_hi) +
+		convert_to_64bit(tx_stats->read_req_bytes_lo,
+				 tx_stats->read_req_bytes_hi) +
+		convert_to_64bit(tx_stats->read_rsp_bytes_lo,
+				 tx_stats->read_rsp_bytes_hi))/4;
+}
+
+static char *ocrdma_wqe_stats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_wqe_stats	*wqe_stats = &rdma_stats->wqe_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "large_send_rc_wqes",
+		convert_to_64bit(wqe_stats->large_send_rc_wqes_lo,
+				 wqe_stats->large_send_rc_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "large_write_rc_wqes",
+		convert_to_64bit(wqe_stats->large_write_rc_wqes_lo,
+				 wqe_stats->large_write_rc_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "read_wqes",
+				convert_to_64bit(wqe_stats->read_wqes_lo,
+						 wqe_stats->read_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "frmr_wqes",
+				convert_to_64bit(wqe_stats->frmr_wqes_lo,
+						 wqe_stats->frmr_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "mw_bind_wqes",
+				convert_to_64bit(wqe_stats->mw_bind_wqes_lo,
+						 wqe_stats->mw_bind_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "invalidate_wqes",
+		convert_to_64bit(wqe_stats->invalidate_wqes_lo,
+				 wqe_stats->invalidate_wqes_hi));
+	pcur += ocrdma_add_stat(stats, pcur, "dpp_wqe_drops",
+				(u64)wqe_stats->dpp_wqe_drops);
+	return stats;
+}
+
+static char *ocrdma_db_errstats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_db_err_stats *db_err_stats = &rdma_stats->db_err_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "sq_doorbell_errors",
+				(u64)db_err_stats->sq_doorbell_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "cq_doorbell_errors",
+				(u64)db_err_stats->cq_doorbell_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "rq_srq_doorbell_errors",
+				(u64)db_err_stats->rq_srq_doorbell_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "cq_overflow_errors",
+				(u64)db_err_stats->cq_overflow_errors);
+	return stats;
+}
+
+static char *ocrdma_rxqp_errstats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rx_qp_err_stats *rx_qp_err_stats =
+		 &rdma_stats->rx_qp_err_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "nak_invalid_requst_errors",
+			(u64)rx_qp_err_stats->nak_invalid_requst_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "nak_remote_operation_errors",
+			(u64)rx_qp_err_stats->nak_remote_operation_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "nak_count_remote_access_errors",
+			(u64)rx_qp_err_stats->nak_count_remote_access_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "local_length_errors",
+			(u64)rx_qp_err_stats->local_length_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "local_protection_errors",
+			(u64)rx_qp_err_stats->local_protection_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "local_qp_operation_errors",
+			(u64)rx_qp_err_stats->local_qp_operation_errors);
+	return stats;
+}
+
+static char *ocrdma_txqp_errstats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_tx_qp_err_stats *tx_qp_err_stats =
+		&rdma_stats->tx_qp_err_stats;
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "local_length_errors",
+			(u64)tx_qp_err_stats->local_length_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "local_protection_errors",
+			(u64)tx_qp_err_stats->local_protection_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "local_qp_operation_errors",
+			(u64)tx_qp_err_stats->local_qp_operation_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "retry_count_exceeded_errors",
+			(u64)tx_qp_err_stats->retry_count_exceeded_errors);
+	pcur += ocrdma_add_stat(stats, pcur, "rnr_retry_count_exceeded_errors",
+			(u64)tx_qp_err_stats->rnr_retry_count_exceeded_errors);
+	return stats;
+}
+
+static char *ocrdma_tx_dbg_stats(struct ocrdma_dev *dev)
+{
+	int i;
+	char *pstats = dev->stats_mem.debugfs_mem;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_tx_dbg_stats *tx_dbg_stats =
+		&rdma_stats->tx_dbg_stats;
+
+	memset(pstats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	for (i = 0; i < 100; i++)
+		pstats += snprintf(pstats, 80, "DW[%d] = 0x%x\n", i,
+				 tx_dbg_stats->data[i]);
+
+	return dev->stats_mem.debugfs_mem;
+}
+
+static char *ocrdma_rx_dbg_stats(struct ocrdma_dev *dev)
+{
+	int i;
+	char *pstats = dev->stats_mem.debugfs_mem;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		(struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rx_dbg_stats *rx_dbg_stats =
+		&rdma_stats->rx_dbg_stats;
+
+	memset(pstats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	for (i = 0; i < 200; i++)
+		pstats += snprintf(pstats, 80, "DW[%d] = 0x%x\n", i,
+				 rx_dbg_stats->data[i]);
+
+	return dev->stats_mem.debugfs_mem;
+}
+
+static char *ocrdma_driver_dbg_stats(struct ocrdma_dev *dev)
+{
+	char *stats = dev->stats_mem.debugfs_mem, *pcur;
+
+
+	memset(stats, 0, (OCRDMA_MAX_DBGFS_MEM));
+
+	pcur = stats;
+	pcur += ocrdma_add_stat(stats, pcur, "async_cq_err",
+				(u64)(dev->async_err_stats
+				[OCRDMA_CQ_ERROR].counter));
+	pcur += ocrdma_add_stat(stats, pcur, "async_cq_overrun_err",
+				(u64)dev->async_err_stats
+				[OCRDMA_CQ_OVERRUN_ERROR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "async_cq_qpcat_err",
+				(u64)dev->async_err_stats
+				[OCRDMA_CQ_QPCAT_ERROR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "async_qp_access_err",
+				(u64)dev->async_err_stats
+				[OCRDMA_QP_ACCESS_ERROR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "async_qp_commm_est_evt",
+				(u64)dev->async_err_stats
+				[OCRDMA_QP_COMM_EST_EVENT].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "async_sq_drained_evt",
+				(u64)dev->async_err_stats
+				[OCRDMA_SQ_DRAINED_EVENT].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "async_dev_fatal_evt",
+				(u64)dev->async_err_stats
+				[OCRDMA_DEVICE_FATAL_EVENT].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "async_srqcat_err",
+				(u64)dev->async_err_stats
+				[OCRDMA_SRQCAT_ERROR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "async_srq_limit_evt",
+				(u64)dev->async_err_stats
+				[OCRDMA_SRQ_LIMIT_EVENT].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "async_qp_last_wqe_evt",
+				(u64)dev->async_err_stats
+				[OCRDMA_QP_LAST_WQE_EVENT].counter);
+
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_len_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_LOC_LEN_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_qp_op_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_LOC_QP_OP_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_eec_op_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_LOC_EEC_OP_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_prot_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_LOC_PROT_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_wr_flush_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_WR_FLUSH_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_mw_bind_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_MW_BIND_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_bad_resp_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_BAD_RESP_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_access_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_LOC_ACCESS_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_rem_inv_req_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_REM_INV_REQ_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_rem_access_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_REM_ACCESS_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_rem_op_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_REM_OP_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_retry_exc_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_RETRY_EXC_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_rnr_retry_exc_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_RNR_RETRY_EXC_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_loc_rdd_viol_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_LOC_RDD_VIOL_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_rem_inv_rd_req_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_REM_INV_RD_REQ_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_rem_abort_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_REM_ABORT_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_inv_eecn_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_INV_EECN_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_inv_eec_state_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_INV_EEC_STATE_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_fatal_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_FATAL_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_resp_timeout_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_RESP_TIMEOUT_ERR].counter);
+	pcur += ocrdma_add_stat(stats, pcur, "cqe_general_err",
+				(u64)dev->cqe_err_stats
+				[OCRDMA_CQE_GENERAL_ERR].counter);
+	return stats;
+}
+
+static void ocrdma_update_stats(struct ocrdma_dev *dev)
+{
+	ulong now = jiffies, secs;
+	int status = 0;
+	struct ocrdma_rdma_stats_resp *rdma_stats =
+		      (struct ocrdma_rdma_stats_resp *)dev->stats_mem.va;
+	struct ocrdma_rsrc_stats *rsrc_stats = &rdma_stats->act_rsrc_stats;
+
+	secs = jiffies_to_msecs(now - dev->last_stats_time) / 1000U;
+	if (secs) {
+		/* update */
+		status = ocrdma_mbx_rdma_stats(dev, false);
+		if (status)
+			pr_err("%s: stats mbox failed with status = %d\n",
+			       __func__, status);
+		/* Update PD counters from PD resource manager */
+		if (dev->pd_mgr->pd_prealloc_valid) {
+			rsrc_stats->dpp_pds = dev->pd_mgr->pd_dpp_count;
+			rsrc_stats->non_dpp_pds = dev->pd_mgr->pd_norm_count;
+			/* Threshold stata*/
+			rsrc_stats = &rdma_stats->th_rsrc_stats;
+			rsrc_stats->dpp_pds = dev->pd_mgr->pd_dpp_thrsh;
+			rsrc_stats->non_dpp_pds = dev->pd_mgr->pd_norm_thrsh;
+		}
+		dev->last_stats_time = jiffies;
+	}
+}
+
+static ssize_t ocrdma_dbgfs_ops_write(struct file *filp,
+					const char __user *buffer,
+					size_t count, loff_t *ppos)
+{
+	char tmp_str[32];
+	long reset;
+	int status = 0;
+	struct ocrdma_stats *pstats = filp->private_data;
+	struct ocrdma_dev *dev = pstats->dev;
+
+	if (count > 32)
+		goto err;
+
+	if (copy_from_user(tmp_str, buffer, count))
+		goto err;
+
+	tmp_str[count-1] = '\0';
+	if (kstrtol(tmp_str, 10, &reset))
+		goto err;
+
+	switch (pstats->type) {
+	case OCRDMA_RESET_STATS:
+		if (reset) {
+			status = ocrdma_mbx_rdma_stats(dev, true);
+			if (status) {
+				pr_err("Failed to reset stats = %d", status);
+				goto err;
+			}
+		}
+		break;
+	default:
+		goto err;
+	}
+
+	return count;
+err:
+	return -EFAULT;
+}
+
+int ocrdma_pma_counters(struct ocrdma_dev *dev,
+			struct ib_mad *out_mad)
+{
+	struct ib_pma_portcounters *pma_cnt;
+
+	memset(out_mad->data, 0, sizeof out_mad->data);
+	pma_cnt = (void *)(out_mad->data + 40);
+	ocrdma_update_stats(dev);
+
+	pma_cnt->port_xmit_data    = cpu_to_be32(ocrdma_sysfs_xmit_data(dev));
+	pma_cnt->port_rcv_data     = cpu_to_be32(ocrdma_sysfs_rcv_data(dev));
+	pma_cnt->port_xmit_packets = cpu_to_be32(ocrdma_sysfs_xmit_pkts(dev));
+	pma_cnt->port_rcv_packets  = cpu_to_be32(ocrdma_sysfs_rcv_pkts(dev));
+	return 0;
+}
+
+static ssize_t ocrdma_dbgfs_ops_read(struct file *filp, char __user *buffer,
+					size_t usr_buf_len, loff_t *ppos)
+{
+	struct ocrdma_stats *pstats = filp->private_data;
+	struct ocrdma_dev *dev = pstats->dev;
+	ssize_t status = 0;
+	char *data = NULL;
+
+	/* No partial reads */
+	if (*ppos != 0)
+		return 0;
+
+	mutex_lock(&dev->stats_lock);
+
+	ocrdma_update_stats(dev);
+
+	switch (pstats->type) {
+	case OCRDMA_RSRC_STATS:
+		data = ocrdma_resource_stats(dev);
+		break;
+	case OCRDMA_RXSTATS:
+		data = ocrdma_rx_stats(dev);
+		break;
+	case OCRDMA_WQESTATS:
+		data = ocrdma_wqe_stats(dev);
+		break;
+	case OCRDMA_TXSTATS:
+		data = ocrdma_tx_stats(dev);
+		break;
+	case OCRDMA_DB_ERRSTATS:
+		data = ocrdma_db_errstats(dev);
+		break;
+	case OCRDMA_RXQP_ERRSTATS:
+		data = ocrdma_rxqp_errstats(dev);
+		break;
+	case OCRDMA_TXQP_ERRSTATS:
+		data = ocrdma_txqp_errstats(dev);
+		break;
+	case OCRDMA_TX_DBG_STATS:
+		data = ocrdma_tx_dbg_stats(dev);
+		break;
+	case OCRDMA_RX_DBG_STATS:
+		data = ocrdma_rx_dbg_stats(dev);
+		break;
+	case OCRDMA_DRV_STATS:
+		data = ocrdma_driver_dbg_stats(dev);
+		break;
+
+	default:
+		status = -EFAULT;
+		goto exit;
+	}
+
+	if (usr_buf_len < strlen(data)) {
+		status = -ENOSPC;
+		goto exit;
+	}
+
+	status = simple_read_from_buffer(buffer, usr_buf_len, ppos, data,
+					 strlen(data));
+exit:
+	mutex_unlock(&dev->stats_lock);
+	return status;
+}
+
+static const struct file_operations ocrdma_dbg_ops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = ocrdma_dbgfs_ops_read,
+	.write = ocrdma_dbgfs_ops_write,
+};
+
+void ocrdma_add_port_stats(struct ocrdma_dev *dev)
+{
+	if (!ocrdma_dbgfs_dir)
+		return;
+
+	/* Create post stats base dir */
+	dev->dir = debugfs_create_dir(dev->ibdev.name, ocrdma_dbgfs_dir);
+	if (!dev->dir)
+		goto err;
+
+	dev->rsrc_stats.type = OCRDMA_RSRC_STATS;
+	dev->rsrc_stats.dev = dev;
+	if (!debugfs_create_file("resource_stats", S_IRUSR, dev->dir,
+				 &dev->rsrc_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->rx_stats.type = OCRDMA_RXSTATS;
+	dev->rx_stats.dev = dev;
+	if (!debugfs_create_file("rx_stats", S_IRUSR, dev->dir,
+				 &dev->rx_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->wqe_stats.type = OCRDMA_WQESTATS;
+	dev->wqe_stats.dev = dev;
+	if (!debugfs_create_file("wqe_stats", S_IRUSR, dev->dir,
+				 &dev->wqe_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->tx_stats.type = OCRDMA_TXSTATS;
+	dev->tx_stats.dev = dev;
+	if (!debugfs_create_file("tx_stats", S_IRUSR, dev->dir,
+				 &dev->tx_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->db_err_stats.type = OCRDMA_DB_ERRSTATS;
+	dev->db_err_stats.dev = dev;
+	if (!debugfs_create_file("db_err_stats", S_IRUSR, dev->dir,
+				 &dev->db_err_stats, &ocrdma_dbg_ops))
+		goto err;
+
+
+	dev->tx_qp_err_stats.type = OCRDMA_TXQP_ERRSTATS;
+	dev->tx_qp_err_stats.dev = dev;
+	if (!debugfs_create_file("tx_qp_err_stats", S_IRUSR, dev->dir,
+				 &dev->tx_qp_err_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->rx_qp_err_stats.type = OCRDMA_RXQP_ERRSTATS;
+	dev->rx_qp_err_stats.dev = dev;
+	if (!debugfs_create_file("rx_qp_err_stats", S_IRUSR, dev->dir,
+				 &dev->rx_qp_err_stats, &ocrdma_dbg_ops))
+		goto err;
+
+
+	dev->tx_dbg_stats.type = OCRDMA_TX_DBG_STATS;
+	dev->tx_dbg_stats.dev = dev;
+	if (!debugfs_create_file("tx_dbg_stats", S_IRUSR, dev->dir,
+				 &dev->tx_dbg_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->rx_dbg_stats.type = OCRDMA_RX_DBG_STATS;
+	dev->rx_dbg_stats.dev = dev;
+	if (!debugfs_create_file("rx_dbg_stats", S_IRUSR, dev->dir,
+				 &dev->rx_dbg_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->driver_stats.type = OCRDMA_DRV_STATS;
+	dev->driver_stats.dev = dev;
+	if (!debugfs_create_file("driver_dbg_stats", S_IRUSR, dev->dir,
+					&dev->driver_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	dev->reset_stats.type = OCRDMA_RESET_STATS;
+	dev->reset_stats.dev = dev;
+	if (!debugfs_create_file("reset_stats", S_IRUSR, dev->dir,
+				&dev->reset_stats, &ocrdma_dbg_ops))
+		goto err;
+
+	/* Now create dma_mem for stats mbx command */
+	if (!ocrdma_alloc_stats_mem(dev))
+		goto err;
+
+	mutex_init(&dev->stats_lock);
+
+	return;
+err:
+	ocrdma_release_stats_mem(dev);
+	debugfs_remove_recursive(dev->dir);
+	dev->dir = NULL;
+}
+
+void ocrdma_rem_port_stats(struct ocrdma_dev *dev)
+{
+	if (!dev->dir)
+		return;
+	mutex_destroy(&dev->stats_lock);
+	ocrdma_release_stats_mem(dev);
+	debugfs_remove(dev->dir);
+}
+
+void ocrdma_init_debugfs(void)
+{
+	/* Create base dir in debugfs root dir */
+	ocrdma_dbgfs_dir = debugfs_create_dir("ocrdma", NULL);
+}
+
+void ocrdma_rem_debugfs(void)
+{
+	debugfs_remove_recursive(ocrdma_dbgfs_dir);
+}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_stats.h b/drivers/infiniband/hw/ocrdma/ocrdma_stats.h
new file mode 100644
index 000000000..091edd68a
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_stats.h
@@ -0,0 +1,58 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2014 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#ifndef __OCRDMA_STATS_H__
+#define __OCRDMA_STATS_H__
+
+#include <linux/debugfs.h>
+#include "ocrdma.h"
+#include "ocrdma_hw.h"
+
+#define OCRDMA_MAX_DBGFS_MEM 4096
+
+enum OCRDMA_STATS_TYPE {
+	OCRDMA_RSRC_STATS,
+	OCRDMA_RXSTATS,
+	OCRDMA_WQESTATS,
+	OCRDMA_TXSTATS,
+	OCRDMA_DB_ERRSTATS,
+	OCRDMA_RXQP_ERRSTATS,
+	OCRDMA_TXQP_ERRSTATS,
+	OCRDMA_TX_DBG_STATS,
+	OCRDMA_RX_DBG_STATS,
+	OCRDMA_DRV_STATS,
+	OCRDMA_RESET_STATS
+};
+
+void ocrdma_rem_debugfs(void);
+void ocrdma_init_debugfs(void);
+void ocrdma_rem_port_stats(struct ocrdma_dev *dev);
+void ocrdma_add_port_stats(struct ocrdma_dev *dev);
+int ocrdma_pma_counters(struct ocrdma_dev *dev,
+			struct ib_mad *out_mad);
+
+#endif	/* __OCRDMA_STATS_H__ */
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
new file mode 100644
index 000000000..219f2122f
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c
@@ -0,0 +1,3194 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#include <linux/dma-mapping.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_user_verbs.h>
+#include <rdma/iw_cm.h>
+#include <rdma/ib_umem.h>
+#include <rdma/ib_addr.h>
+
+#include "ocrdma.h"
+#include "ocrdma_hw.h"
+#include "ocrdma_verbs.h"
+#include "ocrdma_abi.h"
+
+int ocrdma_query_pkey(struct ib_device *ibdev, u8 port, u16 index, u16 *pkey)
+{
+	if (index > 1)
+		return -EINVAL;
+
+	*pkey = 0xffff;
+	return 0;
+}
+
+int ocrdma_query_gid(struct ib_device *ibdev, u8 port,
+		     int index, union ib_gid *sgid)
+{
+	struct ocrdma_dev *dev;
+
+	dev = get_ocrdma_dev(ibdev);
+	memset(sgid, 0, sizeof(*sgid));
+	if (index >= OCRDMA_MAX_SGID)
+		return -EINVAL;
+
+	memcpy(sgid, &dev->sgid_tbl[index], sizeof(*sgid));
+
+	return 0;
+}
+
+int ocrdma_query_device(struct ib_device *ibdev, struct ib_device_attr *attr)
+{
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
+
+	memset(attr, 0, sizeof *attr);
+	memcpy(&attr->fw_ver, &dev->attr.fw_ver[0],
+	       min(sizeof(dev->attr.fw_ver), sizeof(attr->fw_ver)));
+	ocrdma_get_guid(dev, (u8 *)&attr->sys_image_guid);
+	attr->max_mr_size = dev->attr.max_mr_size;
+	attr->page_size_cap = 0xffff000;
+	attr->vendor_id = dev->nic_info.pdev->vendor;
+	attr->vendor_part_id = dev->nic_info.pdev->device;
+	attr->hw_ver = dev->asic_id;
+	attr->max_qp = dev->attr.max_qp;
+	attr->max_ah = OCRDMA_MAX_AH;
+	attr->max_qp_wr = dev->attr.max_wqe;
+
+	attr->device_cap_flags = IB_DEVICE_CURR_QP_STATE_MOD |
+					IB_DEVICE_RC_RNR_NAK_GEN |
+					IB_DEVICE_SHUTDOWN_PORT |
+					IB_DEVICE_SYS_IMAGE_GUID |
+					IB_DEVICE_LOCAL_DMA_LKEY |
+					IB_DEVICE_MEM_MGT_EXTENSIONS;
+	attr->max_sge = min(dev->attr.max_send_sge, dev->attr.max_srq_sge);
+	attr->max_sge_rd = 0;
+	attr->max_cq = dev->attr.max_cq;
+	attr->max_cqe = dev->attr.max_cqe;
+	attr->max_mr = dev->attr.max_mr;
+	attr->max_mw = dev->attr.max_mw;
+	attr->max_pd = dev->attr.max_pd;
+	attr->atomic_cap = 0;
+	attr->max_fmr = 0;
+	attr->max_map_per_fmr = 0;
+	attr->max_qp_rd_atom =
+	    min(dev->attr.max_ord_per_qp, dev->attr.max_ird_per_qp);
+	attr->max_qp_init_rd_atom = dev->attr.max_ord_per_qp;
+	attr->max_srq = dev->attr.max_srq;
+	attr->max_srq_sge = dev->attr.max_srq_sge;
+	attr->max_srq_wr = dev->attr.max_rqe;
+	attr->local_ca_ack_delay = dev->attr.local_ca_ack_delay;
+	attr->max_fast_reg_page_list_len = dev->attr.max_pages_per_frmr;
+	attr->max_pkeys = 1;
+	return 0;
+}
+
+static inline void get_link_speed_and_width(struct ocrdma_dev *dev,
+					    u8 *ib_speed, u8 *ib_width)
+{
+	int status;
+	u8 speed;
+
+	status = ocrdma_mbx_get_link_speed(dev, &speed);
+	if (status)
+		speed = OCRDMA_PHYS_LINK_SPEED_ZERO;
+
+	switch (speed) {
+	case OCRDMA_PHYS_LINK_SPEED_1GBPS:
+		*ib_speed = IB_SPEED_SDR;
+		*ib_width = IB_WIDTH_1X;
+		break;
+
+	case OCRDMA_PHYS_LINK_SPEED_10GBPS:
+		*ib_speed = IB_SPEED_QDR;
+		*ib_width = IB_WIDTH_1X;
+		break;
+
+	case OCRDMA_PHYS_LINK_SPEED_20GBPS:
+		*ib_speed = IB_SPEED_DDR;
+		*ib_width = IB_WIDTH_4X;
+		break;
+
+	case OCRDMA_PHYS_LINK_SPEED_40GBPS:
+		*ib_speed = IB_SPEED_QDR;
+		*ib_width = IB_WIDTH_4X;
+		break;
+
+	default:
+		/* Unsupported */
+		*ib_speed = IB_SPEED_SDR;
+		*ib_width = IB_WIDTH_1X;
+	}
+}
+
+int ocrdma_query_port(struct ib_device *ibdev,
+		      u8 port, struct ib_port_attr *props)
+{
+	enum ib_port_state port_state;
+	struct ocrdma_dev *dev;
+	struct net_device *netdev;
+
+	dev = get_ocrdma_dev(ibdev);
+	if (port > 1) {
+		pr_err("%s(%d) invalid_port=0x%x\n", __func__,
+		       dev->id, port);
+		return -EINVAL;
+	}
+	netdev = dev->nic_info.netdev;
+	if (netif_running(netdev) && netif_oper_up(netdev)) {
+		port_state = IB_PORT_ACTIVE;
+		props->phys_state = 5;
+	} else {
+		port_state = IB_PORT_DOWN;
+		props->phys_state = 3;
+	}
+	props->max_mtu = IB_MTU_4096;
+	props->active_mtu = iboe_get_mtu(netdev->mtu);
+	props->lid = 0;
+	props->lmc = 0;
+	props->sm_lid = 0;
+	props->sm_sl = 0;
+	props->state = port_state;
+	props->port_cap_flags =
+	    IB_PORT_CM_SUP |
+	    IB_PORT_REINIT_SUP |
+	    IB_PORT_DEVICE_MGMT_SUP | IB_PORT_VENDOR_CLASS_SUP | IB_PORT_IP_BASED_GIDS;
+	props->gid_tbl_len = OCRDMA_MAX_SGID;
+	props->pkey_tbl_len = 1;
+	props->bad_pkey_cntr = 0;
+	props->qkey_viol_cntr = 0;
+	get_link_speed_and_width(dev, &props->active_speed,
+				 &props->active_width);
+	props->max_msg_sz = 0x80000000;
+	props->max_vl_num = 4;
+	return 0;
+}
+
+int ocrdma_modify_port(struct ib_device *ibdev, u8 port, int mask,
+		       struct ib_port_modify *props)
+{
+	struct ocrdma_dev *dev;
+
+	dev = get_ocrdma_dev(ibdev);
+	if (port > 1) {
+		pr_err("%s(%d) invalid_port=0x%x\n", __func__, dev->id, port);
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int ocrdma_add_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr,
+			   unsigned long len)
+{
+	struct ocrdma_mm *mm;
+
+	mm = kzalloc(sizeof(*mm), GFP_KERNEL);
+	if (mm == NULL)
+		return -ENOMEM;
+	mm->key.phy_addr = phy_addr;
+	mm->key.len = len;
+	INIT_LIST_HEAD(&mm->entry);
+
+	mutex_lock(&uctx->mm_list_lock);
+	list_add_tail(&mm->entry, &uctx->mm_head);
+	mutex_unlock(&uctx->mm_list_lock);
+	return 0;
+}
+
+static void ocrdma_del_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr,
+			    unsigned long len)
+{
+	struct ocrdma_mm *mm, *tmp;
+
+	mutex_lock(&uctx->mm_list_lock);
+	list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) {
+		if (len != mm->key.len && phy_addr != mm->key.phy_addr)
+			continue;
+
+		list_del(&mm->entry);
+		kfree(mm);
+		break;
+	}
+	mutex_unlock(&uctx->mm_list_lock);
+}
+
+static bool ocrdma_search_mmap(struct ocrdma_ucontext *uctx, u64 phy_addr,
+			      unsigned long len)
+{
+	bool found = false;
+	struct ocrdma_mm *mm;
+
+	mutex_lock(&uctx->mm_list_lock);
+	list_for_each_entry(mm, &uctx->mm_head, entry) {
+		if (len != mm->key.len && phy_addr != mm->key.phy_addr)
+			continue;
+
+		found = true;
+		break;
+	}
+	mutex_unlock(&uctx->mm_list_lock);
+	return found;
+}
+
+
+static u16 _ocrdma_pd_mgr_get_bitmap(struct ocrdma_dev *dev, bool dpp_pool)
+{
+	u16 pd_bitmap_idx = 0;
+	const unsigned long *pd_bitmap;
+
+	if (dpp_pool) {
+		pd_bitmap = dev->pd_mgr->pd_dpp_bitmap;
+		pd_bitmap_idx = find_first_zero_bit(pd_bitmap,
+						    dev->pd_mgr->max_dpp_pd);
+		__set_bit(pd_bitmap_idx, dev->pd_mgr->pd_dpp_bitmap);
+		dev->pd_mgr->pd_dpp_count++;
+		if (dev->pd_mgr->pd_dpp_count > dev->pd_mgr->pd_dpp_thrsh)
+			dev->pd_mgr->pd_dpp_thrsh = dev->pd_mgr->pd_dpp_count;
+	} else {
+		pd_bitmap = dev->pd_mgr->pd_norm_bitmap;
+		pd_bitmap_idx = find_first_zero_bit(pd_bitmap,
+						    dev->pd_mgr->max_normal_pd);
+		__set_bit(pd_bitmap_idx, dev->pd_mgr->pd_norm_bitmap);
+		dev->pd_mgr->pd_norm_count++;
+		if (dev->pd_mgr->pd_norm_count > dev->pd_mgr->pd_norm_thrsh)
+			dev->pd_mgr->pd_norm_thrsh = dev->pd_mgr->pd_norm_count;
+	}
+	return pd_bitmap_idx;
+}
+
+static int _ocrdma_pd_mgr_put_bitmap(struct ocrdma_dev *dev, u16 pd_id,
+					bool dpp_pool)
+{
+	u16 pd_count;
+	u16 pd_bit_index;
+
+	pd_count = dpp_pool ? dev->pd_mgr->pd_dpp_count :
+			      dev->pd_mgr->pd_norm_count;
+	if (pd_count == 0)
+		return -EINVAL;
+
+	if (dpp_pool) {
+		pd_bit_index = pd_id - dev->pd_mgr->pd_dpp_start;
+		if (pd_bit_index >= dev->pd_mgr->max_dpp_pd) {
+			return -EINVAL;
+		} else {
+			__clear_bit(pd_bit_index, dev->pd_mgr->pd_dpp_bitmap);
+			dev->pd_mgr->pd_dpp_count--;
+		}
+	} else {
+		pd_bit_index = pd_id - dev->pd_mgr->pd_norm_start;
+		if (pd_bit_index >= dev->pd_mgr->max_normal_pd) {
+			return -EINVAL;
+		} else {
+			__clear_bit(pd_bit_index, dev->pd_mgr->pd_norm_bitmap);
+			dev->pd_mgr->pd_norm_count--;
+		}
+	}
+
+	return 0;
+}
+
+static u8 ocrdma_put_pd_num(struct ocrdma_dev *dev, u16 pd_id,
+				   bool dpp_pool)
+{
+	int status;
+
+	mutex_lock(&dev->dev_lock);
+	status = _ocrdma_pd_mgr_put_bitmap(dev, pd_id, dpp_pool);
+	mutex_unlock(&dev->dev_lock);
+	return status;
+}
+
+static int ocrdma_get_pd_num(struct ocrdma_dev *dev, struct ocrdma_pd *pd)
+{
+	u16 pd_idx = 0;
+	int status = 0;
+
+	mutex_lock(&dev->dev_lock);
+	if (pd->dpp_enabled) {
+		/* try allocating DPP PD, if not available then normal PD */
+		if (dev->pd_mgr->pd_dpp_count < dev->pd_mgr->max_dpp_pd) {
+			pd_idx = _ocrdma_pd_mgr_get_bitmap(dev, true);
+			pd->id = dev->pd_mgr->pd_dpp_start + pd_idx;
+			pd->dpp_page = dev->pd_mgr->dpp_page_index + pd_idx;
+		} else if (dev->pd_mgr->pd_norm_count <
+			   dev->pd_mgr->max_normal_pd) {
+			pd_idx = _ocrdma_pd_mgr_get_bitmap(dev, false);
+			pd->id = dev->pd_mgr->pd_norm_start + pd_idx;
+			pd->dpp_enabled = false;
+		} else {
+			status = -EINVAL;
+		}
+	} else {
+		if (dev->pd_mgr->pd_norm_count < dev->pd_mgr->max_normal_pd) {
+			pd_idx = _ocrdma_pd_mgr_get_bitmap(dev, false);
+			pd->id = dev->pd_mgr->pd_norm_start + pd_idx;
+		} else {
+			status = -EINVAL;
+		}
+	}
+	mutex_unlock(&dev->dev_lock);
+	return status;
+}
+
+static struct ocrdma_pd *_ocrdma_alloc_pd(struct ocrdma_dev *dev,
+					  struct ocrdma_ucontext *uctx,
+					  struct ib_udata *udata)
+{
+	struct ocrdma_pd *pd = NULL;
+	int status = 0;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	if (udata && uctx && dev->attr.max_dpp_pds) {
+		pd->dpp_enabled =
+			ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R;
+		pd->num_dpp_qp =
+			pd->dpp_enabled ? (dev->nic_info.db_page_size /
+					   dev->attr.wqe_size) : 0;
+	}
+
+	if (dev->pd_mgr->pd_prealloc_valid) {
+		status = ocrdma_get_pd_num(dev, pd);
+		return (status == 0) ? pd : ERR_PTR(status);
+	}
+
+retry:
+	status = ocrdma_mbx_alloc_pd(dev, pd);
+	if (status) {
+		if (pd->dpp_enabled) {
+			pd->dpp_enabled = false;
+			pd->num_dpp_qp = 0;
+			goto retry;
+		} else {
+			kfree(pd);
+			return ERR_PTR(status);
+		}
+	}
+
+	return pd;
+}
+
+static inline int is_ucontext_pd(struct ocrdma_ucontext *uctx,
+				 struct ocrdma_pd *pd)
+{
+	return (uctx->cntxt_pd == pd ? true : false);
+}
+
+static int _ocrdma_dealloc_pd(struct ocrdma_dev *dev,
+			      struct ocrdma_pd *pd)
+{
+	int status = 0;
+
+	if (dev->pd_mgr->pd_prealloc_valid)
+		status = ocrdma_put_pd_num(dev, pd->id, pd->dpp_enabled);
+	else
+		status = ocrdma_mbx_dealloc_pd(dev, pd);
+
+	kfree(pd);
+	return status;
+}
+
+static int ocrdma_alloc_ucontext_pd(struct ocrdma_dev *dev,
+				    struct ocrdma_ucontext *uctx,
+				    struct ib_udata *udata)
+{
+	int status = 0;
+
+	uctx->cntxt_pd = _ocrdma_alloc_pd(dev, uctx, udata);
+	if (IS_ERR(uctx->cntxt_pd)) {
+		status = PTR_ERR(uctx->cntxt_pd);
+		uctx->cntxt_pd = NULL;
+		goto err;
+	}
+
+	uctx->cntxt_pd->uctx = uctx;
+	uctx->cntxt_pd->ibpd.device = &dev->ibdev;
+err:
+	return status;
+}
+
+static int ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx)
+{
+	struct ocrdma_pd *pd = uctx->cntxt_pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+
+	if (uctx->pd_in_use) {
+		pr_err("%s(%d) Freeing in use pdid=0x%x.\n",
+		       __func__, dev->id, pd->id);
+	}
+	uctx->cntxt_pd = NULL;
+	(void)_ocrdma_dealloc_pd(dev, pd);
+	return 0;
+}
+
+static struct ocrdma_pd *ocrdma_get_ucontext_pd(struct ocrdma_ucontext *uctx)
+{
+	struct ocrdma_pd *pd = NULL;
+
+	mutex_lock(&uctx->mm_list_lock);
+	if (!uctx->pd_in_use) {
+		uctx->pd_in_use = true;
+		pd = uctx->cntxt_pd;
+	}
+	mutex_unlock(&uctx->mm_list_lock);
+
+	return pd;
+}
+
+static void ocrdma_release_ucontext_pd(struct ocrdma_ucontext *uctx)
+{
+	mutex_lock(&uctx->mm_list_lock);
+	uctx->pd_in_use = false;
+	mutex_unlock(&uctx->mm_list_lock);
+}
+
+struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *ibdev,
+					  struct ib_udata *udata)
+{
+	int status;
+	struct ocrdma_ucontext *ctx;
+	struct ocrdma_alloc_ucontext_resp resp;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	u32 map_len = roundup(sizeof(u32) * 2048, PAGE_SIZE);
+
+	if (!udata)
+		return ERR_PTR(-EFAULT);
+	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+	if (!ctx)
+		return ERR_PTR(-ENOMEM);
+	INIT_LIST_HEAD(&ctx->mm_head);
+	mutex_init(&ctx->mm_list_lock);
+
+	ctx->ah_tbl.va = dma_alloc_coherent(&pdev->dev, map_len,
+					    &ctx->ah_tbl.pa, GFP_KERNEL);
+	if (!ctx->ah_tbl.va) {
+		kfree(ctx);
+		return ERR_PTR(-ENOMEM);
+	}
+	memset(ctx->ah_tbl.va, 0, map_len);
+	ctx->ah_tbl.len = map_len;
+
+	memset(&resp, 0, sizeof(resp));
+	resp.ah_tbl_len = ctx->ah_tbl.len;
+	resp.ah_tbl_page = virt_to_phys(ctx->ah_tbl.va);
+
+	status = ocrdma_add_mmap(ctx, resp.ah_tbl_page, resp.ah_tbl_len);
+	if (status)
+		goto map_err;
+
+	status = ocrdma_alloc_ucontext_pd(dev, ctx, udata);
+	if (status)
+		goto pd_err;
+
+	resp.dev_id = dev->id;
+	resp.max_inline_data = dev->attr.max_inline_data;
+	resp.wqe_size = dev->attr.wqe_size;
+	resp.rqe_size = dev->attr.rqe_size;
+	resp.dpp_wqe_size = dev->attr.wqe_size;
+
+	memcpy(resp.fw_ver, dev->attr.fw_ver, sizeof(resp.fw_ver));
+	status = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	if (status)
+		goto cpy_err;
+	return &ctx->ibucontext;
+
+cpy_err:
+pd_err:
+	ocrdma_del_mmap(ctx, ctx->ah_tbl.pa, ctx->ah_tbl.len);
+map_err:
+	dma_free_coherent(&pdev->dev, ctx->ah_tbl.len, ctx->ah_tbl.va,
+			  ctx->ah_tbl.pa);
+	kfree(ctx);
+	return ERR_PTR(status);
+}
+
+int ocrdma_dealloc_ucontext(struct ib_ucontext *ibctx)
+{
+	int status = 0;
+	struct ocrdma_mm *mm, *tmp;
+	struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ibctx);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibctx->device);
+	struct pci_dev *pdev = dev->nic_info.pdev;
+
+	status = ocrdma_dealloc_ucontext_pd(uctx);
+
+	ocrdma_del_mmap(uctx, uctx->ah_tbl.pa, uctx->ah_tbl.len);
+	dma_free_coherent(&pdev->dev, uctx->ah_tbl.len, uctx->ah_tbl.va,
+			  uctx->ah_tbl.pa);
+
+	list_for_each_entry_safe(mm, tmp, &uctx->mm_head, entry) {
+		list_del(&mm->entry);
+		kfree(mm);
+	}
+	kfree(uctx);
+	return status;
+}
+
+int ocrdma_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct ocrdma_ucontext *ucontext = get_ocrdma_ucontext(context);
+	struct ocrdma_dev *dev = get_ocrdma_dev(context->device);
+	unsigned long vm_page = vma->vm_pgoff << PAGE_SHIFT;
+	u64 unmapped_db = (u64) dev->nic_info.unmapped_db;
+	unsigned long len = (vma->vm_end - vma->vm_start);
+	int status = 0;
+	bool found;
+
+	if (vma->vm_start & (PAGE_SIZE - 1))
+		return -EINVAL;
+	found = ocrdma_search_mmap(ucontext, vma->vm_pgoff << PAGE_SHIFT, len);
+	if (!found)
+		return -EINVAL;
+
+	if ((vm_page >= unmapped_db) && (vm_page <= (unmapped_db +
+		dev->nic_info.db_total_size)) &&
+		(len <=	dev->nic_info.db_page_size)) {
+		if (vma->vm_flags & VM_READ)
+			return -EPERM;
+
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+		status = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+					    len, vma->vm_page_prot);
+	} else if (dev->nic_info.dpp_unmapped_len &&
+		(vm_page >= (u64) dev->nic_info.dpp_unmapped_addr) &&
+		(vm_page <= (u64) (dev->nic_info.dpp_unmapped_addr +
+			dev->nic_info.dpp_unmapped_len)) &&
+		(len <= dev->nic_info.dpp_unmapped_len)) {
+		if (vma->vm_flags & VM_READ)
+			return -EPERM;
+
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+		status = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+					    len, vma->vm_page_prot);
+	} else {
+		status = remap_pfn_range(vma, vma->vm_start,
+					 vma->vm_pgoff, len, vma->vm_page_prot);
+	}
+	return status;
+}
+
+static int ocrdma_copy_pd_uresp(struct ocrdma_dev *dev, struct ocrdma_pd *pd,
+				struct ib_ucontext *ib_ctx,
+				struct ib_udata *udata)
+{
+	int status;
+	u64 db_page_addr;
+	u64 dpp_page_addr = 0;
+	u32 db_page_size;
+	struct ocrdma_alloc_pd_uresp rsp;
+	struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ib_ctx);
+
+	memset(&rsp, 0, sizeof(rsp));
+	rsp.id = pd->id;
+	rsp.dpp_enabled = pd->dpp_enabled;
+	db_page_addr = ocrdma_get_db_addr(dev, pd->id);
+	db_page_size = dev->nic_info.db_page_size;
+
+	status = ocrdma_add_mmap(uctx, db_page_addr, db_page_size);
+	if (status)
+		return status;
+
+	if (pd->dpp_enabled) {
+		dpp_page_addr = dev->nic_info.dpp_unmapped_addr +
+				(pd->id * PAGE_SIZE);
+		status = ocrdma_add_mmap(uctx, dpp_page_addr,
+				 PAGE_SIZE);
+		if (status)
+			goto dpp_map_err;
+		rsp.dpp_page_addr_hi = upper_32_bits(dpp_page_addr);
+		rsp.dpp_page_addr_lo = dpp_page_addr;
+	}
+
+	status = ib_copy_to_udata(udata, &rsp, sizeof(rsp));
+	if (status)
+		goto ucopy_err;
+
+	pd->uctx = uctx;
+	return 0;
+
+ucopy_err:
+	if (pd->dpp_enabled)
+		ocrdma_del_mmap(pd->uctx, dpp_page_addr, PAGE_SIZE);
+dpp_map_err:
+	ocrdma_del_mmap(pd->uctx, db_page_addr, db_page_size);
+	return status;
+}
+
+struct ib_pd *ocrdma_alloc_pd(struct ib_device *ibdev,
+			      struct ib_ucontext *context,
+			      struct ib_udata *udata)
+{
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
+	struct ocrdma_pd *pd;
+	struct ocrdma_ucontext *uctx = NULL;
+	int status;
+	u8 is_uctx_pd = false;
+
+	if (udata && context) {
+		uctx = get_ocrdma_ucontext(context);
+		pd = ocrdma_get_ucontext_pd(uctx);
+		if (pd) {
+			is_uctx_pd = true;
+			goto pd_mapping;
+		}
+	}
+
+	pd = _ocrdma_alloc_pd(dev, uctx, udata);
+	if (IS_ERR(pd)) {
+		status = PTR_ERR(pd);
+		goto exit;
+	}
+
+pd_mapping:
+	if (udata && context) {
+		status = ocrdma_copy_pd_uresp(dev, pd, context, udata);
+		if (status)
+			goto err;
+	}
+	return &pd->ibpd;
+
+err:
+	if (is_uctx_pd) {
+		ocrdma_release_ucontext_pd(uctx);
+	} else {
+		status = _ocrdma_dealloc_pd(dev, pd);
+	}
+exit:
+	return ERR_PTR(status);
+}
+
+int ocrdma_dealloc_pd(struct ib_pd *ibpd)
+{
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	struct ocrdma_ucontext *uctx = NULL;
+	int status = 0;
+	u64 usr_db;
+
+	uctx = pd->uctx;
+	if (uctx) {
+		u64 dpp_db = dev->nic_info.dpp_unmapped_addr +
+			(pd->id * PAGE_SIZE);
+		if (pd->dpp_enabled)
+			ocrdma_del_mmap(pd->uctx, dpp_db, PAGE_SIZE);
+		usr_db = ocrdma_get_db_addr(dev, pd->id);
+		ocrdma_del_mmap(pd->uctx, usr_db, dev->nic_info.db_page_size);
+
+		if (is_ucontext_pd(uctx, pd)) {
+			ocrdma_release_ucontext_pd(uctx);
+			return status;
+		}
+	}
+	status = _ocrdma_dealloc_pd(dev, pd);
+	return status;
+}
+
+static int ocrdma_alloc_lkey(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
+			    u32 pdid, int acc, u32 num_pbls, u32 addr_check)
+{
+	int status;
+
+	mr->hwmr.fr_mr = 0;
+	mr->hwmr.local_rd = 1;
+	mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0;
+	mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0;
+	mr->hwmr.mw_bind = (acc & IB_ACCESS_MW_BIND) ? 1 : 0;
+	mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
+	mr->hwmr.num_pbls = num_pbls;
+
+	status = ocrdma_mbx_alloc_lkey(dev, &mr->hwmr, pdid, addr_check);
+	if (status)
+		return status;
+
+	mr->ibmr.lkey = mr->hwmr.lkey;
+	if (mr->hwmr.remote_wr || mr->hwmr.remote_rd)
+		mr->ibmr.rkey = mr->hwmr.lkey;
+	return 0;
+}
+
+struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *ibpd, int acc)
+{
+	int status;
+	struct ocrdma_mr *mr;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+
+	if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE)) {
+		pr_err("%s err, invalid access rights\n", __func__);
+		return ERR_PTR(-EINVAL);
+	}
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	status = ocrdma_alloc_lkey(dev, mr, pd->id, acc, 0,
+				   OCRDMA_ADDR_CHECK_DISABLE);
+	if (status) {
+		kfree(mr);
+		return ERR_PTR(status);
+	}
+
+	return &mr->ibmr;
+}
+
+static void ocrdma_free_mr_pbl_tbl(struct ocrdma_dev *dev,
+				   struct ocrdma_hw_mr *mr)
+{
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	int i = 0;
+
+	if (mr->pbl_table) {
+		for (i = 0; i < mr->num_pbls; i++) {
+			if (!mr->pbl_table[i].va)
+				continue;
+			dma_free_coherent(&pdev->dev, mr->pbl_size,
+					  mr->pbl_table[i].va,
+					  mr->pbl_table[i].pa);
+		}
+		kfree(mr->pbl_table);
+		mr->pbl_table = NULL;
+	}
+}
+
+static int ocrdma_get_pbl_info(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
+			      u32 num_pbes)
+{
+	u32 num_pbls = 0;
+	u32 idx = 0;
+	int status = 0;
+	u32 pbl_size;
+
+	do {
+		pbl_size = OCRDMA_MIN_HPAGE_SIZE * (1 << idx);
+		if (pbl_size > MAX_OCRDMA_PBL_SIZE) {
+			status = -EFAULT;
+			break;
+		}
+		num_pbls = roundup(num_pbes, (pbl_size / sizeof(u64)));
+		num_pbls = num_pbls / (pbl_size / sizeof(u64));
+		idx++;
+	} while (num_pbls >= dev->attr.max_num_mr_pbl);
+
+	mr->hwmr.num_pbes = num_pbes;
+	mr->hwmr.num_pbls = num_pbls;
+	mr->hwmr.pbl_size = pbl_size;
+	return status;
+}
+
+static int ocrdma_build_pbl_tbl(struct ocrdma_dev *dev, struct ocrdma_hw_mr *mr)
+{
+	int status = 0;
+	int i;
+	u32 dma_len = mr->pbl_size;
+	struct pci_dev *pdev = dev->nic_info.pdev;
+	void *va;
+	dma_addr_t pa;
+
+	mr->pbl_table = kzalloc(sizeof(struct ocrdma_pbl) *
+				mr->num_pbls, GFP_KERNEL);
+
+	if (!mr->pbl_table)
+		return -ENOMEM;
+
+	for (i = 0; i < mr->num_pbls; i++) {
+		va = dma_alloc_coherent(&pdev->dev, dma_len, &pa, GFP_KERNEL);
+		if (!va) {
+			ocrdma_free_mr_pbl_tbl(dev, mr);
+			status = -ENOMEM;
+			break;
+		}
+		memset(va, 0, dma_len);
+		mr->pbl_table[i].va = va;
+		mr->pbl_table[i].pa = pa;
+	}
+	return status;
+}
+
+static void build_user_pbes(struct ocrdma_dev *dev, struct ocrdma_mr *mr,
+			    u32 num_pbes)
+{
+	struct ocrdma_pbe *pbe;
+	struct scatterlist *sg;
+	struct ocrdma_pbl *pbl_tbl = mr->hwmr.pbl_table;
+	struct ib_umem *umem = mr->umem;
+	int shift, pg_cnt, pages, pbe_cnt, entry, total_num_pbes = 0;
+
+	if (!mr->hwmr.num_pbes)
+		return;
+
+	pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+	pbe_cnt = 0;
+
+	shift = ilog2(umem->page_size);
+
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+		pages = sg_dma_len(sg) >> shift;
+		for (pg_cnt = 0; pg_cnt < pages; pg_cnt++) {
+			/* store the page address in pbe */
+			pbe->pa_lo =
+			    cpu_to_le32(sg_dma_address
+					(sg) +
+					(umem->page_size * pg_cnt));
+			pbe->pa_hi =
+			    cpu_to_le32(upper_32_bits
+					((sg_dma_address
+					  (sg) +
+					  umem->page_size * pg_cnt)));
+			pbe_cnt += 1;
+			total_num_pbes += 1;
+			pbe++;
+
+			/* if done building pbes, issue the mbx cmd. */
+			if (total_num_pbes == num_pbes)
+				return;
+
+			/* if the given pbl is full storing the pbes,
+			 * move to next pbl.
+			 */
+			if (pbe_cnt ==
+				(mr->hwmr.pbl_size / sizeof(u64))) {
+				pbl_tbl++;
+				pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+				pbe_cnt = 0;
+			}
+
+		}
+	}
+}
+
+struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *ibpd, u64 start, u64 len,
+				 u64 usr_addr, int acc, struct ib_udata *udata)
+{
+	int status = -ENOMEM;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	struct ocrdma_mr *mr;
+	struct ocrdma_pd *pd;
+	u32 num_pbes;
+
+	pd = get_ocrdma_pd(ibpd);
+
+	if (acc & IB_ACCESS_REMOTE_WRITE && !(acc & IB_ACCESS_LOCAL_WRITE))
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(status);
+	mr->umem = ib_umem_get(ibpd->uobject->context, start, len, acc, 0);
+	if (IS_ERR(mr->umem)) {
+		status = -EFAULT;
+		goto umem_err;
+	}
+	num_pbes = ib_umem_page_count(mr->umem);
+	status = ocrdma_get_pbl_info(dev, mr, num_pbes);
+	if (status)
+		goto umem_err;
+
+	mr->hwmr.pbe_size = mr->umem->page_size;
+	mr->hwmr.fbo = ib_umem_offset(mr->umem);
+	mr->hwmr.va = usr_addr;
+	mr->hwmr.len = len;
+	mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0;
+	mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0;
+	mr->hwmr.local_rd = 1;
+	mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
+	status = ocrdma_build_pbl_tbl(dev, &mr->hwmr);
+	if (status)
+		goto umem_err;
+	build_user_pbes(dev, mr, num_pbes);
+	status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, acc);
+	if (status)
+		goto mbx_err;
+	mr->ibmr.lkey = mr->hwmr.lkey;
+	if (mr->hwmr.remote_wr || mr->hwmr.remote_rd)
+		mr->ibmr.rkey = mr->hwmr.lkey;
+
+	return &mr->ibmr;
+
+mbx_err:
+	ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
+umem_err:
+	kfree(mr);
+	return ERR_PTR(status);
+}
+
+int ocrdma_dereg_mr(struct ib_mr *ib_mr)
+{
+	struct ocrdma_mr *mr = get_ocrdma_mr(ib_mr);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ib_mr->device);
+
+	(void) ocrdma_mbx_dealloc_lkey(dev, mr->hwmr.fr_mr, mr->hwmr.lkey);
+
+	ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
+
+	/* it could be user registered memory. */
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+	kfree(mr);
+
+	/* Don't stop cleanup, in case FW is unresponsive */
+	if (dev->mqe_ctx.fw_error_state) {
+		pr_err("%s(%d) fw not responding.\n",
+		       __func__, dev->id);
+	}
+	return 0;
+}
+
+static int ocrdma_copy_cq_uresp(struct ocrdma_dev *dev, struct ocrdma_cq *cq,
+				struct ib_udata *udata,
+				struct ib_ucontext *ib_ctx)
+{
+	int status;
+	struct ocrdma_ucontext *uctx = get_ocrdma_ucontext(ib_ctx);
+	struct ocrdma_create_cq_uresp uresp;
+
+	memset(&uresp, 0, sizeof(uresp));
+	uresp.cq_id = cq->id;
+	uresp.page_size = PAGE_ALIGN(cq->len);
+	uresp.num_pages = 1;
+	uresp.max_hw_cqe = cq->max_hw_cqe;
+	uresp.page_addr[0] = virt_to_phys(cq->va);
+	uresp.db_page_addr =  ocrdma_get_db_addr(dev, uctx->cntxt_pd->id);
+	uresp.db_page_size = dev->nic_info.db_page_size;
+	uresp.phase_change = cq->phase_change ? 1 : 0;
+	status = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (status) {
+		pr_err("%s(%d) copy error cqid=0x%x.\n",
+		       __func__, dev->id, cq->id);
+		goto err;
+	}
+	status = ocrdma_add_mmap(uctx, uresp.db_page_addr, uresp.db_page_size);
+	if (status)
+		goto err;
+	status = ocrdma_add_mmap(uctx, uresp.page_addr[0], uresp.page_size);
+	if (status) {
+		ocrdma_del_mmap(uctx, uresp.db_page_addr, uresp.db_page_size);
+		goto err;
+	}
+	cq->ucontext = uctx;
+err:
+	return status;
+}
+
+struct ib_cq *ocrdma_create_cq(struct ib_device *ibdev, int entries, int vector,
+			       struct ib_ucontext *ib_ctx,
+			       struct ib_udata *udata)
+{
+	struct ocrdma_cq *cq;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibdev);
+	struct ocrdma_ucontext *uctx = NULL;
+	u16 pd_id = 0;
+	int status;
+	struct ocrdma_create_cq_ureq ureq;
+
+	if (udata) {
+		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
+			return ERR_PTR(-EFAULT);
+	} else
+		ureq.dpp_cq = 0;
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-ENOMEM);
+
+	spin_lock_init(&cq->cq_lock);
+	spin_lock_init(&cq->comp_handler_lock);
+	INIT_LIST_HEAD(&cq->sq_head);
+	INIT_LIST_HEAD(&cq->rq_head);
+	cq->first_arm = true;
+
+	if (ib_ctx) {
+		uctx = get_ocrdma_ucontext(ib_ctx);
+		pd_id = uctx->cntxt_pd->id;
+	}
+
+	status = ocrdma_mbx_create_cq(dev, cq, entries, ureq.dpp_cq, pd_id);
+	if (status) {
+		kfree(cq);
+		return ERR_PTR(status);
+	}
+	if (ib_ctx) {
+		status = ocrdma_copy_cq_uresp(dev, cq, udata, ib_ctx);
+		if (status)
+			goto ctx_err;
+	}
+	cq->phase = OCRDMA_CQE_VALID;
+	dev->cq_tbl[cq->id] = cq;
+	return &cq->ibcq;
+
+ctx_err:
+	ocrdma_mbx_destroy_cq(dev, cq);
+	kfree(cq);
+	return ERR_PTR(status);
+}
+
+int ocrdma_resize_cq(struct ib_cq *ibcq, int new_cnt,
+		     struct ib_udata *udata)
+{
+	int status = 0;
+	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
+
+	if (new_cnt < 1 || new_cnt > cq->max_hw_cqe) {
+		status = -EINVAL;
+		return status;
+	}
+	ibcq->cqe = new_cnt;
+	return status;
+}
+
+static void ocrdma_flush_cq(struct ocrdma_cq *cq)
+{
+	int cqe_cnt;
+	int valid_count = 0;
+	unsigned long flags;
+
+	struct ocrdma_dev *dev = get_ocrdma_dev(cq->ibcq.device);
+	struct ocrdma_cqe *cqe = NULL;
+
+	cqe = cq->va;
+	cqe_cnt = cq->cqe_cnt;
+
+	/* Last irq might have scheduled a polling thread
+	 * sync-up with it before hard flushing.
+	 */
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	while (cqe_cnt) {
+		if (is_cqe_valid(cq, cqe))
+			valid_count++;
+		cqe++;
+		cqe_cnt--;
+	}
+	ocrdma_ring_cq_db(dev, cq->id, false, false, valid_count);
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+}
+
+int ocrdma_destroy_cq(struct ib_cq *ibcq)
+{
+	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
+	struct ocrdma_eq *eq = NULL;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device);
+	int pdid = 0;
+	u32 irq, indx;
+
+	dev->cq_tbl[cq->id] = NULL;
+	indx = ocrdma_get_eq_table_index(dev, cq->eqn);
+	if (indx == -EINVAL)
+		BUG();
+
+	eq = &dev->eq_tbl[indx];
+	irq = ocrdma_get_irq(dev, eq);
+	synchronize_irq(irq);
+	ocrdma_flush_cq(cq);
+
+	(void)ocrdma_mbx_destroy_cq(dev, cq);
+	if (cq->ucontext) {
+		pdid = cq->ucontext->cntxt_pd->id;
+		ocrdma_del_mmap(cq->ucontext, (u64) cq->pa,
+				PAGE_ALIGN(cq->len));
+		ocrdma_del_mmap(cq->ucontext,
+				ocrdma_get_db_addr(dev, pdid),
+				dev->nic_info.db_page_size);
+	}
+
+	kfree(cq);
+	return 0;
+}
+
+static int ocrdma_add_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp)
+{
+	int status = -EINVAL;
+
+	if (qp->id < OCRDMA_MAX_QP && dev->qp_tbl[qp->id] == NULL) {
+		dev->qp_tbl[qp->id] = qp;
+		status = 0;
+	}
+	return status;
+}
+
+static void ocrdma_del_qpn_map(struct ocrdma_dev *dev, struct ocrdma_qp *qp)
+{
+	dev->qp_tbl[qp->id] = NULL;
+}
+
+static int ocrdma_check_qp_params(struct ib_pd *ibpd, struct ocrdma_dev *dev,
+				  struct ib_qp_init_attr *attrs)
+{
+	if ((attrs->qp_type != IB_QPT_GSI) &&
+	    (attrs->qp_type != IB_QPT_RC) &&
+	    (attrs->qp_type != IB_QPT_UC) &&
+	    (attrs->qp_type != IB_QPT_UD)) {
+		pr_err("%s(%d) unsupported qp type=0x%x requested\n",
+		       __func__, dev->id, attrs->qp_type);
+		return -EINVAL;
+	}
+	/* Skip the check for QP1 to support CM size of 128 */
+	if ((attrs->qp_type != IB_QPT_GSI) &&
+	    (attrs->cap.max_send_wr > dev->attr.max_wqe)) {
+		pr_err("%s(%d) unsupported send_wr=0x%x requested\n",
+		       __func__, dev->id, attrs->cap.max_send_wr);
+		pr_err("%s(%d) supported send_wr=0x%x\n",
+		       __func__, dev->id, dev->attr.max_wqe);
+		return -EINVAL;
+	}
+	if (!attrs->srq && (attrs->cap.max_recv_wr > dev->attr.max_rqe)) {
+		pr_err("%s(%d) unsupported recv_wr=0x%x requested\n",
+		       __func__, dev->id, attrs->cap.max_recv_wr);
+		pr_err("%s(%d) supported recv_wr=0x%x\n",
+		       __func__, dev->id, dev->attr.max_rqe);
+		return -EINVAL;
+	}
+	if (attrs->cap.max_inline_data > dev->attr.max_inline_data) {
+		pr_err("%s(%d) unsupported inline data size=0x%x requested\n",
+		       __func__, dev->id, attrs->cap.max_inline_data);
+		pr_err("%s(%d) supported inline data size=0x%x\n",
+		       __func__, dev->id, dev->attr.max_inline_data);
+		return -EINVAL;
+	}
+	if (attrs->cap.max_send_sge > dev->attr.max_send_sge) {
+		pr_err("%s(%d) unsupported send_sge=0x%x requested\n",
+		       __func__, dev->id, attrs->cap.max_send_sge);
+		pr_err("%s(%d) supported send_sge=0x%x\n",
+		       __func__, dev->id, dev->attr.max_send_sge);
+		return -EINVAL;
+	}
+	if (attrs->cap.max_recv_sge > dev->attr.max_recv_sge) {
+		pr_err("%s(%d) unsupported recv_sge=0x%x requested\n",
+		       __func__, dev->id, attrs->cap.max_recv_sge);
+		pr_err("%s(%d) supported recv_sge=0x%x\n",
+		       __func__, dev->id, dev->attr.max_recv_sge);
+		return -EINVAL;
+	}
+	/* unprivileged user space cannot create special QP */
+	if (ibpd->uobject && attrs->qp_type == IB_QPT_GSI) {
+		pr_err
+		    ("%s(%d) Userspace can't create special QPs of type=0x%x\n",
+		     __func__, dev->id, attrs->qp_type);
+		return -EINVAL;
+	}
+	/* allow creating only one GSI type of QP */
+	if (attrs->qp_type == IB_QPT_GSI && dev->gsi_qp_created) {
+		pr_err("%s(%d) GSI special QPs already created.\n",
+		       __func__, dev->id);
+		return -EINVAL;
+	}
+	/* verify consumer QPs are not trying to use GSI QP's CQ */
+	if ((attrs->qp_type != IB_QPT_GSI) && (dev->gsi_qp_created)) {
+		if ((dev->gsi_sqcq == get_ocrdma_cq(attrs->send_cq)) ||
+			(dev->gsi_rqcq == get_ocrdma_cq(attrs->recv_cq))) {
+			pr_err("%s(%d) Consumer QP cannot use GSI CQs.\n",
+				__func__, dev->id);
+			return -EINVAL;
+		}
+	}
+	return 0;
+}
+
+static int ocrdma_copy_qp_uresp(struct ocrdma_qp *qp,
+				struct ib_udata *udata, int dpp_offset,
+				int dpp_credit_lmt, int srq)
+{
+	int status = 0;
+	u64 usr_db;
+	struct ocrdma_create_qp_uresp uresp;
+	struct ocrdma_pd *pd = qp->pd;
+	struct ocrdma_dev *dev = get_ocrdma_dev(pd->ibpd.device);
+
+	memset(&uresp, 0, sizeof(uresp));
+	usr_db = dev->nic_info.unmapped_db +
+			(pd->id * dev->nic_info.db_page_size);
+	uresp.qp_id = qp->id;
+	uresp.sq_dbid = qp->sq.dbid;
+	uresp.num_sq_pages = 1;
+	uresp.sq_page_size = PAGE_ALIGN(qp->sq.len);
+	uresp.sq_page_addr[0] = virt_to_phys(qp->sq.va);
+	uresp.num_wqe_allocated = qp->sq.max_cnt;
+	if (!srq) {
+		uresp.rq_dbid = qp->rq.dbid;
+		uresp.num_rq_pages = 1;
+		uresp.rq_page_size = PAGE_ALIGN(qp->rq.len);
+		uresp.rq_page_addr[0] = virt_to_phys(qp->rq.va);
+		uresp.num_rqe_allocated = qp->rq.max_cnt;
+	}
+	uresp.db_page_addr = usr_db;
+	uresp.db_page_size = dev->nic_info.db_page_size;
+	uresp.db_sq_offset = OCRDMA_DB_GEN2_SQ_OFFSET;
+	uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET;
+	uresp.db_shift = OCRDMA_DB_RQ_SHIFT;
+
+	if (qp->dpp_enabled) {
+		uresp.dpp_credit = dpp_credit_lmt;
+		uresp.dpp_offset = dpp_offset;
+	}
+	status = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (status) {
+		pr_err("%s(%d) user copy error.\n", __func__, dev->id);
+		goto err;
+	}
+	status = ocrdma_add_mmap(pd->uctx, uresp.sq_page_addr[0],
+				 uresp.sq_page_size);
+	if (status)
+		goto err;
+
+	if (!srq) {
+		status = ocrdma_add_mmap(pd->uctx, uresp.rq_page_addr[0],
+					 uresp.rq_page_size);
+		if (status)
+			goto rq_map_err;
+	}
+	return status;
+rq_map_err:
+	ocrdma_del_mmap(pd->uctx, uresp.sq_page_addr[0], uresp.sq_page_size);
+err:
+	return status;
+}
+
+static void ocrdma_set_qp_db(struct ocrdma_dev *dev, struct ocrdma_qp *qp,
+			     struct ocrdma_pd *pd)
+{
+	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
+		qp->sq_db = dev->nic_info.db +
+			(pd->id * dev->nic_info.db_page_size) +
+			OCRDMA_DB_GEN2_SQ_OFFSET;
+		qp->rq_db = dev->nic_info.db +
+			(pd->id * dev->nic_info.db_page_size) +
+			OCRDMA_DB_GEN2_RQ_OFFSET;
+	} else {
+		qp->sq_db = dev->nic_info.db +
+			(pd->id * dev->nic_info.db_page_size) +
+			OCRDMA_DB_SQ_OFFSET;
+		qp->rq_db = dev->nic_info.db +
+			(pd->id * dev->nic_info.db_page_size) +
+			OCRDMA_DB_RQ_OFFSET;
+	}
+}
+
+static int ocrdma_alloc_wr_id_tbl(struct ocrdma_qp *qp)
+{
+	qp->wqe_wr_id_tbl =
+	    kzalloc(sizeof(*(qp->wqe_wr_id_tbl)) * qp->sq.max_cnt,
+		    GFP_KERNEL);
+	if (qp->wqe_wr_id_tbl == NULL)
+		return -ENOMEM;
+	qp->rqe_wr_id_tbl =
+	    kzalloc(sizeof(u64) * qp->rq.max_cnt, GFP_KERNEL);
+	if (qp->rqe_wr_id_tbl == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+
+static void ocrdma_set_qp_init_params(struct ocrdma_qp *qp,
+				      struct ocrdma_pd *pd,
+				      struct ib_qp_init_attr *attrs)
+{
+	qp->pd = pd;
+	spin_lock_init(&qp->q_lock);
+	INIT_LIST_HEAD(&qp->sq_entry);
+	INIT_LIST_HEAD(&qp->rq_entry);
+
+	qp->qp_type = attrs->qp_type;
+	qp->cap_flags = OCRDMA_QP_INB_RD | OCRDMA_QP_INB_WR;
+	qp->max_inline_data = attrs->cap.max_inline_data;
+	qp->sq.max_sges = attrs->cap.max_send_sge;
+	qp->rq.max_sges = attrs->cap.max_recv_sge;
+	qp->state = OCRDMA_QPS_RST;
+	qp->signaled = (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) ? true : false;
+}
+
+static void ocrdma_store_gsi_qp_cq(struct ocrdma_dev *dev,
+				   struct ib_qp_init_attr *attrs)
+{
+	if (attrs->qp_type == IB_QPT_GSI) {
+		dev->gsi_qp_created = 1;
+		dev->gsi_sqcq = get_ocrdma_cq(attrs->send_cq);
+		dev->gsi_rqcq = get_ocrdma_cq(attrs->recv_cq);
+	}
+}
+
+struct ib_qp *ocrdma_create_qp(struct ib_pd *ibpd,
+			       struct ib_qp_init_attr *attrs,
+			       struct ib_udata *udata)
+{
+	int status;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_qp *qp;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	struct ocrdma_create_qp_ureq ureq;
+	u16 dpp_credit_lmt, dpp_offset;
+
+	status = ocrdma_check_qp_params(ibpd, dev, attrs);
+	if (status)
+		goto gen_err;
+
+	memset(&ureq, 0, sizeof(ureq));
+	if (udata) {
+		if (ib_copy_from_udata(&ureq, udata, sizeof(ureq)))
+			return ERR_PTR(-EFAULT);
+	}
+	qp = kzalloc(sizeof(*qp), GFP_KERNEL);
+	if (!qp) {
+		status = -ENOMEM;
+		goto gen_err;
+	}
+	ocrdma_set_qp_init_params(qp, pd, attrs);
+	if (udata == NULL)
+		qp->cap_flags |= (OCRDMA_QP_MW_BIND | OCRDMA_QP_LKEY0 |
+					OCRDMA_QP_FAST_REG);
+
+	mutex_lock(&dev->dev_lock);
+	status = ocrdma_mbx_create_qp(qp, attrs, ureq.enable_dpp_cq,
+					ureq.dpp_cq_id,
+					&dpp_offset, &dpp_credit_lmt);
+	if (status)
+		goto mbx_err;
+
+	/* user space QP's wr_id table are managed in library */
+	if (udata == NULL) {
+		status = ocrdma_alloc_wr_id_tbl(qp);
+		if (status)
+			goto map_err;
+	}
+
+	status = ocrdma_add_qpn_map(dev, qp);
+	if (status)
+		goto map_err;
+	ocrdma_set_qp_db(dev, qp, pd);
+	if (udata) {
+		status = ocrdma_copy_qp_uresp(qp, udata, dpp_offset,
+					      dpp_credit_lmt,
+					      (attrs->srq != NULL));
+		if (status)
+			goto cpy_err;
+	}
+	ocrdma_store_gsi_qp_cq(dev, attrs);
+	qp->ibqp.qp_num = qp->id;
+	mutex_unlock(&dev->dev_lock);
+	return &qp->ibqp;
+
+cpy_err:
+	ocrdma_del_qpn_map(dev, qp);
+map_err:
+	ocrdma_mbx_destroy_qp(dev, qp);
+mbx_err:
+	mutex_unlock(&dev->dev_lock);
+	kfree(qp->wqe_wr_id_tbl);
+	kfree(qp->rqe_wr_id_tbl);
+	kfree(qp);
+	pr_err("%s(%d) error=%d\n", __func__, dev->id, status);
+gen_err:
+	return ERR_PTR(status);
+}
+
+int _ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		      int attr_mask)
+{
+	int status = 0;
+	struct ocrdma_qp *qp;
+	struct ocrdma_dev *dev;
+	enum ib_qp_state old_qps;
+
+	qp = get_ocrdma_qp(ibqp);
+	dev = get_ocrdma_dev(ibqp->device);
+	if (attr_mask & IB_QP_STATE)
+		status = ocrdma_qp_state_change(qp, attr->qp_state, &old_qps);
+	/* if new and previous states are same hw doesn't need to
+	 * know about it.
+	 */
+	if (status < 0)
+		return status;
+	status = ocrdma_mbx_modify_qp(dev, qp, attr, attr_mask);
+
+	return status;
+}
+
+int ocrdma_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		     int attr_mask, struct ib_udata *udata)
+{
+	unsigned long flags;
+	int status = -EINVAL;
+	struct ocrdma_qp *qp;
+	struct ocrdma_dev *dev;
+	enum ib_qp_state old_qps, new_qps;
+
+	qp = get_ocrdma_qp(ibqp);
+	dev = get_ocrdma_dev(ibqp->device);
+
+	/* syncronize with multiple context trying to change, retrive qps */
+	mutex_lock(&dev->dev_lock);
+	/* syncronize with wqe, rqe posting and cqe processing contexts */
+	spin_lock_irqsave(&qp->q_lock, flags);
+	old_qps = get_ibqp_state(qp->state);
+	if (attr_mask & IB_QP_STATE)
+		new_qps = attr->qp_state;
+	else
+		new_qps = old_qps;
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+
+	if (!ib_modify_qp_is_ok(old_qps, new_qps, ibqp->qp_type, attr_mask,
+				IB_LINK_LAYER_ETHERNET)) {
+		pr_err("%s(%d) invalid attribute mask=0x%x specified for\n"
+		       "qpn=0x%x of type=0x%x old_qps=0x%x, new_qps=0x%x\n",
+		       __func__, dev->id, attr_mask, qp->id, ibqp->qp_type,
+		       old_qps, new_qps);
+		goto param_err;
+	}
+
+	status = _ocrdma_modify_qp(ibqp, attr, attr_mask);
+	if (status > 0)
+		status = 0;
+param_err:
+	mutex_unlock(&dev->dev_lock);
+	return status;
+}
+
+static enum ib_mtu ocrdma_mtu_int_to_enum(u16 mtu)
+{
+	switch (mtu) {
+	case 256:
+		return IB_MTU_256;
+	case 512:
+		return IB_MTU_512;
+	case 1024:
+		return IB_MTU_1024;
+	case 2048:
+		return IB_MTU_2048;
+	case 4096:
+		return IB_MTU_4096;
+	default:
+		return IB_MTU_1024;
+	}
+}
+
+static int ocrdma_to_ib_qp_acc_flags(int qp_cap_flags)
+{
+	int ib_qp_acc_flags = 0;
+
+	if (qp_cap_flags & OCRDMA_QP_INB_WR)
+		ib_qp_acc_flags |= IB_ACCESS_REMOTE_WRITE;
+	if (qp_cap_flags & OCRDMA_QP_INB_RD)
+		ib_qp_acc_flags |= IB_ACCESS_LOCAL_WRITE;
+	return ib_qp_acc_flags;
+}
+
+int ocrdma_query_qp(struct ib_qp *ibqp,
+		    struct ib_qp_attr *qp_attr,
+		    int attr_mask, struct ib_qp_init_attr *qp_init_attr)
+{
+	int status;
+	u32 qp_state;
+	struct ocrdma_qp_params params;
+	struct ocrdma_qp *qp = get_ocrdma_qp(ibqp);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibqp->device);
+
+	memset(&params, 0, sizeof(params));
+	mutex_lock(&dev->dev_lock);
+	status = ocrdma_mbx_query_qp(dev, qp, &params);
+	mutex_unlock(&dev->dev_lock);
+	if (status)
+		goto mbx_err;
+	if (qp->qp_type == IB_QPT_UD)
+		qp_attr->qkey = params.qkey;
+	qp_attr->path_mtu =
+		ocrdma_mtu_int_to_enum(params.path_mtu_pkey_indx &
+				OCRDMA_QP_PARAMS_PATH_MTU_MASK) >>
+				OCRDMA_QP_PARAMS_PATH_MTU_SHIFT;
+	qp_attr->path_mig_state = IB_MIG_MIGRATED;
+	qp_attr->rq_psn = params.hop_lmt_rq_psn & OCRDMA_QP_PARAMS_RQ_PSN_MASK;
+	qp_attr->sq_psn = params.tclass_sq_psn & OCRDMA_QP_PARAMS_SQ_PSN_MASK;
+	qp_attr->dest_qp_num =
+	    params.ack_to_rnr_rtc_dest_qpn & OCRDMA_QP_PARAMS_DEST_QPN_MASK;
+
+	qp_attr->qp_access_flags = ocrdma_to_ib_qp_acc_flags(qp->cap_flags);
+	qp_attr->cap.max_send_wr = qp->sq.max_cnt - 1;
+	qp_attr->cap.max_recv_wr = qp->rq.max_cnt - 1;
+	qp_attr->cap.max_send_sge = qp->sq.max_sges;
+	qp_attr->cap.max_recv_sge = qp->rq.max_sges;
+	qp_attr->cap.max_inline_data = qp->max_inline_data;
+	qp_init_attr->cap = qp_attr->cap;
+	memcpy(&qp_attr->ah_attr.grh.dgid, &params.dgid[0],
+	       sizeof(params.dgid));
+	qp_attr->ah_attr.grh.flow_label = params.rnt_rc_sl_fl &
+	    OCRDMA_QP_PARAMS_FLOW_LABEL_MASK;
+	qp_attr->ah_attr.grh.sgid_index = qp->sgid_idx;
+	qp_attr->ah_attr.grh.hop_limit = (params.hop_lmt_rq_psn &
+					  OCRDMA_QP_PARAMS_HOP_LMT_MASK) >>
+						OCRDMA_QP_PARAMS_HOP_LMT_SHIFT;
+	qp_attr->ah_attr.grh.traffic_class = (params.tclass_sq_psn &
+					      OCRDMA_QP_PARAMS_TCLASS_MASK) >>
+						OCRDMA_QP_PARAMS_TCLASS_SHIFT;
+
+	qp_attr->ah_attr.ah_flags = IB_AH_GRH;
+	qp_attr->ah_attr.port_num = 1;
+	qp_attr->ah_attr.sl = (params.rnt_rc_sl_fl &
+			       OCRDMA_QP_PARAMS_SL_MASK) >>
+				OCRDMA_QP_PARAMS_SL_SHIFT;
+	qp_attr->timeout = (params.ack_to_rnr_rtc_dest_qpn &
+			    OCRDMA_QP_PARAMS_ACK_TIMEOUT_MASK) >>
+				OCRDMA_QP_PARAMS_ACK_TIMEOUT_SHIFT;
+	qp_attr->rnr_retry = (params.ack_to_rnr_rtc_dest_qpn &
+			      OCRDMA_QP_PARAMS_RNR_RETRY_CNT_MASK) >>
+				OCRDMA_QP_PARAMS_RNR_RETRY_CNT_SHIFT;
+	qp_attr->retry_cnt =
+	    (params.rnt_rc_sl_fl & OCRDMA_QP_PARAMS_RETRY_CNT_MASK) >>
+		OCRDMA_QP_PARAMS_RETRY_CNT_SHIFT;
+	qp_attr->min_rnr_timer = 0;
+	qp_attr->pkey_index = 0;
+	qp_attr->port_num = 1;
+	qp_attr->ah_attr.src_path_bits = 0;
+	qp_attr->ah_attr.static_rate = 0;
+	qp_attr->alt_pkey_index = 0;
+	qp_attr->alt_port_num = 0;
+	qp_attr->alt_timeout = 0;
+	memset(&qp_attr->alt_ah_attr, 0, sizeof(qp_attr->alt_ah_attr));
+	qp_state = (params.max_sge_recv_flags & OCRDMA_QP_PARAMS_STATE_MASK) >>
+		    OCRDMA_QP_PARAMS_STATE_SHIFT;
+	qp_attr->qp_state = get_ibqp_state(qp_state);
+	qp_attr->cur_qp_state = qp_attr->qp_state;
+	qp_attr->sq_draining = (qp_state == OCRDMA_QPS_SQ_DRAINING) ? 1 : 0;
+	qp_attr->max_dest_rd_atomic =
+	    params.max_ord_ird >> OCRDMA_QP_PARAMS_MAX_ORD_SHIFT;
+	qp_attr->max_rd_atomic =
+	    params.max_ord_ird & OCRDMA_QP_PARAMS_MAX_IRD_MASK;
+	qp_attr->en_sqd_async_notify = (params.max_sge_recv_flags &
+				OCRDMA_QP_PARAMS_FLAGS_SQD_ASYNC) ? 1 : 0;
+	/* Sync driver QP state with FW */
+	ocrdma_qp_state_change(qp, qp_attr->qp_state, NULL);
+mbx_err:
+	return status;
+}
+
+static void ocrdma_srq_toggle_bit(struct ocrdma_srq *srq, unsigned int idx)
+{
+	unsigned int i = idx / 32;
+	u32 mask = (1U << (idx % 32));
+
+	srq->idx_bit_fields[i] ^= mask;
+}
+
+static int ocrdma_hwq_free_cnt(struct ocrdma_qp_hwq_info *q)
+{
+	return ((q->max_wqe_idx - q->head) + q->tail) % q->max_cnt;
+}
+
+static int is_hw_sq_empty(struct ocrdma_qp *qp)
+{
+	return (qp->sq.tail == qp->sq.head);
+}
+
+static int is_hw_rq_empty(struct ocrdma_qp *qp)
+{
+	return (qp->rq.tail == qp->rq.head);
+}
+
+static void *ocrdma_hwq_head(struct ocrdma_qp_hwq_info *q)
+{
+	return q->va + (q->head * q->entry_size);
+}
+
+static void *ocrdma_hwq_head_from_idx(struct ocrdma_qp_hwq_info *q,
+				      u32 idx)
+{
+	return q->va + (idx * q->entry_size);
+}
+
+static void ocrdma_hwq_inc_head(struct ocrdma_qp_hwq_info *q)
+{
+	q->head = (q->head + 1) & q->max_wqe_idx;
+}
+
+static void ocrdma_hwq_inc_tail(struct ocrdma_qp_hwq_info *q)
+{
+	q->tail = (q->tail + 1) & q->max_wqe_idx;
+}
+
+/* discard the cqe for a given QP */
+static void ocrdma_discard_cqes(struct ocrdma_qp *qp, struct ocrdma_cq *cq)
+{
+	unsigned long cq_flags;
+	unsigned long flags;
+	int discard_cnt = 0;
+	u32 cur_getp, stop_getp;
+	struct ocrdma_cqe *cqe;
+	u32 qpn = 0, wqe_idx = 0;
+
+	spin_lock_irqsave(&cq->cq_lock, cq_flags);
+
+	/* traverse through the CQEs in the hw CQ,
+	 * find the matching CQE for a given qp,
+	 * mark the matching one discarded by clearing qpn.
+	 * ring the doorbell in the poll_cq() as
+	 * we don't complete out of order cqe.
+	 */
+
+	cur_getp = cq->getp;
+	/* find upto when do we reap the cq. */
+	stop_getp = cur_getp;
+	do {
+		if (is_hw_sq_empty(qp) && (!qp->srq && is_hw_rq_empty(qp)))
+			break;
+
+		cqe = cq->va + cur_getp;
+		/* if (a) done reaping whole hw cq, or
+		 *    (b) qp_xq becomes empty.
+		 * then exit
+		 */
+		qpn = cqe->cmn.qpn & OCRDMA_CQE_QPN_MASK;
+		/* if previously discarded cqe found, skip that too. */
+		/* check for matching qp */
+		if (qpn == 0 || qpn != qp->id)
+			goto skip_cqe;
+
+		if (is_cqe_for_sq(cqe)) {
+			ocrdma_hwq_inc_tail(&qp->sq);
+		} else {
+			if (qp->srq) {
+				wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >>
+					OCRDMA_CQE_BUFTAG_SHIFT) &
+					qp->srq->rq.max_wqe_idx;
+				if (wqe_idx < 1)
+					BUG();
+				spin_lock_irqsave(&qp->srq->q_lock, flags);
+				ocrdma_hwq_inc_tail(&qp->srq->rq);
+				ocrdma_srq_toggle_bit(qp->srq, wqe_idx - 1);
+				spin_unlock_irqrestore(&qp->srq->q_lock, flags);
+
+			} else {
+				ocrdma_hwq_inc_tail(&qp->rq);
+			}
+		}
+		/* mark cqe discarded so that it is not picked up later
+		 * in the poll_cq().
+		 */
+		discard_cnt += 1;
+		cqe->cmn.qpn = 0;
+skip_cqe:
+		cur_getp = (cur_getp + 1) % cq->max_hw_cqe;
+	} while (cur_getp != stop_getp);
+	spin_unlock_irqrestore(&cq->cq_lock, cq_flags);
+}
+
+void ocrdma_del_flush_qp(struct ocrdma_qp *qp)
+{
+	int found = false;
+	unsigned long flags;
+	struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device);
+	/* sync with any active CQ poll */
+
+	spin_lock_irqsave(&dev->flush_q_lock, flags);
+	found = ocrdma_is_qp_in_sq_flushlist(qp->sq_cq, qp);
+	if (found)
+		list_del(&qp->sq_entry);
+	if (!qp->srq) {
+		found = ocrdma_is_qp_in_rq_flushlist(qp->rq_cq, qp);
+		if (found)
+			list_del(&qp->rq_entry);
+	}
+	spin_unlock_irqrestore(&dev->flush_q_lock, flags);
+}
+
+int ocrdma_destroy_qp(struct ib_qp *ibqp)
+{
+	struct ocrdma_pd *pd;
+	struct ocrdma_qp *qp;
+	struct ocrdma_dev *dev;
+	struct ib_qp_attr attrs;
+	int attr_mask;
+	unsigned long flags;
+
+	qp = get_ocrdma_qp(ibqp);
+	dev = get_ocrdma_dev(ibqp->device);
+
+	pd = qp->pd;
+
+	/* change the QP state to ERROR */
+	if (qp->state != OCRDMA_QPS_RST) {
+		attrs.qp_state = IB_QPS_ERR;
+		attr_mask = IB_QP_STATE;
+		_ocrdma_modify_qp(ibqp, &attrs, attr_mask);
+	}
+	/* ensure that CQEs for newly created QP (whose id may be same with
+	 * one which just getting destroyed are same), dont get
+	 * discarded until the old CQEs are discarded.
+	 */
+	mutex_lock(&dev->dev_lock);
+	(void) ocrdma_mbx_destroy_qp(dev, qp);
+
+	/*
+	 * acquire CQ lock while destroy is in progress, in order to
+	 * protect against proessing in-flight CQEs for this QP.
+	 */
+	spin_lock_irqsave(&qp->sq_cq->cq_lock, flags);
+	if (qp->rq_cq && (qp->rq_cq != qp->sq_cq))
+		spin_lock(&qp->rq_cq->cq_lock);
+
+	ocrdma_del_qpn_map(dev, qp);
+
+	if (qp->rq_cq && (qp->rq_cq != qp->sq_cq))
+		spin_unlock(&qp->rq_cq->cq_lock);
+	spin_unlock_irqrestore(&qp->sq_cq->cq_lock, flags);
+
+	if (!pd->uctx) {
+		ocrdma_discard_cqes(qp, qp->sq_cq);
+		ocrdma_discard_cqes(qp, qp->rq_cq);
+	}
+	mutex_unlock(&dev->dev_lock);
+
+	if (pd->uctx) {
+		ocrdma_del_mmap(pd->uctx, (u64) qp->sq.pa,
+				PAGE_ALIGN(qp->sq.len));
+		if (!qp->srq)
+			ocrdma_del_mmap(pd->uctx, (u64) qp->rq.pa,
+					PAGE_ALIGN(qp->rq.len));
+	}
+
+	ocrdma_del_flush_qp(qp);
+
+	kfree(qp->wqe_wr_id_tbl);
+	kfree(qp->rqe_wr_id_tbl);
+	kfree(qp);
+	return 0;
+}
+
+static int ocrdma_copy_srq_uresp(struct ocrdma_dev *dev, struct ocrdma_srq *srq,
+				struct ib_udata *udata)
+{
+	int status;
+	struct ocrdma_create_srq_uresp uresp;
+
+	memset(&uresp, 0, sizeof(uresp));
+	uresp.rq_dbid = srq->rq.dbid;
+	uresp.num_rq_pages = 1;
+	uresp.rq_page_addr[0] = virt_to_phys(srq->rq.va);
+	uresp.rq_page_size = srq->rq.len;
+	uresp.db_page_addr = dev->nic_info.unmapped_db +
+	    (srq->pd->id * dev->nic_info.db_page_size);
+	uresp.db_page_size = dev->nic_info.db_page_size;
+	uresp.num_rqe_allocated = srq->rq.max_cnt;
+	if (ocrdma_get_asic_type(dev) == OCRDMA_ASIC_GEN_SKH_R) {
+		uresp.db_rq_offset = OCRDMA_DB_GEN2_RQ_OFFSET;
+		uresp.db_shift = 24;
+	} else {
+		uresp.db_rq_offset = OCRDMA_DB_RQ_OFFSET;
+		uresp.db_shift = 16;
+	}
+
+	status = ib_copy_to_udata(udata, &uresp, sizeof(uresp));
+	if (status)
+		return status;
+	status = ocrdma_add_mmap(srq->pd->uctx, uresp.rq_page_addr[0],
+				 uresp.rq_page_size);
+	if (status)
+		return status;
+	return status;
+}
+
+struct ib_srq *ocrdma_create_srq(struct ib_pd *ibpd,
+				 struct ib_srq_init_attr *init_attr,
+				 struct ib_udata *udata)
+{
+	int status = -ENOMEM;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	struct ocrdma_srq *srq;
+
+	if (init_attr->attr.max_sge > dev->attr.max_recv_sge)
+		return ERR_PTR(-EINVAL);
+	if (init_attr->attr.max_wr > dev->attr.max_rqe)
+		return ERR_PTR(-EINVAL);
+
+	srq = kzalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq)
+		return ERR_PTR(status);
+
+	spin_lock_init(&srq->q_lock);
+	srq->pd = pd;
+	srq->db = dev->nic_info.db + (pd->id * dev->nic_info.db_page_size);
+	status = ocrdma_mbx_create_srq(dev, srq, init_attr, pd);
+	if (status)
+		goto err;
+
+	if (udata == NULL) {
+		srq->rqe_wr_id_tbl = kzalloc(sizeof(u64) * srq->rq.max_cnt,
+			    GFP_KERNEL);
+		if (srq->rqe_wr_id_tbl == NULL)
+			goto arm_err;
+
+		srq->bit_fields_len = (srq->rq.max_cnt / 32) +
+		    (srq->rq.max_cnt % 32 ? 1 : 0);
+		srq->idx_bit_fields =
+		    kmalloc(srq->bit_fields_len * sizeof(u32), GFP_KERNEL);
+		if (srq->idx_bit_fields == NULL)
+			goto arm_err;
+		memset(srq->idx_bit_fields, 0xff,
+		       srq->bit_fields_len * sizeof(u32));
+	}
+
+	if (init_attr->attr.srq_limit) {
+		status = ocrdma_mbx_modify_srq(srq, &init_attr->attr);
+		if (status)
+			goto arm_err;
+	}
+
+	if (udata) {
+		status = ocrdma_copy_srq_uresp(dev, srq, udata);
+		if (status)
+			goto arm_err;
+	}
+
+	return &srq->ibsrq;
+
+arm_err:
+	ocrdma_mbx_destroy_srq(dev, srq);
+err:
+	kfree(srq->rqe_wr_id_tbl);
+	kfree(srq->idx_bit_fields);
+	kfree(srq);
+	return ERR_PTR(status);
+}
+
+int ocrdma_modify_srq(struct ib_srq *ibsrq,
+		      struct ib_srq_attr *srq_attr,
+		      enum ib_srq_attr_mask srq_attr_mask,
+		      struct ib_udata *udata)
+{
+	int status = 0;
+	struct ocrdma_srq *srq;
+
+	srq = get_ocrdma_srq(ibsrq);
+	if (srq_attr_mask & IB_SRQ_MAX_WR)
+		status = -EINVAL;
+	else
+		status = ocrdma_mbx_modify_srq(srq, srq_attr);
+	return status;
+}
+
+int ocrdma_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
+{
+	int status;
+	struct ocrdma_srq *srq;
+
+	srq = get_ocrdma_srq(ibsrq);
+	status = ocrdma_mbx_query_srq(srq, srq_attr);
+	return status;
+}
+
+int ocrdma_destroy_srq(struct ib_srq *ibsrq)
+{
+	int status;
+	struct ocrdma_srq *srq;
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibsrq->device);
+
+	srq = get_ocrdma_srq(ibsrq);
+
+	status = ocrdma_mbx_destroy_srq(dev, srq);
+
+	if (srq->pd->uctx)
+		ocrdma_del_mmap(srq->pd->uctx, (u64) srq->rq.pa,
+				PAGE_ALIGN(srq->rq.len));
+
+	kfree(srq->idx_bit_fields);
+	kfree(srq->rqe_wr_id_tbl);
+	kfree(srq);
+	return status;
+}
+
+/* unprivileged verbs and their support functions. */
+static void ocrdma_build_ud_hdr(struct ocrdma_qp *qp,
+				struct ocrdma_hdr_wqe *hdr,
+				struct ib_send_wr *wr)
+{
+	struct ocrdma_ewqe_ud_hdr *ud_hdr =
+		(struct ocrdma_ewqe_ud_hdr *)(hdr + 1);
+	struct ocrdma_ah *ah = get_ocrdma_ah(wr->wr.ud.ah);
+
+	ud_hdr->rsvd_dest_qpn = wr->wr.ud.remote_qpn;
+	if (qp->qp_type == IB_QPT_GSI)
+		ud_hdr->qkey = qp->qkey;
+	else
+		ud_hdr->qkey = wr->wr.ud.remote_qkey;
+	ud_hdr->rsvd_ahid = ah->id;
+	if (ah->av->valid & OCRDMA_AV_VLAN_VALID)
+		hdr->cw |= (OCRDMA_FLAG_AH_VLAN_PR << OCRDMA_WQE_FLAGS_SHIFT);
+}
+
+static void ocrdma_build_sges(struct ocrdma_hdr_wqe *hdr,
+			      struct ocrdma_sge *sge, int num_sge,
+			      struct ib_sge *sg_list)
+{
+	int i;
+
+	for (i = 0; i < num_sge; i++) {
+		sge[i].lrkey = sg_list[i].lkey;
+		sge[i].addr_lo = sg_list[i].addr;
+		sge[i].addr_hi = upper_32_bits(sg_list[i].addr);
+		sge[i].len = sg_list[i].length;
+		hdr->total_len += sg_list[i].length;
+	}
+	if (num_sge == 0)
+		memset(sge, 0, sizeof(*sge));
+}
+
+static inline uint32_t ocrdma_sglist_len(struct ib_sge *sg_list, int num_sge)
+{
+	uint32_t total_len = 0, i;
+
+	for (i = 0; i < num_sge; i++)
+		total_len += sg_list[i].length;
+	return total_len;
+}
+
+
+static int ocrdma_build_inline_sges(struct ocrdma_qp *qp,
+				    struct ocrdma_hdr_wqe *hdr,
+				    struct ocrdma_sge *sge,
+				    struct ib_send_wr *wr, u32 wqe_size)
+{
+	int i;
+	char *dpp_addr;
+
+	if (wr->send_flags & IB_SEND_INLINE && qp->qp_type != IB_QPT_UD) {
+		hdr->total_len = ocrdma_sglist_len(wr->sg_list, wr->num_sge);
+		if (unlikely(hdr->total_len > qp->max_inline_data)) {
+			pr_err("%s() supported_len=0x%x,\n"
+			       " unsupported len req=0x%x\n", __func__,
+				qp->max_inline_data, hdr->total_len);
+			return -EINVAL;
+		}
+		dpp_addr = (char *)sge;
+		for (i = 0; i < wr->num_sge; i++) {
+			memcpy(dpp_addr,
+			       (void *)(unsigned long)wr->sg_list[i].addr,
+			       wr->sg_list[i].length);
+			dpp_addr += wr->sg_list[i].length;
+		}
+
+		wqe_size += roundup(hdr->total_len, OCRDMA_WQE_ALIGN_BYTES);
+		if (0 == hdr->total_len)
+			wqe_size += sizeof(struct ocrdma_sge);
+		hdr->cw |= (OCRDMA_TYPE_INLINE << OCRDMA_WQE_TYPE_SHIFT);
+	} else {
+		ocrdma_build_sges(hdr, sge, wr->num_sge, wr->sg_list);
+		if (wr->num_sge)
+			wqe_size += (wr->num_sge * sizeof(struct ocrdma_sge));
+		else
+			wqe_size += sizeof(struct ocrdma_sge);
+		hdr->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT);
+	}
+	hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT);
+	return 0;
+}
+
+static int ocrdma_build_send(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr,
+			     struct ib_send_wr *wr)
+{
+	int status;
+	struct ocrdma_sge *sge;
+	u32 wqe_size = sizeof(*hdr);
+
+	if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) {
+		ocrdma_build_ud_hdr(qp, hdr, wr);
+		sge = (struct ocrdma_sge *)(hdr + 2);
+		wqe_size += sizeof(struct ocrdma_ewqe_ud_hdr);
+	} else {
+		sge = (struct ocrdma_sge *)(hdr + 1);
+	}
+
+	status = ocrdma_build_inline_sges(qp, hdr, sge, wr, wqe_size);
+	return status;
+}
+
+static int ocrdma_build_write(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr,
+			      struct ib_send_wr *wr)
+{
+	int status;
+	struct ocrdma_sge *ext_rw = (struct ocrdma_sge *)(hdr + 1);
+	struct ocrdma_sge *sge = ext_rw + 1;
+	u32 wqe_size = sizeof(*hdr) + sizeof(*ext_rw);
+
+	status = ocrdma_build_inline_sges(qp, hdr, sge, wr, wqe_size);
+	if (status)
+		return status;
+	ext_rw->addr_lo = wr->wr.rdma.remote_addr;
+	ext_rw->addr_hi = upper_32_bits(wr->wr.rdma.remote_addr);
+	ext_rw->lrkey = wr->wr.rdma.rkey;
+	ext_rw->len = hdr->total_len;
+	return 0;
+}
+
+static void ocrdma_build_read(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr,
+			      struct ib_send_wr *wr)
+{
+	struct ocrdma_sge *ext_rw = (struct ocrdma_sge *)(hdr + 1);
+	struct ocrdma_sge *sge = ext_rw + 1;
+	u32 wqe_size = ((wr->num_sge + 1) * sizeof(struct ocrdma_sge)) +
+	    sizeof(struct ocrdma_hdr_wqe);
+
+	ocrdma_build_sges(hdr, sge, wr->num_sge, wr->sg_list);
+	hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT);
+	hdr->cw |= (OCRDMA_READ << OCRDMA_WQE_OPCODE_SHIFT);
+	hdr->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT);
+
+	ext_rw->addr_lo = wr->wr.rdma.remote_addr;
+	ext_rw->addr_hi = upper_32_bits(wr->wr.rdma.remote_addr);
+	ext_rw->lrkey = wr->wr.rdma.rkey;
+	ext_rw->len = hdr->total_len;
+}
+
+static void build_frmr_pbes(struct ib_send_wr *wr, struct ocrdma_pbl *pbl_tbl,
+			    struct ocrdma_hw_mr *hwmr)
+{
+	int i;
+	u64 buf_addr = 0;
+	int num_pbes;
+	struct ocrdma_pbe *pbe;
+
+	pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+	num_pbes = 0;
+
+	/* go through the OS phy regions & fill hw pbe entries into pbls. */
+	for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) {
+		/* number of pbes can be more for one OS buf, when
+		 * buffers are of different sizes.
+		 * split the ib_buf to one or more pbes.
+		 */
+		buf_addr = wr->wr.fast_reg.page_list->page_list[i];
+		pbe->pa_lo = cpu_to_le32((u32) (buf_addr & PAGE_MASK));
+		pbe->pa_hi = cpu_to_le32((u32) upper_32_bits(buf_addr));
+		num_pbes += 1;
+		pbe++;
+
+		/* if the pbl is full storing the pbes,
+		 * move to next pbl.
+		*/
+		if (num_pbes == (hwmr->pbl_size/sizeof(u64))) {
+			pbl_tbl++;
+			pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+		}
+	}
+	return;
+}
+
+static int get_encoded_page_size(int pg_sz)
+{
+	/* Max size is 256M 4096 << 16 */
+	int i = 0;
+	for (; i < 17; i++)
+		if (pg_sz == (4096 << i))
+			break;
+	return i;
+}
+
+
+static int ocrdma_build_fr(struct ocrdma_qp *qp, struct ocrdma_hdr_wqe *hdr,
+			   struct ib_send_wr *wr)
+{
+	u64 fbo;
+	struct ocrdma_ewqe_fr *fast_reg = (struct ocrdma_ewqe_fr *)(hdr + 1);
+	struct ocrdma_mr *mr;
+	struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device);
+	u32 wqe_size = sizeof(*fast_reg) + sizeof(*hdr);
+
+	wqe_size = roundup(wqe_size, OCRDMA_WQE_ALIGN_BYTES);
+
+	if (wr->wr.fast_reg.page_list_len > dev->attr.max_pages_per_frmr)
+		return -EINVAL;
+
+	hdr->cw |= (OCRDMA_FR_MR << OCRDMA_WQE_OPCODE_SHIFT);
+	hdr->cw |= ((wqe_size / OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT);
+
+	if (wr->wr.fast_reg.page_list_len == 0)
+		BUG();
+	if (wr->wr.fast_reg.access_flags & IB_ACCESS_LOCAL_WRITE)
+		hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_LOCAL_WR;
+	if (wr->wr.fast_reg.access_flags & IB_ACCESS_REMOTE_WRITE)
+		hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_REMOTE_WR;
+	if (wr->wr.fast_reg.access_flags & IB_ACCESS_REMOTE_READ)
+		hdr->rsvd_lkey_flags |= OCRDMA_LKEY_FLAG_REMOTE_RD;
+	hdr->lkey = wr->wr.fast_reg.rkey;
+	hdr->total_len = wr->wr.fast_reg.length;
+
+	fbo = wr->wr.fast_reg.iova_start -
+	    (wr->wr.fast_reg.page_list->page_list[0] & PAGE_MASK);
+
+	fast_reg->va_hi = upper_32_bits(wr->wr.fast_reg.iova_start);
+	fast_reg->va_lo = (u32) (wr->wr.fast_reg.iova_start & 0xffffffff);
+	fast_reg->fbo_hi = upper_32_bits(fbo);
+	fast_reg->fbo_lo = (u32) fbo & 0xffffffff;
+	fast_reg->num_sges = wr->wr.fast_reg.page_list_len;
+	fast_reg->size_sge =
+		get_encoded_page_size(1 << wr->wr.fast_reg.page_shift);
+	mr = (struct ocrdma_mr *) (unsigned long)
+		dev->stag_arr[(hdr->lkey >> 8) & (OCRDMA_MAX_STAG - 1)];
+	build_frmr_pbes(wr, mr->hwmr.pbl_table, &mr->hwmr);
+	return 0;
+}
+
+static void ocrdma_ring_sq_db(struct ocrdma_qp *qp)
+{
+	u32 val = qp->sq.dbid | (1 << OCRDMA_DB_SQ_SHIFT);
+
+	iowrite32(val, qp->sq_db);
+}
+
+int ocrdma_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+		     struct ib_send_wr **bad_wr)
+{
+	int status = 0;
+	struct ocrdma_qp *qp = get_ocrdma_qp(ibqp);
+	struct ocrdma_hdr_wqe *hdr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->q_lock, flags);
+	if (qp->state != OCRDMA_QPS_RTS && qp->state != OCRDMA_QPS_SQD) {
+		spin_unlock_irqrestore(&qp->q_lock, flags);
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+
+	while (wr) {
+		if (qp->qp_type == IB_QPT_UD &&
+		    (wr->opcode != IB_WR_SEND &&
+		     wr->opcode != IB_WR_SEND_WITH_IMM)) {
+			*bad_wr = wr;
+			status = -EINVAL;
+			break;
+		}
+		if (ocrdma_hwq_free_cnt(&qp->sq) == 0 ||
+		    wr->num_sge > qp->sq.max_sges) {
+			*bad_wr = wr;
+			status = -ENOMEM;
+			break;
+		}
+		hdr = ocrdma_hwq_head(&qp->sq);
+		hdr->cw = 0;
+		if (wr->send_flags & IB_SEND_SIGNALED || qp->signaled)
+			hdr->cw |= (OCRDMA_FLAG_SIG << OCRDMA_WQE_FLAGS_SHIFT);
+		if (wr->send_flags & IB_SEND_FENCE)
+			hdr->cw |=
+			    (OCRDMA_FLAG_FENCE_L << OCRDMA_WQE_FLAGS_SHIFT);
+		if (wr->send_flags & IB_SEND_SOLICITED)
+			hdr->cw |=
+			    (OCRDMA_FLAG_SOLICIT << OCRDMA_WQE_FLAGS_SHIFT);
+		hdr->total_len = 0;
+		switch (wr->opcode) {
+		case IB_WR_SEND_WITH_IMM:
+			hdr->cw |= (OCRDMA_FLAG_IMM << OCRDMA_WQE_FLAGS_SHIFT);
+			hdr->immdt = ntohl(wr->ex.imm_data);
+		case IB_WR_SEND:
+			hdr->cw |= (OCRDMA_SEND << OCRDMA_WQE_OPCODE_SHIFT);
+			ocrdma_build_send(qp, hdr, wr);
+			break;
+		case IB_WR_SEND_WITH_INV:
+			hdr->cw |= (OCRDMA_FLAG_INV << OCRDMA_WQE_FLAGS_SHIFT);
+			hdr->cw |= (OCRDMA_SEND << OCRDMA_WQE_OPCODE_SHIFT);
+			hdr->lkey = wr->ex.invalidate_rkey;
+			status = ocrdma_build_send(qp, hdr, wr);
+			break;
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			hdr->cw |= (OCRDMA_FLAG_IMM << OCRDMA_WQE_FLAGS_SHIFT);
+			hdr->immdt = ntohl(wr->ex.imm_data);
+		case IB_WR_RDMA_WRITE:
+			hdr->cw |= (OCRDMA_WRITE << OCRDMA_WQE_OPCODE_SHIFT);
+			status = ocrdma_build_write(qp, hdr, wr);
+			break;
+		case IB_WR_RDMA_READ:
+			ocrdma_build_read(qp, hdr, wr);
+			break;
+		case IB_WR_LOCAL_INV:
+			hdr->cw |=
+			    (OCRDMA_LKEY_INV << OCRDMA_WQE_OPCODE_SHIFT);
+			hdr->cw |= ((sizeof(struct ocrdma_hdr_wqe) +
+					sizeof(struct ocrdma_sge)) /
+				OCRDMA_WQE_STRIDE) << OCRDMA_WQE_SIZE_SHIFT;
+			hdr->lkey = wr->ex.invalidate_rkey;
+			break;
+		case IB_WR_FAST_REG_MR:
+			status = ocrdma_build_fr(qp, hdr, wr);
+			break;
+		default:
+			status = -EINVAL;
+			break;
+		}
+		if (status) {
+			*bad_wr = wr;
+			break;
+		}
+		if (wr->send_flags & IB_SEND_SIGNALED || qp->signaled)
+			qp->wqe_wr_id_tbl[qp->sq.head].signaled = 1;
+		else
+			qp->wqe_wr_id_tbl[qp->sq.head].signaled = 0;
+		qp->wqe_wr_id_tbl[qp->sq.head].wrid = wr->wr_id;
+		ocrdma_cpu_to_le32(hdr, ((hdr->cw >> OCRDMA_WQE_SIZE_SHIFT) &
+				   OCRDMA_WQE_SIZE_MASK) * OCRDMA_WQE_STRIDE);
+		/* make sure wqe is written before adapter can access it */
+		wmb();
+		/* inform hw to start processing it */
+		ocrdma_ring_sq_db(qp);
+
+		/* update pointer, counter for next wr */
+		ocrdma_hwq_inc_head(&qp->sq);
+		wr = wr->next;
+	}
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+	return status;
+}
+
+static void ocrdma_ring_rq_db(struct ocrdma_qp *qp)
+{
+	u32 val = qp->rq.dbid | (1 << OCRDMA_DB_RQ_SHIFT);
+
+	iowrite32(val, qp->rq_db);
+}
+
+static void ocrdma_build_rqe(struct ocrdma_hdr_wqe *rqe, struct ib_recv_wr *wr,
+			     u16 tag)
+{
+	u32 wqe_size = 0;
+	struct ocrdma_sge *sge;
+	if (wr->num_sge)
+		wqe_size = (wr->num_sge * sizeof(*sge)) + sizeof(*rqe);
+	else
+		wqe_size = sizeof(*sge) + sizeof(*rqe);
+
+	rqe->cw = ((wqe_size / OCRDMA_WQE_STRIDE) <<
+				OCRDMA_WQE_SIZE_SHIFT);
+	rqe->cw |= (OCRDMA_FLAG_SIG << OCRDMA_WQE_FLAGS_SHIFT);
+	rqe->cw |= (OCRDMA_TYPE_LKEY << OCRDMA_WQE_TYPE_SHIFT);
+	rqe->total_len = 0;
+	rqe->rsvd_tag = tag;
+	sge = (struct ocrdma_sge *)(rqe + 1);
+	ocrdma_build_sges(rqe, sge, wr->num_sge, wr->sg_list);
+	ocrdma_cpu_to_le32(rqe, wqe_size);
+}
+
+int ocrdma_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+		     struct ib_recv_wr **bad_wr)
+{
+	int status = 0;
+	unsigned long flags;
+	struct ocrdma_qp *qp = get_ocrdma_qp(ibqp);
+	struct ocrdma_hdr_wqe *rqe;
+
+	spin_lock_irqsave(&qp->q_lock, flags);
+	if (qp->state == OCRDMA_QPS_RST || qp->state == OCRDMA_QPS_ERR) {
+		spin_unlock_irqrestore(&qp->q_lock, flags);
+		*bad_wr = wr;
+		return -EINVAL;
+	}
+	while (wr) {
+		if (ocrdma_hwq_free_cnt(&qp->rq) == 0 ||
+		    wr->num_sge > qp->rq.max_sges) {
+			*bad_wr = wr;
+			status = -ENOMEM;
+			break;
+		}
+		rqe = ocrdma_hwq_head(&qp->rq);
+		ocrdma_build_rqe(rqe, wr, 0);
+
+		qp->rqe_wr_id_tbl[qp->rq.head] = wr->wr_id;
+		/* make sure rqe is written before adapter can access it */
+		wmb();
+
+		/* inform hw to start processing it */
+		ocrdma_ring_rq_db(qp);
+
+		/* update pointer, counter for next wr */
+		ocrdma_hwq_inc_head(&qp->rq);
+		wr = wr->next;
+	}
+	spin_unlock_irqrestore(&qp->q_lock, flags);
+	return status;
+}
+
+/* cqe for srq's rqe can potentially arrive out of order.
+ * index gives the entry in the shadow table where to store
+ * the wr_id. tag/index is returned in cqe to reference back
+ * for a given rqe.
+ */
+static int ocrdma_srq_get_idx(struct ocrdma_srq *srq)
+{
+	int row = 0;
+	int indx = 0;
+
+	for (row = 0; row < srq->bit_fields_len; row++) {
+		if (srq->idx_bit_fields[row]) {
+			indx = ffs(srq->idx_bit_fields[row]);
+			indx = (row * 32) + (indx - 1);
+			if (indx >= srq->rq.max_cnt)
+				BUG();
+			ocrdma_srq_toggle_bit(srq, indx);
+			break;
+		}
+	}
+
+	if (row == srq->bit_fields_len)
+		BUG();
+	return indx + 1; /* Use from index 1 */
+}
+
+static void ocrdma_ring_srq_db(struct ocrdma_srq *srq)
+{
+	u32 val = srq->rq.dbid | (1 << 16);
+
+	iowrite32(val, srq->db + OCRDMA_DB_GEN2_SRQ_OFFSET);
+}
+
+int ocrdma_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			 struct ib_recv_wr **bad_wr)
+{
+	int status = 0;
+	unsigned long flags;
+	struct ocrdma_srq *srq;
+	struct ocrdma_hdr_wqe *rqe;
+	u16 tag;
+
+	srq = get_ocrdma_srq(ibsrq);
+
+	spin_lock_irqsave(&srq->q_lock, flags);
+	while (wr) {
+		if (ocrdma_hwq_free_cnt(&srq->rq) == 0 ||
+		    wr->num_sge > srq->rq.max_sges) {
+			status = -ENOMEM;
+			*bad_wr = wr;
+			break;
+		}
+		tag = ocrdma_srq_get_idx(srq);
+		rqe = ocrdma_hwq_head(&srq->rq);
+		ocrdma_build_rqe(rqe, wr, tag);
+
+		srq->rqe_wr_id_tbl[tag] = wr->wr_id;
+		/* make sure rqe is written before adapter can perform DMA */
+		wmb();
+		/* inform hw to start processing it */
+		ocrdma_ring_srq_db(srq);
+		/* update pointer, counter for next wr */
+		ocrdma_hwq_inc_head(&srq->rq);
+		wr = wr->next;
+	}
+	spin_unlock_irqrestore(&srq->q_lock, flags);
+	return status;
+}
+
+static enum ib_wc_status ocrdma_to_ibwc_err(u16 status)
+{
+	enum ib_wc_status ibwc_status;
+
+	switch (status) {
+	case OCRDMA_CQE_GENERAL_ERR:
+		ibwc_status = IB_WC_GENERAL_ERR;
+		break;
+	case OCRDMA_CQE_LOC_LEN_ERR:
+		ibwc_status = IB_WC_LOC_LEN_ERR;
+		break;
+	case OCRDMA_CQE_LOC_QP_OP_ERR:
+		ibwc_status = IB_WC_LOC_QP_OP_ERR;
+		break;
+	case OCRDMA_CQE_LOC_EEC_OP_ERR:
+		ibwc_status = IB_WC_LOC_EEC_OP_ERR;
+		break;
+	case OCRDMA_CQE_LOC_PROT_ERR:
+		ibwc_status = IB_WC_LOC_PROT_ERR;
+		break;
+	case OCRDMA_CQE_WR_FLUSH_ERR:
+		ibwc_status = IB_WC_WR_FLUSH_ERR;
+		break;
+	case OCRDMA_CQE_MW_BIND_ERR:
+		ibwc_status = IB_WC_MW_BIND_ERR;
+		break;
+	case OCRDMA_CQE_BAD_RESP_ERR:
+		ibwc_status = IB_WC_BAD_RESP_ERR;
+		break;
+	case OCRDMA_CQE_LOC_ACCESS_ERR:
+		ibwc_status = IB_WC_LOC_ACCESS_ERR;
+		break;
+	case OCRDMA_CQE_REM_INV_REQ_ERR:
+		ibwc_status = IB_WC_REM_INV_REQ_ERR;
+		break;
+	case OCRDMA_CQE_REM_ACCESS_ERR:
+		ibwc_status = IB_WC_REM_ACCESS_ERR;
+		break;
+	case OCRDMA_CQE_REM_OP_ERR:
+		ibwc_status = IB_WC_REM_OP_ERR;
+		break;
+	case OCRDMA_CQE_RETRY_EXC_ERR:
+		ibwc_status = IB_WC_RETRY_EXC_ERR;
+		break;
+	case OCRDMA_CQE_RNR_RETRY_EXC_ERR:
+		ibwc_status = IB_WC_RNR_RETRY_EXC_ERR;
+		break;
+	case OCRDMA_CQE_LOC_RDD_VIOL_ERR:
+		ibwc_status = IB_WC_LOC_RDD_VIOL_ERR;
+		break;
+	case OCRDMA_CQE_REM_INV_RD_REQ_ERR:
+		ibwc_status = IB_WC_REM_INV_RD_REQ_ERR;
+		break;
+	case OCRDMA_CQE_REM_ABORT_ERR:
+		ibwc_status = IB_WC_REM_ABORT_ERR;
+		break;
+	case OCRDMA_CQE_INV_EECN_ERR:
+		ibwc_status = IB_WC_INV_EECN_ERR;
+		break;
+	case OCRDMA_CQE_INV_EEC_STATE_ERR:
+		ibwc_status = IB_WC_INV_EEC_STATE_ERR;
+		break;
+	case OCRDMA_CQE_FATAL_ERR:
+		ibwc_status = IB_WC_FATAL_ERR;
+		break;
+	case OCRDMA_CQE_RESP_TIMEOUT_ERR:
+		ibwc_status = IB_WC_RESP_TIMEOUT_ERR;
+		break;
+	default:
+		ibwc_status = IB_WC_GENERAL_ERR;
+		break;
+	}
+	return ibwc_status;
+}
+
+static void ocrdma_update_wc(struct ocrdma_qp *qp, struct ib_wc *ibwc,
+		      u32 wqe_idx)
+{
+	struct ocrdma_hdr_wqe *hdr;
+	struct ocrdma_sge *rw;
+	int opcode;
+
+	hdr = ocrdma_hwq_head_from_idx(&qp->sq, wqe_idx);
+
+	ibwc->wr_id = qp->wqe_wr_id_tbl[wqe_idx].wrid;
+	/* Undo the hdr->cw swap */
+	opcode = le32_to_cpu(hdr->cw) & OCRDMA_WQE_OPCODE_MASK;
+	switch (opcode) {
+	case OCRDMA_WRITE:
+		ibwc->opcode = IB_WC_RDMA_WRITE;
+		break;
+	case OCRDMA_READ:
+		rw = (struct ocrdma_sge *)(hdr + 1);
+		ibwc->opcode = IB_WC_RDMA_READ;
+		ibwc->byte_len = rw->len;
+		break;
+	case OCRDMA_SEND:
+		ibwc->opcode = IB_WC_SEND;
+		break;
+	case OCRDMA_FR_MR:
+		ibwc->opcode = IB_WC_FAST_REG_MR;
+		break;
+	case OCRDMA_LKEY_INV:
+		ibwc->opcode = IB_WC_LOCAL_INV;
+		break;
+	default:
+		ibwc->status = IB_WC_GENERAL_ERR;
+		pr_err("%s() invalid opcode received = 0x%x\n",
+		       __func__, hdr->cw & OCRDMA_WQE_OPCODE_MASK);
+		break;
+	}
+}
+
+static void ocrdma_set_cqe_status_flushed(struct ocrdma_qp *qp,
+						struct ocrdma_cqe *cqe)
+{
+	if (is_cqe_for_sq(cqe)) {
+		cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+				cqe->flags_status_srcqpn) &
+					~OCRDMA_CQE_STATUS_MASK);
+		cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+				cqe->flags_status_srcqpn) |
+				(OCRDMA_CQE_WR_FLUSH_ERR <<
+					OCRDMA_CQE_STATUS_SHIFT));
+	} else {
+		if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) {
+			cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+					cqe->flags_status_srcqpn) &
+						~OCRDMA_CQE_UD_STATUS_MASK);
+			cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+					cqe->flags_status_srcqpn) |
+					(OCRDMA_CQE_WR_FLUSH_ERR <<
+						OCRDMA_CQE_UD_STATUS_SHIFT));
+		} else {
+			cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+					cqe->flags_status_srcqpn) &
+						~OCRDMA_CQE_STATUS_MASK);
+			cqe->flags_status_srcqpn = cpu_to_le32(le32_to_cpu(
+					cqe->flags_status_srcqpn) |
+					(OCRDMA_CQE_WR_FLUSH_ERR <<
+						OCRDMA_CQE_STATUS_SHIFT));
+		}
+	}
+}
+
+static bool ocrdma_update_err_cqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe,
+				  struct ocrdma_qp *qp, int status)
+{
+	bool expand = false;
+
+	ibwc->byte_len = 0;
+	ibwc->qp = &qp->ibqp;
+	ibwc->status = ocrdma_to_ibwc_err(status);
+
+	ocrdma_flush_qp(qp);
+	ocrdma_qp_state_change(qp, IB_QPS_ERR, NULL);
+
+	/* if wqe/rqe pending for which cqe needs to be returned,
+	 * trigger inflating it.
+	 */
+	if (!is_hw_rq_empty(qp) || !is_hw_sq_empty(qp)) {
+		expand = true;
+		ocrdma_set_cqe_status_flushed(qp, cqe);
+	}
+	return expand;
+}
+
+static int ocrdma_update_err_rcqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe,
+				  struct ocrdma_qp *qp, int status)
+{
+	ibwc->opcode = IB_WC_RECV;
+	ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail];
+	ocrdma_hwq_inc_tail(&qp->rq);
+
+	return ocrdma_update_err_cqe(ibwc, cqe, qp, status);
+}
+
+static int ocrdma_update_err_scqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe,
+				  struct ocrdma_qp *qp, int status)
+{
+	ocrdma_update_wc(qp, ibwc, qp->sq.tail);
+	ocrdma_hwq_inc_tail(&qp->sq);
+
+	return ocrdma_update_err_cqe(ibwc, cqe, qp, status);
+}
+
+
+static bool ocrdma_poll_err_scqe(struct ocrdma_qp *qp,
+				 struct ocrdma_cqe *cqe, struct ib_wc *ibwc,
+				 bool *polled, bool *stop)
+{
+	bool expand;
+	struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device);
+	int status = (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT;
+	if (status < OCRDMA_MAX_CQE_ERR)
+		atomic_inc(&dev->cqe_err_stats[status]);
+
+	/* when hw sq is empty, but rq is not empty, so we continue
+	 * to keep the cqe in order to get the cq event again.
+	 */
+	if (is_hw_sq_empty(qp) && !is_hw_rq_empty(qp)) {
+		/* when cq for rq and sq is same, it is safe to return
+		 * flush cqe for RQEs.
+		 */
+		if (!qp->srq && (qp->sq_cq == qp->rq_cq)) {
+			*polled = true;
+			status = OCRDMA_CQE_WR_FLUSH_ERR;
+			expand = ocrdma_update_err_rcqe(ibwc, cqe, qp, status);
+		} else {
+			/* stop processing further cqe as this cqe is used for
+			 * triggering cq event on buddy cq of RQ.
+			 * When QP is destroyed, this cqe will be removed
+			 * from the cq's hardware q.
+			 */
+			*polled = false;
+			*stop = true;
+			expand = false;
+		}
+	} else if (is_hw_sq_empty(qp)) {
+		/* Do nothing */
+		expand = false;
+		*polled = false;
+		*stop = false;
+	} else {
+		*polled = true;
+		expand = ocrdma_update_err_scqe(ibwc, cqe, qp, status);
+	}
+	return expand;
+}
+
+static bool ocrdma_poll_success_scqe(struct ocrdma_qp *qp,
+				     struct ocrdma_cqe *cqe,
+				     struct ib_wc *ibwc, bool *polled)
+{
+	bool expand = false;
+	int tail = qp->sq.tail;
+	u32 wqe_idx;
+
+	if (!qp->wqe_wr_id_tbl[tail].signaled) {
+		*polled = false;    /* WC cannot be consumed yet */
+	} else {
+		ibwc->status = IB_WC_SUCCESS;
+		ibwc->wc_flags = 0;
+		ibwc->qp = &qp->ibqp;
+		ocrdma_update_wc(qp, ibwc, tail);
+		*polled = true;
+	}
+	wqe_idx = (le32_to_cpu(cqe->wq.wqeidx) &
+			OCRDMA_CQE_WQEIDX_MASK) & qp->sq.max_wqe_idx;
+	if (tail != wqe_idx)
+		expand = true; /* Coalesced CQE can't be consumed yet */
+
+	ocrdma_hwq_inc_tail(&qp->sq);
+	return expand;
+}
+
+static bool ocrdma_poll_scqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe,
+			     struct ib_wc *ibwc, bool *polled, bool *stop)
+{
+	int status;
+	bool expand;
+
+	status = (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT;
+
+	if (status == OCRDMA_CQE_SUCCESS)
+		expand = ocrdma_poll_success_scqe(qp, cqe, ibwc, polled);
+	else
+		expand = ocrdma_poll_err_scqe(qp, cqe, ibwc, polled, stop);
+	return expand;
+}
+
+static int ocrdma_update_ud_rcqe(struct ib_wc *ibwc, struct ocrdma_cqe *cqe)
+{
+	int status;
+
+	status = (le32_to_cpu(cqe->flags_status_srcqpn) &
+		OCRDMA_CQE_UD_STATUS_MASK) >> OCRDMA_CQE_UD_STATUS_SHIFT;
+	ibwc->src_qp = le32_to_cpu(cqe->flags_status_srcqpn) &
+						OCRDMA_CQE_SRCQP_MASK;
+	ibwc->pkey_index = le32_to_cpu(cqe->ud.rxlen_pkey) &
+						OCRDMA_CQE_PKEY_MASK;
+	ibwc->wc_flags = IB_WC_GRH;
+	ibwc->byte_len = (le32_to_cpu(cqe->ud.rxlen_pkey) >>
+					OCRDMA_CQE_UD_XFER_LEN_SHIFT);
+	return status;
+}
+
+static void ocrdma_update_free_srq_cqe(struct ib_wc *ibwc,
+				       struct ocrdma_cqe *cqe,
+				       struct ocrdma_qp *qp)
+{
+	unsigned long flags;
+	struct ocrdma_srq *srq;
+	u32 wqe_idx;
+
+	srq = get_ocrdma_srq(qp->ibqp.srq);
+	wqe_idx = (le32_to_cpu(cqe->rq.buftag_qpn) >>
+		OCRDMA_CQE_BUFTAG_SHIFT) & srq->rq.max_wqe_idx;
+	if (wqe_idx < 1)
+		BUG();
+
+	ibwc->wr_id = srq->rqe_wr_id_tbl[wqe_idx];
+	spin_lock_irqsave(&srq->q_lock, flags);
+	ocrdma_srq_toggle_bit(srq, wqe_idx - 1);
+	spin_unlock_irqrestore(&srq->q_lock, flags);
+	ocrdma_hwq_inc_tail(&srq->rq);
+}
+
+static bool ocrdma_poll_err_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe,
+				struct ib_wc *ibwc, bool *polled, bool *stop,
+				int status)
+{
+	bool expand;
+	struct ocrdma_dev *dev = get_ocrdma_dev(qp->ibqp.device);
+
+	if (status < OCRDMA_MAX_CQE_ERR)
+		atomic_inc(&dev->cqe_err_stats[status]);
+
+	/* when hw_rq is empty, but wq is not empty, so continue
+	 * to keep the cqe to get the cq event again.
+	 */
+	if (is_hw_rq_empty(qp) && !is_hw_sq_empty(qp)) {
+		if (!qp->srq && (qp->sq_cq == qp->rq_cq)) {
+			*polled = true;
+			status = OCRDMA_CQE_WR_FLUSH_ERR;
+			expand = ocrdma_update_err_scqe(ibwc, cqe, qp, status);
+		} else {
+			*polled = false;
+			*stop = true;
+			expand = false;
+		}
+	} else if (is_hw_rq_empty(qp)) {
+		/* Do nothing */
+		expand = false;
+		*polled = false;
+		*stop = false;
+	} else {
+		*polled = true;
+		expand = ocrdma_update_err_rcqe(ibwc, cqe, qp, status);
+	}
+	return expand;
+}
+
+static void ocrdma_poll_success_rcqe(struct ocrdma_qp *qp,
+				     struct ocrdma_cqe *cqe, struct ib_wc *ibwc)
+{
+	ibwc->opcode = IB_WC_RECV;
+	ibwc->qp = &qp->ibqp;
+	ibwc->status = IB_WC_SUCCESS;
+
+	if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI)
+		ocrdma_update_ud_rcqe(ibwc, cqe);
+	else
+		ibwc->byte_len = le32_to_cpu(cqe->rq.rxlen);
+
+	if (is_cqe_imm(cqe)) {
+		ibwc->ex.imm_data = htonl(le32_to_cpu(cqe->rq.lkey_immdt));
+		ibwc->wc_flags |= IB_WC_WITH_IMM;
+	} else if (is_cqe_wr_imm(cqe)) {
+		ibwc->opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		ibwc->ex.imm_data = htonl(le32_to_cpu(cqe->rq.lkey_immdt));
+		ibwc->wc_flags |= IB_WC_WITH_IMM;
+	} else if (is_cqe_invalidated(cqe)) {
+		ibwc->ex.invalidate_rkey = le32_to_cpu(cqe->rq.lkey_immdt);
+		ibwc->wc_flags |= IB_WC_WITH_INVALIDATE;
+	}
+	if (qp->ibqp.srq) {
+		ocrdma_update_free_srq_cqe(ibwc, cqe, qp);
+	} else {
+		ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail];
+		ocrdma_hwq_inc_tail(&qp->rq);
+	}
+}
+
+static bool ocrdma_poll_rcqe(struct ocrdma_qp *qp, struct ocrdma_cqe *cqe,
+			     struct ib_wc *ibwc, bool *polled, bool *stop)
+{
+	int status;
+	bool expand = false;
+
+	ibwc->wc_flags = 0;
+	if (qp->qp_type == IB_QPT_UD || qp->qp_type == IB_QPT_GSI) {
+		status = (le32_to_cpu(cqe->flags_status_srcqpn) &
+					OCRDMA_CQE_UD_STATUS_MASK) >>
+					OCRDMA_CQE_UD_STATUS_SHIFT;
+	} else {
+		status = (le32_to_cpu(cqe->flags_status_srcqpn) &
+			     OCRDMA_CQE_STATUS_MASK) >> OCRDMA_CQE_STATUS_SHIFT;
+	}
+
+	if (status == OCRDMA_CQE_SUCCESS) {
+		*polled = true;
+		ocrdma_poll_success_rcqe(qp, cqe, ibwc);
+	} else {
+		expand = ocrdma_poll_err_rcqe(qp, cqe, ibwc, polled, stop,
+					      status);
+	}
+	return expand;
+}
+
+static void ocrdma_change_cq_phase(struct ocrdma_cq *cq, struct ocrdma_cqe *cqe,
+				   u16 cur_getp)
+{
+	if (cq->phase_change) {
+		if (cur_getp == 0)
+			cq->phase = (~cq->phase & OCRDMA_CQE_VALID);
+	} else {
+		/* clear valid bit */
+		cqe->flags_status_srcqpn = 0;
+	}
+}
+
+static int ocrdma_poll_hwcq(struct ocrdma_cq *cq, int num_entries,
+			    struct ib_wc *ibwc)
+{
+	u16 qpn = 0;
+	int i = 0;
+	bool expand = false;
+	int polled_hw_cqes = 0;
+	struct ocrdma_qp *qp = NULL;
+	struct ocrdma_dev *dev = get_ocrdma_dev(cq->ibcq.device);
+	struct ocrdma_cqe *cqe;
+	u16 cur_getp; bool polled = false; bool stop = false;
+
+	cur_getp = cq->getp;
+	while (num_entries) {
+		cqe = cq->va + cur_getp;
+		/* check whether valid cqe or not */
+		if (!is_cqe_valid(cq, cqe))
+			break;
+		qpn = (le32_to_cpu(cqe->cmn.qpn) & OCRDMA_CQE_QPN_MASK);
+		/* ignore discarded cqe */
+		if (qpn == 0)
+			goto skip_cqe;
+		qp = dev->qp_tbl[qpn];
+		BUG_ON(qp == NULL);
+
+		if (is_cqe_for_sq(cqe)) {
+			expand = ocrdma_poll_scqe(qp, cqe, ibwc, &polled,
+						  &stop);
+		} else {
+			expand = ocrdma_poll_rcqe(qp, cqe, ibwc, &polled,
+						  &stop);
+		}
+		if (expand)
+			goto expand_cqe;
+		if (stop)
+			goto stop_cqe;
+		/* clear qpn to avoid duplicate processing by discard_cqe() */
+		cqe->cmn.qpn = 0;
+skip_cqe:
+		polled_hw_cqes += 1;
+		cur_getp = (cur_getp + 1) % cq->max_hw_cqe;
+		ocrdma_change_cq_phase(cq, cqe, cur_getp);
+expand_cqe:
+		if (polled) {
+			num_entries -= 1;
+			i += 1;
+			ibwc = ibwc + 1;
+			polled = false;
+		}
+	}
+stop_cqe:
+	cq->getp = cur_getp;
+	if (cq->deferred_arm) {
+		ocrdma_ring_cq_db(dev, cq->id, true, cq->deferred_sol,
+				  polled_hw_cqes);
+		cq->deferred_arm = false;
+		cq->deferred_sol = false;
+	} else {
+		/* We need to pop the CQE. No need to arm */
+		ocrdma_ring_cq_db(dev, cq->id, false, cq->deferred_sol,
+				  polled_hw_cqes);
+		cq->deferred_sol = false;
+	}
+
+	return i;
+}
+
+/* insert error cqe if the QP's SQ or RQ's CQ matches the CQ under poll. */
+static int ocrdma_add_err_cqe(struct ocrdma_cq *cq, int num_entries,
+			      struct ocrdma_qp *qp, struct ib_wc *ibwc)
+{
+	int err_cqes = 0;
+
+	while (num_entries) {
+		if (is_hw_sq_empty(qp) && is_hw_rq_empty(qp))
+			break;
+		if (!is_hw_sq_empty(qp) && qp->sq_cq == cq) {
+			ocrdma_update_wc(qp, ibwc, qp->sq.tail);
+			ocrdma_hwq_inc_tail(&qp->sq);
+		} else if (!is_hw_rq_empty(qp) && qp->rq_cq == cq) {
+			ibwc->wr_id = qp->rqe_wr_id_tbl[qp->rq.tail];
+			ocrdma_hwq_inc_tail(&qp->rq);
+		} else {
+			return err_cqes;
+		}
+		ibwc->byte_len = 0;
+		ibwc->status = IB_WC_WR_FLUSH_ERR;
+		ibwc = ibwc + 1;
+		err_cqes += 1;
+		num_entries -= 1;
+	}
+	return err_cqes;
+}
+
+int ocrdma_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
+{
+	int cqes_to_poll = num_entries;
+	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device);
+	int num_os_cqe = 0, err_cqes = 0;
+	struct ocrdma_qp *qp;
+	unsigned long flags;
+
+	/* poll cqes from adapter CQ */
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	num_os_cqe = ocrdma_poll_hwcq(cq, cqes_to_poll, wc);
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+	cqes_to_poll -= num_os_cqe;
+
+	if (cqes_to_poll) {
+		wc = wc + num_os_cqe;
+		/* adapter returns single error cqe when qp moves to
+		 * error state. So insert error cqes with wc_status as
+		 * FLUSHED for pending WQEs and RQEs of QP's SQ and RQ
+		 * respectively which uses this CQ.
+		 */
+		spin_lock_irqsave(&dev->flush_q_lock, flags);
+		list_for_each_entry(qp, &cq->sq_head, sq_entry) {
+			if (cqes_to_poll == 0)
+				break;
+			err_cqes = ocrdma_add_err_cqe(cq, cqes_to_poll, qp, wc);
+			cqes_to_poll -= err_cqes;
+			num_os_cqe += err_cqes;
+			wc = wc + err_cqes;
+		}
+		spin_unlock_irqrestore(&dev->flush_q_lock, flags);
+	}
+	return num_os_cqe;
+}
+
+int ocrdma_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags cq_flags)
+{
+	struct ocrdma_cq *cq = get_ocrdma_cq(ibcq);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibcq->device);
+	u16 cq_id;
+	unsigned long flags;
+	bool arm_needed = false, sol_needed = false;
+
+	cq_id = cq->id;
+
+	spin_lock_irqsave(&cq->cq_lock, flags);
+	if (cq_flags & IB_CQ_NEXT_COMP || cq_flags & IB_CQ_SOLICITED)
+		arm_needed = true;
+	if (cq_flags & IB_CQ_SOLICITED)
+		sol_needed = true;
+
+	if (cq->first_arm) {
+		ocrdma_ring_cq_db(dev, cq_id, arm_needed, sol_needed, 0);
+		cq->first_arm = false;
+	}
+
+	cq->deferred_arm = true;
+	cq->deferred_sol = sol_needed;
+	spin_unlock_irqrestore(&cq->cq_lock, flags);
+
+	return 0;
+}
+
+struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *ibpd, int max_page_list_len)
+{
+	int status;
+	struct ocrdma_mr *mr;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+
+	if (max_page_list_len > dev->attr.max_pages_per_frmr)
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(-ENOMEM);
+
+	status = ocrdma_get_pbl_info(dev, mr, max_page_list_len);
+	if (status)
+		goto pbl_err;
+	mr->hwmr.fr_mr = 1;
+	mr->hwmr.remote_rd = 0;
+	mr->hwmr.remote_wr = 0;
+	mr->hwmr.local_rd = 0;
+	mr->hwmr.local_wr = 0;
+	mr->hwmr.mw_bind = 0;
+	status = ocrdma_build_pbl_tbl(dev, &mr->hwmr);
+	if (status)
+		goto pbl_err;
+	status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, 0);
+	if (status)
+		goto mbx_err;
+	mr->ibmr.rkey = mr->hwmr.lkey;
+	mr->ibmr.lkey = mr->hwmr.lkey;
+	dev->stag_arr[(mr->hwmr.lkey >> 8) & (OCRDMA_MAX_STAG - 1)] =
+		(unsigned long) mr;
+	return &mr->ibmr;
+mbx_err:
+	ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
+pbl_err:
+	kfree(mr);
+	return ERR_PTR(-ENOMEM);
+}
+
+struct ib_fast_reg_page_list *ocrdma_alloc_frmr_page_list(struct ib_device
+							  *ibdev,
+							  int page_list_len)
+{
+	struct ib_fast_reg_page_list *frmr_list;
+	int size;
+
+	size = sizeof(*frmr_list) + (page_list_len * sizeof(u64));
+	frmr_list = kzalloc(size, GFP_KERNEL);
+	if (!frmr_list)
+		return ERR_PTR(-ENOMEM);
+	frmr_list->page_list = (u64 *)(frmr_list + 1);
+	return frmr_list;
+}
+
+void ocrdma_free_frmr_page_list(struct ib_fast_reg_page_list *page_list)
+{
+	kfree(page_list);
+}
+
+#define MAX_KERNEL_PBE_SIZE 65536
+static inline int count_kernel_pbes(struct ib_phys_buf *buf_list,
+				    int buf_cnt, u32 *pbe_size)
+{
+	u64 total_size = 0;
+	u64 buf_size = 0;
+	int i;
+	*pbe_size = roundup(buf_list[0].size, PAGE_SIZE);
+	*pbe_size = roundup_pow_of_two(*pbe_size);
+
+	/* find the smallest PBE size that we can have */
+	for (i = 0; i < buf_cnt; i++) {
+		/* first addr may not be page aligned, so ignore checking */
+		if ((i != 0) && ((buf_list[i].addr & ~PAGE_MASK) ||
+				 (buf_list[i].size & ~PAGE_MASK))) {
+			return 0;
+		}
+
+		/* if configured PBE size is greater then the chosen one,
+		 * reduce the PBE size.
+		 */
+		buf_size = roundup(buf_list[i].size, PAGE_SIZE);
+		/* pbe_size has to be even multiple of 4K 1,2,4,8...*/
+		buf_size = roundup_pow_of_two(buf_size);
+		if (*pbe_size > buf_size)
+			*pbe_size = buf_size;
+
+		total_size += buf_size;
+	}
+	*pbe_size = *pbe_size > MAX_KERNEL_PBE_SIZE ?
+	    (MAX_KERNEL_PBE_SIZE) : (*pbe_size);
+
+	/* num_pbes = total_size / (*pbe_size);  this is implemented below. */
+
+	return total_size >> ilog2(*pbe_size);
+}
+
+static void build_kernel_pbes(struct ib_phys_buf *buf_list, int ib_buf_cnt,
+			      u32 pbe_size, struct ocrdma_pbl *pbl_tbl,
+			      struct ocrdma_hw_mr *hwmr)
+{
+	int i;
+	int idx;
+	int pbes_per_buf = 0;
+	u64 buf_addr = 0;
+	int num_pbes;
+	struct ocrdma_pbe *pbe;
+	int total_num_pbes = 0;
+
+	if (!hwmr->num_pbes)
+		return;
+
+	pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+	num_pbes = 0;
+
+	/* go through the OS phy regions & fill hw pbe entries into pbls. */
+	for (i = 0; i < ib_buf_cnt; i++) {
+		buf_addr = buf_list[i].addr;
+		pbes_per_buf =
+		    roundup_pow_of_two(roundup(buf_list[i].size, PAGE_SIZE)) /
+		    pbe_size;
+		hwmr->len += buf_list[i].size;
+		/* number of pbes can be more for one OS buf, when
+		 * buffers are of different sizes.
+		 * split the ib_buf to one or more pbes.
+		 */
+		for (idx = 0; idx < pbes_per_buf; idx++) {
+			/* we program always page aligned addresses,
+			 * first unaligned address is taken care by fbo.
+			 */
+			if (i == 0) {
+				/* for non zero fbo, assign the
+				 * start of the page.
+				 */
+				pbe->pa_lo =
+				    cpu_to_le32((u32) (buf_addr & PAGE_MASK));
+				pbe->pa_hi =
+				    cpu_to_le32((u32) upper_32_bits(buf_addr));
+			} else {
+				pbe->pa_lo =
+				    cpu_to_le32((u32) (buf_addr & 0xffffffff));
+				pbe->pa_hi =
+				    cpu_to_le32((u32) upper_32_bits(buf_addr));
+			}
+			buf_addr += pbe_size;
+			num_pbes += 1;
+			total_num_pbes += 1;
+			pbe++;
+
+			if (total_num_pbes == hwmr->num_pbes)
+				goto mr_tbl_done;
+			/* if the pbl is full storing the pbes,
+			 * move to next pbl.
+			 */
+			if (num_pbes == (hwmr->pbl_size/sizeof(u64))) {
+				pbl_tbl++;
+				pbe = (struct ocrdma_pbe *)pbl_tbl->va;
+				num_pbes = 0;
+			}
+		}
+	}
+mr_tbl_done:
+	return;
+}
+
+struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *ibpd,
+				   struct ib_phys_buf *buf_list,
+				   int buf_cnt, int acc, u64 *iova_start)
+{
+	int status = -ENOMEM;
+	struct ocrdma_mr *mr;
+	struct ocrdma_pd *pd = get_ocrdma_pd(ibpd);
+	struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device);
+	u32 num_pbes;
+	u32 pbe_size = 0;
+
+	if ((acc & IB_ACCESS_REMOTE_WRITE) && !(acc & IB_ACCESS_LOCAL_WRITE))
+		return ERR_PTR(-EINVAL);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr)
+		return ERR_PTR(status);
+
+	num_pbes = count_kernel_pbes(buf_list, buf_cnt, &pbe_size);
+	if (num_pbes == 0) {
+		status = -EINVAL;
+		goto pbl_err;
+	}
+	status = ocrdma_get_pbl_info(dev, mr, num_pbes);
+	if (status)
+		goto pbl_err;
+
+	mr->hwmr.pbe_size = pbe_size;
+	mr->hwmr.fbo = *iova_start - (buf_list[0].addr & PAGE_MASK);
+	mr->hwmr.va = *iova_start;
+	mr->hwmr.local_rd = 1;
+	mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0;
+	mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0;
+	mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0;
+	mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0;
+	mr->hwmr.mw_bind = (acc & IB_ACCESS_MW_BIND) ? 1 : 0;
+
+	status = ocrdma_build_pbl_tbl(dev, &mr->hwmr);
+	if (status)
+		goto pbl_err;
+	build_kernel_pbes(buf_list, buf_cnt, pbe_size, mr->hwmr.pbl_table,
+			  &mr->hwmr);
+	status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, acc);
+	if (status)
+		goto mbx_err;
+
+	mr->ibmr.lkey = mr->hwmr.lkey;
+	if (mr->hwmr.remote_wr || mr->hwmr.remote_rd)
+		mr->ibmr.rkey = mr->hwmr.lkey;
+	return &mr->ibmr;
+
+mbx_err:
+	ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr);
+pbl_err:
+	kfree(mr);
+	return ERR_PTR(status);
+}
diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
new file mode 100644
index 000000000..b8f7853fd
--- /dev/null
+++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h
@@ -0,0 +1,99 @@
+/*******************************************************************
+ * This file is part of the Emulex RoCE Device Driver for          *
+ * RoCE (RDMA over Converged Ethernet) adapters.                   *
+ * Copyright (C) 2008-2012 Emulex. All rights reserved.            *
+ * EMULEX and SLI are trademarks of Emulex.                        *
+ * www.emulex.com                                                  *
+ *                                                                 *
+ * This program is free software; you can redistribute it and/or   *
+ * modify it under the terms of version 2 of the GNU General       *
+ * Public License as published by the Free Software Foundation.    *
+ * This program is distributed in the hope that it will be useful. *
+ * ALL EXPRESS OR IMPLIED CONDITIONS, REPRESENTATIONS AND          *
+ * WARRANTIES, INCLUDING ANY IMPLIED WARRANTY OF MERCHANTABILITY,  *
+ * FITNESS FOR A PARTICULAR PURPOSE, OR NON-INFRINGEMENT, ARE      *
+ * DISCLAIMED, EXCEPT TO THE EXTENT THAT SUCH DISCLAIMERS ARE HELD *
+ * TO BE LEGALLY INVALID.  See the GNU General Public License for  *
+ * more details, a copy of which can be found in the file COPYING  *
+ * included with this package.                                     *
+ *
+ * Contact Information:
+ * linux-drivers@emulex.com
+ *
+ * Emulex
+ * 3333 Susan Street
+ * Costa Mesa, CA 92626
+ *******************************************************************/
+
+#ifndef __OCRDMA_VERBS_H__
+#define __OCRDMA_VERBS_H__
+
+int ocrdma_post_send(struct ib_qp *, struct ib_send_wr *,
+		     struct ib_send_wr **bad_wr);
+int ocrdma_post_recv(struct ib_qp *, struct ib_recv_wr *,
+		     struct ib_recv_wr **bad_wr);
+
+int ocrdma_poll_cq(struct ib_cq *, int num_entries, struct ib_wc *wc);
+int ocrdma_arm_cq(struct ib_cq *, enum ib_cq_notify_flags flags);
+
+int ocrdma_query_device(struct ib_device *, struct ib_device_attr *props);
+int ocrdma_query_port(struct ib_device *, u8 port, struct ib_port_attr *props);
+int ocrdma_modify_port(struct ib_device *, u8 port, int mask,
+		       struct ib_port_modify *props);
+
+void ocrdma_get_guid(struct ocrdma_dev *, u8 *guid);
+int ocrdma_query_gid(struct ib_device *, u8 port,
+		     int index, union ib_gid *gid);
+int ocrdma_query_pkey(struct ib_device *, u8 port, u16 index, u16 *pkey);
+
+struct ib_ucontext *ocrdma_alloc_ucontext(struct ib_device *,
+					  struct ib_udata *);
+int ocrdma_dealloc_ucontext(struct ib_ucontext *);
+
+int ocrdma_mmap(struct ib_ucontext *, struct vm_area_struct *vma);
+
+struct ib_pd *ocrdma_alloc_pd(struct ib_device *,
+			      struct ib_ucontext *, struct ib_udata *);
+int ocrdma_dealloc_pd(struct ib_pd *pd);
+
+struct ib_cq *ocrdma_create_cq(struct ib_device *, int entries, int vector,
+			       struct ib_ucontext *, struct ib_udata *);
+int ocrdma_resize_cq(struct ib_cq *, int cqe, struct ib_udata *);
+int ocrdma_destroy_cq(struct ib_cq *);
+
+struct ib_qp *ocrdma_create_qp(struct ib_pd *,
+			       struct ib_qp_init_attr *attrs,
+			       struct ib_udata *);
+int _ocrdma_modify_qp(struct ib_qp *, struct ib_qp_attr *attr,
+		      int attr_mask);
+int ocrdma_modify_qp(struct ib_qp *, struct ib_qp_attr *attr,
+		     int attr_mask, struct ib_udata *udata);
+int ocrdma_query_qp(struct ib_qp *,
+		    struct ib_qp_attr *qp_attr,
+		    int qp_attr_mask, struct ib_qp_init_attr *);
+int ocrdma_destroy_qp(struct ib_qp *);
+void ocrdma_del_flush_qp(struct ocrdma_qp *qp);
+
+struct ib_srq *ocrdma_create_srq(struct ib_pd *, struct ib_srq_init_attr *,
+				 struct ib_udata *);
+int ocrdma_modify_srq(struct ib_srq *, struct ib_srq_attr *,
+		      enum ib_srq_attr_mask, struct ib_udata *);
+int ocrdma_query_srq(struct ib_srq *, struct ib_srq_attr *);
+int ocrdma_destroy_srq(struct ib_srq *);
+int ocrdma_post_srq_recv(struct ib_srq *, struct ib_recv_wr *,
+			 struct ib_recv_wr **bad_recv_wr);
+
+int ocrdma_dereg_mr(struct ib_mr *);
+struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *, int acc);
+struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *,
+				   struct ib_phys_buf *buffer_list,
+				   int num_phys_buf, int acc, u64 *iova_start);
+struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length,
+				 u64 virt, int acc, struct ib_udata *);
+struct ib_mr *ocrdma_alloc_frmr(struct ib_pd *pd, int max_page_list_len);
+struct ib_fast_reg_page_list *ocrdma_alloc_frmr_page_list(struct ib_device
+							*ibdev,
+							int page_list_len);
+void ocrdma_free_frmr_page_list(struct ib_fast_reg_page_list *page_list);
+
+#endif				/* __OCRDMA_VERBS_H__ */
diff --git a/drivers/infiniband/hw/qib/Kconfig b/drivers/infiniband/hw/qib/Kconfig
new file mode 100644
index 000000000..495be0978
--- /dev/null
+++ b/drivers/infiniband/hw/qib/Kconfig
@@ -0,0 +1,15 @@
+config INFINIBAND_QIB
+	tristate "Intel PCIe HCA support"
+	depends on 64BIT
+	---help---
+	This is a low-level driver for Intel PCIe QLE InfiniBand host
+	channel adapters.  This driver does not support the Intel
+	HyperTransport card (model QHT7140).
+
+config INFINIBAND_QIB_DCA
+	bool "QIB DCA support"
+	depends on INFINIBAND_QIB && DCA && SMP && !(INFINIBAND_QIB=y && DCA=m)
+	default y
+	---help---
+	Setting this enables DCA support on some Intel chip sets
+	with the iba7322 HCA.
diff --git a/drivers/infiniband/hw/qib/Makefile b/drivers/infiniband/hw/qib/Makefile
new file mode 100644
index 000000000..57f8103e5
--- /dev/null
+++ b/drivers/infiniband/hw/qib/Makefile
@@ -0,0 +1,16 @@
+obj-$(CONFIG_INFINIBAND_QIB) += ib_qib.o
+
+ib_qib-y := qib_cq.o qib_diag.o qib_dma.o qib_driver.o qib_eeprom.o \
+	qib_file_ops.o qib_fs.o qib_init.o qib_intr.o qib_keys.o \
+	qib_mad.o qib_mmap.o qib_mr.o qib_pcie.o qib_pio_copy.o \
+	qib_qp.o qib_qsfp.o qib_rc.o qib_ruc.o qib_sdma.o qib_srq.o \
+	qib_sysfs.o qib_twsi.o qib_tx.o qib_uc.o qib_ud.o \
+	qib_user_pages.o qib_user_sdma.o qib_verbs_mcast.o qib_iba7220.o \
+	qib_sd7220.o qib_iba7322.o qib_verbs.o
+
+# 6120 has no fallback if no MSI interrupts, others can do INTx
+ib_qib-$(CONFIG_PCI_MSI) += qib_iba6120.o
+
+ib_qib-$(CONFIG_X86_64) += qib_wc_x86_64.o
+ib_qib-$(CONFIG_PPC64) += qib_wc_ppc64.o
+ib_qib-$(CONFIG_DEBUG_FS) += qib_debugfs.o
diff --git a/drivers/infiniband/hw/qib/qib.h b/drivers/infiniband/hw/qib/qib.h
new file mode 100644
index 000000000..7df16f74b
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib.h
@@ -0,0 +1,1543 @@
+#ifndef _QIB_KERNEL_H
+#define _QIB_KERNEL_H
+/*
+ * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This header file is the base header file for qlogic_ib kernel code
+ * qib_user.h serves a similar purpose for user code.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+#include <linux/slab.h>
+#include <linux/io.h>
+#include <linux/fs.h>
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/sched.h>
+#include <linux/kthread.h>
+
+#include "qib_common.h"
+#include "qib_verbs.h"
+
+/* only s/w major version of QLogic_IB we can handle */
+#define QIB_CHIP_VERS_MAJ 2U
+
+/* don't care about this except printing */
+#define QIB_CHIP_VERS_MIN 0U
+
+/* The Organization Unique Identifier (Mfg code), and its position in GUID */
+#define QIB_OUI 0x001175
+#define QIB_OUI_LSB 40
+
+/*
+ * per driver stats, either not device nor port-specific, or
+ * summed over all of the devices and ports.
+ * They are described by name via ipathfs filesystem, so layout
+ * and number of elements can change without breaking compatibility.
+ * If members are added or deleted qib_statnames[] in qib_fs.c must
+ * change to match.
+ */
+struct qlogic_ib_stats {
+	__u64 sps_ints; /* number of interrupts handled */
+	__u64 sps_errints; /* number of error interrupts */
+	__u64 sps_txerrs; /* tx-related packet errors */
+	__u64 sps_rcverrs; /* non-crc rcv packet errors */
+	__u64 sps_hwerrs; /* hardware errors reported (parity, etc.) */
+	__u64 sps_nopiobufs; /* no pio bufs avail from kernel */
+	__u64 sps_ctxts; /* number of contexts currently open */
+	__u64 sps_lenerrs; /* number of kernel packets where RHF != LRH len */
+	__u64 sps_buffull;
+	__u64 sps_hdrfull;
+};
+
+extern struct qlogic_ib_stats qib_stats;
+extern const struct pci_error_handlers qib_pci_err_handler;
+
+#define QIB_CHIP_SWVERSION QIB_CHIP_VERS_MAJ
+/*
+ * First-cut critierion for "device is active" is
+ * two thousand dwords combined Tx, Rx traffic per
+ * 5-second interval. SMA packets are 64 dwords,
+ * and occur "a few per second", presumably each way.
+ */
+#define QIB_TRAFFIC_ACTIVE_THRESHOLD (2000)
+
+/*
+ * Struct used to indicate which errors are logged in each of the
+ * error-counters that are logged to EEPROM. A counter is incremented
+ * _once_ (saturating at 255) for each event with any bits set in
+ * the error or hwerror register masks below.
+ */
+#define QIB_EEP_LOG_CNT (4)
+struct qib_eep_log_mask {
+	u64 errs_to_log;
+	u64 hwerrs_to_log;
+};
+
+/*
+ * Below contains all data related to a single context (formerly called port).
+ */
+
+#ifdef CONFIG_DEBUG_FS
+struct qib_opcode_stats_perctx;
+#endif
+
+struct qib_ctxtdata {
+	void **rcvegrbuf;
+	dma_addr_t *rcvegrbuf_phys;
+	/* rcvhdrq base, needs mmap before useful */
+	void *rcvhdrq;
+	/* kernel virtual address where hdrqtail is updated */
+	void *rcvhdrtail_kvaddr;
+	/*
+	 * temp buffer for expected send setup, allocated at open, instead
+	 * of each setup call
+	 */
+	void *tid_pg_list;
+	/*
+	 * Shared page for kernel to signal user processes that send buffers
+	 * need disarming.  The process should call QIB_CMD_DISARM_BUFS
+	 * or QIB_CMD_ACK_EVENT with IPATH_EVENT_DISARM_BUFS set.
+	 */
+	unsigned long *user_event_mask;
+	/* when waiting for rcv or pioavail */
+	wait_queue_head_t wait;
+	/*
+	 * rcvegr bufs base, physical, must fit
+	 * in 44 bits so 32 bit programs mmap64 44 bit works)
+	 */
+	dma_addr_t rcvegr_phys;
+	/* mmap of hdrq, must fit in 44 bits */
+	dma_addr_t rcvhdrq_phys;
+	dma_addr_t rcvhdrqtailaddr_phys;
+
+	/*
+	 * number of opens (including slave sub-contexts) on this instance
+	 * (ignoring forks, dup, etc. for now)
+	 */
+	int cnt;
+	/*
+	 * how much space to leave at start of eager TID entries for
+	 * protocol use, on each TID
+	 */
+	/* instead of calculating it */
+	unsigned ctxt;
+	/* local node of context */
+	int node_id;
+	/* non-zero if ctxt is being shared. */
+	u16 subctxt_cnt;
+	/* non-zero if ctxt is being shared. */
+	u16 subctxt_id;
+	/* number of eager TID entries. */
+	u16 rcvegrcnt;
+	/* index of first eager TID entry. */
+	u16 rcvegr_tid_base;
+	/* number of pio bufs for this ctxt (all procs, if shared) */
+	u32 piocnt;
+	/* first pio buffer for this ctxt */
+	u32 pio_base;
+	/* chip offset of PIO buffers for this ctxt */
+	u32 piobufs;
+	/* how many alloc_pages() chunks in rcvegrbuf_pages */
+	u32 rcvegrbuf_chunks;
+	/* how many egrbufs per chunk */
+	u16 rcvegrbufs_perchunk;
+	/* ilog2 of above */
+	u16 rcvegrbufs_perchunk_shift;
+	/* order for rcvegrbuf_pages */
+	size_t rcvegrbuf_size;
+	/* rcvhdrq size (for freeing) */
+	size_t rcvhdrq_size;
+	/* per-context flags for fileops/intr communication */
+	unsigned long flag;
+	/* next expected TID to check when looking for free */
+	u32 tidcursor;
+	/* WAIT_RCV that timed out, no interrupt */
+	u32 rcvwait_to;
+	/* WAIT_PIO that timed out, no interrupt */
+	u32 piowait_to;
+	/* WAIT_RCV already happened, no wait */
+	u32 rcvnowait;
+	/* WAIT_PIO already happened, no wait */
+	u32 pionowait;
+	/* total number of polled urgent packets */
+	u32 urgent;
+	/* saved total number of polled urgent packets for poll edge trigger */
+	u32 urgent_poll;
+	/* pid of process using this ctxt */
+	pid_t pid;
+	pid_t subpid[QLOGIC_IB_MAX_SUBCTXT];
+	/* same size as task_struct .comm[], command that opened context */
+	char comm[16];
+	/* pkeys set by this use of this ctxt */
+	u16 pkeys[4];
+	/* so file ops can get at unit */
+	struct qib_devdata *dd;
+	/* so funcs that need physical port can get it easily */
+	struct qib_pportdata *ppd;
+	/* A page of memory for rcvhdrhead, rcvegrhead, rcvegrtail * N */
+	void *subctxt_uregbase;
+	/* An array of pages for the eager receive buffers * N */
+	void *subctxt_rcvegrbuf;
+	/* An array of pages for the eager header queue entries * N */
+	void *subctxt_rcvhdr_base;
+	/* The version of the library which opened this ctxt */
+	u32 userversion;
+	/* Bitmask of active slaves */
+	u32 active_slaves;
+	/* Type of packets or conditions we want to poll for */
+	u16 poll_type;
+	/* receive packet sequence counter */
+	u8 seq_cnt;
+	u8 redirect_seq_cnt;
+	/* ctxt rcvhdrq head offset */
+	u32 head;
+	/* lookaside fields */
+	struct qib_qp *lookaside_qp;
+	u32 lookaside_qpn;
+	/* QPs waiting for context processing */
+	struct list_head qp_wait_list;
+#ifdef CONFIG_DEBUG_FS
+	/* verbs stats per CTX */
+	struct qib_opcode_stats_perctx *opstats;
+#endif
+};
+
+struct qib_sge_state;
+
+struct qib_sdma_txreq {
+	int                 flags;
+	int                 sg_count;
+	dma_addr_t          addr;
+	void              (*callback)(struct qib_sdma_txreq *, int);
+	u16                 start_idx;  /* sdma private */
+	u16                 next_descq_idx;  /* sdma private */
+	struct list_head    list;       /* sdma private */
+};
+
+struct qib_sdma_desc {
+	__le64 qw[2];
+};
+
+struct qib_verbs_txreq {
+	struct qib_sdma_txreq   txreq;
+	struct qib_qp           *qp;
+	struct qib_swqe         *wqe;
+	u32                     dwords;
+	u16                     hdr_dwords;
+	u16                     hdr_inx;
+	struct qib_pio_header	*align_buf;
+	struct qib_mregion	*mr;
+	struct qib_sge_state    *ss;
+};
+
+#define QIB_SDMA_TXREQ_F_USELARGEBUF  0x1
+#define QIB_SDMA_TXREQ_F_HEADTOHOST   0x2
+#define QIB_SDMA_TXREQ_F_INTREQ       0x4
+#define QIB_SDMA_TXREQ_F_FREEBUF      0x8
+#define QIB_SDMA_TXREQ_F_FREEDESC     0x10
+
+#define QIB_SDMA_TXREQ_S_OK        0
+#define QIB_SDMA_TXREQ_S_SENDERROR 1
+#define QIB_SDMA_TXREQ_S_ABORTED   2
+#define QIB_SDMA_TXREQ_S_SHUTDOWN  3
+
+/*
+ * Get/Set IB link-level config parameters for f_get/set_ib_cfg()
+ * Mostly for MADs that set or query link parameters, also ipath
+ * config interfaces
+ */
+#define QIB_IB_CFG_LIDLMC 0 /* LID (LS16b) and Mask (MS16b) */
+#define QIB_IB_CFG_LWID_ENB 2 /* allowed Link-width */
+#define QIB_IB_CFG_LWID 3 /* currently active Link-width */
+#define QIB_IB_CFG_SPD_ENB 4 /* allowed Link speeds */
+#define QIB_IB_CFG_SPD 5 /* current Link spd */
+#define QIB_IB_CFG_RXPOL_ENB 6 /* Auto-RX-polarity enable */
+#define QIB_IB_CFG_LREV_ENB 7 /* Auto-Lane-reversal enable */
+#define QIB_IB_CFG_LINKLATENCY 8 /* Link Latency (IB1.2 only) */
+#define QIB_IB_CFG_HRTBT 9 /* IB heartbeat off/enable/auto; DDR/QDR only */
+#define QIB_IB_CFG_OP_VLS 10 /* operational VLs */
+#define QIB_IB_CFG_VL_HIGH_CAP 11 /* num of VL high priority weights */
+#define QIB_IB_CFG_VL_LOW_CAP 12 /* num of VL low priority weights */
+#define QIB_IB_CFG_OVERRUN_THRESH 13 /* IB overrun threshold */
+#define QIB_IB_CFG_PHYERR_THRESH 14 /* IB PHY error threshold */
+#define QIB_IB_CFG_LINKDEFAULT 15 /* IB link default (sleep/poll) */
+#define QIB_IB_CFG_PKEYS 16 /* update partition keys */
+#define QIB_IB_CFG_MTU 17 /* update MTU in IBC */
+#define QIB_IB_CFG_LSTATE 18 /* update linkcmd and linkinitcmd in IBC */
+#define QIB_IB_CFG_VL_HIGH_LIMIT 19
+#define QIB_IB_CFG_PMA_TICKS 20 /* PMA sample tick resolution */
+#define QIB_IB_CFG_PORT 21 /* switch port we are connected to */
+
+/*
+ * for CFG_LSTATE: LINKCMD in upper 16 bits, LINKINITCMD in lower 16
+ * IB_LINKINITCMD_POLL and SLEEP are also used as set/get values for
+ * QIB_IB_CFG_LINKDEFAULT cmd
+ */
+#define   IB_LINKCMD_DOWN   (0 << 16)
+#define   IB_LINKCMD_ARMED  (1 << 16)
+#define   IB_LINKCMD_ACTIVE (2 << 16)
+#define   IB_LINKINITCMD_NOP     0
+#define   IB_LINKINITCMD_POLL    1
+#define   IB_LINKINITCMD_SLEEP   2
+#define   IB_LINKINITCMD_DISABLE 3
+
+/*
+ * valid states passed to qib_set_linkstate() user call
+ */
+#define QIB_IB_LINKDOWN         0
+#define QIB_IB_LINKARM          1
+#define QIB_IB_LINKACTIVE       2
+#define QIB_IB_LINKDOWN_ONLY    3
+#define QIB_IB_LINKDOWN_SLEEP   4
+#define QIB_IB_LINKDOWN_DISABLE 5
+
+/*
+ * These 7 values (SDR, DDR, and QDR may be ORed for auto-speed
+ * negotiation) are used for the 3rd argument to path_f_set_ib_cfg
+ * with cmd QIB_IB_CFG_SPD_ENB, by direct calls or via sysfs.  They
+ * are also the the possible values for qib_link_speed_enabled and active
+ * The values were chosen to match values used within the IB spec.
+ */
+#define QIB_IB_SDR 1
+#define QIB_IB_DDR 2
+#define QIB_IB_QDR 4
+
+#define QIB_DEFAULT_MTU 4096
+
+/* max number of IB ports supported per HCA */
+#define QIB_MAX_IB_PORTS 2
+
+/*
+ * Possible IB config parameters for f_get/set_ib_table()
+ */
+#define QIB_IB_TBL_VL_HIGH_ARB 1 /* Get/set VL high priority weights */
+#define QIB_IB_TBL_VL_LOW_ARB 2 /* Get/set VL low priority weights */
+
+/*
+ * Possible "operations" for f_rcvctrl(ppd, op, ctxt)
+ * these are bits so they can be combined, e.g.
+ * QIB_RCVCTRL_INTRAVAIL_ENB | QIB_RCVCTRL_CTXT_ENB
+ */
+#define QIB_RCVCTRL_TAILUPD_ENB 0x01
+#define QIB_RCVCTRL_TAILUPD_DIS 0x02
+#define QIB_RCVCTRL_CTXT_ENB 0x04
+#define QIB_RCVCTRL_CTXT_DIS 0x08
+#define QIB_RCVCTRL_INTRAVAIL_ENB 0x10
+#define QIB_RCVCTRL_INTRAVAIL_DIS 0x20
+#define QIB_RCVCTRL_PKEY_ENB 0x40  /* Note, default is enabled */
+#define QIB_RCVCTRL_PKEY_DIS 0x80
+#define QIB_RCVCTRL_BP_ENB 0x0100
+#define QIB_RCVCTRL_BP_DIS 0x0200
+#define QIB_RCVCTRL_TIDFLOW_ENB 0x0400
+#define QIB_RCVCTRL_TIDFLOW_DIS 0x0800
+
+/*
+ * Possible "operations" for f_sendctrl(ppd, op, var)
+ * these are bits so they can be combined, e.g.
+ * QIB_SENDCTRL_BUFAVAIL_ENB | QIB_SENDCTRL_ENB
+ * Some operations (e.g. DISARM, ABORT) are known to
+ * be "one-shot", so do not modify shadow.
+ */
+#define QIB_SENDCTRL_DISARM       (0x1000)
+#define QIB_SENDCTRL_DISARM_BUF(bufn) ((bufn) | QIB_SENDCTRL_DISARM)
+	/* available (0x2000) */
+#define QIB_SENDCTRL_AVAIL_DIS    (0x4000)
+#define QIB_SENDCTRL_AVAIL_ENB    (0x8000)
+#define QIB_SENDCTRL_AVAIL_BLIP  (0x10000)
+#define QIB_SENDCTRL_SEND_DIS    (0x20000)
+#define QIB_SENDCTRL_SEND_ENB    (0x40000)
+#define QIB_SENDCTRL_FLUSH       (0x80000)
+#define QIB_SENDCTRL_CLEAR      (0x100000)
+#define QIB_SENDCTRL_DISARM_ALL (0x200000)
+
+/*
+ * These are the generic indices for requesting per-port
+ * counter values via the f_portcntr function.  They
+ * are always returned as 64 bit values, although most
+ * are 32 bit counters.
+ */
+/* send-related counters */
+#define QIBPORTCNTR_PKTSEND         0U
+#define QIBPORTCNTR_WORDSEND        1U
+#define QIBPORTCNTR_PSXMITDATA      2U
+#define QIBPORTCNTR_PSXMITPKTS      3U
+#define QIBPORTCNTR_PSXMITWAIT      4U
+#define QIBPORTCNTR_SENDSTALL       5U
+/* receive-related counters */
+#define QIBPORTCNTR_PKTRCV          6U
+#define QIBPORTCNTR_PSRCVDATA       7U
+#define QIBPORTCNTR_PSRCVPKTS       8U
+#define QIBPORTCNTR_RCVEBP          9U
+#define QIBPORTCNTR_RCVOVFL         10U
+#define QIBPORTCNTR_WORDRCV         11U
+/* IB link related error counters */
+#define QIBPORTCNTR_RXLOCALPHYERR   12U
+#define QIBPORTCNTR_RXVLERR         13U
+#define QIBPORTCNTR_ERRICRC         14U
+#define QIBPORTCNTR_ERRVCRC         15U
+#define QIBPORTCNTR_ERRLPCRC        16U
+#define QIBPORTCNTR_BADFORMAT       17U
+#define QIBPORTCNTR_ERR_RLEN        18U
+#define QIBPORTCNTR_IBSYMBOLERR     19U
+#define QIBPORTCNTR_INVALIDRLEN     20U
+#define QIBPORTCNTR_UNSUPVL         21U
+#define QIBPORTCNTR_EXCESSBUFOVFL   22U
+#define QIBPORTCNTR_ERRLINK         23U
+#define QIBPORTCNTR_IBLINKDOWN      24U
+#define QIBPORTCNTR_IBLINKERRRECOV  25U
+#define QIBPORTCNTR_LLI             26U
+/* other error counters */
+#define QIBPORTCNTR_RXDROPPKT       27U
+#define QIBPORTCNTR_VL15PKTDROP     28U
+#define QIBPORTCNTR_ERRPKEY         29U
+#define QIBPORTCNTR_KHDROVFL        30U
+/* sampling counters (these are actually control registers) */
+#define QIBPORTCNTR_PSINTERVAL      31U
+#define QIBPORTCNTR_PSSTART         32U
+#define QIBPORTCNTR_PSSTAT          33U
+
+/* how often we check for packet activity for "power on hours (in seconds) */
+#define ACTIVITY_TIMER 5
+
+#define MAX_NAME_SIZE 64
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+struct qib_irq_notify;
+#endif
+
+struct qib_msix_entry {
+	struct msix_entry msix;
+	void *arg;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	int dca;
+	int rcv;
+	struct qib_irq_notify *notifier;
+#endif
+	char name[MAX_NAME_SIZE];
+	cpumask_var_t mask;
+};
+
+/* Below is an opaque struct. Each chip (device) can maintain
+ * private data needed for its operation, but not germane to the
+ * rest of the driver.  For convenience, we define another that
+ * is chip-specific, per-port
+ */
+struct qib_chip_specific;
+struct qib_chipport_specific;
+
+enum qib_sdma_states {
+	qib_sdma_state_s00_hw_down,
+	qib_sdma_state_s10_hw_start_up_wait,
+	qib_sdma_state_s20_idle,
+	qib_sdma_state_s30_sw_clean_up_wait,
+	qib_sdma_state_s40_hw_clean_up_wait,
+	qib_sdma_state_s50_hw_halt_wait,
+	qib_sdma_state_s99_running,
+};
+
+enum qib_sdma_events {
+	qib_sdma_event_e00_go_hw_down,
+	qib_sdma_event_e10_go_hw_start,
+	qib_sdma_event_e20_hw_started,
+	qib_sdma_event_e30_go_running,
+	qib_sdma_event_e40_sw_cleaned,
+	qib_sdma_event_e50_hw_cleaned,
+	qib_sdma_event_e60_hw_halted,
+	qib_sdma_event_e70_go_idle,
+	qib_sdma_event_e7220_err_halted,
+	qib_sdma_event_e7322_err_halted,
+	qib_sdma_event_e90_timer_tick,
+};
+
+extern char *qib_sdma_state_names[];
+extern char *qib_sdma_event_names[];
+
+struct sdma_set_state_action {
+	unsigned op_enable:1;
+	unsigned op_intenable:1;
+	unsigned op_halt:1;
+	unsigned op_drain:1;
+	unsigned go_s99_running_tofalse:1;
+	unsigned go_s99_running_totrue:1;
+};
+
+struct qib_sdma_state {
+	struct kref          kref;
+	struct completion    comp;
+	enum qib_sdma_states current_state;
+	struct sdma_set_state_action *set_state_action;
+	unsigned             current_op;
+	unsigned             go_s99_running;
+	unsigned             first_sendbuf;
+	unsigned             last_sendbuf; /* really last +1 */
+	/* debugging/devel */
+	enum qib_sdma_states previous_state;
+	unsigned             previous_op;
+	enum qib_sdma_events last_event;
+};
+
+struct xmit_wait {
+	struct timer_list timer;
+	u64 counter;
+	u8 flags;
+	struct cache {
+		u64 psxmitdata;
+		u64 psrcvdata;
+		u64 psxmitpkts;
+		u64 psrcvpkts;
+		u64 psxmitwait;
+	} counter_cache;
+};
+
+/*
+ * The structure below encapsulates data relevant to a physical IB Port.
+ * Current chips support only one such port, but the separation
+ * clarifies things a bit. Note that to conform to IB conventions,
+ * port-numbers are one-based. The first or only port is port1.
+ */
+struct qib_pportdata {
+	struct qib_ibport ibport_data;
+
+	struct qib_devdata *dd;
+	struct qib_chippport_specific *cpspec; /* chip-specific per-port */
+	struct kobject pport_kobj;
+	struct kobject pport_cc_kobj;
+	struct kobject sl2vl_kobj;
+	struct kobject diagc_kobj;
+
+	/* GUID for this interface, in network order */
+	__be64 guid;
+
+	/* QIB_POLL, etc. link-state specific flags, per port */
+	u32 lflags;
+	/* qib_lflags driver is waiting for */
+	u32 state_wanted;
+	spinlock_t lflags_lock;
+
+	/* ref count for each pkey */
+	atomic_t pkeyrefs[4];
+
+	/*
+	 * this address is mapped readonly into user processes so they can
+	 * get status cheaply, whenever they want.  One qword of status per port
+	 */
+	u64 *statusp;
+
+	/* SendDMA related entries */
+
+	/* read mostly */
+	struct qib_sdma_desc *sdma_descq;
+	struct workqueue_struct *qib_wq;
+	struct qib_sdma_state sdma_state;
+	dma_addr_t       sdma_descq_phys;
+	volatile __le64 *sdma_head_dma; /* DMA'ed by chip */
+	dma_addr_t       sdma_head_phys;
+	u16                   sdma_descq_cnt;
+
+	/* read/write using lock */
+	spinlock_t            sdma_lock ____cacheline_aligned_in_smp;
+	struct list_head      sdma_activelist;
+	struct list_head      sdma_userpending;
+	u64                   sdma_descq_added;
+	u64                   sdma_descq_removed;
+	u16                   sdma_descq_tail;
+	u16                   sdma_descq_head;
+	u8                    sdma_generation;
+	u8                    sdma_intrequest;
+
+	struct tasklet_struct sdma_sw_clean_up_task
+		____cacheline_aligned_in_smp;
+
+	wait_queue_head_t state_wait; /* for state_wanted */
+
+	/* HoL blocking for SMP replies */
+	unsigned          hol_state;
+	struct timer_list hol_timer;
+
+	/*
+	 * Shadow copies of registers; size indicates read access size.
+	 * Most of them are readonly, but some are write-only register,
+	 * where we manipulate the bits in the shadow copy, and then write
+	 * the shadow copy to qlogic_ib.
+	 *
+	 * We deliberately make most of these 32 bits, since they have
+	 * restricted range.  For any that we read, we won't to generate 32
+	 * bit accesses, since Opteron will generate 2 separate 32 bit HT
+	 * transactions for a 64 bit read, and we want to avoid unnecessary
+	 * bus transactions.
+	 */
+
+	/* This is the 64 bit group */
+	/* last ibcstatus.  opaque outside chip-specific code */
+	u64 lastibcstat;
+
+	/* these are the "32 bit" regs */
+
+	/*
+	 * the following two are 32-bit bitmasks, but {test,clear,set}_bit
+	 * all expect bit fields to be "unsigned long"
+	 */
+	unsigned long p_rcvctrl; /* shadow per-port rcvctrl */
+	unsigned long p_sendctrl; /* shadow per-port sendctrl */
+
+	u32 ibmtu; /* The MTU programmed for this unit */
+	/*
+	 * Current max size IB packet (in bytes) including IB headers, that
+	 * we can send. Changes when ibmtu changes.
+	 */
+	u32 ibmaxlen;
+	/*
+	 * ibmaxlen at init time, limited by chip and by receive buffer
+	 * size.  Not changed after init.
+	 */
+	u32 init_ibmaxlen;
+	/* LID programmed for this instance */
+	u16 lid;
+	/* list of pkeys programmed; 0 if not set */
+	u16 pkeys[4];
+	/* LID mask control */
+	u8 lmc;
+	u8 link_width_supported;
+	u8 link_speed_supported;
+	u8 link_width_enabled;
+	u8 link_speed_enabled;
+	u8 link_width_active;
+	u8 link_speed_active;
+	u8 vls_supported;
+	u8 vls_operational;
+	/* Rx Polarity inversion (compensate for ~tx on partner) */
+	u8 rx_pol_inv;
+
+	u8 hw_pidx;     /* physical port index */
+	u8 port;        /* IB port number and index into dd->pports - 1 */
+
+	u8 delay_mult;
+
+	/* used to override LED behavior */
+	u8 led_override;  /* Substituted for normal value, if non-zero */
+	u16 led_override_timeoff; /* delta to next timer event */
+	u8 led_override_vals[2]; /* Alternates per blink-frame */
+	u8 led_override_phase; /* Just counts, LSB picks from vals[] */
+	atomic_t led_override_timer_active;
+	/* Used to flash LEDs in override mode */
+	struct timer_list led_override_timer;
+	struct xmit_wait cong_stats;
+	struct timer_list symerr_clear_timer;
+
+	/* Synchronize access between driver writes and sysfs reads */
+	spinlock_t cc_shadow_lock
+		____cacheline_aligned_in_smp;
+
+	/* Shadow copy of the congestion control table */
+	struct cc_table_shadow *ccti_entries_shadow;
+
+	/* Shadow copy of the congestion control entries */
+	struct ib_cc_congestion_setting_attr_shadow *congestion_entries_shadow;
+
+	/* List of congestion control table entries */
+	struct ib_cc_table_entry_shadow *ccti_entries;
+
+	/* 16 congestion entries with each entry corresponding to a SL */
+	struct ib_cc_congestion_entry_shadow *congestion_entries;
+
+	/* Maximum number of congestion control entries that the agent expects
+	 * the manager to send.
+	 */
+	u16 cc_supported_table_entries;
+
+	/* Total number of congestion control table entries */
+	u16 total_cct_entry;
+
+	/* Bit map identifying service level */
+	u16 cc_sl_control_map;
+
+	/* maximum congestion control table index */
+	u16 ccti_limit;
+
+	/* CA's max number of 64 entry units in the congestion control table */
+	u8 cc_max_table_entries;
+};
+
+/* Observers. Not to be taken lightly, possibly not to ship. */
+/*
+ * If a diag read or write is to (bottom <= offset <= top),
+ * the "hoook" is called, allowing, e.g. shadows to be
+ * updated in sync with the driver. struct diag_observer
+ * is the "visible" part.
+ */
+struct diag_observer;
+
+typedef int (*diag_hook) (struct qib_devdata *dd,
+	const struct diag_observer *op,
+	u32 offs, u64 *data, u64 mask, int only_32);
+
+struct diag_observer {
+	diag_hook hook;
+	u32 bottom;
+	u32 top;
+};
+
+extern int qib_register_observer(struct qib_devdata *dd,
+	const struct diag_observer *op);
+
+/* Only declared here, not defined. Private to diags */
+struct diag_observer_list_elt;
+
+/* device data struct now contains only "general per-device" info.
+ * fields related to a physical IB port are in a qib_pportdata struct,
+ * described above) while fields only used by a particular chip-type are in
+ * a qib_chipdata struct, whose contents are opaque to this file.
+ */
+struct qib_devdata {
+	struct qib_ibdev verbs_dev;     /* must be first */
+	struct list_head list;
+	/* pointers to related structs for this device */
+	/* pci access data structure */
+	struct pci_dev *pcidev;
+	struct cdev *user_cdev;
+	struct cdev *diag_cdev;
+	struct device *user_device;
+	struct device *diag_device;
+
+	/* mem-mapped pointer to base of chip regs */
+	u64 __iomem *kregbase;
+	/* end of mem-mapped chip space excluding sendbuf and user regs */
+	u64 __iomem *kregend;
+	/* physical address of chip for io_remap, etc. */
+	resource_size_t physaddr;
+	/* qib_cfgctxts pointers */
+	struct qib_ctxtdata **rcd; /* Receive Context Data */
+
+	/* qib_pportdata, points to array of (physical) port-specific
+	 * data structs, indexed by pidx (0..n-1)
+	 */
+	struct qib_pportdata *pport;
+	struct qib_chip_specific *cspec; /* chip-specific */
+
+	/* kvirt address of 1st 2k pio buffer */
+	void __iomem *pio2kbase;
+	/* kvirt address of 1st 4k pio buffer */
+	void __iomem *pio4kbase;
+	/* mem-mapped pointer to base of PIO buffers (if using WC PAT) */
+	void __iomem *piobase;
+	/* mem-mapped pointer to base of user chip regs (if using WC PAT) */
+	u64 __iomem *userbase;
+	void __iomem *piovl15base; /* base of VL15 buffers, if not WC */
+	/*
+	 * points to area where PIOavail registers will be DMA'ed.
+	 * Has to be on a page of it's own, because the page will be
+	 * mapped into user program space.  This copy is *ONLY* ever
+	 * written by DMA, not by the driver!  Need a copy per device
+	 * when we get to multiple devices
+	 */
+	volatile __le64 *pioavailregs_dma; /* DMA'ed by chip */
+	/* physical address where updates occur */
+	dma_addr_t pioavailregs_phys;
+
+	/* device-specific implementations of functions needed by
+	 * common code. Contrary to previous consensus, we can't
+	 * really just point to a device-specific table, because we
+	 * may need to "bend", e.g. *_f_put_tid
+	 */
+	/* fallback to alternate interrupt type if possible */
+	int (*f_intr_fallback)(struct qib_devdata *);
+	/* hard reset chip */
+	int (*f_reset)(struct qib_devdata *);
+	void (*f_quiet_serdes)(struct qib_pportdata *);
+	int (*f_bringup_serdes)(struct qib_pportdata *);
+	int (*f_early_init)(struct qib_devdata *);
+	void (*f_clear_tids)(struct qib_devdata *, struct qib_ctxtdata *);
+	void (*f_put_tid)(struct qib_devdata *, u64 __iomem*,
+				u32, unsigned long);
+	void (*f_cleanup)(struct qib_devdata *);
+	void (*f_setextled)(struct qib_pportdata *, u32);
+	/* fill out chip-specific fields */
+	int (*f_get_base_info)(struct qib_ctxtdata *, struct qib_base_info *);
+	/* free irq */
+	void (*f_free_irq)(struct qib_devdata *);
+	struct qib_message_header *(*f_get_msgheader)
+					(struct qib_devdata *, __le32 *);
+	void (*f_config_ctxts)(struct qib_devdata *);
+	int (*f_get_ib_cfg)(struct qib_pportdata *, int);
+	int (*f_set_ib_cfg)(struct qib_pportdata *, int, u32);
+	int (*f_set_ib_loopback)(struct qib_pportdata *, const char *);
+	int (*f_get_ib_table)(struct qib_pportdata *, int, void *);
+	int (*f_set_ib_table)(struct qib_pportdata *, int, void *);
+	u32 (*f_iblink_state)(u64);
+	u8 (*f_ibphys_portstate)(u64);
+	void (*f_xgxs_reset)(struct qib_pportdata *);
+	/* per chip actions needed for IB Link up/down changes */
+	int (*f_ib_updown)(struct qib_pportdata *, int, u64);
+	u32 __iomem *(*f_getsendbuf)(struct qib_pportdata *, u64, u32 *);
+	/* Read/modify/write of GPIO pins (potentially chip-specific */
+	int (*f_gpio_mod)(struct qib_devdata *dd, u32 out, u32 dir,
+		u32 mask);
+	/* Enable writes to config EEPROM (if supported) */
+	int (*f_eeprom_wen)(struct qib_devdata *dd, int wen);
+	/*
+	 * modify rcvctrl shadow[s] and write to appropriate chip-regs.
+	 * see above QIB_RCVCTRL_xxx_ENB/DIS for operations.
+	 * (ctxt == -1) means "all contexts", only meaningful for
+	 * clearing. Could remove if chip_spec shutdown properly done.
+	 */
+	void (*f_rcvctrl)(struct qib_pportdata *, unsigned int op,
+		int ctxt);
+	/* Read/modify/write sendctrl appropriately for op and port. */
+	void (*f_sendctrl)(struct qib_pportdata *, u32 op);
+	void (*f_set_intr_state)(struct qib_devdata *, u32);
+	void (*f_set_armlaunch)(struct qib_devdata *, u32);
+	void (*f_wantpiobuf_intr)(struct qib_devdata *, u32);
+	int (*f_late_initreg)(struct qib_devdata *);
+	int (*f_init_sdma_regs)(struct qib_pportdata *);
+	u16 (*f_sdma_gethead)(struct qib_pportdata *);
+	int (*f_sdma_busy)(struct qib_pportdata *);
+	void (*f_sdma_update_tail)(struct qib_pportdata *, u16);
+	void (*f_sdma_set_desc_cnt)(struct qib_pportdata *, unsigned);
+	void (*f_sdma_sendctrl)(struct qib_pportdata *, unsigned);
+	void (*f_sdma_hw_clean_up)(struct qib_pportdata *);
+	void (*f_sdma_hw_start_up)(struct qib_pportdata *);
+	void (*f_sdma_init_early)(struct qib_pportdata *);
+	void (*f_set_cntr_sample)(struct qib_pportdata *, u32, u32);
+	void (*f_update_usrhead)(struct qib_ctxtdata *, u64, u32, u32, u32);
+	u32 (*f_hdrqempty)(struct qib_ctxtdata *);
+	u64 (*f_portcntr)(struct qib_pportdata *, u32);
+	u32 (*f_read_cntrs)(struct qib_devdata *, loff_t, char **,
+		u64 **);
+	u32 (*f_read_portcntrs)(struct qib_devdata *, loff_t, u32,
+		char **, u64 **);
+	u32 (*f_setpbc_control)(struct qib_pportdata *, u32, u8, u8);
+	void (*f_initvl15_bufs)(struct qib_devdata *);
+	void (*f_init_ctxt)(struct qib_ctxtdata *);
+	void (*f_txchk_change)(struct qib_devdata *, u32, u32, u32,
+		struct qib_ctxtdata *);
+	void (*f_writescratch)(struct qib_devdata *, u32);
+	int (*f_tempsense_rd)(struct qib_devdata *, int regnum);
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	int (*f_notify_dca)(struct qib_devdata *, unsigned long event);
+#endif
+
+	char *boardname; /* human readable board info */
+
+	/* template for writing TIDs  */
+	u64 tidtemplate;
+	/* value to write to free TIDs */
+	u64 tidinvalid;
+
+	/* number of registers used for pioavail */
+	u32 pioavregs;
+	/* device (not port) flags, basically device capabilities */
+	u32 flags;
+	/* last buffer for user use */
+	u32 lastctxt_piobuf;
+
+	/* reset value */
+	u64 z_int_counter;
+	/* percpu intcounter */
+	u64 __percpu *int_counter;
+
+	/* pio bufs allocated per ctxt */
+	u32 pbufsctxt;
+	/* if remainder on bufs/ctxt, ctxts < extrabuf get 1 extra */
+	u32 ctxts_extrabuf;
+	/*
+	 * number of ctxts configured as max; zero is set to number chip
+	 * supports, less gives more pio bufs/ctxt, etc.
+	 */
+	u32 cfgctxts;
+	/*
+	 * number of ctxts available for PSM open
+	 */
+	u32 freectxts;
+
+	/*
+	 * hint that we should update pioavailshadow before
+	 * looking for a PIO buffer
+	 */
+	u32 upd_pio_shadow;
+
+	/* internal debugging stats */
+	u32 maxpkts_call;
+	u32 avgpkts_call;
+	u64 nopiobufs;
+
+	/* PCI Vendor ID (here for NodeInfo) */
+	u16 vendorid;
+	/* PCI Device ID (here for NodeInfo) */
+	u16 deviceid;
+	/* for write combining settings */
+	int wc_cookie;
+	unsigned long wc_base;
+	unsigned long wc_len;
+
+	/* shadow copy of struct page *'s for exp tid pages */
+	struct page **pageshadow;
+	/* shadow copy of dma handles for exp tid pages */
+	dma_addr_t *physshadow;
+	u64 __iomem *egrtidbase;
+	spinlock_t sendctrl_lock; /* protect changes to sendctrl shadow */
+	/* around rcd and (user ctxts) ctxt_cnt use (intr vs free) */
+	spinlock_t uctxt_lock; /* rcd and user context changes */
+	/*
+	 * per unit status, see also portdata statusp
+	 * mapped readonly into user processes so they can get unit and
+	 * IB link status cheaply
+	 */
+	u64 *devstatusp;
+	char *freezemsg; /* freeze msg if hw error put chip in freeze */
+	u32 freezelen; /* max length of freezemsg */
+	/* timer used to prevent stats overflow, error throttling, etc. */
+	struct timer_list stats_timer;
+
+	/* timer to verify interrupts work, and fallback if possible */
+	struct timer_list intrchk_timer;
+	unsigned long ureg_align; /* user register alignment */
+
+	/*
+	 * Protects pioavailshadow, pioavailkernel, pio_need_disarm, and
+	 * pio_writing.
+	 */
+	spinlock_t pioavail_lock;
+	/*
+	 * index of last buffer to optimize search for next
+	 */
+	u32 last_pio;
+	/*
+	 * min kernel pio buffer to optimize search
+	 */
+	u32 min_kernel_pio;
+	/*
+	 * Shadow copies of registers; size indicates read access size.
+	 * Most of them are readonly, but some are write-only register,
+	 * where we manipulate the bits in the shadow copy, and then write
+	 * the shadow copy to qlogic_ib.
+	 *
+	 * We deliberately make most of these 32 bits, since they have
+	 * restricted range.  For any that we read, we won't to generate 32
+	 * bit accesses, since Opteron will generate 2 separate 32 bit HT
+	 * transactions for a 64 bit read, and we want to avoid unnecessary
+	 * bus transactions.
+	 */
+
+	/* This is the 64 bit group */
+
+	unsigned long pioavailshadow[6];
+	/* bitmap of send buffers available for the kernel to use with PIO. */
+	unsigned long pioavailkernel[6];
+	/* bitmap of send buffers which need to be disarmed. */
+	unsigned long pio_need_disarm[3];
+	/* bitmap of send buffers which are being written to. */
+	unsigned long pio_writing[3];
+	/* kr_revision shadow */
+	u64 revision;
+	/* Base GUID for device (from eeprom, network order) */
+	__be64 base_guid;
+
+	/*
+	 * kr_sendpiobufbase value (chip offset of pio buffers), and the
+	 * base of the 2KB buffer s(user processes only use 2K)
+	 */
+	u64 piobufbase;
+	u32 pio2k_bufbase;
+
+	/* these are the "32 bit" regs */
+
+	/* number of GUIDs in the flash for this interface */
+	u32 nguid;
+	/*
+	 * the following two are 32-bit bitmasks, but {test,clear,set}_bit
+	 * all expect bit fields to be "unsigned long"
+	 */
+	unsigned long rcvctrl; /* shadow per device rcvctrl */
+	unsigned long sendctrl; /* shadow per device sendctrl */
+
+	/* value we put in kr_rcvhdrcnt */
+	u32 rcvhdrcnt;
+	/* value we put in kr_rcvhdrsize */
+	u32 rcvhdrsize;
+	/* value we put in kr_rcvhdrentsize */
+	u32 rcvhdrentsize;
+	/* kr_ctxtcnt value */
+	u32 ctxtcnt;
+	/* kr_pagealign value */
+	u32 palign;
+	/* number of "2KB" PIO buffers */
+	u32 piobcnt2k;
+	/* size in bytes of "2KB" PIO buffers */
+	u32 piosize2k;
+	/* max usable size in dwords of a "2KB" PIO buffer before going "4KB" */
+	u32 piosize2kmax_dwords;
+	/* number of "4KB" PIO buffers */
+	u32 piobcnt4k;
+	/* size in bytes of "4KB" PIO buffers */
+	u32 piosize4k;
+	/* kr_rcvegrbase value */
+	u32 rcvegrbase;
+	/* kr_rcvtidbase value */
+	u32 rcvtidbase;
+	/* kr_rcvtidcnt value */
+	u32 rcvtidcnt;
+	/* kr_userregbase */
+	u32 uregbase;
+	/* shadow the control register contents */
+	u32 control;
+
+	/* chip address space used by 4k pio buffers */
+	u32 align4k;
+	/* size of each rcvegrbuffer */
+	u16 rcvegrbufsize;
+	/* log2 of above */
+	u16 rcvegrbufsize_shift;
+	/* localbus width (1, 2,4,8,16,32) from config space  */
+	u32 lbus_width;
+	/* localbus speed in MHz */
+	u32 lbus_speed;
+	int unit; /* unit # of this chip */
+
+	/* start of CHIP_SPEC move to chipspec, but need code changes */
+	/* low and high portions of MSI capability/vector */
+	u32 msi_lo;
+	/* saved after PCIe init for restore after reset */
+	u32 msi_hi;
+	/* MSI data (vector) saved for restore */
+	u16 msi_data;
+	/* so we can rewrite it after a chip reset */
+	u32 pcibar0;
+	/* so we can rewrite it after a chip reset */
+	u32 pcibar1;
+	u64 rhdrhead_intr_off;
+
+	/*
+	 * ASCII serial number, from flash, large enough for original
+	 * all digit strings, and longer QLogic serial number format
+	 */
+	u8 serial[16];
+	/* human readable board version */
+	u8 boardversion[96];
+	u8 lbus_info[32]; /* human readable localbus info */
+	/* chip major rev, from qib_revision */
+	u8 majrev;
+	/* chip minor rev, from qib_revision */
+	u8 minrev;
+
+	/* Misc small ints */
+	/* Number of physical ports available */
+	u8 num_pports;
+	/* Lowest context number which can be used by user processes */
+	u8 first_user_ctxt;
+	u8 n_krcv_queues;
+	u8 qpn_mask;
+	u8 skip_kctxt_mask;
+
+	u16 rhf_offset; /* offset of RHF within receive header entry */
+
+	/*
+	 * GPIO pins for twsi-connected devices, and device code for eeprom
+	 */
+	u8 gpio_sda_num;
+	u8 gpio_scl_num;
+	u8 twsi_eeprom_dev;
+	u8 board_atten;
+
+	/* Support (including locks) for EEPROM logging of errors and time */
+	/* control access to actual counters, timer */
+	spinlock_t eep_st_lock;
+	/* control high-level access to EEPROM */
+	struct mutex eep_lock;
+	uint64_t traffic_wds;
+	/*
+	 * masks for which bits of errs, hwerrs that cause
+	 * each of the counters to increment.
+	 */
+	struct qib_eep_log_mask eep_st_masks[QIB_EEP_LOG_CNT];
+	struct qib_diag_client *diag_client;
+	spinlock_t qib_diag_trans_lock; /* protect diag observer ops */
+	struct diag_observer_list_elt *diag_observer_list;
+
+	u8 psxmitwait_supported;
+	/* cycle length of PS* counters in HW (in picoseconds) */
+	u16 psxmitwait_check_rate;
+	/* high volume overflow errors defered to tasklet */
+	struct tasklet_struct error_tasklet;
+	/* per device cq worker */
+	struct kthread_worker *worker;
+
+	int assigned_node_id; /* NUMA node closest to HCA */
+};
+
+/* hol_state values */
+#define QIB_HOL_UP       0
+#define QIB_HOL_INIT     1
+
+#define QIB_SDMA_SENDCTRL_OP_ENABLE    (1U << 0)
+#define QIB_SDMA_SENDCTRL_OP_INTENABLE (1U << 1)
+#define QIB_SDMA_SENDCTRL_OP_HALT      (1U << 2)
+#define QIB_SDMA_SENDCTRL_OP_CLEANUP   (1U << 3)
+#define QIB_SDMA_SENDCTRL_OP_DRAIN     (1U << 4)
+
+/* operation types for f_txchk_change() */
+#define TXCHK_CHG_TYPE_DIS1  3
+#define TXCHK_CHG_TYPE_ENAB1 2
+#define TXCHK_CHG_TYPE_KERN  1
+#define TXCHK_CHG_TYPE_USER  0
+
+#define QIB_CHASE_TIME msecs_to_jiffies(145)
+#define QIB_CHASE_DIS_TIME msecs_to_jiffies(160)
+
+/* Private data for file operations */
+struct qib_filedata {
+	struct qib_ctxtdata *rcd;
+	unsigned subctxt;
+	unsigned tidcursor;
+	struct qib_user_sdma_queue *pq;
+	int rec_cpu_num; /* for cpu affinity; -1 if none */
+};
+
+extern struct list_head qib_dev_list;
+extern spinlock_t qib_devs_lock;
+extern struct qib_devdata *qib_lookup(int unit);
+extern u32 qib_cpulist_count;
+extern unsigned long *qib_cpulist;
+
+extern unsigned qib_cc_table_size;
+int qib_init(struct qib_devdata *, int);
+int init_chip_wc_pat(struct qib_devdata *dd, u32);
+int qib_enable_wc(struct qib_devdata *dd);
+void qib_disable_wc(struct qib_devdata *dd);
+int qib_count_units(int *npresentp, int *nupp);
+int qib_count_active_units(void);
+
+int qib_cdev_init(int minor, const char *name,
+		  const struct file_operations *fops,
+		  struct cdev **cdevp, struct device **devp);
+void qib_cdev_cleanup(struct cdev **cdevp, struct device **devp);
+int qib_dev_init(void);
+void qib_dev_cleanup(void);
+
+int qib_diag_add(struct qib_devdata *);
+void qib_diag_remove(struct qib_devdata *);
+void qib_handle_e_ibstatuschanged(struct qib_pportdata *, u64);
+void qib_sdma_update_tail(struct qib_pportdata *, u16); /* hold sdma_lock */
+
+int qib_decode_err(struct qib_devdata *dd, char *buf, size_t blen, u64 err);
+void qib_bad_intrstatus(struct qib_devdata *);
+void qib_handle_urcv(struct qib_devdata *, u64);
+
+/* clean up any per-chip chip-specific stuff */
+void qib_chip_cleanup(struct qib_devdata *);
+/* clean up any chip type-specific stuff */
+void qib_chip_done(void);
+
+/* check to see if we have to force ordering for write combining */
+int qib_unordered_wc(void);
+void qib_pio_copy(void __iomem *to, const void *from, size_t count);
+
+void qib_disarm_piobufs(struct qib_devdata *, unsigned, unsigned);
+int qib_disarm_piobufs_ifneeded(struct qib_ctxtdata *);
+void qib_disarm_piobufs_set(struct qib_devdata *, unsigned long *, unsigned);
+void qib_cancel_sends(struct qib_pportdata *);
+
+int qib_create_rcvhdrq(struct qib_devdata *, struct qib_ctxtdata *);
+int qib_setup_eagerbufs(struct qib_ctxtdata *);
+void qib_set_ctxtcnt(struct qib_devdata *);
+int qib_create_ctxts(struct qib_devdata *dd);
+struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *, u32, int);
+int qib_init_pportdata(struct qib_pportdata *, struct qib_devdata *, u8, u8);
+void qib_free_ctxtdata(struct qib_devdata *, struct qib_ctxtdata *);
+
+u32 qib_kreceive(struct qib_ctxtdata *, u32 *, u32 *);
+int qib_reset_device(int);
+int qib_wait_linkstate(struct qib_pportdata *, u32, int);
+int qib_set_linkstate(struct qib_pportdata *, u8);
+int qib_set_mtu(struct qib_pportdata *, u16);
+int qib_set_lid(struct qib_pportdata *, u32, u8);
+void qib_hol_down(struct qib_pportdata *);
+void qib_hol_init(struct qib_pportdata *);
+void qib_hol_up(struct qib_pportdata *);
+void qib_hol_event(unsigned long);
+void qib_disable_after_error(struct qib_devdata *);
+int qib_set_uevent_bits(struct qib_pportdata *, const int);
+
+/* for use in system calls, where we want to know device type, etc. */
+#define ctxt_fp(fp) \
+	(((struct qib_filedata *)(fp)->private_data)->rcd)
+#define subctxt_fp(fp) \
+	(((struct qib_filedata *)(fp)->private_data)->subctxt)
+#define tidcursor_fp(fp) \
+	(((struct qib_filedata *)(fp)->private_data)->tidcursor)
+#define user_sdma_queue_fp(fp) \
+	(((struct qib_filedata *)(fp)->private_data)->pq)
+
+static inline struct qib_devdata *dd_from_ppd(struct qib_pportdata *ppd)
+{
+	return ppd->dd;
+}
+
+static inline struct qib_devdata *dd_from_dev(struct qib_ibdev *dev)
+{
+	return container_of(dev, struct qib_devdata, verbs_dev);
+}
+
+static inline struct qib_devdata *dd_from_ibdev(struct ib_device *ibdev)
+{
+	return dd_from_dev(to_idev(ibdev));
+}
+
+static inline struct qib_pportdata *ppd_from_ibp(struct qib_ibport *ibp)
+{
+	return container_of(ibp, struct qib_pportdata, ibport_data);
+}
+
+static inline struct qib_ibport *to_iport(struct ib_device *ibdev, u8 port)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+	WARN_ON(pidx >= dd->num_pports);
+	return &dd->pport[pidx].ibport_data;
+}
+
+/*
+ * values for dd->flags (_device_ related flags) and
+ */
+#define QIB_HAS_LINK_LATENCY  0x1 /* supports link latency (IB 1.2) */
+#define QIB_INITTED           0x2 /* chip and driver up and initted */
+#define QIB_DOING_RESET       0x4  /* in the middle of doing chip reset */
+#define QIB_PRESENT           0x8  /* chip accesses can be done */
+#define QIB_PIO_FLUSH_WC      0x10 /* Needs Write combining flush for PIO */
+#define QIB_HAS_THRESH_UPDATE 0x40
+#define QIB_HAS_SDMA_TIMEOUT  0x80
+#define QIB_USE_SPCL_TRIG     0x100 /* SpecialTrigger launch enabled */
+#define QIB_NODMA_RTAIL       0x200 /* rcvhdrtail register DMA enabled */
+#define QIB_HAS_INTX          0x800 /* Supports INTx interrupts */
+#define QIB_HAS_SEND_DMA      0x1000 /* Supports Send DMA */
+#define QIB_HAS_VLSUPP        0x2000 /* Supports multiple VLs; PBC different */
+#define QIB_HAS_HDRSUPP       0x4000 /* Supports header suppression */
+#define QIB_BADINTR           0x8000 /* severe interrupt problems */
+#define QIB_DCA_ENABLED       0x10000 /* Direct Cache Access enabled */
+#define QIB_HAS_QSFP          0x20000 /* device (card instance) has QSFP */
+
+/*
+ * values for ppd->lflags (_ib_port_ related flags)
+ */
+#define QIBL_LINKV             0x1 /* IB link state valid */
+#define QIBL_LINKDOWN          0x8 /* IB link is down */
+#define QIBL_LINKINIT          0x10 /* IB link level is up */
+#define QIBL_LINKARMED         0x20 /* IB link is ARMED */
+#define QIBL_LINKACTIVE        0x40 /* IB link is ACTIVE */
+/* leave a gap for more IB-link state */
+#define QIBL_IB_AUTONEG_INPROG 0x1000 /* non-IBTA DDR/QDR neg active */
+#define QIBL_IB_AUTONEG_FAILED 0x2000 /* non-IBTA DDR/QDR neg failed */
+#define QIBL_IB_LINK_DISABLED  0x4000 /* Linkdown-disable forced,
+				       * Do not try to bring up */
+#define QIBL_IB_FORCE_NOTIFY   0x8000 /* force notify on next ib change */
+
+/* IB dword length mask in PBC (lower 11 bits); same for all chips */
+#define QIB_PBC_LENGTH_MASK                     ((1 << 11) - 1)
+
+
+/* ctxt_flag bit offsets */
+		/* waiting for a packet to arrive */
+#define QIB_CTXT_WAITING_RCV   2
+		/* master has not finished initializing */
+#define QIB_CTXT_MASTER_UNINIT 4
+		/* waiting for an urgent packet to arrive */
+#define QIB_CTXT_WAITING_URG 5
+
+/* free up any allocated data at closes */
+void qib_free_data(struct qib_ctxtdata *dd);
+void qib_chg_pioavailkernel(struct qib_devdata *, unsigned, unsigned,
+			    u32, struct qib_ctxtdata *);
+struct qib_devdata *qib_init_iba7322_funcs(struct pci_dev *,
+					   const struct pci_device_id *);
+struct qib_devdata *qib_init_iba7220_funcs(struct pci_dev *,
+					   const struct pci_device_id *);
+struct qib_devdata *qib_init_iba6120_funcs(struct pci_dev *,
+					   const struct pci_device_id *);
+void qib_free_devdata(struct qib_devdata *);
+struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra);
+
+#define QIB_TWSI_NO_DEV 0xFF
+/* Below qib_twsi_ functions must be called with eep_lock held */
+int qib_twsi_reset(struct qib_devdata *dd);
+int qib_twsi_blk_rd(struct qib_devdata *dd, int dev, int addr, void *buffer,
+		    int len);
+int qib_twsi_blk_wr(struct qib_devdata *dd, int dev, int addr,
+		    const void *buffer, int len);
+void qib_get_eeprom_info(struct qib_devdata *);
+#define qib_inc_eeprom_err(dd, eidx, incr)
+void qib_dump_lookup_output_queue(struct qib_devdata *);
+void qib_force_pio_avail_update(struct qib_devdata *);
+void qib_clear_symerror_on_linkup(unsigned long opaque);
+
+/*
+ * Set LED override, only the two LSBs have "public" meaning, but
+ * any non-zero value substitutes them for the Link and LinkTrain
+ * LED states.
+ */
+#define QIB_LED_PHYS 1 /* Physical (linktraining) GREEN LED */
+#define QIB_LED_LOG 2  /* Logical (link) YELLOW LED */
+void qib_set_led_override(struct qib_pportdata *ppd, unsigned int val);
+
+/* send dma routines */
+int qib_setup_sdma(struct qib_pportdata *);
+void qib_teardown_sdma(struct qib_pportdata *);
+void __qib_sdma_intr(struct qib_pportdata *);
+void qib_sdma_intr(struct qib_pportdata *);
+void qib_user_sdma_send_desc(struct qib_pportdata *dd,
+			struct list_head *pktlist);
+int qib_sdma_verbs_send(struct qib_pportdata *, struct qib_sge_state *,
+			u32, struct qib_verbs_txreq *);
+/* ppd->sdma_lock should be locked before calling this. */
+int qib_sdma_make_progress(struct qib_pportdata *dd);
+
+static inline int qib_sdma_empty(const struct qib_pportdata *ppd)
+{
+	return ppd->sdma_descq_added == ppd->sdma_descq_removed;
+}
+
+/* must be called under qib_sdma_lock */
+static inline u16 qib_sdma_descq_freecnt(const struct qib_pportdata *ppd)
+{
+	return ppd->sdma_descq_cnt -
+		(ppd->sdma_descq_added - ppd->sdma_descq_removed) - 1;
+}
+
+static inline int __qib_sdma_running(struct qib_pportdata *ppd)
+{
+	return ppd->sdma_state.current_state == qib_sdma_state_s99_running;
+}
+int qib_sdma_running(struct qib_pportdata *);
+void dump_sdma_state(struct qib_pportdata *ppd);
+void __qib_sdma_process_event(struct qib_pportdata *, enum qib_sdma_events);
+void qib_sdma_process_event(struct qib_pportdata *, enum qib_sdma_events);
+
+/*
+ * number of words used for protocol header if not set by qib_userinit();
+ */
+#define QIB_DFLT_RCVHDRSIZE 9
+
+/*
+ * We need to be able to handle an IB header of at least 24 dwords.
+ * We need the rcvhdrq large enough to handle largest IB header, but
+ * still have room for a 2KB MTU standard IB packet.
+ * Additionally, some processor/memory controller combinations
+ * benefit quite strongly from having the DMA'ed data be cacheline
+ * aligned and a cacheline multiple, so we set the size to 32 dwords
+ * (2 64-byte primary cachelines for pretty much all processors of
+ * interest).  The alignment hurts nothing, other than using somewhat
+ * more memory.
+ */
+#define QIB_RCVHDR_ENTSIZE 32
+
+int qib_get_user_pages(unsigned long, size_t, struct page **);
+void qib_release_user_pages(struct page **, size_t);
+int qib_eeprom_read(struct qib_devdata *, u8, void *, int);
+int qib_eeprom_write(struct qib_devdata *, u8, const void *, int);
+u32 __iomem *qib_getsendbuf_range(struct qib_devdata *, u32 *, u32, u32);
+void qib_sendbuf_done(struct qib_devdata *, unsigned);
+
+static inline void qib_clear_rcvhdrtail(const struct qib_ctxtdata *rcd)
+{
+	*((u64 *) rcd->rcvhdrtail_kvaddr) = 0ULL;
+}
+
+static inline u32 qib_get_rcvhdrtail(const struct qib_ctxtdata *rcd)
+{
+	/*
+	 * volatile because it's a DMA target from the chip, routine is
+	 * inlined, and don't want register caching or reordering.
+	 */
+	return (u32) le64_to_cpu(
+		*((volatile __le64 *)rcd->rcvhdrtail_kvaddr)); /* DMA'ed */
+}
+
+static inline u32 qib_get_hdrqtail(const struct qib_ctxtdata *rcd)
+{
+	const struct qib_devdata *dd = rcd->dd;
+	u32 hdrqtail;
+
+	if (dd->flags & QIB_NODMA_RTAIL) {
+		__le32 *rhf_addr;
+		u32 seq;
+
+		rhf_addr = (__le32 *) rcd->rcvhdrq +
+			rcd->head + dd->rhf_offset;
+		seq = qib_hdrget_seq(rhf_addr);
+		hdrqtail = rcd->head;
+		if (seq == rcd->seq_cnt)
+			hdrqtail++;
+	} else
+		hdrqtail = qib_get_rcvhdrtail(rcd);
+
+	return hdrqtail;
+}
+
+/*
+ * sysfs interface.
+ */
+
+extern const char ib_qib_version[];
+
+int qib_device_create(struct qib_devdata *);
+void qib_device_remove(struct qib_devdata *);
+
+int qib_create_port_files(struct ib_device *ibdev, u8 port_num,
+			  struct kobject *kobj);
+int qib_verbs_register_sysfs(struct qib_devdata *);
+void qib_verbs_unregister_sysfs(struct qib_devdata *);
+/* Hook for sysfs read of QSFP */
+extern int qib_qsfp_dump(struct qib_pportdata *ppd, char *buf, int len);
+
+int __init qib_init_qibfs(void);
+int __exit qib_exit_qibfs(void);
+
+int qibfs_add(struct qib_devdata *);
+int qibfs_remove(struct qib_devdata *);
+
+int qib_pcie_init(struct pci_dev *, const struct pci_device_id *);
+int qib_pcie_ddinit(struct qib_devdata *, struct pci_dev *,
+		    const struct pci_device_id *);
+void qib_pcie_ddcleanup(struct qib_devdata *);
+int qib_pcie_params(struct qib_devdata *, u32, u32 *, struct qib_msix_entry *);
+int qib_reinit_intr(struct qib_devdata *);
+void qib_enable_intx(struct pci_dev *);
+void qib_nomsi(struct qib_devdata *);
+void qib_nomsix(struct qib_devdata *);
+void qib_pcie_getcmd(struct qib_devdata *, u16 *, u8 *, u8 *);
+void qib_pcie_reenable(struct qib_devdata *, u16, u8, u8);
+/* interrupts for device */
+u64 qib_int_counter(struct qib_devdata *);
+/* interrupt for all devices */
+u64 qib_sps_ints(void);
+
+/*
+ * dma_addr wrappers - all 0's invalid for hw
+ */
+dma_addr_t qib_map_page(struct pci_dev *, struct page *, unsigned long,
+			  size_t, int);
+const char *qib_get_unit_name(int unit);
+
+/*
+ * Flush write combining store buffers (if present) and perform a write
+ * barrier.
+ */
+static inline void qib_flush_wc(void)
+{
+#if defined(CONFIG_X86_64)
+	asm volatile("sfence" : : : "memory");
+#else
+	wmb(); /* no reorder around wc flush */
+#endif
+}
+
+/* global module parameter variables */
+extern unsigned qib_ibmtu;
+extern ushort qib_cfgctxts;
+extern ushort qib_num_cfg_vls;
+extern ushort qib_mini_init; /* If set, do few (ideally 0) writes to chip */
+extern unsigned qib_n_krcv_queues;
+extern unsigned qib_sdma_fetch_arb;
+extern unsigned qib_compat_ddr_negotiate;
+extern int qib_special_trigger;
+extern unsigned qib_numa_aware;
+
+extern struct mutex qib_mutex;
+
+/* Number of seconds before our card status check...  */
+#define STATUS_TIMEOUT 60
+
+#define QIB_DRV_NAME            "ib_qib"
+#define QIB_USER_MINOR_BASE     0
+#define QIB_TRACE_MINOR         127
+#define QIB_DIAGPKT_MINOR       128
+#define QIB_DIAG_MINOR_BASE     129
+#define QIB_NMINORS             255
+
+#define PCI_VENDOR_ID_PATHSCALE 0x1fc1
+#define PCI_VENDOR_ID_QLOGIC 0x1077
+#define PCI_DEVICE_ID_QLOGIC_IB_6120 0x10
+#define PCI_DEVICE_ID_QLOGIC_IB_7220 0x7220
+#define PCI_DEVICE_ID_QLOGIC_IB_7322 0x7322
+
+/*
+ * qib_early_err is used (only!) to print early errors before devdata is
+ * allocated, or when dd->pcidev may not be valid, and at the tail end of
+ * cleanup when devdata may have been freed, etc.  qib_dev_porterr is
+ * the same as qib_dev_err, but is used when the message really needs
+ * the IB port# to be definitive as to what's happening..
+ * All of these go to the trace log, and the trace log entry is done
+ * first to avoid possible serial port delays from printk.
+ */
+#define qib_early_err(dev, fmt, ...) \
+	dev_err(dev, fmt, ##__VA_ARGS__)
+
+#define qib_dev_err(dd, fmt, ...) \
+	dev_err(&(dd)->pcidev->dev, "%s: " fmt, \
+		qib_get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define qib_dev_warn(dd, fmt, ...) \
+	dev_warn(&(dd)->pcidev->dev, "%s: " fmt, \
+		qib_get_unit_name((dd)->unit), ##__VA_ARGS__)
+
+#define qib_dev_porterr(dd, port, fmt, ...) \
+	dev_err(&(dd)->pcidev->dev, "%s: IB%u:%u " fmt, \
+		qib_get_unit_name((dd)->unit), (dd)->unit, (port), \
+		##__VA_ARGS__)
+
+#define qib_devinfo(pcidev, fmt, ...) \
+	dev_info(&(pcidev)->dev, fmt, ##__VA_ARGS__)
+
+/*
+ * this is used for formatting hw error messages...
+ */
+struct qib_hwerror_msgs {
+	u64 mask;
+	const char *msg;
+	size_t sz;
+};
+
+#define QLOGIC_IB_HWE_MSG(a, b) { .mask = a, .msg = b }
+
+/* in qib_intr.c... */
+void qib_format_hwerrors(u64 hwerrs,
+			 const struct qib_hwerror_msgs *hwerrmsgs,
+			 size_t nhwerrmsgs, char *msg, size_t lmsg);
+#endif                          /* _QIB_KERNEL_H */
diff --git a/drivers/infiniband/hw/qib/qib_6120_regs.h b/drivers/infiniband/hw/qib/qib_6120_regs.h
new file mode 100644
index 000000000..e16cb6f7d
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_6120_regs.h
@@ -0,0 +1,977 @@
+/*
+ * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* This file is mechanically generated from RTL. Any hand-edits will be lost! */
+
+#define QIB_6120_Revision_OFFS 0x0
+#define QIB_6120_Revision_R_Simulator_LSB 0x3F
+#define QIB_6120_Revision_R_Simulator_RMASK 0x1
+#define QIB_6120_Revision_Reserved_LSB 0x28
+#define QIB_6120_Revision_Reserved_RMASK 0x7FFFFF
+#define QIB_6120_Revision_BoardID_LSB 0x20
+#define QIB_6120_Revision_BoardID_RMASK 0xFF
+#define QIB_6120_Revision_R_SW_LSB 0x18
+#define QIB_6120_Revision_R_SW_RMASK 0xFF
+#define QIB_6120_Revision_R_Arch_LSB 0x10
+#define QIB_6120_Revision_R_Arch_RMASK 0xFF
+#define QIB_6120_Revision_R_ChipRevMajor_LSB 0x8
+#define QIB_6120_Revision_R_ChipRevMajor_RMASK 0xFF
+#define QIB_6120_Revision_R_ChipRevMinor_LSB 0x0
+#define QIB_6120_Revision_R_ChipRevMinor_RMASK 0xFF
+
+#define QIB_6120_Control_OFFS 0x8
+#define QIB_6120_Control_TxLatency_LSB 0x4
+#define QIB_6120_Control_TxLatency_RMASK 0x1
+#define QIB_6120_Control_PCIERetryBufDiagEn_LSB 0x3
+#define QIB_6120_Control_PCIERetryBufDiagEn_RMASK 0x1
+#define QIB_6120_Control_LinkEn_LSB 0x2
+#define QIB_6120_Control_LinkEn_RMASK 0x1
+#define QIB_6120_Control_FreezeMode_LSB 0x1
+#define QIB_6120_Control_FreezeMode_RMASK 0x1
+#define QIB_6120_Control_SyncReset_LSB 0x0
+#define QIB_6120_Control_SyncReset_RMASK 0x1
+
+#define QIB_6120_PageAlign_OFFS 0x10
+
+#define QIB_6120_PortCnt_OFFS 0x18
+
+#define QIB_6120_SendRegBase_OFFS 0x30
+
+#define QIB_6120_UserRegBase_OFFS 0x38
+
+#define QIB_6120_CntrRegBase_OFFS 0x40
+
+#define QIB_6120_Scratch_OFFS 0x48
+#define QIB_6120_Scratch_TopHalf_LSB 0x20
+#define QIB_6120_Scratch_TopHalf_RMASK 0xFFFFFFFF
+#define QIB_6120_Scratch_BottomHalf_LSB 0x0
+#define QIB_6120_Scratch_BottomHalf_RMASK 0xFFFFFFFF
+
+#define QIB_6120_IntBlocked_OFFS 0x60
+#define QIB_6120_IntBlocked_ErrorIntBlocked_LSB 0x1F
+#define QIB_6120_IntBlocked_ErrorIntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_PioSetIntBlocked_LSB 0x1E
+#define QIB_6120_IntBlocked_PioSetIntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_PioBufAvailIntBlocked_LSB 0x1D
+#define QIB_6120_IntBlocked_PioBufAvailIntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_assertGPIOIntBlocked_LSB 0x1C
+#define QIB_6120_IntBlocked_assertGPIOIntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_Reserved_LSB 0xF
+#define QIB_6120_IntBlocked_Reserved_RMASK 0x1FFF
+#define QIB_6120_IntBlocked_RcvAvail4IntBlocked_LSB 0x10
+#define QIB_6120_IntBlocked_RcvAvail4IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvAvail3IntBlocked_LSB 0xF
+#define QIB_6120_IntBlocked_RcvAvail3IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvAvail2IntBlocked_LSB 0xE
+#define QIB_6120_IntBlocked_RcvAvail2IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvAvail1IntBlocked_LSB 0xD
+#define QIB_6120_IntBlocked_RcvAvail1IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvAvail0IntBlocked_LSB 0xC
+#define QIB_6120_IntBlocked_RcvAvail0IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_Reserved1_LSB 0x5
+#define QIB_6120_IntBlocked_Reserved1_RMASK 0x7F
+#define QIB_6120_IntBlocked_RcvUrg4IntBlocked_LSB 0x4
+#define QIB_6120_IntBlocked_RcvUrg4IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvUrg3IntBlocked_LSB 0x3
+#define QIB_6120_IntBlocked_RcvUrg3IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvUrg2IntBlocked_LSB 0x2
+#define QIB_6120_IntBlocked_RcvUrg2IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvUrg1IntBlocked_LSB 0x1
+#define QIB_6120_IntBlocked_RcvUrg1IntBlocked_RMASK 0x1
+#define QIB_6120_IntBlocked_RcvUrg0IntBlocked_LSB 0x0
+#define QIB_6120_IntBlocked_RcvUrg0IntBlocked_RMASK 0x1
+
+#define QIB_6120_IntMask_OFFS 0x68
+#define QIB_6120_IntMask_ErrorIntMask_LSB 0x1F
+#define QIB_6120_IntMask_ErrorIntMask_RMASK 0x1
+#define QIB_6120_IntMask_PioSetIntMask_LSB 0x1E
+#define QIB_6120_IntMask_PioSetIntMask_RMASK 0x1
+#define QIB_6120_IntMask_PioBufAvailIntMask_LSB 0x1D
+#define QIB_6120_IntMask_PioBufAvailIntMask_RMASK 0x1
+#define QIB_6120_IntMask_assertGPIOIntMask_LSB 0x1C
+#define QIB_6120_IntMask_assertGPIOIntMask_RMASK 0x1
+#define QIB_6120_IntMask_Reserved_LSB 0x11
+#define QIB_6120_IntMask_Reserved_RMASK 0x7FF
+#define QIB_6120_IntMask_RcvAvail4IntMask_LSB 0x10
+#define QIB_6120_IntMask_RcvAvail4IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvAvail3IntMask_LSB 0xF
+#define QIB_6120_IntMask_RcvAvail3IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvAvail2IntMask_LSB 0xE
+#define QIB_6120_IntMask_RcvAvail2IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvAvail1IntMask_LSB 0xD
+#define QIB_6120_IntMask_RcvAvail1IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvAvail0IntMask_LSB 0xC
+#define QIB_6120_IntMask_RcvAvail0IntMask_RMASK 0x1
+#define QIB_6120_IntMask_Reserved1_LSB 0x5
+#define QIB_6120_IntMask_Reserved1_RMASK 0x7F
+#define QIB_6120_IntMask_RcvUrg4IntMask_LSB 0x4
+#define QIB_6120_IntMask_RcvUrg4IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvUrg3IntMask_LSB 0x3
+#define QIB_6120_IntMask_RcvUrg3IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvUrg2IntMask_LSB 0x2
+#define QIB_6120_IntMask_RcvUrg2IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvUrg1IntMask_LSB 0x1
+#define QIB_6120_IntMask_RcvUrg1IntMask_RMASK 0x1
+#define QIB_6120_IntMask_RcvUrg0IntMask_LSB 0x0
+#define QIB_6120_IntMask_RcvUrg0IntMask_RMASK 0x1
+
+#define QIB_6120_IntStatus_OFFS 0x70
+#define QIB_6120_IntStatus_Error_LSB 0x1F
+#define QIB_6120_IntStatus_Error_RMASK 0x1
+#define QIB_6120_IntStatus_PioSent_LSB 0x1E
+#define QIB_6120_IntStatus_PioSent_RMASK 0x1
+#define QIB_6120_IntStatus_PioBufAvail_LSB 0x1D
+#define QIB_6120_IntStatus_PioBufAvail_RMASK 0x1
+#define QIB_6120_IntStatus_assertGPIO_LSB 0x1C
+#define QIB_6120_IntStatus_assertGPIO_RMASK 0x1
+#define QIB_6120_IntStatus_Reserved_LSB 0xF
+#define QIB_6120_IntStatus_Reserved_RMASK 0x1FFF
+#define QIB_6120_IntStatus_RcvAvail4_LSB 0x10
+#define QIB_6120_IntStatus_RcvAvail4_RMASK 0x1
+#define QIB_6120_IntStatus_RcvAvail3_LSB 0xF
+#define QIB_6120_IntStatus_RcvAvail3_RMASK 0x1
+#define QIB_6120_IntStatus_RcvAvail2_LSB 0xE
+#define QIB_6120_IntStatus_RcvAvail2_RMASK 0x1
+#define QIB_6120_IntStatus_RcvAvail1_LSB 0xD
+#define QIB_6120_IntStatus_RcvAvail1_RMASK 0x1
+#define QIB_6120_IntStatus_RcvAvail0_LSB 0xC
+#define QIB_6120_IntStatus_RcvAvail0_RMASK 0x1
+#define QIB_6120_IntStatus_Reserved1_LSB 0x5
+#define QIB_6120_IntStatus_Reserved1_RMASK 0x7F
+#define QIB_6120_IntStatus_RcvUrg4_LSB 0x4
+#define QIB_6120_IntStatus_RcvUrg4_RMASK 0x1
+#define QIB_6120_IntStatus_RcvUrg3_LSB 0x3
+#define QIB_6120_IntStatus_RcvUrg3_RMASK 0x1
+#define QIB_6120_IntStatus_RcvUrg2_LSB 0x2
+#define QIB_6120_IntStatus_RcvUrg2_RMASK 0x1
+#define QIB_6120_IntStatus_RcvUrg1_LSB 0x1
+#define QIB_6120_IntStatus_RcvUrg1_RMASK 0x1
+#define QIB_6120_IntStatus_RcvUrg0_LSB 0x0
+#define QIB_6120_IntStatus_RcvUrg0_RMASK 0x1
+
+#define QIB_6120_IntClear_OFFS 0x78
+#define QIB_6120_IntClear_ErrorIntClear_LSB 0x1F
+#define QIB_6120_IntClear_ErrorIntClear_RMASK 0x1
+#define QIB_6120_IntClear_PioSetIntClear_LSB 0x1E
+#define QIB_6120_IntClear_PioSetIntClear_RMASK 0x1
+#define QIB_6120_IntClear_PioBufAvailIntClear_LSB 0x1D
+#define QIB_6120_IntClear_PioBufAvailIntClear_RMASK 0x1
+#define QIB_6120_IntClear_assertGPIOIntClear_LSB 0x1C
+#define QIB_6120_IntClear_assertGPIOIntClear_RMASK 0x1
+#define QIB_6120_IntClear_Reserved_LSB 0xF
+#define QIB_6120_IntClear_Reserved_RMASK 0x1FFF
+#define QIB_6120_IntClear_RcvAvail4IntClear_LSB 0x10
+#define QIB_6120_IntClear_RcvAvail4IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvAvail3IntClear_LSB 0xF
+#define QIB_6120_IntClear_RcvAvail3IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvAvail2IntClear_LSB 0xE
+#define QIB_6120_IntClear_RcvAvail2IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvAvail1IntClear_LSB 0xD
+#define QIB_6120_IntClear_RcvAvail1IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvAvail0IntClear_LSB 0xC
+#define QIB_6120_IntClear_RcvAvail0IntClear_RMASK 0x1
+#define QIB_6120_IntClear_Reserved1_LSB 0x5
+#define QIB_6120_IntClear_Reserved1_RMASK 0x7F
+#define QIB_6120_IntClear_RcvUrg4IntClear_LSB 0x4
+#define QIB_6120_IntClear_RcvUrg4IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvUrg3IntClear_LSB 0x3
+#define QIB_6120_IntClear_RcvUrg3IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvUrg2IntClear_LSB 0x2
+#define QIB_6120_IntClear_RcvUrg2IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvUrg1IntClear_LSB 0x1
+#define QIB_6120_IntClear_RcvUrg1IntClear_RMASK 0x1
+#define QIB_6120_IntClear_RcvUrg0IntClear_LSB 0x0
+#define QIB_6120_IntClear_RcvUrg0IntClear_RMASK 0x1
+
+#define QIB_6120_ErrMask_OFFS 0x80
+#define QIB_6120_ErrMask_Reserved_LSB 0x34
+#define QIB_6120_ErrMask_Reserved_RMASK 0xFFF
+#define QIB_6120_ErrMask_HardwareErrMask_LSB 0x33
+#define QIB_6120_ErrMask_HardwareErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_ResetNegatedMask_LSB 0x32
+#define QIB_6120_ErrMask_ResetNegatedMask_RMASK 0x1
+#define QIB_6120_ErrMask_InvalidAddrErrMask_LSB 0x31
+#define QIB_6120_ErrMask_InvalidAddrErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_IBStatusChangedMask_LSB 0x30
+#define QIB_6120_ErrMask_IBStatusChangedMask_RMASK 0x1
+#define QIB_6120_ErrMask_Reserved1_LSB 0x26
+#define QIB_6120_ErrMask_Reserved1_RMASK 0x3FF
+#define QIB_6120_ErrMask_SendUnsupportedVLErrMask_LSB 0x25
+#define QIB_6120_ErrMask_SendUnsupportedVLErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendUnexpectedPktNumErrMask_LSB 0x24
+#define QIB_6120_ErrMask_SendUnexpectedPktNumErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendPioArmLaunchErrMask_LSB 0x23
+#define QIB_6120_ErrMask_SendPioArmLaunchErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendDroppedDataPktErrMask_LSB 0x22
+#define QIB_6120_ErrMask_SendDroppedDataPktErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendDroppedSmpPktErrMask_LSB 0x21
+#define QIB_6120_ErrMask_SendDroppedSmpPktErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendPktLenErrMask_LSB 0x20
+#define QIB_6120_ErrMask_SendPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendUnderRunErrMask_LSB 0x1F
+#define QIB_6120_ErrMask_SendUnderRunErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendMaxPktLenErrMask_LSB 0x1E
+#define QIB_6120_ErrMask_SendMaxPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_SendMinPktLenErrMask_LSB 0x1D
+#define QIB_6120_ErrMask_SendMinPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_Reserved2_LSB 0x12
+#define QIB_6120_ErrMask_Reserved2_RMASK 0x7FF
+#define QIB_6120_ErrMask_RcvIBLostLinkErrMask_LSB 0x11
+#define QIB_6120_ErrMask_RcvIBLostLinkErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvHdrErrMask_LSB 0x10
+#define QIB_6120_ErrMask_RcvHdrErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvHdrLenErrMask_LSB 0xF
+#define QIB_6120_ErrMask_RcvHdrLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvBadTidErrMask_LSB 0xE
+#define QIB_6120_ErrMask_RcvBadTidErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvHdrFullErrMask_LSB 0xD
+#define QIB_6120_ErrMask_RcvHdrFullErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvEgrFullErrMask_LSB 0xC
+#define QIB_6120_ErrMask_RcvEgrFullErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvBadVersionErrMask_LSB 0xB
+#define QIB_6120_ErrMask_RcvBadVersionErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvIBFlowErrMask_LSB 0xA
+#define QIB_6120_ErrMask_RcvIBFlowErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvEBPErrMask_LSB 0x9
+#define QIB_6120_ErrMask_RcvEBPErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvUnsupportedVLErrMask_LSB 0x8
+#define QIB_6120_ErrMask_RcvUnsupportedVLErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvUnexpectedCharErrMask_LSB 0x7
+#define QIB_6120_ErrMask_RcvUnexpectedCharErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvShortPktLenErrMask_LSB 0x6
+#define QIB_6120_ErrMask_RcvShortPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvLongPktLenErrMask_LSB 0x5
+#define QIB_6120_ErrMask_RcvLongPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvMaxPktLenErrMask_LSB 0x4
+#define QIB_6120_ErrMask_RcvMaxPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvMinPktLenErrMask_LSB 0x3
+#define QIB_6120_ErrMask_RcvMinPktLenErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvICRCErrMask_LSB 0x2
+#define QIB_6120_ErrMask_RcvICRCErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvVCRCErrMask_LSB 0x1
+#define QIB_6120_ErrMask_RcvVCRCErrMask_RMASK 0x1
+#define QIB_6120_ErrMask_RcvFormatErrMask_LSB 0x0
+#define QIB_6120_ErrMask_RcvFormatErrMask_RMASK 0x1
+
+#define QIB_6120_ErrStatus_OFFS 0x88
+#define QIB_6120_ErrStatus_Reserved_LSB 0x34
+#define QIB_6120_ErrStatus_Reserved_RMASK 0xFFF
+#define QIB_6120_ErrStatus_HardwareErr_LSB 0x33
+#define QIB_6120_ErrStatus_HardwareErr_RMASK 0x1
+#define QIB_6120_ErrStatus_ResetNegated_LSB 0x32
+#define QIB_6120_ErrStatus_ResetNegated_RMASK 0x1
+#define QIB_6120_ErrStatus_InvalidAddrErr_LSB 0x31
+#define QIB_6120_ErrStatus_InvalidAddrErr_RMASK 0x1
+#define QIB_6120_ErrStatus_IBStatusChanged_LSB 0x30
+#define QIB_6120_ErrStatus_IBStatusChanged_RMASK 0x1
+#define QIB_6120_ErrStatus_Reserved1_LSB 0x26
+#define QIB_6120_ErrStatus_Reserved1_RMASK 0x3FF
+#define QIB_6120_ErrStatus_SendUnsupportedVLErr_LSB 0x25
+#define QIB_6120_ErrStatus_SendUnsupportedVLErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendUnexpectedPktNumErr_LSB 0x24
+#define QIB_6120_ErrStatus_SendUnexpectedPktNumErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendPioArmLaunchErr_LSB 0x23
+#define QIB_6120_ErrStatus_SendPioArmLaunchErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendDroppedDataPktErr_LSB 0x22
+#define QIB_6120_ErrStatus_SendDroppedDataPktErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendDroppedSmpPktErr_LSB 0x21
+#define QIB_6120_ErrStatus_SendDroppedSmpPktErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendPktLenErr_LSB 0x20
+#define QIB_6120_ErrStatus_SendPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendUnderRunErr_LSB 0x1F
+#define QIB_6120_ErrStatus_SendUnderRunErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendMaxPktLenErr_LSB 0x1E
+#define QIB_6120_ErrStatus_SendMaxPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_SendMinPktLenErr_LSB 0x1D
+#define QIB_6120_ErrStatus_SendMinPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_Reserved2_LSB 0x12
+#define QIB_6120_ErrStatus_Reserved2_RMASK 0x7FF
+#define QIB_6120_ErrStatus_RcvIBLostLinkErr_LSB 0x11
+#define QIB_6120_ErrStatus_RcvIBLostLinkErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvHdrErr_LSB 0x10
+#define QIB_6120_ErrStatus_RcvHdrErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvHdrLenErr_LSB 0xF
+#define QIB_6120_ErrStatus_RcvHdrLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvBadTidErr_LSB 0xE
+#define QIB_6120_ErrStatus_RcvBadTidErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvHdrFullErr_LSB 0xD
+#define QIB_6120_ErrStatus_RcvHdrFullErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvEgrFullErr_LSB 0xC
+#define QIB_6120_ErrStatus_RcvEgrFullErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvBadVersionErr_LSB 0xB
+#define QIB_6120_ErrStatus_RcvBadVersionErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvIBFlowErr_LSB 0xA
+#define QIB_6120_ErrStatus_RcvIBFlowErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvEBPErr_LSB 0x9
+#define QIB_6120_ErrStatus_RcvEBPErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvUnsupportedVLErr_LSB 0x8
+#define QIB_6120_ErrStatus_RcvUnsupportedVLErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvUnexpectedCharErr_LSB 0x7
+#define QIB_6120_ErrStatus_RcvUnexpectedCharErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvShortPktLenErr_LSB 0x6
+#define QIB_6120_ErrStatus_RcvShortPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvLongPktLenErr_LSB 0x5
+#define QIB_6120_ErrStatus_RcvLongPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvMaxPktLenErr_LSB 0x4
+#define QIB_6120_ErrStatus_RcvMaxPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvMinPktLenErr_LSB 0x3
+#define QIB_6120_ErrStatus_RcvMinPktLenErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvICRCErr_LSB 0x2
+#define QIB_6120_ErrStatus_RcvICRCErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvVCRCErr_LSB 0x1
+#define QIB_6120_ErrStatus_RcvVCRCErr_RMASK 0x1
+#define QIB_6120_ErrStatus_RcvFormatErr_LSB 0x0
+#define QIB_6120_ErrStatus_RcvFormatErr_RMASK 0x1
+
+#define QIB_6120_ErrClear_OFFS 0x90
+#define QIB_6120_ErrClear_Reserved_LSB 0x34
+#define QIB_6120_ErrClear_Reserved_RMASK 0xFFF
+#define QIB_6120_ErrClear_HardwareErrClear_LSB 0x33
+#define QIB_6120_ErrClear_HardwareErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_ResetNegatedClear_LSB 0x32
+#define QIB_6120_ErrClear_ResetNegatedClear_RMASK 0x1
+#define QIB_6120_ErrClear_InvalidAddrErrClear_LSB 0x31
+#define QIB_6120_ErrClear_InvalidAddrErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_IBStatusChangedClear_LSB 0x30
+#define QIB_6120_ErrClear_IBStatusChangedClear_RMASK 0x1
+#define QIB_6120_ErrClear_Reserved1_LSB 0x26
+#define QIB_6120_ErrClear_Reserved1_RMASK 0x3FF
+#define QIB_6120_ErrClear_SendUnsupportedVLErrClear_LSB 0x25
+#define QIB_6120_ErrClear_SendUnsupportedVLErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendUnexpectedPktNumErrClear_LSB 0x24
+#define QIB_6120_ErrClear_SendUnexpectedPktNumErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendPioArmLaunchErrClear_LSB 0x23
+#define QIB_6120_ErrClear_SendPioArmLaunchErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendDroppedDataPktErrClear_LSB 0x22
+#define QIB_6120_ErrClear_SendDroppedDataPktErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendDroppedSmpPktErrClear_LSB 0x21
+#define QIB_6120_ErrClear_SendDroppedSmpPktErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendPktLenErrClear_LSB 0x20
+#define QIB_6120_ErrClear_SendPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendUnderRunErrClear_LSB 0x1F
+#define QIB_6120_ErrClear_SendUnderRunErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendMaxPktLenErrClear_LSB 0x1E
+#define QIB_6120_ErrClear_SendMaxPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_SendMinPktLenErrClear_LSB 0x1D
+#define QIB_6120_ErrClear_SendMinPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_Reserved2_LSB 0x12
+#define QIB_6120_ErrClear_Reserved2_RMASK 0x7FF
+#define QIB_6120_ErrClear_RcvIBLostLinkErrClear_LSB 0x11
+#define QIB_6120_ErrClear_RcvIBLostLinkErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvHdrErrClear_LSB 0x10
+#define QIB_6120_ErrClear_RcvHdrErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvHdrLenErrClear_LSB 0xF
+#define QIB_6120_ErrClear_RcvHdrLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvBadTidErrClear_LSB 0xE
+#define QIB_6120_ErrClear_RcvBadTidErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvHdrFullErrClear_LSB 0xD
+#define QIB_6120_ErrClear_RcvHdrFullErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvEgrFullErrClear_LSB 0xC
+#define QIB_6120_ErrClear_RcvEgrFullErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvBadVersionErrClear_LSB 0xB
+#define QIB_6120_ErrClear_RcvBadVersionErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvIBFlowErrClear_LSB 0xA
+#define QIB_6120_ErrClear_RcvIBFlowErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvEBPErrClear_LSB 0x9
+#define QIB_6120_ErrClear_RcvEBPErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvUnsupportedVLErrClear_LSB 0x8
+#define QIB_6120_ErrClear_RcvUnsupportedVLErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvUnexpectedCharErrClear_LSB 0x7
+#define QIB_6120_ErrClear_RcvUnexpectedCharErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvShortPktLenErrClear_LSB 0x6
+#define QIB_6120_ErrClear_RcvShortPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvLongPktLenErrClear_LSB 0x5
+#define QIB_6120_ErrClear_RcvLongPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvMaxPktLenErrClear_LSB 0x4
+#define QIB_6120_ErrClear_RcvMaxPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvMinPktLenErrClear_LSB 0x3
+#define QIB_6120_ErrClear_RcvMinPktLenErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvICRCErrClear_LSB 0x2
+#define QIB_6120_ErrClear_RcvICRCErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvVCRCErrClear_LSB 0x1
+#define QIB_6120_ErrClear_RcvVCRCErrClear_RMASK 0x1
+#define QIB_6120_ErrClear_RcvFormatErrClear_LSB 0x0
+#define QIB_6120_ErrClear_RcvFormatErrClear_RMASK 0x1
+
+#define QIB_6120_HwErrMask_OFFS 0x98
+#define QIB_6120_HwErrMask_IBCBusFromSPCParityErrMask_LSB 0x3F
+#define QIB_6120_HwErrMask_IBCBusFromSPCParityErrMask_RMASK 0x1
+#define QIB_6120_HwErrMask_IBCBusToSPCParityErrMask_LSB 0x3E
+#define QIB_6120_HwErrMask_IBCBusToSPCParityErrMask_RMASK 0x1
+#define QIB_6120_HwErrMask_Reserved_LSB 0x3D
+#define QIB_6120_HwErrMask_Reserved_RMASK 0x1
+#define QIB_6120_HwErrMask_IBSerdesPClkNotDetectMask_LSB 0x3C
+#define QIB_6120_HwErrMask_IBSerdesPClkNotDetectMask_RMASK 0x1
+#define QIB_6120_HwErrMask_PCIESerdesQ0PClkNotDetectMask_LSB 0x3B
+#define QIB_6120_HwErrMask_PCIESerdesQ0PClkNotDetectMask_RMASK 0x1
+#define QIB_6120_HwErrMask_PCIESerdesQ1PClkNotDetectMask_LSB 0x3A
+#define QIB_6120_HwErrMask_PCIESerdesQ1PClkNotDetectMask_RMASK 0x1
+#define QIB_6120_HwErrMask_Reserved1_LSB 0x39
+#define QIB_6120_HwErrMask_Reserved1_RMASK 0x1
+#define QIB_6120_HwErrMask_IBPLLrfSlipMask_LSB 0x38
+#define QIB_6120_HwErrMask_IBPLLrfSlipMask_RMASK 0x1
+#define QIB_6120_HwErrMask_IBPLLfbSlipMask_LSB 0x37
+#define QIB_6120_HwErrMask_IBPLLfbSlipMask_RMASK 0x1
+#define QIB_6120_HwErrMask_PowerOnBISTFailedMask_LSB 0x36
+#define QIB_6120_HwErrMask_PowerOnBISTFailedMask_RMASK 0x1
+#define QIB_6120_HwErrMask_Reserved2_LSB 0x33
+#define QIB_6120_HwErrMask_Reserved2_RMASK 0x7
+#define QIB_6120_HwErrMask_RXEMemParityErrMask_LSB 0x2C
+#define QIB_6120_HwErrMask_RXEMemParityErrMask_RMASK 0x7F
+#define QIB_6120_HwErrMask_TXEMemParityErrMask_LSB 0x28
+#define QIB_6120_HwErrMask_TXEMemParityErrMask_RMASK 0xF
+#define QIB_6120_HwErrMask_Reserved3_LSB 0x22
+#define QIB_6120_HwErrMask_Reserved3_RMASK 0x3F
+#define QIB_6120_HwErrMask_PCIeBusParityErrMask_LSB 0x1F
+#define QIB_6120_HwErrMask_PCIeBusParityErrMask_RMASK 0x7
+#define QIB_6120_HwErrMask_PcieCplTimeoutMask_LSB 0x1E
+#define QIB_6120_HwErrMask_PcieCplTimeoutMask_RMASK 0x1
+#define QIB_6120_HwErrMask_PoisonedTLPMask_LSB 0x1D
+#define QIB_6120_HwErrMask_PoisonedTLPMask_RMASK 0x1
+#define QIB_6120_HwErrMask_Reserved4_LSB 0x6
+#define QIB_6120_HwErrMask_Reserved4_RMASK 0x7FFFFF
+#define QIB_6120_HwErrMask_PCIeMemParityErrMask_LSB 0x0
+#define QIB_6120_HwErrMask_PCIeMemParityErrMask_RMASK 0x3F
+
+#define QIB_6120_HwErrStatus_OFFS 0xA0
+#define QIB_6120_HwErrStatus_IBCBusFromSPCParityErr_LSB 0x3F
+#define QIB_6120_HwErrStatus_IBCBusFromSPCParityErr_RMASK 0x1
+#define QIB_6120_HwErrStatus_IBCBusToSPCParityErr_LSB 0x3E
+#define QIB_6120_HwErrStatus_IBCBusToSPCParityErr_RMASK 0x1
+#define QIB_6120_HwErrStatus_Reserved_LSB 0x3D
+#define QIB_6120_HwErrStatus_Reserved_RMASK 0x1
+#define QIB_6120_HwErrStatus_IBSerdesPClkNotDetect_LSB 0x3C
+#define QIB_6120_HwErrStatus_IBSerdesPClkNotDetect_RMASK 0x1
+#define QIB_6120_HwErrStatus_PCIESerdesQ0PClkNotDetect_LSB 0x3B
+#define QIB_6120_HwErrStatus_PCIESerdesQ0PClkNotDetect_RMASK 0x1
+#define QIB_6120_HwErrStatus_PCIESerdesQ1PClkNotDetect_LSB 0x3A
+#define QIB_6120_HwErrStatus_PCIESerdesQ1PClkNotDetect_RMASK 0x1
+#define QIB_6120_HwErrStatus_Reserved1_LSB 0x39
+#define QIB_6120_HwErrStatus_Reserved1_RMASK 0x1
+#define QIB_6120_HwErrStatus_IBPLLrfSlip_LSB 0x38
+#define QIB_6120_HwErrStatus_IBPLLrfSlip_RMASK 0x1
+#define QIB_6120_HwErrStatus_IBPLLfbSlip_LSB 0x37
+#define QIB_6120_HwErrStatus_IBPLLfbSlip_RMASK 0x1
+#define QIB_6120_HwErrStatus_PowerOnBISTFailed_LSB 0x36
+#define QIB_6120_HwErrStatus_PowerOnBISTFailed_RMASK 0x1
+#define QIB_6120_HwErrStatus_Reserved2_LSB 0x33
+#define QIB_6120_HwErrStatus_Reserved2_RMASK 0x7
+#define QIB_6120_HwErrStatus_RXEMemParity_LSB 0x2C
+#define QIB_6120_HwErrStatus_RXEMemParity_RMASK 0x7F
+#define QIB_6120_HwErrStatus_TXEMemParity_LSB 0x28
+#define QIB_6120_HwErrStatus_TXEMemParity_RMASK 0xF
+#define QIB_6120_HwErrStatus_Reserved3_LSB 0x22
+#define QIB_6120_HwErrStatus_Reserved3_RMASK 0x3F
+#define QIB_6120_HwErrStatus_PCIeBusParity_LSB 0x1F
+#define QIB_6120_HwErrStatus_PCIeBusParity_RMASK 0x7
+#define QIB_6120_HwErrStatus_PcieCplTimeout_LSB 0x1E
+#define QIB_6120_HwErrStatus_PcieCplTimeout_RMASK 0x1
+#define QIB_6120_HwErrStatus_PoisenedTLP_LSB 0x1D
+#define QIB_6120_HwErrStatus_PoisenedTLP_RMASK 0x1
+#define QIB_6120_HwErrStatus_Reserved4_LSB 0x6
+#define QIB_6120_HwErrStatus_Reserved4_RMASK 0x7FFFFF
+#define QIB_6120_HwErrStatus_PCIeMemParity_LSB 0x0
+#define QIB_6120_HwErrStatus_PCIeMemParity_RMASK 0x3F
+
+#define QIB_6120_HwErrClear_OFFS 0xA8
+#define QIB_6120_HwErrClear_IBCBusFromSPCParityErrClear_LSB 0x3F
+#define QIB_6120_HwErrClear_IBCBusFromSPCParityErrClear_RMASK 0x1
+#define QIB_6120_HwErrClear_IBCBusToSPCparityErrClear_LSB 0x3E
+#define QIB_6120_HwErrClear_IBCBusToSPCparityErrClear_RMASK 0x1
+#define QIB_6120_HwErrClear_Reserved_LSB 0x3D
+#define QIB_6120_HwErrClear_Reserved_RMASK 0x1
+#define QIB_6120_HwErrClear_IBSerdesPClkNotDetectClear_LSB 0x3C
+#define QIB_6120_HwErrClear_IBSerdesPClkNotDetectClear_RMASK 0x1
+#define QIB_6120_HwErrClear_PCIESerdesQ0PClkNotDetectClear_LSB 0x3B
+#define QIB_6120_HwErrClear_PCIESerdesQ0PClkNotDetectClear_RMASK 0x1
+#define QIB_6120_HwErrClear_PCIESerdesQ1PClkNotDetectClear_LSB 0x3A
+#define QIB_6120_HwErrClear_PCIESerdesQ1PClkNotDetectClear_RMASK 0x1
+#define QIB_6120_HwErrClear_Reserved1_LSB 0x39
+#define QIB_6120_HwErrClear_Reserved1_RMASK 0x1
+#define QIB_6120_HwErrClear_IBPLLrfSlipClear_LSB 0x38
+#define QIB_6120_HwErrClear_IBPLLrfSlipClear_RMASK 0x1
+#define QIB_6120_HwErrClear_IBPLLfbSlipClear_LSB 0x37
+#define QIB_6120_HwErrClear_IBPLLfbSlipClear_RMASK 0x1
+#define QIB_6120_HwErrClear_PowerOnBISTFailedClear_LSB 0x36
+#define QIB_6120_HwErrClear_PowerOnBISTFailedClear_RMASK 0x1
+#define QIB_6120_HwErrClear_Reserved2_LSB 0x33
+#define QIB_6120_HwErrClear_Reserved2_RMASK 0x7
+#define QIB_6120_HwErrClear_RXEMemParityClear_LSB 0x2C
+#define QIB_6120_HwErrClear_RXEMemParityClear_RMASK 0x7F
+#define QIB_6120_HwErrClear_TXEMemParityClear_LSB 0x28
+#define QIB_6120_HwErrClear_TXEMemParityClear_RMASK 0xF
+#define QIB_6120_HwErrClear_Reserved3_LSB 0x22
+#define QIB_6120_HwErrClear_Reserved3_RMASK 0x3F
+#define QIB_6120_HwErrClear_PCIeBusParityClr_LSB 0x1F
+#define QIB_6120_HwErrClear_PCIeBusParityClr_RMASK 0x7
+#define QIB_6120_HwErrClear_PcieCplTimeoutClear_LSB 0x1E
+#define QIB_6120_HwErrClear_PcieCplTimeoutClear_RMASK 0x1
+#define QIB_6120_HwErrClear_PoisonedTLPClear_LSB 0x1D
+#define QIB_6120_HwErrClear_PoisonedTLPClear_RMASK 0x1
+#define QIB_6120_HwErrClear_Reserved4_LSB 0x6
+#define QIB_6120_HwErrClear_Reserved4_RMASK 0x7FFFFF
+#define QIB_6120_HwErrClear_PCIeMemParityClr_LSB 0x0
+#define QIB_6120_HwErrClear_PCIeMemParityClr_RMASK 0x3F
+
+#define QIB_6120_HwDiagCtrl_OFFS 0xB0
+#define QIB_6120_HwDiagCtrl_ForceIBCBusFromSPCParityErr_LSB 0x3F
+#define QIB_6120_HwDiagCtrl_ForceIBCBusFromSPCParityErr_RMASK 0x1
+#define QIB_6120_HwDiagCtrl_ForceIBCBusToSPCParityErr_LSB 0x3E
+#define QIB_6120_HwDiagCtrl_ForceIBCBusToSPCParityErr_RMASK 0x1
+#define QIB_6120_HwDiagCtrl_CounterWrEnable_LSB 0x3D
+#define QIB_6120_HwDiagCtrl_CounterWrEnable_RMASK 0x1
+#define QIB_6120_HwDiagCtrl_CounterDisable_LSB 0x3C
+#define QIB_6120_HwDiagCtrl_CounterDisable_RMASK 0x1
+#define QIB_6120_HwDiagCtrl_Reserved_LSB 0x33
+#define QIB_6120_HwDiagCtrl_Reserved_RMASK 0x1FF
+#define QIB_6120_HwDiagCtrl_ForceRxMemParityErr_LSB 0x2C
+#define QIB_6120_HwDiagCtrl_ForceRxMemParityErr_RMASK 0x7F
+#define QIB_6120_HwDiagCtrl_ForceTxMemparityErr_LSB 0x28
+#define QIB_6120_HwDiagCtrl_ForceTxMemparityErr_RMASK 0xF
+#define QIB_6120_HwDiagCtrl_Reserved1_LSB 0x23
+#define QIB_6120_HwDiagCtrl_Reserved1_RMASK 0x1F
+#define QIB_6120_HwDiagCtrl_forcePCIeBusParity_LSB 0x1F
+#define QIB_6120_HwDiagCtrl_forcePCIeBusParity_RMASK 0xF
+#define QIB_6120_HwDiagCtrl_Reserved2_LSB 0x6
+#define QIB_6120_HwDiagCtrl_Reserved2_RMASK 0x1FFFFFF
+#define QIB_6120_HwDiagCtrl_forcePCIeMemParity_LSB 0x0
+#define QIB_6120_HwDiagCtrl_forcePCIeMemParity_RMASK 0x3F
+
+#define QIB_6120_IBCStatus_OFFS 0xC0
+#define QIB_6120_IBCStatus_TxCreditOk_LSB 0x1F
+#define QIB_6120_IBCStatus_TxCreditOk_RMASK 0x1
+#define QIB_6120_IBCStatus_TxReady_LSB 0x1E
+#define QIB_6120_IBCStatus_TxReady_RMASK 0x1
+#define QIB_6120_IBCStatus_Reserved_LSB 0x7
+#define QIB_6120_IBCStatus_Reserved_RMASK 0x7FFFFF
+#define QIB_6120_IBCStatus_LinkState_LSB 0x4
+#define QIB_6120_IBCStatus_LinkState_RMASK 0x7
+#define QIB_6120_IBCStatus_LinkTrainingState_LSB 0x0
+#define QIB_6120_IBCStatus_LinkTrainingState_RMASK 0xF
+
+#define QIB_6120_IBCCtrl_OFFS 0xC8
+#define QIB_6120_IBCCtrl_Loopback_LSB 0x3F
+#define QIB_6120_IBCCtrl_Loopback_RMASK 0x1
+#define QIB_6120_IBCCtrl_LinkDownDefaultState_LSB 0x3E
+#define QIB_6120_IBCCtrl_LinkDownDefaultState_RMASK 0x1
+#define QIB_6120_IBCCtrl_Reserved_LSB 0x2B
+#define QIB_6120_IBCCtrl_Reserved_RMASK 0x7FFFF
+#define QIB_6120_IBCCtrl_CreditScale_LSB 0x28
+#define QIB_6120_IBCCtrl_CreditScale_RMASK 0x7
+#define QIB_6120_IBCCtrl_OverrunThreshold_LSB 0x24
+#define QIB_6120_IBCCtrl_OverrunThreshold_RMASK 0xF
+#define QIB_6120_IBCCtrl_PhyerrThreshold_LSB 0x20
+#define QIB_6120_IBCCtrl_PhyerrThreshold_RMASK 0xF
+#define QIB_6120_IBCCtrl_Reserved1_LSB 0x1F
+#define QIB_6120_IBCCtrl_Reserved1_RMASK 0x1
+#define QIB_6120_IBCCtrl_MaxPktLen_LSB 0x14
+#define QIB_6120_IBCCtrl_MaxPktLen_RMASK 0x7FF
+#define QIB_6120_IBCCtrl_LinkCmd_LSB 0x12
+#define QIB_6120_IBCCtrl_LinkCmd_RMASK 0x3
+#define QIB_6120_IBCCtrl_LinkInitCmd_LSB 0x10
+#define QIB_6120_IBCCtrl_LinkInitCmd_RMASK 0x3
+#define QIB_6120_IBCCtrl_FlowCtrlWaterMark_LSB 0x8
+#define QIB_6120_IBCCtrl_FlowCtrlWaterMark_RMASK 0xFF
+#define QIB_6120_IBCCtrl_FlowCtrlPeriod_LSB 0x0
+#define QIB_6120_IBCCtrl_FlowCtrlPeriod_RMASK 0xFF
+
+#define QIB_6120_EXTStatus_OFFS 0xD0
+#define QIB_6120_EXTStatus_GPIOIn_LSB 0x30
+#define QIB_6120_EXTStatus_GPIOIn_RMASK 0xFFFF
+#define QIB_6120_EXTStatus_Reserved_LSB 0x20
+#define QIB_6120_EXTStatus_Reserved_RMASK 0xFFFF
+#define QIB_6120_EXTStatus_Reserved1_LSB 0x10
+#define QIB_6120_EXTStatus_Reserved1_RMASK 0xFFFF
+#define QIB_6120_EXTStatus_MemBISTFoundErr_LSB 0xF
+#define QIB_6120_EXTStatus_MemBISTFoundErr_RMASK 0x1
+#define QIB_6120_EXTStatus_MemBISTEndTest_LSB 0xE
+#define QIB_6120_EXTStatus_MemBISTEndTest_RMASK 0x1
+#define QIB_6120_EXTStatus_Reserved2_LSB 0x0
+#define QIB_6120_EXTStatus_Reserved2_RMASK 0x3FFF
+
+#define QIB_6120_EXTCtrl_OFFS 0xD8
+#define QIB_6120_EXTCtrl_GPIOOe_LSB 0x30
+#define QIB_6120_EXTCtrl_GPIOOe_RMASK 0xFFFF
+#define QIB_6120_EXTCtrl_GPIOInvert_LSB 0x20
+#define QIB_6120_EXTCtrl_GPIOInvert_RMASK 0xFFFF
+#define QIB_6120_EXTCtrl_Reserved_LSB 0x4
+#define QIB_6120_EXTCtrl_Reserved_RMASK 0xFFFFFFF
+#define QIB_6120_EXTCtrl_LEDPriPortGreenOn_LSB 0x3
+#define QIB_6120_EXTCtrl_LEDPriPortGreenOn_RMASK 0x1
+#define QIB_6120_EXTCtrl_LEDPriPortYellowOn_LSB 0x2
+#define QIB_6120_EXTCtrl_LEDPriPortYellowOn_RMASK 0x1
+#define QIB_6120_EXTCtrl_LEDGblOkGreenOn_LSB 0x1
+#define QIB_6120_EXTCtrl_LEDGblOkGreenOn_RMASK 0x1
+#define QIB_6120_EXTCtrl_LEDGblErrRedOff_LSB 0x0
+#define QIB_6120_EXTCtrl_LEDGblErrRedOff_RMASK 0x1
+
+#define QIB_6120_GPIOOut_OFFS 0xE0
+
+#define QIB_6120_GPIOMask_OFFS 0xE8
+
+#define QIB_6120_GPIOStatus_OFFS 0xF0
+
+#define QIB_6120_GPIOClear_OFFS 0xF8
+
+#define QIB_6120_RcvCtrl_OFFS 0x100
+#define QIB_6120_RcvCtrl_TailUpd_LSB 0x1F
+#define QIB_6120_RcvCtrl_TailUpd_RMASK 0x1
+#define QIB_6120_RcvCtrl_RcvPartitionKeyDisable_LSB 0x1E
+#define QIB_6120_RcvCtrl_RcvPartitionKeyDisable_RMASK 0x1
+#define QIB_6120_RcvCtrl_Reserved_LSB 0x15
+#define QIB_6120_RcvCtrl_Reserved_RMASK 0x1FF
+#define QIB_6120_RcvCtrl_IntrAvail_LSB 0x10
+#define QIB_6120_RcvCtrl_IntrAvail_RMASK 0x1F
+#define QIB_6120_RcvCtrl_Reserved1_LSB 0x9
+#define QIB_6120_RcvCtrl_Reserved1_RMASK 0x7F
+#define QIB_6120_RcvCtrl_Reserved2_LSB 0x5
+#define QIB_6120_RcvCtrl_Reserved2_RMASK 0xF
+#define QIB_6120_RcvCtrl_PortEnable_LSB 0x0
+#define QIB_6120_RcvCtrl_PortEnable_RMASK 0x1F
+
+#define QIB_6120_RcvBTHQP_OFFS 0x108
+#define QIB_6120_RcvBTHQP_BTHQP_Mask_LSB 0x1E
+#define QIB_6120_RcvBTHQP_BTHQP_Mask_RMASK 0x3
+#define QIB_6120_RcvBTHQP_Reserved_LSB 0x18
+#define QIB_6120_RcvBTHQP_Reserved_RMASK 0x3F
+#define QIB_6120_RcvBTHQP_RcvBTHQP_LSB 0x0
+#define QIB_6120_RcvBTHQP_RcvBTHQP_RMASK 0xFFFFFF
+
+#define QIB_6120_RcvHdrSize_OFFS 0x110
+
+#define QIB_6120_RcvHdrCnt_OFFS 0x118
+
+#define QIB_6120_RcvHdrEntSize_OFFS 0x120
+
+#define QIB_6120_RcvTIDBase_OFFS 0x128
+
+#define QIB_6120_RcvTIDCnt_OFFS 0x130
+
+#define QIB_6120_RcvEgrBase_OFFS 0x138
+
+#define QIB_6120_RcvEgrCnt_OFFS 0x140
+
+#define QIB_6120_RcvBufBase_OFFS 0x148
+
+#define QIB_6120_RcvBufSize_OFFS 0x150
+
+#define QIB_6120_RxIntMemBase_OFFS 0x158
+
+#define QIB_6120_RxIntMemSize_OFFS 0x160
+
+#define QIB_6120_RcvPartitionKey_OFFS 0x168
+
+#define QIB_6120_RcvPktLEDCnt_OFFS 0x178
+#define QIB_6120_RcvPktLEDCnt_ONperiod_LSB 0x20
+#define QIB_6120_RcvPktLEDCnt_ONperiod_RMASK 0xFFFFFFFF
+#define QIB_6120_RcvPktLEDCnt_OFFperiod_LSB 0x0
+#define QIB_6120_RcvPktLEDCnt_OFFperiod_RMASK 0xFFFFFFFF
+
+#define QIB_6120_SendCtrl_OFFS 0x1C0
+#define QIB_6120_SendCtrl_Disarm_LSB 0x1F
+#define QIB_6120_SendCtrl_Disarm_RMASK 0x1
+#define QIB_6120_SendCtrl_Reserved_LSB 0x17
+#define QIB_6120_SendCtrl_Reserved_RMASK 0xFF
+#define QIB_6120_SendCtrl_DisarmPIOBuf_LSB 0x10
+#define QIB_6120_SendCtrl_DisarmPIOBuf_RMASK 0x7F
+#define QIB_6120_SendCtrl_Reserved1_LSB 0x4
+#define QIB_6120_SendCtrl_Reserved1_RMASK 0xFFF
+#define QIB_6120_SendCtrl_PIOEnable_LSB 0x3
+#define QIB_6120_SendCtrl_PIOEnable_RMASK 0x1
+#define QIB_6120_SendCtrl_PIOBufAvailUpd_LSB 0x2
+#define QIB_6120_SendCtrl_PIOBufAvailUpd_RMASK 0x1
+#define QIB_6120_SendCtrl_PIOIntBufAvail_LSB 0x1
+#define QIB_6120_SendCtrl_PIOIntBufAvail_RMASK 0x1
+#define QIB_6120_SendCtrl_Abort_LSB 0x0
+#define QIB_6120_SendCtrl_Abort_RMASK 0x1
+
+#define QIB_6120_SendPIOBufBase_OFFS 0x1C8
+#define QIB_6120_SendPIOBufBase_Reserved_LSB 0x35
+#define QIB_6120_SendPIOBufBase_Reserved_RMASK 0x7FF
+#define QIB_6120_SendPIOBufBase_BaseAddr_LargePIO_LSB 0x20
+#define QIB_6120_SendPIOBufBase_BaseAddr_LargePIO_RMASK 0x1FFFFF
+#define QIB_6120_SendPIOBufBase_Reserved1_LSB 0x15
+#define QIB_6120_SendPIOBufBase_Reserved1_RMASK 0x7FF
+#define QIB_6120_SendPIOBufBase_BaseAddr_SmallPIO_LSB 0x0
+#define QIB_6120_SendPIOBufBase_BaseAddr_SmallPIO_RMASK 0x1FFFFF
+
+#define QIB_6120_SendPIOSize_OFFS 0x1D0
+#define QIB_6120_SendPIOSize_Reserved_LSB 0x2D
+#define QIB_6120_SendPIOSize_Reserved_RMASK 0xFFFFF
+#define QIB_6120_SendPIOSize_Size_LargePIO_LSB 0x20
+#define QIB_6120_SendPIOSize_Size_LargePIO_RMASK 0x1FFF
+#define QIB_6120_SendPIOSize_Reserved1_LSB 0xC
+#define QIB_6120_SendPIOSize_Reserved1_RMASK 0xFFFFF
+#define QIB_6120_SendPIOSize_Size_SmallPIO_LSB 0x0
+#define QIB_6120_SendPIOSize_Size_SmallPIO_RMASK 0xFFF
+
+#define QIB_6120_SendPIOBufCnt_OFFS 0x1D8
+#define QIB_6120_SendPIOBufCnt_Reserved_LSB 0x24
+#define QIB_6120_SendPIOBufCnt_Reserved_RMASK 0xFFFFFFF
+#define QIB_6120_SendPIOBufCnt_Num_LargePIO_LSB 0x20
+#define QIB_6120_SendPIOBufCnt_Num_LargePIO_RMASK 0xF
+#define QIB_6120_SendPIOBufCnt_Reserved1_LSB 0x9
+#define QIB_6120_SendPIOBufCnt_Reserved1_RMASK 0x7FFFFF
+#define QIB_6120_SendPIOBufCnt_Num_SmallPIO_LSB 0x0
+#define QIB_6120_SendPIOBufCnt_Num_SmallPIO_RMASK 0x1FF
+
+#define QIB_6120_SendPIOAvailAddr_OFFS 0x1E0
+#define QIB_6120_SendPIOAvailAddr_SendPIOAvailAddr_LSB 0x6
+#define QIB_6120_SendPIOAvailAddr_SendPIOAvailAddr_RMASK 0x3FFFFFFFF
+#define QIB_6120_SendPIOAvailAddr_Reserved_LSB 0x0
+#define QIB_6120_SendPIOAvailAddr_Reserved_RMASK 0x3F
+
+#define QIB_6120_SendBufErr0_OFFS 0x240
+#define QIB_6120_SendBufErr0_SendBufErrPIO_63_0_LSB 0x0
+#define QIB_6120_SendBufErr0_SendBufErrPIO_63_0_RMASK 0x0
+
+#define QIB_6120_RcvHdrAddr0_OFFS 0x280
+#define QIB_6120_RcvHdrAddr0_RcvHdrAddr0_LSB 0x2
+#define QIB_6120_RcvHdrAddr0_RcvHdrAddr0_RMASK 0x3FFFFFFFFF
+#define QIB_6120_RcvHdrAddr0_Reserved_LSB 0x0
+#define QIB_6120_RcvHdrAddr0_Reserved_RMASK 0x3
+
+#define QIB_6120_RcvHdrTailAddr0_OFFS 0x300
+#define QIB_6120_RcvHdrTailAddr0_RcvHdrTailAddr0_LSB 0x2
+#define QIB_6120_RcvHdrTailAddr0_RcvHdrTailAddr0_RMASK 0x3FFFFFFFFF
+#define QIB_6120_RcvHdrTailAddr0_Reserved_LSB 0x0
+#define QIB_6120_RcvHdrTailAddr0_Reserved_RMASK 0x3
+
+#define QIB_6120_SerdesCfg0_OFFS 0x3C0
+#define QIB_6120_SerdesCfg0_DisableIBTxIdleDetect_LSB 0x3F
+#define QIB_6120_SerdesCfg0_DisableIBTxIdleDetect_RMASK 0x1
+#define QIB_6120_SerdesCfg0_Reserved_LSB 0x38
+#define QIB_6120_SerdesCfg0_Reserved_RMASK 0x7F
+#define QIB_6120_SerdesCfg0_RxEqCtl_LSB 0x36
+#define QIB_6120_SerdesCfg0_RxEqCtl_RMASK 0x3
+#define QIB_6120_SerdesCfg0_TxTermAdj_LSB 0x34
+#define QIB_6120_SerdesCfg0_TxTermAdj_RMASK 0x3
+#define QIB_6120_SerdesCfg0_RxTermAdj_LSB 0x32
+#define QIB_6120_SerdesCfg0_RxTermAdj_RMASK 0x3
+#define QIB_6120_SerdesCfg0_TermAdj1_LSB 0x31
+#define QIB_6120_SerdesCfg0_TermAdj1_RMASK 0x1
+#define QIB_6120_SerdesCfg0_TermAdj0_LSB 0x30
+#define QIB_6120_SerdesCfg0_TermAdj0_RMASK 0x1
+#define QIB_6120_SerdesCfg0_LPBKA_LSB 0x2F
+#define QIB_6120_SerdesCfg0_LPBKA_RMASK 0x1
+#define QIB_6120_SerdesCfg0_LPBKB_LSB 0x2E
+#define QIB_6120_SerdesCfg0_LPBKB_RMASK 0x1
+#define QIB_6120_SerdesCfg0_LPBKC_LSB 0x2D
+#define QIB_6120_SerdesCfg0_LPBKC_RMASK 0x1
+#define QIB_6120_SerdesCfg0_LPBKD_LSB 0x2C
+#define QIB_6120_SerdesCfg0_LPBKD_RMASK 0x1
+#define QIB_6120_SerdesCfg0_PW_LSB 0x2B
+#define QIB_6120_SerdesCfg0_PW_RMASK 0x1
+#define QIB_6120_SerdesCfg0_RefSel_LSB 0x29
+#define QIB_6120_SerdesCfg0_RefSel_RMASK 0x3
+#define QIB_6120_SerdesCfg0_ParReset_LSB 0x28
+#define QIB_6120_SerdesCfg0_ParReset_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ParLPBK_LSB 0x27
+#define QIB_6120_SerdesCfg0_ParLPBK_RMASK 0x1
+#define QIB_6120_SerdesCfg0_OffsetEn_LSB 0x26
+#define QIB_6120_SerdesCfg0_OffsetEn_RMASK 0x1
+#define QIB_6120_SerdesCfg0_Offset_LSB 0x1E
+#define QIB_6120_SerdesCfg0_Offset_RMASK 0xFF
+#define QIB_6120_SerdesCfg0_L2PwrDn_LSB 0x1D
+#define QIB_6120_SerdesCfg0_L2PwrDn_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ResetPLL_LSB 0x1C
+#define QIB_6120_SerdesCfg0_ResetPLL_RMASK 0x1
+#define QIB_6120_SerdesCfg0_RxTermEnX_LSB 0x18
+#define QIB_6120_SerdesCfg0_RxTermEnX_RMASK 0xF
+#define QIB_6120_SerdesCfg0_BeaconTxEnX_LSB 0x14
+#define QIB_6120_SerdesCfg0_BeaconTxEnX_RMASK 0xF
+#define QIB_6120_SerdesCfg0_RxDetEnX_LSB 0x10
+#define QIB_6120_SerdesCfg0_RxDetEnX_RMASK 0xF
+#define QIB_6120_SerdesCfg0_TxIdeEnX_LSB 0xC
+#define QIB_6120_SerdesCfg0_TxIdeEnX_RMASK 0xF
+#define QIB_6120_SerdesCfg0_RxIdleEnX_LSB 0x8
+#define QIB_6120_SerdesCfg0_RxIdleEnX_RMASK 0xF
+#define QIB_6120_SerdesCfg0_L1PwrDnA_LSB 0x7
+#define QIB_6120_SerdesCfg0_L1PwrDnA_RMASK 0x1
+#define QIB_6120_SerdesCfg0_L1PwrDnB_LSB 0x6
+#define QIB_6120_SerdesCfg0_L1PwrDnB_RMASK 0x1
+#define QIB_6120_SerdesCfg0_L1PwrDnC_LSB 0x5
+#define QIB_6120_SerdesCfg0_L1PwrDnC_RMASK 0x1
+#define QIB_6120_SerdesCfg0_L1PwrDnD_LSB 0x4
+#define QIB_6120_SerdesCfg0_L1PwrDnD_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ResetA_LSB 0x3
+#define QIB_6120_SerdesCfg0_ResetA_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ResetB_LSB 0x2
+#define QIB_6120_SerdesCfg0_ResetB_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ResetC_LSB 0x1
+#define QIB_6120_SerdesCfg0_ResetC_RMASK 0x1
+#define QIB_6120_SerdesCfg0_ResetD_LSB 0x0
+#define QIB_6120_SerdesCfg0_ResetD_RMASK 0x1
+
+#define QIB_6120_SerdesStat_OFFS 0x3D0
+#define QIB_6120_SerdesStat_Reserved_LSB 0xC
+#define QIB_6120_SerdesStat_Reserved_RMASK 0xFFFFFFFFFFFFF
+#define QIB_6120_SerdesStat_BeaconDetA_LSB 0xB
+#define QIB_6120_SerdesStat_BeaconDetA_RMASK 0x1
+#define QIB_6120_SerdesStat_BeaconDetB_LSB 0xA
+#define QIB_6120_SerdesStat_BeaconDetB_RMASK 0x1
+#define QIB_6120_SerdesStat_BeaconDetC_LSB 0x9
+#define QIB_6120_SerdesStat_BeaconDetC_RMASK 0x1
+#define QIB_6120_SerdesStat_BeaconDetD_LSB 0x8
+#define QIB_6120_SerdesStat_BeaconDetD_RMASK 0x1
+#define QIB_6120_SerdesStat_RxDetA_LSB 0x7
+#define QIB_6120_SerdesStat_RxDetA_RMASK 0x1
+#define QIB_6120_SerdesStat_RxDetB_LSB 0x6
+#define QIB_6120_SerdesStat_RxDetB_RMASK 0x1
+#define QIB_6120_SerdesStat_RxDetC_LSB 0x5
+#define QIB_6120_SerdesStat_RxDetC_RMASK 0x1
+#define QIB_6120_SerdesStat_RxDetD_LSB 0x4
+#define QIB_6120_SerdesStat_RxDetD_RMASK 0x1
+#define QIB_6120_SerdesStat_TxIdleDetA_LSB 0x3
+#define QIB_6120_SerdesStat_TxIdleDetA_RMASK 0x1
+#define QIB_6120_SerdesStat_TxIdleDetB_LSB 0x2
+#define QIB_6120_SerdesStat_TxIdleDetB_RMASK 0x1
+#define QIB_6120_SerdesStat_TxIdleDetC_LSB 0x1
+#define QIB_6120_SerdesStat_TxIdleDetC_RMASK 0x1
+#define QIB_6120_SerdesStat_TxIdleDetD_LSB 0x0
+#define QIB_6120_SerdesStat_TxIdleDetD_RMASK 0x1
+
+#define QIB_6120_XGXSCfg_OFFS 0x3D8
+#define QIB_6120_XGXSCfg_ArmLaunchErrorDisable_LSB 0x3F
+#define QIB_6120_XGXSCfg_ArmLaunchErrorDisable_RMASK 0x1
+#define QIB_6120_XGXSCfg_Reserved_LSB 0x17
+#define QIB_6120_XGXSCfg_Reserved_RMASK 0xFFFFFFFFFF
+#define QIB_6120_XGXSCfg_polarity_inv_LSB 0x13
+#define QIB_6120_XGXSCfg_polarity_inv_RMASK 0xF
+#define QIB_6120_XGXSCfg_link_sync_mask_LSB 0x9
+#define QIB_6120_XGXSCfg_link_sync_mask_RMASK 0x3FF
+#define QIB_6120_XGXSCfg_port_addr_LSB 0x4
+#define QIB_6120_XGXSCfg_port_addr_RMASK 0x1F
+#define QIB_6120_XGXSCfg_mdd_30_LSB 0x3
+#define QIB_6120_XGXSCfg_mdd_30_RMASK 0x1
+#define QIB_6120_XGXSCfg_xcv_resetn_LSB 0x2
+#define QIB_6120_XGXSCfg_xcv_resetn_RMASK 0x1
+#define QIB_6120_XGXSCfg_Reserved1_LSB 0x1
+#define QIB_6120_XGXSCfg_Reserved1_RMASK 0x1
+#define QIB_6120_XGXSCfg_tx_rx_resetn_LSB 0x0
+#define QIB_6120_XGXSCfg_tx_rx_resetn_RMASK 0x1
+
+#define QIB_6120_LBIntCnt_OFFS 0x12000
+
+#define QIB_6120_LBFlowStallCnt_OFFS 0x12008
+
+#define QIB_6120_TxUnsupVLErrCnt_OFFS 0x12018
+
+#define QIB_6120_TxDataPktCnt_OFFS 0x12020
+
+#define QIB_6120_TxFlowPktCnt_OFFS 0x12028
+
+#define QIB_6120_TxDwordCnt_OFFS 0x12030
+
+#define QIB_6120_TxLenErrCnt_OFFS 0x12038
+
+#define QIB_6120_TxMaxMinLenErrCnt_OFFS 0x12040
+
+#define QIB_6120_TxUnderrunCnt_OFFS 0x12048
+
+#define QIB_6120_TxFlowStallCnt_OFFS 0x12050
+
+#define QIB_6120_TxDroppedPktCnt_OFFS 0x12058
+
+#define QIB_6120_RxDroppedPktCnt_OFFS 0x12060
+
+#define QIB_6120_RxDataPktCnt_OFFS 0x12068
+
+#define QIB_6120_RxFlowPktCnt_OFFS 0x12070
+
+#define QIB_6120_RxDwordCnt_OFFS 0x12078
+
+#define QIB_6120_RxLenErrCnt_OFFS 0x12080
+
+#define QIB_6120_RxMaxMinLenErrCnt_OFFS 0x12088
+
+#define QIB_6120_RxICRCErrCnt_OFFS 0x12090
+
+#define QIB_6120_RxVCRCErrCnt_OFFS 0x12098
+
+#define QIB_6120_RxFlowCtrlErrCnt_OFFS 0x120A0
+
+#define QIB_6120_RxBadFormatCnt_OFFS 0x120A8
+
+#define QIB_6120_RxLinkProblemCnt_OFFS 0x120B0
+
+#define QIB_6120_RxEBPCnt_OFFS 0x120B8
+
+#define QIB_6120_RxLPCRCErrCnt_OFFS 0x120C0
+
+#define QIB_6120_RxBufOvflCnt_OFFS 0x120C8
+
+#define QIB_6120_RxTIDFullErrCnt_OFFS 0x120D0
+
+#define QIB_6120_RxTIDValidErrCnt_OFFS 0x120D8
+
+#define QIB_6120_RxPKeyMismatchCnt_OFFS 0x120E0
+
+#define QIB_6120_RxP0HdrEgrOvflCnt_OFFS 0x120E8
+
+#define QIB_6120_IBStatusChangeCnt_OFFS 0x12140
+
+#define QIB_6120_IBLinkErrRecoveryCnt_OFFS 0x12148
+
+#define QIB_6120_IBLinkDownedCnt_OFFS 0x12150
+
+#define QIB_6120_IBSymbolErrCnt_OFFS 0x12158
+
+#define QIB_6120_PcieRetryBufDiagQwordCnt_OFFS 0x12170
+
+#define QIB_6120_RcvEgrArray0_OFFS 0x14000
+
+#define QIB_6120_RcvTIDArray0_OFFS 0x54000
+
+#define QIB_6120_PIOLaunchFIFO_OFFS 0x64000
+
+#define QIB_6120_SendPIOpbcCache_OFFS 0x64800
+
+#define QIB_6120_RcvBuf1_OFFS 0x72000
+
+#define QIB_6120_RcvBuf2_OFFS 0x75000
+
+#define QIB_6120_RcvFlags_OFFS 0x77000
+
+#define QIB_6120_RcvLookupBuf1_OFFS 0x79000
+
+#define QIB_6120_RcvDMABuf_OFFS 0x7B000
+
+#define QIB_6120_MiscRXEIntMem_OFFS 0x7C000
+
+#define QIB_6120_PCIERcvBuf_OFFS 0x80000
+
+#define QIB_6120_PCIERetryBuf_OFFS 0x82000
+
+#define QIB_6120_PCIERcvBufRdToWrAddr_OFFS 0x84000
+
+#define QIB_6120_PIOBuf0_MA_OFFS 0x100000
diff --git a/drivers/infiniband/hw/qib/qib_7220.h b/drivers/infiniband/hw/qib/qib_7220.h
new file mode 100644
index 000000000..a5356cb42
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_7220.h
@@ -0,0 +1,149 @@
+#ifndef _QIB_7220_H
+#define _QIB_7220_H
+/*
+ * Copyright (c) 2007, 2009, 2010 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* grab register-defs auto-generated by HW */
+#include "qib_7220_regs.h"
+
+/* The number of eager receive TIDs for context zero. */
+#define IBA7220_KRCVEGRCNT      2048U
+
+#define IB_7220_LT_STATE_CFGRCVFCFG      0x09
+#define IB_7220_LT_STATE_CFGWAITRMT      0x0a
+#define IB_7220_LT_STATE_TXREVLANES      0x0d
+#define IB_7220_LT_STATE_CFGENH          0x10
+
+struct qib_chip_specific {
+	u64 __iomem *cregbase;
+	u64 *cntrs;
+	u64 *portcntrs;
+	spinlock_t sdepb_lock; /* serdes EPB bus */
+	spinlock_t rcvmod_lock; /* protect rcvctrl shadow changes */
+	spinlock_t gpio_lock; /* RMW of shadows/regs for ExtCtrl and GPIO */
+	u64 hwerrmask;
+	u64 errormask;
+	u64 gpio_out; /* shadow of kr_gpio_out, for rmw ops */
+	u64 gpio_mask; /* shadow the gpio mask register */
+	u64 extctrl; /* shadow the gpio output enable, etc... */
+	u32 ncntrs;
+	u32 nportcntrs;
+	u32 cntrnamelen;
+	u32 portcntrnamelen;
+	u32 numctxts;
+	u32 rcvegrcnt;
+	u32 autoneg_tries;
+	u32 serdes_first_init_done;
+	u32 sdmabufcnt;
+	u32 lastbuf_for_pio;
+	u32 updthresh; /* current AvailUpdThld */
+	u32 updthresh_dflt; /* default AvailUpdThld */
+	int irq;
+	u8 presets_needed;
+	u8 relock_timer_active;
+	char emsgbuf[128];
+	char sdmamsgbuf[192];
+	char bitsmsgbuf[64];
+	struct timer_list relock_timer;
+	unsigned int relock_interval; /* in jiffies */
+};
+
+struct qib_chippport_specific {
+	struct qib_pportdata pportdata;
+	wait_queue_head_t autoneg_wait;
+	struct delayed_work autoneg_work;
+	struct timer_list chase_timer;
+	/*
+	 * these 5 fields are used to establish deltas for IB symbol
+	 * errors and linkrecovery errors.  They can be reported on
+	 * some chips during link negotiation prior to INIT, and with
+	 * DDR when faking DDR negotiations with non-IBTA switches.
+	 * The chip counters are adjusted at driver unload if there is
+	 * a non-zero delta.
+	 */
+	u64 ibdeltainprog;
+	u64 ibsymdelta;
+	u64 ibsymsnap;
+	u64 iblnkerrdelta;
+	u64 iblnkerrsnap;
+	u64 ibcctrl; /* kr_ibcctrl shadow */
+	u64 ibcddrctrl; /* kr_ibcddrctrl shadow */
+	unsigned long chase_end;
+	u32 last_delay_mult;
+};
+
+/*
+ * This header file provides the declarations and common definitions
+ * for (mostly) manipulation of the SerDes blocks within the IBA7220.
+ * the functions declared should only be called from within other
+ * 7220-related files such as qib_iba7220.c or qib_sd7220.c.
+ */
+int qib_sd7220_presets(struct qib_devdata *dd);
+int qib_sd7220_init(struct qib_devdata *dd);
+void qib_sd7220_clr_ibpar(struct qib_devdata *);
+/*
+ * Below used for sdnum parameter, selecting one of the two sections
+ * used for PCIe, or the single SerDes used for IB, which is the
+ * only one currently used
+ */
+#define IB_7220_SERDES 2
+
+static inline u32 qib_read_kreg32(const struct qib_devdata *dd,
+				  const u16 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+	return readl((u32 __iomem *)&dd->kregbase[regno]);
+}
+
+static inline u64 qib_read_kreg64(const struct qib_devdata *dd,
+				  const u16 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+
+	return readq(&dd->kregbase[regno]);
+}
+
+static inline void qib_write_kreg(const struct qib_devdata *dd,
+				  const u16 regno, u64 value)
+{
+	if (dd->kregbase)
+		writeq(value, &dd->kregbase[regno]);
+}
+
+void set_7220_relock_poll(struct qib_devdata *, int);
+void shutdown_7220_relock_poll(struct qib_devdata *);
+void toggle_7220_rclkrls(struct qib_devdata *);
+
+
+#endif /* _QIB_7220_H */
diff --git a/drivers/infiniband/hw/qib/qib_7220_regs.h b/drivers/infiniband/hw/qib/qib_7220_regs.h
new file mode 100644
index 000000000..0da5bb750
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_7220_regs.h
@@ -0,0 +1,1496 @@
+/*
+ * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved.
+ *
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+/* This file is mechanically generated from RTL. Any hand-edits will be lost! */
+
+#define QIB_7220_Revision_OFFS 0x0
+#define QIB_7220_Revision_R_Simulator_LSB 0x3F
+#define QIB_7220_Revision_R_Simulator_RMASK 0x1
+#define QIB_7220_Revision_R_Emulation_LSB 0x3E
+#define QIB_7220_Revision_R_Emulation_RMASK 0x1
+#define QIB_7220_Revision_R_Emulation_Revcode_LSB 0x28
+#define QIB_7220_Revision_R_Emulation_Revcode_RMASK 0x3FFFFF
+#define QIB_7220_Revision_BoardID_LSB 0x20
+#define QIB_7220_Revision_BoardID_RMASK 0xFF
+#define QIB_7220_Revision_R_SW_LSB 0x18
+#define QIB_7220_Revision_R_SW_RMASK 0xFF
+#define QIB_7220_Revision_R_Arch_LSB 0x10
+#define QIB_7220_Revision_R_Arch_RMASK 0xFF
+#define QIB_7220_Revision_R_ChipRevMajor_LSB 0x8
+#define QIB_7220_Revision_R_ChipRevMajor_RMASK 0xFF
+#define QIB_7220_Revision_R_ChipRevMinor_LSB 0x0
+#define QIB_7220_Revision_R_ChipRevMinor_RMASK 0xFF
+
+#define QIB_7220_Control_OFFS 0x8
+#define QIB_7220_Control_SyncResetExceptPcieIRAMRST_LSB 0x7
+#define QIB_7220_Control_SyncResetExceptPcieIRAMRST_RMASK 0x1
+#define QIB_7220_Control_PCIECplQDiagEn_LSB 0x6
+#define QIB_7220_Control_PCIECplQDiagEn_RMASK 0x1
+#define QIB_7220_Control_Reserved_LSB 0x5
+#define QIB_7220_Control_Reserved_RMASK 0x1
+#define QIB_7220_Control_TxLatency_LSB 0x4
+#define QIB_7220_Control_TxLatency_RMASK 0x1
+#define QIB_7220_Control_PCIERetryBufDiagEn_LSB 0x3
+#define QIB_7220_Control_PCIERetryBufDiagEn_RMASK 0x1
+#define QIB_7220_Control_LinkEn_LSB 0x2
+#define QIB_7220_Control_LinkEn_RMASK 0x1
+#define QIB_7220_Control_FreezeMode_LSB 0x1
+#define QIB_7220_Control_FreezeMode_RMASK 0x1
+#define QIB_7220_Control_SyncReset_LSB 0x0
+#define QIB_7220_Control_SyncReset_RMASK 0x1
+
+#define QIB_7220_PageAlign_OFFS 0x10
+
+#define QIB_7220_PortCnt_OFFS 0x18
+
+#define QIB_7220_SendRegBase_OFFS 0x30
+
+#define QIB_7220_UserRegBase_OFFS 0x38
+
+#define QIB_7220_CntrRegBase_OFFS 0x40
+
+#define QIB_7220_Scratch_OFFS 0x48
+
+#define QIB_7220_IntMask_OFFS 0x68
+#define QIB_7220_IntMask_SDmaIntMask_LSB 0x3F
+#define QIB_7220_IntMask_SDmaIntMask_RMASK 0x1
+#define QIB_7220_IntMask_SDmaDisabledMasked_LSB 0x3E
+#define QIB_7220_IntMask_SDmaDisabledMasked_RMASK 0x1
+#define QIB_7220_IntMask_Reserved_LSB 0x31
+#define QIB_7220_IntMask_Reserved_RMASK 0x1FFF
+#define QIB_7220_IntMask_RcvUrg16IntMask_LSB 0x30
+#define QIB_7220_IntMask_RcvUrg16IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg15IntMask_LSB 0x2F
+#define QIB_7220_IntMask_RcvUrg15IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg14IntMask_LSB 0x2E
+#define QIB_7220_IntMask_RcvUrg14IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg13IntMask_LSB 0x2D
+#define QIB_7220_IntMask_RcvUrg13IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg12IntMask_LSB 0x2C
+#define QIB_7220_IntMask_RcvUrg12IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg11IntMask_LSB 0x2B
+#define QIB_7220_IntMask_RcvUrg11IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg10IntMask_LSB 0x2A
+#define QIB_7220_IntMask_RcvUrg10IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg9IntMask_LSB 0x29
+#define QIB_7220_IntMask_RcvUrg9IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg8IntMask_LSB 0x28
+#define QIB_7220_IntMask_RcvUrg8IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg7IntMask_LSB 0x27
+#define QIB_7220_IntMask_RcvUrg7IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg6IntMask_LSB 0x26
+#define QIB_7220_IntMask_RcvUrg6IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg5IntMask_LSB 0x25
+#define QIB_7220_IntMask_RcvUrg5IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg4IntMask_LSB 0x24
+#define QIB_7220_IntMask_RcvUrg4IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg3IntMask_LSB 0x23
+#define QIB_7220_IntMask_RcvUrg3IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg2IntMask_LSB 0x22
+#define QIB_7220_IntMask_RcvUrg2IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg1IntMask_LSB 0x21
+#define QIB_7220_IntMask_RcvUrg1IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvUrg0IntMask_LSB 0x20
+#define QIB_7220_IntMask_RcvUrg0IntMask_RMASK 0x1
+#define QIB_7220_IntMask_ErrorIntMask_LSB 0x1F
+#define QIB_7220_IntMask_ErrorIntMask_RMASK 0x1
+#define QIB_7220_IntMask_PioSetIntMask_LSB 0x1E
+#define QIB_7220_IntMask_PioSetIntMask_RMASK 0x1
+#define QIB_7220_IntMask_PioBufAvailIntMask_LSB 0x1D
+#define QIB_7220_IntMask_PioBufAvailIntMask_RMASK 0x1
+#define QIB_7220_IntMask_assertGPIOIntMask_LSB 0x1C
+#define QIB_7220_IntMask_assertGPIOIntMask_RMASK 0x1
+#define QIB_7220_IntMask_IBSerdesTrimDoneIntMask_LSB 0x1B
+#define QIB_7220_IntMask_IBSerdesTrimDoneIntMask_RMASK 0x1
+#define QIB_7220_IntMask_JIntMask_LSB 0x1A
+#define QIB_7220_IntMask_JIntMask_RMASK 0x1
+#define QIB_7220_IntMask_Reserved1_LSB 0x11
+#define QIB_7220_IntMask_Reserved1_RMASK 0x1FF
+#define QIB_7220_IntMask_RcvAvail16IntMask_LSB 0x10
+#define QIB_7220_IntMask_RcvAvail16IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail15IntMask_LSB 0xF
+#define QIB_7220_IntMask_RcvAvail15IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail14IntMask_LSB 0xE
+#define QIB_7220_IntMask_RcvAvail14IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail13IntMask_LSB 0xD
+#define QIB_7220_IntMask_RcvAvail13IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail12IntMask_LSB 0xC
+#define QIB_7220_IntMask_RcvAvail12IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail11IntMask_LSB 0xB
+#define QIB_7220_IntMask_RcvAvail11IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail10IntMask_LSB 0xA
+#define QIB_7220_IntMask_RcvAvail10IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail9IntMask_LSB 0x9
+#define QIB_7220_IntMask_RcvAvail9IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail8IntMask_LSB 0x8
+#define QIB_7220_IntMask_RcvAvail8IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail7IntMask_LSB 0x7
+#define QIB_7220_IntMask_RcvAvail7IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail6IntMask_LSB 0x6
+#define QIB_7220_IntMask_RcvAvail6IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail5IntMask_LSB 0x5
+#define QIB_7220_IntMask_RcvAvail5IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail4IntMask_LSB 0x4
+#define QIB_7220_IntMask_RcvAvail4IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail3IntMask_LSB 0x3
+#define QIB_7220_IntMask_RcvAvail3IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail2IntMask_LSB 0x2
+#define QIB_7220_IntMask_RcvAvail2IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail1IntMask_LSB 0x1
+#define QIB_7220_IntMask_RcvAvail1IntMask_RMASK 0x1
+#define QIB_7220_IntMask_RcvAvail0IntMask_LSB 0x0
+#define QIB_7220_IntMask_RcvAvail0IntMask_RMASK 0x1
+
+#define QIB_7220_IntStatus_OFFS 0x70
+#define QIB_7220_IntStatus_SDmaInt_LSB 0x3F
+#define QIB_7220_IntStatus_SDmaInt_RMASK 0x1
+#define QIB_7220_IntStatus_SDmaDisabled_LSB 0x3E
+#define QIB_7220_IntStatus_SDmaDisabled_RMASK 0x1
+#define QIB_7220_IntStatus_Reserved_LSB 0x31
+#define QIB_7220_IntStatus_Reserved_RMASK 0x1FFF
+#define QIB_7220_IntStatus_RcvUrg16_LSB 0x30
+#define QIB_7220_IntStatus_RcvUrg16_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg15_LSB 0x2F
+#define QIB_7220_IntStatus_RcvUrg15_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg14_LSB 0x2E
+#define QIB_7220_IntStatus_RcvUrg14_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg13_LSB 0x2D
+#define QIB_7220_IntStatus_RcvUrg13_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg12_LSB 0x2C
+#define QIB_7220_IntStatus_RcvUrg12_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg11_LSB 0x2B
+#define QIB_7220_IntStatus_RcvUrg11_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg10_LSB 0x2A
+#define QIB_7220_IntStatus_RcvUrg10_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg9_LSB 0x29
+#define QIB_7220_IntStatus_RcvUrg9_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg8_LSB 0x28
+#define QIB_7220_IntStatus_RcvUrg8_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg7_LSB 0x27
+#define QIB_7220_IntStatus_RcvUrg7_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg6_LSB 0x26
+#define QIB_7220_IntStatus_RcvUrg6_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg5_LSB 0x25
+#define QIB_7220_IntStatus_RcvUrg5_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg4_LSB 0x24
+#define QIB_7220_IntStatus_RcvUrg4_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg3_LSB 0x23
+#define QIB_7220_IntStatus_RcvUrg3_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg2_LSB 0x22
+#define QIB_7220_IntStatus_RcvUrg2_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg1_LSB 0x21
+#define QIB_7220_IntStatus_RcvUrg1_RMASK 0x1
+#define QIB_7220_IntStatus_RcvUrg0_LSB 0x20
+#define QIB_7220_IntStatus_RcvUrg0_RMASK 0x1
+#define QIB_7220_IntStatus_Error_LSB 0x1F
+#define QIB_7220_IntStatus_Error_RMASK 0x1
+#define QIB_7220_IntStatus_PioSent_LSB 0x1E
+#define QIB_7220_IntStatus_PioSent_RMASK 0x1
+#define QIB_7220_IntStatus_PioBufAvail_LSB 0x1D
+#define QIB_7220_IntStatus_PioBufAvail_RMASK 0x1
+#define QIB_7220_IntStatus_assertGPIO_LSB 0x1C
+#define QIB_7220_IntStatus_assertGPIO_RMASK 0x1
+#define QIB_7220_IntStatus_IBSerdesTrimDone_LSB 0x1B
+#define QIB_7220_IntStatus_IBSerdesTrimDone_RMASK 0x1
+#define QIB_7220_IntStatus_JInt_LSB 0x1A
+#define QIB_7220_IntStatus_JInt_RMASK 0x1
+#define QIB_7220_IntStatus_Reserved1_LSB 0x11
+#define QIB_7220_IntStatus_Reserved1_RMASK 0x1FF
+#define QIB_7220_IntStatus_RcvAvail16_LSB 0x10
+#define QIB_7220_IntStatus_RcvAvail16_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail15_LSB 0xF
+#define QIB_7220_IntStatus_RcvAvail15_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail14_LSB 0xE
+#define QIB_7220_IntStatus_RcvAvail14_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail13_LSB 0xD
+#define QIB_7220_IntStatus_RcvAvail13_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail12_LSB 0xC
+#define QIB_7220_IntStatus_RcvAvail12_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail11_LSB 0xB
+#define QIB_7220_IntStatus_RcvAvail11_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail10_LSB 0xA
+#define QIB_7220_IntStatus_RcvAvail10_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail9_LSB 0x9
+#define QIB_7220_IntStatus_RcvAvail9_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail8_LSB 0x8
+#define QIB_7220_IntStatus_RcvAvail8_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail7_LSB 0x7
+#define QIB_7220_IntStatus_RcvAvail7_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail6_LSB 0x6
+#define QIB_7220_IntStatus_RcvAvail6_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail5_LSB 0x5
+#define QIB_7220_IntStatus_RcvAvail5_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail4_LSB 0x4
+#define QIB_7220_IntStatus_RcvAvail4_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail3_LSB 0x3
+#define QIB_7220_IntStatus_RcvAvail3_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail2_LSB 0x2
+#define QIB_7220_IntStatus_RcvAvail2_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail1_LSB 0x1
+#define QIB_7220_IntStatus_RcvAvail1_RMASK 0x1
+#define QIB_7220_IntStatus_RcvAvail0_LSB 0x0
+#define QIB_7220_IntStatus_RcvAvail0_RMASK 0x1
+
+#define QIB_7220_IntClear_OFFS 0x78
+#define QIB_7220_IntClear_SDmaIntClear_LSB 0x3F
+#define QIB_7220_IntClear_SDmaIntClear_RMASK 0x1
+#define QIB_7220_IntClear_SDmaDisabledClear_LSB 0x3E
+#define QIB_7220_IntClear_SDmaDisabledClear_RMASK 0x1
+#define QIB_7220_IntClear_Reserved_LSB 0x31
+#define QIB_7220_IntClear_Reserved_RMASK 0x1FFF
+#define QIB_7220_IntClear_RcvUrg16IntClear_LSB 0x30
+#define QIB_7220_IntClear_RcvUrg16IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg15IntClear_LSB 0x2F
+#define QIB_7220_IntClear_RcvUrg15IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg14IntClear_LSB 0x2E
+#define QIB_7220_IntClear_RcvUrg14IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg13IntClear_LSB 0x2D
+#define QIB_7220_IntClear_RcvUrg13IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg12IntClear_LSB 0x2C
+#define QIB_7220_IntClear_RcvUrg12IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg11IntClear_LSB 0x2B
+#define QIB_7220_IntClear_RcvUrg11IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg10IntClear_LSB 0x2A
+#define QIB_7220_IntClear_RcvUrg10IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg9IntClear_LSB 0x29
+#define QIB_7220_IntClear_RcvUrg9IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg8IntClear_LSB 0x28
+#define QIB_7220_IntClear_RcvUrg8IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg7IntClear_LSB 0x27
+#define QIB_7220_IntClear_RcvUrg7IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg6IntClear_LSB 0x26
+#define QIB_7220_IntClear_RcvUrg6IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg5IntClear_LSB 0x25
+#define QIB_7220_IntClear_RcvUrg5IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg4IntClear_LSB 0x24
+#define QIB_7220_IntClear_RcvUrg4IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg3IntClear_LSB 0x23
+#define QIB_7220_IntClear_RcvUrg3IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg2IntClear_LSB 0x22
+#define QIB_7220_IntClear_RcvUrg2IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg1IntClear_LSB 0x21
+#define QIB_7220_IntClear_RcvUrg1IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvUrg0IntClear_LSB 0x20
+#define QIB_7220_IntClear_RcvUrg0IntClear_RMASK 0x1
+#define QIB_7220_IntClear_ErrorIntClear_LSB 0x1F
+#define QIB_7220_IntClear_ErrorIntClear_RMASK 0x1
+#define QIB_7220_IntClear_PioSetIntClear_LSB 0x1E
+#define QIB_7220_IntClear_PioSetIntClear_RMASK 0x1
+#define QIB_7220_IntClear_PioBufAvailIntClear_LSB 0x1D
+#define QIB_7220_IntClear_PioBufAvailIntClear_RMASK 0x1
+#define QIB_7220_IntClear_assertGPIOIntClear_LSB 0x1C
+#define QIB_7220_IntClear_assertGPIOIntClear_RMASK 0x1
+#define QIB_7220_IntClear_IBSerdesTrimDoneClear_LSB 0x1B
+#define QIB_7220_IntClear_IBSerdesTrimDoneClear_RMASK 0x1
+#define QIB_7220_IntClear_JIntClear_LSB 0x1A
+#define QIB_7220_IntClear_JIntClear_RMASK 0x1
+#define QIB_7220_IntClear_Reserved1_LSB 0x11
+#define QIB_7220_IntClear_Reserved1_RMASK 0x1FF
+#define QIB_7220_IntClear_RcvAvail16IntClear_LSB 0x10
+#define QIB_7220_IntClear_RcvAvail16IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail15IntClear_LSB 0xF
+#define QIB_7220_IntClear_RcvAvail15IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail14IntClear_LSB 0xE
+#define QIB_7220_IntClear_RcvAvail14IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail13IntClear_LSB 0xD
+#define QIB_7220_IntClear_RcvAvail13IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail12IntClear_LSB 0xC
+#define QIB_7220_IntClear_RcvAvail12IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail11IntClear_LSB 0xB
+#define QIB_7220_IntClear_RcvAvail11IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail10IntClear_LSB 0xA
+#define QIB_7220_IntClear_RcvAvail10IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail9IntClear_LSB 0x9
+#define QIB_7220_IntClear_RcvAvail9IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail8IntClear_LSB 0x8
+#define QIB_7220_IntClear_RcvAvail8IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail7IntClear_LSB 0x7
+#define QIB_7220_IntClear_RcvAvail7IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail6IntClear_LSB 0x6
+#define QIB_7220_IntClear_RcvAvail6IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail5IntClear_LSB 0x5
+#define QIB_7220_IntClear_RcvAvail5IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail4IntClear_LSB 0x4
+#define QIB_7220_IntClear_RcvAvail4IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail3IntClear_LSB 0x3
+#define QIB_7220_IntClear_RcvAvail3IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail2IntClear_LSB 0x2
+#define QIB_7220_IntClear_RcvAvail2IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail1IntClear_LSB 0x1
+#define QIB_7220_IntClear_RcvAvail1IntClear_RMASK 0x1
+#define QIB_7220_IntClear_RcvAvail0IntClear_LSB 0x0
+#define QIB_7220_IntClear_RcvAvail0IntClear_RMASK 0x1
+
+#define QIB_7220_ErrMask_OFFS 0x80
+#define QIB_7220_ErrMask_Reserved_LSB 0x36
+#define QIB_7220_ErrMask_Reserved_RMASK 0x3FF
+#define QIB_7220_ErrMask_InvalidEEPCmdMask_LSB 0x35
+#define QIB_7220_ErrMask_InvalidEEPCmdMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaDescAddrMisalignErrMask_LSB 0x34
+#define QIB_7220_ErrMask_SDmaDescAddrMisalignErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_HardwareErrMask_LSB 0x33
+#define QIB_7220_ErrMask_HardwareErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_ResetNegatedMask_LSB 0x32
+#define QIB_7220_ErrMask_ResetNegatedMask_RMASK 0x1
+#define QIB_7220_ErrMask_InvalidAddrErrMask_LSB 0x31
+#define QIB_7220_ErrMask_InvalidAddrErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_IBStatusChangedMask_LSB 0x30
+#define QIB_7220_ErrMask_IBStatusChangedMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaUnexpDataErrMask_LSB 0x2F
+#define QIB_7220_ErrMask_SDmaUnexpDataErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaMissingDwErrMask_LSB 0x2E
+#define QIB_7220_ErrMask_SDmaMissingDwErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaDwEnErrMask_LSB 0x2D
+#define QIB_7220_ErrMask_SDmaDwEnErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaRpyTagErrMask_LSB 0x2C
+#define QIB_7220_ErrMask_SDmaRpyTagErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDma1stDescErrMask_LSB 0x2B
+#define QIB_7220_ErrMask_SDma1stDescErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaBaseErrMask_LSB 0x2A
+#define QIB_7220_ErrMask_SDmaBaseErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaTailOutOfBoundErrMask_LSB 0x29
+#define QIB_7220_ErrMask_SDmaTailOutOfBoundErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaOutOfBoundErrMask_LSB 0x28
+#define QIB_7220_ErrMask_SDmaOutOfBoundErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaGenMismatchErrMask_LSB 0x27
+#define QIB_7220_ErrMask_SDmaGenMismatchErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendBufMisuseErrMask_LSB 0x26
+#define QIB_7220_ErrMask_SendBufMisuseErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendUnsupportedVLErrMask_LSB 0x25
+#define QIB_7220_ErrMask_SendUnsupportedVLErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendUnexpectedPktNumErrMask_LSB 0x24
+#define QIB_7220_ErrMask_SendUnexpectedPktNumErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendPioArmLaunchErrMask_LSB 0x23
+#define QIB_7220_ErrMask_SendPioArmLaunchErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendDroppedDataPktErrMask_LSB 0x22
+#define QIB_7220_ErrMask_SendDroppedDataPktErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendDroppedSmpPktErrMask_LSB 0x21
+#define QIB_7220_ErrMask_SendDroppedSmpPktErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendPktLenErrMask_LSB 0x20
+#define QIB_7220_ErrMask_SendPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendUnderRunErrMask_LSB 0x1F
+#define QIB_7220_ErrMask_SendUnderRunErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendMaxPktLenErrMask_LSB 0x1E
+#define QIB_7220_ErrMask_SendMaxPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendMinPktLenErrMask_LSB 0x1D
+#define QIB_7220_ErrMask_SendMinPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SDmaDisabledErrMask_LSB 0x1C
+#define QIB_7220_ErrMask_SDmaDisabledErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_SendSpecialTriggerErrMask_LSB 0x1B
+#define QIB_7220_ErrMask_SendSpecialTriggerErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_Reserved1_LSB 0x12
+#define QIB_7220_ErrMask_Reserved1_RMASK 0x1FF
+#define QIB_7220_ErrMask_RcvIBLostLinkErrMask_LSB 0x11
+#define QIB_7220_ErrMask_RcvIBLostLinkErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvHdrErrMask_LSB 0x10
+#define QIB_7220_ErrMask_RcvHdrErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvHdrLenErrMask_LSB 0xF
+#define QIB_7220_ErrMask_RcvHdrLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvBadTidErrMask_LSB 0xE
+#define QIB_7220_ErrMask_RcvBadTidErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvHdrFullErrMask_LSB 0xD
+#define QIB_7220_ErrMask_RcvHdrFullErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvEgrFullErrMask_LSB 0xC
+#define QIB_7220_ErrMask_RcvEgrFullErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvBadVersionErrMask_LSB 0xB
+#define QIB_7220_ErrMask_RcvBadVersionErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvIBFlowErrMask_LSB 0xA
+#define QIB_7220_ErrMask_RcvIBFlowErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvEBPErrMask_LSB 0x9
+#define QIB_7220_ErrMask_RcvEBPErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvUnsupportedVLErrMask_LSB 0x8
+#define QIB_7220_ErrMask_RcvUnsupportedVLErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvUnexpectedCharErrMask_LSB 0x7
+#define QIB_7220_ErrMask_RcvUnexpectedCharErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvShortPktLenErrMask_LSB 0x6
+#define QIB_7220_ErrMask_RcvShortPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvLongPktLenErrMask_LSB 0x5
+#define QIB_7220_ErrMask_RcvLongPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvMaxPktLenErrMask_LSB 0x4
+#define QIB_7220_ErrMask_RcvMaxPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvMinPktLenErrMask_LSB 0x3
+#define QIB_7220_ErrMask_RcvMinPktLenErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvICRCErrMask_LSB 0x2
+#define QIB_7220_ErrMask_RcvICRCErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvVCRCErrMask_LSB 0x1
+#define QIB_7220_ErrMask_RcvVCRCErrMask_RMASK 0x1
+#define QIB_7220_ErrMask_RcvFormatErrMask_LSB 0x0
+#define QIB_7220_ErrMask_RcvFormatErrMask_RMASK 0x1
+
+#define QIB_7220_ErrStatus_OFFS 0x88
+#define QIB_7220_ErrStatus_Reserved_LSB 0x36
+#define QIB_7220_ErrStatus_Reserved_RMASK 0x3FF
+#define QIB_7220_ErrStatus_InvalidEEPCmdErr_LSB 0x35
+#define QIB_7220_ErrStatus_InvalidEEPCmdErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaDescAddrMisalignErr_LSB 0x34
+#define QIB_7220_ErrStatus_SDmaDescAddrMisalignErr_RMASK 0x1
+#define QIB_7220_ErrStatus_HardwareErr_LSB 0x33
+#define QIB_7220_ErrStatus_HardwareErr_RMASK 0x1
+#define QIB_7220_ErrStatus_ResetNegated_LSB 0x32
+#define QIB_7220_ErrStatus_ResetNegated_RMASK 0x1
+#define QIB_7220_ErrStatus_InvalidAddrErr_LSB 0x31
+#define QIB_7220_ErrStatus_InvalidAddrErr_RMASK 0x1
+#define QIB_7220_ErrStatus_IBStatusChanged_LSB 0x30
+#define QIB_7220_ErrStatus_IBStatusChanged_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaUnexpDataErr_LSB 0x2F
+#define QIB_7220_ErrStatus_SDmaUnexpDataErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaMissingDwErr_LSB 0x2E
+#define QIB_7220_ErrStatus_SDmaMissingDwErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaDwEnErr_LSB 0x2D
+#define QIB_7220_ErrStatus_SDmaDwEnErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaRpyTagErr_LSB 0x2C
+#define QIB_7220_ErrStatus_SDmaRpyTagErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDma1stDescErr_LSB 0x2B
+#define QIB_7220_ErrStatus_SDma1stDescErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaBaseErr_LSB 0x2A
+#define QIB_7220_ErrStatus_SDmaBaseErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaTailOutOfBoundErr_LSB 0x29
+#define QIB_7220_ErrStatus_SDmaTailOutOfBoundErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaOutOfBoundErr_LSB 0x28
+#define QIB_7220_ErrStatus_SDmaOutOfBoundErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaGenMismatchErr_LSB 0x27
+#define QIB_7220_ErrStatus_SDmaGenMismatchErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendBufMisuseErr_LSB 0x26
+#define QIB_7220_ErrStatus_SendBufMisuseErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendUnsupportedVLErr_LSB 0x25
+#define QIB_7220_ErrStatus_SendUnsupportedVLErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendUnexpectedPktNumErr_LSB 0x24
+#define QIB_7220_ErrStatus_SendUnexpectedPktNumErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendPioArmLaunchErr_LSB 0x23
+#define QIB_7220_ErrStatus_SendPioArmLaunchErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendDroppedDataPktErr_LSB 0x22
+#define QIB_7220_ErrStatus_SendDroppedDataPktErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendDroppedSmpPktErr_LSB 0x21
+#define QIB_7220_ErrStatus_SendDroppedSmpPktErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendPktLenErr_LSB 0x20
+#define QIB_7220_ErrStatus_SendPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendUnderRunErr_LSB 0x1F
+#define QIB_7220_ErrStatus_SendUnderRunErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendMaxPktLenErr_LSB 0x1E
+#define QIB_7220_ErrStatus_SendMaxPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendMinPktLenErr_LSB 0x1D
+#define QIB_7220_ErrStatus_SendMinPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SDmaDisabledErr_LSB 0x1C
+#define QIB_7220_ErrStatus_SDmaDisabledErr_RMASK 0x1
+#define QIB_7220_ErrStatus_SendSpecialTriggerErr_LSB 0x1B
+#define QIB_7220_ErrStatus_SendSpecialTriggerErr_RMASK 0x1
+#define QIB_7220_ErrStatus_Reserved1_LSB 0x12
+#define QIB_7220_ErrStatus_Reserved1_RMASK 0x1FF
+#define QIB_7220_ErrStatus_RcvIBLostLinkErr_LSB 0x11
+#define QIB_7220_ErrStatus_RcvIBLostLinkErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvHdrErr_LSB 0x10
+#define QIB_7220_ErrStatus_RcvHdrErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvHdrLenErr_LSB 0xF
+#define QIB_7220_ErrStatus_RcvHdrLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvBadTidErr_LSB 0xE
+#define QIB_7220_ErrStatus_RcvBadTidErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvHdrFullErr_LSB 0xD
+#define QIB_7220_ErrStatus_RcvHdrFullErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvEgrFullErr_LSB 0xC
+#define QIB_7220_ErrStatus_RcvEgrFullErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvBadVersionErr_LSB 0xB
+#define QIB_7220_ErrStatus_RcvBadVersionErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvIBFlowErr_LSB 0xA
+#define QIB_7220_ErrStatus_RcvIBFlowErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvEBPErr_LSB 0x9
+#define QIB_7220_ErrStatus_RcvEBPErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvUnsupportedVLErr_LSB 0x8
+#define QIB_7220_ErrStatus_RcvUnsupportedVLErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvUnexpectedCharErr_LSB 0x7
+#define QIB_7220_ErrStatus_RcvUnexpectedCharErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvShortPktLenErr_LSB 0x6
+#define QIB_7220_ErrStatus_RcvShortPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvLongPktLenErr_LSB 0x5
+#define QIB_7220_ErrStatus_RcvLongPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvMaxPktLenErr_LSB 0x4
+#define QIB_7220_ErrStatus_RcvMaxPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvMinPktLenErr_LSB 0x3
+#define QIB_7220_ErrStatus_RcvMinPktLenErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvICRCErr_LSB 0x2
+#define QIB_7220_ErrStatus_RcvICRCErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvVCRCErr_LSB 0x1
+#define QIB_7220_ErrStatus_RcvVCRCErr_RMASK 0x1
+#define QIB_7220_ErrStatus_RcvFormatErr_LSB 0x0
+#define QIB_7220_ErrStatus_RcvFormatErr_RMASK 0x1
+
+#define QIB_7220_ErrClear_OFFS 0x90
+#define QIB_7220_ErrClear_Reserved_LSB 0x36
+#define QIB_7220_ErrClear_Reserved_RMASK 0x3FF
+#define QIB_7220_ErrClear_InvalidEEPCmdErrClear_LSB 0x35
+#define QIB_7220_ErrClear_InvalidEEPCmdErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaDescAddrMisalignErrClear_LSB 0x34
+#define QIB_7220_ErrClear_SDmaDescAddrMisalignErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_HardwareErrClear_LSB 0x33
+#define QIB_7220_ErrClear_HardwareErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_ResetNegatedClear_LSB 0x32
+#define QIB_7220_ErrClear_ResetNegatedClear_RMASK 0x1
+#define QIB_7220_ErrClear_InvalidAddrErrClear_LSB 0x31
+#define QIB_7220_ErrClear_InvalidAddrErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_IBStatusChangedClear_LSB 0x30
+#define QIB_7220_ErrClear_IBStatusChangedClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaUnexpDataErrClear_LSB 0x2F
+#define QIB_7220_ErrClear_SDmaUnexpDataErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaMissingDwErrClear_LSB 0x2E
+#define QIB_7220_ErrClear_SDmaMissingDwErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaDwEnErrClear_LSB 0x2D
+#define QIB_7220_ErrClear_SDmaDwEnErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaRpyTagErrClear_LSB 0x2C
+#define QIB_7220_ErrClear_SDmaRpyTagErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDma1stDescErrClear_LSB 0x2B
+#define QIB_7220_ErrClear_SDma1stDescErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaBaseErrClear_LSB 0x2A
+#define QIB_7220_ErrClear_SDmaBaseErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaTailOutOfBoundErrClear_LSB 0x29
+#define QIB_7220_ErrClear_SDmaTailOutOfBoundErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaOutOfBoundErrClear_LSB 0x28
+#define QIB_7220_ErrClear_SDmaOutOfBoundErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaGenMismatchErrClear_LSB 0x27
+#define QIB_7220_ErrClear_SDmaGenMismatchErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendBufMisuseErrClear_LSB 0x26
+#define QIB_7220_ErrClear_SendBufMisuseErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendUnsupportedVLErrClear_LSB 0x25
+#define QIB_7220_ErrClear_SendUnsupportedVLErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendUnexpectedPktNumErrClear_LSB 0x24
+#define QIB_7220_ErrClear_SendUnexpectedPktNumErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendPioArmLaunchErrClear_LSB 0x23
+#define QIB_7220_ErrClear_SendPioArmLaunchErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendDroppedDataPktErrClear_LSB 0x22
+#define QIB_7220_ErrClear_SendDroppedDataPktErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendDroppedSmpPktErrClear_LSB 0x21
+#define QIB_7220_ErrClear_SendDroppedSmpPktErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendPktLenErrClear_LSB 0x20
+#define QIB_7220_ErrClear_SendPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendUnderRunErrClear_LSB 0x1F
+#define QIB_7220_ErrClear_SendUnderRunErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendMaxPktLenErrClear_LSB 0x1E
+#define QIB_7220_ErrClear_SendMaxPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendMinPktLenErrClear_LSB 0x1D
+#define QIB_7220_ErrClear_SendMinPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SDmaDisabledErrClear_LSB 0x1C
+#define QIB_7220_ErrClear_SDmaDisabledErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_SendSpecialTriggerErrClear_LSB 0x1B
+#define QIB_7220_ErrClear_SendSpecialTriggerErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_Reserved1_LSB 0x12
+#define QIB_7220_ErrClear_Reserved1_RMASK 0x1FF
+#define QIB_7220_ErrClear_RcvIBLostLinkErrClear_LSB 0x11
+#define QIB_7220_ErrClear_RcvIBLostLinkErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvHdrErrClear_LSB 0x10
+#define QIB_7220_ErrClear_RcvHdrErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvHdrLenErrClear_LSB 0xF
+#define QIB_7220_ErrClear_RcvHdrLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvBadTidErrClear_LSB 0xE
+#define QIB_7220_ErrClear_RcvBadTidErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvHdrFullErrClear_LSB 0xD
+#define QIB_7220_ErrClear_RcvHdrFullErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvEgrFullErrClear_LSB 0xC
+#define QIB_7220_ErrClear_RcvEgrFullErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvBadVersionErrClear_LSB 0xB
+#define QIB_7220_ErrClear_RcvBadVersionErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvIBFlowErrClear_LSB 0xA
+#define QIB_7220_ErrClear_RcvIBFlowErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvEBPErrClear_LSB 0x9
+#define QIB_7220_ErrClear_RcvEBPErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvUnsupportedVLErrClear_LSB 0x8
+#define QIB_7220_ErrClear_RcvUnsupportedVLErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvUnexpectedCharErrClear_LSB 0x7
+#define QIB_7220_ErrClear_RcvUnexpectedCharErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvShortPktLenErrClear_LSB 0x6
+#define QIB_7220_ErrClear_RcvShortPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvLongPktLenErrClear_LSB 0x5
+#define QIB_7220_ErrClear_RcvLongPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvMaxPktLenErrClear_LSB 0x4
+#define QIB_7220_ErrClear_RcvMaxPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvMinPktLenErrClear_LSB 0x3
+#define QIB_7220_ErrClear_RcvMinPktLenErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvICRCErrClear_LSB 0x2
+#define QIB_7220_ErrClear_RcvICRCErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvVCRCErrClear_LSB 0x1
+#define QIB_7220_ErrClear_RcvVCRCErrClear_RMASK 0x1
+#define QIB_7220_ErrClear_RcvFormatErrClear_LSB 0x0
+#define QIB_7220_ErrClear_RcvFormatErrClear_RMASK 0x1
+
+#define QIB_7220_HwErrMask_OFFS 0x98
+#define QIB_7220_HwErrMask_IBCBusFromSPCParityErrMask_LSB 0x3F
+#define QIB_7220_HwErrMask_IBCBusFromSPCParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_IBCBusToSPCParityErrMask_LSB 0x3E
+#define QIB_7220_HwErrMask_IBCBusToSPCParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_Clk_uC_PLLNotLockedMask_LSB 0x3D
+#define QIB_7220_HwErrMask_Clk_uC_PLLNotLockedMask_RMASK 0x1
+#define QIB_7220_HwErrMask_IBSerdesPClkNotDetectMask_LSB 0x3C
+#define QIB_7220_HwErrMask_IBSerdesPClkNotDetectMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIESerdesQ3PClkNotDetectMask_LSB 0x3B
+#define QIB_7220_HwErrMask_PCIESerdesQ3PClkNotDetectMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIESerdesQ2PClkNotDetectMask_LSB 0x3A
+#define QIB_7220_HwErrMask_PCIESerdesQ2PClkNotDetectMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIESerdesQ1PClkNotDetectMask_LSB 0x39
+#define QIB_7220_HwErrMask_PCIESerdesQ1PClkNotDetectMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIESerdesQ0PClkNotDetectMask_LSB 0x38
+#define QIB_7220_HwErrMask_PCIESerdesQ0PClkNotDetectMask_RMASK 0x1
+#define QIB_7220_HwErrMask_Reserved_LSB 0x37
+#define QIB_7220_HwErrMask_Reserved_RMASK 0x1
+#define QIB_7220_HwErrMask_PowerOnBISTFailedMask_LSB 0x36
+#define QIB_7220_HwErrMask_PowerOnBISTFailedMask_RMASK 0x1
+#define QIB_7220_HwErrMask_Reserved1_LSB 0x33
+#define QIB_7220_HwErrMask_Reserved1_RMASK 0x7
+#define QIB_7220_HwErrMask_RXEMemParityErrMask_LSB 0x2C
+#define QIB_7220_HwErrMask_RXEMemParityErrMask_RMASK 0x7F
+#define QIB_7220_HwErrMask_TXEMemParityErrMask_LSB 0x28
+#define QIB_7220_HwErrMask_TXEMemParityErrMask_RMASK 0xF
+#define QIB_7220_HwErrMask_DDSRXEQMemoryParityErrMask_LSB 0x27
+#define QIB_7220_HwErrMask_DDSRXEQMemoryParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_IB_uC_MemoryParityErrMask_LSB 0x26
+#define QIB_7220_HwErrMask_IB_uC_MemoryParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIEOct1_uC_MemoryParityErrMask_LSB 0x25
+#define QIB_7220_HwErrMask_PCIEOct1_uC_MemoryParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PCIEOct0_uC_MemoryParityErrMask_LSB 0x24
+#define QIB_7220_HwErrMask_PCIEOct0_uC_MemoryParityErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_Reserved2_LSB 0x22
+#define QIB_7220_HwErrMask_Reserved2_RMASK 0x3
+#define QIB_7220_HwErrMask_PCIeBusParityErrMask_LSB 0x1F
+#define QIB_7220_HwErrMask_PCIeBusParityErrMask_RMASK 0x7
+#define QIB_7220_HwErrMask_PcieCplTimeoutMask_LSB 0x1E
+#define QIB_7220_HwErrMask_PcieCplTimeoutMask_RMASK 0x1
+#define QIB_7220_HwErrMask_PoisonedTLPMask_LSB 0x1D
+#define QIB_7220_HwErrMask_PoisonedTLPMask_RMASK 0x1
+#define QIB_7220_HwErrMask_SDmaMemReadErrMask_LSB 0x1C
+#define QIB_7220_HwErrMask_SDmaMemReadErrMask_RMASK 0x1
+#define QIB_7220_HwErrMask_Reserved3_LSB 0x8
+#define QIB_7220_HwErrMask_Reserved3_RMASK 0xFFFFF
+#define QIB_7220_HwErrMask_PCIeMemParityErrMask_LSB 0x0
+#define QIB_7220_HwErrMask_PCIeMemParityErrMask_RMASK 0xFF
+
+#define QIB_7220_HwErrStatus_OFFS 0xA0
+#define QIB_7220_HwErrStatus_IBCBusFromSPCParityErr_LSB 0x3F
+#define QIB_7220_HwErrStatus_IBCBusFromSPCParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_IBCBusToSPCParityErr_LSB 0x3E
+#define QIB_7220_HwErrStatus_IBCBusToSPCParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_Clk_uC_PLLNotLocked_LSB 0x3D
+#define QIB_7220_HwErrStatus_Clk_uC_PLLNotLocked_RMASK 0x1
+#define QIB_7220_HwErrStatus_IBSerdesPClkNotDetect_LSB 0x3C
+#define QIB_7220_HwErrStatus_IBSerdesPClkNotDetect_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIESerdesQ3PClkNotDetect_LSB 0x3B
+#define QIB_7220_HwErrStatus_PCIESerdesQ3PClkNotDetect_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIESerdesQ2PClkNotDetect_LSB 0x3A
+#define QIB_7220_HwErrStatus_PCIESerdesQ2PClkNotDetect_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIESerdesQ1PClkNotDetect_LSB 0x39
+#define QIB_7220_HwErrStatus_PCIESerdesQ1PClkNotDetect_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIESerdesQ0PClkNotDetect_LSB 0x38
+#define QIB_7220_HwErrStatus_PCIESerdesQ0PClkNotDetect_RMASK 0x1
+#define QIB_7220_HwErrStatus_Reserved_LSB 0x37
+#define QIB_7220_HwErrStatus_Reserved_RMASK 0x1
+#define QIB_7220_HwErrStatus_PowerOnBISTFailed_LSB 0x36
+#define QIB_7220_HwErrStatus_PowerOnBISTFailed_RMASK 0x1
+#define QIB_7220_HwErrStatus_Reserved1_LSB 0x33
+#define QIB_7220_HwErrStatus_Reserved1_RMASK 0x7
+#define QIB_7220_HwErrStatus_RXEMemParity_LSB 0x2C
+#define QIB_7220_HwErrStatus_RXEMemParity_RMASK 0x7F
+#define QIB_7220_HwErrStatus_TXEMemParity_LSB 0x28
+#define QIB_7220_HwErrStatus_TXEMemParity_RMASK 0xF
+#define QIB_7220_HwErrStatus_DDSRXEQMemoryParityErr_LSB 0x27
+#define QIB_7220_HwErrStatus_DDSRXEQMemoryParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_IB_uC_MemoryParityErr_LSB 0x26
+#define QIB_7220_HwErrStatus_IB_uC_MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIE_uC_Oct1MemoryParityErr_LSB 0x25
+#define QIB_7220_HwErrStatus_PCIE_uC_Oct1MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_PCIE_uC_Oct0MemoryParityErr_LSB 0x24
+#define QIB_7220_HwErrStatus_PCIE_uC_Oct0MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_Reserved2_LSB 0x22
+#define QIB_7220_HwErrStatus_Reserved2_RMASK 0x3
+#define QIB_7220_HwErrStatus_PCIeBusParity_LSB 0x1F
+#define QIB_7220_HwErrStatus_PCIeBusParity_RMASK 0x7
+#define QIB_7220_HwErrStatus_PcieCplTimeout_LSB 0x1E
+#define QIB_7220_HwErrStatus_PcieCplTimeout_RMASK 0x1
+#define QIB_7220_HwErrStatus_PoisenedTLP_LSB 0x1D
+#define QIB_7220_HwErrStatus_PoisenedTLP_RMASK 0x1
+#define QIB_7220_HwErrStatus_SDmaMemReadErr_LSB 0x1C
+#define QIB_7220_HwErrStatus_SDmaMemReadErr_RMASK 0x1
+#define QIB_7220_HwErrStatus_Reserved3_LSB 0x8
+#define QIB_7220_HwErrStatus_Reserved3_RMASK 0xFFFFF
+#define QIB_7220_HwErrStatus_PCIeMemParity_LSB 0x0
+#define QIB_7220_HwErrStatus_PCIeMemParity_RMASK 0xFF
+
+#define QIB_7220_HwErrClear_OFFS 0xA8
+#define QIB_7220_HwErrClear_IBCBusFromSPCParityErrClear_LSB 0x3F
+#define QIB_7220_HwErrClear_IBCBusFromSPCParityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_IBCBusToSPCparityErrClear_LSB 0x3E
+#define QIB_7220_HwErrClear_IBCBusToSPCparityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_Clk_uC_PLLNotLockedClear_LSB 0x3D
+#define QIB_7220_HwErrClear_Clk_uC_PLLNotLockedClear_RMASK 0x1
+#define QIB_7220_HwErrClear_IBSerdesPClkNotDetectClear_LSB 0x3C
+#define QIB_7220_HwErrClear_IBSerdesPClkNotDetectClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIESerdesQ3PClkNotDetectClear_LSB 0x3B
+#define QIB_7220_HwErrClear_PCIESerdesQ3PClkNotDetectClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIESerdesQ2PClkNotDetectClear_LSB 0x3A
+#define QIB_7220_HwErrClear_PCIESerdesQ2PClkNotDetectClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIESerdesQ1PClkNotDetectClear_LSB 0x39
+#define QIB_7220_HwErrClear_PCIESerdesQ1PClkNotDetectClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIESerdesQ0PClkNotDetectClear_LSB 0x38
+#define QIB_7220_HwErrClear_PCIESerdesQ0PClkNotDetectClear_RMASK 0x1
+#define QIB_7220_HwErrClear_Reserved_LSB 0x37
+#define QIB_7220_HwErrClear_Reserved_RMASK 0x1
+#define QIB_7220_HwErrClear_PowerOnBISTFailedClear_LSB 0x36
+#define QIB_7220_HwErrClear_PowerOnBISTFailedClear_RMASK 0x1
+#define QIB_7220_HwErrClear_Reserved1_LSB 0x33
+#define QIB_7220_HwErrClear_Reserved1_RMASK 0x7
+#define QIB_7220_HwErrClear_RXEMemParityClear_LSB 0x2C
+#define QIB_7220_HwErrClear_RXEMemParityClear_RMASK 0x7F
+#define QIB_7220_HwErrClear_TXEMemParityClear_LSB 0x28
+#define QIB_7220_HwErrClear_TXEMemParityClear_RMASK 0xF
+#define QIB_7220_HwErrClear_DDSRXEQMemoryParityErrClear_LSB 0x27
+#define QIB_7220_HwErrClear_DDSRXEQMemoryParityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_IB_uC_MemoryParityErrClear_LSB 0x26
+#define QIB_7220_HwErrClear_IB_uC_MemoryParityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIE_uC_Oct1MemoryParityErrClear_LSB 0x25
+#define QIB_7220_HwErrClear_PCIE_uC_Oct1MemoryParityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PCIE_uC_Oct0MemoryParityErrClear_LSB 0x24
+#define QIB_7220_HwErrClear_PCIE_uC_Oct0MemoryParityErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_Reserved2_LSB 0x22
+#define QIB_7220_HwErrClear_Reserved2_RMASK 0x3
+#define QIB_7220_HwErrClear_PCIeBusParityClr_LSB 0x1F
+#define QIB_7220_HwErrClear_PCIeBusParityClr_RMASK 0x7
+#define QIB_7220_HwErrClear_PcieCplTimeoutClear_LSB 0x1E
+#define QIB_7220_HwErrClear_PcieCplTimeoutClear_RMASK 0x1
+#define QIB_7220_HwErrClear_PoisonedTLPClear_LSB 0x1D
+#define QIB_7220_HwErrClear_PoisonedTLPClear_RMASK 0x1
+#define QIB_7220_HwErrClear_SDmaMemReadErrClear_LSB 0x1C
+#define QIB_7220_HwErrClear_SDmaMemReadErrClear_RMASK 0x1
+#define QIB_7220_HwErrClear_Reserved3_LSB 0x8
+#define QIB_7220_HwErrClear_Reserved3_RMASK 0xFFFFF
+#define QIB_7220_HwErrClear_PCIeMemParityClr_LSB 0x0
+#define QIB_7220_HwErrClear_PCIeMemParityClr_RMASK 0xFF
+
+#define QIB_7220_HwDiagCtrl_OFFS 0xB0
+#define QIB_7220_HwDiagCtrl_ForceIBCBusFromSPCParityErr_LSB 0x3F
+#define QIB_7220_HwDiagCtrl_ForceIBCBusFromSPCParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_ForceIBCBusToSPCParityErr_LSB 0x3E
+#define QIB_7220_HwDiagCtrl_ForceIBCBusToSPCParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_CounterWrEnable_LSB 0x3D
+#define QIB_7220_HwDiagCtrl_CounterWrEnable_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_CounterDisable_LSB 0x3C
+#define QIB_7220_HwDiagCtrl_CounterDisable_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_Reserved_LSB 0x33
+#define QIB_7220_HwDiagCtrl_Reserved_RMASK 0x1FF
+#define QIB_7220_HwDiagCtrl_ForceRxMemParityErr_LSB 0x2C
+#define QIB_7220_HwDiagCtrl_ForceRxMemParityErr_RMASK 0x7F
+#define QIB_7220_HwDiagCtrl_ForceTxMemparityErr_LSB 0x28
+#define QIB_7220_HwDiagCtrl_ForceTxMemparityErr_RMASK 0xF
+#define QIB_7220_HwDiagCtrl_ForceDDSRXEQMemoryParityErr_LSB 0x27
+#define QIB_7220_HwDiagCtrl_ForceDDSRXEQMemoryParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_ForceIB_uC_MemoryParityErr_LSB 0x26
+#define QIB_7220_HwDiagCtrl_ForceIB_uC_MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct1MemoryParityErr_LSB 0x25
+#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct1MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct0MemoryParityErr_LSB 0x24
+#define QIB_7220_HwDiagCtrl_ForcePCIE_uC_Oct0MemoryParityErr_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_Reserved1_LSB 0x23
+#define QIB_7220_HwDiagCtrl_Reserved1_RMASK 0x1
+#define QIB_7220_HwDiagCtrl_forcePCIeBusParity_LSB 0x1F
+#define QIB_7220_HwDiagCtrl_forcePCIeBusParity_RMASK 0xF
+#define QIB_7220_HwDiagCtrl_Reserved2_LSB 0x8
+#define QIB_7220_HwDiagCtrl_Reserved2_RMASK 0x7FFFFF
+#define QIB_7220_HwDiagCtrl_forcePCIeMemParity_LSB 0x0
+#define QIB_7220_HwDiagCtrl_forcePCIeMemParity_RMASK 0xFF
+
+#define QIB_7220_REG_0000B8_OFFS 0xB8
+
+#define QIB_7220_IBCStatus_OFFS 0xC0
+#define QIB_7220_IBCStatus_TxCreditOk_LSB 0x1F
+#define QIB_7220_IBCStatus_TxCreditOk_RMASK 0x1
+#define QIB_7220_IBCStatus_TxReady_LSB 0x1E
+#define QIB_7220_IBCStatus_TxReady_RMASK 0x1
+#define QIB_7220_IBCStatus_Reserved_LSB 0xE
+#define QIB_7220_IBCStatus_Reserved_RMASK 0xFFFF
+#define QIB_7220_IBCStatus_IBTxLaneReversed_LSB 0xD
+#define QIB_7220_IBCStatus_IBTxLaneReversed_RMASK 0x1
+#define QIB_7220_IBCStatus_IBRxLaneReversed_LSB 0xC
+#define QIB_7220_IBCStatus_IBRxLaneReversed_RMASK 0x1
+#define QIB_7220_IBCStatus_IB_SERDES_TRIM_DONE_LSB 0xB
+#define QIB_7220_IBCStatus_IB_SERDES_TRIM_DONE_RMASK 0x1
+#define QIB_7220_IBCStatus_DDS_RXEQ_FAIL_LSB 0xA
+#define QIB_7220_IBCStatus_DDS_RXEQ_FAIL_RMASK 0x1
+#define QIB_7220_IBCStatus_LinkWidthActive_LSB 0x9
+#define QIB_7220_IBCStatus_LinkWidthActive_RMASK 0x1
+#define QIB_7220_IBCStatus_LinkSpeedActive_LSB 0x8
+#define QIB_7220_IBCStatus_LinkSpeedActive_RMASK 0x1
+#define QIB_7220_IBCStatus_LinkState_LSB 0x5
+#define QIB_7220_IBCStatus_LinkState_RMASK 0x7
+#define QIB_7220_IBCStatus_LinkTrainingState_LSB 0x0
+#define QIB_7220_IBCStatus_LinkTrainingState_RMASK 0x1F
+
+#define QIB_7220_IBCCtrl_OFFS 0xC8
+#define QIB_7220_IBCCtrl_Loopback_LSB 0x3F
+#define QIB_7220_IBCCtrl_Loopback_RMASK 0x1
+#define QIB_7220_IBCCtrl_LinkDownDefaultState_LSB 0x3E
+#define QIB_7220_IBCCtrl_LinkDownDefaultState_RMASK 0x1
+#define QIB_7220_IBCCtrl_Reserved_LSB 0x2B
+#define QIB_7220_IBCCtrl_Reserved_RMASK 0x7FFFF
+#define QIB_7220_IBCCtrl_CreditScale_LSB 0x28
+#define QIB_7220_IBCCtrl_CreditScale_RMASK 0x7
+#define QIB_7220_IBCCtrl_OverrunThreshold_LSB 0x24
+#define QIB_7220_IBCCtrl_OverrunThreshold_RMASK 0xF
+#define QIB_7220_IBCCtrl_PhyerrThreshold_LSB 0x20
+#define QIB_7220_IBCCtrl_PhyerrThreshold_RMASK 0xF
+#define QIB_7220_IBCCtrl_MaxPktLen_LSB 0x15
+#define QIB_7220_IBCCtrl_MaxPktLen_RMASK 0x7FF
+#define QIB_7220_IBCCtrl_LinkCmd_LSB 0x13
+#define QIB_7220_IBCCtrl_LinkCmd_RMASK 0x3
+#define QIB_7220_IBCCtrl_LinkInitCmd_LSB 0x10
+#define QIB_7220_IBCCtrl_LinkInitCmd_RMASK 0x7
+#define QIB_7220_IBCCtrl_FlowCtrlWaterMark_LSB 0x8
+#define QIB_7220_IBCCtrl_FlowCtrlWaterMark_RMASK 0xFF
+#define QIB_7220_IBCCtrl_FlowCtrlPeriod_LSB 0x0
+#define QIB_7220_IBCCtrl_FlowCtrlPeriod_RMASK 0xFF
+
+#define QIB_7220_EXTStatus_OFFS 0xD0
+#define QIB_7220_EXTStatus_GPIOIn_LSB 0x30
+#define QIB_7220_EXTStatus_GPIOIn_RMASK 0xFFFF
+#define QIB_7220_EXTStatus_Reserved_LSB 0x20
+#define QIB_7220_EXTStatus_Reserved_RMASK 0xFFFF
+#define QIB_7220_EXTStatus_Reserved1_LSB 0x10
+#define QIB_7220_EXTStatus_Reserved1_RMASK 0xFFFF
+#define QIB_7220_EXTStatus_MemBISTDisabled_LSB 0xF
+#define QIB_7220_EXTStatus_MemBISTDisabled_RMASK 0x1
+#define QIB_7220_EXTStatus_MemBISTEndTest_LSB 0xE
+#define QIB_7220_EXTStatus_MemBISTEndTest_RMASK 0x1
+#define QIB_7220_EXTStatus_Reserved2_LSB 0x0
+#define QIB_7220_EXTStatus_Reserved2_RMASK 0x3FFF
+
+#define QIB_7220_EXTCtrl_OFFS 0xD8
+#define QIB_7220_EXTCtrl_GPIOOe_LSB 0x30
+#define QIB_7220_EXTCtrl_GPIOOe_RMASK 0xFFFF
+#define QIB_7220_EXTCtrl_GPIOInvert_LSB 0x20
+#define QIB_7220_EXTCtrl_GPIOInvert_RMASK 0xFFFF
+#define QIB_7220_EXTCtrl_Reserved_LSB 0x4
+#define QIB_7220_EXTCtrl_Reserved_RMASK 0xFFFFFFF
+#define QIB_7220_EXTCtrl_LEDPriPortGreenOn_LSB 0x3
+#define QIB_7220_EXTCtrl_LEDPriPortGreenOn_RMASK 0x1
+#define QIB_7220_EXTCtrl_LEDPriPortYellowOn_LSB 0x2
+#define QIB_7220_EXTCtrl_LEDPriPortYellowOn_RMASK 0x1
+#define QIB_7220_EXTCtrl_LEDGblOkGreenOn_LSB 0x1
+#define QIB_7220_EXTCtrl_LEDGblOkGreenOn_RMASK 0x1
+#define QIB_7220_EXTCtrl_LEDGblErrRedOff_LSB 0x0
+#define QIB_7220_EXTCtrl_LEDGblErrRedOff_RMASK 0x1
+
+#define QIB_7220_GPIOOut_OFFS 0xE0
+
+#define QIB_7220_GPIOMask_OFFS 0xE8
+
+#define QIB_7220_GPIOStatus_OFFS 0xF0
+
+#define QIB_7220_GPIOClear_OFFS 0xF8
+
+#define QIB_7220_RcvCtrl_OFFS 0x100
+#define QIB_7220_RcvCtrl_Reserved_LSB 0x27
+#define QIB_7220_RcvCtrl_Reserved_RMASK 0x1FFFFFF
+#define QIB_7220_RcvCtrl_RcvQPMapEnable_LSB 0x26
+#define QIB_7220_RcvCtrl_RcvQPMapEnable_RMASK 0x1
+#define QIB_7220_RcvCtrl_PortCfg_LSB 0x24
+#define QIB_7220_RcvCtrl_PortCfg_RMASK 0x3
+#define QIB_7220_RcvCtrl_TailUpd_LSB 0x23
+#define QIB_7220_RcvCtrl_TailUpd_RMASK 0x1
+#define QIB_7220_RcvCtrl_RcvPartitionKeyDisable_LSB 0x22
+#define QIB_7220_RcvCtrl_RcvPartitionKeyDisable_RMASK 0x1
+#define QIB_7220_RcvCtrl_IntrAvail_LSB 0x11
+#define QIB_7220_RcvCtrl_IntrAvail_RMASK 0x1FFFF
+#define QIB_7220_RcvCtrl_PortEnable_LSB 0x0
+#define QIB_7220_RcvCtrl_PortEnable_RMASK 0x1FFFF
+
+#define QIB_7220_RcvBTHQP_OFFS 0x108
+#define QIB_7220_RcvBTHQP_Reserved_LSB 0x18
+#define QIB_7220_RcvBTHQP_Reserved_RMASK 0xFF
+#define QIB_7220_RcvBTHQP_RcvBTHQP_LSB 0x0
+#define QIB_7220_RcvBTHQP_RcvBTHQP_RMASK 0xFFFFFF
+
+#define QIB_7220_RcvHdrSize_OFFS 0x110
+
+#define QIB_7220_RcvHdrCnt_OFFS 0x118
+
+#define QIB_7220_RcvHdrEntSize_OFFS 0x120
+
+#define QIB_7220_RcvTIDBase_OFFS 0x128
+
+#define QIB_7220_RcvTIDCnt_OFFS 0x130
+
+#define QIB_7220_RcvEgrBase_OFFS 0x138
+
+#define QIB_7220_RcvEgrCnt_OFFS 0x140
+
+#define QIB_7220_RcvBufBase_OFFS 0x148
+
+#define QIB_7220_RcvBufSize_OFFS 0x150
+
+#define QIB_7220_RxIntMemBase_OFFS 0x158
+
+#define QIB_7220_RxIntMemSize_OFFS 0x160
+
+#define QIB_7220_RcvPartitionKey_OFFS 0x168
+
+#define QIB_7220_RcvQPMulticastPort_OFFS 0x170
+#define QIB_7220_RcvQPMulticastPort_Reserved_LSB 0x5
+#define QIB_7220_RcvQPMulticastPort_Reserved_RMASK 0x7FFFFFFFFFFFFFF
+#define QIB_7220_RcvQPMulticastPort_RcvQpMcPort_LSB 0x0
+#define QIB_7220_RcvQPMulticastPort_RcvQpMcPort_RMASK 0x1F
+
+#define QIB_7220_RcvPktLEDCnt_OFFS 0x178
+#define QIB_7220_RcvPktLEDCnt_ONperiod_LSB 0x20
+#define QIB_7220_RcvPktLEDCnt_ONperiod_RMASK 0xFFFFFFFF
+#define QIB_7220_RcvPktLEDCnt_OFFperiod_LSB 0x0
+#define QIB_7220_RcvPktLEDCnt_OFFperiod_RMASK 0xFFFFFFFF
+
+#define QIB_7220_IBCDDRCtrl_OFFS 0x180
+#define QIB_7220_IBCDDRCtrl_IB_DLID_MASK_LSB 0x30
+#define QIB_7220_IBCDDRCtrl_IB_DLID_MASK_RMASK 0xFFFF
+#define QIB_7220_IBCDDRCtrl_IB_DLID_LSB 0x20
+#define QIB_7220_IBCDDRCtrl_IB_DLID_RMASK 0xFFFF
+#define QIB_7220_IBCDDRCtrl_Reserved_LSB 0x1B
+#define QIB_7220_IBCDDRCtrl_Reserved_RMASK 0x1F
+#define QIB_7220_IBCDDRCtrl_HRTBT_REQ_LSB 0x1A
+#define QIB_7220_IBCDDRCtrl_HRTBT_REQ_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_HRTBT_PORT_LSB 0x12
+#define QIB_7220_IBCDDRCtrl_HRTBT_PORT_RMASK 0xFF
+#define QIB_7220_IBCDDRCtrl_HRTBT_AUTO_LSB 0x11
+#define QIB_7220_IBCDDRCtrl_HRTBT_AUTO_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_HRTBT_ENB_LSB 0x10
+#define QIB_7220_IBCDDRCtrl_HRTBT_ENB_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_DDS_LSB 0xC
+#define QIB_7220_IBCDDRCtrl_SD_DDS_RMASK 0xF
+#define QIB_7220_IBCDDRCtrl_SD_DDSV_LSB 0xB
+#define QIB_7220_IBCDDRCtrl_SD_DDSV_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_ADD_ENB_LSB 0xA
+#define QIB_7220_IBCDDRCtrl_SD_ADD_ENB_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_RX_EQUAL_ENABLE_LSB 0x9
+#define QIB_7220_IBCDDRCtrl_SD_RX_EQUAL_ENABLE_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_IB_LANE_REV_SUPPORTED_LSB 0x8
+#define QIB_7220_IBCDDRCtrl_IB_LANE_REV_SUPPORTED_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_IB_POLARITY_REV_SUPP_LSB 0x7
+#define QIB_7220_IBCDDRCtrl_IB_POLARITY_REV_SUPP_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_IB_NUM_CHANNELS_LSB 0x5
+#define QIB_7220_IBCDDRCtrl_IB_NUM_CHANNELS_RMASK 0x3
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_QDR_LSB 0x4
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_QDR_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_DDR_LSB 0x3
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_DDR_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_SDR_LSB 0x2
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_SDR_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_LSB 0x1
+#define QIB_7220_IBCDDRCtrl_SD_SPEED_RMASK 0x1
+#define QIB_7220_IBCDDRCtrl_IB_ENHANCED_MODE_LSB 0x0
+#define QIB_7220_IBCDDRCtrl_IB_ENHANCED_MODE_RMASK 0x1
+
+#define QIB_7220_HRTBT_GUID_OFFS 0x188
+
+#define QIB_7220_IBCDDRCtrl2_OFFS 0x1A0
+#define QIB_7220_IBCDDRCtrl2_IB_BACK_PORCH_LSB 0x5
+#define QIB_7220_IBCDDRCtrl2_IB_BACK_PORCH_RMASK 0x1F
+#define QIB_7220_IBCDDRCtrl2_IB_FRONT_PORCH_LSB 0x0
+#define QIB_7220_IBCDDRCtrl2_IB_FRONT_PORCH_RMASK 0x1F
+
+#define QIB_7220_IBCDDRStatus_OFFS 0x1A8
+#define QIB_7220_IBCDDRStatus_heartbeat_timed_out_LSB 0x24
+#define QIB_7220_IBCDDRStatus_heartbeat_timed_out_RMASK 0x1
+#define QIB_7220_IBCDDRStatus_heartbeat_crosstalk_LSB 0x20
+#define QIB_7220_IBCDDRStatus_heartbeat_crosstalk_RMASK 0xF
+#define QIB_7220_IBCDDRStatus_RxEqLocalDevice_LSB 0x1E
+#define QIB_7220_IBCDDRStatus_RxEqLocalDevice_RMASK 0x3
+#define QIB_7220_IBCDDRStatus_ReqDDSLocalFromRmt_LSB 0x1A
+#define QIB_7220_IBCDDRStatus_ReqDDSLocalFromRmt_RMASK 0xF
+#define QIB_7220_IBCDDRStatus_LinkRoundTripLatency_LSB 0x0
+#define QIB_7220_IBCDDRStatus_LinkRoundTripLatency_RMASK 0x3FFFFFF
+
+#define QIB_7220_JIntReload_OFFS 0x1B0
+#define QIB_7220_JIntReload_J_limit_reload_LSB 0x10
+#define QIB_7220_JIntReload_J_limit_reload_RMASK 0xFFFF
+#define QIB_7220_JIntReload_J_reload_LSB 0x0
+#define QIB_7220_JIntReload_J_reload_RMASK 0xFFFF
+
+#define QIB_7220_IBNCModeCtrl_OFFS 0x1B8
+#define QIB_7220_IBNCModeCtrl_Reserved_LSB 0x1A
+#define QIB_7220_IBNCModeCtrl_Reserved_RMASK 0x3FFFFFFFFF
+#define QIB_7220_IBNCModeCtrl_TSMCode_TS2_LSB 0x11
+#define QIB_7220_IBNCModeCtrl_TSMCode_TS2_RMASK 0x1FF
+#define QIB_7220_IBNCModeCtrl_TSMCode_TS1_LSB 0x8
+#define QIB_7220_IBNCModeCtrl_TSMCode_TS1_RMASK 0x1FF
+#define QIB_7220_IBNCModeCtrl_Reserved1_LSB 0x3
+#define QIB_7220_IBNCModeCtrl_Reserved1_RMASK 0x1F
+#define QIB_7220_IBNCModeCtrl_TSMEnable_ignore_TSM_on_rx_LSB 0x2
+#define QIB_7220_IBNCModeCtrl_TSMEnable_ignore_TSM_on_rx_RMASK 0x1
+#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS2_LSB 0x1
+#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS2_RMASK 0x1
+#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS1_LSB 0x0
+#define QIB_7220_IBNCModeCtrl_TSMEnable_send_TS1_RMASK 0x1
+
+#define QIB_7220_SendCtrl_OFFS 0x1C0
+#define QIB_7220_SendCtrl_Disarm_LSB 0x1F
+#define QIB_7220_SendCtrl_Disarm_RMASK 0x1
+#define QIB_7220_SendCtrl_Reserved_LSB 0x1D
+#define QIB_7220_SendCtrl_Reserved_RMASK 0x3
+#define QIB_7220_SendCtrl_AvailUpdThld_LSB 0x18
+#define QIB_7220_SendCtrl_AvailUpdThld_RMASK 0x1F
+#define QIB_7220_SendCtrl_DisarmPIOBuf_LSB 0x10
+#define QIB_7220_SendCtrl_DisarmPIOBuf_RMASK 0xFF
+#define QIB_7220_SendCtrl_Reserved1_LSB 0xD
+#define QIB_7220_SendCtrl_Reserved1_RMASK 0x7
+#define QIB_7220_SendCtrl_SDmaHalt_LSB 0xC
+#define QIB_7220_SendCtrl_SDmaHalt_RMASK 0x1
+#define QIB_7220_SendCtrl_SDmaEnable_LSB 0xB
+#define QIB_7220_SendCtrl_SDmaEnable_RMASK 0x1
+#define QIB_7220_SendCtrl_SDmaSingleDescriptor_LSB 0xA
+#define QIB_7220_SendCtrl_SDmaSingleDescriptor_RMASK 0x1
+#define QIB_7220_SendCtrl_SDmaIntEnable_LSB 0x9
+#define QIB_7220_SendCtrl_SDmaIntEnable_RMASK 0x1
+#define QIB_7220_SendCtrl_Reserved2_LSB 0x5
+#define QIB_7220_SendCtrl_Reserved2_RMASK 0xF
+#define QIB_7220_SendCtrl_SSpecialTriggerEn_LSB 0x4
+#define QIB_7220_SendCtrl_SSpecialTriggerEn_RMASK 0x1
+#define QIB_7220_SendCtrl_SPioEnable_LSB 0x3
+#define QIB_7220_SendCtrl_SPioEnable_RMASK 0x1
+#define QIB_7220_SendCtrl_SendBufAvailUpd_LSB 0x2
+#define QIB_7220_SendCtrl_SendBufAvailUpd_RMASK 0x1
+#define QIB_7220_SendCtrl_SendIntBufAvail_LSB 0x1
+#define QIB_7220_SendCtrl_SendIntBufAvail_RMASK 0x1
+#define QIB_7220_SendCtrl_Abort_LSB 0x0
+#define QIB_7220_SendCtrl_Abort_RMASK 0x1
+
+#define QIB_7220_SendBufBase_OFFS 0x1C8
+#define QIB_7220_SendBufBase_Reserved_LSB 0x35
+#define QIB_7220_SendBufBase_Reserved_RMASK 0x7FF
+#define QIB_7220_SendBufBase_BaseAddr_LargePIO_LSB 0x20
+#define QIB_7220_SendBufBase_BaseAddr_LargePIO_RMASK 0x1FFFFF
+#define QIB_7220_SendBufBase_Reserved1_LSB 0x15
+#define QIB_7220_SendBufBase_Reserved1_RMASK 0x7FF
+#define QIB_7220_SendBufBase_BaseAddr_SmallPIO_LSB 0x0
+#define QIB_7220_SendBufBase_BaseAddr_SmallPIO_RMASK 0x1FFFFF
+
+#define QIB_7220_SendBufSize_OFFS 0x1D0
+#define QIB_7220_SendBufSize_Reserved_LSB 0x2D
+#define QIB_7220_SendBufSize_Reserved_RMASK 0xFFFFF
+#define QIB_7220_SendBufSize_Size_LargePIO_LSB 0x20
+#define QIB_7220_SendBufSize_Size_LargePIO_RMASK 0x1FFF
+#define QIB_7220_SendBufSize_Reserved1_LSB 0xC
+#define QIB_7220_SendBufSize_Reserved1_RMASK 0xFFFFF
+#define QIB_7220_SendBufSize_Size_SmallPIO_LSB 0x0
+#define QIB_7220_SendBufSize_Size_SmallPIO_RMASK 0xFFF
+
+#define QIB_7220_SendBufCnt_OFFS 0x1D8
+#define QIB_7220_SendBufCnt_Reserved_LSB 0x24
+#define QIB_7220_SendBufCnt_Reserved_RMASK 0xFFFFFFF
+#define QIB_7220_SendBufCnt_Num_LargeBuffers_LSB 0x20
+#define QIB_7220_SendBufCnt_Num_LargeBuffers_RMASK 0xF
+#define QIB_7220_SendBufCnt_Reserved1_LSB 0x9
+#define QIB_7220_SendBufCnt_Reserved1_RMASK 0x7FFFFF
+#define QIB_7220_SendBufCnt_Num_SmallBuffers_LSB 0x0
+#define QIB_7220_SendBufCnt_Num_SmallBuffers_RMASK 0x1FF
+
+#define QIB_7220_SendBufAvailAddr_OFFS 0x1E0
+#define QIB_7220_SendBufAvailAddr_SendBufAvailAddr_LSB 0x6
+#define QIB_7220_SendBufAvailAddr_SendBufAvailAddr_RMASK 0x3FFFFFFFF
+#define QIB_7220_SendBufAvailAddr_Reserved_LSB 0x0
+#define QIB_7220_SendBufAvailAddr_Reserved_RMASK 0x3F
+
+#define QIB_7220_TxIntMemBase_OFFS 0x1E8
+
+#define QIB_7220_TxIntMemSize_OFFS 0x1F0
+
+#define QIB_7220_SendDmaBase_OFFS 0x1F8
+#define QIB_7220_SendDmaBase_Reserved_LSB 0x30
+#define QIB_7220_SendDmaBase_Reserved_RMASK 0xFFFF
+#define QIB_7220_SendDmaBase_SendDmaBase_LSB 0x0
+#define QIB_7220_SendDmaBase_SendDmaBase_RMASK 0xFFFFFFFFFFFF
+
+#define QIB_7220_SendDmaLenGen_OFFS 0x200
+#define QIB_7220_SendDmaLenGen_Reserved_LSB 0x13
+#define QIB_7220_SendDmaLenGen_Reserved_RMASK 0x1FFFFFFFFFFF
+#define QIB_7220_SendDmaLenGen_Generation_LSB 0x10
+#define QIB_7220_SendDmaLenGen_Generation_MSB 0x12
+#define QIB_7220_SendDmaLenGen_Generation_RMASK 0x7
+#define QIB_7220_SendDmaLenGen_Length_LSB 0x0
+#define QIB_7220_SendDmaLenGen_Length_RMASK 0xFFFF
+
+#define QIB_7220_SendDmaTail_OFFS 0x208
+#define QIB_7220_SendDmaTail_Reserved_LSB 0x10
+#define QIB_7220_SendDmaTail_Reserved_RMASK 0xFFFFFFFFFFFF
+#define QIB_7220_SendDmaTail_SendDmaTail_LSB 0x0
+#define QIB_7220_SendDmaTail_SendDmaTail_RMASK 0xFFFF
+
+#define QIB_7220_SendDmaHead_OFFS 0x210
+#define QIB_7220_SendDmaHead_Reserved_LSB 0x30
+#define QIB_7220_SendDmaHead_Reserved_RMASK 0xFFFF
+#define QIB_7220_SendDmaHead_InternalSendDmaHead_LSB 0x20
+#define QIB_7220_SendDmaHead_InternalSendDmaHead_RMASK 0xFFFF
+#define QIB_7220_SendDmaHead_Reserved1_LSB 0x10
+#define QIB_7220_SendDmaHead_Reserved1_RMASK 0xFFFF
+#define QIB_7220_SendDmaHead_SendDmaHead_LSB 0x0
+#define QIB_7220_SendDmaHead_SendDmaHead_RMASK 0xFFFF
+
+#define QIB_7220_SendDmaHeadAddr_OFFS 0x218
+#define QIB_7220_SendDmaHeadAddr_Reserved_LSB 0x30
+#define QIB_7220_SendDmaHeadAddr_Reserved_RMASK 0xFFFF
+#define QIB_7220_SendDmaHeadAddr_SendDmaHeadAddr_LSB 0x0
+#define QIB_7220_SendDmaHeadAddr_SendDmaHeadAddr_RMASK 0xFFFFFFFFFFFF
+
+#define QIB_7220_SendDmaBufMask0_OFFS 0x220
+#define QIB_7220_SendDmaBufMask0_BufMask_63_0_LSB 0x0
+#define QIB_7220_SendDmaBufMask0_BufMask_63_0_RMASK 0x0
+
+#define QIB_7220_SendDmaStatus_OFFS 0x238
+#define QIB_7220_SendDmaStatus_ScoreBoardDrainInProg_LSB 0x3F
+#define QIB_7220_SendDmaStatus_ScoreBoardDrainInProg_RMASK 0x1
+#define QIB_7220_SendDmaStatus_AbortInProg_LSB 0x3E
+#define QIB_7220_SendDmaStatus_AbortInProg_RMASK 0x1
+#define QIB_7220_SendDmaStatus_InternalSDmaEnable_LSB 0x3D
+#define QIB_7220_SendDmaStatus_InternalSDmaEnable_RMASK 0x1
+#define QIB_7220_SendDmaStatus_ScbDescIndex_13_0_LSB 0x2F
+#define QIB_7220_SendDmaStatus_ScbDescIndex_13_0_RMASK 0x3FFF
+#define QIB_7220_SendDmaStatus_RpyLowAddr_6_0_LSB 0x28
+#define QIB_7220_SendDmaStatus_RpyLowAddr_6_0_RMASK 0x7F
+#define QIB_7220_SendDmaStatus_RpyTag_7_0_LSB 0x20
+#define QIB_7220_SendDmaStatus_RpyTag_7_0_RMASK 0xFF
+#define QIB_7220_SendDmaStatus_ScbFull_LSB 0x1F
+#define QIB_7220_SendDmaStatus_ScbFull_RMASK 0x1
+#define QIB_7220_SendDmaStatus_ScbEmpty_LSB 0x1E
+#define QIB_7220_SendDmaStatus_ScbEmpty_RMASK 0x1
+#define QIB_7220_SendDmaStatus_ScbEntryValid_LSB 0x1D
+#define QIB_7220_SendDmaStatus_ScbEntryValid_RMASK 0x1
+#define QIB_7220_SendDmaStatus_ScbFetchDescFlag_LSB 0x1C
+#define QIB_7220_SendDmaStatus_ScbFetchDescFlag_RMASK 0x1
+#define QIB_7220_SendDmaStatus_SplFifoReadyToGo_LSB 0x1B
+#define QIB_7220_SendDmaStatus_SplFifoReadyToGo_RMASK 0x1
+#define QIB_7220_SendDmaStatus_SplFifoDisarmed_LSB 0x1A
+#define QIB_7220_SendDmaStatus_SplFifoDisarmed_RMASK 0x1
+#define QIB_7220_SendDmaStatus_SplFifoEmpty_LSB 0x19
+#define QIB_7220_SendDmaStatus_SplFifoEmpty_RMASK 0x1
+#define QIB_7220_SendDmaStatus_SplFifoFull_LSB 0x18
+#define QIB_7220_SendDmaStatus_SplFifoFull_RMASK 0x1
+#define QIB_7220_SendDmaStatus_SplFifoBufNum_LSB 0x10
+#define QIB_7220_SendDmaStatus_SplFifoBufNum_RMASK 0xFF
+#define QIB_7220_SendDmaStatus_SplFifoDescIndex_LSB 0x0
+#define QIB_7220_SendDmaStatus_SplFifoDescIndex_RMASK 0xFFFF
+
+#define QIB_7220_SendBufErr0_OFFS 0x240
+#define QIB_7220_SendBufErr0_SendBufErr_63_0_LSB 0x0
+#define QIB_7220_SendBufErr0_SendBufErr_63_0_RMASK 0x0
+
+#define QIB_7220_RcvHdrAddr0_OFFS 0x270
+#define QIB_7220_RcvHdrAddr0_RcvHdrAddr0_LSB 0x2
+#define QIB_7220_RcvHdrAddr0_RcvHdrAddr0_RMASK 0x3FFFFFFFFF
+#define QIB_7220_RcvHdrAddr0_Reserved_LSB 0x0
+#define QIB_7220_RcvHdrAddr0_Reserved_RMASK 0x3
+
+#define QIB_7220_RcvHdrTailAddr0_OFFS 0x300
+#define QIB_7220_RcvHdrTailAddr0_RcvHdrTailAddr0_LSB 0x2
+#define QIB_7220_RcvHdrTailAddr0_RcvHdrTailAddr0_RMASK 0x3FFFFFFFFF
+#define QIB_7220_RcvHdrTailAddr0_Reserved_LSB 0x0
+#define QIB_7220_RcvHdrTailAddr0_Reserved_RMASK 0x3
+
+#define QIB_7220_ibsd_epb_access_ctrl_OFFS 0x3C0
+#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_granted_LSB 0x8
+#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_granted_RMASK 0x1
+#define QIB_7220_ibsd_epb_access_ctrl_Reserved_LSB 0x1
+#define QIB_7220_ibsd_epb_access_ctrl_Reserved_RMASK 0x7F
+#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_LSB 0x0
+#define QIB_7220_ibsd_epb_access_ctrl_sw_ib_epb_req_RMASK 0x1
+
+#define QIB_7220_ibsd_epb_transaction_reg_OFFS 0x3C8
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_rdy_LSB 0x1F
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_rdy_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_req_error_LSB 0x1E
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_req_error_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved_LSB 0x1D
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_mem_data_parity_LSB 0x1C
+#define QIB_7220_ibsd_epb_transaction_reg_mem_data_parity_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved1_LSB 0x1B
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved1_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_cs_LSB 0x19
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_cs_RMASK 0x3
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_read_write_LSB 0x18
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_read_write_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved2_LSB 0x17
+#define QIB_7220_ibsd_epb_transaction_reg_Reserved2_RMASK 0x1
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_address_LSB 0x8
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_address_RMASK 0x7FFF
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_data_LSB 0x0
+#define QIB_7220_ibsd_epb_transaction_reg_ib_epb_data_RMASK 0xFF
+
+#define QIB_7220_XGXSCfg_OFFS 0x3D8
+#define QIB_7220_XGXSCfg_sel_link_down_for_fctrl_lane_sync_reset_LSB 0x3F
+#define QIB_7220_XGXSCfg_sel_link_down_for_fctrl_lane_sync_reset_RMASK 0x1
+#define QIB_7220_XGXSCfg_Reserved_LSB 0x13
+#define QIB_7220_XGXSCfg_Reserved_RMASK 0xFFFFFFFFFFF
+#define QIB_7220_XGXSCfg_link_sync_mask_LSB 0x9
+#define QIB_7220_XGXSCfg_link_sync_mask_RMASK 0x3FF
+#define QIB_7220_XGXSCfg_Reserved1_LSB 0x3
+#define QIB_7220_XGXSCfg_Reserved1_RMASK 0x3F
+#define QIB_7220_XGXSCfg_xcv_reset_LSB 0x2
+#define QIB_7220_XGXSCfg_xcv_reset_RMASK 0x1
+#define QIB_7220_XGXSCfg_Reserved2_LSB 0x1
+#define QIB_7220_XGXSCfg_Reserved2_RMASK 0x1
+#define QIB_7220_XGXSCfg_tx_rx_reset_LSB 0x0
+#define QIB_7220_XGXSCfg_tx_rx_reset_RMASK 0x1
+
+#define QIB_7220_IBSerDesCtrl_OFFS 0x3E0
+#define QIB_7220_IBSerDesCtrl_Reserved_LSB 0x2D
+#define QIB_7220_IBSerDesCtrl_Reserved_RMASK 0x7FFFF
+#define QIB_7220_IBSerDesCtrl_INT_uC_LSB 0x2C
+#define QIB_7220_IBSerDesCtrl_INT_uC_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_CKSEL_uC_LSB 0x2A
+#define QIB_7220_IBSerDesCtrl_CKSEL_uC_RMASK 0x3
+#define QIB_7220_IBSerDesCtrl_PLLN_LSB 0x28
+#define QIB_7220_IBSerDesCtrl_PLLN_RMASK 0x3
+#define QIB_7220_IBSerDesCtrl_PLLM_LSB 0x25
+#define QIB_7220_IBSerDesCtrl_PLLM_RMASK 0x7
+#define QIB_7220_IBSerDesCtrl_TXOBPD_LSB 0x24
+#define QIB_7220_IBSerDesCtrl_TXOBPD_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_TWC_LSB 0x23
+#define QIB_7220_IBSerDesCtrl_TWC_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_RXIDLE_LSB 0x22
+#define QIB_7220_IBSerDesCtrl_RXIDLE_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_RXINV_LSB 0x21
+#define QIB_7220_IBSerDesCtrl_RXINV_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_TXINV_LSB 0x20
+#define QIB_7220_IBSerDesCtrl_TXINV_RMASK 0x1
+#define QIB_7220_IBSerDesCtrl_Reserved1_LSB 0x12
+#define QIB_7220_IBSerDesCtrl_Reserved1_RMASK 0x3FFF
+#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForRXEQ_LSB 0xD
+#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForRXEQ_RMASK 0x1F
+#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForDDS_LSB 0x8
+#define QIB_7220_IBSerDesCtrl_NumSerDesRegsToWrForDDS_RMASK 0x1F
+#define QIB_7220_IBSerDesCtrl_Reserved2_LSB 0x1
+#define QIB_7220_IBSerDesCtrl_Reserved2_RMASK 0x7F
+#define QIB_7220_IBSerDesCtrl_ResetIB_uC_Core_LSB 0x0
+#define QIB_7220_IBSerDesCtrl_ResetIB_uC_Core_RMASK 0x1
+
+#define QIB_7220_pciesd_epb_access_ctrl_OFFS 0x400
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_granted_LSB 0x8
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_granted_RMASK 0x1
+#define QIB_7220_pciesd_epb_access_ctrl_Reserved_LSB 0x3
+#define QIB_7220_pciesd_epb_access_ctrl_Reserved_RMASK 0x1F
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcieepb_star_en_LSB 0x1
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcieepb_star_en_RMASK 0x3
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_LSB 0x0
+#define QIB_7220_pciesd_epb_access_ctrl_sw_pcie_epb_req_RMASK 0x1
+
+#define QIB_7220_pciesd_epb_transaction_reg_OFFS 0x408
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_rdy_LSB 0x1F
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_rdy_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_req_error_LSB 0x1E
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_req_error_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_Reserved_LSB 0x1D
+#define QIB_7220_pciesd_epb_transaction_reg_Reserved_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_mem_data_parity_LSB 0x1C
+#define QIB_7220_pciesd_epb_transaction_reg_mem_data_parity_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_cs_LSB 0x19
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_cs_RMASK 0x7
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_read_write_LSB 0x18
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_read_write_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_Reserved1_LSB 0x17
+#define QIB_7220_pciesd_epb_transaction_reg_Reserved1_RMASK 0x1
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_address_LSB 0x8
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_address_RMASK 0x7FFF
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_data_LSB 0x0
+#define QIB_7220_pciesd_epb_transaction_reg_pcie_epb_data_RMASK 0xFF
+
+#define QIB_7220_SerDes_DDSRXEQ0_OFFS 0x500
+#define QIB_7220_SerDes_DDSRXEQ0_reg_addr_LSB 0x4
+#define QIB_7220_SerDes_DDSRXEQ0_reg_addr_RMASK 0x3F
+#define QIB_7220_SerDes_DDSRXEQ0_element_num_LSB 0x0
+#define QIB_7220_SerDes_DDSRXEQ0_element_num_RMASK 0xF
+
+#define QIB_7220_LBIntCnt_OFFS 0x13000
+
+#define QIB_7220_LBFlowStallCnt_OFFS 0x13008
+
+#define QIB_7220_TxSDmaDescCnt_OFFS 0x13010
+
+#define QIB_7220_TxUnsupVLErrCnt_OFFS 0x13018
+
+#define QIB_7220_TxDataPktCnt_OFFS 0x13020
+
+#define QIB_7220_TxFlowPktCnt_OFFS 0x13028
+
+#define QIB_7220_TxDwordCnt_OFFS 0x13030
+
+#define QIB_7220_TxLenErrCnt_OFFS 0x13038
+
+#define QIB_7220_TxMaxMinLenErrCnt_OFFS 0x13040
+
+#define QIB_7220_TxUnderrunCnt_OFFS 0x13048
+
+#define QIB_7220_TxFlowStallCnt_OFFS 0x13050
+
+#define QIB_7220_TxDroppedPktCnt_OFFS 0x13058
+
+#define QIB_7220_RxDroppedPktCnt_OFFS 0x13060
+
+#define QIB_7220_RxDataPktCnt_OFFS 0x13068
+
+#define QIB_7220_RxFlowPktCnt_OFFS 0x13070
+
+#define QIB_7220_RxDwordCnt_OFFS 0x13078
+
+#define QIB_7220_RxLenErrCnt_OFFS 0x13080
+
+#define QIB_7220_RxMaxMinLenErrCnt_OFFS 0x13088
+
+#define QIB_7220_RxICRCErrCnt_OFFS 0x13090
+
+#define QIB_7220_RxVCRCErrCnt_OFFS 0x13098
+
+#define QIB_7220_RxFlowCtrlViolCnt_OFFS 0x130A0
+
+#define QIB_7220_RxVersionErrCnt_OFFS 0x130A8
+
+#define QIB_7220_RxLinkMalformCnt_OFFS 0x130B0
+
+#define QIB_7220_RxEBPCnt_OFFS 0x130B8
+
+#define QIB_7220_RxLPCRCErrCnt_OFFS 0x130C0
+
+#define QIB_7220_RxBufOvflCnt_OFFS 0x130C8
+
+#define QIB_7220_RxTIDFullErrCnt_OFFS 0x130D0
+
+#define QIB_7220_RxTIDValidErrCnt_OFFS 0x130D8
+
+#define QIB_7220_RxPKeyMismatchCnt_OFFS 0x130E0
+
+#define QIB_7220_RxP0HdrEgrOvflCnt_OFFS 0x130E8
+
+#define QIB_7220_IBStatusChangeCnt_OFFS 0x13170
+
+#define QIB_7220_IBLinkErrRecoveryCnt_OFFS 0x13178
+
+#define QIB_7220_IBLinkDownedCnt_OFFS 0x13180
+
+#define QIB_7220_IBSymbolErrCnt_OFFS 0x13188
+
+#define QIB_7220_RxVL15DroppedPktCnt_OFFS 0x13190
+
+#define QIB_7220_RxOtherLocalPhyErrCnt_OFFS 0x13198
+
+#define QIB_7220_PcieRetryBufDiagQwordCnt_OFFS 0x131A0
+
+#define QIB_7220_ExcessBufferOvflCnt_OFFS 0x131A8
+
+#define QIB_7220_LocalLinkIntegrityErrCnt_OFFS 0x131B0
+
+#define QIB_7220_RxVlErrCnt_OFFS 0x131B8
+
+#define QIB_7220_RxDlidFltrCnt_OFFS 0x131C0
+
+#define QIB_7220_CNT_0131C8_OFFS 0x131C8
+
+#define QIB_7220_PSStat_OFFS 0x13200
+
+#define QIB_7220_PSStart_OFFS 0x13208
+
+#define QIB_7220_PSInterval_OFFS 0x13210
+
+#define QIB_7220_PSRcvDataCount_OFFS 0x13218
+
+#define QIB_7220_PSRcvPktsCount_OFFS 0x13220
+
+#define QIB_7220_PSXmitDataCount_OFFS 0x13228
+
+#define QIB_7220_PSXmitPktsCount_OFFS 0x13230
+
+#define QIB_7220_PSXmitWaitCount_OFFS 0x13238
+
+#define QIB_7220_CNT_013240_OFFS 0x13240
+
+#define QIB_7220_RcvEgrArray_OFFS 0x14000
+
+#define QIB_7220_MEM_038000_OFFS 0x38000
+
+#define QIB_7220_RcvTIDArray0_OFFS 0x53000
+
+#define QIB_7220_PIOLaunchFIFO_OFFS 0x64000
+
+#define QIB_7220_MEM_064480_OFFS 0x64480
+
+#define QIB_7220_SendPIOpbcCache_OFFS 0x64800
+
+#define QIB_7220_MEM_064C80_OFFS 0x64C80
+
+#define QIB_7220_PreLaunchFIFO_OFFS 0x65000
+
+#define QIB_7220_MEM_065080_OFFS 0x65080
+
+#define QIB_7220_ScoreBoard_OFFS 0x65400
+
+#define QIB_7220_MEM_065440_OFFS 0x65440
+
+#define QIB_7220_DescriptorFIFO_OFFS 0x65800
+
+#define QIB_7220_MEM_065880_OFFS 0x65880
+
+#define QIB_7220_RcvBuf1_OFFS 0x72000
+
+#define QIB_7220_MEM_074800_OFFS 0x74800
+
+#define QIB_7220_RcvBuf2_OFFS 0x75000
+
+#define QIB_7220_MEM_076400_OFFS 0x76400
+
+#define QIB_7220_RcvFlags_OFFS 0x77000
+
+#define QIB_7220_MEM_078400_OFFS 0x78400
+
+#define QIB_7220_RcvLookupBuf1_OFFS 0x79000
+
+#define QIB_7220_MEM_07A400_OFFS 0x7A400
+
+#define QIB_7220_RcvDMADatBuf_OFFS 0x7B000
+
+#define QIB_7220_RcvDMAHdrBuf_OFFS 0x7B800
+
+#define QIB_7220_MiscRXEIntMem_OFFS 0x7C000
+
+#define QIB_7220_MEM_07D400_OFFS 0x7D400
+
+#define QIB_7220_PCIERcvBuf_OFFS 0x80000
+
+#define QIB_7220_PCIERetryBuf_OFFS 0x84000
+
+#define QIB_7220_PCIERcvBufRdToWrAddr_OFFS 0x88000
+
+#define QIB_7220_PCIECplBuf_OFFS 0x90000
+
+#define QIB_7220_IBSerDesMappTable_OFFS 0x94000
+
+#define QIB_7220_MEM_095000_OFFS 0x95000
+
+#define QIB_7220_SendBuf0_MA_OFFS 0x100000
+
+#define QIB_7220_MEM_1A0000_OFFS 0x1A0000
diff --git a/drivers/infiniband/hw/qib/qib_7322_regs.h b/drivers/infiniband/hw/qib/qib_7322_regs.h
new file mode 100644
index 000000000..32dc81ff8
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_7322_regs.h
@@ -0,0 +1,3163 @@
+/*
+ * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* This file is mechanically generated from RTL. Any hand-edits will be lost! */
+
+#define QIB_7322_Revision_OFFS 0x0
+#define QIB_7322_Revision_DEF 0x0000000002010601
+#define QIB_7322_Revision_R_Simulator_LSB 0x3F
+#define QIB_7322_Revision_R_Simulator_MSB 0x3F
+#define QIB_7322_Revision_R_Simulator_RMASK 0x1
+#define QIB_7322_Revision_R_Emulation_LSB 0x3E
+#define QIB_7322_Revision_R_Emulation_MSB 0x3E
+#define QIB_7322_Revision_R_Emulation_RMASK 0x1
+#define QIB_7322_Revision_R_Emulation_Revcode_LSB 0x28
+#define QIB_7322_Revision_R_Emulation_Revcode_MSB 0x3D
+#define QIB_7322_Revision_R_Emulation_Revcode_RMASK 0x3FFFFF
+#define QIB_7322_Revision_BoardID_LSB 0x20
+#define QIB_7322_Revision_BoardID_MSB 0x27
+#define QIB_7322_Revision_BoardID_RMASK 0xFF
+#define QIB_7322_Revision_R_SW_LSB 0x18
+#define QIB_7322_Revision_R_SW_MSB 0x1F
+#define QIB_7322_Revision_R_SW_RMASK 0xFF
+#define QIB_7322_Revision_R_Arch_LSB 0x10
+#define QIB_7322_Revision_R_Arch_MSB 0x17
+#define QIB_7322_Revision_R_Arch_RMASK 0xFF
+#define QIB_7322_Revision_R_ChipRevMajor_LSB 0x8
+#define QIB_7322_Revision_R_ChipRevMajor_MSB 0xF
+#define QIB_7322_Revision_R_ChipRevMajor_RMASK 0xFF
+#define QIB_7322_Revision_R_ChipRevMinor_LSB 0x0
+#define QIB_7322_Revision_R_ChipRevMinor_MSB 0x7
+#define QIB_7322_Revision_R_ChipRevMinor_RMASK 0xFF
+
+#define QIB_7322_Control_OFFS 0x8
+#define QIB_7322_Control_DEF 0x0000000000000000
+#define QIB_7322_Control_PCIECplQDiagEn_LSB 0x6
+#define QIB_7322_Control_PCIECplQDiagEn_MSB 0x6
+#define QIB_7322_Control_PCIECplQDiagEn_RMASK 0x1
+#define QIB_7322_Control_PCIEPostQDiagEn_LSB 0x5
+#define QIB_7322_Control_PCIEPostQDiagEn_MSB 0x5
+#define QIB_7322_Control_PCIEPostQDiagEn_RMASK 0x1
+#define QIB_7322_Control_SDmaDescFetchPriorityEn_LSB 0x4
+#define QIB_7322_Control_SDmaDescFetchPriorityEn_MSB 0x4
+#define QIB_7322_Control_SDmaDescFetchPriorityEn_RMASK 0x1
+#define QIB_7322_Control_PCIERetryBufDiagEn_LSB 0x3
+#define QIB_7322_Control_PCIERetryBufDiagEn_MSB 0x3
+#define QIB_7322_Control_PCIERetryBufDiagEn_RMASK 0x1
+#define QIB_7322_Control_FreezeMode_LSB 0x1
+#define QIB_7322_Control_FreezeMode_MSB 0x1
+#define QIB_7322_Control_FreezeMode_RMASK 0x1
+#define QIB_7322_Control_SyncReset_LSB 0x0
+#define QIB_7322_Control_SyncReset_MSB 0x0
+#define QIB_7322_Control_SyncReset_RMASK 0x1
+
+#define QIB_7322_PageAlign_OFFS 0x10
+#define QIB_7322_PageAlign_DEF 0x0000000000001000
+
+#define QIB_7322_ContextCnt_OFFS 0x18
+#define QIB_7322_ContextCnt_DEF 0x0000000000000012
+
+#define QIB_7322_Scratch_OFFS 0x20
+#define QIB_7322_Scratch_DEF 0x0000000000000000
+
+#define QIB_7322_CntrRegBase_OFFS 0x28
+#define QIB_7322_CntrRegBase_DEF 0x0000000000011000
+
+#define QIB_7322_SendRegBase_OFFS 0x30
+#define QIB_7322_SendRegBase_DEF 0x0000000000003000
+
+#define QIB_7322_UserRegBase_OFFS 0x38
+#define QIB_7322_UserRegBase_DEF 0x0000000000200000
+
+#define QIB_7322_IntMask_OFFS 0x68
+#define QIB_7322_IntMask_DEF 0x0000000000000000
+#define QIB_7322_IntMask_SDmaIntMask_1_LSB 0x3F
+#define QIB_7322_IntMask_SDmaIntMask_1_MSB 0x3F
+#define QIB_7322_IntMask_SDmaIntMask_1_RMASK 0x1
+#define QIB_7322_IntMask_SDmaIntMask_0_LSB 0x3E
+#define QIB_7322_IntMask_SDmaIntMask_0_MSB 0x3E
+#define QIB_7322_IntMask_SDmaIntMask_0_RMASK 0x1
+#define QIB_7322_IntMask_SDmaProgressIntMask_1_LSB 0x3D
+#define QIB_7322_IntMask_SDmaProgressIntMask_1_MSB 0x3D
+#define QIB_7322_IntMask_SDmaProgressIntMask_1_RMASK 0x1
+#define QIB_7322_IntMask_SDmaProgressIntMask_0_LSB 0x3C
+#define QIB_7322_IntMask_SDmaProgressIntMask_0_MSB 0x3C
+#define QIB_7322_IntMask_SDmaProgressIntMask_0_RMASK 0x1
+#define QIB_7322_IntMask_SDmaIdleIntMask_1_LSB 0x3B
+#define QIB_7322_IntMask_SDmaIdleIntMask_1_MSB 0x3B
+#define QIB_7322_IntMask_SDmaIdleIntMask_1_RMASK 0x1
+#define QIB_7322_IntMask_SDmaIdleIntMask_0_LSB 0x3A
+#define QIB_7322_IntMask_SDmaIdleIntMask_0_MSB 0x3A
+#define QIB_7322_IntMask_SDmaIdleIntMask_0_RMASK 0x1
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_1_LSB 0x39
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_1_MSB 0x39
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_1_RMASK 0x1
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_0_LSB 0x38
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_0_MSB 0x38
+#define QIB_7322_IntMask_SDmaCleanupDoneMask_0_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg17IntMask_LSB 0x31
+#define QIB_7322_IntMask_RcvUrg17IntMask_MSB 0x31
+#define QIB_7322_IntMask_RcvUrg17IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg16IntMask_LSB 0x30
+#define QIB_7322_IntMask_RcvUrg16IntMask_MSB 0x30
+#define QIB_7322_IntMask_RcvUrg16IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg15IntMask_LSB 0x2F
+#define QIB_7322_IntMask_RcvUrg15IntMask_MSB 0x2F
+#define QIB_7322_IntMask_RcvUrg15IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg14IntMask_LSB 0x2E
+#define QIB_7322_IntMask_RcvUrg14IntMask_MSB 0x2E
+#define QIB_7322_IntMask_RcvUrg14IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg13IntMask_LSB 0x2D
+#define QIB_7322_IntMask_RcvUrg13IntMask_MSB 0x2D
+#define QIB_7322_IntMask_RcvUrg13IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg12IntMask_LSB 0x2C
+#define QIB_7322_IntMask_RcvUrg12IntMask_MSB 0x2C
+#define QIB_7322_IntMask_RcvUrg12IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg11IntMask_LSB 0x2B
+#define QIB_7322_IntMask_RcvUrg11IntMask_MSB 0x2B
+#define QIB_7322_IntMask_RcvUrg11IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg10IntMask_LSB 0x2A
+#define QIB_7322_IntMask_RcvUrg10IntMask_MSB 0x2A
+#define QIB_7322_IntMask_RcvUrg10IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg9IntMask_LSB 0x29
+#define QIB_7322_IntMask_RcvUrg9IntMask_MSB 0x29
+#define QIB_7322_IntMask_RcvUrg9IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg8IntMask_LSB 0x28
+#define QIB_7322_IntMask_RcvUrg8IntMask_MSB 0x28
+#define QIB_7322_IntMask_RcvUrg8IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg7IntMask_LSB 0x27
+#define QIB_7322_IntMask_RcvUrg7IntMask_MSB 0x27
+#define QIB_7322_IntMask_RcvUrg7IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg6IntMask_LSB 0x26
+#define QIB_7322_IntMask_RcvUrg6IntMask_MSB 0x26
+#define QIB_7322_IntMask_RcvUrg6IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg5IntMask_LSB 0x25
+#define QIB_7322_IntMask_RcvUrg5IntMask_MSB 0x25
+#define QIB_7322_IntMask_RcvUrg5IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg4IntMask_LSB 0x24
+#define QIB_7322_IntMask_RcvUrg4IntMask_MSB 0x24
+#define QIB_7322_IntMask_RcvUrg4IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg3IntMask_LSB 0x23
+#define QIB_7322_IntMask_RcvUrg3IntMask_MSB 0x23
+#define QIB_7322_IntMask_RcvUrg3IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg2IntMask_LSB 0x22
+#define QIB_7322_IntMask_RcvUrg2IntMask_MSB 0x22
+#define QIB_7322_IntMask_RcvUrg2IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg1IntMask_LSB 0x21
+#define QIB_7322_IntMask_RcvUrg1IntMask_MSB 0x21
+#define QIB_7322_IntMask_RcvUrg1IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvUrg0IntMask_LSB 0x20
+#define QIB_7322_IntMask_RcvUrg0IntMask_MSB 0x20
+#define QIB_7322_IntMask_RcvUrg0IntMask_RMASK 0x1
+#define QIB_7322_IntMask_ErrIntMask_1_LSB 0x1F
+#define QIB_7322_IntMask_ErrIntMask_1_MSB 0x1F
+#define QIB_7322_IntMask_ErrIntMask_1_RMASK 0x1
+#define QIB_7322_IntMask_ErrIntMask_0_LSB 0x1E
+#define QIB_7322_IntMask_ErrIntMask_0_MSB 0x1E
+#define QIB_7322_IntMask_ErrIntMask_0_RMASK 0x1
+#define QIB_7322_IntMask_ErrIntMask_LSB 0x1D
+#define QIB_7322_IntMask_ErrIntMask_MSB 0x1D
+#define QIB_7322_IntMask_ErrIntMask_RMASK 0x1
+#define QIB_7322_IntMask_AssertGPIOIntMask_LSB 0x1C
+#define QIB_7322_IntMask_AssertGPIOIntMask_MSB 0x1C
+#define QIB_7322_IntMask_AssertGPIOIntMask_RMASK 0x1
+#define QIB_7322_IntMask_SendDoneIntMask_1_LSB 0x19
+#define QIB_7322_IntMask_SendDoneIntMask_1_MSB 0x19
+#define QIB_7322_IntMask_SendDoneIntMask_1_RMASK 0x1
+#define QIB_7322_IntMask_SendDoneIntMask_0_LSB 0x18
+#define QIB_7322_IntMask_SendDoneIntMask_0_MSB 0x18
+#define QIB_7322_IntMask_SendDoneIntMask_0_RMASK 0x1
+#define QIB_7322_IntMask_SendBufAvailIntMask_LSB 0x17
+#define QIB_7322_IntMask_SendBufAvailIntMask_MSB 0x17
+#define QIB_7322_IntMask_SendBufAvailIntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail17IntMask_LSB 0x11
+#define QIB_7322_IntMask_RcvAvail17IntMask_MSB 0x11
+#define QIB_7322_IntMask_RcvAvail17IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail16IntMask_LSB 0x10
+#define QIB_7322_IntMask_RcvAvail16IntMask_MSB 0x10
+#define QIB_7322_IntMask_RcvAvail16IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail15IntMask_LSB 0xF
+#define QIB_7322_IntMask_RcvAvail15IntMask_MSB 0xF
+#define QIB_7322_IntMask_RcvAvail15IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail14IntMask_LSB 0xE
+#define QIB_7322_IntMask_RcvAvail14IntMask_MSB 0xE
+#define QIB_7322_IntMask_RcvAvail14IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail13IntMask_LSB 0xD
+#define QIB_7322_IntMask_RcvAvail13IntMask_MSB 0xD
+#define QIB_7322_IntMask_RcvAvail13IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail12IntMask_LSB 0xC
+#define QIB_7322_IntMask_RcvAvail12IntMask_MSB 0xC
+#define QIB_7322_IntMask_RcvAvail12IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail11IntMask_LSB 0xB
+#define QIB_7322_IntMask_RcvAvail11IntMask_MSB 0xB
+#define QIB_7322_IntMask_RcvAvail11IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail10IntMask_LSB 0xA
+#define QIB_7322_IntMask_RcvAvail10IntMask_MSB 0xA
+#define QIB_7322_IntMask_RcvAvail10IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail9IntMask_LSB 0x9
+#define QIB_7322_IntMask_RcvAvail9IntMask_MSB 0x9
+#define QIB_7322_IntMask_RcvAvail9IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail8IntMask_LSB 0x8
+#define QIB_7322_IntMask_RcvAvail8IntMask_MSB 0x8
+#define QIB_7322_IntMask_RcvAvail8IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail7IntMask_LSB 0x7
+#define QIB_7322_IntMask_RcvAvail7IntMask_MSB 0x7
+#define QIB_7322_IntMask_RcvAvail7IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail6IntMask_LSB 0x6
+#define QIB_7322_IntMask_RcvAvail6IntMask_MSB 0x6
+#define QIB_7322_IntMask_RcvAvail6IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail5IntMask_LSB 0x5
+#define QIB_7322_IntMask_RcvAvail5IntMask_MSB 0x5
+#define QIB_7322_IntMask_RcvAvail5IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail4IntMask_LSB 0x4
+#define QIB_7322_IntMask_RcvAvail4IntMask_MSB 0x4
+#define QIB_7322_IntMask_RcvAvail4IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail3IntMask_LSB 0x3
+#define QIB_7322_IntMask_RcvAvail3IntMask_MSB 0x3
+#define QIB_7322_IntMask_RcvAvail3IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail2IntMask_LSB 0x2
+#define QIB_7322_IntMask_RcvAvail2IntMask_MSB 0x2
+#define QIB_7322_IntMask_RcvAvail2IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail1IntMask_LSB 0x1
+#define QIB_7322_IntMask_RcvAvail1IntMask_MSB 0x1
+#define QIB_7322_IntMask_RcvAvail1IntMask_RMASK 0x1
+#define QIB_7322_IntMask_RcvAvail0IntMask_LSB 0x0
+#define QIB_7322_IntMask_RcvAvail0IntMask_MSB 0x0
+#define QIB_7322_IntMask_RcvAvail0IntMask_RMASK 0x1
+
+#define QIB_7322_IntStatus_OFFS 0x70
+#define QIB_7322_IntStatus_DEF 0x0000000000000000
+#define QIB_7322_IntStatus_SDmaInt_1_LSB 0x3F
+#define QIB_7322_IntStatus_SDmaInt_1_MSB 0x3F
+#define QIB_7322_IntStatus_SDmaInt_1_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaInt_0_LSB 0x3E
+#define QIB_7322_IntStatus_SDmaInt_0_MSB 0x3E
+#define QIB_7322_IntStatus_SDmaInt_0_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaProgressInt_1_LSB 0x3D
+#define QIB_7322_IntStatus_SDmaProgressInt_1_MSB 0x3D
+#define QIB_7322_IntStatus_SDmaProgressInt_1_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaProgressInt_0_LSB 0x3C
+#define QIB_7322_IntStatus_SDmaProgressInt_0_MSB 0x3C
+#define QIB_7322_IntStatus_SDmaProgressInt_0_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaIdleInt_1_LSB 0x3B
+#define QIB_7322_IntStatus_SDmaIdleInt_1_MSB 0x3B
+#define QIB_7322_IntStatus_SDmaIdleInt_1_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaIdleInt_0_LSB 0x3A
+#define QIB_7322_IntStatus_SDmaIdleInt_0_MSB 0x3A
+#define QIB_7322_IntStatus_SDmaIdleInt_0_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaCleanupDone_1_LSB 0x39
+#define QIB_7322_IntStatus_SDmaCleanupDone_1_MSB 0x39
+#define QIB_7322_IntStatus_SDmaCleanupDone_1_RMASK 0x1
+#define QIB_7322_IntStatus_SDmaCleanupDone_0_LSB 0x38
+#define QIB_7322_IntStatus_SDmaCleanupDone_0_MSB 0x38
+#define QIB_7322_IntStatus_SDmaCleanupDone_0_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg17_LSB 0x31
+#define QIB_7322_IntStatus_RcvUrg17_MSB 0x31
+#define QIB_7322_IntStatus_RcvUrg17_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg16_LSB 0x30
+#define QIB_7322_IntStatus_RcvUrg16_MSB 0x30
+#define QIB_7322_IntStatus_RcvUrg16_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg15_LSB 0x2F
+#define QIB_7322_IntStatus_RcvUrg15_MSB 0x2F
+#define QIB_7322_IntStatus_RcvUrg15_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg14_LSB 0x2E
+#define QIB_7322_IntStatus_RcvUrg14_MSB 0x2E
+#define QIB_7322_IntStatus_RcvUrg14_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg13_LSB 0x2D
+#define QIB_7322_IntStatus_RcvUrg13_MSB 0x2D
+#define QIB_7322_IntStatus_RcvUrg13_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg12_LSB 0x2C
+#define QIB_7322_IntStatus_RcvUrg12_MSB 0x2C
+#define QIB_7322_IntStatus_RcvUrg12_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg11_LSB 0x2B
+#define QIB_7322_IntStatus_RcvUrg11_MSB 0x2B
+#define QIB_7322_IntStatus_RcvUrg11_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg10_LSB 0x2A
+#define QIB_7322_IntStatus_RcvUrg10_MSB 0x2A
+#define QIB_7322_IntStatus_RcvUrg10_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg9_LSB 0x29
+#define QIB_7322_IntStatus_RcvUrg9_MSB 0x29
+#define QIB_7322_IntStatus_RcvUrg9_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg8_LSB 0x28
+#define QIB_7322_IntStatus_RcvUrg8_MSB 0x28
+#define QIB_7322_IntStatus_RcvUrg8_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg7_LSB 0x27
+#define QIB_7322_IntStatus_RcvUrg7_MSB 0x27
+#define QIB_7322_IntStatus_RcvUrg7_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg6_LSB 0x26
+#define QIB_7322_IntStatus_RcvUrg6_MSB 0x26
+#define QIB_7322_IntStatus_RcvUrg6_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg5_LSB 0x25
+#define QIB_7322_IntStatus_RcvUrg5_MSB 0x25
+#define QIB_7322_IntStatus_RcvUrg5_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg4_LSB 0x24
+#define QIB_7322_IntStatus_RcvUrg4_MSB 0x24
+#define QIB_7322_IntStatus_RcvUrg4_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg3_LSB 0x23
+#define QIB_7322_IntStatus_RcvUrg3_MSB 0x23
+#define QIB_7322_IntStatus_RcvUrg3_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg2_LSB 0x22
+#define QIB_7322_IntStatus_RcvUrg2_MSB 0x22
+#define QIB_7322_IntStatus_RcvUrg2_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg1_LSB 0x21
+#define QIB_7322_IntStatus_RcvUrg1_MSB 0x21
+#define QIB_7322_IntStatus_RcvUrg1_RMASK 0x1
+#define QIB_7322_IntStatus_RcvUrg0_LSB 0x20
+#define QIB_7322_IntStatus_RcvUrg0_MSB 0x20
+#define QIB_7322_IntStatus_RcvUrg0_RMASK 0x1
+#define QIB_7322_IntStatus_Err_1_LSB 0x1F
+#define QIB_7322_IntStatus_Err_1_MSB 0x1F
+#define QIB_7322_IntStatus_Err_1_RMASK 0x1
+#define QIB_7322_IntStatus_Err_0_LSB 0x1E
+#define QIB_7322_IntStatus_Err_0_MSB 0x1E
+#define QIB_7322_IntStatus_Err_0_RMASK 0x1
+#define QIB_7322_IntStatus_Err_LSB 0x1D
+#define QIB_7322_IntStatus_Err_MSB 0x1D
+#define QIB_7322_IntStatus_Err_RMASK 0x1
+#define QIB_7322_IntStatus_AssertGPIO_LSB 0x1C
+#define QIB_7322_IntStatus_AssertGPIO_MSB 0x1C
+#define QIB_7322_IntStatus_AssertGPIO_RMASK 0x1
+#define QIB_7322_IntStatus_SendDone_1_LSB 0x19
+#define QIB_7322_IntStatus_SendDone_1_MSB 0x19
+#define QIB_7322_IntStatus_SendDone_1_RMASK 0x1
+#define QIB_7322_IntStatus_SendDone_0_LSB 0x18
+#define QIB_7322_IntStatus_SendDone_0_MSB 0x18
+#define QIB_7322_IntStatus_SendDone_0_RMASK 0x1
+#define QIB_7322_IntStatus_SendBufAvail_LSB 0x17
+#define QIB_7322_IntStatus_SendBufAvail_MSB 0x17
+#define QIB_7322_IntStatus_SendBufAvail_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail17_LSB 0x11
+#define QIB_7322_IntStatus_RcvAvail17_MSB 0x11
+#define QIB_7322_IntStatus_RcvAvail17_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail16_LSB 0x10
+#define QIB_7322_IntStatus_RcvAvail16_MSB 0x10
+#define QIB_7322_IntStatus_RcvAvail16_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail15_LSB 0xF
+#define QIB_7322_IntStatus_RcvAvail15_MSB 0xF
+#define QIB_7322_IntStatus_RcvAvail15_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail14_LSB 0xE
+#define QIB_7322_IntStatus_RcvAvail14_MSB 0xE
+#define QIB_7322_IntStatus_RcvAvail14_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail13_LSB 0xD
+#define QIB_7322_IntStatus_RcvAvail13_MSB 0xD
+#define QIB_7322_IntStatus_RcvAvail13_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail12_LSB 0xC
+#define QIB_7322_IntStatus_RcvAvail12_MSB 0xC
+#define QIB_7322_IntStatus_RcvAvail12_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail11_LSB 0xB
+#define QIB_7322_IntStatus_RcvAvail11_MSB 0xB
+#define QIB_7322_IntStatus_RcvAvail11_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail10_LSB 0xA
+#define QIB_7322_IntStatus_RcvAvail10_MSB 0xA
+#define QIB_7322_IntStatus_RcvAvail10_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail9_LSB 0x9
+#define QIB_7322_IntStatus_RcvAvail9_MSB 0x9
+#define QIB_7322_IntStatus_RcvAvail9_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail8_LSB 0x8
+#define QIB_7322_IntStatus_RcvAvail8_MSB 0x8
+#define QIB_7322_IntStatus_RcvAvail8_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail7_LSB 0x7
+#define QIB_7322_IntStatus_RcvAvail7_MSB 0x7
+#define QIB_7322_IntStatus_RcvAvail7_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail6_LSB 0x6
+#define QIB_7322_IntStatus_RcvAvail6_MSB 0x6
+#define QIB_7322_IntStatus_RcvAvail6_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail5_LSB 0x5
+#define QIB_7322_IntStatus_RcvAvail5_MSB 0x5
+#define QIB_7322_IntStatus_RcvAvail5_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail4_LSB 0x4
+#define QIB_7322_IntStatus_RcvAvail4_MSB 0x4
+#define QIB_7322_IntStatus_RcvAvail4_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail3_LSB 0x3
+#define QIB_7322_IntStatus_RcvAvail3_MSB 0x3
+#define QIB_7322_IntStatus_RcvAvail3_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail2_LSB 0x2
+#define QIB_7322_IntStatus_RcvAvail2_MSB 0x2
+#define QIB_7322_IntStatus_RcvAvail2_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail1_LSB 0x1
+#define QIB_7322_IntStatus_RcvAvail1_MSB 0x1
+#define QIB_7322_IntStatus_RcvAvail1_RMASK 0x1
+#define QIB_7322_IntStatus_RcvAvail0_LSB 0x0
+#define QIB_7322_IntStatus_RcvAvail0_MSB 0x0
+#define QIB_7322_IntStatus_RcvAvail0_RMASK 0x1
+
+#define QIB_7322_IntClear_OFFS 0x78
+#define QIB_7322_IntClear_DEF 0x0000000000000000
+#define QIB_7322_IntClear_SDmaIntClear_1_LSB 0x3F
+#define QIB_7322_IntClear_SDmaIntClear_1_MSB 0x3F
+#define QIB_7322_IntClear_SDmaIntClear_1_RMASK 0x1
+#define QIB_7322_IntClear_SDmaIntClear_0_LSB 0x3E
+#define QIB_7322_IntClear_SDmaIntClear_0_MSB 0x3E
+#define QIB_7322_IntClear_SDmaIntClear_0_RMASK 0x1
+#define QIB_7322_IntClear_SDmaProgressIntClear_1_LSB 0x3D
+#define QIB_7322_IntClear_SDmaProgressIntClear_1_MSB 0x3D
+#define QIB_7322_IntClear_SDmaProgressIntClear_1_RMASK 0x1
+#define QIB_7322_IntClear_SDmaProgressIntClear_0_LSB 0x3C
+#define QIB_7322_IntClear_SDmaProgressIntClear_0_MSB 0x3C
+#define QIB_7322_IntClear_SDmaProgressIntClear_0_RMASK 0x1
+#define QIB_7322_IntClear_SDmaIdleIntClear_1_LSB 0x3B
+#define QIB_7322_IntClear_SDmaIdleIntClear_1_MSB 0x3B
+#define QIB_7322_IntClear_SDmaIdleIntClear_1_RMASK 0x1
+#define QIB_7322_IntClear_SDmaIdleIntClear_0_LSB 0x3A
+#define QIB_7322_IntClear_SDmaIdleIntClear_0_MSB 0x3A
+#define QIB_7322_IntClear_SDmaIdleIntClear_0_RMASK 0x1
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_1_LSB 0x39
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_1_MSB 0x39
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_1_RMASK 0x1
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_0_LSB 0x38
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_0_MSB 0x38
+#define QIB_7322_IntClear_SDmaCleanupDoneClear_0_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg17IntClear_LSB 0x31
+#define QIB_7322_IntClear_RcvUrg17IntClear_MSB 0x31
+#define QIB_7322_IntClear_RcvUrg17IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg16IntClear_LSB 0x30
+#define QIB_7322_IntClear_RcvUrg16IntClear_MSB 0x30
+#define QIB_7322_IntClear_RcvUrg16IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg15IntClear_LSB 0x2F
+#define QIB_7322_IntClear_RcvUrg15IntClear_MSB 0x2F
+#define QIB_7322_IntClear_RcvUrg15IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg14IntClear_LSB 0x2E
+#define QIB_7322_IntClear_RcvUrg14IntClear_MSB 0x2E
+#define QIB_7322_IntClear_RcvUrg14IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg13IntClear_LSB 0x2D
+#define QIB_7322_IntClear_RcvUrg13IntClear_MSB 0x2D
+#define QIB_7322_IntClear_RcvUrg13IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg12IntClear_LSB 0x2C
+#define QIB_7322_IntClear_RcvUrg12IntClear_MSB 0x2C
+#define QIB_7322_IntClear_RcvUrg12IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg11IntClear_LSB 0x2B
+#define QIB_7322_IntClear_RcvUrg11IntClear_MSB 0x2B
+#define QIB_7322_IntClear_RcvUrg11IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg10IntClear_LSB 0x2A
+#define QIB_7322_IntClear_RcvUrg10IntClear_MSB 0x2A
+#define QIB_7322_IntClear_RcvUrg10IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg9IntClear_LSB 0x29
+#define QIB_7322_IntClear_RcvUrg9IntClear_MSB 0x29
+#define QIB_7322_IntClear_RcvUrg9IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg8IntClear_LSB 0x28
+#define QIB_7322_IntClear_RcvUrg8IntClear_MSB 0x28
+#define QIB_7322_IntClear_RcvUrg8IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg7IntClear_LSB 0x27
+#define QIB_7322_IntClear_RcvUrg7IntClear_MSB 0x27
+#define QIB_7322_IntClear_RcvUrg7IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg6IntClear_LSB 0x26
+#define QIB_7322_IntClear_RcvUrg6IntClear_MSB 0x26
+#define QIB_7322_IntClear_RcvUrg6IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg5IntClear_LSB 0x25
+#define QIB_7322_IntClear_RcvUrg5IntClear_MSB 0x25
+#define QIB_7322_IntClear_RcvUrg5IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg4IntClear_LSB 0x24
+#define QIB_7322_IntClear_RcvUrg4IntClear_MSB 0x24
+#define QIB_7322_IntClear_RcvUrg4IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg3IntClear_LSB 0x23
+#define QIB_7322_IntClear_RcvUrg3IntClear_MSB 0x23
+#define QIB_7322_IntClear_RcvUrg3IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg2IntClear_LSB 0x22
+#define QIB_7322_IntClear_RcvUrg2IntClear_MSB 0x22
+#define QIB_7322_IntClear_RcvUrg2IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg1IntClear_LSB 0x21
+#define QIB_7322_IntClear_RcvUrg1IntClear_MSB 0x21
+#define QIB_7322_IntClear_RcvUrg1IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvUrg0IntClear_LSB 0x20
+#define QIB_7322_IntClear_RcvUrg0IntClear_MSB 0x20
+#define QIB_7322_IntClear_RcvUrg0IntClear_RMASK 0x1
+#define QIB_7322_IntClear_ErrIntClear_1_LSB 0x1F
+#define QIB_7322_IntClear_ErrIntClear_1_MSB 0x1F
+#define QIB_7322_IntClear_ErrIntClear_1_RMASK 0x1
+#define QIB_7322_IntClear_ErrIntClear_0_LSB 0x1E
+#define QIB_7322_IntClear_ErrIntClear_0_MSB 0x1E
+#define QIB_7322_IntClear_ErrIntClear_0_RMASK 0x1
+#define QIB_7322_IntClear_ErrIntClear_LSB 0x1D
+#define QIB_7322_IntClear_ErrIntClear_MSB 0x1D
+#define QIB_7322_IntClear_ErrIntClear_RMASK 0x1
+#define QIB_7322_IntClear_AssertGPIOIntClear_LSB 0x1C
+#define QIB_7322_IntClear_AssertGPIOIntClear_MSB 0x1C
+#define QIB_7322_IntClear_AssertGPIOIntClear_RMASK 0x1
+#define QIB_7322_IntClear_SendDoneIntClear_1_LSB 0x19
+#define QIB_7322_IntClear_SendDoneIntClear_1_MSB 0x19
+#define QIB_7322_IntClear_SendDoneIntClear_1_RMASK 0x1
+#define QIB_7322_IntClear_SendDoneIntClear_0_LSB 0x18
+#define QIB_7322_IntClear_SendDoneIntClear_0_MSB 0x18
+#define QIB_7322_IntClear_SendDoneIntClear_0_RMASK 0x1
+#define QIB_7322_IntClear_SendBufAvailIntClear_LSB 0x17
+#define QIB_7322_IntClear_SendBufAvailIntClear_MSB 0x17
+#define QIB_7322_IntClear_SendBufAvailIntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail17IntClear_LSB 0x11
+#define QIB_7322_IntClear_RcvAvail17IntClear_MSB 0x11
+#define QIB_7322_IntClear_RcvAvail17IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail16IntClear_LSB 0x10
+#define QIB_7322_IntClear_RcvAvail16IntClear_MSB 0x10
+#define QIB_7322_IntClear_RcvAvail16IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail15IntClear_LSB 0xF
+#define QIB_7322_IntClear_RcvAvail15IntClear_MSB 0xF
+#define QIB_7322_IntClear_RcvAvail15IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail14IntClear_LSB 0xE
+#define QIB_7322_IntClear_RcvAvail14IntClear_MSB 0xE
+#define QIB_7322_IntClear_RcvAvail14IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail13IntClear_LSB 0xD
+#define QIB_7322_IntClear_RcvAvail13IntClear_MSB 0xD
+#define QIB_7322_IntClear_RcvAvail13IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail12IntClear_LSB 0xC
+#define QIB_7322_IntClear_RcvAvail12IntClear_MSB 0xC
+#define QIB_7322_IntClear_RcvAvail12IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail11IntClear_LSB 0xB
+#define QIB_7322_IntClear_RcvAvail11IntClear_MSB 0xB
+#define QIB_7322_IntClear_RcvAvail11IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail10IntClear_LSB 0xA
+#define QIB_7322_IntClear_RcvAvail10IntClear_MSB 0xA
+#define QIB_7322_IntClear_RcvAvail10IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail9IntClear_LSB 0x9
+#define QIB_7322_IntClear_RcvAvail9IntClear_MSB 0x9
+#define QIB_7322_IntClear_RcvAvail9IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail8IntClear_LSB 0x8
+#define QIB_7322_IntClear_RcvAvail8IntClear_MSB 0x8
+#define QIB_7322_IntClear_RcvAvail8IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail7IntClear_LSB 0x7
+#define QIB_7322_IntClear_RcvAvail7IntClear_MSB 0x7
+#define QIB_7322_IntClear_RcvAvail7IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail6IntClear_LSB 0x6
+#define QIB_7322_IntClear_RcvAvail6IntClear_MSB 0x6
+#define QIB_7322_IntClear_RcvAvail6IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail5IntClear_LSB 0x5
+#define QIB_7322_IntClear_RcvAvail5IntClear_MSB 0x5
+#define QIB_7322_IntClear_RcvAvail5IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail4IntClear_LSB 0x4
+#define QIB_7322_IntClear_RcvAvail4IntClear_MSB 0x4
+#define QIB_7322_IntClear_RcvAvail4IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail3IntClear_LSB 0x3
+#define QIB_7322_IntClear_RcvAvail3IntClear_MSB 0x3
+#define QIB_7322_IntClear_RcvAvail3IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail2IntClear_LSB 0x2
+#define QIB_7322_IntClear_RcvAvail2IntClear_MSB 0x2
+#define QIB_7322_IntClear_RcvAvail2IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail1IntClear_LSB 0x1
+#define QIB_7322_IntClear_RcvAvail1IntClear_MSB 0x1
+#define QIB_7322_IntClear_RcvAvail1IntClear_RMASK 0x1
+#define QIB_7322_IntClear_RcvAvail0IntClear_LSB 0x0
+#define QIB_7322_IntClear_RcvAvail0IntClear_MSB 0x0
+#define QIB_7322_IntClear_RcvAvail0IntClear_RMASK 0x1
+
+#define QIB_7322_ErrMask_OFFS 0x80
+#define QIB_7322_ErrMask_DEF 0x0000000000000000
+#define QIB_7322_ErrMask_ResetNegatedMask_LSB 0x3F
+#define QIB_7322_ErrMask_ResetNegatedMask_MSB 0x3F
+#define QIB_7322_ErrMask_ResetNegatedMask_RMASK 0x1
+#define QIB_7322_ErrMask_HardwareErrMask_LSB 0x3E
+#define QIB_7322_ErrMask_HardwareErrMask_MSB 0x3E
+#define QIB_7322_ErrMask_HardwareErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_InvalidAddrErrMask_LSB 0x3D
+#define QIB_7322_ErrMask_InvalidAddrErrMask_MSB 0x3D
+#define QIB_7322_ErrMask_InvalidAddrErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SDmaVL15ErrMask_LSB 0x38
+#define QIB_7322_ErrMask_SDmaVL15ErrMask_MSB 0x38
+#define QIB_7322_ErrMask_SDmaVL15ErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SBufVL15MisUseErrMask_LSB 0x37
+#define QIB_7322_ErrMask_SBufVL15MisUseErrMask_MSB 0x37
+#define QIB_7322_ErrMask_SBufVL15MisUseErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_InvalidEEPCmdMask_LSB 0x35
+#define QIB_7322_ErrMask_InvalidEEPCmdMask_MSB 0x35
+#define QIB_7322_ErrMask_InvalidEEPCmdMask_RMASK 0x1
+#define QIB_7322_ErrMask_RcvContextShareErrMask_LSB 0x34
+#define QIB_7322_ErrMask_RcvContextShareErrMask_MSB 0x34
+#define QIB_7322_ErrMask_RcvContextShareErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SendVLMismatchErrMask_LSB 0x24
+#define QIB_7322_ErrMask_SendVLMismatchErrMask_MSB 0x24
+#define QIB_7322_ErrMask_SendVLMismatchErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SendArmLaunchErrMask_LSB 0x23
+#define QIB_7322_ErrMask_SendArmLaunchErrMask_MSB 0x23
+#define QIB_7322_ErrMask_SendArmLaunchErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SendSpecialTriggerErrMask_LSB 0x1B
+#define QIB_7322_ErrMask_SendSpecialTriggerErrMask_MSB 0x1B
+#define QIB_7322_ErrMask_SendSpecialTriggerErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SDmaWrongPortErrMask_LSB 0x1A
+#define QIB_7322_ErrMask_SDmaWrongPortErrMask_MSB 0x1A
+#define QIB_7322_ErrMask_SDmaWrongPortErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_SDmaBufMaskDuplicateErrMask_LSB 0x19
+#define QIB_7322_ErrMask_SDmaBufMaskDuplicateErrMask_MSB 0x19
+#define QIB_7322_ErrMask_SDmaBufMaskDuplicateErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_RcvHdrFullErrMask_LSB 0xD
+#define QIB_7322_ErrMask_RcvHdrFullErrMask_MSB 0xD
+#define QIB_7322_ErrMask_RcvHdrFullErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_RcvEgrFullErrMask_LSB 0xC
+#define QIB_7322_ErrMask_RcvEgrFullErrMask_MSB 0xC
+#define QIB_7322_ErrMask_RcvEgrFullErrMask_RMASK 0x1
+
+#define QIB_7322_ErrStatus_OFFS 0x88
+#define QIB_7322_ErrStatus_DEF 0x0000000000000000
+#define QIB_7322_ErrStatus_ResetNegated_LSB 0x3F
+#define QIB_7322_ErrStatus_ResetNegated_MSB 0x3F
+#define QIB_7322_ErrStatus_ResetNegated_RMASK 0x1
+#define QIB_7322_ErrStatus_HardwareErr_LSB 0x3E
+#define QIB_7322_ErrStatus_HardwareErr_MSB 0x3E
+#define QIB_7322_ErrStatus_HardwareErr_RMASK 0x1
+#define QIB_7322_ErrStatus_InvalidAddrErr_LSB 0x3D
+#define QIB_7322_ErrStatus_InvalidAddrErr_MSB 0x3D
+#define QIB_7322_ErrStatus_InvalidAddrErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SDmaVL15Err_LSB 0x38
+#define QIB_7322_ErrStatus_SDmaVL15Err_MSB 0x38
+#define QIB_7322_ErrStatus_SDmaVL15Err_RMASK 0x1
+#define QIB_7322_ErrStatus_SBufVL15MisUseErr_LSB 0x37
+#define QIB_7322_ErrStatus_SBufVL15MisUseErr_MSB 0x37
+#define QIB_7322_ErrStatus_SBufVL15MisUseErr_RMASK 0x1
+#define QIB_7322_ErrStatus_InvalidEEPCmdErr_LSB 0x35
+#define QIB_7322_ErrStatus_InvalidEEPCmdErr_MSB 0x35
+#define QIB_7322_ErrStatus_InvalidEEPCmdErr_RMASK 0x1
+#define QIB_7322_ErrStatus_RcvContextShareErr_LSB 0x34
+#define QIB_7322_ErrStatus_RcvContextShareErr_MSB 0x34
+#define QIB_7322_ErrStatus_RcvContextShareErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SendVLMismatchErr_LSB 0x24
+#define QIB_7322_ErrStatus_SendVLMismatchErr_MSB 0x24
+#define QIB_7322_ErrStatus_SendVLMismatchErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SendArmLaunchErr_LSB 0x23
+#define QIB_7322_ErrStatus_SendArmLaunchErr_MSB 0x23
+#define QIB_7322_ErrStatus_SendArmLaunchErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SendSpecialTriggerErr_LSB 0x1B
+#define QIB_7322_ErrStatus_SendSpecialTriggerErr_MSB 0x1B
+#define QIB_7322_ErrStatus_SendSpecialTriggerErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SDmaWrongPortErr_LSB 0x1A
+#define QIB_7322_ErrStatus_SDmaWrongPortErr_MSB 0x1A
+#define QIB_7322_ErrStatus_SDmaWrongPortErr_RMASK 0x1
+#define QIB_7322_ErrStatus_SDmaBufMaskDuplicateErr_LSB 0x19
+#define QIB_7322_ErrStatus_SDmaBufMaskDuplicateErr_MSB 0x19
+#define QIB_7322_ErrStatus_SDmaBufMaskDuplicateErr_RMASK 0x1
+#define QIB_7322_ErrStatus_RcvHdrFullErr_LSB 0xD
+#define QIB_7322_ErrStatus_RcvHdrFullErr_MSB 0xD
+#define QIB_7322_ErrStatus_RcvHdrFullErr_RMASK 0x1
+#define QIB_7322_ErrStatus_RcvEgrFullErr_LSB 0xC
+#define QIB_7322_ErrStatus_RcvEgrFullErr_MSB 0xC
+#define QIB_7322_ErrStatus_RcvEgrFullErr_RMASK 0x1
+
+#define QIB_7322_ErrClear_OFFS 0x90
+#define QIB_7322_ErrClear_DEF 0x0000000000000000
+#define QIB_7322_ErrClear_ResetNegatedClear_LSB 0x3F
+#define QIB_7322_ErrClear_ResetNegatedClear_MSB 0x3F
+#define QIB_7322_ErrClear_ResetNegatedClear_RMASK 0x1
+#define QIB_7322_ErrClear_HardwareErrClear_LSB 0x3E
+#define QIB_7322_ErrClear_HardwareErrClear_MSB 0x3E
+#define QIB_7322_ErrClear_HardwareErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_InvalidAddrErrClear_LSB 0x3D
+#define QIB_7322_ErrClear_InvalidAddrErrClear_MSB 0x3D
+#define QIB_7322_ErrClear_InvalidAddrErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SDmaVL15ErrClear_LSB 0x38
+#define QIB_7322_ErrClear_SDmaVL15ErrClear_MSB 0x38
+#define QIB_7322_ErrClear_SDmaVL15ErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SBufVL15MisUseErrClear_LSB 0x37
+#define QIB_7322_ErrClear_SBufVL15MisUseErrClear_MSB 0x37
+#define QIB_7322_ErrClear_SBufVL15MisUseErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_InvalidEEPCmdErrClear_LSB 0x35
+#define QIB_7322_ErrClear_InvalidEEPCmdErrClear_MSB 0x35
+#define QIB_7322_ErrClear_InvalidEEPCmdErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_RcvContextShareErrClear_LSB 0x34
+#define QIB_7322_ErrClear_RcvContextShareErrClear_MSB 0x34
+#define QIB_7322_ErrClear_RcvContextShareErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SendVLMismatchErrMask_LSB 0x24
+#define QIB_7322_ErrClear_SendVLMismatchErrMask_MSB 0x24
+#define QIB_7322_ErrClear_SendVLMismatchErrMask_RMASK 0x1
+#define QIB_7322_ErrClear_SendArmLaunchErrClear_LSB 0x23
+#define QIB_7322_ErrClear_SendArmLaunchErrClear_MSB 0x23
+#define QIB_7322_ErrClear_SendArmLaunchErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SendSpecialTriggerErrClear_LSB 0x1B
+#define QIB_7322_ErrClear_SendSpecialTriggerErrClear_MSB 0x1B
+#define QIB_7322_ErrClear_SendSpecialTriggerErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SDmaWrongPortErrClear_LSB 0x1A
+#define QIB_7322_ErrClear_SDmaWrongPortErrClear_MSB 0x1A
+#define QIB_7322_ErrClear_SDmaWrongPortErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_SDmaBufMaskDuplicateErrClear_LSB 0x19
+#define QIB_7322_ErrClear_SDmaBufMaskDuplicateErrClear_MSB 0x19
+#define QIB_7322_ErrClear_SDmaBufMaskDuplicateErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_RcvHdrFullErrClear_LSB 0xD
+#define QIB_7322_ErrClear_RcvHdrFullErrClear_MSB 0xD
+#define QIB_7322_ErrClear_RcvHdrFullErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_RcvEgrFullErrClear_LSB 0xC
+#define QIB_7322_ErrClear_RcvEgrFullErrClear_MSB 0xC
+#define QIB_7322_ErrClear_RcvEgrFullErrClear_RMASK 0x1
+
+#define QIB_7322_HwErrMask_OFFS 0x98
+#define QIB_7322_HwErrMask_DEF 0x0000000000000000
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_1_LSB 0x3F
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_1_MSB 0x3F
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_1_RMASK 0x1
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_0_LSB 0x3E
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_0_MSB 0x3E
+#define QIB_7322_HwErrMask_IBSerdesPClkNotDetectMask_0_RMASK 0x1
+#define QIB_7322_HwErrMask_PCIESerdesPClkNotDetectMask_LSB 0x37
+#define QIB_7322_HwErrMask_PCIESerdesPClkNotDetectMask_MSB 0x37
+#define QIB_7322_HwErrMask_PCIESerdesPClkNotDetectMask_RMASK 0x1
+#define QIB_7322_HwErrMask_PowerOnBISTFailedMask_LSB 0x36
+#define QIB_7322_HwErrMask_PowerOnBISTFailedMask_MSB 0x36
+#define QIB_7322_HwErrMask_PowerOnBISTFailedMask_RMASK 0x1
+#define QIB_7322_HwErrMask_TempsenseTholdReachedMask_LSB 0x35
+#define QIB_7322_HwErrMask_TempsenseTholdReachedMask_MSB 0x35
+#define QIB_7322_HwErrMask_TempsenseTholdReachedMask_RMASK 0x1
+#define QIB_7322_HwErrMask_MemoryErrMask_LSB 0x30
+#define QIB_7322_HwErrMask_MemoryErrMask_MSB 0x30
+#define QIB_7322_HwErrMask_MemoryErrMask_RMASK 0x1
+#define QIB_7322_HwErrMask_pcie_phy_txParityErr_LSB 0x22
+#define QIB_7322_HwErrMask_pcie_phy_txParityErr_MSB 0x22
+#define QIB_7322_HwErrMask_pcie_phy_txParityErr_RMASK 0x1
+#define QIB_7322_HwErrMask_PCIeBusParityErrMask_LSB 0x1F
+#define QIB_7322_HwErrMask_PCIeBusParityErrMask_MSB 0x21
+#define QIB_7322_HwErrMask_PCIeBusParityErrMask_RMASK 0x7
+#define QIB_7322_HwErrMask_PcieCplTimeoutMask_LSB 0x1E
+#define QIB_7322_HwErrMask_PcieCplTimeoutMask_MSB 0x1E
+#define QIB_7322_HwErrMask_PcieCplTimeoutMask_RMASK 0x1
+#define QIB_7322_HwErrMask_PciePoisonedTLPMask_LSB 0x1D
+#define QIB_7322_HwErrMask_PciePoisonedTLPMask_MSB 0x1D
+#define QIB_7322_HwErrMask_PciePoisonedTLPMask_RMASK 0x1
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_1_LSB 0x1C
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_1_MSB 0x1C
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_1_RMASK 0x1
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_0_LSB 0x1B
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_0_MSB 0x1B
+#define QIB_7322_HwErrMask_SDmaMemReadErrMask_0_RMASK 0x1
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_1_LSB 0xF
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_1_MSB 0xF
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_1_RMASK 0x1
+#define QIB_7322_HwErrMask_IBCBusToSPCParityErrMask_1_LSB 0xE
+#define QIB_7322_HwErrMask_IBCBusToSPCParityErrMask_1_MSB 0xE
+#define QIB_7322_HwErrMask_IBCBusToSPCParityErrMask_1_RMASK 0x1
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_0_LSB 0xD
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_0_MSB 0xD
+#define QIB_7322_HwErrMask_IBCBusFromSPCParityErrMask_0_RMASK 0x1
+#define QIB_7322_HwErrMask_statusValidNoEopMask_LSB 0xC
+#define QIB_7322_HwErrMask_statusValidNoEopMask_MSB 0xC
+#define QIB_7322_HwErrMask_statusValidNoEopMask_RMASK 0x1
+#define QIB_7322_HwErrMask_LATriggeredMask_LSB 0xB
+#define QIB_7322_HwErrMask_LATriggeredMask_MSB 0xB
+#define QIB_7322_HwErrMask_LATriggeredMask_RMASK 0x1
+
+#define QIB_7322_HwErrStatus_OFFS 0xA0
+#define QIB_7322_HwErrStatus_DEF 0x0000000000000000
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_1_LSB 0x3F
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_1_MSB 0x3F
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_1_RMASK 0x1
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_0_LSB 0x3E
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_0_MSB 0x3E
+#define QIB_7322_HwErrStatus_IBSerdesPClkNotDetect_0_RMASK 0x1
+#define QIB_7322_HwErrStatus_PCIESerdesPClkNotDetect_LSB 0x37
+#define QIB_7322_HwErrStatus_PCIESerdesPClkNotDetect_MSB 0x37
+#define QIB_7322_HwErrStatus_PCIESerdesPClkNotDetect_RMASK 0x1
+#define QIB_7322_HwErrStatus_PowerOnBISTFailed_LSB 0x36
+#define QIB_7322_HwErrStatus_PowerOnBISTFailed_MSB 0x36
+#define QIB_7322_HwErrStatus_PowerOnBISTFailed_RMASK 0x1
+#define QIB_7322_HwErrStatus_TempsenseTholdReached_LSB 0x35
+#define QIB_7322_HwErrStatus_TempsenseTholdReached_MSB 0x35
+#define QIB_7322_HwErrStatus_TempsenseTholdReached_RMASK 0x1
+#define QIB_7322_HwErrStatus_MemoryErr_LSB 0x30
+#define QIB_7322_HwErrStatus_MemoryErr_MSB 0x30
+#define QIB_7322_HwErrStatus_MemoryErr_RMASK 0x1
+#define QIB_7322_HwErrStatus_pcie_phy_txParityErr_LSB 0x22
+#define QIB_7322_HwErrStatus_pcie_phy_txParityErr_MSB 0x22
+#define QIB_7322_HwErrStatus_pcie_phy_txParityErr_RMASK 0x1
+#define QIB_7322_HwErrStatus_PCIeBusParity_LSB 0x1F
+#define QIB_7322_HwErrStatus_PCIeBusParity_MSB 0x21
+#define QIB_7322_HwErrStatus_PCIeBusParity_RMASK 0x7
+#define QIB_7322_HwErrStatus_PcieCplTimeout_LSB 0x1E
+#define QIB_7322_HwErrStatus_PcieCplTimeout_MSB 0x1E
+#define QIB_7322_HwErrStatus_PcieCplTimeout_RMASK 0x1
+#define QIB_7322_HwErrStatus_PciePoisonedTLP_LSB 0x1D
+#define QIB_7322_HwErrStatus_PciePoisonedTLP_MSB 0x1D
+#define QIB_7322_HwErrStatus_PciePoisonedTLP_RMASK 0x1
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_1_LSB 0x1C
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_1_MSB 0x1C
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_1_RMASK 0x1
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_0_LSB 0x1B
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_0_MSB 0x1B
+#define QIB_7322_HwErrStatus_SDmaMemReadErr_0_RMASK 0x1
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_1_LSB 0xF
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_1_MSB 0xF
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_1_RMASK 0x1
+#define QIB_7322_HwErrStatus_IBCBusToSPCParityErr_1_LSB 0xE
+#define QIB_7322_HwErrStatus_IBCBusToSPCParityErr_1_MSB 0xE
+#define QIB_7322_HwErrStatus_IBCBusToSPCParityErr_1_RMASK 0x1
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_0_LSB 0xD
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_0_MSB 0xD
+#define QIB_7322_HwErrStatus_IBCBusFromSPCParityErr_0_RMASK 0x1
+#define QIB_7322_HwErrStatus_statusValidNoEop_LSB 0xC
+#define QIB_7322_HwErrStatus_statusValidNoEop_MSB 0xC
+#define QIB_7322_HwErrStatus_statusValidNoEop_RMASK 0x1
+#define QIB_7322_HwErrStatus_LATriggered_LSB 0xB
+#define QIB_7322_HwErrStatus_LATriggered_MSB 0xB
+#define QIB_7322_HwErrStatus_LATriggered_RMASK 0x1
+
+#define QIB_7322_HwErrClear_OFFS 0xA8
+#define QIB_7322_HwErrClear_DEF 0x0000000000000000
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_1_LSB 0x3F
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_1_MSB 0x3F
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_1_RMASK 0x1
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_0_LSB 0x3E
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_0_MSB 0x3E
+#define QIB_7322_HwErrClear_IBSerdesPClkNotDetectClear_0_RMASK 0x1
+#define QIB_7322_HwErrClear_PCIESerdesPClkNotDetectClear_LSB 0x37
+#define QIB_7322_HwErrClear_PCIESerdesPClkNotDetectClear_MSB 0x37
+#define QIB_7322_HwErrClear_PCIESerdesPClkNotDetectClear_RMASK 0x1
+#define QIB_7322_HwErrClear_PowerOnBISTFailedClear_LSB 0x36
+#define QIB_7322_HwErrClear_PowerOnBISTFailedClear_MSB 0x36
+#define QIB_7322_HwErrClear_PowerOnBISTFailedClear_RMASK 0x1
+#define QIB_7322_HwErrClear_TempsenseTholdReachedClear_LSB 0x35
+#define QIB_7322_HwErrClear_TempsenseTholdReachedClear_MSB 0x35
+#define QIB_7322_HwErrClear_TempsenseTholdReachedClear_RMASK 0x1
+#define QIB_7322_HwErrClear_MemoryErrClear_LSB 0x30
+#define QIB_7322_HwErrClear_MemoryErrClear_MSB 0x30
+#define QIB_7322_HwErrClear_MemoryErrClear_RMASK 0x1
+#define QIB_7322_HwErrClear_pcie_phy_txParityErr_LSB 0x22
+#define QIB_7322_HwErrClear_pcie_phy_txParityErr_MSB 0x22
+#define QIB_7322_HwErrClear_pcie_phy_txParityErr_RMASK 0x1
+#define QIB_7322_HwErrClear_PCIeBusParityClear_LSB 0x1F
+#define QIB_7322_HwErrClear_PCIeBusParityClear_MSB 0x21
+#define QIB_7322_HwErrClear_PCIeBusParityClear_RMASK 0x7
+#define QIB_7322_HwErrClear_PcieCplTimeoutClear_LSB 0x1E
+#define QIB_7322_HwErrClear_PcieCplTimeoutClear_MSB 0x1E
+#define QIB_7322_HwErrClear_PcieCplTimeoutClear_RMASK 0x1
+#define QIB_7322_HwErrClear_PciePoisonedTLPClear_LSB 0x1D
+#define QIB_7322_HwErrClear_PciePoisonedTLPClear_MSB 0x1D
+#define QIB_7322_HwErrClear_PciePoisonedTLPClear_RMASK 0x1
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_1_LSB 0x1C
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_1_MSB 0x1C
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_1_RMASK 0x1
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_0_LSB 0x1B
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_0_MSB 0x1B
+#define QIB_7322_HwErrClear_SDmaMemReadErrClear_0_RMASK 0x1
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_1_LSB 0xF
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_1_MSB 0xF
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_1_RMASK 0x1
+#define QIB_7322_HwErrClear_IBCBusToSPCParityErrClear_1_LSB 0xE
+#define QIB_7322_HwErrClear_IBCBusToSPCParityErrClear_1_MSB 0xE
+#define QIB_7322_HwErrClear_IBCBusToSPCParityErrClear_1_RMASK 0x1
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_0_LSB 0xD
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_0_MSB 0xD
+#define QIB_7322_HwErrClear_IBCBusFromSPCParityErrClear_0_RMASK 0x1
+#define QIB_7322_HwErrClear_statusValidNoEopClear_LSB 0xC
+#define QIB_7322_HwErrClear_statusValidNoEopClear_MSB 0xC
+#define QIB_7322_HwErrClear_statusValidNoEopClear_RMASK 0x1
+#define QIB_7322_HwErrClear_LATriggeredClear_LSB 0xB
+#define QIB_7322_HwErrClear_LATriggeredClear_MSB 0xB
+#define QIB_7322_HwErrClear_LATriggeredClear_RMASK 0x1
+
+#define QIB_7322_HwDiagCtrl_OFFS 0xB0
+#define QIB_7322_HwDiagCtrl_DEF 0x0000000000000000
+#define QIB_7322_HwDiagCtrl_Diagnostic_LSB 0x3F
+#define QIB_7322_HwDiagCtrl_Diagnostic_MSB 0x3F
+#define QIB_7322_HwDiagCtrl_Diagnostic_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_CounterWrEnable_LSB 0x3D
+#define QIB_7322_HwDiagCtrl_CounterWrEnable_MSB 0x3D
+#define QIB_7322_HwDiagCtrl_CounterWrEnable_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_CounterDisable_LSB 0x3C
+#define QIB_7322_HwDiagCtrl_CounterDisable_MSB 0x3C
+#define QIB_7322_HwDiagCtrl_CounterDisable_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_forcePCIeBusParity_LSB 0x1F
+#define QIB_7322_HwDiagCtrl_forcePCIeBusParity_MSB 0x22
+#define QIB_7322_HwDiagCtrl_forcePCIeBusParity_RMASK 0xF
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_1_LSB 0xF
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_1_MSB 0xF
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_1_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_1_LSB 0xE
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_1_MSB 0xE
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_1_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_0_LSB 0xD
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_0_MSB 0xD
+#define QIB_7322_HwDiagCtrl_ForceIBCBusFromSPCParityErr_0_RMASK 0x1
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_0_LSB 0xC
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_0_MSB 0xC
+#define QIB_7322_HwDiagCtrl_ForceIBCBusToSPCParityErr_0_RMASK 0x1
+
+#define QIB_7322_EXTStatus_OFFS 0xC0
+#define QIB_7322_EXTStatus_DEF 0x000000000000X000
+#define QIB_7322_EXTStatus_GPIOIn_LSB 0x30
+#define QIB_7322_EXTStatus_GPIOIn_MSB 0x3F
+#define QIB_7322_EXTStatus_GPIOIn_RMASK 0xFFFF
+#define QIB_7322_EXTStatus_MemBISTDisabled_LSB 0xF
+#define QIB_7322_EXTStatus_MemBISTDisabled_MSB 0xF
+#define QIB_7322_EXTStatus_MemBISTDisabled_RMASK 0x1
+#define QIB_7322_EXTStatus_MemBISTEndTest_LSB 0xE
+#define QIB_7322_EXTStatus_MemBISTEndTest_MSB 0xE
+#define QIB_7322_EXTStatus_MemBISTEndTest_RMASK 0x1
+
+#define QIB_7322_EXTCtrl_OFFS 0xC8
+#define QIB_7322_EXTCtrl_DEF 0x0000000000000000
+#define QIB_7322_EXTCtrl_GPIOOe_LSB 0x30
+#define QIB_7322_EXTCtrl_GPIOOe_MSB 0x3F
+#define QIB_7322_EXTCtrl_GPIOOe_RMASK 0xFFFF
+#define QIB_7322_EXTCtrl_GPIOInvert_LSB 0x20
+#define QIB_7322_EXTCtrl_GPIOInvert_MSB 0x2F
+#define QIB_7322_EXTCtrl_GPIOInvert_RMASK 0xFFFF
+#define QIB_7322_EXTCtrl_LEDPort1GreenOn_LSB 0x3
+#define QIB_7322_EXTCtrl_LEDPort1GreenOn_MSB 0x3
+#define QIB_7322_EXTCtrl_LEDPort1GreenOn_RMASK 0x1
+#define QIB_7322_EXTCtrl_LEDPort1YellowOn_LSB 0x2
+#define QIB_7322_EXTCtrl_LEDPort1YellowOn_MSB 0x2
+#define QIB_7322_EXTCtrl_LEDPort1YellowOn_RMASK 0x1
+#define QIB_7322_EXTCtrl_LEDPort0GreenOn_LSB 0x1
+#define QIB_7322_EXTCtrl_LEDPort0GreenOn_MSB 0x1
+#define QIB_7322_EXTCtrl_LEDPort0GreenOn_RMASK 0x1
+#define QIB_7322_EXTCtrl_LEDPort0YellowOn_LSB 0x0
+#define QIB_7322_EXTCtrl_LEDPort0YellowOn_MSB 0x0
+#define QIB_7322_EXTCtrl_LEDPort0YellowOn_RMASK 0x1
+
+#define QIB_7322_GPIOOut_OFFS 0xE0
+#define QIB_7322_GPIOOut_DEF 0x0000000000000000
+
+#define QIB_7322_GPIOMask_OFFS 0xE8
+#define QIB_7322_GPIOMask_DEF 0x0000000000000000
+
+#define QIB_7322_GPIOStatus_OFFS 0xF0
+#define QIB_7322_GPIOStatus_DEF 0x0000000000000000
+
+#define QIB_7322_GPIOClear_OFFS 0xF8
+#define QIB_7322_GPIOClear_DEF 0x0000000000000000
+
+#define QIB_7322_RcvCtrl_OFFS 0x100
+#define QIB_7322_RcvCtrl_DEF 0x0000000000000000
+#define QIB_7322_RcvCtrl_TidReDirect_LSB 0x30
+#define QIB_7322_RcvCtrl_TidReDirect_MSB 0x3F
+#define QIB_7322_RcvCtrl_TidReDirect_RMASK 0xFFFF
+#define QIB_7322_RcvCtrl_TailUpd_LSB 0x2F
+#define QIB_7322_RcvCtrl_TailUpd_MSB 0x2F
+#define QIB_7322_RcvCtrl_TailUpd_RMASK 0x1
+#define QIB_7322_RcvCtrl_XrcTypeCode_LSB 0x2C
+#define QIB_7322_RcvCtrl_XrcTypeCode_MSB 0x2E
+#define QIB_7322_RcvCtrl_XrcTypeCode_RMASK 0x7
+#define QIB_7322_RcvCtrl_TidFlowEnable_LSB 0x2B
+#define QIB_7322_RcvCtrl_TidFlowEnable_MSB 0x2B
+#define QIB_7322_RcvCtrl_TidFlowEnable_RMASK 0x1
+#define QIB_7322_RcvCtrl_ContextCfg_LSB 0x29
+#define QIB_7322_RcvCtrl_ContextCfg_MSB 0x2A
+#define QIB_7322_RcvCtrl_ContextCfg_RMASK 0x3
+#define QIB_7322_RcvCtrl_IntrAvail_LSB 0x14
+#define QIB_7322_RcvCtrl_IntrAvail_MSB 0x25
+#define QIB_7322_RcvCtrl_IntrAvail_RMASK 0x3FFFF
+#define QIB_7322_RcvCtrl_dontDropRHQFull_LSB 0x0
+#define QIB_7322_RcvCtrl_dontDropRHQFull_MSB 0x11
+#define QIB_7322_RcvCtrl_dontDropRHQFull_RMASK 0x3FFFF
+
+#define QIB_7322_RcvHdrSize_OFFS 0x110
+#define QIB_7322_RcvHdrSize_DEF 0x0000000000000000
+
+#define QIB_7322_RcvHdrCnt_OFFS 0x118
+#define QIB_7322_RcvHdrCnt_DEF 0x0000000000000000
+
+#define QIB_7322_RcvHdrEntSize_OFFS 0x120
+#define QIB_7322_RcvHdrEntSize_DEF 0x0000000000000000
+
+#define QIB_7322_RcvTIDBase_OFFS 0x128
+#define QIB_7322_RcvTIDBase_DEF 0x0000000000050000
+
+#define QIB_7322_RcvTIDCnt_OFFS 0x130
+#define QIB_7322_RcvTIDCnt_DEF 0x0000000000000200
+
+#define QIB_7322_RcvEgrBase_OFFS 0x138
+#define QIB_7322_RcvEgrBase_DEF 0x0000000000014000
+
+#define QIB_7322_RcvEgrCnt_OFFS 0x140
+#define QIB_7322_RcvEgrCnt_DEF 0x0000000000001000
+
+#define QIB_7322_RcvBufBase_OFFS 0x148
+#define QIB_7322_RcvBufBase_DEF 0x0000000000080000
+
+#define QIB_7322_RcvBufSize_OFFS 0x150
+#define QIB_7322_RcvBufSize_DEF 0x0000000000005000
+
+#define QIB_7322_RxIntMemBase_OFFS 0x158
+#define QIB_7322_RxIntMemBase_DEF 0x0000000000077000
+
+#define QIB_7322_RxIntMemSize_OFFS 0x160
+#define QIB_7322_RxIntMemSize_DEF 0x0000000000007000
+
+#define QIB_7322_feature_mask_OFFS 0x190
+#define QIB_7322_feature_mask_DEF 0x00000000000000XX
+
+#define QIB_7322_active_feature_mask_OFFS 0x198
+#define QIB_7322_active_feature_mask_DEF 0x00000000000000XX
+#define QIB_7322_active_feature_mask_Port1_QDR_Enabled_LSB 0x5
+#define QIB_7322_active_feature_mask_Port1_QDR_Enabled_MSB 0x5
+#define QIB_7322_active_feature_mask_Port1_QDR_Enabled_RMASK 0x1
+#define QIB_7322_active_feature_mask_Port1_DDR_Enabled_LSB 0x4
+#define QIB_7322_active_feature_mask_Port1_DDR_Enabled_MSB 0x4
+#define QIB_7322_active_feature_mask_Port1_DDR_Enabled_RMASK 0x1
+#define QIB_7322_active_feature_mask_Port1_SDR_Enabled_LSB 0x3
+#define QIB_7322_active_feature_mask_Port1_SDR_Enabled_MSB 0x3
+#define QIB_7322_active_feature_mask_Port1_SDR_Enabled_RMASK 0x1
+#define QIB_7322_active_feature_mask_Port0_QDR_Enabled_LSB 0x2
+#define QIB_7322_active_feature_mask_Port0_QDR_Enabled_MSB 0x2
+#define QIB_7322_active_feature_mask_Port0_QDR_Enabled_RMASK 0x1
+#define QIB_7322_active_feature_mask_Port0_DDR_Enabled_LSB 0x1
+#define QIB_7322_active_feature_mask_Port0_DDR_Enabled_MSB 0x1
+#define QIB_7322_active_feature_mask_Port0_DDR_Enabled_RMASK 0x1
+#define QIB_7322_active_feature_mask_Port0_SDR_Enabled_LSB 0x0
+#define QIB_7322_active_feature_mask_Port0_SDR_Enabled_MSB 0x0
+#define QIB_7322_active_feature_mask_Port0_SDR_Enabled_RMASK 0x1
+
+#define QIB_7322_SendCtrl_OFFS 0x1C0
+#define QIB_7322_SendCtrl_DEF 0x0000000000000000
+#define QIB_7322_SendCtrl_Disarm_LSB 0x1F
+#define QIB_7322_SendCtrl_Disarm_MSB 0x1F
+#define QIB_7322_SendCtrl_Disarm_RMASK 0x1
+#define QIB_7322_SendCtrl_SendBufAvailPad64Byte_LSB 0x1D
+#define QIB_7322_SendCtrl_SendBufAvailPad64Byte_MSB 0x1D
+#define QIB_7322_SendCtrl_SendBufAvailPad64Byte_RMASK 0x1
+#define QIB_7322_SendCtrl_AvailUpdThld_LSB 0x18
+#define QIB_7322_SendCtrl_AvailUpdThld_MSB 0x1C
+#define QIB_7322_SendCtrl_AvailUpdThld_RMASK 0x1F
+#define QIB_7322_SendCtrl_DisarmSendBuf_LSB 0x10
+#define QIB_7322_SendCtrl_DisarmSendBuf_MSB 0x17
+#define QIB_7322_SendCtrl_DisarmSendBuf_RMASK 0xFF
+#define QIB_7322_SendCtrl_SpecialTriggerEn_LSB 0x4
+#define QIB_7322_SendCtrl_SpecialTriggerEn_MSB 0x4
+#define QIB_7322_SendCtrl_SpecialTriggerEn_RMASK 0x1
+#define QIB_7322_SendCtrl_SendBufAvailUpd_LSB 0x2
+#define QIB_7322_SendCtrl_SendBufAvailUpd_MSB 0x2
+#define QIB_7322_SendCtrl_SendBufAvailUpd_RMASK 0x1
+#define QIB_7322_SendCtrl_SendIntBufAvail_LSB 0x1
+#define QIB_7322_SendCtrl_SendIntBufAvail_MSB 0x1
+#define QIB_7322_SendCtrl_SendIntBufAvail_RMASK 0x1
+
+#define QIB_7322_SendBufBase_OFFS 0x1C8
+#define QIB_7322_SendBufBase_DEF 0x0018000000100000
+#define QIB_7322_SendBufBase_BaseAddr_LargePIO_LSB 0x20
+#define QIB_7322_SendBufBase_BaseAddr_LargePIO_MSB 0x34
+#define QIB_7322_SendBufBase_BaseAddr_LargePIO_RMASK 0x1FFFFF
+#define QIB_7322_SendBufBase_BaseAddr_SmallPIO_LSB 0x0
+#define QIB_7322_SendBufBase_BaseAddr_SmallPIO_MSB 0x14
+#define QIB_7322_SendBufBase_BaseAddr_SmallPIO_RMASK 0x1FFFFF
+
+#define QIB_7322_SendBufSize_OFFS 0x1D0
+#define QIB_7322_SendBufSize_DEF 0x0000108000000880
+#define QIB_7322_SendBufSize_Size_LargePIO_LSB 0x20
+#define QIB_7322_SendBufSize_Size_LargePIO_MSB 0x2C
+#define QIB_7322_SendBufSize_Size_LargePIO_RMASK 0x1FFF
+#define QIB_7322_SendBufSize_Size_SmallPIO_LSB 0x0
+#define QIB_7322_SendBufSize_Size_SmallPIO_MSB 0xB
+#define QIB_7322_SendBufSize_Size_SmallPIO_RMASK 0xFFF
+
+#define QIB_7322_SendBufCnt_OFFS 0x1D8
+#define QIB_7322_SendBufCnt_DEF 0x0000002000000080
+#define QIB_7322_SendBufCnt_Num_LargeBuffers_LSB 0x20
+#define QIB_7322_SendBufCnt_Num_LargeBuffers_MSB 0x25
+#define QIB_7322_SendBufCnt_Num_LargeBuffers_RMASK 0x3F
+#define QIB_7322_SendBufCnt_Num_SmallBuffers_LSB 0x0
+#define QIB_7322_SendBufCnt_Num_SmallBuffers_MSB 0x8
+#define QIB_7322_SendBufCnt_Num_SmallBuffers_RMASK 0x1FF
+
+#define QIB_7322_SendBufAvailAddr_OFFS 0x1E0
+#define QIB_7322_SendBufAvailAddr_DEF 0x0000000000000000
+#define QIB_7322_SendBufAvailAddr_SendBufAvailAddr_LSB 0x6
+#define QIB_7322_SendBufAvailAddr_SendBufAvailAddr_MSB 0x27
+#define QIB_7322_SendBufAvailAddr_SendBufAvailAddr_RMASK 0x3FFFFFFFF
+
+#define QIB_7322_SendBufErr0_OFFS 0x240
+#define QIB_7322_SendBufErr0_DEF 0x0000000000000000
+#define QIB_7322_SendBufErr0_SendBufErr_63_0_LSB 0x0
+#define QIB_7322_SendBufErr0_SendBufErr_63_0_MSB 0x3F
+#define QIB_7322_SendBufErr0_SendBufErr_63_0_RMASK 0x0
+
+#define QIB_7322_AvailUpdCount_OFFS 0x268
+#define QIB_7322_AvailUpdCount_DEF 0x0000000000000000
+#define QIB_7322_AvailUpdCount_AvailUpdCount_LSB 0x0
+#define QIB_7322_AvailUpdCount_AvailUpdCount_MSB 0x4
+#define QIB_7322_AvailUpdCount_AvailUpdCount_RMASK 0x1F
+
+#define QIB_7322_RcvHdrAddr0_OFFS 0x280
+#define QIB_7322_RcvHdrAddr0_DEF 0x0000000000000000
+#define QIB_7322_RcvHdrAddr0_RcvHdrAddr_LSB 0x2
+#define QIB_7322_RcvHdrAddr0_RcvHdrAddr_MSB 0x27
+#define QIB_7322_RcvHdrAddr0_RcvHdrAddr_RMASK 0x3FFFFFFFFF
+
+#define QIB_7322_RcvHdrTailAddr0_OFFS 0x340
+#define QIB_7322_RcvHdrTailAddr0_DEF 0x0000000000000000
+#define QIB_7322_RcvHdrTailAddr0_RcvHdrTailAddr_LSB 0x2
+#define QIB_7322_RcvHdrTailAddr0_RcvHdrTailAddr_MSB 0x27
+#define QIB_7322_RcvHdrTailAddr0_RcvHdrTailAddr_RMASK 0x3FFFFFFFFF
+
+#define QIB_7322_ahb_access_ctrl_OFFS 0x460
+#define QIB_7322_ahb_access_ctrl_DEF 0x0000000000000000
+#define QIB_7322_ahb_access_ctrl_sw_sel_ahb_trgt_LSB 0x1
+#define QIB_7322_ahb_access_ctrl_sw_sel_ahb_trgt_MSB 0x2
+#define QIB_7322_ahb_access_ctrl_sw_sel_ahb_trgt_RMASK 0x3
+#define QIB_7322_ahb_access_ctrl_sw_ahb_sel_LSB 0x0
+#define QIB_7322_ahb_access_ctrl_sw_ahb_sel_MSB 0x0
+#define QIB_7322_ahb_access_ctrl_sw_ahb_sel_RMASK 0x1
+
+#define QIB_7322_ahb_transaction_reg_OFFS 0x468
+#define QIB_7322_ahb_transaction_reg_DEF 0x0000000080000000
+#define QIB_7322_ahb_transaction_reg_ahb_data_LSB 0x20
+#define QIB_7322_ahb_transaction_reg_ahb_data_MSB 0x3F
+#define QIB_7322_ahb_transaction_reg_ahb_data_RMASK 0xFFFFFFFF
+#define QIB_7322_ahb_transaction_reg_ahb_rdy_LSB 0x1F
+#define QIB_7322_ahb_transaction_reg_ahb_rdy_MSB 0x1F
+#define QIB_7322_ahb_transaction_reg_ahb_rdy_RMASK 0x1
+#define QIB_7322_ahb_transaction_reg_ahb_req_err_LSB 0x1E
+#define QIB_7322_ahb_transaction_reg_ahb_req_err_MSB 0x1E
+#define QIB_7322_ahb_transaction_reg_ahb_req_err_RMASK 0x1
+#define QIB_7322_ahb_transaction_reg_write_not_read_LSB 0x1B
+#define QIB_7322_ahb_transaction_reg_write_not_read_MSB 0x1B
+#define QIB_7322_ahb_transaction_reg_write_not_read_RMASK 0x1
+#define QIB_7322_ahb_transaction_reg_ahb_address_LSB 0x10
+#define QIB_7322_ahb_transaction_reg_ahb_address_MSB 0x1A
+#define QIB_7322_ahb_transaction_reg_ahb_address_RMASK 0x7FF
+
+#define QIB_7322_SPC_JTAG_ACCESS_REG_OFFS 0x470
+#define QIB_7322_SPC_JTAG_ACCESS_REG_DEF 0x0000000000000001
+#define QIB_7322_SPC_JTAG_ACCESS_REG_SPC_JTAG_ACCESS_EN_LSB 0xA
+#define QIB_7322_SPC_JTAG_ACCESS_REG_SPC_JTAG_ACCESS_EN_MSB 0xA
+#define QIB_7322_SPC_JTAG_ACCESS_REG_SPC_JTAG_ACCESS_EN_RMASK 0x1
+#define QIB_7322_SPC_JTAG_ACCESS_REG_bist_en_LSB 0x5
+#define QIB_7322_SPC_JTAG_ACCESS_REG_bist_en_MSB 0x9
+#define QIB_7322_SPC_JTAG_ACCESS_REG_bist_en_RMASK 0x1F
+#define QIB_7322_SPC_JTAG_ACCESS_REG_opcode_LSB 0x3
+#define QIB_7322_SPC_JTAG_ACCESS_REG_opcode_MSB 0x4
+#define QIB_7322_SPC_JTAG_ACCESS_REG_opcode_RMASK 0x3
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdi_LSB 0x2
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdi_MSB 0x2
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdi_RMASK 0x1
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdo_LSB 0x1
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdo_MSB 0x1
+#define QIB_7322_SPC_JTAG_ACCESS_REG_tdo_RMASK 0x1
+#define QIB_7322_SPC_JTAG_ACCESS_REG_rdy_LSB 0x0
+#define QIB_7322_SPC_JTAG_ACCESS_REG_rdy_MSB 0x0
+#define QIB_7322_SPC_JTAG_ACCESS_REG_rdy_RMASK 0x1
+
+#define QIB_7322_SendCheckMask0_OFFS 0x4C0
+#define QIB_7322_SendCheckMask0_DEF 0x0000000000000000
+#define QIB_7322_SendCheckMask0_SendCheckMask_63_32_LSB 0x0
+#define QIB_7322_SendCheckMask0_SendCheckMask_63_32_MSB 0x3F
+#define QIB_7322_SendCheckMask0_SendCheckMask_63_32_RMASK 0x0
+
+#define QIB_7322_SendGRHCheckMask0_OFFS 0x4E0
+#define QIB_7322_SendGRHCheckMask0_DEF 0x0000000000000000
+#define QIB_7322_SendGRHCheckMask0_SendGRHCheckMask_63_32_LSB 0x0
+#define QIB_7322_SendGRHCheckMask0_SendGRHCheckMask_63_32_MSB 0x3F
+#define QIB_7322_SendGRHCheckMask0_SendGRHCheckMask_63_32_RMASK 0x0
+
+#define QIB_7322_SendIBPacketMask0_OFFS 0x500
+#define QIB_7322_SendIBPacketMask0_DEF 0x0000000000000000
+#define QIB_7322_SendIBPacketMask0_SendIBPacketMask_63_32_LSB 0x0
+#define QIB_7322_SendIBPacketMask0_SendIBPacketMask_63_32_MSB 0x3F
+#define QIB_7322_SendIBPacketMask0_SendIBPacketMask_63_32_RMASK 0x0
+
+#define QIB_7322_IntRedirect0_OFFS 0x540
+#define QIB_7322_IntRedirect0_DEF 0x0000000000000000
+#define QIB_7322_IntRedirect0_vec11_LSB 0x37
+#define QIB_7322_IntRedirect0_vec11_MSB 0x3B
+#define QIB_7322_IntRedirect0_vec11_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec10_LSB 0x32
+#define QIB_7322_IntRedirect0_vec10_MSB 0x36
+#define QIB_7322_IntRedirect0_vec10_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec9_LSB 0x2D
+#define QIB_7322_IntRedirect0_vec9_MSB 0x31
+#define QIB_7322_IntRedirect0_vec9_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec8_LSB 0x28
+#define QIB_7322_IntRedirect0_vec8_MSB 0x2C
+#define QIB_7322_IntRedirect0_vec8_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec7_LSB 0x23
+#define QIB_7322_IntRedirect0_vec7_MSB 0x27
+#define QIB_7322_IntRedirect0_vec7_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec6_LSB 0x1E
+#define QIB_7322_IntRedirect0_vec6_MSB 0x22
+#define QIB_7322_IntRedirect0_vec6_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec5_LSB 0x19
+#define QIB_7322_IntRedirect0_vec5_MSB 0x1D
+#define QIB_7322_IntRedirect0_vec5_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec4_LSB 0x14
+#define QIB_7322_IntRedirect0_vec4_MSB 0x18
+#define QIB_7322_IntRedirect0_vec4_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec3_LSB 0xF
+#define QIB_7322_IntRedirect0_vec3_MSB 0x13
+#define QIB_7322_IntRedirect0_vec3_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec2_LSB 0xA
+#define QIB_7322_IntRedirect0_vec2_MSB 0xE
+#define QIB_7322_IntRedirect0_vec2_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec1_LSB 0x5
+#define QIB_7322_IntRedirect0_vec1_MSB 0x9
+#define QIB_7322_IntRedirect0_vec1_RMASK 0x1F
+#define QIB_7322_IntRedirect0_vec0_LSB 0x0
+#define QIB_7322_IntRedirect0_vec0_MSB 0x4
+#define QIB_7322_IntRedirect0_vec0_RMASK 0x1F
+
+#define QIB_7322_Int_Granted_OFFS 0x570
+#define QIB_7322_Int_Granted_DEF 0x0000000000000000
+
+#define QIB_7322_vec_clr_without_int_OFFS 0x578
+#define QIB_7322_vec_clr_without_int_DEF 0x0000000000000000
+
+#define QIB_7322_DCACtrlA_OFFS 0x580
+#define QIB_7322_DCACtrlA_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlA_SendDMAHead1DCAEnable_LSB 0x4
+#define QIB_7322_DCACtrlA_SendDMAHead1DCAEnable_MSB 0x4
+#define QIB_7322_DCACtrlA_SendDMAHead1DCAEnable_RMASK 0x1
+#define QIB_7322_DCACtrlA_SendDMAHead0DCAEnable_LSB 0x3
+#define QIB_7322_DCACtrlA_SendDMAHead0DCAEnable_MSB 0x3
+#define QIB_7322_DCACtrlA_SendDMAHead0DCAEnable_RMASK 0x1
+#define QIB_7322_DCACtrlA_RcvTailUpdDCAEnable_LSB 0x2
+#define QIB_7322_DCACtrlA_RcvTailUpdDCAEnable_MSB 0x2
+#define QIB_7322_DCACtrlA_RcvTailUpdDCAEnable_RMASK 0x1
+#define QIB_7322_DCACtrlA_EagerDCAEnable_LSB 0x1
+#define QIB_7322_DCACtrlA_EagerDCAEnable_MSB 0x1
+#define QIB_7322_DCACtrlA_EagerDCAEnable_RMASK 0x1
+#define QIB_7322_DCACtrlA_RcvHdrqDCAEnable_LSB 0x0
+#define QIB_7322_DCACtrlA_RcvHdrqDCAEnable_MSB 0x0
+#define QIB_7322_DCACtrlA_RcvHdrqDCAEnable_RMASK 0x1
+
+#define QIB_7322_DCACtrlB_OFFS 0x588
+#define QIB_7322_DCACtrlB_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAXfrCnt_LSB 0x36
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAXfrCnt_MSB 0x3B
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAOPH_LSB 0x2E
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAOPH_MSB 0x35
+#define QIB_7322_DCACtrlB_RcvHdrq3DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAXfrCnt_LSB 0x28
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAXfrCnt_MSB 0x2D
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAOPH_LSB 0x20
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAOPH_MSB 0x27
+#define QIB_7322_DCACtrlB_RcvHdrq2DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAXfrCnt_LSB 0x16
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAXfrCnt_MSB 0x1B
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAOPH_LSB 0xE
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAOPH_MSB 0x15
+#define QIB_7322_DCACtrlB_RcvHdrq1DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAXfrCnt_LSB 0x8
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAXfrCnt_MSB 0xD
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAOPH_LSB 0x0
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAOPH_MSB 0x7
+#define QIB_7322_DCACtrlB_RcvHdrq0DCAOPH_RMASK 0xFF
+
+#define QIB_7322_DCACtrlC_OFFS 0x590
+#define QIB_7322_DCACtrlC_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAXfrCnt_LSB 0x36
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAXfrCnt_MSB 0x3B
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAOPH_LSB 0x2E
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAOPH_MSB 0x35
+#define QIB_7322_DCACtrlC_RcvHdrq7DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAXfrCnt_LSB 0x28
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAXfrCnt_MSB 0x2D
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAOPH_LSB 0x20
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAOPH_MSB 0x27
+#define QIB_7322_DCACtrlC_RcvHdrq6DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAXfrCnt_LSB 0x16
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAXfrCnt_MSB 0x1B
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAOPH_LSB 0xE
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAOPH_MSB 0x15
+#define QIB_7322_DCACtrlC_RcvHdrq5DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAXfrCnt_LSB 0x8
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAXfrCnt_MSB 0xD
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAOPH_LSB 0x0
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAOPH_MSB 0x7
+#define QIB_7322_DCACtrlC_RcvHdrq4DCAOPH_RMASK 0xFF
+
+#define QIB_7322_DCACtrlD_OFFS 0x598
+#define QIB_7322_DCACtrlD_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAXfrCnt_LSB 0x36
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAXfrCnt_MSB 0x3B
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAOPH_LSB 0x2E
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAOPH_MSB 0x35
+#define QIB_7322_DCACtrlD_RcvHdrq11DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAXfrCnt_LSB 0x28
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAXfrCnt_MSB 0x2D
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAOPH_LSB 0x20
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAOPH_MSB 0x27
+#define QIB_7322_DCACtrlD_RcvHdrq10DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAXfrCnt_LSB 0x16
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAXfrCnt_MSB 0x1B
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAOPH_LSB 0xE
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAOPH_MSB 0x15
+#define QIB_7322_DCACtrlD_RcvHdrq9DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAXfrCnt_LSB 0x8
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAXfrCnt_MSB 0xD
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAOPH_LSB 0x0
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAOPH_MSB 0x7
+#define QIB_7322_DCACtrlD_RcvHdrq8DCAOPH_RMASK 0xFF
+
+#define QIB_7322_DCACtrlE_OFFS 0x5A0
+#define QIB_7322_DCACtrlE_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAXfrCnt_LSB 0x36
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAXfrCnt_MSB 0x3B
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAOPH_LSB 0x2E
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAOPH_MSB 0x35
+#define QIB_7322_DCACtrlE_RcvHdrq15DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAXfrCnt_LSB 0x28
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAXfrCnt_MSB 0x2D
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAOPH_LSB 0x20
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAOPH_MSB 0x27
+#define QIB_7322_DCACtrlE_RcvHdrq14DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAXfrCnt_LSB 0x16
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAXfrCnt_MSB 0x1B
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAOPH_LSB 0xE
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAOPH_MSB 0x15
+#define QIB_7322_DCACtrlE_RcvHdrq13DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAXfrCnt_LSB 0x8
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAXfrCnt_MSB 0xD
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAOPH_LSB 0x0
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAOPH_MSB 0x7
+#define QIB_7322_DCACtrlE_RcvHdrq12DCAOPH_RMASK 0xFF
+
+#define QIB_7322_DCACtrlF_OFFS 0x5A8
+#define QIB_7322_DCACtrlF_DEF 0x0000000000000000
+#define QIB_7322_DCACtrlF_SendDma1DCAOPH_LSB 0x28
+#define QIB_7322_DCACtrlF_SendDma1DCAOPH_MSB 0x2F
+#define QIB_7322_DCACtrlF_SendDma1DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlF_SendDma0DCAOPH_LSB 0x20
+#define QIB_7322_DCACtrlF_SendDma0DCAOPH_MSB 0x27
+#define QIB_7322_DCACtrlF_SendDma0DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAXfrCnt_LSB 0x16
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAXfrCnt_MSB 0x1B
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAOPH_LSB 0xE
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAOPH_MSB 0x15
+#define QIB_7322_DCACtrlF_RcvHdrq17DCAOPH_RMASK 0xFF
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAXfrCnt_LSB 0x8
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAXfrCnt_MSB 0xD
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAXfrCnt_RMASK 0x3F
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAOPH_LSB 0x0
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAOPH_MSB 0x7
+#define QIB_7322_DCACtrlF_RcvHdrq16DCAOPH_RMASK 0xFF
+
+#define QIB_7322_RcvAvailTimeOut0_OFFS 0xC00
+#define QIB_7322_RcvAvailTimeOut0_DEF 0x0000000000000000
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOCount_LSB 0x10
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOCount_MSB 0x1F
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOCount_RMASK 0xFFFF
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOReload_LSB 0x0
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOReload_MSB 0xF
+#define QIB_7322_RcvAvailTimeOut0_RcvAvailTOReload_RMASK 0xFFFF
+
+#define QIB_7322_CntrRegBase_0_OFFS 0x1028
+#define QIB_7322_CntrRegBase_0_DEF 0x0000000000012000
+
+#define QIB_7322_ErrMask_0_OFFS 0x1080
+#define QIB_7322_ErrMask_0_DEF 0x0000000000000000
+#define QIB_7322_ErrMask_0_IBStatusChangedMask_LSB 0x3A
+#define QIB_7322_ErrMask_0_IBStatusChangedMask_MSB 0x3A
+#define QIB_7322_ErrMask_0_IBStatusChangedMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SHeadersErrMask_LSB 0x39
+#define QIB_7322_ErrMask_0_SHeadersErrMask_MSB 0x39
+#define QIB_7322_ErrMask_0_SHeadersErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_VL15BufMisuseErrMask_LSB 0x36
+#define QIB_7322_ErrMask_0_VL15BufMisuseErrMask_MSB 0x36
+#define QIB_7322_ErrMask_0_VL15BufMisuseErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaHaltErrMask_LSB 0x31
+#define QIB_7322_ErrMask_0_SDmaHaltErrMask_MSB 0x31
+#define QIB_7322_ErrMask_0_SDmaHaltErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaDescAddrMisalignErrMask_LSB 0x30
+#define QIB_7322_ErrMask_0_SDmaDescAddrMisalignErrMask_MSB 0x30
+#define QIB_7322_ErrMask_0_SDmaDescAddrMisalignErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaUnexpDataErrMask_LSB 0x2F
+#define QIB_7322_ErrMask_0_SDmaUnexpDataErrMask_MSB 0x2F
+#define QIB_7322_ErrMask_0_SDmaUnexpDataErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaMissingDwErrMask_LSB 0x2E
+#define QIB_7322_ErrMask_0_SDmaMissingDwErrMask_MSB 0x2E
+#define QIB_7322_ErrMask_0_SDmaMissingDwErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaDwEnErrMask_LSB 0x2D
+#define QIB_7322_ErrMask_0_SDmaDwEnErrMask_MSB 0x2D
+#define QIB_7322_ErrMask_0_SDmaDwEnErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaRpyTagErrMask_LSB 0x2C
+#define QIB_7322_ErrMask_0_SDmaRpyTagErrMask_MSB 0x2C
+#define QIB_7322_ErrMask_0_SDmaRpyTagErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDma1stDescErrMask_LSB 0x2B
+#define QIB_7322_ErrMask_0_SDma1stDescErrMask_MSB 0x2B
+#define QIB_7322_ErrMask_0_SDma1stDescErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaBaseErrMask_LSB 0x2A
+#define QIB_7322_ErrMask_0_SDmaBaseErrMask_MSB 0x2A
+#define QIB_7322_ErrMask_0_SDmaBaseErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaTailOutOfBoundErrMask_LSB 0x29
+#define QIB_7322_ErrMask_0_SDmaTailOutOfBoundErrMask_MSB 0x29
+#define QIB_7322_ErrMask_0_SDmaTailOutOfBoundErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaOutOfBoundErrMask_LSB 0x28
+#define QIB_7322_ErrMask_0_SDmaOutOfBoundErrMask_MSB 0x28
+#define QIB_7322_ErrMask_0_SDmaOutOfBoundErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SDmaGenMismatchErrMask_LSB 0x27
+#define QIB_7322_ErrMask_0_SDmaGenMismatchErrMask_MSB 0x27
+#define QIB_7322_ErrMask_0_SDmaGenMismatchErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendBufMisuseErrMask_LSB 0x26
+#define QIB_7322_ErrMask_0_SendBufMisuseErrMask_MSB 0x26
+#define QIB_7322_ErrMask_0_SendBufMisuseErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendUnsupportedVLErrMask_LSB 0x25
+#define QIB_7322_ErrMask_0_SendUnsupportedVLErrMask_MSB 0x25
+#define QIB_7322_ErrMask_0_SendUnsupportedVLErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendUnexpectedPktNumErrMask_LSB 0x24
+#define QIB_7322_ErrMask_0_SendUnexpectedPktNumErrMask_MSB 0x24
+#define QIB_7322_ErrMask_0_SendUnexpectedPktNumErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendDroppedDataPktErrMask_LSB 0x22
+#define QIB_7322_ErrMask_0_SendDroppedDataPktErrMask_MSB 0x22
+#define QIB_7322_ErrMask_0_SendDroppedDataPktErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendDroppedSmpPktErrMask_LSB 0x21
+#define QIB_7322_ErrMask_0_SendDroppedSmpPktErrMask_MSB 0x21
+#define QIB_7322_ErrMask_0_SendDroppedSmpPktErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendPktLenErrMask_LSB 0x20
+#define QIB_7322_ErrMask_0_SendPktLenErrMask_MSB 0x20
+#define QIB_7322_ErrMask_0_SendPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendUnderRunErrMask_LSB 0x1F
+#define QIB_7322_ErrMask_0_SendUnderRunErrMask_MSB 0x1F
+#define QIB_7322_ErrMask_0_SendUnderRunErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendMaxPktLenErrMask_LSB 0x1E
+#define QIB_7322_ErrMask_0_SendMaxPktLenErrMask_MSB 0x1E
+#define QIB_7322_ErrMask_0_SendMaxPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_SendMinPktLenErrMask_LSB 0x1D
+#define QIB_7322_ErrMask_0_SendMinPktLenErrMask_MSB 0x1D
+#define QIB_7322_ErrMask_0_SendMinPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvIBLostLinkErrMask_LSB 0x11
+#define QIB_7322_ErrMask_0_RcvIBLostLinkErrMask_MSB 0x11
+#define QIB_7322_ErrMask_0_RcvIBLostLinkErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvHdrErrMask_LSB 0x10
+#define QIB_7322_ErrMask_0_RcvHdrErrMask_MSB 0x10
+#define QIB_7322_ErrMask_0_RcvHdrErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvHdrLenErrMask_LSB 0xF
+#define QIB_7322_ErrMask_0_RcvHdrLenErrMask_MSB 0xF
+#define QIB_7322_ErrMask_0_RcvHdrLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvBadTidErrMask_LSB 0xE
+#define QIB_7322_ErrMask_0_RcvBadTidErrMask_MSB 0xE
+#define QIB_7322_ErrMask_0_RcvBadTidErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvBadVersionErrMask_LSB 0xB
+#define QIB_7322_ErrMask_0_RcvBadVersionErrMask_MSB 0xB
+#define QIB_7322_ErrMask_0_RcvBadVersionErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvIBFlowErrMask_LSB 0xA
+#define QIB_7322_ErrMask_0_RcvIBFlowErrMask_MSB 0xA
+#define QIB_7322_ErrMask_0_RcvIBFlowErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvEBPErrMask_LSB 0x9
+#define QIB_7322_ErrMask_0_RcvEBPErrMask_MSB 0x9
+#define QIB_7322_ErrMask_0_RcvEBPErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvUnsupportedVLErrMask_LSB 0x8
+#define QIB_7322_ErrMask_0_RcvUnsupportedVLErrMask_MSB 0x8
+#define QIB_7322_ErrMask_0_RcvUnsupportedVLErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvUnexpectedCharErrMask_LSB 0x7
+#define QIB_7322_ErrMask_0_RcvUnexpectedCharErrMask_MSB 0x7
+#define QIB_7322_ErrMask_0_RcvUnexpectedCharErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvShortPktLenErrMask_LSB 0x6
+#define QIB_7322_ErrMask_0_RcvShortPktLenErrMask_MSB 0x6
+#define QIB_7322_ErrMask_0_RcvShortPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvLongPktLenErrMask_LSB 0x5
+#define QIB_7322_ErrMask_0_RcvLongPktLenErrMask_MSB 0x5
+#define QIB_7322_ErrMask_0_RcvLongPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvMaxPktLenErrMask_LSB 0x4
+#define QIB_7322_ErrMask_0_RcvMaxPktLenErrMask_MSB 0x4
+#define QIB_7322_ErrMask_0_RcvMaxPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvMinPktLenErrMask_LSB 0x3
+#define QIB_7322_ErrMask_0_RcvMinPktLenErrMask_MSB 0x3
+#define QIB_7322_ErrMask_0_RcvMinPktLenErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvICRCErrMask_LSB 0x2
+#define QIB_7322_ErrMask_0_RcvICRCErrMask_MSB 0x2
+#define QIB_7322_ErrMask_0_RcvICRCErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvVCRCErrMask_LSB 0x1
+#define QIB_7322_ErrMask_0_RcvVCRCErrMask_MSB 0x1
+#define QIB_7322_ErrMask_0_RcvVCRCErrMask_RMASK 0x1
+#define QIB_7322_ErrMask_0_RcvFormatErrMask_LSB 0x0
+#define QIB_7322_ErrMask_0_RcvFormatErrMask_MSB 0x0
+#define QIB_7322_ErrMask_0_RcvFormatErrMask_RMASK 0x1
+
+#define QIB_7322_ErrStatus_0_OFFS 0x1088
+#define QIB_7322_ErrStatus_0_DEF 0x0000000000000000
+#define QIB_7322_ErrStatus_0_IBStatusChanged_LSB 0x3A
+#define QIB_7322_ErrStatus_0_IBStatusChanged_MSB 0x3A
+#define QIB_7322_ErrStatus_0_IBStatusChanged_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SHeadersErr_LSB 0x39
+#define QIB_7322_ErrStatus_0_SHeadersErr_MSB 0x39
+#define QIB_7322_ErrStatus_0_SHeadersErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_VL15BufMisuseErr_LSB 0x36
+#define QIB_7322_ErrStatus_0_VL15BufMisuseErr_MSB 0x36
+#define QIB_7322_ErrStatus_0_VL15BufMisuseErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaHaltErr_LSB 0x31
+#define QIB_7322_ErrStatus_0_SDmaHaltErr_MSB 0x31
+#define QIB_7322_ErrStatus_0_SDmaHaltErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaDescAddrMisalignErr_LSB 0x30
+#define QIB_7322_ErrStatus_0_SDmaDescAddrMisalignErr_MSB 0x30
+#define QIB_7322_ErrStatus_0_SDmaDescAddrMisalignErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaUnexpDataErr_LSB 0x2F
+#define QIB_7322_ErrStatus_0_SDmaUnexpDataErr_MSB 0x2F
+#define QIB_7322_ErrStatus_0_SDmaUnexpDataErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaMissingDwErr_LSB 0x2E
+#define QIB_7322_ErrStatus_0_SDmaMissingDwErr_MSB 0x2E
+#define QIB_7322_ErrStatus_0_SDmaMissingDwErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaDwEnErr_LSB 0x2D
+#define QIB_7322_ErrStatus_0_SDmaDwEnErr_MSB 0x2D
+#define QIB_7322_ErrStatus_0_SDmaDwEnErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaRpyTagErr_LSB 0x2C
+#define QIB_7322_ErrStatus_0_SDmaRpyTagErr_MSB 0x2C
+#define QIB_7322_ErrStatus_0_SDmaRpyTagErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDma1stDescErr_LSB 0x2B
+#define QIB_7322_ErrStatus_0_SDma1stDescErr_MSB 0x2B
+#define QIB_7322_ErrStatus_0_SDma1stDescErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaBaseErr_LSB 0x2A
+#define QIB_7322_ErrStatus_0_SDmaBaseErr_MSB 0x2A
+#define QIB_7322_ErrStatus_0_SDmaBaseErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaTailOutOfBoundErr_LSB 0x29
+#define QIB_7322_ErrStatus_0_SDmaTailOutOfBoundErr_MSB 0x29
+#define QIB_7322_ErrStatus_0_SDmaTailOutOfBoundErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaOutOfBoundErr_LSB 0x28
+#define QIB_7322_ErrStatus_0_SDmaOutOfBoundErr_MSB 0x28
+#define QIB_7322_ErrStatus_0_SDmaOutOfBoundErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SDmaGenMismatchErr_LSB 0x27
+#define QIB_7322_ErrStatus_0_SDmaGenMismatchErr_MSB 0x27
+#define QIB_7322_ErrStatus_0_SDmaGenMismatchErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendBufMisuseErr_LSB 0x26
+#define QIB_7322_ErrStatus_0_SendBufMisuseErr_MSB 0x26
+#define QIB_7322_ErrStatus_0_SendBufMisuseErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendUnsupportedVLErr_LSB 0x25
+#define QIB_7322_ErrStatus_0_SendUnsupportedVLErr_MSB 0x25
+#define QIB_7322_ErrStatus_0_SendUnsupportedVLErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendUnexpectedPktNumErr_LSB 0x24
+#define QIB_7322_ErrStatus_0_SendUnexpectedPktNumErr_MSB 0x24
+#define QIB_7322_ErrStatus_0_SendUnexpectedPktNumErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendDroppedDataPktErr_LSB 0x22
+#define QIB_7322_ErrStatus_0_SendDroppedDataPktErr_MSB 0x22
+#define QIB_7322_ErrStatus_0_SendDroppedDataPktErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendDroppedSmpPktErr_LSB 0x21
+#define QIB_7322_ErrStatus_0_SendDroppedSmpPktErr_MSB 0x21
+#define QIB_7322_ErrStatus_0_SendDroppedSmpPktErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendPktLenErr_LSB 0x20
+#define QIB_7322_ErrStatus_0_SendPktLenErr_MSB 0x20
+#define QIB_7322_ErrStatus_0_SendPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendUnderRunErr_LSB 0x1F
+#define QIB_7322_ErrStatus_0_SendUnderRunErr_MSB 0x1F
+#define QIB_7322_ErrStatus_0_SendUnderRunErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendMaxPktLenErr_LSB 0x1E
+#define QIB_7322_ErrStatus_0_SendMaxPktLenErr_MSB 0x1E
+#define QIB_7322_ErrStatus_0_SendMaxPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_SendMinPktLenErr_LSB 0x1D
+#define QIB_7322_ErrStatus_0_SendMinPktLenErr_MSB 0x1D
+#define QIB_7322_ErrStatus_0_SendMinPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvIBLostLinkErr_LSB 0x11
+#define QIB_7322_ErrStatus_0_RcvIBLostLinkErr_MSB 0x11
+#define QIB_7322_ErrStatus_0_RcvIBLostLinkErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvHdrErr_LSB 0x10
+#define QIB_7322_ErrStatus_0_RcvHdrErr_MSB 0x10
+#define QIB_7322_ErrStatus_0_RcvHdrErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvHdrLenErr_LSB 0xF
+#define QIB_7322_ErrStatus_0_RcvHdrLenErr_MSB 0xF
+#define QIB_7322_ErrStatus_0_RcvHdrLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvBadTidErr_LSB 0xE
+#define QIB_7322_ErrStatus_0_RcvBadTidErr_MSB 0xE
+#define QIB_7322_ErrStatus_0_RcvBadTidErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvBadVersionErr_LSB 0xB
+#define QIB_7322_ErrStatus_0_RcvBadVersionErr_MSB 0xB
+#define QIB_7322_ErrStatus_0_RcvBadVersionErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvIBFlowErr_LSB 0xA
+#define QIB_7322_ErrStatus_0_RcvIBFlowErr_MSB 0xA
+#define QIB_7322_ErrStatus_0_RcvIBFlowErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvEBPErr_LSB 0x9
+#define QIB_7322_ErrStatus_0_RcvEBPErr_MSB 0x9
+#define QIB_7322_ErrStatus_0_RcvEBPErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvUnsupportedVLErr_LSB 0x8
+#define QIB_7322_ErrStatus_0_RcvUnsupportedVLErr_MSB 0x8
+#define QIB_7322_ErrStatus_0_RcvUnsupportedVLErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvUnexpectedCharErr_LSB 0x7
+#define QIB_7322_ErrStatus_0_RcvUnexpectedCharErr_MSB 0x7
+#define QIB_7322_ErrStatus_0_RcvUnexpectedCharErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvShortPktLenErr_LSB 0x6
+#define QIB_7322_ErrStatus_0_RcvShortPktLenErr_MSB 0x6
+#define QIB_7322_ErrStatus_0_RcvShortPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvLongPktLenErr_LSB 0x5
+#define QIB_7322_ErrStatus_0_RcvLongPktLenErr_MSB 0x5
+#define QIB_7322_ErrStatus_0_RcvLongPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvMaxPktLenErr_LSB 0x4
+#define QIB_7322_ErrStatus_0_RcvMaxPktLenErr_MSB 0x4
+#define QIB_7322_ErrStatus_0_RcvMaxPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvMinPktLenErr_LSB 0x3
+#define QIB_7322_ErrStatus_0_RcvMinPktLenErr_MSB 0x3
+#define QIB_7322_ErrStatus_0_RcvMinPktLenErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvICRCErr_LSB 0x2
+#define QIB_7322_ErrStatus_0_RcvICRCErr_MSB 0x2
+#define QIB_7322_ErrStatus_0_RcvICRCErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvVCRCErr_LSB 0x1
+#define QIB_7322_ErrStatus_0_RcvVCRCErr_MSB 0x1
+#define QIB_7322_ErrStatus_0_RcvVCRCErr_RMASK 0x1
+#define QIB_7322_ErrStatus_0_RcvFormatErr_LSB 0x0
+#define QIB_7322_ErrStatus_0_RcvFormatErr_MSB 0x0
+#define QIB_7322_ErrStatus_0_RcvFormatErr_RMASK 0x1
+
+#define QIB_7322_ErrClear_0_OFFS 0x1090
+#define QIB_7322_ErrClear_0_DEF 0x0000000000000000
+#define QIB_7322_ErrClear_0_IBStatusChangedClear_LSB 0x3A
+#define QIB_7322_ErrClear_0_IBStatusChangedClear_MSB 0x3A
+#define QIB_7322_ErrClear_0_IBStatusChangedClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SHeadersErrClear_LSB 0x39
+#define QIB_7322_ErrClear_0_SHeadersErrClear_MSB 0x39
+#define QIB_7322_ErrClear_0_SHeadersErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_VL15BufMisuseErrClear_LSB 0x36
+#define QIB_7322_ErrClear_0_VL15BufMisuseErrClear_MSB 0x36
+#define QIB_7322_ErrClear_0_VL15BufMisuseErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaHaltErrClear_LSB 0x31
+#define QIB_7322_ErrClear_0_SDmaHaltErrClear_MSB 0x31
+#define QIB_7322_ErrClear_0_SDmaHaltErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaDescAddrMisalignErrClear_LSB 0x30
+#define QIB_7322_ErrClear_0_SDmaDescAddrMisalignErrClear_MSB 0x30
+#define QIB_7322_ErrClear_0_SDmaDescAddrMisalignErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaUnexpDataErrClear_LSB 0x2F
+#define QIB_7322_ErrClear_0_SDmaUnexpDataErrClear_MSB 0x2F
+#define QIB_7322_ErrClear_0_SDmaUnexpDataErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaMissingDwErrClear_LSB 0x2E
+#define QIB_7322_ErrClear_0_SDmaMissingDwErrClear_MSB 0x2E
+#define QIB_7322_ErrClear_0_SDmaMissingDwErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaDwEnErrClear_LSB 0x2D
+#define QIB_7322_ErrClear_0_SDmaDwEnErrClear_MSB 0x2D
+#define QIB_7322_ErrClear_0_SDmaDwEnErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaRpyTagErrClear_LSB 0x2C
+#define QIB_7322_ErrClear_0_SDmaRpyTagErrClear_MSB 0x2C
+#define QIB_7322_ErrClear_0_SDmaRpyTagErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDma1stDescErrClear_LSB 0x2B
+#define QIB_7322_ErrClear_0_SDma1stDescErrClear_MSB 0x2B
+#define QIB_7322_ErrClear_0_SDma1stDescErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaBaseErrClear_LSB 0x2A
+#define QIB_7322_ErrClear_0_SDmaBaseErrClear_MSB 0x2A
+#define QIB_7322_ErrClear_0_SDmaBaseErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaTailOutOfBoundErrClear_LSB 0x29
+#define QIB_7322_ErrClear_0_SDmaTailOutOfBoundErrClear_MSB 0x29
+#define QIB_7322_ErrClear_0_SDmaTailOutOfBoundErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaOutOfBoundErrClear_LSB 0x28
+#define QIB_7322_ErrClear_0_SDmaOutOfBoundErrClear_MSB 0x28
+#define QIB_7322_ErrClear_0_SDmaOutOfBoundErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SDmaGenMismatchErrClear_LSB 0x27
+#define QIB_7322_ErrClear_0_SDmaGenMismatchErrClear_MSB 0x27
+#define QIB_7322_ErrClear_0_SDmaGenMismatchErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendBufMisuseErrClear_LSB 0x26
+#define QIB_7322_ErrClear_0_SendBufMisuseErrClear_MSB 0x26
+#define QIB_7322_ErrClear_0_SendBufMisuseErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendUnsupportedVLErrClear_LSB 0x25
+#define QIB_7322_ErrClear_0_SendUnsupportedVLErrClear_MSB 0x25
+#define QIB_7322_ErrClear_0_SendUnsupportedVLErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendUnexpectedPktNumErrClear_LSB 0x24
+#define QIB_7322_ErrClear_0_SendUnexpectedPktNumErrClear_MSB 0x24
+#define QIB_7322_ErrClear_0_SendUnexpectedPktNumErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendDroppedDataPktErrClear_LSB 0x22
+#define QIB_7322_ErrClear_0_SendDroppedDataPktErrClear_MSB 0x22
+#define QIB_7322_ErrClear_0_SendDroppedDataPktErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendDroppedSmpPktErrClear_LSB 0x21
+#define QIB_7322_ErrClear_0_SendDroppedSmpPktErrClear_MSB 0x21
+#define QIB_7322_ErrClear_0_SendDroppedSmpPktErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendPktLenErrClear_LSB 0x20
+#define QIB_7322_ErrClear_0_SendPktLenErrClear_MSB 0x20
+#define QIB_7322_ErrClear_0_SendPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendUnderRunErrClear_LSB 0x1F
+#define QIB_7322_ErrClear_0_SendUnderRunErrClear_MSB 0x1F
+#define QIB_7322_ErrClear_0_SendUnderRunErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendMaxPktLenErrClear_LSB 0x1E
+#define QIB_7322_ErrClear_0_SendMaxPktLenErrClear_MSB 0x1E
+#define QIB_7322_ErrClear_0_SendMaxPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_SendMinPktLenErrClear_LSB 0x1D
+#define QIB_7322_ErrClear_0_SendMinPktLenErrClear_MSB 0x1D
+#define QIB_7322_ErrClear_0_SendMinPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvIBLostLinkErrClear_LSB 0x11
+#define QIB_7322_ErrClear_0_RcvIBLostLinkErrClear_MSB 0x11
+#define QIB_7322_ErrClear_0_RcvIBLostLinkErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvHdrErrClear_LSB 0x10
+#define QIB_7322_ErrClear_0_RcvHdrErrClear_MSB 0x10
+#define QIB_7322_ErrClear_0_RcvHdrErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvHdrLenErrClear_LSB 0xF
+#define QIB_7322_ErrClear_0_RcvHdrLenErrClear_MSB 0xF
+#define QIB_7322_ErrClear_0_RcvHdrLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvBadTidErrClear_LSB 0xE
+#define QIB_7322_ErrClear_0_RcvBadTidErrClear_MSB 0xE
+#define QIB_7322_ErrClear_0_RcvBadTidErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvBadVersionErrClear_LSB 0xB
+#define QIB_7322_ErrClear_0_RcvBadVersionErrClear_MSB 0xB
+#define QIB_7322_ErrClear_0_RcvBadVersionErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvIBFlowErrClear_LSB 0xA
+#define QIB_7322_ErrClear_0_RcvIBFlowErrClear_MSB 0xA
+#define QIB_7322_ErrClear_0_RcvIBFlowErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvEBPErrClear_LSB 0x9
+#define QIB_7322_ErrClear_0_RcvEBPErrClear_MSB 0x9
+#define QIB_7322_ErrClear_0_RcvEBPErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvUnsupportedVLErrClear_LSB 0x8
+#define QIB_7322_ErrClear_0_RcvUnsupportedVLErrClear_MSB 0x8
+#define QIB_7322_ErrClear_0_RcvUnsupportedVLErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvUnexpectedCharErrClear_LSB 0x7
+#define QIB_7322_ErrClear_0_RcvUnexpectedCharErrClear_MSB 0x7
+#define QIB_7322_ErrClear_0_RcvUnexpectedCharErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvShortPktLenErrClear_LSB 0x6
+#define QIB_7322_ErrClear_0_RcvShortPktLenErrClear_MSB 0x6
+#define QIB_7322_ErrClear_0_RcvShortPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvLongPktLenErrClear_LSB 0x5
+#define QIB_7322_ErrClear_0_RcvLongPktLenErrClear_MSB 0x5
+#define QIB_7322_ErrClear_0_RcvLongPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvMaxPktLenErrClear_LSB 0x4
+#define QIB_7322_ErrClear_0_RcvMaxPktLenErrClear_MSB 0x4
+#define QIB_7322_ErrClear_0_RcvMaxPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvMinPktLenErrClear_LSB 0x3
+#define QIB_7322_ErrClear_0_RcvMinPktLenErrClear_MSB 0x3
+#define QIB_7322_ErrClear_0_RcvMinPktLenErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvICRCErrClear_LSB 0x2
+#define QIB_7322_ErrClear_0_RcvICRCErrClear_MSB 0x2
+#define QIB_7322_ErrClear_0_RcvICRCErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvVCRCErrClear_LSB 0x1
+#define QIB_7322_ErrClear_0_RcvVCRCErrClear_MSB 0x1
+#define QIB_7322_ErrClear_0_RcvVCRCErrClear_RMASK 0x1
+#define QIB_7322_ErrClear_0_RcvFormatErrClear_LSB 0x0
+#define QIB_7322_ErrClear_0_RcvFormatErrClear_MSB 0x0
+#define QIB_7322_ErrClear_0_RcvFormatErrClear_RMASK 0x1
+
+#define QIB_7322_TXEStatus_0_OFFS 0x10B8
+#define QIB_7322_TXEStatus_0_DEF 0x0000000XC00080FF
+#define QIB_7322_TXEStatus_0_TXE_IBC_Idle_LSB 0x1F
+#define QIB_7322_TXEStatus_0_TXE_IBC_Idle_MSB 0x1F
+#define QIB_7322_TXEStatus_0_TXE_IBC_Idle_RMASK 0x1
+#define QIB_7322_TXEStatus_0_RmFifoEmpty_LSB 0x1E
+#define QIB_7322_TXEStatus_0_RmFifoEmpty_MSB 0x1E
+#define QIB_7322_TXEStatus_0_RmFifoEmpty_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL15_LSB 0xF
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL15_MSB 0xF
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL15_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL7_LSB 0x7
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL7_MSB 0x7
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL7_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL6_LSB 0x6
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL6_MSB 0x6
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL6_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL5_LSB 0x5
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL5_MSB 0x5
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL5_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL4_LSB 0x4
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL4_MSB 0x4
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL4_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL3_LSB 0x3
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL3_MSB 0x3
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL3_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL2_LSB 0x2
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL2_MSB 0x2
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL2_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL1_LSB 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL1_MSB 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL1_RMASK 0x1
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL0_LSB 0x0
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL0_MSB 0x0
+#define QIB_7322_TXEStatus_0_LaFifoEmpty_VL0_RMASK 0x1
+
+#define QIB_7322_RcvCtrl_0_OFFS 0x1100
+#define QIB_7322_RcvCtrl_0_DEF 0x0000000000000000
+#define QIB_7322_RcvCtrl_0_RcvResetCredit_LSB 0x2A
+#define QIB_7322_RcvCtrl_0_RcvResetCredit_MSB 0x2A
+#define QIB_7322_RcvCtrl_0_RcvResetCredit_RMASK 0x1
+#define QIB_7322_RcvCtrl_0_RcvPartitionKeyDisable_LSB 0x29
+#define QIB_7322_RcvCtrl_0_RcvPartitionKeyDisable_MSB 0x29
+#define QIB_7322_RcvCtrl_0_RcvPartitionKeyDisable_RMASK 0x1
+#define QIB_7322_RcvCtrl_0_RcvQPMapEnable_LSB 0x28
+#define QIB_7322_RcvCtrl_0_RcvQPMapEnable_MSB 0x28
+#define QIB_7322_RcvCtrl_0_RcvQPMapEnable_RMASK 0x1
+#define QIB_7322_RcvCtrl_0_RcvIBPortEnable_LSB 0x27
+#define QIB_7322_RcvCtrl_0_RcvIBPortEnable_MSB 0x27
+#define QIB_7322_RcvCtrl_0_RcvIBPortEnable_RMASK 0x1
+#define QIB_7322_RcvCtrl_0_ContextEnableUser_LSB 0x2
+#define QIB_7322_RcvCtrl_0_ContextEnableUser_MSB 0x11
+#define QIB_7322_RcvCtrl_0_ContextEnableUser_RMASK 0xFFFF
+#define QIB_7322_RcvCtrl_0_ContextEnableKernel_LSB 0x0
+#define QIB_7322_RcvCtrl_0_ContextEnableKernel_MSB 0x0
+#define QIB_7322_RcvCtrl_0_ContextEnableKernel_RMASK 0x1
+
+#define QIB_7322_RcvBTHQP_0_OFFS 0x1108
+#define QIB_7322_RcvBTHQP_0_DEF 0x0000000000000000
+#define QIB_7322_RcvBTHQP_0_RcvBTHQP_LSB 0x0
+#define QIB_7322_RcvBTHQP_0_RcvBTHQP_MSB 0x17
+#define QIB_7322_RcvBTHQP_0_RcvBTHQP_RMASK 0xFFFFFF
+
+#define QIB_7322_RcvQPMapTableA_0_OFFS 0x1110
+#define QIB_7322_RcvQPMapTableA_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext5_LSB 0x19
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext5_MSB 0x1D
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext5_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext4_LSB 0x14
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext4_MSB 0x18
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext4_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext3_LSB 0xF
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext3_MSB 0x13
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext3_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext2_LSB 0xA
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext2_MSB 0xE
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext2_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext1_LSB 0x5
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext1_MSB 0x9
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext1_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext0_LSB 0x0
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext0_MSB 0x4
+#define QIB_7322_RcvQPMapTableA_0_RcvQPMapContext0_RMASK 0x1F
+
+#define QIB_7322_RcvQPMapTableB_0_OFFS 0x1118
+#define QIB_7322_RcvQPMapTableB_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext11_LSB 0x19
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext11_MSB 0x1D
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext11_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext10_LSB 0x14
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext10_MSB 0x18
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext10_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext9_LSB 0xF
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext9_MSB 0x13
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext9_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext8_LSB 0xA
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext8_MSB 0xE
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext8_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext7_LSB 0x5
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext7_MSB 0x9
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext7_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext6_LSB 0x0
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext6_MSB 0x4
+#define QIB_7322_RcvQPMapTableB_0_RcvQPMapContext6_RMASK 0x1F
+
+#define QIB_7322_RcvQPMapTableC_0_OFFS 0x1120
+#define QIB_7322_RcvQPMapTableC_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext17_LSB 0x19
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext17_MSB 0x1D
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext17_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext16_LSB 0x14
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext16_MSB 0x18
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext16_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext15_LSB 0xF
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext15_MSB 0x13
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext15_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext14_LSB 0xA
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext14_MSB 0xE
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext14_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext13_LSB 0x5
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext13_MSB 0x9
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext13_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext12_LSB 0x0
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext12_MSB 0x4
+#define QIB_7322_RcvQPMapTableC_0_RcvQPMapContext12_RMASK 0x1F
+
+#define QIB_7322_RcvQPMapTableD_0_OFFS 0x1128
+#define QIB_7322_RcvQPMapTableD_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext23_LSB 0x19
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext23_MSB 0x1D
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext23_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext22_LSB 0x14
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext22_MSB 0x18
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext22_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext21_LSB 0xF
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext21_MSB 0x13
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext21_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext20_LSB 0xA
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext20_MSB 0xE
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext20_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext19_LSB 0x5
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext19_MSB 0x9
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext19_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext18_LSB 0x0
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext18_MSB 0x4
+#define QIB_7322_RcvQPMapTableD_0_RcvQPMapContext18_RMASK 0x1F
+
+#define QIB_7322_RcvQPMapTableE_0_OFFS 0x1130
+#define QIB_7322_RcvQPMapTableE_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext29_LSB 0x19
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext29_MSB 0x1D
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext29_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext28_LSB 0x14
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext28_MSB 0x18
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext28_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext27_LSB 0xF
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext27_MSB 0x13
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext27_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext26_LSB 0xA
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext26_MSB 0xE
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext26_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext25_LSB 0x5
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext25_MSB 0x9
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext25_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext24_LSB 0x0
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext24_MSB 0x4
+#define QIB_7322_RcvQPMapTableE_0_RcvQPMapContext24_RMASK 0x1F
+
+#define QIB_7322_RcvQPMapTableF_0_OFFS 0x1138
+#define QIB_7322_RcvQPMapTableF_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext31_LSB 0x5
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext31_MSB 0x9
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext31_RMASK 0x1F
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext30_LSB 0x0
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext30_MSB 0x4
+#define QIB_7322_RcvQPMapTableF_0_RcvQPMapContext30_RMASK 0x1F
+
+#define QIB_7322_PSStat_0_OFFS 0x1140
+#define QIB_7322_PSStat_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSStart_0_OFFS 0x1148
+#define QIB_7322_PSStart_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSInterval_0_OFFS 0x1150
+#define QIB_7322_PSInterval_0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvStatus_0_OFFS 0x1160
+#define QIB_7322_RcvStatus_0_DEF 0x0000000000000000
+#define QIB_7322_RcvStatus_0_DmaeqBlockingContext_LSB 0x1
+#define QIB_7322_RcvStatus_0_DmaeqBlockingContext_MSB 0x5
+#define QIB_7322_RcvStatus_0_DmaeqBlockingContext_RMASK 0x1F
+#define QIB_7322_RcvStatus_0_RxPktInProgress_LSB 0x0
+#define QIB_7322_RcvStatus_0_RxPktInProgress_MSB 0x0
+#define QIB_7322_RcvStatus_0_RxPktInProgress_RMASK 0x1
+
+#define QIB_7322_RcvPartitionKey_0_OFFS 0x1168
+#define QIB_7322_RcvPartitionKey_0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvQPMulticastContext_0_OFFS 0x1170
+#define QIB_7322_RcvQPMulticastContext_0_DEF 0x0000000000000000
+#define QIB_7322_RcvQPMulticastContext_0_RcvQpMcContext_LSB 0x0
+#define QIB_7322_RcvQPMulticastContext_0_RcvQpMcContext_MSB 0x4
+#define QIB_7322_RcvQPMulticastContext_0_RcvQpMcContext_RMASK 0x1F
+
+#define QIB_7322_RcvPktLEDCnt_0_OFFS 0x1178
+#define QIB_7322_RcvPktLEDCnt_0_DEF 0x0000000000000000
+#define QIB_7322_RcvPktLEDCnt_0_ONperiod_LSB 0x20
+#define QIB_7322_RcvPktLEDCnt_0_ONperiod_MSB 0x3F
+#define QIB_7322_RcvPktLEDCnt_0_ONperiod_RMASK 0xFFFFFFFF
+#define QIB_7322_RcvPktLEDCnt_0_OFFperiod_LSB 0x0
+#define QIB_7322_RcvPktLEDCnt_0_OFFperiod_MSB 0x1F
+#define QIB_7322_RcvPktLEDCnt_0_OFFperiod_RMASK 0xFFFFFFFF
+
+#define QIB_7322_SendDmaIdleCnt_0_OFFS 0x1180
+#define QIB_7322_SendDmaIdleCnt_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaIdleCnt_0_SendDmaIdleCnt_LSB 0x0
+#define QIB_7322_SendDmaIdleCnt_0_SendDmaIdleCnt_MSB 0xF
+#define QIB_7322_SendDmaIdleCnt_0_SendDmaIdleCnt_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaReloadCnt_0_OFFS 0x1188
+#define QIB_7322_SendDmaReloadCnt_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaReloadCnt_0_SendDmaReloadCnt_LSB 0x0
+#define QIB_7322_SendDmaReloadCnt_0_SendDmaReloadCnt_MSB 0xF
+#define QIB_7322_SendDmaReloadCnt_0_SendDmaReloadCnt_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaDescCnt_0_OFFS 0x1190
+#define QIB_7322_SendDmaDescCnt_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaDescCnt_0_SendDmaDescCnt_LSB 0x0
+#define QIB_7322_SendDmaDescCnt_0_SendDmaDescCnt_MSB 0xF
+#define QIB_7322_SendDmaDescCnt_0_SendDmaDescCnt_RMASK 0xFFFF
+
+#define QIB_7322_SendCtrl_0_OFFS 0x11C0
+#define QIB_7322_SendCtrl_0_DEF 0x0000000000000000
+#define QIB_7322_SendCtrl_0_IBVLArbiterEn_LSB 0xF
+#define QIB_7322_SendCtrl_0_IBVLArbiterEn_MSB 0xF
+#define QIB_7322_SendCtrl_0_IBVLArbiterEn_RMASK 0x1
+#define QIB_7322_SendCtrl_0_TxeDrainRmFifo_LSB 0xE
+#define QIB_7322_SendCtrl_0_TxeDrainRmFifo_MSB 0xE
+#define QIB_7322_SendCtrl_0_TxeDrainRmFifo_RMASK 0x1
+#define QIB_7322_SendCtrl_0_TxeDrainLaFifo_LSB 0xD
+#define QIB_7322_SendCtrl_0_TxeDrainLaFifo_MSB 0xD
+#define QIB_7322_SendCtrl_0_TxeDrainLaFifo_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SDmaHalt_LSB 0xC
+#define QIB_7322_SendCtrl_0_SDmaHalt_MSB 0xC
+#define QIB_7322_SendCtrl_0_SDmaHalt_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SDmaEnable_LSB 0xB
+#define QIB_7322_SendCtrl_0_SDmaEnable_MSB 0xB
+#define QIB_7322_SendCtrl_0_SDmaEnable_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SDmaSingleDescriptor_LSB 0xA
+#define QIB_7322_SendCtrl_0_SDmaSingleDescriptor_MSB 0xA
+#define QIB_7322_SendCtrl_0_SDmaSingleDescriptor_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SDmaIntEnable_LSB 0x9
+#define QIB_7322_SendCtrl_0_SDmaIntEnable_MSB 0x9
+#define QIB_7322_SendCtrl_0_SDmaIntEnable_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SDmaCleanup_LSB 0x8
+#define QIB_7322_SendCtrl_0_SDmaCleanup_MSB 0x8
+#define QIB_7322_SendCtrl_0_SDmaCleanup_RMASK 0x1
+#define QIB_7322_SendCtrl_0_ForceCreditUpToDate_LSB 0x7
+#define QIB_7322_SendCtrl_0_ForceCreditUpToDate_MSB 0x7
+#define QIB_7322_SendCtrl_0_ForceCreditUpToDate_RMASK 0x1
+#define QIB_7322_SendCtrl_0_SendEnable_LSB 0x3
+#define QIB_7322_SendCtrl_0_SendEnable_MSB 0x3
+#define QIB_7322_SendCtrl_0_SendEnable_RMASK 0x1
+#define QIB_7322_SendCtrl_0_TxeBypassIbc_LSB 0x1
+#define QIB_7322_SendCtrl_0_TxeBypassIbc_MSB 0x1
+#define QIB_7322_SendCtrl_0_TxeBypassIbc_RMASK 0x1
+#define QIB_7322_SendCtrl_0_TxeAbortIbc_LSB 0x0
+#define QIB_7322_SendCtrl_0_TxeAbortIbc_MSB 0x0
+#define QIB_7322_SendCtrl_0_TxeAbortIbc_RMASK 0x1
+
+#define QIB_7322_SendDmaBase_0_OFFS 0x11F8
+#define QIB_7322_SendDmaBase_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaBase_0_SendDmaBase_LSB 0x0
+#define QIB_7322_SendDmaBase_0_SendDmaBase_MSB 0x2F
+#define QIB_7322_SendDmaBase_0_SendDmaBase_RMASK 0xFFFFFFFFFFFF
+
+#define QIB_7322_SendDmaLenGen_0_OFFS 0x1200
+#define QIB_7322_SendDmaLenGen_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaLenGen_0_Generation_LSB 0x10
+#define QIB_7322_SendDmaLenGen_0_Generation_MSB 0x12
+#define QIB_7322_SendDmaLenGen_0_Generation_RMASK 0x7
+#define QIB_7322_SendDmaLenGen_0_Length_LSB 0x0
+#define QIB_7322_SendDmaLenGen_0_Length_MSB 0xF
+#define QIB_7322_SendDmaLenGen_0_Length_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaTail_0_OFFS 0x1208
+#define QIB_7322_SendDmaTail_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaTail_0_SendDmaTail_LSB 0x0
+#define QIB_7322_SendDmaTail_0_SendDmaTail_MSB 0xF
+#define QIB_7322_SendDmaTail_0_SendDmaTail_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaHead_0_OFFS 0x1210
+#define QIB_7322_SendDmaHead_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaHead_0_InternalSendDmaHead_LSB 0x20
+#define QIB_7322_SendDmaHead_0_InternalSendDmaHead_MSB 0x2F
+#define QIB_7322_SendDmaHead_0_InternalSendDmaHead_RMASK 0xFFFF
+#define QIB_7322_SendDmaHead_0_SendDmaHead_LSB 0x0
+#define QIB_7322_SendDmaHead_0_SendDmaHead_MSB 0xF
+#define QIB_7322_SendDmaHead_0_SendDmaHead_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaHeadAddr_0_OFFS 0x1218
+#define QIB_7322_SendDmaHeadAddr_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaHeadAddr_0_SendDmaHeadAddr_LSB 0x0
+#define QIB_7322_SendDmaHeadAddr_0_SendDmaHeadAddr_MSB 0x2F
+#define QIB_7322_SendDmaHeadAddr_0_SendDmaHeadAddr_RMASK 0xFFFFFFFFFFFF
+
+#define QIB_7322_SendDmaBufMask0_0_OFFS 0x1220
+#define QIB_7322_SendDmaBufMask0_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaBufMask0_0_BufMask_63_0_LSB 0x0
+#define QIB_7322_SendDmaBufMask0_0_BufMask_63_0_MSB 0x3F
+#define QIB_7322_SendDmaBufMask0_0_BufMask_63_0_RMASK 0x0
+
+#define QIB_7322_SendDmaStatus_0_OFFS 0x1238
+#define QIB_7322_SendDmaStatus_0_DEF 0x0000000042000000
+#define QIB_7322_SendDmaStatus_0_ScoreBoardDrainInProg_LSB 0x3F
+#define QIB_7322_SendDmaStatus_0_ScoreBoardDrainInProg_MSB 0x3F
+#define QIB_7322_SendDmaStatus_0_ScoreBoardDrainInProg_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_HaltInProg_LSB 0x3E
+#define QIB_7322_SendDmaStatus_0_HaltInProg_MSB 0x3E
+#define QIB_7322_SendDmaStatus_0_HaltInProg_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_InternalSDmaHalt_LSB 0x3D
+#define QIB_7322_SendDmaStatus_0_InternalSDmaHalt_MSB 0x3D
+#define QIB_7322_SendDmaStatus_0_InternalSDmaHalt_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_ScbDescIndex_13_0_LSB 0x2F
+#define QIB_7322_SendDmaStatus_0_ScbDescIndex_13_0_MSB 0x3C
+#define QIB_7322_SendDmaStatus_0_ScbDescIndex_13_0_RMASK 0x3FFF
+#define QIB_7322_SendDmaStatus_0_RpyLowAddr_6_0_LSB 0x28
+#define QIB_7322_SendDmaStatus_0_RpyLowAddr_6_0_MSB 0x2E
+#define QIB_7322_SendDmaStatus_0_RpyLowAddr_6_0_RMASK 0x7F
+#define QIB_7322_SendDmaStatus_0_RpyTag_7_0_LSB 0x20
+#define QIB_7322_SendDmaStatus_0_RpyTag_7_0_MSB 0x27
+#define QIB_7322_SendDmaStatus_0_RpyTag_7_0_RMASK 0xFF
+#define QIB_7322_SendDmaStatus_0_ScbFull_LSB 0x1F
+#define QIB_7322_SendDmaStatus_0_ScbFull_MSB 0x1F
+#define QIB_7322_SendDmaStatus_0_ScbFull_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_ScbEmpty_LSB 0x1E
+#define QIB_7322_SendDmaStatus_0_ScbEmpty_MSB 0x1E
+#define QIB_7322_SendDmaStatus_0_ScbEmpty_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_ScbEntryValid_LSB 0x1D
+#define QIB_7322_SendDmaStatus_0_ScbEntryValid_MSB 0x1D
+#define QIB_7322_SendDmaStatus_0_ScbEntryValid_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_ScbFetchDescFlag_LSB 0x1C
+#define QIB_7322_SendDmaStatus_0_ScbFetchDescFlag_MSB 0x1C
+#define QIB_7322_SendDmaStatus_0_ScbFetchDescFlag_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_SplFifoReadyToGo_LSB 0x1B
+#define QIB_7322_SendDmaStatus_0_SplFifoReadyToGo_MSB 0x1B
+#define QIB_7322_SendDmaStatus_0_SplFifoReadyToGo_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_SplFifoDisarmed_LSB 0x1A
+#define QIB_7322_SendDmaStatus_0_SplFifoDisarmed_MSB 0x1A
+#define QIB_7322_SendDmaStatus_0_SplFifoDisarmed_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_SplFifoEmpty_LSB 0x19
+#define QIB_7322_SendDmaStatus_0_SplFifoEmpty_MSB 0x19
+#define QIB_7322_SendDmaStatus_0_SplFifoEmpty_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_SplFifoFull_LSB 0x18
+#define QIB_7322_SendDmaStatus_0_SplFifoFull_MSB 0x18
+#define QIB_7322_SendDmaStatus_0_SplFifoFull_RMASK 0x1
+#define QIB_7322_SendDmaStatus_0_SplFifoBufNum_LSB 0x10
+#define QIB_7322_SendDmaStatus_0_SplFifoBufNum_MSB 0x17
+#define QIB_7322_SendDmaStatus_0_SplFifoBufNum_RMASK 0xFF
+#define QIB_7322_SendDmaStatus_0_SplFifoDescIndex_LSB 0x0
+#define QIB_7322_SendDmaStatus_0_SplFifoDescIndex_MSB 0xF
+#define QIB_7322_SendDmaStatus_0_SplFifoDescIndex_RMASK 0xFFFF
+
+#define QIB_7322_SendDmaPriorityThld_0_OFFS 0x1258
+#define QIB_7322_SendDmaPriorityThld_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaPriorityThld_0_PriorityThreshold_LSB 0x0
+#define QIB_7322_SendDmaPriorityThld_0_PriorityThreshold_MSB 0x3
+#define QIB_7322_SendDmaPriorityThld_0_PriorityThreshold_RMASK 0xF
+
+#define QIB_7322_SendHdrErrSymptom_0_OFFS 0x1260
+#define QIB_7322_SendHdrErrSymptom_0_DEF 0x0000000000000000
+#define QIB_7322_SendHdrErrSymptom_0_NonKeyPacket_LSB 0x6
+#define QIB_7322_SendHdrErrSymptom_0_NonKeyPacket_MSB 0x6
+#define QIB_7322_SendHdrErrSymptom_0_NonKeyPacket_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_GRHFail_LSB 0x5
+#define QIB_7322_SendHdrErrSymptom_0_GRHFail_MSB 0x5
+#define QIB_7322_SendHdrErrSymptom_0_GRHFail_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_PkeyFail_LSB 0x4
+#define QIB_7322_SendHdrErrSymptom_0_PkeyFail_MSB 0x4
+#define QIB_7322_SendHdrErrSymptom_0_PkeyFail_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_QPFail_LSB 0x3
+#define QIB_7322_SendHdrErrSymptom_0_QPFail_MSB 0x3
+#define QIB_7322_SendHdrErrSymptom_0_QPFail_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_SLIDFail_LSB 0x2
+#define QIB_7322_SendHdrErrSymptom_0_SLIDFail_MSB 0x2
+#define QIB_7322_SendHdrErrSymptom_0_SLIDFail_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_RawIPV6_LSB 0x1
+#define QIB_7322_SendHdrErrSymptom_0_RawIPV6_MSB 0x1
+#define QIB_7322_SendHdrErrSymptom_0_RawIPV6_RMASK 0x1
+#define QIB_7322_SendHdrErrSymptom_0_PacketTooSmall_LSB 0x0
+#define QIB_7322_SendHdrErrSymptom_0_PacketTooSmall_MSB 0x0
+#define QIB_7322_SendHdrErrSymptom_0_PacketTooSmall_RMASK 0x1
+
+#define QIB_7322_RxCreditVL0_0_OFFS 0x1280
+#define QIB_7322_RxCreditVL0_0_DEF 0x0000000000000000
+#define QIB_7322_RxCreditVL0_0_RxBufrConsumedVL_LSB 0x10
+#define QIB_7322_RxCreditVL0_0_RxBufrConsumedVL_MSB 0x1B
+#define QIB_7322_RxCreditVL0_0_RxBufrConsumedVL_RMASK 0xFFF
+#define QIB_7322_RxCreditVL0_0_RxMaxCreditVL_LSB 0x0
+#define QIB_7322_RxCreditVL0_0_RxMaxCreditVL_MSB 0xB
+#define QIB_7322_RxCreditVL0_0_RxMaxCreditVL_RMASK 0xFFF
+
+#define QIB_7322_SendDmaBufUsed0_0_OFFS 0x1480
+#define QIB_7322_SendDmaBufUsed0_0_DEF 0x0000000000000000
+#define QIB_7322_SendDmaBufUsed0_0_BufUsed_63_0_LSB 0x0
+#define QIB_7322_SendDmaBufUsed0_0_BufUsed_63_0_MSB 0x3F
+#define QIB_7322_SendDmaBufUsed0_0_BufUsed_63_0_RMASK 0x0
+
+#define QIB_7322_SendCheckControl_0_OFFS 0x14A8
+#define QIB_7322_SendCheckControl_0_DEF 0x0000000000000000
+#define QIB_7322_SendCheckControl_0_PKey_En_LSB 0x4
+#define QIB_7322_SendCheckControl_0_PKey_En_MSB 0x4
+#define QIB_7322_SendCheckControl_0_PKey_En_RMASK 0x1
+#define QIB_7322_SendCheckControl_0_BTHQP_En_LSB 0x3
+#define QIB_7322_SendCheckControl_0_BTHQP_En_MSB 0x3
+#define QIB_7322_SendCheckControl_0_BTHQP_En_RMASK 0x1
+#define QIB_7322_SendCheckControl_0_SLID_En_LSB 0x2
+#define QIB_7322_SendCheckControl_0_SLID_En_MSB 0x2
+#define QIB_7322_SendCheckControl_0_SLID_En_RMASK 0x1
+#define QIB_7322_SendCheckControl_0_RawIPV6_En_LSB 0x1
+#define QIB_7322_SendCheckControl_0_RawIPV6_En_MSB 0x1
+#define QIB_7322_SendCheckControl_0_RawIPV6_En_RMASK 0x1
+#define QIB_7322_SendCheckControl_0_PacketTooSmall_En_LSB 0x0
+#define QIB_7322_SendCheckControl_0_PacketTooSmall_En_MSB 0x0
+#define QIB_7322_SendCheckControl_0_PacketTooSmall_En_RMASK 0x1
+
+#define QIB_7322_SendIBSLIDMask_0_OFFS 0x14B0
+#define QIB_7322_SendIBSLIDMask_0_DEF 0x0000000000000000
+#define QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_LSB 0x0
+#define QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_MSB 0xF
+#define QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_RMASK 0xFFFF
+
+#define QIB_7322_SendIBSLIDAssign_0_OFFS 0x14B8
+#define QIB_7322_SendIBSLIDAssign_0_DEF 0x0000000000000000
+#define QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_LSB 0x0
+#define QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_MSB 0xF
+#define QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_RMASK 0xFFFF
+
+#define QIB_7322_IBCStatusA_0_OFFS 0x1540
+#define QIB_7322_IBCStatusA_0_DEF 0x0000000000000X02
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL7_LSB 0x27
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL7_MSB 0x27
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL7_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL6_LSB 0x26
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL6_MSB 0x26
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL6_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL5_LSB 0x25
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL5_MSB 0x25
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL5_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL4_LSB 0x24
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL4_MSB 0x24
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL4_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL3_LSB 0x23
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL3_MSB 0x23
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL3_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL2_LSB 0x22
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL2_MSB 0x22
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL2_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL1_LSB 0x21
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL1_MSB 0x21
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL1_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL0_LSB 0x20
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL0_MSB 0x20
+#define QIB_7322_IBCStatusA_0_TxCreditOk_VL0_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_TxReady_LSB 0x1E
+#define QIB_7322_IBCStatusA_0_TxReady_MSB 0x1E
+#define QIB_7322_IBCStatusA_0_TxReady_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_LinkSpeedQDR_LSB 0x1D
+#define QIB_7322_IBCStatusA_0_LinkSpeedQDR_MSB 0x1D
+#define QIB_7322_IBCStatusA_0_LinkSpeedQDR_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_ScrambleCapRemote_LSB 0xF
+#define QIB_7322_IBCStatusA_0_ScrambleCapRemote_MSB 0xF
+#define QIB_7322_IBCStatusA_0_ScrambleCapRemote_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_ScrambleEn_LSB 0xE
+#define QIB_7322_IBCStatusA_0_ScrambleEn_MSB 0xE
+#define QIB_7322_IBCStatusA_0_ScrambleEn_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_IBTxLaneReversed_LSB 0xD
+#define QIB_7322_IBCStatusA_0_IBTxLaneReversed_MSB 0xD
+#define QIB_7322_IBCStatusA_0_IBTxLaneReversed_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_IBRxLaneReversed_LSB 0xC
+#define QIB_7322_IBCStatusA_0_IBRxLaneReversed_MSB 0xC
+#define QIB_7322_IBCStatusA_0_IBRxLaneReversed_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_DDS_RXEQ_FAIL_LSB 0xA
+#define QIB_7322_IBCStatusA_0_DDS_RXEQ_FAIL_MSB 0xA
+#define QIB_7322_IBCStatusA_0_DDS_RXEQ_FAIL_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_LinkWidthActive_LSB 0x9
+#define QIB_7322_IBCStatusA_0_LinkWidthActive_MSB 0x9
+#define QIB_7322_IBCStatusA_0_LinkWidthActive_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_LinkSpeedActive_LSB 0x8
+#define QIB_7322_IBCStatusA_0_LinkSpeedActive_MSB 0x8
+#define QIB_7322_IBCStatusA_0_LinkSpeedActive_RMASK 0x1
+#define QIB_7322_IBCStatusA_0_LinkState_LSB 0x5
+#define QIB_7322_IBCStatusA_0_LinkState_MSB 0x7
+#define QIB_7322_IBCStatusA_0_LinkState_RMASK 0x7
+#define QIB_7322_IBCStatusA_0_LinkTrainingState_LSB 0x0
+#define QIB_7322_IBCStatusA_0_LinkTrainingState_MSB 0x4
+#define QIB_7322_IBCStatusA_0_LinkTrainingState_RMASK 0x1F
+
+#define QIB_7322_IBCStatusB_0_OFFS 0x1548
+#define QIB_7322_IBCStatusB_0_DEF 0x00000000XXXXXXXX
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_debug_LSB 0x27
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_debug_MSB 0x27
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_debug_RMASK 0x1
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_reached_threshold_LSB 0x26
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_reached_threshold_MSB 0x26
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_reached_threshold_RMASK 0x1
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_started_LSB 0x25
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_started_MSB 0x25
+#define QIB_7322_IBCStatusB_0_ibsd_adaptation_timer_started_RMASK 0x1
+#define QIB_7322_IBCStatusB_0_heartbeat_timed_out_LSB 0x24
+#define QIB_7322_IBCStatusB_0_heartbeat_timed_out_MSB 0x24
+#define QIB_7322_IBCStatusB_0_heartbeat_timed_out_RMASK 0x1
+#define QIB_7322_IBCStatusB_0_heartbeat_crosstalk_LSB 0x20
+#define QIB_7322_IBCStatusB_0_heartbeat_crosstalk_MSB 0x23
+#define QIB_7322_IBCStatusB_0_heartbeat_crosstalk_RMASK 0xF
+#define QIB_7322_IBCStatusB_0_RxEqLocalDevice_LSB 0x1E
+#define QIB_7322_IBCStatusB_0_RxEqLocalDevice_MSB 0x1F
+#define QIB_7322_IBCStatusB_0_RxEqLocalDevice_RMASK 0x3
+#define QIB_7322_IBCStatusB_0_ReqDDSLocalFromRmt_LSB 0x1A
+#define QIB_7322_IBCStatusB_0_ReqDDSLocalFromRmt_MSB 0x1D
+#define QIB_7322_IBCStatusB_0_ReqDDSLocalFromRmt_RMASK 0xF
+#define QIB_7322_IBCStatusB_0_LinkRoundTripLatency_LSB 0x0
+#define QIB_7322_IBCStatusB_0_LinkRoundTripLatency_MSB 0x19
+#define QIB_7322_IBCStatusB_0_LinkRoundTripLatency_RMASK 0x3FFFFFF
+
+#define QIB_7322_IBCCtrlA_0_OFFS 0x1560
+#define QIB_7322_IBCCtrlA_0_DEF 0x0000000000000000
+#define QIB_7322_IBCCtrlA_0_Loopback_LSB 0x3F
+#define QIB_7322_IBCCtrlA_0_Loopback_MSB 0x3F
+#define QIB_7322_IBCCtrlA_0_Loopback_RMASK 0x1
+#define QIB_7322_IBCCtrlA_0_LinkDownDefaultState_LSB 0x3E
+#define QIB_7322_IBCCtrlA_0_LinkDownDefaultState_MSB 0x3E
+#define QIB_7322_IBCCtrlA_0_LinkDownDefaultState_RMASK 0x1
+#define QIB_7322_IBCCtrlA_0_IBLinkEn_LSB 0x3D
+#define QIB_7322_IBCCtrlA_0_IBLinkEn_MSB 0x3D
+#define QIB_7322_IBCCtrlA_0_IBLinkEn_RMASK 0x1
+#define QIB_7322_IBCCtrlA_0_IBStatIntReductionEn_LSB 0x3C
+#define QIB_7322_IBCCtrlA_0_IBStatIntReductionEn_MSB 0x3C
+#define QIB_7322_IBCCtrlA_0_IBStatIntReductionEn_RMASK 0x1
+#define QIB_7322_IBCCtrlA_0_NumVLane_LSB 0x30
+#define QIB_7322_IBCCtrlA_0_NumVLane_MSB 0x32
+#define QIB_7322_IBCCtrlA_0_NumVLane_RMASK 0x7
+#define QIB_7322_IBCCtrlA_0_OverrunThreshold_LSB 0x24
+#define QIB_7322_IBCCtrlA_0_OverrunThreshold_MSB 0x27
+#define QIB_7322_IBCCtrlA_0_OverrunThreshold_RMASK 0xF
+#define QIB_7322_IBCCtrlA_0_PhyerrThreshold_LSB 0x20
+#define QIB_7322_IBCCtrlA_0_PhyerrThreshold_MSB 0x23
+#define QIB_7322_IBCCtrlA_0_PhyerrThreshold_RMASK 0xF
+#define QIB_7322_IBCCtrlA_0_MaxPktLen_LSB 0x15
+#define QIB_7322_IBCCtrlA_0_MaxPktLen_MSB 0x1F
+#define QIB_7322_IBCCtrlA_0_MaxPktLen_RMASK 0x7FF
+#define QIB_7322_IBCCtrlA_0_LinkCmd_LSB 0x13
+#define QIB_7322_IBCCtrlA_0_LinkCmd_MSB 0x14
+#define QIB_7322_IBCCtrlA_0_LinkCmd_RMASK 0x3
+#define QIB_7322_IBCCtrlA_0_LinkInitCmd_LSB 0x10
+#define QIB_7322_IBCCtrlA_0_LinkInitCmd_MSB 0x12
+#define QIB_7322_IBCCtrlA_0_LinkInitCmd_RMASK 0x7
+#define QIB_7322_IBCCtrlA_0_FlowCtrlWaterMark_LSB 0x8
+#define QIB_7322_IBCCtrlA_0_FlowCtrlWaterMark_MSB 0xF
+#define QIB_7322_IBCCtrlA_0_FlowCtrlWaterMark_RMASK 0xFF
+#define QIB_7322_IBCCtrlA_0_FlowCtrlPeriod_LSB 0x0
+#define QIB_7322_IBCCtrlA_0_FlowCtrlPeriod_MSB 0x7
+#define QIB_7322_IBCCtrlA_0_FlowCtrlPeriod_RMASK 0xFF
+
+#define QIB_7322_IBCCtrlB_0_OFFS 0x1568
+#define QIB_7322_IBCCtrlB_0_DEF 0x00000000000305FF
+#define QIB_7322_IBCCtrlB_0_IB_DLID_MASK_LSB 0x30
+#define QIB_7322_IBCCtrlB_0_IB_DLID_MASK_MSB 0x3F
+#define QIB_7322_IBCCtrlB_0_IB_DLID_MASK_RMASK 0xFFFF
+#define QIB_7322_IBCCtrlB_0_IB_DLID_LSB 0x20
+#define QIB_7322_IBCCtrlB_0_IB_DLID_MSB 0x2F
+#define QIB_7322_IBCCtrlB_0_IB_DLID_RMASK 0xFFFF
+#define QIB_7322_IBCCtrlB_0_IB_ENABLE_FILT_DPKT_LSB 0x1B
+#define QIB_7322_IBCCtrlB_0_IB_ENABLE_FILT_DPKT_MSB 0x1B
+#define QIB_7322_IBCCtrlB_0_IB_ENABLE_FILT_DPKT_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_HRTBT_REQ_LSB 0x1A
+#define QIB_7322_IBCCtrlB_0_HRTBT_REQ_MSB 0x1A
+#define QIB_7322_IBCCtrlB_0_HRTBT_REQ_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_HRTBT_PORT_LSB 0x12
+#define QIB_7322_IBCCtrlB_0_HRTBT_PORT_MSB 0x19
+#define QIB_7322_IBCCtrlB_0_HRTBT_PORT_RMASK 0xFF
+#define QIB_7322_IBCCtrlB_0_HRTBT_AUTO_LSB 0x11
+#define QIB_7322_IBCCtrlB_0_HRTBT_AUTO_MSB 0x11
+#define QIB_7322_IBCCtrlB_0_HRTBT_AUTO_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_HRTBT_ENB_LSB 0x10
+#define QIB_7322_IBCCtrlB_0_HRTBT_ENB_MSB 0x10
+#define QIB_7322_IBCCtrlB_0_HRTBT_ENB_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_DDS_LSB 0xC
+#define QIB_7322_IBCCtrlB_0_SD_DDS_MSB 0xF
+#define QIB_7322_IBCCtrlB_0_SD_DDS_RMASK 0xF
+#define QIB_7322_IBCCtrlB_0_SD_DDSV_LSB 0xB
+#define QIB_7322_IBCCtrlB_0_SD_DDSV_MSB 0xB
+#define QIB_7322_IBCCtrlB_0_SD_DDSV_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_ADD_ENB_LSB 0xA
+#define QIB_7322_IBCCtrlB_0_SD_ADD_ENB_MSB 0xA
+#define QIB_7322_IBCCtrlB_0_SD_ADD_ENB_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_RX_EQUAL_ENABLE_LSB 0x9
+#define QIB_7322_IBCCtrlB_0_SD_RX_EQUAL_ENABLE_MSB 0x9
+#define QIB_7322_IBCCtrlB_0_SD_RX_EQUAL_ENABLE_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_IB_LANE_REV_SUPPORTED_LSB 0x8
+#define QIB_7322_IBCCtrlB_0_IB_LANE_REV_SUPPORTED_MSB 0x8
+#define QIB_7322_IBCCtrlB_0_IB_LANE_REV_SUPPORTED_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_IB_POLARITY_REV_SUPP_LSB 0x7
+#define QIB_7322_IBCCtrlB_0_IB_POLARITY_REV_SUPP_MSB 0x7
+#define QIB_7322_IBCCtrlB_0_IB_POLARITY_REV_SUPP_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_IB_NUM_CHANNELS_LSB 0x5
+#define QIB_7322_IBCCtrlB_0_IB_NUM_CHANNELS_MSB 0x6
+#define QIB_7322_IBCCtrlB_0_IB_NUM_CHANNELS_RMASK 0x3
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_QDR_LSB 0x4
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_QDR_MSB 0x4
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_QDR_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_DDR_LSB 0x3
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_DDR_MSB 0x3
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_DDR_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_SDR_LSB 0x2
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_SDR_MSB 0x2
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_SDR_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_LSB 0x1
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_MSB 0x1
+#define QIB_7322_IBCCtrlB_0_SD_SPEED_RMASK 0x1
+#define QIB_7322_IBCCtrlB_0_IB_ENHANCED_MODE_LSB 0x0
+#define QIB_7322_IBCCtrlB_0_IB_ENHANCED_MODE_MSB 0x0
+#define QIB_7322_IBCCtrlB_0_IB_ENHANCED_MODE_RMASK 0x1
+
+#define QIB_7322_IBCCtrlC_0_OFFS 0x1570
+#define QIB_7322_IBCCtrlC_0_DEF 0x0000000000000301
+#define QIB_7322_IBCCtrlC_0_IB_BACK_PORCH_LSB 0x5
+#define QIB_7322_IBCCtrlC_0_IB_BACK_PORCH_MSB 0x9
+#define QIB_7322_IBCCtrlC_0_IB_BACK_PORCH_RMASK 0x1F
+#define QIB_7322_IBCCtrlC_0_IB_FRONT_PORCH_LSB 0x0
+#define QIB_7322_IBCCtrlC_0_IB_FRONT_PORCH_MSB 0x4
+#define QIB_7322_IBCCtrlC_0_IB_FRONT_PORCH_RMASK 0x1F
+
+#define QIB_7322_HRTBT_GUID_0_OFFS 0x1588
+#define QIB_7322_HRTBT_GUID_0_DEF 0x0000000000000000
+
+#define QIB_7322_IB_SDTEST_IF_TX_0_OFFS 0x1590
+#define QIB_7322_IB_SDTEST_IF_TX_0_DEF 0x0000000000000000
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_RX_CFG_LSB 0x30
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_RX_CFG_MSB 0x3F
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_RX_CFG_RMASK 0xFFFF
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_TX_CFG_LSB 0x20
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_TX_CFG_MSB 0x2F
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_TX_CFG_RMASK 0xFFFF
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_SPEED_LSB 0xD
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_SPEED_MSB 0xF
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_SPEED_RMASK 0x7
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_OPCODE_LSB 0xB
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_OPCODE_MSB 0xC
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_TX_OPCODE_RMASK 0x3
+#define QIB_7322_IB_SDTEST_IF_TX_0_CREDIT_CHANGE_LSB 0x4
+#define QIB_7322_IB_SDTEST_IF_TX_0_CREDIT_CHANGE_MSB 0x4
+#define QIB_7322_IB_SDTEST_IF_TX_0_CREDIT_CHANGE_RMASK 0x1
+#define QIB_7322_IB_SDTEST_IF_TX_0_VL_CAP_LSB 0x2
+#define QIB_7322_IB_SDTEST_IF_TX_0_VL_CAP_MSB 0x3
+#define QIB_7322_IB_SDTEST_IF_TX_0_VL_CAP_RMASK 0x3
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_3_TX_VALID_LSB 0x1
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_3_TX_VALID_MSB 0x1
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_3_TX_VALID_RMASK 0x1
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_T_TX_VALID_LSB 0x0
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_T_TX_VALID_MSB 0x0
+#define QIB_7322_IB_SDTEST_IF_TX_0_TS_T_TX_VALID_RMASK 0x1
+
+#define QIB_7322_IB_SDTEST_IF_RX_0_OFFS 0x1598
+#define QIB_7322_IB_SDTEST_IF_RX_0_DEF 0x0000000000000000
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_RX_CFG_LSB 0x30
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_RX_CFG_MSB 0x3F
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_RX_CFG_RMASK 0xFFFF
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_TX_CFG_LSB 0x20
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_TX_CFG_MSB 0x2F
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_TX_CFG_RMASK 0xFFFF
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_B_LSB 0x18
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_B_MSB 0x1F
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_B_RMASK 0xFF
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_A_LSB 0x10
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_A_MSB 0x17
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_RX_A_RMASK 0xFF
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_3_RX_VALID_LSB 0x1
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_3_RX_VALID_MSB 0x1
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_3_RX_VALID_RMASK 0x1
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_T_RX_VALID_LSB 0x0
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_T_RX_VALID_MSB 0x0
+#define QIB_7322_IB_SDTEST_IF_RX_0_TS_T_RX_VALID_RMASK 0x1
+
+#define QIB_7322_IBNCModeCtrl_0_OFFS 0x15B8
+#define QIB_7322_IBNCModeCtrl_0_DEF 0x0000000000000000
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteForce_LSB 0x22
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteForce_MSB 0x22
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteForce_RMASK 0x1
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteMask_LSB 0x21
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteMask_MSB 0x21
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapRemoteMask_RMASK 0x1
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapLocal_LSB 0x20
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapLocal_MSB 0x20
+#define QIB_7322_IBNCModeCtrl_0_ScrambleCapLocal_RMASK 0x1
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS2_LSB 0x11
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS2_MSB 0x19
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS2_RMASK 0x1FF
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS1_LSB 0x8
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS1_MSB 0x10
+#define QIB_7322_IBNCModeCtrl_0_TSMCode_TS1_RMASK 0x1FF
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_ignore_TSM_on_rx_LSB 0x2
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_ignore_TSM_on_rx_MSB 0x2
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_ignore_TSM_on_rx_RMASK 0x1
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS2_LSB 0x1
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS2_MSB 0x1
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS2_RMASK 0x1
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS1_LSB 0x0
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS1_MSB 0x0
+#define QIB_7322_IBNCModeCtrl_0_TSMEnable_send_TS1_RMASK 0x1
+
+#define QIB_7322_IBSerdesStatus_0_OFFS 0x15D0
+#define QIB_7322_IBSerdesStatus_0_DEF 0x0000000000000000
+
+#define QIB_7322_IBPCSConfig_0_OFFS 0x15D8
+#define QIB_7322_IBPCSConfig_0_DEF 0x0000000000000007
+#define QIB_7322_IBPCSConfig_0_link_sync_mask_LSB 0x9
+#define QIB_7322_IBPCSConfig_0_link_sync_mask_MSB 0x12
+#define QIB_7322_IBPCSConfig_0_link_sync_mask_RMASK 0x3FF
+#define QIB_7322_IBPCSConfig_0_xcv_rreset_LSB 0x2
+#define QIB_7322_IBPCSConfig_0_xcv_rreset_MSB 0x2
+#define QIB_7322_IBPCSConfig_0_xcv_rreset_RMASK 0x1
+#define QIB_7322_IBPCSConfig_0_xcv_treset_LSB 0x1
+#define QIB_7322_IBPCSConfig_0_xcv_treset_MSB 0x1
+#define QIB_7322_IBPCSConfig_0_xcv_treset_RMASK 0x1
+#define QIB_7322_IBPCSConfig_0_tx_rx_reset_LSB 0x0
+#define QIB_7322_IBPCSConfig_0_tx_rx_reset_MSB 0x0
+#define QIB_7322_IBPCSConfig_0_tx_rx_reset_RMASK 0x1
+
+#define QIB_7322_IBSerdesCtrl_0_OFFS 0x15E0
+#define QIB_7322_IBSerdesCtrl_0_DEF 0x0000000000FFA00F
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_QDR_LSB 0x1A
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_QDR_MSB 0x1A
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_QDR_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_DDR_LSB 0x19
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_DDR_MSB 0x19
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_DDR_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_SDR_LSB 0x18
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_SDR_MSB 0x18
+#define QIB_7322_IBSerdesCtrl_0_DISABLE_RXLATOFF_SDR_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_CHANNEL_RESET_N_LSB 0x14
+#define QIB_7322_IBSerdesCtrl_0_CHANNEL_RESET_N_MSB 0x17
+#define QIB_7322_IBSerdesCtrl_0_CHANNEL_RESET_N_RMASK 0xF
+#define QIB_7322_IBSerdesCtrl_0_CGMODE_LSB 0x10
+#define QIB_7322_IBSerdesCtrl_0_CGMODE_MSB 0x13
+#define QIB_7322_IBSerdesCtrl_0_CGMODE_RMASK 0xF
+#define QIB_7322_IBSerdesCtrl_0_IB_LAT_MODE_LSB 0xF
+#define QIB_7322_IBSerdesCtrl_0_IB_LAT_MODE_MSB 0xF
+#define QIB_7322_IBSerdesCtrl_0_IB_LAT_MODE_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_RXLOSEN_LSB 0xD
+#define QIB_7322_IBSerdesCtrl_0_RXLOSEN_MSB 0xD
+#define QIB_7322_IBSerdesCtrl_0_RXLOSEN_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_LPEN_LSB 0xC
+#define QIB_7322_IBSerdesCtrl_0_LPEN_MSB 0xC
+#define QIB_7322_IBSerdesCtrl_0_LPEN_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_PLLPD_LSB 0xB
+#define QIB_7322_IBSerdesCtrl_0_PLLPD_MSB 0xB
+#define QIB_7322_IBSerdesCtrl_0_PLLPD_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_TXPD_LSB 0xA
+#define QIB_7322_IBSerdesCtrl_0_TXPD_MSB 0xA
+#define QIB_7322_IBSerdesCtrl_0_TXPD_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_RXPD_LSB 0x9
+#define QIB_7322_IBSerdesCtrl_0_RXPD_MSB 0x9
+#define QIB_7322_IBSerdesCtrl_0_RXPD_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_TXIDLE_LSB 0x8
+#define QIB_7322_IBSerdesCtrl_0_TXIDLE_MSB 0x8
+#define QIB_7322_IBSerdesCtrl_0_TXIDLE_RMASK 0x1
+#define QIB_7322_IBSerdesCtrl_0_CMODE_LSB 0x0
+#define QIB_7322_IBSerdesCtrl_0_CMODE_MSB 0x6
+#define QIB_7322_IBSerdesCtrl_0_CMODE_RMASK 0x7F
+
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_OFFS 0x1600
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_DEF 0x0000000000000000
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_tx_override_deemphasis_select_LSB 0x1F
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_tx_override_deemphasis_select_MSB 0x1F
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_tx_override_deemphasis_select_RMASK 0x1
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_reset_tx_deemphasis_override_LSB 0x1E
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_reset_tx_deemphasis_override_MSB 0x1E
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_reset_tx_deemphasis_override_RMASK 0x1
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txampcntl_d2a_LSB 0xE
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txampcntl_d2a_MSB 0x11
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txampcntl_d2a_RMASK 0xF
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txc0_ena_LSB 0x9
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txc0_ena_MSB 0xD
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txc0_ena_RMASK 0x1F
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcp1_ena_LSB 0x5
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcp1_ena_MSB 0x8
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcp1_ena_RMASK 0xF
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_xtra_emph0_LSB 0x3
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_xtra_emph0_MSB 0x4
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_xtra_emph0_RMASK 0x3
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_ena_LSB 0x0
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_ena_MSB 0x2
+#define QIB_7322_IBSD_TX_DEEMPHASIS_OVERRIDE_0_txcn1_ena_RMASK 0x7
+
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_OFFS 0x1640
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenagain_sdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenale_sdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_STATIC_SDR_0_static_disable_rxenadfe_sdr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_OFFS 0x1648
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenagain_sdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenale_sdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_SDR_0_dyn_disable_rxenadfe_sdr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_OFFS 0x1650
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenagain_ddr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenale_ddr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_STATIC_DDR_0_static_disable_rxenadfe_ddr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_OFFS 0x1658
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenagain_ddr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenale_ddr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_DDR_0_dyn_disable_rxenadfe_ddr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_OFFS 0x1660
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenagain_qdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenale_qdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_STATIC_QDR_0_static_disable_rxenadfe_qdr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_OFFS 0x1668
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_DEF 0x0000000000000000
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch3_LSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch3_MSB 0x27
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch2_LSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch2_MSB 0x26
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch1_LSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch1_MSB 0x25
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch0_LSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch0_MSB 0x24
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenagain_qdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch3_LSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch3_MSB 0x23
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch3_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch2_LSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch2_MSB 0x22
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch2_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch1_LSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch1_MSB 0x21
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch1_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch0_LSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch0_MSB 0x20
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenale_qdr_ch0_RMASK 0x1
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch3_LSB 0x18
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch3_MSB 0x1F
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch3_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch2_LSB 0x10
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch2_MSB 0x17
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch2_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch1_LSB 0x8
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch1_MSB 0xF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch1_RMASK 0xFF
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch0_LSB 0x0
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch0_MSB 0x7
+#define QIB_7322_ADAPT_DISABLE_DYNAMIC_QDR_0_dyn_disable_rxenadfe_qdr_ch0_RMASK 0xFF
+
+#define QIB_7322_ADAPT_DISABLE_TIMER_THRESHOLD_0_OFFS 0x1670
+#define QIB_7322_ADAPT_DISABLE_TIMER_THRESHOLD_0_DEF 0x0000000000000000
+
+#define QIB_7322_HighPriorityLimit_0_OFFS 0x1BC0
+#define QIB_7322_HighPriorityLimit_0_DEF 0x0000000000000000
+#define QIB_7322_HighPriorityLimit_0_Limit_LSB 0x0
+#define QIB_7322_HighPriorityLimit_0_Limit_MSB 0x7
+#define QIB_7322_HighPriorityLimit_0_Limit_RMASK 0xFF
+
+#define QIB_7322_LowPriority0_0_OFFS 0x1C00
+#define QIB_7322_LowPriority0_0_DEF 0x0000000000000000
+#define QIB_7322_LowPriority0_0_VirtualLane_LSB 0x10
+#define QIB_7322_LowPriority0_0_VirtualLane_MSB 0x12
+#define QIB_7322_LowPriority0_0_VirtualLane_RMASK 0x7
+#define QIB_7322_LowPriority0_0_Weight_LSB 0x0
+#define QIB_7322_LowPriority0_0_Weight_MSB 0x7
+#define QIB_7322_LowPriority0_0_Weight_RMASK 0xFF
+
+#define QIB_7322_HighPriority0_0_OFFS 0x1E00
+#define QIB_7322_HighPriority0_0_DEF 0x0000000000000000
+#define QIB_7322_HighPriority0_0_VirtualLane_LSB 0x10
+#define QIB_7322_HighPriority0_0_VirtualLane_MSB 0x12
+#define QIB_7322_HighPriority0_0_VirtualLane_RMASK 0x7
+#define QIB_7322_HighPriority0_0_Weight_LSB 0x0
+#define QIB_7322_HighPriority0_0_Weight_MSB 0x7
+#define QIB_7322_HighPriority0_0_Weight_RMASK 0xFF
+
+#define QIB_7322_CntrRegBase_1_OFFS 0x2028
+#define QIB_7322_CntrRegBase_1_DEF 0x0000000000013000
+
+#define QIB_7322_RcvQPMulticastContext_1_OFFS 0x2170
+
+#define QIB_7322_SendCtrl_1_OFFS 0x21C0
+
+#define QIB_7322_SendBufAvail0_OFFS 0x3000
+#define QIB_7322_SendBufAvail0_DEF 0x0000000000000000
+#define QIB_7322_SendBufAvail0_SendBuf_31_0_LSB 0x0
+#define QIB_7322_SendBufAvail0_SendBuf_31_0_MSB 0x3F
+#define QIB_7322_SendBufAvail0_SendBuf_31_0_RMASK 0x0
+
+#define QIB_7322_MsixTable_OFFS 0x8000
+#define QIB_7322_MsixTable_DEF 0x0000000000000000
+
+#define QIB_7322_MsixPba_OFFS 0x9000
+#define QIB_7322_MsixPba_DEF 0x0000000000000000
+
+#define QIB_7322_LAMemory_OFFS 0xA000
+#define QIB_7322_LAMemory_DEF 0x0000000000000000
+
+#define QIB_7322_LBIntCnt_OFFS 0x11000
+#define QIB_7322_LBIntCnt_DEF 0x0000000000000000
+
+#define QIB_7322_LBFlowStallCnt_OFFS 0x11008
+#define QIB_7322_LBFlowStallCnt_DEF 0x0000000000000000
+
+#define QIB_7322_RxTIDFullErrCnt_OFFS 0x110D0
+#define QIB_7322_RxTIDFullErrCnt_DEF 0x0000000000000000
+
+#define QIB_7322_RxTIDValidErrCnt_OFFS 0x110D8
+#define QIB_7322_RxTIDValidErrCnt_DEF 0x0000000000000000
+
+#define QIB_7322_RxP0HdrEgrOvflCnt_OFFS 0x110E8
+#define QIB_7322_RxP0HdrEgrOvflCnt_DEF 0x0000000000000000
+
+#define QIB_7322_PcieRetryBufDiagQwordCnt_OFFS 0x111A0
+#define QIB_7322_PcieRetryBufDiagQwordCnt_DEF 0x0000000000000000
+
+#define QIB_7322_RxTidFlowDropCnt_OFFS 0x111E0
+#define QIB_7322_RxTidFlowDropCnt_DEF 0x0000000000000000
+
+#define QIB_7322_LBIntCnt_0_OFFS 0x12000
+#define QIB_7322_LBIntCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxCreditUpToDateTimeOut_0_OFFS 0x12008
+#define QIB_7322_TxCreditUpToDateTimeOut_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxSDmaDescCnt_0_OFFS 0x12010
+#define QIB_7322_TxSDmaDescCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxUnsupVLErrCnt_0_OFFS 0x12018
+#define QIB_7322_TxUnsupVLErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxDataPktCnt_0_OFFS 0x12020
+#define QIB_7322_TxDataPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxFlowPktCnt_0_OFFS 0x12028
+#define QIB_7322_TxFlowPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxDwordCnt_0_OFFS 0x12030
+#define QIB_7322_TxDwordCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxLenErrCnt_0_OFFS 0x12038
+#define QIB_7322_TxLenErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxMaxMinLenErrCnt_0_OFFS 0x12040
+#define QIB_7322_TxMaxMinLenErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxUnderrunCnt_0_OFFS 0x12048
+#define QIB_7322_TxUnderrunCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxFlowStallCnt_0_OFFS 0x12050
+#define QIB_7322_TxFlowStallCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxDroppedPktCnt_0_OFFS 0x12058
+#define QIB_7322_TxDroppedPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxDroppedPktCnt_0_OFFS 0x12060
+#define QIB_7322_RxDroppedPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxDataPktCnt_0_OFFS 0x12068
+#define QIB_7322_RxDataPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxFlowPktCnt_0_OFFS 0x12070
+#define QIB_7322_RxFlowPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxDwordCnt_0_OFFS 0x12078
+#define QIB_7322_RxDwordCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxLenErrCnt_0_OFFS 0x12080
+#define QIB_7322_RxLenErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxMaxMinLenErrCnt_0_OFFS 0x12088
+#define QIB_7322_RxMaxMinLenErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxICRCErrCnt_0_OFFS 0x12090
+#define QIB_7322_RxICRCErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxVCRCErrCnt_0_OFFS 0x12098
+#define QIB_7322_RxVCRCErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxFlowCtrlViolCnt_0_OFFS 0x120A0
+#define QIB_7322_RxFlowCtrlViolCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxVersionErrCnt_0_OFFS 0x120A8
+#define QIB_7322_RxVersionErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxLinkMalformCnt_0_OFFS 0x120B0
+#define QIB_7322_RxLinkMalformCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxEBPCnt_0_OFFS 0x120B8
+#define QIB_7322_RxEBPCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxLPCRCErrCnt_0_OFFS 0x120C0
+#define QIB_7322_RxLPCRCErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxBufOvflCnt_0_OFFS 0x120C8
+#define QIB_7322_RxBufOvflCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxLenTruncateCnt_0_OFFS 0x120D0
+#define QIB_7322_RxLenTruncateCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxPKeyMismatchCnt_0_OFFS 0x120E0
+#define QIB_7322_RxPKeyMismatchCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_IBLinkDownedCnt_0_OFFS 0x12180
+#define QIB_7322_IBLinkDownedCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_IBSymbolErrCnt_0_OFFS 0x12188
+#define QIB_7322_IBSymbolErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_IBStatusChangeCnt_0_OFFS 0x12190
+#define QIB_7322_IBStatusChangeCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_IBLinkErrRecoveryCnt_0_OFFS 0x12198
+#define QIB_7322_IBLinkErrRecoveryCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_ExcessBufferOvflCnt_0_OFFS 0x121A8
+#define QIB_7322_ExcessBufferOvflCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_LocalLinkIntegrityErrCnt_0_OFFS 0x121B0
+#define QIB_7322_LocalLinkIntegrityErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxVlErrCnt_0_OFFS 0x121B8
+#define QIB_7322_RxVlErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxDlidFltrCnt_0_OFFS 0x121C0
+#define QIB_7322_RxDlidFltrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxVL15DroppedPktCnt_0_OFFS 0x121C8
+#define QIB_7322_RxVL15DroppedPktCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxOtherLocalPhyErrCnt_0_OFFS 0x121D0
+#define QIB_7322_RxOtherLocalPhyErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_RxQPInvalidContextCnt_0_OFFS 0x121D8
+#define QIB_7322_RxQPInvalidContextCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_TxHeadersErrCnt_0_OFFS 0x121F8
+#define QIB_7322_TxHeadersErrCnt_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSRcvDataCount_0_OFFS 0x12218
+#define QIB_7322_PSRcvDataCount_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSRcvPktsCount_0_OFFS 0x12220
+#define QIB_7322_PSRcvPktsCount_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitDataCount_0_OFFS 0x12228
+#define QIB_7322_PSXmitDataCount_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitPktsCount_0_OFFS 0x12230
+#define QIB_7322_PSXmitPktsCount_0_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitWaitCount_0_OFFS 0x12238
+#define QIB_7322_PSXmitWaitCount_0_DEF 0x0000000000000000
+
+#define QIB_7322_LBIntCnt_1_OFFS 0x13000
+#define QIB_7322_LBIntCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxCreditUpToDateTimeOut_1_OFFS 0x13008
+#define QIB_7322_TxCreditUpToDateTimeOut_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxSDmaDescCnt_1_OFFS 0x13010
+#define QIB_7322_TxSDmaDescCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxUnsupVLErrCnt_1_OFFS 0x13018
+#define QIB_7322_TxUnsupVLErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxDataPktCnt_1_OFFS 0x13020
+#define QIB_7322_TxDataPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxFlowPktCnt_1_OFFS 0x13028
+#define QIB_7322_TxFlowPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxDwordCnt_1_OFFS 0x13030
+#define QIB_7322_TxDwordCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxLenErrCnt_1_OFFS 0x13038
+#define QIB_7322_TxLenErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxMaxMinLenErrCnt_1_OFFS 0x13040
+#define QIB_7322_TxMaxMinLenErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxUnderrunCnt_1_OFFS 0x13048
+#define QIB_7322_TxUnderrunCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxFlowStallCnt_1_OFFS 0x13050
+#define QIB_7322_TxFlowStallCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxDroppedPktCnt_1_OFFS 0x13058
+#define QIB_7322_TxDroppedPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxDroppedPktCnt_1_OFFS 0x13060
+#define QIB_7322_RxDroppedPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxDataPktCnt_1_OFFS 0x13068
+#define QIB_7322_RxDataPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxFlowPktCnt_1_OFFS 0x13070
+#define QIB_7322_RxFlowPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxDwordCnt_1_OFFS 0x13078
+#define QIB_7322_RxDwordCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxLenErrCnt_1_OFFS 0x13080
+#define QIB_7322_RxLenErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxMaxMinLenErrCnt_1_OFFS 0x13088
+#define QIB_7322_RxMaxMinLenErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxICRCErrCnt_1_OFFS 0x13090
+#define QIB_7322_RxICRCErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxVCRCErrCnt_1_OFFS 0x13098
+#define QIB_7322_RxVCRCErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxFlowCtrlViolCnt_1_OFFS 0x130A0
+#define QIB_7322_RxFlowCtrlViolCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxVersionErrCnt_1_OFFS 0x130A8
+#define QIB_7322_RxVersionErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxLinkMalformCnt_1_OFFS 0x130B0
+#define QIB_7322_RxLinkMalformCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxEBPCnt_1_OFFS 0x130B8
+#define QIB_7322_RxEBPCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxLPCRCErrCnt_1_OFFS 0x130C0
+#define QIB_7322_RxLPCRCErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxBufOvflCnt_1_OFFS 0x130C8
+#define QIB_7322_RxBufOvflCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxLenTruncateCnt_1_OFFS 0x130D0
+#define QIB_7322_RxLenTruncateCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxPKeyMismatchCnt_1_OFFS 0x130E0
+#define QIB_7322_RxPKeyMismatchCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_IBLinkDownedCnt_1_OFFS 0x13180
+#define QIB_7322_IBLinkDownedCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_IBSymbolErrCnt_1_OFFS 0x13188
+#define QIB_7322_IBSymbolErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_IBStatusChangeCnt_1_OFFS 0x13190
+#define QIB_7322_IBStatusChangeCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_IBLinkErrRecoveryCnt_1_OFFS 0x13198
+#define QIB_7322_IBLinkErrRecoveryCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_ExcessBufferOvflCnt_1_OFFS 0x131A8
+#define QIB_7322_ExcessBufferOvflCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_LocalLinkIntegrityErrCnt_1_OFFS 0x131B0
+#define QIB_7322_LocalLinkIntegrityErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxVlErrCnt_1_OFFS 0x131B8
+#define QIB_7322_RxVlErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxDlidFltrCnt_1_OFFS 0x131C0
+#define QIB_7322_RxDlidFltrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxVL15DroppedPktCnt_1_OFFS 0x131C8
+#define QIB_7322_RxVL15DroppedPktCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxOtherLocalPhyErrCnt_1_OFFS 0x131D0
+#define QIB_7322_RxOtherLocalPhyErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_RxQPInvalidContextCnt_1_OFFS 0x131D8
+#define QIB_7322_RxQPInvalidContextCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_TxHeadersErrCnt_1_OFFS 0x131F8
+#define QIB_7322_TxHeadersErrCnt_1_DEF 0x0000000000000000
+
+#define QIB_7322_PSRcvDataCount_1_OFFS 0x13218
+#define QIB_7322_PSRcvDataCount_1_DEF 0x0000000000000000
+
+#define QIB_7322_PSRcvPktsCount_1_OFFS 0x13220
+#define QIB_7322_PSRcvPktsCount_1_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitDataCount_1_OFFS 0x13228
+#define QIB_7322_PSXmitDataCount_1_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitPktsCount_1_OFFS 0x13230
+#define QIB_7322_PSXmitPktsCount_1_DEF 0x0000000000000000
+
+#define QIB_7322_PSXmitWaitCount_1_OFFS 0x13238
+#define QIB_7322_PSXmitWaitCount_1_DEF 0x0000000000000000
+
+#define QIB_7322_RcvEgrArray_OFFS 0x14000
+#define QIB_7322_RcvEgrArray_DEF 0x0000000000000000
+#define QIB_7322_RcvEgrArray_RT_BufSize_LSB 0x25
+#define QIB_7322_RcvEgrArray_RT_BufSize_MSB 0x27
+#define QIB_7322_RcvEgrArray_RT_BufSize_RMASK 0x7
+#define QIB_7322_RcvEgrArray_RT_Addr_LSB 0x0
+#define QIB_7322_RcvEgrArray_RT_Addr_MSB 0x24
+#define QIB_7322_RcvEgrArray_RT_Addr_RMASK 0x1FFFFFFFFF
+
+#define QIB_7322_RcvTIDArray0_OFFS 0x50000
+#define QIB_7322_RcvTIDArray0_DEF 0x0000000000000000
+#define QIB_7322_RcvTIDArray0_RT_BufSize_LSB 0x25
+#define QIB_7322_RcvTIDArray0_RT_BufSize_MSB 0x27
+#define QIB_7322_RcvTIDArray0_RT_BufSize_RMASK 0x7
+#define QIB_7322_RcvTIDArray0_RT_Addr_LSB 0x0
+#define QIB_7322_RcvTIDArray0_RT_Addr_MSB 0x24
+#define QIB_7322_RcvTIDArray0_RT_Addr_RMASK 0x1FFFFFFFFF
+
+#define QIB_7322_IBSD_DDS_MAP_TABLE_0_OFFS 0xD0000
+#define QIB_7322_IBSD_DDS_MAP_TABLE_0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvHdrTail0_OFFS 0x200000
+#define QIB_7322_RcvHdrTail0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvHdrHead0_OFFS 0x200008
+#define QIB_7322_RcvHdrHead0_DEF 0x0000000000000000
+#define QIB_7322_RcvHdrHead0_counter_LSB 0x20
+#define QIB_7322_RcvHdrHead0_counter_MSB 0x2F
+#define QIB_7322_RcvHdrHead0_counter_RMASK 0xFFFF
+#define QIB_7322_RcvHdrHead0_RcvHeadPointer_LSB 0x0
+#define QIB_7322_RcvHdrHead0_RcvHeadPointer_MSB 0x1F
+#define QIB_7322_RcvHdrHead0_RcvHeadPointer_RMASK 0xFFFFFFFF
+
+#define QIB_7322_RcvEgrIndexTail0_OFFS 0x200010
+#define QIB_7322_RcvEgrIndexTail0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvEgrIndexHead0_OFFS 0x200018
+#define QIB_7322_RcvEgrIndexHead0_DEF 0x0000000000000000
+
+#define QIB_7322_RcvTIDFlowTable0_OFFS 0x201000
+#define QIB_7322_RcvTIDFlowTable0_DEF 0x0000000000000000
+#define QIB_7322_RcvTIDFlowTable0_GenMismatch_LSB 0x1C
+#define QIB_7322_RcvTIDFlowTable0_GenMismatch_MSB 0x1C
+#define QIB_7322_RcvTIDFlowTable0_GenMismatch_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_SeqMismatch_LSB 0x1B
+#define QIB_7322_RcvTIDFlowTable0_SeqMismatch_MSB 0x1B
+#define QIB_7322_RcvTIDFlowTable0_SeqMismatch_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_KeepOnGenErr_LSB 0x16
+#define QIB_7322_RcvTIDFlowTable0_KeepOnGenErr_MSB 0x16
+#define QIB_7322_RcvTIDFlowTable0_KeepOnGenErr_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_KeepAfterSeqErr_LSB 0x15
+#define QIB_7322_RcvTIDFlowTable0_KeepAfterSeqErr_MSB 0x15
+#define QIB_7322_RcvTIDFlowTable0_KeepAfterSeqErr_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_HdrSuppEnabled_LSB 0x14
+#define QIB_7322_RcvTIDFlowTable0_HdrSuppEnabled_MSB 0x14
+#define QIB_7322_RcvTIDFlowTable0_HdrSuppEnabled_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_FlowValid_LSB 0x13
+#define QIB_7322_RcvTIDFlowTable0_FlowValid_MSB 0x13
+#define QIB_7322_RcvTIDFlowTable0_FlowValid_RMASK 0x1
+#define QIB_7322_RcvTIDFlowTable0_GenVal_LSB 0xB
+#define QIB_7322_RcvTIDFlowTable0_GenVal_MSB 0x12
+#define QIB_7322_RcvTIDFlowTable0_GenVal_RMASK 0xFF
+#define QIB_7322_RcvTIDFlowTable0_SeqNum_LSB 0x0
+#define QIB_7322_RcvTIDFlowTable0_SeqNum_MSB 0xA
+#define QIB_7322_RcvTIDFlowTable0_SeqNum_RMASK 0x7FF
diff --git a/drivers/infiniband/hw/qib/qib_common.h b/drivers/infiniband/hw/qib/qib_common.h
new file mode 100644
index 000000000..4fb78abd8
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_common.h
@@ -0,0 +1,812 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _QIB_COMMON_H
+#define _QIB_COMMON_H
+
+/*
+ * This file contains defines, structures, etc. that are used
+ * to communicate between kernel and user code.
+ */
+
+/* This is the IEEE-assigned OUI for QLogic Inc. QLogic_IB */
+#define QIB_SRC_OUI_1 0x00
+#define QIB_SRC_OUI_2 0x11
+#define QIB_SRC_OUI_3 0x75
+
+/* version of protocol header (known to chip also). In the long run,
+ * we should be able to generate and accept a range of version numbers;
+ * for now we only accept one, and it's compiled in.
+ */
+#define IPS_PROTO_VERSION 2
+
+/*
+ * These are compile time constants that you may want to enable or disable
+ * if you are trying to debug problems with code or performance.
+ * QIB_VERBOSE_TRACING define as 1 if you want additional tracing in
+ * fastpath code
+ * QIB_TRACE_REGWRITES define as 1 if you want register writes to be
+ * traced in faspath code
+ * _QIB_TRACING define as 0 if you want to remove all tracing in a
+ * compilation unit
+ */
+
+/*
+ * The value in the BTH QP field that QLogic_IB uses to differentiate
+ * an qlogic_ib protocol IB packet vs standard IB transport
+ * This it needs to be even (0x656b78), because the LSB is sometimes
+ * used for the MSB of context. The change may cause a problem
+ * interoperating with older software.
+ */
+#define QIB_KD_QP 0x656b78
+
+/*
+ * These are the status bits readable (in ascii form, 64bit value)
+ * from the "status" sysfs file.  For binary compatibility, values
+ * must remain as is; removed states can be reused for different
+ * purposes.
+ */
+#define QIB_STATUS_INITTED       0x1    /* basic initialization done */
+/* Chip has been found and initted */
+#define QIB_STATUS_CHIP_PRESENT 0x20
+/* IB link is at ACTIVE, usable for data traffic */
+#define QIB_STATUS_IB_READY     0x40
+/* link is configured, LID, MTU, etc. have been set */
+#define QIB_STATUS_IB_CONF      0x80
+/* A Fatal hardware error has occurred. */
+#define QIB_STATUS_HWERROR     0x200
+
+/*
+ * The list of usermode accessible registers.  Also see Reg_* later in file.
+ */
+enum qib_ureg {
+	/* (RO)  DMA RcvHdr to be used next. */
+	ur_rcvhdrtail = 0,
+	/* (RW)  RcvHdr entry to be processed next by host. */
+	ur_rcvhdrhead = 1,
+	/* (RO)  Index of next Eager index to use. */
+	ur_rcvegrindextail = 2,
+	/* (RW)  Eager TID to be processed next */
+	ur_rcvegrindexhead = 3,
+	/* For internal use only; max register number. */
+	_QIB_UregMax
+};
+
+/* bit values for spi_runtime_flags */
+#define QIB_RUNTIME_PCIE                0x0002
+#define QIB_RUNTIME_FORCE_WC_ORDER      0x0004
+#define QIB_RUNTIME_RCVHDR_COPY         0x0008
+#define QIB_RUNTIME_MASTER              0x0010
+#define QIB_RUNTIME_RCHK                0x0020
+#define QIB_RUNTIME_NODMA_RTAIL         0x0080
+#define QIB_RUNTIME_SPECIAL_TRIGGER     0x0100
+#define QIB_RUNTIME_SDMA                0x0200
+#define QIB_RUNTIME_FORCE_PIOAVAIL      0x0400
+#define QIB_RUNTIME_PIO_REGSWAPPED      0x0800
+#define QIB_RUNTIME_CTXT_MSB_IN_QP      0x1000
+#define QIB_RUNTIME_CTXT_REDIRECT       0x2000
+#define QIB_RUNTIME_HDRSUPP             0x4000
+
+/*
+ * This structure is returned by qib_userinit() immediately after
+ * open to get implementation-specific info, and info specific to this
+ * instance.
+ *
+ * This struct must have explict pad fields where type sizes
+ * may result in different alignments between 32 and 64 bit
+ * programs, since the 64 bit * bit kernel requires the user code
+ * to have matching offsets
+ */
+struct qib_base_info {
+	/* version of hardware, for feature checking. */
+	__u32 spi_hw_version;
+	/* version of software, for feature checking. */
+	__u32 spi_sw_version;
+	/* QLogic_IB context assigned, goes into sent packets */
+	__u16 spi_ctxt;
+	__u16 spi_subctxt;
+	/*
+	 * IB MTU, packets IB data must be less than this.
+	 * The MTU is in bytes, and will be a multiple of 4 bytes.
+	 */
+	__u32 spi_mtu;
+	/*
+	 * Size of a PIO buffer.  Any given packet's total size must be less
+	 * than this (in words).  Included is the starting control word, so
+	 * if 513 is returned, then total pkt size is 512 words or less.
+	 */
+	__u32 spi_piosize;
+	/* size of the TID cache in qlogic_ib, in entries */
+	__u32 spi_tidcnt;
+	/* size of the TID Eager list in qlogic_ib, in entries */
+	__u32 spi_tidegrcnt;
+	/* size of a single receive header queue entry in words. */
+	__u32 spi_rcvhdrent_size;
+	/*
+	 * Count of receive header queue entries allocated.
+	 * This may be less than the spu_rcvhdrcnt passed in!.
+	 */
+	__u32 spi_rcvhdr_cnt;
+
+	/* per-chip and other runtime features bitmap (QIB_RUNTIME_*) */
+	__u32 spi_runtime_flags;
+
+	/* address where hardware receive header queue is mapped */
+	__u64 spi_rcvhdr_base;
+
+	/* user program. */
+
+	/* base address of eager TID receive buffers used by hardware. */
+	__u64 spi_rcv_egrbufs;
+
+	/* Allocated by initialization code, not by protocol. */
+
+	/*
+	 * Size of each TID buffer in host memory, starting at
+	 * spi_rcv_egrbufs.  The buffers are virtually contiguous.
+	 */
+	__u32 spi_rcv_egrbufsize;
+	/*
+	 * The special QP (queue pair) value that identifies an qlogic_ib
+	 * protocol packet from standard IB packets.  More, probably much
+	 * more, to be added.
+	 */
+	__u32 spi_qpair;
+
+	/*
+	 * User register base for init code, not to be used directly by
+	 * protocol or applications.  Always points to chip registers,
+	 * for normal or shared context.
+	 */
+	__u64 spi_uregbase;
+	/*
+	 * Maximum buffer size in bytes that can be used in a single TID
+	 * entry (assuming the buffer is aligned to this boundary).  This is
+	 * the minimum of what the hardware and software support Guaranteed
+	 * to be a power of 2.
+	 */
+	__u32 spi_tid_maxsize;
+	/*
+	 * alignment of each pio send buffer (byte count
+	 * to add to spi_piobufbase to get to second buffer)
+	 */
+	__u32 spi_pioalign;
+	/*
+	 * The index of the first pio buffer available to this process;
+	 * needed to do lookup in spi_pioavailaddr; not added to
+	 * spi_piobufbase.
+	 */
+	__u32 spi_pioindex;
+	 /* number of buffers mapped for this process */
+	__u32 spi_piocnt;
+
+	/*
+	 * Base address of writeonly pio buffers for this process.
+	 * Each buffer has spi_piosize words, and is aligned on spi_pioalign
+	 * boundaries.  spi_piocnt buffers are mapped from this address
+	 */
+	__u64 spi_piobufbase;
+
+	/*
+	 * Base address of readonly memory copy of the pioavail registers.
+	 * There are 2 bits for each buffer.
+	 */
+	__u64 spi_pioavailaddr;
+
+	/*
+	 * Address where driver updates a copy of the interface and driver
+	 * status (QIB_STATUS_*) as a 64 bit value.  It's followed by a
+	 * link status qword (formerly combined with driver status), then a
+	 * string indicating hardware error, if there was one.
+	 */
+	__u64 spi_status;
+
+	/* number of chip ctxts available to user processes */
+	__u32 spi_nctxts;
+	__u16 spi_unit; /* unit number of chip we are using */
+	__u16 spi_port; /* IB port number we are using */
+	/* num bufs in each contiguous set */
+	__u32 spi_rcv_egrperchunk;
+	/* size in bytes of each contiguous set */
+	__u32 spi_rcv_egrchunksize;
+	/* total size of mmap to cover full rcvegrbuffers */
+	__u32 spi_rcv_egrbuftotlen;
+	__u32 spi_rhf_offset; /* dword offset in hdrqent for rcvhdr flags */
+	/* address of readonly memory copy of the rcvhdrq tail register. */
+	__u64 spi_rcvhdr_tailaddr;
+
+	/*
+	 * shared memory pages for subctxts if ctxt is shared; these cover
+	 * all the processes in the group sharing a single context.
+	 * all have enough space for the num_subcontexts value on this job.
+	 */
+	__u64 spi_subctxt_uregbase;
+	__u64 spi_subctxt_rcvegrbuf;
+	__u64 spi_subctxt_rcvhdr_base;
+
+	/* shared memory page for send buffer disarm status */
+	__u64 spi_sendbuf_status;
+} __aligned(8);
+
+/*
+ * This version number is given to the driver by the user code during
+ * initialization in the spu_userversion field of qib_user_info, so
+ * the driver can check for compatibility with user code.
+ *
+ * The major version changes when data structures
+ * change in an incompatible way.  The driver must be the same or higher
+ * for initialization to succeed.  In some cases, a higher version
+ * driver will not interoperate with older software, and initialization
+ * will return an error.
+ */
+#define QIB_USER_SWMAJOR 1
+
+/*
+ * Minor version differences are always compatible
+ * a within a major version, however if user software is larger
+ * than driver software, some new features and/or structure fields
+ * may not be implemented; the user code must deal with this if it
+ * cares, or it must abort after initialization reports the difference.
+ */
+#define QIB_USER_SWMINOR 13
+
+#define QIB_USER_SWVERSION ((QIB_USER_SWMAJOR << 16) | QIB_USER_SWMINOR)
+
+#ifndef QIB_KERN_TYPE
+#define QIB_KERN_TYPE 0
+#endif
+
+/*
+ * Similarly, this is the kernel version going back to the user.  It's
+ * slightly different, in that we want to tell if the driver was built as
+ * part of a QLogic release, or from the driver from openfabrics.org,
+ * kernel.org, or a standard distribution, for support reasons.
+ * The high bit is 0 for non-QLogic and 1 for QLogic-built/supplied.
+ *
+ * It's returned by the driver to the user code during initialization in the
+ * spi_sw_version field of qib_base_info, so the user code can in turn
+ * check for compatibility with the kernel.
+*/
+#define QIB_KERN_SWVERSION ((QIB_KERN_TYPE << 31) | QIB_USER_SWVERSION)
+
+/*
+ * Define the driver version number.  This is something that refers only
+ * to the driver itself, not the software interfaces it supports.
+ */
+#define QIB_DRIVER_VERSION_BASE "1.11"
+
+/* create the final driver version string */
+#ifdef QIB_IDSTR
+#define QIB_DRIVER_VERSION QIB_DRIVER_VERSION_BASE " " QIB_IDSTR
+#else
+#define QIB_DRIVER_VERSION QIB_DRIVER_VERSION_BASE
+#endif
+
+/*
+ * If the unit is specified via open, HCA choice is fixed.  If port is
+ * specified, it's also fixed.  Otherwise we try to spread contexts
+ * across ports and HCAs, using different algorithims.  WITHIN is
+ * the old default, prior to this mechanism.
+ */
+#define QIB_PORT_ALG_ACROSS 0 /* round robin contexts across HCAs, then
+			       * ports; this is the default */
+#define QIB_PORT_ALG_WITHIN 1 /* use all contexts on an HCA (round robin
+			       * active ports within), then next HCA */
+#define QIB_PORT_ALG_COUNT 2 /* number of algorithm choices */
+
+/*
+ * This structure is passed to qib_userinit() to tell the driver where
+ * user code buffers are, sizes, etc.   The offsets and sizes of the
+ * fields must remain unchanged, for binary compatibility.  It can
+ * be extended, if userversion is changed so user code can tell, if needed
+ */
+struct qib_user_info {
+	/*
+	 * version of user software, to detect compatibility issues.
+	 * Should be set to QIB_USER_SWVERSION.
+	 */
+	__u32 spu_userversion;
+
+	__u32 _spu_unused2;
+
+	/* size of struct base_info to write to */
+	__u32 spu_base_info_size;
+
+	__u32 spu_port_alg; /* which QIB_PORT_ALG_*; unused user minor < 11 */
+
+	/*
+	 * If two or more processes wish to share a context, each process
+	 * must set the spu_subctxt_cnt and spu_subctxt_id to the same
+	 * values.  The only restriction on the spu_subctxt_id is that
+	 * it be unique for a given node.
+	 */
+	__u16 spu_subctxt_cnt;
+	__u16 spu_subctxt_id;
+
+	__u32 spu_port; /* IB port requested by user if > 0 */
+
+	/*
+	 * address of struct base_info to write to
+	 */
+	__u64 spu_base_info;
+
+} __aligned(8);
+
+/* User commands. */
+
+/* 16 available, was: old set up userspace (for old user code) */
+#define QIB_CMD_CTXT_INFO       17      /* find out what resources we got */
+#define QIB_CMD_RECV_CTRL       18      /* control receipt of packets */
+#define QIB_CMD_TID_UPDATE      19      /* update expected TID entries */
+#define QIB_CMD_TID_FREE        20      /* free expected TID entries */
+#define QIB_CMD_SET_PART_KEY    21      /* add partition key */
+/* 22 available, was: return info on slave processes (for old user code) */
+#define QIB_CMD_ASSIGN_CTXT     23      /* allocate HCA and ctxt */
+#define QIB_CMD_USER_INIT       24      /* set up userspace */
+#define QIB_CMD_UNUSED_1        25
+#define QIB_CMD_UNUSED_2        26
+#define QIB_CMD_PIOAVAILUPD     27      /* force an update of PIOAvail reg */
+#define QIB_CMD_POLL_TYPE       28      /* set the kind of polling we want */
+#define QIB_CMD_ARMLAUNCH_CTRL  29      /* armlaunch detection control */
+/* 30 is unused */
+#define QIB_CMD_SDMA_INFLIGHT   31      /* sdma inflight counter request */
+#define QIB_CMD_SDMA_COMPLETE   32      /* sdma completion counter request */
+/* 33 available, was a testing feature  */
+#define QIB_CMD_DISARM_BUFS     34      /* disarm send buffers w/ errors */
+#define QIB_CMD_ACK_EVENT       35      /* ack & clear bits */
+#define QIB_CMD_CPUS_LIST       36      /* list of cpus allocated, for pinned
+					 * processes: qib_cpus_list */
+
+/*
+ * QIB_CMD_ACK_EVENT obsoletes QIB_CMD_DISARM_BUFS, but we keep it for
+ * compatibility with libraries from previous release.   The ACK_EVENT
+ * will take appropriate driver action (if any, just DISARM for now),
+ * then clear the bits passed in as part of the mask.  These bits are
+ * in the first 64bit word at spi_sendbuf_status, and are passed to
+ * the driver in the event_mask union as well.
+ */
+#define _QIB_EVENT_DISARM_BUFS_BIT	0
+#define _QIB_EVENT_LINKDOWN_BIT		1
+#define _QIB_EVENT_LID_CHANGE_BIT	2
+#define _QIB_EVENT_LMC_CHANGE_BIT	3
+#define _QIB_EVENT_SL2VL_CHANGE_BIT	4
+#define _QIB_MAX_EVENT_BIT _QIB_EVENT_SL2VL_CHANGE_BIT
+
+#define QIB_EVENT_DISARM_BUFS_BIT	(1UL << _QIB_EVENT_DISARM_BUFS_BIT)
+#define QIB_EVENT_LINKDOWN_BIT		(1UL << _QIB_EVENT_LINKDOWN_BIT)
+#define QIB_EVENT_LID_CHANGE_BIT	(1UL << _QIB_EVENT_LID_CHANGE_BIT)
+#define QIB_EVENT_LMC_CHANGE_BIT	(1UL << _QIB_EVENT_LMC_CHANGE_BIT)
+#define QIB_EVENT_SL2VL_CHANGE_BIT	(1UL << _QIB_EVENT_SL2VL_CHANGE_BIT)
+
+
+/*
+ * Poll types
+ */
+#define QIB_POLL_TYPE_ANYRCV     0x0
+#define QIB_POLL_TYPE_URGENT     0x1
+
+struct qib_ctxt_info {
+	__u16 num_active;       /* number of active units */
+	__u16 unit;             /* unit (chip) assigned to caller */
+	__u16 port;             /* IB port assigned to caller (1-based) */
+	__u16 ctxt;             /* ctxt on unit assigned to caller */
+	__u16 subctxt;          /* subctxt on unit assigned to caller */
+	__u16 num_ctxts;        /* number of ctxts available on unit */
+	__u16 num_subctxts;     /* number of subctxts opened on ctxt */
+	__u16 rec_cpu;          /* cpu # for affinity (ffff if none) */
+};
+
+struct qib_tid_info {
+	__u32 tidcnt;
+	/* make structure same size in 32 and 64 bit */
+	__u32 tid__unused;
+	/* virtual address of first page in transfer */
+	__u64 tidvaddr;
+	/* pointer (same size 32/64 bit) to __u16 tid array */
+	__u64 tidlist;
+
+	/*
+	 * pointer (same size 32/64 bit) to bitmap of TIDs used
+	 * for this call; checked for being large enough at open
+	 */
+	__u64 tidmap;
+};
+
+struct qib_cmd {
+	__u32 type;                     /* command type */
+	union {
+		struct qib_tid_info tid_info;
+		struct qib_user_info user_info;
+
+		/*
+		 * address in userspace where we should put the sdma
+		 * inflight counter
+		 */
+		__u64 sdma_inflight;
+		/*
+		 * address in userspace where we should put the sdma
+		 * completion counter
+		 */
+		__u64 sdma_complete;
+		/* address in userspace of struct qib_ctxt_info to
+		   write result to */
+		__u64 ctxt_info;
+		/* enable/disable receipt of packets */
+		__u32 recv_ctrl;
+		/* enable/disable armlaunch errors (non-zero to enable) */
+		__u32 armlaunch_ctrl;
+		/* partition key to set */
+		__u16 part_key;
+		/* user address of __u32 bitmask of active slaves */
+		__u64 slave_mask_addr;
+		/* type of polling we want */
+		__u16 poll_type;
+		/* back pressure enable bit for one particular context */
+		__u8 ctxt_bp;
+		/* qib_user_event_ack(), IPATH_EVENT_* bits */
+		__u64 event_mask;
+	} cmd;
+};
+
+struct qib_iovec {
+	/* Pointer to data, but same size 32 and 64 bit */
+	__u64 iov_base;
+
+	/*
+	 * Length of data; don't need 64 bits, but want
+	 * qib_sendpkt to remain same size as before 32 bit changes, so...
+	 */
+	__u64 iov_len;
+};
+
+/*
+ * Describes a single packet for send.  Each packet can have one or more
+ * buffers, but the total length (exclusive of IB headers) must be less
+ * than the MTU, and if using the PIO method, entire packet length,
+ * including IB headers, must be less than the qib_piosize value (words).
+ * Use of this necessitates including sys/uio.h
+ */
+struct __qib_sendpkt {
+	__u32 sps_flags;        /* flags for packet (TBD) */
+	__u32 sps_cnt;          /* number of entries to use in sps_iov */
+	/* array of iov's describing packet. TEMPORARY */
+	struct qib_iovec sps_iov[4];
+};
+
+/*
+ * Diagnostics can send a packet by "writing" the following
+ * structs to the diag data special file.
+ * This allows a custom
+ * pbc (+ static rate) qword, so that special modes and deliberate
+ * changes to CRCs can be used. The elements were also re-ordered
+ * for better alignment and to avoid padding issues.
+ */
+#define _DIAG_XPKT_VERS 3
+struct qib_diag_xpkt {
+	__u16 version;
+	__u16 unit;
+	__u16 port;
+	__u16 len;
+	__u64 data;
+	__u64 pbc_wd;
+};
+
+/*
+ * Data layout in I2C flash (for GUID, etc.)
+ * All fields are little-endian binary unless otherwise stated
+ */
+#define QIB_FLASH_VERSION 2
+struct qib_flash {
+	/* flash layout version (QIB_FLASH_VERSION) */
+	__u8 if_fversion;
+	/* checksum protecting if_length bytes */
+	__u8 if_csum;
+	/*
+	 * valid length (in use, protected by if_csum), including
+	 * if_fversion and if_csum themselves)
+	 */
+	__u8 if_length;
+	/* the GUID, in network order */
+	__u8 if_guid[8];
+	/* number of GUIDs to use, starting from if_guid */
+	__u8 if_numguid;
+	/* the (last 10 characters of) board serial number, in ASCII */
+	char if_serial[12];
+	/* board mfg date (YYYYMMDD ASCII) */
+	char if_mfgdate[8];
+	/* last board rework/test date (YYYYMMDD ASCII) */
+	char if_testdate[8];
+	/* logging of error counts, TBD */
+	__u8 if_errcntp[4];
+	/* powered on hours, updated at driver unload */
+	__u8 if_powerhour[2];
+	/* ASCII free-form comment field */
+	char if_comment[32];
+	/* Backwards compatible prefix for longer QLogic Serial Numbers */
+	char if_sprefix[4];
+	/* 82 bytes used, min flash size is 128 bytes */
+	__u8 if_future[46];
+};
+
+/*
+ * These are the counters implemented in the chip, and are listed in order.
+ * The InterCaps naming is taken straight from the chip spec.
+ */
+struct qlogic_ib_counters {
+	__u64 LBIntCnt;
+	__u64 LBFlowStallCnt;
+	__u64 TxSDmaDescCnt;    /* was Reserved1 */
+	__u64 TxUnsupVLErrCnt;
+	__u64 TxDataPktCnt;
+	__u64 TxFlowPktCnt;
+	__u64 TxDwordCnt;
+	__u64 TxLenErrCnt;
+	__u64 TxMaxMinLenErrCnt;
+	__u64 TxUnderrunCnt;
+	__u64 TxFlowStallCnt;
+	__u64 TxDroppedPktCnt;
+	__u64 RxDroppedPktCnt;
+	__u64 RxDataPktCnt;
+	__u64 RxFlowPktCnt;
+	__u64 RxDwordCnt;
+	__u64 RxLenErrCnt;
+	__u64 RxMaxMinLenErrCnt;
+	__u64 RxICRCErrCnt;
+	__u64 RxVCRCErrCnt;
+	__u64 RxFlowCtrlErrCnt;
+	__u64 RxBadFormatCnt;
+	__u64 RxLinkProblemCnt;
+	__u64 RxEBPCnt;
+	__u64 RxLPCRCErrCnt;
+	__u64 RxBufOvflCnt;
+	__u64 RxTIDFullErrCnt;
+	__u64 RxTIDValidErrCnt;
+	__u64 RxPKeyMismatchCnt;
+	__u64 RxP0HdrEgrOvflCnt;
+	__u64 RxP1HdrEgrOvflCnt;
+	__u64 RxP2HdrEgrOvflCnt;
+	__u64 RxP3HdrEgrOvflCnt;
+	__u64 RxP4HdrEgrOvflCnt;
+	__u64 RxP5HdrEgrOvflCnt;
+	__u64 RxP6HdrEgrOvflCnt;
+	__u64 RxP7HdrEgrOvflCnt;
+	__u64 RxP8HdrEgrOvflCnt;
+	__u64 RxP9HdrEgrOvflCnt;
+	__u64 RxP10HdrEgrOvflCnt;
+	__u64 RxP11HdrEgrOvflCnt;
+	__u64 RxP12HdrEgrOvflCnt;
+	__u64 RxP13HdrEgrOvflCnt;
+	__u64 RxP14HdrEgrOvflCnt;
+	__u64 RxP15HdrEgrOvflCnt;
+	__u64 RxP16HdrEgrOvflCnt;
+	__u64 IBStatusChangeCnt;
+	__u64 IBLinkErrRecoveryCnt;
+	__u64 IBLinkDownedCnt;
+	__u64 IBSymbolErrCnt;
+	__u64 RxVL15DroppedPktCnt;
+	__u64 RxOtherLocalPhyErrCnt;
+	__u64 PcieRetryBufDiagQwordCnt;
+	__u64 ExcessBufferOvflCnt;
+	__u64 LocalLinkIntegrityErrCnt;
+	__u64 RxVlErrCnt;
+	__u64 RxDlidFltrCnt;
+};
+
+/*
+ * The next set of defines are for packet headers, and chip register
+ * and memory bits that are visible to and/or used by user-mode software.
+ */
+
+/* RcvHdrFlags bits */
+#define QLOGIC_IB_RHF_LENGTH_MASK 0x7FF
+#define QLOGIC_IB_RHF_LENGTH_SHIFT 0
+#define QLOGIC_IB_RHF_RCVTYPE_MASK 0x7
+#define QLOGIC_IB_RHF_RCVTYPE_SHIFT 11
+#define QLOGIC_IB_RHF_EGRINDEX_MASK 0xFFF
+#define QLOGIC_IB_RHF_EGRINDEX_SHIFT 16
+#define QLOGIC_IB_RHF_SEQ_MASK 0xF
+#define QLOGIC_IB_RHF_SEQ_SHIFT 0
+#define QLOGIC_IB_RHF_HDRQ_OFFSET_MASK 0x7FF
+#define QLOGIC_IB_RHF_HDRQ_OFFSET_SHIFT 4
+#define QLOGIC_IB_RHF_H_ICRCERR   0x80000000
+#define QLOGIC_IB_RHF_H_VCRCERR   0x40000000
+#define QLOGIC_IB_RHF_H_PARITYERR 0x20000000
+#define QLOGIC_IB_RHF_H_LENERR    0x10000000
+#define QLOGIC_IB_RHF_H_MTUERR    0x08000000
+#define QLOGIC_IB_RHF_H_IHDRERR   0x04000000
+#define QLOGIC_IB_RHF_H_TIDERR    0x02000000
+#define QLOGIC_IB_RHF_H_MKERR     0x01000000
+#define QLOGIC_IB_RHF_H_IBERR     0x00800000
+#define QLOGIC_IB_RHF_H_ERR_MASK  0xFF800000
+#define QLOGIC_IB_RHF_L_USE_EGR   0x80000000
+#define QLOGIC_IB_RHF_L_SWA       0x00008000
+#define QLOGIC_IB_RHF_L_SWB       0x00004000
+
+/* qlogic_ib header fields */
+#define QLOGIC_IB_I_VERS_MASK 0xF
+#define QLOGIC_IB_I_VERS_SHIFT 28
+#define QLOGIC_IB_I_CTXT_MASK 0xF
+#define QLOGIC_IB_I_CTXT_SHIFT 24
+#define QLOGIC_IB_I_TID_MASK 0x7FF
+#define QLOGIC_IB_I_TID_SHIFT 13
+#define QLOGIC_IB_I_OFFSET_MASK 0x1FFF
+#define QLOGIC_IB_I_OFFSET_SHIFT 0
+
+/* K_PktFlags bits */
+#define QLOGIC_IB_KPF_INTR 0x1
+#define QLOGIC_IB_KPF_SUBCTXT_MASK 0x3
+#define QLOGIC_IB_KPF_SUBCTXT_SHIFT 1
+
+#define QLOGIC_IB_MAX_SUBCTXT   4
+
+/* SendPIO per-buffer control */
+#define QLOGIC_IB_SP_TEST    0x40
+#define QLOGIC_IB_SP_TESTEBP 0x20
+#define QLOGIC_IB_SP_TRIGGER_SHIFT  15
+
+/* SendPIOAvail bits */
+#define QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT 1
+#define QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT 0
+
+/* qlogic_ib header format */
+struct qib_header {
+	/*
+	 * Version - 4 bits, Context - 4 bits, TID - 10 bits and Offset -
+	 * 14 bits before ECO change ~28 Dec 03.  After that, Vers 4,
+	 * Context 4, TID 11, offset 13.
+	 */
+	__le32 ver_ctxt_tid_offset;
+	__le16 chksum;
+	__le16 pkt_flags;
+};
+
+/*
+ * qlogic_ib user message header format.
+ * This structure contains the first 4 fields common to all protocols
+ * that employ qlogic_ib.
+ */
+struct qib_message_header {
+	__be16 lrh[4];
+	__be32 bth[3];
+	/* fields below this point are in host byte order */
+	struct qib_header iph;
+	/* fields below are simplified, but should match PSM */
+	/* some are accessed by driver when packet spliting is needed */
+	__u8 sub_opcode;
+	__u8 flags;
+	__u16 commidx;
+	__u32 ack_seq_num;
+	__u8 flowid;
+	__u8 hdr_dlen;
+	__u16 mqhdr;
+	__u32 uwords[4];
+};
+
+/* sequence number bits for message */
+union qib_seqnum {
+	struct {
+		__u32 seq:11;
+		__u32 gen:8;
+		__u32 flow:5;
+	};
+	struct {
+		__u32 pkt:16;
+		__u32 msg:8;
+	};
+	__u32 val;
+};
+
+/* qib receiving-dma tid-session-member */
+struct qib_tid_session_member {
+	__u16 tid;
+	__u16 offset;
+	__u16 length;
+};
+
+/* IB - LRH header consts */
+#define QIB_LRH_GRH 0x0003      /* 1. word of IB LRH - next header: GRH */
+#define QIB_LRH_BTH 0x0002      /* 1. word of IB LRH - next header: BTH */
+
+/* misc. */
+#define SIZE_OF_CRC 1
+
+#define QIB_DEFAULT_P_KEY 0xFFFF
+#define QIB_PERMISSIVE_LID 0xFFFF
+#define QIB_AETH_CREDIT_SHIFT 24
+#define QIB_AETH_CREDIT_MASK 0x1F
+#define QIB_AETH_CREDIT_INVAL 0x1F
+#define QIB_PSN_MASK 0xFFFFFF
+#define QIB_MSN_MASK 0xFFFFFF
+#define QIB_QPN_MASK 0xFFFFFF
+#define QIB_MULTICAST_LID_BASE 0xC000
+#define QIB_EAGER_TID_ID QLOGIC_IB_I_TID_MASK
+#define QIB_MULTICAST_QPN 0xFFFFFF
+
+/* Receive Header Queue: receive type (from qlogic_ib) */
+#define RCVHQ_RCV_TYPE_EXPECTED  0
+#define RCVHQ_RCV_TYPE_EAGER     1
+#define RCVHQ_RCV_TYPE_NON_KD    2
+#define RCVHQ_RCV_TYPE_ERROR     3
+
+#define QIB_HEADER_QUEUE_WORDS 9
+
+/* functions for extracting fields from rcvhdrq entries for the driver.
+ */
+static inline __u32 qib_hdrget_err_flags(const __le32 *rbuf)
+{
+	return __le32_to_cpu(rbuf[1]) & QLOGIC_IB_RHF_H_ERR_MASK;
+}
+
+static inline __u32 qib_hdrget_rcv_type(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> QLOGIC_IB_RHF_RCVTYPE_SHIFT) &
+		QLOGIC_IB_RHF_RCVTYPE_MASK;
+}
+
+static inline __u32 qib_hdrget_length_in_bytes(const __le32 *rbuf)
+{
+	return ((__le32_to_cpu(rbuf[0]) >> QLOGIC_IB_RHF_LENGTH_SHIFT) &
+		QLOGIC_IB_RHF_LENGTH_MASK) << 2;
+}
+
+static inline __u32 qib_hdrget_index(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[0]) >> QLOGIC_IB_RHF_EGRINDEX_SHIFT) &
+		QLOGIC_IB_RHF_EGRINDEX_MASK;
+}
+
+static inline __u32 qib_hdrget_seq(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> QLOGIC_IB_RHF_SEQ_SHIFT) &
+		QLOGIC_IB_RHF_SEQ_MASK;
+}
+
+static inline __u32 qib_hdrget_offset(const __le32 *rbuf)
+{
+	return (__le32_to_cpu(rbuf[1]) >> QLOGIC_IB_RHF_HDRQ_OFFSET_SHIFT) &
+		QLOGIC_IB_RHF_HDRQ_OFFSET_MASK;
+}
+
+static inline __u32 qib_hdrget_use_egr_buf(const __le32 *rbuf)
+{
+	return __le32_to_cpu(rbuf[0]) & QLOGIC_IB_RHF_L_USE_EGR;
+}
+
+static inline __u32 qib_hdrget_qib_ver(__le32 hdrword)
+{
+	return (__le32_to_cpu(hdrword) >> QLOGIC_IB_I_VERS_SHIFT) &
+		QLOGIC_IB_I_VERS_MASK;
+}
+
+#endif                          /* _QIB_COMMON_H */
diff --git a/drivers/infiniband/hw/qib/qib_cq.c b/drivers/infiniband/hw/qib/qib_cq.c
new file mode 100644
index 000000000..ab4e11cfa
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_cq.c
@@ -0,0 +1,540 @@
+/*
+ * Copyright (c) 2013 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006, 2007, 2008, 2010 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/kthread.h>
+
+#include "qib_verbs.h"
+#include "qib.h"
+
+/**
+ * qib_cq_enter - add a new entry to the completion queue
+ * @cq: completion queue
+ * @entry: work completion entry to add
+ * @sig: true if @entry is a solicitated entry
+ *
+ * This may be called with qp->s_lock held.
+ */
+void qib_cq_enter(struct qib_cq *cq, struct ib_wc *entry, int solicited)
+{
+	struct qib_cq_wc *wc;
+	unsigned long flags;
+	u32 head;
+	u32 next;
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	/*
+	 * Note that the head pointer might be writable by user processes.
+	 * Take care to verify it is a sane value.
+	 */
+	wc = cq->queue;
+	head = wc->head;
+	if (head >= (unsigned) cq->ibcq.cqe) {
+		head = cq->ibcq.cqe;
+		next = 0;
+	} else
+		next = head + 1;
+	if (unlikely(next == wc->tail)) {
+		spin_unlock_irqrestore(&cq->lock, flags);
+		if (cq->ibcq.event_handler) {
+			struct ib_event ev;
+
+			ev.device = cq->ibcq.device;
+			ev.element.cq = &cq->ibcq;
+			ev.event = IB_EVENT_CQ_ERR;
+			cq->ibcq.event_handler(&ev, cq->ibcq.cq_context);
+		}
+		return;
+	}
+	if (cq->ip) {
+		wc->uqueue[head].wr_id = entry->wr_id;
+		wc->uqueue[head].status = entry->status;
+		wc->uqueue[head].opcode = entry->opcode;
+		wc->uqueue[head].vendor_err = entry->vendor_err;
+		wc->uqueue[head].byte_len = entry->byte_len;
+		wc->uqueue[head].ex.imm_data =
+			(__u32 __force)entry->ex.imm_data;
+		wc->uqueue[head].qp_num = entry->qp->qp_num;
+		wc->uqueue[head].src_qp = entry->src_qp;
+		wc->uqueue[head].wc_flags = entry->wc_flags;
+		wc->uqueue[head].pkey_index = entry->pkey_index;
+		wc->uqueue[head].slid = entry->slid;
+		wc->uqueue[head].sl = entry->sl;
+		wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+		wc->uqueue[head].port_num = entry->port_num;
+		/* Make sure entry is written before the head index. */
+		smp_wmb();
+	} else
+		wc->kqueue[head] = *entry;
+	wc->head = next;
+
+	if (cq->notify == IB_CQ_NEXT_COMP ||
+	    (cq->notify == IB_CQ_SOLICITED &&
+	     (solicited || entry->status != IB_WC_SUCCESS))) {
+		struct kthread_worker *worker;
+		/*
+		 * This will cause send_complete() to be called in
+		 * another thread.
+		 */
+		smp_rmb();
+		worker = cq->dd->worker;
+		if (likely(worker)) {
+			cq->notify = IB_CQ_NONE;
+			cq->triggered++;
+			queue_kthread_work(worker, &cq->comptask);
+		}
+	}
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+}
+
+/**
+ * qib_poll_cq - poll for work completion entries
+ * @ibcq: the completion queue to poll
+ * @num_entries: the maximum number of entries to return
+ * @entry: pointer to array where work completions are placed
+ *
+ * Returns the number of completion entries polled.
+ *
+ * This may be called from interrupt context.  Also called by ib_poll_cq()
+ * in the generic verbs code.
+ */
+int qib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
+{
+	struct qib_cq *cq = to_icq(ibcq);
+	struct qib_cq_wc *wc;
+	unsigned long flags;
+	int npolled;
+	u32 tail;
+
+	/* The kernel can only poll a kernel completion queue */
+	if (cq->ip) {
+		npolled = -EINVAL;
+		goto bail;
+	}
+
+	spin_lock_irqsave(&cq->lock, flags);
+
+	wc = cq->queue;
+	tail = wc->tail;
+	if (tail > (u32) cq->ibcq.cqe)
+		tail = (u32) cq->ibcq.cqe;
+	for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
+		if (tail == wc->head)
+			break;
+		/* The kernel doesn't need a RMB since it has the lock. */
+		*entry = wc->kqueue[tail];
+		if (tail >= cq->ibcq.cqe)
+			tail = 0;
+		else
+			tail++;
+	}
+	wc->tail = tail;
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+bail:
+	return npolled;
+}
+
+static void send_complete(struct kthread_work *work)
+{
+	struct qib_cq *cq = container_of(work, struct qib_cq, comptask);
+
+	/*
+	 * The completion handler will most likely rearm the notification
+	 * and poll for all pending entries.  If a new completion entry
+	 * is added while we are in this routine, queue_work()
+	 * won't call us again until we return so we check triggered to
+	 * see if we need to call the handler again.
+	 */
+	for (;;) {
+		u8 triggered = cq->triggered;
+
+		/*
+		 * IPoIB connected mode assumes the callback is from a
+		 * soft IRQ. We simulate this by blocking "bottom halves".
+		 * See the implementation for ipoib_cm_handle_tx_wc(),
+		 * netif_tx_lock_bh() and netif_tx_lock().
+		 */
+		local_bh_disable();
+		cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
+		local_bh_enable();
+
+		if (cq->triggered == triggered)
+			return;
+	}
+}
+
+/**
+ * qib_create_cq - create a completion queue
+ * @ibdev: the device this completion queue is attached to
+ * @entries: the minimum size of the completion queue
+ * @context: unused by the QLogic_IB driver
+ * @udata: user data for libibverbs.so
+ *
+ * Returns a pointer to the completion queue or negative errno values
+ * for failure.
+ *
+ * Called by ib_create_cq() in the generic verbs code.
+ */
+struct ib_cq *qib_create_cq(struct ib_device *ibdev, int entries,
+			    int comp_vector, struct ib_ucontext *context,
+			    struct ib_udata *udata)
+{
+	struct qib_ibdev *dev = to_idev(ibdev);
+	struct qib_cq *cq;
+	struct qib_cq_wc *wc;
+	struct ib_cq *ret;
+	u32 sz;
+
+	if (entries < 1 || entries > ib_qib_max_cqes) {
+		ret = ERR_PTR(-EINVAL);
+		goto done;
+	}
+
+	/* Allocate the completion queue structure. */
+	cq = kmalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto done;
+	}
+
+	/*
+	 * Allocate the completion queue entries and head/tail pointers.
+	 * This is allocated separately so that it can be resized and
+	 * also mapped into user space.
+	 * We need to use vmalloc() in order to support mmap and large
+	 * numbers of entries.
+	 */
+	sz = sizeof(*wc);
+	if (udata && udata->outlen >= sizeof(__u64))
+		sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
+	else
+		sz += sizeof(struct ib_wc) * (entries + 1);
+	wc = vmalloc_user(sz);
+	if (!wc) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_cq;
+	}
+
+	/*
+	 * Return the address of the WC as the offset to mmap.
+	 * See qib_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		int err;
+
+		cq->ip = qib_create_mmap_info(dev, sz, context, wc);
+		if (!cq->ip) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_wc;
+		}
+
+		err = ib_copy_to_udata(udata, &cq->ip->offset,
+				       sizeof(cq->ip->offset));
+		if (err) {
+			ret = ERR_PTR(err);
+			goto bail_ip;
+		}
+	} else
+		cq->ip = NULL;
+
+	spin_lock(&dev->n_cqs_lock);
+	if (dev->n_cqs_allocated == ib_qib_max_cqs) {
+		spin_unlock(&dev->n_cqs_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	dev->n_cqs_allocated++;
+	spin_unlock(&dev->n_cqs_lock);
+
+	if (cq->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&cq->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	/*
+	 * ib_create_cq() will initialize cq->ibcq except for cq->ibcq.cqe.
+	 * The number of entries should be >= the number requested or return
+	 * an error.
+	 */
+	cq->dd = dd_from_dev(dev);
+	cq->ibcq.cqe = entries;
+	cq->notify = IB_CQ_NONE;
+	cq->triggered = 0;
+	spin_lock_init(&cq->lock);
+	init_kthread_work(&cq->comptask, send_complete);
+	wc->head = 0;
+	wc->tail = 0;
+	cq->queue = wc;
+
+	ret = &cq->ibcq;
+
+	goto done;
+
+bail_ip:
+	kfree(cq->ip);
+bail_wc:
+	vfree(wc);
+bail_cq:
+	kfree(cq);
+done:
+	return ret;
+}
+
+/**
+ * qib_destroy_cq - destroy a completion queue
+ * @ibcq: the completion queue to destroy.
+ *
+ * Returns 0 for success.
+ *
+ * Called by ib_destroy_cq() in the generic verbs code.
+ */
+int qib_destroy_cq(struct ib_cq *ibcq)
+{
+	struct qib_ibdev *dev = to_idev(ibcq->device);
+	struct qib_cq *cq = to_icq(ibcq);
+
+	flush_kthread_work(&cq->comptask);
+	spin_lock(&dev->n_cqs_lock);
+	dev->n_cqs_allocated--;
+	spin_unlock(&dev->n_cqs_lock);
+	if (cq->ip)
+		kref_put(&cq->ip->ref, qib_release_mmap_info);
+	else
+		vfree(cq->queue);
+	kfree(cq);
+
+	return 0;
+}
+
+/**
+ * qib_req_notify_cq - change the notification type for a completion queue
+ * @ibcq: the completion queue
+ * @notify_flags: the type of notification to request
+ *
+ * Returns 0 for success.
+ *
+ * This may be called from interrupt context.  Also called by
+ * ib_req_notify_cq() in the generic verbs code.
+ */
+int qib_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags)
+{
+	struct qib_cq *cq = to_icq(ibcq);
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&cq->lock, flags);
+	/*
+	 * Don't change IB_CQ_NEXT_COMP to IB_CQ_SOLICITED but allow
+	 * any other transitions (see C11-31 and C11-32 in ch. 11.4.2.2).
+	 */
+	if (cq->notify != IB_CQ_NEXT_COMP)
+		cq->notify = notify_flags & IB_CQ_SOLICITED_MASK;
+
+	if ((notify_flags & IB_CQ_REPORT_MISSED_EVENTS) &&
+	    cq->queue->head != cq->queue->tail)
+		ret = 1;
+
+	spin_unlock_irqrestore(&cq->lock, flags);
+
+	return ret;
+}
+
+/**
+ * qib_resize_cq - change the size of the CQ
+ * @ibcq: the completion queue
+ *
+ * Returns 0 for success.
+ */
+int qib_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
+{
+	struct qib_cq *cq = to_icq(ibcq);
+	struct qib_cq_wc *old_wc;
+	struct qib_cq_wc *wc;
+	u32 head, tail, n;
+	int ret;
+	u32 sz;
+
+	if (cqe < 1 || cqe > ib_qib_max_cqes) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Need to use vmalloc() if we want to support large #s of entries.
+	 */
+	sz = sizeof(*wc);
+	if (udata && udata->outlen >= sizeof(__u64))
+		sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
+	else
+		sz += sizeof(struct ib_wc) * (cqe + 1);
+	wc = vmalloc_user(sz);
+	if (!wc) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	/* Check that we can write the offset to mmap. */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		__u64 offset = 0;
+
+		ret = ib_copy_to_udata(udata, &offset, sizeof(offset));
+		if (ret)
+			goto bail_free;
+	}
+
+	spin_lock_irq(&cq->lock);
+	/*
+	 * Make sure head and tail are sane since they
+	 * might be user writable.
+	 */
+	old_wc = cq->queue;
+	head = old_wc->head;
+	if (head > (u32) cq->ibcq.cqe)
+		head = (u32) cq->ibcq.cqe;
+	tail = old_wc->tail;
+	if (tail > (u32) cq->ibcq.cqe)
+		tail = (u32) cq->ibcq.cqe;
+	if (head < tail)
+		n = cq->ibcq.cqe + 1 + head - tail;
+	else
+		n = head - tail;
+	if (unlikely((u32)cqe < n)) {
+		ret = -EINVAL;
+		goto bail_unlock;
+	}
+	for (n = 0; tail != head; n++) {
+		if (cq->ip)
+			wc->uqueue[n] = old_wc->uqueue[tail];
+		else
+			wc->kqueue[n] = old_wc->kqueue[tail];
+		if (tail == (u32) cq->ibcq.cqe)
+			tail = 0;
+		else
+			tail++;
+	}
+	cq->ibcq.cqe = cqe;
+	wc->head = n;
+	wc->tail = 0;
+	cq->queue = wc;
+	spin_unlock_irq(&cq->lock);
+
+	vfree(old_wc);
+
+	if (cq->ip) {
+		struct qib_ibdev *dev = to_idev(ibcq->device);
+		struct qib_mmap_info *ip = cq->ip;
+
+		qib_update_mmap_info(dev, ip, sz, wc);
+
+		/*
+		 * Return the offset to mmap.
+		 * See qib_mmap() for details.
+		 */
+		if (udata && udata->outlen >= sizeof(__u64)) {
+			ret = ib_copy_to_udata(udata, &ip->offset,
+					       sizeof(ip->offset));
+			if (ret)
+				goto bail;
+		}
+
+		spin_lock_irq(&dev->pending_lock);
+		if (list_empty(&ip->pending_mmaps))
+			list_add(&ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = 0;
+	goto bail;
+
+bail_unlock:
+	spin_unlock_irq(&cq->lock);
+bail_free:
+	vfree(wc);
+bail:
+	return ret;
+}
+
+int qib_cq_init(struct qib_devdata *dd)
+{
+	int ret = 0;
+	int cpu;
+	struct task_struct *task;
+
+	if (dd->worker)
+		return 0;
+	dd->worker = kzalloc(sizeof(*dd->worker), GFP_KERNEL);
+	if (!dd->worker)
+		return -ENOMEM;
+	init_kthread_worker(dd->worker);
+	task = kthread_create_on_node(
+		kthread_worker_fn,
+		dd->worker,
+		dd->assigned_node_id,
+		"qib_cq%d", dd->unit);
+	if (IS_ERR(task))
+		goto task_fail;
+	cpu = cpumask_first(cpumask_of_node(dd->assigned_node_id));
+	kthread_bind(task, cpu);
+	wake_up_process(task);
+out:
+	return ret;
+task_fail:
+	ret = PTR_ERR(task);
+	kfree(dd->worker);
+	dd->worker = NULL;
+	goto out;
+}
+
+void qib_cq_exit(struct qib_devdata *dd)
+{
+	struct kthread_worker *worker;
+
+	worker = dd->worker;
+	if (!worker)
+		return;
+	/* blocks future queuing from send_complete() */
+	dd->worker = NULL;
+	smp_wmb();
+	flush_kthread_worker(worker);
+	kthread_stop(worker->task);
+	kfree(worker);
+}
diff --git a/drivers/infiniband/hw/qib/qib_debugfs.c b/drivers/infiniband/hw/qib/qib_debugfs.c
new file mode 100644
index 000000000..5e75b43c5
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_debugfs.c
@@ -0,0 +1,283 @@
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Copyright (c) 2013 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/kernel.h>
+#include <linux/export.h>
+
+#include "qib.h"
+#include "qib_verbs.h"
+#include "qib_debugfs.h"
+
+static struct dentry *qib_dbg_root;
+
+#define DEBUGFS_FILE(name) \
+static const struct seq_operations _##name##_seq_ops = { \
+	.start = _##name##_seq_start, \
+	.next  = _##name##_seq_next, \
+	.stop  = _##name##_seq_stop, \
+	.show  = _##name##_seq_show \
+}; \
+static int _##name##_open(struct inode *inode, struct file *s) \
+{ \
+	struct seq_file *seq; \
+	int ret; \
+	ret =  seq_open(s, &_##name##_seq_ops); \
+	if (ret) \
+		return ret; \
+	seq = s->private_data; \
+	seq->private = inode->i_private; \
+	return 0; \
+} \
+static const struct file_operations _##name##_file_ops = { \
+	.owner   = THIS_MODULE, \
+	.open    = _##name##_open, \
+	.read    = seq_read, \
+	.llseek  = seq_lseek, \
+	.release = seq_release \
+};
+
+#define DEBUGFS_FILE_CREATE(name) \
+do { \
+	struct dentry *ent; \
+	ent = debugfs_create_file(#name , 0400, ibd->qib_ibdev_dbg, \
+		ibd, &_##name##_file_ops); \
+	if (!ent) \
+		pr_warn("create of " #name " failed\n"); \
+} while (0)
+
+static void *_opcode_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct qib_opcode_stats_perctx *opstats;
+
+	if (*pos >= ARRAY_SIZE(opstats->stats))
+		return NULL;
+	return pos;
+}
+
+static void *_opcode_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct qib_opcode_stats_perctx *opstats;
+
+	++*pos;
+	if (*pos >= ARRAY_SIZE(opstats->stats))
+		return NULL;
+	return pos;
+}
+
+
+static void _opcode_stats_seq_stop(struct seq_file *s, void *v)
+{
+	/* nothing allocated */
+}
+
+static int _opcode_stats_seq_show(struct seq_file *s, void *v)
+{
+	loff_t *spos = v;
+	loff_t i = *spos, j;
+	u64 n_packets = 0, n_bytes = 0;
+	struct qib_ibdev *ibd = (struct qib_ibdev *)s->private;
+	struct qib_devdata *dd = dd_from_dev(ibd);
+
+	for (j = 0; j < dd->first_user_ctxt; j++) {
+		if (!dd->rcd[j])
+			continue;
+		n_packets += dd->rcd[j]->opstats->stats[i].n_packets;
+		n_bytes += dd->rcd[j]->opstats->stats[i].n_bytes;
+	}
+	if (!n_packets && !n_bytes)
+		return SEQ_SKIP;
+	seq_printf(s, "%02llx %llu/%llu\n", i,
+		(unsigned long long) n_packets,
+		(unsigned long long) n_bytes);
+
+	return 0;
+}
+
+DEBUGFS_FILE(opcode_stats)
+
+static void *_ctx_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct qib_ibdev *ibd = (struct qib_ibdev *)s->private;
+	struct qib_devdata *dd = dd_from_dev(ibd);
+
+	if (!*pos)
+		return SEQ_START_TOKEN;
+	if (*pos >= dd->first_user_ctxt)
+		return NULL;
+	return pos;
+}
+
+static void *_ctx_stats_seq_next(struct seq_file *s, void *v, loff_t *pos)
+{
+	struct qib_ibdev *ibd = (struct qib_ibdev *)s->private;
+	struct qib_devdata *dd = dd_from_dev(ibd);
+
+	if (v == SEQ_START_TOKEN)
+		return pos;
+
+	++*pos;
+	if (*pos >= dd->first_user_ctxt)
+		return NULL;
+	return pos;
+}
+
+static void _ctx_stats_seq_stop(struct seq_file *s, void *v)
+{
+	/* nothing allocated */
+}
+
+static int _ctx_stats_seq_show(struct seq_file *s, void *v)
+{
+	loff_t *spos;
+	loff_t i, j;
+	u64 n_packets = 0;
+	struct qib_ibdev *ibd = (struct qib_ibdev *)s->private;
+	struct qib_devdata *dd = dd_from_dev(ibd);
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(s, "Ctx:npkts\n");
+		return 0;
+	}
+
+	spos = v;
+	i = *spos;
+
+	if (!dd->rcd[i])
+		return SEQ_SKIP;
+
+	for (j = 0; j < ARRAY_SIZE(dd->rcd[i]->opstats->stats); j++)
+		n_packets += dd->rcd[i]->opstats->stats[j].n_packets;
+
+	if (!n_packets)
+		return SEQ_SKIP;
+
+	seq_printf(s, "  %llu:%llu\n", i, n_packets);
+	return 0;
+}
+
+DEBUGFS_FILE(ctx_stats)
+
+static void *_qp_stats_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct qib_qp_iter *iter;
+	loff_t n = *pos;
+
+	rcu_read_lock();
+	iter = qib_qp_iter_init(s->private);
+	if (!iter)
+		return NULL;
+
+	while (n--) {
+		if (qib_qp_iter_next(iter)) {
+			kfree(iter);
+			return NULL;
+		}
+	}
+
+	return iter;
+}
+
+static void *_qp_stats_seq_next(struct seq_file *s, void *iter_ptr,
+				   loff_t *pos)
+{
+	struct qib_qp_iter *iter = iter_ptr;
+
+	(*pos)++;
+
+	if (qib_qp_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void _qp_stats_seq_stop(struct seq_file *s, void *iter_ptr)
+{
+	rcu_read_unlock();
+}
+
+static int _qp_stats_seq_show(struct seq_file *s, void *iter_ptr)
+{
+	struct qib_qp_iter *iter = iter_ptr;
+
+	if (!iter)
+		return 0;
+
+	qib_qp_iter_print(s, iter);
+
+	return 0;
+}
+
+DEBUGFS_FILE(qp_stats)
+
+void qib_dbg_ibdev_init(struct qib_ibdev *ibd)
+{
+	char name[10];
+
+	snprintf(name, sizeof(name), "qib%d", dd_from_dev(ibd)->unit);
+	ibd->qib_ibdev_dbg = debugfs_create_dir(name, qib_dbg_root);
+	if (!ibd->qib_ibdev_dbg) {
+		pr_warn("create of %s failed\n", name);
+		return;
+	}
+	DEBUGFS_FILE_CREATE(opcode_stats);
+	DEBUGFS_FILE_CREATE(ctx_stats);
+	DEBUGFS_FILE_CREATE(qp_stats);
+}
+
+void qib_dbg_ibdev_exit(struct qib_ibdev *ibd)
+{
+	if (!qib_dbg_root)
+		goto out;
+	debugfs_remove_recursive(ibd->qib_ibdev_dbg);
+out:
+	ibd->qib_ibdev_dbg = NULL;
+}
+
+void qib_dbg_init(void)
+{
+	qib_dbg_root = debugfs_create_dir(QIB_DRV_NAME, NULL);
+	if (!qib_dbg_root)
+		pr_warn("init of debugfs failed\n");
+}
+
+void qib_dbg_exit(void)
+{
+	debugfs_remove_recursive(qib_dbg_root);
+	qib_dbg_root = NULL;
+}
+
+#endif
+
diff --git a/drivers/infiniband/hw/qib/qib_debugfs.h b/drivers/infiniband/hw/qib/qib_debugfs.h
new file mode 100644
index 000000000..7ae983a91
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_debugfs.h
@@ -0,0 +1,45 @@
+#ifndef _QIB_DEBUGFS_H
+#define _QIB_DEBUGFS_H
+
+#ifdef CONFIG_DEBUG_FS
+/*
+ * Copyright (c) 2013 Intel Corporation.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+struct qib_ibdev;
+void qib_dbg_ibdev_init(struct qib_ibdev *ibd);
+void qib_dbg_ibdev_exit(struct qib_ibdev *ibd);
+void qib_dbg_init(void);
+void qib_dbg_exit(void);
+
+#endif
+
+#endif                          /* _QIB_DEBUGFS_H */
diff --git a/drivers/infiniband/hw/qib/qib_diag.c b/drivers/infiniband/hw/qib/qib_diag.c
new file mode 100644
index 000000000..8c34b23e5
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_diag.c
@@ -0,0 +1,916 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains support for diagnostic functions.  It is accessed by
+ * opening the qib_diag device, normally minor number 129.  Diagnostic use
+ * of the QLogic_IB chip may render the chip or board unusable until the
+ * driver is unloaded, or in some cases, until the system is rebooted.
+ *
+ * Accesses to the chip through this interface are not similar to going
+ * through the /sys/bus/pci resource mmap interface.
+ */
+
+#include <linux/io.h>
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+
+#include "qib.h"
+#include "qib_common.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) QIB_DRV_NAME ": " fmt
+
+/*
+ * Each client that opens the diag device must read then write
+ * offset 0, to prevent lossage from random cat or od. diag_state
+ * sequences this "handshake".
+ */
+enum diag_state { UNUSED = 0, OPENED, INIT, READY };
+
+/* State for an individual client. PID so children cannot abuse handshake */
+static struct qib_diag_client {
+	struct qib_diag_client *next;
+	struct qib_devdata *dd;
+	pid_t pid;
+	enum diag_state state;
+} *client_pool;
+
+/*
+ * Get a client struct. Recycled if possible, else kmalloc.
+ * Must be called with qib_mutex held
+ */
+static struct qib_diag_client *get_client(struct qib_devdata *dd)
+{
+	struct qib_diag_client *dc;
+
+	dc = client_pool;
+	if (dc)
+		/* got from pool remove it and use */
+		client_pool = dc->next;
+	else
+		/* None in pool, alloc and init */
+		dc = kmalloc(sizeof(*dc), GFP_KERNEL);
+
+	if (dc) {
+		dc->next = NULL;
+		dc->dd = dd;
+		dc->pid = current->pid;
+		dc->state = OPENED;
+	}
+	return dc;
+}
+
+/*
+ * Return to pool. Must be called with qib_mutex held
+ */
+static void return_client(struct qib_diag_client *dc)
+{
+	struct qib_devdata *dd = dc->dd;
+	struct qib_diag_client *tdc, *rdc;
+
+	rdc = NULL;
+	if (dc == dd->diag_client) {
+		dd->diag_client = dc->next;
+		rdc = dc;
+	} else {
+		tdc = dc->dd->diag_client;
+		while (tdc) {
+			if (dc == tdc->next) {
+				tdc->next = dc->next;
+				rdc = dc;
+				break;
+			}
+			tdc = tdc->next;
+		}
+	}
+	if (rdc) {
+		rdc->state = UNUSED;
+		rdc->dd = NULL;
+		rdc->pid = 0;
+		rdc->next = client_pool;
+		client_pool = rdc;
+	}
+}
+
+static int qib_diag_open(struct inode *in, struct file *fp);
+static int qib_diag_release(struct inode *in, struct file *fp);
+static ssize_t qib_diag_read(struct file *fp, char __user *data,
+			     size_t count, loff_t *off);
+static ssize_t qib_diag_write(struct file *fp, const char __user *data,
+			      size_t count, loff_t *off);
+
+static const struct file_operations diag_file_ops = {
+	.owner = THIS_MODULE,
+	.write = qib_diag_write,
+	.read = qib_diag_read,
+	.open = qib_diag_open,
+	.release = qib_diag_release,
+	.llseek = default_llseek,
+};
+
+static atomic_t diagpkt_count = ATOMIC_INIT(0);
+static struct cdev *diagpkt_cdev;
+static struct device *diagpkt_device;
+
+static ssize_t qib_diagpkt_write(struct file *fp, const char __user *data,
+				 size_t count, loff_t *off);
+
+static const struct file_operations diagpkt_file_ops = {
+	.owner = THIS_MODULE,
+	.write = qib_diagpkt_write,
+	.llseek = noop_llseek,
+};
+
+int qib_diag_add(struct qib_devdata *dd)
+{
+	char name[16];
+	int ret = 0;
+
+	if (atomic_inc_return(&diagpkt_count) == 1) {
+		ret = qib_cdev_init(QIB_DIAGPKT_MINOR, "ipath_diagpkt",
+				    &diagpkt_file_ops, &diagpkt_cdev,
+				    &diagpkt_device);
+		if (ret)
+			goto done;
+	}
+
+	snprintf(name, sizeof(name), "ipath_diag%d", dd->unit);
+	ret = qib_cdev_init(QIB_DIAG_MINOR_BASE + dd->unit, name,
+			    &diag_file_ops, &dd->diag_cdev,
+			    &dd->diag_device);
+done:
+	return ret;
+}
+
+static void qib_unregister_observers(struct qib_devdata *dd);
+
+void qib_diag_remove(struct qib_devdata *dd)
+{
+	struct qib_diag_client *dc;
+
+	if (atomic_dec_and_test(&diagpkt_count))
+		qib_cdev_cleanup(&diagpkt_cdev, &diagpkt_device);
+
+	qib_cdev_cleanup(&dd->diag_cdev, &dd->diag_device);
+
+	/*
+	 * Return all diag_clients of this device. There should be none,
+	 * as we are "guaranteed" that no clients are still open
+	 */
+	while (dd->diag_client)
+		return_client(dd->diag_client);
+
+	/* Now clean up all unused client structs */
+	while (client_pool) {
+		dc = client_pool;
+		client_pool = dc->next;
+		kfree(dc);
+	}
+	/* Clean up observer list */
+	qib_unregister_observers(dd);
+}
+
+/* qib_remap_ioaddr32 - remap an offset into chip address space to __iomem *
+ *
+ * @dd: the qlogic_ib device
+ * @offs: the offset in chip-space
+ * @cntp: Pointer to max (byte) count for transfer starting at offset
+ * This returns a u32 __iomem * so it can be used for both 64 and 32-bit
+ * mapping. It is needed because with the use of PAT for control of
+ * write-combining, the logically contiguous address-space of the chip
+ * may be split into virtually non-contiguous spaces, with different
+ * attributes, which are them mapped to contiguous physical space
+ * based from the first BAR.
+ *
+ * The code below makes the same assumptions as were made in
+ * init_chip_wc_pat() (qib_init.c), copied here:
+ * Assumes chip address space looks like:
+ *		- kregs + sregs + cregs + uregs (in any order)
+ *		- piobufs (2K and 4K bufs in either order)
+ *	or:
+ *		- kregs + sregs + cregs (in any order)
+ *		- piobufs (2K and 4K bufs in either order)
+ *		- uregs
+ *
+ * If cntp is non-NULL, returns how many bytes from offset can be accessed
+ * Returns 0 if the offset is not mapped.
+ */
+static u32 __iomem *qib_remap_ioaddr32(struct qib_devdata *dd, u32 offset,
+				       u32 *cntp)
+{
+	u32 kreglen;
+	u32 snd_bottom, snd_lim = 0;
+	u32 __iomem *krb32 = (u32 __iomem *)dd->kregbase;
+	u32 __iomem *map = NULL;
+	u32 cnt = 0;
+	u32 tot4k, offs4k;
+
+	/* First, simplest case, offset is within the first map. */
+	kreglen = (dd->kregend - dd->kregbase) * sizeof(u64);
+	if (offset < kreglen) {
+		map = krb32 + (offset / sizeof(u32));
+		cnt = kreglen - offset;
+		goto mapped;
+	}
+
+	/*
+	 * Next check for user regs, the next most common case,
+	 * and a cheap check because if they are not in the first map
+	 * they are last in chip.
+	 */
+	if (dd->userbase) {
+		/* If user regs mapped, they are after send, so set limit. */
+		u32 ulim = (dd->cfgctxts * dd->ureg_align) + dd->uregbase;
+
+		if (!dd->piovl15base)
+			snd_lim = dd->uregbase;
+		krb32 = (u32 __iomem *)dd->userbase;
+		if (offset >= dd->uregbase && offset < ulim) {
+			map = krb32 + (offset - dd->uregbase) / sizeof(u32);
+			cnt = ulim - offset;
+			goto mapped;
+		}
+	}
+
+	/*
+	 * Lastly, check for offset within Send Buffers.
+	 * This is gnarly because struct devdata is deliberately vague
+	 * about things like 7322 VL15 buffers, and we are not in
+	 * chip-specific code here, so should not make many assumptions.
+	 * The one we _do_ make is that the only chip that has more sndbufs
+	 * than we admit is the 7322, and it has userregs above that, so
+	 * we know the snd_lim.
+	 */
+	/* Assume 2K buffers are first. */
+	snd_bottom = dd->pio2k_bufbase;
+	if (snd_lim == 0) {
+		u32 tot2k = dd->piobcnt2k * ALIGN(dd->piosize2k, dd->palign);
+
+		snd_lim = snd_bottom + tot2k;
+	}
+	/* If 4k buffers exist, account for them by bumping
+	 * appropriate limit.
+	 */
+	tot4k = dd->piobcnt4k * dd->align4k;
+	offs4k = dd->piobufbase >> 32;
+	if (dd->piobcnt4k) {
+		if (snd_bottom > offs4k)
+			snd_bottom = offs4k;
+		else {
+			/* 4k above 2k. Bump snd_lim, if needed*/
+			if (!dd->userbase || dd->piovl15base)
+				snd_lim = offs4k + tot4k;
+		}
+	}
+	/*
+	 * Judgement call: can we ignore the space between SendBuffs and
+	 * UserRegs, where we would like to see vl15 buffs, but not more?
+	 */
+	if (offset >= snd_bottom && offset < snd_lim) {
+		offset -= snd_bottom;
+		map = (u32 __iomem *)dd->piobase + (offset / sizeof(u32));
+		cnt = snd_lim - offset;
+	}
+
+	if (!map && offs4k && dd->piovl15base) {
+		snd_lim = offs4k + tot4k + 2 * dd->align4k;
+		if (offset >= (offs4k + tot4k) && offset < snd_lim) {
+			map = (u32 __iomem *)dd->piovl15base +
+				((offset - (offs4k + tot4k)) / sizeof(u32));
+			cnt = snd_lim - offset;
+		}
+	}
+
+mapped:
+	if (cntp)
+		*cntp = cnt;
+	return map;
+}
+
+/*
+ * qib_read_umem64 - read a 64-bit quantity from the chip into user space
+ * @dd: the qlogic_ib device
+ * @uaddr: the location to store the data in user memory
+ * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore)
+ * @count: number of bytes to copy (multiple of 32 bits)
+ *
+ * This function also localizes all chip memory accesses.
+ * The copy should be written such that we read full cacheline packets
+ * from the chip.  This is usually used for a single qword
+ *
+ * NOTE:  This assumes the chip address is 64-bit aligned.
+ */
+static int qib_read_umem64(struct qib_devdata *dd, void __user *uaddr,
+			   u32 regoffs, size_t count)
+{
+	const u64 __iomem *reg_addr;
+	const u64 __iomem *reg_end;
+	u32 limit;
+	int ret;
+
+	reg_addr = (const u64 __iomem *)qib_remap_ioaddr32(dd, regoffs, &limit);
+	if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (count >= limit)
+		count = limit;
+	reg_end = reg_addr + (count / sizeof(u64));
+
+	/* not very efficient, but it works for now */
+	while (reg_addr < reg_end) {
+		u64 data = readq(reg_addr);
+
+		if (copy_to_user(uaddr, &data, sizeof(u64))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		reg_addr++;
+		uaddr += sizeof(u64);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/*
+ * qib_write_umem64 - write a 64-bit quantity to the chip from user space
+ * @dd: the qlogic_ib device
+ * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore)
+ * @uaddr: the source of the data in user memory
+ * @count: the number of bytes to copy (multiple of 32 bits)
+ *
+ * This is usually used for a single qword
+ * NOTE:  This assumes the chip address is 64-bit aligned.
+ */
+
+static int qib_write_umem64(struct qib_devdata *dd, u32 regoffs,
+			    const void __user *uaddr, size_t count)
+{
+	u64 __iomem *reg_addr;
+	const u64 __iomem *reg_end;
+	u32 limit;
+	int ret;
+
+	reg_addr = (u64 __iomem *)qib_remap_ioaddr32(dd, regoffs, &limit);
+	if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (count >= limit)
+		count = limit;
+	reg_end = reg_addr + (count / sizeof(u64));
+
+	/* not very efficient, but it works for now */
+	while (reg_addr < reg_end) {
+		u64 data;
+
+		if (copy_from_user(&data, uaddr, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		writeq(data, reg_addr);
+
+		reg_addr++;
+		uaddr += sizeof(u64);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/*
+ * qib_read_umem32 - read a 32-bit quantity from the chip into user space
+ * @dd: the qlogic_ib device
+ * @uaddr: the location to store the data in user memory
+ * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore)
+ * @count: number of bytes to copy
+ *
+ * read 32 bit values, not 64 bit; for memories that only
+ * support 32 bit reads; usually a single dword.
+ */
+static int qib_read_umem32(struct qib_devdata *dd, void __user *uaddr,
+			   u32 regoffs, size_t count)
+{
+	const u32 __iomem *reg_addr;
+	const u32 __iomem *reg_end;
+	u32 limit;
+	int ret;
+
+	reg_addr = qib_remap_ioaddr32(dd, regoffs, &limit);
+	if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (count >= limit)
+		count = limit;
+	reg_end = reg_addr + (count / sizeof(u32));
+
+	/* not very efficient, but it works for now */
+	while (reg_addr < reg_end) {
+		u32 data = readl(reg_addr);
+
+		if (copy_to_user(uaddr, &data, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+
+		reg_addr++;
+		uaddr += sizeof(u32);
+
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+/*
+ * qib_write_umem32 - write a 32-bit quantity to the chip from user space
+ * @dd: the qlogic_ib device
+ * @regoffs: the offset from BAR0 (_NOT_ full pointer, anymore)
+ * @uaddr: the source of the data in user memory
+ * @count: number of bytes to copy
+ *
+ * write 32 bit values, not 64 bit; for memories that only
+ * support 32 bit write; usually a single dword.
+ */
+
+static int qib_write_umem32(struct qib_devdata *dd, u32 regoffs,
+			    const void __user *uaddr, size_t count)
+{
+	u32 __iomem *reg_addr;
+	const u32 __iomem *reg_end;
+	u32 limit;
+	int ret;
+
+	reg_addr = qib_remap_ioaddr32(dd, regoffs, &limit);
+	if (reg_addr == NULL || limit == 0 || !(dd->flags & QIB_PRESENT)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (count >= limit)
+		count = limit;
+	reg_end = reg_addr + (count / sizeof(u32));
+
+	while (reg_addr < reg_end) {
+		u32 data;
+
+		if (copy_from_user(&data, uaddr, sizeof(data))) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		writel(data, reg_addr);
+
+		reg_addr++;
+		uaddr += sizeof(u32);
+	}
+	ret = 0;
+bail:
+	return ret;
+}
+
+static int qib_diag_open(struct inode *in, struct file *fp)
+{
+	int unit = iminor(in) - QIB_DIAG_MINOR_BASE;
+	struct qib_devdata *dd;
+	struct qib_diag_client *dc;
+	int ret;
+
+	mutex_lock(&qib_mutex);
+
+	dd = qib_lookup(unit);
+
+	if (dd == NULL || !(dd->flags & QIB_PRESENT) ||
+	    !dd->kregbase) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	dc = get_client(dd);
+	if (!dc) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	dc->next = dd->diag_client;
+	dd->diag_client = dc;
+	fp->private_data = dc;
+	ret = 0;
+bail:
+	mutex_unlock(&qib_mutex);
+
+	return ret;
+}
+
+/**
+ * qib_diagpkt_write - write an IB packet
+ * @fp: the diag data device file pointer
+ * @data: qib_diag_pkt structure saying where to get the packet
+ * @count: size of data to write
+ * @off: unused by this code
+ */
+static ssize_t qib_diagpkt_write(struct file *fp,
+				 const char __user *data,
+				 size_t count, loff_t *off)
+{
+	u32 __iomem *piobuf;
+	u32 plen, pbufn, maxlen_reserve;
+	struct qib_diag_xpkt dp;
+	u32 *tmpbuf = NULL;
+	struct qib_devdata *dd;
+	struct qib_pportdata *ppd;
+	ssize_t ret = 0;
+
+	if (count != sizeof(dp)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (copy_from_user(&dp, data, sizeof(dp))) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	dd = qib_lookup(dp.unit);
+	if (!dd || !(dd->flags & QIB_PRESENT) || !dd->kregbase) {
+		ret = -ENODEV;
+		goto bail;
+	}
+	if (!(dd->flags & QIB_INITTED)) {
+		/* no hardware, freeze, etc. */
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	if (dp.version != _DIAG_XPKT_VERS) {
+		qib_dev_err(dd, "Invalid version %u for diagpkt_write\n",
+			    dp.version);
+		ret = -EINVAL;
+		goto bail;
+	}
+	/* send count must be an exact number of dwords */
+	if (dp.len & 3) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	if (!dp.port || dp.port > dd->num_pports) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	ppd = &dd->pport[dp.port - 1];
+
+	/*
+	 * need total length before first word written, plus 2 Dwords. One Dword
+	 * is for padding so we get the full user data when not aligned on
+	 * a word boundary. The other Dword is to make sure we have room for the
+	 * ICRC which gets tacked on later.
+	 */
+	maxlen_reserve = 2 * sizeof(u32);
+	if (dp.len > ppd->ibmaxlen - maxlen_reserve) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	plen = sizeof(u32) + dp.len;
+
+	tmpbuf = vmalloc(plen);
+	if (!tmpbuf) {
+		qib_devinfo(dd->pcidev,
+			"Unable to allocate tmp buffer, failing\n");
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	if (copy_from_user(tmpbuf,
+			   (const void __user *) (unsigned long) dp.data,
+			   dp.len)) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	plen >>= 2;             /* in dwords */
+
+	if (dp.pbc_wd == 0)
+		dp.pbc_wd = plen;
+
+	piobuf = dd->f_getsendbuf(ppd, dp.pbc_wd, &pbufn);
+	if (!piobuf) {
+		ret = -EBUSY;
+		goto bail;
+	}
+	/* disarm it just to be extra sure */
+	dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(pbufn));
+
+	/* disable header check on pbufn for this packet */
+	dd->f_txchk_change(dd, pbufn, 1, TXCHK_CHG_TYPE_DIS1, NULL);
+
+	writeq(dp.pbc_wd, piobuf);
+	/*
+	 * Copy all but the trigger word, then flush, so it's written
+	 * to chip before trigger word, then write trigger word, then
+	 * flush again, so packet is sent.
+	 */
+	if (dd->flags & QIB_PIO_FLUSH_WC) {
+		qib_flush_wc();
+		qib_pio_copy(piobuf + 2, tmpbuf, plen - 1);
+		qib_flush_wc();
+		__raw_writel(tmpbuf[plen - 1], piobuf + plen + 1);
+	} else
+		qib_pio_copy(piobuf + 2, tmpbuf, plen);
+
+	if (dd->flags & QIB_USE_SPCL_TRIG) {
+		u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023;
+
+		qib_flush_wc();
+		__raw_writel(0xaebecede, piobuf + spcl_off);
+	}
+
+	/*
+	 * Ensure buffer is written to the chip, then re-enable
+	 * header checks (if supported by chip).  The txchk
+	 * code will ensure seen by chip before returning.
+	 */
+	qib_flush_wc();
+	qib_sendbuf_done(dd, pbufn);
+	dd->f_txchk_change(dd, pbufn, 1, TXCHK_CHG_TYPE_ENAB1, NULL);
+
+	ret = sizeof(dp);
+
+bail:
+	vfree(tmpbuf);
+	return ret;
+}
+
+static int qib_diag_release(struct inode *in, struct file *fp)
+{
+	mutex_lock(&qib_mutex);
+	return_client(fp->private_data);
+	fp->private_data = NULL;
+	mutex_unlock(&qib_mutex);
+	return 0;
+}
+
+/*
+ * Chip-specific code calls to register its interest in
+ * a specific range.
+ */
+struct diag_observer_list_elt {
+	struct diag_observer_list_elt *next;
+	const struct diag_observer *op;
+};
+
+int qib_register_observer(struct qib_devdata *dd,
+			  const struct diag_observer *op)
+{
+	struct diag_observer_list_elt *olp;
+	unsigned long flags;
+
+	if (!dd || !op)
+		return -EINVAL;
+	olp = vmalloc(sizeof(*olp));
+	if (!olp) {
+		pr_err("vmalloc for observer failed\n");
+		return -ENOMEM;
+	}
+
+	spin_lock_irqsave(&dd->qib_diag_trans_lock, flags);
+	olp->op = op;
+	olp->next = dd->diag_observer_list;
+	dd->diag_observer_list = olp;
+	spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags);
+
+	return 0;
+}
+
+/* Remove all registered observers when device is closed */
+static void qib_unregister_observers(struct qib_devdata *dd)
+{
+	struct diag_observer_list_elt *olp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->qib_diag_trans_lock, flags);
+	olp = dd->diag_observer_list;
+	while (olp) {
+		/* Pop one observer, let go of lock */
+		dd->diag_observer_list = olp->next;
+		spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags);
+		vfree(olp);
+		/* try again. */
+		spin_lock_irqsave(&dd->qib_diag_trans_lock, flags);
+		olp = dd->diag_observer_list;
+	}
+	spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags);
+}
+
+/*
+ * Find the observer, if any, for the specified address. Initial implementation
+ * is simple stack of observers. This must be called with diag transaction
+ * lock held.
+ */
+static const struct diag_observer *diag_get_observer(struct qib_devdata *dd,
+						     u32 addr)
+{
+	struct diag_observer_list_elt *olp;
+	const struct diag_observer *op = NULL;
+
+	olp = dd->diag_observer_list;
+	while (olp) {
+		op = olp->op;
+		if (addr >= op->bottom && addr <= op->top)
+			break;
+		olp = olp->next;
+	}
+	if (!olp)
+		op = NULL;
+
+	return op;
+}
+
+static ssize_t qib_diag_read(struct file *fp, char __user *data,
+			     size_t count, loff_t *off)
+{
+	struct qib_diag_client *dc = fp->private_data;
+	struct qib_devdata *dd = dc->dd;
+	void __iomem *kreg_base;
+	ssize_t ret;
+
+	if (dc->pid != current->pid) {
+		ret = -EPERM;
+		goto bail;
+	}
+
+	kreg_base = dd->kregbase;
+
+	if (count == 0)
+		ret = 0;
+	else if ((count % 4) || (*off % 4))
+		/* address or length is not 32-bit aligned, hence invalid */
+		ret = -EINVAL;
+	else if (dc->state < READY && (*off || count != 8))
+		ret = -EINVAL;  /* prevent cat /dev/qib_diag* */
+	else {
+		unsigned long flags;
+		u64 data64 = 0;
+		int use_32;
+		const struct diag_observer *op;
+
+		use_32 = (count % 8) || (*off % 8);
+		ret = -1;
+		spin_lock_irqsave(&dd->qib_diag_trans_lock, flags);
+		/*
+		 * Check for observer on this address range.
+		 * we only support a single 32 or 64-bit read
+		 * via observer, currently.
+		 */
+		op = diag_get_observer(dd, *off);
+		if (op) {
+			u32 offset = *off;
+
+			ret = op->hook(dd, op, offset, &data64, 0, use_32);
+		}
+		/*
+		 * We need to release lock before any copy_to_user(),
+		 * whether implicit in qib_read_umem* or explicit below.
+		 */
+		spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags);
+		if (!op) {
+			if (use_32)
+				/*
+				 * Address or length is not 64-bit aligned;
+				 * do 32-bit rd
+				 */
+				ret = qib_read_umem32(dd, data, (u32) *off,
+						      count);
+			else
+				ret = qib_read_umem64(dd, data, (u32) *off,
+						      count);
+		} else if (ret == count) {
+			/* Below finishes case where observer existed */
+			ret = copy_to_user(data, &data64, use_32 ?
+					   sizeof(u32) : sizeof(u64));
+			if (ret)
+				ret = -EFAULT;
+		}
+	}
+
+	if (ret >= 0) {
+		*off += count;
+		ret = count;
+		if (dc->state == OPENED)
+			dc->state = INIT;
+	}
+bail:
+	return ret;
+}
+
+static ssize_t qib_diag_write(struct file *fp, const char __user *data,
+			      size_t count, loff_t *off)
+{
+	struct qib_diag_client *dc = fp->private_data;
+	struct qib_devdata *dd = dc->dd;
+	void __iomem *kreg_base;
+	ssize_t ret;
+
+	if (dc->pid != current->pid) {
+		ret = -EPERM;
+		goto bail;
+	}
+
+	kreg_base = dd->kregbase;
+
+	if (count == 0)
+		ret = 0;
+	else if ((count % 4) || (*off % 4))
+		/* address or length is not 32-bit aligned, hence invalid */
+		ret = -EINVAL;
+	else if (dc->state < READY &&
+		((*off || count != 8) || dc->state != INIT))
+		/* No writes except second-step of init seq */
+		ret = -EINVAL;  /* before any other write allowed */
+	else {
+		unsigned long flags;
+		const struct diag_observer *op = NULL;
+		int use_32 =  (count % 8) || (*off % 8);
+
+		/*
+		 * Check for observer on this address range.
+		 * We only support a single 32 or 64-bit write
+		 * via observer, currently. This helps, because
+		 * we would otherwise have to jump through hoops
+		 * to make "diag transaction" meaningful when we
+		 * cannot do a copy_from_user while holding the lock.
+		 */
+		if (count == 4 || count == 8) {
+			u64 data64;
+			u32 offset = *off;
+
+			ret = copy_from_user(&data64, data, count);
+			if (ret) {
+				ret = -EFAULT;
+				goto bail;
+			}
+			spin_lock_irqsave(&dd->qib_diag_trans_lock, flags);
+			op = diag_get_observer(dd, *off);
+			if (op)
+				ret = op->hook(dd, op, offset, &data64, ~0Ull,
+					       use_32);
+			spin_unlock_irqrestore(&dd->qib_diag_trans_lock, flags);
+		}
+
+		if (!op) {
+			if (use_32)
+				/*
+				 * Address or length is not 64-bit aligned;
+				 * do 32-bit write
+				 */
+				ret = qib_write_umem32(dd, (u32) *off, data,
+						       count);
+			else
+				ret = qib_write_umem64(dd, (u32) *off, data,
+						       count);
+		}
+	}
+
+	if (ret >= 0) {
+		*off += count;
+		ret = count;
+		if (dc->state == INIT)
+			dc->state = READY; /* all read/write OK now */
+	}
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/qib/qib_dma.c b/drivers/infiniband/hw/qib/qib_dma.c
new file mode 100644
index 000000000..59fe092b4
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_dma.c
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2006, 2009, 2010 QLogic, Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/types.h>
+#include <linux/scatterlist.h>
+
+#include "qib_verbs.h"
+
+#define BAD_DMA_ADDRESS ((u64) 0)
+
+/*
+ * The following functions implement driver specific replacements
+ * for the ib_dma_*() functions.
+ *
+ * These functions return kernel virtual addresses instead of
+ * device bus addresses since the driver uses the CPU to copy
+ * data instead of using hardware DMA.
+ */
+
+static int qib_mapping_error(struct ib_device *dev, u64 dma_addr)
+{
+	return dma_addr == BAD_DMA_ADDRESS;
+}
+
+static u64 qib_dma_map_single(struct ib_device *dev, void *cpu_addr,
+			      size_t size, enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+	return (u64) cpu_addr;
+}
+
+static void qib_dma_unmap_single(struct ib_device *dev, u64 addr, size_t size,
+				 enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+}
+
+static u64 qib_dma_map_page(struct ib_device *dev, struct page *page,
+			    unsigned long offset, size_t size,
+			    enum dma_data_direction direction)
+{
+	u64 addr;
+
+	BUG_ON(!valid_dma_direction(direction));
+
+	if (offset + size > PAGE_SIZE) {
+		addr = BAD_DMA_ADDRESS;
+		goto done;
+	}
+
+	addr = (u64) page_address(page);
+	if (addr)
+		addr += offset;
+	/* TODO: handle highmem pages */
+
+done:
+	return addr;
+}
+
+static void qib_dma_unmap_page(struct ib_device *dev, u64 addr, size_t size,
+			       enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+}
+
+static int qib_map_sg(struct ib_device *dev, struct scatterlist *sgl,
+		      int nents, enum dma_data_direction direction)
+{
+	struct scatterlist *sg;
+	u64 addr;
+	int i;
+	int ret = nents;
+
+	BUG_ON(!valid_dma_direction(direction));
+
+	for_each_sg(sgl, sg, nents, i) {
+		addr = (u64) page_address(sg_page(sg));
+		/* TODO: handle highmem pages */
+		if (!addr) {
+			ret = 0;
+			break;
+		}
+		sg->dma_address = addr + sg->offset;
+#ifdef CONFIG_NEED_SG_DMA_LENGTH
+		sg->dma_length = sg->length;
+#endif
+	}
+	return ret;
+}
+
+static void qib_unmap_sg(struct ib_device *dev,
+			 struct scatterlist *sg, int nents,
+			 enum dma_data_direction direction)
+{
+	BUG_ON(!valid_dma_direction(direction));
+}
+
+static void qib_sync_single_for_cpu(struct ib_device *dev, u64 addr,
+				    size_t size, enum dma_data_direction dir)
+{
+}
+
+static void qib_sync_single_for_device(struct ib_device *dev, u64 addr,
+				       size_t size,
+				       enum dma_data_direction dir)
+{
+}
+
+static void *qib_dma_alloc_coherent(struct ib_device *dev, size_t size,
+				    u64 *dma_handle, gfp_t flag)
+{
+	struct page *p;
+	void *addr = NULL;
+
+	p = alloc_pages(flag, get_order(size));
+	if (p)
+		addr = page_address(p);
+	if (dma_handle)
+		*dma_handle = (u64) addr;
+	return addr;
+}
+
+static void qib_dma_free_coherent(struct ib_device *dev, size_t size,
+				  void *cpu_addr, u64 dma_handle)
+{
+	free_pages((unsigned long) cpu_addr, get_order(size));
+}
+
+struct ib_dma_mapping_ops qib_dma_mapping_ops = {
+	.mapping_error = qib_mapping_error,
+	.map_single = qib_dma_map_single,
+	.unmap_single = qib_dma_unmap_single,
+	.map_page = qib_dma_map_page,
+	.unmap_page = qib_dma_unmap_page,
+	.map_sg = qib_map_sg,
+	.unmap_sg = qib_unmap_sg,
+	.sync_single_for_cpu = qib_sync_single_for_cpu,
+	.sync_single_for_device = qib_sync_single_for_device,
+	.alloc_coherent = qib_dma_alloc_coherent,
+	.free_coherent = qib_dma_free_coherent
+};
diff --git a/drivers/infiniband/hw/qib/qib_driver.c b/drivers/infiniband/hw/qib/qib_driver.c
new file mode 100644
index 000000000..f58fdc3d2
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_driver.c
@@ -0,0 +1,820 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/prefetch.h>
+
+#include "qib.h"
+
+/*
+ * The size has to be longer than this string, so we can append
+ * board/chip information to it in the init code.
+ */
+const char ib_qib_version[] = QIB_DRIVER_VERSION "\n";
+
+DEFINE_SPINLOCK(qib_devs_lock);
+LIST_HEAD(qib_dev_list);
+DEFINE_MUTEX(qib_mutex);	/* general driver use */
+
+unsigned qib_ibmtu;
+module_param_named(ibmtu, qib_ibmtu, uint, S_IRUGO);
+MODULE_PARM_DESC(ibmtu, "Set max IB MTU (0=2KB, 1=256, 2=512, ... 5=4096");
+
+unsigned qib_compat_ddr_negotiate = 1;
+module_param_named(compat_ddr_negotiate, qib_compat_ddr_negotiate, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(compat_ddr_negotiate,
+		 "Attempt pre-IBTA 1.2 DDR speed negotiation");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Intel <ibsupport@intel.com>");
+MODULE_DESCRIPTION("Intel IB driver");
+MODULE_VERSION(QIB_DRIVER_VERSION);
+
+/*
+ * QIB_PIO_MAXIBHDR is the max IB header size allowed for in our
+ * PIO send buffers.  This is well beyond anything currently
+ * defined in the InfiniBand spec.
+ */
+#define QIB_PIO_MAXIBHDR 128
+
+/*
+ * QIB_MAX_PKT_RCV is the max # if packets processed per receive interrupt.
+ */
+#define QIB_MAX_PKT_RECV 64
+
+struct qlogic_ib_stats qib_stats;
+
+const char *qib_get_unit_name(int unit)
+{
+	static char iname[16];
+
+	snprintf(iname, sizeof(iname), "infinipath%u", unit);
+	return iname;
+}
+
+/*
+ * Return count of units with at least one port ACTIVE.
+ */
+int qib_count_active_units(void)
+{
+	struct qib_devdata *dd;
+	struct qib_pportdata *ppd;
+	unsigned long flags;
+	int pidx, nunits_active = 0;
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+	list_for_each_entry(dd, &qib_dev_list, list) {
+		if (!(dd->flags & QIB_PRESENT) || !dd->kregbase)
+			continue;
+		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+			ppd = dd->pport + pidx;
+			if (ppd->lid && (ppd->lflags & (QIBL_LINKINIT |
+					 QIBL_LINKARMED | QIBL_LINKACTIVE))) {
+				nunits_active++;
+				break;
+			}
+		}
+	}
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+	return nunits_active;
+}
+
+/*
+ * Return count of all units, optionally return in arguments
+ * the number of usable (present) units, and the number of
+ * ports that are up.
+ */
+int qib_count_units(int *npresentp, int *nupp)
+{
+	int nunits = 0, npresent = 0, nup = 0;
+	struct qib_devdata *dd;
+	unsigned long flags;
+	int pidx;
+	struct qib_pportdata *ppd;
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+
+	list_for_each_entry(dd, &qib_dev_list, list) {
+		nunits++;
+		if ((dd->flags & QIB_PRESENT) && dd->kregbase)
+			npresent++;
+		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+			ppd = dd->pport + pidx;
+			if (ppd->lid && (ppd->lflags & (QIBL_LINKINIT |
+					 QIBL_LINKARMED | QIBL_LINKACTIVE)))
+				nup++;
+		}
+	}
+
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+
+	if (npresentp)
+		*npresentp = npresent;
+	if (nupp)
+		*nupp = nup;
+
+	return nunits;
+}
+
+/**
+ * qib_wait_linkstate - wait for an IB link state change to occur
+ * @dd: the qlogic_ib device
+ * @state: the state to wait for
+ * @msecs: the number of milliseconds to wait
+ *
+ * wait up to msecs milliseconds for IB link state change to occur for
+ * now, take the easy polling route.  Currently used only by
+ * qib_set_linkstate.  Returns 0 if state reached, otherwise
+ * -ETIMEDOUT state can have multiple states set, for any of several
+ * transitions.
+ */
+int qib_wait_linkstate(struct qib_pportdata *ppd, u32 state, int msecs)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	if (ppd->state_wanted) {
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		ret = -EBUSY;
+		goto bail;
+	}
+	ppd->state_wanted = state;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	wait_event_interruptible_timeout(ppd->state_wait,
+					 (ppd->lflags & state),
+					 msecs_to_jiffies(msecs));
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->state_wanted = 0;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+
+	if (!(ppd->lflags & state))
+		ret = -ETIMEDOUT;
+	else
+		ret = 0;
+bail:
+	return ret;
+}
+
+int qib_set_linkstate(struct qib_pportdata *ppd, u8 newstate)
+{
+	u32 lstate;
+	int ret;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	switch (newstate) {
+	case QIB_IB_LINKDOWN_ONLY:
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_DOWN | IB_LINKINITCMD_NOP);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case QIB_IB_LINKDOWN:
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_DOWN | IB_LINKINITCMD_POLL);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case QIB_IB_LINKDOWN_SLEEP:
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_DOWN | IB_LINKINITCMD_SLEEP);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case QIB_IB_LINKDOWN_DISABLE:
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_DOWN | IB_LINKINITCMD_DISABLE);
+		/* don't wait */
+		ret = 0;
+		goto bail;
+
+	case QIB_IB_LINKARM:
+		if (ppd->lflags & QIBL_LINKARMED) {
+			ret = 0;
+			goto bail;
+		}
+		if (!(ppd->lflags & (QIBL_LINKINIT | QIBL_LINKACTIVE))) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		/*
+		 * Since the port can be ACTIVE when we ask for ARMED,
+		 * clear QIBL_LINKV so we can wait for a transition.
+		 * If the link isn't ARMED, then something else happened
+		 * and there is no point waiting for ARMED.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_LINKV;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_ARMED | IB_LINKINITCMD_NOP);
+		lstate = QIBL_LINKV;
+		break;
+
+	case QIB_IB_LINKACTIVE:
+		if (ppd->lflags & QIBL_LINKACTIVE) {
+			ret = 0;
+			goto bail;
+		}
+		if (!(ppd->lflags & QIBL_LINKARMED)) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LSTATE,
+				 IB_LINKCMD_ACTIVE | IB_LINKINITCMD_NOP);
+		lstate = QIBL_LINKACTIVE;
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto bail;
+	}
+	ret = qib_wait_linkstate(ppd, lstate, 10);
+
+bail:
+	return ret;
+}
+
+/*
+ * Get address of eager buffer from it's index (allocated in chunks, not
+ * contiguous).
+ */
+static inline void *qib_get_egrbuf(const struct qib_ctxtdata *rcd, u32 etail)
+{
+	const u32 chunk = etail >> rcd->rcvegrbufs_perchunk_shift;
+	const u32 idx =  etail & ((u32)rcd->rcvegrbufs_perchunk - 1);
+
+	return rcd->rcvegrbuf[chunk] + (idx << rcd->dd->rcvegrbufsize_shift);
+}
+
+/*
+ * Returns 1 if error was a CRC, else 0.
+ * Needed for some chip's synthesized error counters.
+ */
+static u32 qib_rcv_hdrerr(struct qib_ctxtdata *rcd, struct qib_pportdata *ppd,
+			  u32 ctxt, u32 eflags, u32 l, u32 etail,
+			  __le32 *rhf_addr, struct qib_message_header *rhdr)
+{
+	u32 ret = 0;
+
+	if (eflags & (QLOGIC_IB_RHF_H_ICRCERR | QLOGIC_IB_RHF_H_VCRCERR))
+		ret = 1;
+	else if (eflags == QLOGIC_IB_RHF_H_TIDERR) {
+		/* For TIDERR and RC QPs premptively schedule a NAK */
+		struct qib_ib_header *hdr = (struct qib_ib_header *) rhdr;
+		struct qib_other_headers *ohdr = NULL;
+		struct qib_ibport *ibp = &ppd->ibport_data;
+		struct qib_qp *qp = NULL;
+		u32 tlen = qib_hdrget_length_in_bytes(rhf_addr);
+		u16 lid  = be16_to_cpu(hdr->lrh[1]);
+		int lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+		u32 qp_num;
+		u32 opcode;
+		u32 psn;
+		int diff;
+
+		/* Sanity check packet */
+		if (tlen < 24)
+			goto drop;
+
+		if (lid < QIB_MULTICAST_LID_BASE) {
+			lid &= ~((1 << ppd->lmc) - 1);
+			if (unlikely(lid != ppd->lid))
+				goto drop;
+		}
+
+		/* Check for GRH */
+		if (lnh == QIB_LRH_BTH)
+			ohdr = &hdr->u.oth;
+		else if (lnh == QIB_LRH_GRH) {
+			u32 vtf;
+
+			ohdr = &hdr->u.l.oth;
+			if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+				goto drop;
+			vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+			if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+				goto drop;
+		} else
+			goto drop;
+
+		/* Get opcode and PSN from packet */
+		opcode = be32_to_cpu(ohdr->bth[0]);
+		opcode >>= 24;
+		psn = be32_to_cpu(ohdr->bth[2]);
+
+		/* Get the destination QP number. */
+		qp_num = be32_to_cpu(ohdr->bth[1]) & QIB_QPN_MASK;
+		if (qp_num != QIB_MULTICAST_QPN) {
+			int ruc_res;
+
+			qp = qib_lookup_qpn(ibp, qp_num);
+			if (!qp)
+				goto drop;
+
+			/*
+			 * Handle only RC QPs - for other QP types drop error
+			 * packet.
+			 */
+			spin_lock(&qp->r_lock);
+
+			/* Check for valid receive state. */
+			if (!(ib_qib_state_ops[qp->state] &
+			      QIB_PROCESS_RECV_OK)) {
+				ibp->n_pkt_drops++;
+				goto unlock;
+			}
+
+			switch (qp->ibqp.qp_type) {
+			case IB_QPT_RC:
+				ruc_res =
+					qib_ruc_check_hdr(
+						ibp, hdr,
+						lnh == QIB_LRH_GRH,
+						qp,
+						be32_to_cpu(ohdr->bth[0]));
+				if (ruc_res)
+					goto unlock;
+
+				/* Only deal with RDMA Writes for now */
+				if (opcode <
+				    IB_OPCODE_RC_RDMA_READ_RESPONSE_FIRST) {
+					diff = qib_cmp24(psn, qp->r_psn);
+					if (!qp->r_nak_state && diff >= 0) {
+						ibp->n_rc_seqnak++;
+						qp->r_nak_state =
+							IB_NAK_PSN_ERROR;
+						/* Use the expected PSN. */
+						qp->r_ack_psn = qp->r_psn;
+						/*
+						 * Wait to send the sequence
+						 * NAK until all packets
+						 * in the receive queue have
+						 * been processed.
+						 * Otherwise, we end up
+						 * propagating congestion.
+						 */
+						if (list_empty(&qp->rspwait)) {
+							qp->r_flags |=
+								QIB_R_RSP_NAK;
+							atomic_inc(
+								&qp->refcount);
+							list_add_tail(
+							 &qp->rspwait,
+							 &rcd->qp_wait_list);
+						}
+					} /* Out of sequence NAK */
+				} /* QP Request NAKs */
+				break;
+			case IB_QPT_SMI:
+			case IB_QPT_GSI:
+			case IB_QPT_UD:
+			case IB_QPT_UC:
+			default:
+				/* For now don't handle any other QP types */
+				break;
+			}
+
+unlock:
+			spin_unlock(&qp->r_lock);
+			/*
+			 * Notify qib_destroy_qp() if it is waiting
+			 * for us to finish.
+			 */
+			if (atomic_dec_and_test(&qp->refcount))
+				wake_up(&qp->wait);
+		} /* Unicast QP */
+	} /* Valid packet with TIDErr */
+
+drop:
+	return ret;
+}
+
+/*
+ * qib_kreceive - receive a packet
+ * @rcd: the qlogic_ib context
+ * @llic: gets count of good packets needed to clear lli,
+ *          (used with chips that need need to track crcs for lli)
+ *
+ * called from interrupt handler for errors or receive interrupt
+ * Returns number of CRC error packets, needed by some chips for
+ * local link integrity tracking.   crcs are adjusted down by following
+ * good packets, if any, and count of good packets is also tracked.
+ */
+u32 qib_kreceive(struct qib_ctxtdata *rcd, u32 *llic, u32 *npkts)
+{
+	struct qib_devdata *dd = rcd->dd;
+	struct qib_pportdata *ppd = rcd->ppd;
+	__le32 *rhf_addr;
+	void *ebuf;
+	const u32 rsize = dd->rcvhdrentsize;        /* words */
+	const u32 maxcnt = dd->rcvhdrcnt * rsize;   /* words */
+	u32 etail = -1, l, hdrqtail;
+	struct qib_message_header *hdr;
+	u32 eflags, etype, tlen, i = 0, updegr = 0, crcs = 0;
+	int last;
+	u64 lval;
+	struct qib_qp *qp, *nqp;
+
+	l = rcd->head;
+	rhf_addr = (__le32 *) rcd->rcvhdrq + l + dd->rhf_offset;
+	if (dd->flags & QIB_NODMA_RTAIL) {
+		u32 seq = qib_hdrget_seq(rhf_addr);
+
+		if (seq != rcd->seq_cnt)
+			goto bail;
+		hdrqtail = 0;
+	} else {
+		hdrqtail = qib_get_rcvhdrtail(rcd);
+		if (l == hdrqtail)
+			goto bail;
+		smp_rmb();  /* prevent speculative reads of dma'ed hdrq */
+	}
+
+	for (last = 0, i = 1; !last; i += !last) {
+		hdr = dd->f_get_msgheader(dd, rhf_addr);
+		eflags = qib_hdrget_err_flags(rhf_addr);
+		etype = qib_hdrget_rcv_type(rhf_addr);
+		/* total length */
+		tlen = qib_hdrget_length_in_bytes(rhf_addr);
+		ebuf = NULL;
+		if ((dd->flags & QIB_NODMA_RTAIL) ?
+		    qib_hdrget_use_egr_buf(rhf_addr) :
+		    (etype != RCVHQ_RCV_TYPE_EXPECTED)) {
+			etail = qib_hdrget_index(rhf_addr);
+			updegr = 1;
+			if (tlen > sizeof(*hdr) ||
+			    etype >= RCVHQ_RCV_TYPE_NON_KD) {
+				ebuf = qib_get_egrbuf(rcd, etail);
+				prefetch_range(ebuf, tlen - sizeof(*hdr));
+			}
+		}
+		if (!eflags) {
+			u16 lrh_len = be16_to_cpu(hdr->lrh[2]) << 2;
+
+			if (lrh_len != tlen) {
+				qib_stats.sps_lenerrs++;
+				goto move_along;
+			}
+		}
+		if (etype == RCVHQ_RCV_TYPE_NON_KD && !eflags &&
+		    ebuf == NULL &&
+		    tlen > (dd->rcvhdrentsize - 2 + 1 -
+				qib_hdrget_offset(rhf_addr)) << 2) {
+			goto move_along;
+		}
+
+		/*
+		 * Both tiderr and qibhdrerr are set for all plain IB
+		 * packets; only qibhdrerr should be set.
+		 */
+		if (unlikely(eflags))
+			crcs += qib_rcv_hdrerr(rcd, ppd, rcd->ctxt, eflags, l,
+					       etail, rhf_addr, hdr);
+		else if (etype == RCVHQ_RCV_TYPE_NON_KD) {
+			qib_ib_rcv(rcd, hdr, ebuf, tlen);
+			if (crcs)
+				crcs--;
+			else if (llic && *llic)
+				--*llic;
+		}
+move_along:
+		l += rsize;
+		if (l >= maxcnt)
+			l = 0;
+		if (i == QIB_MAX_PKT_RECV)
+			last = 1;
+
+		rhf_addr = (__le32 *) rcd->rcvhdrq + l + dd->rhf_offset;
+		if (dd->flags & QIB_NODMA_RTAIL) {
+			u32 seq = qib_hdrget_seq(rhf_addr);
+
+			if (++rcd->seq_cnt > 13)
+				rcd->seq_cnt = 1;
+			if (seq != rcd->seq_cnt)
+				last = 1;
+		} else if (l == hdrqtail)
+			last = 1;
+		/*
+		 * Update head regs etc., every 16 packets, if not last pkt,
+		 * to help prevent rcvhdrq overflows, when many packets
+		 * are processed and queue is nearly full.
+		 * Don't request an interrupt for intermediate updates.
+		 */
+		lval = l;
+		if (!last && !(i & 0xf)) {
+			dd->f_update_usrhead(rcd, lval, updegr, etail, i);
+			updegr = 0;
+		}
+	}
+	/*
+	 * Notify qib_destroy_qp() if it is waiting
+	 * for lookaside_qp to finish.
+	 */
+	if (rcd->lookaside_qp) {
+		if (atomic_dec_and_test(&rcd->lookaside_qp->refcount))
+			wake_up(&rcd->lookaside_qp->wait);
+		rcd->lookaside_qp = NULL;
+	}
+
+	rcd->head = l;
+
+	/*
+	 * Iterate over all QPs waiting to respond.
+	 * The list won't change since the IRQ is only run on one CPU.
+	 */
+	list_for_each_entry_safe(qp, nqp, &rcd->qp_wait_list, rspwait) {
+		list_del_init(&qp->rspwait);
+		if (qp->r_flags & QIB_R_RSP_NAK) {
+			qp->r_flags &= ~QIB_R_RSP_NAK;
+			qib_send_rc_ack(qp);
+		}
+		if (qp->r_flags & QIB_R_RSP_SEND) {
+			unsigned long flags;
+
+			qp->r_flags &= ~QIB_R_RSP_SEND;
+			spin_lock_irqsave(&qp->s_lock, flags);
+			if (ib_qib_state_ops[qp->state] &
+					QIB_PROCESS_OR_FLUSH_SEND)
+				qib_schedule_send(qp);
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+		}
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+
+bail:
+	/* Report number of packets consumed */
+	if (npkts)
+		*npkts = i;
+
+	/*
+	 * Always write head at end, and setup rcv interrupt, even
+	 * if no packets were processed.
+	 */
+	lval = (u64)rcd->head | dd->rhdrhead_intr_off;
+	dd->f_update_usrhead(rcd, lval, updegr, etail, i);
+	return crcs;
+}
+
+/**
+ * qib_set_mtu - set the MTU
+ * @ppd: the perport data
+ * @arg: the new MTU
+ *
+ * We can handle "any" incoming size, the issue here is whether we
+ * need to restrict our outgoing size.   For now, we don't do any
+ * sanity checking on this, and we don't deal with what happens to
+ * programs that are already running when the size changes.
+ * NOTE: changing the MTU will usually cause the IBC to go back to
+ * link INIT state...
+ */
+int qib_set_mtu(struct qib_pportdata *ppd, u16 arg)
+{
+	u32 piosize;
+	int ret, chk;
+
+	if (arg != 256 && arg != 512 && arg != 1024 && arg != 2048 &&
+	    arg != 4096) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	chk = ib_mtu_enum_to_int(qib_ibmtu);
+	if (chk > 0 && arg > chk) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	piosize = ppd->ibmaxlen;
+	ppd->ibmtu = arg;
+
+	if (arg >= (piosize - QIB_PIO_MAXIBHDR)) {
+		/* Only if it's not the initial value (or reset to it) */
+		if (piosize != ppd->init_ibmaxlen) {
+			if (arg > piosize && arg <= ppd->init_ibmaxlen)
+				piosize = ppd->init_ibmaxlen - 2 * sizeof(u32);
+			ppd->ibmaxlen = piosize;
+		}
+	} else if ((arg + QIB_PIO_MAXIBHDR) != ppd->ibmaxlen) {
+		piosize = arg + QIB_PIO_MAXIBHDR - 2 * sizeof(u32);
+		ppd->ibmaxlen = piosize;
+	}
+
+	ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_MTU, 0);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int qib_set_lid(struct qib_pportdata *ppd, u32 lid, u8 lmc)
+{
+	struct qib_devdata *dd = ppd->dd;
+
+	ppd->lid = lid;
+	ppd->lmc = lmc;
+
+	dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LIDLMC,
+			 lid | (~((1U << lmc) - 1)) << 16);
+
+	qib_devinfo(dd->pcidev, "IB%u:%u got a lid: 0x%x\n",
+		    dd->unit, ppd->port, lid);
+
+	return 0;
+}
+
+/*
+ * Following deal with the "obviously simple" task of overriding the state
+ * of the LEDS, which normally indicate link physical and logical status.
+ * The complications arise in dealing with different hardware mappings
+ * and the board-dependent routine being called from interrupts.
+ * and then there's the requirement to _flash_ them.
+ */
+#define LED_OVER_FREQ_SHIFT 8
+#define LED_OVER_FREQ_MASK (0xFF<<LED_OVER_FREQ_SHIFT)
+/* Below is "non-zero" to force override, but both actual LEDs are off */
+#define LED_OVER_BOTH_OFF (8)
+
+static void qib_run_led_override(unsigned long opaque)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *)opaque;
+	struct qib_devdata *dd = ppd->dd;
+	int timeoff;
+	int ph_idx;
+
+	if (!(dd->flags & QIB_INITTED))
+		return;
+
+	ph_idx = ppd->led_override_phase++ & 1;
+	ppd->led_override = ppd->led_override_vals[ph_idx];
+	timeoff = ppd->led_override_timeoff;
+
+	dd->f_setextled(ppd, 1);
+	/*
+	 * don't re-fire the timer if user asked for it to be off; we let
+	 * it fire one more time after they turn it off to simplify
+	 */
+	if (ppd->led_override_vals[0] || ppd->led_override_vals[1])
+		mod_timer(&ppd->led_override_timer, jiffies + timeoff);
+}
+
+void qib_set_led_override(struct qib_pportdata *ppd, unsigned int val)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int timeoff, freq;
+
+	if (!(dd->flags & QIB_INITTED))
+		return;
+
+	/* First check if we are blinking. If not, use 1HZ polling */
+	timeoff = HZ;
+	freq = (val & LED_OVER_FREQ_MASK) >> LED_OVER_FREQ_SHIFT;
+
+	if (freq) {
+		/* For blink, set each phase from one nybble of val */
+		ppd->led_override_vals[0] = val & 0xF;
+		ppd->led_override_vals[1] = (val >> 4) & 0xF;
+		timeoff = (HZ << 4)/freq;
+	} else {
+		/* Non-blink set both phases the same. */
+		ppd->led_override_vals[0] = val & 0xF;
+		ppd->led_override_vals[1] = val & 0xF;
+	}
+	ppd->led_override_timeoff = timeoff;
+
+	/*
+	 * If the timer has not already been started, do so. Use a "quick"
+	 * timeout so the function will be called soon, to look at our request.
+	 */
+	if (atomic_inc_return(&ppd->led_override_timer_active) == 1) {
+		/* Need to start timer */
+		init_timer(&ppd->led_override_timer);
+		ppd->led_override_timer.function = qib_run_led_override;
+		ppd->led_override_timer.data = (unsigned long) ppd;
+		ppd->led_override_timer.expires = jiffies + 1;
+		add_timer(&ppd->led_override_timer);
+	} else {
+		if (ppd->led_override_vals[0] || ppd->led_override_vals[1])
+			mod_timer(&ppd->led_override_timer, jiffies + 1);
+		atomic_dec(&ppd->led_override_timer_active);
+	}
+}
+
+/**
+ * qib_reset_device - reset the chip if possible
+ * @unit: the device to reset
+ *
+ * Whether or not reset is successful, we attempt to re-initialize the chip
+ * (that is, much like a driver unload/reload).  We clear the INITTED flag
+ * so that the various entry points will fail until we reinitialize.  For
+ * now, we only allow this if no user contexts are open that use chip resources
+ */
+int qib_reset_device(int unit)
+{
+	int ret, i;
+	struct qib_devdata *dd = qib_lookup(unit);
+	struct qib_pportdata *ppd;
+	unsigned long flags;
+	int pidx;
+
+	if (!dd) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	qib_devinfo(dd->pcidev, "Reset on unit %u requested\n", unit);
+
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT)) {
+		qib_devinfo(dd->pcidev,
+			"Invalid unit number %u or not initialized or not present\n",
+			unit);
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	if (dd->rcd)
+		for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) {
+			if (!dd->rcd[i] || !dd->rcd[i]->cnt)
+				continue;
+			spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+			ret = -EBUSY;
+			goto bail;
+		}
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (atomic_read(&ppd->led_override_timer_active)) {
+			/* Need to stop LED timer, _then_ shut off LEDs */
+			del_timer_sync(&ppd->led_override_timer);
+			atomic_set(&ppd->led_override_timer_active, 0);
+		}
+
+		/* Shut off LEDs after we are sure timer is not running */
+		ppd->led_override = LED_OVER_BOTH_OFF;
+		dd->f_setextled(ppd, 0);
+		if (dd->flags & QIB_HAS_SEND_DMA)
+			qib_teardown_sdma(ppd);
+	}
+
+	ret = dd->f_reset(dd);
+	if (ret == 1)
+		ret = qib_init(dd, 1);
+	else
+		ret = -EAGAIN;
+	if (ret)
+		qib_dev_err(dd,
+			"Reinitialize unit %u after reset failed with %d\n",
+			unit, ret);
+	else
+		qib_devinfo(dd->pcidev,
+			"Reinitialized unit %u after resetting\n",
+			unit);
+
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/qib/qib_eeprom.c b/drivers/infiniband/hw/qib/qib_eeprom.c
new file mode 100644
index 000000000..311ee6c3d
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_eeprom.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "qib.h"
+
+/*
+ * Functions specific to the serial EEPROM on cards handled by ib_qib.
+ * The actual serail interface code is in qib_twsi.c. This file is a client
+ */
+
+/**
+ * qib_eeprom_read - receives bytes from the eeprom via I2C
+ * @dd: the qlogic_ib device
+ * @eeprom_offset: address to read from
+ * @buffer: where to store result
+ * @len: number of bytes to receive
+ */
+int qib_eeprom_read(struct qib_devdata *dd, u8 eeprom_offset,
+		    void *buff, int len)
+{
+	int ret;
+
+	ret = mutex_lock_interruptible(&dd->eep_lock);
+	if (!ret) {
+		ret = qib_twsi_reset(dd);
+		if (ret)
+			qib_dev_err(dd, "EEPROM Reset for read failed\n");
+		else
+			ret = qib_twsi_blk_rd(dd, dd->twsi_eeprom_dev,
+					      eeprom_offset, buff, len);
+		mutex_unlock(&dd->eep_lock);
+	}
+
+	return ret;
+}
+
+/*
+ * Actually update the eeprom, first doing write enable if
+ * needed, then restoring write enable state.
+ * Must be called with eep_lock held
+ */
+static int eeprom_write_with_enable(struct qib_devdata *dd, u8 offset,
+		     const void *buf, int len)
+{
+	int ret, pwen;
+
+	pwen = dd->f_eeprom_wen(dd, 1);
+	ret = qib_twsi_reset(dd);
+	if (ret)
+		qib_dev_err(dd, "EEPROM Reset for write failed\n");
+	else
+		ret = qib_twsi_blk_wr(dd, dd->twsi_eeprom_dev,
+				      offset, buf, len);
+	dd->f_eeprom_wen(dd, pwen);
+	return ret;
+}
+
+/**
+ * qib_eeprom_write - writes data to the eeprom via I2C
+ * @dd: the qlogic_ib device
+ * @eeprom_offset: where to place data
+ * @buffer: data to write
+ * @len: number of bytes to write
+ */
+int qib_eeprom_write(struct qib_devdata *dd, u8 eeprom_offset,
+		     const void *buff, int len)
+{
+	int ret;
+
+	ret = mutex_lock_interruptible(&dd->eep_lock);
+	if (!ret) {
+		ret = eeprom_write_with_enable(dd, eeprom_offset, buff, len);
+		mutex_unlock(&dd->eep_lock);
+	}
+
+	return ret;
+}
+
+static u8 flash_csum(struct qib_flash *ifp, int adjust)
+{
+	u8 *ip = (u8 *) ifp;
+	u8 csum = 0, len;
+
+	/*
+	 * Limit length checksummed to max length of actual data.
+	 * Checksum of erased eeprom will still be bad, but we avoid
+	 * reading past the end of the buffer we were passed.
+	 */
+	len = ifp->if_length;
+	if (len > sizeof(struct qib_flash))
+		len = sizeof(struct qib_flash);
+	while (len--)
+		csum += *ip++;
+	csum -= ifp->if_csum;
+	csum = ~csum;
+	if (adjust)
+		ifp->if_csum = csum;
+
+	return csum;
+}
+
+/**
+ * qib_get_eeprom_info- get the GUID et al. from the TSWI EEPROM device
+ * @dd: the qlogic_ib device
+ *
+ * We have the capability to use the nguid field, and get
+ * the guid from the first chip's flash, to use for all of them.
+ */
+void qib_get_eeprom_info(struct qib_devdata *dd)
+{
+	void *buf;
+	struct qib_flash *ifp;
+	__be64 guid;
+	int len, eep_stat;
+	u8 csum, *bguid;
+	int t = dd->unit;
+	struct qib_devdata *dd0 = qib_lookup(0);
+
+	if (t && dd0->nguid > 1 && t <= dd0->nguid) {
+		u8 oguid;
+
+		dd->base_guid = dd0->base_guid;
+		bguid = (u8 *) &dd->base_guid;
+
+		oguid = bguid[7];
+		bguid[7] += t;
+		if (oguid > bguid[7]) {
+			if (bguid[6] == 0xff) {
+				if (bguid[5] == 0xff) {
+					qib_dev_err(dd,
+						"Can't set %s GUID from base, wraps to OUI!\n",
+						qib_get_unit_name(t));
+					dd->base_guid = 0;
+					goto bail;
+				}
+				bguid[5]++;
+			}
+			bguid[6]++;
+		}
+		dd->nguid = 1;
+		goto bail;
+	}
+
+	/*
+	 * Read full flash, not just currently used part, since it may have
+	 * been written with a newer definition.
+	 * */
+	len = sizeof(struct qib_flash);
+	buf = vmalloc(len);
+	if (!buf) {
+		qib_dev_err(dd,
+			"Couldn't allocate memory to read %u bytes from eeprom for GUID\n",
+			len);
+		goto bail;
+	}
+
+	/*
+	 * Use "public" eeprom read function, which does locking and
+	 * figures out device. This will migrate to chip-specific.
+	 */
+	eep_stat = qib_eeprom_read(dd, 0, buf, len);
+
+	if (eep_stat) {
+		qib_dev_err(dd, "Failed reading GUID from eeprom\n");
+		goto done;
+	}
+	ifp = (struct qib_flash *)buf;
+
+	csum = flash_csum(ifp, 0);
+	if (csum != ifp->if_csum) {
+		qib_devinfo(dd->pcidev,
+			"Bad I2C flash checksum: 0x%x, not 0x%x\n",
+			csum, ifp->if_csum);
+		goto done;
+	}
+	if (*(__be64 *) ifp->if_guid == cpu_to_be64(0) ||
+	    *(__be64 *) ifp->if_guid == ~cpu_to_be64(0)) {
+		qib_dev_err(dd,
+			"Invalid GUID %llx from flash; ignoring\n",
+			*(unsigned long long *) ifp->if_guid);
+		/* don't allow GUID if all 0 or all 1's */
+		goto done;
+	}
+
+	/* complain, but allow it */
+	if (*(u64 *) ifp->if_guid == 0x100007511000000ULL)
+		qib_devinfo(dd->pcidev,
+			"Warning, GUID %llx is default, probably not correct!\n",
+			*(unsigned long long *) ifp->if_guid);
+
+	bguid = ifp->if_guid;
+	if (!bguid[0] && !bguid[1] && !bguid[2]) {
+		/*
+		 * Original incorrect GUID format in flash; fix in
+		 * core copy, by shifting up 2 octets; don't need to
+		 * change top octet, since both it and shifted are 0.
+		 */
+		bguid[1] = bguid[3];
+		bguid[2] = bguid[4];
+		bguid[3] = 0;
+		bguid[4] = 0;
+		guid = *(__be64 *) ifp->if_guid;
+	} else
+		guid = *(__be64 *) ifp->if_guid;
+	dd->base_guid = guid;
+	dd->nguid = ifp->if_numguid;
+	/*
+	 * Things are slightly complicated by the desire to transparently
+	 * support both the Pathscale 10-digit serial number and the QLogic
+	 * 13-character version.
+	 */
+	if ((ifp->if_fversion > 1) && ifp->if_sprefix[0] &&
+	    ((u8 *) ifp->if_sprefix)[0] != 0xFF) {
+		char *snp = dd->serial;
+
+		/*
+		 * This board has a Serial-prefix, which is stored
+		 * elsewhere for backward-compatibility.
+		 */
+		memcpy(snp, ifp->if_sprefix, sizeof(ifp->if_sprefix));
+		snp[sizeof(ifp->if_sprefix)] = '\0';
+		len = strlen(snp);
+		snp += len;
+		len = sizeof(dd->serial) - len;
+		if (len > sizeof(ifp->if_serial))
+			len = sizeof(ifp->if_serial);
+		memcpy(snp, ifp->if_serial, len);
+	} else {
+		memcpy(dd->serial, ifp->if_serial, sizeof(ifp->if_serial));
+	}
+	if (!strstr(ifp->if_comment, "Tested successfully"))
+		qib_dev_err(dd,
+			"Board SN %s did not pass functional test: %s\n",
+			dd->serial, ifp->if_comment);
+
+done:
+	vfree(buf);
+
+bail:;
+}
+
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
new file mode 100644
index 000000000..725881890
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -0,0 +1,2418 @@
+/*
+ * Copyright (c) 2012, 2013 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/poll.h>
+#include <linux/cdev.h>
+#include <linux/swap.h>
+#include <linux/vmalloc.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/jiffies.h>
+#include <asm/pgtable.h>
+#include <linux/delay.h>
+#include <linux/export.h>
+#include <linux/uio.h>
+
+#include "qib.h"
+#include "qib_common.h"
+#include "qib_user_sdma.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) QIB_DRV_NAME ": " fmt
+
+static int qib_open(struct inode *, struct file *);
+static int qib_close(struct inode *, struct file *);
+static ssize_t qib_write(struct file *, const char __user *, size_t, loff_t *);
+static ssize_t qib_write_iter(struct kiocb *, struct iov_iter *);
+static unsigned int qib_poll(struct file *, struct poll_table_struct *);
+static int qib_mmapf(struct file *, struct vm_area_struct *);
+
+/*
+ * This is really, really weird shit - write() and writev() here
+ * have completely unrelated semantics.  Sucky userland ABI,
+ * film at 11.
+ */
+static const struct file_operations qib_file_ops = {
+	.owner = THIS_MODULE,
+	.write = qib_write,
+	.write_iter = qib_write_iter,
+	.open = qib_open,
+	.release = qib_close,
+	.poll = qib_poll,
+	.mmap = qib_mmapf,
+	.llseek = noop_llseek,
+};
+
+/*
+ * Convert kernel virtual addresses to physical addresses so they don't
+ * potentially conflict with the chip addresses used as mmap offsets.
+ * It doesn't really matter what mmap offset we use as long as we can
+ * interpret it correctly.
+ */
+static u64 cvt_kvaddr(void *p)
+{
+	struct page *page;
+	u64 paddr = 0;
+
+	page = vmalloc_to_page(p);
+	if (page)
+		paddr = page_to_pfn(page) << PAGE_SHIFT;
+
+	return paddr;
+}
+
+static int qib_get_base_info(struct file *fp, void __user *ubase,
+			     size_t ubase_size)
+{
+	struct qib_ctxtdata *rcd = ctxt_fp(fp);
+	int ret = 0;
+	struct qib_base_info *kinfo = NULL;
+	struct qib_devdata *dd = rcd->dd;
+	struct qib_pportdata *ppd = rcd->ppd;
+	unsigned subctxt_cnt;
+	int shared, master;
+	size_t sz;
+
+	subctxt_cnt = rcd->subctxt_cnt;
+	if (!subctxt_cnt) {
+		shared = 0;
+		master = 0;
+		subctxt_cnt = 1;
+	} else {
+		shared = 1;
+		master = !subctxt_fp(fp);
+	}
+
+	sz = sizeof(*kinfo);
+	/* If context sharing is not requested, allow the old size structure */
+	if (!shared)
+		sz -= 7 * sizeof(u64);
+	if (ubase_size < sz) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL);
+	if (kinfo == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	ret = dd->f_get_base_info(rcd, kinfo);
+	if (ret < 0)
+		goto bail;
+
+	kinfo->spi_rcvhdr_cnt = dd->rcvhdrcnt;
+	kinfo->spi_rcvhdrent_size = dd->rcvhdrentsize;
+	kinfo->spi_tidegrcnt = rcd->rcvegrcnt;
+	kinfo->spi_rcv_egrbufsize = dd->rcvegrbufsize;
+	/*
+	 * have to mmap whole thing
+	 */
+	kinfo->spi_rcv_egrbuftotlen =
+		rcd->rcvegrbuf_chunks * rcd->rcvegrbuf_size;
+	kinfo->spi_rcv_egrperchunk = rcd->rcvegrbufs_perchunk;
+	kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /
+		rcd->rcvegrbuf_chunks;
+	kinfo->spi_tidcnt = dd->rcvtidcnt / subctxt_cnt;
+	if (master)
+		kinfo->spi_tidcnt += dd->rcvtidcnt % subctxt_cnt;
+	/*
+	 * for this use, may be cfgctxts summed over all chips that
+	 * are are configured and present
+	 */
+	kinfo->spi_nctxts = dd->cfgctxts;
+	/* unit (chip/board) our context is on */
+	kinfo->spi_unit = dd->unit;
+	kinfo->spi_port = ppd->port;
+	/* for now, only a single page */
+	kinfo->spi_tid_maxsize = PAGE_SIZE;
+
+	/*
+	 * Doing this per context, and based on the skip value, etc.  This has
+	 * to be the actual buffer size, since the protocol code treats it
+	 * as an array.
+	 *
+	 * These have to be set to user addresses in the user code via mmap.
+	 * These values are used on return to user code for the mmap target
+	 * addresses only.  For 32 bit, same 44 bit address problem, so use
+	 * the physical address, not virtual.  Before 2.6.11, using the
+	 * page_address() macro worked, but in 2.6.11, even that returns the
+	 * full 64 bit address (upper bits all 1's).  So far, using the
+	 * physical addresses (or chip offsets, for chip mapping) works, but
+	 * no doubt some future kernel release will change that, and we'll be
+	 * on to yet another method of dealing with this.
+	 * Normally only one of rcvhdr_tailaddr or rhf_offset is useful
+	 * since the chips with non-zero rhf_offset don't normally
+	 * enable tail register updates to host memory, but for testing,
+	 * both can be enabled and used.
+	 */
+	kinfo->spi_rcvhdr_base = (u64) rcd->rcvhdrq_phys;
+	kinfo->spi_rcvhdr_tailaddr = (u64) rcd->rcvhdrqtailaddr_phys;
+	kinfo->spi_rhf_offset = dd->rhf_offset;
+	kinfo->spi_rcv_egrbufs = (u64) rcd->rcvegr_phys;
+	kinfo->spi_pioavailaddr = (u64) dd->pioavailregs_phys;
+	/* setup per-unit (not port) status area for user programs */
+	kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +
+		(char *) ppd->statusp -
+		(char *) dd->pioavailregs_dma;
+	kinfo->spi_uregbase = (u64) dd->uregbase + dd->ureg_align * rcd->ctxt;
+	if (!shared) {
+		kinfo->spi_piocnt = rcd->piocnt;
+		kinfo->spi_piobufbase = (u64) rcd->piobufs;
+		kinfo->spi_sendbuf_status = cvt_kvaddr(rcd->user_event_mask);
+	} else if (master) {
+		kinfo->spi_piocnt = (rcd->piocnt / subctxt_cnt) +
+				    (rcd->piocnt % subctxt_cnt);
+		/* Master's PIO buffers are after all the slave's */
+		kinfo->spi_piobufbase = (u64) rcd->piobufs +
+			dd->palign *
+			(rcd->piocnt - kinfo->spi_piocnt);
+	} else {
+		unsigned slave = subctxt_fp(fp) - 1;
+
+		kinfo->spi_piocnt = rcd->piocnt / subctxt_cnt;
+		kinfo->spi_piobufbase = (u64) rcd->piobufs +
+			dd->palign * kinfo->spi_piocnt * slave;
+	}
+
+	if (shared) {
+		kinfo->spi_sendbuf_status =
+			cvt_kvaddr(&rcd->user_event_mask[subctxt_fp(fp)]);
+		/* only spi_subctxt_* fields should be set in this block! */
+		kinfo->spi_subctxt_uregbase = cvt_kvaddr(rcd->subctxt_uregbase);
+
+		kinfo->spi_subctxt_rcvegrbuf =
+			cvt_kvaddr(rcd->subctxt_rcvegrbuf);
+		kinfo->spi_subctxt_rcvhdr_base =
+			cvt_kvaddr(rcd->subctxt_rcvhdr_base);
+	}
+
+	/*
+	 * All user buffers are 2KB buffers.  If we ever support
+	 * giving 4KB buffers to user processes, this will need some
+	 * work.  Can't use piobufbase directly, because it has
+	 * both 2K and 4K buffer base values.
+	 */
+	kinfo->spi_pioindex = (kinfo->spi_piobufbase - dd->pio2k_bufbase) /
+		dd->palign;
+	kinfo->spi_pioalign = dd->palign;
+	kinfo->spi_qpair = QIB_KD_QP;
+	/*
+	 * user mode PIO buffers are always 2KB, even when 4KB can
+	 * be received, and sent via the kernel; this is ibmaxlen
+	 * for 2K MTU.
+	 */
+	kinfo->spi_piosize = dd->piosize2k - 2 * sizeof(u32);
+	kinfo->spi_mtu = ppd->ibmaxlen; /* maxlen, not ibmtu */
+	kinfo->spi_ctxt = rcd->ctxt;
+	kinfo->spi_subctxt = subctxt_fp(fp);
+	kinfo->spi_sw_version = QIB_KERN_SWVERSION;
+	kinfo->spi_sw_version |= 1U << 31; /* QLogic-built, not kernel.org */
+	kinfo->spi_hw_version = dd->revision;
+
+	if (master)
+		kinfo->spi_runtime_flags |= QIB_RUNTIME_MASTER;
+
+	sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo);
+	if (copy_to_user(ubase, kinfo, sz))
+		ret = -EFAULT;
+bail:
+	kfree(kinfo);
+	return ret;
+}
+
+/**
+ * qib_tid_update - update a context TID
+ * @rcd: the context
+ * @fp: the qib device file
+ * @ti: the TID information
+ *
+ * The new implementation as of Oct 2004 is that the driver assigns
+ * the tid and returns it to the caller.   To reduce search time, we
+ * keep a cursor for each context, walking the shadow tid array to find
+ * one that's not in use.
+ *
+ * For now, if we can't allocate the full list, we fail, although
+ * in the long run, we'll allocate as many as we can, and the
+ * caller will deal with that by trying the remaining pages later.
+ * That means that when we fail, we have to mark the tids as not in
+ * use again, in our shadow copy.
+ *
+ * It's up to the caller to free the tids when they are done.
+ * We'll unlock the pages as they free them.
+ *
+ * Also, right now we are locking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.
+ */
+static int qib_tid_update(struct qib_ctxtdata *rcd, struct file *fp,
+			  const struct qib_tid_info *ti)
+{
+	int ret = 0, ntids;
+	u32 tid, ctxttid, cnt, i, tidcnt, tidoff;
+	u16 *tidlist;
+	struct qib_devdata *dd = rcd->dd;
+	u64 physaddr;
+	unsigned long vaddr;
+	u64 __iomem *tidbase;
+	unsigned long tidmap[8];
+	struct page **pagep = NULL;
+	unsigned subctxt = subctxt_fp(fp);
+
+	if (!dd->pageshadow) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	cnt = ti->tidcnt;
+	if (!cnt) {
+		ret = -EFAULT;
+		goto done;
+	}
+	ctxttid = rcd->ctxt * dd->rcvtidcnt;
+	if (!rcd->subctxt_cnt) {
+		tidcnt = dd->rcvtidcnt;
+		tid = rcd->tidcursor;
+		tidoff = 0;
+	} else if (!subctxt) {
+		tidcnt = (dd->rcvtidcnt / rcd->subctxt_cnt) +
+			 (dd->rcvtidcnt % rcd->subctxt_cnt);
+		tidoff = dd->rcvtidcnt - tidcnt;
+		ctxttid += tidoff;
+		tid = tidcursor_fp(fp);
+	} else {
+		tidcnt = dd->rcvtidcnt / rcd->subctxt_cnt;
+		tidoff = tidcnt * (subctxt - 1);
+		ctxttid += tidoff;
+		tid = tidcursor_fp(fp);
+	}
+	if (cnt > tidcnt) {
+		/* make sure it all fits in tid_pg_list */
+		qib_devinfo(dd->pcidev,
+			"Process tried to allocate %u TIDs, only trying max (%u)\n",
+			cnt, tidcnt);
+		cnt = tidcnt;
+	}
+	pagep = (struct page **) rcd->tid_pg_list;
+	tidlist = (u16 *) &pagep[dd->rcvtidcnt];
+	pagep += tidoff;
+	tidlist += tidoff;
+
+	memset(tidmap, 0, sizeof(tidmap));
+	/* before decrement; chip actual # */
+	ntids = tidcnt;
+	tidbase = (u64 __iomem *) (((char __iomem *) dd->kregbase) +
+				   dd->rcvtidbase +
+				   ctxttid * sizeof(*tidbase));
+
+	/* virtual address of first page in transfer */
+	vaddr = ti->tidvaddr;
+	if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
+		       cnt * PAGE_SIZE)) {
+		ret = -EFAULT;
+		goto done;
+	}
+	ret = qib_get_user_pages(vaddr, cnt, pagep);
+	if (ret) {
+		/*
+		 * if (ret == -EBUSY)
+		 * We can't continue because the pagep array won't be
+		 * initialized. This should never happen,
+		 * unless perhaps the user has mpin'ed the pages
+		 * themselves.
+		 */
+		qib_devinfo(
+			dd->pcidev,
+			"Failed to lock addr %p, %u pages: errno %d\n",
+			(void *) vaddr, cnt, -ret);
+		goto done;
+	}
+	for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
+		for (; ntids--; tid++) {
+			if (tid == tidcnt)
+				tid = 0;
+			if (!dd->pageshadow[ctxttid + tid])
+				break;
+		}
+		if (ntids < 0) {
+			/*
+			 * Oops, wrapped all the way through their TIDs,
+			 * and didn't have enough free; see comments at
+			 * start of routine
+			 */
+			i--;    /* last tidlist[i] not filled in */
+			ret = -ENOMEM;
+			break;
+		}
+		tidlist[i] = tid + tidoff;
+		/* we "know" system pages and TID pages are same size */
+		dd->pageshadow[ctxttid + tid] = pagep[i];
+		dd->physshadow[ctxttid + tid] =
+			qib_map_page(dd->pcidev, pagep[i], 0, PAGE_SIZE,
+				     PCI_DMA_FROMDEVICE);
+		/*
+		 * don't need atomic or it's overhead
+		 */
+		__set_bit(tid, tidmap);
+		physaddr = dd->physshadow[ctxttid + tid];
+		/* PERFORMANCE: below should almost certainly be cached */
+		dd->f_put_tid(dd, &tidbase[tid],
+				  RCVHQ_RCV_TYPE_EXPECTED, physaddr);
+		/*
+		 * don't check this tid in qib_ctxtshadow, since we
+		 * just filled it in; start with the next one.
+		 */
+		tid++;
+	}
+
+	if (ret) {
+		u32 limit;
+cleanup:
+		/* jump here if copy out of updated info failed... */
+		/* same code that's in qib_free_tid() */
+		limit = sizeof(tidmap) * BITS_PER_BYTE;
+		if (limit > tidcnt)
+			/* just in case size changes in future */
+			limit = tidcnt;
+		tid = find_first_bit((const unsigned long *)tidmap, limit);
+		for (; tid < limit; tid++) {
+			if (!test_bit(tid, tidmap))
+				continue;
+			if (dd->pageshadow[ctxttid + tid]) {
+				dma_addr_t phys;
+
+				phys = dd->physshadow[ctxttid + tid];
+				dd->physshadow[ctxttid + tid] = dd->tidinvalid;
+				/* PERFORMANCE: below should almost certainly
+				 * be cached
+				 */
+				dd->f_put_tid(dd, &tidbase[tid],
+					      RCVHQ_RCV_TYPE_EXPECTED,
+					      dd->tidinvalid);
+				pci_unmap_page(dd->pcidev, phys, PAGE_SIZE,
+					       PCI_DMA_FROMDEVICE);
+				dd->pageshadow[ctxttid + tid] = NULL;
+			}
+		}
+		qib_release_user_pages(pagep, cnt);
+	} else {
+		/*
+		 * Copy the updated array, with qib_tid's filled in, back
+		 * to user.  Since we did the copy in already, this "should
+		 * never fail" If it does, we have to clean up...
+		 */
+		if (copy_to_user((void __user *)
+				 (unsigned long) ti->tidlist,
+				 tidlist, cnt * sizeof(*tidlist))) {
+			ret = -EFAULT;
+			goto cleanup;
+		}
+		if (copy_to_user((void __user *) (unsigned long) ti->tidmap,
+				 tidmap, sizeof(tidmap))) {
+			ret = -EFAULT;
+			goto cleanup;
+		}
+		if (tid == tidcnt)
+			tid = 0;
+		if (!rcd->subctxt_cnt)
+			rcd->tidcursor = tid;
+		else
+			tidcursor_fp(fp) = tid;
+	}
+
+done:
+	return ret;
+}
+
+/**
+ * qib_tid_free - free a context TID
+ * @rcd: the context
+ * @subctxt: the subcontext
+ * @ti: the TID info
+ *
+ * right now we are unlocking one page at a time, but since
+ * the intended use of this routine is for a single group of
+ * virtually contiguous pages, that should change to improve
+ * performance.  We check that the TID is in range for this context
+ * but otherwise don't check validity; if user has an error and
+ * frees the wrong tid, it's only their own data that can thereby
+ * be corrupted.  We do check that the TID was in use, for sanity
+ * We always use our idea of the saved address, not the address that
+ * they pass in to us.
+ */
+static int qib_tid_free(struct qib_ctxtdata *rcd, unsigned subctxt,
+			const struct qib_tid_info *ti)
+{
+	int ret = 0;
+	u32 tid, ctxttid, cnt, limit, tidcnt;
+	struct qib_devdata *dd = rcd->dd;
+	u64 __iomem *tidbase;
+	unsigned long tidmap[8];
+
+	if (!dd->pageshadow) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap,
+			   sizeof(tidmap))) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	ctxttid = rcd->ctxt * dd->rcvtidcnt;
+	if (!rcd->subctxt_cnt)
+		tidcnt = dd->rcvtidcnt;
+	else if (!subctxt) {
+		tidcnt = (dd->rcvtidcnt / rcd->subctxt_cnt) +
+			 (dd->rcvtidcnt % rcd->subctxt_cnt);
+		ctxttid += dd->rcvtidcnt - tidcnt;
+	} else {
+		tidcnt = dd->rcvtidcnt / rcd->subctxt_cnt;
+		ctxttid += tidcnt * (subctxt - 1);
+	}
+	tidbase = (u64 __iomem *) ((char __iomem *)(dd->kregbase) +
+				   dd->rcvtidbase +
+				   ctxttid * sizeof(*tidbase));
+
+	limit = sizeof(tidmap) * BITS_PER_BYTE;
+	if (limit > tidcnt)
+		/* just in case size changes in future */
+		limit = tidcnt;
+	tid = find_first_bit(tidmap, limit);
+	for (cnt = 0; tid < limit; tid++) {
+		/*
+		 * small optimization; if we detect a run of 3 or so without
+		 * any set, use find_first_bit again.  That's mainly to
+		 * accelerate the case where we wrapped, so we have some at
+		 * the beginning, and some at the end, and a big gap
+		 * in the middle.
+		 */
+		if (!test_bit(tid, tidmap))
+			continue;
+		cnt++;
+		if (dd->pageshadow[ctxttid + tid]) {
+			struct page *p;
+			dma_addr_t phys;
+
+			p = dd->pageshadow[ctxttid + tid];
+			dd->pageshadow[ctxttid + tid] = NULL;
+			phys = dd->physshadow[ctxttid + tid];
+			dd->physshadow[ctxttid + tid] = dd->tidinvalid;
+			/* PERFORMANCE: below should almost certainly be
+			 * cached
+			 */
+			dd->f_put_tid(dd, &tidbase[tid],
+				      RCVHQ_RCV_TYPE_EXPECTED, dd->tidinvalid);
+			pci_unmap_page(dd->pcidev, phys, PAGE_SIZE,
+				       PCI_DMA_FROMDEVICE);
+			qib_release_user_pages(&p, 1);
+		}
+	}
+done:
+	return ret;
+}
+
+/**
+ * qib_set_part_key - set a partition key
+ * @rcd: the context
+ * @key: the key
+ *
+ * We can have up to 4 active at a time (other than the default, which is
+ * always allowed).  This is somewhat tricky, since multiple contexts may set
+ * the same key, so we reference count them, and clean up at exit.  All 4
+ * partition keys are packed into a single qlogic_ib register.  It's an
+ * error for a process to set the same pkey multiple times.  We provide no
+ * mechanism to de-allocate a pkey at this time, we may eventually need to
+ * do that.  I've used the atomic operations, and no locking, and only make
+ * a single pass through what's available.  This should be more than
+ * adequate for some time. I'll think about spinlocks or the like if and as
+ * it's necessary.
+ */
+static int qib_set_part_key(struct qib_ctxtdata *rcd, u16 key)
+{
+	struct qib_pportdata *ppd = rcd->ppd;
+	int i, any = 0, pidx = -1;
+	u16 lkey = key & 0x7FFF;
+	int ret;
+
+	if (lkey == (QIB_DEFAULT_P_KEY & 0x7FFF)) {
+		/* nothing to do; this key always valid */
+		ret = 0;
+		goto bail;
+	}
+
+	if (!lkey) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Set the full membership bit, because it has to be
+	 * set in the register or the packet, and it seems
+	 * cleaner to set in the register than to force all
+	 * callers to set it.
+	 */
+	key |= 0x8000;
+
+	for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) {
+		if (!rcd->pkeys[i] && pidx == -1)
+			pidx = i;
+		if (rcd->pkeys[i] == key) {
+			ret = -EEXIST;
+			goto bail;
+		}
+	}
+	if (pidx == -1) {
+		ret = -EBUSY;
+		goto bail;
+	}
+	for (any = i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (!ppd->pkeys[i]) {
+			any++;
+			continue;
+		}
+		if (ppd->pkeys[i] == key) {
+			atomic_t *pkrefs = &ppd->pkeyrefs[i];
+
+			if (atomic_inc_return(pkrefs) > 1) {
+				rcd->pkeys[pidx] = key;
+				ret = 0;
+				goto bail;
+			} else {
+				/*
+				 * lost race, decrement count, catch below
+				 */
+				atomic_dec(pkrefs);
+				any++;
+			}
+		}
+		if ((ppd->pkeys[i] & 0x7FFF) == lkey) {
+			/*
+			 * It makes no sense to have both the limited and
+			 * full membership PKEY set at the same time since
+			 * the unlimited one will disable the limited one.
+			 */
+			ret = -EEXIST;
+			goto bail;
+		}
+	}
+	if (!any) {
+		ret = -EBUSY;
+		goto bail;
+	}
+	for (any = i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (!ppd->pkeys[i] &&
+		    atomic_inc_return(&ppd->pkeyrefs[i]) == 1) {
+			rcd->pkeys[pidx] = key;
+			ppd->pkeys[i] = key;
+			(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0);
+			ret = 0;
+			goto bail;
+		}
+	}
+	ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_manage_rcvq - manage a context's receive queue
+ * @rcd: the context
+ * @subctxt: the subcontext
+ * @start_stop: action to carry out
+ *
+ * start_stop == 0 disables receive on the context, for use in queue
+ * overflow conditions.  start_stop==1 re-enables, to be used to
+ * re-init the software copy of the head register
+ */
+static int qib_manage_rcvq(struct qib_ctxtdata *rcd, unsigned subctxt,
+			   int start_stop)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned int rcvctrl_op;
+
+	if (subctxt)
+		goto bail;
+	/* atomically clear receive enable ctxt. */
+	if (start_stop) {
+		/*
+		 * On enable, force in-memory copy of the tail register to
+		 * 0, so that protocol code doesn't have to worry about
+		 * whether or not the chip has yet updated the in-memory
+		 * copy or not on return from the system call. The chip
+		 * always resets it's tail register back to 0 on a
+		 * transition from disabled to enabled.
+		 */
+		if (rcd->rcvhdrtail_kvaddr)
+			qib_clear_rcvhdrtail(rcd);
+		rcvctrl_op = QIB_RCVCTRL_CTXT_ENB;
+	} else
+		rcvctrl_op = QIB_RCVCTRL_CTXT_DIS;
+	dd->f_rcvctrl(rcd->ppd, rcvctrl_op, rcd->ctxt);
+	/* always; new head should be equal to new tail; see above */
+bail:
+	return 0;
+}
+
+static void qib_clean_part_key(struct qib_ctxtdata *rcd,
+			       struct qib_devdata *dd)
+{
+	int i, j, pchanged = 0;
+	u64 oldpkey;
+	struct qib_pportdata *ppd = rcd->ppd;
+
+	/* for debugging only */
+	oldpkey = (u64) ppd->pkeys[0] |
+		((u64) ppd->pkeys[1] << 16) |
+		((u64) ppd->pkeys[2] << 32) |
+		((u64) ppd->pkeys[3] << 48);
+
+	for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) {
+		if (!rcd->pkeys[i])
+			continue;
+		for (j = 0; j < ARRAY_SIZE(ppd->pkeys); j++) {
+			/* check for match independent of the global bit */
+			if ((ppd->pkeys[j] & 0x7fff) !=
+			    (rcd->pkeys[i] & 0x7fff))
+				continue;
+			if (atomic_dec_and_test(&ppd->pkeyrefs[j])) {
+				ppd->pkeys[j] = 0;
+				pchanged++;
+			}
+			break;
+		}
+		rcd->pkeys[i] = 0;
+	}
+	if (pchanged)
+		(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0);
+}
+
+/* common code for the mappings on dma_alloc_coherent mem */
+static int qib_mmap_mem(struct vm_area_struct *vma, struct qib_ctxtdata *rcd,
+			unsigned len, void *kvaddr, u32 write_ok, char *what)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned long pfn;
+	int ret;
+
+	if ((vma->vm_end - vma->vm_start) > len) {
+		qib_devinfo(dd->pcidev,
+			 "FAIL on %s: len %lx > %x\n", what,
+			 vma->vm_end - vma->vm_start, len);
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	/*
+	 * shared context user code requires rcvhdrq mapped r/w, others
+	 * only allowed readonly mapping.
+	 */
+	if (!write_ok) {
+		if (vma->vm_flags & VM_WRITE) {
+			qib_devinfo(dd->pcidev,
+				 "%s must be mapped readonly\n", what);
+			ret = -EPERM;
+			goto bail;
+		}
+
+		/* don't allow them to later change with mprotect */
+		vma->vm_flags &= ~VM_MAYWRITE;
+	}
+
+	pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT;
+	ret = remap_pfn_range(vma, vma->vm_start, pfn,
+			      len, vma->vm_page_prot);
+	if (ret)
+		qib_devinfo(dd->pcidev,
+			"%s ctxt%u mmap of %lx, %x bytes failed: %d\n",
+			what, rcd->ctxt, pfn, len, ret);
+bail:
+	return ret;
+}
+
+static int mmap_ureg(struct vm_area_struct *vma, struct qib_devdata *dd,
+		     u64 ureg)
+{
+	unsigned long phys;
+	unsigned long sz;
+	int ret;
+
+	/*
+	 * This is real hardware, so use io_remap.  This is the mechanism
+	 * for the user process to update the head registers for their ctxt
+	 * in the chip.
+	 */
+	sz = dd->flags & QIB_HAS_HDRSUPP ? 2 * PAGE_SIZE : PAGE_SIZE;
+	if ((vma->vm_end - vma->vm_start) > sz) {
+		qib_devinfo(dd->pcidev,
+			"FAIL mmap userreg: reqlen %lx > PAGE\n",
+			vma->vm_end - vma->vm_start);
+		ret = -EFAULT;
+	} else {
+		phys = dd->physaddr + ureg;
+		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+
+		vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 phys >> PAGE_SHIFT,
+					 vma->vm_end - vma->vm_start,
+					 vma->vm_page_prot);
+	}
+	return ret;
+}
+
+static int mmap_piobufs(struct vm_area_struct *vma,
+			struct qib_devdata *dd,
+			struct qib_ctxtdata *rcd,
+			unsigned piobufs, unsigned piocnt)
+{
+	unsigned long phys;
+	int ret;
+
+	/*
+	 * When we map the PIO buffers in the chip, we want to map them as
+	 * writeonly, no read possible; unfortunately, x86 doesn't allow
+	 * for this in hardware, but we still prevent users from asking
+	 * for it.
+	 */
+	if ((vma->vm_end - vma->vm_start) > (piocnt * dd->palign)) {
+		qib_devinfo(dd->pcidev,
+			"FAIL mmap piobufs: reqlen %lx > PAGE\n",
+			 vma->vm_end - vma->vm_start);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	phys = dd->physaddr + piobufs;
+
+#if defined(__powerpc__)
+	/* There isn't a generic way to specify writethrough mappings */
+	pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
+	pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;
+	pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED;
+#endif
+
+	/*
+	 * don't allow them to later change to readable with mprotect (for when
+	 * not initially mapped readable, as is normally the case)
+	 */
+	vma->vm_flags &= ~VM_MAYREAD;
+	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
+
+	/* We used PAT if wc_cookie == 0 */
+	if (!dd->wc_cookie)
+		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
+
+	ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT,
+				 vma->vm_end - vma->vm_start,
+				 vma->vm_page_prot);
+bail:
+	return ret;
+}
+
+static int mmap_rcvegrbufs(struct vm_area_struct *vma,
+			   struct qib_ctxtdata *rcd)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned long start, size;
+	size_t total_size, i;
+	unsigned long pfn;
+	int ret;
+
+	size = rcd->rcvegrbuf_size;
+	total_size = rcd->rcvegrbuf_chunks * size;
+	if ((vma->vm_end - vma->vm_start) > total_size) {
+		qib_devinfo(dd->pcidev,
+			"FAIL on egr bufs: reqlen %lx > actual %lx\n",
+			 vma->vm_end - vma->vm_start,
+			 (unsigned long) total_size);
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (vma->vm_flags & VM_WRITE) {
+		qib_devinfo(dd->pcidev,
+			"Can't map eager buffers as writable (flags=%lx)\n",
+			vma->vm_flags);
+		ret = -EPERM;
+		goto bail;
+	}
+	/* don't allow them to later change to writeable with mprotect */
+	vma->vm_flags &= ~VM_MAYWRITE;
+
+	start = vma->vm_start;
+
+	for (i = 0; i < rcd->rcvegrbuf_chunks; i++, start += size) {
+		pfn = virt_to_phys(rcd->rcvegrbuf[i]) >> PAGE_SHIFT;
+		ret = remap_pfn_range(vma, start, pfn, size,
+				      vma->vm_page_prot);
+		if (ret < 0)
+			goto bail;
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/*
+ * qib_file_vma_fault - handle a VMA page fault.
+ */
+static int qib_file_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct page *page;
+
+	page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
+	if (!page)
+		return VM_FAULT_SIGBUS;
+
+	get_page(page);
+	vmf->page = page;
+
+	return 0;
+}
+
+static struct vm_operations_struct qib_file_vm_ops = {
+	.fault = qib_file_vma_fault,
+};
+
+static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
+		       struct qib_ctxtdata *rcd, unsigned subctxt)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned subctxt_cnt;
+	unsigned long len;
+	void *addr;
+	size_t size;
+	int ret = 0;
+
+	subctxt_cnt = rcd->subctxt_cnt;
+	size = rcd->rcvegrbuf_chunks * rcd->rcvegrbuf_size;
+
+	/*
+	 * Each process has all the subctxt uregbase, rcvhdrq, and
+	 * rcvegrbufs mmapped - as an array for all the processes,
+	 * and also separately for this process.
+	 */
+	if (pgaddr == cvt_kvaddr(rcd->subctxt_uregbase)) {
+		addr = rcd->subctxt_uregbase;
+		size = PAGE_SIZE * subctxt_cnt;
+	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvhdr_base)) {
+		addr = rcd->subctxt_rcvhdr_base;
+		size = rcd->rcvhdrq_size * subctxt_cnt;
+	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvegrbuf)) {
+		addr = rcd->subctxt_rcvegrbuf;
+		size *= subctxt_cnt;
+	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_uregbase +
+					PAGE_SIZE * subctxt)) {
+		addr = rcd->subctxt_uregbase + PAGE_SIZE * subctxt;
+		size = PAGE_SIZE;
+	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvhdr_base +
+					rcd->rcvhdrq_size * subctxt)) {
+		addr = rcd->subctxt_rcvhdr_base +
+			rcd->rcvhdrq_size * subctxt;
+		size = rcd->rcvhdrq_size;
+	} else if (pgaddr == cvt_kvaddr(&rcd->user_event_mask[subctxt])) {
+		addr = rcd->user_event_mask;
+		size = PAGE_SIZE;
+	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvegrbuf +
+					size * subctxt)) {
+		addr = rcd->subctxt_rcvegrbuf + size * subctxt;
+		/* rcvegrbufs are read-only on the slave */
+		if (vma->vm_flags & VM_WRITE) {
+			qib_devinfo(dd->pcidev,
+				 "Can't map eager buffers as writable (flags=%lx)\n",
+				 vma->vm_flags);
+			ret = -EPERM;
+			goto bail;
+		}
+		/*
+		 * Don't allow permission to later change to writeable
+		 * with mprotect.
+		 */
+		vma->vm_flags &= ~VM_MAYWRITE;
+	} else
+		goto bail;
+	len = vma->vm_end - vma->vm_start;
+	if (len > size) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
+	vma->vm_ops = &qib_file_vm_ops;
+	vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+	ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_mmapf - mmap various structures into user space
+ * @fp: the file pointer
+ * @vma: the VM area
+ *
+ * We use this to have a shared buffer between the kernel and the user code
+ * for the rcvhdr queue, egr buffers, and the per-context user regs and pio
+ * buffers in the chip.  We have the open and close entries so we can bump
+ * the ref count and keep the driver from being unloaded while still mapped.
+ */
+static int qib_mmapf(struct file *fp, struct vm_area_struct *vma)
+{
+	struct qib_ctxtdata *rcd;
+	struct qib_devdata *dd;
+	u64 pgaddr, ureg;
+	unsigned piobufs, piocnt;
+	int ret, match = 1;
+
+	rcd = ctxt_fp(fp);
+	if (!rcd || !(vma->vm_flags & VM_SHARED)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	dd = rcd->dd;
+
+	/*
+	 * This is the qib_do_user_init() code, mapping the shared buffers
+	 * and per-context user registers into the user process. The address
+	 * referred to by vm_pgoff is the file offset passed via mmap().
+	 * For shared contexts, this is the kernel vmalloc() address of the
+	 * pages to share with the master.
+	 * For non-shared or master ctxts, this is a physical address.
+	 * We only do one mmap for each space mapped.
+	 */
+	pgaddr = vma->vm_pgoff << PAGE_SHIFT;
+
+	/*
+	 * Check for 0 in case one of the allocations failed, but user
+	 * called mmap anyway.
+	 */
+	if (!pgaddr)  {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Physical addresses must fit in 40 bits for our hardware.
+	 * Check for kernel virtual addresses first, anything else must
+	 * match a HW or memory address.
+	 */
+	ret = mmap_kvaddr(vma, pgaddr, rcd, subctxt_fp(fp));
+	if (ret) {
+		if (ret > 0)
+			ret = 0;
+		goto bail;
+	}
+
+	ureg = dd->uregbase + dd->ureg_align * rcd->ctxt;
+	if (!rcd->subctxt_cnt) {
+		/* ctxt is not shared */
+		piocnt = rcd->piocnt;
+		piobufs = rcd->piobufs;
+	} else if (!subctxt_fp(fp)) {
+		/* caller is the master */
+		piocnt = (rcd->piocnt / rcd->subctxt_cnt) +
+			 (rcd->piocnt % rcd->subctxt_cnt);
+		piobufs = rcd->piobufs +
+			dd->palign * (rcd->piocnt - piocnt);
+	} else {
+		unsigned slave = subctxt_fp(fp) - 1;
+
+		/* caller is a slave */
+		piocnt = rcd->piocnt / rcd->subctxt_cnt;
+		piobufs = rcd->piobufs + dd->palign * piocnt * slave;
+	}
+
+	if (pgaddr == ureg)
+		ret = mmap_ureg(vma, dd, ureg);
+	else if (pgaddr == piobufs)
+		ret = mmap_piobufs(vma, dd, rcd, piobufs, piocnt);
+	else if (pgaddr == dd->pioavailregs_phys)
+		/* in-memory copy of pioavail registers */
+		ret = qib_mmap_mem(vma, rcd, PAGE_SIZE,
+				   (void *) dd->pioavailregs_dma, 0,
+				   "pioavail registers");
+	else if (pgaddr == rcd->rcvegr_phys)
+		ret = mmap_rcvegrbufs(vma, rcd);
+	else if (pgaddr == (u64) rcd->rcvhdrq_phys)
+		/*
+		 * The rcvhdrq itself; multiple pages, contiguous
+		 * from an i/o perspective.  Shared contexts need
+		 * to map r/w, so we allow writing.
+		 */
+		ret = qib_mmap_mem(vma, rcd, rcd->rcvhdrq_size,
+				   rcd->rcvhdrq, 1, "rcvhdrq");
+	else if (pgaddr == (u64) rcd->rcvhdrqtailaddr_phys)
+		/* in-memory copy of rcvhdrq tail register */
+		ret = qib_mmap_mem(vma, rcd, PAGE_SIZE,
+				   rcd->rcvhdrtail_kvaddr, 0,
+				   "rcvhdrq tail");
+	else
+		match = 0;
+	if (!match)
+		ret = -EINVAL;
+
+	vma->vm_private_data = NULL;
+
+	if (ret < 0)
+		qib_devinfo(dd->pcidev,
+			 "mmap Failure %d: off %llx len %lx\n",
+			 -ret, (unsigned long long)pgaddr,
+			 vma->vm_end - vma->vm_start);
+bail:
+	return ret;
+}
+
+static unsigned int qib_poll_urgent(struct qib_ctxtdata *rcd,
+				    struct file *fp,
+				    struct poll_table_struct *pt)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned pollflag;
+
+	poll_wait(fp, &rcd->wait, pt);
+
+	spin_lock_irq(&dd->uctxt_lock);
+	if (rcd->urgent != rcd->urgent_poll) {
+		pollflag = POLLIN | POLLRDNORM;
+		rcd->urgent_poll = rcd->urgent;
+	} else {
+		pollflag = 0;
+		set_bit(QIB_CTXT_WAITING_URG, &rcd->flag);
+	}
+	spin_unlock_irq(&dd->uctxt_lock);
+
+	return pollflag;
+}
+
+static unsigned int qib_poll_next(struct qib_ctxtdata *rcd,
+				  struct file *fp,
+				  struct poll_table_struct *pt)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned pollflag;
+
+	poll_wait(fp, &rcd->wait, pt);
+
+	spin_lock_irq(&dd->uctxt_lock);
+	if (dd->f_hdrqempty(rcd)) {
+		set_bit(QIB_CTXT_WAITING_RCV, &rcd->flag);
+		dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_INTRAVAIL_ENB, rcd->ctxt);
+		pollflag = 0;
+	} else
+		pollflag = POLLIN | POLLRDNORM;
+	spin_unlock_irq(&dd->uctxt_lock);
+
+	return pollflag;
+}
+
+static unsigned int qib_poll(struct file *fp, struct poll_table_struct *pt)
+{
+	struct qib_ctxtdata *rcd;
+	unsigned pollflag;
+
+	rcd = ctxt_fp(fp);
+	if (!rcd)
+		pollflag = POLLERR;
+	else if (rcd->poll_type == QIB_POLL_TYPE_URGENT)
+		pollflag = qib_poll_urgent(rcd, fp, pt);
+	else  if (rcd->poll_type == QIB_POLL_TYPE_ANYRCV)
+		pollflag = qib_poll_next(rcd, fp, pt);
+	else /* invalid */
+		pollflag = POLLERR;
+
+	return pollflag;
+}
+
+static void assign_ctxt_affinity(struct file *fp, struct qib_devdata *dd)
+{
+	struct qib_filedata *fd = fp->private_data;
+	const unsigned int weight = cpumask_weight(&current->cpus_allowed);
+	const struct cpumask *local_mask = cpumask_of_pcibus(dd->pcidev->bus);
+	int local_cpu;
+
+	/*
+	 * If process has NOT already set it's affinity, select and
+	 * reserve a processor for it on the local NUMA node.
+	 */
+	if ((weight >= qib_cpulist_count) &&
+		(cpumask_weight(local_mask) <= qib_cpulist_count)) {
+		for_each_cpu(local_cpu, local_mask)
+			if (!test_and_set_bit(local_cpu, qib_cpulist)) {
+				fd->rec_cpu_num = local_cpu;
+				return;
+			}
+	}
+
+	/*
+	 * If process has NOT already set it's affinity, select and
+	 * reserve a processor for it, as a rendevous for all
+	 * users of the driver.  If they don't actually later
+	 * set affinity to this cpu, or set it to some other cpu,
+	 * it just means that sooner or later we don't recommend
+	 * a cpu, and let the scheduler do it's best.
+	 */
+	if (weight >= qib_cpulist_count) {
+		int cpu;
+
+		cpu = find_first_zero_bit(qib_cpulist,
+					  qib_cpulist_count);
+		if (cpu == qib_cpulist_count)
+			qib_dev_err(dd,
+			"no cpus avail for affinity PID %u\n",
+			current->pid);
+		else {
+			__set_bit(cpu, qib_cpulist);
+			fd->rec_cpu_num = cpu;
+		}
+	}
+}
+
+/*
+ * Check that userland and driver are compatible for subcontexts.
+ */
+static int qib_compatible_subctxts(int user_swmajor, int user_swminor)
+{
+	/* this code is written long-hand for clarity */
+	if (QIB_USER_SWMAJOR != user_swmajor) {
+		/* no promise of compatibility if major mismatch */
+		return 0;
+	}
+	if (QIB_USER_SWMAJOR == 1) {
+		switch (QIB_USER_SWMINOR) {
+		case 0:
+		case 1:
+		case 2:
+			/* no subctxt implementation so cannot be compatible */
+			return 0;
+		case 3:
+			/* 3 is only compatible with itself */
+			return user_swminor == 3;
+		default:
+			/* >= 4 are compatible (or are expected to be) */
+			return user_swminor <= QIB_USER_SWMINOR;
+		}
+	}
+	/* make no promises yet for future major versions */
+	return 0;
+}
+
+static int init_subctxts(struct qib_devdata *dd,
+			 struct qib_ctxtdata *rcd,
+			 const struct qib_user_info *uinfo)
+{
+	int ret = 0;
+	unsigned num_subctxts;
+	size_t size;
+
+	/*
+	 * If the user is requesting zero subctxts,
+	 * skip the subctxt allocation.
+	 */
+	if (uinfo->spu_subctxt_cnt <= 0)
+		goto bail;
+	num_subctxts = uinfo->spu_subctxt_cnt;
+
+	/* Check for subctxt compatibility */
+	if (!qib_compatible_subctxts(uinfo->spu_userversion >> 16,
+		uinfo->spu_userversion & 0xffff)) {
+		qib_devinfo(dd->pcidev,
+			 "Mismatched user version (%d.%d) and driver version (%d.%d) while context sharing. Ensure that driver and library are from the same release.\n",
+			 (int) (uinfo->spu_userversion >> 16),
+			 (int) (uinfo->spu_userversion & 0xffff),
+			 QIB_USER_SWMAJOR, QIB_USER_SWMINOR);
+		goto bail;
+	}
+	if (num_subctxts > QLOGIC_IB_MAX_SUBCTXT) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	rcd->subctxt_uregbase = vmalloc_user(PAGE_SIZE * num_subctxts);
+	if (!rcd->subctxt_uregbase) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	/* Note: rcd->rcvhdrq_size isn't initialized yet. */
+	size = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize *
+		     sizeof(u32), PAGE_SIZE) * num_subctxts;
+	rcd->subctxt_rcvhdr_base = vmalloc_user(size);
+	if (!rcd->subctxt_rcvhdr_base) {
+		ret = -ENOMEM;
+		goto bail_ureg;
+	}
+
+	rcd->subctxt_rcvegrbuf = vmalloc_user(rcd->rcvegrbuf_chunks *
+					      rcd->rcvegrbuf_size *
+					      num_subctxts);
+	if (!rcd->subctxt_rcvegrbuf) {
+		ret = -ENOMEM;
+		goto bail_rhdr;
+	}
+
+	rcd->subctxt_cnt = uinfo->spu_subctxt_cnt;
+	rcd->subctxt_id = uinfo->spu_subctxt_id;
+	rcd->active_slaves = 1;
+	rcd->redirect_seq_cnt = 1;
+	set_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag);
+	goto bail;
+
+bail_rhdr:
+	vfree(rcd->subctxt_rcvhdr_base);
+bail_ureg:
+	vfree(rcd->subctxt_uregbase);
+	rcd->subctxt_uregbase = NULL;
+bail:
+	return ret;
+}
+
+static int setup_ctxt(struct qib_pportdata *ppd, int ctxt,
+		      struct file *fp, const struct qib_user_info *uinfo)
+{
+	struct qib_filedata *fd = fp->private_data;
+	struct qib_devdata *dd = ppd->dd;
+	struct qib_ctxtdata *rcd;
+	void *ptmp = NULL;
+	int ret;
+	int numa_id;
+
+	assign_ctxt_affinity(fp, dd);
+
+	numa_id = qib_numa_aware ? ((fd->rec_cpu_num != -1) ?
+		cpu_to_node(fd->rec_cpu_num) :
+		numa_node_id()) : dd->assigned_node_id;
+
+	rcd = qib_create_ctxtdata(ppd, ctxt, numa_id);
+
+	/*
+	 * Allocate memory for use in qib_tid_update() at open to
+	 * reduce cost of expected send setup per message segment
+	 */
+	if (rcd)
+		ptmp = kmalloc(dd->rcvtidcnt * sizeof(u16) +
+			       dd->rcvtidcnt * sizeof(struct page **),
+			       GFP_KERNEL);
+
+	if (!rcd || !ptmp) {
+		qib_dev_err(dd,
+			"Unable to allocate ctxtdata memory, failing open\n");
+		ret = -ENOMEM;
+		goto bailerr;
+	}
+	rcd->userversion = uinfo->spu_userversion;
+	ret = init_subctxts(dd, rcd, uinfo);
+	if (ret)
+		goto bailerr;
+	rcd->tid_pg_list = ptmp;
+	rcd->pid = current->pid;
+	init_waitqueue_head(&dd->rcd[ctxt]->wait);
+	strlcpy(rcd->comm, current->comm, sizeof(rcd->comm));
+	ctxt_fp(fp) = rcd;
+	qib_stats.sps_ctxts++;
+	dd->freectxts--;
+	ret = 0;
+	goto bail;
+
+bailerr:
+	if (fd->rec_cpu_num != -1)
+		__clear_bit(fd->rec_cpu_num, qib_cpulist);
+
+	dd->rcd[ctxt] = NULL;
+	kfree(rcd);
+	kfree(ptmp);
+bail:
+	return ret;
+}
+
+static inline int usable(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+
+	return dd && (dd->flags & QIB_PRESENT) && dd->kregbase && ppd->lid &&
+		(ppd->lflags & QIBL_LINKACTIVE);
+}
+
+/*
+ * Select a context on the given device, either using a requested port
+ * or the port based on the context number.
+ */
+static int choose_port_ctxt(struct file *fp, struct qib_devdata *dd, u32 port,
+			    const struct qib_user_info *uinfo)
+{
+	struct qib_pportdata *ppd = NULL;
+	int ret, ctxt;
+
+	if (port) {
+		if (!usable(dd->pport + port - 1)) {
+			ret = -ENETDOWN;
+			goto done;
+		} else
+			ppd = dd->pport + port - 1;
+	}
+	for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts && dd->rcd[ctxt];
+	     ctxt++)
+		;
+	if (ctxt == dd->cfgctxts) {
+		ret = -EBUSY;
+		goto done;
+	}
+	if (!ppd) {
+		u32 pidx = ctxt % dd->num_pports;
+
+		if (usable(dd->pport + pidx))
+			ppd = dd->pport + pidx;
+		else {
+			for (pidx = 0; pidx < dd->num_pports && !ppd;
+			     pidx++)
+				if (usable(dd->pport + pidx))
+					ppd = dd->pport + pidx;
+		}
+	}
+	ret = ppd ? setup_ctxt(ppd, ctxt, fp, uinfo) : -ENETDOWN;
+done:
+	return ret;
+}
+
+static int find_free_ctxt(int unit, struct file *fp,
+			  const struct qib_user_info *uinfo)
+{
+	struct qib_devdata *dd = qib_lookup(unit);
+	int ret;
+
+	if (!dd || (uinfo->spu_port && uinfo->spu_port > dd->num_pports))
+		ret = -ENODEV;
+	else
+		ret = choose_port_ctxt(fp, dd, uinfo->spu_port, uinfo);
+
+	return ret;
+}
+
+static int get_a_ctxt(struct file *fp, const struct qib_user_info *uinfo,
+		      unsigned alg)
+{
+	struct qib_devdata *udd = NULL;
+	int ret = 0, devmax, npresent, nup, ndev, dusable = 0, i;
+	u32 port = uinfo->spu_port, ctxt;
+
+	devmax = qib_count_units(&npresent, &nup);
+	if (!npresent) {
+		ret = -ENXIO;
+		goto done;
+	}
+	if (nup == 0) {
+		ret = -ENETDOWN;
+		goto done;
+	}
+
+	if (alg == QIB_PORT_ALG_ACROSS) {
+		unsigned inuse = ~0U;
+
+		/* find device (with ACTIVE ports) with fewest ctxts in use */
+		for (ndev = 0; ndev < devmax; ndev++) {
+			struct qib_devdata *dd = qib_lookup(ndev);
+			unsigned cused = 0, cfree = 0, pusable = 0;
+
+			if (!dd)
+				continue;
+			if (port && port <= dd->num_pports &&
+			    usable(dd->pport + port - 1))
+				pusable = 1;
+			else
+				for (i = 0; i < dd->num_pports; i++)
+					if (usable(dd->pport + i))
+						pusable++;
+			if (!pusable)
+				continue;
+			for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts;
+			     ctxt++)
+				if (dd->rcd[ctxt])
+					cused++;
+				else
+					cfree++;
+			if (cfree && cused < inuse) {
+				udd = dd;
+				inuse = cused;
+			}
+		}
+		if (udd) {
+			ret = choose_port_ctxt(fp, udd, port, uinfo);
+			goto done;
+		}
+	} else {
+		for (ndev = 0; ndev < devmax; ndev++) {
+			struct qib_devdata *dd = qib_lookup(ndev);
+
+			if (dd) {
+				ret = choose_port_ctxt(fp, dd, port, uinfo);
+				if (!ret)
+					goto done;
+				if (ret == -EBUSY)
+					dusable++;
+			}
+		}
+	}
+	ret = dusable ? -EBUSY : -ENETDOWN;
+
+done:
+	return ret;
+}
+
+static int find_shared_ctxt(struct file *fp,
+			    const struct qib_user_info *uinfo)
+{
+	int devmax, ndev, i;
+	int ret = 0;
+
+	devmax = qib_count_units(NULL, NULL);
+
+	for (ndev = 0; ndev < devmax; ndev++) {
+		struct qib_devdata *dd = qib_lookup(ndev);
+
+		/* device portion of usable() */
+		if (!(dd && (dd->flags & QIB_PRESENT) && dd->kregbase))
+			continue;
+		for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) {
+			struct qib_ctxtdata *rcd = dd->rcd[i];
+
+			/* Skip ctxts which are not yet open */
+			if (!rcd || !rcd->cnt)
+				continue;
+			/* Skip ctxt if it doesn't match the requested one */
+			if (rcd->subctxt_id != uinfo->spu_subctxt_id)
+				continue;
+			/* Verify the sharing process matches the master */
+			if (rcd->subctxt_cnt != uinfo->spu_subctxt_cnt ||
+			    rcd->userversion != uinfo->spu_userversion ||
+			    rcd->cnt >= rcd->subctxt_cnt) {
+				ret = -EINVAL;
+				goto done;
+			}
+			ctxt_fp(fp) = rcd;
+			subctxt_fp(fp) = rcd->cnt++;
+			rcd->subpid[subctxt_fp(fp)] = current->pid;
+			tidcursor_fp(fp) = 0;
+			rcd->active_slaves |= 1 << subctxt_fp(fp);
+			ret = 1;
+			goto done;
+		}
+	}
+
+done:
+	return ret;
+}
+
+static int qib_open(struct inode *in, struct file *fp)
+{
+	/* The real work is performed later in qib_assign_ctxt() */
+	fp->private_data = kzalloc(sizeof(struct qib_filedata), GFP_KERNEL);
+	if (fp->private_data) /* no cpu affinity by default */
+		((struct qib_filedata *)fp->private_data)->rec_cpu_num = -1;
+	return fp->private_data ? 0 : -ENOMEM;
+}
+
+static int find_hca(unsigned int cpu, int *unit)
+{
+	int ret = 0, devmax, npresent, nup, ndev;
+
+	*unit = -1;
+
+	devmax = qib_count_units(&npresent, &nup);
+	if (!npresent) {
+		ret = -ENXIO;
+		goto done;
+	}
+	if (!nup) {
+		ret = -ENETDOWN;
+		goto done;
+	}
+	for (ndev = 0; ndev < devmax; ndev++) {
+		struct qib_devdata *dd = qib_lookup(ndev);
+
+		if (dd) {
+			if (pcibus_to_node(dd->pcidev->bus) < 0) {
+				ret = -EINVAL;
+				goto done;
+			}
+			if (cpu_to_node(cpu) ==
+				pcibus_to_node(dd->pcidev->bus)) {
+				*unit = ndev;
+				goto done;
+			}
+		}
+	}
+done:
+	return ret;
+}
+
+static int do_qib_user_sdma_queue_create(struct file *fp)
+{
+	struct qib_filedata *fd = fp->private_data;
+	struct qib_ctxtdata *rcd = fd->rcd;
+	struct qib_devdata *dd = rcd->dd;
+
+	if (dd->flags & QIB_HAS_SEND_DMA) {
+
+		fd->pq = qib_user_sdma_queue_create(&dd->pcidev->dev,
+						    dd->unit,
+						    rcd->ctxt,
+						    fd->subctxt);
+		if (!fd->pq)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+/*
+ * Get ctxt early, so can set affinity prior to memory allocation.
+ */
+static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo)
+{
+	int ret;
+	int i_minor;
+	unsigned swmajor, swminor, alg = QIB_PORT_ALG_ACROSS;
+
+	/* Check to be sure we haven't already initialized this file */
+	if (ctxt_fp(fp)) {
+		ret = -EINVAL;
+		goto done;
+	}
+
+	/* for now, if major version is different, bail */
+	swmajor = uinfo->spu_userversion >> 16;
+	if (swmajor != QIB_USER_SWMAJOR) {
+		ret = -ENODEV;
+		goto done;
+	}
+
+	swminor = uinfo->spu_userversion & 0xffff;
+
+	if (swminor >= 11 && uinfo->spu_port_alg < QIB_PORT_ALG_COUNT)
+		alg = uinfo->spu_port_alg;
+
+	mutex_lock(&qib_mutex);
+
+	if (qib_compatible_subctxts(swmajor, swminor) &&
+	    uinfo->spu_subctxt_cnt) {
+		ret = find_shared_ctxt(fp, uinfo);
+		if (ret > 0) {
+			ret = do_qib_user_sdma_queue_create(fp);
+			if (!ret)
+				assign_ctxt_affinity(fp, (ctxt_fp(fp))->dd);
+			goto done_ok;
+		}
+	}
+
+	i_minor = iminor(file_inode(fp)) - QIB_USER_MINOR_BASE;
+	if (i_minor)
+		ret = find_free_ctxt(i_minor - 1, fp, uinfo);
+	else {
+		int unit;
+		const unsigned int cpu = cpumask_first(&current->cpus_allowed);
+		const unsigned int weight =
+			cpumask_weight(&current->cpus_allowed);
+
+		if (weight == 1 && !test_bit(cpu, qib_cpulist))
+			if (!find_hca(cpu, &unit) && unit >= 0)
+				if (!find_free_ctxt(unit, fp, uinfo)) {
+					ret = 0;
+					goto done_chk_sdma;
+				}
+		ret = get_a_ctxt(fp, uinfo, alg);
+	}
+
+done_chk_sdma:
+	if (!ret)
+		ret = do_qib_user_sdma_queue_create(fp);
+done_ok:
+	mutex_unlock(&qib_mutex);
+
+done:
+	return ret;
+}
+
+
+static int qib_do_user_init(struct file *fp,
+			    const struct qib_user_info *uinfo)
+{
+	int ret;
+	struct qib_ctxtdata *rcd = ctxt_fp(fp);
+	struct qib_devdata *dd;
+	unsigned uctxt;
+
+	/* Subctxts don't need to initialize anything since master did it. */
+	if (subctxt_fp(fp)) {
+		ret = wait_event_interruptible(rcd->wait,
+			!test_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag));
+		goto bail;
+	}
+
+	dd = rcd->dd;
+
+	/* some ctxts may get extra buffers, calculate that here */
+	uctxt = rcd->ctxt - dd->first_user_ctxt;
+	if (uctxt < dd->ctxts_extrabuf) {
+		rcd->piocnt = dd->pbufsctxt + 1;
+		rcd->pio_base = rcd->piocnt * uctxt;
+	} else {
+		rcd->piocnt = dd->pbufsctxt;
+		rcd->pio_base = rcd->piocnt * uctxt +
+			dd->ctxts_extrabuf;
+	}
+
+	/*
+	 * All user buffers are 2KB buffers.  If we ever support
+	 * giving 4KB buffers to user processes, this will need some
+	 * work.  Can't use piobufbase directly, because it has
+	 * both 2K and 4K buffer base values.  So check and handle.
+	 */
+	if ((rcd->pio_base + rcd->piocnt) > dd->piobcnt2k) {
+		if (rcd->pio_base >= dd->piobcnt2k) {
+			qib_dev_err(dd,
+				    "%u:ctxt%u: no 2KB buffers available\n",
+				    dd->unit, rcd->ctxt);
+			ret = -ENOBUFS;
+			goto bail;
+		}
+		rcd->piocnt = dd->piobcnt2k - rcd->pio_base;
+		qib_dev_err(dd, "Ctxt%u: would use 4KB bufs, using %u\n",
+			    rcd->ctxt, rcd->piocnt);
+	}
+
+	rcd->piobufs = dd->pio2k_bufbase + rcd->pio_base * dd->palign;
+	qib_chg_pioavailkernel(dd, rcd->pio_base, rcd->piocnt,
+			       TXCHK_CHG_TYPE_USER, rcd);
+	/*
+	 * try to ensure that processes start up with consistent avail update
+	 * for their own range, at least.   If system very quiet, it might
+	 * have the in-memory copy out of date at startup for this range of
+	 * buffers, when a context gets re-used.  Do after the chg_pioavail
+	 * and before the rest of setup, so it's "almost certain" the dma
+	 * will have occurred (can't 100% guarantee, but should be many
+	 * decimals of 9s, with this ordering), given how much else happens
+	 * after this.
+	 */
+	dd->f_sendctrl(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+
+	/*
+	 * Now allocate the rcvhdr Q and eager TIDs; skip the TID
+	 * array for time being.  If rcd->ctxt > chip-supported,
+	 * we need to do extra stuff here to handle by handling overflow
+	 * through ctxt 0, someday
+	 */
+	ret = qib_create_rcvhdrq(dd, rcd);
+	if (!ret)
+		ret = qib_setup_eagerbufs(rcd);
+	if (ret)
+		goto bail_pio;
+
+	rcd->tidcursor = 0; /* start at beginning after open */
+
+	/* initialize poll variables... */
+	rcd->urgent = 0;
+	rcd->urgent_poll = 0;
+
+	/*
+	 * Now enable the ctxt for receive.
+	 * For chips that are set to DMA the tail register to memory
+	 * when they change (and when the update bit transitions from
+	 * 0 to 1.  So for those chips, we turn it off and then back on.
+	 * This will (very briefly) affect any other open ctxts, but the
+	 * duration is very short, and therefore isn't an issue.  We
+	 * explicitly set the in-memory tail copy to 0 beforehand, so we
+	 * don't have to wait to be sure the DMA update has happened
+	 * (chip resets head/tail to 0 on transition to enable).
+	 */
+	if (rcd->rcvhdrtail_kvaddr)
+		qib_clear_rcvhdrtail(rcd);
+
+	dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_CTXT_ENB | QIB_RCVCTRL_TIDFLOW_ENB,
+		      rcd->ctxt);
+
+	/* Notify any waiting slaves */
+	if (rcd->subctxt_cnt) {
+		clear_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag);
+		wake_up(&rcd->wait);
+	}
+	return 0;
+
+bail_pio:
+	qib_chg_pioavailkernel(dd, rcd->pio_base, rcd->piocnt,
+			       TXCHK_CHG_TYPE_KERN, rcd);
+bail:
+	return ret;
+}
+
+/**
+ * unlock_exptid - unlock any expected TID entries context still had in use
+ * @rcd: ctxt
+ *
+ * We don't actually update the chip here, because we do a bulk update
+ * below, using f_clear_tids.
+ */
+static void unlock_expected_tids(struct qib_ctxtdata *rcd)
+{
+	struct qib_devdata *dd = rcd->dd;
+	int ctxt_tidbase = rcd->ctxt * dd->rcvtidcnt;
+	int i, cnt = 0, maxtid = ctxt_tidbase + dd->rcvtidcnt;
+
+	for (i = ctxt_tidbase; i < maxtid; i++) {
+		struct page *p = dd->pageshadow[i];
+		dma_addr_t phys;
+
+		if (!p)
+			continue;
+
+		phys = dd->physshadow[i];
+		dd->physshadow[i] = dd->tidinvalid;
+		dd->pageshadow[i] = NULL;
+		pci_unmap_page(dd->pcidev, phys, PAGE_SIZE,
+			       PCI_DMA_FROMDEVICE);
+		qib_release_user_pages(&p, 1);
+		cnt++;
+	}
+}
+
+static int qib_close(struct inode *in, struct file *fp)
+{
+	int ret = 0;
+	struct qib_filedata *fd;
+	struct qib_ctxtdata *rcd;
+	struct qib_devdata *dd;
+	unsigned long flags;
+	unsigned ctxt;
+	pid_t pid;
+
+	mutex_lock(&qib_mutex);
+
+	fd = fp->private_data;
+	fp->private_data = NULL;
+	rcd = fd->rcd;
+	if (!rcd) {
+		mutex_unlock(&qib_mutex);
+		goto bail;
+	}
+
+	dd = rcd->dd;
+
+	/* ensure all pio buffer writes in progress are flushed */
+	qib_flush_wc();
+
+	/* drain user sdma queue */
+	if (fd->pq) {
+		qib_user_sdma_queue_drain(rcd->ppd, fd->pq);
+		qib_user_sdma_queue_destroy(fd->pq);
+	}
+
+	if (fd->rec_cpu_num != -1)
+		__clear_bit(fd->rec_cpu_num, qib_cpulist);
+
+	if (--rcd->cnt) {
+		/*
+		 * XXX If the master closes the context before the slave(s),
+		 * revoke the mmap for the eager receive queue so
+		 * the slave(s) don't wait for receive data forever.
+		 */
+		rcd->active_slaves &= ~(1 << fd->subctxt);
+		rcd->subpid[fd->subctxt] = 0;
+		mutex_unlock(&qib_mutex);
+		goto bail;
+	}
+
+	/* early; no interrupt users after this */
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	ctxt = rcd->ctxt;
+	dd->rcd[ctxt] = NULL;
+	pid = rcd->pid;
+	rcd->pid = 0;
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+
+	if (rcd->rcvwait_to || rcd->piowait_to ||
+	    rcd->rcvnowait || rcd->pionowait) {
+		rcd->rcvwait_to = 0;
+		rcd->piowait_to = 0;
+		rcd->rcvnowait = 0;
+		rcd->pionowait = 0;
+	}
+	if (rcd->flag)
+		rcd->flag = 0;
+
+	if (dd->kregbase) {
+		/* atomically clear receive enable ctxt and intr avail. */
+		dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_CTXT_DIS |
+				  QIB_RCVCTRL_INTRAVAIL_DIS, ctxt);
+
+		/* clean up the pkeys for this ctxt user */
+		qib_clean_part_key(rcd, dd);
+		qib_disarm_piobufs(dd, rcd->pio_base, rcd->piocnt);
+		qib_chg_pioavailkernel(dd, rcd->pio_base,
+				       rcd->piocnt, TXCHK_CHG_TYPE_KERN, NULL);
+
+		dd->f_clear_tids(dd, rcd);
+
+		if (dd->pageshadow)
+			unlock_expected_tids(rcd);
+		qib_stats.sps_ctxts--;
+		dd->freectxts++;
+	}
+
+	mutex_unlock(&qib_mutex);
+	qib_free_ctxtdata(dd, rcd); /* after releasing the mutex */
+
+bail:
+	kfree(fd);
+	return ret;
+}
+
+static int qib_ctxt_info(struct file *fp, struct qib_ctxt_info __user *uinfo)
+{
+	struct qib_ctxt_info info;
+	int ret;
+	size_t sz;
+	struct qib_ctxtdata *rcd = ctxt_fp(fp);
+	struct qib_filedata *fd;
+
+	fd = fp->private_data;
+
+	info.num_active = qib_count_active_units();
+	info.unit = rcd->dd->unit;
+	info.port = rcd->ppd->port;
+	info.ctxt = rcd->ctxt;
+	info.subctxt =  subctxt_fp(fp);
+	/* Number of user ctxts available for this device. */
+	info.num_ctxts = rcd->dd->cfgctxts - rcd->dd->first_user_ctxt;
+	info.num_subctxts = rcd->subctxt_cnt;
+	info.rec_cpu = fd->rec_cpu_num;
+	sz = sizeof(info);
+
+	if (copy_to_user(uinfo, &info, sz)) {
+		ret = -EFAULT;
+		goto bail;
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int qib_sdma_get_inflight(struct qib_user_sdma_queue *pq,
+				 u32 __user *inflightp)
+{
+	const u32 val = qib_user_sdma_inflight_counter(pq);
+
+	if (put_user(val, inflightp))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int qib_sdma_get_complete(struct qib_pportdata *ppd,
+				 struct qib_user_sdma_queue *pq,
+				 u32 __user *completep)
+{
+	u32 val;
+	int err;
+
+	if (!pq)
+		return -EINVAL;
+
+	err = qib_user_sdma_make_progress(ppd, pq);
+	if (err < 0)
+		return err;
+
+	val = qib_user_sdma_complete_counter(pq);
+	if (put_user(val, completep))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int disarm_req_delay(struct qib_ctxtdata *rcd)
+{
+	int ret = 0;
+
+	if (!usable(rcd->ppd)) {
+		int i;
+		/*
+		 * if link is down, or otherwise not usable, delay
+		 * the caller up to 30 seconds, so we don't thrash
+		 * in trying to get the chip back to ACTIVE, and
+		 * set flag so they make the call again.
+		 */
+		if (rcd->user_event_mask) {
+			/*
+			 * subctxt_cnt is 0 if not shared, so do base
+			 * separately, first, then remaining subctxt, if any
+			 */
+			set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+				&rcd->user_event_mask[0]);
+			for (i = 1; i < rcd->subctxt_cnt; i++)
+				set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+					&rcd->user_event_mask[i]);
+		}
+		for (i = 0; !usable(rcd->ppd) && i < 300; i++)
+			msleep(100);
+		ret = -ENETDOWN;
+	}
+	return ret;
+}
+
+/*
+ * Find all user contexts in use, and set the specified bit in their
+ * event mask.
+ * See also find_ctxt() for a similar use, that is specific to send buffers.
+ */
+int qib_set_uevent_bits(struct qib_pportdata *ppd, const int evtbit)
+{
+	struct qib_ctxtdata *rcd;
+	unsigned ctxt;
+	int ret = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->dd->uctxt_lock, flags);
+	for (ctxt = ppd->dd->first_user_ctxt; ctxt < ppd->dd->cfgctxts;
+	     ctxt++) {
+		rcd = ppd->dd->rcd[ctxt];
+		if (!rcd)
+			continue;
+		if (rcd->user_event_mask) {
+			int i;
+			/*
+			 * subctxt_cnt is 0 if not shared, so do base
+			 * separately, first, then remaining subctxt, if any
+			 */
+			set_bit(evtbit, &rcd->user_event_mask[0]);
+			for (i = 1; i < rcd->subctxt_cnt; i++)
+				set_bit(evtbit, &rcd->user_event_mask[i]);
+		}
+		ret = 1;
+		break;
+	}
+	spin_unlock_irqrestore(&ppd->dd->uctxt_lock, flags);
+
+	return ret;
+}
+
+/*
+ * clear the event notifier events for this context.
+ * For the DISARM_BUFS case, we also take action (this obsoletes
+ * the older QIB_CMD_DISARM_BUFS, but we keep it for backwards
+ * compatibility.
+ * Other bits don't currently require actions, just atomically clear.
+ * User process then performs actions appropriate to bit having been
+ * set, if desired, and checks again in future.
+ */
+static int qib_user_event_ack(struct qib_ctxtdata *rcd, int subctxt,
+			      unsigned long events)
+{
+	int ret = 0, i;
+
+	for (i = 0; i <= _QIB_MAX_EVENT_BIT; i++) {
+		if (!test_bit(i, &events))
+			continue;
+		if (i == _QIB_EVENT_DISARM_BUFS_BIT) {
+			(void)qib_disarm_piobufs_ifneeded(rcd);
+			ret = disarm_req_delay(rcd);
+		} else
+			clear_bit(i, &rcd->user_event_mask[subctxt]);
+	}
+	return ret;
+}
+
+static ssize_t qib_write(struct file *fp, const char __user *data,
+			 size_t count, loff_t *off)
+{
+	const struct qib_cmd __user *ucmd;
+	struct qib_ctxtdata *rcd;
+	const void __user *src;
+	size_t consumed, copy = 0;
+	struct qib_cmd cmd;
+	ssize_t ret = 0;
+	void *dest;
+
+	if (count < sizeof(cmd.type)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ucmd = (const struct qib_cmd __user *) data;
+
+	if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) {
+		ret = -EFAULT;
+		goto bail;
+	}
+
+	consumed = sizeof(cmd.type);
+
+	switch (cmd.type) {
+	case QIB_CMD_ASSIGN_CTXT:
+	case QIB_CMD_USER_INIT:
+		copy = sizeof(cmd.cmd.user_info);
+		dest = &cmd.cmd.user_info;
+		src = &ucmd->cmd.user_info;
+		break;
+
+	case QIB_CMD_RECV_CTRL:
+		copy = sizeof(cmd.cmd.recv_ctrl);
+		dest = &cmd.cmd.recv_ctrl;
+		src = &ucmd->cmd.recv_ctrl;
+		break;
+
+	case QIB_CMD_CTXT_INFO:
+		copy = sizeof(cmd.cmd.ctxt_info);
+		dest = &cmd.cmd.ctxt_info;
+		src = &ucmd->cmd.ctxt_info;
+		break;
+
+	case QIB_CMD_TID_UPDATE:
+	case QIB_CMD_TID_FREE:
+		copy = sizeof(cmd.cmd.tid_info);
+		dest = &cmd.cmd.tid_info;
+		src = &ucmd->cmd.tid_info;
+		break;
+
+	case QIB_CMD_SET_PART_KEY:
+		copy = sizeof(cmd.cmd.part_key);
+		dest = &cmd.cmd.part_key;
+		src = &ucmd->cmd.part_key;
+		break;
+
+	case QIB_CMD_DISARM_BUFS:
+	case QIB_CMD_PIOAVAILUPD: /* force an update of PIOAvail reg */
+		copy = 0;
+		src = NULL;
+		dest = NULL;
+		break;
+
+	case QIB_CMD_POLL_TYPE:
+		copy = sizeof(cmd.cmd.poll_type);
+		dest = &cmd.cmd.poll_type;
+		src = &ucmd->cmd.poll_type;
+		break;
+
+	case QIB_CMD_ARMLAUNCH_CTRL:
+		copy = sizeof(cmd.cmd.armlaunch_ctrl);
+		dest = &cmd.cmd.armlaunch_ctrl;
+		src = &ucmd->cmd.armlaunch_ctrl;
+		break;
+
+	case QIB_CMD_SDMA_INFLIGHT:
+		copy = sizeof(cmd.cmd.sdma_inflight);
+		dest = &cmd.cmd.sdma_inflight;
+		src = &ucmd->cmd.sdma_inflight;
+		break;
+
+	case QIB_CMD_SDMA_COMPLETE:
+		copy = sizeof(cmd.cmd.sdma_complete);
+		dest = &cmd.cmd.sdma_complete;
+		src = &ucmd->cmd.sdma_complete;
+		break;
+
+	case QIB_CMD_ACK_EVENT:
+		copy = sizeof(cmd.cmd.event_mask);
+		dest = &cmd.cmd.event_mask;
+		src = &ucmd->cmd.event_mask;
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (copy) {
+		if ((count - consumed) < copy) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		if (copy_from_user(dest, src, copy)) {
+			ret = -EFAULT;
+			goto bail;
+		}
+		consumed += copy;
+	}
+
+	rcd = ctxt_fp(fp);
+	if (!rcd && cmd.type != QIB_CMD_ASSIGN_CTXT) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	switch (cmd.type) {
+	case QIB_CMD_ASSIGN_CTXT:
+		ret = qib_assign_ctxt(fp, &cmd.cmd.user_info);
+		if (ret)
+			goto bail;
+		break;
+
+	case QIB_CMD_USER_INIT:
+		ret = qib_do_user_init(fp, &cmd.cmd.user_info);
+		if (ret)
+			goto bail;
+		ret = qib_get_base_info(fp, (void __user *) (unsigned long)
+					cmd.cmd.user_info.spu_base_info,
+					cmd.cmd.user_info.spu_base_info_size);
+		break;
+
+	case QIB_CMD_RECV_CTRL:
+		ret = qib_manage_rcvq(rcd, subctxt_fp(fp), cmd.cmd.recv_ctrl);
+		break;
+
+	case QIB_CMD_CTXT_INFO:
+		ret = qib_ctxt_info(fp, (struct qib_ctxt_info __user *)
+				    (unsigned long) cmd.cmd.ctxt_info);
+		break;
+
+	case QIB_CMD_TID_UPDATE:
+		ret = qib_tid_update(rcd, fp, &cmd.cmd.tid_info);
+		break;
+
+	case QIB_CMD_TID_FREE:
+		ret = qib_tid_free(rcd, subctxt_fp(fp), &cmd.cmd.tid_info);
+		break;
+
+	case QIB_CMD_SET_PART_KEY:
+		ret = qib_set_part_key(rcd, cmd.cmd.part_key);
+		break;
+
+	case QIB_CMD_DISARM_BUFS:
+		(void)qib_disarm_piobufs_ifneeded(rcd);
+		ret = disarm_req_delay(rcd);
+		break;
+
+	case QIB_CMD_PIOAVAILUPD:
+		qib_force_pio_avail_update(rcd->dd);
+		break;
+
+	case QIB_CMD_POLL_TYPE:
+		rcd->poll_type = cmd.cmd.poll_type;
+		break;
+
+	case QIB_CMD_ARMLAUNCH_CTRL:
+		rcd->dd->f_set_armlaunch(rcd->dd, cmd.cmd.armlaunch_ctrl);
+		break;
+
+	case QIB_CMD_SDMA_INFLIGHT:
+		ret = qib_sdma_get_inflight(user_sdma_queue_fp(fp),
+					    (u32 __user *) (unsigned long)
+					    cmd.cmd.sdma_inflight);
+		break;
+
+	case QIB_CMD_SDMA_COMPLETE:
+		ret = qib_sdma_get_complete(rcd->ppd,
+					    user_sdma_queue_fp(fp),
+					    (u32 __user *) (unsigned long)
+					    cmd.cmd.sdma_complete);
+		break;
+
+	case QIB_CMD_ACK_EVENT:
+		ret = qib_user_event_ack(rcd, subctxt_fp(fp),
+					 cmd.cmd.event_mask);
+		break;
+	}
+
+	if (ret >= 0)
+		ret = consumed;
+
+bail:
+	return ret;
+}
+
+static ssize_t qib_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct qib_filedata *fp = iocb->ki_filp->private_data;
+	struct qib_ctxtdata *rcd = ctxt_fp(iocb->ki_filp);
+	struct qib_user_sdma_queue *pq = fp->pq;
+
+	if (!iter_is_iovec(from) || !from->nr_segs || !pq)
+		return -EINVAL;
+			 
+	return qib_user_sdma_writev(rcd, pq, from->iov, from->nr_segs);
+}
+
+static struct class *qib_class;
+static dev_t qib_dev;
+
+int qib_cdev_init(int minor, const char *name,
+		  const struct file_operations *fops,
+		  struct cdev **cdevp, struct device **devp)
+{
+	const dev_t dev = MKDEV(MAJOR(qib_dev), minor);
+	struct cdev *cdev;
+	struct device *device = NULL;
+	int ret;
+
+	cdev = cdev_alloc();
+	if (!cdev) {
+		pr_err("Could not allocate cdev for minor %d, %s\n",
+		       minor, name);
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	cdev->owner = THIS_MODULE;
+	cdev->ops = fops;
+	kobject_set_name(&cdev->kobj, name);
+
+	ret = cdev_add(cdev, dev, 1);
+	if (ret < 0) {
+		pr_err("Could not add cdev for minor %d, %s (err %d)\n",
+		       minor, name, -ret);
+		goto err_cdev;
+	}
+
+	device = device_create(qib_class, NULL, dev, NULL, "%s", name);
+	if (!IS_ERR(device))
+		goto done;
+	ret = PTR_ERR(device);
+	device = NULL;
+	pr_err("Could not create device for minor %d, %s (err %d)\n",
+	       minor, name, -ret);
+err_cdev:
+	cdev_del(cdev);
+	cdev = NULL;
+done:
+	*cdevp = cdev;
+	*devp = device;
+	return ret;
+}
+
+void qib_cdev_cleanup(struct cdev **cdevp, struct device **devp)
+{
+	struct device *device = *devp;
+
+	if (device) {
+		device_unregister(device);
+		*devp = NULL;
+	}
+
+	if (*cdevp) {
+		cdev_del(*cdevp);
+		*cdevp = NULL;
+	}
+}
+
+static struct cdev *wildcard_cdev;
+static struct device *wildcard_device;
+
+int __init qib_dev_init(void)
+{
+	int ret;
+
+	ret = alloc_chrdev_region(&qib_dev, 0, QIB_NMINORS, QIB_DRV_NAME);
+	if (ret < 0) {
+		pr_err("Could not allocate chrdev region (err %d)\n", -ret);
+		goto done;
+	}
+
+	qib_class = class_create(THIS_MODULE, "ipath");
+	if (IS_ERR(qib_class)) {
+		ret = PTR_ERR(qib_class);
+		pr_err("Could not create device class (err %d)\n", -ret);
+		unregister_chrdev_region(qib_dev, QIB_NMINORS);
+	}
+
+done:
+	return ret;
+}
+
+void qib_dev_cleanup(void)
+{
+	if (qib_class) {
+		class_destroy(qib_class);
+		qib_class = NULL;
+	}
+
+	unregister_chrdev_region(qib_dev, QIB_NMINORS);
+}
+
+static atomic_t user_count = ATOMIC_INIT(0);
+
+static void qib_user_remove(struct qib_devdata *dd)
+{
+	if (atomic_dec_return(&user_count) == 0)
+		qib_cdev_cleanup(&wildcard_cdev, &wildcard_device);
+
+	qib_cdev_cleanup(&dd->user_cdev, &dd->user_device);
+}
+
+static int qib_user_add(struct qib_devdata *dd)
+{
+	char name[10];
+	int ret;
+
+	if (atomic_inc_return(&user_count) == 1) {
+		ret = qib_cdev_init(0, "ipath", &qib_file_ops,
+				    &wildcard_cdev, &wildcard_device);
+		if (ret)
+			goto done;
+	}
+
+	snprintf(name, sizeof(name), "ipath%d", dd->unit);
+	ret = qib_cdev_init(dd->unit + 1, name, &qib_file_ops,
+			    &dd->user_cdev, &dd->user_device);
+	if (ret)
+		qib_user_remove(dd);
+done:
+	return ret;
+}
+
+/*
+ * Create per-unit files in /dev
+ */
+int qib_device_create(struct qib_devdata *dd)
+{
+	int r, ret;
+
+	r = qib_user_add(dd);
+	ret = qib_diag_add(dd);
+	if (r && !ret)
+		ret = r;
+	return ret;
+}
+
+/*
+ * Remove per-unit files in /dev
+ * void, core kernel returns no errors for this stuff
+ */
+void qib_device_remove(struct qib_devdata *dd)
+{
+	qib_user_remove(dd);
+	qib_diag_remove(dd);
+}
diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c
new file mode 100644
index 000000000..bdd5d3857
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_fs.c
@@ -0,0 +1,621 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/pagemap.h>
+#include <linux/init.h>
+#include <linux/namei.h>
+
+#include "qib.h"
+
+#define QIBFS_MAGIC 0x726a77
+
+static struct super_block *qib_super;
+
+#define private2dd(file) (file_inode(file)->i_private)
+
+static int qibfs_mknod(struct inode *dir, struct dentry *dentry,
+		       umode_t mode, const struct file_operations *fops,
+		       void *data)
+{
+	int error;
+	struct inode *inode = new_inode(dir->i_sb);
+
+	if (!inode) {
+		error = -EPERM;
+		goto bail;
+	}
+
+	inode->i_ino = get_next_ino();
+	inode->i_mode = mode;
+	inode->i_uid = GLOBAL_ROOT_UID;
+	inode->i_gid = GLOBAL_ROOT_GID;
+	inode->i_blocks = 0;
+	inode->i_atime = CURRENT_TIME;
+	inode->i_mtime = inode->i_atime;
+	inode->i_ctime = inode->i_atime;
+	inode->i_private = data;
+	if (S_ISDIR(mode)) {
+		inode->i_op = &simple_dir_inode_operations;
+		inc_nlink(inode);
+		inc_nlink(dir);
+	}
+
+	inode->i_fop = fops;
+
+	d_instantiate(dentry, inode);
+	error = 0;
+
+bail:
+	return error;
+}
+
+static int create_file(const char *name, umode_t mode,
+		       struct dentry *parent, struct dentry **dentry,
+		       const struct file_operations *fops, void *data)
+{
+	int error;
+
+	mutex_lock(&d_inode(parent)->i_mutex);
+	*dentry = lookup_one_len(name, parent, strlen(name));
+	if (!IS_ERR(*dentry))
+		error = qibfs_mknod(d_inode(parent), *dentry,
+				    mode, fops, data);
+	else
+		error = PTR_ERR(*dentry);
+	mutex_unlock(&d_inode(parent)->i_mutex);
+
+	return error;
+}
+
+static ssize_t driver_stats_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	qib_stats.sps_ints = qib_sps_ints();
+	return simple_read_from_buffer(buf, count, ppos, &qib_stats,
+				       sizeof(qib_stats));
+}
+
+/*
+ * driver stats field names, one line per stat, single string.  Used by
+ * programs like ipathstats to print the stats in a way which works for
+ * different versions of drivers, without changing program source.
+ * if qlogic_ib_stats changes, this needs to change.  Names need to be
+ * 12 chars or less (w/o newline), for proper display by ipathstats utility.
+ */
+static const char qib_statnames[] =
+	"KernIntr\n"
+	"ErrorIntr\n"
+	"Tx_Errs\n"
+	"Rcv_Errs\n"
+	"H/W_Errs\n"
+	"NoPIOBufs\n"
+	"CtxtsOpen\n"
+	"RcvLen_Errs\n"
+	"EgrBufFull\n"
+	"EgrHdrFull\n"
+	;
+
+static ssize_t driver_names_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	return simple_read_from_buffer(buf, count, ppos, qib_statnames,
+		sizeof(qib_statnames) - 1); /* no null */
+}
+
+static const struct file_operations driver_ops[] = {
+	{ .read = driver_stats_read, .llseek = generic_file_llseek, },
+	{ .read = driver_names_read, .llseek = generic_file_llseek, },
+};
+
+/* read the per-device counters */
+static ssize_t dev_counters_read(struct file *file, char __user *buf,
+				 size_t count, loff_t *ppos)
+{
+	u64 *counters;
+	size_t avail;
+	struct qib_devdata *dd = private2dd(file);
+
+	avail = dd->f_read_cntrs(dd, *ppos, NULL, &counters);
+	return simple_read_from_buffer(buf, count, ppos, counters, avail);
+}
+
+/* read the per-device counters */
+static ssize_t dev_names_read(struct file *file, char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	char *names;
+	size_t avail;
+	struct qib_devdata *dd = private2dd(file);
+
+	avail = dd->f_read_cntrs(dd, *ppos, &names, NULL);
+	return simple_read_from_buffer(buf, count, ppos, names, avail);
+}
+
+static const struct file_operations cntr_ops[] = {
+	{ .read = dev_counters_read, .llseek = generic_file_llseek, },
+	{ .read = dev_names_read, .llseek = generic_file_llseek, },
+};
+
+/*
+ * Could use file_inode(file)->i_ino to figure out which file,
+ * instead of separate routine for each, but for now, this works...
+ */
+
+/* read the per-port names (same for each port) */
+static ssize_t portnames_read(struct file *file, char __user *buf,
+			      size_t count, loff_t *ppos)
+{
+	char *names;
+	size_t avail;
+	struct qib_devdata *dd = private2dd(file);
+
+	avail = dd->f_read_portcntrs(dd, *ppos, 0, &names, NULL);
+	return simple_read_from_buffer(buf, count, ppos, names, avail);
+}
+
+/* read the per-port counters for port 1 (pidx 0) */
+static ssize_t portcntrs_1_read(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	u64 *counters;
+	size_t avail;
+	struct qib_devdata *dd = private2dd(file);
+
+	avail = dd->f_read_portcntrs(dd, *ppos, 0, NULL, &counters);
+	return simple_read_from_buffer(buf, count, ppos, counters, avail);
+}
+
+/* read the per-port counters for port 2 (pidx 1) */
+static ssize_t portcntrs_2_read(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	u64 *counters;
+	size_t avail;
+	struct qib_devdata *dd = private2dd(file);
+
+	avail = dd->f_read_portcntrs(dd, *ppos, 1, NULL, &counters);
+	return simple_read_from_buffer(buf, count, ppos, counters, avail);
+}
+
+static const struct file_operations portcntr_ops[] = {
+	{ .read = portnames_read, .llseek = generic_file_llseek, },
+	{ .read = portcntrs_1_read, .llseek = generic_file_llseek, },
+	{ .read = portcntrs_2_read, .llseek = generic_file_llseek, },
+};
+
+/*
+ * read the per-port QSFP data for port 1 (pidx 0)
+ */
+static ssize_t qsfp_1_read(struct file *file, char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	struct qib_devdata *dd = private2dd(file);
+	char *tmp;
+	int ret;
+
+	tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	ret = qib_qsfp_dump(dd->pport, tmp, PAGE_SIZE);
+	if (ret > 0)
+		ret = simple_read_from_buffer(buf, count, ppos, tmp, ret);
+	kfree(tmp);
+	return ret;
+}
+
+/*
+ * read the per-port QSFP data for port 2 (pidx 1)
+ */
+static ssize_t qsfp_2_read(struct file *file, char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	struct qib_devdata *dd = private2dd(file);
+	char *tmp;
+	int ret;
+
+	if (dd->num_pports < 2)
+		return -ENODEV;
+
+	tmp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!tmp)
+		return -ENOMEM;
+
+	ret = qib_qsfp_dump(dd->pport + 1, tmp, PAGE_SIZE);
+	if (ret > 0)
+		ret = simple_read_from_buffer(buf, count, ppos, tmp, ret);
+	kfree(tmp);
+	return ret;
+}
+
+static const struct file_operations qsfp_ops[] = {
+	{ .read = qsfp_1_read, .llseek = generic_file_llseek, },
+	{ .read = qsfp_2_read, .llseek = generic_file_llseek, },
+};
+
+static ssize_t flash_read(struct file *file, char __user *buf,
+			  size_t count, loff_t *ppos)
+{
+	struct qib_devdata *dd;
+	ssize_t ret;
+	loff_t pos;
+	char *tmp;
+
+	pos = *ppos;
+
+	if (pos < 0) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (pos >= sizeof(struct qib_flash)) {
+		ret = 0;
+		goto bail;
+	}
+
+	if (count > sizeof(struct qib_flash) - pos)
+		count = sizeof(struct qib_flash) - pos;
+
+	tmp = kmalloc(count, GFP_KERNEL);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	dd = private2dd(file);
+	if (qib_eeprom_read(dd, pos, tmp, count)) {
+		qib_dev_err(dd, "failed to read from flash\n");
+		ret = -ENXIO;
+		goto bail_tmp;
+	}
+
+	if (copy_to_user(buf, tmp, count)) {
+		ret = -EFAULT;
+		goto bail_tmp;
+	}
+
+	*ppos = pos + count;
+	ret = count;
+
+bail_tmp:
+	kfree(tmp);
+
+bail:
+	return ret;
+}
+
+static ssize_t flash_write(struct file *file, const char __user *buf,
+			   size_t count, loff_t *ppos)
+{
+	struct qib_devdata *dd;
+	ssize_t ret;
+	loff_t pos;
+	char *tmp;
+
+	pos = *ppos;
+
+	if (pos != 0) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	if (count != sizeof(struct qib_flash)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	tmp = kmalloc(count, GFP_KERNEL);
+	if (!tmp) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	if (copy_from_user(tmp, buf, count)) {
+		ret = -EFAULT;
+		goto bail_tmp;
+	}
+
+	dd = private2dd(file);
+	if (qib_eeprom_write(dd, pos, tmp, count)) {
+		ret = -ENXIO;
+		qib_dev_err(dd, "failed to write to flash\n");
+		goto bail_tmp;
+	}
+
+	*ppos = pos + count;
+	ret = count;
+
+bail_tmp:
+	kfree(tmp);
+
+bail:
+	return ret;
+}
+
+static const struct file_operations flash_ops = {
+	.read = flash_read,
+	.write = flash_write,
+	.llseek = default_llseek,
+};
+
+static int add_cntr_files(struct super_block *sb, struct qib_devdata *dd)
+{
+	struct dentry *dir, *tmp;
+	char unit[10];
+	int ret, i;
+
+	/* create the per-unit directory */
+	snprintf(unit, sizeof(unit), "%u", dd->unit);
+	ret = create_file(unit, S_IFDIR|S_IRUGO|S_IXUGO, sb->s_root, &dir,
+			  &simple_dir_operations, dd);
+	if (ret) {
+		pr_err("create_file(%s) failed: %d\n", unit, ret);
+		goto bail;
+	}
+
+	/* create the files in the new directory */
+	ret = create_file("counters", S_IFREG|S_IRUGO, dir, &tmp,
+			  &cntr_ops[0], dd);
+	if (ret) {
+		pr_err("create_file(%s/counters) failed: %d\n",
+		       unit, ret);
+		goto bail;
+	}
+	ret = create_file("counter_names", S_IFREG|S_IRUGO, dir, &tmp,
+			  &cntr_ops[1], dd);
+	if (ret) {
+		pr_err("create_file(%s/counter_names) failed: %d\n",
+		       unit, ret);
+		goto bail;
+	}
+	ret = create_file("portcounter_names", S_IFREG|S_IRUGO, dir, &tmp,
+			  &portcntr_ops[0], dd);
+	if (ret) {
+		pr_err("create_file(%s/%s) failed: %d\n",
+		       unit, "portcounter_names", ret);
+		goto bail;
+	}
+	for (i = 1; i <= dd->num_pports; i++) {
+		char fname[24];
+
+		sprintf(fname, "port%dcounters", i);
+		/* create the files in the new directory */
+		ret = create_file(fname, S_IFREG|S_IRUGO, dir, &tmp,
+				  &portcntr_ops[i], dd);
+		if (ret) {
+			pr_err("create_file(%s/%s) failed: %d\n",
+				unit, fname, ret);
+			goto bail;
+		}
+		if (!(dd->flags & QIB_HAS_QSFP))
+			continue;
+		sprintf(fname, "qsfp%d", i);
+		ret = create_file(fname, S_IFREG|S_IRUGO, dir, &tmp,
+				  &qsfp_ops[i - 1], dd);
+		if (ret) {
+			pr_err("create_file(%s/%s) failed: %d\n",
+				unit, fname, ret);
+			goto bail;
+		}
+	}
+
+	ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp,
+			  &flash_ops, dd);
+	if (ret)
+		pr_err("create_file(%s/flash) failed: %d\n",
+			unit, ret);
+bail:
+	return ret;
+}
+
+static int remove_file(struct dentry *parent, char *name)
+{
+	struct dentry *tmp;
+	int ret;
+
+	tmp = lookup_one_len(name, parent, strlen(name));
+
+	if (IS_ERR(tmp)) {
+		ret = PTR_ERR(tmp);
+		goto bail;
+	}
+
+	spin_lock(&tmp->d_lock);
+	if (!d_unhashed(tmp) && d_really_is_positive(tmp)) {
+		__d_drop(tmp);
+		spin_unlock(&tmp->d_lock);
+		simple_unlink(d_inode(parent), tmp);
+	} else {
+		spin_unlock(&tmp->d_lock);
+	}
+	dput(tmp);
+
+	ret = 0;
+bail:
+	/*
+	 * We don't expect clients to care about the return value, but
+	 * it's there if they need it.
+	 */
+	return ret;
+}
+
+static int remove_device_files(struct super_block *sb,
+			       struct qib_devdata *dd)
+{
+	struct dentry *dir, *root;
+	char unit[10];
+	int ret, i;
+
+	root = dget(sb->s_root);
+	mutex_lock(&d_inode(root)->i_mutex);
+	snprintf(unit, sizeof(unit), "%u", dd->unit);
+	dir = lookup_one_len(unit, root, strlen(unit));
+
+	if (IS_ERR(dir)) {
+		ret = PTR_ERR(dir);
+		pr_err("Lookup of %s failed\n", unit);
+		goto bail;
+	}
+
+	mutex_lock(&d_inode(dir)->i_mutex);
+	remove_file(dir, "counters");
+	remove_file(dir, "counter_names");
+	remove_file(dir, "portcounter_names");
+	for (i = 0; i < dd->num_pports; i++) {
+		char fname[24];
+
+		sprintf(fname, "port%dcounters", i + 1);
+		remove_file(dir, fname);
+		if (dd->flags & QIB_HAS_QSFP) {
+			sprintf(fname, "qsfp%d", i + 1);
+			remove_file(dir, fname);
+		}
+	}
+	remove_file(dir, "flash");
+	mutex_unlock(&d_inode(dir)->i_mutex);
+	ret = simple_rmdir(d_inode(root), dir);
+	d_delete(dir);
+	dput(dir);
+
+bail:
+	mutex_unlock(&d_inode(root)->i_mutex);
+	dput(root);
+	return ret;
+}
+
+/*
+ * This fills everything in when the fs is mounted, to handle umount/mount
+ * after device init.  The direct add_cntr_files() call handles adding
+ * them from the init code, when the fs is already mounted.
+ */
+static int qibfs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct qib_devdata *dd, *tmp;
+	unsigned long flags;
+	int ret;
+
+	static struct tree_descr files[] = {
+		[2] = {"driver_stats", &driver_ops[0], S_IRUGO},
+		[3] = {"driver_stats_names", &driver_ops[1], S_IRUGO},
+		{""},
+	};
+
+	ret = simple_fill_super(sb, QIBFS_MAGIC, files);
+	if (ret) {
+		pr_err("simple_fill_super failed: %d\n", ret);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+
+	list_for_each_entry_safe(dd, tmp, &qib_dev_list, list) {
+		spin_unlock_irqrestore(&qib_devs_lock, flags);
+		ret = add_cntr_files(sb, dd);
+		if (ret)
+			goto bail;
+		spin_lock_irqsave(&qib_devs_lock, flags);
+	}
+
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+
+bail:
+	return ret;
+}
+
+static struct dentry *qibfs_mount(struct file_system_type *fs_type, int flags,
+			const char *dev_name, void *data)
+{
+	struct dentry *ret;
+
+	ret = mount_single(fs_type, flags, data, qibfs_fill_super);
+	if (!IS_ERR(ret))
+		qib_super = ret->d_sb;
+	return ret;
+}
+
+static void qibfs_kill_super(struct super_block *s)
+{
+	kill_litter_super(s);
+	qib_super = NULL;
+}
+
+int qibfs_add(struct qib_devdata *dd)
+{
+	int ret;
+
+	/*
+	 * On first unit initialized, qib_super will not yet exist
+	 * because nobody has yet tried to mount the filesystem, so
+	 * we can't consider that to be an error; if an error occurs
+	 * during the mount, that will get a complaint, so this is OK.
+	 * add_cntr_files() for all units is done at mount from
+	 * qibfs_fill_super(), so one way or another, everything works.
+	 */
+	if (qib_super == NULL)
+		ret = 0;
+	else
+		ret = add_cntr_files(qib_super, dd);
+	return ret;
+}
+
+int qibfs_remove(struct qib_devdata *dd)
+{
+	int ret = 0;
+
+	if (qib_super)
+		ret = remove_device_files(qib_super, dd);
+
+	return ret;
+}
+
+static struct file_system_type qibfs_fs_type = {
+	.owner =        THIS_MODULE,
+	.name =         "ipathfs",
+	.mount =        qibfs_mount,
+	.kill_sb =      qibfs_kill_super,
+};
+MODULE_ALIAS_FS("ipathfs");
+
+int __init qib_init_qibfs(void)
+{
+	return register_filesystem(&qibfs_fs_type);
+}
+
+int __exit qib_exit_qibfs(void)
+{
+	return unregister_filesystem(&qibfs_fs_type);
+}
diff --git a/drivers/infiniband/hw/qib/qib_iba6120.c b/drivers/infiniband/hw/qib/qib_iba6120.c
new file mode 100644
index 000000000..4b927809d
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_iba6120.c
@@ -0,0 +1,3600 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * This file contains all of the code that is specific to the
+ * QLogic_IB 6120 PCIe chip.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <rdma/ib_verbs.h>
+
+#include "qib.h"
+#include "qib_6120_regs.h"
+
+static void qib_6120_setup_setextled(struct qib_pportdata *, u32);
+static void sendctrl_6120_mod(struct qib_pportdata *ppd, u32 op);
+static u8 qib_6120_phys_portstate(u64);
+static u32 qib_6120_iblink_state(u64);
+
+/*
+ * This file contains all the chip-specific register information and
+ * access functions for the Intel Intel_IB PCI-Express chip.
+ *
+ */
+
+/* KREG_IDX uses machine-generated #defines */
+#define KREG_IDX(regname) (QIB_6120_##regname##_OFFS / sizeof(u64))
+
+/* Use defines to tie machine-generated names to lower-case names */
+#define kr_extctrl KREG_IDX(EXTCtrl)
+#define kr_extstatus KREG_IDX(EXTStatus)
+#define kr_gpio_clear KREG_IDX(GPIOClear)
+#define kr_gpio_mask KREG_IDX(GPIOMask)
+#define kr_gpio_out KREG_IDX(GPIOOut)
+#define kr_gpio_status KREG_IDX(GPIOStatus)
+#define kr_rcvctrl KREG_IDX(RcvCtrl)
+#define kr_sendctrl KREG_IDX(SendCtrl)
+#define kr_partitionkey KREG_IDX(RcvPartitionKey)
+#define kr_hwdiagctrl KREG_IDX(HwDiagCtrl)
+#define kr_ibcstatus KREG_IDX(IBCStatus)
+#define kr_ibcctrl KREG_IDX(IBCCtrl)
+#define kr_sendbuffererror KREG_IDX(SendBufErr0)
+#define kr_rcvbthqp KREG_IDX(RcvBTHQP)
+#define kr_counterregbase KREG_IDX(CntrRegBase)
+#define kr_palign KREG_IDX(PageAlign)
+#define kr_rcvegrbase KREG_IDX(RcvEgrBase)
+#define kr_rcvegrcnt KREG_IDX(RcvEgrCnt)
+#define kr_rcvhdrcnt KREG_IDX(RcvHdrCnt)
+#define kr_rcvhdrentsize KREG_IDX(RcvHdrEntSize)
+#define kr_rcvhdrsize KREG_IDX(RcvHdrSize)
+#define kr_rcvtidbase KREG_IDX(RcvTIDBase)
+#define kr_rcvtidcnt KREG_IDX(RcvTIDCnt)
+#define kr_scratch KREG_IDX(Scratch)
+#define kr_sendctrl KREG_IDX(SendCtrl)
+#define kr_sendpioavailaddr KREG_IDX(SendPIOAvailAddr)
+#define kr_sendpiobufbase KREG_IDX(SendPIOBufBase)
+#define kr_sendpiobufcnt KREG_IDX(SendPIOBufCnt)
+#define kr_sendpiosize KREG_IDX(SendPIOSize)
+#define kr_sendregbase KREG_IDX(SendRegBase)
+#define kr_userregbase KREG_IDX(UserRegBase)
+#define kr_control KREG_IDX(Control)
+#define kr_intclear KREG_IDX(IntClear)
+#define kr_intmask KREG_IDX(IntMask)
+#define kr_intstatus KREG_IDX(IntStatus)
+#define kr_errclear KREG_IDX(ErrClear)
+#define kr_errmask KREG_IDX(ErrMask)
+#define kr_errstatus KREG_IDX(ErrStatus)
+#define kr_hwerrclear KREG_IDX(HwErrClear)
+#define kr_hwerrmask KREG_IDX(HwErrMask)
+#define kr_hwerrstatus KREG_IDX(HwErrStatus)
+#define kr_revision KREG_IDX(Revision)
+#define kr_portcnt KREG_IDX(PortCnt)
+#define kr_serdes_cfg0 KREG_IDX(SerdesCfg0)
+#define kr_serdes_cfg1 (kr_serdes_cfg0 + 1)
+#define kr_serdes_stat KREG_IDX(SerdesStat)
+#define kr_xgxs_cfg KREG_IDX(XGXSCfg)
+
+/* These must only be written via qib_write_kreg_ctxt() */
+#define kr_rcvhdraddr KREG_IDX(RcvHdrAddr0)
+#define kr_rcvhdrtailaddr KREG_IDX(RcvHdrTailAddr0)
+
+#define CREG_IDX(regname) ((QIB_6120_##regname##_OFFS - \
+			QIB_6120_LBIntCnt_OFFS) / sizeof(u64))
+
+#define cr_badformat CREG_IDX(RxBadFormatCnt)
+#define cr_erricrc CREG_IDX(RxICRCErrCnt)
+#define cr_errlink CREG_IDX(RxLinkProblemCnt)
+#define cr_errlpcrc CREG_IDX(RxLPCRCErrCnt)
+#define cr_errpkey CREG_IDX(RxPKeyMismatchCnt)
+#define cr_rcvflowctrl_err CREG_IDX(RxFlowCtrlErrCnt)
+#define cr_err_rlen CREG_IDX(RxLenErrCnt)
+#define cr_errslen CREG_IDX(TxLenErrCnt)
+#define cr_errtidfull CREG_IDX(RxTIDFullErrCnt)
+#define cr_errtidvalid CREG_IDX(RxTIDValidErrCnt)
+#define cr_errvcrc CREG_IDX(RxVCRCErrCnt)
+#define cr_ibstatuschange CREG_IDX(IBStatusChangeCnt)
+#define cr_lbint CREG_IDX(LBIntCnt)
+#define cr_invalidrlen CREG_IDX(RxMaxMinLenErrCnt)
+#define cr_invalidslen CREG_IDX(TxMaxMinLenErrCnt)
+#define cr_lbflowstall CREG_IDX(LBFlowStallCnt)
+#define cr_pktrcv CREG_IDX(RxDataPktCnt)
+#define cr_pktrcvflowctrl CREG_IDX(RxFlowPktCnt)
+#define cr_pktsend CREG_IDX(TxDataPktCnt)
+#define cr_pktsendflow CREG_IDX(TxFlowPktCnt)
+#define cr_portovfl CREG_IDX(RxP0HdrEgrOvflCnt)
+#define cr_rcvebp CREG_IDX(RxEBPCnt)
+#define cr_rcvovfl CREG_IDX(RxBufOvflCnt)
+#define cr_senddropped CREG_IDX(TxDroppedPktCnt)
+#define cr_sendstall CREG_IDX(TxFlowStallCnt)
+#define cr_sendunderrun CREG_IDX(TxUnderrunCnt)
+#define cr_wordrcv CREG_IDX(RxDwordCnt)
+#define cr_wordsend CREG_IDX(TxDwordCnt)
+#define cr_txunsupvl CREG_IDX(TxUnsupVLErrCnt)
+#define cr_rxdroppkt CREG_IDX(RxDroppedPktCnt)
+#define cr_iblinkerrrecov CREG_IDX(IBLinkErrRecoveryCnt)
+#define cr_iblinkdown CREG_IDX(IBLinkDownedCnt)
+#define cr_ibsymbolerr CREG_IDX(IBSymbolErrCnt)
+
+#define SYM_RMASK(regname, fldname) ((u64)              \
+	QIB_6120_##regname##_##fldname##_RMASK)
+#define SYM_MASK(regname, fldname) ((u64)               \
+	QIB_6120_##regname##_##fldname##_RMASK <<       \
+	 QIB_6120_##regname##_##fldname##_LSB)
+#define SYM_LSB(regname, fldname) (QIB_6120_##regname##_##fldname##_LSB)
+
+#define SYM_FIELD(value, regname, fldname) ((u64) \
+	(((value) >> SYM_LSB(regname, fldname)) & \
+	 SYM_RMASK(regname, fldname)))
+#define ERR_MASK(fldname) SYM_MASK(ErrMask, fldname##Mask)
+#define HWE_MASK(fldname) SYM_MASK(HwErrMask, fldname##Mask)
+
+/* link training states, from IBC */
+#define IB_6120_LT_STATE_DISABLED        0x00
+#define IB_6120_LT_STATE_LINKUP          0x01
+#define IB_6120_LT_STATE_POLLACTIVE      0x02
+#define IB_6120_LT_STATE_POLLQUIET       0x03
+#define IB_6120_LT_STATE_SLEEPDELAY      0x04
+#define IB_6120_LT_STATE_SLEEPQUIET      0x05
+#define IB_6120_LT_STATE_CFGDEBOUNCE     0x08
+#define IB_6120_LT_STATE_CFGRCVFCFG      0x09
+#define IB_6120_LT_STATE_CFGWAITRMT      0x0a
+#define IB_6120_LT_STATE_CFGIDLE 0x0b
+#define IB_6120_LT_STATE_RECOVERRETRAIN  0x0c
+#define IB_6120_LT_STATE_RECOVERWAITRMT  0x0e
+#define IB_6120_LT_STATE_RECOVERIDLE     0x0f
+
+/* link state machine states from IBC */
+#define IB_6120_L_STATE_DOWN             0x0
+#define IB_6120_L_STATE_INIT             0x1
+#define IB_6120_L_STATE_ARM              0x2
+#define IB_6120_L_STATE_ACTIVE           0x3
+#define IB_6120_L_STATE_ACT_DEFER        0x4
+
+static const u8 qib_6120_physportstate[0x20] = {
+	[IB_6120_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
+	[IB_6120_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
+	[IB_6120_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
+	[IB_6120_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
+	[IB_6120_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_6120_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_6120_LT_STATE_CFGDEBOUNCE] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_6120_LT_STATE_CFGRCVFCFG] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_6120_LT_STATE_CFGWAITRMT] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_6120_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_6120_LT_STATE_RECOVERRETRAIN] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_6120_LT_STATE_RECOVERWAITRMT] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_6120_LT_STATE_RECOVERIDLE] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[0x10] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x11] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x12] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x13] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
+};
+
+
+struct qib_chip_specific {
+	u64 __iomem *cregbase;
+	u64 *cntrs;
+	u64 *portcntrs;
+	void *dummy_hdrq;   /* used after ctxt close */
+	dma_addr_t dummy_hdrq_phys;
+	spinlock_t kernel_tid_lock; /* no back to back kernel TID writes */
+	spinlock_t user_tid_lock; /* no back to back user TID writes */
+	spinlock_t rcvmod_lock; /* protect rcvctrl shadow changes */
+	spinlock_t gpio_lock; /* RMW of shadows/regs for ExtCtrl and GPIO */
+	u64 hwerrmask;
+	u64 errormask;
+	u64 gpio_out; /* shadow of kr_gpio_out, for rmw ops */
+	u64 gpio_mask; /* shadow the gpio mask register */
+	u64 extctrl; /* shadow the gpio output enable, etc... */
+	/*
+	 * these 5 fields are used to establish deltas for IB symbol
+	 * errors and linkrecovery errors.  They can be reported on
+	 * some chips during link negotiation prior to INIT, and with
+	 * DDR when faking DDR negotiations with non-IBTA switches.
+	 * The chip counters are adjusted at driver unload if there is
+	 * a non-zero delta.
+	 */
+	u64 ibdeltainprog;
+	u64 ibsymdelta;
+	u64 ibsymsnap;
+	u64 iblnkerrdelta;
+	u64 iblnkerrsnap;
+	u64 ibcctrl; /* shadow for kr_ibcctrl */
+	u32 lastlinkrecov; /* link recovery issue */
+	int irq;
+	u32 cntrnamelen;
+	u32 portcntrnamelen;
+	u32 ncntrs;
+	u32 nportcntrs;
+	/* used with gpio interrupts to implement IB counters */
+	u32 rxfc_unsupvl_errs;
+	u32 overrun_thresh_errs;
+	/*
+	 * these count only cases where _successive_ LocalLinkIntegrity
+	 * errors were seen in the receive headers of IB standard packets
+	 */
+	u32 lli_errs;
+	u32 lli_counter;
+	u64 lli_thresh;
+	u64 sword; /* total dwords sent (sample result) */
+	u64 rword; /* total dwords received (sample result) */
+	u64 spkts; /* total packets sent (sample result) */
+	u64 rpkts; /* total packets received (sample result) */
+	u64 xmit_wait; /* # of ticks no data sent (sample result) */
+	struct timer_list pma_timer;
+	char emsgbuf[128];
+	char bitsmsgbuf[64];
+	u8 pma_sample_status;
+};
+
+/* ibcctrl bits */
+#define QLOGIC_IB_IBCC_LINKINITCMD_DISABLE 1
+/* cycle through TS1/TS2 till OK */
+#define QLOGIC_IB_IBCC_LINKINITCMD_POLL 2
+/* wait for TS1, then go on */
+#define QLOGIC_IB_IBCC_LINKINITCMD_SLEEP 3
+#define QLOGIC_IB_IBCC_LINKINITCMD_SHIFT 16
+
+#define QLOGIC_IB_IBCC_LINKCMD_DOWN 1           /* move to 0x11 */
+#define QLOGIC_IB_IBCC_LINKCMD_ARMED 2          /* move to 0x21 */
+#define QLOGIC_IB_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */
+#define QLOGIC_IB_IBCC_LINKCMD_SHIFT 18
+
+/*
+ * We could have a single register get/put routine, that takes a group type,
+ * but this is somewhat clearer and cleaner.  It also gives us some error
+ * checking.  64 bit register reads should always work, but are inefficient
+ * on opteron (the northbridge always generates 2 separate HT 32 bit reads),
+ * so we use kreg32 wherever possible.  User register and counter register
+ * reads are always 32 bit reads, so only one form of those routines.
+ */
+
+/**
+ * qib_read_ureg32 - read 32-bit virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @ctxt: context number
+ *
+ * Return the contents of a register that is virtualized to be per context.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u32 qib_read_ureg32(const struct qib_devdata *dd,
+				  enum qib_ureg regno, int ctxt)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+
+	if (dd->userbase)
+		return readl(regno + (u64 __iomem *)
+			     ((char __iomem *)dd->userbase +
+			      dd->ureg_align * ctxt));
+	else
+		return readl(regno + (u64 __iomem *)
+			     (dd->uregbase +
+			      (char __iomem *)dd->kregbase +
+			      dd->ureg_align * ctxt));
+}
+
+/**
+ * qib_write_ureg - write 32-bit virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @value: value
+ * @ctxt: context
+ *
+ * Write the contents of a register that is virtualized to be per context.
+ */
+static inline void qib_write_ureg(const struct qib_devdata *dd,
+				  enum qib_ureg regno, u64 value, int ctxt)
+{
+	u64 __iomem *ubase;
+
+	if (dd->userbase)
+		ubase = (u64 __iomem *)
+			((char __iomem *) dd->userbase +
+			 dd->ureg_align * ctxt);
+	else
+		ubase = (u64 __iomem *)
+			(dd->uregbase +
+			 (char __iomem *) dd->kregbase +
+			 dd->ureg_align * ctxt);
+
+	if (dd->kregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &ubase[regno]);
+}
+
+static inline u32 qib_read_kreg32(const struct qib_devdata *dd,
+				  const u16 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+	return readl((u32 __iomem *)&dd->kregbase[regno]);
+}
+
+static inline u64 qib_read_kreg64(const struct qib_devdata *dd,
+				  const u16 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+
+	return readq(&dd->kregbase[regno]);
+}
+
+static inline void qib_write_kreg(const struct qib_devdata *dd,
+				  const u16 regno, u64 value)
+{
+	if (dd->kregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &dd->kregbase[regno]);
+}
+
+/**
+ * qib_write_kreg_ctxt - write a device's per-ctxt 64-bit kernel register
+ * @dd: the qlogic_ib device
+ * @regno: the register number to write
+ * @ctxt: the context containing the register
+ * @value: the value to write
+ */
+static inline void qib_write_kreg_ctxt(const struct qib_devdata *dd,
+				       const u16 regno, unsigned ctxt,
+				       u64 value)
+{
+	qib_write_kreg(dd, regno + ctxt, value);
+}
+
+static inline void write_6120_creg(const struct qib_devdata *dd,
+				   u16 regno, u64 value)
+{
+	if (dd->cspec->cregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &dd->cspec->cregbase[regno]);
+}
+
+static inline u64 read_6120_creg(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readq(&dd->cspec->cregbase[regno]);
+}
+
+static inline u32 read_6120_creg32(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readl(&dd->cspec->cregbase[regno]);
+}
+
+/* kr_control bits */
+#define QLOGIC_IB_C_RESET 1U
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define QLOGIC_IB_I_RCVURG_MASK ((1U << 5) - 1)
+#define QLOGIC_IB_I_RCVURG_SHIFT 0
+#define QLOGIC_IB_I_RCVAVAIL_MASK ((1U << 5) - 1)
+#define QLOGIC_IB_I_RCVAVAIL_SHIFT 12
+
+#define QLOGIC_IB_C_FREEZEMODE 0x00000002
+#define QLOGIC_IB_C_LINKENABLE 0x00000004
+#define QLOGIC_IB_I_ERROR               0x0000000080000000ULL
+#define QLOGIC_IB_I_SPIOSENT            0x0000000040000000ULL
+#define QLOGIC_IB_I_SPIOBUFAVAIL        0x0000000020000000ULL
+#define QLOGIC_IB_I_GPIO                0x0000000010000000ULL
+#define QLOGIC_IB_I_BITSEXTANT \
+		((QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT) | \
+		(QLOGIC_IB_I_RCVAVAIL_MASK << \
+		 QLOGIC_IB_I_RCVAVAIL_SHIFT) | \
+		QLOGIC_IB_I_ERROR | QLOGIC_IB_I_SPIOSENT | \
+		QLOGIC_IB_I_SPIOBUFAVAIL | QLOGIC_IB_I_GPIO)
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK  0x000000000000003fULL
+#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT 0
+#define QLOGIC_IB_HWE_PCIEPOISONEDTLP      0x0000000010000000ULL
+#define QLOGIC_IB_HWE_PCIECPLTIMEOUT       0x0000000020000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYXTLH    0x0000000040000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYXADM    0x0000000080000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYRADM    0x0000000100000000ULL
+#define QLOGIC_IB_HWE_COREPLL_FBSLIP       0x0080000000000000ULL
+#define QLOGIC_IB_HWE_COREPLL_RFSLIP       0x0100000000000000ULL
+#define QLOGIC_IB_HWE_PCIE1PLLFAILED       0x0400000000000000ULL
+#define QLOGIC_IB_HWE_PCIE0PLLFAILED       0x0800000000000000ULL
+#define QLOGIC_IB_HWE_SERDESPLLFAILED      0x1000000000000000ULL
+
+
+/* kr_extstatus bits */
+#define QLOGIC_IB_EXTS_FREQSEL 0x2
+#define QLOGIC_IB_EXTS_SERDESSEL 0x4
+#define QLOGIC_IB_EXTS_MEMBIST_ENDTEST     0x0000000000004000
+#define QLOGIC_IB_EXTS_MEMBIST_FOUND       0x0000000000008000
+
+/* kr_xgxsconfig bits */
+#define QLOGIC_IB_XGXS_RESET          0x5ULL
+
+#define _QIB_GPIO_SDA_NUM 1
+#define _QIB_GPIO_SCL_NUM 0
+
+/* Bits in GPIO for the added IB link interrupts */
+#define GPIO_RXUVL_BIT 3
+#define GPIO_OVRUN_BIT 4
+#define GPIO_LLI_BIT 5
+#define GPIO_ERRINTR_MASK 0x38
+
+
+#define QLOGIC_IB_RT_BUFSIZE_MASK 0xe0000000ULL
+#define QLOGIC_IB_RT_BUFSIZE_SHIFTVAL(tid) \
+	((((tid) & QLOGIC_IB_RT_BUFSIZE_MASK) >> 29) + 11 - 1)
+#define QLOGIC_IB_RT_BUFSIZE(tid) (1 << QLOGIC_IB_RT_BUFSIZE_SHIFTVAL(tid))
+#define QLOGIC_IB_RT_IS_VALID(tid) \
+	(((tid) & QLOGIC_IB_RT_BUFSIZE_MASK) && \
+	 ((((tid) & QLOGIC_IB_RT_BUFSIZE_MASK) != QLOGIC_IB_RT_BUFSIZE_MASK)))
+#define QLOGIC_IB_RT_ADDR_MASK 0x1FFFFFFFULL /* 29 bits valid */
+#define QLOGIC_IB_RT_ADDR_SHIFT 10
+
+#define QLOGIC_IB_R_INTRAVAIL_SHIFT 16
+#define QLOGIC_IB_R_TAILUPD_SHIFT 31
+#define IBA6120_R_PKEY_DIS_SHIFT 30
+
+#define PBC_6120_VL15_SEND_CTRL (1ULL << 31) /* pbc; VL15; link_buf only */
+
+#define IBCBUSFRSPCPARITYERR HWE_MASK(IBCBusFromSPCParityErr)
+#define IBCBUSTOSPCPARITYERR HWE_MASK(IBCBusToSPCParityErr)
+
+#define SYM_MASK_BIT(regname, fldname, bit) ((u64) \
+	((1ULL << (SYM_LSB(regname, fldname) + (bit)))))
+
+#define TXEMEMPARITYERR_PIOBUF \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 0)
+#define TXEMEMPARITYERR_PIOPBC \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 1)
+#define TXEMEMPARITYERR_PIOLAUNCHFIFO \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 2)
+
+#define RXEMEMPARITYERR_RCVBUF \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 0)
+#define RXEMEMPARITYERR_LOOKUPQ \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 1)
+#define RXEMEMPARITYERR_EXPTID \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 2)
+#define RXEMEMPARITYERR_EAGERTID \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 3)
+#define RXEMEMPARITYERR_FLAGBUF \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 4)
+#define RXEMEMPARITYERR_DATAINFO \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 5)
+#define RXEMEMPARITYERR_HDRINFO \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 6)
+
+/* 6120 specific hardware errors... */
+static const struct qib_hwerror_msgs qib_6120_hwerror_msgs[] = {
+	/* generic hardware errors */
+	QLOGIC_IB_HWE_MSG(IBCBUSFRSPCPARITYERR, "QIB2IB Parity"),
+	QLOGIC_IB_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2QIB Parity"),
+
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOBUF,
+			  "TXE PIOBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOPBC,
+			  "TXE PIOPBC Memory Parity"),
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOLAUNCHFIFO,
+			  "TXE PIOLAUNCHFIFO Memory Parity"),
+
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_RCVBUF,
+			  "RXE RCVBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_LOOKUPQ,
+			  "RXE LOOKUPQ Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EAGERTID,
+			  "RXE EAGERTID Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EXPTID,
+			  "RXE EXPTID Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_FLAGBUF,
+			  "RXE FLAGBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_DATAINFO,
+			  "RXE DATAINFO Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_HDRINFO,
+			  "RXE HDRINFO Memory Parity"),
+
+	/* chip-specific hardware errors */
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEPOISONEDTLP,
+			  "PCIe Poisoned TLP"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLTIMEOUT,
+			  "PCIe completion timeout"),
+	/*
+	 * In practice, it's unlikely wthat we'll see PCIe PLL, or bus
+	 * parity or memory parity error failures, because most likely we
+	 * won't be able to talk to the core of the chip.  Nonetheless, we
+	 * might see them, if they are in parts of the PCIe core that aren't
+	 * essential.
+	 */
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE1PLLFAILED,
+			  "PCIePLL1"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE0PLLFAILED,
+			  "PCIePLL0"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXTLH,
+			  "PCIe XTLH core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXADM,
+			  "PCIe ADM TX core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYRADM,
+			  "PCIe ADM RX core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_SERDESPLLFAILED,
+			  "SerDes PLL"),
+};
+
+#define TXE_PIO_PARITY (TXEMEMPARITYERR_PIOBUF | TXEMEMPARITYERR_PIOPBC)
+#define _QIB_PLL_FAIL (QLOGIC_IB_HWE_COREPLL_FBSLIP |   \
+		QLOGIC_IB_HWE_COREPLL_RFSLIP)
+
+	/* variables for sanity checking interrupt and errors */
+#define IB_HWE_BITSEXTANT \
+	(HWE_MASK(RXEMemParityErr) |					\
+	 HWE_MASK(TXEMemParityErr) |					\
+	 (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK <<			\
+	  QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) |			\
+	 QLOGIC_IB_HWE_PCIE1PLLFAILED |					\
+	 QLOGIC_IB_HWE_PCIE0PLLFAILED |					\
+	 QLOGIC_IB_HWE_PCIEPOISONEDTLP |				\
+	 QLOGIC_IB_HWE_PCIECPLTIMEOUT |					\
+	 QLOGIC_IB_HWE_PCIEBUSPARITYXTLH |				\
+	 QLOGIC_IB_HWE_PCIEBUSPARITYXADM |				\
+	 QLOGIC_IB_HWE_PCIEBUSPARITYRADM |				\
+	 HWE_MASK(PowerOnBISTFailed) |					\
+	 QLOGIC_IB_HWE_COREPLL_FBSLIP |					\
+	 QLOGIC_IB_HWE_COREPLL_RFSLIP |					\
+	 QLOGIC_IB_HWE_SERDESPLLFAILED |				\
+	 HWE_MASK(IBCBusToSPCParityErr) |				\
+	 HWE_MASK(IBCBusFromSPCParityErr))
+
+#define IB_E_BITSEXTANT \
+	(ERR_MASK(RcvFormatErr) | ERR_MASK(RcvVCRCErr) |		\
+	 ERR_MASK(RcvICRCErr) | ERR_MASK(RcvMinPktLenErr) |		\
+	 ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvLongPktLenErr) |	\
+	 ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvUnexpectedCharErr) | \
+	 ERR_MASK(RcvUnsupportedVLErr) | ERR_MASK(RcvEBPErr) |		\
+	 ERR_MASK(RcvIBFlowErr) | ERR_MASK(RcvBadVersionErr) |		\
+	 ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr) |		\
+	 ERR_MASK(RcvBadTidErr) | ERR_MASK(RcvHdrLenErr) |		\
+	 ERR_MASK(RcvHdrErr) | ERR_MASK(RcvIBLostLinkErr) |		\
+	 ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendMaxPktLenErr) |	\
+	 ERR_MASK(SendUnderRunErr) | ERR_MASK(SendPktLenErr) |		\
+	 ERR_MASK(SendDroppedSmpPktErr) |				\
+	 ERR_MASK(SendDroppedDataPktErr) |				\
+	 ERR_MASK(SendPioArmLaunchErr) |				\
+	 ERR_MASK(SendUnexpectedPktNumErr) |				\
+	 ERR_MASK(SendUnsupportedVLErr) | ERR_MASK(IBStatusChanged) |	\
+	 ERR_MASK(InvalidAddrErr) | ERR_MASK(ResetNegated) |		\
+	 ERR_MASK(HardwareErr))
+
+#define QLOGIC_IB_E_PKTERRS ( \
+		ERR_MASK(SendPktLenErr) |				\
+		ERR_MASK(SendDroppedDataPktErr) |			\
+		ERR_MASK(RcvVCRCErr) |					\
+		ERR_MASK(RcvICRCErr) |					\
+		ERR_MASK(RcvShortPktLenErr) |				\
+		ERR_MASK(RcvEBPErr))
+
+/* These are all rcv-related errors which we want to count for stats */
+#define E_SUM_PKTERRS						\
+	(ERR_MASK(RcvHdrLenErr) | ERR_MASK(RcvBadTidErr) |		\
+	 ERR_MASK(RcvBadVersionErr) | ERR_MASK(RcvHdrErr) |		\
+	 ERR_MASK(RcvLongPktLenErr) | ERR_MASK(RcvShortPktLenErr) |	\
+	 ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvMinPktLenErr) |	\
+	 ERR_MASK(RcvFormatErr) | ERR_MASK(RcvUnsupportedVLErr) |	\
+	 ERR_MASK(RcvUnexpectedCharErr) | ERR_MASK(RcvEBPErr))
+
+/* These are all send-related errors which we want to count for stats */
+#define E_SUM_ERRS							\
+	(ERR_MASK(SendPioArmLaunchErr) |				\
+	 ERR_MASK(SendUnexpectedPktNumErr) |				\
+	 ERR_MASK(SendDroppedDataPktErr) |				\
+	 ERR_MASK(SendDroppedSmpPktErr) |				\
+	 ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendUnsupportedVLErr) |	\
+	 ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) |		\
+	 ERR_MASK(InvalidAddrErr))
+
+/*
+ * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore
+ * errors not related to freeze and cancelling buffers.  Can't ignore
+ * armlaunch because could get more while still cleaning up, and need
+ * to cancel those as they happen.
+ */
+#define E_SPKT_ERRS_IGNORE \
+	(ERR_MASK(SendDroppedDataPktErr) |				\
+	 ERR_MASK(SendDroppedSmpPktErr) |				\
+	 ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendMinPktLenErr) |	\
+	 ERR_MASK(SendPktLenErr))
+
+/*
+ * these are errors that can occur when the link changes state while
+ * a packet is being sent or received.  This doesn't cover things
+ * like EBP or VCRC that can be the result of a sending having the
+ * link change state, so we receive a "known bad" packet.
+ */
+#define E_SUM_LINK_PKTERRS		\
+	(ERR_MASK(SendDroppedDataPktErr) |				\
+	 ERR_MASK(SendDroppedSmpPktErr) |				\
+	 ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) |		\
+	 ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvMinPktLenErr) |	\
+	 ERR_MASK(RcvUnexpectedCharErr))
+
+static void qib_6120_put_tid_2(struct qib_devdata *, u64 __iomem *,
+			       u32, unsigned long);
+
+/*
+ * On platforms using this chip, and not having ordered WC stores, we
+ * can get TXE parity errors due to speculative reads to the PIO buffers,
+ * and this, due to a chip issue can result in (many) false parity error
+ * reports.  So it's a debug print on those, and an info print on systems
+ * where the speculative reads don't occur.
+ */
+static void qib_6120_txe_recover(struct qib_devdata *dd)
+{
+	if (!qib_unordered_wc())
+		qib_devinfo(dd->pcidev,
+			    "Recovering from TXE PIO parity error\n");
+}
+
+/* enable/disable chip from delivering interrupts */
+static void qib_6120_set_intr_state(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		if (dd->flags & QIB_BADINTR)
+			return;
+		qib_write_kreg(dd, kr_intmask, ~0ULL);
+		/* force re-interrupt of any pending interrupts. */
+		qib_write_kreg(dd, kr_intclear, 0ULL);
+	} else
+		qib_write_kreg(dd, kr_intmask, 0ULL);
+}
+
+/*
+ * Try to cleanup as much as possible for anything that might have gone
+ * wrong while in freeze mode, such as pio buffers being written by user
+ * processes (causing armlaunch), send errors due to going into freeze mode,
+ * etc., and try to avoid causing extra interrupts while doing so.
+ * Forcibly update the in-memory pioavail register copies after cleanup
+ * because the chip won't do it while in freeze mode (the register values
+ * themselves are kept correct).
+ * Make sure that we don't lose any important interrupts by using the chip
+ * feature that says that writing 0 to a bit in *clear that is set in
+ * *status will cause an interrupt to be generated again (if allowed by
+ * the *mask value).
+ * This is in chip-specific code because of all of the register accesses,
+ * even though the details are similar on most chips
+ */
+static void qib_6120_clear_freeze(struct qib_devdata *dd)
+{
+	/* disable error interrupts, to avoid confusion */
+	qib_write_kreg(dd, kr_errmask, 0ULL);
+
+	/* also disable interrupts; errormask is sometimes overwriten */
+	qib_6120_set_intr_state(dd, 0);
+
+	qib_cancel_sends(dd->pport);
+
+	/* clear the freeze, and be sure chip saw it */
+	qib_write_kreg(dd, kr_control, dd->control);
+	qib_read_kreg32(dd, kr_scratch);
+
+	/* force in-memory update now we are out of freeze */
+	qib_force_pio_avail_update(dd);
+
+	/*
+	 * force new interrupt if any hwerr, error or interrupt bits are
+	 * still set, and clear "safe" send packet errors related to freeze
+	 * and cancelling sends.  Re-enable error interrupts before possible
+	 * force of re-interrupt on pending interrupts.
+	 */
+	qib_write_kreg(dd, kr_hwerrclear, 0ULL);
+	qib_write_kreg(dd, kr_errclear, E_SPKT_ERRS_IGNORE);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+	qib_6120_set_intr_state(dd, 1);
+}
+
+/**
+ * qib_handle_6120_hwerrors - display hardware errors.
+ * @dd: the qlogic_ib device
+ * @msg: the output buffer
+ * @msgl: the size of the output buffer
+ *
+ * Use same msg buffer as regular errors to avoid excessive stack
+ * use.  Most hardware errors are catastrophic, but for right now,
+ * we'll print them and continue.  Reuse the same message buffer as
+ * handle_6120_errors() to avoid excessive stack usage.
+ */
+static void qib_handle_6120_hwerrors(struct qib_devdata *dd, char *msg,
+				     size_t msgl)
+{
+	u64 hwerrs;
+	u32 bits, ctrl;
+	int isfatal = 0;
+	char *bitsmsg;
+	int log_idx;
+
+	hwerrs = qib_read_kreg64(dd, kr_hwerrstatus);
+	if (!hwerrs)
+		return;
+	if (hwerrs == ~0ULL) {
+		qib_dev_err(dd,
+			"Read of hardware error status failed (all bits set); ignoring\n");
+		return;
+	}
+	qib_stats.sps_hwerrs++;
+
+	/* Always clear the error status register, except MEMBISTFAIL,
+	 * regardless of whether we continue or stop using the chip.
+	 * We want that set so we know it failed, even across driver reload.
+	 * We'll still ignore it in the hwerrmask.  We do this partly for
+	 * diagnostics, but also for support */
+	qib_write_kreg(dd, kr_hwerrclear,
+		       hwerrs & ~HWE_MASK(PowerOnBISTFailed));
+
+	hwerrs &= dd->cspec->hwerrmask;
+
+	/* We log some errors to EEPROM, check if we have any of those. */
+	for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx)
+		if (hwerrs & dd->eep_st_masks[log_idx].hwerrs_to_log)
+			qib_inc_eeprom_err(dd, log_idx, 1);
+
+	/*
+	 * Make sure we get this much out, unless told to be quiet,
+	 * or it's occurred within the last 5 seconds.
+	 */
+	if (hwerrs & ~(TXE_PIO_PARITY | RXEMEMPARITYERR_EAGERTID))
+		qib_devinfo(dd->pcidev,
+			"Hardware error: hwerr=0x%llx (cleared)\n",
+			(unsigned long long) hwerrs);
+
+	if (hwerrs & ~IB_HWE_BITSEXTANT)
+		qib_dev_err(dd,
+			"hwerror interrupt with unknown errors %llx set\n",
+			(unsigned long long)(hwerrs & ~IB_HWE_BITSEXTANT));
+
+	ctrl = qib_read_kreg32(dd, kr_control);
+	if ((ctrl & QLOGIC_IB_C_FREEZEMODE) && !dd->diag_client) {
+		/*
+		 * Parity errors in send memory are recoverable,
+		 * just cancel the send (if indicated in * sendbuffererror),
+		 * count the occurrence, unfreeze (if no other handled
+		 * hardware error bits are set), and continue. They can
+		 * occur if a processor speculative read is done to the PIO
+		 * buffer while we are sending a packet, for example.
+		 */
+		if (hwerrs & TXE_PIO_PARITY) {
+			qib_6120_txe_recover(dd);
+			hwerrs &= ~TXE_PIO_PARITY;
+		}
+
+		if (!hwerrs) {
+			static u32 freeze_cnt;
+
+			freeze_cnt++;
+			qib_6120_clear_freeze(dd);
+		} else
+			isfatal = 1;
+	}
+
+	*msg = '\0';
+
+	if (hwerrs & HWE_MASK(PowerOnBISTFailed)) {
+		isfatal = 1;
+		strlcat(msg,
+			"[Memory BIST test failed, InfiniPath hardware unusable]",
+			msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->cspec->hwerrmask &= ~HWE_MASK(PowerOnBISTFailed);
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	qib_format_hwerrors(hwerrs, qib_6120_hwerror_msgs,
+			    ARRAY_SIZE(qib_6120_hwerror_msgs), msg, msgl);
+
+	bitsmsg = dd->cspec->bitsmsgbuf;
+	if (hwerrs & (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK <<
+		      QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT)) {
+		bits = (u32) ((hwerrs >>
+			       QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) &
+			      QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK);
+		snprintf(bitsmsg, sizeof(dd->cspec->bitsmsgbuf),
+			 "[PCIe Mem Parity Errs %x] ", bits);
+		strlcat(msg, bitsmsg, msgl);
+	}
+
+	if (hwerrs & _QIB_PLL_FAIL) {
+		isfatal = 1;
+		snprintf(bitsmsg, sizeof(dd->cspec->bitsmsgbuf),
+			 "[PLL failed (%llx), InfiniPath hardware unusable]",
+			 (unsigned long long) hwerrs & _QIB_PLL_FAIL);
+		strlcat(msg, bitsmsg, msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->cspec->hwerrmask &= ~(hwerrs & _QIB_PLL_FAIL);
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	if (hwerrs & QLOGIC_IB_HWE_SERDESPLLFAILED) {
+		/*
+		 * If it occurs, it is left masked since the external
+		 * interface is unused
+		 */
+		dd->cspec->hwerrmask &= ~QLOGIC_IB_HWE_SERDESPLLFAILED;
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	if (hwerrs)
+		/*
+		 * if any set that we aren't ignoring; only
+		 * make the complaint once, in case it's stuck
+		 * or recurring, and we get here multiple
+		 * times.
+		 */
+		qib_dev_err(dd, "%s hardware error\n", msg);
+	else
+		*msg = 0; /* recovered from all of them */
+
+	if (isfatal && !dd->diag_client) {
+		qib_dev_err(dd,
+			"Fatal Hardware Error, no longer usable, SN %.16s\n",
+			dd->serial);
+		/*
+		 * for /sys status file and user programs to print; if no
+		 * trailing brace is copied, we'll know it was truncated.
+		 */
+		if (dd->freezemsg)
+			snprintf(dd->freezemsg, dd->freezelen,
+				 "{%s}", msg);
+		qib_disable_after_error(dd);
+	}
+}
+
+/*
+ * Decode the error status into strings, deciding whether to always
+ * print * it or not depending on "normal packet errors" vs everything
+ * else.   Return 1 if "real" errors, otherwise 0 if only packet
+ * errors, so caller can decide what to print with the string.
+ */
+static int qib_decode_6120_err(struct qib_devdata *dd, char *buf, size_t blen,
+			       u64 err)
+{
+	int iserr = 1;
+
+	*buf = '\0';
+	if (err & QLOGIC_IB_E_PKTERRS) {
+		if (!(err & ~QLOGIC_IB_E_PKTERRS))
+			iserr = 0;
+		if ((err & ERR_MASK(RcvICRCErr)) &&
+		    !(err&(ERR_MASK(RcvVCRCErr)|ERR_MASK(RcvEBPErr))))
+			strlcat(buf, "CRC ", blen);
+		if (!iserr)
+			goto done;
+	}
+	if (err & ERR_MASK(RcvHdrLenErr))
+		strlcat(buf, "rhdrlen ", blen);
+	if (err & ERR_MASK(RcvBadTidErr))
+		strlcat(buf, "rbadtid ", blen);
+	if (err & ERR_MASK(RcvBadVersionErr))
+		strlcat(buf, "rbadversion ", blen);
+	if (err & ERR_MASK(RcvHdrErr))
+		strlcat(buf, "rhdr ", blen);
+	if (err & ERR_MASK(RcvLongPktLenErr))
+		strlcat(buf, "rlongpktlen ", blen);
+	if (err & ERR_MASK(RcvMaxPktLenErr))
+		strlcat(buf, "rmaxpktlen ", blen);
+	if (err & ERR_MASK(RcvMinPktLenErr))
+		strlcat(buf, "rminpktlen ", blen);
+	if (err & ERR_MASK(SendMinPktLenErr))
+		strlcat(buf, "sminpktlen ", blen);
+	if (err & ERR_MASK(RcvFormatErr))
+		strlcat(buf, "rformaterr ", blen);
+	if (err & ERR_MASK(RcvUnsupportedVLErr))
+		strlcat(buf, "runsupvl ", blen);
+	if (err & ERR_MASK(RcvUnexpectedCharErr))
+		strlcat(buf, "runexpchar ", blen);
+	if (err & ERR_MASK(RcvIBFlowErr))
+		strlcat(buf, "ribflow ", blen);
+	if (err & ERR_MASK(SendUnderRunErr))
+		strlcat(buf, "sunderrun ", blen);
+	if (err & ERR_MASK(SendPioArmLaunchErr))
+		strlcat(buf, "spioarmlaunch ", blen);
+	if (err & ERR_MASK(SendUnexpectedPktNumErr))
+		strlcat(buf, "sunexperrpktnum ", blen);
+	if (err & ERR_MASK(SendDroppedSmpPktErr))
+		strlcat(buf, "sdroppedsmppkt ", blen);
+	if (err & ERR_MASK(SendMaxPktLenErr))
+		strlcat(buf, "smaxpktlen ", blen);
+	if (err & ERR_MASK(SendUnsupportedVLErr))
+		strlcat(buf, "sunsupVL ", blen);
+	if (err & ERR_MASK(InvalidAddrErr))
+		strlcat(buf, "invalidaddr ", blen);
+	if (err & ERR_MASK(RcvEgrFullErr))
+		strlcat(buf, "rcvegrfull ", blen);
+	if (err & ERR_MASK(RcvHdrFullErr))
+		strlcat(buf, "rcvhdrfull ", blen);
+	if (err & ERR_MASK(IBStatusChanged))
+		strlcat(buf, "ibcstatuschg ", blen);
+	if (err & ERR_MASK(RcvIBLostLinkErr))
+		strlcat(buf, "riblostlink ", blen);
+	if (err & ERR_MASK(HardwareErr))
+		strlcat(buf, "hardware ", blen);
+	if (err & ERR_MASK(ResetNegated))
+		strlcat(buf, "reset ", blen);
+done:
+	return iserr;
+}
+
+/*
+ * Called when we might have an error that is specific to a particular
+ * PIO buffer, and may need to cancel that buffer, so it can be re-used.
+ */
+static void qib_disarm_6120_senderrbufs(struct qib_pportdata *ppd)
+{
+	unsigned long sbuf[2];
+	struct qib_devdata *dd = ppd->dd;
+
+	/*
+	 * It's possible that sendbuffererror could have bits set; might
+	 * have already done this as a result of hardware error handling.
+	 */
+	sbuf[0] = qib_read_kreg64(dd, kr_sendbuffererror);
+	sbuf[1] = qib_read_kreg64(dd, kr_sendbuffererror + 1);
+
+	if (sbuf[0] || sbuf[1])
+		qib_disarm_piobufs_set(dd, sbuf,
+				       dd->piobcnt2k + dd->piobcnt4k);
+}
+
+static int chk_6120_linkrecovery(struct qib_devdata *dd, u64 ibcs)
+{
+	int ret = 1;
+	u32 ibstate = qib_6120_iblink_state(ibcs);
+	u32 linkrecov = read_6120_creg32(dd, cr_iblinkerrrecov);
+
+	if (linkrecov != dd->cspec->lastlinkrecov) {
+		/* and no more until active again */
+		dd->cspec->lastlinkrecov = 0;
+		qib_set_linkstate(dd->pport, QIB_IB_LINKDOWN);
+		ret = 0;
+	}
+	if (ibstate == IB_PORT_ACTIVE)
+		dd->cspec->lastlinkrecov =
+			read_6120_creg32(dd, cr_iblinkerrrecov);
+	return ret;
+}
+
+static void handle_6120_errors(struct qib_devdata *dd, u64 errs)
+{
+	char *msg;
+	u64 ignore_this_time = 0;
+	u64 iserr = 0;
+	int log_idx;
+	struct qib_pportdata *ppd = dd->pport;
+	u64 mask;
+
+	/* don't report errors that are masked */
+	errs &= dd->cspec->errormask;
+	msg = dd->cspec->emsgbuf;
+
+	/* do these first, they are most important */
+	if (errs & ERR_MASK(HardwareErr))
+		qib_handle_6120_hwerrors(dd, msg, sizeof(dd->cspec->emsgbuf));
+	else
+		for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx)
+			if (errs & dd->eep_st_masks[log_idx].errs_to_log)
+				qib_inc_eeprom_err(dd, log_idx, 1);
+
+	if (errs & ~IB_E_BITSEXTANT)
+		qib_dev_err(dd,
+			"error interrupt with unknown errors %llx set\n",
+			(unsigned long long) (errs & ~IB_E_BITSEXTANT));
+
+	if (errs & E_SUM_ERRS) {
+		qib_disarm_6120_senderrbufs(ppd);
+		if ((errs & E_SUM_LINK_PKTERRS) &&
+		    !(ppd->lflags & QIBL_LINKACTIVE)) {
+			/*
+			 * This can happen when trying to bring the link
+			 * up, but the IB link changes state at the "wrong"
+			 * time. The IB logic then complains that the packet
+			 * isn't valid.  We don't want to confuse people, so
+			 * we just don't print them, except at debug
+			 */
+			ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+		}
+	} else if ((errs & E_SUM_LINK_PKTERRS) &&
+		   !(ppd->lflags & QIBL_LINKACTIVE)) {
+		/*
+		 * This can happen when SMA is trying to bring the link
+		 * up, but the IB link changes state at the "wrong" time.
+		 * The IB logic then complains that the packet isn't
+		 * valid.  We don't want to confuse people, so we just
+		 * don't print them, except at debug
+		 */
+		ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+	}
+
+	qib_write_kreg(dd, kr_errclear, errs);
+
+	errs &= ~ignore_this_time;
+	if (!errs)
+		goto done;
+
+	/*
+	 * The ones we mask off are handled specially below
+	 * or above.
+	 */
+	mask = ERR_MASK(IBStatusChanged) | ERR_MASK(RcvEgrFullErr) |
+		ERR_MASK(RcvHdrFullErr) | ERR_MASK(HardwareErr);
+	qib_decode_6120_err(dd, msg, sizeof(dd->cspec->emsgbuf), errs & ~mask);
+
+	if (errs & E_SUM_PKTERRS)
+		qib_stats.sps_rcverrs++;
+	if (errs & E_SUM_ERRS)
+		qib_stats.sps_txerrs++;
+
+	iserr = errs & ~(E_SUM_PKTERRS | QLOGIC_IB_E_PKTERRS);
+
+	if (errs & ERR_MASK(IBStatusChanged)) {
+		u64 ibcs = qib_read_kreg64(dd, kr_ibcstatus);
+		u32 ibstate = qib_6120_iblink_state(ibcs);
+		int handle = 1;
+
+		if (ibstate != IB_PORT_INIT && dd->cspec->lastlinkrecov)
+			handle = chk_6120_linkrecovery(dd, ibcs);
+		/*
+		 * Since going into a recovery state causes the link state
+		 * to go down and since recovery is transitory, it is better
+		 * if we "miss" ever seeing the link training state go into
+		 * recovery (i.e., ignore this transition for link state
+		 * special handling purposes) without updating lastibcstat.
+		 */
+		if (handle && qib_6120_phys_portstate(ibcs) ==
+					    IB_PHYSPORTSTATE_LINK_ERR_RECOVER)
+			handle = 0;
+		if (handle)
+			qib_handle_e_ibstatuschanged(ppd, ibcs);
+	}
+
+	if (errs & ERR_MASK(ResetNegated)) {
+		qib_dev_err(dd,
+			"Got reset, requires re-init (unload and reload driver)\n");
+		dd->flags &= ~QIB_INITTED;  /* needs re-init */
+		/* mark as having had error */
+		*dd->devstatusp |= QIB_STATUS_HWERROR;
+		*dd->pport->statusp &= ~QIB_STATUS_IB_CONF;
+	}
+
+	if (*msg && iserr)
+		qib_dev_porterr(dd, ppd->port, "%s error\n", msg);
+
+	if (ppd->state_wanted & ppd->lflags)
+		wake_up_interruptible(&ppd->state_wait);
+
+	/*
+	 * If there were hdrq or egrfull errors, wake up any processes
+	 * waiting in poll.  We used to try to check which contexts had
+	 * the overflow, but given the cost of that and the chip reads
+	 * to support it, it's better to just wake everybody up if we
+	 * get an overflow; waiters can poll again if it's not them.
+	 */
+	if (errs & (ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr))) {
+		qib_handle_urcv(dd, ~0U);
+		if (errs & ERR_MASK(RcvEgrFullErr))
+			qib_stats.sps_buffull++;
+		else
+			qib_stats.sps_hdrfull++;
+	}
+done:
+	return;
+}
+
+/**
+ * qib_6120_init_hwerrors - enable hardware errors
+ * @dd: the qlogic_ib device
+ *
+ * now that we have finished initializing everything that might reasonably
+ * cause a hardware error, and cleared those errors bits as they occur,
+ * we can enable hardware errors in the mask (potentially enabling
+ * freeze mode), and enable hardware errors as errors (along with
+ * everything else) in errormask
+ */
+static void qib_6120_init_hwerrors(struct qib_devdata *dd)
+{
+	u64 val;
+	u64 extsval;
+
+	extsval = qib_read_kreg64(dd, kr_extstatus);
+
+	if (!(extsval & QLOGIC_IB_EXTS_MEMBIST_ENDTEST))
+		qib_dev_err(dd, "MemBIST did not complete!\n");
+
+	/* init so all hwerrors interrupt, and enter freeze, ajdust below */
+	val = ~0ULL;
+	if (dd->minrev < 2) {
+		/*
+		 * Avoid problem with internal interface bus parity
+		 * checking. Fixed in Rev2.
+		 */
+		val &= ~QLOGIC_IB_HWE_PCIEBUSPARITYRADM;
+	}
+	/* avoid some intel cpu's speculative read freeze mode issue */
+	val &= ~TXEMEMPARITYERR_PIOBUF;
+
+	dd->cspec->hwerrmask = val;
+
+	qib_write_kreg(dd, kr_hwerrclear, ~HWE_MASK(PowerOnBISTFailed));
+	qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+
+	/* clear all */
+	qib_write_kreg(dd, kr_errclear, ~0ULL);
+	/* enable errors that are masked, at least this first time. */
+	qib_write_kreg(dd, kr_errmask, ~0ULL);
+	dd->cspec->errormask = qib_read_kreg64(dd, kr_errmask);
+	/* clear any interrupts up to this point (ints still not enabled) */
+	qib_write_kreg(dd, kr_intclear, ~0ULL);
+
+	qib_write_kreg(dd, kr_rcvbthqp,
+		       dd->qpn_mask << (QIB_6120_RcvBTHQP_BTHQP_Mask_LSB - 1) |
+		       QIB_KD_QP);
+}
+
+/*
+ * Disable and enable the armlaunch error.  Used for PIO bandwidth testing
+ * on chips that are count-based, rather than trigger-based.  There is no
+ * reference counting, but that's also fine, given the intended use.
+ * Only chip-specific because it's all register accesses
+ */
+static void qib_set_6120_armlaunch(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		qib_write_kreg(dd, kr_errclear,
+			       ERR_MASK(SendPioArmLaunchErr));
+		dd->cspec->errormask |= ERR_MASK(SendPioArmLaunchErr);
+	} else
+		dd->cspec->errormask &= ~ERR_MASK(SendPioArmLaunchErr);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+}
+
+/*
+ * Formerly took parameter <which> in pre-shifted,
+ * pre-merged form with LinkCmd and LinkInitCmd
+ * together, and assuming the zero was NOP.
+ */
+static void qib_set_ib_6120_lstate(struct qib_pportdata *ppd, u16 linkcmd,
+				   u16 linitcmd)
+{
+	u64 mod_wd;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	if (linitcmd == QLOGIC_IB_IBCC_LINKINITCMD_DISABLE) {
+		/*
+		 * If we are told to disable, note that so link-recovery
+		 * code does not attempt to bring us back up.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	} else if (linitcmd || linkcmd == QLOGIC_IB_IBCC_LINKCMD_DOWN) {
+		/*
+		 * Any other linkinitcmd will lead to LINKDOWN and then
+		 * to INIT (if all is well), so clear flag to let
+		 * link-recovery code attempt to bring us back up.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	}
+
+	mod_wd = (linkcmd << QLOGIC_IB_IBCC_LINKCMD_SHIFT) |
+		(linitcmd << QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+
+	qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl | mod_wd);
+	/* write to chip to prevent back-to-back writes of control reg */
+	qib_write_kreg(dd, kr_scratch, 0);
+}
+
+/**
+ * qib_6120_bringup_serdes - bring up the serdes
+ * @dd: the qlogic_ib device
+ */
+static int qib_6120_bringup_serdes(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 val, config1, prev_val, hwstat, ibc;
+
+	/* Put IBC in reset, sends disabled */
+	dd->control &= ~QLOGIC_IB_C_LINKENABLE;
+	qib_write_kreg(dd, kr_control, 0ULL);
+
+	dd->cspec->ibdeltainprog = 1;
+	dd->cspec->ibsymsnap = read_6120_creg32(dd, cr_ibsymbolerr);
+	dd->cspec->iblnkerrsnap = read_6120_creg32(dd, cr_iblinkerrrecov);
+
+	/* flowcontrolwatermark is in units of KBytes */
+	ibc = 0x5ULL << SYM_LSB(IBCCtrl, FlowCtrlWaterMark);
+	/*
+	 * How often flowctrl sent.  More or less in usecs; balance against
+	 * watermark value, so that in theory senders always get a flow
+	 * control update in time to not let the IB link go idle.
+	 */
+	ibc |= 0x3ULL << SYM_LSB(IBCCtrl, FlowCtrlPeriod);
+	/* max error tolerance */
+	dd->cspec->lli_thresh = 0xf;
+	ibc |= (u64) dd->cspec->lli_thresh << SYM_LSB(IBCCtrl, PhyerrThreshold);
+	/* use "real" buffer space for */
+	ibc |= 4ULL << SYM_LSB(IBCCtrl, CreditScale);
+	/* IB credit flow control. */
+	ibc |= 0xfULL << SYM_LSB(IBCCtrl, OverrunThreshold);
+	/*
+	 * set initial max size pkt IBC will send, including ICRC; it's the
+	 * PIO buffer size in dwords, less 1; also see qib_set_mtu()
+	 */
+	ibc |= ((u64)(ppd->ibmaxlen >> 2) + 1) << SYM_LSB(IBCCtrl, MaxPktLen);
+	dd->cspec->ibcctrl = ibc; /* without linkcmd or linkinitcmd! */
+
+	/* initially come up waiting for TS1, without sending anything. */
+	val = dd->cspec->ibcctrl | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE <<
+		QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+	qib_write_kreg(dd, kr_ibcctrl, val);
+
+	val = qib_read_kreg64(dd, kr_serdes_cfg0);
+	config1 = qib_read_kreg64(dd, kr_serdes_cfg1);
+
+	/*
+	 * Force reset on, also set rxdetect enable.  Must do before reading
+	 * serdesstatus at least for simulation, or some of the bits in
+	 * serdes status will come back as undefined and cause simulation
+	 * failures
+	 */
+	val |= SYM_MASK(SerdesCfg0, ResetPLL) |
+		SYM_MASK(SerdesCfg0, RxDetEnX) |
+		(SYM_MASK(SerdesCfg0, L1PwrDnA) |
+		 SYM_MASK(SerdesCfg0, L1PwrDnB) |
+		 SYM_MASK(SerdesCfg0, L1PwrDnC) |
+		 SYM_MASK(SerdesCfg0, L1PwrDnD));
+	qib_write_kreg(dd, kr_serdes_cfg0, val);
+	/* be sure chip saw it */
+	qib_read_kreg64(dd, kr_scratch);
+	udelay(5);              /* need pll reset set at least for a bit */
+	/*
+	 * after PLL is reset, set the per-lane Resets and TxIdle and
+	 * clear the PLL reset and rxdetect (to get falling edge).
+	 * Leave L1PWR bits set (permanently)
+	 */
+	val &= ~(SYM_MASK(SerdesCfg0, RxDetEnX) |
+		 SYM_MASK(SerdesCfg0, ResetPLL) |
+		 (SYM_MASK(SerdesCfg0, L1PwrDnA) |
+		  SYM_MASK(SerdesCfg0, L1PwrDnB) |
+		  SYM_MASK(SerdesCfg0, L1PwrDnC) |
+		  SYM_MASK(SerdesCfg0, L1PwrDnD)));
+	val |= (SYM_MASK(SerdesCfg0, ResetA) |
+		SYM_MASK(SerdesCfg0, ResetB) |
+		SYM_MASK(SerdesCfg0, ResetC) |
+		SYM_MASK(SerdesCfg0, ResetD)) |
+		SYM_MASK(SerdesCfg0, TxIdeEnX);
+	qib_write_kreg(dd, kr_serdes_cfg0, val);
+	/* be sure chip saw it */
+	(void) qib_read_kreg64(dd, kr_scratch);
+	/* need PLL reset clear for at least 11 usec before lane
+	 * resets cleared; give it a few more to be sure */
+	udelay(15);
+	val &= ~((SYM_MASK(SerdesCfg0, ResetA) |
+		  SYM_MASK(SerdesCfg0, ResetB) |
+		  SYM_MASK(SerdesCfg0, ResetC) |
+		  SYM_MASK(SerdesCfg0, ResetD)) |
+		 SYM_MASK(SerdesCfg0, TxIdeEnX));
+
+	qib_write_kreg(dd, kr_serdes_cfg0, val);
+	/* be sure chip saw it */
+	(void) qib_read_kreg64(dd, kr_scratch);
+
+	val = qib_read_kreg64(dd, kr_xgxs_cfg);
+	prev_val = val;
+	if (val & QLOGIC_IB_XGXS_RESET)
+		val &= ~QLOGIC_IB_XGXS_RESET;
+	if (SYM_FIELD(val, XGXSCfg, polarity_inv) != ppd->rx_pol_inv) {
+		/* need to compensate for Tx inversion in partner */
+		val &= ~SYM_MASK(XGXSCfg, polarity_inv);
+		val |= (u64)ppd->rx_pol_inv << SYM_LSB(XGXSCfg, polarity_inv);
+	}
+	if (val != prev_val)
+		qib_write_kreg(dd, kr_xgxs_cfg, val);
+
+	val = qib_read_kreg64(dd, kr_serdes_cfg0);
+
+	/* clear current and de-emphasis bits */
+	config1 &= ~0x0ffffffff00ULL;
+	/* set current to 20ma */
+	config1 |= 0x00000000000ULL;
+	/* set de-emphasis to -5.68dB */
+	config1 |= 0x0cccc000000ULL;
+	qib_write_kreg(dd, kr_serdes_cfg1, config1);
+
+	/* base and port guid same for single port */
+	ppd->guid = dd->base_guid;
+
+	/*
+	 * the process of setting and un-resetting the serdes normally
+	 * causes a serdes PLL error, so check for that and clear it
+	 * here.  Also clearr hwerr bit in errstatus, but not others.
+	 */
+	hwstat = qib_read_kreg64(dd, kr_hwerrstatus);
+	if (hwstat) {
+		/* should just have PLL, clear all set, in an case */
+		qib_write_kreg(dd, kr_hwerrclear, hwstat);
+		qib_write_kreg(dd, kr_errclear, ERR_MASK(HardwareErr));
+	}
+
+	dd->control |= QLOGIC_IB_C_LINKENABLE;
+	dd->control &= ~QLOGIC_IB_C_FREEZEMODE;
+	qib_write_kreg(dd, kr_control, dd->control);
+
+	return 0;
+}
+
+/**
+ * qib_6120_quiet_serdes - set serdes to txidle
+ * @ppd: physical port of the qlogic_ib device
+ * Called when driver is being unloaded
+ */
+static void qib_6120_quiet_serdes(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 val;
+
+	qib_set_ib_6120_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+
+	/* disable IBC */
+	dd->control &= ~QLOGIC_IB_C_LINKENABLE;
+	qib_write_kreg(dd, kr_control,
+		       dd->control | QLOGIC_IB_C_FREEZEMODE);
+
+	if (dd->cspec->ibsymdelta || dd->cspec->iblnkerrdelta ||
+	    dd->cspec->ibdeltainprog) {
+		u64 diagc;
+
+		/* enable counter writes */
+		diagc = qib_read_kreg64(dd, kr_hwdiagctrl);
+		qib_write_kreg(dd, kr_hwdiagctrl,
+			       diagc | SYM_MASK(HwDiagCtrl, CounterWrEnable));
+
+		if (dd->cspec->ibsymdelta || dd->cspec->ibdeltainprog) {
+			val = read_6120_creg32(dd, cr_ibsymbolerr);
+			if (dd->cspec->ibdeltainprog)
+				val -= val - dd->cspec->ibsymsnap;
+			val -= dd->cspec->ibsymdelta;
+			write_6120_creg(dd, cr_ibsymbolerr, val);
+		}
+		if (dd->cspec->iblnkerrdelta || dd->cspec->ibdeltainprog) {
+			val = read_6120_creg32(dd, cr_iblinkerrrecov);
+			if (dd->cspec->ibdeltainprog)
+				val -= val - dd->cspec->iblnkerrsnap;
+			val -= dd->cspec->iblnkerrdelta;
+			write_6120_creg(dd, cr_iblinkerrrecov, val);
+		}
+
+		/* and disable counter writes */
+		qib_write_kreg(dd, kr_hwdiagctrl, diagc);
+	}
+
+	val = qib_read_kreg64(dd, kr_serdes_cfg0);
+	val |= SYM_MASK(SerdesCfg0, TxIdeEnX);
+	qib_write_kreg(dd, kr_serdes_cfg0, val);
+}
+
+/**
+ * qib_6120_setup_setextled - set the state of the two external LEDs
+ * @dd: the qlogic_ib device
+ * @on: whether the link is up or not
+ *
+ * The exact combo of LEDs if on is true is determined by looking
+ * at the ibcstatus.
+
+ * These LEDs indicate the physical and logical state of IB link.
+ * For this chip (at least with recommended board pinouts), LED1
+ * is Yellow (logical state) and LED2 is Green (physical state),
+ *
+ * Note:  We try to match the Mellanox HCA LED behavior as best
+ * we can.  Green indicates physical link state is OK (something is
+ * plugged in, and we can train).
+ * Amber indicates the link is logically up (ACTIVE).
+ * Mellanox further blinks the amber LED to indicate data packet
+ * activity, but we have no hardware support for that, so it would
+ * require waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if appropriate.  That's
+ * visible overhead, so not something we will do.
+ *
+ */
+static void qib_6120_setup_setextled(struct qib_pportdata *ppd, u32 on)
+{
+	u64 extctl, val, lst, ltst;
+	unsigned long flags;
+	struct qib_devdata *dd = ppd->dd;
+
+	/*
+	 * The diags use the LED to indicate diag info, so we leave
+	 * the external LED alone when the diags are running.
+	 */
+	if (dd->diag_client)
+		return;
+
+	/* Allow override of LED display for, e.g. Locating system in rack */
+	if (ppd->led_override) {
+		ltst = (ppd->led_override & QIB_LED_PHYS) ?
+			IB_PHYSPORTSTATE_LINKUP : IB_PHYSPORTSTATE_DISABLED,
+		lst = (ppd->led_override & QIB_LED_LOG) ?
+			IB_PORT_ACTIVE : IB_PORT_DOWN;
+	} else if (on) {
+		val = qib_read_kreg64(dd, kr_ibcstatus);
+		ltst = qib_6120_phys_portstate(val);
+		lst = qib_6120_iblink_state(val);
+	} else {
+		ltst = 0;
+		lst = 0;
+	}
+
+	spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+	extctl = dd->cspec->extctrl & ~(SYM_MASK(EXTCtrl, LEDPriPortGreenOn) |
+				 SYM_MASK(EXTCtrl, LEDPriPortYellowOn));
+
+	if (ltst == IB_PHYSPORTSTATE_LINKUP)
+		extctl |= SYM_MASK(EXTCtrl, LEDPriPortYellowOn);
+	if (lst == IB_PORT_ACTIVE)
+		extctl |= SYM_MASK(EXTCtrl, LEDPriPortGreenOn);
+	dd->cspec->extctrl = extctl;
+	qib_write_kreg(dd, kr_extctrl, extctl);
+	spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+}
+
+static void qib_6120_free_irq(struct qib_devdata *dd)
+{
+	if (dd->cspec->irq) {
+		free_irq(dd->cspec->irq, dd);
+		dd->cspec->irq = 0;
+	}
+	qib_nomsi(dd);
+}
+
+/**
+ * qib_6120_setup_cleanup - clean up any per-chip chip-specific stuff
+ * @dd: the qlogic_ib device
+ *
+ * This is called during driver unload.
+*/
+static void qib_6120_setup_cleanup(struct qib_devdata *dd)
+{
+	qib_6120_free_irq(dd);
+	kfree(dd->cspec->cntrs);
+	kfree(dd->cspec->portcntrs);
+	if (dd->cspec->dummy_hdrq) {
+		dma_free_coherent(&dd->pcidev->dev,
+				  ALIGN(dd->rcvhdrcnt *
+					dd->rcvhdrentsize *
+					sizeof(u32), PAGE_SIZE),
+				  dd->cspec->dummy_hdrq,
+				  dd->cspec->dummy_hdrq_phys);
+		dd->cspec->dummy_hdrq = NULL;
+	}
+}
+
+static void qib_wantpiobuf_6120_intr(struct qib_devdata *dd, u32 needint)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+	if (needint)
+		dd->sendctrl |= SYM_MASK(SendCtrl, PIOIntBufAvail);
+	else
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, PIOIntBufAvail);
+	qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+}
+
+/*
+ * handle errors and unusual events first, separate function
+ * to improve cache hits for fast path interrupt handling
+ */
+static noinline void unlikely_6120_intr(struct qib_devdata *dd, u64 istat)
+{
+	if (unlikely(istat & ~QLOGIC_IB_I_BITSEXTANT))
+		qib_dev_err(dd, "interrupt with unknown interrupts %Lx set\n",
+			    istat & ~QLOGIC_IB_I_BITSEXTANT);
+
+	if (istat & QLOGIC_IB_I_ERROR) {
+		u64 estat = 0;
+
+		qib_stats.sps_errints++;
+		estat = qib_read_kreg64(dd, kr_errstatus);
+		if (!estat)
+			qib_devinfo(dd->pcidev,
+				"error interrupt (%Lx), but no error bits set!\n",
+				istat);
+		handle_6120_errors(dd, estat);
+	}
+
+	if (istat & QLOGIC_IB_I_GPIO) {
+		u32 gpiostatus;
+		u32 to_clear = 0;
+
+		/*
+		 * GPIO_3..5 on IBA6120 Rev2 chips indicate
+		 * errors that we need to count.
+		 */
+		gpiostatus = qib_read_kreg32(dd, kr_gpio_status);
+		/* First the error-counter case. */
+		if (gpiostatus & GPIO_ERRINTR_MASK) {
+			/* want to clear the bits we see asserted. */
+			to_clear |= (gpiostatus & GPIO_ERRINTR_MASK);
+
+			/*
+			 * Count appropriately, clear bits out of our copy,
+			 * as they have been "handled".
+			 */
+			if (gpiostatus & (1 << GPIO_RXUVL_BIT))
+				dd->cspec->rxfc_unsupvl_errs++;
+			if (gpiostatus & (1 << GPIO_OVRUN_BIT))
+				dd->cspec->overrun_thresh_errs++;
+			if (gpiostatus & (1 << GPIO_LLI_BIT))
+				dd->cspec->lli_errs++;
+			gpiostatus &= ~GPIO_ERRINTR_MASK;
+		}
+		if (gpiostatus) {
+			/*
+			 * Some unexpected bits remain. If they could have
+			 * caused the interrupt, complain and clear.
+			 * To avoid repetition of this condition, also clear
+			 * the mask. It is almost certainly due to error.
+			 */
+			const u32 mask = qib_read_kreg32(dd, kr_gpio_mask);
+
+			/*
+			 * Also check that the chip reflects our shadow,
+			 * and report issues, If they caused the interrupt.
+			 * we will suppress by refreshing from the shadow.
+			 */
+			if (mask & gpiostatus) {
+				to_clear |= (gpiostatus & mask);
+				dd->cspec->gpio_mask &= ~(gpiostatus & mask);
+				qib_write_kreg(dd, kr_gpio_mask,
+					       dd->cspec->gpio_mask);
+			}
+		}
+		if (to_clear)
+			qib_write_kreg(dd, kr_gpio_clear, (u64) to_clear);
+	}
+}
+
+static irqreturn_t qib_6120intr(int irq, void *data)
+{
+	struct qib_devdata *dd = data;
+	irqreturn_t ret;
+	u32 istat, ctxtrbits, rmask, crcs = 0;
+	unsigned i;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) {
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		ret = IRQ_HANDLED;
+		goto bail;
+	}
+
+	istat = qib_read_kreg32(dd, kr_intstatus);
+
+	if (unlikely(!istat)) {
+		ret = IRQ_NONE; /* not our interrupt, or already handled */
+		goto bail;
+	}
+	if (unlikely(istat == -1)) {
+		qib_bad_intrstatus(dd);
+		/* don't know if it was our interrupt or not */
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	this_cpu_inc(*dd->int_counter);
+
+	if (unlikely(istat & (~QLOGIC_IB_I_BITSEXTANT |
+			      QLOGIC_IB_I_GPIO | QLOGIC_IB_I_ERROR)))
+		unlikely_6120_intr(dd, istat);
+
+	/*
+	 * Clear the interrupt bits we found set, relatively early, so we
+	 * "know" know the chip will have seen this by the time we process
+	 * the queue, and will re-interrupt if necessary.  The processor
+	 * itself won't take the interrupt again until we return.
+	 */
+	qib_write_kreg(dd, kr_intclear, istat);
+
+	/*
+	 * Handle kernel receive queues before checking for pio buffers
+	 * available since receives can overflow; piobuf waiters can afford
+	 * a few extra cycles, since they were waiting anyway.
+	 */
+	ctxtrbits = istat &
+		((QLOGIC_IB_I_RCVAVAIL_MASK << QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+		 (QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT));
+	if (ctxtrbits) {
+		rmask = (1U << QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+			(1U << QLOGIC_IB_I_RCVURG_SHIFT);
+		for (i = 0; i < dd->first_user_ctxt; i++) {
+			if (ctxtrbits & rmask) {
+				ctxtrbits &= ~rmask;
+				crcs += qib_kreceive(dd->rcd[i],
+						     &dd->cspec->lli_counter,
+						     NULL);
+			}
+			rmask <<= 1;
+		}
+		if (crcs) {
+			u32 cntr = dd->cspec->lli_counter;
+
+			cntr += crcs;
+			if (cntr) {
+				if (cntr > dd->cspec->lli_thresh) {
+					dd->cspec->lli_counter = 0;
+					dd->cspec->lli_errs++;
+				} else
+					dd->cspec->lli_counter += cntr;
+			}
+		}
+
+
+		if (ctxtrbits) {
+			ctxtrbits =
+				(ctxtrbits >> QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+				(ctxtrbits >> QLOGIC_IB_I_RCVURG_SHIFT);
+			qib_handle_urcv(dd, ctxtrbits);
+		}
+	}
+
+	if ((istat & QLOGIC_IB_I_SPIOBUFAVAIL) && (dd->flags & QIB_INITTED))
+		qib_ib_piobufavail(dd);
+
+	ret = IRQ_HANDLED;
+bail:
+	return ret;
+}
+
+/*
+ * Set up our chip-specific interrupt handler
+ * The interrupt type has already been setup, so
+ * we just need to do the registration and error checking.
+ */
+static void qib_setup_6120_interrupt(struct qib_devdata *dd)
+{
+	/*
+	 * If the chip supports added error indication via GPIO pins,
+	 * enable interrupts on those bits so the interrupt routine
+	 * can count the events. Also set flag so interrupt routine
+	 * can know they are expected.
+	 */
+	if (SYM_FIELD(dd->revision, Revision_R,
+		      ChipRevMinor) > 1) {
+		/* Rev2+ reports extra errors via internal GPIO pins */
+		dd->cspec->gpio_mask |= GPIO_ERRINTR_MASK;
+		qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+	}
+
+	if (!dd->cspec->irq)
+		qib_dev_err(dd,
+			"irq is 0, BIOS error?  Interrupts won't work\n");
+	else {
+		int ret;
+
+		ret = request_irq(dd->cspec->irq, qib_6120intr, 0,
+				  QIB_DRV_NAME, dd);
+		if (ret)
+			qib_dev_err(dd,
+				"Couldn't setup interrupt (irq=%d): %d\n",
+				dd->cspec->irq, ret);
+	}
+}
+
+/**
+ * pe_boardname - fill in the board name
+ * @dd: the qlogic_ib device
+ *
+ * info is based on the board revision register
+ */
+static void pe_boardname(struct qib_devdata *dd)
+{
+	char *n;
+	u32 boardid, namelen;
+
+	boardid = SYM_FIELD(dd->revision, Revision,
+			    BoardID);
+
+	switch (boardid) {
+	case 2:
+		n = "InfiniPath_QLE7140";
+		break;
+	default:
+		qib_dev_err(dd, "Unknown 6120 board with ID %u\n", boardid);
+		n = "Unknown_InfiniPath_6120";
+		break;
+	}
+	namelen = strlen(n) + 1;
+	dd->boardname = kmalloc(namelen, GFP_KERNEL);
+	if (!dd->boardname)
+		qib_dev_err(dd, "Failed allocation for board name: %s\n", n);
+	else
+		snprintf(dd->boardname, namelen, "%s", n);
+
+	if (dd->majrev != 4 || !dd->minrev || dd->minrev > 2)
+		qib_dev_err(dd,
+			"Unsupported InfiniPath hardware revision %u.%u!\n",
+			dd->majrev, dd->minrev);
+
+	snprintf(dd->boardversion, sizeof(dd->boardversion),
+		 "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n",
+		 QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname,
+		 (unsigned)SYM_FIELD(dd->revision, Revision_R, Arch),
+		 dd->majrev, dd->minrev,
+		 (unsigned)SYM_FIELD(dd->revision, Revision_R, SW));
+
+}
+
+/*
+ * This routine sleeps, so it can only be called from user context, not
+ * from interrupt context.  If we need interrupt context, we can split
+ * it into two routines.
+ */
+static int qib_6120_setup_reset(struct qib_devdata *dd)
+{
+	u64 val;
+	int i;
+	int ret;
+	u16 cmdval;
+	u8 int_line, clinesz;
+
+	qib_pcie_getcmd(dd, &cmdval, &int_line, &clinesz);
+
+	/* Use ERROR so it shows up in logs, etc. */
+	qib_dev_err(dd, "Resetting InfiniPath unit %u\n", dd->unit);
+
+	/* no interrupts till re-initted */
+	qib_6120_set_intr_state(dd, 0);
+
+	dd->cspec->ibdeltainprog = 0;
+	dd->cspec->ibsymdelta = 0;
+	dd->cspec->iblnkerrdelta = 0;
+
+	/*
+	 * Keep chip from being accessed until we are ready.  Use
+	 * writeq() directly, to allow the write even though QIB_PRESENT
+	 * isn't set.
+	 */
+	dd->flags &= ~(QIB_INITTED | QIB_PRESENT);
+	/* so we check interrupts work again */
+	dd->z_int_counter = qib_int_counter(dd);
+	val = dd->control | QLOGIC_IB_C_RESET;
+	writeq(val, &dd->kregbase[kr_control]);
+	mb(); /* prevent compiler re-ordering around actual reset */
+
+	for (i = 1; i <= 5; i++) {
+		/*
+		 * Allow MBIST, etc. to complete; longer on each retry.
+		 * We sometimes get machine checks from bus timeout if no
+		 * response, so for now, make it *really* long.
+		 */
+		msleep(1000 + (1 + i) * 2000);
+
+		qib_pcie_reenable(dd, cmdval, int_line, clinesz);
+
+		/*
+		 * Use readq directly, so we don't need to mark it as PRESENT
+		 * until we get a successful indication that all is well.
+		 */
+		val = readq(&dd->kregbase[kr_revision]);
+		if (val == dd->revision) {
+			dd->flags |= QIB_PRESENT; /* it's back */
+			ret = qib_reinit_intr(dd);
+			goto bail;
+		}
+	}
+	ret = 0; /* failed */
+
+bail:
+	if (ret) {
+		if (qib_pcie_params(dd, dd->lbus_width, NULL, NULL))
+			qib_dev_err(dd,
+				"Reset failed to setup PCIe or interrupts; continuing anyway\n");
+		/* clear the reset error, init error/hwerror mask */
+		qib_6120_init_hwerrors(dd);
+		/* for Rev2 error interrupts; nop for rev 1 */
+		qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+		/* clear the reset error, init error/hwerror mask */
+		qib_6120_init_hwerrors(dd);
+	}
+	return ret;
+}
+
+/**
+ * qib_6120_put_tid - write a TID in chip
+ * @dd: the qlogic_ib device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0)
+ * for expected
+ * @pa: physical address of in memory buffer; tidinvalid if freeing
+ *
+ * This exists as a separate routine to allow for special locking etc.
+ * It's used for both the full cleanup on exit, as well as the normal
+ * setup and teardown.
+ */
+static void qib_6120_put_tid(struct qib_devdata *dd, u64 __iomem *tidptr,
+			     u32 type, unsigned long pa)
+{
+	u32 __iomem *tidp32 = (u32 __iomem *)tidptr;
+	unsigned long flags;
+	int tidx;
+	spinlock_t *tidlockp; /* select appropriate spinlock */
+
+	if (!dd->kregbase)
+		return;
+
+	if (pa != dd->tidinvalid) {
+		if (pa & ((1U << 11) - 1)) {
+			qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n",
+				    pa);
+			return;
+		}
+		pa >>= 11;
+		if (pa & ~QLOGIC_IB_RT_ADDR_MASK) {
+			qib_dev_err(dd,
+				"Physical page address 0x%lx larger than supported\n",
+				pa);
+			return;
+		}
+
+		if (type == RCVHQ_RCV_TYPE_EAGER)
+			pa |= dd->tidtemplate;
+		else /* for now, always full 4KB page */
+			pa |= 2 << 29;
+	}
+
+	/*
+	 * Avoid chip issue by writing the scratch register
+	 * before and after the TID, and with an io write barrier.
+	 * We use a spinlock around the writes, so they can't intermix
+	 * with other TID (eager or expected) writes (the chip problem
+	 * is triggered by back to back TID writes). Unfortunately, this
+	 * call can be done from interrupt level for the ctxt 0 eager TIDs,
+	 * so we have to use irqsave locks.
+	 */
+	/*
+	 * Assumes tidptr always > egrtidbase
+	 * if type == RCVHQ_RCV_TYPE_EAGER.
+	 */
+	tidx = tidptr - dd->egrtidbase;
+
+	tidlockp = (type == RCVHQ_RCV_TYPE_EAGER && tidx < dd->rcvhdrcnt)
+		? &dd->cspec->kernel_tid_lock : &dd->cspec->user_tid_lock;
+	spin_lock_irqsave(tidlockp, flags);
+	qib_write_kreg(dd, kr_scratch, 0xfeeddeaf);
+	writel(pa, tidp32);
+	qib_write_kreg(dd, kr_scratch, 0xdeadbeef);
+	mmiowb();
+	spin_unlock_irqrestore(tidlockp, flags);
+}
+
+/**
+ * qib_6120_put_tid_2 - write a TID in chip, Revision 2 or higher
+ * @dd: the qlogic_ib device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: RCVHQ_RCV_TYPE_EAGER (1) for eager, RCVHQ_RCV_TYPE_EXPECTED (0)
+ * for expected
+ * @pa: physical address of in memory buffer; tidinvalid if freeing
+ *
+ * This exists as a separate routine to allow for selection of the
+ * appropriate "flavor". The static calls in cleanup just use the
+ * revision-agnostic form, as they are not performance critical.
+ */
+static void qib_6120_put_tid_2(struct qib_devdata *dd, u64 __iomem *tidptr,
+			       u32 type, unsigned long pa)
+{
+	u32 __iomem *tidp32 = (u32 __iomem *)tidptr;
+	u32 tidx;
+
+	if (!dd->kregbase)
+		return;
+
+	if (pa != dd->tidinvalid) {
+		if (pa & ((1U << 11) - 1)) {
+			qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n",
+				    pa);
+			return;
+		}
+		pa >>= 11;
+		if (pa & ~QLOGIC_IB_RT_ADDR_MASK) {
+			qib_dev_err(dd,
+				"Physical page address 0x%lx larger than supported\n",
+				pa);
+			return;
+		}
+
+		if (type == RCVHQ_RCV_TYPE_EAGER)
+			pa |= dd->tidtemplate;
+		else /* for now, always full 4KB page */
+			pa |= 2 << 29;
+	}
+	tidx = tidptr - dd->egrtidbase;
+	writel(pa, tidp32);
+	mmiowb();
+}
+
+
+/**
+ * qib_6120_clear_tids - clear all TID entries for a context, expected and eager
+ * @dd: the qlogic_ib device
+ * @ctxt: the context
+ *
+ * clear all TID entries for a context, expected and eager.
+ * Used from qib_close().  On this chip, TIDs are only 32 bits,
+ * not 64, but they are still on 64 bit boundaries, so tidbase
+ * is declared as u64 * for the pointer math, even though we write 32 bits
+ */
+static void qib_6120_clear_tids(struct qib_devdata *dd,
+				struct qib_ctxtdata *rcd)
+{
+	u64 __iomem *tidbase;
+	unsigned long tidinv;
+	u32 ctxt;
+	int i;
+
+	if (!dd->kregbase || !rcd)
+		return;
+
+	ctxt = rcd->ctxt;
+
+	tidinv = dd->tidinvalid;
+	tidbase = (u64 __iomem *)
+		((char __iomem *)(dd->kregbase) +
+		 dd->rcvtidbase +
+		 ctxt * dd->rcvtidcnt * sizeof(*tidbase));
+
+	for (i = 0; i < dd->rcvtidcnt; i++)
+		/* use func pointer because could be one of two funcs */
+		dd->f_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
+				  tidinv);
+
+	tidbase = (u64 __iomem *)
+		((char __iomem *)(dd->kregbase) +
+		 dd->rcvegrbase +
+		 rcd->rcvegr_tid_base * sizeof(*tidbase));
+
+	for (i = 0; i < rcd->rcvegrcnt; i++)
+		/* use func pointer because could be one of two funcs */
+		dd->f_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
+				  tidinv);
+}
+
+/**
+ * qib_6120_tidtemplate - setup constants for TID updates
+ * @dd: the qlogic_ib device
+ *
+ * We setup stuff that we use a lot, to avoid calculating each time
+ */
+static void qib_6120_tidtemplate(struct qib_devdata *dd)
+{
+	u32 egrsize = dd->rcvegrbufsize;
+
+	/*
+	 * For now, we always allocate 4KB buffers (at init) so we can
+	 * receive max size packets.  We may want a module parameter to
+	 * specify 2KB or 4KB and/or make be per ctxt instead of per device
+	 * for those who want to reduce memory footprint.  Note that the
+	 * rcvhdrentsize size must be large enough to hold the largest
+	 * IB header (currently 96 bytes) that we expect to handle (plus of
+	 * course the 2 dwords of RHF).
+	 */
+	if (egrsize == 2048)
+		dd->tidtemplate = 1U << 29;
+	else if (egrsize == 4096)
+		dd->tidtemplate = 2U << 29;
+	dd->tidinvalid = 0;
+}
+
+int __attribute__((weak)) qib_unordered_wc(void)
+{
+	return 0;
+}
+
+/**
+ * qib_6120_get_base_info - set chip-specific flags for user code
+ * @rcd: the qlogic_ib ctxt
+ * @kbase: qib_base_info pointer
+ *
+ * We set the PCIE flag because the lower bandwidth on PCIe vs
+ * HyperTransport can affect some user packet algorithms.
+ */
+static int qib_6120_get_base_info(struct qib_ctxtdata *rcd,
+				  struct qib_base_info *kinfo)
+{
+	if (qib_unordered_wc())
+		kinfo->spi_runtime_flags |= QIB_RUNTIME_FORCE_WC_ORDER;
+
+	kinfo->spi_runtime_flags |= QIB_RUNTIME_PCIE |
+		QIB_RUNTIME_FORCE_PIOAVAIL | QIB_RUNTIME_PIO_REGSWAPPED;
+	return 0;
+}
+
+
+static struct qib_message_header *
+qib_6120_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr)
+{
+	return (struct qib_message_header *)
+		&rhf_addr[sizeof(u64) / sizeof(u32)];
+}
+
+static void qib_6120_config_ctxts(struct qib_devdata *dd)
+{
+	dd->ctxtcnt = qib_read_kreg32(dd, kr_portcnt);
+	if (qib_n_krcv_queues > 1) {
+		dd->first_user_ctxt = qib_n_krcv_queues * dd->num_pports;
+		if (dd->first_user_ctxt > dd->ctxtcnt)
+			dd->first_user_ctxt = dd->ctxtcnt;
+		dd->qpn_mask = dd->first_user_ctxt <= 2 ? 2 : 6;
+	} else
+		dd->first_user_ctxt = dd->num_pports;
+	dd->n_krcv_queues = dd->first_user_ctxt;
+}
+
+static void qib_update_6120_usrhead(struct qib_ctxtdata *rcd, u64 hd,
+				    u32 updegr, u32 egrhd, u32 npkts)
+{
+	if (updegr)
+		qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt);
+	mmiowb();
+	qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt);
+	mmiowb();
+}
+
+static u32 qib_6120_hdrqempty(struct qib_ctxtdata *rcd)
+{
+	u32 head, tail;
+
+	head = qib_read_ureg32(rcd->dd, ur_rcvhdrhead, rcd->ctxt);
+	if (rcd->rcvhdrtail_kvaddr)
+		tail = qib_get_rcvhdrtail(rcd);
+	else
+		tail = qib_read_ureg32(rcd->dd, ur_rcvhdrtail, rcd->ctxt);
+	return head == tail;
+}
+
+/*
+ * Used when we close any ctxt, for DMA already in flight
+ * at close.  Can't be done until we know hdrq size, so not
+ * early in chip init.
+ */
+static void alloc_dummy_hdrq(struct qib_devdata *dd)
+{
+	dd->cspec->dummy_hdrq = dma_alloc_coherent(&dd->pcidev->dev,
+					dd->rcd[0]->rcvhdrq_size,
+					&dd->cspec->dummy_hdrq_phys,
+					GFP_ATOMIC | __GFP_COMP);
+	if (!dd->cspec->dummy_hdrq) {
+		qib_devinfo(dd->pcidev, "Couldn't allocate dummy hdrq\n");
+		/* fallback to just 0'ing */
+		dd->cspec->dummy_hdrq_phys = 0UL;
+	}
+}
+
+/*
+ * Modify the RCVCTRL register in chip-specific way. This
+ * is a function because bit positions and (future) register
+ * location is chip-specific, but the needed operations are
+ * generic. <op> is a bit-mask because we often want to
+ * do multiple modifications.
+ */
+static void rcvctrl_6120_mod(struct qib_pportdata *ppd, unsigned int op,
+			     int ctxt)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 mask, val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+
+	if (op & QIB_RCVCTRL_TAILUPD_ENB)
+		dd->rcvctrl |= (1ULL << QLOGIC_IB_R_TAILUPD_SHIFT);
+	if (op & QIB_RCVCTRL_TAILUPD_DIS)
+		dd->rcvctrl &= ~(1ULL << QLOGIC_IB_R_TAILUPD_SHIFT);
+	if (op & QIB_RCVCTRL_PKEY_ENB)
+		dd->rcvctrl &= ~(1ULL << IBA6120_R_PKEY_DIS_SHIFT);
+	if (op & QIB_RCVCTRL_PKEY_DIS)
+		dd->rcvctrl |= (1ULL << IBA6120_R_PKEY_DIS_SHIFT);
+	if (ctxt < 0)
+		mask = (1ULL << dd->ctxtcnt) - 1;
+	else
+		mask = (1ULL << ctxt);
+	if (op & QIB_RCVCTRL_CTXT_ENB) {
+		/* always done for specific ctxt */
+		dd->rcvctrl |= (mask << SYM_LSB(RcvCtrl, PortEnable));
+		if (!(dd->flags & QIB_NODMA_RTAIL))
+			dd->rcvctrl |= 1ULL << QLOGIC_IB_R_TAILUPD_SHIFT;
+		/* Write these registers before the context is enabled. */
+		qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt,
+			dd->rcd[ctxt]->rcvhdrqtailaddr_phys);
+		qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt,
+			dd->rcd[ctxt]->rcvhdrq_phys);
+
+		if (ctxt == 0 && !dd->cspec->dummy_hdrq)
+			alloc_dummy_hdrq(dd);
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS)
+		dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, PortEnable));
+	if (op & QIB_RCVCTRL_INTRAVAIL_ENB)
+		dd->rcvctrl |= (mask << QLOGIC_IB_R_INTRAVAIL_SHIFT);
+	if (op & QIB_RCVCTRL_INTRAVAIL_DIS)
+		dd->rcvctrl &= ~(mask << QLOGIC_IB_R_INTRAVAIL_SHIFT);
+	qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl);
+	if ((op & QIB_RCVCTRL_INTRAVAIL_ENB) && dd->rhdrhead_intr_off) {
+		/* arm rcv interrupt */
+		val = qib_read_ureg32(dd, ur_rcvhdrhead, ctxt) |
+			dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	}
+	if (op & QIB_RCVCTRL_CTXT_ENB) {
+		/*
+		 * Init the context registers also; if we were
+		 * disabled, tail and head should both be zero
+		 * already from the enable, but since we don't
+		 * know, we have to do it explicitly.
+		 */
+		val = qib_read_ureg32(dd, ur_rcvegrindextail, ctxt);
+		qib_write_ureg(dd, ur_rcvegrindexhead, val, ctxt);
+
+		val = qib_read_ureg32(dd, ur_rcvhdrtail, ctxt);
+		dd->rcd[ctxt]->head = val;
+		/* If kctxt, interrupt on next receive. */
+		if (ctxt < dd->first_user_ctxt)
+			val |= dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS) {
+		/*
+		 * Be paranoid, and never write 0's to these, just use an
+		 * unused page.  Of course,
+		 * rcvhdraddr points to a large chunk of memory, so this
+		 * could still trash things, but at least it won't trash
+		 * page 0, and by disabling the ctxt, it should stop "soon",
+		 * even if a packet or two is in already in flight after we
+		 * disabled the ctxt.  Only 6120 has this issue.
+		 */
+		if (ctxt >= 0) {
+			qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt,
+					    dd->cspec->dummy_hdrq_phys);
+			qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt,
+					    dd->cspec->dummy_hdrq_phys);
+		} else {
+			unsigned i;
+
+			for (i = 0; i < dd->cfgctxts; i++) {
+				qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr,
+					    i, dd->cspec->dummy_hdrq_phys);
+				qib_write_kreg_ctxt(dd, kr_rcvhdraddr,
+					    i, dd->cspec->dummy_hdrq_phys);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+}
+
+/*
+ * Modify the SENDCTRL register in chip-specific way. This
+ * is a function there may be multiple such registers with
+ * slightly different layouts. Only operations actually used
+ * are implemented yet.
+ * Chip requires no back-back sendctrl writes, so write
+ * scratch register after writing sendctrl
+ */
+static void sendctrl_6120_mod(struct qib_pportdata *ppd, u32 op)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 tmp_dd_sendctrl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+
+	/* First the ones that are "sticky", saved in shadow */
+	if (op & QIB_SENDCTRL_CLEAR)
+		dd->sendctrl = 0;
+	if (op & QIB_SENDCTRL_SEND_DIS)
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, PIOEnable);
+	else if (op & QIB_SENDCTRL_SEND_ENB)
+		dd->sendctrl |= SYM_MASK(SendCtrl, PIOEnable);
+	if (op & QIB_SENDCTRL_AVAIL_DIS)
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, PIOBufAvailUpd);
+	else if (op & QIB_SENDCTRL_AVAIL_ENB)
+		dd->sendctrl |= SYM_MASK(SendCtrl, PIOBufAvailUpd);
+
+	if (op & QIB_SENDCTRL_DISARM_ALL) {
+		u32 i, last;
+
+		tmp_dd_sendctrl = dd->sendctrl;
+		/*
+		 * disarm any that are not yet launched, disabling sends
+		 * and updates until done.
+		 */
+		last = dd->piobcnt2k + dd->piobcnt4k;
+		tmp_dd_sendctrl &=
+			~(SYM_MASK(SendCtrl, PIOEnable) |
+			  SYM_MASK(SendCtrl, PIOBufAvailUpd));
+		for (i = 0; i < last; i++) {
+			qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl |
+				       SYM_MASK(SendCtrl, Disarm) | i);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+	}
+
+	tmp_dd_sendctrl = dd->sendctrl;
+
+	if (op & QIB_SENDCTRL_FLUSH)
+		tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Abort);
+	if (op & QIB_SENDCTRL_DISARM)
+		tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Disarm) |
+			((op & QIB_6120_SendCtrl_DisarmPIOBuf_RMASK) <<
+			 SYM_LSB(SendCtrl, DisarmPIOBuf));
+	if (op & QIB_SENDCTRL_AVAIL_BLIP)
+		tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, PIOBufAvailUpd);
+
+	qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	if (op & QIB_SENDCTRL_AVAIL_BLIP) {
+		qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+
+	if (op & QIB_SENDCTRL_FLUSH) {
+		u32 v;
+		/*
+		 * ensure writes have hit chip, then do a few
+		 * more reads, to allow DMA of pioavail registers
+		 * to occur, so in-memory copy is in sync with
+		 * the chip.  Not always safe to sleep.
+		 */
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		qib_read_kreg32(dd, kr_scratch);
+	}
+}
+
+/**
+ * qib_portcntr_6120 - read a per-port counter
+ * @dd: the qlogic_ib device
+ * @creg: the counter to snapshot
+ */
+static u64 qib_portcntr_6120(struct qib_pportdata *ppd, u32 reg)
+{
+	u64 ret = 0ULL;
+	struct qib_devdata *dd = ppd->dd;
+	u16 creg;
+	/* 0xffff for unimplemented or synthesized counters */
+	static const u16 xlator[] = {
+		[QIBPORTCNTR_PKTSEND] = cr_pktsend,
+		[QIBPORTCNTR_WORDSEND] = cr_wordsend,
+		[QIBPORTCNTR_PSXMITDATA] = 0xffff,
+		[QIBPORTCNTR_PSXMITPKTS] = 0xffff,
+		[QIBPORTCNTR_PSXMITWAIT] = 0xffff,
+		[QIBPORTCNTR_SENDSTALL] = cr_sendstall,
+		[QIBPORTCNTR_PKTRCV] = cr_pktrcv,
+		[QIBPORTCNTR_PSRCVDATA] = 0xffff,
+		[QIBPORTCNTR_PSRCVPKTS] = 0xffff,
+		[QIBPORTCNTR_RCVEBP] = cr_rcvebp,
+		[QIBPORTCNTR_RCVOVFL] = cr_rcvovfl,
+		[QIBPORTCNTR_WORDRCV] = cr_wordrcv,
+		[QIBPORTCNTR_RXDROPPKT] = cr_rxdroppkt,
+		[QIBPORTCNTR_RXLOCALPHYERR] = 0xffff,
+		[QIBPORTCNTR_RXVLERR] = 0xffff,
+		[QIBPORTCNTR_ERRICRC] = cr_erricrc,
+		[QIBPORTCNTR_ERRVCRC] = cr_errvcrc,
+		[QIBPORTCNTR_ERRLPCRC] = cr_errlpcrc,
+		[QIBPORTCNTR_BADFORMAT] = cr_badformat,
+		[QIBPORTCNTR_ERR_RLEN] = cr_err_rlen,
+		[QIBPORTCNTR_IBSYMBOLERR] = cr_ibsymbolerr,
+		[QIBPORTCNTR_INVALIDRLEN] = cr_invalidrlen,
+		[QIBPORTCNTR_UNSUPVL] = cr_txunsupvl,
+		[QIBPORTCNTR_EXCESSBUFOVFL] = 0xffff,
+		[QIBPORTCNTR_ERRLINK] = cr_errlink,
+		[QIBPORTCNTR_IBLINKDOWN] = cr_iblinkdown,
+		[QIBPORTCNTR_IBLINKERRRECOV] = cr_iblinkerrrecov,
+		[QIBPORTCNTR_LLI] = 0xffff,
+		[QIBPORTCNTR_PSINTERVAL] = 0xffff,
+		[QIBPORTCNTR_PSSTART] = 0xffff,
+		[QIBPORTCNTR_PSSTAT] = 0xffff,
+		[QIBPORTCNTR_VL15PKTDROP] = 0xffff,
+		[QIBPORTCNTR_ERRPKEY] = cr_errpkey,
+		[QIBPORTCNTR_KHDROVFL] = 0xffff,
+	};
+
+	if (reg >= ARRAY_SIZE(xlator)) {
+		qib_devinfo(ppd->dd->pcidev,
+			 "Unimplemented portcounter %u\n", reg);
+		goto done;
+	}
+	creg = xlator[reg];
+
+	/* handle counters requests not implemented as chip counters */
+	if (reg == QIBPORTCNTR_LLI)
+		ret = dd->cspec->lli_errs;
+	else if (reg == QIBPORTCNTR_EXCESSBUFOVFL)
+		ret = dd->cspec->overrun_thresh_errs;
+	else if (reg == QIBPORTCNTR_KHDROVFL) {
+		int i;
+
+		/* sum over all kernel contexts */
+		for (i = 0; i < dd->first_user_ctxt; i++)
+			ret += read_6120_creg32(dd, cr_portovfl + i);
+	} else if (reg == QIBPORTCNTR_PSSTAT)
+		ret = dd->cspec->pma_sample_status;
+	if (creg == 0xffff)
+		goto done;
+
+	/*
+	 * only fast incrementing counters are 64bit; use 32 bit reads to
+	 * avoid two independent reads when on opteron
+	 */
+	if (creg == cr_wordsend || creg == cr_wordrcv ||
+	    creg == cr_pktsend || creg == cr_pktrcv)
+		ret = read_6120_creg(dd, creg);
+	else
+		ret = read_6120_creg32(dd, creg);
+	if (creg == cr_ibsymbolerr) {
+		if (dd->cspec->ibdeltainprog)
+			ret -= ret - dd->cspec->ibsymsnap;
+		ret -= dd->cspec->ibsymdelta;
+	} else if (creg == cr_iblinkerrrecov) {
+		if (dd->cspec->ibdeltainprog)
+			ret -= ret - dd->cspec->iblnkerrsnap;
+		ret -= dd->cspec->iblnkerrdelta;
+	}
+	if (reg == QIBPORTCNTR_RXDROPPKT) /* add special cased count */
+		ret += dd->cspec->rxfc_unsupvl_errs;
+
+done:
+	return ret;
+}
+
+/*
+ * Device counter names (not port-specific), one line per stat,
+ * single string.  Used by utilities like ipathstats to print the stats
+ * in a way which works for different versions of drivers, without changing
+ * the utility.  Names need to be 12 chars or less (w/o newline), for proper
+ * display by utility.
+ * Non-error counters are first.
+ * Start of "error" conters is indicated by a leading "E " on the first
+ * "error" counter, and doesn't count in label length.
+ * The EgrOvfl list needs to be last so we truncate them at the configured
+ * context count for the device.
+ * cntr6120indices contains the corresponding register indices.
+ */
+static const char cntr6120names[] =
+	"Interrupts\n"
+	"HostBusStall\n"
+	"E RxTIDFull\n"
+	"RxTIDInvalid\n"
+	"Ctxt0EgrOvfl\n"
+	"Ctxt1EgrOvfl\n"
+	"Ctxt2EgrOvfl\n"
+	"Ctxt3EgrOvfl\n"
+	"Ctxt4EgrOvfl\n";
+
+static const size_t cntr6120indices[] = {
+	cr_lbint,
+	cr_lbflowstall,
+	cr_errtidfull,
+	cr_errtidvalid,
+	cr_portovfl + 0,
+	cr_portovfl + 1,
+	cr_portovfl + 2,
+	cr_portovfl + 3,
+	cr_portovfl + 4,
+};
+
+/*
+ * same as cntr6120names and cntr6120indices, but for port-specific counters.
+ * portcntr6120indices is somewhat complicated by some registers needing
+ * adjustments of various kinds, and those are ORed with _PORT_VIRT_FLAG
+ */
+static const char portcntr6120names[] =
+	"TxPkt\n"
+	"TxFlowPkt\n"
+	"TxWords\n"
+	"RxPkt\n"
+	"RxFlowPkt\n"
+	"RxWords\n"
+	"TxFlowStall\n"
+	"E IBStatusChng\n"
+	"IBLinkDown\n"
+	"IBLnkRecov\n"
+	"IBRxLinkErr\n"
+	"IBSymbolErr\n"
+	"RxLLIErr\n"
+	"RxBadFormat\n"
+	"RxBadLen\n"
+	"RxBufOvrfl\n"
+	"RxEBP\n"
+	"RxFlowCtlErr\n"
+	"RxICRCerr\n"
+	"RxLPCRCerr\n"
+	"RxVCRCerr\n"
+	"RxInvalLen\n"
+	"RxInvalPKey\n"
+	"RxPktDropped\n"
+	"TxBadLength\n"
+	"TxDropped\n"
+	"TxInvalLen\n"
+	"TxUnderrun\n"
+	"TxUnsupVL\n"
+	;
+
+#define _PORT_VIRT_FLAG 0x8000 /* "virtual", need adjustments */
+static const size_t portcntr6120indices[] = {
+	QIBPORTCNTR_PKTSEND | _PORT_VIRT_FLAG,
+	cr_pktsendflow,
+	QIBPORTCNTR_WORDSEND | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_PKTRCV | _PORT_VIRT_FLAG,
+	cr_pktrcvflowctrl,
+	QIBPORTCNTR_WORDRCV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_SENDSTALL | _PORT_VIRT_FLAG,
+	cr_ibstatuschange,
+	QIBPORTCNTR_IBLINKDOWN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBLINKERRRECOV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLINK | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBSYMBOLERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_LLI | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_BADFORMAT | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERR_RLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVOVFL | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVEBP | _PORT_VIRT_FLAG,
+	cr_rcvflowctrl_err,
+	QIBPORTCNTR_ERRICRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLPCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRVCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_INVALIDRLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRPKEY | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RXDROPPKT | _PORT_VIRT_FLAG,
+	cr_invalidslen,
+	cr_senddropped,
+	cr_errslen,
+	cr_sendunderrun,
+	cr_txunsupvl,
+};
+
+/* do all the setup to make the counter reads efficient later */
+static void init_6120_cntrnames(struct qib_devdata *dd)
+{
+	int i, j = 0;
+	char *s;
+
+	for (i = 0, s = (char *)cntr6120names; s && j <= dd->cfgctxts;
+	     i++) {
+		/* we always have at least one counter before the egrovfl */
+		if (!j && !strncmp("Ctxt0EgrOvfl", s + 1, 12))
+			j = 1;
+		s = strchr(s + 1, '\n');
+		if (s && j)
+			j++;
+	}
+	dd->cspec->ncntrs = i;
+	if (!s)
+		/* full list; size is without terminating null */
+		dd->cspec->cntrnamelen = sizeof(cntr6120names) - 1;
+	else
+		dd->cspec->cntrnamelen = 1 + s - cntr6120names;
+	dd->cspec->cntrs = kmalloc(dd->cspec->ncntrs
+		* sizeof(u64), GFP_KERNEL);
+	if (!dd->cspec->cntrs)
+		qib_dev_err(dd, "Failed allocation for counters\n");
+
+	for (i = 0, s = (char *)portcntr6120names; s; i++)
+		s = strchr(s + 1, '\n');
+	dd->cspec->nportcntrs = i - 1;
+	dd->cspec->portcntrnamelen = sizeof(portcntr6120names) - 1;
+	dd->cspec->portcntrs = kmalloc(dd->cspec->nportcntrs
+		* sizeof(u64), GFP_KERNEL);
+	if (!dd->cspec->portcntrs)
+		qib_dev_err(dd, "Failed allocation for portcounters\n");
+}
+
+static u32 qib_read_6120cntrs(struct qib_devdata *dd, loff_t pos, char **namep,
+			      u64 **cntrp)
+{
+	u32 ret;
+
+	if (namep) {
+		ret = dd->cspec->cntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+		else
+			*namep = (char *)cntr6120names;
+	} else {
+		u64 *cntr = dd->cspec->cntrs;
+		int i;
+
+		ret = dd->cspec->ncntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+		if (pos >= ret) {
+			ret = 0; /* final read after getting everything */
+			goto done;
+		}
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->ncntrs; i++)
+			*cntr++ = read_6120_creg32(dd, cntr6120indices[i]);
+	}
+done:
+	return ret;
+}
+
+static u32 qib_read_6120portcntrs(struct qib_devdata *dd, loff_t pos, u32 port,
+				  char **namep, u64 **cntrp)
+{
+	u32 ret;
+
+	if (namep) {
+		ret = dd->cspec->portcntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+		else
+			*namep = (char *)portcntr6120names;
+	} else {
+		u64 *cntr = dd->cspec->portcntrs;
+		struct qib_pportdata *ppd = &dd->pport[port];
+		int i;
+
+		ret = dd->cspec->nportcntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->nportcntrs; i++) {
+			if (portcntr6120indices[i] & _PORT_VIRT_FLAG)
+				*cntr++ = qib_portcntr_6120(ppd,
+					portcntr6120indices[i] &
+					~_PORT_VIRT_FLAG);
+			else
+				*cntr++ = read_6120_creg32(dd,
+					   portcntr6120indices[i]);
+		}
+	}
+done:
+	return ret;
+}
+
+static void qib_chk_6120_errormask(struct qib_devdata *dd)
+{
+	static u32 fixed;
+	u32 ctrl;
+	unsigned long errormask;
+	unsigned long hwerrs;
+
+	if (!dd->cspec->errormask || !(dd->flags & QIB_INITTED))
+		return;
+
+	errormask = qib_read_kreg64(dd, kr_errmask);
+
+	if (errormask == dd->cspec->errormask)
+		return;
+	fixed++;
+
+	hwerrs = qib_read_kreg64(dd, kr_hwerrstatus);
+	ctrl = qib_read_kreg32(dd, kr_control);
+
+	qib_write_kreg(dd, kr_errmask,
+		dd->cspec->errormask);
+
+	if ((hwerrs & dd->cspec->hwerrmask) ||
+	    (ctrl & QLOGIC_IB_C_FREEZEMODE)) {
+		qib_write_kreg(dd, kr_hwerrclear, 0ULL);
+		qib_write_kreg(dd, kr_errclear, 0ULL);
+		/* force re-interrupt of pending events, just in case */
+		qib_write_kreg(dd, kr_intclear, 0ULL);
+		qib_devinfo(dd->pcidev,
+			 "errormask fixed(%u) %lx->%lx, ctrl %x hwerr %lx\n",
+			 fixed, errormask, (unsigned long)dd->cspec->errormask,
+			 ctrl, hwerrs);
+	}
+}
+
+/**
+ * qib_get_faststats - get word counters from chip before they overflow
+ * @opaque - contains a pointer to the qlogic_ib device qib_devdata
+ *
+ * This needs more work; in particular, decision on whether we really
+ * need traffic_wds done the way it is
+ * called from add_timer
+ */
+static void qib_get_6120_faststats(unsigned long opaque)
+{
+	struct qib_devdata *dd = (struct qib_devdata *) opaque;
+	struct qib_pportdata *ppd = dd->pport;
+	unsigned long flags;
+	u64 traffic_wds;
+
+	/*
+	 * don't access the chip while running diags, or memory diags can
+	 * fail
+	 */
+	if (!(dd->flags & QIB_INITTED) || dd->diag_client)
+		/* but re-arm the timer, for diags case; won't hurt other */
+		goto done;
+
+	/*
+	 * We now try to maintain an activity timer, based on traffic
+	 * exceeding a threshold, so we need to check the word-counts
+	 * even if they are 64-bit.
+	 */
+	traffic_wds = qib_portcntr_6120(ppd, cr_wordsend) +
+		qib_portcntr_6120(ppd, cr_wordrcv);
+	spin_lock_irqsave(&dd->eep_st_lock, flags);
+	traffic_wds -= dd->traffic_wds;
+	dd->traffic_wds += traffic_wds;
+	spin_unlock_irqrestore(&dd->eep_st_lock, flags);
+
+	qib_chk_6120_errormask(dd);
+done:
+	mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER);
+}
+
+/* no interrupt fallback for these chips */
+static int qib_6120_nointr_fallback(struct qib_devdata *dd)
+{
+	return 0;
+}
+
+/*
+ * reset the XGXS (between serdes and IBC).  Slightly less intrusive
+ * than resetting the IBC or external link state, and useful in some
+ * cases to cause some retraining.  To do this right, we reset IBC
+ * as well.
+ */
+static void qib_6120_xgxs_reset(struct qib_pportdata *ppd)
+{
+	u64 val, prev_val;
+	struct qib_devdata *dd = ppd->dd;
+
+	prev_val = qib_read_kreg64(dd, kr_xgxs_cfg);
+	val = prev_val | QLOGIC_IB_XGXS_RESET;
+	prev_val &= ~QLOGIC_IB_XGXS_RESET; /* be sure */
+	qib_write_kreg(dd, kr_control,
+		       dd->control & ~QLOGIC_IB_C_LINKENABLE);
+	qib_write_kreg(dd, kr_xgxs_cfg, val);
+	qib_read_kreg32(dd, kr_scratch);
+	qib_write_kreg(dd, kr_xgxs_cfg, prev_val);
+	qib_write_kreg(dd, kr_control, dd->control);
+}
+
+static int qib_6120_get_ib_cfg(struct qib_pportdata *ppd, int which)
+{
+	int ret;
+
+	switch (which) {
+	case QIB_IB_CFG_LWID:
+		ret = ppd->link_width_active;
+		break;
+
+	case QIB_IB_CFG_SPD:
+		ret = ppd->link_speed_active;
+		break;
+
+	case QIB_IB_CFG_LWID_ENB:
+		ret = ppd->link_width_enabled;
+		break;
+
+	case QIB_IB_CFG_SPD_ENB:
+		ret = ppd->link_speed_enabled;
+		break;
+
+	case QIB_IB_CFG_OP_VLS:
+		ret = ppd->vls_operational;
+		break;
+
+	case QIB_IB_CFG_VL_HIGH_CAP:
+		ret = 0;
+		break;
+
+	case QIB_IB_CFG_VL_LOW_CAP:
+		ret = 0;
+		break;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		ret = SYM_FIELD(ppd->dd->cspec->ibcctrl, IBCCtrl,
+				OverrunThreshold);
+		break;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		ret = SYM_FIELD(ppd->dd->cspec->ibcctrl, IBCCtrl,
+				PhyerrThreshold);
+		break;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		ret = (ppd->dd->cspec->ibcctrl &
+		       SYM_MASK(IBCCtrl, LinkDownDefaultState)) ?
+			IB_LINKINITCMD_SLEEP : IB_LINKINITCMD_POLL;
+		break;
+
+	case QIB_IB_CFG_HRTBT: /* Get Heartbeat off/enable/auto */
+		ret = 0; /* no heartbeat on this chip */
+		break;
+
+	case QIB_IB_CFG_PMA_TICKS:
+		ret = 250; /* 1 usec. */
+		break;
+
+	default:
+		ret =  -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+/*
+ * We assume range checking is already done, if needed.
+ */
+static int qib_6120_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int ret = 0;
+	u64 val64;
+	u16 lcmd, licmd;
+
+	switch (which) {
+	case QIB_IB_CFG_LWID_ENB:
+		ppd->link_width_enabled = val;
+		break;
+
+	case QIB_IB_CFG_SPD_ENB:
+		ppd->link_speed_enabled = val;
+		break;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		val64 = SYM_FIELD(dd->cspec->ibcctrl, IBCCtrl,
+				  OverrunThreshold);
+		if (val64 != val) {
+			dd->cspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, OverrunThreshold);
+			dd->cspec->ibcctrl |= (u64) val <<
+				SYM_LSB(IBCCtrl, OverrunThreshold);
+			qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+		break;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		val64 = SYM_FIELD(dd->cspec->ibcctrl, IBCCtrl,
+				  PhyerrThreshold);
+		if (val64 != val) {
+			dd->cspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, PhyerrThreshold);
+			dd->cspec->ibcctrl |= (u64) val <<
+				SYM_LSB(IBCCtrl, PhyerrThreshold);
+			qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+		break;
+
+	case QIB_IB_CFG_PKEYS: /* update pkeys */
+		val64 = (u64) ppd->pkeys[0] | ((u64) ppd->pkeys[1] << 16) |
+			((u64) ppd->pkeys[2] << 32) |
+			((u64) ppd->pkeys[3] << 48);
+		qib_write_kreg(dd, kr_partitionkey, val64);
+		break;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		if (val == IB_LINKINITCMD_POLL)
+			dd->cspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, LinkDownDefaultState);
+		else /* SLEEP */
+			dd->cspec->ibcctrl |=
+				SYM_MASK(IBCCtrl, LinkDownDefaultState);
+		qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+		break;
+
+	case QIB_IB_CFG_MTU: /* update the MTU in IBC */
+		/*
+		 * Update our housekeeping variables, and set IBC max
+		 * size, same as init code; max IBC is max we allow in
+		 * buffer, less the qword pbc, plus 1 for ICRC, in dwords
+		 * Set even if it's unchanged, print debug message only
+		 * on changes.
+		 */
+		val = (ppd->ibmaxlen >> 2) + 1;
+		dd->cspec->ibcctrl &= ~SYM_MASK(IBCCtrl, MaxPktLen);
+		dd->cspec->ibcctrl |= (u64)val <<
+			SYM_LSB(IBCCtrl, MaxPktLen);
+		qib_write_kreg(dd, kr_ibcctrl, dd->cspec->ibcctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+		break;
+
+	case QIB_IB_CFG_LSTATE: /* set the IB link state */
+		switch (val & 0xffff0000) {
+		case IB_LINKCMD_DOWN:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_DOWN;
+			if (!dd->cspec->ibdeltainprog) {
+				dd->cspec->ibdeltainprog = 1;
+				dd->cspec->ibsymsnap =
+					read_6120_creg32(dd, cr_ibsymbolerr);
+				dd->cspec->iblnkerrsnap =
+					read_6120_creg32(dd, cr_iblinkerrrecov);
+			}
+			break;
+
+		case IB_LINKCMD_ARMED:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ARMED;
+			break;
+
+		case IB_LINKCMD_ACTIVE:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ACTIVE;
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkcmd req 0x%x\n", val >> 16);
+			goto bail;
+		}
+		switch (val & 0xffff) {
+		case IB_LINKINITCMD_NOP:
+			licmd = 0;
+			break;
+
+		case IB_LINKINITCMD_POLL:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_POLL;
+			break;
+
+		case IB_LINKINITCMD_SLEEP:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_SLEEP;
+			break;
+
+		case IB_LINKINITCMD_DISABLE:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_DISABLE;
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkinitcmd req 0x%x\n",
+				    val & 0xffff);
+			goto bail;
+		}
+		qib_set_ib_6120_lstate(ppd, lcmd, licmd);
+		goto bail;
+
+	case QIB_IB_CFG_HRTBT:
+		ret = -EINVAL;
+		break;
+
+	default:
+		ret = -EINVAL;
+	}
+bail:
+	return ret;
+}
+
+static int qib_6120_set_loopback(struct qib_pportdata *ppd, const char *what)
+{
+	int ret = 0;
+
+	if (!strncmp(what, "ibc", 3)) {
+		ppd->dd->cspec->ibcctrl |= SYM_MASK(IBCCtrl, Loopback);
+		qib_devinfo(ppd->dd->pcidev, "Enabling IB%u:%u IBC loopback\n",
+			 ppd->dd->unit, ppd->port);
+	} else if (!strncmp(what, "off", 3)) {
+		ppd->dd->cspec->ibcctrl &= ~SYM_MASK(IBCCtrl, Loopback);
+		qib_devinfo(ppd->dd->pcidev,
+			"Disabling IB%u:%u IBC loopback (normal)\n",
+			ppd->dd->unit, ppd->port);
+	} else
+		ret = -EINVAL;
+	if (!ret) {
+		qib_write_kreg(ppd->dd, kr_ibcctrl, ppd->dd->cspec->ibcctrl);
+		qib_write_kreg(ppd->dd, kr_scratch, 0);
+	}
+	return ret;
+}
+
+static void pma_6120_timer(unsigned long data)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *)data;
+	struct qib_chip_specific *cs = ppd->dd->cspec;
+	struct qib_ibport *ibp = &ppd->ibport_data;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ibp->lock, flags);
+	if (cs->pma_sample_status == IB_PMA_SAMPLE_STATUS_STARTED) {
+		cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
+		qib_snapshot_counters(ppd, &cs->sword, &cs->rword,
+				      &cs->spkts, &cs->rpkts, &cs->xmit_wait);
+		mod_timer(&cs->pma_timer,
+			  jiffies + usecs_to_jiffies(ibp->pma_sample_interval));
+	} else if (cs->pma_sample_status == IB_PMA_SAMPLE_STATUS_RUNNING) {
+		u64 ta, tb, tc, td, te;
+
+		cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+		qib_snapshot_counters(ppd, &ta, &tb, &tc, &td, &te);
+
+		cs->sword = ta - cs->sword;
+		cs->rword = tb - cs->rword;
+		cs->spkts = tc - cs->spkts;
+		cs->rpkts = td - cs->rpkts;
+		cs->xmit_wait = te - cs->xmit_wait;
+	}
+	spin_unlock_irqrestore(&ibp->lock, flags);
+}
+
+/*
+ * Note that the caller has the ibp->lock held.
+ */
+static void qib_set_cntr_6120_sample(struct qib_pportdata *ppd, u32 intv,
+				     u32 start)
+{
+	struct qib_chip_specific *cs = ppd->dd->cspec;
+
+	if (start && intv) {
+		cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_STARTED;
+		mod_timer(&cs->pma_timer, jiffies + usecs_to_jiffies(start));
+	} else if (intv) {
+		cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_RUNNING;
+		qib_snapshot_counters(ppd, &cs->sword, &cs->rword,
+				      &cs->spkts, &cs->rpkts, &cs->xmit_wait);
+		mod_timer(&cs->pma_timer, jiffies + usecs_to_jiffies(intv));
+	} else {
+		cs->pma_sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+		cs->sword = 0;
+		cs->rword = 0;
+		cs->spkts = 0;
+		cs->rpkts = 0;
+		cs->xmit_wait = 0;
+	}
+}
+
+static u32 qib_6120_iblink_state(u64 ibcs)
+{
+	u32 state = (u32)SYM_FIELD(ibcs, IBCStatus, LinkState);
+
+	switch (state) {
+	case IB_6120_L_STATE_INIT:
+		state = IB_PORT_INIT;
+		break;
+	case IB_6120_L_STATE_ARM:
+		state = IB_PORT_ARMED;
+		break;
+	case IB_6120_L_STATE_ACTIVE:
+		/* fall through */
+	case IB_6120_L_STATE_ACT_DEFER:
+		state = IB_PORT_ACTIVE;
+		break;
+	default: /* fall through */
+	case IB_6120_L_STATE_DOWN:
+		state = IB_PORT_DOWN;
+		break;
+	}
+	return state;
+}
+
+/* returns the IBTA port state, rather than the IBC link training state */
+static u8 qib_6120_phys_portstate(u64 ibcs)
+{
+	u8 state = (u8)SYM_FIELD(ibcs, IBCStatus, LinkTrainingState);
+	return qib_6120_physportstate[state];
+}
+
+static int qib_6120_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags &= ~QIBL_IB_FORCE_NOTIFY;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+
+	if (ibup) {
+		if (ppd->dd->cspec->ibdeltainprog) {
+			ppd->dd->cspec->ibdeltainprog = 0;
+			ppd->dd->cspec->ibsymdelta +=
+				read_6120_creg32(ppd->dd, cr_ibsymbolerr) -
+					ppd->dd->cspec->ibsymsnap;
+			ppd->dd->cspec->iblnkerrdelta +=
+				read_6120_creg32(ppd->dd, cr_iblinkerrrecov) -
+					ppd->dd->cspec->iblnkerrsnap;
+		}
+		qib_hol_init(ppd);
+	} else {
+		ppd->dd->cspec->lli_counter = 0;
+		if (!ppd->dd->cspec->ibdeltainprog) {
+			ppd->dd->cspec->ibdeltainprog = 1;
+			ppd->dd->cspec->ibsymsnap =
+				read_6120_creg32(ppd->dd, cr_ibsymbolerr);
+			ppd->dd->cspec->iblnkerrsnap =
+				read_6120_creg32(ppd->dd, cr_iblinkerrrecov);
+		}
+		qib_hol_down(ppd);
+	}
+
+	qib_6120_setup_setextled(ppd, ibup);
+
+	return 0;
+}
+
+/* Does read/modify/write to appropriate registers to
+ * set output and direction bits selected by mask.
+ * these are in their canonical postions (e.g. lsb of
+ * dir will end up in D48 of extctrl on existing chips).
+ * returns contents of GP Inputs.
+ */
+static int gpio_6120_mod(struct qib_devdata *dd, u32 out, u32 dir, u32 mask)
+{
+	u64 read_val, new_out;
+	unsigned long flags;
+
+	if (mask) {
+		/* some bits being written, lock access to GPIO */
+		dir &= mask;
+		out &= mask;
+		spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+		dd->cspec->extctrl &= ~((u64)mask << SYM_LSB(EXTCtrl, GPIOOe));
+		dd->cspec->extctrl |= ((u64) dir << SYM_LSB(EXTCtrl, GPIOOe));
+		new_out = (dd->cspec->gpio_out & ~mask) | out;
+
+		qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl);
+		qib_write_kreg(dd, kr_gpio_out, new_out);
+		dd->cspec->gpio_out = new_out;
+		spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+	}
+	/*
+	 * It is unlikely that a read at this time would get valid
+	 * data on a pin whose direction line was set in the same
+	 * call to this function. We include the read here because
+	 * that allows us to potentially combine a change on one pin with
+	 * a read on another, and because the old code did something like
+	 * this.
+	 */
+	read_val = qib_read_kreg64(dd, kr_extstatus);
+	return SYM_FIELD(read_val, EXTStatus, GPIOIn);
+}
+
+/*
+ * Read fundamental info we need to use the chip.  These are
+ * the registers that describe chip capabilities, and are
+ * saved in shadow registers.
+ */
+static void get_6120_chip_params(struct qib_devdata *dd)
+{
+	u64 val;
+	u32 piobufs;
+	int mtu;
+
+	dd->uregbase = qib_read_kreg32(dd, kr_userregbase);
+
+	dd->rcvtidcnt = qib_read_kreg32(dd, kr_rcvtidcnt);
+	dd->rcvtidbase = qib_read_kreg32(dd, kr_rcvtidbase);
+	dd->rcvegrbase = qib_read_kreg32(dd, kr_rcvegrbase);
+	dd->palign = qib_read_kreg32(dd, kr_palign);
+	dd->piobufbase = qib_read_kreg64(dd, kr_sendpiobufbase);
+	dd->pio2k_bufbase = dd->piobufbase & 0xffffffff;
+
+	dd->rcvhdrcnt = qib_read_kreg32(dd, kr_rcvegrcnt);
+
+	val = qib_read_kreg64(dd, kr_sendpiosize);
+	dd->piosize2k = val & ~0U;
+	dd->piosize4k = val >> 32;
+
+	mtu = ib_mtu_enum_to_int(qib_ibmtu);
+	if (mtu == -1)
+		mtu = QIB_DEFAULT_MTU;
+	dd->pport->ibmtu = (u32)mtu;
+
+	val = qib_read_kreg64(dd, kr_sendpiobufcnt);
+	dd->piobcnt2k = val & ~0U;
+	dd->piobcnt4k = val >> 32;
+	dd->last_pio = dd->piobcnt4k + dd->piobcnt2k - 1;
+	/* these may be adjusted in init_chip_wc_pat() */
+	dd->pio2kbase = (u32 __iomem *)
+		(((char __iomem *)dd->kregbase) + dd->pio2k_bufbase);
+	if (dd->piobcnt4k) {
+		dd->pio4kbase = (u32 __iomem *)
+			(((char __iomem *) dd->kregbase) +
+			 (dd->piobufbase >> 32));
+		/*
+		 * 4K buffers take 2 pages; we use roundup just to be
+		 * paranoid; we calculate it once here, rather than on
+		 * ever buf allocate
+		 */
+		dd->align4k = ALIGN(dd->piosize4k, dd->palign);
+	}
+
+	piobufs = dd->piobcnt4k + dd->piobcnt2k;
+
+	dd->pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) /
+		(sizeof(u64) * BITS_PER_BYTE / 2);
+}
+
+/*
+ * The chip base addresses in cspec and cpspec have to be set
+ * after possible init_chip_wc_pat(), rather than in
+ * get_6120_chip_params(), so split out as separate function
+ */
+static void set_6120_baseaddrs(struct qib_devdata *dd)
+{
+	u32 cregbase;
+
+	cregbase = qib_read_kreg32(dd, kr_counterregbase);
+	dd->cspec->cregbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase + cregbase);
+
+	dd->egrtidbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase + dd->rcvegrbase);
+}
+
+/*
+ * Write the final few registers that depend on some of the
+ * init setup.  Done late in init, just before bringing up
+ * the serdes.
+ */
+static int qib_late_6120_initreg(struct qib_devdata *dd)
+{
+	int ret = 0;
+	u64 val;
+
+	qib_write_kreg(dd, kr_rcvhdrentsize, dd->rcvhdrentsize);
+	qib_write_kreg(dd, kr_rcvhdrsize, dd->rcvhdrsize);
+	qib_write_kreg(dd, kr_rcvhdrcnt, dd->rcvhdrcnt);
+	qib_write_kreg(dd, kr_sendpioavailaddr, dd->pioavailregs_phys);
+	val = qib_read_kreg64(dd, kr_sendpioavailaddr);
+	if (val != dd->pioavailregs_phys) {
+		qib_dev_err(dd,
+			"Catastrophic software error, SendPIOAvailAddr written as %lx, read back as %llx\n",
+			(unsigned long) dd->pioavailregs_phys,
+			(unsigned long long) val);
+		ret = -EINVAL;
+	}
+	return ret;
+}
+
+static int init_6120_variables(struct qib_devdata *dd)
+{
+	int ret = 0;
+	struct qib_pportdata *ppd;
+	u32 sbufs;
+
+	ppd = (struct qib_pportdata *)(dd + 1);
+	dd->pport = ppd;
+	dd->num_pports = 1;
+
+	dd->cspec = (struct qib_chip_specific *)(ppd + dd->num_pports);
+	ppd->cpspec = NULL; /* not used in this chip */
+
+	spin_lock_init(&dd->cspec->kernel_tid_lock);
+	spin_lock_init(&dd->cspec->user_tid_lock);
+	spin_lock_init(&dd->cspec->rcvmod_lock);
+	spin_lock_init(&dd->cspec->gpio_lock);
+
+	/* we haven't yet set QIB_PRESENT, so use read directly */
+	dd->revision = readq(&dd->kregbase[kr_revision]);
+
+	if ((dd->revision & 0xffffffffU) == 0xffffffffU) {
+		qib_dev_err(dd,
+			"Revision register read failure, giving up initialization\n");
+		ret = -ENODEV;
+		goto bail;
+	}
+	dd->flags |= QIB_PRESENT;  /* now register routines work */
+
+	dd->majrev = (u8) SYM_FIELD(dd->revision, Revision_R,
+				    ChipRevMajor);
+	dd->minrev = (u8) SYM_FIELD(dd->revision, Revision_R,
+				    ChipRevMinor);
+
+	get_6120_chip_params(dd);
+	pe_boardname(dd); /* fill in boardname */
+
+	/*
+	 * GPIO bits for TWSI data and clock,
+	 * used for serial EEPROM.
+	 */
+	dd->gpio_sda_num = _QIB_GPIO_SDA_NUM;
+	dd->gpio_scl_num = _QIB_GPIO_SCL_NUM;
+	dd->twsi_eeprom_dev = QIB_TWSI_NO_DEV;
+
+	if (qib_unordered_wc())
+		dd->flags |= QIB_PIO_FLUSH_WC;
+
+	/*
+	 * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity.
+	 * 2 is Some Misc, 3 is reserved for future.
+	 */
+	dd->eep_st_masks[0].hwerrs_to_log = HWE_MASK(TXEMemParityErr);
+
+	/* Ignore errors in PIO/PBC on systems with unordered write-combining */
+	if (qib_unordered_wc())
+		dd->eep_st_masks[0].hwerrs_to_log &= ~TXE_PIO_PARITY;
+
+	dd->eep_st_masks[1].hwerrs_to_log = HWE_MASK(RXEMemParityErr);
+
+	dd->eep_st_masks[2].errs_to_log = ERR_MASK(ResetNegated);
+
+	ret = qib_init_pportdata(ppd, dd, 0, 1);
+	if (ret)
+		goto bail;
+	ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
+	ppd->link_speed_supported = QIB_IB_SDR;
+	ppd->link_width_enabled = IB_WIDTH_4X;
+	ppd->link_speed_enabled = ppd->link_speed_supported;
+	/* these can't change for this chip, so set once */
+	ppd->link_width_active = ppd->link_width_enabled;
+	ppd->link_speed_active = ppd->link_speed_enabled;
+	ppd->vls_supported = IB_VL_VL0;
+	ppd->vls_operational = ppd->vls_supported;
+
+	dd->rcvhdrentsize = QIB_RCVHDR_ENTSIZE;
+	dd->rcvhdrsize = QIB_DFLT_RCVHDRSIZE;
+	dd->rhf_offset = 0;
+
+	/* we always allocate at least 2048 bytes for eager buffers */
+	ret = ib_mtu_enum_to_int(qib_ibmtu);
+	dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU;
+	BUG_ON(!is_power_of_2(dd->rcvegrbufsize));
+	dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize);
+
+	qib_6120_tidtemplate(dd);
+
+	/*
+	 * We can request a receive interrupt for 1 or
+	 * more packets from current offset.  For now, we set this
+	 * up for a single packet.
+	 */
+	dd->rhdrhead_intr_off = 1ULL << 32;
+
+	/* setup the stats timer; the add_timer is done at end of init */
+	init_timer(&dd->stats_timer);
+	dd->stats_timer.function = qib_get_6120_faststats;
+	dd->stats_timer.data = (unsigned long) dd;
+
+	init_timer(&dd->cspec->pma_timer);
+	dd->cspec->pma_timer.function = pma_6120_timer;
+	dd->cspec->pma_timer.data = (unsigned long) ppd;
+
+	dd->ureg_align = qib_read_kreg32(dd, kr_palign);
+
+	dd->piosize2kmax_dwords = dd->piosize2k >> 2;
+	qib_6120_config_ctxts(dd);
+	qib_set_ctxtcnt(dd);
+
+	ret = init_chip_wc_pat(dd, 0);
+	if (ret)
+		goto bail;
+	set_6120_baseaddrs(dd); /* set chip access pointers now */
+
+	ret = 0;
+	if (qib_mini_init)
+		goto bail;
+
+	qib_num_cfg_vls = 1; /* if any 6120's, only one VL */
+
+	ret = qib_create_ctxts(dd);
+	init_6120_cntrnames(dd);
+
+	/* use all of 4KB buffers for the kernel, otherwise 16 */
+	sbufs = dd->piobcnt4k ?  dd->piobcnt4k : 16;
+
+	dd->lastctxt_piobuf = dd->piobcnt2k + dd->piobcnt4k - sbufs;
+	dd->pbufsctxt = dd->lastctxt_piobuf /
+		(dd->cfgctxts - dd->first_user_ctxt);
+
+	if (ret)
+		goto bail;
+bail:
+	return ret;
+}
+
+/*
+ * For this chip, we want to use the same buffer every time
+ * when we are trying to bring the link up (they are always VL15
+ * packets).  At that link state the packet should always go out immediately
+ * (or at least be discarded at the tx interface if the link is down).
+ * If it doesn't, and the buffer isn't available, that means some other
+ * sender has gotten ahead of us, and is preventing our packet from going
+ * out.  In that case, we flush all packets, and try again.  If that still
+ * fails, we fail the request, and hope things work the next time around.
+ *
+ * We don't need very complicated heuristics on whether the packet had
+ * time to go out or not, since even at SDR 1X, it goes out in very short
+ * time periods, covered by the chip reads done here and as part of the
+ * flush.
+ */
+static u32 __iomem *get_6120_link_buf(struct qib_pportdata *ppd, u32 *bnum)
+{
+	u32 __iomem *buf;
+	u32 lbuf = ppd->dd->piobcnt2k + ppd->dd->piobcnt4k - 1;
+
+	/*
+	 * always blip to get avail list updated, since it's almost
+	 * always needed, and is fairly cheap.
+	 */
+	sendctrl_6120_mod(ppd->dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+	qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */
+	buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf);
+	if (buf)
+		goto done;
+
+	sendctrl_6120_mod(ppd, QIB_SENDCTRL_DISARM_ALL | QIB_SENDCTRL_FLUSH |
+			  QIB_SENDCTRL_AVAIL_BLIP);
+	ppd->dd->upd_pio_shadow  = 1; /* update our idea of what's busy */
+	qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */
+	buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf);
+done:
+	return buf;
+}
+
+static u32 __iomem *qib_6120_getsendbuf(struct qib_pportdata *ppd, u64 pbc,
+					u32 *pbufnum)
+{
+	u32 first, last, plen = pbc & QIB_PBC_LENGTH_MASK;
+	struct qib_devdata *dd = ppd->dd;
+	u32 __iomem *buf;
+
+	if (((pbc >> 32) & PBC_6120_VL15_SEND_CTRL) &&
+		!(ppd->lflags & (QIBL_IB_AUTONEG_INPROG | QIBL_LINKACTIVE)))
+		buf = get_6120_link_buf(ppd, pbufnum);
+	else {
+
+		if ((plen + 1) > dd->piosize2kmax_dwords)
+			first = dd->piobcnt2k;
+		else
+			first = 0;
+		/* try 4k if all 2k busy, so same last for both sizes */
+		last = dd->piobcnt2k + dd->piobcnt4k - 1;
+		buf = qib_getsendbuf_range(dd, pbufnum, first, last);
+	}
+	return buf;
+}
+
+static int init_sdma_6120_regs(struct qib_pportdata *ppd)
+{
+	return -ENODEV;
+}
+
+static u16 qib_sdma_6120_gethead(struct qib_pportdata *ppd)
+{
+	return 0;
+}
+
+static int qib_sdma_6120_busy(struct qib_pportdata *ppd)
+{
+	return 0;
+}
+
+static void qib_sdma_update_6120_tail(struct qib_pportdata *ppd, u16 tail)
+{
+}
+
+static void qib_6120_sdma_sendctrl(struct qib_pportdata *ppd, unsigned op)
+{
+}
+
+static void qib_sdma_set_6120_desc_cnt(struct qib_pportdata *ppd, unsigned cnt)
+{
+}
+
+/*
+ * the pbc doesn't need a VL15 indicator, but we need it for link_buf.
+ * The chip ignores the bit if set.
+ */
+static u32 qib_6120_setpbc_control(struct qib_pportdata *ppd, u32 plen,
+				   u8 srate, u8 vl)
+{
+	return vl == 15 ? PBC_6120_VL15_SEND_CTRL : 0;
+}
+
+static void qib_6120_initvl15_bufs(struct qib_devdata *dd)
+{
+}
+
+static void qib_6120_init_ctxt(struct qib_ctxtdata *rcd)
+{
+	rcd->rcvegrcnt = rcd->dd->rcvhdrcnt;
+	rcd->rcvegr_tid_base = rcd->ctxt * rcd->rcvegrcnt;
+}
+
+static void qib_6120_txchk_change(struct qib_devdata *dd, u32 start,
+	u32 len, u32 avail, struct qib_ctxtdata *rcd)
+{
+}
+
+static void writescratch(struct qib_devdata *dd, u32 val)
+{
+	(void) qib_write_kreg(dd, kr_scratch, val);
+}
+
+static int qib_6120_tempsense_rd(struct qib_devdata *dd, int regnum)
+{
+	return -ENXIO;
+}
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+static int qib_6120_notify_dca(struct qib_devdata *dd, unsigned long event)
+{
+	return 0;
+}
+#endif
+
+/* Dummy function, as 6120 boards never disable EEPROM Write */
+static int qib_6120_eeprom_wen(struct qib_devdata *dd, int wen)
+{
+	return 1;
+}
+
+/**
+ * qib_init_iba6120_funcs - set up the chip-specific function pointers
+ * @pdev: pci_dev of the qlogic_ib device
+ * @ent: pci_device_id matching this chip
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ *
+ * It also allocates/partially-inits the qib_devdata struct for
+ * this device.
+ */
+struct qib_devdata *qib_init_iba6120_funcs(struct pci_dev *pdev,
+					   const struct pci_device_id *ent)
+{
+	struct qib_devdata *dd;
+	int ret;
+
+	dd = qib_alloc_devdata(pdev, sizeof(struct qib_pportdata) +
+			       sizeof(struct qib_chip_specific));
+	if (IS_ERR(dd))
+		goto bail;
+
+	dd->f_bringup_serdes    = qib_6120_bringup_serdes;
+	dd->f_cleanup           = qib_6120_setup_cleanup;
+	dd->f_clear_tids        = qib_6120_clear_tids;
+	dd->f_free_irq          = qib_6120_free_irq;
+	dd->f_get_base_info     = qib_6120_get_base_info;
+	dd->f_get_msgheader     = qib_6120_get_msgheader;
+	dd->f_getsendbuf        = qib_6120_getsendbuf;
+	dd->f_gpio_mod          = gpio_6120_mod;
+	dd->f_eeprom_wen	= qib_6120_eeprom_wen;
+	dd->f_hdrqempty         = qib_6120_hdrqempty;
+	dd->f_ib_updown         = qib_6120_ib_updown;
+	dd->f_init_ctxt         = qib_6120_init_ctxt;
+	dd->f_initvl15_bufs     = qib_6120_initvl15_bufs;
+	dd->f_intr_fallback     = qib_6120_nointr_fallback;
+	dd->f_late_initreg      = qib_late_6120_initreg;
+	dd->f_setpbc_control    = qib_6120_setpbc_control;
+	dd->f_portcntr          = qib_portcntr_6120;
+	dd->f_put_tid           = (dd->minrev >= 2) ?
+				      qib_6120_put_tid_2 :
+				      qib_6120_put_tid;
+	dd->f_quiet_serdes      = qib_6120_quiet_serdes;
+	dd->f_rcvctrl           = rcvctrl_6120_mod;
+	dd->f_read_cntrs        = qib_read_6120cntrs;
+	dd->f_read_portcntrs    = qib_read_6120portcntrs;
+	dd->f_reset             = qib_6120_setup_reset;
+	dd->f_init_sdma_regs    = init_sdma_6120_regs;
+	dd->f_sdma_busy         = qib_sdma_6120_busy;
+	dd->f_sdma_gethead      = qib_sdma_6120_gethead;
+	dd->f_sdma_sendctrl     = qib_6120_sdma_sendctrl;
+	dd->f_sdma_set_desc_cnt = qib_sdma_set_6120_desc_cnt;
+	dd->f_sdma_update_tail  = qib_sdma_update_6120_tail;
+	dd->f_sendctrl          = sendctrl_6120_mod;
+	dd->f_set_armlaunch     = qib_set_6120_armlaunch;
+	dd->f_set_cntr_sample   = qib_set_cntr_6120_sample;
+	dd->f_iblink_state      = qib_6120_iblink_state;
+	dd->f_ibphys_portstate  = qib_6120_phys_portstate;
+	dd->f_get_ib_cfg        = qib_6120_get_ib_cfg;
+	dd->f_set_ib_cfg        = qib_6120_set_ib_cfg;
+	dd->f_set_ib_loopback   = qib_6120_set_loopback;
+	dd->f_set_intr_state    = qib_6120_set_intr_state;
+	dd->f_setextled         = qib_6120_setup_setextled;
+	dd->f_txchk_change      = qib_6120_txchk_change;
+	dd->f_update_usrhead    = qib_update_6120_usrhead;
+	dd->f_wantpiobuf_intr   = qib_wantpiobuf_6120_intr;
+	dd->f_xgxs_reset        = qib_6120_xgxs_reset;
+	dd->f_writescratch      = writescratch;
+	dd->f_tempsense_rd	= qib_6120_tempsense_rd;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dd->f_notify_dca = qib_6120_notify_dca;
+#endif
+	/*
+	 * Do remaining pcie setup and save pcie values in dd.
+	 * Any error printing is already done by the init code.
+	 * On return, we have the chip mapped and accessible,
+	 * but chip registers are not set up until start of
+	 * init_6120_variables.
+	 */
+	ret = qib_pcie_ddinit(dd, pdev, ent);
+	if (ret < 0)
+		goto bail_free;
+
+	/* initialize chip-specific variables */
+	ret = init_6120_variables(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	if (qib_mini_init)
+		goto bail;
+
+	if (qib_pcie_params(dd, 8, NULL, NULL))
+		qib_dev_err(dd,
+			"Failed to setup PCIe or interrupts; continuing anyway\n");
+	dd->cspec->irq = pdev->irq; /* save IRQ */
+
+	/* clear diagctrl register, in case diags were running and crashed */
+	qib_write_kreg(dd, kr_hwdiagctrl, 0);
+
+	if (qib_read_kreg64(dd, kr_hwerrstatus) &
+	    QLOGIC_IB_HWE_SERDESPLLFAILED)
+		qib_write_kreg(dd, kr_hwerrclear,
+			       QLOGIC_IB_HWE_SERDESPLLFAILED);
+
+	/* setup interrupt handler (interrupt type handled above) */
+	qib_setup_6120_interrupt(dd);
+	/* Note that qpn_mask is set by qib_6120_config_ctxts() first */
+	qib_6120_init_hwerrors(dd);
+
+	goto bail;
+
+bail_cleanup:
+	qib_pcie_ddcleanup(dd);
+bail_free:
+	qib_free_devdata(dd);
+	dd = ERR_PTR(ret);
+bail:
+	return dd;
+}
diff --git a/drivers/infiniband/hw/qib/qib_iba7220.c b/drivers/infiniband/hw/qib/qib_iba7220.c
new file mode 100644
index 000000000..00b2af211
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_iba7220.c
@@ -0,0 +1,4657 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * This file contains all of the code that is specific to the
+ * QLogic_IB 7220 chip (except that specific to the SerDes)
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <rdma/ib_verbs.h>
+
+#include "qib.h"
+#include "qib_7220.h"
+
+static void qib_setup_7220_setextled(struct qib_pportdata *, u32);
+static void qib_7220_handle_hwerrors(struct qib_devdata *, char *, size_t);
+static void sendctrl_7220_mod(struct qib_pportdata *ppd, u32 op);
+static u32 qib_7220_iblink_state(u64);
+static u8 qib_7220_phys_portstate(u64);
+static void qib_sdma_update_7220_tail(struct qib_pportdata *, u16);
+static void qib_set_ib_7220_lstate(struct qib_pportdata *, u16, u16);
+
+/*
+ * This file contains almost all the chip-specific register information and
+ * access functions for the QLogic QLogic_IB 7220 PCI-Express chip, with the
+ * exception of SerDes support, which in in qib_sd7220.c.
+ */
+
+/* Below uses machine-generated qib_chipnum_regs.h file */
+#define KREG_IDX(regname) (QIB_7220_##regname##_OFFS / sizeof(u64))
+
+/* Use defines to tie machine-generated names to lower-case names */
+#define kr_control KREG_IDX(Control)
+#define kr_counterregbase KREG_IDX(CntrRegBase)
+#define kr_errclear KREG_IDX(ErrClear)
+#define kr_errmask KREG_IDX(ErrMask)
+#define kr_errstatus KREG_IDX(ErrStatus)
+#define kr_extctrl KREG_IDX(EXTCtrl)
+#define kr_extstatus KREG_IDX(EXTStatus)
+#define kr_gpio_clear KREG_IDX(GPIOClear)
+#define kr_gpio_mask KREG_IDX(GPIOMask)
+#define kr_gpio_out KREG_IDX(GPIOOut)
+#define kr_gpio_status KREG_IDX(GPIOStatus)
+#define kr_hrtbt_guid KREG_IDX(HRTBT_GUID)
+#define kr_hwdiagctrl KREG_IDX(HwDiagCtrl)
+#define kr_hwerrclear KREG_IDX(HwErrClear)
+#define kr_hwerrmask KREG_IDX(HwErrMask)
+#define kr_hwerrstatus KREG_IDX(HwErrStatus)
+#define kr_ibcctrl KREG_IDX(IBCCtrl)
+#define kr_ibcddrctrl KREG_IDX(IBCDDRCtrl)
+#define kr_ibcddrstatus KREG_IDX(IBCDDRStatus)
+#define kr_ibcstatus KREG_IDX(IBCStatus)
+#define kr_ibserdesctrl KREG_IDX(IBSerDesCtrl)
+#define kr_intclear KREG_IDX(IntClear)
+#define kr_intmask KREG_IDX(IntMask)
+#define kr_intstatus KREG_IDX(IntStatus)
+#define kr_ncmodectrl KREG_IDX(IBNCModeCtrl)
+#define kr_palign KREG_IDX(PageAlign)
+#define kr_partitionkey KREG_IDX(RcvPartitionKey)
+#define kr_portcnt KREG_IDX(PortCnt)
+#define kr_rcvbthqp KREG_IDX(RcvBTHQP)
+#define kr_rcvctrl KREG_IDX(RcvCtrl)
+#define kr_rcvegrbase KREG_IDX(RcvEgrBase)
+#define kr_rcvegrcnt KREG_IDX(RcvEgrCnt)
+#define kr_rcvhdrcnt KREG_IDX(RcvHdrCnt)
+#define kr_rcvhdrentsize KREG_IDX(RcvHdrEntSize)
+#define kr_rcvhdrsize KREG_IDX(RcvHdrSize)
+#define kr_rcvpktledcnt KREG_IDX(RcvPktLEDCnt)
+#define kr_rcvtidbase KREG_IDX(RcvTIDBase)
+#define kr_rcvtidcnt KREG_IDX(RcvTIDCnt)
+#define kr_revision KREG_IDX(Revision)
+#define kr_scratch KREG_IDX(Scratch)
+#define kr_sendbuffererror KREG_IDX(SendBufErr0)
+#define kr_sendctrl KREG_IDX(SendCtrl)
+#define kr_senddmabase KREG_IDX(SendDmaBase)
+#define kr_senddmabufmask0 KREG_IDX(SendDmaBufMask0)
+#define kr_senddmabufmask1 (KREG_IDX(SendDmaBufMask0) + 1)
+#define kr_senddmabufmask2 (KREG_IDX(SendDmaBufMask0) + 2)
+#define kr_senddmahead KREG_IDX(SendDmaHead)
+#define kr_senddmaheadaddr KREG_IDX(SendDmaHeadAddr)
+#define kr_senddmalengen KREG_IDX(SendDmaLenGen)
+#define kr_senddmastatus KREG_IDX(SendDmaStatus)
+#define kr_senddmatail KREG_IDX(SendDmaTail)
+#define kr_sendpioavailaddr KREG_IDX(SendBufAvailAddr)
+#define kr_sendpiobufbase KREG_IDX(SendBufBase)
+#define kr_sendpiobufcnt KREG_IDX(SendBufCnt)
+#define kr_sendpiosize KREG_IDX(SendBufSize)
+#define kr_sendregbase KREG_IDX(SendRegBase)
+#define kr_userregbase KREG_IDX(UserRegBase)
+#define kr_xgxs_cfg KREG_IDX(XGXSCfg)
+
+/* These must only be written via qib_write_kreg_ctxt() */
+#define kr_rcvhdraddr KREG_IDX(RcvHdrAddr0)
+#define kr_rcvhdrtailaddr KREG_IDX(RcvHdrTailAddr0)
+
+
+#define CREG_IDX(regname) ((QIB_7220_##regname##_OFFS - \
+			QIB_7220_LBIntCnt_OFFS) / sizeof(u64))
+
+#define cr_badformat CREG_IDX(RxVersionErrCnt)
+#define cr_erricrc CREG_IDX(RxICRCErrCnt)
+#define cr_errlink CREG_IDX(RxLinkMalformCnt)
+#define cr_errlpcrc CREG_IDX(RxLPCRCErrCnt)
+#define cr_errpkey CREG_IDX(RxPKeyMismatchCnt)
+#define cr_rcvflowctrl_err CREG_IDX(RxFlowCtrlViolCnt)
+#define cr_err_rlen CREG_IDX(RxLenErrCnt)
+#define cr_errslen CREG_IDX(TxLenErrCnt)
+#define cr_errtidfull CREG_IDX(RxTIDFullErrCnt)
+#define cr_errtidvalid CREG_IDX(RxTIDValidErrCnt)
+#define cr_errvcrc CREG_IDX(RxVCRCErrCnt)
+#define cr_ibstatuschange CREG_IDX(IBStatusChangeCnt)
+#define cr_lbint CREG_IDX(LBIntCnt)
+#define cr_invalidrlen CREG_IDX(RxMaxMinLenErrCnt)
+#define cr_invalidslen CREG_IDX(TxMaxMinLenErrCnt)
+#define cr_lbflowstall CREG_IDX(LBFlowStallCnt)
+#define cr_pktrcv CREG_IDX(RxDataPktCnt)
+#define cr_pktrcvflowctrl CREG_IDX(RxFlowPktCnt)
+#define cr_pktsend CREG_IDX(TxDataPktCnt)
+#define cr_pktsendflow CREG_IDX(TxFlowPktCnt)
+#define cr_portovfl CREG_IDX(RxP0HdrEgrOvflCnt)
+#define cr_rcvebp CREG_IDX(RxEBPCnt)
+#define cr_rcvovfl CREG_IDX(RxBufOvflCnt)
+#define cr_senddropped CREG_IDX(TxDroppedPktCnt)
+#define cr_sendstall CREG_IDX(TxFlowStallCnt)
+#define cr_sendunderrun CREG_IDX(TxUnderrunCnt)
+#define cr_wordrcv CREG_IDX(RxDwordCnt)
+#define cr_wordsend CREG_IDX(TxDwordCnt)
+#define cr_txunsupvl CREG_IDX(TxUnsupVLErrCnt)
+#define cr_rxdroppkt CREG_IDX(RxDroppedPktCnt)
+#define cr_iblinkerrrecov CREG_IDX(IBLinkErrRecoveryCnt)
+#define cr_iblinkdown CREG_IDX(IBLinkDownedCnt)
+#define cr_ibsymbolerr CREG_IDX(IBSymbolErrCnt)
+#define cr_vl15droppedpkt CREG_IDX(RxVL15DroppedPktCnt)
+#define cr_rxotherlocalphyerr CREG_IDX(RxOtherLocalPhyErrCnt)
+#define cr_excessbufferovfl CREG_IDX(ExcessBufferOvflCnt)
+#define cr_locallinkintegrityerr CREG_IDX(LocalLinkIntegrityErrCnt)
+#define cr_rxvlerr CREG_IDX(RxVlErrCnt)
+#define cr_rxdlidfltr CREG_IDX(RxDlidFltrCnt)
+#define cr_psstat CREG_IDX(PSStat)
+#define cr_psstart CREG_IDX(PSStart)
+#define cr_psinterval CREG_IDX(PSInterval)
+#define cr_psrcvdatacount CREG_IDX(PSRcvDataCount)
+#define cr_psrcvpktscount CREG_IDX(PSRcvPktsCount)
+#define cr_psxmitdatacount CREG_IDX(PSXmitDataCount)
+#define cr_psxmitpktscount CREG_IDX(PSXmitPktsCount)
+#define cr_psxmitwaitcount CREG_IDX(PSXmitWaitCount)
+#define cr_txsdmadesc CREG_IDX(TxSDmaDescCnt)
+#define cr_pcieretrydiag CREG_IDX(PcieRetryBufDiagQwordCnt)
+
+#define SYM_RMASK(regname, fldname) ((u64)              \
+	QIB_7220_##regname##_##fldname##_RMASK)
+#define SYM_MASK(regname, fldname) ((u64)               \
+	QIB_7220_##regname##_##fldname##_RMASK <<       \
+	 QIB_7220_##regname##_##fldname##_LSB)
+#define SYM_LSB(regname, fldname) (QIB_7220_##regname##_##fldname##_LSB)
+#define SYM_FIELD(value, regname, fldname) ((u64) \
+	(((value) >> SYM_LSB(regname, fldname)) & \
+	 SYM_RMASK(regname, fldname)))
+#define ERR_MASK(fldname) SYM_MASK(ErrMask, fldname##Mask)
+#define HWE_MASK(fldname) SYM_MASK(HwErrMask, fldname##Mask)
+
+/* ibcctrl bits */
+#define QLOGIC_IB_IBCC_LINKINITCMD_DISABLE 1
+/* cycle through TS1/TS2 till OK */
+#define QLOGIC_IB_IBCC_LINKINITCMD_POLL 2
+/* wait for TS1, then go on */
+#define QLOGIC_IB_IBCC_LINKINITCMD_SLEEP 3
+#define QLOGIC_IB_IBCC_LINKINITCMD_SHIFT 16
+
+#define QLOGIC_IB_IBCC_LINKCMD_DOWN 1           /* move to 0x11 */
+#define QLOGIC_IB_IBCC_LINKCMD_ARMED 2          /* move to 0x21 */
+#define QLOGIC_IB_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */
+
+#define BLOB_7220_IBCHG 0x81
+
+/*
+ * We could have a single register get/put routine, that takes a group type,
+ * but this is somewhat clearer and cleaner.  It also gives us some error
+ * checking.  64 bit register reads should always work, but are inefficient
+ * on opteron (the northbridge always generates 2 separate HT 32 bit reads),
+ * so we use kreg32 wherever possible.  User register and counter register
+ * reads are always 32 bit reads, so only one form of those routines.
+ */
+
+/**
+ * qib_read_ureg32 - read 32-bit virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @ctxt: context number
+ *
+ * Return the contents of a register that is virtualized to be per context.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u32 qib_read_ureg32(const struct qib_devdata *dd,
+				  enum qib_ureg regno, int ctxt)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+
+	if (dd->userbase)
+		return readl(regno + (u64 __iomem *)
+			     ((char __iomem *)dd->userbase +
+			      dd->ureg_align * ctxt));
+	else
+		return readl(regno + (u64 __iomem *)
+			     (dd->uregbase +
+			      (char __iomem *)dd->kregbase +
+			      dd->ureg_align * ctxt));
+}
+
+/**
+ * qib_write_ureg - write 32-bit virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @value: value
+ * @ctxt: context
+ *
+ * Write the contents of a register that is virtualized to be per context.
+ */
+static inline void qib_write_ureg(const struct qib_devdata *dd,
+				  enum qib_ureg regno, u64 value, int ctxt)
+{
+	u64 __iomem *ubase;
+
+	if (dd->userbase)
+		ubase = (u64 __iomem *)
+			((char __iomem *) dd->userbase +
+			 dd->ureg_align * ctxt);
+	else
+		ubase = (u64 __iomem *)
+			(dd->uregbase +
+			 (char __iomem *) dd->kregbase +
+			 dd->ureg_align * ctxt);
+
+	if (dd->kregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &ubase[regno]);
+}
+
+/**
+ * qib_write_kreg_ctxt - write a device's per-ctxt 64-bit kernel register
+ * @dd: the qlogic_ib device
+ * @regno: the register number to write
+ * @ctxt: the context containing the register
+ * @value: the value to write
+ */
+static inline void qib_write_kreg_ctxt(const struct qib_devdata *dd,
+				       const u16 regno, unsigned ctxt,
+				       u64 value)
+{
+	qib_write_kreg(dd, regno + ctxt, value);
+}
+
+static inline void write_7220_creg(const struct qib_devdata *dd,
+				   u16 regno, u64 value)
+{
+	if (dd->cspec->cregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &dd->cspec->cregbase[regno]);
+}
+
+static inline u64 read_7220_creg(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readq(&dd->cspec->cregbase[regno]);
+}
+
+static inline u32 read_7220_creg32(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readl(&dd->cspec->cregbase[regno]);
+}
+
+/* kr_revision bits */
+#define QLOGIC_IB_R_EMULATORREV_MASK ((1ULL << 22) - 1)
+#define QLOGIC_IB_R_EMULATORREV_SHIFT 40
+
+/* kr_control bits */
+#define QLOGIC_IB_C_RESET (1U << 7)
+
+/* kr_intstatus, kr_intclear, kr_intmask bits */
+#define QLOGIC_IB_I_RCVURG_MASK ((1ULL << 17) - 1)
+#define QLOGIC_IB_I_RCVURG_SHIFT 32
+#define QLOGIC_IB_I_RCVAVAIL_MASK ((1ULL << 17) - 1)
+#define QLOGIC_IB_I_RCVAVAIL_SHIFT 0
+#define QLOGIC_IB_I_SERDESTRIMDONE (1ULL << 27)
+
+#define QLOGIC_IB_C_FREEZEMODE 0x00000002
+#define QLOGIC_IB_C_LINKENABLE 0x00000004
+
+#define QLOGIC_IB_I_SDMAINT             0x8000000000000000ULL
+#define QLOGIC_IB_I_SDMADISABLED        0x4000000000000000ULL
+#define QLOGIC_IB_I_ERROR               0x0000000080000000ULL
+#define QLOGIC_IB_I_SPIOSENT            0x0000000040000000ULL
+#define QLOGIC_IB_I_SPIOBUFAVAIL        0x0000000020000000ULL
+#define QLOGIC_IB_I_GPIO                0x0000000010000000ULL
+
+/* variables for sanity checking interrupt and errors */
+#define QLOGIC_IB_I_BITSEXTANT \
+		(QLOGIC_IB_I_SDMAINT | QLOGIC_IB_I_SDMADISABLED | \
+		(QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT) | \
+		(QLOGIC_IB_I_RCVAVAIL_MASK << \
+		 QLOGIC_IB_I_RCVAVAIL_SHIFT) | \
+		QLOGIC_IB_I_ERROR | QLOGIC_IB_I_SPIOSENT | \
+		QLOGIC_IB_I_SPIOBUFAVAIL | QLOGIC_IB_I_GPIO | \
+		QLOGIC_IB_I_SERDESTRIMDONE)
+
+#define IB_HWE_BITSEXTANT \
+	       (HWE_MASK(RXEMemParityErr) | \
+		HWE_MASK(TXEMemParityErr) | \
+		(QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK <<	 \
+		 QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) | \
+		QLOGIC_IB_HWE_PCIE1PLLFAILED | \
+		QLOGIC_IB_HWE_PCIE0PLLFAILED | \
+		QLOGIC_IB_HWE_PCIEPOISONEDTLP | \
+		QLOGIC_IB_HWE_PCIECPLTIMEOUT | \
+		QLOGIC_IB_HWE_PCIEBUSPARITYXTLH | \
+		QLOGIC_IB_HWE_PCIEBUSPARITYXADM | \
+		QLOGIC_IB_HWE_PCIEBUSPARITYRADM | \
+		HWE_MASK(PowerOnBISTFailed) |	  \
+		QLOGIC_IB_HWE_COREPLL_FBSLIP | \
+		QLOGIC_IB_HWE_COREPLL_RFSLIP | \
+		QLOGIC_IB_HWE_SERDESPLLFAILED | \
+		HWE_MASK(IBCBusToSPCParityErr) | \
+		HWE_MASK(IBCBusFromSPCParityErr) | \
+		QLOGIC_IB_HWE_PCIECPLDATAQUEUEERR | \
+		QLOGIC_IB_HWE_PCIECPLHDRQUEUEERR | \
+		QLOGIC_IB_HWE_SDMAMEMREADERR | \
+		QLOGIC_IB_HWE_CLK_UC_PLLNOTLOCKED | \
+		QLOGIC_IB_HWE_PCIESERDESQ0PCLKNOTDETECT | \
+		QLOGIC_IB_HWE_PCIESERDESQ1PCLKNOTDETECT | \
+		QLOGIC_IB_HWE_PCIESERDESQ2PCLKNOTDETECT | \
+		QLOGIC_IB_HWE_PCIESERDESQ3PCLKNOTDETECT | \
+		QLOGIC_IB_HWE_DDSRXEQMEMORYPARITYERR | \
+		QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR | \
+		QLOGIC_IB_HWE_PCIE_UC_OCT0MEMORYPARITYERR | \
+		QLOGIC_IB_HWE_PCIE_UC_OCT1MEMORYPARITYERR)
+
+#define IB_E_BITSEXTANT							\
+	(ERR_MASK(RcvFormatErr) | ERR_MASK(RcvVCRCErr) |		\
+	 ERR_MASK(RcvICRCErr) | ERR_MASK(RcvMinPktLenErr) |		\
+	 ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvLongPktLenErr) |	\
+	 ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvUnexpectedCharErr) | \
+	 ERR_MASK(RcvUnsupportedVLErr) | ERR_MASK(RcvEBPErr) |		\
+	 ERR_MASK(RcvIBFlowErr) | ERR_MASK(RcvBadVersionErr) |		\
+	 ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr) |		\
+	 ERR_MASK(RcvBadTidErr) | ERR_MASK(RcvHdrLenErr) |		\
+	 ERR_MASK(RcvHdrErr) | ERR_MASK(RcvIBLostLinkErr) |		\
+	 ERR_MASK(SendSpecialTriggerErr) |				\
+	 ERR_MASK(SDmaDisabledErr) | ERR_MASK(SendMinPktLenErr) |	\
+	 ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendUnderRunErr) |	\
+	 ERR_MASK(SendPktLenErr) | ERR_MASK(SendDroppedSmpPktErr) |	\
+	 ERR_MASK(SendDroppedDataPktErr) |				\
+	 ERR_MASK(SendPioArmLaunchErr) |				\
+	 ERR_MASK(SendUnexpectedPktNumErr) |				\
+	 ERR_MASK(SendUnsupportedVLErr) | ERR_MASK(SendBufMisuseErr) |	\
+	 ERR_MASK(SDmaGenMismatchErr) | ERR_MASK(SDmaOutOfBoundErr) |	\
+	 ERR_MASK(SDmaTailOutOfBoundErr) | ERR_MASK(SDmaBaseErr) |	\
+	 ERR_MASK(SDma1stDescErr) | ERR_MASK(SDmaRpyTagErr) |		\
+	 ERR_MASK(SDmaDwEnErr) | ERR_MASK(SDmaMissingDwErr) |		\
+	 ERR_MASK(SDmaUnexpDataErr) |					\
+	 ERR_MASK(IBStatusChanged) | ERR_MASK(InvalidAddrErr) |		\
+	 ERR_MASK(ResetNegated) | ERR_MASK(HardwareErr) |		\
+	 ERR_MASK(SDmaDescAddrMisalignErr) |				\
+	 ERR_MASK(InvalidEEPCmd))
+
+/* kr_hwerrclear, kr_hwerrmask, kr_hwerrstatus, bits */
+#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK  0x00000000000000ffULL
+#define QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT 0
+#define QLOGIC_IB_HWE_PCIEPOISONEDTLP      0x0000000010000000ULL
+#define QLOGIC_IB_HWE_PCIECPLTIMEOUT       0x0000000020000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYXTLH    0x0000000040000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYXADM    0x0000000080000000ULL
+#define QLOGIC_IB_HWE_PCIEBUSPARITYRADM    0x0000000100000000ULL
+#define QLOGIC_IB_HWE_COREPLL_FBSLIP       0x0080000000000000ULL
+#define QLOGIC_IB_HWE_COREPLL_RFSLIP       0x0100000000000000ULL
+#define QLOGIC_IB_HWE_PCIE1PLLFAILED       0x0400000000000000ULL
+#define QLOGIC_IB_HWE_PCIE0PLLFAILED       0x0800000000000000ULL
+#define QLOGIC_IB_HWE_SERDESPLLFAILED      0x1000000000000000ULL
+/* specific to this chip */
+#define QLOGIC_IB_HWE_PCIECPLDATAQUEUEERR         0x0000000000000040ULL
+#define QLOGIC_IB_HWE_PCIECPLHDRQUEUEERR          0x0000000000000080ULL
+#define QLOGIC_IB_HWE_SDMAMEMREADERR              0x0000000010000000ULL
+#define QLOGIC_IB_HWE_CLK_UC_PLLNOTLOCKED          0x2000000000000000ULL
+#define QLOGIC_IB_HWE_PCIESERDESQ0PCLKNOTDETECT   0x0100000000000000ULL
+#define QLOGIC_IB_HWE_PCIESERDESQ1PCLKNOTDETECT   0x0200000000000000ULL
+#define QLOGIC_IB_HWE_PCIESERDESQ2PCLKNOTDETECT   0x0400000000000000ULL
+#define QLOGIC_IB_HWE_PCIESERDESQ3PCLKNOTDETECT   0x0800000000000000ULL
+#define QLOGIC_IB_HWE_DDSRXEQMEMORYPARITYERR       0x0000008000000000ULL
+#define QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR        0x0000004000000000ULL
+#define QLOGIC_IB_HWE_PCIE_UC_OCT0MEMORYPARITYERR 0x0000001000000000ULL
+#define QLOGIC_IB_HWE_PCIE_UC_OCT1MEMORYPARITYERR 0x0000002000000000ULL
+
+#define IBA7220_IBCC_LINKCMD_SHIFT 19
+
+/* kr_ibcddrctrl bits */
+#define IBA7220_IBC_DLIDLMC_MASK        0xFFFFFFFFUL
+#define IBA7220_IBC_DLIDLMC_SHIFT       32
+
+#define IBA7220_IBC_HRTBT_MASK  (SYM_RMASK(IBCDDRCtrl, HRTBT_AUTO) | \
+				 SYM_RMASK(IBCDDRCtrl, HRTBT_ENB))
+#define IBA7220_IBC_HRTBT_SHIFT SYM_LSB(IBCDDRCtrl, HRTBT_ENB)
+
+#define IBA7220_IBC_LANE_REV_SUPPORTED (1<<8)
+#define IBA7220_IBC_LREV_MASK   1
+#define IBA7220_IBC_LREV_SHIFT  8
+#define IBA7220_IBC_RXPOL_MASK  1
+#define IBA7220_IBC_RXPOL_SHIFT 7
+#define IBA7220_IBC_WIDTH_SHIFT 5
+#define IBA7220_IBC_WIDTH_MASK  0x3
+#define IBA7220_IBC_WIDTH_1X_ONLY       (0 << IBA7220_IBC_WIDTH_SHIFT)
+#define IBA7220_IBC_WIDTH_4X_ONLY       (1 << IBA7220_IBC_WIDTH_SHIFT)
+#define IBA7220_IBC_WIDTH_AUTONEG       (2 << IBA7220_IBC_WIDTH_SHIFT)
+#define IBA7220_IBC_SPEED_AUTONEG       (1 << 1)
+#define IBA7220_IBC_SPEED_SDR           (1 << 2)
+#define IBA7220_IBC_SPEED_DDR           (1 << 3)
+#define IBA7220_IBC_SPEED_AUTONEG_MASK  (0x7 << 1)
+#define IBA7220_IBC_IBTA_1_2_MASK       (1)
+
+/* kr_ibcddrstatus */
+/* link latency shift is 0, don't bother defining */
+#define IBA7220_DDRSTAT_LINKLAT_MASK    0x3ffffff
+
+/* kr_extstatus bits */
+#define QLOGIC_IB_EXTS_FREQSEL 0x2
+#define QLOGIC_IB_EXTS_SERDESSEL 0x4
+#define QLOGIC_IB_EXTS_MEMBIST_ENDTEST     0x0000000000004000
+#define QLOGIC_IB_EXTS_MEMBIST_DISABLED    0x0000000000008000
+
+/* kr_xgxsconfig bits */
+#define QLOGIC_IB_XGXS_RESET          0x5ULL
+#define QLOGIC_IB_XGXS_FC_SAFE        (1ULL << 63)
+
+/* kr_rcvpktledcnt */
+#define IBA7220_LEDBLINK_ON_SHIFT 32 /* 4ns period on after packet */
+#define IBA7220_LEDBLINK_OFF_SHIFT 0 /* 4ns period off before next on */
+
+#define _QIB_GPIO_SDA_NUM 1
+#define _QIB_GPIO_SCL_NUM 0
+#define QIB_TWSI_EEPROM_DEV 0xA2 /* All Production 7220 cards. */
+#define QIB_TWSI_TEMP_DEV 0x98
+
+/* HW counter clock is at 4nsec */
+#define QIB_7220_PSXMITWAIT_CHECK_RATE 4000
+
+#define IBA7220_R_INTRAVAIL_SHIFT 17
+#define IBA7220_R_PKEY_DIS_SHIFT 34
+#define IBA7220_R_TAILUPD_SHIFT 35
+#define IBA7220_R_CTXTCFG_SHIFT 36
+
+#define IBA7220_HDRHEAD_PKTINT_SHIFT 32 /* interrupt cnt in upper 32 bits */
+
+/*
+ * the size bits give us 2^N, in KB units.  0 marks as invalid,
+ * and 7 is reserved.  We currently use only 2KB and 4KB
+ */
+#define IBA7220_TID_SZ_SHIFT 37 /* shift to 3bit size selector */
+#define IBA7220_TID_SZ_2K (1UL << IBA7220_TID_SZ_SHIFT) /* 2KB */
+#define IBA7220_TID_SZ_4K (2UL << IBA7220_TID_SZ_SHIFT) /* 4KB */
+#define IBA7220_TID_PA_SHIFT 11U /* TID addr in chip stored w/o low bits */
+#define PBC_7220_VL15_SEND (1ULL << 63) /* pbc; VL15, no credit check */
+#define PBC_7220_VL15_SEND_CTRL (1ULL << 31) /* control version of same */
+
+#define AUTONEG_TRIES 5 /* sequential retries to negotiate DDR */
+
+/* packet rate matching delay multiplier */
+static u8 rate_to_delay[2][2] = {
+	/* 1x, 4x */
+	{   8, 2 }, /* SDR */
+	{   4, 1 }  /* DDR */
+};
+
+static u8 ib_rate_to_delay[IB_RATE_120_GBPS + 1] = {
+	[IB_RATE_2_5_GBPS] = 8,
+	[IB_RATE_5_GBPS] = 4,
+	[IB_RATE_10_GBPS] = 2,
+	[IB_RATE_20_GBPS] = 1
+};
+
+#define IBA7220_LINKSPEED_SHIFT SYM_LSB(IBCStatus, LinkSpeedActive)
+#define IBA7220_LINKWIDTH_SHIFT SYM_LSB(IBCStatus, LinkWidthActive)
+
+/* link training states, from IBC */
+#define IB_7220_LT_STATE_DISABLED        0x00
+#define IB_7220_LT_STATE_LINKUP          0x01
+#define IB_7220_LT_STATE_POLLACTIVE      0x02
+#define IB_7220_LT_STATE_POLLQUIET       0x03
+#define IB_7220_LT_STATE_SLEEPDELAY      0x04
+#define IB_7220_LT_STATE_SLEEPQUIET      0x05
+#define IB_7220_LT_STATE_CFGDEBOUNCE     0x08
+#define IB_7220_LT_STATE_CFGRCVFCFG      0x09
+#define IB_7220_LT_STATE_CFGWAITRMT      0x0a
+#define IB_7220_LT_STATE_CFGIDLE 0x0b
+#define IB_7220_LT_STATE_RECOVERRETRAIN  0x0c
+#define IB_7220_LT_STATE_RECOVERWAITRMT  0x0e
+#define IB_7220_LT_STATE_RECOVERIDLE     0x0f
+
+/* link state machine states from IBC */
+#define IB_7220_L_STATE_DOWN             0x0
+#define IB_7220_L_STATE_INIT             0x1
+#define IB_7220_L_STATE_ARM              0x2
+#define IB_7220_L_STATE_ACTIVE           0x3
+#define IB_7220_L_STATE_ACT_DEFER        0x4
+
+static const u8 qib_7220_physportstate[0x20] = {
+	[IB_7220_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
+	[IB_7220_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
+	[IB_7220_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
+	[IB_7220_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
+	[IB_7220_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_7220_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_7220_LT_STATE_CFGDEBOUNCE] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7220_LT_STATE_CFGRCVFCFG] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7220_LT_STATE_CFGWAITRMT] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7220_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7220_LT_STATE_RECOVERRETRAIN] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_7220_LT_STATE_RECOVERWAITRMT] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_7220_LT_STATE_RECOVERIDLE] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[0x10] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x11] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x12] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x13] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
+};
+
+int qib_special_trigger;
+module_param_named(special_trigger, qib_special_trigger, int, S_IRUGO);
+MODULE_PARM_DESC(special_trigger, "Enable SpecialTrigger arm/launch");
+
+#define IBCBUSFRSPCPARITYERR HWE_MASK(IBCBusFromSPCParityErr)
+#define IBCBUSTOSPCPARITYERR HWE_MASK(IBCBusToSPCParityErr)
+
+#define SYM_MASK_BIT(regname, fldname, bit) ((u64) \
+	(1ULL << (SYM_LSB(regname, fldname) + (bit))))
+
+#define TXEMEMPARITYERR_PIOBUF \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 0)
+#define TXEMEMPARITYERR_PIOPBC \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 1)
+#define TXEMEMPARITYERR_PIOLAUNCHFIFO \
+	SYM_MASK_BIT(HwErrMask, TXEMemParityErrMask, 2)
+
+#define RXEMEMPARITYERR_RCVBUF \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 0)
+#define RXEMEMPARITYERR_LOOKUPQ \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 1)
+#define RXEMEMPARITYERR_EXPTID \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 2)
+#define RXEMEMPARITYERR_EAGERTID \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 3)
+#define RXEMEMPARITYERR_FLAGBUF \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 4)
+#define RXEMEMPARITYERR_DATAINFO \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 5)
+#define RXEMEMPARITYERR_HDRINFO \
+	SYM_MASK_BIT(HwErrMask, RXEMemParityErrMask, 6)
+
+/* 7220 specific hardware errors... */
+static const struct qib_hwerror_msgs qib_7220_hwerror_msgs[] = {
+	/* generic hardware errors */
+	QLOGIC_IB_HWE_MSG(IBCBUSFRSPCPARITYERR, "QIB2IB Parity"),
+	QLOGIC_IB_HWE_MSG(IBCBUSTOSPCPARITYERR, "IB2QIB Parity"),
+
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOBUF,
+			  "TXE PIOBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOPBC,
+			  "TXE PIOPBC Memory Parity"),
+	QLOGIC_IB_HWE_MSG(TXEMEMPARITYERR_PIOLAUNCHFIFO,
+			  "TXE PIOLAUNCHFIFO Memory Parity"),
+
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_RCVBUF,
+			  "RXE RCVBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_LOOKUPQ,
+			  "RXE LOOKUPQ Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EAGERTID,
+			  "RXE EAGERTID Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_EXPTID,
+			  "RXE EXPTID Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_FLAGBUF,
+			  "RXE FLAGBUF Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_DATAINFO,
+			  "RXE DATAINFO Memory Parity"),
+	QLOGIC_IB_HWE_MSG(RXEMEMPARITYERR_HDRINFO,
+			  "RXE HDRINFO Memory Parity"),
+
+	/* chip-specific hardware errors */
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEPOISONEDTLP,
+			  "PCIe Poisoned TLP"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLTIMEOUT,
+			  "PCIe completion timeout"),
+	/*
+	 * In practice, it's unlikely wthat we'll see PCIe PLL, or bus
+	 * parity or memory parity error failures, because most likely we
+	 * won't be able to talk to the core of the chip.  Nonetheless, we
+	 * might see them, if they are in parts of the PCIe core that aren't
+	 * essential.
+	 */
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE1PLLFAILED,
+			  "PCIePLL1"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE0PLLFAILED,
+			  "PCIePLL0"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXTLH,
+			  "PCIe XTLH core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYXADM,
+			  "PCIe ADM TX core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIEBUSPARITYRADM,
+			  "PCIe ADM RX core parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_SERDESPLLFAILED,
+			  "SerDes PLL"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLDATAQUEUEERR,
+			  "PCIe cpl header queue"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIECPLHDRQUEUEERR,
+			  "PCIe cpl data queue"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_SDMAMEMREADERR,
+			  "Send DMA memory read"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_CLK_UC_PLLNOTLOCKED,
+			  "uC PLL clock not locked"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ0PCLKNOTDETECT,
+			  "PCIe serdes Q0 no clock"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ1PCLKNOTDETECT,
+			  "PCIe serdes Q1 no clock"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ2PCLKNOTDETECT,
+			  "PCIe serdes Q2 no clock"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIESERDESQ3PCLKNOTDETECT,
+			  "PCIe serdes Q3 no clock"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_DDSRXEQMEMORYPARITYERR,
+			  "DDS RXEQ memory parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR,
+			  "IB uC memory parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE_UC_OCT0MEMORYPARITYERR,
+			  "PCIe uC oct0 memory parity"),
+	QLOGIC_IB_HWE_MSG(QLOGIC_IB_HWE_PCIE_UC_OCT1MEMORYPARITYERR,
+			  "PCIe uC oct1 memory parity"),
+};
+
+#define RXE_PARITY (RXEMEMPARITYERR_EAGERTID|RXEMEMPARITYERR_EXPTID)
+
+#define QLOGIC_IB_E_PKTERRS (\
+		ERR_MASK(SendPktLenErr) |				\
+		ERR_MASK(SendDroppedDataPktErr) |			\
+		ERR_MASK(RcvVCRCErr) |					\
+		ERR_MASK(RcvICRCErr) |					\
+		ERR_MASK(RcvShortPktLenErr) |				\
+		ERR_MASK(RcvEBPErr))
+
+/* Convenience for decoding Send DMA errors */
+#define QLOGIC_IB_E_SDMAERRS ( \
+		ERR_MASK(SDmaGenMismatchErr) |				\
+		ERR_MASK(SDmaOutOfBoundErr) |				\
+		ERR_MASK(SDmaTailOutOfBoundErr) | ERR_MASK(SDmaBaseErr) | \
+		ERR_MASK(SDma1stDescErr) | ERR_MASK(SDmaRpyTagErr) |	\
+		ERR_MASK(SDmaDwEnErr) | ERR_MASK(SDmaMissingDwErr) |	\
+		ERR_MASK(SDmaUnexpDataErr) |				\
+		ERR_MASK(SDmaDescAddrMisalignErr) |			\
+		ERR_MASK(SDmaDisabledErr) |				\
+		ERR_MASK(SendBufMisuseErr))
+
+/* These are all rcv-related errors which we want to count for stats */
+#define E_SUM_PKTERRS \
+	(ERR_MASK(RcvHdrLenErr) | ERR_MASK(RcvBadTidErr) |		\
+	 ERR_MASK(RcvBadVersionErr) | ERR_MASK(RcvHdrErr) |		\
+	 ERR_MASK(RcvLongPktLenErr) | ERR_MASK(RcvShortPktLenErr) |	\
+	 ERR_MASK(RcvMaxPktLenErr) | ERR_MASK(RcvMinPktLenErr) |	\
+	 ERR_MASK(RcvFormatErr) | ERR_MASK(RcvUnsupportedVLErr) |	\
+	 ERR_MASK(RcvUnexpectedCharErr) | ERR_MASK(RcvEBPErr))
+
+/* These are all send-related errors which we want to count for stats */
+#define E_SUM_ERRS \
+	(ERR_MASK(SendPioArmLaunchErr) | ERR_MASK(SendUnexpectedPktNumErr) | \
+	 ERR_MASK(SendDroppedDataPktErr) | ERR_MASK(SendDroppedSmpPktErr) | \
+	 ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendUnsupportedVLErr) |	\
+	 ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) |		\
+	 ERR_MASK(InvalidAddrErr))
+
+/*
+ * this is similar to E_SUM_ERRS, but can't ignore armlaunch, don't ignore
+ * errors not related to freeze and cancelling buffers.  Can't ignore
+ * armlaunch because could get more while still cleaning up, and need
+ * to cancel those as they happen.
+ */
+#define E_SPKT_ERRS_IGNORE \
+	(ERR_MASK(SendDroppedDataPktErr) | ERR_MASK(SendDroppedSmpPktErr) | \
+	 ERR_MASK(SendMaxPktLenErr) | ERR_MASK(SendMinPktLenErr) |	\
+	 ERR_MASK(SendPktLenErr))
+
+/*
+ * these are errors that can occur when the link changes state while
+ * a packet is being sent or received.  This doesn't cover things
+ * like EBP or VCRC that can be the result of a sending having the
+ * link change state, so we receive a "known bad" packet.
+ */
+#define E_SUM_LINK_PKTERRS \
+	(ERR_MASK(SendDroppedDataPktErr) | ERR_MASK(SendDroppedSmpPktErr) | \
+	 ERR_MASK(SendMinPktLenErr) | ERR_MASK(SendPktLenErr) |		\
+	 ERR_MASK(RcvShortPktLenErr) | ERR_MASK(RcvMinPktLenErr) |	\
+	 ERR_MASK(RcvUnexpectedCharErr))
+
+static void autoneg_7220_work(struct work_struct *);
+static u32 __iomem *qib_7220_getsendbuf(struct qib_pportdata *, u64, u32 *);
+
+/*
+ * Called when we might have an error that is specific to a particular
+ * PIO buffer, and may need to cancel that buffer, so it can be re-used.
+ * because we don't need to force the update of pioavail.
+ */
+static void qib_disarm_7220_senderrbufs(struct qib_pportdata *ppd)
+{
+	unsigned long sbuf[3];
+	struct qib_devdata *dd = ppd->dd;
+
+	/*
+	 * It's possible that sendbuffererror could have bits set; might
+	 * have already done this as a result of hardware error handling.
+	 */
+	/* read these before writing errorclear */
+	sbuf[0] = qib_read_kreg64(dd, kr_sendbuffererror);
+	sbuf[1] = qib_read_kreg64(dd, kr_sendbuffererror + 1);
+	sbuf[2] = qib_read_kreg64(dd, kr_sendbuffererror + 2);
+
+	if (sbuf[0] || sbuf[1] || sbuf[2])
+		qib_disarm_piobufs_set(dd, sbuf,
+				       dd->piobcnt2k + dd->piobcnt4k);
+}
+
+static void qib_7220_txe_recover(struct qib_devdata *dd)
+{
+	qib_devinfo(dd->pcidev, "Recovering from TXE PIO parity error\n");
+	qib_disarm_7220_senderrbufs(dd->pport);
+}
+
+/*
+ * This is called with interrupts disabled and sdma_lock held.
+ */
+static void qib_7220_sdma_sendctrl(struct qib_pportdata *ppd, unsigned op)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 set_sendctrl = 0;
+	u64 clr_sendctrl = 0;
+
+	if (op & QIB_SDMA_SENDCTRL_OP_ENABLE)
+		set_sendctrl |= SYM_MASK(SendCtrl, SDmaEnable);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl, SDmaEnable);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_INTENABLE)
+		set_sendctrl |= SYM_MASK(SendCtrl, SDmaIntEnable);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl, SDmaIntEnable);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_HALT)
+		set_sendctrl |= SYM_MASK(SendCtrl, SDmaHalt);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl, SDmaHalt);
+
+	spin_lock(&dd->sendctrl_lock);
+
+	dd->sendctrl |= set_sendctrl;
+	dd->sendctrl &= ~clr_sendctrl;
+
+	qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	spin_unlock(&dd->sendctrl_lock);
+}
+
+static void qib_decode_7220_sdma_errs(struct qib_pportdata *ppd,
+				      u64 err, char *buf, size_t blen)
+{
+	static const struct {
+		u64 err;
+		const char *msg;
+	} errs[] = {
+		{ ERR_MASK(SDmaGenMismatchErr),
+		  "SDmaGenMismatch" },
+		{ ERR_MASK(SDmaOutOfBoundErr),
+		  "SDmaOutOfBound" },
+		{ ERR_MASK(SDmaTailOutOfBoundErr),
+		  "SDmaTailOutOfBound" },
+		{ ERR_MASK(SDmaBaseErr),
+		  "SDmaBase" },
+		{ ERR_MASK(SDma1stDescErr),
+		  "SDma1stDesc" },
+		{ ERR_MASK(SDmaRpyTagErr),
+		  "SDmaRpyTag" },
+		{ ERR_MASK(SDmaDwEnErr),
+		  "SDmaDwEn" },
+		{ ERR_MASK(SDmaMissingDwErr),
+		  "SDmaMissingDw" },
+		{ ERR_MASK(SDmaUnexpDataErr),
+		  "SDmaUnexpData" },
+		{ ERR_MASK(SDmaDescAddrMisalignErr),
+		  "SDmaDescAddrMisalign" },
+		{ ERR_MASK(SendBufMisuseErr),
+		  "SendBufMisuse" },
+		{ ERR_MASK(SDmaDisabledErr),
+		  "SDmaDisabled" },
+	};
+	int i;
+	size_t bidx = 0;
+
+	for (i = 0; i < ARRAY_SIZE(errs); i++) {
+		if (err & errs[i].err)
+			bidx += scnprintf(buf + bidx, blen - bidx,
+					 "%s ", errs[i].msg);
+	}
+}
+
+/*
+ * This is called as part of link down clean up so disarm and flush
+ * all send buffers so that SMP packets can be sent.
+ */
+static void qib_7220_sdma_hw_clean_up(struct qib_pportdata *ppd)
+{
+	/* This will trigger the Abort interrupt */
+	sendctrl_7220_mod(ppd, QIB_SENDCTRL_DISARM_ALL | QIB_SENDCTRL_FLUSH |
+			  QIB_SENDCTRL_AVAIL_BLIP);
+	ppd->dd->upd_pio_shadow  = 1; /* update our idea of what's busy */
+}
+
+static void qib_sdma_7220_setlengen(struct qib_pportdata *ppd)
+{
+	/*
+	 * Set SendDmaLenGen and clear and set
+	 * the MSB of the generation count to enable generation checking
+	 * and load the internal generation counter.
+	 */
+	qib_write_kreg(ppd->dd, kr_senddmalengen, ppd->sdma_descq_cnt);
+	qib_write_kreg(ppd->dd, kr_senddmalengen,
+		       ppd->sdma_descq_cnt |
+		       (1ULL << QIB_7220_SendDmaLenGen_Generation_MSB));
+}
+
+static void qib_7220_sdma_hw_start_up(struct qib_pportdata *ppd)
+{
+	qib_sdma_7220_setlengen(ppd);
+	qib_sdma_update_7220_tail(ppd, 0); /* Set SendDmaTail */
+	ppd->sdma_head_dma[0] = 0;
+}
+
+#define DISABLES_SDMA (							\
+		ERR_MASK(SDmaDisabledErr) |				\
+		ERR_MASK(SDmaBaseErr) |					\
+		ERR_MASK(SDmaTailOutOfBoundErr) |			\
+		ERR_MASK(SDmaOutOfBoundErr) |				\
+		ERR_MASK(SDma1stDescErr) |				\
+		ERR_MASK(SDmaRpyTagErr) |				\
+		ERR_MASK(SDmaGenMismatchErr) |				\
+		ERR_MASK(SDmaDescAddrMisalignErr) |			\
+		ERR_MASK(SDmaMissingDwErr) |				\
+		ERR_MASK(SDmaDwEnErr))
+
+static void sdma_7220_errors(struct qib_pportdata *ppd, u64 errs)
+{
+	unsigned long flags;
+	struct qib_devdata *dd = ppd->dd;
+	char *msg;
+
+	errs &= QLOGIC_IB_E_SDMAERRS;
+
+	msg = dd->cspec->sdmamsgbuf;
+	qib_decode_7220_sdma_errs(ppd, errs, msg,
+		sizeof(dd->cspec->sdmamsgbuf));
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	if (errs & ERR_MASK(SendBufMisuseErr)) {
+		unsigned long sbuf[3];
+
+		sbuf[0] = qib_read_kreg64(dd, kr_sendbuffererror);
+		sbuf[1] = qib_read_kreg64(dd, kr_sendbuffererror + 1);
+		sbuf[2] = qib_read_kreg64(dd, kr_sendbuffererror + 2);
+
+		qib_dev_err(ppd->dd,
+			    "IB%u:%u SendBufMisuse: %04lx %016lx %016lx\n",
+			    ppd->dd->unit, ppd->port, sbuf[2], sbuf[1],
+			    sbuf[0]);
+	}
+
+	if (errs & ERR_MASK(SDmaUnexpDataErr))
+		qib_dev_err(dd, "IB%u:%u SDmaUnexpData\n", ppd->dd->unit,
+			    ppd->port);
+
+	switch (ppd->sdma_state.current_state) {
+	case qib_sdma_state_s00_hw_down:
+		/* not expecting any interrupts */
+		break;
+
+	case qib_sdma_state_s10_hw_start_up_wait:
+		/* handled in intr path */
+		break;
+
+	case qib_sdma_state_s20_idle:
+		/* not expecting any interrupts */
+		break;
+
+	case qib_sdma_state_s30_sw_clean_up_wait:
+		/* not expecting any interrupts */
+		break;
+
+	case qib_sdma_state_s40_hw_clean_up_wait:
+		if (errs & ERR_MASK(SDmaDisabledErr))
+			__qib_sdma_process_event(ppd,
+				qib_sdma_event_e50_hw_cleaned);
+		break;
+
+	case qib_sdma_state_s50_hw_halt_wait:
+		/* handled in intr path */
+		break;
+
+	case qib_sdma_state_s99_running:
+		if (errs & DISABLES_SDMA)
+			__qib_sdma_process_event(ppd,
+				qib_sdma_event_e7220_err_halted);
+		break;
+	}
+
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+/*
+ * Decode the error status into strings, deciding whether to always
+ * print * it or not depending on "normal packet errors" vs everything
+ * else.   Return 1 if "real" errors, otherwise 0 if only packet
+ * errors, so caller can decide what to print with the string.
+ */
+static int qib_decode_7220_err(struct qib_devdata *dd, char *buf, size_t blen,
+			       u64 err)
+{
+	int iserr = 1;
+
+	*buf = '\0';
+	if (err & QLOGIC_IB_E_PKTERRS) {
+		if (!(err & ~QLOGIC_IB_E_PKTERRS))
+			iserr = 0;
+		if ((err & ERR_MASK(RcvICRCErr)) &&
+		    !(err & (ERR_MASK(RcvVCRCErr) | ERR_MASK(RcvEBPErr))))
+			strlcat(buf, "CRC ", blen);
+		if (!iserr)
+			goto done;
+	}
+	if (err & ERR_MASK(RcvHdrLenErr))
+		strlcat(buf, "rhdrlen ", blen);
+	if (err & ERR_MASK(RcvBadTidErr))
+		strlcat(buf, "rbadtid ", blen);
+	if (err & ERR_MASK(RcvBadVersionErr))
+		strlcat(buf, "rbadversion ", blen);
+	if (err & ERR_MASK(RcvHdrErr))
+		strlcat(buf, "rhdr ", blen);
+	if (err & ERR_MASK(SendSpecialTriggerErr))
+		strlcat(buf, "sendspecialtrigger ", blen);
+	if (err & ERR_MASK(RcvLongPktLenErr))
+		strlcat(buf, "rlongpktlen ", blen);
+	if (err & ERR_MASK(RcvMaxPktLenErr))
+		strlcat(buf, "rmaxpktlen ", blen);
+	if (err & ERR_MASK(RcvMinPktLenErr))
+		strlcat(buf, "rminpktlen ", blen);
+	if (err & ERR_MASK(SendMinPktLenErr))
+		strlcat(buf, "sminpktlen ", blen);
+	if (err & ERR_MASK(RcvFormatErr))
+		strlcat(buf, "rformaterr ", blen);
+	if (err & ERR_MASK(RcvUnsupportedVLErr))
+		strlcat(buf, "runsupvl ", blen);
+	if (err & ERR_MASK(RcvUnexpectedCharErr))
+		strlcat(buf, "runexpchar ", blen);
+	if (err & ERR_MASK(RcvIBFlowErr))
+		strlcat(buf, "ribflow ", blen);
+	if (err & ERR_MASK(SendUnderRunErr))
+		strlcat(buf, "sunderrun ", blen);
+	if (err & ERR_MASK(SendPioArmLaunchErr))
+		strlcat(buf, "spioarmlaunch ", blen);
+	if (err & ERR_MASK(SendUnexpectedPktNumErr))
+		strlcat(buf, "sunexperrpktnum ", blen);
+	if (err & ERR_MASK(SendDroppedSmpPktErr))
+		strlcat(buf, "sdroppedsmppkt ", blen);
+	if (err & ERR_MASK(SendMaxPktLenErr))
+		strlcat(buf, "smaxpktlen ", blen);
+	if (err & ERR_MASK(SendUnsupportedVLErr))
+		strlcat(buf, "sunsupVL ", blen);
+	if (err & ERR_MASK(InvalidAddrErr))
+		strlcat(buf, "invalidaddr ", blen);
+	if (err & ERR_MASK(RcvEgrFullErr))
+		strlcat(buf, "rcvegrfull ", blen);
+	if (err & ERR_MASK(RcvHdrFullErr))
+		strlcat(buf, "rcvhdrfull ", blen);
+	if (err & ERR_MASK(IBStatusChanged))
+		strlcat(buf, "ibcstatuschg ", blen);
+	if (err & ERR_MASK(RcvIBLostLinkErr))
+		strlcat(buf, "riblostlink ", blen);
+	if (err & ERR_MASK(HardwareErr))
+		strlcat(buf, "hardware ", blen);
+	if (err & ERR_MASK(ResetNegated))
+		strlcat(buf, "reset ", blen);
+	if (err & QLOGIC_IB_E_SDMAERRS)
+		qib_decode_7220_sdma_errs(dd->pport, err, buf, blen);
+	if (err & ERR_MASK(InvalidEEPCmd))
+		strlcat(buf, "invalideepromcmd ", blen);
+done:
+	return iserr;
+}
+
+static void reenable_7220_chase(unsigned long opaque)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *)opaque;
+
+	ppd->cpspec->chase_timer.expires = 0;
+	qib_set_ib_7220_lstate(ppd, QLOGIC_IB_IBCC_LINKCMD_DOWN,
+		QLOGIC_IB_IBCC_LINKINITCMD_POLL);
+}
+
+static void handle_7220_chase(struct qib_pportdata *ppd, u64 ibcst)
+{
+	u8 ibclt;
+	unsigned long tnow;
+
+	ibclt = (u8)SYM_FIELD(ibcst, IBCStatus, LinkTrainingState);
+
+	/*
+	 * Detect and handle the state chase issue, where we can
+	 * get stuck if we are unlucky on timing on both sides of
+	 * the link.   If we are, we disable, set a timer, and
+	 * then re-enable.
+	 */
+	switch (ibclt) {
+	case IB_7220_LT_STATE_CFGRCVFCFG:
+	case IB_7220_LT_STATE_CFGWAITRMT:
+	case IB_7220_LT_STATE_TXREVLANES:
+	case IB_7220_LT_STATE_CFGENH:
+		tnow = jiffies;
+		if (ppd->cpspec->chase_end &&
+		    time_after(tnow, ppd->cpspec->chase_end)) {
+			ppd->cpspec->chase_end = 0;
+			qib_set_ib_7220_lstate(ppd,
+				QLOGIC_IB_IBCC_LINKCMD_DOWN,
+				QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+			ppd->cpspec->chase_timer.expires = jiffies +
+				QIB_CHASE_DIS_TIME;
+			add_timer(&ppd->cpspec->chase_timer);
+		} else if (!ppd->cpspec->chase_end)
+			ppd->cpspec->chase_end = tnow + QIB_CHASE_TIME;
+		break;
+
+	default:
+		ppd->cpspec->chase_end = 0;
+		break;
+	}
+}
+
+static void handle_7220_errors(struct qib_devdata *dd, u64 errs)
+{
+	char *msg;
+	u64 ignore_this_time = 0;
+	u64 iserr = 0;
+	int log_idx;
+	struct qib_pportdata *ppd = dd->pport;
+	u64 mask;
+
+	/* don't report errors that are masked */
+	errs &= dd->cspec->errormask;
+	msg = dd->cspec->emsgbuf;
+
+	/* do these first, they are most important */
+	if (errs & ERR_MASK(HardwareErr))
+		qib_7220_handle_hwerrors(dd, msg, sizeof(dd->cspec->emsgbuf));
+	else
+		for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx)
+			if (errs & dd->eep_st_masks[log_idx].errs_to_log)
+				qib_inc_eeprom_err(dd, log_idx, 1);
+
+	if (errs & QLOGIC_IB_E_SDMAERRS)
+		sdma_7220_errors(ppd, errs);
+
+	if (errs & ~IB_E_BITSEXTANT)
+		qib_dev_err(dd,
+			"error interrupt with unknown errors %llx set\n",
+			(unsigned long long) (errs & ~IB_E_BITSEXTANT));
+
+	if (errs & E_SUM_ERRS) {
+		qib_disarm_7220_senderrbufs(ppd);
+		if ((errs & E_SUM_LINK_PKTERRS) &&
+		    !(ppd->lflags & QIBL_LINKACTIVE)) {
+			/*
+			 * This can happen when trying to bring the link
+			 * up, but the IB link changes state at the "wrong"
+			 * time. The IB logic then complains that the packet
+			 * isn't valid.  We don't want to confuse people, so
+			 * we just don't print them, except at debug
+			 */
+			ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+		}
+	} else if ((errs & E_SUM_LINK_PKTERRS) &&
+		   !(ppd->lflags & QIBL_LINKACTIVE)) {
+		/*
+		 * This can happen when SMA is trying to bring the link
+		 * up, but the IB link changes state at the "wrong" time.
+		 * The IB logic then complains that the packet isn't
+		 * valid.  We don't want to confuse people, so we just
+		 * don't print them, except at debug
+		 */
+		ignore_this_time = errs & E_SUM_LINK_PKTERRS;
+	}
+
+	qib_write_kreg(dd, kr_errclear, errs);
+
+	errs &= ~ignore_this_time;
+	if (!errs)
+		goto done;
+
+	/*
+	 * The ones we mask off are handled specially below
+	 * or above.  Also mask SDMADISABLED by default as it
+	 * is too chatty.
+	 */
+	mask = ERR_MASK(IBStatusChanged) |
+		ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr) |
+		ERR_MASK(HardwareErr) | ERR_MASK(SDmaDisabledErr);
+
+	qib_decode_7220_err(dd, msg, sizeof(dd->cspec->emsgbuf), errs & ~mask);
+
+	if (errs & E_SUM_PKTERRS)
+		qib_stats.sps_rcverrs++;
+	if (errs & E_SUM_ERRS)
+		qib_stats.sps_txerrs++;
+	iserr = errs & ~(E_SUM_PKTERRS | QLOGIC_IB_E_PKTERRS |
+			 ERR_MASK(SDmaDisabledErr));
+
+	if (errs & ERR_MASK(IBStatusChanged)) {
+		u64 ibcs;
+
+		ibcs = qib_read_kreg64(dd, kr_ibcstatus);
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+			handle_7220_chase(ppd, ibcs);
+
+		/* Update our picture of width and speed from chip */
+		ppd->link_width_active =
+			((ibcs >> IBA7220_LINKWIDTH_SHIFT) & 1) ?
+			    IB_WIDTH_4X : IB_WIDTH_1X;
+		ppd->link_speed_active =
+			((ibcs >> IBA7220_LINKSPEED_SHIFT) & 1) ?
+			    QIB_IB_DDR : QIB_IB_SDR;
+
+		/*
+		 * Since going into a recovery state causes the link state
+		 * to go down and since recovery is transitory, it is better
+		 * if we "miss" ever seeing the link training state go into
+		 * recovery (i.e., ignore this transition for link state
+		 * special handling purposes) without updating lastibcstat.
+		 */
+		if (qib_7220_phys_portstate(ibcs) !=
+					    IB_PHYSPORTSTATE_LINK_ERR_RECOVER)
+			qib_handle_e_ibstatuschanged(ppd, ibcs);
+	}
+
+	if (errs & ERR_MASK(ResetNegated)) {
+		qib_dev_err(dd,
+			"Got reset, requires re-init (unload and reload driver)\n");
+		dd->flags &= ~QIB_INITTED;  /* needs re-init */
+		/* mark as having had error */
+		*dd->devstatusp |= QIB_STATUS_HWERROR;
+		*dd->pport->statusp &= ~QIB_STATUS_IB_CONF;
+	}
+
+	if (*msg && iserr)
+		qib_dev_porterr(dd, ppd->port, "%s error\n", msg);
+
+	if (ppd->state_wanted & ppd->lflags)
+		wake_up_interruptible(&ppd->state_wait);
+
+	/*
+	 * If there were hdrq or egrfull errors, wake up any processes
+	 * waiting in poll.  We used to try to check which contexts had
+	 * the overflow, but given the cost of that and the chip reads
+	 * to support it, it's better to just wake everybody up if we
+	 * get an overflow; waiters can poll again if it's not them.
+	 */
+	if (errs & (ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr))) {
+		qib_handle_urcv(dd, ~0U);
+		if (errs & ERR_MASK(RcvEgrFullErr))
+			qib_stats.sps_buffull++;
+		else
+			qib_stats.sps_hdrfull++;
+	}
+done:
+	return;
+}
+
+/* enable/disable chip from delivering interrupts */
+static void qib_7220_set_intr_state(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		if (dd->flags & QIB_BADINTR)
+			return;
+		qib_write_kreg(dd, kr_intmask, ~0ULL);
+		/* force re-interrupt of any pending interrupts. */
+		qib_write_kreg(dd, kr_intclear, 0ULL);
+	} else
+		qib_write_kreg(dd, kr_intmask, 0ULL);
+}
+
+/*
+ * Try to cleanup as much as possible for anything that might have gone
+ * wrong while in freeze mode, such as pio buffers being written by user
+ * processes (causing armlaunch), send errors due to going into freeze mode,
+ * etc., and try to avoid causing extra interrupts while doing so.
+ * Forcibly update the in-memory pioavail register copies after cleanup
+ * because the chip won't do it while in freeze mode (the register values
+ * themselves are kept correct).
+ * Make sure that we don't lose any important interrupts by using the chip
+ * feature that says that writing 0 to a bit in *clear that is set in
+ * *status will cause an interrupt to be generated again (if allowed by
+ * the *mask value).
+ * This is in chip-specific code because of all of the register accesses,
+ * even though the details are similar on most chips.
+ */
+static void qib_7220_clear_freeze(struct qib_devdata *dd)
+{
+	/* disable error interrupts, to avoid confusion */
+	qib_write_kreg(dd, kr_errmask, 0ULL);
+
+	/* also disable interrupts; errormask is sometimes overwriten */
+	qib_7220_set_intr_state(dd, 0);
+
+	qib_cancel_sends(dd->pport);
+
+	/* clear the freeze, and be sure chip saw it */
+	qib_write_kreg(dd, kr_control, dd->control);
+	qib_read_kreg32(dd, kr_scratch);
+
+	/* force in-memory update now we are out of freeze */
+	qib_force_pio_avail_update(dd);
+
+	/*
+	 * force new interrupt if any hwerr, error or interrupt bits are
+	 * still set, and clear "safe" send packet errors related to freeze
+	 * and cancelling sends.  Re-enable error interrupts before possible
+	 * force of re-interrupt on pending interrupts.
+	 */
+	qib_write_kreg(dd, kr_hwerrclear, 0ULL);
+	qib_write_kreg(dd, kr_errclear, E_SPKT_ERRS_IGNORE);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+	qib_7220_set_intr_state(dd, 1);
+}
+
+/**
+ * qib_7220_handle_hwerrors - display hardware errors.
+ * @dd: the qlogic_ib device
+ * @msg: the output buffer
+ * @msgl: the size of the output buffer
+ *
+ * Use same msg buffer as regular errors to avoid excessive stack
+ * use.  Most hardware errors are catastrophic, but for right now,
+ * we'll print them and continue.  We reuse the same message buffer as
+ * handle_7220_errors() to avoid excessive stack usage.
+ */
+static void qib_7220_handle_hwerrors(struct qib_devdata *dd, char *msg,
+				     size_t msgl)
+{
+	u64 hwerrs;
+	u32 bits, ctrl;
+	int isfatal = 0;
+	char *bitsmsg;
+	int log_idx;
+
+	hwerrs = qib_read_kreg64(dd, kr_hwerrstatus);
+	if (!hwerrs)
+		goto bail;
+	if (hwerrs == ~0ULL) {
+		qib_dev_err(dd,
+			"Read of hardware error status failed (all bits set); ignoring\n");
+		goto bail;
+	}
+	qib_stats.sps_hwerrs++;
+
+	/*
+	 * Always clear the error status register, except MEMBISTFAIL,
+	 * regardless of whether we continue or stop using the chip.
+	 * We want that set so we know it failed, even across driver reload.
+	 * We'll still ignore it in the hwerrmask.  We do this partly for
+	 * diagnostics, but also for support.
+	 */
+	qib_write_kreg(dd, kr_hwerrclear,
+		       hwerrs & ~HWE_MASK(PowerOnBISTFailed));
+
+	hwerrs &= dd->cspec->hwerrmask;
+
+	/* We log some errors to EEPROM, check if we have any of those. */
+	for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx)
+		if (hwerrs & dd->eep_st_masks[log_idx].hwerrs_to_log)
+			qib_inc_eeprom_err(dd, log_idx, 1);
+	if (hwerrs & ~(TXEMEMPARITYERR_PIOBUF | TXEMEMPARITYERR_PIOPBC |
+		       RXE_PARITY))
+		qib_devinfo(dd->pcidev,
+			"Hardware error: hwerr=0x%llx (cleared)\n",
+			(unsigned long long) hwerrs);
+
+	if (hwerrs & ~IB_HWE_BITSEXTANT)
+		qib_dev_err(dd,
+			"hwerror interrupt with unknown errors %llx set\n",
+			(unsigned long long) (hwerrs & ~IB_HWE_BITSEXTANT));
+
+	if (hwerrs & QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR)
+		qib_sd7220_clr_ibpar(dd);
+
+	ctrl = qib_read_kreg32(dd, kr_control);
+	if ((ctrl & QLOGIC_IB_C_FREEZEMODE) && !dd->diag_client) {
+		/*
+		 * Parity errors in send memory are recoverable by h/w
+		 * just do housekeeping, exit freeze mode and continue.
+		 */
+		if (hwerrs & (TXEMEMPARITYERR_PIOBUF |
+			      TXEMEMPARITYERR_PIOPBC)) {
+			qib_7220_txe_recover(dd);
+			hwerrs &= ~(TXEMEMPARITYERR_PIOBUF |
+				    TXEMEMPARITYERR_PIOPBC);
+		}
+		if (hwerrs)
+			isfatal = 1;
+		else
+			qib_7220_clear_freeze(dd);
+	}
+
+	*msg = '\0';
+
+	if (hwerrs & HWE_MASK(PowerOnBISTFailed)) {
+		isfatal = 1;
+		strlcat(msg,
+			"[Memory BIST test failed, InfiniPath hardware unusable]",
+			msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->cspec->hwerrmask &= ~HWE_MASK(PowerOnBISTFailed);
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	qib_format_hwerrors(hwerrs, qib_7220_hwerror_msgs,
+			    ARRAY_SIZE(qib_7220_hwerror_msgs), msg, msgl);
+
+	bitsmsg = dd->cspec->bitsmsgbuf;
+	if (hwerrs & (QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK <<
+		      QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT)) {
+		bits = (u32) ((hwerrs >>
+			       QLOGIC_IB_HWE_PCIEMEMPARITYERR_SHIFT) &
+			      QLOGIC_IB_HWE_PCIEMEMPARITYERR_MASK);
+		snprintf(bitsmsg, sizeof(dd->cspec->bitsmsgbuf),
+			 "[PCIe Mem Parity Errs %x] ", bits);
+		strlcat(msg, bitsmsg, msgl);
+	}
+
+#define _QIB_PLL_FAIL (QLOGIC_IB_HWE_COREPLL_FBSLIP |   \
+			 QLOGIC_IB_HWE_COREPLL_RFSLIP)
+
+	if (hwerrs & _QIB_PLL_FAIL) {
+		isfatal = 1;
+		snprintf(bitsmsg, sizeof(dd->cspec->bitsmsgbuf),
+			 "[PLL failed (%llx), InfiniPath hardware unusable]",
+			 (unsigned long long) hwerrs & _QIB_PLL_FAIL);
+		strlcat(msg, bitsmsg, msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->cspec->hwerrmask &= ~(hwerrs & _QIB_PLL_FAIL);
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	if (hwerrs & QLOGIC_IB_HWE_SERDESPLLFAILED) {
+		/*
+		 * If it occurs, it is left masked since the eternal
+		 * interface is unused.
+		 */
+		dd->cspec->hwerrmask &= ~QLOGIC_IB_HWE_SERDESPLLFAILED;
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	qib_dev_err(dd, "%s hardware error\n", msg);
+
+	if (isfatal && !dd->diag_client) {
+		qib_dev_err(dd,
+			"Fatal Hardware Error, no longer usable, SN %.16s\n",
+			dd->serial);
+		/*
+		 * For /sys status file and user programs to print; if no
+		 * trailing brace is copied, we'll know it was truncated.
+		 */
+		if (dd->freezemsg)
+			snprintf(dd->freezemsg, dd->freezelen,
+				 "{%s}", msg);
+		qib_disable_after_error(dd);
+	}
+bail:;
+}
+
+/**
+ * qib_7220_init_hwerrors - enable hardware errors
+ * @dd: the qlogic_ib device
+ *
+ * now that we have finished initializing everything that might reasonably
+ * cause a hardware error, and cleared those errors bits as they occur,
+ * we can enable hardware errors in the mask (potentially enabling
+ * freeze mode), and enable hardware errors as errors (along with
+ * everything else) in errormask
+ */
+static void qib_7220_init_hwerrors(struct qib_devdata *dd)
+{
+	u64 val;
+	u64 extsval;
+
+	extsval = qib_read_kreg64(dd, kr_extstatus);
+
+	if (!(extsval & (QLOGIC_IB_EXTS_MEMBIST_ENDTEST |
+			 QLOGIC_IB_EXTS_MEMBIST_DISABLED)))
+		qib_dev_err(dd, "MemBIST did not complete!\n");
+	if (extsval & QLOGIC_IB_EXTS_MEMBIST_DISABLED)
+		qib_devinfo(dd->pcidev, "MemBIST is disabled.\n");
+
+	val = ~0ULL;    /* default to all hwerrors become interrupts, */
+
+	val &= ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR;
+	dd->cspec->hwerrmask = val;
+
+	qib_write_kreg(dd, kr_hwerrclear, ~HWE_MASK(PowerOnBISTFailed));
+	qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+
+	/* clear all */
+	qib_write_kreg(dd, kr_errclear, ~0ULL);
+	/* enable errors that are masked, at least this first time. */
+	qib_write_kreg(dd, kr_errmask, ~0ULL);
+	dd->cspec->errormask = qib_read_kreg64(dd, kr_errmask);
+	/* clear any interrupts up to this point (ints still not enabled) */
+	qib_write_kreg(dd, kr_intclear, ~0ULL);
+}
+
+/*
+ * Disable and enable the armlaunch error.  Used for PIO bandwidth testing
+ * on chips that are count-based, rather than trigger-based.  There is no
+ * reference counting, but that's also fine, given the intended use.
+ * Only chip-specific because it's all register accesses
+ */
+static void qib_set_7220_armlaunch(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		qib_write_kreg(dd, kr_errclear, ERR_MASK(SendPioArmLaunchErr));
+		dd->cspec->errormask |= ERR_MASK(SendPioArmLaunchErr);
+	} else
+		dd->cspec->errormask &= ~ERR_MASK(SendPioArmLaunchErr);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+}
+
+/*
+ * Formerly took parameter <which> in pre-shifted,
+ * pre-merged form with LinkCmd and LinkInitCmd
+ * together, and assuming the zero was NOP.
+ */
+static void qib_set_ib_7220_lstate(struct qib_pportdata *ppd, u16 linkcmd,
+				   u16 linitcmd)
+{
+	u64 mod_wd;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	if (linitcmd == QLOGIC_IB_IBCC_LINKINITCMD_DISABLE) {
+		/*
+		 * If we are told to disable, note that so link-recovery
+		 * code does not attempt to bring us back up.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	} else if (linitcmd || linkcmd == QLOGIC_IB_IBCC_LINKCMD_DOWN) {
+		/*
+		 * Any other linkinitcmd will lead to LINKDOWN and then
+		 * to INIT (if all is well), so clear flag to let
+		 * link-recovery code attempt to bring us back up.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	}
+
+	mod_wd = (linkcmd << IBA7220_IBCC_LINKCMD_SHIFT) |
+		(linitcmd << QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+
+	qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl | mod_wd);
+	/* write to chip to prevent back-to-back writes of ibc reg */
+	qib_write_kreg(dd, kr_scratch, 0);
+}
+
+/*
+ * All detailed interaction with the SerDes has been moved to qib_sd7220.c
+ *
+ * The portion of IBA7220-specific bringup_serdes() that actually deals with
+ * registers and memory within the SerDes itself is qib_sd7220_init().
+ */
+
+/**
+ * qib_7220_bringup_serdes - bring up the serdes
+ * @ppd: physical port on the qlogic_ib device
+ */
+static int qib_7220_bringup_serdes(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 val, prev_val, guid, ibc;
+	int ret = 0;
+
+	/* Put IBC in reset, sends disabled */
+	dd->control &= ~QLOGIC_IB_C_LINKENABLE;
+	qib_write_kreg(dd, kr_control, 0ULL);
+
+	if (qib_compat_ddr_negotiate) {
+		ppd->cpspec->ibdeltainprog = 1;
+		ppd->cpspec->ibsymsnap = read_7220_creg32(dd, cr_ibsymbolerr);
+		ppd->cpspec->iblnkerrsnap =
+			read_7220_creg32(dd, cr_iblinkerrrecov);
+	}
+
+	/* flowcontrolwatermark is in units of KBytes */
+	ibc = 0x5ULL << SYM_LSB(IBCCtrl, FlowCtrlWaterMark);
+	/*
+	 * How often flowctrl sent.  More or less in usecs; balance against
+	 * watermark value, so that in theory senders always get a flow
+	 * control update in time to not let the IB link go idle.
+	 */
+	ibc |= 0x3ULL << SYM_LSB(IBCCtrl, FlowCtrlPeriod);
+	/* max error tolerance */
+	ibc |= 0xfULL << SYM_LSB(IBCCtrl, PhyerrThreshold);
+	/* use "real" buffer space for */
+	ibc |= 4ULL << SYM_LSB(IBCCtrl, CreditScale);
+	/* IB credit flow control. */
+	ibc |= 0xfULL << SYM_LSB(IBCCtrl, OverrunThreshold);
+	/*
+	 * set initial max size pkt IBC will send, including ICRC; it's the
+	 * PIO buffer size in dwords, less 1; also see qib_set_mtu()
+	 */
+	ibc |= ((u64)(ppd->ibmaxlen >> 2) + 1) << SYM_LSB(IBCCtrl, MaxPktLen);
+	ppd->cpspec->ibcctrl = ibc; /* without linkcmd or linkinitcmd! */
+
+	/* initially come up waiting for TS1, without sending anything. */
+	val = ppd->cpspec->ibcctrl | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE <<
+		QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+	qib_write_kreg(dd, kr_ibcctrl, val);
+
+	if (!ppd->cpspec->ibcddrctrl) {
+		/* not on re-init after reset */
+		ppd->cpspec->ibcddrctrl = qib_read_kreg64(dd, kr_ibcddrctrl);
+
+		if (ppd->link_speed_enabled == (QIB_IB_SDR | QIB_IB_DDR))
+			ppd->cpspec->ibcddrctrl |=
+				IBA7220_IBC_SPEED_AUTONEG_MASK |
+				IBA7220_IBC_IBTA_1_2_MASK;
+		else
+			ppd->cpspec->ibcddrctrl |=
+				ppd->link_speed_enabled == QIB_IB_DDR ?
+				IBA7220_IBC_SPEED_DDR : IBA7220_IBC_SPEED_SDR;
+		if ((ppd->link_width_enabled & (IB_WIDTH_1X | IB_WIDTH_4X)) ==
+		    (IB_WIDTH_1X | IB_WIDTH_4X))
+			ppd->cpspec->ibcddrctrl |= IBA7220_IBC_WIDTH_AUTONEG;
+		else
+			ppd->cpspec->ibcddrctrl |=
+				ppd->link_width_enabled == IB_WIDTH_4X ?
+				IBA7220_IBC_WIDTH_4X_ONLY :
+				IBA7220_IBC_WIDTH_1X_ONLY;
+
+		/* always enable these on driver reload, not sticky */
+		ppd->cpspec->ibcddrctrl |=
+			IBA7220_IBC_RXPOL_MASK << IBA7220_IBC_RXPOL_SHIFT;
+		ppd->cpspec->ibcddrctrl |=
+			IBA7220_IBC_HRTBT_MASK << IBA7220_IBC_HRTBT_SHIFT;
+
+		/* enable automatic lane reversal detection for receive */
+		ppd->cpspec->ibcddrctrl |= IBA7220_IBC_LANE_REV_SUPPORTED;
+	} else
+		/* write to chip to prevent back-to-back writes of ibc reg */
+		qib_write_kreg(dd, kr_scratch, 0);
+
+	qib_write_kreg(dd, kr_ibcddrctrl, ppd->cpspec->ibcddrctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	qib_write_kreg(dd, kr_ncmodectrl, 0Ull);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	ret = qib_sd7220_init(dd);
+
+	val = qib_read_kreg64(dd, kr_xgxs_cfg);
+	prev_val = val;
+	val |= QLOGIC_IB_XGXS_FC_SAFE;
+	if (val != prev_val) {
+		qib_write_kreg(dd, kr_xgxs_cfg, val);
+		qib_read_kreg32(dd, kr_scratch);
+	}
+	if (val & QLOGIC_IB_XGXS_RESET)
+		val &= ~QLOGIC_IB_XGXS_RESET;
+	if (val != prev_val)
+		qib_write_kreg(dd, kr_xgxs_cfg, val);
+
+	/* first time through, set port guid */
+	if (!ppd->guid)
+		ppd->guid = dd->base_guid;
+	guid = be64_to_cpu(ppd->guid);
+
+	qib_write_kreg(dd, kr_hrtbt_guid, guid);
+	if (!ret) {
+		dd->control |= QLOGIC_IB_C_LINKENABLE;
+		qib_write_kreg(dd, kr_control, dd->control);
+	} else
+		/* write to chip to prevent back-to-back writes of ibc reg */
+		qib_write_kreg(dd, kr_scratch, 0);
+	return ret;
+}
+
+/**
+ * qib_7220_quiet_serdes - set serdes to txidle
+ * @ppd: physical port of the qlogic_ib device
+ * Called when driver is being unloaded
+ */
+static void qib_7220_quiet_serdes(struct qib_pportdata *ppd)
+{
+	u64 val;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	/* disable IBC */
+	dd->control &= ~QLOGIC_IB_C_LINKENABLE;
+	qib_write_kreg(dd, kr_control,
+		       dd->control | QLOGIC_IB_C_FREEZEMODE);
+
+	ppd->cpspec->chase_end = 0;
+	if (ppd->cpspec->chase_timer.data) /* if initted */
+		del_timer_sync(&ppd->cpspec->chase_timer);
+
+	if (ppd->cpspec->ibsymdelta || ppd->cpspec->iblnkerrdelta ||
+	    ppd->cpspec->ibdeltainprog) {
+		u64 diagc;
+
+		/* enable counter writes */
+		diagc = qib_read_kreg64(dd, kr_hwdiagctrl);
+		qib_write_kreg(dd, kr_hwdiagctrl,
+			       diagc | SYM_MASK(HwDiagCtrl, CounterWrEnable));
+
+		if (ppd->cpspec->ibsymdelta || ppd->cpspec->ibdeltainprog) {
+			val = read_7220_creg32(dd, cr_ibsymbolerr);
+			if (ppd->cpspec->ibdeltainprog)
+				val -= val - ppd->cpspec->ibsymsnap;
+			val -= ppd->cpspec->ibsymdelta;
+			write_7220_creg(dd, cr_ibsymbolerr, val);
+		}
+		if (ppd->cpspec->iblnkerrdelta || ppd->cpspec->ibdeltainprog) {
+			val = read_7220_creg32(dd, cr_iblinkerrrecov);
+			if (ppd->cpspec->ibdeltainprog)
+				val -= val - ppd->cpspec->iblnkerrsnap;
+			val -= ppd->cpspec->iblnkerrdelta;
+			write_7220_creg(dd, cr_iblinkerrrecov, val);
+		}
+
+		/* and disable counter writes */
+		qib_write_kreg(dd, kr_hwdiagctrl, diagc);
+	}
+	qib_set_ib_7220_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	wake_up(&ppd->cpspec->autoneg_wait);
+	cancel_delayed_work_sync(&ppd->cpspec->autoneg_work);
+
+	shutdown_7220_relock_poll(ppd->dd);
+	val = qib_read_kreg64(ppd->dd, kr_xgxs_cfg);
+	val |= QLOGIC_IB_XGXS_RESET;
+	qib_write_kreg(ppd->dd, kr_xgxs_cfg, val);
+}
+
+/**
+ * qib_setup_7220_setextled - set the state of the two external LEDs
+ * @dd: the qlogic_ib device
+ * @on: whether the link is up or not
+ *
+ * The exact combo of LEDs if on is true is determined by looking
+ * at the ibcstatus.
+ *
+ * These LEDs indicate the physical and logical state of IB link.
+ * For this chip (at least with recommended board pinouts), LED1
+ * is Yellow (logical state) and LED2 is Green (physical state),
+ *
+ * Note:  We try to match the Mellanox HCA LED behavior as best
+ * we can.  Green indicates physical link state is OK (something is
+ * plugged in, and we can train).
+ * Amber indicates the link is logically up (ACTIVE).
+ * Mellanox further blinks the amber LED to indicate data packet
+ * activity, but we have no hardware support for that, so it would
+ * require waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if appropriate.  That's
+ * visible overhead, so not something we will do.
+ *
+ */
+static void qib_setup_7220_setextled(struct qib_pportdata *ppd, u32 on)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 extctl, ledblink = 0, val, lst, ltst;
+	unsigned long flags;
+
+	/*
+	 * The diags use the LED to indicate diag info, so we leave
+	 * the external LED alone when the diags are running.
+	 */
+	if (dd->diag_client)
+		return;
+
+	if (ppd->led_override) {
+		ltst = (ppd->led_override & QIB_LED_PHYS) ?
+			IB_PHYSPORTSTATE_LINKUP : IB_PHYSPORTSTATE_DISABLED,
+		lst = (ppd->led_override & QIB_LED_LOG) ?
+			IB_PORT_ACTIVE : IB_PORT_DOWN;
+	} else if (on) {
+		val = qib_read_kreg64(dd, kr_ibcstatus);
+		ltst = qib_7220_phys_portstate(val);
+		lst = qib_7220_iblink_state(val);
+	} else {
+		ltst = 0;
+		lst = 0;
+	}
+
+	spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+	extctl = dd->cspec->extctrl & ~(SYM_MASK(EXTCtrl, LEDPriPortGreenOn) |
+				 SYM_MASK(EXTCtrl, LEDPriPortYellowOn));
+	if (ltst == IB_PHYSPORTSTATE_LINKUP) {
+		extctl |= SYM_MASK(EXTCtrl, LEDPriPortGreenOn);
+		/*
+		 * counts are in chip clock (4ns) periods.
+		 * This is 1/16 sec (66.6ms) on,
+		 * 3/16 sec (187.5 ms) off, with packets rcvd
+		 */
+		ledblink = ((66600 * 1000UL / 4) << IBA7220_LEDBLINK_ON_SHIFT)
+			| ((187500 * 1000UL / 4) << IBA7220_LEDBLINK_OFF_SHIFT);
+	}
+	if (lst == IB_PORT_ACTIVE)
+		extctl |= SYM_MASK(EXTCtrl, LEDPriPortYellowOn);
+	dd->cspec->extctrl = extctl;
+	qib_write_kreg(dd, kr_extctrl, extctl);
+	spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+
+	if (ledblink) /* blink the LED on packet receive */
+		qib_write_kreg(dd, kr_rcvpktledcnt, ledblink);
+}
+
+static void qib_7220_free_irq(struct qib_devdata *dd)
+{
+	if (dd->cspec->irq) {
+		free_irq(dd->cspec->irq, dd);
+		dd->cspec->irq = 0;
+	}
+	qib_nomsi(dd);
+}
+
+/*
+ * qib_setup_7220_cleanup - clean up any per-chip chip-specific stuff
+ * @dd: the qlogic_ib device
+ *
+ * This is called during driver unload.
+ *
+ */
+static void qib_setup_7220_cleanup(struct qib_devdata *dd)
+{
+	qib_7220_free_irq(dd);
+	kfree(dd->cspec->cntrs);
+	kfree(dd->cspec->portcntrs);
+}
+
+/*
+ * This is only called for SDmaInt.
+ * SDmaDisabled is handled on the error path.
+ */
+static void sdma_7220_intr(struct qib_pportdata *ppd, u64 istat)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	switch (ppd->sdma_state.current_state) {
+	case qib_sdma_state_s00_hw_down:
+		break;
+
+	case qib_sdma_state_s10_hw_start_up_wait:
+		__qib_sdma_process_event(ppd, qib_sdma_event_e20_hw_started);
+		break;
+
+	case qib_sdma_state_s20_idle:
+		break;
+
+	case qib_sdma_state_s30_sw_clean_up_wait:
+		break;
+
+	case qib_sdma_state_s40_hw_clean_up_wait:
+		break;
+
+	case qib_sdma_state_s50_hw_halt_wait:
+		__qib_sdma_process_event(ppd, qib_sdma_event_e60_hw_halted);
+		break;
+
+	case qib_sdma_state_s99_running:
+		/* too chatty to print here */
+		__qib_sdma_intr(ppd);
+		break;
+	}
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+static void qib_wantpiobuf_7220_intr(struct qib_devdata *dd, u32 needint)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+	if (needint) {
+		if (!(dd->sendctrl & SYM_MASK(SendCtrl, SendBufAvailUpd)))
+			goto done;
+		/*
+		 * blip the availupd off, next write will be on, so
+		 * we ensure an avail update, regardless of threshold or
+		 * buffers becoming free, whenever we want an interrupt
+		 */
+		qib_write_kreg(dd, kr_sendctrl, dd->sendctrl &
+			~SYM_MASK(SendCtrl, SendBufAvailUpd));
+		qib_write_kreg(dd, kr_scratch, 0ULL);
+		dd->sendctrl |= SYM_MASK(SendCtrl, SendIntBufAvail);
+	} else
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, SendIntBufAvail);
+	qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+done:
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+}
+
+/*
+ * Handle errors and unusual events first, separate function
+ * to improve cache hits for fast path interrupt handling.
+ */
+static noinline void unlikely_7220_intr(struct qib_devdata *dd, u64 istat)
+{
+	if (unlikely(istat & ~QLOGIC_IB_I_BITSEXTANT))
+		qib_dev_err(dd,
+			    "interrupt with unknown interrupts %Lx set\n",
+			    istat & ~QLOGIC_IB_I_BITSEXTANT);
+
+	if (istat & QLOGIC_IB_I_GPIO) {
+		u32 gpiostatus;
+
+		/*
+		 * Boards for this chip currently don't use GPIO interrupts,
+		 * so clear by writing GPIOstatus to GPIOclear, and complain
+		 * to alert developer. To avoid endless repeats, clear
+		 * the bits in the mask, since there is some kind of
+		 * programming error or chip problem.
+		 */
+		gpiostatus = qib_read_kreg32(dd, kr_gpio_status);
+		/*
+		 * In theory, writing GPIOstatus to GPIOclear could
+		 * have a bad side-effect on some diagnostic that wanted
+		 * to poll for a status-change, but the various shadows
+		 * make that problematic at best. Diags will just suppress
+		 * all GPIO interrupts during such tests.
+		 */
+		qib_write_kreg(dd, kr_gpio_clear, gpiostatus);
+
+		if (gpiostatus) {
+			const u32 mask = qib_read_kreg32(dd, kr_gpio_mask);
+			u32 gpio_irq = mask & gpiostatus;
+
+			/*
+			 * A bit set in status and (chip) Mask register
+			 * would cause an interrupt. Since we are not
+			 * expecting any, report it. Also check that the
+			 * chip reflects our shadow, report issues,
+			 * and refresh from the shadow.
+			 */
+			/*
+			 * Clear any troublemakers, and update chip
+			 * from shadow
+			 */
+			dd->cspec->gpio_mask &= ~gpio_irq;
+			qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+		}
+	}
+
+	if (istat & QLOGIC_IB_I_ERROR) {
+		u64 estat;
+
+		qib_stats.sps_errints++;
+		estat = qib_read_kreg64(dd, kr_errstatus);
+		if (!estat)
+			qib_devinfo(dd->pcidev,
+				"error interrupt (%Lx), but no error bits set!\n",
+				istat);
+		else
+			handle_7220_errors(dd, estat);
+	}
+}
+
+static irqreturn_t qib_7220intr(int irq, void *data)
+{
+	struct qib_devdata *dd = data;
+	irqreturn_t ret;
+	u64 istat;
+	u64 ctxtrbits;
+	u64 rmask;
+	unsigned i;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) {
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		ret = IRQ_HANDLED;
+		goto bail;
+	}
+
+	istat = qib_read_kreg64(dd, kr_intstatus);
+
+	if (unlikely(!istat)) {
+		ret = IRQ_NONE; /* not our interrupt, or already handled */
+		goto bail;
+	}
+	if (unlikely(istat == -1)) {
+		qib_bad_intrstatus(dd);
+		/* don't know if it was our interrupt or not */
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	this_cpu_inc(*dd->int_counter);
+	if (unlikely(istat & (~QLOGIC_IB_I_BITSEXTANT |
+			      QLOGIC_IB_I_GPIO | QLOGIC_IB_I_ERROR)))
+		unlikely_7220_intr(dd, istat);
+
+	/*
+	 * Clear the interrupt bits we found set, relatively early, so we
+	 * "know" know the chip will have seen this by the time we process
+	 * the queue, and will re-interrupt if necessary.  The processor
+	 * itself won't take the interrupt again until we return.
+	 */
+	qib_write_kreg(dd, kr_intclear, istat);
+
+	/*
+	 * Handle kernel receive queues before checking for pio buffers
+	 * available since receives can overflow; piobuf waiters can afford
+	 * a few extra cycles, since they were waiting anyway.
+	 */
+	ctxtrbits = istat &
+		((QLOGIC_IB_I_RCVAVAIL_MASK << QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+		 (QLOGIC_IB_I_RCVURG_MASK << QLOGIC_IB_I_RCVURG_SHIFT));
+	if (ctxtrbits) {
+		rmask = (1ULL << QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+			(1ULL << QLOGIC_IB_I_RCVURG_SHIFT);
+		for (i = 0; i < dd->first_user_ctxt; i++) {
+			if (ctxtrbits & rmask) {
+				ctxtrbits &= ~rmask;
+				qib_kreceive(dd->rcd[i], NULL, NULL);
+			}
+			rmask <<= 1;
+		}
+		if (ctxtrbits) {
+			ctxtrbits =
+				(ctxtrbits >> QLOGIC_IB_I_RCVAVAIL_SHIFT) |
+				(ctxtrbits >> QLOGIC_IB_I_RCVURG_SHIFT);
+			qib_handle_urcv(dd, ctxtrbits);
+		}
+	}
+
+	/* only call for SDmaInt */
+	if (istat & QLOGIC_IB_I_SDMAINT)
+		sdma_7220_intr(dd->pport, istat);
+
+	if ((istat & QLOGIC_IB_I_SPIOBUFAVAIL) && (dd->flags & QIB_INITTED))
+		qib_ib_piobufavail(dd);
+
+	ret = IRQ_HANDLED;
+bail:
+	return ret;
+}
+
+/*
+ * Set up our chip-specific interrupt handler.
+ * The interrupt type has already been setup, so
+ * we just need to do the registration and error checking.
+ * If we are using MSI interrupts, we may fall back to
+ * INTx later, if the interrupt handler doesn't get called
+ * within 1/2 second (see verify_interrupt()).
+ */
+static void qib_setup_7220_interrupt(struct qib_devdata *dd)
+{
+	if (!dd->cspec->irq)
+		qib_dev_err(dd,
+			"irq is 0, BIOS error?  Interrupts won't work\n");
+	else {
+		int ret = request_irq(dd->cspec->irq, qib_7220intr,
+			dd->msi_lo ? 0 : IRQF_SHARED,
+			QIB_DRV_NAME, dd);
+
+		if (ret)
+			qib_dev_err(dd,
+				"Couldn't setup %s interrupt (irq=%d): %d\n",
+				dd->msi_lo ?  "MSI" : "INTx",
+				dd->cspec->irq, ret);
+	}
+}
+
+/**
+ * qib_7220_boardname - fill in the board name
+ * @dd: the qlogic_ib device
+ *
+ * info is based on the board revision register
+ */
+static void qib_7220_boardname(struct qib_devdata *dd)
+{
+	char *n;
+	u32 boardid, namelen;
+
+	boardid = SYM_FIELD(dd->revision, Revision,
+			    BoardID);
+
+	switch (boardid) {
+	case 1:
+		n = "InfiniPath_QLE7240";
+		break;
+	case 2:
+		n = "InfiniPath_QLE7280";
+		break;
+	default:
+		qib_dev_err(dd, "Unknown 7220 board with ID %u\n", boardid);
+		n = "Unknown_InfiniPath_7220";
+		break;
+	}
+
+	namelen = strlen(n) + 1;
+	dd->boardname = kmalloc(namelen, GFP_KERNEL);
+	if (!dd->boardname)
+		qib_dev_err(dd, "Failed allocation for board name: %s\n", n);
+	else
+		snprintf(dd->boardname, namelen, "%s", n);
+
+	if (dd->majrev != 5 || !dd->minrev || dd->minrev > 2)
+		qib_dev_err(dd,
+			"Unsupported InfiniPath hardware revision %u.%u!\n",
+			dd->majrev, dd->minrev);
+
+	snprintf(dd->boardversion, sizeof(dd->boardversion),
+		 "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n",
+		 QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname,
+		 (unsigned)SYM_FIELD(dd->revision, Revision_R, Arch),
+		 dd->majrev, dd->minrev,
+		 (unsigned)SYM_FIELD(dd->revision, Revision_R, SW));
+}
+
+/*
+ * This routine sleeps, so it can only be called from user context, not
+ * from interrupt context.
+ */
+static int qib_setup_7220_reset(struct qib_devdata *dd)
+{
+	u64 val;
+	int i;
+	int ret;
+	u16 cmdval;
+	u8 int_line, clinesz;
+	unsigned long flags;
+
+	qib_pcie_getcmd(dd, &cmdval, &int_line, &clinesz);
+
+	/* Use dev_err so it shows up in logs, etc. */
+	qib_dev_err(dd, "Resetting InfiniPath unit %u\n", dd->unit);
+
+	/* no interrupts till re-initted */
+	qib_7220_set_intr_state(dd, 0);
+
+	dd->pport->cpspec->ibdeltainprog = 0;
+	dd->pport->cpspec->ibsymdelta = 0;
+	dd->pport->cpspec->iblnkerrdelta = 0;
+
+	/*
+	 * Keep chip from being accessed until we are ready.  Use
+	 * writeq() directly, to allow the write even though QIB_PRESENT
+	 * isn't set.
+	 */
+	dd->flags &= ~(QIB_INITTED | QIB_PRESENT);
+	/* so we check interrupts work again */
+	dd->z_int_counter = qib_int_counter(dd);
+	val = dd->control | QLOGIC_IB_C_RESET;
+	writeq(val, &dd->kregbase[kr_control]);
+	mb(); /* prevent compiler reordering around actual reset */
+
+	for (i = 1; i <= 5; i++) {
+		/*
+		 * Allow MBIST, etc. to complete; longer on each retry.
+		 * We sometimes get machine checks from bus timeout if no
+		 * response, so for now, make it *really* long.
+		 */
+		msleep(1000 + (1 + i) * 2000);
+
+		qib_pcie_reenable(dd, cmdval, int_line, clinesz);
+
+		/*
+		 * Use readq directly, so we don't need to mark it as PRESENT
+		 * until we get a successful indication that all is well.
+		 */
+		val = readq(&dd->kregbase[kr_revision]);
+		if (val == dd->revision) {
+			dd->flags |= QIB_PRESENT; /* it's back */
+			ret = qib_reinit_intr(dd);
+			goto bail;
+		}
+	}
+	ret = 0; /* failed */
+
+bail:
+	if (ret) {
+		if (qib_pcie_params(dd, dd->lbus_width, NULL, NULL))
+			qib_dev_err(dd,
+				"Reset failed to setup PCIe or interrupts; continuing anyway\n");
+
+		/* hold IBC in reset, no sends, etc till later */
+		qib_write_kreg(dd, kr_control, 0ULL);
+
+		/* clear the reset error, init error/hwerror mask */
+		qib_7220_init_hwerrors(dd);
+
+		/* do setup similar to speed or link-width changes */
+		if (dd->pport->cpspec->ibcddrctrl & IBA7220_IBC_IBTA_1_2_MASK)
+			dd->cspec->presets_needed = 1;
+		spin_lock_irqsave(&dd->pport->lflags_lock, flags);
+		dd->pport->lflags |= QIBL_IB_FORCE_NOTIFY;
+		dd->pport->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+		spin_unlock_irqrestore(&dd->pport->lflags_lock, flags);
+	}
+
+	return ret;
+}
+
+/**
+ * qib_7220_put_tid - write a TID to the chip
+ * @dd: the qlogic_ib device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: 0 for eager, 1 for expected
+ * @pa: physical address of in memory buffer; tidinvalid if freeing
+ */
+static void qib_7220_put_tid(struct qib_devdata *dd, u64 __iomem *tidptr,
+			     u32 type, unsigned long pa)
+{
+	if (pa != dd->tidinvalid) {
+		u64 chippa = pa >> IBA7220_TID_PA_SHIFT;
+
+		/* paranoia checks */
+		if (pa != (chippa << IBA7220_TID_PA_SHIFT)) {
+			qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n",
+				    pa);
+			return;
+		}
+		if (chippa >= (1UL << IBA7220_TID_SZ_SHIFT)) {
+			qib_dev_err(dd,
+				"Physical page address 0x%lx larger than supported\n",
+				pa);
+			return;
+		}
+
+		if (type == RCVHQ_RCV_TYPE_EAGER)
+			chippa |= dd->tidtemplate;
+		else /* for now, always full 4KB page */
+			chippa |= IBA7220_TID_SZ_4K;
+		pa = chippa;
+	}
+	writeq(pa, tidptr);
+	mmiowb();
+}
+
+/**
+ * qib_7220_clear_tids - clear all TID entries for a ctxt, expected and eager
+ * @dd: the qlogic_ib device
+ * @ctxt: the ctxt
+ *
+ * clear all TID entries for a ctxt, expected and eager.
+ * Used from qib_close().  On this chip, TIDs are only 32 bits,
+ * not 64, but they are still on 64 bit boundaries, so tidbase
+ * is declared as u64 * for the pointer math, even though we write 32 bits
+ */
+static void qib_7220_clear_tids(struct qib_devdata *dd,
+				struct qib_ctxtdata *rcd)
+{
+	u64 __iomem *tidbase;
+	unsigned long tidinv;
+	u32 ctxt;
+	int i;
+
+	if (!dd->kregbase || !rcd)
+		return;
+
+	ctxt = rcd->ctxt;
+
+	tidinv = dd->tidinvalid;
+	tidbase = (u64 __iomem *)
+		((char __iomem *)(dd->kregbase) +
+		 dd->rcvtidbase +
+		 ctxt * dd->rcvtidcnt * sizeof(*tidbase));
+
+	for (i = 0; i < dd->rcvtidcnt; i++)
+		qib_7220_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
+				 tidinv);
+
+	tidbase = (u64 __iomem *)
+		((char __iomem *)(dd->kregbase) +
+		 dd->rcvegrbase +
+		 rcd->rcvegr_tid_base * sizeof(*tidbase));
+
+	for (i = 0; i < rcd->rcvegrcnt; i++)
+		qib_7220_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
+				 tidinv);
+}
+
+/**
+ * qib_7220_tidtemplate - setup constants for TID updates
+ * @dd: the qlogic_ib device
+ *
+ * We setup stuff that we use a lot, to avoid calculating each time
+ */
+static void qib_7220_tidtemplate(struct qib_devdata *dd)
+{
+	if (dd->rcvegrbufsize == 2048)
+		dd->tidtemplate = IBA7220_TID_SZ_2K;
+	else if (dd->rcvegrbufsize == 4096)
+		dd->tidtemplate = IBA7220_TID_SZ_4K;
+	dd->tidinvalid = 0;
+}
+
+/**
+ * qib_init_7220_get_base_info - set chip-specific flags for user code
+ * @rcd: the qlogic_ib ctxt
+ * @kbase: qib_base_info pointer
+ *
+ * We set the PCIE flag because the lower bandwidth on PCIe vs
+ * HyperTransport can affect some user packet algorithims.
+ */
+static int qib_7220_get_base_info(struct qib_ctxtdata *rcd,
+				  struct qib_base_info *kinfo)
+{
+	kinfo->spi_runtime_flags |= QIB_RUNTIME_PCIE |
+		QIB_RUNTIME_NODMA_RTAIL | QIB_RUNTIME_SDMA;
+
+	if (rcd->dd->flags & QIB_USE_SPCL_TRIG)
+		kinfo->spi_runtime_flags |= QIB_RUNTIME_SPECIAL_TRIGGER;
+
+	return 0;
+}
+
+static struct qib_message_header *
+qib_7220_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr)
+{
+	u32 offset = qib_hdrget_offset(rhf_addr);
+
+	return (struct qib_message_header *)
+		(rhf_addr - dd->rhf_offset + offset);
+}
+
+static void qib_7220_config_ctxts(struct qib_devdata *dd)
+{
+	unsigned long flags;
+	u32 nchipctxts;
+
+	nchipctxts = qib_read_kreg32(dd, kr_portcnt);
+	dd->cspec->numctxts = nchipctxts;
+	if (qib_n_krcv_queues > 1) {
+		dd->qpn_mask = 0x3e;
+		dd->first_user_ctxt = qib_n_krcv_queues * dd->num_pports;
+		if (dd->first_user_ctxt > nchipctxts)
+			dd->first_user_ctxt = nchipctxts;
+	} else
+		dd->first_user_ctxt = dd->num_pports;
+	dd->n_krcv_queues = dd->first_user_ctxt;
+
+	if (!qib_cfgctxts) {
+		int nctxts = dd->first_user_ctxt + num_online_cpus();
+
+		if (nctxts <= 5)
+			dd->ctxtcnt = 5;
+		else if (nctxts <= 9)
+			dd->ctxtcnt = 9;
+		else if (nctxts <= nchipctxts)
+			dd->ctxtcnt = nchipctxts;
+	} else if (qib_cfgctxts <= nchipctxts)
+		dd->ctxtcnt = qib_cfgctxts;
+	if (!dd->ctxtcnt) /* none of the above, set to max */
+		dd->ctxtcnt = nchipctxts;
+
+	/*
+	 * Chip can be configured for 5, 9, or 17 ctxts, and choice
+	 * affects number of eager TIDs per ctxt (1K, 2K, 4K).
+	 * Lock to be paranoid about later motion, etc.
+	 */
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+	if (dd->ctxtcnt > 9)
+		dd->rcvctrl |= 2ULL << IBA7220_R_CTXTCFG_SHIFT;
+	else if (dd->ctxtcnt > 5)
+		dd->rcvctrl |= 1ULL << IBA7220_R_CTXTCFG_SHIFT;
+	/* else configure for default 5 receive ctxts */
+	if (dd->qpn_mask)
+		dd->rcvctrl |= 1ULL << QIB_7220_RcvCtrl_RcvQPMapEnable_LSB;
+	qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl);
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+
+	/* kr_rcvegrcnt changes based on the number of contexts enabled */
+	dd->cspec->rcvegrcnt = qib_read_kreg32(dd, kr_rcvegrcnt);
+	dd->rcvhdrcnt = max(dd->cspec->rcvegrcnt, IBA7220_KRCVEGRCNT);
+}
+
+static int qib_7220_get_ib_cfg(struct qib_pportdata *ppd, int which)
+{
+	int lsb, ret = 0;
+	u64 maskr; /* right-justified mask */
+
+	switch (which) {
+	case QIB_IB_CFG_LWID_ENB: /* Get allowed Link-width */
+		ret = ppd->link_width_enabled;
+		goto done;
+
+	case QIB_IB_CFG_LWID: /* Get currently active Link-width */
+		ret = ppd->link_width_active;
+		goto done;
+
+	case QIB_IB_CFG_SPD_ENB: /* Get allowed Link speeds */
+		ret = ppd->link_speed_enabled;
+		goto done;
+
+	case QIB_IB_CFG_SPD: /* Get current Link spd */
+		ret = ppd->link_speed_active;
+		goto done;
+
+	case QIB_IB_CFG_RXPOL_ENB: /* Get Auto-RX-polarity enable */
+		lsb = IBA7220_IBC_RXPOL_SHIFT;
+		maskr = IBA7220_IBC_RXPOL_MASK;
+		break;
+
+	case QIB_IB_CFG_LREV_ENB: /* Get Auto-Lane-reversal enable */
+		lsb = IBA7220_IBC_LREV_SHIFT;
+		maskr = IBA7220_IBC_LREV_MASK;
+		break;
+
+	case QIB_IB_CFG_LINKLATENCY:
+		ret = qib_read_kreg64(ppd->dd, kr_ibcddrstatus)
+			& IBA7220_DDRSTAT_LINKLAT_MASK;
+		goto done;
+
+	case QIB_IB_CFG_OP_VLS:
+		ret = ppd->vls_operational;
+		goto done;
+
+	case QIB_IB_CFG_VL_HIGH_CAP:
+		ret = 0;
+		goto done;
+
+	case QIB_IB_CFG_VL_LOW_CAP:
+		ret = 0;
+		goto done;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		ret = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl,
+				OverrunThreshold);
+		goto done;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		ret = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl,
+				PhyerrThreshold);
+		goto done;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		ret = (ppd->cpspec->ibcctrl &
+		       SYM_MASK(IBCCtrl, LinkDownDefaultState)) ?
+			IB_LINKINITCMD_SLEEP : IB_LINKINITCMD_POLL;
+		goto done;
+
+	case QIB_IB_CFG_HRTBT: /* Get Heartbeat off/enable/auto */
+		lsb = IBA7220_IBC_HRTBT_SHIFT;
+		maskr = IBA7220_IBC_HRTBT_MASK;
+		break;
+
+	case QIB_IB_CFG_PMA_TICKS:
+		/*
+		 * 0x00 = 10x link transfer rate or 4 nsec. for 2.5Gbs
+		 * Since the clock is always 250MHz, the value is 1 or 0.
+		 */
+		ret = (ppd->link_speed_active == QIB_IB_DDR);
+		goto done;
+
+	default:
+		ret = -EINVAL;
+		goto done;
+	}
+	ret = (int)((ppd->cpspec->ibcddrctrl >> lsb) & maskr);
+done:
+	return ret;
+}
+
+static int qib_7220_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 maskr; /* right-justified mask */
+	int lsb, ret = 0, setforce = 0;
+	u16 lcmd, licmd;
+	unsigned long flags;
+	u32 tmp = 0;
+
+	switch (which) {
+	case QIB_IB_CFG_LIDLMC:
+		/*
+		 * Set LID and LMC. Combined to avoid possible hazard
+		 * caller puts LMC in 16MSbits, DLID in 16LSbits of val
+		 */
+		lsb = IBA7220_IBC_DLIDLMC_SHIFT;
+		maskr = IBA7220_IBC_DLIDLMC_MASK;
+		break;
+
+	case QIB_IB_CFG_LWID_ENB: /* set allowed Link-width */
+		/*
+		 * As with speed, only write the actual register if
+		 * the link is currently down, otherwise takes effect
+		 * on next link change.
+		 */
+		ppd->link_width_enabled = val;
+		if (!(ppd->lflags & QIBL_LINKDOWN))
+			goto bail;
+		/*
+		 * We set the QIBL_IB_FORCE_NOTIFY bit so updown
+		 * will get called because we want update
+		 * link_width_active, and the change may not take
+		 * effect for some time (if we are in POLL), so this
+		 * flag will force the updown routine to be called
+		 * on the next ibstatuschange down interrupt, even
+		 * if it's not an down->up transition.
+		 */
+		val--; /* convert from IB to chip */
+		maskr = IBA7220_IBC_WIDTH_MASK;
+		lsb = IBA7220_IBC_WIDTH_SHIFT;
+		setforce = 1;
+		break;
+
+	case QIB_IB_CFG_SPD_ENB: /* set allowed Link speeds */
+		/*
+		 * If we turn off IB1.2, need to preset SerDes defaults,
+		 * but not right now. Set a flag for the next time
+		 * we command the link down.  As with width, only write the
+		 * actual register if the link is currently down, otherwise
+		 * takes effect on next link change.  Since setting is being
+		 * explicitly requested (via MAD or sysfs), clear autoneg
+		 * failure status if speed autoneg is enabled.
+		 */
+		ppd->link_speed_enabled = val;
+		if ((ppd->cpspec->ibcddrctrl & IBA7220_IBC_IBTA_1_2_MASK) &&
+		    !(val & (val - 1)))
+			dd->cspec->presets_needed = 1;
+		if (!(ppd->lflags & QIBL_LINKDOWN))
+			goto bail;
+		/*
+		 * We set the QIBL_IB_FORCE_NOTIFY bit so updown
+		 * will get called because we want update
+		 * link_speed_active, and the change may not take
+		 * effect for some time (if we are in POLL), so this
+		 * flag will force the updown routine to be called
+		 * on the next ibstatuschange down interrupt, even
+		 * if it's not an down->up transition.
+		 */
+		if (val == (QIB_IB_SDR | QIB_IB_DDR)) {
+			val = IBA7220_IBC_SPEED_AUTONEG_MASK |
+				IBA7220_IBC_IBTA_1_2_MASK;
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		} else
+			val = val == QIB_IB_DDR ?
+				IBA7220_IBC_SPEED_DDR : IBA7220_IBC_SPEED_SDR;
+		maskr = IBA7220_IBC_SPEED_AUTONEG_MASK |
+			IBA7220_IBC_IBTA_1_2_MASK;
+		/* IBTA 1.2 mode + speed bits are contiguous */
+		lsb = SYM_LSB(IBCDDRCtrl, IB_ENHANCED_MODE);
+		setforce = 1;
+		break;
+
+	case QIB_IB_CFG_RXPOL_ENB: /* set Auto-RX-polarity enable */
+		lsb = IBA7220_IBC_RXPOL_SHIFT;
+		maskr = IBA7220_IBC_RXPOL_MASK;
+		break;
+
+	case QIB_IB_CFG_LREV_ENB: /* set Auto-Lane-reversal enable */
+		lsb = IBA7220_IBC_LREV_SHIFT;
+		maskr = IBA7220_IBC_LREV_MASK;
+		break;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		maskr = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl,
+				  OverrunThreshold);
+		if (maskr != val) {
+			ppd->cpspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, OverrunThreshold);
+			ppd->cpspec->ibcctrl |= (u64) val <<
+				SYM_LSB(IBCCtrl, OverrunThreshold);
+			qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		maskr = SYM_FIELD(ppd->cpspec->ibcctrl, IBCCtrl,
+				  PhyerrThreshold);
+		if (maskr != val) {
+			ppd->cpspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, PhyerrThreshold);
+			ppd->cpspec->ibcctrl |= (u64) val <<
+				SYM_LSB(IBCCtrl, PhyerrThreshold);
+			qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_PKEYS: /* update pkeys */
+		maskr = (u64) ppd->pkeys[0] | ((u64) ppd->pkeys[1] << 16) |
+			((u64) ppd->pkeys[2] << 32) |
+			((u64) ppd->pkeys[3] << 48);
+		qib_write_kreg(dd, kr_partitionkey, maskr);
+		goto bail;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		if (val == IB_LINKINITCMD_POLL)
+			ppd->cpspec->ibcctrl &=
+				~SYM_MASK(IBCCtrl, LinkDownDefaultState);
+		else /* SLEEP */
+			ppd->cpspec->ibcctrl |=
+				SYM_MASK(IBCCtrl, LinkDownDefaultState);
+		qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+		goto bail;
+
+	case QIB_IB_CFG_MTU: /* update the MTU in IBC */
+		/*
+		 * Update our housekeeping variables, and set IBC max
+		 * size, same as init code; max IBC is max we allow in
+		 * buffer, less the qword pbc, plus 1 for ICRC, in dwords
+		 * Set even if it's unchanged, print debug message only
+		 * on changes.
+		 */
+		val = (ppd->ibmaxlen >> 2) + 1;
+		ppd->cpspec->ibcctrl &= ~SYM_MASK(IBCCtrl, MaxPktLen);
+		ppd->cpspec->ibcctrl |= (u64)val << SYM_LSB(IBCCtrl, MaxPktLen);
+		qib_write_kreg(dd, kr_ibcctrl, ppd->cpspec->ibcctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+		goto bail;
+
+	case QIB_IB_CFG_LSTATE: /* set the IB link state */
+		switch (val & 0xffff0000) {
+		case IB_LINKCMD_DOWN:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_DOWN;
+			if (!ppd->cpspec->ibdeltainprog &&
+			    qib_compat_ddr_negotiate) {
+				ppd->cpspec->ibdeltainprog = 1;
+				ppd->cpspec->ibsymsnap =
+					read_7220_creg32(dd, cr_ibsymbolerr);
+				ppd->cpspec->iblnkerrsnap =
+					read_7220_creg32(dd, cr_iblinkerrrecov);
+			}
+			break;
+
+		case IB_LINKCMD_ARMED:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ARMED;
+			break;
+
+		case IB_LINKCMD_ACTIVE:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ACTIVE;
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkcmd req 0x%x\n", val >> 16);
+			goto bail;
+		}
+		switch (val & 0xffff) {
+		case IB_LINKINITCMD_NOP:
+			licmd = 0;
+			break;
+
+		case IB_LINKINITCMD_POLL:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_POLL;
+			break;
+
+		case IB_LINKINITCMD_SLEEP:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_SLEEP;
+			break;
+
+		case IB_LINKINITCMD_DISABLE:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_DISABLE;
+			ppd->cpspec->chase_end = 0;
+			/*
+			 * stop state chase counter and timer, if running.
+			 * wait forpending timer, but don't clear .data (ppd)!
+			 */
+			if (ppd->cpspec->chase_timer.expires) {
+				del_timer_sync(&ppd->cpspec->chase_timer);
+				ppd->cpspec->chase_timer.expires = 0;
+			}
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkinitcmd req 0x%x\n",
+				    val & 0xffff);
+			goto bail;
+		}
+		qib_set_ib_7220_lstate(ppd, lcmd, licmd);
+
+		maskr = IBA7220_IBC_WIDTH_MASK;
+		lsb = IBA7220_IBC_WIDTH_SHIFT;
+		tmp = (ppd->cpspec->ibcddrctrl >> lsb) & maskr;
+		/* If the width active on the chip does not match the
+		 * width in the shadow register, write the new active
+		 * width to the chip.
+		 * We don't have to worry about speed as the speed is taken
+		 * care of by set_7220_ibspeed_fast called by ib_updown.
+		 */
+		if (ppd->link_width_enabled-1 != tmp) {
+			ppd->cpspec->ibcddrctrl &= ~(maskr << lsb);
+			ppd->cpspec->ibcddrctrl |=
+				(((u64)(ppd->link_width_enabled-1) & maskr) <<
+				 lsb);
+			qib_write_kreg(dd, kr_ibcddrctrl,
+				       ppd->cpspec->ibcddrctrl);
+			qib_write_kreg(dd, kr_scratch, 0);
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags |= QIBL_IB_FORCE_NOTIFY;
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_HRTBT: /* set Heartbeat off/enable/auto */
+		if (val > IBA7220_IBC_HRTBT_MASK) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		lsb = IBA7220_IBC_HRTBT_SHIFT;
+		maskr = IBA7220_IBC_HRTBT_MASK;
+		break;
+
+	default:
+		ret = -EINVAL;
+		goto bail;
+	}
+	ppd->cpspec->ibcddrctrl &= ~(maskr << lsb);
+	ppd->cpspec->ibcddrctrl |= (((u64) val & maskr) << lsb);
+	qib_write_kreg(dd, kr_ibcddrctrl, ppd->cpspec->ibcddrctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+	if (setforce) {
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_FORCE_NOTIFY;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	}
+bail:
+	return ret;
+}
+
+static int qib_7220_set_loopback(struct qib_pportdata *ppd, const char *what)
+{
+	int ret = 0;
+	u64 val, ddr;
+
+	if (!strncmp(what, "ibc", 3)) {
+		ppd->cpspec->ibcctrl |= SYM_MASK(IBCCtrl, Loopback);
+		val = 0; /* disable heart beat, so link will come up */
+		qib_devinfo(ppd->dd->pcidev, "Enabling IB%u:%u IBC loopback\n",
+			 ppd->dd->unit, ppd->port);
+	} else if (!strncmp(what, "off", 3)) {
+		ppd->cpspec->ibcctrl &= ~SYM_MASK(IBCCtrl, Loopback);
+		/* enable heart beat again */
+		val = IBA7220_IBC_HRTBT_MASK << IBA7220_IBC_HRTBT_SHIFT;
+		qib_devinfo(ppd->dd->pcidev,
+			"Disabling IB%u:%u IBC loopback (normal)\n",
+			ppd->dd->unit, ppd->port);
+	} else
+		ret = -EINVAL;
+	if (!ret) {
+		qib_write_kreg(ppd->dd, kr_ibcctrl, ppd->cpspec->ibcctrl);
+		ddr = ppd->cpspec->ibcddrctrl & ~(IBA7220_IBC_HRTBT_MASK
+					     << IBA7220_IBC_HRTBT_SHIFT);
+		ppd->cpspec->ibcddrctrl = ddr | val;
+		qib_write_kreg(ppd->dd, kr_ibcddrctrl,
+			       ppd->cpspec->ibcddrctrl);
+		qib_write_kreg(ppd->dd, kr_scratch, 0);
+	}
+	return ret;
+}
+
+static void qib_update_7220_usrhead(struct qib_ctxtdata *rcd, u64 hd,
+				    u32 updegr, u32 egrhd, u32 npkts)
+{
+	if (updegr)
+		qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt);
+	mmiowb();
+	qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt);
+	mmiowb();
+}
+
+static u32 qib_7220_hdrqempty(struct qib_ctxtdata *rcd)
+{
+	u32 head, tail;
+
+	head = qib_read_ureg32(rcd->dd, ur_rcvhdrhead, rcd->ctxt);
+	if (rcd->rcvhdrtail_kvaddr)
+		tail = qib_get_rcvhdrtail(rcd);
+	else
+		tail = qib_read_ureg32(rcd->dd, ur_rcvhdrtail, rcd->ctxt);
+	return head == tail;
+}
+
+/*
+ * Modify the RCVCTRL register in chip-specific way. This
+ * is a function because bit positions and (future) register
+ * location is chip-specifc, but the needed operations are
+ * generic. <op> is a bit-mask because we often want to
+ * do multiple modifications.
+ */
+static void rcvctrl_7220_mod(struct qib_pportdata *ppd, unsigned int op,
+			     int ctxt)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 mask, val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+	if (op & QIB_RCVCTRL_TAILUPD_ENB)
+		dd->rcvctrl |= (1ULL << IBA7220_R_TAILUPD_SHIFT);
+	if (op & QIB_RCVCTRL_TAILUPD_DIS)
+		dd->rcvctrl &= ~(1ULL << IBA7220_R_TAILUPD_SHIFT);
+	if (op & QIB_RCVCTRL_PKEY_ENB)
+		dd->rcvctrl &= ~(1ULL << IBA7220_R_PKEY_DIS_SHIFT);
+	if (op & QIB_RCVCTRL_PKEY_DIS)
+		dd->rcvctrl |= (1ULL << IBA7220_R_PKEY_DIS_SHIFT);
+	if (ctxt < 0)
+		mask = (1ULL << dd->ctxtcnt) - 1;
+	else
+		mask = (1ULL << ctxt);
+	if (op & QIB_RCVCTRL_CTXT_ENB) {
+		/* always done for specific ctxt */
+		dd->rcvctrl |= (mask << SYM_LSB(RcvCtrl, PortEnable));
+		if (!(dd->flags & QIB_NODMA_RTAIL))
+			dd->rcvctrl |= 1ULL << IBA7220_R_TAILUPD_SHIFT;
+		/* Write these registers before the context is enabled. */
+		qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt,
+			dd->rcd[ctxt]->rcvhdrqtailaddr_phys);
+		qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt,
+			dd->rcd[ctxt]->rcvhdrq_phys);
+		dd->rcd[ctxt]->seq_cnt = 1;
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS)
+		dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, PortEnable));
+	if (op & QIB_RCVCTRL_INTRAVAIL_ENB)
+		dd->rcvctrl |= (mask << IBA7220_R_INTRAVAIL_SHIFT);
+	if (op & QIB_RCVCTRL_INTRAVAIL_DIS)
+		dd->rcvctrl &= ~(mask << IBA7220_R_INTRAVAIL_SHIFT);
+	qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl);
+	if ((op & QIB_RCVCTRL_INTRAVAIL_ENB) && dd->rhdrhead_intr_off) {
+		/* arm rcv interrupt */
+		val = qib_read_ureg32(dd, ur_rcvhdrhead, ctxt) |
+			dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	}
+	if (op & QIB_RCVCTRL_CTXT_ENB) {
+		/*
+		 * Init the context registers also; if we were
+		 * disabled, tail and head should both be zero
+		 * already from the enable, but since we don't
+		 * know, we have to do it explicitly.
+		 */
+		val = qib_read_ureg32(dd, ur_rcvegrindextail, ctxt);
+		qib_write_ureg(dd, ur_rcvegrindexhead, val, ctxt);
+
+		val = qib_read_ureg32(dd, ur_rcvhdrtail, ctxt);
+		dd->rcd[ctxt]->head = val;
+		/* If kctxt, interrupt on next receive. */
+		if (ctxt < dd->first_user_ctxt)
+			val |= dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS) {
+		if (ctxt >= 0) {
+			qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr, ctxt, 0);
+			qib_write_kreg_ctxt(dd, kr_rcvhdraddr, ctxt, 0);
+		} else {
+			unsigned i;
+
+			for (i = 0; i < dd->cfgctxts; i++) {
+				qib_write_kreg_ctxt(dd, kr_rcvhdrtailaddr,
+						    i, 0);
+				qib_write_kreg_ctxt(dd, kr_rcvhdraddr, i, 0);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+}
+
+/*
+ * Modify the SENDCTRL register in chip-specific way. This
+ * is a function there may be multiple such registers with
+ * slightly different layouts. To start, we assume the
+ * "canonical" register layout of the first chips.
+ * Chip requires no back-back sendctrl writes, so write
+ * scratch register after writing sendctrl
+ */
+static void sendctrl_7220_mod(struct qib_pportdata *ppd, u32 op)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 tmp_dd_sendctrl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+
+	/* First the ones that are "sticky", saved in shadow */
+	if (op & QIB_SENDCTRL_CLEAR)
+		dd->sendctrl = 0;
+	if (op & QIB_SENDCTRL_SEND_DIS)
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, SPioEnable);
+	else if (op & QIB_SENDCTRL_SEND_ENB) {
+		dd->sendctrl |= SYM_MASK(SendCtrl, SPioEnable);
+		if (dd->flags & QIB_USE_SPCL_TRIG)
+			dd->sendctrl |= SYM_MASK(SendCtrl,
+						 SSpecialTriggerEn);
+	}
+	if (op & QIB_SENDCTRL_AVAIL_DIS)
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd);
+	else if (op & QIB_SENDCTRL_AVAIL_ENB)
+		dd->sendctrl |= SYM_MASK(SendCtrl, SendBufAvailUpd);
+
+	if (op & QIB_SENDCTRL_DISARM_ALL) {
+		u32 i, last;
+
+		tmp_dd_sendctrl = dd->sendctrl;
+		/*
+		 * disarm any that are not yet launched, disabling sends
+		 * and updates until done.
+		 */
+		last = dd->piobcnt2k + dd->piobcnt4k;
+		tmp_dd_sendctrl &=
+			~(SYM_MASK(SendCtrl, SPioEnable) |
+			  SYM_MASK(SendCtrl, SendBufAvailUpd));
+		for (i = 0; i < last; i++) {
+			qib_write_kreg(dd, kr_sendctrl,
+				       tmp_dd_sendctrl |
+				       SYM_MASK(SendCtrl, Disarm) | i);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+	}
+
+	tmp_dd_sendctrl = dd->sendctrl;
+
+	if (op & QIB_SENDCTRL_FLUSH)
+		tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Abort);
+	if (op & QIB_SENDCTRL_DISARM)
+		tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Disarm) |
+			((op & QIB_7220_SendCtrl_DisarmPIOBuf_RMASK) <<
+			 SYM_LSB(SendCtrl, DisarmPIOBuf));
+	if ((op & QIB_SENDCTRL_AVAIL_BLIP) &&
+	    (dd->sendctrl & SYM_MASK(SendCtrl, SendBufAvailUpd)))
+		tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd);
+
+	qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	if (op & QIB_SENDCTRL_AVAIL_BLIP) {
+		qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+
+	if (op & QIB_SENDCTRL_FLUSH) {
+		u32 v;
+		/*
+		 * ensure writes have hit chip, then do a few
+		 * more reads, to allow DMA of pioavail registers
+		 * to occur, so in-memory copy is in sync with
+		 * the chip.  Not always safe to sleep.
+		 */
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		qib_read_kreg32(dd, kr_scratch);
+	}
+}
+
+/**
+ * qib_portcntr_7220 - read a per-port counter
+ * @dd: the qlogic_ib device
+ * @creg: the counter to snapshot
+ */
+static u64 qib_portcntr_7220(struct qib_pportdata *ppd, u32 reg)
+{
+	u64 ret = 0ULL;
+	struct qib_devdata *dd = ppd->dd;
+	u16 creg;
+	/* 0xffff for unimplemented or synthesized counters */
+	static const u16 xlator[] = {
+		[QIBPORTCNTR_PKTSEND] = cr_pktsend,
+		[QIBPORTCNTR_WORDSEND] = cr_wordsend,
+		[QIBPORTCNTR_PSXMITDATA] = cr_psxmitdatacount,
+		[QIBPORTCNTR_PSXMITPKTS] = cr_psxmitpktscount,
+		[QIBPORTCNTR_PSXMITWAIT] = cr_psxmitwaitcount,
+		[QIBPORTCNTR_SENDSTALL] = cr_sendstall,
+		[QIBPORTCNTR_PKTRCV] = cr_pktrcv,
+		[QIBPORTCNTR_PSRCVDATA] = cr_psrcvdatacount,
+		[QIBPORTCNTR_PSRCVPKTS] = cr_psrcvpktscount,
+		[QIBPORTCNTR_RCVEBP] = cr_rcvebp,
+		[QIBPORTCNTR_RCVOVFL] = cr_rcvovfl,
+		[QIBPORTCNTR_WORDRCV] = cr_wordrcv,
+		[QIBPORTCNTR_RXDROPPKT] = cr_rxdroppkt,
+		[QIBPORTCNTR_RXLOCALPHYERR] = cr_rxotherlocalphyerr,
+		[QIBPORTCNTR_RXVLERR] = cr_rxvlerr,
+		[QIBPORTCNTR_ERRICRC] = cr_erricrc,
+		[QIBPORTCNTR_ERRVCRC] = cr_errvcrc,
+		[QIBPORTCNTR_ERRLPCRC] = cr_errlpcrc,
+		[QIBPORTCNTR_BADFORMAT] = cr_badformat,
+		[QIBPORTCNTR_ERR_RLEN] = cr_err_rlen,
+		[QIBPORTCNTR_IBSYMBOLERR] = cr_ibsymbolerr,
+		[QIBPORTCNTR_INVALIDRLEN] = cr_invalidrlen,
+		[QIBPORTCNTR_UNSUPVL] = cr_txunsupvl,
+		[QIBPORTCNTR_EXCESSBUFOVFL] = cr_excessbufferovfl,
+		[QIBPORTCNTR_ERRLINK] = cr_errlink,
+		[QIBPORTCNTR_IBLINKDOWN] = cr_iblinkdown,
+		[QIBPORTCNTR_IBLINKERRRECOV] = cr_iblinkerrrecov,
+		[QIBPORTCNTR_LLI] = cr_locallinkintegrityerr,
+		[QIBPORTCNTR_PSINTERVAL] = cr_psinterval,
+		[QIBPORTCNTR_PSSTART] = cr_psstart,
+		[QIBPORTCNTR_PSSTAT] = cr_psstat,
+		[QIBPORTCNTR_VL15PKTDROP] = cr_vl15droppedpkt,
+		[QIBPORTCNTR_ERRPKEY] = cr_errpkey,
+		[QIBPORTCNTR_KHDROVFL] = 0xffff,
+	};
+
+	if (reg >= ARRAY_SIZE(xlator)) {
+		qib_devinfo(ppd->dd->pcidev,
+			 "Unimplemented portcounter %u\n", reg);
+		goto done;
+	}
+	creg = xlator[reg];
+
+	if (reg == QIBPORTCNTR_KHDROVFL) {
+		int i;
+
+		/* sum over all kernel contexts */
+		for (i = 0; i < dd->first_user_ctxt; i++)
+			ret += read_7220_creg32(dd, cr_portovfl + i);
+	}
+	if (creg == 0xffff)
+		goto done;
+
+	/*
+	 * only fast incrementing counters are 64bit; use 32 bit reads to
+	 * avoid two independent reads when on opteron
+	 */
+	if ((creg == cr_wordsend || creg == cr_wordrcv ||
+	     creg == cr_pktsend || creg == cr_pktrcv))
+		ret = read_7220_creg(dd, creg);
+	else
+		ret = read_7220_creg32(dd, creg);
+	if (creg == cr_ibsymbolerr) {
+		if (dd->pport->cpspec->ibdeltainprog)
+			ret -= ret - ppd->cpspec->ibsymsnap;
+		ret -= dd->pport->cpspec->ibsymdelta;
+	} else if (creg == cr_iblinkerrrecov) {
+		if (dd->pport->cpspec->ibdeltainprog)
+			ret -= ret - ppd->cpspec->iblnkerrsnap;
+		ret -= dd->pport->cpspec->iblnkerrdelta;
+	}
+done:
+	return ret;
+}
+
+/*
+ * Device counter names (not port-specific), one line per stat,
+ * single string.  Used by utilities like ipathstats to print the stats
+ * in a way which works for different versions of drivers, without changing
+ * the utility.  Names need to be 12 chars or less (w/o newline), for proper
+ * display by utility.
+ * Non-error counters are first.
+ * Start of "error" conters is indicated by a leading "E " on the first
+ * "error" counter, and doesn't count in label length.
+ * The EgrOvfl list needs to be last so we truncate them at the configured
+ * context count for the device.
+ * cntr7220indices contains the corresponding register indices.
+ */
+static const char cntr7220names[] =
+	"Interrupts\n"
+	"HostBusStall\n"
+	"E RxTIDFull\n"
+	"RxTIDInvalid\n"
+	"Ctxt0EgrOvfl\n"
+	"Ctxt1EgrOvfl\n"
+	"Ctxt2EgrOvfl\n"
+	"Ctxt3EgrOvfl\n"
+	"Ctxt4EgrOvfl\n"
+	"Ctxt5EgrOvfl\n"
+	"Ctxt6EgrOvfl\n"
+	"Ctxt7EgrOvfl\n"
+	"Ctxt8EgrOvfl\n"
+	"Ctxt9EgrOvfl\n"
+	"Ctx10EgrOvfl\n"
+	"Ctx11EgrOvfl\n"
+	"Ctx12EgrOvfl\n"
+	"Ctx13EgrOvfl\n"
+	"Ctx14EgrOvfl\n"
+	"Ctx15EgrOvfl\n"
+	"Ctx16EgrOvfl\n";
+
+static const size_t cntr7220indices[] = {
+	cr_lbint,
+	cr_lbflowstall,
+	cr_errtidfull,
+	cr_errtidvalid,
+	cr_portovfl + 0,
+	cr_portovfl + 1,
+	cr_portovfl + 2,
+	cr_portovfl + 3,
+	cr_portovfl + 4,
+	cr_portovfl + 5,
+	cr_portovfl + 6,
+	cr_portovfl + 7,
+	cr_portovfl + 8,
+	cr_portovfl + 9,
+	cr_portovfl + 10,
+	cr_portovfl + 11,
+	cr_portovfl + 12,
+	cr_portovfl + 13,
+	cr_portovfl + 14,
+	cr_portovfl + 15,
+	cr_portovfl + 16,
+};
+
+/*
+ * same as cntr7220names and cntr7220indices, but for port-specific counters.
+ * portcntr7220indices is somewhat complicated by some registers needing
+ * adjustments of various kinds, and those are ORed with _PORT_VIRT_FLAG
+ */
+static const char portcntr7220names[] =
+	"TxPkt\n"
+	"TxFlowPkt\n"
+	"TxWords\n"
+	"RxPkt\n"
+	"RxFlowPkt\n"
+	"RxWords\n"
+	"TxFlowStall\n"
+	"TxDmaDesc\n"  /* 7220 and 7322-only */
+	"E RxDlidFltr\n"  /* 7220 and 7322-only */
+	"IBStatusChng\n"
+	"IBLinkDown\n"
+	"IBLnkRecov\n"
+	"IBRxLinkErr\n"
+	"IBSymbolErr\n"
+	"RxLLIErr\n"
+	"RxBadFormat\n"
+	"RxBadLen\n"
+	"RxBufOvrfl\n"
+	"RxEBP\n"
+	"RxFlowCtlErr\n"
+	"RxICRCerr\n"
+	"RxLPCRCerr\n"
+	"RxVCRCerr\n"
+	"RxInvalLen\n"
+	"RxInvalPKey\n"
+	"RxPktDropped\n"
+	"TxBadLength\n"
+	"TxDropped\n"
+	"TxInvalLen\n"
+	"TxUnderrun\n"
+	"TxUnsupVL\n"
+	"RxLclPhyErr\n" /* 7220 and 7322-only */
+	"RxVL15Drop\n" /* 7220 and 7322-only */
+	"RxVlErr\n" /* 7220 and 7322-only */
+	"XcessBufOvfl\n" /* 7220 and 7322-only */
+	;
+
+#define _PORT_VIRT_FLAG 0x8000 /* "virtual", need adjustments */
+static const size_t portcntr7220indices[] = {
+	QIBPORTCNTR_PKTSEND | _PORT_VIRT_FLAG,
+	cr_pktsendflow,
+	QIBPORTCNTR_WORDSEND | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_PKTRCV | _PORT_VIRT_FLAG,
+	cr_pktrcvflowctrl,
+	QIBPORTCNTR_WORDRCV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_SENDSTALL | _PORT_VIRT_FLAG,
+	cr_txsdmadesc,
+	cr_rxdlidfltr,
+	cr_ibstatuschange,
+	QIBPORTCNTR_IBLINKDOWN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBLINKERRRECOV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLINK | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBSYMBOLERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_LLI | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_BADFORMAT | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERR_RLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVOVFL | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVEBP | _PORT_VIRT_FLAG,
+	cr_rcvflowctrl_err,
+	QIBPORTCNTR_ERRICRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLPCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRVCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_INVALIDRLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRPKEY | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RXDROPPKT | _PORT_VIRT_FLAG,
+	cr_invalidslen,
+	cr_senddropped,
+	cr_errslen,
+	cr_sendunderrun,
+	cr_txunsupvl,
+	QIBPORTCNTR_RXLOCALPHYERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_VL15PKTDROP | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RXVLERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_EXCESSBUFOVFL | _PORT_VIRT_FLAG,
+};
+
+/* do all the setup to make the counter reads efficient later */
+static void init_7220_cntrnames(struct qib_devdata *dd)
+{
+	int i, j = 0;
+	char *s;
+
+	for (i = 0, s = (char *)cntr7220names; s && j <= dd->cfgctxts;
+	     i++) {
+		/* we always have at least one counter before the egrovfl */
+		if (!j && !strncmp("Ctxt0EgrOvfl", s + 1, 12))
+			j = 1;
+		s = strchr(s + 1, '\n');
+		if (s && j)
+			j++;
+	}
+	dd->cspec->ncntrs = i;
+	if (!s)
+		/* full list; size is without terminating null */
+		dd->cspec->cntrnamelen = sizeof(cntr7220names) - 1;
+	else
+		dd->cspec->cntrnamelen = 1 + s - cntr7220names;
+	dd->cspec->cntrs = kmalloc(dd->cspec->ncntrs
+		* sizeof(u64), GFP_KERNEL);
+	if (!dd->cspec->cntrs)
+		qib_dev_err(dd, "Failed allocation for counters\n");
+
+	for (i = 0, s = (char *)portcntr7220names; s; i++)
+		s = strchr(s + 1, '\n');
+	dd->cspec->nportcntrs = i - 1;
+	dd->cspec->portcntrnamelen = sizeof(portcntr7220names) - 1;
+	dd->cspec->portcntrs = kmalloc(dd->cspec->nportcntrs
+		* sizeof(u64), GFP_KERNEL);
+	if (!dd->cspec->portcntrs)
+		qib_dev_err(dd, "Failed allocation for portcounters\n");
+}
+
+static u32 qib_read_7220cntrs(struct qib_devdata *dd, loff_t pos, char **namep,
+			      u64 **cntrp)
+{
+	u32 ret;
+
+	if (!dd->cspec->cntrs) {
+		ret = 0;
+		goto done;
+	}
+
+	if (namep) {
+		*namep = (char *)cntr7220names;
+		ret = dd->cspec->cntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+	} else {
+		u64 *cntr = dd->cspec->cntrs;
+		int i;
+
+		ret = dd->cspec->ncntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->ncntrs; i++)
+			*cntr++ = read_7220_creg32(dd, cntr7220indices[i]);
+	}
+done:
+	return ret;
+}
+
+static u32 qib_read_7220portcntrs(struct qib_devdata *dd, loff_t pos, u32 port,
+				  char **namep, u64 **cntrp)
+{
+	u32 ret;
+
+	if (!dd->cspec->portcntrs) {
+		ret = 0;
+		goto done;
+	}
+	if (namep) {
+		*namep = (char *)portcntr7220names;
+		ret = dd->cspec->portcntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+	} else {
+		u64 *cntr = dd->cspec->portcntrs;
+		struct qib_pportdata *ppd = &dd->pport[port];
+		int i;
+
+		ret = dd->cspec->nportcntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->nportcntrs; i++) {
+			if (portcntr7220indices[i] & _PORT_VIRT_FLAG)
+				*cntr++ = qib_portcntr_7220(ppd,
+					portcntr7220indices[i] &
+					~_PORT_VIRT_FLAG);
+			else
+				*cntr++ = read_7220_creg32(dd,
+					   portcntr7220indices[i]);
+		}
+	}
+done:
+	return ret;
+}
+
+/**
+ * qib_get_7220_faststats - get word counters from chip before they overflow
+ * @opaque - contains a pointer to the qlogic_ib device qib_devdata
+ *
+ * This needs more work; in particular, decision on whether we really
+ * need traffic_wds done the way it is
+ * called from add_timer
+ */
+static void qib_get_7220_faststats(unsigned long opaque)
+{
+	struct qib_devdata *dd = (struct qib_devdata *) opaque;
+	struct qib_pportdata *ppd = dd->pport;
+	unsigned long flags;
+	u64 traffic_wds;
+
+	/*
+	 * don't access the chip while running diags, or memory diags can
+	 * fail
+	 */
+	if (!(dd->flags & QIB_INITTED) || dd->diag_client)
+		/* but re-arm the timer, for diags case; won't hurt other */
+		goto done;
+
+	/*
+	 * We now try to maintain an activity timer, based on traffic
+	 * exceeding a threshold, so we need to check the word-counts
+	 * even if they are 64-bit.
+	 */
+	traffic_wds = qib_portcntr_7220(ppd, cr_wordsend) +
+		qib_portcntr_7220(ppd, cr_wordrcv);
+	spin_lock_irqsave(&dd->eep_st_lock, flags);
+	traffic_wds -= dd->traffic_wds;
+	dd->traffic_wds += traffic_wds;
+	spin_unlock_irqrestore(&dd->eep_st_lock, flags);
+done:
+	mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER);
+}
+
+/*
+ * If we are using MSI, try to fallback to INTx.
+ */
+static int qib_7220_intr_fallback(struct qib_devdata *dd)
+{
+	if (!dd->msi_lo)
+		return 0;
+
+	qib_devinfo(dd->pcidev,
+		"MSI interrupt not detected, trying INTx interrupts\n");
+	qib_7220_free_irq(dd);
+	qib_enable_intx(dd->pcidev);
+	/*
+	 * Some newer kernels require free_irq before disable_msi,
+	 * and irq can be changed during disable and INTx enable
+	 * and we need to therefore use the pcidev->irq value,
+	 * not our saved MSI value.
+	 */
+	dd->cspec->irq = dd->pcidev->irq;
+	qib_setup_7220_interrupt(dd);
+	return 1;
+}
+
+/*
+ * Reset the XGXS (between serdes and IBC).  Slightly less intrusive
+ * than resetting the IBC or external link state, and useful in some
+ * cases to cause some retraining.  To do this right, we reset IBC
+ * as well.
+ */
+static void qib_7220_xgxs_reset(struct qib_pportdata *ppd)
+{
+	u64 val, prev_val;
+	struct qib_devdata *dd = ppd->dd;
+
+	prev_val = qib_read_kreg64(dd, kr_xgxs_cfg);
+	val = prev_val | QLOGIC_IB_XGXS_RESET;
+	prev_val &= ~QLOGIC_IB_XGXS_RESET; /* be sure */
+	qib_write_kreg(dd, kr_control,
+		       dd->control & ~QLOGIC_IB_C_LINKENABLE);
+	qib_write_kreg(dd, kr_xgxs_cfg, val);
+	qib_read_kreg32(dd, kr_scratch);
+	qib_write_kreg(dd, kr_xgxs_cfg, prev_val);
+	qib_write_kreg(dd, kr_control, dd->control);
+}
+
+/*
+ * For this chip, we want to use the same buffer every time
+ * when we are trying to bring the link up (they are always VL15
+ * packets).  At that link state the packet should always go out immediately
+ * (or at least be discarded at the tx interface if the link is down).
+ * If it doesn't, and the buffer isn't available, that means some other
+ * sender has gotten ahead of us, and is preventing our packet from going
+ * out.  In that case, we flush all packets, and try again.  If that still
+ * fails, we fail the request, and hope things work the next time around.
+ *
+ * We don't need very complicated heuristics on whether the packet had
+ * time to go out or not, since even at SDR 1X, it goes out in very short
+ * time periods, covered by the chip reads done here and as part of the
+ * flush.
+ */
+static u32 __iomem *get_7220_link_buf(struct qib_pportdata *ppd, u32 *bnum)
+{
+	u32 __iomem *buf;
+	u32 lbuf = ppd->dd->cspec->lastbuf_for_pio;
+	int do_cleanup;
+	unsigned long flags;
+
+	/*
+	 * always blip to get avail list updated, since it's almost
+	 * always needed, and is fairly cheap.
+	 */
+	sendctrl_7220_mod(ppd->dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+	qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */
+	buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf);
+	if (buf)
+		goto done;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+	if (ppd->sdma_state.current_state == qib_sdma_state_s20_idle &&
+	    ppd->sdma_state.current_state != qib_sdma_state_s00_hw_down) {
+		__qib_sdma_process_event(ppd, qib_sdma_event_e00_go_hw_down);
+		do_cleanup = 0;
+	} else {
+		do_cleanup = 1;
+		qib_7220_sdma_hw_clean_up(ppd);
+	}
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+
+	if (do_cleanup) {
+		qib_read_kreg64(ppd->dd, kr_scratch); /* extra chip flush */
+		buf = qib_getsendbuf_range(ppd->dd, bnum, lbuf, lbuf);
+	}
+done:
+	return buf;
+}
+
+/*
+ * This code for non-IBTA-compliant IB speed negotiation is only known to
+ * work for the SDR to DDR transition, and only between an HCA and a switch
+ * with recent firmware.  It is based on observed heuristics, rather than
+ * actual knowledge of the non-compliant speed negotiation.
+ * It has a number of hard-coded fields, since the hope is to rewrite this
+ * when a spec is available on how the negoation is intended to work.
+ */
+static void autoneg_7220_sendpkt(struct qib_pportdata *ppd, u32 *hdr,
+				 u32 dcnt, u32 *data)
+{
+	int i;
+	u64 pbc;
+	u32 __iomem *piobuf;
+	u32 pnum;
+	struct qib_devdata *dd = ppd->dd;
+
+	i = 0;
+	pbc = 7 + dcnt + 1; /* 7 dword header, dword data, icrc */
+	pbc |= PBC_7220_VL15_SEND;
+	while (!(piobuf = get_7220_link_buf(ppd, &pnum))) {
+		if (i++ > 5)
+			return;
+		udelay(2);
+	}
+	sendctrl_7220_mod(dd->pport, QIB_SENDCTRL_DISARM_BUF(pnum));
+	writeq(pbc, piobuf);
+	qib_flush_wc();
+	qib_pio_copy(piobuf + 2, hdr, 7);
+	qib_pio_copy(piobuf + 9, data, dcnt);
+	if (dd->flags & QIB_USE_SPCL_TRIG) {
+		u32 spcl_off = (pnum >= dd->piobcnt2k) ? 2047 : 1023;
+
+		qib_flush_wc();
+		__raw_writel(0xaebecede, piobuf + spcl_off);
+	}
+	qib_flush_wc();
+	qib_sendbuf_done(dd, pnum);
+}
+
+/*
+ * _start packet gets sent twice at start, _done gets sent twice at end
+ */
+static void autoneg_7220_send(struct qib_pportdata *ppd, int which)
+{
+	struct qib_devdata *dd = ppd->dd;
+	static u32 swapped;
+	u32 dw, i, hcnt, dcnt, *data;
+	static u32 hdr[7] = { 0xf002ffff, 0x48ffff, 0x6400abba };
+	static u32 madpayload_start[0x40] = {
+		0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0,
+		0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x1, 0x1388, 0x15e, 0x1, /* rest 0's */
+		};
+	static u32 madpayload_done[0x40] = {
+		0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0,
+		0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x40000001, 0x1388, 0x15e, /* rest 0's */
+		};
+
+	dcnt = ARRAY_SIZE(madpayload_start);
+	hcnt = ARRAY_SIZE(hdr);
+	if (!swapped) {
+		/* for maintainability, do it at runtime */
+		for (i = 0; i < hcnt; i++) {
+			dw = (__force u32) cpu_to_be32(hdr[i]);
+			hdr[i] = dw;
+		}
+		for (i = 0; i < dcnt; i++) {
+			dw = (__force u32) cpu_to_be32(madpayload_start[i]);
+			madpayload_start[i] = dw;
+			dw = (__force u32) cpu_to_be32(madpayload_done[i]);
+			madpayload_done[i] = dw;
+		}
+		swapped = 1;
+	}
+
+	data = which ? madpayload_done : madpayload_start;
+
+	autoneg_7220_sendpkt(ppd, hdr, dcnt, data);
+	qib_read_kreg64(dd, kr_scratch);
+	udelay(2);
+	autoneg_7220_sendpkt(ppd, hdr, dcnt, data);
+	qib_read_kreg64(dd, kr_scratch);
+	udelay(2);
+}
+
+/*
+ * Do the absolute minimum to cause an IB speed change, and make it
+ * ready, but don't actually trigger the change.   The caller will
+ * do that when ready (if link is in Polling training state, it will
+ * happen immediately, otherwise when link next goes down)
+ *
+ * This routine should only be used as part of the DDR autonegotation
+ * code for devices that are not compliant with IB 1.2 (or code that
+ * fixes things up for same).
+ *
+ * When link has gone down, and autoneg enabled, or autoneg has
+ * failed and we give up until next time we set both speeds, and
+ * then we want IBTA enabled as well as "use max enabled speed.
+ */
+static void set_7220_ibspeed_fast(struct qib_pportdata *ppd, u32 speed)
+{
+	ppd->cpspec->ibcddrctrl &= ~(IBA7220_IBC_SPEED_AUTONEG_MASK |
+		IBA7220_IBC_IBTA_1_2_MASK);
+
+	if (speed == (QIB_IB_SDR | QIB_IB_DDR))
+		ppd->cpspec->ibcddrctrl |= IBA7220_IBC_SPEED_AUTONEG_MASK |
+			IBA7220_IBC_IBTA_1_2_MASK;
+	else
+		ppd->cpspec->ibcddrctrl |= speed == QIB_IB_DDR ?
+			IBA7220_IBC_SPEED_DDR : IBA7220_IBC_SPEED_SDR;
+
+	qib_write_kreg(ppd->dd, kr_ibcddrctrl, ppd->cpspec->ibcddrctrl);
+	qib_write_kreg(ppd->dd, kr_scratch, 0);
+}
+
+/*
+ * This routine is only used when we are not talking to another
+ * IB 1.2-compliant device that we think can do DDR.
+ * (This includes all existing switch chips as of Oct 2007.)
+ * 1.2-compliant devices go directly to DDR prior to reaching INIT
+ */
+static void try_7220_autoneg(struct qib_pportdata *ppd)
+{
+	unsigned long flags;
+
+	/*
+	 * Required for older non-IB1.2 DDR switches.  Newer
+	 * non-IB-compliant switches don't need it, but so far,
+	 * aren't bothered by it either.  "Magic constant"
+	 */
+	qib_write_kreg(ppd->dd, kr_ncmodectrl, 0x3b9dc07);
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags |= QIBL_IB_AUTONEG_INPROG;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	autoneg_7220_send(ppd, 0);
+	set_7220_ibspeed_fast(ppd, QIB_IB_DDR);
+
+	toggle_7220_rclkrls(ppd->dd);
+	/* 2 msec is minimum length of a poll cycle */
+	queue_delayed_work(ib_wq, &ppd->cpspec->autoneg_work,
+			   msecs_to_jiffies(2));
+}
+
+/*
+ * Handle the empirically determined mechanism for auto-negotiation
+ * of DDR speed with switches.
+ */
+static void autoneg_7220_work(struct work_struct *work)
+{
+	struct qib_pportdata *ppd;
+	struct qib_devdata *dd;
+	u64 startms;
+	u32 i;
+	unsigned long flags;
+
+	ppd = &container_of(work, struct qib_chippport_specific,
+			    autoneg_work.work)->pportdata;
+	dd = ppd->dd;
+
+	startms = jiffies_to_msecs(jiffies);
+
+	/*
+	 * Busy wait for this first part, it should be at most a
+	 * few hundred usec, since we scheduled ourselves for 2msec.
+	 */
+	for (i = 0; i < 25; i++) {
+		if (SYM_FIELD(ppd->lastibcstat, IBCStatus, LinkTrainingState)
+		     == IB_7220_LT_STATE_POLLQUIET) {
+			qib_set_linkstate(ppd, QIB_IB_LINKDOWN_DISABLE);
+			break;
+		}
+		udelay(100);
+	}
+
+	if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+		goto done; /* we got there early or told to stop */
+
+	/* we expect this to timeout */
+	if (wait_event_timeout(ppd->cpspec->autoneg_wait,
+			       !(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+			       msecs_to_jiffies(90)))
+		goto done;
+
+	toggle_7220_rclkrls(dd);
+
+	/* we expect this to timeout */
+	if (wait_event_timeout(ppd->cpspec->autoneg_wait,
+			       !(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+			       msecs_to_jiffies(1700)))
+		goto done;
+
+	set_7220_ibspeed_fast(ppd, QIB_IB_SDR);
+	toggle_7220_rclkrls(dd);
+
+	/*
+	 * Wait up to 250 msec for link to train and get to INIT at DDR;
+	 * this should terminate early.
+	 */
+	wait_event_timeout(ppd->cpspec->autoneg_wait,
+		!(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+		msecs_to_jiffies(250));
+done:
+	if (ppd->lflags & QIBL_IB_AUTONEG_INPROG) {
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG;
+		if (dd->cspec->autoneg_tries == AUTONEG_TRIES) {
+			ppd->lflags |= QIBL_IB_AUTONEG_FAILED;
+			dd->cspec->autoneg_tries = 0;
+		}
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		set_7220_ibspeed_fast(ppd, ppd->link_speed_enabled);
+	}
+}
+
+static u32 qib_7220_iblink_state(u64 ibcs)
+{
+	u32 state = (u32)SYM_FIELD(ibcs, IBCStatus, LinkState);
+
+	switch (state) {
+	case IB_7220_L_STATE_INIT:
+		state = IB_PORT_INIT;
+		break;
+	case IB_7220_L_STATE_ARM:
+		state = IB_PORT_ARMED;
+		break;
+	case IB_7220_L_STATE_ACTIVE:
+		/* fall through */
+	case IB_7220_L_STATE_ACT_DEFER:
+		state = IB_PORT_ACTIVE;
+		break;
+	default: /* fall through */
+	case IB_7220_L_STATE_DOWN:
+		state = IB_PORT_DOWN;
+		break;
+	}
+	return state;
+}
+
+/* returns the IBTA port state, rather than the IBC link training state */
+static u8 qib_7220_phys_portstate(u64 ibcs)
+{
+	u8 state = (u8)SYM_FIELD(ibcs, IBCStatus, LinkTrainingState);
+	return qib_7220_physportstate[state];
+}
+
+static int qib_7220_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs)
+{
+	int ret = 0, symadj = 0;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags &= ~QIBL_IB_FORCE_NOTIFY;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+
+	if (!ibup) {
+		/*
+		 * When the link goes down we don't want AEQ running, so it
+		 * won't interfere with IBC training, etc., and we need
+		 * to go back to the static SerDes preset values.
+		 */
+		if (!(ppd->lflags & (QIBL_IB_AUTONEG_FAILED |
+				     QIBL_IB_AUTONEG_INPROG)))
+			set_7220_ibspeed_fast(ppd, ppd->link_speed_enabled);
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) {
+			qib_sd7220_presets(dd);
+			qib_cancel_sends(ppd); /* initial disarm, etc. */
+			spin_lock_irqsave(&ppd->sdma_lock, flags);
+			if (__qib_sdma_running(ppd))
+				__qib_sdma_process_event(ppd,
+					qib_sdma_event_e70_go_idle);
+			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+		}
+		/* this might better in qib_sd7220_presets() */
+		set_7220_relock_poll(dd, ibup);
+	} else {
+		if (qib_compat_ddr_negotiate &&
+		    !(ppd->lflags & (QIBL_IB_AUTONEG_FAILED |
+				     QIBL_IB_AUTONEG_INPROG)) &&
+		    ppd->link_speed_active == QIB_IB_SDR &&
+		    (ppd->link_speed_enabled & (QIB_IB_DDR | QIB_IB_SDR)) ==
+		    (QIB_IB_DDR | QIB_IB_SDR) &&
+		    dd->cspec->autoneg_tries < AUTONEG_TRIES) {
+			/* we are SDR, and DDR auto-negotiation enabled */
+			++dd->cspec->autoneg_tries;
+			if (!ppd->cpspec->ibdeltainprog) {
+				ppd->cpspec->ibdeltainprog = 1;
+				ppd->cpspec->ibsymsnap = read_7220_creg32(dd,
+					cr_ibsymbolerr);
+				ppd->cpspec->iblnkerrsnap = read_7220_creg32(dd,
+					cr_iblinkerrrecov);
+			}
+			try_7220_autoneg(ppd);
+			ret = 1; /* no other IB status change processing */
+		} else if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) &&
+			   ppd->link_speed_active == QIB_IB_SDR) {
+			autoneg_7220_send(ppd, 1);
+			set_7220_ibspeed_fast(ppd, QIB_IB_DDR);
+			udelay(2);
+			toggle_7220_rclkrls(dd);
+			ret = 1; /* no other IB status change processing */
+		} else {
+			if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) &&
+			    (ppd->link_speed_active & QIB_IB_DDR)) {
+				spin_lock_irqsave(&ppd->lflags_lock, flags);
+				ppd->lflags &= ~(QIBL_IB_AUTONEG_INPROG |
+						 QIBL_IB_AUTONEG_FAILED);
+				spin_unlock_irqrestore(&ppd->lflags_lock,
+						       flags);
+				dd->cspec->autoneg_tries = 0;
+				/* re-enable SDR, for next link down */
+				set_7220_ibspeed_fast(ppd,
+						      ppd->link_speed_enabled);
+				wake_up(&ppd->cpspec->autoneg_wait);
+				symadj = 1;
+			} else if (ppd->lflags & QIBL_IB_AUTONEG_FAILED) {
+				/*
+				 * Clear autoneg failure flag, and do setup
+				 * so we'll try next time link goes down and
+				 * back to INIT (possibly connected to a
+				 * different device).
+				 */
+				spin_lock_irqsave(&ppd->lflags_lock, flags);
+				ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+				spin_unlock_irqrestore(&ppd->lflags_lock,
+						       flags);
+				ppd->cpspec->ibcddrctrl |=
+					IBA7220_IBC_IBTA_1_2_MASK;
+				qib_write_kreg(dd, kr_ncmodectrl, 0);
+				symadj = 1;
+			}
+		}
+
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+			symadj = 1;
+
+		if (!ret) {
+			ppd->delay_mult = rate_to_delay
+			    [(ibcs >> IBA7220_LINKSPEED_SHIFT) & 1]
+			    [(ibcs >> IBA7220_LINKWIDTH_SHIFT) & 1];
+
+			set_7220_relock_poll(dd, ibup);
+			spin_lock_irqsave(&ppd->sdma_lock, flags);
+			/*
+			 * Unlike 7322, the 7220 needs this, due to lack of
+			 * interrupt in some cases when we have sdma active
+			 * when the link goes down.
+			 */
+			if (ppd->sdma_state.current_state !=
+			    qib_sdma_state_s20_idle)
+				__qib_sdma_process_event(ppd,
+					qib_sdma_event_e00_go_hw_down);
+			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+		}
+	}
+
+	if (symadj) {
+		if (ppd->cpspec->ibdeltainprog) {
+			ppd->cpspec->ibdeltainprog = 0;
+			ppd->cpspec->ibsymdelta += read_7220_creg32(ppd->dd,
+				cr_ibsymbolerr) - ppd->cpspec->ibsymsnap;
+			ppd->cpspec->iblnkerrdelta += read_7220_creg32(ppd->dd,
+				cr_iblinkerrrecov) - ppd->cpspec->iblnkerrsnap;
+		}
+	} else if (!ibup && qib_compat_ddr_negotiate &&
+		   !ppd->cpspec->ibdeltainprog &&
+			!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) {
+		ppd->cpspec->ibdeltainprog = 1;
+		ppd->cpspec->ibsymsnap = read_7220_creg32(ppd->dd,
+							  cr_ibsymbolerr);
+		ppd->cpspec->iblnkerrsnap = read_7220_creg32(ppd->dd,
+						     cr_iblinkerrrecov);
+	}
+
+	if (!ret)
+		qib_setup_7220_setextled(ppd, ibup);
+	return ret;
+}
+
+/*
+ * Does read/modify/write to appropriate registers to
+ * set output and direction bits selected by mask.
+ * these are in their canonical postions (e.g. lsb of
+ * dir will end up in D48 of extctrl on existing chips).
+ * returns contents of GP Inputs.
+ */
+static int gpio_7220_mod(struct qib_devdata *dd, u32 out, u32 dir, u32 mask)
+{
+	u64 read_val, new_out;
+	unsigned long flags;
+
+	if (mask) {
+		/* some bits being written, lock access to GPIO */
+		dir &= mask;
+		out &= mask;
+		spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+		dd->cspec->extctrl &= ~((u64)mask << SYM_LSB(EXTCtrl, GPIOOe));
+		dd->cspec->extctrl |= ((u64) dir << SYM_LSB(EXTCtrl, GPIOOe));
+		new_out = (dd->cspec->gpio_out & ~mask) | out;
+
+		qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl);
+		qib_write_kreg(dd, kr_gpio_out, new_out);
+		dd->cspec->gpio_out = new_out;
+		spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+	}
+	/*
+	 * It is unlikely that a read at this time would get valid
+	 * data on a pin whose direction line was set in the same
+	 * call to this function. We include the read here because
+	 * that allows us to potentially combine a change on one pin with
+	 * a read on another, and because the old code did something like
+	 * this.
+	 */
+	read_val = qib_read_kreg64(dd, kr_extstatus);
+	return SYM_FIELD(read_val, EXTStatus, GPIOIn);
+}
+
+/*
+ * Read fundamental info we need to use the chip.  These are
+ * the registers that describe chip capabilities, and are
+ * saved in shadow registers.
+ */
+static void get_7220_chip_params(struct qib_devdata *dd)
+{
+	u64 val;
+	u32 piobufs;
+	int mtu;
+
+	dd->uregbase = qib_read_kreg32(dd, kr_userregbase);
+
+	dd->rcvtidcnt = qib_read_kreg32(dd, kr_rcvtidcnt);
+	dd->rcvtidbase = qib_read_kreg32(dd, kr_rcvtidbase);
+	dd->rcvegrbase = qib_read_kreg32(dd, kr_rcvegrbase);
+	dd->palign = qib_read_kreg32(dd, kr_palign);
+	dd->piobufbase = qib_read_kreg64(dd, kr_sendpiobufbase);
+	dd->pio2k_bufbase = dd->piobufbase & 0xffffffff;
+
+	val = qib_read_kreg64(dd, kr_sendpiosize);
+	dd->piosize2k = val & ~0U;
+	dd->piosize4k = val >> 32;
+
+	mtu = ib_mtu_enum_to_int(qib_ibmtu);
+	if (mtu == -1)
+		mtu = QIB_DEFAULT_MTU;
+	dd->pport->ibmtu = (u32)mtu;
+
+	val = qib_read_kreg64(dd, kr_sendpiobufcnt);
+	dd->piobcnt2k = val & ~0U;
+	dd->piobcnt4k = val >> 32;
+	/* these may be adjusted in init_chip_wc_pat() */
+	dd->pio2kbase = (u32 __iomem *)
+		((char __iomem *) dd->kregbase + dd->pio2k_bufbase);
+	if (dd->piobcnt4k) {
+		dd->pio4kbase = (u32 __iomem *)
+			((char __iomem *) dd->kregbase +
+			 (dd->piobufbase >> 32));
+		/*
+		 * 4K buffers take 2 pages; we use roundup just to be
+		 * paranoid; we calculate it once here, rather than on
+		 * ever buf allocate
+		 */
+		dd->align4k = ALIGN(dd->piosize4k, dd->palign);
+	}
+
+	piobufs = dd->piobcnt4k + dd->piobcnt2k;
+
+	dd->pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) /
+		(sizeof(u64) * BITS_PER_BYTE / 2);
+}
+
+/*
+ * The chip base addresses in cspec and cpspec have to be set
+ * after possible init_chip_wc_pat(), rather than in
+ * qib_get_7220_chip_params(), so split out as separate function
+ */
+static void set_7220_baseaddrs(struct qib_devdata *dd)
+{
+	u32 cregbase;
+	/* init after possible re-map in init_chip_wc_pat() */
+	cregbase = qib_read_kreg32(dd, kr_counterregbase);
+	dd->cspec->cregbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase + cregbase);
+
+	dd->egrtidbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase + dd->rcvegrbase);
+}
+
+
+#define SENDCTRL_SHADOWED (SYM_MASK(SendCtrl, SendIntBufAvail) |	\
+			   SYM_MASK(SendCtrl, SPioEnable) |		\
+			   SYM_MASK(SendCtrl, SSpecialTriggerEn) |	\
+			   SYM_MASK(SendCtrl, SendBufAvailUpd) |	\
+			   SYM_MASK(SendCtrl, AvailUpdThld) |		\
+			   SYM_MASK(SendCtrl, SDmaEnable) |		\
+			   SYM_MASK(SendCtrl, SDmaIntEnable) |		\
+			   SYM_MASK(SendCtrl, SDmaHalt) |		\
+			   SYM_MASK(SendCtrl, SDmaSingleDescriptor))
+
+static int sendctrl_hook(struct qib_devdata *dd,
+			 const struct diag_observer *op,
+			 u32 offs, u64 *data, u64 mask, int only_32)
+{
+	unsigned long flags;
+	unsigned idx = offs / sizeof(u64);
+	u64 local_data, all_bits;
+
+	if (idx != kr_sendctrl) {
+		qib_dev_err(dd, "SendCtrl Hook called with offs %X, %s-bit\n",
+			    offs, only_32 ? "32" : "64");
+		return 0;
+	}
+
+	all_bits = ~0ULL;
+	if (only_32)
+		all_bits >>= 32;
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+	if ((mask & all_bits) != all_bits) {
+		/*
+		 * At least some mask bits are zero, so we need
+		 * to read. The judgement call is whether from
+		 * reg or shadow. First-cut: read reg, and complain
+		 * if any bits which should be shadowed are different
+		 * from their shadowed value.
+		 */
+		if (only_32)
+			local_data = (u64)qib_read_kreg32(dd, idx);
+		else
+			local_data = qib_read_kreg64(dd, idx);
+		qib_dev_err(dd, "Sendctrl -> %X, Shad -> %X\n",
+			    (u32)local_data, (u32)dd->sendctrl);
+		if ((local_data & SENDCTRL_SHADOWED) !=
+		    (dd->sendctrl & SENDCTRL_SHADOWED))
+			qib_dev_err(dd, "Sendctrl read: %X shadow is %X\n",
+				(u32)local_data, (u32) dd->sendctrl);
+		*data = (local_data & ~mask) | (*data & mask);
+	}
+	if (mask) {
+		/*
+		 * At least some mask bits are one, so we need
+		 * to write, but only shadow some bits.
+		 */
+		u64 sval, tval; /* Shadowed, transient */
+
+		/*
+		 * New shadow val is bits we don't want to touch,
+		 * ORed with bits we do, that are intended for shadow.
+		 */
+		sval = (dd->sendctrl & ~mask);
+		sval |= *data & SENDCTRL_SHADOWED & mask;
+		dd->sendctrl = sval;
+		tval = sval | (*data & ~SENDCTRL_SHADOWED & mask);
+		qib_dev_err(dd, "Sendctrl <- %X, Shad <- %X\n",
+			    (u32)tval, (u32)sval);
+		qib_write_kreg(dd, kr_sendctrl, tval);
+		qib_write_kreg(dd, kr_scratch, 0Ull);
+	}
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+
+	return only_32 ? 4 : 8;
+}
+
+static const struct diag_observer sendctrl_observer = {
+	sendctrl_hook, kr_sendctrl * sizeof(u64),
+	kr_sendctrl * sizeof(u64)
+};
+
+/*
+ * write the final few registers that depend on some of the
+ * init setup.  Done late in init, just before bringing up
+ * the serdes.
+ */
+static int qib_late_7220_initreg(struct qib_devdata *dd)
+{
+	int ret = 0;
+	u64 val;
+
+	qib_write_kreg(dd, kr_rcvhdrentsize, dd->rcvhdrentsize);
+	qib_write_kreg(dd, kr_rcvhdrsize, dd->rcvhdrsize);
+	qib_write_kreg(dd, kr_rcvhdrcnt, dd->rcvhdrcnt);
+	qib_write_kreg(dd, kr_sendpioavailaddr, dd->pioavailregs_phys);
+	val = qib_read_kreg64(dd, kr_sendpioavailaddr);
+	if (val != dd->pioavailregs_phys) {
+		qib_dev_err(dd,
+			"Catastrophic software error, SendPIOAvailAddr written as %lx, read back as %llx\n",
+			(unsigned long) dd->pioavailregs_phys,
+			(unsigned long long) val);
+		ret = -EINVAL;
+	}
+	qib_register_observer(dd, &sendctrl_observer);
+	return ret;
+}
+
+static int qib_init_7220_variables(struct qib_devdata *dd)
+{
+	struct qib_chippport_specific *cpspec;
+	struct qib_pportdata *ppd;
+	int ret = 0;
+	u32 sbufs, updthresh;
+
+	cpspec = (struct qib_chippport_specific *)(dd + 1);
+	ppd = &cpspec->pportdata;
+	dd->pport = ppd;
+	dd->num_pports = 1;
+
+	dd->cspec = (struct qib_chip_specific *)(cpspec + dd->num_pports);
+	ppd->cpspec = cpspec;
+
+	spin_lock_init(&dd->cspec->sdepb_lock);
+	spin_lock_init(&dd->cspec->rcvmod_lock);
+	spin_lock_init(&dd->cspec->gpio_lock);
+
+	/* we haven't yet set QIB_PRESENT, so use read directly */
+	dd->revision = readq(&dd->kregbase[kr_revision]);
+
+	if ((dd->revision & 0xffffffffU) == 0xffffffffU) {
+		qib_dev_err(dd,
+			"Revision register read failure, giving up initialization\n");
+		ret = -ENODEV;
+		goto bail;
+	}
+	dd->flags |= QIB_PRESENT;  /* now register routines work */
+
+	dd->majrev = (u8) SYM_FIELD(dd->revision, Revision_R,
+				    ChipRevMajor);
+	dd->minrev = (u8) SYM_FIELD(dd->revision, Revision_R,
+				    ChipRevMinor);
+
+	get_7220_chip_params(dd);
+	qib_7220_boardname(dd);
+
+	/*
+	 * GPIO bits for TWSI data and clock,
+	 * used for serial EEPROM.
+	 */
+	dd->gpio_sda_num = _QIB_GPIO_SDA_NUM;
+	dd->gpio_scl_num = _QIB_GPIO_SCL_NUM;
+	dd->twsi_eeprom_dev = QIB_TWSI_EEPROM_DEV;
+
+	dd->flags |= QIB_HAS_INTX | QIB_HAS_LINK_LATENCY |
+		QIB_NODMA_RTAIL | QIB_HAS_THRESH_UPDATE;
+	dd->flags |= qib_special_trigger ?
+		QIB_USE_SPCL_TRIG : QIB_HAS_SEND_DMA;
+
+	/*
+	 * EEPROM error log 0 is TXE Parity errors. 1 is RXE Parity.
+	 * 2 is Some Misc, 3 is reserved for future.
+	 */
+	dd->eep_st_masks[0].hwerrs_to_log = HWE_MASK(TXEMemParityErr);
+
+	dd->eep_st_masks[1].hwerrs_to_log = HWE_MASK(RXEMemParityErr);
+
+	dd->eep_st_masks[2].errs_to_log = ERR_MASK(ResetNegated);
+
+	init_waitqueue_head(&cpspec->autoneg_wait);
+	INIT_DELAYED_WORK(&cpspec->autoneg_work, autoneg_7220_work);
+
+	ret = qib_init_pportdata(ppd, dd, 0, 1);
+	if (ret)
+		goto bail;
+	ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
+	ppd->link_speed_supported = QIB_IB_SDR | QIB_IB_DDR;
+
+	ppd->link_width_enabled = ppd->link_width_supported;
+	ppd->link_speed_enabled = ppd->link_speed_supported;
+	/*
+	 * Set the initial values to reasonable default, will be set
+	 * for real when link is up.
+	 */
+	ppd->link_width_active = IB_WIDTH_4X;
+	ppd->link_speed_active = QIB_IB_SDR;
+	ppd->delay_mult = rate_to_delay[0][1];
+	ppd->vls_supported = IB_VL_VL0;
+	ppd->vls_operational = ppd->vls_supported;
+
+	if (!qib_mini_init)
+		qib_write_kreg(dd, kr_rcvbthqp, QIB_KD_QP);
+
+	init_timer(&ppd->cpspec->chase_timer);
+	ppd->cpspec->chase_timer.function = reenable_7220_chase;
+	ppd->cpspec->chase_timer.data = (unsigned long)ppd;
+
+	qib_num_cfg_vls = 1; /* if any 7220's, only one VL */
+
+	dd->rcvhdrentsize = QIB_RCVHDR_ENTSIZE;
+	dd->rcvhdrsize = QIB_DFLT_RCVHDRSIZE;
+	dd->rhf_offset =
+		dd->rcvhdrentsize - sizeof(u64) / sizeof(u32);
+
+	/* we always allocate at least 2048 bytes for eager buffers */
+	ret = ib_mtu_enum_to_int(qib_ibmtu);
+	dd->rcvegrbufsize = ret != -1 ? max(ret, 2048) : QIB_DEFAULT_MTU;
+	BUG_ON(!is_power_of_2(dd->rcvegrbufsize));
+	dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize);
+
+	qib_7220_tidtemplate(dd);
+
+	/*
+	 * We can request a receive interrupt for 1 or
+	 * more packets from current offset.  For now, we set this
+	 * up for a single packet.
+	 */
+	dd->rhdrhead_intr_off = 1ULL << 32;
+
+	/* setup the stats timer; the add_timer is done at end of init */
+	init_timer(&dd->stats_timer);
+	dd->stats_timer.function = qib_get_7220_faststats;
+	dd->stats_timer.data = (unsigned long) dd;
+	dd->stats_timer.expires = jiffies + ACTIVITY_TIMER * HZ;
+
+	/*
+	 * Control[4] has been added to change the arbitration within
+	 * the SDMA engine between favoring data fetches over descriptor
+	 * fetches.  qib_sdma_fetch_arb==0 gives data fetches priority.
+	 */
+	if (qib_sdma_fetch_arb)
+		dd->control |= 1 << 4;
+
+	dd->ureg_align = 0x10000;  /* 64KB alignment */
+
+	dd->piosize2kmax_dwords = (dd->piosize2k >> 2)-1;
+	qib_7220_config_ctxts(dd);
+	qib_set_ctxtcnt(dd);  /* needed for PAT setup */
+
+	ret = init_chip_wc_pat(dd, 0);
+	if (ret)
+		goto bail;
+	set_7220_baseaddrs(dd); /* set chip access pointers now */
+
+	ret = 0;
+	if (qib_mini_init)
+		goto bail;
+
+	ret = qib_create_ctxts(dd);
+	init_7220_cntrnames(dd);
+
+	/* use all of 4KB buffers for the kernel SDMA, zero if !SDMA.
+	 * reserve the update threshold amount for other kernel use, such
+	 * as sending SMI, MAD, and ACKs, or 3, whichever is greater,
+	 * unless we aren't enabling SDMA, in which case we want to use
+	 * all the 4k bufs for the kernel.
+	 * if this was less than the update threshold, we could wait
+	 * a long time for an update.  Coded this way because we
+	 * sometimes change the update threshold for various reasons,
+	 * and we want this to remain robust.
+	 */
+	updthresh = 8U; /* update threshold */
+	if (dd->flags & QIB_HAS_SEND_DMA) {
+		dd->cspec->sdmabufcnt =  dd->piobcnt4k;
+		sbufs = updthresh > 3 ? updthresh : 3;
+	} else {
+		dd->cspec->sdmabufcnt = 0;
+		sbufs = dd->piobcnt4k;
+	}
+
+	dd->cspec->lastbuf_for_pio = dd->piobcnt2k + dd->piobcnt4k -
+		dd->cspec->sdmabufcnt;
+	dd->lastctxt_piobuf = dd->cspec->lastbuf_for_pio - sbufs;
+	dd->cspec->lastbuf_for_pio--; /* range is <= , not < */
+	dd->last_pio = dd->cspec->lastbuf_for_pio;
+	dd->pbufsctxt = dd->lastctxt_piobuf /
+		(dd->cfgctxts - dd->first_user_ctxt);
+
+	/*
+	 * if we are at 16 user contexts, we will have one 7 sbufs
+	 * per context, so drop the update threshold to match.  We
+	 * want to update before we actually run out, at low pbufs/ctxt
+	 * so give ourselves some margin
+	 */
+	if ((dd->pbufsctxt - 2) < updthresh)
+		updthresh = dd->pbufsctxt - 2;
+
+	dd->cspec->updthresh_dflt = updthresh;
+	dd->cspec->updthresh = updthresh;
+
+	/* before full enable, no interrupts, no locking needed */
+	dd->sendctrl |= (updthresh & SYM_RMASK(SendCtrl, AvailUpdThld))
+			     << SYM_LSB(SendCtrl, AvailUpdThld);
+
+	dd->psxmitwait_supported = 1;
+	dd->psxmitwait_check_rate = QIB_7220_PSXMITWAIT_CHECK_RATE;
+bail:
+	return ret;
+}
+
+static u32 __iomem *qib_7220_getsendbuf(struct qib_pportdata *ppd, u64 pbc,
+					u32 *pbufnum)
+{
+	u32 first, last, plen = pbc & QIB_PBC_LENGTH_MASK;
+	struct qib_devdata *dd = ppd->dd;
+	u32 __iomem *buf;
+
+	if (((pbc >> 32) & PBC_7220_VL15_SEND_CTRL) &&
+		!(ppd->lflags & (QIBL_IB_AUTONEG_INPROG | QIBL_LINKACTIVE)))
+		buf = get_7220_link_buf(ppd, pbufnum);
+	else {
+		if ((plen + 1) > dd->piosize2kmax_dwords)
+			first = dd->piobcnt2k;
+		else
+			first = 0;
+		/* try 4k if all 2k busy, so same last for both sizes */
+		last = dd->cspec->lastbuf_for_pio;
+		buf = qib_getsendbuf_range(dd, pbufnum, first, last);
+	}
+	return buf;
+}
+
+/* these 2 "counters" are really control registers, and are always RW */
+static void qib_set_cntr_7220_sample(struct qib_pportdata *ppd, u32 intv,
+				     u32 start)
+{
+	write_7220_creg(ppd->dd, cr_psinterval, intv);
+	write_7220_creg(ppd->dd, cr_psstart, start);
+}
+
+/*
+ * NOTE: no real attempt is made to generalize the SDMA stuff.
+ * At some point "soon" we will have a new more generalized
+ * set of sdma interface, and then we'll clean this up.
+ */
+
+/* Must be called with sdma_lock held, or before init finished */
+static void qib_sdma_update_7220_tail(struct qib_pportdata *ppd, u16 tail)
+{
+	/* Commit writes to memory and advance the tail on the chip */
+	wmb();
+	ppd->sdma_descq_tail = tail;
+	qib_write_kreg(ppd->dd, kr_senddmatail, tail);
+}
+
+static void qib_sdma_set_7220_desc_cnt(struct qib_pportdata *ppd, unsigned cnt)
+{
+}
+
+static struct sdma_set_state_action sdma_7220_action_table[] = {
+	[qib_sdma_state_s00_hw_down] = {
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.go_s99_running_tofalse = 1,
+	},
+	[qib_sdma_state_s10_hw_start_up_wait] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+	},
+	[qib_sdma_state_s20_idle] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+	},
+	[qib_sdma_state_s30_sw_clean_up_wait] = {
+		.op_enable = 0,
+		.op_intenable = 1,
+		.op_halt = 0,
+	},
+	[qib_sdma_state_s40_hw_clean_up_wait] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+	},
+	[qib_sdma_state_s50_hw_halt_wait] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+	},
+	[qib_sdma_state_s99_running] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 0,
+		.go_s99_running_totrue = 1,
+	},
+};
+
+static void qib_7220_sdma_init_early(struct qib_pportdata *ppd)
+{
+	ppd->sdma_state.set_state_action = sdma_7220_action_table;
+}
+
+static int init_sdma_7220_regs(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	unsigned i, n;
+	u64 senddmabufmask[3] = { 0 };
+
+	/* Set SendDmaBase */
+	qib_write_kreg(dd, kr_senddmabase, ppd->sdma_descq_phys);
+	qib_sdma_7220_setlengen(ppd);
+	qib_sdma_update_7220_tail(ppd, 0); /* Set SendDmaTail */
+	/* Set SendDmaHeadAddr */
+	qib_write_kreg(dd, kr_senddmaheadaddr, ppd->sdma_head_phys);
+
+	/*
+	 * Reserve all the former "kernel" piobufs, using high number range
+	 * so we get as many 4K buffers as possible
+	 */
+	n = dd->piobcnt2k + dd->piobcnt4k;
+	i = n - dd->cspec->sdmabufcnt;
+
+	for (; i < n; ++i) {
+		unsigned word = i / 64;
+		unsigned bit = i & 63;
+
+		BUG_ON(word >= 3);
+		senddmabufmask[word] |= 1ULL << bit;
+	}
+	qib_write_kreg(dd, kr_senddmabufmask0, senddmabufmask[0]);
+	qib_write_kreg(dd, kr_senddmabufmask1, senddmabufmask[1]);
+	qib_write_kreg(dd, kr_senddmabufmask2, senddmabufmask[2]);
+
+	ppd->sdma_state.first_sendbuf = i;
+	ppd->sdma_state.last_sendbuf = n;
+
+	return 0;
+}
+
+/* sdma_lock must be held */
+static u16 qib_sdma_7220_gethead(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int sane;
+	int use_dmahead;
+	u16 swhead;
+	u16 swtail;
+	u16 cnt;
+	u16 hwhead;
+
+	use_dmahead = __qib_sdma_running(ppd) &&
+		(dd->flags & QIB_HAS_SDMA_TIMEOUT);
+retry:
+	hwhead = use_dmahead ?
+		(u16)le64_to_cpu(*ppd->sdma_head_dma) :
+		(u16)qib_read_kreg32(dd, kr_senddmahead);
+
+	swhead = ppd->sdma_descq_head;
+	swtail = ppd->sdma_descq_tail;
+	cnt = ppd->sdma_descq_cnt;
+
+	if (swhead < swtail) {
+		/* not wrapped */
+		sane = (hwhead >= swhead) & (hwhead <= swtail);
+	} else if (swhead > swtail) {
+		/* wrapped around */
+		sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
+			(hwhead <= swtail);
+	} else {
+		/* empty */
+		sane = (hwhead == swhead);
+	}
+
+	if (unlikely(!sane)) {
+		if (use_dmahead) {
+			/* try one more time, directly from the register */
+			use_dmahead = 0;
+			goto retry;
+		}
+		/* assume no progress */
+		hwhead = swhead;
+	}
+
+	return hwhead;
+}
+
+static int qib_sdma_7220_busy(struct qib_pportdata *ppd)
+{
+	u64 hwstatus = qib_read_kreg64(ppd->dd, kr_senddmastatus);
+
+	return (hwstatus & SYM_MASK(SendDmaStatus, ScoreBoardDrainInProg)) ||
+	       (hwstatus & SYM_MASK(SendDmaStatus, AbortInProg)) ||
+	       (hwstatus & SYM_MASK(SendDmaStatus, InternalSDmaEnable)) ||
+	       !(hwstatus & SYM_MASK(SendDmaStatus, ScbEmpty));
+}
+
+/*
+ * Compute the amount of delay before sending the next packet if the
+ * port's send rate differs from the static rate set for the QP.
+ * Since the delay affects this packet but the amount of the delay is
+ * based on the length of the previous packet, use the last delay computed
+ * and save the delay count for this packet to be used next time
+ * we get here.
+ */
+static u32 qib_7220_setpbc_control(struct qib_pportdata *ppd, u32 plen,
+				   u8 srate, u8 vl)
+{
+	u8 snd_mult = ppd->delay_mult;
+	u8 rcv_mult = ib_rate_to_delay[srate];
+	u32 ret = ppd->cpspec->last_delay_mult;
+
+	ppd->cpspec->last_delay_mult = (rcv_mult > snd_mult) ?
+		(plen * (rcv_mult - snd_mult) + 1) >> 1 : 0;
+
+	/* Indicate VL15, if necessary */
+	if (vl == 15)
+		ret |= PBC_7220_VL15_SEND_CTRL;
+	return ret;
+}
+
+static void qib_7220_initvl15_bufs(struct qib_devdata *dd)
+{
+}
+
+static void qib_7220_init_ctxt(struct qib_ctxtdata *rcd)
+{
+	if (!rcd->ctxt) {
+		rcd->rcvegrcnt = IBA7220_KRCVEGRCNT;
+		rcd->rcvegr_tid_base = 0;
+	} else {
+		rcd->rcvegrcnt = rcd->dd->cspec->rcvegrcnt;
+		rcd->rcvegr_tid_base = IBA7220_KRCVEGRCNT +
+			(rcd->ctxt - 1) * rcd->rcvegrcnt;
+	}
+}
+
+static void qib_7220_txchk_change(struct qib_devdata *dd, u32 start,
+				  u32 len, u32 which, struct qib_ctxtdata *rcd)
+{
+	int i;
+	unsigned long flags;
+
+	switch (which) {
+	case TXCHK_CHG_TYPE_KERN:
+		/* see if we need to raise avail update threshold */
+		spin_lock_irqsave(&dd->uctxt_lock, flags);
+		for (i = dd->first_user_ctxt;
+		     dd->cspec->updthresh != dd->cspec->updthresh_dflt
+		     && i < dd->cfgctxts; i++)
+			if (dd->rcd[i] && dd->rcd[i]->subctxt_cnt &&
+			   ((dd->rcd[i]->piocnt / dd->rcd[i]->subctxt_cnt) - 1)
+			   < dd->cspec->updthresh_dflt)
+				break;
+		spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+		if (i == dd->cfgctxts) {
+			spin_lock_irqsave(&dd->sendctrl_lock, flags);
+			dd->cspec->updthresh = dd->cspec->updthresh_dflt;
+			dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld);
+			dd->sendctrl |= (dd->cspec->updthresh &
+					 SYM_RMASK(SendCtrl, AvailUpdThld)) <<
+					   SYM_LSB(SendCtrl, AvailUpdThld);
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+			sendctrl_7220_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+		}
+		break;
+	case TXCHK_CHG_TYPE_USER:
+		spin_lock_irqsave(&dd->sendctrl_lock, flags);
+		if (rcd && rcd->subctxt_cnt && ((rcd->piocnt
+			/ rcd->subctxt_cnt) - 1) < dd->cspec->updthresh) {
+			dd->cspec->updthresh = (rcd->piocnt /
+						rcd->subctxt_cnt) - 1;
+			dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld);
+			dd->sendctrl |= (dd->cspec->updthresh &
+					SYM_RMASK(SendCtrl, AvailUpdThld))
+					<< SYM_LSB(SendCtrl, AvailUpdThld);
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+			sendctrl_7220_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+		} else
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+		break;
+	}
+}
+
+static void writescratch(struct qib_devdata *dd, u32 val)
+{
+	qib_write_kreg(dd, kr_scratch, val);
+}
+
+#define VALID_TS_RD_REG_MASK 0xBF
+/**
+ * qib_7220_tempsense_read - read register of temp sensor via TWSI
+ * @dd: the qlogic_ib device
+ * @regnum: register to read from
+ *
+ * returns reg contents (0..255) or < 0 for error
+ */
+static int qib_7220_tempsense_rd(struct qib_devdata *dd, int regnum)
+{
+	int ret;
+	u8 rdata;
+
+	if (regnum > 7) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/* return a bogus value for (the one) register we do not have */
+	if (!((1 << regnum) & VALID_TS_RD_REG_MASK)) {
+		ret = 0;
+		goto bail;
+	}
+
+	ret = mutex_lock_interruptible(&dd->eep_lock);
+	if (ret)
+		goto bail;
+
+	ret = qib_twsi_blk_rd(dd, QIB_TWSI_TEMP_DEV, regnum, &rdata, 1);
+	if (!ret)
+		ret = rdata;
+
+	mutex_unlock(&dd->eep_lock);
+
+	/*
+	 * There are three possibilities here:
+	 * ret is actual value (0..255)
+	 * ret is -ENXIO or -EINVAL from twsi code or this file
+	 * ret is -EINTR from mutex_lock_interruptible.
+	 */
+bail:
+	return ret;
+}
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+static int qib_7220_notify_dca(struct qib_devdata *dd, unsigned long event)
+{
+	return 0;
+}
+#endif
+
+/* Dummy function, as 7220 boards never disable EEPROM Write */
+static int qib_7220_eeprom_wen(struct qib_devdata *dd, int wen)
+{
+	return 1;
+}
+
+/**
+ * qib_init_iba7220_funcs - set up the chip-specific function pointers
+ * @dev: the pci_dev for qlogic_ib device
+ * @ent: pci_device_id struct for this dev
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+struct qib_devdata *qib_init_iba7220_funcs(struct pci_dev *pdev,
+					   const struct pci_device_id *ent)
+{
+	struct qib_devdata *dd;
+	int ret;
+	u32 boardid, minwidth;
+
+	dd = qib_alloc_devdata(pdev, sizeof(struct qib_chip_specific) +
+		sizeof(struct qib_chippport_specific));
+	if (IS_ERR(dd))
+		goto bail;
+
+	dd->f_bringup_serdes    = qib_7220_bringup_serdes;
+	dd->f_cleanup           = qib_setup_7220_cleanup;
+	dd->f_clear_tids        = qib_7220_clear_tids;
+	dd->f_free_irq          = qib_7220_free_irq;
+	dd->f_get_base_info     = qib_7220_get_base_info;
+	dd->f_get_msgheader     = qib_7220_get_msgheader;
+	dd->f_getsendbuf        = qib_7220_getsendbuf;
+	dd->f_gpio_mod          = gpio_7220_mod;
+	dd->f_eeprom_wen        = qib_7220_eeprom_wen;
+	dd->f_hdrqempty         = qib_7220_hdrqempty;
+	dd->f_ib_updown         = qib_7220_ib_updown;
+	dd->f_init_ctxt         = qib_7220_init_ctxt;
+	dd->f_initvl15_bufs     = qib_7220_initvl15_bufs;
+	dd->f_intr_fallback     = qib_7220_intr_fallback;
+	dd->f_late_initreg      = qib_late_7220_initreg;
+	dd->f_setpbc_control    = qib_7220_setpbc_control;
+	dd->f_portcntr          = qib_portcntr_7220;
+	dd->f_put_tid           = qib_7220_put_tid;
+	dd->f_quiet_serdes      = qib_7220_quiet_serdes;
+	dd->f_rcvctrl           = rcvctrl_7220_mod;
+	dd->f_read_cntrs        = qib_read_7220cntrs;
+	dd->f_read_portcntrs    = qib_read_7220portcntrs;
+	dd->f_reset             = qib_setup_7220_reset;
+	dd->f_init_sdma_regs    = init_sdma_7220_regs;
+	dd->f_sdma_busy         = qib_sdma_7220_busy;
+	dd->f_sdma_gethead      = qib_sdma_7220_gethead;
+	dd->f_sdma_sendctrl     = qib_7220_sdma_sendctrl;
+	dd->f_sdma_set_desc_cnt = qib_sdma_set_7220_desc_cnt;
+	dd->f_sdma_update_tail  = qib_sdma_update_7220_tail;
+	dd->f_sdma_hw_clean_up  = qib_7220_sdma_hw_clean_up;
+	dd->f_sdma_hw_start_up  = qib_7220_sdma_hw_start_up;
+	dd->f_sdma_init_early   = qib_7220_sdma_init_early;
+	dd->f_sendctrl          = sendctrl_7220_mod;
+	dd->f_set_armlaunch     = qib_set_7220_armlaunch;
+	dd->f_set_cntr_sample   = qib_set_cntr_7220_sample;
+	dd->f_iblink_state      = qib_7220_iblink_state;
+	dd->f_ibphys_portstate  = qib_7220_phys_portstate;
+	dd->f_get_ib_cfg        = qib_7220_get_ib_cfg;
+	dd->f_set_ib_cfg        = qib_7220_set_ib_cfg;
+	dd->f_set_ib_loopback   = qib_7220_set_loopback;
+	dd->f_set_intr_state    = qib_7220_set_intr_state;
+	dd->f_setextled         = qib_setup_7220_setextled;
+	dd->f_txchk_change      = qib_7220_txchk_change;
+	dd->f_update_usrhead    = qib_update_7220_usrhead;
+	dd->f_wantpiobuf_intr   = qib_wantpiobuf_7220_intr;
+	dd->f_xgxs_reset        = qib_7220_xgxs_reset;
+	dd->f_writescratch      = writescratch;
+	dd->f_tempsense_rd	= qib_7220_tempsense_rd;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dd->f_notify_dca = qib_7220_notify_dca;
+#endif
+	/*
+	 * Do remaining pcie setup and save pcie values in dd.
+	 * Any error printing is already done by the init code.
+	 * On return, we have the chip mapped, but chip registers
+	 * are not set up until start of qib_init_7220_variables.
+	 */
+	ret = qib_pcie_ddinit(dd, pdev, ent);
+	if (ret < 0)
+		goto bail_free;
+
+	/* initialize chip-specific variables */
+	ret = qib_init_7220_variables(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	if (qib_mini_init)
+		goto bail;
+
+	boardid = SYM_FIELD(dd->revision, Revision,
+			    BoardID);
+	switch (boardid) {
+	case 0:
+	case 2:
+	case 10:
+	case 12:
+		minwidth = 16; /* x16 capable boards */
+		break;
+	default:
+		minwidth = 8; /* x8 capable boards */
+		break;
+	}
+	if (qib_pcie_params(dd, minwidth, NULL, NULL))
+		qib_dev_err(dd,
+			"Failed to setup PCIe or interrupts; continuing anyway\n");
+
+	/* save IRQ for possible later use */
+	dd->cspec->irq = pdev->irq;
+
+	if (qib_read_kreg64(dd, kr_hwerrstatus) &
+	    QLOGIC_IB_HWE_SERDESPLLFAILED)
+		qib_write_kreg(dd, kr_hwerrclear,
+			       QLOGIC_IB_HWE_SERDESPLLFAILED);
+
+	/* setup interrupt handler (interrupt type handled above) */
+	qib_setup_7220_interrupt(dd);
+	qib_7220_init_hwerrors(dd);
+
+	/* clear diagctrl register, in case diags were running and crashed */
+	qib_write_kreg(dd, kr_hwdiagctrl, 0);
+
+	goto bail;
+
+bail_cleanup:
+	qib_pcie_ddcleanup(dd);
+bail_free:
+	qib_free_devdata(dd);
+	dd = ERR_PTR(ret);
+bail:
+	return dd;
+}
diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c
new file mode 100644
index 000000000..f32b4628e
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_iba7322.c
@@ -0,0 +1,8573 @@
+/*
+ * Copyright (c) 2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2008 - 2012 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file contains all of the code that is specific to the
+ * InfiniPath 7322 chip
+ */
+
+#include <linux/interrupt.h>
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/io.h>
+#include <linux/jiffies.h>
+#include <linux/module.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_smi.h>
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+#include <linux/dca.h>
+#endif
+
+#include "qib.h"
+#include "qib_7322_regs.h"
+#include "qib_qsfp.h"
+
+#include "qib_mad.h"
+#include "qib_verbs.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) QIB_DRV_NAME " " fmt
+
+static void qib_setup_7322_setextled(struct qib_pportdata *, u32);
+static void qib_7322_handle_hwerrors(struct qib_devdata *, char *, size_t);
+static void sendctrl_7322_mod(struct qib_pportdata *ppd, u32 op);
+static irqreturn_t qib_7322intr(int irq, void *data);
+static irqreturn_t qib_7322bufavail(int irq, void *data);
+static irqreturn_t sdma_intr(int irq, void *data);
+static irqreturn_t sdma_idle_intr(int irq, void *data);
+static irqreturn_t sdma_progress_intr(int irq, void *data);
+static irqreturn_t sdma_cleanup_intr(int irq, void *data);
+static void qib_7322_txchk_change(struct qib_devdata *, u32, u32, u32,
+				  struct qib_ctxtdata *rcd);
+static u8 qib_7322_phys_portstate(u64);
+static u32 qib_7322_iblink_state(u64);
+static void qib_set_ib_7322_lstate(struct qib_pportdata *ppd, u16 linkcmd,
+				   u16 linitcmd);
+static void force_h1(struct qib_pportdata *);
+static void adj_tx_serdes(struct qib_pportdata *);
+static u32 qib_7322_setpbc_control(struct qib_pportdata *, u32, u8, u8);
+static void qib_7322_mini_pcs_reset(struct qib_pportdata *);
+
+static u32 ahb_mod(struct qib_devdata *, int, int, int, u32, u32);
+static void ibsd_wr_allchans(struct qib_pportdata *, int, unsigned, unsigned);
+static void serdes_7322_los_enable(struct qib_pportdata *, int);
+static int serdes_7322_init_old(struct qib_pportdata *);
+static int serdes_7322_init_new(struct qib_pportdata *);
+static void dump_sdma_7322_state(struct qib_pportdata *);
+
+#define BMASK(msb, lsb) (((1 << ((msb) + 1 - (lsb))) - 1) << (lsb))
+
+/* LE2 serdes values for different cases */
+#define LE2_DEFAULT 5
+#define LE2_5m 4
+#define LE2_QME 0
+
+/* Below is special-purpose, so only really works for the IB SerDes blocks. */
+#define IBSD(hw_pidx) (hw_pidx + 2)
+
+/* these are variables for documentation and experimentation purposes */
+static const unsigned rcv_int_timeout = 375;
+static const unsigned rcv_int_count = 16;
+static const unsigned sdma_idle_cnt = 64;
+
+/* Time to stop altering Rx Equalization parameters, after link up. */
+#define RXEQ_DISABLE_MSECS 2500
+
+/*
+ * Number of VLs we are configured to use (to allow for more
+ * credits per vl, etc.)
+ */
+ushort qib_num_cfg_vls = 2;
+module_param_named(num_vls, qib_num_cfg_vls, ushort, S_IRUGO);
+MODULE_PARM_DESC(num_vls, "Set number of Virtual Lanes to use (1-8)");
+
+static ushort qib_chase = 1;
+module_param_named(chase, qib_chase, ushort, S_IRUGO);
+MODULE_PARM_DESC(chase, "Enable state chase handling");
+
+static ushort qib_long_atten = 10; /* 10 dB ~= 5m length */
+module_param_named(long_attenuation, qib_long_atten, ushort, S_IRUGO);
+MODULE_PARM_DESC(long_attenuation,
+		 "attenuation cutoff (dB) for long copper cable setup");
+
+static ushort qib_singleport;
+module_param_named(singleport, qib_singleport, ushort, S_IRUGO);
+MODULE_PARM_DESC(singleport, "Use only IB port 1; more per-port buffer space");
+
+static ushort qib_krcvq01_no_msi;
+module_param_named(krcvq01_no_msi, qib_krcvq01_no_msi, ushort, S_IRUGO);
+MODULE_PARM_DESC(krcvq01_no_msi, "No MSI for kctx < 2");
+
+/*
+ * Receive header queue sizes
+ */
+static unsigned qib_rcvhdrcnt;
+module_param_named(rcvhdrcnt, qib_rcvhdrcnt, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvhdrcnt, "receive header count");
+
+static unsigned qib_rcvhdrsize;
+module_param_named(rcvhdrsize, qib_rcvhdrsize, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvhdrsize, "receive header size in 32-bit words");
+
+static unsigned qib_rcvhdrentsize;
+module_param_named(rcvhdrentsize, qib_rcvhdrentsize, uint, S_IRUGO);
+MODULE_PARM_DESC(rcvhdrentsize, "receive header entry size in 32-bit words");
+
+#define MAX_ATTEN_LEN 64 /* plenty for any real system */
+/* for read back, default index is ~5m copper cable */
+static char txselect_list[MAX_ATTEN_LEN] = "10";
+static struct kparam_string kp_txselect = {
+	.string = txselect_list,
+	.maxlen = MAX_ATTEN_LEN
+};
+static int  setup_txselect(const char *, struct kernel_param *);
+module_param_call(txselect, setup_txselect, param_get_string,
+		  &kp_txselect, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(txselect,
+		 "Tx serdes indices (for no QSFP or invalid QSFP data)");
+
+#define BOARD_QME7342 5
+#define BOARD_QMH7342 6
+#define BOARD_QMH7360 9
+#define IS_QMH(dd) (SYM_FIELD((dd)->revision, Revision, BoardID) == \
+		    BOARD_QMH7342)
+#define IS_QME(dd) (SYM_FIELD((dd)->revision, Revision, BoardID) == \
+		    BOARD_QME7342)
+
+#define KREG_IDX(regname)     (QIB_7322_##regname##_OFFS / sizeof(u64))
+
+#define KREG_IBPORT_IDX(regname) ((QIB_7322_##regname##_0_OFFS / sizeof(u64)))
+
+#define MASK_ACROSS(lsb, msb) \
+	(((1ULL << ((msb) + 1 - (lsb))) - 1) << (lsb))
+
+#define SYM_RMASK(regname, fldname) ((u64)              \
+	QIB_7322_##regname##_##fldname##_RMASK)
+
+#define SYM_MASK(regname, fldname) ((u64)               \
+	QIB_7322_##regname##_##fldname##_RMASK <<       \
+	 QIB_7322_##regname##_##fldname##_LSB)
+
+#define SYM_FIELD(value, regname, fldname) ((u64)	\
+	(((value) >> SYM_LSB(regname, fldname)) &	\
+	 SYM_RMASK(regname, fldname)))
+
+/* useful for things like LaFifoEmpty_0...7, TxCreditOK_0...7, etc. */
+#define SYM_FIELD_ACROSS(value, regname, fldname, nbits) \
+	(((value) >> SYM_LSB(regname, fldname)) & MASK_ACROSS(0, nbits))
+
+#define HWE_MASK(fldname) SYM_MASK(HwErrMask, fldname##Mask)
+#define ERR_MASK(fldname) SYM_MASK(ErrMask, fldname##Mask)
+#define ERR_MASK_N(fldname) SYM_MASK(ErrMask_0, fldname##Mask)
+#define INT_MASK(fldname) SYM_MASK(IntMask, fldname##IntMask)
+#define INT_MASK_P(fldname, port) SYM_MASK(IntMask, fldname##IntMask##_##port)
+/* Below because most, but not all, fields of IntMask have that full suffix */
+#define INT_MASK_PM(fldname, port) SYM_MASK(IntMask, fldname##Mask##_##port)
+
+
+#define SYM_LSB(regname, fldname) (QIB_7322_##regname##_##fldname##_LSB)
+
+/*
+ * the size bits give us 2^N, in KB units.  0 marks as invalid,
+ * and 7 is reserved.  We currently use only 2KB and 4KB
+ */
+#define IBA7322_TID_SZ_SHIFT QIB_7322_RcvTIDArray0_RT_BufSize_LSB
+#define IBA7322_TID_SZ_2K (1UL<<IBA7322_TID_SZ_SHIFT) /* 2KB */
+#define IBA7322_TID_SZ_4K (2UL<<IBA7322_TID_SZ_SHIFT) /* 4KB */
+#define IBA7322_TID_PA_SHIFT 11U /* TID addr in chip stored w/o low bits */
+
+#define SendIBSLIDAssignMask \
+	QIB_7322_SendIBSLIDAssign_0_SendIBSLIDAssign_15_0_RMASK
+#define SendIBSLMCMask \
+	QIB_7322_SendIBSLIDMask_0_SendIBSLIDMask_15_0_RMASK
+
+#define ExtLED_IB1_YEL SYM_MASK(EXTCtrl, LEDPort0YellowOn)
+#define ExtLED_IB1_GRN SYM_MASK(EXTCtrl, LEDPort0GreenOn)
+#define ExtLED_IB2_YEL SYM_MASK(EXTCtrl, LEDPort1YellowOn)
+#define ExtLED_IB2_GRN SYM_MASK(EXTCtrl, LEDPort1GreenOn)
+#define ExtLED_IB1_MASK (ExtLED_IB1_YEL | ExtLED_IB1_GRN)
+#define ExtLED_IB2_MASK (ExtLED_IB2_YEL | ExtLED_IB2_GRN)
+
+#define _QIB_GPIO_SDA_NUM 1
+#define _QIB_GPIO_SCL_NUM 0
+#define QIB_EEPROM_WEN_NUM 14
+#define QIB_TWSI_EEPROM_DEV 0xA2 /* All Production 7322 cards. */
+
+/* HW counter clock is at 4nsec */
+#define QIB_7322_PSXMITWAIT_CHECK_RATE 4000
+
+/* full speed IB port 1 only */
+#define PORT_SPD_CAP (QIB_IB_SDR | QIB_IB_DDR | QIB_IB_QDR)
+#define PORT_SPD_CAP_SHIFT 3
+
+/* full speed featuremask, both ports */
+#define DUAL_PORT_CAP (PORT_SPD_CAP | (PORT_SPD_CAP << PORT_SPD_CAP_SHIFT))
+
+/*
+ * This file contains almost all the chip-specific register information and
+ * access functions for the FAKED QLogic InfiniPath 7322 PCI-Express chip.
+ */
+
+/* Use defines to tie machine-generated names to lower-case names */
+#define kr_contextcnt KREG_IDX(ContextCnt)
+#define kr_control KREG_IDX(Control)
+#define kr_counterregbase KREG_IDX(CntrRegBase)
+#define kr_errclear KREG_IDX(ErrClear)
+#define kr_errmask KREG_IDX(ErrMask)
+#define kr_errstatus KREG_IDX(ErrStatus)
+#define kr_extctrl KREG_IDX(EXTCtrl)
+#define kr_extstatus KREG_IDX(EXTStatus)
+#define kr_gpio_clear KREG_IDX(GPIOClear)
+#define kr_gpio_mask KREG_IDX(GPIOMask)
+#define kr_gpio_out KREG_IDX(GPIOOut)
+#define kr_gpio_status KREG_IDX(GPIOStatus)
+#define kr_hwdiagctrl KREG_IDX(HwDiagCtrl)
+#define kr_debugportval KREG_IDX(DebugPortValueReg)
+#define kr_fmask KREG_IDX(feature_mask)
+#define kr_act_fmask KREG_IDX(active_feature_mask)
+#define kr_hwerrclear KREG_IDX(HwErrClear)
+#define kr_hwerrmask KREG_IDX(HwErrMask)
+#define kr_hwerrstatus KREG_IDX(HwErrStatus)
+#define kr_intclear KREG_IDX(IntClear)
+#define kr_intmask KREG_IDX(IntMask)
+#define kr_intredirect KREG_IDX(IntRedirect0)
+#define kr_intstatus KREG_IDX(IntStatus)
+#define kr_pagealign KREG_IDX(PageAlign)
+#define kr_rcvavailtimeout KREG_IDX(RcvAvailTimeOut0)
+#define kr_rcvctrl KREG_IDX(RcvCtrl) /* Common, but chip also has per-port */
+#define kr_rcvegrbase KREG_IDX(RcvEgrBase)
+#define kr_rcvegrcnt KREG_IDX(RcvEgrCnt)
+#define kr_rcvhdrcnt KREG_IDX(RcvHdrCnt)
+#define kr_rcvhdrentsize KREG_IDX(RcvHdrEntSize)
+#define kr_rcvhdrsize KREG_IDX(RcvHdrSize)
+#define kr_rcvtidbase KREG_IDX(RcvTIDBase)
+#define kr_rcvtidcnt KREG_IDX(RcvTIDCnt)
+#define kr_revision KREG_IDX(Revision)
+#define kr_scratch KREG_IDX(Scratch)
+#define kr_sendbuffererror KREG_IDX(SendBufErr0) /* and base for 1 and 2 */
+#define kr_sendcheckmask KREG_IDX(SendCheckMask0) /* and 1, 2 */
+#define kr_sendctrl KREG_IDX(SendCtrl)
+#define kr_sendgrhcheckmask KREG_IDX(SendGRHCheckMask0) /* and 1, 2 */
+#define kr_sendibpktmask KREG_IDX(SendIBPacketMask0) /* and 1, 2 */
+#define kr_sendpioavailaddr KREG_IDX(SendBufAvailAddr)
+#define kr_sendpiobufbase KREG_IDX(SendBufBase)
+#define kr_sendpiobufcnt KREG_IDX(SendBufCnt)
+#define kr_sendpiosize KREG_IDX(SendBufSize)
+#define kr_sendregbase KREG_IDX(SendRegBase)
+#define kr_sendbufavail0 KREG_IDX(SendBufAvail0)
+#define kr_userregbase KREG_IDX(UserRegBase)
+#define kr_intgranted KREG_IDX(Int_Granted)
+#define kr_vecclr_wo_int KREG_IDX(vec_clr_without_int)
+#define kr_intblocked KREG_IDX(IntBlocked)
+#define kr_r_access KREG_IDX(SPC_JTAG_ACCESS_REG)
+
+/*
+ * per-port kernel registers.  Access only with qib_read_kreg_port()
+ * or qib_write_kreg_port()
+ */
+#define krp_errclear KREG_IBPORT_IDX(ErrClear)
+#define krp_errmask KREG_IBPORT_IDX(ErrMask)
+#define krp_errstatus KREG_IBPORT_IDX(ErrStatus)
+#define krp_highprio_0 KREG_IBPORT_IDX(HighPriority0)
+#define krp_highprio_limit KREG_IBPORT_IDX(HighPriorityLimit)
+#define krp_hrtbt_guid KREG_IBPORT_IDX(HRTBT_GUID)
+#define krp_ib_pcsconfig KREG_IBPORT_IDX(IBPCSConfig)
+#define krp_ibcctrl_a KREG_IBPORT_IDX(IBCCtrlA)
+#define krp_ibcctrl_b KREG_IBPORT_IDX(IBCCtrlB)
+#define krp_ibcctrl_c KREG_IBPORT_IDX(IBCCtrlC)
+#define krp_ibcstatus_a KREG_IBPORT_IDX(IBCStatusA)
+#define krp_ibcstatus_b KREG_IBPORT_IDX(IBCStatusB)
+#define krp_txestatus KREG_IBPORT_IDX(TXEStatus)
+#define krp_lowprio_0 KREG_IBPORT_IDX(LowPriority0)
+#define krp_ncmodectrl KREG_IBPORT_IDX(IBNCModeCtrl)
+#define krp_partitionkey KREG_IBPORT_IDX(RcvPartitionKey)
+#define krp_psinterval KREG_IBPORT_IDX(PSInterval)
+#define krp_psstart KREG_IBPORT_IDX(PSStart)
+#define krp_psstat KREG_IBPORT_IDX(PSStat)
+#define krp_rcvbthqp KREG_IBPORT_IDX(RcvBTHQP)
+#define krp_rcvctrl KREG_IBPORT_IDX(RcvCtrl)
+#define krp_rcvpktledcnt KREG_IBPORT_IDX(RcvPktLEDCnt)
+#define krp_rcvqpmaptable KREG_IBPORT_IDX(RcvQPMapTableA)
+#define krp_rxcreditvl0 KREG_IBPORT_IDX(RxCreditVL0)
+#define krp_rxcreditvl15 (KREG_IBPORT_IDX(RxCreditVL0)+15)
+#define krp_sendcheckcontrol KREG_IBPORT_IDX(SendCheckControl)
+#define krp_sendctrl KREG_IBPORT_IDX(SendCtrl)
+#define krp_senddmabase KREG_IBPORT_IDX(SendDmaBase)
+#define krp_senddmabufmask0 KREG_IBPORT_IDX(SendDmaBufMask0)
+#define krp_senddmabufmask1 (KREG_IBPORT_IDX(SendDmaBufMask0) + 1)
+#define krp_senddmabufmask2 (KREG_IBPORT_IDX(SendDmaBufMask0) + 2)
+#define krp_senddmabuf_use0 KREG_IBPORT_IDX(SendDmaBufUsed0)
+#define krp_senddmabuf_use1 (KREG_IBPORT_IDX(SendDmaBufUsed0) + 1)
+#define krp_senddmabuf_use2 (KREG_IBPORT_IDX(SendDmaBufUsed0) + 2)
+#define krp_senddmadesccnt KREG_IBPORT_IDX(SendDmaDescCnt)
+#define krp_senddmahead KREG_IBPORT_IDX(SendDmaHead)
+#define krp_senddmaheadaddr KREG_IBPORT_IDX(SendDmaHeadAddr)
+#define krp_senddmaidlecnt KREG_IBPORT_IDX(SendDmaIdleCnt)
+#define krp_senddmalengen KREG_IBPORT_IDX(SendDmaLenGen)
+#define krp_senddmaprioritythld KREG_IBPORT_IDX(SendDmaPriorityThld)
+#define krp_senddmareloadcnt KREG_IBPORT_IDX(SendDmaReloadCnt)
+#define krp_senddmastatus KREG_IBPORT_IDX(SendDmaStatus)
+#define krp_senddmatail KREG_IBPORT_IDX(SendDmaTail)
+#define krp_sendhdrsymptom KREG_IBPORT_IDX(SendHdrErrSymptom)
+#define krp_sendslid KREG_IBPORT_IDX(SendIBSLIDAssign)
+#define krp_sendslidmask KREG_IBPORT_IDX(SendIBSLIDMask)
+#define krp_ibsdtestiftx KREG_IBPORT_IDX(IB_SDTEST_IF_TX)
+#define krp_adapt_dis_timer KREG_IBPORT_IDX(ADAPT_DISABLE_TIMER_THRESHOLD)
+#define krp_tx_deemph_override KREG_IBPORT_IDX(IBSD_TX_DEEMPHASIS_OVERRIDE)
+#define krp_serdesctrl KREG_IBPORT_IDX(IBSerdesCtrl)
+
+/*
+ * Per-context kernel registers.  Access only with qib_read_kreg_ctxt()
+ * or qib_write_kreg_ctxt()
+ */
+#define krc_rcvhdraddr KREG_IDX(RcvHdrAddr0)
+#define krc_rcvhdrtailaddr KREG_IDX(RcvHdrTailAddr0)
+
+/*
+ * TID Flow table, per context.  Reduces
+ * number of hdrq updates to one per flow (or on errors).
+ * context 0 and 1 share same memory, but have distinct
+ * addresses.  Since for now, we never use expected sends
+ * on kernel contexts, we don't worry about that (we initialize
+ * those entries for ctxt 0/1 on driver load twice, for example).
+ */
+#define NUM_TIDFLOWS_CTXT 0x20 /* 0x20 per context; have to hardcode */
+#define ur_rcvflowtable (KREG_IDX(RcvTIDFlowTable0) - KREG_IDX(RcvHdrTail0))
+
+/* these are the error bits in the tid flows, and are W1C */
+#define TIDFLOW_ERRBITS  ( \
+	(SYM_MASK(RcvTIDFlowTable0, GenMismatch) << \
+	SYM_LSB(RcvTIDFlowTable0, GenMismatch)) | \
+	(SYM_MASK(RcvTIDFlowTable0, SeqMismatch) << \
+	SYM_LSB(RcvTIDFlowTable0, SeqMismatch)))
+
+/* Most (not all) Counters are per-IBport.
+ * Requires LBIntCnt is at offset 0 in the group
+ */
+#define CREG_IDX(regname) \
+((QIB_7322_##regname##_0_OFFS - QIB_7322_LBIntCnt_OFFS) / sizeof(u64))
+
+#define crp_badformat CREG_IDX(RxVersionErrCnt)
+#define crp_err_rlen CREG_IDX(RxLenErrCnt)
+#define crp_erricrc CREG_IDX(RxICRCErrCnt)
+#define crp_errlink CREG_IDX(RxLinkMalformCnt)
+#define crp_errlpcrc CREG_IDX(RxLPCRCErrCnt)
+#define crp_errpkey CREG_IDX(RxPKeyMismatchCnt)
+#define crp_errvcrc CREG_IDX(RxVCRCErrCnt)
+#define crp_excessbufferovfl CREG_IDX(ExcessBufferOvflCnt)
+#define crp_iblinkdown CREG_IDX(IBLinkDownedCnt)
+#define crp_iblinkerrrecov CREG_IDX(IBLinkErrRecoveryCnt)
+#define crp_ibstatuschange CREG_IDX(IBStatusChangeCnt)
+#define crp_ibsymbolerr CREG_IDX(IBSymbolErrCnt)
+#define crp_invalidrlen CREG_IDX(RxMaxMinLenErrCnt)
+#define crp_locallinkintegrityerr CREG_IDX(LocalLinkIntegrityErrCnt)
+#define crp_pktrcv CREG_IDX(RxDataPktCnt)
+#define crp_pktrcvflowctrl CREG_IDX(RxFlowPktCnt)
+#define crp_pktsend CREG_IDX(TxDataPktCnt)
+#define crp_pktsendflow CREG_IDX(TxFlowPktCnt)
+#define crp_psrcvdatacount CREG_IDX(PSRcvDataCount)
+#define crp_psrcvpktscount CREG_IDX(PSRcvPktsCount)
+#define crp_psxmitdatacount CREG_IDX(PSXmitDataCount)
+#define crp_psxmitpktscount CREG_IDX(PSXmitPktsCount)
+#define crp_psxmitwaitcount CREG_IDX(PSXmitWaitCount)
+#define crp_rcvebp CREG_IDX(RxEBPCnt)
+#define crp_rcvflowctrlviol CREG_IDX(RxFlowCtrlViolCnt)
+#define crp_rcvovfl CREG_IDX(RxBufOvflCnt)
+#define crp_rxdlidfltr CREG_IDX(RxDlidFltrCnt)
+#define crp_rxdroppkt CREG_IDX(RxDroppedPktCnt)
+#define crp_rxotherlocalphyerr CREG_IDX(RxOtherLocalPhyErrCnt)
+#define crp_rxqpinvalidctxt CREG_IDX(RxQPInvalidContextCnt)
+#define crp_rxvlerr CREG_IDX(RxVlErrCnt)
+#define crp_sendstall CREG_IDX(TxFlowStallCnt)
+#define crp_txdroppedpkt CREG_IDX(TxDroppedPktCnt)
+#define crp_txhdrerr CREG_IDX(TxHeadersErrCnt)
+#define crp_txlenerr CREG_IDX(TxLenErrCnt)
+#define crp_txminmaxlenerr CREG_IDX(TxMaxMinLenErrCnt)
+#define crp_txsdmadesc CREG_IDX(TxSDmaDescCnt)
+#define crp_txunderrun CREG_IDX(TxUnderrunCnt)
+#define crp_txunsupvl CREG_IDX(TxUnsupVLErrCnt)
+#define crp_vl15droppedpkt CREG_IDX(RxVL15DroppedPktCnt)
+#define crp_wordrcv CREG_IDX(RxDwordCnt)
+#define crp_wordsend CREG_IDX(TxDwordCnt)
+#define crp_tx_creditstalls CREG_IDX(TxCreditUpToDateTimeOut)
+
+/* these are the (few) counters that are not port-specific */
+#define CREG_DEVIDX(regname) ((QIB_7322_##regname##_OFFS - \
+			QIB_7322_LBIntCnt_OFFS) / sizeof(u64))
+#define cr_base_egrovfl CREG_DEVIDX(RxP0HdrEgrOvflCnt)
+#define cr_lbint CREG_DEVIDX(LBIntCnt)
+#define cr_lbstall CREG_DEVIDX(LBFlowStallCnt)
+#define cr_pcieretrydiag CREG_DEVIDX(PcieRetryBufDiagQwordCnt)
+#define cr_rxtidflowdrop CREG_DEVIDX(RxTidFlowDropCnt)
+#define cr_tidfull CREG_DEVIDX(RxTIDFullErrCnt)
+#define cr_tidinvalid CREG_DEVIDX(RxTIDValidErrCnt)
+
+/* no chip register for # of IB ports supported, so define */
+#define NUM_IB_PORTS 2
+
+/* 1 VL15 buffer per hardware IB port, no register for this, so define */
+#define NUM_VL15_BUFS NUM_IB_PORTS
+
+/*
+ * context 0 and 1 are special, and there is no chip register that
+ * defines this value, so we have to define it here.
+ * These are all allocated to either 0 or 1 for single port
+ * hardware configuration, otherwise each gets half
+ */
+#define KCTXT0_EGRCNT 2048
+
+/* values for vl and port fields in PBC, 7322-specific */
+#define PBC_PORT_SEL_LSB 26
+#define PBC_PORT_SEL_RMASK 1
+#define PBC_VL_NUM_LSB 27
+#define PBC_VL_NUM_RMASK 7
+#define PBC_7322_VL15_SEND (1ULL << 63) /* pbc; VL15, no credit check */
+#define PBC_7322_VL15_SEND_CTRL (1ULL << 31) /* control version of same */
+
+static u8 ib_rate_to_delay[IB_RATE_120_GBPS + 1] = {
+	[IB_RATE_2_5_GBPS] = 16,
+	[IB_RATE_5_GBPS] = 8,
+	[IB_RATE_10_GBPS] = 4,
+	[IB_RATE_20_GBPS] = 2,
+	[IB_RATE_30_GBPS] = 2,
+	[IB_RATE_40_GBPS] = 1
+};
+
+#define IBA7322_LINKSPEED_SHIFT SYM_LSB(IBCStatusA_0, LinkSpeedActive)
+#define IBA7322_LINKWIDTH_SHIFT SYM_LSB(IBCStatusA_0, LinkWidthActive)
+
+/* link training states, from IBC */
+#define IB_7322_LT_STATE_DISABLED        0x00
+#define IB_7322_LT_STATE_LINKUP          0x01
+#define IB_7322_LT_STATE_POLLACTIVE      0x02
+#define IB_7322_LT_STATE_POLLQUIET       0x03
+#define IB_7322_LT_STATE_SLEEPDELAY      0x04
+#define IB_7322_LT_STATE_SLEEPQUIET      0x05
+#define IB_7322_LT_STATE_CFGDEBOUNCE     0x08
+#define IB_7322_LT_STATE_CFGRCVFCFG      0x09
+#define IB_7322_LT_STATE_CFGWAITRMT      0x0a
+#define IB_7322_LT_STATE_CFGIDLE         0x0b
+#define IB_7322_LT_STATE_RECOVERRETRAIN  0x0c
+#define IB_7322_LT_STATE_TXREVLANES      0x0d
+#define IB_7322_LT_STATE_RECOVERWAITRMT  0x0e
+#define IB_7322_LT_STATE_RECOVERIDLE     0x0f
+#define IB_7322_LT_STATE_CFGENH          0x10
+#define IB_7322_LT_STATE_CFGTEST         0x11
+#define IB_7322_LT_STATE_CFGWAITRMTTEST  0x12
+#define IB_7322_LT_STATE_CFGWAITENH      0x13
+
+/* link state machine states from IBC */
+#define IB_7322_L_STATE_DOWN             0x0
+#define IB_7322_L_STATE_INIT             0x1
+#define IB_7322_L_STATE_ARM              0x2
+#define IB_7322_L_STATE_ACTIVE           0x3
+#define IB_7322_L_STATE_ACT_DEFER        0x4
+
+static const u8 qib_7322_physportstate[0x20] = {
+	[IB_7322_LT_STATE_DISABLED] = IB_PHYSPORTSTATE_DISABLED,
+	[IB_7322_LT_STATE_LINKUP] = IB_PHYSPORTSTATE_LINKUP,
+	[IB_7322_LT_STATE_POLLACTIVE] = IB_PHYSPORTSTATE_POLL,
+	[IB_7322_LT_STATE_POLLQUIET] = IB_PHYSPORTSTATE_POLL,
+	[IB_7322_LT_STATE_SLEEPDELAY] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_7322_LT_STATE_SLEEPQUIET] = IB_PHYSPORTSTATE_SLEEP,
+	[IB_7322_LT_STATE_CFGDEBOUNCE] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7322_LT_STATE_CFGRCVFCFG] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7322_LT_STATE_CFGWAITRMT] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7322_LT_STATE_CFGIDLE] = IB_PHYSPORTSTATE_CFG_IDLE,
+	[IB_7322_LT_STATE_RECOVERRETRAIN] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_7322_LT_STATE_RECOVERWAITRMT] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_7322_LT_STATE_RECOVERIDLE] =
+		IB_PHYSPORTSTATE_LINK_ERR_RECOVER,
+	[IB_7322_LT_STATE_CFGENH] = IB_PHYSPORTSTATE_CFG_ENH,
+	[IB_7322_LT_STATE_CFGTEST] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7322_LT_STATE_CFGWAITRMTTEST] =
+		IB_PHYSPORTSTATE_CFG_TRAIN,
+	[IB_7322_LT_STATE_CFGWAITENH] =
+		IB_PHYSPORTSTATE_CFG_WAIT_ENH,
+	[0x14] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x15] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x16] = IB_PHYSPORTSTATE_CFG_TRAIN,
+	[0x17] = IB_PHYSPORTSTATE_CFG_TRAIN
+};
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+struct qib_irq_notify {
+	int rcv;
+	void *arg;
+	struct irq_affinity_notify notify;
+};
+#endif
+
+struct qib_chip_specific {
+	u64 __iomem *cregbase;
+	u64 *cntrs;
+	spinlock_t rcvmod_lock; /* protect rcvctrl shadow changes */
+	spinlock_t gpio_lock; /* RMW of shadows/regs for ExtCtrl and GPIO */
+	u64 main_int_mask;      /* clear bits which have dedicated handlers */
+	u64 int_enable_mask;  /* for per port interrupts in single port mode */
+	u64 errormask;
+	u64 hwerrmask;
+	u64 gpio_out; /* shadow of kr_gpio_out, for rmw ops */
+	u64 gpio_mask; /* shadow the gpio mask register */
+	u64 extctrl; /* shadow the gpio output enable, etc... */
+	u32 ncntrs;
+	u32 nportcntrs;
+	u32 cntrnamelen;
+	u32 portcntrnamelen;
+	u32 numctxts;
+	u32 rcvegrcnt;
+	u32 updthresh; /* current AvailUpdThld */
+	u32 updthresh_dflt; /* default AvailUpdThld */
+	u32 r1;
+	int irq;
+	u32 num_msix_entries;
+	u32 sdmabufcnt;
+	u32 lastbuf_for_pio;
+	u32 stay_in_freeze;
+	u32 recovery_ports_initted;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	u32 dca_ctrl;
+	int rhdr_cpu[18];
+	int sdma_cpu[2];
+	u64 dca_rcvhdr_ctrl[5]; /* B, C, D, E, F */
+#endif
+	struct qib_msix_entry *msix_entries;
+	unsigned long *sendchkenable;
+	unsigned long *sendgrhchk;
+	unsigned long *sendibchk;
+	u32 rcvavail_timeout[18];
+	char emsgbuf[128]; /* for device error interrupt msg buffer */
+};
+
+/* Table of entries in "human readable" form Tx Emphasis. */
+struct txdds_ent {
+	u8 amp;
+	u8 pre;
+	u8 main;
+	u8 post;
+};
+
+struct vendor_txdds_ent {
+	u8 oui[QSFP_VOUI_LEN];
+	u8 *partnum;
+	struct txdds_ent sdr;
+	struct txdds_ent ddr;
+	struct txdds_ent qdr;
+};
+
+static void write_tx_serdes_param(struct qib_pportdata *, struct txdds_ent *);
+
+#define TXDDS_TABLE_SZ 16 /* number of entries per speed in onchip table */
+#define TXDDS_EXTRA_SZ 18 /* number of extra tx settings entries */
+#define TXDDS_MFG_SZ 2    /* number of mfg tx settings entries */
+#define SERDES_CHANS 4 /* yes, it's obvious, but one less magic number */
+
+#define H1_FORCE_VAL 8
+#define H1_FORCE_QME 1 /*  may be overridden via setup_txselect() */
+#define H1_FORCE_QMH 7 /*  may be overridden via setup_txselect() */
+
+/* The static and dynamic registers are paired, and the pairs indexed by spd */
+#define krp_static_adapt_dis(spd) (KREG_IBPORT_IDX(ADAPT_DISABLE_STATIC_SDR) \
+	+ ((spd) * 2))
+
+#define QDR_DFE_DISABLE_DELAY 4000 /* msec after LINKUP */
+#define QDR_STATIC_ADAPT_DOWN 0xf0f0f0f0ULL /* link down, H1-H4 QDR adapts */
+#define QDR_STATIC_ADAPT_DOWN_R1 0ULL /* r1 link down, H1-H4 QDR adapts */
+#define QDR_STATIC_ADAPT_INIT 0xffffffffffULL /* up, disable H0,H1-8, LE */
+#define QDR_STATIC_ADAPT_INIT_R1 0xf0ffffffffULL /* r1 up, disable H0,H1-8 */
+
+struct qib_chippport_specific {
+	u64 __iomem *kpregbase;
+	u64 __iomem *cpregbase;
+	u64 *portcntrs;
+	struct qib_pportdata *ppd;
+	wait_queue_head_t autoneg_wait;
+	struct delayed_work autoneg_work;
+	struct delayed_work ipg_work;
+	struct timer_list chase_timer;
+	/*
+	 * these 5 fields are used to establish deltas for IB symbol
+	 * errors and linkrecovery errors.  They can be reported on
+	 * some chips during link negotiation prior to INIT, and with
+	 * DDR when faking DDR negotiations with non-IBTA switches.
+	 * The chip counters are adjusted at driver unload if there is
+	 * a non-zero delta.
+	 */
+	u64 ibdeltainprog;
+	u64 ibsymdelta;
+	u64 ibsymsnap;
+	u64 iblnkerrdelta;
+	u64 iblnkerrsnap;
+	u64 iblnkdownsnap;
+	u64 iblnkdowndelta;
+	u64 ibmalfdelta;
+	u64 ibmalfsnap;
+	u64 ibcctrl_a; /* krp_ibcctrl_a shadow */
+	u64 ibcctrl_b; /* krp_ibcctrl_b shadow */
+	unsigned long qdr_dfe_time;
+	unsigned long chase_end;
+	u32 autoneg_tries;
+	u32 recovery_init;
+	u32 qdr_dfe_on;
+	u32 qdr_reforce;
+	/*
+	 * Per-bay per-channel rcv QMH H1 values and Tx values for QDR.
+	 * entry zero is unused, to simplify indexing
+	 */
+	u8 h1_val;
+	u8 no_eep;  /* txselect table index to use if no qsfp info */
+	u8 ipg_tries;
+	u8 ibmalfusesnap;
+	struct qib_qsfp_data qsfp_data;
+	char epmsgbuf[192]; /* for port error interrupt msg buffer */
+	char sdmamsgbuf[192]; /* for per-port sdma error messages */
+};
+
+static struct {
+	const char *name;
+	irq_handler_t handler;
+	int lsb;
+	int port; /* 0 if not port-specific, else port # */
+	int dca;
+} irq_table[] = {
+	{ "", qib_7322intr, -1, 0, 0 },
+	{ " (buf avail)", qib_7322bufavail,
+		SYM_LSB(IntStatus, SendBufAvail), 0, 0},
+	{ " (sdma 0)", sdma_intr,
+		SYM_LSB(IntStatus, SDmaInt_0), 1, 1 },
+	{ " (sdma 1)", sdma_intr,
+		SYM_LSB(IntStatus, SDmaInt_1), 2, 1 },
+	{ " (sdmaI 0)", sdma_idle_intr,
+		SYM_LSB(IntStatus, SDmaIdleInt_0), 1, 1},
+	{ " (sdmaI 1)", sdma_idle_intr,
+		SYM_LSB(IntStatus, SDmaIdleInt_1), 2, 1},
+	{ " (sdmaP 0)", sdma_progress_intr,
+		SYM_LSB(IntStatus, SDmaProgressInt_0), 1, 1 },
+	{ " (sdmaP 1)", sdma_progress_intr,
+		SYM_LSB(IntStatus, SDmaProgressInt_1), 2, 1 },
+	{ " (sdmaC 0)", sdma_cleanup_intr,
+		SYM_LSB(IntStatus, SDmaCleanupDone_0), 1, 0 },
+	{ " (sdmaC 1)", sdma_cleanup_intr,
+		SYM_LSB(IntStatus, SDmaCleanupDone_1), 2 , 0},
+};
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+
+static const struct dca_reg_map {
+	int     shadow_inx;
+	int     lsb;
+	u64     mask;
+	u16     regno;
+} dca_rcvhdr_reg_map[] = {
+	{ 0, SYM_LSB(DCACtrlB, RcvHdrq0DCAOPH),
+	   ~SYM_MASK(DCACtrlB, RcvHdrq0DCAOPH) , KREG_IDX(DCACtrlB) },
+	{ 0, SYM_LSB(DCACtrlB, RcvHdrq1DCAOPH),
+	   ~SYM_MASK(DCACtrlB, RcvHdrq1DCAOPH) , KREG_IDX(DCACtrlB) },
+	{ 0, SYM_LSB(DCACtrlB, RcvHdrq2DCAOPH),
+	   ~SYM_MASK(DCACtrlB, RcvHdrq2DCAOPH) , KREG_IDX(DCACtrlB) },
+	{ 0, SYM_LSB(DCACtrlB, RcvHdrq3DCAOPH),
+	   ~SYM_MASK(DCACtrlB, RcvHdrq3DCAOPH) , KREG_IDX(DCACtrlB) },
+	{ 1, SYM_LSB(DCACtrlC, RcvHdrq4DCAOPH),
+	   ~SYM_MASK(DCACtrlC, RcvHdrq4DCAOPH) , KREG_IDX(DCACtrlC) },
+	{ 1, SYM_LSB(DCACtrlC, RcvHdrq5DCAOPH),
+	   ~SYM_MASK(DCACtrlC, RcvHdrq5DCAOPH) , KREG_IDX(DCACtrlC) },
+	{ 1, SYM_LSB(DCACtrlC, RcvHdrq6DCAOPH),
+	   ~SYM_MASK(DCACtrlC, RcvHdrq6DCAOPH) , KREG_IDX(DCACtrlC) },
+	{ 1, SYM_LSB(DCACtrlC, RcvHdrq7DCAOPH),
+	   ~SYM_MASK(DCACtrlC, RcvHdrq7DCAOPH) , KREG_IDX(DCACtrlC) },
+	{ 2, SYM_LSB(DCACtrlD, RcvHdrq8DCAOPH),
+	   ~SYM_MASK(DCACtrlD, RcvHdrq8DCAOPH) , KREG_IDX(DCACtrlD) },
+	{ 2, SYM_LSB(DCACtrlD, RcvHdrq9DCAOPH),
+	   ~SYM_MASK(DCACtrlD, RcvHdrq9DCAOPH) , KREG_IDX(DCACtrlD) },
+	{ 2, SYM_LSB(DCACtrlD, RcvHdrq10DCAOPH),
+	   ~SYM_MASK(DCACtrlD, RcvHdrq10DCAOPH) , KREG_IDX(DCACtrlD) },
+	{ 2, SYM_LSB(DCACtrlD, RcvHdrq11DCAOPH),
+	   ~SYM_MASK(DCACtrlD, RcvHdrq11DCAOPH) , KREG_IDX(DCACtrlD) },
+	{ 3, SYM_LSB(DCACtrlE, RcvHdrq12DCAOPH),
+	   ~SYM_MASK(DCACtrlE, RcvHdrq12DCAOPH) , KREG_IDX(DCACtrlE) },
+	{ 3, SYM_LSB(DCACtrlE, RcvHdrq13DCAOPH),
+	   ~SYM_MASK(DCACtrlE, RcvHdrq13DCAOPH) , KREG_IDX(DCACtrlE) },
+	{ 3, SYM_LSB(DCACtrlE, RcvHdrq14DCAOPH),
+	   ~SYM_MASK(DCACtrlE, RcvHdrq14DCAOPH) , KREG_IDX(DCACtrlE) },
+	{ 3, SYM_LSB(DCACtrlE, RcvHdrq15DCAOPH),
+	   ~SYM_MASK(DCACtrlE, RcvHdrq15DCAOPH) , KREG_IDX(DCACtrlE) },
+	{ 4, SYM_LSB(DCACtrlF, RcvHdrq16DCAOPH),
+	   ~SYM_MASK(DCACtrlF, RcvHdrq16DCAOPH) , KREG_IDX(DCACtrlF) },
+	{ 4, SYM_LSB(DCACtrlF, RcvHdrq17DCAOPH),
+	   ~SYM_MASK(DCACtrlF, RcvHdrq17DCAOPH) , KREG_IDX(DCACtrlF) },
+};
+#endif
+
+/* ibcctrl bits */
+#define QLOGIC_IB_IBCC_LINKINITCMD_DISABLE 1
+/* cycle through TS1/TS2 till OK */
+#define QLOGIC_IB_IBCC_LINKINITCMD_POLL 2
+/* wait for TS1, then go on */
+#define QLOGIC_IB_IBCC_LINKINITCMD_SLEEP 3
+#define QLOGIC_IB_IBCC_LINKINITCMD_SHIFT 16
+
+#define QLOGIC_IB_IBCC_LINKCMD_DOWN 1           /* move to 0x11 */
+#define QLOGIC_IB_IBCC_LINKCMD_ARMED 2          /* move to 0x21 */
+#define QLOGIC_IB_IBCC_LINKCMD_ACTIVE 3 /* move to 0x31 */
+
+#define BLOB_7322_IBCHG 0x101
+
+static inline void qib_write_kreg(const struct qib_devdata *dd,
+				  const u32 regno, u64 value);
+static inline u32 qib_read_kreg32(const struct qib_devdata *, const u32);
+static void write_7322_initregs(struct qib_devdata *);
+static void write_7322_init_portregs(struct qib_pportdata *);
+static void setup_7322_link_recovery(struct qib_pportdata *, u32);
+static void check_7322_rxe_status(struct qib_pportdata *);
+static u32 __iomem *qib_7322_getsendbuf(struct qib_pportdata *, u64, u32 *);
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+static void qib_setup_dca(struct qib_devdata *dd);
+static void setup_dca_notifier(struct qib_devdata *dd,
+			       struct qib_msix_entry *m);
+static void reset_dca_notifier(struct qib_devdata *dd,
+			       struct qib_msix_entry *m);
+#endif
+
+/**
+ * qib_read_ureg32 - read 32-bit virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @ctxt: context number
+ *
+ * Return the contents of a register that is virtualized to be per context.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u32 qib_read_ureg32(const struct qib_devdata *dd,
+				  enum qib_ureg regno, int ctxt)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readl(regno + (u64 __iomem *)(
+		(dd->ureg_align * ctxt) + (dd->userbase ?
+		 (char __iomem *)dd->userbase :
+		 (char __iomem *)dd->kregbase + dd->uregbase)));
+}
+
+/**
+ * qib_read_ureg - read virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @ctxt: context number
+ *
+ * Return the contents of a register that is virtualized to be per context.
+ * Returns -1 on errors (not distinguishable from valid contents at
+ * runtime; we may add a separate error variable at some point).
+ */
+static inline u64 qib_read_ureg(const struct qib_devdata *dd,
+				enum qib_ureg regno, int ctxt)
+{
+
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readq(regno + (u64 __iomem *)(
+		(dd->ureg_align * ctxt) + (dd->userbase ?
+		 (char __iomem *)dd->userbase :
+		 (char __iomem *)dd->kregbase + dd->uregbase)));
+}
+
+/**
+ * qib_write_ureg - write virtualized per-context register
+ * @dd: device
+ * @regno: register number
+ * @value: value
+ * @ctxt: context
+ *
+ * Write the contents of a register that is virtualized to be per context.
+ */
+static inline void qib_write_ureg(const struct qib_devdata *dd,
+				  enum qib_ureg regno, u64 value, int ctxt)
+{
+	u64 __iomem *ubase;
+
+	if (dd->userbase)
+		ubase = (u64 __iomem *)
+			((char __iomem *) dd->userbase +
+			 dd->ureg_align * ctxt);
+	else
+		ubase = (u64 __iomem *)
+			(dd->uregbase +
+			 (char __iomem *) dd->kregbase +
+			 dd->ureg_align * ctxt);
+
+	if (dd->kregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &ubase[regno]);
+}
+
+static inline u32 qib_read_kreg32(const struct qib_devdata *dd,
+				  const u32 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+	return readl((u32 __iomem *) &dd->kregbase[regno]);
+}
+
+static inline u64 qib_read_kreg64(const struct qib_devdata *dd,
+				  const u32 regno)
+{
+	if (!dd->kregbase || !(dd->flags & QIB_PRESENT))
+		return -1;
+	return readq(&dd->kregbase[regno]);
+}
+
+static inline void qib_write_kreg(const struct qib_devdata *dd,
+				  const u32 regno, u64 value)
+{
+	if (dd->kregbase && (dd->flags & QIB_PRESENT))
+		writeq(value, &dd->kregbase[regno]);
+}
+
+/*
+ * not many sanity checks for the port-specific kernel register routines,
+ * since they are only used when it's known to be safe.
+*/
+static inline u64 qib_read_kreg_port(const struct qib_pportdata *ppd,
+				     const u16 regno)
+{
+	if (!ppd->cpspec->kpregbase || !(ppd->dd->flags & QIB_PRESENT))
+		return 0ULL;
+	return readq(&ppd->cpspec->kpregbase[regno]);
+}
+
+static inline void qib_write_kreg_port(const struct qib_pportdata *ppd,
+				       const u16 regno, u64 value)
+{
+	if (ppd->cpspec && ppd->dd && ppd->cpspec->kpregbase &&
+	    (ppd->dd->flags & QIB_PRESENT))
+		writeq(value, &ppd->cpspec->kpregbase[regno]);
+}
+
+/**
+ * qib_write_kreg_ctxt - write a device's per-ctxt 64-bit kernel register
+ * @dd: the qlogic_ib device
+ * @regno: the register number to write
+ * @ctxt: the context containing the register
+ * @value: the value to write
+ */
+static inline void qib_write_kreg_ctxt(const struct qib_devdata *dd,
+				       const u16 regno, unsigned ctxt,
+				       u64 value)
+{
+	qib_write_kreg(dd, regno + ctxt, value);
+}
+
+static inline u64 read_7322_creg(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readq(&dd->cspec->cregbase[regno]);
+
+
+}
+
+static inline u32 read_7322_creg32(const struct qib_devdata *dd, u16 regno)
+{
+	if (!dd->cspec->cregbase || !(dd->flags & QIB_PRESENT))
+		return 0;
+	return readl(&dd->cspec->cregbase[regno]);
+
+
+}
+
+static inline void write_7322_creg_port(const struct qib_pportdata *ppd,
+					u16 regno, u64 value)
+{
+	if (ppd->cpspec && ppd->cpspec->cpregbase &&
+	    (ppd->dd->flags & QIB_PRESENT))
+		writeq(value, &ppd->cpspec->cpregbase[regno]);
+}
+
+static inline u64 read_7322_creg_port(const struct qib_pportdata *ppd,
+				      u16 regno)
+{
+	if (!ppd->cpspec || !ppd->cpspec->cpregbase ||
+	    !(ppd->dd->flags & QIB_PRESENT))
+		return 0;
+	return readq(&ppd->cpspec->cpregbase[regno]);
+}
+
+static inline u32 read_7322_creg32_port(const struct qib_pportdata *ppd,
+					u16 regno)
+{
+	if (!ppd->cpspec || !ppd->cpspec->cpregbase ||
+	    !(ppd->dd->flags & QIB_PRESENT))
+		return 0;
+	return readl(&ppd->cpspec->cpregbase[regno]);
+}
+
+/* bits in Control register */
+#define QLOGIC_IB_C_RESET SYM_MASK(Control, SyncReset)
+#define QLOGIC_IB_C_SDMAFETCHPRIOEN SYM_MASK(Control, SDmaDescFetchPriorityEn)
+
+/* bits in general interrupt regs */
+#define QIB_I_RCVURG_LSB SYM_LSB(IntMask, RcvUrg0IntMask)
+#define QIB_I_RCVURG_RMASK MASK_ACROSS(0, 17)
+#define QIB_I_RCVURG_MASK (QIB_I_RCVURG_RMASK << QIB_I_RCVURG_LSB)
+#define QIB_I_RCVAVAIL_LSB SYM_LSB(IntMask, RcvAvail0IntMask)
+#define QIB_I_RCVAVAIL_RMASK MASK_ACROSS(0, 17)
+#define QIB_I_RCVAVAIL_MASK (QIB_I_RCVAVAIL_RMASK << QIB_I_RCVAVAIL_LSB)
+#define QIB_I_C_ERROR INT_MASK(Err)
+
+#define QIB_I_SPIOSENT (INT_MASK_P(SendDone, 0) | INT_MASK_P(SendDone, 1))
+#define QIB_I_SPIOBUFAVAIL INT_MASK(SendBufAvail)
+#define QIB_I_GPIO INT_MASK(AssertGPIO)
+#define QIB_I_P_SDMAINT(pidx) \
+	(INT_MASK_P(SDma, pidx) | INT_MASK_P(SDmaIdle, pidx) | \
+	 INT_MASK_P(SDmaProgress, pidx) | \
+	 INT_MASK_PM(SDmaCleanupDone, pidx))
+
+/* Interrupt bits that are "per port" */
+#define QIB_I_P_BITSEXTANT(pidx) \
+	(INT_MASK_P(Err, pidx) | INT_MASK_P(SendDone, pidx) | \
+	INT_MASK_P(SDma, pidx) | INT_MASK_P(SDmaIdle, pidx) | \
+	INT_MASK_P(SDmaProgress, pidx) | \
+	INT_MASK_PM(SDmaCleanupDone, pidx))
+
+/* Interrupt bits that are common to a device */
+/* currently unused: QIB_I_SPIOSENT */
+#define QIB_I_C_BITSEXTANT \
+	(QIB_I_RCVURG_MASK | QIB_I_RCVAVAIL_MASK | \
+	QIB_I_SPIOSENT | \
+	QIB_I_C_ERROR | QIB_I_SPIOBUFAVAIL | QIB_I_GPIO)
+
+#define QIB_I_BITSEXTANT (QIB_I_C_BITSEXTANT | \
+	QIB_I_P_BITSEXTANT(0) | QIB_I_P_BITSEXTANT(1))
+
+/*
+ * Error bits that are "per port".
+ */
+#define QIB_E_P_IBSTATUSCHANGED ERR_MASK_N(IBStatusChanged)
+#define QIB_E_P_SHDR ERR_MASK_N(SHeadersErr)
+#define QIB_E_P_VL15_BUF_MISUSE ERR_MASK_N(VL15BufMisuseErr)
+#define QIB_E_P_SND_BUF_MISUSE ERR_MASK_N(SendBufMisuseErr)
+#define QIB_E_P_SUNSUPVL ERR_MASK_N(SendUnsupportedVLErr)
+#define QIB_E_P_SUNEXP_PKTNUM ERR_MASK_N(SendUnexpectedPktNumErr)
+#define QIB_E_P_SDROP_DATA ERR_MASK_N(SendDroppedDataPktErr)
+#define QIB_E_P_SDROP_SMP ERR_MASK_N(SendDroppedSmpPktErr)
+#define QIB_E_P_SPKTLEN ERR_MASK_N(SendPktLenErr)
+#define QIB_E_P_SUNDERRUN ERR_MASK_N(SendUnderRunErr)
+#define QIB_E_P_SMAXPKTLEN ERR_MASK_N(SendMaxPktLenErr)
+#define QIB_E_P_SMINPKTLEN ERR_MASK_N(SendMinPktLenErr)
+#define QIB_E_P_RIBLOSTLINK ERR_MASK_N(RcvIBLostLinkErr)
+#define QIB_E_P_RHDR ERR_MASK_N(RcvHdrErr)
+#define QIB_E_P_RHDRLEN ERR_MASK_N(RcvHdrLenErr)
+#define QIB_E_P_RBADTID ERR_MASK_N(RcvBadTidErr)
+#define QIB_E_P_RBADVERSION ERR_MASK_N(RcvBadVersionErr)
+#define QIB_E_P_RIBFLOW ERR_MASK_N(RcvIBFlowErr)
+#define QIB_E_P_REBP ERR_MASK_N(RcvEBPErr)
+#define QIB_E_P_RUNSUPVL ERR_MASK_N(RcvUnsupportedVLErr)
+#define QIB_E_P_RUNEXPCHAR ERR_MASK_N(RcvUnexpectedCharErr)
+#define QIB_E_P_RSHORTPKTLEN ERR_MASK_N(RcvShortPktLenErr)
+#define QIB_E_P_RLONGPKTLEN ERR_MASK_N(RcvLongPktLenErr)
+#define QIB_E_P_RMAXPKTLEN ERR_MASK_N(RcvMaxPktLenErr)
+#define QIB_E_P_RMINPKTLEN ERR_MASK_N(RcvMinPktLenErr)
+#define QIB_E_P_RICRC ERR_MASK_N(RcvICRCErr)
+#define QIB_E_P_RVCRC ERR_MASK_N(RcvVCRCErr)
+#define QIB_E_P_RFORMATERR ERR_MASK_N(RcvFormatErr)
+
+#define QIB_E_P_SDMA1STDESC ERR_MASK_N(SDma1stDescErr)
+#define QIB_E_P_SDMABASE ERR_MASK_N(SDmaBaseErr)
+#define QIB_E_P_SDMADESCADDRMISALIGN ERR_MASK_N(SDmaDescAddrMisalignErr)
+#define QIB_E_P_SDMADWEN ERR_MASK_N(SDmaDwEnErr)
+#define QIB_E_P_SDMAGENMISMATCH ERR_MASK_N(SDmaGenMismatchErr)
+#define QIB_E_P_SDMAHALT ERR_MASK_N(SDmaHaltErr)
+#define QIB_E_P_SDMAMISSINGDW ERR_MASK_N(SDmaMissingDwErr)
+#define QIB_E_P_SDMAOUTOFBOUND ERR_MASK_N(SDmaOutOfBoundErr)
+#define QIB_E_P_SDMARPYTAG ERR_MASK_N(SDmaRpyTagErr)
+#define QIB_E_P_SDMATAILOUTOFBOUND ERR_MASK_N(SDmaTailOutOfBoundErr)
+#define QIB_E_P_SDMAUNEXPDATA ERR_MASK_N(SDmaUnexpDataErr)
+
+/* Error bits that are common to a device */
+#define QIB_E_RESET ERR_MASK(ResetNegated)
+#define QIB_E_HARDWARE ERR_MASK(HardwareErr)
+#define QIB_E_INVALIDADDR ERR_MASK(InvalidAddrErr)
+
+
+/*
+ * Per chip (rather than per-port) errors.  Most either do
+ * nothing but trigger a print (because they self-recover, or
+ * always occur in tandem with other errors that handle the
+ * issue), or because they indicate errors with no recovery,
+ * but we want to know that they happened.
+ */
+#define QIB_E_SBUF_VL15_MISUSE ERR_MASK(SBufVL15MisUseErr)
+#define QIB_E_BADEEP ERR_MASK(InvalidEEPCmd)
+#define QIB_E_VLMISMATCH ERR_MASK(SendVLMismatchErr)
+#define QIB_E_ARMLAUNCH ERR_MASK(SendArmLaunchErr)
+#define QIB_E_SPCLTRIG ERR_MASK(SendSpecialTriggerErr)
+#define QIB_E_RRCVHDRFULL ERR_MASK(RcvHdrFullErr)
+#define QIB_E_RRCVEGRFULL ERR_MASK(RcvEgrFullErr)
+#define QIB_E_RCVCTXTSHARE ERR_MASK(RcvContextShareErr)
+
+/* SDMA chip errors (not per port)
+ * QIB_E_SDMA_BUF_DUP needs no special handling, because we will also get
+ * the SDMAHALT error immediately, so we just print the dup error via the
+ * E_AUTO mechanism.  This is true of most of the per-port fatal errors
+ * as well, but since this is port-independent, by definition, it's
+ * handled a bit differently.  SDMA_VL15 and SDMA_WRONG_PORT are per
+ * packet send errors, and so are handled in the same manner as other
+ * per-packet errors.
+ */
+#define QIB_E_SDMA_VL15 ERR_MASK(SDmaVL15Err)
+#define QIB_E_SDMA_WRONG_PORT ERR_MASK(SDmaWrongPortErr)
+#define QIB_E_SDMA_BUF_DUP ERR_MASK(SDmaBufMaskDuplicateErr)
+
+/*
+ * Below functionally equivalent to legacy QLOGIC_IB_E_PKTERRS
+ * it is used to print "common" packet errors.
+ */
+#define QIB_E_P_PKTERRS (QIB_E_P_SPKTLEN |\
+	QIB_E_P_SDROP_DATA | QIB_E_P_RVCRC |\
+	QIB_E_P_RICRC | QIB_E_P_RSHORTPKTLEN |\
+	QIB_E_P_VL15_BUF_MISUSE | QIB_E_P_SHDR | \
+	QIB_E_P_REBP)
+
+/* Error Bits that Packet-related (Receive, per-port) */
+#define QIB_E_P_RPKTERRS (\
+	QIB_E_P_RHDRLEN | QIB_E_P_RBADTID | \
+	QIB_E_P_RBADVERSION | QIB_E_P_RHDR | \
+	QIB_E_P_RLONGPKTLEN | QIB_E_P_RSHORTPKTLEN |\
+	QIB_E_P_RMAXPKTLEN | QIB_E_P_RMINPKTLEN | \
+	QIB_E_P_RFORMATERR | QIB_E_P_RUNSUPVL | \
+	QIB_E_P_RUNEXPCHAR | QIB_E_P_RIBFLOW | QIB_E_P_REBP)
+
+/*
+ * Error bits that are Send-related (per port)
+ * (ARMLAUNCH excluded from E_SPKTERRS because it gets special handling).
+ * All of these potentially need to have a buffer disarmed
+ */
+#define QIB_E_P_SPKTERRS (\
+	QIB_E_P_SUNEXP_PKTNUM |\
+	QIB_E_P_SDROP_DATA | QIB_E_P_SDROP_SMP |\
+	QIB_E_P_SMAXPKTLEN |\
+	QIB_E_P_VL15_BUF_MISUSE | QIB_E_P_SHDR | \
+	QIB_E_P_SMINPKTLEN | QIB_E_P_SPKTLEN | \
+	QIB_E_P_SND_BUF_MISUSE | QIB_E_P_SUNSUPVL)
+
+#define QIB_E_SPKTERRS ( \
+		QIB_E_SBUF_VL15_MISUSE | QIB_E_VLMISMATCH | \
+		ERR_MASK_N(SendUnsupportedVLErr) |			\
+		QIB_E_SPCLTRIG | QIB_E_SDMA_VL15 | QIB_E_SDMA_WRONG_PORT)
+
+#define QIB_E_P_SDMAERRS ( \
+	QIB_E_P_SDMAHALT | \
+	QIB_E_P_SDMADESCADDRMISALIGN | \
+	QIB_E_P_SDMAUNEXPDATA | \
+	QIB_E_P_SDMAMISSINGDW | \
+	QIB_E_P_SDMADWEN | \
+	QIB_E_P_SDMARPYTAG | \
+	QIB_E_P_SDMA1STDESC | \
+	QIB_E_P_SDMABASE | \
+	QIB_E_P_SDMATAILOUTOFBOUND | \
+	QIB_E_P_SDMAOUTOFBOUND | \
+	QIB_E_P_SDMAGENMISMATCH)
+
+/*
+ * This sets some bits more than once, but makes it more obvious which
+ * bits are not handled under other categories, and the repeat definition
+ * is not a problem.
+ */
+#define QIB_E_P_BITSEXTANT ( \
+	QIB_E_P_SPKTERRS | QIB_E_P_PKTERRS | QIB_E_P_RPKTERRS | \
+	QIB_E_P_RIBLOSTLINK | QIB_E_P_IBSTATUSCHANGED | \
+	QIB_E_P_SND_BUF_MISUSE | QIB_E_P_SUNDERRUN | \
+	QIB_E_P_SHDR | QIB_E_P_VL15_BUF_MISUSE | QIB_E_P_SDMAERRS \
+	)
+
+/*
+ * These are errors that can occur when the link
+ * changes state while a packet is being sent or received.  This doesn't
+ * cover things like EBP or VCRC that can be the result of a sending
+ * having the link change state, so we receive a "known bad" packet.
+ * All of these are "per port", so renamed:
+ */
+#define QIB_E_P_LINK_PKTERRS (\
+	QIB_E_P_SDROP_DATA | QIB_E_P_SDROP_SMP |\
+	QIB_E_P_SMINPKTLEN | QIB_E_P_SPKTLEN |\
+	QIB_E_P_RSHORTPKTLEN | QIB_E_P_RMINPKTLEN |\
+	QIB_E_P_RUNEXPCHAR)
+
+/*
+ * This sets some bits more than once, but makes it more obvious which
+ * bits are not handled under other categories (such as QIB_E_SPKTERRS),
+ * and the repeat definition is not a problem.
+ */
+#define QIB_E_C_BITSEXTANT (\
+	QIB_E_HARDWARE | QIB_E_INVALIDADDR | QIB_E_BADEEP |\
+	QIB_E_ARMLAUNCH | QIB_E_VLMISMATCH | QIB_E_RRCVHDRFULL |\
+	QIB_E_RRCVEGRFULL | QIB_E_RESET | QIB_E_SBUF_VL15_MISUSE)
+
+/* Likewise Neuter E_SPKT_ERRS_IGNORE */
+#define E_SPKT_ERRS_IGNORE 0
+
+#define QIB_EXTS_MEMBIST_DISABLED \
+	SYM_MASK(EXTStatus, MemBISTDisabled)
+#define QIB_EXTS_MEMBIST_ENDTEST \
+	SYM_MASK(EXTStatus, MemBISTEndTest)
+
+#define QIB_E_SPIOARMLAUNCH \
+	ERR_MASK(SendArmLaunchErr)
+
+#define IBA7322_IBCC_LINKINITCMD_MASK SYM_RMASK(IBCCtrlA_0, LinkInitCmd)
+#define IBA7322_IBCC_LINKCMD_SHIFT SYM_LSB(IBCCtrlA_0, LinkCmd)
+
+/*
+ * IBTA_1_2 is set when multiple speeds are enabled (normal),
+ * and also if forced QDR (only QDR enabled).  It's enabled for the
+ * forced QDR case so that scrambling will be enabled by the TS3
+ * exchange, when supported by both sides of the link.
+ */
+#define IBA7322_IBC_IBTA_1_2_MASK SYM_MASK(IBCCtrlB_0, IB_ENHANCED_MODE)
+#define IBA7322_IBC_MAX_SPEED_MASK SYM_MASK(IBCCtrlB_0, SD_SPEED)
+#define IBA7322_IBC_SPEED_QDR SYM_MASK(IBCCtrlB_0, SD_SPEED_QDR)
+#define IBA7322_IBC_SPEED_DDR SYM_MASK(IBCCtrlB_0, SD_SPEED_DDR)
+#define IBA7322_IBC_SPEED_SDR SYM_MASK(IBCCtrlB_0, SD_SPEED_SDR)
+#define IBA7322_IBC_SPEED_MASK (SYM_MASK(IBCCtrlB_0, SD_SPEED_SDR) | \
+	SYM_MASK(IBCCtrlB_0, SD_SPEED_DDR) | SYM_MASK(IBCCtrlB_0, SD_SPEED_QDR))
+#define IBA7322_IBC_SPEED_LSB SYM_LSB(IBCCtrlB_0, SD_SPEED_SDR)
+
+#define IBA7322_LEDBLINK_OFF_SHIFT SYM_LSB(RcvPktLEDCnt_0, OFFperiod)
+#define IBA7322_LEDBLINK_ON_SHIFT SYM_LSB(RcvPktLEDCnt_0, ONperiod)
+
+#define IBA7322_IBC_WIDTH_AUTONEG SYM_MASK(IBCCtrlB_0, IB_NUM_CHANNELS)
+#define IBA7322_IBC_WIDTH_4X_ONLY (1<<SYM_LSB(IBCCtrlB_0, IB_NUM_CHANNELS))
+#define IBA7322_IBC_WIDTH_1X_ONLY (0<<SYM_LSB(IBCCtrlB_0, IB_NUM_CHANNELS))
+
+#define IBA7322_IBC_RXPOL_MASK SYM_MASK(IBCCtrlB_0, IB_POLARITY_REV_SUPP)
+#define IBA7322_IBC_RXPOL_LSB SYM_LSB(IBCCtrlB_0, IB_POLARITY_REV_SUPP)
+#define IBA7322_IBC_HRTBT_MASK (SYM_MASK(IBCCtrlB_0, HRTBT_AUTO) | \
+	SYM_MASK(IBCCtrlB_0, HRTBT_ENB))
+#define IBA7322_IBC_HRTBT_RMASK (IBA7322_IBC_HRTBT_MASK >> \
+	SYM_LSB(IBCCtrlB_0, HRTBT_ENB))
+#define IBA7322_IBC_HRTBT_LSB SYM_LSB(IBCCtrlB_0, HRTBT_ENB)
+
+#define IBA7322_REDIRECT_VEC_PER_REG 12
+
+#define IBA7322_SENDCHK_PKEY SYM_MASK(SendCheckControl_0, PKey_En)
+#define IBA7322_SENDCHK_BTHQP SYM_MASK(SendCheckControl_0, BTHQP_En)
+#define IBA7322_SENDCHK_SLID SYM_MASK(SendCheckControl_0, SLID_En)
+#define IBA7322_SENDCHK_RAW_IPV6 SYM_MASK(SendCheckControl_0, RawIPV6_En)
+#define IBA7322_SENDCHK_MINSZ SYM_MASK(SendCheckControl_0, PacketTooSmall_En)
+
+#define AUTONEG_TRIES 3 /* sequential retries to negotiate DDR */
+
+#define HWE_AUTO(fldname) { .mask = SYM_MASK(HwErrMask, fldname##Mask), \
+	.msg = #fldname , .sz = sizeof(#fldname) }
+#define HWE_AUTO_P(fldname, port) { .mask = SYM_MASK(HwErrMask, \
+	fldname##Mask##_##port), .msg = #fldname , .sz = sizeof(#fldname) }
+static const struct qib_hwerror_msgs qib_7322_hwerror_msgs[] = {
+	HWE_AUTO_P(IBSerdesPClkNotDetect, 1),
+	HWE_AUTO_P(IBSerdesPClkNotDetect, 0),
+	HWE_AUTO(PCIESerdesPClkNotDetect),
+	HWE_AUTO(PowerOnBISTFailed),
+	HWE_AUTO(TempsenseTholdReached),
+	HWE_AUTO(MemoryErr),
+	HWE_AUTO(PCIeBusParityErr),
+	HWE_AUTO(PcieCplTimeout),
+	HWE_AUTO(PciePoisonedTLP),
+	HWE_AUTO_P(SDmaMemReadErr, 1),
+	HWE_AUTO_P(SDmaMemReadErr, 0),
+	HWE_AUTO_P(IBCBusFromSPCParityErr, 1),
+	HWE_AUTO_P(IBCBusToSPCParityErr, 1),
+	HWE_AUTO_P(IBCBusFromSPCParityErr, 0),
+	HWE_AUTO(statusValidNoEop),
+	HWE_AUTO(LATriggered),
+	{ .mask = 0, .sz = 0 }
+};
+
+#define E_AUTO(fldname) { .mask = SYM_MASK(ErrMask, fldname##Mask), \
+	.msg = #fldname, .sz = sizeof(#fldname) }
+#define E_P_AUTO(fldname) { .mask = SYM_MASK(ErrMask_0, fldname##Mask), \
+	.msg = #fldname, .sz = sizeof(#fldname) }
+static const struct qib_hwerror_msgs qib_7322error_msgs[] = {
+	E_AUTO(RcvEgrFullErr),
+	E_AUTO(RcvHdrFullErr),
+	E_AUTO(ResetNegated),
+	E_AUTO(HardwareErr),
+	E_AUTO(InvalidAddrErr),
+	E_AUTO(SDmaVL15Err),
+	E_AUTO(SBufVL15MisUseErr),
+	E_AUTO(InvalidEEPCmd),
+	E_AUTO(RcvContextShareErr),
+	E_AUTO(SendVLMismatchErr),
+	E_AUTO(SendArmLaunchErr),
+	E_AUTO(SendSpecialTriggerErr),
+	E_AUTO(SDmaWrongPortErr),
+	E_AUTO(SDmaBufMaskDuplicateErr),
+	{ .mask = 0, .sz = 0 }
+};
+
+static const struct  qib_hwerror_msgs qib_7322p_error_msgs[] = {
+	E_P_AUTO(IBStatusChanged),
+	E_P_AUTO(SHeadersErr),
+	E_P_AUTO(VL15BufMisuseErr),
+	/*
+	 * SDmaHaltErr is not really an error, make it clearer;
+	 */
+	{.mask = SYM_MASK(ErrMask_0, SDmaHaltErrMask), .msg = "SDmaHalted",
+		.sz = 11},
+	E_P_AUTO(SDmaDescAddrMisalignErr),
+	E_P_AUTO(SDmaUnexpDataErr),
+	E_P_AUTO(SDmaMissingDwErr),
+	E_P_AUTO(SDmaDwEnErr),
+	E_P_AUTO(SDmaRpyTagErr),
+	E_P_AUTO(SDma1stDescErr),
+	E_P_AUTO(SDmaBaseErr),
+	E_P_AUTO(SDmaTailOutOfBoundErr),
+	E_P_AUTO(SDmaOutOfBoundErr),
+	E_P_AUTO(SDmaGenMismatchErr),
+	E_P_AUTO(SendBufMisuseErr),
+	E_P_AUTO(SendUnsupportedVLErr),
+	E_P_AUTO(SendUnexpectedPktNumErr),
+	E_P_AUTO(SendDroppedDataPktErr),
+	E_P_AUTO(SendDroppedSmpPktErr),
+	E_P_AUTO(SendPktLenErr),
+	E_P_AUTO(SendUnderRunErr),
+	E_P_AUTO(SendMaxPktLenErr),
+	E_P_AUTO(SendMinPktLenErr),
+	E_P_AUTO(RcvIBLostLinkErr),
+	E_P_AUTO(RcvHdrErr),
+	E_P_AUTO(RcvHdrLenErr),
+	E_P_AUTO(RcvBadTidErr),
+	E_P_AUTO(RcvBadVersionErr),
+	E_P_AUTO(RcvIBFlowErr),
+	E_P_AUTO(RcvEBPErr),
+	E_P_AUTO(RcvUnsupportedVLErr),
+	E_P_AUTO(RcvUnexpectedCharErr),
+	E_P_AUTO(RcvShortPktLenErr),
+	E_P_AUTO(RcvLongPktLenErr),
+	E_P_AUTO(RcvMaxPktLenErr),
+	E_P_AUTO(RcvMinPktLenErr),
+	E_P_AUTO(RcvICRCErr),
+	E_P_AUTO(RcvVCRCErr),
+	E_P_AUTO(RcvFormatErr),
+	{ .mask = 0, .sz = 0 }
+};
+
+/*
+ * Below generates "auto-message" for interrupts not specific to any port or
+ * context
+ */
+#define INTR_AUTO(fldname) { .mask = SYM_MASK(IntMask, fldname##Mask), \
+	.msg = #fldname, .sz = sizeof(#fldname) }
+/* Below generates "auto-message" for interrupts specific to a port */
+#define INTR_AUTO_P(fldname) { .mask = MASK_ACROSS(\
+	SYM_LSB(IntMask, fldname##Mask##_0), \
+	SYM_LSB(IntMask, fldname##Mask##_1)), \
+	.msg = #fldname "_P", .sz = sizeof(#fldname "_P") }
+/* For some reason, the SerDesTrimDone bits are reversed */
+#define INTR_AUTO_PI(fldname) { .mask = MASK_ACROSS(\
+	SYM_LSB(IntMask, fldname##Mask##_1), \
+	SYM_LSB(IntMask, fldname##Mask##_0)), \
+	.msg = #fldname "_P", .sz = sizeof(#fldname "_P") }
+/*
+ * Below generates "auto-message" for interrupts specific to a context,
+ * with ctxt-number appended
+ */
+#define INTR_AUTO_C(fldname) { .mask = MASK_ACROSS(\
+	SYM_LSB(IntMask, fldname##0IntMask), \
+	SYM_LSB(IntMask, fldname##17IntMask)), \
+	.msg = #fldname "_C", .sz = sizeof(#fldname "_C") }
+
+static const struct  qib_hwerror_msgs qib_7322_intr_msgs[] = {
+	INTR_AUTO_P(SDmaInt),
+	INTR_AUTO_P(SDmaProgressInt),
+	INTR_AUTO_P(SDmaIdleInt),
+	INTR_AUTO_P(SDmaCleanupDone),
+	INTR_AUTO_C(RcvUrg),
+	INTR_AUTO_P(ErrInt),
+	INTR_AUTO(ErrInt),      /* non-port-specific errs */
+	INTR_AUTO(AssertGPIOInt),
+	INTR_AUTO_P(SendDoneInt),
+	INTR_AUTO(SendBufAvailInt),
+	INTR_AUTO_C(RcvAvail),
+	{ .mask = 0, .sz = 0 }
+};
+
+#define TXSYMPTOM_AUTO_P(fldname) \
+	{ .mask = SYM_MASK(SendHdrErrSymptom_0, fldname), \
+	.msg = #fldname, .sz = sizeof(#fldname) }
+static const struct  qib_hwerror_msgs hdrchk_msgs[] = {
+	TXSYMPTOM_AUTO_P(NonKeyPacket),
+	TXSYMPTOM_AUTO_P(GRHFail),
+	TXSYMPTOM_AUTO_P(PkeyFail),
+	TXSYMPTOM_AUTO_P(QPFail),
+	TXSYMPTOM_AUTO_P(SLIDFail),
+	TXSYMPTOM_AUTO_P(RawIPV6),
+	TXSYMPTOM_AUTO_P(PacketTooSmall),
+	{ .mask = 0, .sz = 0 }
+};
+
+#define IBA7322_HDRHEAD_PKTINT_SHIFT 32 /* interrupt cnt in upper 32 bits */
+
+/*
+ * Called when we might have an error that is specific to a particular
+ * PIO buffer, and may need to cancel that buffer, so it can be re-used,
+ * because we don't need to force the update of pioavail
+ */
+static void qib_disarm_7322_senderrbufs(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u32 i;
+	int any;
+	u32 piobcnt = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS;
+	u32 regcnt = (piobcnt + BITS_PER_LONG - 1) / BITS_PER_LONG;
+	unsigned long sbuf[4];
+
+	/*
+	 * It's possible that sendbuffererror could have bits set; might
+	 * have already done this as a result of hardware error handling.
+	 */
+	any = 0;
+	for (i = 0; i < regcnt; ++i) {
+		sbuf[i] = qib_read_kreg64(dd, kr_sendbuffererror + i);
+		if (sbuf[i]) {
+			any = 1;
+			qib_write_kreg(dd, kr_sendbuffererror + i, sbuf[i]);
+		}
+	}
+
+	if (any)
+		qib_disarm_piobufs_set(dd, sbuf, piobcnt);
+}
+
+/* No txe_recover yet, if ever */
+
+/* No decode__errors yet */
+static void err_decode(char *msg, size_t len, u64 errs,
+		       const struct qib_hwerror_msgs *msp)
+{
+	u64 these, lmask;
+	int took, multi, n = 0;
+
+	while (errs && msp && msp->mask) {
+		multi = (msp->mask & (msp->mask - 1));
+		while (errs & msp->mask) {
+			these = (errs & msp->mask);
+			lmask = (these & (these - 1)) ^ these;
+			if (len) {
+				if (n++) {
+					/* separate the strings */
+					*msg++ = ',';
+					len--;
+				}
+				BUG_ON(!msp->sz);
+				/* msp->sz counts the nul */
+				took = min_t(size_t, msp->sz - (size_t)1, len);
+				memcpy(msg,  msp->msg, took);
+				len -= took;
+				msg += took;
+				if (len)
+					*msg = '\0';
+			}
+			errs &= ~lmask;
+			if (len && multi) {
+				/* More than one bit this mask */
+				int idx = -1;
+
+				while (lmask & msp->mask) {
+					++idx;
+					lmask >>= 1;
+				}
+				took = scnprintf(msg, len, "_%d", idx);
+				len -= took;
+				msg += took;
+			}
+		}
+		++msp;
+	}
+	/* If some bits are left, show in hex. */
+	if (len && errs)
+		snprintf(msg, len, "%sMORE:%llX", n ? "," : "",
+			(unsigned long long) errs);
+}
+
+/* only called if r1 set */
+static void flush_fifo(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u32 __iomem *piobuf;
+	u32 bufn;
+	u32 *hdr;
+	u64 pbc;
+	const unsigned hdrwords = 7;
+	static struct qib_ib_header ibhdr = {
+		.lrh[0] = cpu_to_be16(0xF000 | QIB_LRH_BTH),
+		.lrh[1] = IB_LID_PERMISSIVE,
+		.lrh[2] = cpu_to_be16(hdrwords + SIZE_OF_CRC),
+		.lrh[3] = IB_LID_PERMISSIVE,
+		.u.oth.bth[0] = cpu_to_be32(
+			(IB_OPCODE_UD_SEND_ONLY << 24) | QIB_DEFAULT_P_KEY),
+		.u.oth.bth[1] = cpu_to_be32(0),
+		.u.oth.bth[2] = cpu_to_be32(0),
+		.u.oth.u.ud.deth[0] = cpu_to_be32(0),
+		.u.oth.u.ud.deth[1] = cpu_to_be32(0),
+	};
+
+	/*
+	 * Send a dummy VL15 packet to flush the launch FIFO.
+	 * This will not actually be sent since the TxeBypassIbc bit is set.
+	 */
+	pbc = PBC_7322_VL15_SEND |
+		(((u64)ppd->hw_pidx) << (PBC_PORT_SEL_LSB + 32)) |
+		(hdrwords + SIZE_OF_CRC);
+	piobuf = qib_7322_getsendbuf(ppd, pbc, &bufn);
+	if (!piobuf)
+		return;
+	writeq(pbc, piobuf);
+	hdr = (u32 *) &ibhdr;
+	if (dd->flags & QIB_PIO_FLUSH_WC) {
+		qib_flush_wc();
+		qib_pio_copy(piobuf + 2, hdr, hdrwords - 1);
+		qib_flush_wc();
+		__raw_writel(hdr[hdrwords - 1], piobuf + hdrwords + 1);
+		qib_flush_wc();
+	} else
+		qib_pio_copy(piobuf + 2, hdr, hdrwords);
+	qib_sendbuf_done(dd, bufn);
+}
+
+/*
+ * This is called with interrupts disabled and sdma_lock held.
+ */
+static void qib_7322_sdma_sendctrl(struct qib_pportdata *ppd, unsigned op)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 set_sendctrl = 0;
+	u64 clr_sendctrl = 0;
+
+	if (op & QIB_SDMA_SENDCTRL_OP_ENABLE)
+		set_sendctrl |= SYM_MASK(SendCtrl_0, SDmaEnable);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl_0, SDmaEnable);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_INTENABLE)
+		set_sendctrl |= SYM_MASK(SendCtrl_0, SDmaIntEnable);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl_0, SDmaIntEnable);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_HALT)
+		set_sendctrl |= SYM_MASK(SendCtrl_0, SDmaHalt);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl_0, SDmaHalt);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_DRAIN)
+		set_sendctrl |= SYM_MASK(SendCtrl_0, TxeBypassIbc) |
+				SYM_MASK(SendCtrl_0, TxeAbortIbc) |
+				SYM_MASK(SendCtrl_0, TxeDrainRmFifo);
+	else
+		clr_sendctrl |= SYM_MASK(SendCtrl_0, TxeBypassIbc) |
+				SYM_MASK(SendCtrl_0, TxeAbortIbc) |
+				SYM_MASK(SendCtrl_0, TxeDrainRmFifo);
+
+	spin_lock(&dd->sendctrl_lock);
+
+	/* If we are draining everything, block sends first */
+	if (op & QIB_SDMA_SENDCTRL_OP_DRAIN) {
+		ppd->p_sendctrl &= ~SYM_MASK(SendCtrl_0, SendEnable);
+		qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	ppd->p_sendctrl |= set_sendctrl;
+	ppd->p_sendctrl &= ~clr_sendctrl;
+
+	if (op & QIB_SDMA_SENDCTRL_OP_CLEANUP)
+		qib_write_kreg_port(ppd, krp_sendctrl,
+				    ppd->p_sendctrl |
+				    SYM_MASK(SendCtrl_0, SDmaCleanup));
+	else
+		qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	if (op & QIB_SDMA_SENDCTRL_OP_DRAIN) {
+		ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, SendEnable);
+		qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	spin_unlock(&dd->sendctrl_lock);
+
+	if ((op & QIB_SDMA_SENDCTRL_OP_DRAIN) && ppd->dd->cspec->r1)
+		flush_fifo(ppd);
+}
+
+static void qib_7322_sdma_hw_clean_up(struct qib_pportdata *ppd)
+{
+	__qib_sdma_process_event(ppd, qib_sdma_event_e50_hw_cleaned);
+}
+
+static void qib_sdma_7322_setlengen(struct qib_pportdata *ppd)
+{
+	/*
+	 * Set SendDmaLenGen and clear and set
+	 * the MSB of the generation count to enable generation checking
+	 * and load the internal generation counter.
+	 */
+	qib_write_kreg_port(ppd, krp_senddmalengen, ppd->sdma_descq_cnt);
+	qib_write_kreg_port(ppd, krp_senddmalengen,
+			    ppd->sdma_descq_cnt |
+			    (1ULL << QIB_7322_SendDmaLenGen_0_Generation_MSB));
+}
+
+/*
+ * Must be called with sdma_lock held, or before init finished.
+ */
+static void qib_sdma_update_7322_tail(struct qib_pportdata *ppd, u16 tail)
+{
+	/* Commit writes to memory and advance the tail on the chip */
+	wmb();
+	ppd->sdma_descq_tail = tail;
+	qib_write_kreg_port(ppd, krp_senddmatail, tail);
+}
+
+/*
+ * This is called with interrupts disabled and sdma_lock held.
+ */
+static void qib_7322_sdma_hw_start_up(struct qib_pportdata *ppd)
+{
+	/*
+	 * Drain all FIFOs.
+	 * The hardware doesn't require this but we do it so that verbs
+	 * and user applications don't wait for link active to send stale
+	 * data.
+	 */
+	sendctrl_7322_mod(ppd, QIB_SENDCTRL_FLUSH);
+
+	qib_sdma_7322_setlengen(ppd);
+	qib_sdma_update_7322_tail(ppd, 0); /* Set SendDmaTail */
+	ppd->sdma_head_dma[0] = 0;
+	qib_7322_sdma_sendctrl(ppd,
+		ppd->sdma_state.current_op | QIB_SDMA_SENDCTRL_OP_CLEANUP);
+}
+
+#define DISABLES_SDMA ( \
+	QIB_E_P_SDMAHALT | \
+	QIB_E_P_SDMADESCADDRMISALIGN | \
+	QIB_E_P_SDMAMISSINGDW | \
+	QIB_E_P_SDMADWEN | \
+	QIB_E_P_SDMARPYTAG | \
+	QIB_E_P_SDMA1STDESC | \
+	QIB_E_P_SDMABASE | \
+	QIB_E_P_SDMATAILOUTOFBOUND | \
+	QIB_E_P_SDMAOUTOFBOUND | \
+	QIB_E_P_SDMAGENMISMATCH)
+
+static void sdma_7322_p_errors(struct qib_pportdata *ppd, u64 errs)
+{
+	unsigned long flags;
+	struct qib_devdata *dd = ppd->dd;
+
+	errs &= QIB_E_P_SDMAERRS;
+	err_decode(ppd->cpspec->sdmamsgbuf, sizeof(ppd->cpspec->sdmamsgbuf),
+		   errs, qib_7322p_error_msgs);
+
+	if (errs & QIB_E_P_SDMAUNEXPDATA)
+		qib_dev_err(dd, "IB%u:%u SDmaUnexpData\n", dd->unit,
+			    ppd->port);
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	if (errs != QIB_E_P_SDMAHALT) {
+		/* SDMA errors have QIB_E_P_SDMAHALT and another bit set */
+		qib_dev_porterr(dd, ppd->port,
+			"SDMA %s 0x%016llx %s\n",
+			qib_sdma_state_names[ppd->sdma_state.current_state],
+			errs, ppd->cpspec->sdmamsgbuf);
+		dump_sdma_7322_state(ppd);
+	}
+
+	switch (ppd->sdma_state.current_state) {
+	case qib_sdma_state_s00_hw_down:
+		break;
+
+	case qib_sdma_state_s10_hw_start_up_wait:
+		if (errs & QIB_E_P_SDMAHALT)
+			__qib_sdma_process_event(ppd,
+				qib_sdma_event_e20_hw_started);
+		break;
+
+	case qib_sdma_state_s20_idle:
+		break;
+
+	case qib_sdma_state_s30_sw_clean_up_wait:
+		break;
+
+	case qib_sdma_state_s40_hw_clean_up_wait:
+		if (errs & QIB_E_P_SDMAHALT)
+			__qib_sdma_process_event(ppd,
+				qib_sdma_event_e50_hw_cleaned);
+		break;
+
+	case qib_sdma_state_s50_hw_halt_wait:
+		if (errs & QIB_E_P_SDMAHALT)
+			__qib_sdma_process_event(ppd,
+				qib_sdma_event_e60_hw_halted);
+		break;
+
+	case qib_sdma_state_s99_running:
+		__qib_sdma_process_event(ppd, qib_sdma_event_e7322_err_halted);
+		__qib_sdma_process_event(ppd, qib_sdma_event_e60_hw_halted);
+		break;
+	}
+
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+/*
+ * handle per-device errors (not per-port errors)
+ */
+static noinline void handle_7322_errors(struct qib_devdata *dd)
+{
+	char *msg;
+	u64 iserr = 0;
+	u64 errs;
+	u64 mask;
+	int log_idx;
+
+	qib_stats.sps_errints++;
+	errs = qib_read_kreg64(dd, kr_errstatus);
+	if (!errs) {
+		qib_devinfo(dd->pcidev,
+			"device error interrupt, but no error bits set!\n");
+		goto done;
+	}
+
+	/* don't report errors that are masked */
+	errs &= dd->cspec->errormask;
+	msg = dd->cspec->emsgbuf;
+
+	/* do these first, they are most important */
+	if (errs & QIB_E_HARDWARE) {
+		*msg = '\0';
+		qib_7322_handle_hwerrors(dd, msg, sizeof(dd->cspec->emsgbuf));
+	} else
+		for (log_idx = 0; log_idx < QIB_EEP_LOG_CNT; ++log_idx)
+			if (errs & dd->eep_st_masks[log_idx].errs_to_log)
+				qib_inc_eeprom_err(dd, log_idx, 1);
+
+	if (errs & QIB_E_SPKTERRS) {
+		qib_disarm_7322_senderrbufs(dd->pport);
+		qib_stats.sps_txerrs++;
+	} else if (errs & QIB_E_INVALIDADDR)
+		qib_stats.sps_txerrs++;
+	else if (errs & QIB_E_ARMLAUNCH) {
+		qib_stats.sps_txerrs++;
+		qib_disarm_7322_senderrbufs(dd->pport);
+	}
+	qib_write_kreg(dd, kr_errclear, errs);
+
+	/*
+	 * The ones we mask off are handled specially below
+	 * or above.  Also mask SDMADISABLED by default as it
+	 * is too chatty.
+	 */
+	mask = QIB_E_HARDWARE;
+	*msg = '\0';
+
+	err_decode(msg, sizeof(dd->cspec->emsgbuf), errs & ~mask,
+		   qib_7322error_msgs);
+
+	/*
+	 * Getting reset is a tragedy for all ports. Mark the device
+	 * _and_ the ports as "offline" in way meaningful to each.
+	 */
+	if (errs & QIB_E_RESET) {
+		int pidx;
+
+		qib_dev_err(dd,
+			"Got reset, requires re-init (unload and reload driver)\n");
+		dd->flags &= ~QIB_INITTED;  /* needs re-init */
+		/* mark as having had error */
+		*dd->devstatusp |= QIB_STATUS_HWERROR;
+		for (pidx = 0; pidx < dd->num_pports; ++pidx)
+			if (dd->pport[pidx].link_speed_supported)
+				*dd->pport[pidx].statusp &= ~QIB_STATUS_IB_CONF;
+	}
+
+	if (*msg && iserr)
+		qib_dev_err(dd, "%s error\n", msg);
+
+	/*
+	 * If there were hdrq or egrfull errors, wake up any processes
+	 * waiting in poll.  We used to try to check which contexts had
+	 * the overflow, but given the cost of that and the chip reads
+	 * to support it, it's better to just wake everybody up if we
+	 * get an overflow; waiters can poll again if it's not them.
+	 */
+	if (errs & (ERR_MASK(RcvEgrFullErr) | ERR_MASK(RcvHdrFullErr))) {
+		qib_handle_urcv(dd, ~0U);
+		if (errs & ERR_MASK(RcvEgrFullErr))
+			qib_stats.sps_buffull++;
+		else
+			qib_stats.sps_hdrfull++;
+	}
+
+done:
+	return;
+}
+
+static void qib_error_tasklet(unsigned long data)
+{
+	struct qib_devdata *dd = (struct qib_devdata *)data;
+
+	handle_7322_errors(dd);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+}
+
+static void reenable_chase(unsigned long opaque)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *)opaque;
+
+	ppd->cpspec->chase_timer.expires = 0;
+	qib_set_ib_7322_lstate(ppd, QLOGIC_IB_IBCC_LINKCMD_DOWN,
+		QLOGIC_IB_IBCC_LINKINITCMD_POLL);
+}
+
+static void disable_chase(struct qib_pportdata *ppd, unsigned long tnow,
+		u8 ibclt)
+{
+	ppd->cpspec->chase_end = 0;
+
+	if (!qib_chase)
+		return;
+
+	qib_set_ib_7322_lstate(ppd, QLOGIC_IB_IBCC_LINKCMD_DOWN,
+		QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+	ppd->cpspec->chase_timer.expires = jiffies + QIB_CHASE_DIS_TIME;
+	add_timer(&ppd->cpspec->chase_timer);
+}
+
+static void handle_serdes_issues(struct qib_pportdata *ppd, u64 ibcst)
+{
+	u8 ibclt;
+	unsigned long tnow;
+
+	ibclt = (u8)SYM_FIELD(ibcst, IBCStatusA_0, LinkTrainingState);
+
+	/*
+	 * Detect and handle the state chase issue, where we can
+	 * get stuck if we are unlucky on timing on both sides of
+	 * the link.   If we are, we disable, set a timer, and
+	 * then re-enable.
+	 */
+	switch (ibclt) {
+	case IB_7322_LT_STATE_CFGRCVFCFG:
+	case IB_7322_LT_STATE_CFGWAITRMT:
+	case IB_7322_LT_STATE_TXREVLANES:
+	case IB_7322_LT_STATE_CFGENH:
+		tnow = jiffies;
+		if (ppd->cpspec->chase_end &&
+		     time_after(tnow, ppd->cpspec->chase_end))
+			disable_chase(ppd, tnow, ibclt);
+		else if (!ppd->cpspec->chase_end)
+			ppd->cpspec->chase_end = tnow + QIB_CHASE_TIME;
+		break;
+	default:
+		ppd->cpspec->chase_end = 0;
+		break;
+	}
+
+	if (((ibclt >= IB_7322_LT_STATE_CFGTEST &&
+	      ibclt <= IB_7322_LT_STATE_CFGWAITENH) ||
+	     ibclt == IB_7322_LT_STATE_LINKUP) &&
+	    (ibcst & SYM_MASK(IBCStatusA_0, LinkSpeedQDR))) {
+		force_h1(ppd);
+		ppd->cpspec->qdr_reforce = 1;
+		if (!ppd->dd->cspec->r1)
+			serdes_7322_los_enable(ppd, 0);
+	} else if (ppd->cpspec->qdr_reforce &&
+		(ibcst & SYM_MASK(IBCStatusA_0, LinkSpeedQDR)) &&
+		 (ibclt == IB_7322_LT_STATE_CFGENH ||
+		ibclt == IB_7322_LT_STATE_CFGIDLE ||
+		ibclt == IB_7322_LT_STATE_LINKUP))
+		force_h1(ppd);
+
+	if ((IS_QMH(ppd->dd) || IS_QME(ppd->dd)) &&
+	    ppd->link_speed_enabled == QIB_IB_QDR &&
+	    (ibclt == IB_7322_LT_STATE_CFGTEST ||
+	     ibclt == IB_7322_LT_STATE_CFGENH ||
+	     (ibclt >= IB_7322_LT_STATE_POLLACTIVE &&
+	      ibclt <= IB_7322_LT_STATE_SLEEPQUIET)))
+		adj_tx_serdes(ppd);
+
+	if (ibclt != IB_7322_LT_STATE_LINKUP) {
+		u8 ltstate = qib_7322_phys_portstate(ibcst);
+		u8 pibclt = (u8)SYM_FIELD(ppd->lastibcstat, IBCStatusA_0,
+					  LinkTrainingState);
+		if (!ppd->dd->cspec->r1 &&
+		    pibclt == IB_7322_LT_STATE_LINKUP &&
+		    ltstate != IB_PHYSPORTSTATE_LINK_ERR_RECOVER &&
+		    ltstate != IB_PHYSPORTSTATE_RECOVERY_RETRAIN &&
+		    ltstate != IB_PHYSPORTSTATE_RECOVERY_WAITRMT &&
+		    ltstate != IB_PHYSPORTSTATE_RECOVERY_IDLE)
+			/* If the link went down (but no into recovery,
+			 * turn LOS back on */
+			serdes_7322_los_enable(ppd, 1);
+		if (!ppd->cpspec->qdr_dfe_on &&
+		    ibclt <= IB_7322_LT_STATE_SLEEPQUIET) {
+			ppd->cpspec->qdr_dfe_on = 1;
+			ppd->cpspec->qdr_dfe_time = 0;
+			/* On link down, reenable QDR adaptation */
+			qib_write_kreg_port(ppd, krp_static_adapt_dis(2),
+					    ppd->dd->cspec->r1 ?
+					    QDR_STATIC_ADAPT_DOWN_R1 :
+					    QDR_STATIC_ADAPT_DOWN);
+			pr_info(
+				"IB%u:%u re-enabled QDR adaptation ibclt %x\n",
+				ppd->dd->unit, ppd->port, ibclt);
+		}
+	}
+}
+
+static int qib_7322_set_ib_cfg(struct qib_pportdata *, int, u32);
+
+/*
+ * This is per-pport error handling.
+ * will likely get it's own MSIx interrupt (one for each port,
+ * although just a single handler).
+ */
+static noinline void handle_7322_p_errors(struct qib_pportdata *ppd)
+{
+	char *msg;
+	u64 ignore_this_time = 0, iserr = 0, errs, fmask;
+	struct qib_devdata *dd = ppd->dd;
+
+	/* do this as soon as possible */
+	fmask = qib_read_kreg64(dd, kr_act_fmask);
+	if (!fmask)
+		check_7322_rxe_status(ppd);
+
+	errs = qib_read_kreg_port(ppd, krp_errstatus);
+	if (!errs)
+		qib_devinfo(dd->pcidev,
+			 "Port%d error interrupt, but no error bits set!\n",
+			 ppd->port);
+	if (!fmask)
+		errs &= ~QIB_E_P_IBSTATUSCHANGED;
+	if (!errs)
+		goto done;
+
+	msg = ppd->cpspec->epmsgbuf;
+	*msg = '\0';
+
+	if (errs & ~QIB_E_P_BITSEXTANT) {
+		err_decode(msg, sizeof(ppd->cpspec->epmsgbuf),
+			   errs & ~QIB_E_P_BITSEXTANT, qib_7322p_error_msgs);
+		if (!*msg)
+			snprintf(msg, sizeof(ppd->cpspec->epmsgbuf),
+				 "no others");
+		qib_dev_porterr(dd, ppd->port,
+			"error interrupt with unknown errors 0x%016Lx set (and %s)\n",
+			(errs & ~QIB_E_P_BITSEXTANT), msg);
+		*msg = '\0';
+	}
+
+	if (errs & QIB_E_P_SHDR) {
+		u64 symptom;
+
+		/* determine cause, then write to clear */
+		symptom = qib_read_kreg_port(ppd, krp_sendhdrsymptom);
+		qib_write_kreg_port(ppd, krp_sendhdrsymptom, 0);
+		err_decode(msg, sizeof(ppd->cpspec->epmsgbuf), symptom,
+			   hdrchk_msgs);
+		*msg = '\0';
+		/* senderrbuf cleared in SPKTERRS below */
+	}
+
+	if (errs & QIB_E_P_SPKTERRS) {
+		if ((errs & QIB_E_P_LINK_PKTERRS) &&
+		    !(ppd->lflags & QIBL_LINKACTIVE)) {
+			/*
+			 * This can happen when trying to bring the link
+			 * up, but the IB link changes state at the "wrong"
+			 * time. The IB logic then complains that the packet
+			 * isn't valid.  We don't want to confuse people, so
+			 * we just don't print them, except at debug
+			 */
+			err_decode(msg, sizeof(ppd->cpspec->epmsgbuf),
+				   (errs & QIB_E_P_LINK_PKTERRS),
+				   qib_7322p_error_msgs);
+			*msg = '\0';
+			ignore_this_time = errs & QIB_E_P_LINK_PKTERRS;
+		}
+		qib_disarm_7322_senderrbufs(ppd);
+	} else if ((errs & QIB_E_P_LINK_PKTERRS) &&
+		   !(ppd->lflags & QIBL_LINKACTIVE)) {
+		/*
+		 * This can happen when SMA is trying to bring the link
+		 * up, but the IB link changes state at the "wrong" time.
+		 * The IB logic then complains that the packet isn't
+		 * valid.  We don't want to confuse people, so we just
+		 * don't print them, except at debug
+		 */
+		err_decode(msg, sizeof(ppd->cpspec->epmsgbuf), errs,
+			   qib_7322p_error_msgs);
+		ignore_this_time = errs & QIB_E_P_LINK_PKTERRS;
+		*msg = '\0';
+	}
+
+	qib_write_kreg_port(ppd, krp_errclear, errs);
+
+	errs &= ~ignore_this_time;
+	if (!errs)
+		goto done;
+
+	if (errs & QIB_E_P_RPKTERRS)
+		qib_stats.sps_rcverrs++;
+	if (errs & QIB_E_P_SPKTERRS)
+		qib_stats.sps_txerrs++;
+
+	iserr = errs & ~(QIB_E_P_RPKTERRS | QIB_E_P_PKTERRS);
+
+	if (errs & QIB_E_P_SDMAERRS)
+		sdma_7322_p_errors(ppd, errs);
+
+	if (errs & QIB_E_P_IBSTATUSCHANGED) {
+		u64 ibcs;
+		u8 ltstate;
+
+		ibcs = qib_read_kreg_port(ppd, krp_ibcstatus_a);
+		ltstate = qib_7322_phys_portstate(ibcs);
+
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+			handle_serdes_issues(ppd, ibcs);
+		if (!(ppd->cpspec->ibcctrl_a &
+		      SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn))) {
+			/*
+			 * We got our interrupt, so init code should be
+			 * happy and not try alternatives. Now squelch
+			 * other "chatter" from link-negotiation (pre Init)
+			 */
+			ppd->cpspec->ibcctrl_a |=
+				SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn);
+			qib_write_kreg_port(ppd, krp_ibcctrl_a,
+					    ppd->cpspec->ibcctrl_a);
+		}
+
+		/* Update our picture of width and speed from chip */
+		ppd->link_width_active =
+			(ibcs & SYM_MASK(IBCStatusA_0, LinkWidthActive)) ?
+			    IB_WIDTH_4X : IB_WIDTH_1X;
+		ppd->link_speed_active = (ibcs & SYM_MASK(IBCStatusA_0,
+			LinkSpeedQDR)) ? QIB_IB_QDR : (ibcs &
+			  SYM_MASK(IBCStatusA_0, LinkSpeedActive)) ?
+				   QIB_IB_DDR : QIB_IB_SDR;
+
+		if ((ppd->lflags & QIBL_IB_LINK_DISABLED) && ltstate !=
+		    IB_PHYSPORTSTATE_DISABLED)
+			qib_set_ib_7322_lstate(ppd, 0,
+			       QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+		else
+			/*
+			 * Since going into a recovery state causes the link
+			 * state to go down and since recovery is transitory,
+			 * it is better if we "miss" ever seeing the link
+			 * training state go into recovery (i.e., ignore this
+			 * transition for link state special handling purposes)
+			 * without updating lastibcstat.
+			 */
+			if (ltstate != IB_PHYSPORTSTATE_LINK_ERR_RECOVER &&
+			    ltstate != IB_PHYSPORTSTATE_RECOVERY_RETRAIN &&
+			    ltstate != IB_PHYSPORTSTATE_RECOVERY_WAITRMT &&
+			    ltstate != IB_PHYSPORTSTATE_RECOVERY_IDLE)
+				qib_handle_e_ibstatuschanged(ppd, ibcs);
+	}
+	if (*msg && iserr)
+		qib_dev_porterr(dd, ppd->port, "%s error\n", msg);
+
+	if (ppd->state_wanted & ppd->lflags)
+		wake_up_interruptible(&ppd->state_wait);
+done:
+	return;
+}
+
+/* enable/disable chip from delivering interrupts */
+static void qib_7322_set_intr_state(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		if (dd->flags & QIB_BADINTR)
+			return;
+		qib_write_kreg(dd, kr_intmask, dd->cspec->int_enable_mask);
+		/* cause any pending enabled interrupts to be re-delivered */
+		qib_write_kreg(dd, kr_intclear, 0ULL);
+		if (dd->cspec->num_msix_entries) {
+			/* and same for MSIx */
+			u64 val = qib_read_kreg64(dd, kr_intgranted);
+
+			if (val)
+				qib_write_kreg(dd, kr_intgranted, val);
+		}
+	} else
+		qib_write_kreg(dd, kr_intmask, 0ULL);
+}
+
+/*
+ * Try to cleanup as much as possible for anything that might have gone
+ * wrong while in freeze mode, such as pio buffers being written by user
+ * processes (causing armlaunch), send errors due to going into freeze mode,
+ * etc., and try to avoid causing extra interrupts while doing so.
+ * Forcibly update the in-memory pioavail register copies after cleanup
+ * because the chip won't do it while in freeze mode (the register values
+ * themselves are kept correct).
+ * Make sure that we don't lose any important interrupts by using the chip
+ * feature that says that writing 0 to a bit in *clear that is set in
+ * *status will cause an interrupt to be generated again (if allowed by
+ * the *mask value).
+ * This is in chip-specific code because of all of the register accesses,
+ * even though the details are similar on most chips.
+ */
+static void qib_7322_clear_freeze(struct qib_devdata *dd)
+{
+	int pidx;
+
+	/* disable error interrupts, to avoid confusion */
+	qib_write_kreg(dd, kr_errmask, 0ULL);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx)
+		if (dd->pport[pidx].link_speed_supported)
+			qib_write_kreg_port(dd->pport + pidx, krp_errmask,
+					    0ULL);
+
+	/* also disable interrupts; errormask is sometimes overwriten */
+	qib_7322_set_intr_state(dd, 0);
+
+	/* clear the freeze, and be sure chip saw it */
+	qib_write_kreg(dd, kr_control, dd->control);
+	qib_read_kreg32(dd, kr_scratch);
+
+	/*
+	 * Force new interrupt if any hwerr, error or interrupt bits are
+	 * still set, and clear "safe" send packet errors related to freeze
+	 * and cancelling sends.  Re-enable error interrupts before possible
+	 * force of re-interrupt on pending interrupts.
+	 */
+	qib_write_kreg(dd, kr_hwerrclear, 0ULL);
+	qib_write_kreg(dd, kr_errclear, E_SPKT_ERRS_IGNORE);
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+	/* We need to purge per-port errs and reset mask, too */
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		if (!dd->pport[pidx].link_speed_supported)
+			continue;
+		qib_write_kreg_port(dd->pport + pidx, krp_errclear, ~0Ull);
+		qib_write_kreg_port(dd->pport + pidx, krp_errmask, ~0Ull);
+	}
+	qib_7322_set_intr_state(dd, 1);
+}
+
+/* no error handling to speak of */
+/**
+ * qib_7322_handle_hwerrors - display hardware errors.
+ * @dd: the qlogic_ib device
+ * @msg: the output buffer
+ * @msgl: the size of the output buffer
+ *
+ * Use same msg buffer as regular errors to avoid excessive stack
+ * use.  Most hardware errors are catastrophic, but for right now,
+ * we'll print them and continue.  We reuse the same message buffer as
+ * qib_handle_errors() to avoid excessive stack usage.
+ */
+static void qib_7322_handle_hwerrors(struct qib_devdata *dd, char *msg,
+				     size_t msgl)
+{
+	u64 hwerrs;
+	u32 ctrl;
+	int isfatal = 0;
+
+	hwerrs = qib_read_kreg64(dd, kr_hwerrstatus);
+	if (!hwerrs)
+		goto bail;
+	if (hwerrs == ~0ULL) {
+		qib_dev_err(dd,
+			"Read of hardware error status failed (all bits set); ignoring\n");
+		goto bail;
+	}
+	qib_stats.sps_hwerrs++;
+
+	/* Always clear the error status register, except BIST fail */
+	qib_write_kreg(dd, kr_hwerrclear, hwerrs &
+		       ~HWE_MASK(PowerOnBISTFailed));
+
+	hwerrs &= dd->cspec->hwerrmask;
+
+	/* no EEPROM logging, yet */
+
+	if (hwerrs)
+		qib_devinfo(dd->pcidev,
+			"Hardware error: hwerr=0x%llx (cleared)\n",
+			(unsigned long long) hwerrs);
+
+	ctrl = qib_read_kreg32(dd, kr_control);
+	if ((ctrl & SYM_MASK(Control, FreezeMode)) && !dd->diag_client) {
+		/*
+		 * No recovery yet...
+		 */
+		if ((hwerrs & ~HWE_MASK(LATriggered)) ||
+		    dd->cspec->stay_in_freeze) {
+			/*
+			 * If any set that we aren't ignoring only make the
+			 * complaint once, in case it's stuck or recurring,
+			 * and we get here multiple times
+			 * Force link down, so switch knows, and
+			 * LEDs are turned off.
+			 */
+			if (dd->flags & QIB_INITTED)
+				isfatal = 1;
+		} else
+			qib_7322_clear_freeze(dd);
+	}
+
+	if (hwerrs & HWE_MASK(PowerOnBISTFailed)) {
+		isfatal = 1;
+		strlcpy(msg,
+			"[Memory BIST test failed, InfiniPath hardware unusable]",
+			msgl);
+		/* ignore from now on, so disable until driver reloaded */
+		dd->cspec->hwerrmask &= ~HWE_MASK(PowerOnBISTFailed);
+		qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+	}
+
+	err_decode(msg, msgl, hwerrs, qib_7322_hwerror_msgs);
+
+	/* Ignore esoteric PLL failures et al. */
+
+	qib_dev_err(dd, "%s hardware error\n", msg);
+
+	if (hwerrs &
+		   (SYM_MASK(HwErrMask, SDmaMemReadErrMask_0) |
+		    SYM_MASK(HwErrMask, SDmaMemReadErrMask_1))) {
+		int pidx = 0;
+		int err;
+		unsigned long flags;
+		struct qib_pportdata *ppd = dd->pport;
+
+		for (; pidx < dd->num_pports; ++pidx, ppd++) {
+			err = 0;
+			if (pidx == 0 && (hwerrs &
+				SYM_MASK(HwErrMask, SDmaMemReadErrMask_0)))
+				err++;
+			if (pidx == 1 && (hwerrs &
+				SYM_MASK(HwErrMask, SDmaMemReadErrMask_1)))
+				err++;
+			if (err) {
+				spin_lock_irqsave(&ppd->sdma_lock, flags);
+				dump_sdma_7322_state(ppd);
+				spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+			}
+		}
+	}
+
+	if (isfatal && !dd->diag_client) {
+		qib_dev_err(dd,
+			"Fatal Hardware Error, no longer usable, SN %.16s\n",
+			dd->serial);
+		/*
+		 * for /sys status file and user programs to print; if no
+		 * trailing brace is copied, we'll know it was truncated.
+		 */
+		if (dd->freezemsg)
+			snprintf(dd->freezemsg, dd->freezelen,
+				 "{%s}", msg);
+		qib_disable_after_error(dd);
+	}
+bail:;
+}
+
+/**
+ * qib_7322_init_hwerrors - enable hardware errors
+ * @dd: the qlogic_ib device
+ *
+ * now that we have finished initializing everything that might reasonably
+ * cause a hardware error, and cleared those errors bits as they occur,
+ * we can enable hardware errors in the mask (potentially enabling
+ * freeze mode), and enable hardware errors as errors (along with
+ * everything else) in errormask
+ */
+static void qib_7322_init_hwerrors(struct qib_devdata *dd)
+{
+	int pidx;
+	u64 extsval;
+
+	extsval = qib_read_kreg64(dd, kr_extstatus);
+	if (!(extsval & (QIB_EXTS_MEMBIST_DISABLED |
+			 QIB_EXTS_MEMBIST_ENDTEST)))
+		qib_dev_err(dd, "MemBIST did not complete!\n");
+
+	/* never clear BIST failure, so reported on each driver load */
+	qib_write_kreg(dd, kr_hwerrclear, ~HWE_MASK(PowerOnBISTFailed));
+	qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+
+	/* clear all */
+	qib_write_kreg(dd, kr_errclear, ~0ULL);
+	/* enable errors that are masked, at least this first time. */
+	qib_write_kreg(dd, kr_errmask, ~0ULL);
+	dd->cspec->errormask = qib_read_kreg64(dd, kr_errmask);
+	for (pidx = 0; pidx < dd->num_pports; ++pidx)
+		if (dd->pport[pidx].link_speed_supported)
+			qib_write_kreg_port(dd->pport + pidx, krp_errmask,
+					    ~0ULL);
+}
+
+/*
+ * Disable and enable the armlaunch error.  Used for PIO bandwidth testing
+ * on chips that are count-based, rather than trigger-based.  There is no
+ * reference counting, but that's also fine, given the intended use.
+ * Only chip-specific because it's all register accesses
+ */
+static void qib_set_7322_armlaunch(struct qib_devdata *dd, u32 enable)
+{
+	if (enable) {
+		qib_write_kreg(dd, kr_errclear, QIB_E_SPIOARMLAUNCH);
+		dd->cspec->errormask |= QIB_E_SPIOARMLAUNCH;
+	} else
+		dd->cspec->errormask &= ~QIB_E_SPIOARMLAUNCH;
+	qib_write_kreg(dd, kr_errmask, dd->cspec->errormask);
+}
+
+/*
+ * Formerly took parameter <which> in pre-shifted,
+ * pre-merged form with LinkCmd and LinkInitCmd
+ * together, and assuming the zero was NOP.
+ */
+static void qib_set_ib_7322_lstate(struct qib_pportdata *ppd, u16 linkcmd,
+				   u16 linitcmd)
+{
+	u64 mod_wd;
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+
+	if (linitcmd == QLOGIC_IB_IBCC_LINKINITCMD_DISABLE) {
+		/*
+		 * If we are told to disable, note that so link-recovery
+		 * code does not attempt to bring us back up.
+		 * Also reset everything that we can, so we start
+		 * completely clean when re-enabled (before we
+		 * actually issue the disable to the IBC)
+		 */
+		qib_7322_mini_pcs_reset(ppd);
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	} else if (linitcmd || linkcmd == QLOGIC_IB_IBCC_LINKCMD_DOWN) {
+		/*
+		 * Any other linkinitcmd will lead to LINKDOWN and then
+		 * to INIT (if all is well), so clear flag to let
+		 * link-recovery code attempt to bring us back up.
+		 */
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		/*
+		 * Clear status change interrupt reduction so the
+		 * new state is seen.
+		 */
+		ppd->cpspec->ibcctrl_a &=
+			~SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn);
+	}
+
+	mod_wd = (linkcmd << IBA7322_IBCC_LINKCMD_SHIFT) |
+		(linitcmd << QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+
+	qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a |
+			    mod_wd);
+	/* write to chip to prevent back-to-back writes of ibc reg */
+	qib_write_kreg(dd, kr_scratch, 0);
+
+}
+
+/*
+ * The total RCV buffer memory is 64KB, used for both ports, and is
+ * in units of 64 bytes (same as IB flow control credit unit).
+ * The consumedVL unit in the same registers are in 32 byte units!
+ * So, a VL15 packet needs 4.50 IB credits, and 9 rx buffer chunks,
+ * and we can therefore allocate just 9 IB credits for 2 VL15 packets
+ * in krp_rxcreditvl15, rather than 10.
+ */
+#define RCV_BUF_UNITSZ 64
+#define NUM_RCV_BUF_UNITS(dd) ((64 * 1024) / (RCV_BUF_UNITSZ * dd->num_pports))
+
+static void set_vls(struct qib_pportdata *ppd)
+{
+	int i, numvls, totcred, cred_vl, vl0extra;
+	struct qib_devdata *dd = ppd->dd;
+	u64 val;
+
+	numvls = qib_num_vls(ppd->vls_operational);
+
+	/*
+	 * Set up per-VL credits. Below is kluge based on these assumptions:
+	 * 1) port is disabled at the time early_init is called.
+	 * 2) give VL15 17 credits, for two max-plausible packets.
+	 * 3) Give VL0-N the rest, with any rounding excess used for VL0
+	 */
+	/* 2 VL15 packets @ 288 bytes each (including IB headers) */
+	totcred = NUM_RCV_BUF_UNITS(dd);
+	cred_vl = (2 * 288 + RCV_BUF_UNITSZ - 1) / RCV_BUF_UNITSZ;
+	totcred -= cred_vl;
+	qib_write_kreg_port(ppd, krp_rxcreditvl15, (u64) cred_vl);
+	cred_vl = totcred / numvls;
+	vl0extra = totcred - cred_vl * numvls;
+	qib_write_kreg_port(ppd, krp_rxcreditvl0, cred_vl + vl0extra);
+	for (i = 1; i < numvls; i++)
+		qib_write_kreg_port(ppd, krp_rxcreditvl0 + i, cred_vl);
+	for (; i < 8; i++) /* no buffer space for other VLs */
+		qib_write_kreg_port(ppd, krp_rxcreditvl0 + i, 0);
+
+	/* Notify IBC that credits need to be recalculated */
+	val = qib_read_kreg_port(ppd, krp_ibsdtestiftx);
+	val |= SYM_MASK(IB_SDTEST_IF_TX_0, CREDIT_CHANGE);
+	qib_write_kreg_port(ppd, krp_ibsdtestiftx, val);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+	val &= ~SYM_MASK(IB_SDTEST_IF_TX_0, CREDIT_CHANGE);
+	qib_write_kreg_port(ppd, krp_ibsdtestiftx, val);
+
+	for (i = 0; i < numvls; i++)
+		val = qib_read_kreg_port(ppd, krp_rxcreditvl0 + i);
+	val = qib_read_kreg_port(ppd, krp_rxcreditvl15);
+
+	/* Change the number of operational VLs */
+	ppd->cpspec->ibcctrl_a = (ppd->cpspec->ibcctrl_a &
+				~SYM_MASK(IBCCtrlA_0, NumVLane)) |
+		((u64)(numvls - 1) << SYM_LSB(IBCCtrlA_0, NumVLane));
+	qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+}
+
+/*
+ * The code that deals with actual SerDes is in serdes_7322_init().
+ * Compared to the code for iba7220, it is minimal.
+ */
+static int serdes_7322_init(struct qib_pportdata *ppd);
+
+/**
+ * qib_7322_bringup_serdes - bring up the serdes
+ * @ppd: physical port on the qlogic_ib device
+ */
+static int qib_7322_bringup_serdes(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 val, guid, ibc;
+	unsigned long flags;
+	int ret = 0;
+
+	/*
+	 * SerDes model not in Pd, but still need to
+	 * set up much of IBCCtrl and IBCDDRCtrl; move elsewhere
+	 * eventually.
+	 */
+	/* Put IBC in reset, sends disabled (should be in reset already) */
+	ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, IBLinkEn);
+	qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+
+	/* ensure previous Tx parameters are not still forced */
+	qib_write_kreg_port(ppd, krp_tx_deemph_override,
+		SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		reset_tx_deemphasis_override));
+
+	if (qib_compat_ddr_negotiate) {
+		ppd->cpspec->ibdeltainprog = 1;
+		ppd->cpspec->ibsymsnap = read_7322_creg32_port(ppd,
+						crp_ibsymbolerr);
+		ppd->cpspec->iblnkerrsnap = read_7322_creg32_port(ppd,
+						crp_iblinkerrrecov);
+	}
+
+	/* flowcontrolwatermark is in units of KBytes */
+	ibc = 0x5ULL << SYM_LSB(IBCCtrlA_0, FlowCtrlWaterMark);
+	/*
+	 * Flow control is sent this often, even if no changes in
+	 * buffer space occur.  Units are 128ns for this chip.
+	 * Set to 3usec.
+	 */
+	ibc |= 24ULL << SYM_LSB(IBCCtrlA_0, FlowCtrlPeriod);
+	/* max error tolerance */
+	ibc |= 0xfULL << SYM_LSB(IBCCtrlA_0, PhyerrThreshold);
+	/* IB credit flow control. */
+	ibc |= 0xfULL << SYM_LSB(IBCCtrlA_0, OverrunThreshold);
+	/*
+	 * set initial max size pkt IBC will send, including ICRC; it's the
+	 * PIO buffer size in dwords, less 1; also see qib_set_mtu()
+	 */
+	ibc |= ((u64)(ppd->ibmaxlen >> 2) + 1) <<
+		SYM_LSB(IBCCtrlA_0, MaxPktLen);
+	ppd->cpspec->ibcctrl_a = ibc; /* without linkcmd or linkinitcmd! */
+
+	/*
+	 * Reset the PCS interface to the serdes (and also ibc, which is still
+	 * in reset from above).  Writes new value of ibcctrl_a as last step.
+	 */
+	qib_7322_mini_pcs_reset(ppd);
+
+	if (!ppd->cpspec->ibcctrl_b) {
+		unsigned lse = ppd->link_speed_enabled;
+
+		/*
+		 * Not on re-init after reset, establish shadow
+		 * and force initial config.
+		 */
+		ppd->cpspec->ibcctrl_b = qib_read_kreg_port(ppd,
+							     krp_ibcctrl_b);
+		ppd->cpspec->ibcctrl_b &= ~(IBA7322_IBC_SPEED_QDR |
+				IBA7322_IBC_SPEED_DDR |
+				IBA7322_IBC_SPEED_SDR |
+				IBA7322_IBC_WIDTH_AUTONEG |
+				SYM_MASK(IBCCtrlB_0, IB_LANE_REV_SUPPORTED));
+		if (lse & (lse - 1)) /* Muliple speeds enabled */
+			ppd->cpspec->ibcctrl_b |=
+				(lse << IBA7322_IBC_SPEED_LSB) |
+				IBA7322_IBC_IBTA_1_2_MASK |
+				IBA7322_IBC_MAX_SPEED_MASK;
+		else
+			ppd->cpspec->ibcctrl_b |= (lse == QIB_IB_QDR) ?
+				IBA7322_IBC_SPEED_QDR |
+				 IBA7322_IBC_IBTA_1_2_MASK :
+				(lse == QIB_IB_DDR) ?
+					IBA7322_IBC_SPEED_DDR :
+					IBA7322_IBC_SPEED_SDR;
+		if ((ppd->link_width_enabled & (IB_WIDTH_1X | IB_WIDTH_4X)) ==
+		    (IB_WIDTH_1X | IB_WIDTH_4X))
+			ppd->cpspec->ibcctrl_b |= IBA7322_IBC_WIDTH_AUTONEG;
+		else
+			ppd->cpspec->ibcctrl_b |=
+				ppd->link_width_enabled == IB_WIDTH_4X ?
+				IBA7322_IBC_WIDTH_4X_ONLY :
+				IBA7322_IBC_WIDTH_1X_ONLY;
+
+		/* always enable these on driver reload, not sticky */
+		ppd->cpspec->ibcctrl_b |= (IBA7322_IBC_RXPOL_MASK |
+			IBA7322_IBC_HRTBT_MASK);
+	}
+	qib_write_kreg_port(ppd, krp_ibcctrl_b, ppd->cpspec->ibcctrl_b);
+
+	/* setup so we have more time at CFGTEST to change H1 */
+	val = qib_read_kreg_port(ppd, krp_ibcctrl_c);
+	val &= ~SYM_MASK(IBCCtrlC_0, IB_FRONT_PORCH);
+	val |= 0xfULL << SYM_LSB(IBCCtrlC_0, IB_FRONT_PORCH);
+	qib_write_kreg_port(ppd, krp_ibcctrl_c, val);
+
+	serdes_7322_init(ppd);
+
+	guid = be64_to_cpu(ppd->guid);
+	if (!guid) {
+		if (dd->base_guid)
+			guid = be64_to_cpu(dd->base_guid) + ppd->port - 1;
+		ppd->guid = cpu_to_be64(guid);
+	}
+
+	qib_write_kreg_port(ppd, krp_hrtbt_guid, guid);
+	/* write to chip to prevent back-to-back writes of ibc reg */
+	qib_write_kreg(dd, kr_scratch, 0);
+
+	/* Enable port */
+	ppd->cpspec->ibcctrl_a |= SYM_MASK(IBCCtrlA_0, IBLinkEn);
+	set_vls(ppd);
+
+	/* initially come up DISABLED, without sending anything. */
+	val = ppd->cpspec->ibcctrl_a | (QLOGIC_IB_IBCC_LINKINITCMD_DISABLE <<
+					QLOGIC_IB_IBCC_LINKINITCMD_SHIFT);
+	qib_write_kreg_port(ppd, krp_ibcctrl_a, val);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+	/* clear the linkinit cmds */
+	ppd->cpspec->ibcctrl_a = val & ~SYM_MASK(IBCCtrlA_0, LinkInitCmd);
+
+	/* be paranoid against later code motion, etc. */
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+	ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvIBPortEnable);
+	qib_write_kreg_port(ppd, krp_rcvctrl, ppd->p_rcvctrl);
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+
+	/* Also enable IBSTATUSCHG interrupt.  */
+	val = qib_read_kreg_port(ppd, krp_errmask);
+	qib_write_kreg_port(ppd, krp_errmask,
+		val | ERR_MASK_N(IBStatusChanged));
+
+	/* Always zero until we start messing with SerDes for real */
+	return ret;
+}
+
+/**
+ * qib_7322_quiet_serdes - set serdes to txidle
+ * @dd: the qlogic_ib device
+ * Called when driver is being unloaded
+ */
+static void qib_7322_mini_quiet_serdes(struct qib_pportdata *ppd)
+{
+	u64 val;
+	unsigned long flags;
+
+	qib_set_ib_7322_lstate(ppd, 0, QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	wake_up(&ppd->cpspec->autoneg_wait);
+	cancel_delayed_work_sync(&ppd->cpspec->autoneg_work);
+	if (ppd->dd->cspec->r1)
+		cancel_delayed_work_sync(&ppd->cpspec->ipg_work);
+
+	ppd->cpspec->chase_end = 0;
+	if (ppd->cpspec->chase_timer.data) /* if initted */
+		del_timer_sync(&ppd->cpspec->chase_timer);
+
+	/*
+	 * Despite the name, actually disables IBC as well. Do it when
+	 * we are as sure as possible that no more packets can be
+	 * received, following the down and the PCS reset.
+	 * The actual disabling happens in qib_7322_mini_pci_reset(),
+	 * along with the PCS being reset.
+	 */
+	ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, IBLinkEn);
+	qib_7322_mini_pcs_reset(ppd);
+
+	/*
+	 * Update the adjusted counters so the adjustment persists
+	 * across driver reload.
+	 */
+	if (ppd->cpspec->ibsymdelta || ppd->cpspec->iblnkerrdelta ||
+	    ppd->cpspec->ibdeltainprog || ppd->cpspec->iblnkdowndelta) {
+		struct qib_devdata *dd = ppd->dd;
+		u64 diagc;
+
+		/* enable counter writes */
+		diagc = qib_read_kreg64(dd, kr_hwdiagctrl);
+		qib_write_kreg(dd, kr_hwdiagctrl,
+			       diagc | SYM_MASK(HwDiagCtrl, CounterWrEnable));
+
+		if (ppd->cpspec->ibsymdelta || ppd->cpspec->ibdeltainprog) {
+			val = read_7322_creg32_port(ppd, crp_ibsymbolerr);
+			if (ppd->cpspec->ibdeltainprog)
+				val -= val - ppd->cpspec->ibsymsnap;
+			val -= ppd->cpspec->ibsymdelta;
+			write_7322_creg_port(ppd, crp_ibsymbolerr, val);
+		}
+		if (ppd->cpspec->iblnkerrdelta || ppd->cpspec->ibdeltainprog) {
+			val = read_7322_creg32_port(ppd, crp_iblinkerrrecov);
+			if (ppd->cpspec->ibdeltainprog)
+				val -= val - ppd->cpspec->iblnkerrsnap;
+			val -= ppd->cpspec->iblnkerrdelta;
+			write_7322_creg_port(ppd, crp_iblinkerrrecov, val);
+		}
+		if (ppd->cpspec->iblnkdowndelta) {
+			val = read_7322_creg32_port(ppd, crp_iblinkdown);
+			val += ppd->cpspec->iblnkdowndelta;
+			write_7322_creg_port(ppd, crp_iblinkdown, val);
+		}
+		/*
+		 * No need to save ibmalfdelta since IB perfcounters
+		 * are cleared on driver reload.
+		 */
+
+		/* and disable counter writes */
+		qib_write_kreg(dd, kr_hwdiagctrl, diagc);
+	}
+}
+
+/**
+ * qib_setup_7322_setextled - set the state of the two external LEDs
+ * @ppd: physical port on the qlogic_ib device
+ * @on: whether the link is up or not
+ *
+ * The exact combo of LEDs if on is true is determined by looking
+ * at the ibcstatus.
+ *
+ * These LEDs indicate the physical and logical state of IB link.
+ * For this chip (at least with recommended board pinouts), LED1
+ * is Yellow (logical state) and LED2 is Green (physical state),
+ *
+ * Note:  We try to match the Mellanox HCA LED behavior as best
+ * we can.  Green indicates physical link state is OK (something is
+ * plugged in, and we can train).
+ * Amber indicates the link is logically up (ACTIVE).
+ * Mellanox further blinks the amber LED to indicate data packet
+ * activity, but we have no hardware support for that, so it would
+ * require waking up every 10-20 msecs and checking the counters
+ * on the chip, and then turning the LED off if appropriate.  That's
+ * visible overhead, so not something we will do.
+ */
+static void qib_setup_7322_setextled(struct qib_pportdata *ppd, u32 on)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 extctl, ledblink = 0, val;
+	unsigned long flags;
+	int yel, grn;
+
+	/*
+	 * The diags use the LED to indicate diag info, so we leave
+	 * the external LED alone when the diags are running.
+	 */
+	if (dd->diag_client)
+		return;
+
+	/* Allow override of LED display for, e.g. Locating system in rack */
+	if (ppd->led_override) {
+		grn = (ppd->led_override & QIB_LED_PHYS);
+		yel = (ppd->led_override & QIB_LED_LOG);
+	} else if (on) {
+		val = qib_read_kreg_port(ppd, krp_ibcstatus_a);
+		grn = qib_7322_phys_portstate(val) ==
+			IB_PHYSPORTSTATE_LINKUP;
+		yel = qib_7322_iblink_state(val) == IB_PORT_ACTIVE;
+	} else {
+		grn = 0;
+		yel = 0;
+	}
+
+	spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+	extctl = dd->cspec->extctrl & (ppd->port == 1 ?
+		~ExtLED_IB1_MASK : ~ExtLED_IB2_MASK);
+	if (grn) {
+		extctl |= ppd->port == 1 ? ExtLED_IB1_GRN : ExtLED_IB2_GRN;
+		/*
+		 * Counts are in chip clock (4ns) periods.
+		 * This is 1/16 sec (66.6ms) on,
+		 * 3/16 sec (187.5 ms) off, with packets rcvd.
+		 */
+		ledblink = ((66600 * 1000UL / 4) << IBA7322_LEDBLINK_ON_SHIFT) |
+			((187500 * 1000UL / 4) << IBA7322_LEDBLINK_OFF_SHIFT);
+	}
+	if (yel)
+		extctl |= ppd->port == 1 ? ExtLED_IB1_YEL : ExtLED_IB2_YEL;
+	dd->cspec->extctrl = extctl;
+	qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl);
+	spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+
+	if (ledblink) /* blink the LED on packet receive */
+		qib_write_kreg_port(ppd, krp_rcvpktledcnt, ledblink);
+}
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+
+static int qib_7322_notify_dca(struct qib_devdata *dd, unsigned long event)
+{
+	switch (event) {
+	case DCA_PROVIDER_ADD:
+		if (dd->flags & QIB_DCA_ENABLED)
+			break;
+		if (!dca_add_requester(&dd->pcidev->dev)) {
+			qib_devinfo(dd->pcidev, "DCA enabled\n");
+			dd->flags |= QIB_DCA_ENABLED;
+			qib_setup_dca(dd);
+		}
+		break;
+	case DCA_PROVIDER_REMOVE:
+		if (dd->flags & QIB_DCA_ENABLED) {
+			dca_remove_requester(&dd->pcidev->dev);
+			dd->flags &= ~QIB_DCA_ENABLED;
+			dd->cspec->dca_ctrl = 0;
+			qib_write_kreg(dd, KREG_IDX(DCACtrlA),
+				dd->cspec->dca_ctrl);
+		}
+		break;
+	}
+	return 0;
+}
+
+static void qib_update_rhdrq_dca(struct qib_ctxtdata *rcd, int cpu)
+{
+	struct qib_devdata *dd = rcd->dd;
+	struct qib_chip_specific *cspec = dd->cspec;
+
+	if (!(dd->flags & QIB_DCA_ENABLED))
+		return;
+	if (cspec->rhdr_cpu[rcd->ctxt] != cpu) {
+		const struct dca_reg_map *rmp;
+
+		cspec->rhdr_cpu[rcd->ctxt] = cpu;
+		rmp = &dca_rcvhdr_reg_map[rcd->ctxt];
+		cspec->dca_rcvhdr_ctrl[rmp->shadow_inx] &= rmp->mask;
+		cspec->dca_rcvhdr_ctrl[rmp->shadow_inx] |=
+			(u64) dca3_get_tag(&dd->pcidev->dev, cpu) << rmp->lsb;
+		qib_devinfo(dd->pcidev,
+			"Ctxt %d cpu %d dca %llx\n", rcd->ctxt, cpu,
+			(long long) cspec->dca_rcvhdr_ctrl[rmp->shadow_inx]);
+		qib_write_kreg(dd, rmp->regno,
+			       cspec->dca_rcvhdr_ctrl[rmp->shadow_inx]);
+		cspec->dca_ctrl |= SYM_MASK(DCACtrlA, RcvHdrqDCAEnable);
+		qib_write_kreg(dd, KREG_IDX(DCACtrlA), cspec->dca_ctrl);
+	}
+}
+
+static void qib_update_sdma_dca(struct qib_pportdata *ppd, int cpu)
+{
+	struct qib_devdata *dd = ppd->dd;
+	struct qib_chip_specific *cspec = dd->cspec;
+	unsigned pidx = ppd->port - 1;
+
+	if (!(dd->flags & QIB_DCA_ENABLED))
+		return;
+	if (cspec->sdma_cpu[pidx] != cpu) {
+		cspec->sdma_cpu[pidx] = cpu;
+		cspec->dca_rcvhdr_ctrl[4] &= ~(ppd->hw_pidx ?
+			SYM_MASK(DCACtrlF, SendDma1DCAOPH) :
+			SYM_MASK(DCACtrlF, SendDma0DCAOPH));
+		cspec->dca_rcvhdr_ctrl[4] |=
+			(u64) dca3_get_tag(&dd->pcidev->dev, cpu) <<
+				(ppd->hw_pidx ?
+					SYM_LSB(DCACtrlF, SendDma1DCAOPH) :
+					SYM_LSB(DCACtrlF, SendDma0DCAOPH));
+		qib_devinfo(dd->pcidev,
+			"sdma %d cpu %d dca %llx\n", ppd->hw_pidx, cpu,
+			(long long) cspec->dca_rcvhdr_ctrl[4]);
+		qib_write_kreg(dd, KREG_IDX(DCACtrlF),
+			       cspec->dca_rcvhdr_ctrl[4]);
+		cspec->dca_ctrl |= ppd->hw_pidx ?
+			SYM_MASK(DCACtrlA, SendDMAHead1DCAEnable) :
+			SYM_MASK(DCACtrlA, SendDMAHead0DCAEnable);
+		qib_write_kreg(dd, KREG_IDX(DCACtrlA), cspec->dca_ctrl);
+	}
+}
+
+static void qib_setup_dca(struct qib_devdata *dd)
+{
+	struct qib_chip_specific *cspec = dd->cspec;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(cspec->rhdr_cpu); i++)
+		cspec->rhdr_cpu[i] = -1;
+	for (i = 0; i < ARRAY_SIZE(cspec->sdma_cpu); i++)
+		cspec->sdma_cpu[i] = -1;
+	cspec->dca_rcvhdr_ctrl[0] =
+		(1ULL << SYM_LSB(DCACtrlB, RcvHdrq0DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlB, RcvHdrq1DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlB, RcvHdrq2DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlB, RcvHdrq3DCAXfrCnt));
+	cspec->dca_rcvhdr_ctrl[1] =
+		(1ULL << SYM_LSB(DCACtrlC, RcvHdrq4DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlC, RcvHdrq5DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlC, RcvHdrq6DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlC, RcvHdrq7DCAXfrCnt));
+	cspec->dca_rcvhdr_ctrl[2] =
+		(1ULL << SYM_LSB(DCACtrlD, RcvHdrq8DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlD, RcvHdrq9DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlD, RcvHdrq10DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlD, RcvHdrq11DCAXfrCnt));
+	cspec->dca_rcvhdr_ctrl[3] =
+		(1ULL << SYM_LSB(DCACtrlE, RcvHdrq12DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlE, RcvHdrq13DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlE, RcvHdrq14DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlE, RcvHdrq15DCAXfrCnt));
+	cspec->dca_rcvhdr_ctrl[4] =
+		(1ULL << SYM_LSB(DCACtrlF, RcvHdrq16DCAXfrCnt)) |
+		(1ULL << SYM_LSB(DCACtrlF, RcvHdrq17DCAXfrCnt));
+	for (i = 0; i < ARRAY_SIZE(cspec->sdma_cpu); i++)
+		qib_write_kreg(dd, KREG_IDX(DCACtrlB) + i,
+			       cspec->dca_rcvhdr_ctrl[i]);
+	for (i = 0; i < cspec->num_msix_entries; i++)
+		setup_dca_notifier(dd, &cspec->msix_entries[i]);
+}
+
+static void qib_irq_notifier_notify(struct irq_affinity_notify *notify,
+			     const cpumask_t *mask)
+{
+	struct qib_irq_notify *n =
+		container_of(notify, struct qib_irq_notify, notify);
+	int cpu = cpumask_first(mask);
+
+	if (n->rcv) {
+		struct qib_ctxtdata *rcd = (struct qib_ctxtdata *)n->arg;
+
+		qib_update_rhdrq_dca(rcd, cpu);
+	} else {
+		struct qib_pportdata *ppd = (struct qib_pportdata *)n->arg;
+
+		qib_update_sdma_dca(ppd, cpu);
+	}
+}
+
+static void qib_irq_notifier_release(struct kref *ref)
+{
+	struct qib_irq_notify *n =
+		container_of(ref, struct qib_irq_notify, notify.kref);
+	struct qib_devdata *dd;
+
+	if (n->rcv) {
+		struct qib_ctxtdata *rcd = (struct qib_ctxtdata *)n->arg;
+
+		dd = rcd->dd;
+	} else {
+		struct qib_pportdata *ppd = (struct qib_pportdata *)n->arg;
+
+		dd = ppd->dd;
+	}
+	qib_devinfo(dd->pcidev,
+		"release on HCA notify 0x%p n 0x%p\n", ref, n);
+	kfree(n);
+}
+#endif
+
+/*
+ * Disable MSIx interrupt if enabled, call generic MSIx code
+ * to cleanup, and clear pending MSIx interrupts.
+ * Used for fallback to INTx, after reset, and when MSIx setup fails.
+ */
+static void qib_7322_nomsix(struct qib_devdata *dd)
+{
+	u64 intgranted;
+	int n;
+
+	dd->cspec->main_int_mask = ~0ULL;
+	n = dd->cspec->num_msix_entries;
+	if (n) {
+		int i;
+
+		dd->cspec->num_msix_entries = 0;
+		for (i = 0; i < n; i++) {
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+			reset_dca_notifier(dd, &dd->cspec->msix_entries[i]);
+#endif
+			irq_set_affinity_hint(
+			  dd->cspec->msix_entries[i].msix.vector, NULL);
+			free_cpumask_var(dd->cspec->msix_entries[i].mask);
+			free_irq(dd->cspec->msix_entries[i].msix.vector,
+			   dd->cspec->msix_entries[i].arg);
+		}
+		qib_nomsix(dd);
+	}
+	/* make sure no MSIx interrupts are left pending */
+	intgranted = qib_read_kreg64(dd, kr_intgranted);
+	if (intgranted)
+		qib_write_kreg(dd, kr_intgranted, intgranted);
+}
+
+static void qib_7322_free_irq(struct qib_devdata *dd)
+{
+	if (dd->cspec->irq) {
+		free_irq(dd->cspec->irq, dd);
+		dd->cspec->irq = 0;
+	}
+	qib_7322_nomsix(dd);
+}
+
+static void qib_setup_7322_cleanup(struct qib_devdata *dd)
+{
+	int i;
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	if (dd->flags & QIB_DCA_ENABLED) {
+		dca_remove_requester(&dd->pcidev->dev);
+		dd->flags &= ~QIB_DCA_ENABLED;
+		dd->cspec->dca_ctrl = 0;
+		qib_write_kreg(dd, KREG_IDX(DCACtrlA), dd->cspec->dca_ctrl);
+	}
+#endif
+
+	qib_7322_free_irq(dd);
+	kfree(dd->cspec->cntrs);
+	kfree(dd->cspec->sendchkenable);
+	kfree(dd->cspec->sendgrhchk);
+	kfree(dd->cspec->sendibchk);
+	kfree(dd->cspec->msix_entries);
+	for (i = 0; i < dd->num_pports; i++) {
+		unsigned long flags;
+		u32 mask = QSFP_GPIO_MOD_PRS_N |
+			(QSFP_GPIO_MOD_PRS_N << QSFP_GPIO_PORT2_SHIFT);
+
+		kfree(dd->pport[i].cpspec->portcntrs);
+		if (dd->flags & QIB_HAS_QSFP) {
+			spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+			dd->cspec->gpio_mask &= ~mask;
+			qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+			spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+			qib_qsfp_deinit(&dd->pport[i].cpspec->qsfp_data);
+		}
+		if (dd->pport[i].ibport_data.smi_ah)
+			ib_destroy_ah(&dd->pport[i].ibport_data.smi_ah->ibah);
+	}
+}
+
+/* handle SDMA interrupts */
+static void sdma_7322_intr(struct qib_devdata *dd, u64 istat)
+{
+	struct qib_pportdata *ppd0 = &dd->pport[0];
+	struct qib_pportdata *ppd1 = &dd->pport[1];
+	u64 intr0 = istat & (INT_MASK_P(SDma, 0) |
+		INT_MASK_P(SDmaIdle, 0) | INT_MASK_P(SDmaProgress, 0));
+	u64 intr1 = istat & (INT_MASK_P(SDma, 1) |
+		INT_MASK_P(SDmaIdle, 1) | INT_MASK_P(SDmaProgress, 1));
+
+	if (intr0)
+		qib_sdma_intr(ppd0);
+	if (intr1)
+		qib_sdma_intr(ppd1);
+
+	if (istat & INT_MASK_PM(SDmaCleanupDone, 0))
+		qib_sdma_process_event(ppd0, qib_sdma_event_e20_hw_started);
+	if (istat & INT_MASK_PM(SDmaCleanupDone, 1))
+		qib_sdma_process_event(ppd1, qib_sdma_event_e20_hw_started);
+}
+
+/*
+ * Set or clear the Send buffer available interrupt enable bit.
+ */
+static void qib_wantpiobuf_7322_intr(struct qib_devdata *dd, u32 needint)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+	if (needint)
+		dd->sendctrl |= SYM_MASK(SendCtrl, SendIntBufAvail);
+	else
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, SendIntBufAvail);
+	qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+}
+
+/*
+ * Somehow got an interrupt with reserved bits set in interrupt status.
+ * Print a message so we know it happened, then clear them.
+ * keep mainline interrupt handler cache-friendly
+ */
+static noinline void unknown_7322_ibits(struct qib_devdata *dd, u64 istat)
+{
+	u64 kills;
+	char msg[128];
+
+	kills = istat & ~QIB_I_BITSEXTANT;
+	qib_dev_err(dd,
+		"Clearing reserved interrupt(s) 0x%016llx: %s\n",
+		(unsigned long long) kills, msg);
+	qib_write_kreg(dd, kr_intmask, (dd->cspec->int_enable_mask & ~kills));
+}
+
+/* keep mainline interrupt handler cache-friendly */
+static noinline void unknown_7322_gpio_intr(struct qib_devdata *dd)
+{
+	u32 gpiostatus;
+	int handled = 0;
+	int pidx;
+
+	/*
+	 * Boards for this chip currently don't use GPIO interrupts,
+	 * so clear by writing GPIOstatus to GPIOclear, and complain
+	 * to developer.  To avoid endless repeats, clear
+	 * the bits in the mask, since there is some kind of
+	 * programming error or chip problem.
+	 */
+	gpiostatus = qib_read_kreg32(dd, kr_gpio_status);
+	/*
+	 * In theory, writing GPIOstatus to GPIOclear could
+	 * have a bad side-effect on some diagnostic that wanted
+	 * to poll for a status-change, but the various shadows
+	 * make that problematic at best. Diags will just suppress
+	 * all GPIO interrupts during such tests.
+	 */
+	qib_write_kreg(dd, kr_gpio_clear, gpiostatus);
+	/*
+	 * Check for QSFP MOD_PRS changes
+	 * only works for single port if IB1 != pidx1
+	 */
+	for (pidx = 0; pidx < dd->num_pports && (dd->flags & QIB_HAS_QSFP);
+	     ++pidx) {
+		struct qib_pportdata *ppd;
+		struct qib_qsfp_data *qd;
+		u32 mask;
+
+		if (!dd->pport[pidx].link_speed_supported)
+			continue;
+		mask = QSFP_GPIO_MOD_PRS_N;
+		ppd = dd->pport + pidx;
+		mask <<= (QSFP_GPIO_PORT2_SHIFT * ppd->hw_pidx);
+		if (gpiostatus & dd->cspec->gpio_mask & mask) {
+			u64 pins;
+
+			qd = &ppd->cpspec->qsfp_data;
+			gpiostatus &= ~mask;
+			pins = qib_read_kreg64(dd, kr_extstatus);
+			pins >>= SYM_LSB(EXTStatus, GPIOIn);
+			if (!(pins & mask)) {
+				++handled;
+				qd->t_insert = jiffies;
+				queue_work(ib_wq, &qd->work);
+			}
+		}
+	}
+
+	if (gpiostatus && !handled) {
+		const u32 mask = qib_read_kreg32(dd, kr_gpio_mask);
+		u32 gpio_irq = mask & gpiostatus;
+
+		/*
+		 * Clear any troublemakers, and update chip from shadow
+		 */
+		dd->cspec->gpio_mask &= ~gpio_irq;
+		qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+	}
+}
+
+/*
+ * Handle errors and unusual events first, separate function
+ * to improve cache hits for fast path interrupt handling.
+ */
+static noinline void unlikely_7322_intr(struct qib_devdata *dd, u64 istat)
+{
+	if (istat & ~QIB_I_BITSEXTANT)
+		unknown_7322_ibits(dd, istat);
+	if (istat & QIB_I_GPIO)
+		unknown_7322_gpio_intr(dd);
+	if (istat & QIB_I_C_ERROR) {
+		qib_write_kreg(dd, kr_errmask, 0ULL);
+		tasklet_schedule(&dd->error_tasklet);
+	}
+	if (istat & INT_MASK_P(Err, 0) && dd->rcd[0])
+		handle_7322_p_errors(dd->rcd[0]->ppd);
+	if (istat & INT_MASK_P(Err, 1) && dd->rcd[1])
+		handle_7322_p_errors(dd->rcd[1]->ppd);
+}
+
+/*
+ * Dynamically adjust the rcv int timeout for a context based on incoming
+ * packet rate.
+ */
+static void adjust_rcv_timeout(struct qib_ctxtdata *rcd, int npkts)
+{
+	struct qib_devdata *dd = rcd->dd;
+	u32 timeout = dd->cspec->rcvavail_timeout[rcd->ctxt];
+
+	/*
+	 * Dynamically adjust idle timeout on chip
+	 * based on number of packets processed.
+	 */
+	if (npkts < rcv_int_count && timeout > 2)
+		timeout >>= 1;
+	else if (npkts >= rcv_int_count && timeout < rcv_int_timeout)
+		timeout = min(timeout << 1, rcv_int_timeout);
+	else
+		return;
+
+	dd->cspec->rcvavail_timeout[rcd->ctxt] = timeout;
+	qib_write_kreg(dd, kr_rcvavailtimeout + rcd->ctxt, timeout);
+}
+
+/*
+ * This is the main interrupt handler.
+ * It will normally only be used for low frequency interrupts but may
+ * have to handle all interrupts if INTx is enabled or fewer than normal
+ * MSIx interrupts were allocated.
+ * This routine should ignore the interrupt bits for any of the
+ * dedicated MSIx handlers.
+ */
+static irqreturn_t qib_7322intr(int irq, void *data)
+{
+	struct qib_devdata *dd = data;
+	irqreturn_t ret;
+	u64 istat;
+	u64 ctxtrbits;
+	u64 rmask;
+	unsigned i;
+	u32 npkts;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT) {
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		ret = IRQ_HANDLED;
+		goto bail;
+	}
+
+	istat = qib_read_kreg64(dd, kr_intstatus);
+
+	if (unlikely(istat == ~0ULL)) {
+		qib_bad_intrstatus(dd);
+		qib_dev_err(dd, "Interrupt status all f's, skipping\n");
+		/* don't know if it was our interrupt or not */
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	istat &= dd->cspec->main_int_mask;
+	if (unlikely(!istat)) {
+		/* already handled, or shared and not us */
+		ret = IRQ_NONE;
+		goto bail;
+	}
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* handle "errors" of various kinds first, device ahead of port */
+	if (unlikely(istat & (~QIB_I_BITSEXTANT | QIB_I_GPIO |
+			      QIB_I_C_ERROR | INT_MASK_P(Err, 0) |
+			      INT_MASK_P(Err, 1))))
+		unlikely_7322_intr(dd, istat);
+
+	/*
+	 * Clear the interrupt bits we found set, relatively early, so we
+	 * "know" know the chip will have seen this by the time we process
+	 * the queue, and will re-interrupt if necessary.  The processor
+	 * itself won't take the interrupt again until we return.
+	 */
+	qib_write_kreg(dd, kr_intclear, istat);
+
+	/*
+	 * Handle kernel receive queues before checking for pio buffers
+	 * available since receives can overflow; piobuf waiters can afford
+	 * a few extra cycles, since they were waiting anyway.
+	 */
+	ctxtrbits = istat & (QIB_I_RCVAVAIL_MASK | QIB_I_RCVURG_MASK);
+	if (ctxtrbits) {
+		rmask = (1ULL << QIB_I_RCVAVAIL_LSB) |
+			(1ULL << QIB_I_RCVURG_LSB);
+		for (i = 0; i < dd->first_user_ctxt; i++) {
+			if (ctxtrbits & rmask) {
+				ctxtrbits &= ~rmask;
+				if (dd->rcd[i])
+					qib_kreceive(dd->rcd[i], NULL, &npkts);
+			}
+			rmask <<= 1;
+		}
+		if (ctxtrbits) {
+			ctxtrbits = (ctxtrbits >> QIB_I_RCVAVAIL_LSB) |
+				(ctxtrbits >> QIB_I_RCVURG_LSB);
+			qib_handle_urcv(dd, ctxtrbits);
+		}
+	}
+
+	if (istat & (QIB_I_P_SDMAINT(0) | QIB_I_P_SDMAINT(1)))
+		sdma_7322_intr(dd, istat);
+
+	if ((istat & QIB_I_SPIOBUFAVAIL) && (dd->flags & QIB_INITTED))
+		qib_ib_piobufavail(dd);
+
+	ret = IRQ_HANDLED;
+bail:
+	return ret;
+}
+
+/*
+ * Dedicated receive packet available interrupt handler.
+ */
+static irqreturn_t qib_7322pintr(int irq, void *data)
+{
+	struct qib_ctxtdata *rcd = data;
+	struct qib_devdata *dd = rcd->dd;
+	u32 npkts;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, ((1ULL << QIB_I_RCVAVAIL_LSB) |
+		       (1ULL << QIB_I_RCVURG_LSB)) << rcd->ctxt);
+
+	qib_kreceive(rcd, NULL, &npkts);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Dedicated Send buffer available interrupt handler.
+ */
+static irqreturn_t qib_7322bufavail(int irq, void *data)
+{
+	struct qib_devdata *dd = data;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, QIB_I_SPIOBUFAVAIL);
+
+	/* qib_ib_piobufavail() will clear the want PIO interrupt if needed */
+	if (dd->flags & QIB_INITTED)
+		qib_ib_piobufavail(dd);
+	else
+		qib_wantpiobuf_7322_intr(dd, 0);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Dedicated Send DMA interrupt handler.
+ */
+static irqreturn_t sdma_intr(int irq, void *data)
+{
+	struct qib_pportdata *ppd = data;
+	struct qib_devdata *dd = ppd->dd;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ?
+		       INT_MASK_P(SDma, 1) : INT_MASK_P(SDma, 0));
+	qib_sdma_intr(ppd);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Dedicated Send DMA idle interrupt handler.
+ */
+static irqreturn_t sdma_idle_intr(int irq, void *data)
+{
+	struct qib_pportdata *ppd = data;
+	struct qib_devdata *dd = ppd->dd;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ?
+		       INT_MASK_P(SDmaIdle, 1) : INT_MASK_P(SDmaIdle, 0));
+	qib_sdma_intr(ppd);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Dedicated Send DMA progress interrupt handler.
+ */
+static irqreturn_t sdma_progress_intr(int irq, void *data)
+{
+	struct qib_pportdata *ppd = data;
+	struct qib_devdata *dd = ppd->dd;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ?
+		       INT_MASK_P(SDmaProgress, 1) :
+		       INT_MASK_P(SDmaProgress, 0));
+	qib_sdma_intr(ppd);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Dedicated Send DMA cleanup interrupt handler.
+ */
+static irqreturn_t sdma_cleanup_intr(int irq, void *data)
+{
+	struct qib_pportdata *ppd = data;
+	struct qib_devdata *dd = ppd->dd;
+
+	if ((dd->flags & (QIB_PRESENT | QIB_BADINTR)) != QIB_PRESENT)
+		/*
+		 * This return value is not great, but we do not want the
+		 * interrupt core code to remove our interrupt handler
+		 * because we don't appear to be handling an interrupt
+		 * during a chip reset.
+		 */
+		return IRQ_HANDLED;
+
+	this_cpu_inc(*dd->int_counter);
+
+	/* Clear the interrupt bit we expect to be set. */
+	qib_write_kreg(dd, kr_intclear, ppd->hw_pidx ?
+		       INT_MASK_PM(SDmaCleanupDone, 1) :
+		       INT_MASK_PM(SDmaCleanupDone, 0));
+	qib_sdma_process_event(ppd, qib_sdma_event_e20_hw_started);
+
+	return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+
+static void reset_dca_notifier(struct qib_devdata *dd, struct qib_msix_entry *m)
+{
+	if (!m->dca)
+		return;
+	qib_devinfo(dd->pcidev,
+		"Disabling notifier on HCA %d irq %d\n",
+		dd->unit,
+		m->msix.vector);
+	irq_set_affinity_notifier(
+		m->msix.vector,
+		NULL);
+	m->notifier = NULL;
+}
+
+static void setup_dca_notifier(struct qib_devdata *dd, struct qib_msix_entry *m)
+{
+	struct qib_irq_notify *n;
+
+	if (!m->dca)
+		return;
+	n = kzalloc(sizeof(*n), GFP_KERNEL);
+	if (n) {
+		int ret;
+
+		m->notifier = n;
+		n->notify.irq = m->msix.vector;
+		n->notify.notify = qib_irq_notifier_notify;
+		n->notify.release = qib_irq_notifier_release;
+		n->arg = m->arg;
+		n->rcv = m->rcv;
+		qib_devinfo(dd->pcidev,
+			"set notifier irq %d rcv %d notify %p\n",
+			n->notify.irq, n->rcv, &n->notify);
+		ret = irq_set_affinity_notifier(
+				n->notify.irq,
+				&n->notify);
+		if (ret) {
+			m->notifier = NULL;
+			kfree(n);
+		}
+	}
+}
+
+#endif
+
+/*
+ * Set up our chip-specific interrupt handler.
+ * The interrupt type has already been setup, so
+ * we just need to do the registration and error checking.
+ * If we are using MSIx interrupts, we may fall back to
+ * INTx later, if the interrupt handler doesn't get called
+ * within 1/2 second (see verify_interrupt()).
+ */
+static void qib_setup_7322_interrupt(struct qib_devdata *dd, int clearpend)
+{
+	int ret, i, msixnum;
+	u64 redirect[6];
+	u64 mask;
+	const struct cpumask *local_mask;
+	int firstcpu, secondcpu = 0, currrcvcpu = 0;
+
+	if (!dd->num_pports)
+		return;
+
+	if (clearpend) {
+		/*
+		 * if not switching interrupt types, be sure interrupts are
+		 * disabled, and then clear anything pending at this point,
+		 * because we are starting clean.
+		 */
+		qib_7322_set_intr_state(dd, 0);
+
+		/* clear the reset error, init error/hwerror mask */
+		qib_7322_init_hwerrors(dd);
+
+		/* clear any interrupt bits that might be set */
+		qib_write_kreg(dd, kr_intclear, ~0ULL);
+
+		/* make sure no pending MSIx intr, and clear diag reg */
+		qib_write_kreg(dd, kr_intgranted, ~0ULL);
+		qib_write_kreg(dd, kr_vecclr_wo_int, ~0ULL);
+	}
+
+	if (!dd->cspec->num_msix_entries) {
+		/* Try to get INTx interrupt */
+try_intx:
+		if (!dd->pcidev->irq) {
+			qib_dev_err(dd,
+				"irq is 0, BIOS error?  Interrupts won't work\n");
+			goto bail;
+		}
+		ret = request_irq(dd->pcidev->irq, qib_7322intr,
+				  IRQF_SHARED, QIB_DRV_NAME, dd);
+		if (ret) {
+			qib_dev_err(dd,
+				"Couldn't setup INTx interrupt (irq=%d): %d\n",
+				dd->pcidev->irq, ret);
+			goto bail;
+		}
+		dd->cspec->irq = dd->pcidev->irq;
+		dd->cspec->main_int_mask = ~0ULL;
+		goto bail;
+	}
+
+	/* Try to get MSIx interrupts */
+	memset(redirect, 0, sizeof(redirect));
+	mask = ~0ULL;
+	msixnum = 0;
+	local_mask = cpumask_of_pcibus(dd->pcidev->bus);
+	firstcpu = cpumask_first(local_mask);
+	if (firstcpu >= nr_cpu_ids ||
+			cpumask_weight(local_mask) == num_online_cpus()) {
+		local_mask = topology_core_cpumask(0);
+		firstcpu = cpumask_first(local_mask);
+	}
+	if (firstcpu < nr_cpu_ids) {
+		secondcpu = cpumask_next(firstcpu, local_mask);
+		if (secondcpu >= nr_cpu_ids)
+			secondcpu = firstcpu;
+		currrcvcpu = secondcpu;
+	}
+	for (i = 0; msixnum < dd->cspec->num_msix_entries; i++) {
+		irq_handler_t handler;
+		void *arg;
+		u64 val;
+		int lsb, reg, sh;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+		int dca = 0;
+#endif
+
+		dd->cspec->msix_entries[msixnum].
+			name[sizeof(dd->cspec->msix_entries[msixnum].name) - 1]
+			= '\0';
+		if (i < ARRAY_SIZE(irq_table)) {
+			if (irq_table[i].port) {
+				/* skip if for a non-configured port */
+				if (irq_table[i].port > dd->num_pports)
+					continue;
+				arg = dd->pport + irq_table[i].port - 1;
+			} else
+				arg = dd;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+			dca = irq_table[i].dca;
+#endif
+			lsb = irq_table[i].lsb;
+			handler = irq_table[i].handler;
+			snprintf(dd->cspec->msix_entries[msixnum].name,
+				sizeof(dd->cspec->msix_entries[msixnum].name)
+				 - 1,
+				QIB_DRV_NAME "%d%s", dd->unit,
+				irq_table[i].name);
+		} else {
+			unsigned ctxt;
+
+			ctxt = i - ARRAY_SIZE(irq_table);
+			/* per krcvq context receive interrupt */
+			arg = dd->rcd[ctxt];
+			if (!arg)
+				continue;
+			if (qib_krcvq01_no_msi && ctxt < 2)
+				continue;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+			dca = 1;
+#endif
+			lsb = QIB_I_RCVAVAIL_LSB + ctxt;
+			handler = qib_7322pintr;
+			snprintf(dd->cspec->msix_entries[msixnum].name,
+				sizeof(dd->cspec->msix_entries[msixnum].name)
+				 - 1,
+				QIB_DRV_NAME "%d (kctx)", dd->unit);
+		}
+		ret = request_irq(
+			dd->cspec->msix_entries[msixnum].msix.vector,
+			handler, 0, dd->cspec->msix_entries[msixnum].name,
+			arg);
+		if (ret) {
+			/*
+			 * Shouldn't happen since the enable said we could
+			 * have as many as we are trying to setup here.
+			 */
+			qib_dev_err(dd,
+				"Couldn't setup MSIx interrupt (vec=%d, irq=%d): %d\n",
+				msixnum,
+				dd->cspec->msix_entries[msixnum].msix.vector,
+				ret);
+			qib_7322_nomsix(dd);
+			goto try_intx;
+		}
+		dd->cspec->msix_entries[msixnum].arg = arg;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+		dd->cspec->msix_entries[msixnum].dca = dca;
+		dd->cspec->msix_entries[msixnum].rcv =
+			handler == qib_7322pintr;
+#endif
+		if (lsb >= 0) {
+			reg = lsb / IBA7322_REDIRECT_VEC_PER_REG;
+			sh = (lsb % IBA7322_REDIRECT_VEC_PER_REG) *
+				SYM_LSB(IntRedirect0, vec1);
+			mask &= ~(1ULL << lsb);
+			redirect[reg] |= ((u64) msixnum) << sh;
+		}
+		val = qib_read_kreg64(dd, 2 * msixnum + 1 +
+			(QIB_7322_MsixTable_OFFS / sizeof(u64)));
+		if (firstcpu < nr_cpu_ids &&
+			zalloc_cpumask_var(
+				&dd->cspec->msix_entries[msixnum].mask,
+				GFP_KERNEL)) {
+			if (handler == qib_7322pintr) {
+				cpumask_set_cpu(currrcvcpu,
+					dd->cspec->msix_entries[msixnum].mask);
+				currrcvcpu = cpumask_next(currrcvcpu,
+					local_mask);
+				if (currrcvcpu >= nr_cpu_ids)
+					currrcvcpu = secondcpu;
+			} else {
+				cpumask_set_cpu(firstcpu,
+					dd->cspec->msix_entries[msixnum].mask);
+			}
+			irq_set_affinity_hint(
+				dd->cspec->msix_entries[msixnum].msix.vector,
+				dd->cspec->msix_entries[msixnum].mask);
+		}
+		msixnum++;
+	}
+	/* Initialize the vector mapping */
+	for (i = 0; i < ARRAY_SIZE(redirect); i++)
+		qib_write_kreg(dd, kr_intredirect + i, redirect[i]);
+	dd->cspec->main_int_mask = mask;
+	tasklet_init(&dd->error_tasklet, qib_error_tasklet,
+		(unsigned long)dd);
+bail:;
+}
+
+/**
+ * qib_7322_boardname - fill in the board name and note features
+ * @dd: the qlogic_ib device
+ *
+ * info will be based on the board revision register
+ */
+static unsigned qib_7322_boardname(struct qib_devdata *dd)
+{
+	/* Will need enumeration of board-types here */
+	char *n;
+	u32 boardid, namelen;
+	unsigned features = DUAL_PORT_CAP;
+
+	boardid = SYM_FIELD(dd->revision, Revision, BoardID);
+
+	switch (boardid) {
+	case 0:
+		n = "InfiniPath_QLE7342_Emulation";
+		break;
+	case 1:
+		n = "InfiniPath_QLE7340";
+		dd->flags |= QIB_HAS_QSFP;
+		features = PORT_SPD_CAP;
+		break;
+	case 2:
+		n = "InfiniPath_QLE7342";
+		dd->flags |= QIB_HAS_QSFP;
+		break;
+	case 3:
+		n = "InfiniPath_QMI7342";
+		break;
+	case 4:
+		n = "InfiniPath_Unsupported7342";
+		qib_dev_err(dd, "Unsupported version of QMH7342\n");
+		features = 0;
+		break;
+	case BOARD_QMH7342:
+		n = "InfiniPath_QMH7342";
+		features = 0x24;
+		break;
+	case BOARD_QME7342:
+		n = "InfiniPath_QME7342";
+		break;
+	case 8:
+		n = "InfiniPath_QME7362";
+		dd->flags |= QIB_HAS_QSFP;
+		break;
+	case BOARD_QMH7360:
+		n = "Intel IB QDR 1P FLR-QSFP Adptr";
+		dd->flags |= QIB_HAS_QSFP;
+		break;
+	case 15:
+		n = "InfiniPath_QLE7342_TEST";
+		dd->flags |= QIB_HAS_QSFP;
+		break;
+	default:
+		n = "InfiniPath_QLE73xy_UNKNOWN";
+		qib_dev_err(dd, "Unknown 7322 board type %u\n", boardid);
+		break;
+	}
+	dd->board_atten = 1; /* index into txdds_Xdr */
+
+	namelen = strlen(n) + 1;
+	dd->boardname = kmalloc(namelen, GFP_KERNEL);
+	if (!dd->boardname)
+		qib_dev_err(dd, "Failed allocation for board name: %s\n", n);
+	else
+		snprintf(dd->boardname, namelen, "%s", n);
+
+	snprintf(dd->boardversion, sizeof(dd->boardversion),
+		 "ChipABI %u.%u, %s, InfiniPath%u %u.%u, SW Compat %u\n",
+		 QIB_CHIP_VERS_MAJ, QIB_CHIP_VERS_MIN, dd->boardname,
+		 (unsigned)SYM_FIELD(dd->revision, Revision_R, Arch),
+		 dd->majrev, dd->minrev,
+		 (unsigned)SYM_FIELD(dd->revision, Revision_R, SW));
+
+	if (qib_singleport && (features >> PORT_SPD_CAP_SHIFT) & PORT_SPD_CAP) {
+		qib_devinfo(dd->pcidev,
+			"IB%u: Forced to single port mode by module parameter\n",
+			dd->unit);
+		features &= PORT_SPD_CAP;
+	}
+
+	return features;
+}
+
+/*
+ * This routine sleeps, so it can only be called from user context, not
+ * from interrupt context.
+ */
+static int qib_do_7322_reset(struct qib_devdata *dd)
+{
+	u64 val;
+	u64 *msix_vecsave;
+	int i, msix_entries, ret = 1;
+	u16 cmdval;
+	u8 int_line, clinesz;
+	unsigned long flags;
+
+	/* Use dev_err so it shows up in logs, etc. */
+	qib_dev_err(dd, "Resetting InfiniPath unit %u\n", dd->unit);
+
+	qib_pcie_getcmd(dd, &cmdval, &int_line, &clinesz);
+
+	msix_entries = dd->cspec->num_msix_entries;
+
+	/* no interrupts till re-initted */
+	qib_7322_set_intr_state(dd, 0);
+
+	if (msix_entries) {
+		qib_7322_nomsix(dd);
+		/* can be up to 512 bytes, too big for stack */
+		msix_vecsave = kmalloc(2 * dd->cspec->num_msix_entries *
+			sizeof(u64), GFP_KERNEL);
+		if (!msix_vecsave)
+			qib_dev_err(dd, "No mem to save MSIx data\n");
+	} else
+		msix_vecsave = NULL;
+
+	/*
+	 * Core PCI (as of 2.6.18) doesn't save or rewrite the full vector
+	 * info that is set up by the BIOS, so we have to save and restore
+	 * it ourselves.   There is some risk something could change it,
+	 * after we save it, but since we have disabled the MSIx, it
+	 * shouldn't be touched...
+	 */
+	for (i = 0; i < msix_entries; i++) {
+		u64 vecaddr, vecdata;
+
+		vecaddr = qib_read_kreg64(dd, 2 * i +
+				  (QIB_7322_MsixTable_OFFS / sizeof(u64)));
+		vecdata = qib_read_kreg64(dd, 1 + 2 * i +
+				  (QIB_7322_MsixTable_OFFS / sizeof(u64)));
+		if (msix_vecsave) {
+			msix_vecsave[2 * i] = vecaddr;
+			/* save it without the masked bit set */
+			msix_vecsave[1 + 2 * i] = vecdata & ~0x100000000ULL;
+		}
+	}
+
+	dd->pport->cpspec->ibdeltainprog = 0;
+	dd->pport->cpspec->ibsymdelta = 0;
+	dd->pport->cpspec->iblnkerrdelta = 0;
+	dd->pport->cpspec->ibmalfdelta = 0;
+	/* so we check interrupts work again */
+	dd->z_int_counter = qib_int_counter(dd);
+
+	/*
+	 * Keep chip from being accessed until we are ready.  Use
+	 * writeq() directly, to allow the write even though QIB_PRESENT
+	 * isn't set.
+	 */
+	dd->flags &= ~(QIB_INITTED | QIB_PRESENT | QIB_BADINTR);
+	dd->flags |= QIB_DOING_RESET;
+	val = dd->control | QLOGIC_IB_C_RESET;
+	writeq(val, &dd->kregbase[kr_control]);
+
+	for (i = 1; i <= 5; i++) {
+		/*
+		 * Allow MBIST, etc. to complete; longer on each retry.
+		 * We sometimes get machine checks from bus timeout if no
+		 * response, so for now, make it *really* long.
+		 */
+		msleep(1000 + (1 + i) * 3000);
+
+		qib_pcie_reenable(dd, cmdval, int_line, clinesz);
+
+		/*
+		 * Use readq directly, so we don't need to mark it as PRESENT
+		 * until we get a successful indication that all is well.
+		 */
+		val = readq(&dd->kregbase[kr_revision]);
+		if (val == dd->revision)
+			break;
+		if (i == 5) {
+			qib_dev_err(dd,
+				"Failed to initialize after reset, unusable\n");
+			ret = 0;
+			goto  bail;
+		}
+	}
+
+	dd->flags |= QIB_PRESENT; /* it's back */
+
+	if (msix_entries) {
+		/* restore the MSIx vector address and data if saved above */
+		for (i = 0; i < msix_entries; i++) {
+			dd->cspec->msix_entries[i].msix.entry = i;
+			if (!msix_vecsave || !msix_vecsave[2 * i])
+				continue;
+			qib_write_kreg(dd, 2 * i +
+				(QIB_7322_MsixTable_OFFS / sizeof(u64)),
+				msix_vecsave[2 * i]);
+			qib_write_kreg(dd, 1 + 2 * i +
+				(QIB_7322_MsixTable_OFFS / sizeof(u64)),
+				msix_vecsave[1 + 2 * i]);
+		}
+	}
+
+	/* initialize the remaining registers.  */
+	for (i = 0; i < dd->num_pports; ++i)
+		write_7322_init_portregs(&dd->pport[i]);
+	write_7322_initregs(dd);
+
+	if (qib_pcie_params(dd, dd->lbus_width,
+			    &dd->cspec->num_msix_entries,
+			    dd->cspec->msix_entries))
+		qib_dev_err(dd,
+			"Reset failed to setup PCIe or interrupts; continuing anyway\n");
+
+	qib_setup_7322_interrupt(dd, 1);
+
+	for (i = 0; i < dd->num_pports; ++i) {
+		struct qib_pportdata *ppd = &dd->pport[i];
+
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_FORCE_NOTIFY;
+		ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	}
+
+bail:
+	dd->flags &= ~QIB_DOING_RESET; /* OK or not, no longer resetting */
+	kfree(msix_vecsave);
+	return ret;
+}
+
+/**
+ * qib_7322_put_tid - write a TID to the chip
+ * @dd: the qlogic_ib device
+ * @tidptr: pointer to the expected TID (in chip) to update
+ * @tidtype: 0 for eager, 1 for expected
+ * @pa: physical address of in memory buffer; tidinvalid if freeing
+ */
+static void qib_7322_put_tid(struct qib_devdata *dd, u64 __iomem *tidptr,
+			     u32 type, unsigned long pa)
+{
+	if (!(dd->flags & QIB_PRESENT))
+		return;
+	if (pa != dd->tidinvalid) {
+		u64 chippa = pa >> IBA7322_TID_PA_SHIFT;
+
+		/* paranoia checks */
+		if (pa != (chippa << IBA7322_TID_PA_SHIFT)) {
+			qib_dev_err(dd, "Physaddr %lx not 2KB aligned!\n",
+				    pa);
+			return;
+		}
+		if (chippa >= (1UL << IBA7322_TID_SZ_SHIFT)) {
+			qib_dev_err(dd,
+				"Physical page address 0x%lx larger than supported\n",
+				pa);
+			return;
+		}
+
+		if (type == RCVHQ_RCV_TYPE_EAGER)
+			chippa |= dd->tidtemplate;
+		else /* for now, always full 4KB page */
+			chippa |= IBA7322_TID_SZ_4K;
+		pa = chippa;
+	}
+	writeq(pa, tidptr);
+	mmiowb();
+}
+
+/**
+ * qib_7322_clear_tids - clear all TID entries for a ctxt, expected and eager
+ * @dd: the qlogic_ib device
+ * @ctxt: the ctxt
+ *
+ * clear all TID entries for a ctxt, expected and eager.
+ * Used from qib_close().
+ */
+static void qib_7322_clear_tids(struct qib_devdata *dd,
+				struct qib_ctxtdata *rcd)
+{
+	u64 __iomem *tidbase;
+	unsigned long tidinv;
+	u32 ctxt;
+	int i;
+
+	if (!dd->kregbase || !rcd)
+		return;
+
+	ctxt = rcd->ctxt;
+
+	tidinv = dd->tidinvalid;
+	tidbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase +
+		 dd->rcvtidbase +
+		 ctxt * dd->rcvtidcnt * sizeof(*tidbase));
+
+	for (i = 0; i < dd->rcvtidcnt; i++)
+		qib_7322_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EXPECTED,
+				 tidinv);
+
+	tidbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase +
+		 dd->rcvegrbase +
+		 rcd->rcvegr_tid_base * sizeof(*tidbase));
+
+	for (i = 0; i < rcd->rcvegrcnt; i++)
+		qib_7322_put_tid(dd, &tidbase[i], RCVHQ_RCV_TYPE_EAGER,
+				 tidinv);
+}
+
+/**
+ * qib_7322_tidtemplate - setup constants for TID updates
+ * @dd: the qlogic_ib device
+ *
+ * We setup stuff that we use a lot, to avoid calculating each time
+ */
+static void qib_7322_tidtemplate(struct qib_devdata *dd)
+{
+	/*
+	 * For now, we always allocate 4KB buffers (at init) so we can
+	 * receive max size packets.  We may want a module parameter to
+	 * specify 2KB or 4KB and/or make it per port instead of per device
+	 * for those who want to reduce memory footprint.  Note that the
+	 * rcvhdrentsize size must be large enough to hold the largest
+	 * IB header (currently 96 bytes) that we expect to handle (plus of
+	 * course the 2 dwords of RHF).
+	 */
+	if (dd->rcvegrbufsize == 2048)
+		dd->tidtemplate = IBA7322_TID_SZ_2K;
+	else if (dd->rcvegrbufsize == 4096)
+		dd->tidtemplate = IBA7322_TID_SZ_4K;
+	dd->tidinvalid = 0;
+}
+
+/**
+ * qib_init_7322_get_base_info - set chip-specific flags for user code
+ * @rcd: the qlogic_ib ctxt
+ * @kbase: qib_base_info pointer
+ *
+ * We set the PCIE flag because the lower bandwidth on PCIe vs
+ * HyperTransport can affect some user packet algorithims.
+ */
+
+static int qib_7322_get_base_info(struct qib_ctxtdata *rcd,
+				  struct qib_base_info *kinfo)
+{
+	kinfo->spi_runtime_flags |= QIB_RUNTIME_CTXT_MSB_IN_QP |
+		QIB_RUNTIME_PCIE | QIB_RUNTIME_NODMA_RTAIL |
+		QIB_RUNTIME_HDRSUPP | QIB_RUNTIME_SDMA;
+	if (rcd->dd->cspec->r1)
+		kinfo->spi_runtime_flags |= QIB_RUNTIME_RCHK;
+	if (rcd->dd->flags & QIB_USE_SPCL_TRIG)
+		kinfo->spi_runtime_flags |= QIB_RUNTIME_SPECIAL_TRIGGER;
+
+	return 0;
+}
+
+static struct qib_message_header *
+qib_7322_get_msgheader(struct qib_devdata *dd, __le32 *rhf_addr)
+{
+	u32 offset = qib_hdrget_offset(rhf_addr);
+
+	return (struct qib_message_header *)
+		(rhf_addr - dd->rhf_offset + offset);
+}
+
+/*
+ * Configure number of contexts.
+ */
+static void qib_7322_config_ctxts(struct qib_devdata *dd)
+{
+	unsigned long flags;
+	u32 nchipctxts;
+
+	nchipctxts = qib_read_kreg32(dd, kr_contextcnt);
+	dd->cspec->numctxts = nchipctxts;
+	if (qib_n_krcv_queues > 1 && dd->num_pports) {
+		dd->first_user_ctxt = NUM_IB_PORTS +
+			(qib_n_krcv_queues - 1) * dd->num_pports;
+		if (dd->first_user_ctxt > nchipctxts)
+			dd->first_user_ctxt = nchipctxts;
+		dd->n_krcv_queues = dd->first_user_ctxt / dd->num_pports;
+	} else {
+		dd->first_user_ctxt = NUM_IB_PORTS;
+		dd->n_krcv_queues = 1;
+	}
+
+	if (!qib_cfgctxts) {
+		int nctxts = dd->first_user_ctxt + num_online_cpus();
+
+		if (nctxts <= 6)
+			dd->ctxtcnt = 6;
+		else if (nctxts <= 10)
+			dd->ctxtcnt = 10;
+		else if (nctxts <= nchipctxts)
+			dd->ctxtcnt = nchipctxts;
+	} else if (qib_cfgctxts < dd->num_pports)
+		dd->ctxtcnt = dd->num_pports;
+	else if (qib_cfgctxts <= nchipctxts)
+		dd->ctxtcnt = qib_cfgctxts;
+	if (!dd->ctxtcnt) /* none of the above, set to max */
+		dd->ctxtcnt = nchipctxts;
+
+	/*
+	 * Chip can be configured for 6, 10, or 18 ctxts, and choice
+	 * affects number of eager TIDs per ctxt (1K, 2K, 4K).
+	 * Lock to be paranoid about later motion, etc.
+	 */
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+	if (dd->ctxtcnt > 10)
+		dd->rcvctrl |= 2ULL << SYM_LSB(RcvCtrl, ContextCfg);
+	else if (dd->ctxtcnt > 6)
+		dd->rcvctrl |= 1ULL << SYM_LSB(RcvCtrl, ContextCfg);
+	/* else configure for default 6 receive ctxts */
+
+	/* The XRC opcode is 5. */
+	dd->rcvctrl |= 5ULL << SYM_LSB(RcvCtrl, XrcTypeCode);
+
+	/*
+	 * RcvCtrl *must* be written here so that the
+	 * chip understands how to change rcvegrcnt below.
+	 */
+	qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl);
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+
+	/* kr_rcvegrcnt changes based on the number of contexts enabled */
+	dd->cspec->rcvegrcnt = qib_read_kreg32(dd, kr_rcvegrcnt);
+	if (qib_rcvhdrcnt)
+		dd->rcvhdrcnt = max(dd->cspec->rcvegrcnt, qib_rcvhdrcnt);
+	else
+		dd->rcvhdrcnt = 2 * max(dd->cspec->rcvegrcnt,
+				    dd->num_pports > 1 ? 1024U : 2048U);
+}
+
+static int qib_7322_get_ib_cfg(struct qib_pportdata *ppd, int which)
+{
+
+	int lsb, ret = 0;
+	u64 maskr; /* right-justified mask */
+
+	switch (which) {
+
+	case QIB_IB_CFG_LWID_ENB: /* Get allowed Link-width */
+		ret = ppd->link_width_enabled;
+		goto done;
+
+	case QIB_IB_CFG_LWID: /* Get currently active Link-width */
+		ret = ppd->link_width_active;
+		goto done;
+
+	case QIB_IB_CFG_SPD_ENB: /* Get allowed Link speeds */
+		ret = ppd->link_speed_enabled;
+		goto done;
+
+	case QIB_IB_CFG_SPD: /* Get current Link spd */
+		ret = ppd->link_speed_active;
+		goto done;
+
+	case QIB_IB_CFG_RXPOL_ENB: /* Get Auto-RX-polarity enable */
+		lsb = SYM_LSB(IBCCtrlB_0, IB_POLARITY_REV_SUPP);
+		maskr = SYM_RMASK(IBCCtrlB_0, IB_POLARITY_REV_SUPP);
+		break;
+
+	case QIB_IB_CFG_LREV_ENB: /* Get Auto-Lane-reversal enable */
+		lsb = SYM_LSB(IBCCtrlB_0, IB_LANE_REV_SUPPORTED);
+		maskr = SYM_RMASK(IBCCtrlB_0, IB_LANE_REV_SUPPORTED);
+		break;
+
+	case QIB_IB_CFG_LINKLATENCY:
+		ret = qib_read_kreg_port(ppd, krp_ibcstatus_b) &
+			SYM_MASK(IBCStatusB_0, LinkRoundTripLatency);
+		goto done;
+
+	case QIB_IB_CFG_OP_VLS:
+		ret = ppd->vls_operational;
+		goto done;
+
+	case QIB_IB_CFG_VL_HIGH_CAP:
+		ret = 16;
+		goto done;
+
+	case QIB_IB_CFG_VL_LOW_CAP:
+		ret = 16;
+		goto done;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		ret = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0,
+				OverrunThreshold);
+		goto done;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		ret = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0,
+				PhyerrThreshold);
+		goto done;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		ret = (ppd->cpspec->ibcctrl_a &
+		       SYM_MASK(IBCCtrlA_0, LinkDownDefaultState)) ?
+			IB_LINKINITCMD_SLEEP : IB_LINKINITCMD_POLL;
+		goto done;
+
+	case QIB_IB_CFG_HRTBT: /* Get Heartbeat off/enable/auto */
+		lsb = IBA7322_IBC_HRTBT_LSB;
+		maskr = IBA7322_IBC_HRTBT_RMASK; /* OR of AUTO and ENB */
+		break;
+
+	case QIB_IB_CFG_PMA_TICKS:
+		/*
+		 * 0x00 = 10x link transfer rate or 4 nsec. for 2.5Gbs
+		 * Since the clock is always 250MHz, the value is 3, 1 or 0.
+		 */
+		if (ppd->link_speed_active == QIB_IB_QDR)
+			ret = 3;
+		else if (ppd->link_speed_active == QIB_IB_DDR)
+			ret = 1;
+		else
+			ret = 0;
+		goto done;
+
+	default:
+		ret = -EINVAL;
+		goto done;
+	}
+	ret = (int)((ppd->cpspec->ibcctrl_b >> lsb) & maskr);
+done:
+	return ret;
+}
+
+/*
+ * Below again cribbed liberally from older version. Do not lean
+ * heavily on it.
+ */
+#define IBA7322_IBC_DLIDLMC_SHIFT QIB_7322_IBCCtrlB_0_IB_DLID_LSB
+#define IBA7322_IBC_DLIDLMC_MASK (QIB_7322_IBCCtrlB_0_IB_DLID_RMASK \
+	| (QIB_7322_IBCCtrlB_0_IB_DLID_MASK_RMASK << 16))
+
+static int qib_7322_set_ib_cfg(struct qib_pportdata *ppd, int which, u32 val)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 maskr; /* right-justified mask */
+	int lsb, ret = 0;
+	u16 lcmd, licmd;
+	unsigned long flags;
+
+	switch (which) {
+	case QIB_IB_CFG_LIDLMC:
+		/*
+		 * Set LID and LMC. Combined to avoid possible hazard
+		 * caller puts LMC in 16MSbits, DLID in 16LSbits of val
+		 */
+		lsb = IBA7322_IBC_DLIDLMC_SHIFT;
+		maskr = IBA7322_IBC_DLIDLMC_MASK;
+		/*
+		 * For header-checking, the SLID in the packet will
+		 * be masked with SendIBSLMCMask, and compared
+		 * with SendIBSLIDAssignMask. Make sure we do not
+		 * set any bits not covered by the mask, or we get
+		 * false-positives.
+		 */
+		qib_write_kreg_port(ppd, krp_sendslid,
+				    val & (val >> 16) & SendIBSLIDAssignMask);
+		qib_write_kreg_port(ppd, krp_sendslidmask,
+				    (val >> 16) & SendIBSLMCMask);
+		break;
+
+	case QIB_IB_CFG_LWID_ENB: /* set allowed Link-width */
+		ppd->link_width_enabled = val;
+		/* convert IB value to chip register value */
+		if (val == IB_WIDTH_1X)
+			val = 0;
+		else if (val == IB_WIDTH_4X)
+			val = 1;
+		else
+			val = 3;
+		maskr = SYM_RMASK(IBCCtrlB_0, IB_NUM_CHANNELS);
+		lsb = SYM_LSB(IBCCtrlB_0, IB_NUM_CHANNELS);
+		break;
+
+	case QIB_IB_CFG_SPD_ENB: /* set allowed Link speeds */
+		/*
+		 * As with width, only write the actual register if the
+		 * link is currently down, otherwise takes effect on next
+		 * link change.  Since setting is being explicitly requested
+		 * (via MAD or sysfs), clear autoneg failure status if speed
+		 * autoneg is enabled.
+		 */
+		ppd->link_speed_enabled = val;
+		val <<= IBA7322_IBC_SPEED_LSB;
+		maskr = IBA7322_IBC_SPEED_MASK | IBA7322_IBC_IBTA_1_2_MASK |
+			IBA7322_IBC_MAX_SPEED_MASK;
+		if (val & (val - 1)) {
+			/* Muliple speeds enabled */
+			val |= IBA7322_IBC_IBTA_1_2_MASK |
+				IBA7322_IBC_MAX_SPEED_MASK;
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		} else if (val & IBA7322_IBC_SPEED_QDR)
+			val |= IBA7322_IBC_IBTA_1_2_MASK;
+		/* IBTA 1.2 mode + min/max + speed bits are contiguous */
+		lsb = SYM_LSB(IBCCtrlB_0, IB_ENHANCED_MODE);
+		break;
+
+	case QIB_IB_CFG_RXPOL_ENB: /* set Auto-RX-polarity enable */
+		lsb = SYM_LSB(IBCCtrlB_0, IB_POLARITY_REV_SUPP);
+		maskr = SYM_RMASK(IBCCtrlB_0, IB_POLARITY_REV_SUPP);
+		break;
+
+	case QIB_IB_CFG_LREV_ENB: /* set Auto-Lane-reversal enable */
+		lsb = SYM_LSB(IBCCtrlB_0, IB_LANE_REV_SUPPORTED);
+		maskr = SYM_RMASK(IBCCtrlB_0, IB_LANE_REV_SUPPORTED);
+		break;
+
+	case QIB_IB_CFG_OVERRUN_THRESH: /* IB overrun threshold */
+		maskr = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0,
+				  OverrunThreshold);
+		if (maskr != val) {
+			ppd->cpspec->ibcctrl_a &=
+				~SYM_MASK(IBCCtrlA_0, OverrunThreshold);
+			ppd->cpspec->ibcctrl_a |= (u64) val <<
+				SYM_LSB(IBCCtrlA_0, OverrunThreshold);
+			qib_write_kreg_port(ppd, krp_ibcctrl_a,
+					    ppd->cpspec->ibcctrl_a);
+			qib_write_kreg(dd, kr_scratch, 0ULL);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_PHYERR_THRESH: /* IB PHY error threshold */
+		maskr = SYM_FIELD(ppd->cpspec->ibcctrl_a, IBCCtrlA_0,
+				  PhyerrThreshold);
+		if (maskr != val) {
+			ppd->cpspec->ibcctrl_a &=
+				~SYM_MASK(IBCCtrlA_0, PhyerrThreshold);
+			ppd->cpspec->ibcctrl_a |= (u64) val <<
+				SYM_LSB(IBCCtrlA_0, PhyerrThreshold);
+			qib_write_kreg_port(ppd, krp_ibcctrl_a,
+					    ppd->cpspec->ibcctrl_a);
+			qib_write_kreg(dd, kr_scratch, 0ULL);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_PKEYS: /* update pkeys */
+		maskr = (u64) ppd->pkeys[0] | ((u64) ppd->pkeys[1] << 16) |
+			((u64) ppd->pkeys[2] << 32) |
+			((u64) ppd->pkeys[3] << 48);
+		qib_write_kreg_port(ppd, krp_partitionkey, maskr);
+		goto bail;
+
+	case QIB_IB_CFG_LINKDEFAULT: /* IB link default (sleep/poll) */
+		/* will only take effect when the link state changes */
+		if (val == IB_LINKINITCMD_POLL)
+			ppd->cpspec->ibcctrl_a &=
+				~SYM_MASK(IBCCtrlA_0, LinkDownDefaultState);
+		else /* SLEEP */
+			ppd->cpspec->ibcctrl_a |=
+				SYM_MASK(IBCCtrlA_0, LinkDownDefaultState);
+		qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a);
+		qib_write_kreg(dd, kr_scratch, 0ULL);
+		goto bail;
+
+	case QIB_IB_CFG_MTU: /* update the MTU in IBC */
+		/*
+		 * Update our housekeeping variables, and set IBC max
+		 * size, same as init code; max IBC is max we allow in
+		 * buffer, less the qword pbc, plus 1 for ICRC, in dwords
+		 * Set even if it's unchanged, print debug message only
+		 * on changes.
+		 */
+		val = (ppd->ibmaxlen >> 2) + 1;
+		ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0, MaxPktLen);
+		ppd->cpspec->ibcctrl_a |= (u64)val <<
+			SYM_LSB(IBCCtrlA_0, MaxPktLen);
+		qib_write_kreg_port(ppd, krp_ibcctrl_a,
+				    ppd->cpspec->ibcctrl_a);
+		qib_write_kreg(dd, kr_scratch, 0ULL);
+		goto bail;
+
+	case QIB_IB_CFG_LSTATE: /* set the IB link state */
+		switch (val & 0xffff0000) {
+		case IB_LINKCMD_DOWN:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_DOWN;
+			ppd->cpspec->ibmalfusesnap = 1;
+			ppd->cpspec->ibmalfsnap = read_7322_creg32_port(ppd,
+				crp_errlink);
+			if (!ppd->cpspec->ibdeltainprog &&
+			    qib_compat_ddr_negotiate) {
+				ppd->cpspec->ibdeltainprog = 1;
+				ppd->cpspec->ibsymsnap =
+					read_7322_creg32_port(ppd,
+							      crp_ibsymbolerr);
+				ppd->cpspec->iblnkerrsnap =
+					read_7322_creg32_port(ppd,
+						      crp_iblinkerrrecov);
+			}
+			break;
+
+		case IB_LINKCMD_ARMED:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ARMED;
+			if (ppd->cpspec->ibmalfusesnap) {
+				ppd->cpspec->ibmalfusesnap = 0;
+				ppd->cpspec->ibmalfdelta +=
+					read_7322_creg32_port(ppd,
+							      crp_errlink) -
+					ppd->cpspec->ibmalfsnap;
+			}
+			break;
+
+		case IB_LINKCMD_ACTIVE:
+			lcmd = QLOGIC_IB_IBCC_LINKCMD_ACTIVE;
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkcmd req 0x%x\n", val >> 16);
+			goto bail;
+		}
+		switch (val & 0xffff) {
+		case IB_LINKINITCMD_NOP:
+			licmd = 0;
+			break;
+
+		case IB_LINKINITCMD_POLL:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_POLL;
+			break;
+
+		case IB_LINKINITCMD_SLEEP:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_SLEEP;
+			break;
+
+		case IB_LINKINITCMD_DISABLE:
+			licmd = QLOGIC_IB_IBCC_LINKINITCMD_DISABLE;
+			ppd->cpspec->chase_end = 0;
+			/*
+			 * stop state chase counter and timer, if running.
+			 * wait forpending timer, but don't clear .data (ppd)!
+			 */
+			if (ppd->cpspec->chase_timer.expires) {
+				del_timer_sync(&ppd->cpspec->chase_timer);
+				ppd->cpspec->chase_timer.expires = 0;
+			}
+			break;
+
+		default:
+			ret = -EINVAL;
+			qib_dev_err(dd, "bad linkinitcmd req 0x%x\n",
+				    val & 0xffff);
+			goto bail;
+		}
+		qib_set_ib_7322_lstate(ppd, lcmd, licmd);
+		goto bail;
+
+	case QIB_IB_CFG_OP_VLS:
+		if (ppd->vls_operational != val) {
+			ppd->vls_operational = val;
+			set_vls(ppd);
+		}
+		goto bail;
+
+	case QIB_IB_CFG_VL_HIGH_LIMIT:
+		qib_write_kreg_port(ppd, krp_highprio_limit, val);
+		goto bail;
+
+	case QIB_IB_CFG_HRTBT: /* set Heartbeat off/enable/auto */
+		if (val > 3) {
+			ret = -EINVAL;
+			goto bail;
+		}
+		lsb = IBA7322_IBC_HRTBT_LSB;
+		maskr = IBA7322_IBC_HRTBT_RMASK; /* OR of AUTO and ENB */
+		break;
+
+	case QIB_IB_CFG_PORT:
+		/* val is the port number of the switch we are connected to. */
+		if (ppd->dd->cspec->r1) {
+			cancel_delayed_work(&ppd->cpspec->ipg_work);
+			ppd->cpspec->ipg_tries = 0;
+		}
+		goto bail;
+
+	default:
+		ret = -EINVAL;
+		goto bail;
+	}
+	ppd->cpspec->ibcctrl_b &= ~(maskr << lsb);
+	ppd->cpspec->ibcctrl_b |= (((u64) val & maskr) << lsb);
+	qib_write_kreg_port(ppd, krp_ibcctrl_b, ppd->cpspec->ibcctrl_b);
+	qib_write_kreg(dd, kr_scratch, 0);
+bail:
+	return ret;
+}
+
+static int qib_7322_set_loopback(struct qib_pportdata *ppd, const char *what)
+{
+	int ret = 0;
+	u64 val, ctrlb;
+
+	/* only IBC loopback, may add serdes and xgxs loopbacks later */
+	if (!strncmp(what, "ibc", 3)) {
+		ppd->cpspec->ibcctrl_a |= SYM_MASK(IBCCtrlA_0,
+						       Loopback);
+		val = 0; /* disable heart beat, so link will come up */
+		qib_devinfo(ppd->dd->pcidev, "Enabling IB%u:%u IBC loopback\n",
+			 ppd->dd->unit, ppd->port);
+	} else if (!strncmp(what, "off", 3)) {
+		ppd->cpspec->ibcctrl_a &= ~SYM_MASK(IBCCtrlA_0,
+							Loopback);
+		/* enable heart beat again */
+		val = IBA7322_IBC_HRTBT_RMASK << IBA7322_IBC_HRTBT_LSB;
+		qib_devinfo(ppd->dd->pcidev,
+			"Disabling IB%u:%u IBC loopback (normal)\n",
+			ppd->dd->unit, ppd->port);
+	} else
+		ret = -EINVAL;
+	if (!ret) {
+		qib_write_kreg_port(ppd, krp_ibcctrl_a,
+				    ppd->cpspec->ibcctrl_a);
+		ctrlb = ppd->cpspec->ibcctrl_b & ~(IBA7322_IBC_HRTBT_MASK
+					     << IBA7322_IBC_HRTBT_LSB);
+		ppd->cpspec->ibcctrl_b = ctrlb | val;
+		qib_write_kreg_port(ppd, krp_ibcctrl_b,
+				    ppd->cpspec->ibcctrl_b);
+		qib_write_kreg(ppd->dd, kr_scratch, 0);
+	}
+	return ret;
+}
+
+static void get_vl_weights(struct qib_pportdata *ppd, unsigned regno,
+			   struct ib_vl_weight_elem *vl)
+{
+	unsigned i;
+
+	for (i = 0; i < 16; i++, regno++, vl++) {
+		u32 val = qib_read_kreg_port(ppd, regno);
+
+		vl->vl = (val >> SYM_LSB(LowPriority0_0, VirtualLane)) &
+			SYM_RMASK(LowPriority0_0, VirtualLane);
+		vl->weight = (val >> SYM_LSB(LowPriority0_0, Weight)) &
+			SYM_RMASK(LowPriority0_0, Weight);
+	}
+}
+
+static void set_vl_weights(struct qib_pportdata *ppd, unsigned regno,
+			   struct ib_vl_weight_elem *vl)
+{
+	unsigned i;
+
+	for (i = 0; i < 16; i++, regno++, vl++) {
+		u64 val;
+
+		val = ((vl->vl & SYM_RMASK(LowPriority0_0, VirtualLane)) <<
+			SYM_LSB(LowPriority0_0, VirtualLane)) |
+		      ((vl->weight & SYM_RMASK(LowPriority0_0, Weight)) <<
+			SYM_LSB(LowPriority0_0, Weight));
+		qib_write_kreg_port(ppd, regno, val);
+	}
+	if (!(ppd->p_sendctrl & SYM_MASK(SendCtrl_0, IBVLArbiterEn))) {
+		struct qib_devdata *dd = ppd->dd;
+		unsigned long flags;
+
+		spin_lock_irqsave(&dd->sendctrl_lock, flags);
+		ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, IBVLArbiterEn);
+		qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+		spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+	}
+}
+
+static int qib_7322_get_ib_table(struct qib_pportdata *ppd, int which, void *t)
+{
+	switch (which) {
+	case QIB_IB_TBL_VL_HIGH_ARB:
+		get_vl_weights(ppd, krp_highprio_0, t);
+		break;
+
+	case QIB_IB_TBL_VL_LOW_ARB:
+		get_vl_weights(ppd, krp_lowprio_0, t);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int qib_7322_set_ib_table(struct qib_pportdata *ppd, int which, void *t)
+{
+	switch (which) {
+	case QIB_IB_TBL_VL_HIGH_ARB:
+		set_vl_weights(ppd, krp_highprio_0, t);
+		break;
+
+	case QIB_IB_TBL_VL_LOW_ARB:
+		set_vl_weights(ppd, krp_lowprio_0, t);
+		break;
+
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static void qib_update_7322_usrhead(struct qib_ctxtdata *rcd, u64 hd,
+				    u32 updegr, u32 egrhd, u32 npkts)
+{
+	/*
+	 * Need to write timeout register before updating rcvhdrhead to ensure
+	 * that the timer is enabled on reception of a packet.
+	 */
+	if (hd >> IBA7322_HDRHEAD_PKTINT_SHIFT)
+		adjust_rcv_timeout(rcd, npkts);
+	if (updegr)
+		qib_write_ureg(rcd->dd, ur_rcvegrindexhead, egrhd, rcd->ctxt);
+	mmiowb();
+	qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt);
+	qib_write_ureg(rcd->dd, ur_rcvhdrhead, hd, rcd->ctxt);
+	mmiowb();
+}
+
+static u32 qib_7322_hdrqempty(struct qib_ctxtdata *rcd)
+{
+	u32 head, tail;
+
+	head = qib_read_ureg32(rcd->dd, ur_rcvhdrhead, rcd->ctxt);
+	if (rcd->rcvhdrtail_kvaddr)
+		tail = qib_get_rcvhdrtail(rcd);
+	else
+		tail = qib_read_ureg32(rcd->dd, ur_rcvhdrtail, rcd->ctxt);
+	return head == tail;
+}
+
+#define RCVCTRL_COMMON_MODS (QIB_RCVCTRL_CTXT_ENB | \
+	QIB_RCVCTRL_CTXT_DIS | \
+	QIB_RCVCTRL_TIDFLOW_ENB | \
+	QIB_RCVCTRL_TIDFLOW_DIS | \
+	QIB_RCVCTRL_TAILUPD_ENB | \
+	QIB_RCVCTRL_TAILUPD_DIS | \
+	QIB_RCVCTRL_INTRAVAIL_ENB | \
+	QIB_RCVCTRL_INTRAVAIL_DIS | \
+	QIB_RCVCTRL_BP_ENB | \
+	QIB_RCVCTRL_BP_DIS)
+
+#define RCVCTRL_PORT_MODS (QIB_RCVCTRL_CTXT_ENB | \
+	QIB_RCVCTRL_CTXT_DIS | \
+	QIB_RCVCTRL_PKEY_DIS | \
+	QIB_RCVCTRL_PKEY_ENB)
+
+/*
+ * Modify the RCVCTRL register in chip-specific way. This
+ * is a function because bit positions and (future) register
+ * location is chip-specifc, but the needed operations are
+ * generic. <op> is a bit-mask because we often want to
+ * do multiple modifications.
+ */
+static void rcvctrl_7322_mod(struct qib_pportdata *ppd, unsigned int op,
+			     int ctxt)
+{
+	struct qib_devdata *dd = ppd->dd;
+	struct qib_ctxtdata *rcd;
+	u64 mask, val;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+
+	if (op & QIB_RCVCTRL_TIDFLOW_ENB)
+		dd->rcvctrl |= SYM_MASK(RcvCtrl, TidFlowEnable);
+	if (op & QIB_RCVCTRL_TIDFLOW_DIS)
+		dd->rcvctrl &= ~SYM_MASK(RcvCtrl, TidFlowEnable);
+	if (op & QIB_RCVCTRL_TAILUPD_ENB)
+		dd->rcvctrl |= SYM_MASK(RcvCtrl, TailUpd);
+	if (op & QIB_RCVCTRL_TAILUPD_DIS)
+		dd->rcvctrl &= ~SYM_MASK(RcvCtrl, TailUpd);
+	if (op & QIB_RCVCTRL_PKEY_ENB)
+		ppd->p_rcvctrl &= ~SYM_MASK(RcvCtrl_0, RcvPartitionKeyDisable);
+	if (op & QIB_RCVCTRL_PKEY_DIS)
+		ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvPartitionKeyDisable);
+	if (ctxt < 0) {
+		mask = (1ULL << dd->ctxtcnt) - 1;
+		rcd = NULL;
+	} else {
+		mask = (1ULL << ctxt);
+		rcd = dd->rcd[ctxt];
+	}
+	if ((op & QIB_RCVCTRL_CTXT_ENB) && rcd) {
+		ppd->p_rcvctrl |=
+			(mask << SYM_LSB(RcvCtrl_0, ContextEnableKernel));
+		if (!(dd->flags & QIB_NODMA_RTAIL)) {
+			op |= QIB_RCVCTRL_TAILUPD_ENB; /* need reg write */
+			dd->rcvctrl |= SYM_MASK(RcvCtrl, TailUpd);
+		}
+		/* Write these registers before the context is enabled. */
+		qib_write_kreg_ctxt(dd, krc_rcvhdrtailaddr, ctxt,
+				    rcd->rcvhdrqtailaddr_phys);
+		qib_write_kreg_ctxt(dd, krc_rcvhdraddr, ctxt,
+				    rcd->rcvhdrq_phys);
+		rcd->seq_cnt = 1;
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS)
+		ppd->p_rcvctrl &=
+			~(mask << SYM_LSB(RcvCtrl_0, ContextEnableKernel));
+	if (op & QIB_RCVCTRL_BP_ENB)
+		dd->rcvctrl |= mask << SYM_LSB(RcvCtrl, dontDropRHQFull);
+	if (op & QIB_RCVCTRL_BP_DIS)
+		dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, dontDropRHQFull));
+	if (op & QIB_RCVCTRL_INTRAVAIL_ENB)
+		dd->rcvctrl |= (mask << SYM_LSB(RcvCtrl, IntrAvail));
+	if (op & QIB_RCVCTRL_INTRAVAIL_DIS)
+		dd->rcvctrl &= ~(mask << SYM_LSB(RcvCtrl, IntrAvail));
+	/*
+	 * Decide which registers to write depending on the ops enabled.
+	 * Special case is "flush" (no bits set at all)
+	 * which needs to write both.
+	 */
+	if (op == 0 || (op & RCVCTRL_COMMON_MODS))
+		qib_write_kreg(dd, kr_rcvctrl, dd->rcvctrl);
+	if (op == 0 || (op & RCVCTRL_PORT_MODS))
+		qib_write_kreg_port(ppd, krp_rcvctrl, ppd->p_rcvctrl);
+	if ((op & QIB_RCVCTRL_CTXT_ENB) && dd->rcd[ctxt]) {
+		/*
+		 * Init the context registers also; if we were
+		 * disabled, tail and head should both be zero
+		 * already from the enable, but since we don't
+		 * know, we have to do it explicitly.
+		 */
+		val = qib_read_ureg32(dd, ur_rcvegrindextail, ctxt);
+		qib_write_ureg(dd, ur_rcvegrindexhead, val, ctxt);
+
+		/* be sure enabling write seen; hd/tl should be 0 */
+		(void) qib_read_kreg32(dd, kr_scratch);
+		val = qib_read_ureg32(dd, ur_rcvhdrtail, ctxt);
+		dd->rcd[ctxt]->head = val;
+		/* If kctxt, interrupt on next receive. */
+		if (ctxt < dd->first_user_ctxt)
+			val |= dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	} else if ((op & QIB_RCVCTRL_INTRAVAIL_ENB) &&
+		dd->rcd[ctxt] && dd->rhdrhead_intr_off) {
+		/* arm rcv interrupt */
+		val = dd->rcd[ctxt]->head | dd->rhdrhead_intr_off;
+		qib_write_ureg(dd, ur_rcvhdrhead, val, ctxt);
+	}
+	if (op & QIB_RCVCTRL_CTXT_DIS) {
+		unsigned f;
+
+		/* Now that the context is disabled, clear these registers. */
+		if (ctxt >= 0) {
+			qib_write_kreg_ctxt(dd, krc_rcvhdrtailaddr, ctxt, 0);
+			qib_write_kreg_ctxt(dd, krc_rcvhdraddr, ctxt, 0);
+			for (f = 0; f < NUM_TIDFLOWS_CTXT; f++)
+				qib_write_ureg(dd, ur_rcvflowtable + f,
+					       TIDFLOW_ERRBITS, ctxt);
+		} else {
+			unsigned i;
+
+			for (i = 0; i < dd->cfgctxts; i++) {
+				qib_write_kreg_ctxt(dd, krc_rcvhdrtailaddr,
+						    i, 0);
+				qib_write_kreg_ctxt(dd, krc_rcvhdraddr, i, 0);
+				for (f = 0; f < NUM_TIDFLOWS_CTXT; f++)
+					qib_write_ureg(dd, ur_rcvflowtable + f,
+						       TIDFLOW_ERRBITS, i);
+			}
+		}
+	}
+	spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+}
+
+/*
+ * Modify the SENDCTRL register in chip-specific way. This
+ * is a function where there are multiple such registers with
+ * slightly different layouts.
+ * The chip doesn't allow back-to-back sendctrl writes, so write
+ * the scratch register after writing sendctrl.
+ *
+ * Which register is written depends on the operation.
+ * Most operate on the common register, while
+ * SEND_ENB and SEND_DIS operate on the per-port ones.
+ * SEND_ENB is included in common because it can change SPCL_TRIG
+ */
+#define SENDCTRL_COMMON_MODS (\
+	QIB_SENDCTRL_CLEAR | \
+	QIB_SENDCTRL_AVAIL_DIS | \
+	QIB_SENDCTRL_AVAIL_ENB | \
+	QIB_SENDCTRL_AVAIL_BLIP | \
+	QIB_SENDCTRL_DISARM | \
+	QIB_SENDCTRL_DISARM_ALL | \
+	QIB_SENDCTRL_SEND_ENB)
+
+#define SENDCTRL_PORT_MODS (\
+	QIB_SENDCTRL_CLEAR | \
+	QIB_SENDCTRL_SEND_ENB | \
+	QIB_SENDCTRL_SEND_DIS | \
+	QIB_SENDCTRL_FLUSH)
+
+static void sendctrl_7322_mod(struct qib_pportdata *ppd, u32 op)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 tmp_dd_sendctrl;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+
+	/* First the dd ones that are "sticky", saved in shadow */
+	if (op & QIB_SENDCTRL_CLEAR)
+		dd->sendctrl = 0;
+	if (op & QIB_SENDCTRL_AVAIL_DIS)
+		dd->sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd);
+	else if (op & QIB_SENDCTRL_AVAIL_ENB) {
+		dd->sendctrl |= SYM_MASK(SendCtrl, SendBufAvailUpd);
+		if (dd->flags & QIB_USE_SPCL_TRIG)
+			dd->sendctrl |= SYM_MASK(SendCtrl, SpecialTriggerEn);
+	}
+
+	/* Then the ppd ones that are "sticky", saved in shadow */
+	if (op & QIB_SENDCTRL_SEND_DIS)
+		ppd->p_sendctrl &= ~SYM_MASK(SendCtrl_0, SendEnable);
+	else if (op & QIB_SENDCTRL_SEND_ENB)
+		ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, SendEnable);
+
+	if (op & QIB_SENDCTRL_DISARM_ALL) {
+		u32 i, last;
+
+		tmp_dd_sendctrl = dd->sendctrl;
+		last = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS;
+		/*
+		 * Disarm any buffers that are not yet launched,
+		 * disabling updates until done.
+		 */
+		tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd);
+		for (i = 0; i < last; i++) {
+			qib_write_kreg(dd, kr_sendctrl,
+				       tmp_dd_sendctrl |
+				       SYM_MASK(SendCtrl, Disarm) | i);
+			qib_write_kreg(dd, kr_scratch, 0);
+		}
+	}
+
+	if (op & QIB_SENDCTRL_FLUSH) {
+		u64 tmp_ppd_sendctrl = ppd->p_sendctrl;
+
+		/*
+		 * Now drain all the fifos.  The Abort bit should never be
+		 * needed, so for now, at least, we don't use it.
+		 */
+		tmp_ppd_sendctrl |=
+			SYM_MASK(SendCtrl_0, TxeDrainRmFifo) |
+			SYM_MASK(SendCtrl_0, TxeDrainLaFifo) |
+			SYM_MASK(SendCtrl_0, TxeBypassIbc);
+		qib_write_kreg_port(ppd, krp_sendctrl, tmp_ppd_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	tmp_dd_sendctrl = dd->sendctrl;
+
+	if (op & QIB_SENDCTRL_DISARM)
+		tmp_dd_sendctrl |= SYM_MASK(SendCtrl, Disarm) |
+			((op & QIB_7322_SendCtrl_DisarmSendBuf_RMASK) <<
+			 SYM_LSB(SendCtrl, DisarmSendBuf));
+	if ((op & QIB_SENDCTRL_AVAIL_BLIP) &&
+	    (dd->sendctrl & SYM_MASK(SendCtrl, SendBufAvailUpd)))
+		tmp_dd_sendctrl &= ~SYM_MASK(SendCtrl, SendBufAvailUpd);
+
+	if (op == 0 || (op & SENDCTRL_COMMON_MODS)) {
+		qib_write_kreg(dd, kr_sendctrl, tmp_dd_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	if (op == 0 || (op & SENDCTRL_PORT_MODS)) {
+		qib_write_kreg_port(ppd, krp_sendctrl, ppd->p_sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	if (op & QIB_SENDCTRL_AVAIL_BLIP) {
+		qib_write_kreg(dd, kr_sendctrl, dd->sendctrl);
+		qib_write_kreg(dd, kr_scratch, 0);
+	}
+
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+
+	if (op & QIB_SENDCTRL_FLUSH) {
+		u32 v;
+		/*
+		 * ensure writes have hit chip, then do a few
+		 * more reads, to allow DMA of pioavail registers
+		 * to occur, so in-memory copy is in sync with
+		 * the chip.  Not always safe to sleep.
+		 */
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		v = qib_read_kreg32(dd, kr_scratch);
+		qib_write_kreg(dd, kr_scratch, v);
+		qib_read_kreg32(dd, kr_scratch);
+	}
+}
+
+#define _PORT_VIRT_FLAG 0x8000U /* "virtual", need adjustments */
+#define _PORT_64BIT_FLAG 0x10000U /* not "virtual", but 64bit */
+#define _PORT_CNTR_IDXMASK 0x7fffU /* mask off flags above */
+
+/**
+ * qib_portcntr_7322 - read a per-port chip counter
+ * @ppd: the qlogic_ib pport
+ * @creg: the counter to read (not a chip offset)
+ */
+static u64 qib_portcntr_7322(struct qib_pportdata *ppd, u32 reg)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 ret = 0ULL;
+	u16 creg;
+	/* 0xffff for unimplemented or synthesized counters */
+	static const u32 xlator[] = {
+		[QIBPORTCNTR_PKTSEND] = crp_pktsend | _PORT_64BIT_FLAG,
+		[QIBPORTCNTR_WORDSEND] = crp_wordsend | _PORT_64BIT_FLAG,
+		[QIBPORTCNTR_PSXMITDATA] = crp_psxmitdatacount,
+		[QIBPORTCNTR_PSXMITPKTS] = crp_psxmitpktscount,
+		[QIBPORTCNTR_PSXMITWAIT] = crp_psxmitwaitcount,
+		[QIBPORTCNTR_SENDSTALL] = crp_sendstall,
+		[QIBPORTCNTR_PKTRCV] = crp_pktrcv | _PORT_64BIT_FLAG,
+		[QIBPORTCNTR_PSRCVDATA] = crp_psrcvdatacount,
+		[QIBPORTCNTR_PSRCVPKTS] = crp_psrcvpktscount,
+		[QIBPORTCNTR_RCVEBP] = crp_rcvebp,
+		[QIBPORTCNTR_RCVOVFL] = crp_rcvovfl,
+		[QIBPORTCNTR_WORDRCV] = crp_wordrcv | _PORT_64BIT_FLAG,
+		[QIBPORTCNTR_RXDROPPKT] = 0xffff, /* not needed  for 7322 */
+		[QIBPORTCNTR_RXLOCALPHYERR] = crp_rxotherlocalphyerr,
+		[QIBPORTCNTR_RXVLERR] = crp_rxvlerr,
+		[QIBPORTCNTR_ERRICRC] = crp_erricrc,
+		[QIBPORTCNTR_ERRVCRC] = crp_errvcrc,
+		[QIBPORTCNTR_ERRLPCRC] = crp_errlpcrc,
+		[QIBPORTCNTR_BADFORMAT] = crp_badformat,
+		[QIBPORTCNTR_ERR_RLEN] = crp_err_rlen,
+		[QIBPORTCNTR_IBSYMBOLERR] = crp_ibsymbolerr,
+		[QIBPORTCNTR_INVALIDRLEN] = crp_invalidrlen,
+		[QIBPORTCNTR_UNSUPVL] = crp_txunsupvl,
+		[QIBPORTCNTR_EXCESSBUFOVFL] = crp_excessbufferovfl,
+		[QIBPORTCNTR_ERRLINK] = crp_errlink,
+		[QIBPORTCNTR_IBLINKDOWN] = crp_iblinkdown,
+		[QIBPORTCNTR_IBLINKERRRECOV] = crp_iblinkerrrecov,
+		[QIBPORTCNTR_LLI] = crp_locallinkintegrityerr,
+		[QIBPORTCNTR_VL15PKTDROP] = crp_vl15droppedpkt,
+		[QIBPORTCNTR_ERRPKEY] = crp_errpkey,
+		/*
+		 * the next 3 aren't really counters, but were implemented
+		 * as counters in older chips, so still get accessed as
+		 * though they were counters from this code.
+		 */
+		[QIBPORTCNTR_PSINTERVAL] = krp_psinterval,
+		[QIBPORTCNTR_PSSTART] = krp_psstart,
+		[QIBPORTCNTR_PSSTAT] = krp_psstat,
+		/* pseudo-counter, summed for all ports */
+		[QIBPORTCNTR_KHDROVFL] = 0xffff,
+	};
+
+	if (reg >= ARRAY_SIZE(xlator)) {
+		qib_devinfo(ppd->dd->pcidev,
+			 "Unimplemented portcounter %u\n", reg);
+		goto done;
+	}
+	creg = xlator[reg] & _PORT_CNTR_IDXMASK;
+
+	/* handle non-counters and special cases first */
+	if (reg == QIBPORTCNTR_KHDROVFL) {
+		int i;
+
+		/* sum over all kernel contexts (skip if mini_init) */
+		for (i = 0; dd->rcd && i < dd->first_user_ctxt; i++) {
+			struct qib_ctxtdata *rcd = dd->rcd[i];
+
+			if (!rcd || rcd->ppd != ppd)
+				continue;
+			ret += read_7322_creg32(dd, cr_base_egrovfl + i);
+		}
+		goto done;
+	} else if (reg == QIBPORTCNTR_RXDROPPKT) {
+		/*
+		 * Used as part of the synthesis of port_rcv_errors
+		 * in the verbs code for IBTA counters.  Not needed for 7322,
+		 * because all the errors are already counted by other cntrs.
+		 */
+		goto done;
+	} else if (reg == QIBPORTCNTR_PSINTERVAL ||
+		   reg == QIBPORTCNTR_PSSTART || reg == QIBPORTCNTR_PSSTAT) {
+		/* were counters in older chips, now per-port kernel regs */
+		ret = qib_read_kreg_port(ppd, creg);
+		goto done;
+	}
+
+	/*
+	 * Only fast increment counters are 64 bits; use 32 bit reads to
+	 * avoid two independent reads when on Opteron.
+	 */
+	if (xlator[reg] & _PORT_64BIT_FLAG)
+		ret = read_7322_creg_port(ppd, creg);
+	else
+		ret = read_7322_creg32_port(ppd, creg);
+	if (creg == crp_ibsymbolerr) {
+		if (ppd->cpspec->ibdeltainprog)
+			ret -= ret - ppd->cpspec->ibsymsnap;
+		ret -= ppd->cpspec->ibsymdelta;
+	} else if (creg == crp_iblinkerrrecov) {
+		if (ppd->cpspec->ibdeltainprog)
+			ret -= ret - ppd->cpspec->iblnkerrsnap;
+		ret -= ppd->cpspec->iblnkerrdelta;
+	} else if (creg == crp_errlink)
+		ret -= ppd->cpspec->ibmalfdelta;
+	else if (creg == crp_iblinkdown)
+		ret += ppd->cpspec->iblnkdowndelta;
+done:
+	return ret;
+}
+
+/*
+ * Device counter names (not port-specific), one line per stat,
+ * single string.  Used by utilities like ipathstats to print the stats
+ * in a way which works for different versions of drivers, without changing
+ * the utility.  Names need to be 12 chars or less (w/o newline), for proper
+ * display by utility.
+ * Non-error counters are first.
+ * Start of "error" conters is indicated by a leading "E " on the first
+ * "error" counter, and doesn't count in label length.
+ * The EgrOvfl list needs to be last so we truncate them at the configured
+ * context count for the device.
+ * cntr7322indices contains the corresponding register indices.
+ */
+static const char cntr7322names[] =
+	"Interrupts\n"
+	"HostBusStall\n"
+	"E RxTIDFull\n"
+	"RxTIDInvalid\n"
+	"RxTIDFloDrop\n" /* 7322 only */
+	"Ctxt0EgrOvfl\n"
+	"Ctxt1EgrOvfl\n"
+	"Ctxt2EgrOvfl\n"
+	"Ctxt3EgrOvfl\n"
+	"Ctxt4EgrOvfl\n"
+	"Ctxt5EgrOvfl\n"
+	"Ctxt6EgrOvfl\n"
+	"Ctxt7EgrOvfl\n"
+	"Ctxt8EgrOvfl\n"
+	"Ctxt9EgrOvfl\n"
+	"Ctx10EgrOvfl\n"
+	"Ctx11EgrOvfl\n"
+	"Ctx12EgrOvfl\n"
+	"Ctx13EgrOvfl\n"
+	"Ctx14EgrOvfl\n"
+	"Ctx15EgrOvfl\n"
+	"Ctx16EgrOvfl\n"
+	"Ctx17EgrOvfl\n"
+	;
+
+static const u32 cntr7322indices[] = {
+	cr_lbint | _PORT_64BIT_FLAG,
+	cr_lbstall | _PORT_64BIT_FLAG,
+	cr_tidfull,
+	cr_tidinvalid,
+	cr_rxtidflowdrop,
+	cr_base_egrovfl + 0,
+	cr_base_egrovfl + 1,
+	cr_base_egrovfl + 2,
+	cr_base_egrovfl + 3,
+	cr_base_egrovfl + 4,
+	cr_base_egrovfl + 5,
+	cr_base_egrovfl + 6,
+	cr_base_egrovfl + 7,
+	cr_base_egrovfl + 8,
+	cr_base_egrovfl + 9,
+	cr_base_egrovfl + 10,
+	cr_base_egrovfl + 11,
+	cr_base_egrovfl + 12,
+	cr_base_egrovfl + 13,
+	cr_base_egrovfl + 14,
+	cr_base_egrovfl + 15,
+	cr_base_egrovfl + 16,
+	cr_base_egrovfl + 17,
+};
+
+/*
+ * same as cntr7322names and cntr7322indices, but for port-specific counters.
+ * portcntr7322indices is somewhat complicated by some registers needing
+ * adjustments of various kinds, and those are ORed with _PORT_VIRT_FLAG
+ */
+static const char portcntr7322names[] =
+	"TxPkt\n"
+	"TxFlowPkt\n"
+	"TxWords\n"
+	"RxPkt\n"
+	"RxFlowPkt\n"
+	"RxWords\n"
+	"TxFlowStall\n"
+	"TxDmaDesc\n"  /* 7220 and 7322-only */
+	"E RxDlidFltr\n"  /* 7220 and 7322-only */
+	"IBStatusChng\n"
+	"IBLinkDown\n"
+	"IBLnkRecov\n"
+	"IBRxLinkErr\n"
+	"IBSymbolErr\n"
+	"RxLLIErr\n"
+	"RxBadFormat\n"
+	"RxBadLen\n"
+	"RxBufOvrfl\n"
+	"RxEBP\n"
+	"RxFlowCtlErr\n"
+	"RxICRCerr\n"
+	"RxLPCRCerr\n"
+	"RxVCRCerr\n"
+	"RxInvalLen\n"
+	"RxInvalPKey\n"
+	"RxPktDropped\n"
+	"TxBadLength\n"
+	"TxDropped\n"
+	"TxInvalLen\n"
+	"TxUnderrun\n"
+	"TxUnsupVL\n"
+	"RxLclPhyErr\n" /* 7220 and 7322-only from here down */
+	"RxVL15Drop\n"
+	"RxVlErr\n"
+	"XcessBufOvfl\n"
+	"RxQPBadCtxt\n" /* 7322-only from here down */
+	"TXBadHeader\n"
+	;
+
+static const u32 portcntr7322indices[] = {
+	QIBPORTCNTR_PKTSEND | _PORT_VIRT_FLAG,
+	crp_pktsendflow,
+	QIBPORTCNTR_WORDSEND | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_PKTRCV | _PORT_VIRT_FLAG,
+	crp_pktrcvflowctrl,
+	QIBPORTCNTR_WORDRCV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_SENDSTALL | _PORT_VIRT_FLAG,
+	crp_txsdmadesc | _PORT_64BIT_FLAG,
+	crp_rxdlidfltr,
+	crp_ibstatuschange,
+	QIBPORTCNTR_IBLINKDOWN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBLINKERRRECOV | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLINK | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_IBSYMBOLERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_LLI | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_BADFORMAT | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERR_RLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVOVFL | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RCVEBP | _PORT_VIRT_FLAG,
+	crp_rcvflowctrlviol,
+	QIBPORTCNTR_ERRICRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRLPCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRVCRC | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_INVALIDRLEN | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_ERRPKEY | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RXDROPPKT | _PORT_VIRT_FLAG,
+	crp_txminmaxlenerr,
+	crp_txdroppedpkt,
+	crp_txlenerr,
+	crp_txunderrun,
+	crp_txunsupvl,
+	QIBPORTCNTR_RXLOCALPHYERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_VL15PKTDROP | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_RXVLERR | _PORT_VIRT_FLAG,
+	QIBPORTCNTR_EXCESSBUFOVFL | _PORT_VIRT_FLAG,
+	crp_rxqpinvalidctxt,
+	crp_txhdrerr,
+};
+
+/* do all the setup to make the counter reads efficient later */
+static void init_7322_cntrnames(struct qib_devdata *dd)
+{
+	int i, j = 0;
+	char *s;
+
+	for (i = 0, s = (char *)cntr7322names; s && j <= dd->cfgctxts;
+	     i++) {
+		/* we always have at least one counter before the egrovfl */
+		if (!j && !strncmp("Ctxt0EgrOvfl", s + 1, 12))
+			j = 1;
+		s = strchr(s + 1, '\n');
+		if (s && j)
+			j++;
+	}
+	dd->cspec->ncntrs = i;
+	if (!s)
+		/* full list; size is without terminating null */
+		dd->cspec->cntrnamelen = sizeof(cntr7322names) - 1;
+	else
+		dd->cspec->cntrnamelen = 1 + s - cntr7322names;
+	dd->cspec->cntrs = kmalloc(dd->cspec->ncntrs
+		* sizeof(u64), GFP_KERNEL);
+	if (!dd->cspec->cntrs)
+		qib_dev_err(dd, "Failed allocation for counters\n");
+
+	for (i = 0, s = (char *)portcntr7322names; s; i++)
+		s = strchr(s + 1, '\n');
+	dd->cspec->nportcntrs = i - 1;
+	dd->cspec->portcntrnamelen = sizeof(portcntr7322names) - 1;
+	for (i = 0; i < dd->num_pports; ++i) {
+		dd->pport[i].cpspec->portcntrs = kmalloc(dd->cspec->nportcntrs
+			* sizeof(u64), GFP_KERNEL);
+		if (!dd->pport[i].cpspec->portcntrs)
+			qib_dev_err(dd,
+				"Failed allocation for portcounters\n");
+	}
+}
+
+static u32 qib_read_7322cntrs(struct qib_devdata *dd, loff_t pos, char **namep,
+			      u64 **cntrp)
+{
+	u32 ret;
+
+	if (namep) {
+		ret = dd->cspec->cntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+		else
+			*namep = (char *) cntr7322names;
+	} else {
+		u64 *cntr = dd->cspec->cntrs;
+		int i;
+
+		ret = dd->cspec->ncntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->ncntrs; i++)
+			if (cntr7322indices[i] & _PORT_64BIT_FLAG)
+				*cntr++ = read_7322_creg(dd,
+							 cntr7322indices[i] &
+							 _PORT_CNTR_IDXMASK);
+			else
+				*cntr++ = read_7322_creg32(dd,
+							   cntr7322indices[i]);
+	}
+done:
+	return ret;
+}
+
+static u32 qib_read_7322portcntrs(struct qib_devdata *dd, loff_t pos, u32 port,
+				  char **namep, u64 **cntrp)
+{
+	u32 ret;
+
+	if (namep) {
+		ret = dd->cspec->portcntrnamelen;
+		if (pos >= ret)
+			ret = 0; /* final read after getting everything */
+		else
+			*namep = (char *)portcntr7322names;
+	} else {
+		struct qib_pportdata *ppd = &dd->pport[port];
+		u64 *cntr = ppd->cpspec->portcntrs;
+		int i;
+
+		ret = dd->cspec->nportcntrs * sizeof(u64);
+		if (!cntr || pos >= ret) {
+			/* everything read, or couldn't get memory */
+			ret = 0;
+			goto done;
+		}
+		*cntrp = cntr;
+		for (i = 0; i < dd->cspec->nportcntrs; i++) {
+			if (portcntr7322indices[i] & _PORT_VIRT_FLAG)
+				*cntr++ = qib_portcntr_7322(ppd,
+					portcntr7322indices[i] &
+					_PORT_CNTR_IDXMASK);
+			else if (portcntr7322indices[i] & _PORT_64BIT_FLAG)
+				*cntr++ = read_7322_creg_port(ppd,
+					   portcntr7322indices[i] &
+					    _PORT_CNTR_IDXMASK);
+			else
+				*cntr++ = read_7322_creg32_port(ppd,
+					   portcntr7322indices[i]);
+		}
+	}
+done:
+	return ret;
+}
+
+/**
+ * qib_get_7322_faststats - get word counters from chip before they overflow
+ * @opaque - contains a pointer to the qlogic_ib device qib_devdata
+ *
+ * VESTIGIAL IBA7322 has no "small fast counters", so the only
+ * real purpose of this function is to maintain the notion of
+ * "active time", which in turn is only logged into the eeprom,
+ * which we don;t have, yet, for 7322-based boards.
+ *
+ * called from add_timer
+ */
+static void qib_get_7322_faststats(unsigned long opaque)
+{
+	struct qib_devdata *dd = (struct qib_devdata *) opaque;
+	struct qib_pportdata *ppd;
+	unsigned long flags;
+	u64 traffic_wds;
+	int pidx;
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+
+		/*
+		 * If port isn't enabled or not operational ports, or
+		 * diags is running (can cause memory diags to fail)
+		 * skip this port this time.
+		 */
+		if (!ppd->link_speed_supported || !(dd->flags & QIB_INITTED)
+		    || dd->diag_client)
+			continue;
+
+		/*
+		 * Maintain an activity timer, based on traffic
+		 * exceeding a threshold, so we need to check the word-counts
+		 * even if they are 64-bit.
+		 */
+		traffic_wds = qib_portcntr_7322(ppd, QIBPORTCNTR_WORDRCV) +
+			qib_portcntr_7322(ppd, QIBPORTCNTR_WORDSEND);
+		spin_lock_irqsave(&ppd->dd->eep_st_lock, flags);
+		traffic_wds -= ppd->dd->traffic_wds;
+		ppd->dd->traffic_wds += traffic_wds;
+		spin_unlock_irqrestore(&ppd->dd->eep_st_lock, flags);
+		if (ppd->cpspec->qdr_dfe_on && (ppd->link_speed_active &
+						QIB_IB_QDR) &&
+		    (ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED |
+				    QIBL_LINKACTIVE)) &&
+		    ppd->cpspec->qdr_dfe_time &&
+		    time_is_before_jiffies(ppd->cpspec->qdr_dfe_time)) {
+			ppd->cpspec->qdr_dfe_on = 0;
+
+			qib_write_kreg_port(ppd, krp_static_adapt_dis(2),
+					    ppd->dd->cspec->r1 ?
+					    QDR_STATIC_ADAPT_INIT_R1 :
+					    QDR_STATIC_ADAPT_INIT);
+			force_h1(ppd);
+		}
+	}
+	mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER);
+}
+
+/*
+ * If we were using MSIx, try to fallback to INTx.
+ */
+static int qib_7322_intr_fallback(struct qib_devdata *dd)
+{
+	if (!dd->cspec->num_msix_entries)
+		return 0; /* already using INTx */
+
+	qib_devinfo(dd->pcidev,
+		"MSIx interrupt not detected, trying INTx interrupts\n");
+	qib_7322_nomsix(dd);
+	qib_enable_intx(dd->pcidev);
+	qib_setup_7322_interrupt(dd, 0);
+	return 1;
+}
+
+/*
+ * Reset the XGXS (between serdes and IBC).  Slightly less intrusive
+ * than resetting the IBC or external link state, and useful in some
+ * cases to cause some retraining.  To do this right, we reset IBC
+ * as well, then return to previous state (which may be still in reset)
+ * NOTE: some callers of this "know" this writes the current value
+ * of cpspec->ibcctrl_a as part of it's operation, so if that changes,
+ * check all callers.
+ */
+static void qib_7322_mini_pcs_reset(struct qib_pportdata *ppd)
+{
+	u64 val;
+	struct qib_devdata *dd = ppd->dd;
+	const u64 reset_bits = SYM_MASK(IBPCSConfig_0, xcv_rreset) |
+		SYM_MASK(IBPCSConfig_0, xcv_treset) |
+		SYM_MASK(IBPCSConfig_0, tx_rx_reset);
+
+	val = qib_read_kreg_port(ppd, krp_ib_pcsconfig);
+	qib_write_kreg(dd, kr_hwerrmask,
+		       dd->cspec->hwerrmask & ~HWE_MASK(statusValidNoEop));
+	qib_write_kreg_port(ppd, krp_ibcctrl_a,
+			    ppd->cpspec->ibcctrl_a &
+			    ~SYM_MASK(IBCCtrlA_0, IBLinkEn));
+
+	qib_write_kreg_port(ppd, krp_ib_pcsconfig, val | reset_bits);
+	qib_read_kreg32(dd, kr_scratch);
+	qib_write_kreg_port(ppd, krp_ib_pcsconfig, val & ~reset_bits);
+	qib_write_kreg_port(ppd, krp_ibcctrl_a, ppd->cpspec->ibcctrl_a);
+	qib_write_kreg(dd, kr_scratch, 0ULL);
+	qib_write_kreg(dd, kr_hwerrclear,
+		       SYM_MASK(HwErrClear, statusValidNoEopClear));
+	qib_write_kreg(dd, kr_hwerrmask, dd->cspec->hwerrmask);
+}
+
+/*
+ * This code for non-IBTA-compliant IB speed negotiation is only known to
+ * work for the SDR to DDR transition, and only between an HCA and a switch
+ * with recent firmware.  It is based on observed heuristics, rather than
+ * actual knowledge of the non-compliant speed negotiation.
+ * It has a number of hard-coded fields, since the hope is to rewrite this
+ * when a spec is available on how the negoation is intended to work.
+ */
+static void autoneg_7322_sendpkt(struct qib_pportdata *ppd, u32 *hdr,
+				 u32 dcnt, u32 *data)
+{
+	int i;
+	u64 pbc;
+	u32 __iomem *piobuf;
+	u32 pnum, control, len;
+	struct qib_devdata *dd = ppd->dd;
+
+	i = 0;
+	len = 7 + dcnt + 1; /* 7 dword header, dword data, icrc */
+	control = qib_7322_setpbc_control(ppd, len, 0, 15);
+	pbc = ((u64) control << 32) | len;
+	while (!(piobuf = qib_7322_getsendbuf(ppd, pbc, &pnum))) {
+		if (i++ > 15)
+			return;
+		udelay(2);
+	}
+	/* disable header check on this packet, since it can't be valid */
+	dd->f_txchk_change(dd, pnum, 1, TXCHK_CHG_TYPE_DIS1, NULL);
+	writeq(pbc, piobuf);
+	qib_flush_wc();
+	qib_pio_copy(piobuf + 2, hdr, 7);
+	qib_pio_copy(piobuf + 9, data, dcnt);
+	if (dd->flags & QIB_USE_SPCL_TRIG) {
+		u32 spcl_off = (pnum >= dd->piobcnt2k) ? 2047 : 1023;
+
+		qib_flush_wc();
+		__raw_writel(0xaebecede, piobuf + spcl_off);
+	}
+	qib_flush_wc();
+	qib_sendbuf_done(dd, pnum);
+	/* and re-enable hdr check */
+	dd->f_txchk_change(dd, pnum, 1, TXCHK_CHG_TYPE_ENAB1, NULL);
+}
+
+/*
+ * _start packet gets sent twice at start, _done gets sent twice at end
+ */
+static void qib_autoneg_7322_send(struct qib_pportdata *ppd, int which)
+{
+	struct qib_devdata *dd = ppd->dd;
+	static u32 swapped;
+	u32 dw, i, hcnt, dcnt, *data;
+	static u32 hdr[7] = { 0xf002ffff, 0x48ffff, 0x6400abba };
+	static u32 madpayload_start[0x40] = {
+		0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0,
+		0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x1, 0x1388, 0x15e, 0x1, /* rest 0's */
+		};
+	static u32 madpayload_done[0x40] = {
+		0x1810103, 0x1, 0x0, 0x0, 0x2c90000, 0x2c9, 0x0, 0x0,
+		0xffffffff, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+		0x40000001, 0x1388, 0x15e, /* rest 0's */
+		};
+
+	dcnt = ARRAY_SIZE(madpayload_start);
+	hcnt = ARRAY_SIZE(hdr);
+	if (!swapped) {
+		/* for maintainability, do it at runtime */
+		for (i = 0; i < hcnt; i++) {
+			dw = (__force u32) cpu_to_be32(hdr[i]);
+			hdr[i] = dw;
+		}
+		for (i = 0; i < dcnt; i++) {
+			dw = (__force u32) cpu_to_be32(madpayload_start[i]);
+			madpayload_start[i] = dw;
+			dw = (__force u32) cpu_to_be32(madpayload_done[i]);
+			madpayload_done[i] = dw;
+		}
+		swapped = 1;
+	}
+
+	data = which ? madpayload_done : madpayload_start;
+
+	autoneg_7322_sendpkt(ppd, hdr, dcnt, data);
+	qib_read_kreg64(dd, kr_scratch);
+	udelay(2);
+	autoneg_7322_sendpkt(ppd, hdr, dcnt, data);
+	qib_read_kreg64(dd, kr_scratch);
+	udelay(2);
+}
+
+/*
+ * Do the absolute minimum to cause an IB speed change, and make it
+ * ready, but don't actually trigger the change.   The caller will
+ * do that when ready (if link is in Polling training state, it will
+ * happen immediately, otherwise when link next goes down)
+ *
+ * This routine should only be used as part of the DDR autonegotation
+ * code for devices that are not compliant with IB 1.2 (or code that
+ * fixes things up for same).
+ *
+ * When link has gone down, and autoneg enabled, or autoneg has
+ * failed and we give up until next time we set both speeds, and
+ * then we want IBTA enabled as well as "use max enabled speed.
+ */
+static void set_7322_ibspeed_fast(struct qib_pportdata *ppd, u32 speed)
+{
+	u64 newctrlb;
+
+	newctrlb = ppd->cpspec->ibcctrl_b & ~(IBA7322_IBC_SPEED_MASK |
+				    IBA7322_IBC_IBTA_1_2_MASK |
+				    IBA7322_IBC_MAX_SPEED_MASK);
+
+	if (speed & (speed - 1)) /* multiple speeds */
+		newctrlb |= (speed << IBA7322_IBC_SPEED_LSB) |
+				    IBA7322_IBC_IBTA_1_2_MASK |
+				    IBA7322_IBC_MAX_SPEED_MASK;
+	else
+		newctrlb |= speed == QIB_IB_QDR ?
+			IBA7322_IBC_SPEED_QDR | IBA7322_IBC_IBTA_1_2_MASK :
+			((speed == QIB_IB_DDR ?
+			  IBA7322_IBC_SPEED_DDR : IBA7322_IBC_SPEED_SDR));
+
+	if (newctrlb == ppd->cpspec->ibcctrl_b)
+		return;
+
+	ppd->cpspec->ibcctrl_b = newctrlb;
+	qib_write_kreg_port(ppd, krp_ibcctrl_b, ppd->cpspec->ibcctrl_b);
+	qib_write_kreg(ppd->dd, kr_scratch, 0);
+}
+
+/*
+ * This routine is only used when we are not talking to another
+ * IB 1.2-compliant device that we think can do DDR.
+ * (This includes all existing switch chips as of Oct 2007.)
+ * 1.2-compliant devices go directly to DDR prior to reaching INIT
+ */
+static void try_7322_autoneg(struct qib_pportdata *ppd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags |= QIBL_IB_AUTONEG_INPROG;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	qib_autoneg_7322_send(ppd, 0);
+	set_7322_ibspeed_fast(ppd, QIB_IB_DDR);
+	qib_7322_mini_pcs_reset(ppd);
+	/* 2 msec is minimum length of a poll cycle */
+	queue_delayed_work(ib_wq, &ppd->cpspec->autoneg_work,
+			   msecs_to_jiffies(2));
+}
+
+/*
+ * Handle the empirically determined mechanism for auto-negotiation
+ * of DDR speed with switches.
+ */
+static void autoneg_7322_work(struct work_struct *work)
+{
+	struct qib_pportdata *ppd;
+	struct qib_devdata *dd;
+	u64 startms;
+	u32 i;
+	unsigned long flags;
+
+	ppd = container_of(work, struct qib_chippport_specific,
+			    autoneg_work.work)->ppd;
+	dd = ppd->dd;
+
+	startms = jiffies_to_msecs(jiffies);
+
+	/*
+	 * Busy wait for this first part, it should be at most a
+	 * few hundred usec, since we scheduled ourselves for 2msec.
+	 */
+	for (i = 0; i < 25; i++) {
+		if (SYM_FIELD(ppd->lastibcstat, IBCStatusA_0, LinkState)
+		     == IB_7322_LT_STATE_POLLQUIET) {
+			qib_set_linkstate(ppd, QIB_IB_LINKDOWN_DISABLE);
+			break;
+		}
+		udelay(100);
+	}
+
+	if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+		goto done; /* we got there early or told to stop */
+
+	/* we expect this to timeout */
+	if (wait_event_timeout(ppd->cpspec->autoneg_wait,
+			       !(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+			       msecs_to_jiffies(90)))
+		goto done;
+	qib_7322_mini_pcs_reset(ppd);
+
+	/* we expect this to timeout */
+	if (wait_event_timeout(ppd->cpspec->autoneg_wait,
+			       !(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+			       msecs_to_jiffies(1700)))
+		goto done;
+	qib_7322_mini_pcs_reset(ppd);
+
+	set_7322_ibspeed_fast(ppd, QIB_IB_SDR);
+
+	/*
+	 * Wait up to 250 msec for link to train and get to INIT at DDR;
+	 * this should terminate early.
+	 */
+	wait_event_timeout(ppd->cpspec->autoneg_wait,
+		!(ppd->lflags & QIBL_IB_AUTONEG_INPROG),
+		msecs_to_jiffies(250));
+done:
+	if (ppd->lflags & QIBL_IB_AUTONEG_INPROG) {
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_IB_AUTONEG_INPROG;
+		if (ppd->cpspec->autoneg_tries == AUTONEG_TRIES) {
+			ppd->lflags |= QIBL_IB_AUTONEG_FAILED;
+			ppd->cpspec->autoneg_tries = 0;
+		}
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		set_7322_ibspeed_fast(ppd, ppd->link_speed_enabled);
+	}
+}
+
+/*
+ * This routine is used to request IPG set in the QLogic switch.
+ * Only called if r1.
+ */
+static void try_7322_ipg(struct qib_pportdata *ppd)
+{
+	struct qib_ibport *ibp = &ppd->ibport_data;
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent;
+	struct ib_smp *smp;
+	unsigned delay;
+	int ret;
+
+	agent = ibp->send_agent;
+	if (!agent)
+		goto retry;
+
+	send_buf = ib_create_send_mad(agent, 0, 0, 0, IB_MGMT_MAD_HDR,
+				      IB_MGMT_MAD_DATA, GFP_ATOMIC);
+	if (IS_ERR(send_buf))
+		goto retry;
+
+	if (!ibp->smi_ah) {
+		struct ib_ah *ah;
+
+		ah = qib_create_qp0_ah(ibp, be16_to_cpu(IB_LID_PERMISSIVE));
+		if (IS_ERR(ah))
+			ret = PTR_ERR(ah);
+		else {
+			send_buf->ah = ah;
+			ibp->smi_ah = to_iah(ah);
+			ret = 0;
+		}
+	} else {
+		send_buf->ah = &ibp->smi_ah->ibah;
+		ret = 0;
+	}
+
+	smp = send_buf->mad;
+	smp->base_version = IB_MGMT_BASE_VERSION;
+	smp->mgmt_class = IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE;
+	smp->class_version = 1;
+	smp->method = IB_MGMT_METHOD_SEND;
+	smp->hop_cnt = 1;
+	smp->attr_id = QIB_VENDOR_IPG;
+	smp->attr_mod = 0;
+
+	if (!ret)
+		ret = ib_post_send_mad(send_buf, NULL);
+	if (ret)
+		ib_free_send_mad(send_buf);
+retry:
+	delay = 2 << ppd->cpspec->ipg_tries;
+	queue_delayed_work(ib_wq, &ppd->cpspec->ipg_work,
+			   msecs_to_jiffies(delay));
+}
+
+/*
+ * Timeout handler for setting IPG.
+ * Only called if r1.
+ */
+static void ipg_7322_work(struct work_struct *work)
+{
+	struct qib_pportdata *ppd;
+
+	ppd = container_of(work, struct qib_chippport_specific,
+			   ipg_work.work)->ppd;
+	if ((ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED | QIBL_LINKACTIVE))
+	    && ++ppd->cpspec->ipg_tries <= 10)
+		try_7322_ipg(ppd);
+}
+
+static u32 qib_7322_iblink_state(u64 ibcs)
+{
+	u32 state = (u32)SYM_FIELD(ibcs, IBCStatusA_0, LinkState);
+
+	switch (state) {
+	case IB_7322_L_STATE_INIT:
+		state = IB_PORT_INIT;
+		break;
+	case IB_7322_L_STATE_ARM:
+		state = IB_PORT_ARMED;
+		break;
+	case IB_7322_L_STATE_ACTIVE:
+		/* fall through */
+	case IB_7322_L_STATE_ACT_DEFER:
+		state = IB_PORT_ACTIVE;
+		break;
+	default: /* fall through */
+	case IB_7322_L_STATE_DOWN:
+		state = IB_PORT_DOWN;
+		break;
+	}
+	return state;
+}
+
+/* returns the IBTA port state, rather than the IBC link training state */
+static u8 qib_7322_phys_portstate(u64 ibcs)
+{
+	u8 state = (u8)SYM_FIELD(ibcs, IBCStatusA_0, LinkTrainingState);
+	return qib_7322_physportstate[state];
+}
+
+static int qib_7322_ib_updown(struct qib_pportdata *ppd, int ibup, u64 ibcs)
+{
+	int ret = 0, symadj = 0;
+	unsigned long flags;
+	int mult;
+
+	spin_lock_irqsave(&ppd->lflags_lock, flags);
+	ppd->lflags &= ~QIBL_IB_FORCE_NOTIFY;
+	spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+
+	/* Update our picture of width and speed from chip */
+	if (ibcs & SYM_MASK(IBCStatusA_0, LinkSpeedQDR)) {
+		ppd->link_speed_active = QIB_IB_QDR;
+		mult = 4;
+	} else if (ibcs & SYM_MASK(IBCStatusA_0, LinkSpeedActive)) {
+		ppd->link_speed_active = QIB_IB_DDR;
+		mult = 2;
+	} else {
+		ppd->link_speed_active = QIB_IB_SDR;
+		mult = 1;
+	}
+	if (ibcs & SYM_MASK(IBCStatusA_0, LinkWidthActive)) {
+		ppd->link_width_active = IB_WIDTH_4X;
+		mult *= 4;
+	} else
+		ppd->link_width_active = IB_WIDTH_1X;
+	ppd->delay_mult = ib_rate_to_delay[mult_to_ib_rate(mult)];
+
+	if (!ibup) {
+		u64 clr;
+
+		/* Link went down. */
+		/* do IPG MAD again after linkdown, even if last time failed */
+		ppd->cpspec->ipg_tries = 0;
+		clr = qib_read_kreg_port(ppd, krp_ibcstatus_b) &
+			(SYM_MASK(IBCStatusB_0, heartbeat_timed_out) |
+			 SYM_MASK(IBCStatusB_0, heartbeat_crosstalk));
+		if (clr)
+			qib_write_kreg_port(ppd, krp_ibcstatus_b, clr);
+		if (!(ppd->lflags & (QIBL_IB_AUTONEG_FAILED |
+				     QIBL_IB_AUTONEG_INPROG)))
+			set_7322_ibspeed_fast(ppd, ppd->link_speed_enabled);
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) {
+			struct qib_qsfp_data *qd =
+				&ppd->cpspec->qsfp_data;
+			/* unlock the Tx settings, speed may change */
+			qib_write_kreg_port(ppd, krp_tx_deemph_override,
+				SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+				reset_tx_deemphasis_override));
+			qib_cancel_sends(ppd);
+			/* on link down, ensure sane pcs state */
+			qib_7322_mini_pcs_reset(ppd);
+			/* schedule the qsfp refresh which should turn the link
+			   off */
+			if (ppd->dd->flags & QIB_HAS_QSFP) {
+				qd->t_insert = jiffies;
+				queue_work(ib_wq, &qd->work);
+			}
+			spin_lock_irqsave(&ppd->sdma_lock, flags);
+			if (__qib_sdma_running(ppd))
+				__qib_sdma_process_event(ppd,
+					qib_sdma_event_e70_go_idle);
+			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+		}
+		clr = read_7322_creg32_port(ppd, crp_iblinkdown);
+		if (clr == ppd->cpspec->iblnkdownsnap)
+			ppd->cpspec->iblnkdowndelta++;
+	} else {
+		if (qib_compat_ddr_negotiate &&
+		    !(ppd->lflags & (QIBL_IB_AUTONEG_FAILED |
+				     QIBL_IB_AUTONEG_INPROG)) &&
+		    ppd->link_speed_active == QIB_IB_SDR &&
+		    (ppd->link_speed_enabled & QIB_IB_DDR)
+		    && ppd->cpspec->autoneg_tries < AUTONEG_TRIES) {
+			/* we are SDR, and auto-negotiation enabled */
+			++ppd->cpspec->autoneg_tries;
+			if (!ppd->cpspec->ibdeltainprog) {
+				ppd->cpspec->ibdeltainprog = 1;
+				ppd->cpspec->ibsymdelta +=
+					read_7322_creg32_port(ppd,
+						crp_ibsymbolerr) -
+						ppd->cpspec->ibsymsnap;
+				ppd->cpspec->iblnkerrdelta +=
+					read_7322_creg32_port(ppd,
+						crp_iblinkerrrecov) -
+						ppd->cpspec->iblnkerrsnap;
+			}
+			try_7322_autoneg(ppd);
+			ret = 1; /* no other IB status change processing */
+		} else if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) &&
+			   ppd->link_speed_active == QIB_IB_SDR) {
+			qib_autoneg_7322_send(ppd, 1);
+			set_7322_ibspeed_fast(ppd, QIB_IB_DDR);
+			qib_7322_mini_pcs_reset(ppd);
+			udelay(2);
+			ret = 1; /* no other IB status change processing */
+		} else if ((ppd->lflags & QIBL_IB_AUTONEG_INPROG) &&
+			   (ppd->link_speed_active & QIB_IB_DDR)) {
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags &= ~(QIBL_IB_AUTONEG_INPROG |
+					 QIBL_IB_AUTONEG_FAILED);
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+			ppd->cpspec->autoneg_tries = 0;
+			/* re-enable SDR, for next link down */
+			set_7322_ibspeed_fast(ppd, ppd->link_speed_enabled);
+			wake_up(&ppd->cpspec->autoneg_wait);
+			symadj = 1;
+		} else if (ppd->lflags & QIBL_IB_AUTONEG_FAILED) {
+			/*
+			 * Clear autoneg failure flag, and do setup
+			 * so we'll try next time link goes down and
+			 * back to INIT (possibly connected to a
+			 * different device).
+			 */
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags &= ~QIBL_IB_AUTONEG_FAILED;
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+			ppd->cpspec->ibcctrl_b |= IBA7322_IBC_IBTA_1_2_MASK;
+			symadj = 1;
+		}
+		if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) {
+			symadj = 1;
+			if (ppd->dd->cspec->r1 && ppd->cpspec->ipg_tries <= 10)
+				try_7322_ipg(ppd);
+			if (!ppd->cpspec->recovery_init)
+				setup_7322_link_recovery(ppd, 0);
+			ppd->cpspec->qdr_dfe_time = jiffies +
+				msecs_to_jiffies(QDR_DFE_DISABLE_DELAY);
+		}
+		ppd->cpspec->ibmalfusesnap = 0;
+		ppd->cpspec->ibmalfsnap = read_7322_creg32_port(ppd,
+			crp_errlink);
+	}
+	if (symadj) {
+		ppd->cpspec->iblnkdownsnap =
+			read_7322_creg32_port(ppd, crp_iblinkdown);
+		if (ppd->cpspec->ibdeltainprog) {
+			ppd->cpspec->ibdeltainprog = 0;
+			ppd->cpspec->ibsymdelta += read_7322_creg32_port(ppd,
+				crp_ibsymbolerr) - ppd->cpspec->ibsymsnap;
+			ppd->cpspec->iblnkerrdelta += read_7322_creg32_port(ppd,
+				crp_iblinkerrrecov) - ppd->cpspec->iblnkerrsnap;
+		}
+	} else if (!ibup && qib_compat_ddr_negotiate &&
+		   !ppd->cpspec->ibdeltainprog &&
+			!(ppd->lflags & QIBL_IB_AUTONEG_INPROG)) {
+		ppd->cpspec->ibdeltainprog = 1;
+		ppd->cpspec->ibsymsnap = read_7322_creg32_port(ppd,
+			crp_ibsymbolerr);
+		ppd->cpspec->iblnkerrsnap = read_7322_creg32_port(ppd,
+			crp_iblinkerrrecov);
+	}
+
+	if (!ret)
+		qib_setup_7322_setextled(ppd, ibup);
+	return ret;
+}
+
+/*
+ * Does read/modify/write to appropriate registers to
+ * set output and direction bits selected by mask.
+ * these are in their canonical postions (e.g. lsb of
+ * dir will end up in D48 of extctrl on existing chips).
+ * returns contents of GP Inputs.
+ */
+static int gpio_7322_mod(struct qib_devdata *dd, u32 out, u32 dir, u32 mask)
+{
+	u64 read_val, new_out;
+	unsigned long flags;
+
+	if (mask) {
+		/* some bits being written, lock access to GPIO */
+		dir &= mask;
+		out &= mask;
+		spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+		dd->cspec->extctrl &= ~((u64)mask << SYM_LSB(EXTCtrl, GPIOOe));
+		dd->cspec->extctrl |= ((u64) dir << SYM_LSB(EXTCtrl, GPIOOe));
+		new_out = (dd->cspec->gpio_out & ~mask) | out;
+
+		qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl);
+		qib_write_kreg(dd, kr_gpio_out, new_out);
+		dd->cspec->gpio_out = new_out;
+		spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+	}
+	/*
+	 * It is unlikely that a read at this time would get valid
+	 * data on a pin whose direction line was set in the same
+	 * call to this function. We include the read here because
+	 * that allows us to potentially combine a change on one pin with
+	 * a read on another, and because the old code did something like
+	 * this.
+	 */
+	read_val = qib_read_kreg64(dd, kr_extstatus);
+	return SYM_FIELD(read_val, EXTStatus, GPIOIn);
+}
+
+/* Enable writes to config EEPROM, if possible. Returns previous state */
+static int qib_7322_eeprom_wen(struct qib_devdata *dd, int wen)
+{
+	int prev_wen;
+	u32 mask;
+
+	mask = 1 << QIB_EEPROM_WEN_NUM;
+	prev_wen = ~gpio_7322_mod(dd, 0, 0, 0) >> QIB_EEPROM_WEN_NUM;
+	gpio_7322_mod(dd, wen ? 0 : mask, mask, mask);
+
+	return prev_wen & 1;
+}
+
+/*
+ * Read fundamental info we need to use the chip.  These are
+ * the registers that describe chip capabilities, and are
+ * saved in shadow registers.
+ */
+static void get_7322_chip_params(struct qib_devdata *dd)
+{
+	u64 val;
+	u32 piobufs;
+	int mtu;
+
+	dd->palign = qib_read_kreg32(dd, kr_pagealign);
+
+	dd->uregbase = qib_read_kreg32(dd, kr_userregbase);
+
+	dd->rcvtidcnt = qib_read_kreg32(dd, kr_rcvtidcnt);
+	dd->rcvtidbase = qib_read_kreg32(dd, kr_rcvtidbase);
+	dd->rcvegrbase = qib_read_kreg32(dd, kr_rcvegrbase);
+	dd->piobufbase = qib_read_kreg64(dd, kr_sendpiobufbase);
+	dd->pio2k_bufbase = dd->piobufbase & 0xffffffff;
+
+	val = qib_read_kreg64(dd, kr_sendpiobufcnt);
+	dd->piobcnt2k = val & ~0U;
+	dd->piobcnt4k = val >> 32;
+	val = qib_read_kreg64(dd, kr_sendpiosize);
+	dd->piosize2k = val & ~0U;
+	dd->piosize4k = val >> 32;
+
+	mtu = ib_mtu_enum_to_int(qib_ibmtu);
+	if (mtu == -1)
+		mtu = QIB_DEFAULT_MTU;
+	dd->pport[0].ibmtu = (u32)mtu;
+	dd->pport[1].ibmtu = (u32)mtu;
+
+	/* these may be adjusted in init_chip_wc_pat() */
+	dd->pio2kbase = (u32 __iomem *)
+		((char __iomem *) dd->kregbase + dd->pio2k_bufbase);
+	dd->pio4kbase = (u32 __iomem *)
+		((char __iomem *) dd->kregbase +
+		 (dd->piobufbase >> 32));
+	/*
+	 * 4K buffers take 2 pages; we use roundup just to be
+	 * paranoid; we calculate it once here, rather than on
+	 * ever buf allocate
+	 */
+	dd->align4k = ALIGN(dd->piosize4k, dd->palign);
+
+	piobufs = dd->piobcnt4k + dd->piobcnt2k + NUM_VL15_BUFS;
+
+	dd->pioavregs = ALIGN(piobufs, sizeof(u64) * BITS_PER_BYTE / 2) /
+		(sizeof(u64) * BITS_PER_BYTE / 2);
+}
+
+/*
+ * The chip base addresses in cspec and cpspec have to be set
+ * after possible init_chip_wc_pat(), rather than in
+ * get_7322_chip_params(), so split out as separate function
+ */
+static void qib_7322_set_baseaddrs(struct qib_devdata *dd)
+{
+	u32 cregbase;
+
+	cregbase = qib_read_kreg32(dd, kr_counterregbase);
+
+	dd->cspec->cregbase = (u64 __iomem *)(cregbase +
+		(char __iomem *)dd->kregbase);
+
+	dd->egrtidbase = (u64 __iomem *)
+		((char __iomem *) dd->kregbase + dd->rcvegrbase);
+
+	/* port registers are defined as relative to base of chip */
+	dd->pport[0].cpspec->kpregbase =
+		(u64 __iomem *)((char __iomem *)dd->kregbase);
+	dd->pport[1].cpspec->kpregbase =
+		(u64 __iomem *)(dd->palign +
+		(char __iomem *)dd->kregbase);
+	dd->pport[0].cpspec->cpregbase =
+		(u64 __iomem *)(qib_read_kreg_port(&dd->pport[0],
+		kr_counterregbase) + (char __iomem *)dd->kregbase);
+	dd->pport[1].cpspec->cpregbase =
+		(u64 __iomem *)(qib_read_kreg_port(&dd->pport[1],
+		kr_counterregbase) + (char __iomem *)dd->kregbase);
+}
+
+/*
+ * This is a fairly special-purpose observer, so we only support
+ * the port-specific parts of SendCtrl
+ */
+
+#define SENDCTRL_SHADOWED (SYM_MASK(SendCtrl_0, SendEnable) |		\
+			   SYM_MASK(SendCtrl_0, SDmaEnable) |		\
+			   SYM_MASK(SendCtrl_0, SDmaIntEnable) |	\
+			   SYM_MASK(SendCtrl_0, SDmaSingleDescriptor) | \
+			   SYM_MASK(SendCtrl_0, SDmaHalt) |		\
+			   SYM_MASK(SendCtrl_0, IBVLArbiterEn) |	\
+			   SYM_MASK(SendCtrl_0, ForceCreditUpToDate))
+
+static int sendctrl_hook(struct qib_devdata *dd,
+			 const struct diag_observer *op, u32 offs,
+			 u64 *data, u64 mask, int only_32)
+{
+	unsigned long flags;
+	unsigned idx;
+	unsigned pidx;
+	struct qib_pportdata *ppd = NULL;
+	u64 local_data, all_bits;
+
+	/*
+	 * The fixed correspondence between Physical ports and pports is
+	 * severed. We need to hunt for the ppd that corresponds
+	 * to the offset we got. And we have to do that without admitting
+	 * we know the stride, apparently.
+	 */
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		u64 __iomem *psptr;
+		u32 psoffs;
+
+		ppd = dd->pport + pidx;
+		if (!ppd->cpspec->kpregbase)
+			continue;
+
+		psptr = ppd->cpspec->kpregbase + krp_sendctrl;
+		psoffs = (u32) (psptr - dd->kregbase) * sizeof(*psptr);
+		if (psoffs == offs)
+			break;
+	}
+
+	/* If pport is not being managed by driver, just avoid shadows. */
+	if (pidx >= dd->num_pports)
+		ppd = NULL;
+
+	/* In any case, "idx" is flat index in kreg space */
+	idx = offs / sizeof(u64);
+
+	all_bits = ~0ULL;
+	if (only_32)
+		all_bits >>= 32;
+
+	spin_lock_irqsave(&dd->sendctrl_lock, flags);
+	if (!ppd || (mask & all_bits) != all_bits) {
+		/*
+		 * At least some mask bits are zero, so we need
+		 * to read. The judgement call is whether from
+		 * reg or shadow. First-cut: read reg, and complain
+		 * if any bits which should be shadowed are different
+		 * from their shadowed value.
+		 */
+		if (only_32)
+			local_data = (u64)qib_read_kreg32(dd, idx);
+		else
+			local_data = qib_read_kreg64(dd, idx);
+		*data = (local_data & ~mask) | (*data & mask);
+	}
+	if (mask) {
+		/*
+		 * At least some mask bits are one, so we need
+		 * to write, but only shadow some bits.
+		 */
+		u64 sval, tval; /* Shadowed, transient */
+
+		/*
+		 * New shadow val is bits we don't want to touch,
+		 * ORed with bits we do, that are intended for shadow.
+		 */
+		if (ppd) {
+			sval = ppd->p_sendctrl & ~mask;
+			sval |= *data & SENDCTRL_SHADOWED & mask;
+			ppd->p_sendctrl = sval;
+		} else
+			sval = *data & SENDCTRL_SHADOWED & mask;
+		tval = sval | (*data & ~SENDCTRL_SHADOWED & mask);
+		qib_write_kreg(dd, idx, tval);
+		qib_write_kreg(dd, kr_scratch, 0Ull);
+	}
+	spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+	return only_32 ? 4 : 8;
+}
+
+static const struct diag_observer sendctrl_0_observer = {
+	sendctrl_hook, KREG_IDX(SendCtrl_0) * sizeof(u64),
+	KREG_IDX(SendCtrl_0) * sizeof(u64)
+};
+
+static const struct diag_observer sendctrl_1_observer = {
+	sendctrl_hook, KREG_IDX(SendCtrl_1) * sizeof(u64),
+	KREG_IDX(SendCtrl_1) * sizeof(u64)
+};
+
+static ushort sdma_fetch_prio = 8;
+module_param_named(sdma_fetch_prio, sdma_fetch_prio, ushort, S_IRUGO);
+MODULE_PARM_DESC(sdma_fetch_prio, "SDMA descriptor fetch priority");
+
+/* Besides logging QSFP events, we set appropriate TxDDS values */
+static void init_txdds_table(struct qib_pportdata *ppd, int override);
+
+static void qsfp_7322_event(struct work_struct *work)
+{
+	struct qib_qsfp_data *qd;
+	struct qib_pportdata *ppd;
+	unsigned long pwrup;
+	unsigned long flags;
+	int ret;
+	u32 le2;
+
+	qd = container_of(work, struct qib_qsfp_data, work);
+	ppd = qd->ppd;
+	pwrup = qd->t_insert +
+		msecs_to_jiffies(QSFP_PWR_LAG_MSEC - QSFP_MODPRS_LAG_MSEC);
+
+	/* Delay for 20 msecs to allow ModPrs resistor to setup */
+	mdelay(QSFP_MODPRS_LAG_MSEC);
+
+	if (!qib_qsfp_mod_present(ppd)) {
+		ppd->cpspec->qsfp_data.modpresent = 0;
+		/* Set the physical link to disabled */
+		qib_set_ib_7322_lstate(ppd, 0,
+				       QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_LINKV;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	} else {
+		/*
+		 * Some QSFP's not only do not respond until the full power-up
+		 * time, but may behave badly if we try. So hold off responding
+		 * to insertion.
+		 */
+		while (1) {
+			if (time_is_before_jiffies(pwrup))
+				break;
+			msleep(20);
+		}
+
+		ret = qib_refresh_qsfp_cache(ppd, &qd->cache);
+
+		/*
+		 * Need to change LE2 back to defaults if we couldn't
+		 * read the cable type (to handle cable swaps), so do this
+		 * even on failure to read cable information.  We don't
+		 * get here for QME, so IS_QME check not needed here.
+		 */
+		if (!ret && !ppd->dd->cspec->r1) {
+			if (QSFP_IS_ACTIVE_FAR(qd->cache.tech))
+				le2 = LE2_QME;
+			else if (qd->cache.atten[1] >= qib_long_atten &&
+				 QSFP_IS_CU(qd->cache.tech))
+				le2 = LE2_5m;
+			else
+				le2 = LE2_DEFAULT;
+		} else
+			le2 = LE2_DEFAULT;
+		ibsd_wr_allchans(ppd, 13, (le2 << 7), BMASK(9, 7));
+		/*
+		 * We always change parameteters, since we can choose
+		 * values for cables without eeproms, and the cable may have
+		 * changed from a cable with full or partial eeprom content
+		 * to one with partial or no content.
+		 */
+		init_txdds_table(ppd, 0);
+		/* The physical link is being re-enabled only when the
+		 * previous state was DISABLED and the VALID bit is not
+		 * set. This should only happen when  the cable has been
+		 * physically pulled. */
+		if (!ppd->cpspec->qsfp_data.modpresent &&
+		    (ppd->lflags & (QIBL_LINKV | QIBL_IB_LINK_DISABLED))) {
+			ppd->cpspec->qsfp_data.modpresent = 1;
+			qib_set_ib_7322_lstate(ppd, 0,
+				QLOGIC_IB_IBCC_LINKINITCMD_SLEEP);
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags |= QIBL_LINKV;
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		}
+	}
+}
+
+/*
+ * There is little we can do but complain to the user if QSFP
+ * initialization fails.
+ */
+static void qib_init_7322_qsfp(struct qib_pportdata *ppd)
+{
+	unsigned long flags;
+	struct qib_qsfp_data *qd = &ppd->cpspec->qsfp_data;
+	struct qib_devdata *dd = ppd->dd;
+	u64 mod_prs_bit = QSFP_GPIO_MOD_PRS_N;
+
+	mod_prs_bit <<= (QSFP_GPIO_PORT2_SHIFT * ppd->hw_pidx);
+	qd->ppd = ppd;
+	qib_qsfp_init(qd, qsfp_7322_event);
+	spin_lock_irqsave(&dd->cspec->gpio_lock, flags);
+	dd->cspec->extctrl |= (mod_prs_bit << SYM_LSB(EXTCtrl, GPIOInvert));
+	dd->cspec->gpio_mask |= mod_prs_bit;
+	qib_write_kreg(dd, kr_extctrl, dd->cspec->extctrl);
+	qib_write_kreg(dd, kr_gpio_mask, dd->cspec->gpio_mask);
+	spin_unlock_irqrestore(&dd->cspec->gpio_lock, flags);
+}
+
+/*
+ * called at device initialization time, and also if the txselect
+ * module parameter is changed.  This is used for cables that don't
+ * have valid QSFP EEPROMs (not present, or attenuation is zero).
+ * We initialize to the default, then if there is a specific
+ * unit,port match, we use that (and set it immediately, for the
+ * current speed, if the link is at INIT or better).
+ * String format is "default# unit#,port#=# ... u,p=#", separators must
+ * be a SPACE character.  A newline terminates.  The u,p=# tuples may
+ * optionally have "u,p=#,#", where the final # is the H1 value
+ * The last specific match is used (actually, all are used, but last
+ * one is the one that winds up set); if none at all, fall back on default.
+ */
+static void set_no_qsfp_atten(struct qib_devdata *dd, int change)
+{
+	char *nxt, *str;
+	u32 pidx, unit, port, deflt, h1;
+	unsigned long val;
+	int any = 0, seth1;
+	int txdds_size;
+
+	str = txselect_list;
+
+	/* default number is validated in setup_txselect() */
+	deflt = simple_strtoul(str, &nxt, 0);
+	for (pidx = 0; pidx < dd->num_pports; ++pidx)
+		dd->pport[pidx].cpspec->no_eep = deflt;
+
+	txdds_size = TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ;
+	if (IS_QME(dd) || IS_QMH(dd))
+		txdds_size += TXDDS_MFG_SZ;
+
+	while (*nxt && nxt[1]) {
+		str = ++nxt;
+		unit = simple_strtoul(str, &nxt, 0);
+		if (nxt == str || !*nxt || *nxt != ',') {
+			while (*nxt && *nxt++ != ' ') /* skip to next, if any */
+				;
+			continue;
+		}
+		str = ++nxt;
+		port = simple_strtoul(str, &nxt, 0);
+		if (nxt == str || *nxt != '=') {
+			while (*nxt && *nxt++ != ' ') /* skip to next, if any */
+				;
+			continue;
+		}
+		str = ++nxt;
+		val = simple_strtoul(str, &nxt, 0);
+		if (nxt == str) {
+			while (*nxt && *nxt++ != ' ') /* skip to next, if any */
+				;
+			continue;
+		}
+		if (val >= txdds_size)
+			continue;
+		seth1 = 0;
+		h1 = 0; /* gcc thinks it might be used uninitted */
+		if (*nxt == ',' && nxt[1]) {
+			str = ++nxt;
+			h1 = (u32)simple_strtoul(str, &nxt, 0);
+			if (nxt == str)
+				while (*nxt && *nxt++ != ' ') /* skip */
+					;
+			else
+				seth1 = 1;
+		}
+		for (pidx = 0; dd->unit == unit && pidx < dd->num_pports;
+		     ++pidx) {
+			struct qib_pportdata *ppd = &dd->pport[pidx];
+
+			if (ppd->port != port || !ppd->link_speed_supported)
+				continue;
+			ppd->cpspec->no_eep = val;
+			if (seth1)
+				ppd->cpspec->h1_val = h1;
+			/* now change the IBC and serdes, overriding generic */
+			init_txdds_table(ppd, 1);
+			/* Re-enable the physical state machine on mezz boards
+			 * now that the correct settings have been set.
+			 * QSFP boards are handles by the QSFP event handler */
+			if (IS_QMH(dd) || IS_QME(dd))
+				qib_set_ib_7322_lstate(ppd, 0,
+					    QLOGIC_IB_IBCC_LINKINITCMD_SLEEP);
+			any++;
+		}
+		if (*nxt == '\n')
+			break; /* done */
+	}
+	if (change && !any) {
+		/* no specific setting, use the default.
+		 * Change the IBC and serdes, but since it's
+		 * general, don't override specific settings.
+		 */
+		for (pidx = 0; pidx < dd->num_pports; ++pidx)
+			if (dd->pport[pidx].link_speed_supported)
+				init_txdds_table(&dd->pport[pidx], 0);
+	}
+}
+
+/* handle the txselect parameter changing */
+static int setup_txselect(const char *str, struct kernel_param *kp)
+{
+	struct qib_devdata *dd;
+	unsigned long val;
+	char *n;
+
+	if (strlen(str) >= MAX_ATTEN_LEN) {
+		pr_info("txselect_values string too long\n");
+		return -ENOSPC;
+	}
+	val = simple_strtoul(str, &n, 0);
+	if (n == str || val >= (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ +
+				TXDDS_MFG_SZ)) {
+		pr_info("txselect_values must start with a number < %d\n",
+			TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ + TXDDS_MFG_SZ);
+		return -EINVAL;
+	}
+	strcpy(txselect_list, str);
+
+	list_for_each_entry(dd, &qib_dev_list, list)
+		if (dd->deviceid == PCI_DEVICE_ID_QLOGIC_IB_7322)
+			set_no_qsfp_atten(dd, 1);
+	return 0;
+}
+
+/*
+ * Write the final few registers that depend on some of the
+ * init setup.  Done late in init, just before bringing up
+ * the serdes.
+ */
+static int qib_late_7322_initreg(struct qib_devdata *dd)
+{
+	int ret = 0, n;
+	u64 val;
+
+	qib_write_kreg(dd, kr_rcvhdrentsize, dd->rcvhdrentsize);
+	qib_write_kreg(dd, kr_rcvhdrsize, dd->rcvhdrsize);
+	qib_write_kreg(dd, kr_rcvhdrcnt, dd->rcvhdrcnt);
+	qib_write_kreg(dd, kr_sendpioavailaddr, dd->pioavailregs_phys);
+	val = qib_read_kreg64(dd, kr_sendpioavailaddr);
+	if (val != dd->pioavailregs_phys) {
+		qib_dev_err(dd,
+			"Catastrophic software error, SendPIOAvailAddr written as %lx, read back as %llx\n",
+			(unsigned long) dd->pioavailregs_phys,
+			(unsigned long long) val);
+		ret = -EINVAL;
+	}
+
+	n = dd->piobcnt2k + dd->piobcnt4k + NUM_VL15_BUFS;
+	qib_7322_txchk_change(dd, 0, n, TXCHK_CHG_TYPE_KERN, NULL);
+	/* driver sends get pkey, lid, etc. checking also, to catch bugs */
+	qib_7322_txchk_change(dd, 0, n, TXCHK_CHG_TYPE_ENAB1, NULL);
+
+	qib_register_observer(dd, &sendctrl_0_observer);
+	qib_register_observer(dd, &sendctrl_1_observer);
+
+	dd->control &= ~QLOGIC_IB_C_SDMAFETCHPRIOEN;
+	qib_write_kreg(dd, kr_control, dd->control);
+	/*
+	 * Set SendDmaFetchPriority and init Tx params, including
+	 * QSFP handler on boards that have QSFP.
+	 * First set our default attenuation entry for cables that
+	 * don't have valid attenuation.
+	 */
+	set_no_qsfp_atten(dd, 0);
+	for (n = 0; n < dd->num_pports; ++n) {
+		struct qib_pportdata *ppd = dd->pport + n;
+
+		qib_write_kreg_port(ppd, krp_senddmaprioritythld,
+				    sdma_fetch_prio & 0xf);
+		/* Initialize qsfp if present on board. */
+		if (dd->flags & QIB_HAS_QSFP)
+			qib_init_7322_qsfp(ppd);
+	}
+	dd->control |= QLOGIC_IB_C_SDMAFETCHPRIOEN;
+	qib_write_kreg(dd, kr_control, dd->control);
+
+	return ret;
+}
+
+/* per IB port errors.  */
+#define SENDCTRL_PIBP (MASK_ACROSS(0, 1) | MASK_ACROSS(3, 3) | \
+	MASK_ACROSS(8, 15))
+#define RCVCTRL_PIBP (MASK_ACROSS(0, 17) | MASK_ACROSS(39, 41))
+#define ERRS_PIBP (MASK_ACROSS(57, 58) | MASK_ACROSS(54, 54) | \
+	MASK_ACROSS(36, 49) | MASK_ACROSS(29, 34) | MASK_ACROSS(14, 17) | \
+	MASK_ACROSS(0, 11))
+
+/*
+ * Write the initialization per-port registers that need to be done at
+ * driver load and after reset completes (i.e., that aren't done as part
+ * of other init procedures called from qib_init.c).
+ * Some of these should be redundant on reset, but play safe.
+ */
+static void write_7322_init_portregs(struct qib_pportdata *ppd)
+{
+	u64 val;
+	int i;
+
+	if (!ppd->link_speed_supported) {
+		/* no buffer credits for this port */
+		for (i = 1; i < 8; i++)
+			qib_write_kreg_port(ppd, krp_rxcreditvl0 + i, 0);
+		qib_write_kreg_port(ppd, krp_ibcctrl_b, 0);
+		qib_write_kreg(ppd->dd, kr_scratch, 0);
+		return;
+	}
+
+	/*
+	 * Set the number of supported virtual lanes in IBC,
+	 * for flow control packet handling on unsupported VLs
+	 */
+	val = qib_read_kreg_port(ppd, krp_ibsdtestiftx);
+	val &= ~SYM_MASK(IB_SDTEST_IF_TX_0, VL_CAP);
+	val |= (u64)(ppd->vls_supported - 1) <<
+		SYM_LSB(IB_SDTEST_IF_TX_0, VL_CAP);
+	qib_write_kreg_port(ppd, krp_ibsdtestiftx, val);
+
+	qib_write_kreg_port(ppd, krp_rcvbthqp, QIB_KD_QP);
+
+	/* enable tx header checking */
+	qib_write_kreg_port(ppd, krp_sendcheckcontrol, IBA7322_SENDCHK_PKEY |
+			    IBA7322_SENDCHK_BTHQP | IBA7322_SENDCHK_SLID |
+			    IBA7322_SENDCHK_RAW_IPV6 | IBA7322_SENDCHK_MINSZ);
+
+	qib_write_kreg_port(ppd, krp_ncmodectrl,
+		SYM_MASK(IBNCModeCtrl_0, ScrambleCapLocal));
+
+	/*
+	 * Unconditionally clear the bufmask bits.  If SDMA is
+	 * enabled, we'll set them appropriately later.
+	 */
+	qib_write_kreg_port(ppd, krp_senddmabufmask0, 0);
+	qib_write_kreg_port(ppd, krp_senddmabufmask1, 0);
+	qib_write_kreg_port(ppd, krp_senddmabufmask2, 0);
+	if (ppd->dd->cspec->r1)
+		ppd->p_sendctrl |= SYM_MASK(SendCtrl_0, ForceCreditUpToDate);
+}
+
+/*
+ * Write the initialization per-device registers that need to be done at
+ * driver load and after reset completes (i.e., that aren't done as part
+ * of other init procedures called from qib_init.c).  Also write per-port
+ * registers that are affected by overall device config, such as QP mapping
+ * Some of these should be redundant on reset, but play safe.
+ */
+static void write_7322_initregs(struct qib_devdata *dd)
+{
+	struct qib_pportdata *ppd;
+	int i, pidx;
+	u64 val;
+
+	/* Set Multicast QPs received by port 2 to map to context one. */
+	qib_write_kreg(dd, KREG_IDX(RcvQPMulticastContext_1), 1);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		unsigned n, regno;
+		unsigned long flags;
+
+		if (dd->n_krcv_queues < 2 ||
+			!dd->pport[pidx].link_speed_supported)
+			continue;
+
+		ppd = &dd->pport[pidx];
+
+		/* be paranoid against later code motion, etc. */
+		spin_lock_irqsave(&dd->cspec->rcvmod_lock, flags);
+		ppd->p_rcvctrl |= SYM_MASK(RcvCtrl_0, RcvQPMapEnable);
+		spin_unlock_irqrestore(&dd->cspec->rcvmod_lock, flags);
+
+		/* Initialize QP to context mapping */
+		regno = krp_rcvqpmaptable;
+		val = 0;
+		if (dd->num_pports > 1)
+			n = dd->first_user_ctxt / dd->num_pports;
+		else
+			n = dd->first_user_ctxt - 1;
+		for (i = 0; i < 32; ) {
+			unsigned ctxt;
+
+			if (dd->num_pports > 1)
+				ctxt = (i % n) * dd->num_pports + pidx;
+			else if (i % n)
+				ctxt = (i % n) + 1;
+			else
+				ctxt = ppd->hw_pidx;
+			val |= ctxt << (5 * (i % 6));
+			i++;
+			if (i % 6 == 0) {
+				qib_write_kreg_port(ppd, regno, val);
+				val = 0;
+				regno++;
+			}
+		}
+		qib_write_kreg_port(ppd, regno, val);
+	}
+
+	/*
+	 * Setup up interrupt mitigation for kernel contexts, but
+	 * not user contexts (user contexts use interrupts when
+	 * stalled waiting for any packet, so want those interrupts
+	 * right away).
+	 */
+	for (i = 0; i < dd->first_user_ctxt; i++) {
+		dd->cspec->rcvavail_timeout[i] = rcv_int_timeout;
+		qib_write_kreg(dd, kr_rcvavailtimeout + i, rcv_int_timeout);
+	}
+
+	/*
+	 * Initialize  as (disabled) rcvflow tables.  Application code
+	 * will setup each flow as it uses the flow.
+	 * Doesn't clear any of the error bits that might be set.
+	 */
+	val = TIDFLOW_ERRBITS; /* these are W1C */
+	for (i = 0; i < dd->cfgctxts; i++) {
+		int flow;
+
+		for (flow = 0; flow < NUM_TIDFLOWS_CTXT; flow++)
+			qib_write_ureg(dd, ur_rcvflowtable+flow, val, i);
+	}
+
+	/*
+	 * dual cards init to dual port recovery, single port cards to
+	 * the one port.  Dual port cards may later adjust to 1 port,
+	 * and then back to dual port if both ports are connected
+	 * */
+	if (dd->num_pports)
+		setup_7322_link_recovery(dd->pport, dd->num_pports > 1);
+}
+
+static int qib_init_7322_variables(struct qib_devdata *dd)
+{
+	struct qib_pportdata *ppd;
+	unsigned features, pidx, sbufcnt;
+	int ret, mtu;
+	u32 sbufs, updthresh;
+	resource_size_t vl15off;
+
+	/* pport structs are contiguous, allocated after devdata */
+	ppd = (struct qib_pportdata *)(dd + 1);
+	dd->pport = ppd;
+	ppd[0].dd = dd;
+	ppd[1].dd = dd;
+
+	dd->cspec = (struct qib_chip_specific *)(ppd + 2);
+
+	ppd[0].cpspec = (struct qib_chippport_specific *)(dd->cspec + 1);
+	ppd[1].cpspec = &ppd[0].cpspec[1];
+	ppd[0].cpspec->ppd = &ppd[0]; /* for autoneg_7322_work() */
+	ppd[1].cpspec->ppd = &ppd[1]; /* for autoneg_7322_work() */
+
+	spin_lock_init(&dd->cspec->rcvmod_lock);
+	spin_lock_init(&dd->cspec->gpio_lock);
+
+	/* we haven't yet set QIB_PRESENT, so use read directly */
+	dd->revision = readq(&dd->kregbase[kr_revision]);
+
+	if ((dd->revision & 0xffffffffU) == 0xffffffffU) {
+		qib_dev_err(dd,
+			"Revision register read failure, giving up initialization\n");
+		ret = -ENODEV;
+		goto bail;
+	}
+	dd->flags |= QIB_PRESENT;  /* now register routines work */
+
+	dd->majrev = (u8) SYM_FIELD(dd->revision, Revision_R, ChipRevMajor);
+	dd->minrev = (u8) SYM_FIELD(dd->revision, Revision_R, ChipRevMinor);
+	dd->cspec->r1 = dd->minrev == 1;
+
+	get_7322_chip_params(dd);
+	features = qib_7322_boardname(dd);
+
+	/* now that piobcnt2k and 4k set, we can allocate these */
+	sbufcnt = dd->piobcnt2k + dd->piobcnt4k +
+		NUM_VL15_BUFS + BITS_PER_LONG - 1;
+	sbufcnt /= BITS_PER_LONG;
+	dd->cspec->sendchkenable = kmalloc(sbufcnt *
+		sizeof(*dd->cspec->sendchkenable), GFP_KERNEL);
+	dd->cspec->sendgrhchk = kmalloc(sbufcnt *
+		sizeof(*dd->cspec->sendgrhchk), GFP_KERNEL);
+	dd->cspec->sendibchk = kmalloc(sbufcnt *
+		sizeof(*dd->cspec->sendibchk), GFP_KERNEL);
+	if (!dd->cspec->sendchkenable || !dd->cspec->sendgrhchk ||
+		!dd->cspec->sendibchk) {
+		qib_dev_err(dd, "Failed allocation for hdrchk bitmaps\n");
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	ppd = dd->pport;
+
+	/*
+	 * GPIO bits for TWSI data and clock,
+	 * used for serial EEPROM.
+	 */
+	dd->gpio_sda_num = _QIB_GPIO_SDA_NUM;
+	dd->gpio_scl_num = _QIB_GPIO_SCL_NUM;
+	dd->twsi_eeprom_dev = QIB_TWSI_EEPROM_DEV;
+
+	dd->flags |= QIB_HAS_INTX | QIB_HAS_LINK_LATENCY |
+		QIB_NODMA_RTAIL | QIB_HAS_VLSUPP | QIB_HAS_HDRSUPP |
+		QIB_HAS_THRESH_UPDATE |
+		(sdma_idle_cnt ? QIB_HAS_SDMA_TIMEOUT : 0);
+	dd->flags |= qib_special_trigger ?
+		QIB_USE_SPCL_TRIG : QIB_HAS_SEND_DMA;
+
+	/*
+	 * Setup initial values.  These may change when PAT is enabled, but
+	 * we need these to do initial chip register accesses.
+	 */
+	qib_7322_set_baseaddrs(dd);
+
+	mtu = ib_mtu_enum_to_int(qib_ibmtu);
+	if (mtu == -1)
+		mtu = QIB_DEFAULT_MTU;
+
+	dd->cspec->int_enable_mask = QIB_I_BITSEXTANT;
+	/* all hwerrors become interrupts, unless special purposed */
+	dd->cspec->hwerrmask = ~0ULL;
+	/*  link_recovery setup causes these errors, so ignore them,
+	 *  other than clearing them when they occur */
+	dd->cspec->hwerrmask &=
+		~(SYM_MASK(HwErrMask, IBSerdesPClkNotDetectMask_0) |
+		  SYM_MASK(HwErrMask, IBSerdesPClkNotDetectMask_1) |
+		  HWE_MASK(LATriggered));
+
+	for (pidx = 0; pidx < NUM_IB_PORTS; ++pidx) {
+		struct qib_chippport_specific *cp = ppd->cpspec;
+
+		ppd->link_speed_supported = features & PORT_SPD_CAP;
+		features >>=  PORT_SPD_CAP_SHIFT;
+		if (!ppd->link_speed_supported) {
+			/* single port mode (7340, or configured) */
+			dd->skip_kctxt_mask |= 1 << pidx;
+			if (pidx == 0) {
+				/* Make sure port is disabled. */
+				qib_write_kreg_port(ppd, krp_rcvctrl, 0);
+				qib_write_kreg_port(ppd, krp_ibcctrl_a, 0);
+				ppd[0] = ppd[1];
+				dd->cspec->hwerrmask &= ~(SYM_MASK(HwErrMask,
+						  IBSerdesPClkNotDetectMask_0)
+						  | SYM_MASK(HwErrMask,
+						  SDmaMemReadErrMask_0));
+				dd->cspec->int_enable_mask &= ~(
+				     SYM_MASK(IntMask, SDmaCleanupDoneMask_0) |
+				     SYM_MASK(IntMask, SDmaIdleIntMask_0) |
+				     SYM_MASK(IntMask, SDmaProgressIntMask_0) |
+				     SYM_MASK(IntMask, SDmaIntMask_0) |
+				     SYM_MASK(IntMask, ErrIntMask_0) |
+				     SYM_MASK(IntMask, SendDoneIntMask_0));
+			} else {
+				/* Make sure port is disabled. */
+				qib_write_kreg_port(ppd, krp_rcvctrl, 0);
+				qib_write_kreg_port(ppd, krp_ibcctrl_a, 0);
+				dd->cspec->hwerrmask &= ~(SYM_MASK(HwErrMask,
+						  IBSerdesPClkNotDetectMask_1)
+						  | SYM_MASK(HwErrMask,
+						  SDmaMemReadErrMask_1));
+				dd->cspec->int_enable_mask &= ~(
+				     SYM_MASK(IntMask, SDmaCleanupDoneMask_1) |
+				     SYM_MASK(IntMask, SDmaIdleIntMask_1) |
+				     SYM_MASK(IntMask, SDmaProgressIntMask_1) |
+				     SYM_MASK(IntMask, SDmaIntMask_1) |
+				     SYM_MASK(IntMask, ErrIntMask_1) |
+				     SYM_MASK(IntMask, SendDoneIntMask_1));
+			}
+			continue;
+		}
+
+		dd->num_pports++;
+		ret = qib_init_pportdata(ppd, dd, pidx, dd->num_pports);
+		if (ret) {
+			dd->num_pports--;
+			goto bail;
+		}
+
+		ppd->link_width_supported = IB_WIDTH_1X | IB_WIDTH_4X;
+		ppd->link_width_enabled = IB_WIDTH_4X;
+		ppd->link_speed_enabled = ppd->link_speed_supported;
+		/*
+		 * Set the initial values to reasonable default, will be set
+		 * for real when link is up.
+		 */
+		ppd->link_width_active = IB_WIDTH_4X;
+		ppd->link_speed_active = QIB_IB_SDR;
+		ppd->delay_mult = ib_rate_to_delay[IB_RATE_10_GBPS];
+		switch (qib_num_cfg_vls) {
+		case 1:
+			ppd->vls_supported = IB_VL_VL0;
+			break;
+		case 2:
+			ppd->vls_supported = IB_VL_VL0_1;
+			break;
+		default:
+			qib_devinfo(dd->pcidev,
+				    "Invalid num_vls %u, using 4 VLs\n",
+				    qib_num_cfg_vls);
+			qib_num_cfg_vls = 4;
+			/* fall through */
+		case 4:
+			ppd->vls_supported = IB_VL_VL0_3;
+			break;
+		case 8:
+			if (mtu <= 2048)
+				ppd->vls_supported = IB_VL_VL0_7;
+			else {
+				qib_devinfo(dd->pcidev,
+					    "Invalid num_vls %u for MTU %d , using 4 VLs\n",
+					    qib_num_cfg_vls, mtu);
+				ppd->vls_supported = IB_VL_VL0_3;
+				qib_num_cfg_vls = 4;
+			}
+			break;
+		}
+		ppd->vls_operational = ppd->vls_supported;
+
+		init_waitqueue_head(&cp->autoneg_wait);
+		INIT_DELAYED_WORK(&cp->autoneg_work,
+				  autoneg_7322_work);
+		if (ppd->dd->cspec->r1)
+			INIT_DELAYED_WORK(&cp->ipg_work, ipg_7322_work);
+
+		/*
+		 * For Mez and similar cards, no qsfp info, so do
+		 * the "cable info" setup here.  Can be overridden
+		 * in adapter-specific routines.
+		 */
+		if (!(dd->flags & QIB_HAS_QSFP)) {
+			if (!IS_QMH(dd) && !IS_QME(dd))
+				qib_devinfo(dd->pcidev,
+					"IB%u:%u: Unknown mezzanine card type\n",
+					dd->unit, ppd->port);
+			cp->h1_val = IS_QMH(dd) ? H1_FORCE_QMH : H1_FORCE_QME;
+			/*
+			 * Choose center value as default tx serdes setting
+			 * until changed through module parameter.
+			 */
+			ppd->cpspec->no_eep = IS_QMH(dd) ?
+				TXDDS_TABLE_SZ + 2 : TXDDS_TABLE_SZ + 4;
+		} else
+			cp->h1_val = H1_FORCE_VAL;
+
+		/* Avoid writes to chip for mini_init */
+		if (!qib_mini_init)
+			write_7322_init_portregs(ppd);
+
+		init_timer(&cp->chase_timer);
+		cp->chase_timer.function = reenable_chase;
+		cp->chase_timer.data = (unsigned long)ppd;
+
+		ppd++;
+	}
+
+	dd->rcvhdrentsize = qib_rcvhdrentsize ?
+		qib_rcvhdrentsize : QIB_RCVHDR_ENTSIZE;
+	dd->rcvhdrsize = qib_rcvhdrsize ?
+		qib_rcvhdrsize : QIB_DFLT_RCVHDRSIZE;
+	dd->rhf_offset = dd->rcvhdrentsize - sizeof(u64) / sizeof(u32);
+
+	/* we always allocate at least 2048 bytes for eager buffers */
+	dd->rcvegrbufsize = max(mtu, 2048);
+	BUG_ON(!is_power_of_2(dd->rcvegrbufsize));
+	dd->rcvegrbufsize_shift = ilog2(dd->rcvegrbufsize);
+
+	qib_7322_tidtemplate(dd);
+
+	/*
+	 * We can request a receive interrupt for 1 or
+	 * more packets from current offset.
+	 */
+	dd->rhdrhead_intr_off =
+		(u64) rcv_int_count << IBA7322_HDRHEAD_PKTINT_SHIFT;
+
+	/* setup the stats timer; the add_timer is done at end of init */
+	init_timer(&dd->stats_timer);
+	dd->stats_timer.function = qib_get_7322_faststats;
+	dd->stats_timer.data = (unsigned long) dd;
+
+	dd->ureg_align = 0x10000;  /* 64KB alignment */
+
+	dd->piosize2kmax_dwords = dd->piosize2k >> 2;
+
+	qib_7322_config_ctxts(dd);
+	qib_set_ctxtcnt(dd);
+
+	/*
+	 * We do not set WC on the VL15 buffers to avoid
+	 * a rare problem with unaligned writes from
+	 * interrupt-flushed store buffers, so we need
+	 * to map those separately here.  We can't solve
+	 * this for the rarely used mtrr case.
+	 */
+	ret = init_chip_wc_pat(dd, 0);
+	if (ret)
+		goto bail;
+
+	/* vl15 buffers start just after the 4k buffers */
+	vl15off = dd->physaddr + (dd->piobufbase >> 32) +
+		  dd->piobcnt4k * dd->align4k;
+	dd->piovl15base	= ioremap_nocache(vl15off,
+					  NUM_VL15_BUFS * dd->align4k);
+	if (!dd->piovl15base) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	qib_7322_set_baseaddrs(dd); /* set chip access pointers now */
+
+	ret = 0;
+	if (qib_mini_init)
+		goto bail;
+	if (!dd->num_pports) {
+		qib_dev_err(dd, "No ports enabled, giving up initialization\n");
+		goto bail; /* no error, so can still figure out why err */
+	}
+
+	write_7322_initregs(dd);
+	ret = qib_create_ctxts(dd);
+	init_7322_cntrnames(dd);
+
+	updthresh = 8U; /* update threshold */
+
+	/* use all of 4KB buffers for the kernel SDMA, zero if !SDMA.
+	 * reserve the update threshold amount for other kernel use, such
+	 * as sending SMI, MAD, and ACKs, or 3, whichever is greater,
+	 * unless we aren't enabling SDMA, in which case we want to use
+	 * all the 4k bufs for the kernel.
+	 * if this was less than the update threshold, we could wait
+	 * a long time for an update.  Coded this way because we
+	 * sometimes change the update threshold for various reasons,
+	 * and we want this to remain robust.
+	 */
+	if (dd->flags & QIB_HAS_SEND_DMA) {
+		dd->cspec->sdmabufcnt = dd->piobcnt4k;
+		sbufs = updthresh > 3 ? updthresh : 3;
+	} else {
+		dd->cspec->sdmabufcnt = 0;
+		sbufs = dd->piobcnt4k;
+	}
+	dd->cspec->lastbuf_for_pio = dd->piobcnt2k + dd->piobcnt4k -
+		dd->cspec->sdmabufcnt;
+	dd->lastctxt_piobuf = dd->cspec->lastbuf_for_pio - sbufs;
+	dd->cspec->lastbuf_for_pio--; /* range is <= , not < */
+	dd->last_pio = dd->cspec->lastbuf_for_pio;
+	dd->pbufsctxt = (dd->cfgctxts > dd->first_user_ctxt) ?
+		dd->lastctxt_piobuf / (dd->cfgctxts - dd->first_user_ctxt) : 0;
+
+	/*
+	 * If we have 16 user contexts, we will have 7 sbufs
+	 * per context, so reduce the update threshold to match.  We
+	 * want to update before we actually run out, at low pbufs/ctxt
+	 * so give ourselves some margin.
+	 */
+	if (dd->pbufsctxt >= 2 && dd->pbufsctxt - 2 < updthresh)
+		updthresh = dd->pbufsctxt - 2;
+	dd->cspec->updthresh_dflt = updthresh;
+	dd->cspec->updthresh = updthresh;
+
+	/* before full enable, no interrupts, no locking needed */
+	dd->sendctrl |= ((updthresh & SYM_RMASK(SendCtrl, AvailUpdThld))
+			     << SYM_LSB(SendCtrl, AvailUpdThld)) |
+			SYM_MASK(SendCtrl, SendBufAvailPad64Byte);
+
+	dd->psxmitwait_supported = 1;
+	dd->psxmitwait_check_rate = QIB_7322_PSXMITWAIT_CHECK_RATE;
+bail:
+	if (!dd->ctxtcnt)
+		dd->ctxtcnt = 1; /* for other initialization code */
+
+	return ret;
+}
+
+static u32 __iomem *qib_7322_getsendbuf(struct qib_pportdata *ppd, u64 pbc,
+					u32 *pbufnum)
+{
+	u32 first, last, plen = pbc & QIB_PBC_LENGTH_MASK;
+	struct qib_devdata *dd = ppd->dd;
+
+	/* last is same for 2k and 4k, because we use 4k if all 2k busy */
+	if (pbc & PBC_7322_VL15_SEND) {
+		first = dd->piobcnt2k + dd->piobcnt4k + ppd->hw_pidx;
+		last = first;
+	} else {
+		if ((plen + 1) > dd->piosize2kmax_dwords)
+			first = dd->piobcnt2k;
+		else
+			first = 0;
+		last = dd->cspec->lastbuf_for_pio;
+	}
+	return qib_getsendbuf_range(dd, pbufnum, first, last);
+}
+
+static void qib_set_cntr_7322_sample(struct qib_pportdata *ppd, u32 intv,
+				     u32 start)
+{
+	qib_write_kreg_port(ppd, krp_psinterval, intv);
+	qib_write_kreg_port(ppd, krp_psstart, start);
+}
+
+/*
+ * Must be called with sdma_lock held, or before init finished.
+ */
+static void qib_sdma_set_7322_desc_cnt(struct qib_pportdata *ppd, unsigned cnt)
+{
+	qib_write_kreg_port(ppd, krp_senddmadesccnt, cnt);
+}
+
+/*
+ * sdma_lock should be acquired before calling this routine
+ */
+static void dump_sdma_7322_state(struct qib_pportdata *ppd)
+{
+	u64 reg, reg1, reg2;
+
+	reg = qib_read_kreg_port(ppd, krp_senddmastatus);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmastatus: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_sendctrl);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA sendctrl: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmabase);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmabase: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmabufmask0);
+	reg1 = qib_read_kreg_port(ppd, krp_senddmabufmask1);
+	reg2 = qib_read_kreg_port(ppd, krp_senddmabufmask2);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmabufmask 0:%llx  1:%llx  2:%llx\n",
+		 reg, reg1, reg2);
+
+	/* get bufuse bits, clear them, and print them again if non-zero */
+	reg = qib_read_kreg_port(ppd, krp_senddmabuf_use0);
+	qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg);
+	reg1 = qib_read_kreg_port(ppd, krp_senddmabuf_use1);
+	qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg1);
+	reg2 = qib_read_kreg_port(ppd, krp_senddmabuf_use2);
+	qib_write_kreg_port(ppd, krp_senddmabuf_use0, reg2);
+	/* 0 and 1 should always be zero, so print as short form */
+	qib_dev_porterr(ppd->dd, ppd->port,
+		 "SDMA current senddmabuf_use 0:%llx  1:%llx  2:%llx\n",
+		 reg, reg1, reg2);
+	reg = qib_read_kreg_port(ppd, krp_senddmabuf_use0);
+	reg1 = qib_read_kreg_port(ppd, krp_senddmabuf_use1);
+	reg2 = qib_read_kreg_port(ppd, krp_senddmabuf_use2);
+	/* 0 and 1 should always be zero, so print as short form */
+	qib_dev_porterr(ppd->dd, ppd->port,
+		 "SDMA cleared senddmabuf_use 0:%llx  1:%llx  2:%llx\n",
+		 reg, reg1, reg2);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmatail);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmatail: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmahead);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmahead: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmaheadaddr);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmaheadaddr: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmalengen);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmalengen: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmadesccnt);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmadesccnt: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmaidlecnt);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmaidlecnt: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmaprioritythld);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmapriorityhld: 0x%016llx\n", reg);
+
+	reg = qib_read_kreg_port(ppd, krp_senddmareloadcnt);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA senddmareloadcnt: 0x%016llx\n", reg);
+
+	dump_sdma_state(ppd);
+}
+
+static struct sdma_set_state_action sdma_7322_action_table[] = {
+	[qib_sdma_state_s00_hw_down] = {
+		.go_s99_running_tofalse = 1,
+		.op_enable = 0,
+		.op_intenable = 0,
+		.op_halt = 0,
+		.op_drain = 0,
+	},
+	[qib_sdma_state_s10_hw_start_up_wait] = {
+		.op_enable = 0,
+		.op_intenable = 1,
+		.op_halt = 1,
+		.op_drain = 0,
+	},
+	[qib_sdma_state_s20_idle] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+		.op_drain = 0,
+	},
+	[qib_sdma_state_s30_sw_clean_up_wait] = {
+		.op_enable = 0,
+		.op_intenable = 1,
+		.op_halt = 1,
+		.op_drain = 0,
+	},
+	[qib_sdma_state_s40_hw_clean_up_wait] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+		.op_drain = 0,
+	},
+	[qib_sdma_state_s50_hw_halt_wait] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 1,
+		.op_drain = 1,
+	},
+	[qib_sdma_state_s99_running] = {
+		.op_enable = 1,
+		.op_intenable = 1,
+		.op_halt = 0,
+		.op_drain = 0,
+		.go_s99_running_totrue = 1,
+	},
+};
+
+static void qib_7322_sdma_init_early(struct qib_pportdata *ppd)
+{
+	ppd->sdma_state.set_state_action = sdma_7322_action_table;
+}
+
+static int init_sdma_7322_regs(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	unsigned lastbuf, erstbuf;
+	u64 senddmabufmask[3] = { 0 };
+	int n, ret = 0;
+
+	qib_write_kreg_port(ppd, krp_senddmabase, ppd->sdma_descq_phys);
+	qib_sdma_7322_setlengen(ppd);
+	qib_sdma_update_7322_tail(ppd, 0); /* Set SendDmaTail */
+	qib_write_kreg_port(ppd, krp_senddmareloadcnt, sdma_idle_cnt);
+	qib_write_kreg_port(ppd, krp_senddmadesccnt, 0);
+	qib_write_kreg_port(ppd, krp_senddmaheadaddr, ppd->sdma_head_phys);
+
+	if (dd->num_pports)
+		n = dd->cspec->sdmabufcnt / dd->num_pports; /* no remainder */
+	else
+		n = dd->cspec->sdmabufcnt; /* failsafe for init */
+	erstbuf = (dd->piobcnt2k + dd->piobcnt4k) -
+		((dd->num_pports == 1 || ppd->port == 2) ? n :
+		dd->cspec->sdmabufcnt);
+	lastbuf = erstbuf + n;
+
+	ppd->sdma_state.first_sendbuf = erstbuf;
+	ppd->sdma_state.last_sendbuf = lastbuf;
+	for (; erstbuf < lastbuf; ++erstbuf) {
+		unsigned word = erstbuf / BITS_PER_LONG;
+		unsigned bit = erstbuf & (BITS_PER_LONG - 1);
+
+		BUG_ON(word >= 3);
+		senddmabufmask[word] |= 1ULL << bit;
+	}
+	qib_write_kreg_port(ppd, krp_senddmabufmask0, senddmabufmask[0]);
+	qib_write_kreg_port(ppd, krp_senddmabufmask1, senddmabufmask[1]);
+	qib_write_kreg_port(ppd, krp_senddmabufmask2, senddmabufmask[2]);
+	return ret;
+}
+
+/* sdma_lock must be held */
+static u16 qib_sdma_7322_gethead(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int sane;
+	int use_dmahead;
+	u16 swhead;
+	u16 swtail;
+	u16 cnt;
+	u16 hwhead;
+
+	use_dmahead = __qib_sdma_running(ppd) &&
+		(dd->flags & QIB_HAS_SDMA_TIMEOUT);
+retry:
+	hwhead = use_dmahead ?
+		(u16) le64_to_cpu(*ppd->sdma_head_dma) :
+		(u16) qib_read_kreg_port(ppd, krp_senddmahead);
+
+	swhead = ppd->sdma_descq_head;
+	swtail = ppd->sdma_descq_tail;
+	cnt = ppd->sdma_descq_cnt;
+
+	if (swhead < swtail)
+		/* not wrapped */
+		sane = (hwhead >= swhead) & (hwhead <= swtail);
+	else if (swhead > swtail)
+		/* wrapped around */
+		sane = ((hwhead >= swhead) && (hwhead < cnt)) ||
+			(hwhead <= swtail);
+	else
+		/* empty */
+		sane = (hwhead == swhead);
+
+	if (unlikely(!sane)) {
+		if (use_dmahead) {
+			/* try one more time, directly from the register */
+			use_dmahead = 0;
+			goto retry;
+		}
+		/* proceed as if no progress */
+		hwhead = swhead;
+	}
+
+	return hwhead;
+}
+
+static int qib_sdma_7322_busy(struct qib_pportdata *ppd)
+{
+	u64 hwstatus = qib_read_kreg_port(ppd, krp_senddmastatus);
+
+	return (hwstatus & SYM_MASK(SendDmaStatus_0, ScoreBoardDrainInProg)) ||
+	       (hwstatus & SYM_MASK(SendDmaStatus_0, HaltInProg)) ||
+	       !(hwstatus & SYM_MASK(SendDmaStatus_0, InternalSDmaHalt)) ||
+	       !(hwstatus & SYM_MASK(SendDmaStatus_0, ScbEmpty));
+}
+
+/*
+ * Compute the amount of delay before sending the next packet if the
+ * port's send rate differs from the static rate set for the QP.
+ * The delay affects the next packet and the amount of the delay is
+ * based on the length of the this packet.
+ */
+static u32 qib_7322_setpbc_control(struct qib_pportdata *ppd, u32 plen,
+				   u8 srate, u8 vl)
+{
+	u8 snd_mult = ppd->delay_mult;
+	u8 rcv_mult = ib_rate_to_delay[srate];
+	u32 ret;
+
+	ret = rcv_mult > snd_mult ? ((plen + 1) >> 1) * snd_mult : 0;
+
+	/* Indicate VL15, else set the VL in the control word */
+	if (vl == 15)
+		ret |= PBC_7322_VL15_SEND_CTRL;
+	else
+		ret |= vl << PBC_VL_NUM_LSB;
+	ret |= ((u32)(ppd->hw_pidx)) << PBC_PORT_SEL_LSB;
+
+	return ret;
+}
+
+/*
+ * Enable the per-port VL15 send buffers for use.
+ * They follow the rest of the buffers, without a config parameter.
+ * This was in initregs, but that is done before the shadow
+ * is set up, and this has to be done after the shadow is
+ * set up.
+ */
+static void qib_7322_initvl15_bufs(struct qib_devdata *dd)
+{
+	unsigned vl15bufs;
+
+	vl15bufs = dd->piobcnt2k + dd->piobcnt4k;
+	qib_chg_pioavailkernel(dd, vl15bufs, NUM_VL15_BUFS,
+			       TXCHK_CHG_TYPE_KERN, NULL);
+}
+
+static void qib_7322_init_ctxt(struct qib_ctxtdata *rcd)
+{
+	if (rcd->ctxt < NUM_IB_PORTS) {
+		if (rcd->dd->num_pports > 1) {
+			rcd->rcvegrcnt = KCTXT0_EGRCNT / 2;
+			rcd->rcvegr_tid_base = rcd->ctxt ? rcd->rcvegrcnt : 0;
+		} else {
+			rcd->rcvegrcnt = KCTXT0_EGRCNT;
+			rcd->rcvegr_tid_base = 0;
+		}
+	} else {
+		rcd->rcvegrcnt = rcd->dd->cspec->rcvegrcnt;
+		rcd->rcvegr_tid_base = KCTXT0_EGRCNT +
+			(rcd->ctxt - NUM_IB_PORTS) * rcd->rcvegrcnt;
+	}
+}
+
+#define QTXSLEEPS 5000
+static void qib_7322_txchk_change(struct qib_devdata *dd, u32 start,
+				  u32 len, u32 which, struct qib_ctxtdata *rcd)
+{
+	int i;
+	const int last = start + len - 1;
+	const int lastr = last / BITS_PER_LONG;
+	u32 sleeps = 0;
+	int wait = rcd != NULL;
+	unsigned long flags;
+
+	while (wait) {
+		unsigned long shadow;
+		int cstart, previ = -1;
+
+		/*
+		 * when flipping from kernel to user, we can't change
+		 * the checking type if the buffer is allocated to the
+		 * driver.   It's OK the other direction, because it's
+		 * from close, and we have just disarm'ed all the
+		 * buffers.  All the kernel to kernel changes are also
+		 * OK.
+		 */
+		for (cstart = start; cstart <= last; cstart++) {
+			i = ((2 * cstart) + QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT)
+				/ BITS_PER_LONG;
+			if (i != previ) {
+				shadow = (unsigned long)
+					le64_to_cpu(dd->pioavailregs_dma[i]);
+				previ = i;
+			}
+			if (test_bit(((2 * cstart) +
+				      QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT)
+				     % BITS_PER_LONG, &shadow))
+				break;
+		}
+
+		if (cstart > last)
+			break;
+
+		if (sleeps == QTXSLEEPS)
+			break;
+		/* make sure we see an updated copy next time around */
+		sendctrl_7322_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+		sleeps++;
+		msleep(20);
+	}
+
+	switch (which) {
+	case TXCHK_CHG_TYPE_DIS1:
+		/*
+		 * disable checking on a range; used by diags; just
+		 * one buffer, but still written generically
+		 */
+		for (i = start; i <= last; i++)
+			clear_bit(i, dd->cspec->sendchkenable);
+		break;
+
+	case TXCHK_CHG_TYPE_ENAB1:
+		/*
+		 * (re)enable checking on a range; used by diags; just
+		 * one buffer, but still written generically; read
+		 * scratch to be sure buffer actually triggered, not
+		 * just flushed from processor.
+		 */
+		qib_read_kreg32(dd, kr_scratch);
+		for (i = start; i <= last; i++)
+			set_bit(i, dd->cspec->sendchkenable);
+		break;
+
+	case TXCHK_CHG_TYPE_KERN:
+		/* usable by kernel */
+		for (i = start; i <= last; i++) {
+			set_bit(i, dd->cspec->sendibchk);
+			clear_bit(i, dd->cspec->sendgrhchk);
+		}
+		spin_lock_irqsave(&dd->uctxt_lock, flags);
+		/* see if we need to raise avail update threshold */
+		for (i = dd->first_user_ctxt;
+		     dd->cspec->updthresh != dd->cspec->updthresh_dflt
+		     && i < dd->cfgctxts; i++)
+			if (dd->rcd[i] && dd->rcd[i]->subctxt_cnt &&
+			   ((dd->rcd[i]->piocnt / dd->rcd[i]->subctxt_cnt) - 1)
+			   < dd->cspec->updthresh_dflt)
+				break;
+		spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+		if (i == dd->cfgctxts) {
+			spin_lock_irqsave(&dd->sendctrl_lock, flags);
+			dd->cspec->updthresh = dd->cspec->updthresh_dflt;
+			dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld);
+			dd->sendctrl |= (dd->cspec->updthresh &
+					 SYM_RMASK(SendCtrl, AvailUpdThld)) <<
+					   SYM_LSB(SendCtrl, AvailUpdThld);
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+			sendctrl_7322_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+		}
+		break;
+
+	case TXCHK_CHG_TYPE_USER:
+		/* for user process */
+		for (i = start; i <= last; i++) {
+			clear_bit(i, dd->cspec->sendibchk);
+			set_bit(i, dd->cspec->sendgrhchk);
+		}
+		spin_lock_irqsave(&dd->sendctrl_lock, flags);
+		if (rcd && rcd->subctxt_cnt && ((rcd->piocnt
+			/ rcd->subctxt_cnt) - 1) < dd->cspec->updthresh) {
+			dd->cspec->updthresh = (rcd->piocnt /
+						rcd->subctxt_cnt) - 1;
+			dd->sendctrl &= ~SYM_MASK(SendCtrl, AvailUpdThld);
+			dd->sendctrl |= (dd->cspec->updthresh &
+					SYM_RMASK(SendCtrl, AvailUpdThld))
+					<< SYM_LSB(SendCtrl, AvailUpdThld);
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+			sendctrl_7322_mod(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+		} else
+			spin_unlock_irqrestore(&dd->sendctrl_lock, flags);
+		break;
+
+	default:
+		break;
+	}
+
+	for (i = start / BITS_PER_LONG; which >= 2 && i <= lastr; ++i)
+		qib_write_kreg(dd, kr_sendcheckmask + i,
+			       dd->cspec->sendchkenable[i]);
+
+	for (i = start / BITS_PER_LONG; which < 2 && i <= lastr; ++i) {
+		qib_write_kreg(dd, kr_sendgrhcheckmask + i,
+			       dd->cspec->sendgrhchk[i]);
+		qib_write_kreg(dd, kr_sendibpktmask + i,
+			       dd->cspec->sendibchk[i]);
+	}
+
+	/*
+	 * Be sure whatever we did was seen by the chip and acted upon,
+	 * before we return.  Mostly important for which >= 2.
+	 */
+	qib_read_kreg32(dd, kr_scratch);
+}
+
+
+/* useful for trigger analyzers, etc. */
+static void writescratch(struct qib_devdata *dd, u32 val)
+{
+	qib_write_kreg(dd, kr_scratch, val);
+}
+
+/* Dummy for now, use chip regs soon */
+static int qib_7322_tempsense_rd(struct qib_devdata *dd, int regnum)
+{
+	return -ENXIO;
+}
+
+/**
+ * qib_init_iba7322_funcs - set up the chip-specific function pointers
+ * @dev: the pci_dev for qlogic_ib device
+ * @ent: pci_device_id struct for this dev
+ *
+ * Also allocates, inits, and returns the devdata struct for this
+ * device instance
+ *
+ * This is global, and is called directly at init to set up the
+ * chip-specific function pointers for later use.
+ */
+struct qib_devdata *qib_init_iba7322_funcs(struct pci_dev *pdev,
+					   const struct pci_device_id *ent)
+{
+	struct qib_devdata *dd;
+	int ret, i;
+	u32 tabsize, actual_cnt = 0;
+
+	dd = qib_alloc_devdata(pdev,
+		NUM_IB_PORTS * sizeof(struct qib_pportdata) +
+		sizeof(struct qib_chip_specific) +
+		NUM_IB_PORTS * sizeof(struct qib_chippport_specific));
+	if (IS_ERR(dd))
+		goto bail;
+
+	dd->f_bringup_serdes    = qib_7322_bringup_serdes;
+	dd->f_cleanup           = qib_setup_7322_cleanup;
+	dd->f_clear_tids        = qib_7322_clear_tids;
+	dd->f_free_irq          = qib_7322_free_irq;
+	dd->f_get_base_info     = qib_7322_get_base_info;
+	dd->f_get_msgheader     = qib_7322_get_msgheader;
+	dd->f_getsendbuf        = qib_7322_getsendbuf;
+	dd->f_gpio_mod          = gpio_7322_mod;
+	dd->f_eeprom_wen        = qib_7322_eeprom_wen;
+	dd->f_hdrqempty         = qib_7322_hdrqempty;
+	dd->f_ib_updown         = qib_7322_ib_updown;
+	dd->f_init_ctxt         = qib_7322_init_ctxt;
+	dd->f_initvl15_bufs     = qib_7322_initvl15_bufs;
+	dd->f_intr_fallback     = qib_7322_intr_fallback;
+	dd->f_late_initreg      = qib_late_7322_initreg;
+	dd->f_setpbc_control    = qib_7322_setpbc_control;
+	dd->f_portcntr          = qib_portcntr_7322;
+	dd->f_put_tid           = qib_7322_put_tid;
+	dd->f_quiet_serdes      = qib_7322_mini_quiet_serdes;
+	dd->f_rcvctrl           = rcvctrl_7322_mod;
+	dd->f_read_cntrs        = qib_read_7322cntrs;
+	dd->f_read_portcntrs    = qib_read_7322portcntrs;
+	dd->f_reset             = qib_do_7322_reset;
+	dd->f_init_sdma_regs    = init_sdma_7322_regs;
+	dd->f_sdma_busy         = qib_sdma_7322_busy;
+	dd->f_sdma_gethead      = qib_sdma_7322_gethead;
+	dd->f_sdma_sendctrl     = qib_7322_sdma_sendctrl;
+	dd->f_sdma_set_desc_cnt = qib_sdma_set_7322_desc_cnt;
+	dd->f_sdma_update_tail  = qib_sdma_update_7322_tail;
+	dd->f_sendctrl          = sendctrl_7322_mod;
+	dd->f_set_armlaunch     = qib_set_7322_armlaunch;
+	dd->f_set_cntr_sample   = qib_set_cntr_7322_sample;
+	dd->f_iblink_state      = qib_7322_iblink_state;
+	dd->f_ibphys_portstate  = qib_7322_phys_portstate;
+	dd->f_get_ib_cfg        = qib_7322_get_ib_cfg;
+	dd->f_set_ib_cfg        = qib_7322_set_ib_cfg;
+	dd->f_set_ib_loopback   = qib_7322_set_loopback;
+	dd->f_get_ib_table      = qib_7322_get_ib_table;
+	dd->f_set_ib_table      = qib_7322_set_ib_table;
+	dd->f_set_intr_state    = qib_7322_set_intr_state;
+	dd->f_setextled         = qib_setup_7322_setextled;
+	dd->f_txchk_change      = qib_7322_txchk_change;
+	dd->f_update_usrhead    = qib_update_7322_usrhead;
+	dd->f_wantpiobuf_intr   = qib_wantpiobuf_7322_intr;
+	dd->f_xgxs_reset        = qib_7322_mini_pcs_reset;
+	dd->f_sdma_hw_clean_up  = qib_7322_sdma_hw_clean_up;
+	dd->f_sdma_hw_start_up  = qib_7322_sdma_hw_start_up;
+	dd->f_sdma_init_early   = qib_7322_sdma_init_early;
+	dd->f_writescratch      = writescratch;
+	dd->f_tempsense_rd	= qib_7322_tempsense_rd;
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dd->f_notify_dca	= qib_7322_notify_dca;
+#endif
+	/*
+	 * Do remaining PCIe setup and save PCIe values in dd.
+	 * Any error printing is already done by the init code.
+	 * On return, we have the chip mapped, but chip registers
+	 * are not set up until start of qib_init_7322_variables.
+	 */
+	ret = qib_pcie_ddinit(dd, pdev, ent);
+	if (ret < 0)
+		goto bail_free;
+
+	/* initialize chip-specific variables */
+	ret = qib_init_7322_variables(dd);
+	if (ret)
+		goto bail_cleanup;
+
+	if (qib_mini_init || !dd->num_pports)
+		goto bail;
+
+	/*
+	 * Determine number of vectors we want; depends on port count
+	 * and number of configured kernel receive queues actually used.
+	 * Should also depend on whether sdma is enabled or not, but
+	 * that's such a rare testing case it's not worth worrying about.
+	 */
+	tabsize = dd->first_user_ctxt + ARRAY_SIZE(irq_table);
+	for (i = 0; i < tabsize; i++)
+		if ((i < ARRAY_SIZE(irq_table) &&
+		     irq_table[i].port <= dd->num_pports) ||
+		    (i >= ARRAY_SIZE(irq_table) &&
+		     dd->rcd[i - ARRAY_SIZE(irq_table)]))
+			actual_cnt++;
+	/* reduce by ctxt's < 2 */
+	if (qib_krcvq01_no_msi)
+		actual_cnt -= dd->num_pports;
+
+	tabsize = actual_cnt;
+	dd->cspec->msix_entries = kzalloc(tabsize *
+			sizeof(struct qib_msix_entry), GFP_KERNEL);
+	if (!dd->cspec->msix_entries) {
+		qib_dev_err(dd, "No memory for MSIx table\n");
+		tabsize = 0;
+	}
+	for (i = 0; i < tabsize; i++)
+		dd->cspec->msix_entries[i].msix.entry = i;
+
+	if (qib_pcie_params(dd, 8, &tabsize, dd->cspec->msix_entries))
+		qib_dev_err(dd,
+			"Failed to setup PCIe or interrupts; continuing anyway\n");
+	/* may be less than we wanted, if not enough available */
+	dd->cspec->num_msix_entries = tabsize;
+
+	/* setup interrupt handler */
+	qib_setup_7322_interrupt(dd, 1);
+
+	/* clear diagctrl register, in case diags were running and crashed */
+	qib_write_kreg(dd, kr_hwdiagctrl, 0);
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	if (!dca_add_requester(&pdev->dev)) {
+		qib_devinfo(dd->pcidev, "DCA enabled\n");
+		dd->flags |= QIB_DCA_ENABLED;
+		qib_setup_dca(dd);
+	}
+#endif
+	goto bail;
+
+bail_cleanup:
+	qib_pcie_ddcleanup(dd);
+bail_free:
+	qib_free_devdata(dd);
+	dd = ERR_PTR(ret);
+bail:
+	return dd;
+}
+
+/*
+ * Set the table entry at the specified index from the table specifed.
+ * There are 3 * TXDDS_TABLE_SZ entries in all per port, with the first
+ * TXDDS_TABLE_SZ for SDR, the next for DDR, and the last for QDR.
+ * 'idx' below addresses the correct entry, while its 4 LSBs select the
+ * corresponding entry (one of TXDDS_TABLE_SZ) from the selected table.
+ */
+#define DDS_ENT_AMP_LSB 14
+#define DDS_ENT_MAIN_LSB 9
+#define DDS_ENT_POST_LSB 5
+#define DDS_ENT_PRE_XTRA_LSB 3
+#define DDS_ENT_PRE_LSB 0
+
+/*
+ * Set one entry in the TxDDS table for spec'd port
+ * ridx picks one of the entries, while tp points
+ * to the appropriate table entry.
+ */
+static void set_txdds(struct qib_pportdata *ppd, int ridx,
+		      const struct txdds_ent *tp)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u32 pack_ent;
+	int regidx;
+
+	/* Get correct offset in chip-space, and in source table */
+	regidx = KREG_IBPORT_IDX(IBSD_DDS_MAP_TABLE) + ridx;
+	/*
+	 * We do not use qib_write_kreg_port() because it was intended
+	 * only for registers in the lower "port specific" pages.
+	 * So do index calculation  by hand.
+	 */
+	if (ppd->hw_pidx)
+		regidx += (dd->palign / sizeof(u64));
+
+	pack_ent = tp->amp << DDS_ENT_AMP_LSB;
+	pack_ent |= tp->main << DDS_ENT_MAIN_LSB;
+	pack_ent |= tp->pre << DDS_ENT_PRE_LSB;
+	pack_ent |= tp->post << DDS_ENT_POST_LSB;
+	qib_write_kreg(dd, regidx, pack_ent);
+	/* Prevent back-to-back writes by hitting scratch */
+	qib_write_kreg(ppd->dd, kr_scratch, 0);
+}
+
+static const struct vendor_txdds_ent vendor_txdds[] = {
+	{ /* Amphenol 1m 30awg NoEq */
+		{ 0x41, 0x50, 0x48 }, "584470002       ",
+		{ 10,  0,  0,  5 }, { 10,  0,  0,  9 }, {  7,  1,  0, 13 },
+	},
+	{ /* Amphenol 3m 28awg NoEq */
+		{ 0x41, 0x50, 0x48 }, "584470004       ",
+		{  0,  0,  0,  8 }, {  0,  0,  0, 11 }, {  0,  1,  7, 15 },
+	},
+	{ /* Finisar 3m OM2 Optical */
+		{ 0x00, 0x90, 0x65 }, "FCBG410QB1C03-QL",
+		{  0,  0,  0,  3 }, {  0,  0,  0,  4 }, {  0,  0,  0, 13 },
+	},
+	{ /* Finisar 30m OM2 Optical */
+		{ 0x00, 0x90, 0x65 }, "FCBG410QB1C30-QL",
+		{  0,  0,  0,  1 }, {  0,  0,  0,  5 }, {  0,  0,  0, 11 },
+	},
+	{ /* Finisar Default OM2 Optical */
+		{ 0x00, 0x90, 0x65 }, NULL,
+		{  0,  0,  0,  2 }, {  0,  0,  0,  5 }, {  0,  0,  0, 12 },
+	},
+	{ /* Gore 1m 30awg NoEq */
+		{ 0x00, 0x21, 0x77 }, "QSN3300-1       ",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  9 }, {  0,  1,  0, 15 },
+	},
+	{ /* Gore 2m 30awg NoEq */
+		{ 0x00, 0x21, 0x77 }, "QSN3300-2       ",
+		{  0,  0,  0,  8 }, {  0,  0,  0, 10 }, {  0,  1,  7, 15 },
+	},
+	{ /* Gore 1m 28awg NoEq */
+		{ 0x00, 0x21, 0x77 }, "QSN3800-1       ",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  8 }, {  0,  1,  0, 15 },
+	},
+	{ /* Gore 3m 28awg NoEq */
+		{ 0x00, 0x21, 0x77 }, "QSN3800-3       ",
+		{  0,  0,  0,  9 }, {  0,  0,  0, 13 }, {  0,  1,  7, 15 },
+	},
+	{ /* Gore 5m 24awg Eq */
+		{ 0x00, 0x21, 0x77 }, "QSN7000-5       ",
+		{  0,  0,  0,  7 }, {  0,  0,  0,  9 }, {  0,  1,  3, 15 },
+	},
+	{ /* Gore 7m 24awg Eq */
+		{ 0x00, 0x21, 0x77 }, "QSN7000-7       ",
+		{  0,  0,  0,  9 }, {  0,  0,  0, 11 }, {  0,  2,  6, 15 },
+	},
+	{ /* Gore 5m 26awg Eq */
+		{ 0x00, 0x21, 0x77 }, "QSN7600-5       ",
+		{  0,  0,  0,  8 }, {  0,  0,  0, 11 }, {  0,  1,  9, 13 },
+	},
+	{ /* Gore 7m 26awg Eq */
+		{ 0x00, 0x21, 0x77 }, "QSN7600-7       ",
+		{  0,  0,  0,  8 }, {  0,  0,  0, 11 }, {  10,  1,  8, 15 },
+	},
+	{ /* Intersil 12m 24awg Active */
+		{ 0x00, 0x30, 0xB4 }, "QLX4000CQSFP1224",
+		{  0,  0,  0,  2 }, {  0,  0,  0,  5 }, {  0,  3,  0,  9 },
+	},
+	{ /* Intersil 10m 28awg Active */
+		{ 0x00, 0x30, 0xB4 }, "QLX4000CQSFP1028",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  4 }, {  0,  2,  0,  2 },
+	},
+	{ /* Intersil 7m 30awg Active */
+		{ 0x00, 0x30, 0xB4 }, "QLX4000CQSFP0730",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  4 }, {  0,  1,  0,  3 },
+	},
+	{ /* Intersil 5m 32awg Active */
+		{ 0x00, 0x30, 0xB4 }, "QLX4000CQSFP0532",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  6 }, {  0,  2,  0,  8 },
+	},
+	{ /* Intersil Default Active */
+		{ 0x00, 0x30, 0xB4 }, NULL,
+		{  0,  0,  0,  6 }, {  0,  0,  0,  5 }, {  0,  2,  0,  5 },
+	},
+	{ /* Luxtera 20m Active Optical */
+		{ 0x00, 0x25, 0x63 }, NULL,
+		{  0,  0,  0,  5 }, {  0,  0,  0,  8 }, {  0,  2,  0,  12 },
+	},
+	{ /* Molex 1M Cu loopback */
+		{ 0x00, 0x09, 0x3A }, "74763-0025      ",
+		{  2,  2,  6, 15 }, {  2,  2,  6, 15 }, {  2,  2,  6, 15 },
+	},
+	{ /* Molex 2m 28awg NoEq */
+		{ 0x00, 0x09, 0x3A }, "74757-2201      ",
+		{  0,  0,  0,  6 }, {  0,  0,  0,  9 }, {  0,  1,  1, 15 },
+	},
+};
+
+static const struct txdds_ent txdds_sdr[TXDDS_TABLE_SZ] = {
+	/* amp, pre, main, post */
+	{  2, 2, 15,  6 },	/* Loopback */
+	{  0, 0,  0,  1 },	/*  2 dB */
+	{  0, 0,  0,  2 },	/*  3 dB */
+	{  0, 0,  0,  3 },	/*  4 dB */
+	{  0, 0,  0,  4 },	/*  5 dB */
+	{  0, 0,  0,  5 },	/*  6 dB */
+	{  0, 0,  0,  6 },	/*  7 dB */
+	{  0, 0,  0,  7 },	/*  8 dB */
+	{  0, 0,  0,  8 },	/*  9 dB */
+	{  0, 0,  0,  9 },	/* 10 dB */
+	{  0, 0,  0, 10 },	/* 11 dB */
+	{  0, 0,  0, 11 },	/* 12 dB */
+	{  0, 0,  0, 12 },	/* 13 dB */
+	{  0, 0,  0, 13 },	/* 14 dB */
+	{  0, 0,  0, 14 },	/* 15 dB */
+	{  0, 0,  0, 15 },	/* 16 dB */
+};
+
+static const struct txdds_ent txdds_ddr[TXDDS_TABLE_SZ] = {
+	/* amp, pre, main, post */
+	{  2, 2, 15,  6 },	/* Loopback */
+	{  0, 0,  0,  8 },	/*  2 dB */
+	{  0, 0,  0,  8 },	/*  3 dB */
+	{  0, 0,  0,  9 },	/*  4 dB */
+	{  0, 0,  0,  9 },	/*  5 dB */
+	{  0, 0,  0, 10 },	/*  6 dB */
+	{  0, 0,  0, 10 },	/*  7 dB */
+	{  0, 0,  0, 11 },	/*  8 dB */
+	{  0, 0,  0, 11 },	/*  9 dB */
+	{  0, 0,  0, 12 },	/* 10 dB */
+	{  0, 0,  0, 12 },	/* 11 dB */
+	{  0, 0,  0, 13 },	/* 12 dB */
+	{  0, 0,  0, 13 },	/* 13 dB */
+	{  0, 0,  0, 14 },	/* 14 dB */
+	{  0, 0,  0, 14 },	/* 15 dB */
+	{  0, 0,  0, 15 },	/* 16 dB */
+};
+
+static const struct txdds_ent txdds_qdr[TXDDS_TABLE_SZ] = {
+	/* amp, pre, main, post */
+	{  2, 2, 15,  6 },	/* Loopback */
+	{  0, 1,  0,  7 },	/*  2 dB (also QMH7342) */
+	{  0, 1,  0,  9 },	/*  3 dB (also QMH7342) */
+	{  0, 1,  0, 11 },	/*  4 dB */
+	{  0, 1,  0, 13 },	/*  5 dB */
+	{  0, 1,  0, 15 },	/*  6 dB */
+	{  0, 1,  3, 15 },	/*  7 dB */
+	{  0, 1,  7, 15 },	/*  8 dB */
+	{  0, 1,  7, 15 },	/*  9 dB */
+	{  0, 1,  8, 15 },	/* 10 dB */
+	{  0, 1,  9, 15 },	/* 11 dB */
+	{  0, 1, 10, 15 },	/* 12 dB */
+	{  0, 2,  6, 15 },	/* 13 dB */
+	{  0, 2,  7, 15 },	/* 14 dB */
+	{  0, 2,  8, 15 },	/* 15 dB */
+	{  0, 2,  9, 15 },	/* 16 dB */
+};
+
+/*
+ * extra entries for use with txselect, for indices >= TXDDS_TABLE_SZ.
+ * These are mostly used for mez cards going through connectors
+ * and backplane traces, but can be used to add other "unusual"
+ * table values as well.
+ */
+static const struct txdds_ent txdds_extra_sdr[TXDDS_EXTRA_SZ] = {
+	/* amp, pre, main, post */
+	{  0, 0, 0,  1 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  1 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  2 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  2 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  3 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  4 },	/* QMH7342 backplane settings */
+	{  0, 1, 4, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 3, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 12 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 11 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0,  9 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 14 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 2, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 11 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  7 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  9 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  6 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  8 },       /* QME7342 backplane settings 1.1 */
+};
+
+static const struct txdds_ent txdds_extra_ddr[TXDDS_EXTRA_SZ] = {
+	/* amp, pre, main, post */
+	{  0, 0, 0,  7 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  7 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  8 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  8 },	/* QMH7342 backplane settings */
+	{  0, 0, 0,  9 },	/* QMH7342 backplane settings */
+	{  0, 0, 0, 10 },	/* QMH7342 backplane settings */
+	{  0, 1, 4, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 3, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 12 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 11 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0,  9 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 14 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 2, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1, 0, 11 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  7 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  9 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  6 },       /* QME7342 backplane settings 1.1 */
+	{  0, 1, 0,  8 },       /* QME7342 backplane settings 1.1 */
+};
+
+static const struct txdds_ent txdds_extra_qdr[TXDDS_EXTRA_SZ] = {
+	/* amp, pre, main, post */
+	{  0, 1,  0,  4 },	/* QMH7342 backplane settings */
+	{  0, 1,  0,  5 },	/* QMH7342 backplane settings */
+	{  0, 1,  0,  6 },	/* QMH7342 backplane settings */
+	{  0, 1,  0,  8 },	/* QMH7342 backplane settings */
+	{  0, 1,  0, 10 },	/* QMH7342 backplane settings */
+	{  0, 1,  0, 12 },	/* QMH7342 backplane settings */
+	{  0, 1,  4, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  3, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  0, 12 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  0, 11 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  0,  9 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  0, 14 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  2, 15 },	/* QME7342 backplane settings 1.0 */
+	{  0, 1,  0, 11 },      /* QME7342 backplane settings 1.1 */
+	{  0, 1,  0,  7 },      /* QME7342 backplane settings 1.1 */
+	{  0, 1,  0,  9 },      /* QME7342 backplane settings 1.1 */
+	{  0, 1,  0,  6 },      /* QME7342 backplane settings 1.1 */
+	{  0, 1,  0,  8 },      /* QME7342 backplane settings 1.1 */
+};
+
+static const struct txdds_ent txdds_extra_mfg[TXDDS_MFG_SZ] = {
+	/* amp, pre, main, post */
+	{ 0, 0, 0, 0 },         /* QME7342 mfg settings */
+	{ 0, 0, 0, 6 },         /* QME7342 P2 mfg settings */
+};
+
+static const struct txdds_ent *get_atten_table(const struct txdds_ent *txdds,
+					       unsigned atten)
+{
+	/*
+	 * The attenuation table starts at 2dB for entry 1,
+	 * with entry 0 being the loopback entry.
+	 */
+	if (atten <= 2)
+		atten = 1;
+	else if (atten > TXDDS_TABLE_SZ)
+		atten = TXDDS_TABLE_SZ - 1;
+	else
+		atten--;
+	return txdds + atten;
+}
+
+/*
+ * if override is set, the module parameter txselect has a value
+ * for this specific port, so use it, rather than our normal mechanism.
+ */
+static void find_best_ent(struct qib_pportdata *ppd,
+			  const struct txdds_ent **sdr_dds,
+			  const struct txdds_ent **ddr_dds,
+			  const struct txdds_ent **qdr_dds, int override)
+{
+	struct qib_qsfp_cache *qd = &ppd->cpspec->qsfp_data.cache;
+	int idx;
+
+	/* Search table of known cables */
+	for (idx = 0; !override && idx < ARRAY_SIZE(vendor_txdds); ++idx) {
+		const struct vendor_txdds_ent *v = vendor_txdds + idx;
+
+		if (!memcmp(v->oui, qd->oui, QSFP_VOUI_LEN) &&
+		    (!v->partnum ||
+		     !memcmp(v->partnum, qd->partnum, QSFP_PN_LEN))) {
+			*sdr_dds = &v->sdr;
+			*ddr_dds = &v->ddr;
+			*qdr_dds = &v->qdr;
+			return;
+		}
+	}
+
+	/* Active cables don't have attenuation so we only set SERDES
+	 * settings to account for the attenuation of the board traces. */
+	if (!override && QSFP_IS_ACTIVE(qd->tech)) {
+		*sdr_dds = txdds_sdr + ppd->dd->board_atten;
+		*ddr_dds = txdds_ddr + ppd->dd->board_atten;
+		*qdr_dds = txdds_qdr + ppd->dd->board_atten;
+		return;
+	}
+
+	if (!override && QSFP_HAS_ATTEN(qd->tech) && (qd->atten[0] ||
+						      qd->atten[1])) {
+		*sdr_dds = get_atten_table(txdds_sdr, qd->atten[0]);
+		*ddr_dds = get_atten_table(txdds_ddr, qd->atten[0]);
+		*qdr_dds = get_atten_table(txdds_qdr, qd->atten[1]);
+		return;
+	} else if (ppd->cpspec->no_eep < TXDDS_TABLE_SZ) {
+		/*
+		 * If we have no (or incomplete) data from the cable
+		 * EEPROM, or no QSFP, or override is set, use the
+		 * module parameter value to index into the attentuation
+		 * table.
+		 */
+		idx = ppd->cpspec->no_eep;
+		*sdr_dds = &txdds_sdr[idx];
+		*ddr_dds = &txdds_ddr[idx];
+		*qdr_dds = &txdds_qdr[idx];
+	} else if (ppd->cpspec->no_eep < (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ)) {
+		/* similar to above, but index into the "extra" table. */
+		idx = ppd->cpspec->no_eep - TXDDS_TABLE_SZ;
+		*sdr_dds = &txdds_extra_sdr[idx];
+		*ddr_dds = &txdds_extra_ddr[idx];
+		*qdr_dds = &txdds_extra_qdr[idx];
+	} else if ((IS_QME(ppd->dd) || IS_QMH(ppd->dd)) &&
+		   ppd->cpspec->no_eep < (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ +
+					  TXDDS_MFG_SZ)) {
+		idx = ppd->cpspec->no_eep - (TXDDS_TABLE_SZ + TXDDS_EXTRA_SZ);
+		pr_info("IB%u:%u use idx %u into txdds_mfg\n",
+			ppd->dd->unit, ppd->port, idx);
+		*sdr_dds = &txdds_extra_mfg[idx];
+		*ddr_dds = &txdds_extra_mfg[idx];
+		*qdr_dds = &txdds_extra_mfg[idx];
+	} else {
+		/* this shouldn't happen, it's range checked */
+		*sdr_dds = txdds_sdr + qib_long_atten;
+		*ddr_dds = txdds_ddr + qib_long_atten;
+		*qdr_dds = txdds_qdr + qib_long_atten;
+	}
+}
+
+static void init_txdds_table(struct qib_pportdata *ppd, int override)
+{
+	const struct txdds_ent *sdr_dds, *ddr_dds, *qdr_dds;
+	struct txdds_ent *dds;
+	int idx;
+	int single_ent = 0;
+
+	find_best_ent(ppd, &sdr_dds, &ddr_dds, &qdr_dds, override);
+
+	/* for mez cards or override, use the selected value for all entries */
+	if (!(ppd->dd->flags & QIB_HAS_QSFP) || override)
+		single_ent = 1;
+
+	/* Fill in the first entry with the best entry found. */
+	set_txdds(ppd, 0, sdr_dds);
+	set_txdds(ppd, TXDDS_TABLE_SZ, ddr_dds);
+	set_txdds(ppd, 2 * TXDDS_TABLE_SZ, qdr_dds);
+	if (ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED |
+		QIBL_LINKACTIVE)) {
+		dds = (struct txdds_ent *)(ppd->link_speed_active ==
+					   QIB_IB_QDR ?  qdr_dds :
+					   (ppd->link_speed_active ==
+					    QIB_IB_DDR ? ddr_dds : sdr_dds));
+		write_tx_serdes_param(ppd, dds);
+	}
+
+	/* Fill in the remaining entries with the default table values. */
+	for (idx = 1; idx < ARRAY_SIZE(txdds_sdr); ++idx) {
+		set_txdds(ppd, idx, single_ent ? sdr_dds : txdds_sdr + idx);
+		set_txdds(ppd, idx + TXDDS_TABLE_SZ,
+			  single_ent ? ddr_dds : txdds_ddr + idx);
+		set_txdds(ppd, idx + 2 * TXDDS_TABLE_SZ,
+			  single_ent ? qdr_dds : txdds_qdr + idx);
+	}
+}
+
+#define KR_AHB_ACC KREG_IDX(ahb_access_ctrl)
+#define KR_AHB_TRANS KREG_IDX(ahb_transaction_reg)
+#define AHB_TRANS_RDY SYM_MASK(ahb_transaction_reg, ahb_rdy)
+#define AHB_ADDR_LSB SYM_LSB(ahb_transaction_reg, ahb_address)
+#define AHB_DATA_LSB SYM_LSB(ahb_transaction_reg, ahb_data)
+#define AHB_WR SYM_MASK(ahb_transaction_reg, write_not_read)
+#define AHB_TRANS_TRIES 10
+
+/*
+ * The chan argument is 0=chan0, 1=chan1, 2=pll, 3=chan2, 4=chan4,
+ * 5=subsystem which is why most calls have "chan + chan >> 1"
+ * for the channel argument.
+ */
+static u32 ahb_mod(struct qib_devdata *dd, int quad, int chan, int addr,
+		    u32 data, u32 mask)
+{
+	u32 rd_data, wr_data, sz_mask;
+	u64 trans, acc, prev_acc;
+	u32 ret = 0xBAD0BAD;
+	int tries;
+
+	prev_acc = qib_read_kreg64(dd, KR_AHB_ACC);
+	/* From this point on, make sure we return access */
+	acc = (quad << 1) | 1;
+	qib_write_kreg(dd, KR_AHB_ACC, acc);
+
+	for (tries = 1; tries < AHB_TRANS_TRIES; ++tries) {
+		trans = qib_read_kreg64(dd, KR_AHB_TRANS);
+		if (trans & AHB_TRANS_RDY)
+			break;
+	}
+	if (tries >= AHB_TRANS_TRIES) {
+		qib_dev_err(dd, "No ahb_rdy in %d tries\n", AHB_TRANS_TRIES);
+		goto bail;
+	}
+
+	/* If mask is not all 1s, we need to read, but different SerDes
+	 * entities have different sizes
+	 */
+	sz_mask = (1UL << ((quad == 1) ? 32 : 16)) - 1;
+	wr_data = data & mask & sz_mask;
+	if ((~mask & sz_mask) != 0) {
+		trans = ((chan << 6) | addr) << (AHB_ADDR_LSB + 1);
+		qib_write_kreg(dd, KR_AHB_TRANS, trans);
+
+		for (tries = 1; tries < AHB_TRANS_TRIES; ++tries) {
+			trans = qib_read_kreg64(dd, KR_AHB_TRANS);
+			if (trans & AHB_TRANS_RDY)
+				break;
+		}
+		if (tries >= AHB_TRANS_TRIES) {
+			qib_dev_err(dd, "No Rd ahb_rdy in %d tries\n",
+				    AHB_TRANS_TRIES);
+			goto bail;
+		}
+		/* Re-read in case host split reads and read data first */
+		trans = qib_read_kreg64(dd, KR_AHB_TRANS);
+		rd_data = (uint32_t)(trans >> AHB_DATA_LSB);
+		wr_data |= (rd_data & ~mask & sz_mask);
+	}
+
+	/* If mask is not zero, we need to write. */
+	if (mask & sz_mask) {
+		trans = ((chan << 6) | addr) << (AHB_ADDR_LSB + 1);
+		trans |= ((uint64_t)wr_data << AHB_DATA_LSB);
+		trans |= AHB_WR;
+		qib_write_kreg(dd, KR_AHB_TRANS, trans);
+
+		for (tries = 1; tries < AHB_TRANS_TRIES; ++tries) {
+			trans = qib_read_kreg64(dd, KR_AHB_TRANS);
+			if (trans & AHB_TRANS_RDY)
+				break;
+		}
+		if (tries >= AHB_TRANS_TRIES) {
+			qib_dev_err(dd, "No Wr ahb_rdy in %d tries\n",
+				    AHB_TRANS_TRIES);
+			goto bail;
+		}
+	}
+	ret = wr_data;
+bail:
+	qib_write_kreg(dd, KR_AHB_ACC, prev_acc);
+	return ret;
+}
+
+static void ibsd_wr_allchans(struct qib_pportdata *ppd, int addr, unsigned data,
+			     unsigned mask)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int chan;
+	u32 rbc;
+
+	for (chan = 0; chan < SERDES_CHANS; ++chan) {
+		ahb_mod(dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)), addr,
+			data, mask);
+		rbc = ahb_mod(dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+			      addr, 0, 0);
+	}
+}
+
+static void serdes_7322_los_enable(struct qib_pportdata *ppd, int enable)
+{
+	u64 data = qib_read_kreg_port(ppd, krp_serdesctrl);
+	u8 state = SYM_FIELD(data, IBSerdesCtrl_0, RXLOSEN);
+
+	if (enable && !state) {
+		pr_info("IB%u:%u Turning LOS on\n",
+			ppd->dd->unit, ppd->port);
+		data |= SYM_MASK(IBSerdesCtrl_0, RXLOSEN);
+	} else if (!enable && state) {
+		pr_info("IB%u:%u Turning LOS off\n",
+			ppd->dd->unit, ppd->port);
+		data &= ~SYM_MASK(IBSerdesCtrl_0, RXLOSEN);
+	}
+	qib_write_kreg_port(ppd, krp_serdesctrl, data);
+}
+
+static int serdes_7322_init(struct qib_pportdata *ppd)
+{
+	int ret = 0;
+
+	if (ppd->dd->cspec->r1)
+		ret = serdes_7322_init_old(ppd);
+	else
+		ret = serdes_7322_init_new(ppd);
+	return ret;
+}
+
+static int serdes_7322_init_old(struct qib_pportdata *ppd)
+{
+	u32 le_val;
+
+	/*
+	 * Initialize the Tx DDS tables.  Also done every QSFP event,
+	 * for adapters with QSFP
+	 */
+	init_txdds_table(ppd, 0);
+
+	/* ensure no tx overrides from earlier driver loads */
+	qib_write_kreg_port(ppd, krp_tx_deemph_override,
+		SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		reset_tx_deemphasis_override));
+
+	/* Patch some SerDes defaults to "Better for IB" */
+	/* Timing Loop Bandwidth: cdr_timing[11:9] = 0 */
+	ibsd_wr_allchans(ppd, 2, 0, BMASK(11, 9));
+
+	/* Termination: rxtermctrl_r2d addr 11 bits [12:11] = 1 */
+	ibsd_wr_allchans(ppd, 11, (1 << 11), BMASK(12, 11));
+	/* Enable LE2: rxle2en_r2a addr 13 bit [6] = 1 */
+	ibsd_wr_allchans(ppd, 13, (1 << 6), (1 << 6));
+
+	/* May be overridden in qsfp_7322_event */
+	le_val = IS_QME(ppd->dd) ? LE2_QME : LE2_DEFAULT;
+	ibsd_wr_allchans(ppd, 13, (le_val << 7), BMASK(9, 7));
+
+	/* enable LE1 adaptation for all but QME, which is disabled */
+	le_val = IS_QME(ppd->dd) ? 0 : 1;
+	ibsd_wr_allchans(ppd, 13, (le_val << 5), (1 << 5));
+
+	/* Clear cmode-override, may be set from older driver */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 0 << 14, 1 << 14);
+
+	/* Timing Recovery: rxtapsel addr 5 bits [9:8] = 0 */
+	ibsd_wr_allchans(ppd, 5, (0 << 8), BMASK(9, 8));
+
+	/* setup LoS params; these are subsystem, so chan == 5 */
+	/* LoS filter threshold_count on, ch 0-3, set to 8 */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 5, 8 << 11, BMASK(14, 11));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 8 << 4, BMASK(7, 4));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 8, 8 << 11, BMASK(14, 11));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 8 << 4, BMASK(7, 4));
+
+	/* LoS filter threshold_count off, ch 0-3, set to 4 */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 6, 4 << 0, BMASK(3, 0));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 4 << 8, BMASK(11, 8));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 4 << 0, BMASK(3, 0));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 4 << 8, BMASK(11, 8));
+
+	/* LoS filter select enabled */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 1 << 15, 1 << 15);
+
+	/* LoS target data:  SDR=4, DDR=2, QDR=1 */
+	ibsd_wr_allchans(ppd, 14, (1 << 3), BMASK(5, 3)); /* QDR */
+	ibsd_wr_allchans(ppd, 20, (2 << 10), BMASK(12, 10)); /* DDR */
+	ibsd_wr_allchans(ppd, 20, (4 << 13), BMASK(15, 13)); /* SDR */
+
+	serdes_7322_los_enable(ppd, 1);
+
+	/* rxbistena; set 0 to avoid effects of it switch later */
+	ibsd_wr_allchans(ppd, 9, 0 << 15, 1 << 15);
+
+	/* Configure 4 DFE taps, and only they adapt */
+	ibsd_wr_allchans(ppd, 16, 0 << 0, BMASK(1, 0));
+
+	/* gain hi stop 32 (22) (6:1) lo stop 7 (10:7) target 22 (13) (15:11) */
+	le_val = (ppd->dd->cspec->r1 || IS_QME(ppd->dd)) ? 0xb6c0 : 0x6bac;
+	ibsd_wr_allchans(ppd, 21, le_val, 0xfffe);
+
+	/*
+	 * Set receive adaptation mode.  SDR and DDR adaptation are
+	 * always on, and QDR is initially enabled; later disabled.
+	 */
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(0), 0ULL);
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(1), 0ULL);
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(2),
+			    ppd->dd->cspec->r1 ?
+			    QDR_STATIC_ADAPT_DOWN_R1 : QDR_STATIC_ADAPT_DOWN);
+	ppd->cpspec->qdr_dfe_on = 1;
+
+	/* FLoop LOS gate: PPM filter  enabled */
+	ibsd_wr_allchans(ppd, 38, 0 << 10, 1 << 10);
+
+	/* rx offset center enabled */
+	ibsd_wr_allchans(ppd, 12, 1 << 4, 1 << 4);
+
+	if (!ppd->dd->cspec->r1) {
+		ibsd_wr_allchans(ppd, 12, 1 << 12, 1 << 12);
+		ibsd_wr_allchans(ppd, 12, 2 << 8, 0x0f << 8);
+	}
+
+	/* Set the frequency loop bandwidth to 15 */
+	ibsd_wr_allchans(ppd, 2, 15 << 5, BMASK(8, 5));
+
+	return 0;
+}
+
+static int serdes_7322_init_new(struct qib_pportdata *ppd)
+{
+	unsigned long tend;
+	u32 le_val, rxcaldone;
+	int chan, chan_done = (1 << SERDES_CHANS) - 1;
+
+	/* Clear cmode-override, may be set from older driver */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 0 << 14, 1 << 14);
+
+	/* ensure no tx overrides from earlier driver loads */
+	qib_write_kreg_port(ppd, krp_tx_deemph_override,
+		SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		reset_tx_deemphasis_override));
+
+	/* START OF LSI SUGGESTED SERDES BRINGUP */
+	/* Reset - Calibration Setup */
+	/*       Stop DFE adaptaion */
+	ibsd_wr_allchans(ppd, 1, 0, BMASK(9, 1));
+	/*       Disable LE1 */
+	ibsd_wr_allchans(ppd, 13, 0, BMASK(5, 5));
+	/*       Disable autoadapt for LE1 */
+	ibsd_wr_allchans(ppd, 1, 0, BMASK(15, 15));
+	/*       Disable LE2 */
+	ibsd_wr_allchans(ppd, 13, 0, BMASK(6, 6));
+	/*       Disable VGA */
+	ibsd_wr_allchans(ppd, 5, 0, BMASK(0, 0));
+	/*       Disable AFE Offset Cancel */
+	ibsd_wr_allchans(ppd, 12, 0, BMASK(12, 12));
+	/*       Disable Timing Loop */
+	ibsd_wr_allchans(ppd, 2, 0, BMASK(3, 3));
+	/*       Disable Frequency Loop */
+	ibsd_wr_allchans(ppd, 2, 0, BMASK(4, 4));
+	/*       Disable Baseline Wander Correction */
+	ibsd_wr_allchans(ppd, 13, 0, BMASK(13, 13));
+	/*       Disable RX Calibration */
+	ibsd_wr_allchans(ppd, 4, 0, BMASK(10, 10));
+	/*       Disable RX Offset Calibration */
+	ibsd_wr_allchans(ppd, 12, 0, BMASK(4, 4));
+	/*       Select BB CDR */
+	ibsd_wr_allchans(ppd, 2, (1 << 15), BMASK(15, 15));
+	/*       CDR Step Size */
+	ibsd_wr_allchans(ppd, 5, 0, BMASK(9, 8));
+	/*       Enable phase Calibration */
+	ibsd_wr_allchans(ppd, 12, (1 << 5), BMASK(5, 5));
+	/*       DFE Bandwidth [2:14-12] */
+	ibsd_wr_allchans(ppd, 2, (4 << 12), BMASK(14, 12));
+	/*       DFE Config (4 taps only) */
+	ibsd_wr_allchans(ppd, 16, 0, BMASK(1, 0));
+	/*       Gain Loop Bandwidth */
+	if (!ppd->dd->cspec->r1) {
+		ibsd_wr_allchans(ppd, 12, 1 << 12, BMASK(12, 12));
+		ibsd_wr_allchans(ppd, 12, 2 << 8, BMASK(11, 8));
+	} else {
+		ibsd_wr_allchans(ppd, 19, (3 << 11), BMASK(13, 11));
+	}
+	/*       Baseline Wander Correction Gain [13:4-0] (leave as default) */
+	/*       Baseline Wander Correction Gain [3:7-5] (leave as default) */
+	/*       Data Rate Select [5:7-6] (leave as default) */
+	/*       RX Parallel Word Width [3:10-8] (leave as default) */
+
+	/* RX REST */
+	/*       Single- or Multi-channel reset */
+	/*       RX Analog reset */
+	/*       RX Digital reset */
+	ibsd_wr_allchans(ppd, 0, 0, BMASK(15, 13));
+	msleep(20);
+	/*       RX Analog reset */
+	ibsd_wr_allchans(ppd, 0, (1 << 14), BMASK(14, 14));
+	msleep(20);
+	/*       RX Digital reset */
+	ibsd_wr_allchans(ppd, 0, (1 << 13), BMASK(13, 13));
+	msleep(20);
+
+	/* setup LoS params; these are subsystem, so chan == 5 */
+	/* LoS filter threshold_count on, ch 0-3, set to 8 */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 5, 8 << 11, BMASK(14, 11));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 8 << 4, BMASK(7, 4));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 8, 8 << 11, BMASK(14, 11));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 8 << 4, BMASK(7, 4));
+
+	/* LoS filter threshold_count off, ch 0-3, set to 4 */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 6, 4 << 0, BMASK(3, 0));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 7, 4 << 8, BMASK(11, 8));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 4 << 0, BMASK(3, 0));
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 10, 4 << 8, BMASK(11, 8));
+
+	/* LoS filter select enabled */
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), 5, 9, 1 << 15, 1 << 15);
+
+	/* LoS target data:  SDR=4, DDR=2, QDR=1 */
+	ibsd_wr_allchans(ppd, 14, (1 << 3), BMASK(5, 3)); /* QDR */
+	ibsd_wr_allchans(ppd, 20, (2 << 10), BMASK(12, 10)); /* DDR */
+	ibsd_wr_allchans(ppd, 20, (4 << 13), BMASK(15, 13)); /* SDR */
+
+	/* Turn on LOS on initial SERDES init */
+	serdes_7322_los_enable(ppd, 1);
+	/* FLoop LOS gate: PPM filter  enabled */
+	ibsd_wr_allchans(ppd, 38, 0 << 10, 1 << 10);
+
+	/* RX LATCH CALIBRATION */
+	/*       Enable Eyefinder Phase Calibration latch */
+	ibsd_wr_allchans(ppd, 15, 1, BMASK(0, 0));
+	/*       Enable RX Offset Calibration latch */
+	ibsd_wr_allchans(ppd, 12, (1 << 4), BMASK(4, 4));
+	msleep(20);
+	/*       Start Calibration */
+	ibsd_wr_allchans(ppd, 4, (1 << 10), BMASK(10, 10));
+	tend = jiffies + msecs_to_jiffies(500);
+	while (chan_done && !time_is_before_jiffies(tend)) {
+		msleep(20);
+		for (chan = 0; chan < SERDES_CHANS; ++chan) {
+			rxcaldone = ahb_mod(ppd->dd, IBSD(ppd->hw_pidx),
+					    (chan + (chan >> 1)),
+					    25, 0, 0);
+			if ((~rxcaldone & (u32)BMASK(9, 9)) == 0 &&
+			    (~chan_done & (1 << chan)) == 0)
+				chan_done &= ~(1 << chan);
+		}
+	}
+	if (chan_done) {
+		pr_info("Serdes %d calibration not done after .5 sec: 0x%x\n",
+			 IBSD(ppd->hw_pidx), chan_done);
+	} else {
+		for (chan = 0; chan < SERDES_CHANS; ++chan) {
+			rxcaldone = ahb_mod(ppd->dd, IBSD(ppd->hw_pidx),
+					    (chan + (chan >> 1)),
+					    25, 0, 0);
+			if ((~rxcaldone & (u32)BMASK(10, 10)) == 0)
+				pr_info("Serdes %d chan %d calibration failed\n",
+					IBSD(ppd->hw_pidx), chan);
+		}
+	}
+
+	/*       Turn off Calibration */
+	ibsd_wr_allchans(ppd, 4, 0, BMASK(10, 10));
+	msleep(20);
+
+	/* BRING RX UP */
+	/*       Set LE2 value (May be overridden in qsfp_7322_event) */
+	le_val = IS_QME(ppd->dd) ? LE2_QME : LE2_DEFAULT;
+	ibsd_wr_allchans(ppd, 13, (le_val << 7), BMASK(9, 7));
+	/*       Set LE2 Loop bandwidth */
+	ibsd_wr_allchans(ppd, 3, (7 << 5), BMASK(7, 5));
+	/*       Enable LE2 */
+	ibsd_wr_allchans(ppd, 13, (1 << 6), BMASK(6, 6));
+	msleep(20);
+	/*       Enable H0 only */
+	ibsd_wr_allchans(ppd, 1, 1, BMASK(9, 1));
+	/* gain hi stop 32 (22) (6:1) lo stop 7 (10:7) target 22 (13) (15:11) */
+	le_val = (ppd->dd->cspec->r1 || IS_QME(ppd->dd)) ? 0xb6c0 : 0x6bac;
+	ibsd_wr_allchans(ppd, 21, le_val, 0xfffe);
+	/*       Enable VGA */
+	ibsd_wr_allchans(ppd, 5, 0, BMASK(0, 0));
+	msleep(20);
+	/*       Set Frequency Loop Bandwidth */
+	ibsd_wr_allchans(ppd, 2, (15 << 5), BMASK(8, 5));
+	/*       Enable Frequency Loop */
+	ibsd_wr_allchans(ppd, 2, (1 << 4), BMASK(4, 4));
+	/*       Set Timing Loop Bandwidth */
+	ibsd_wr_allchans(ppd, 2, 0, BMASK(11, 9));
+	/*       Enable Timing Loop */
+	ibsd_wr_allchans(ppd, 2, (1 << 3), BMASK(3, 3));
+	msleep(50);
+	/*       Enable DFE
+	 *       Set receive adaptation mode.  SDR and DDR adaptation are
+	 *       always on, and QDR is initially enabled; later disabled.
+	 */
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(0), 0ULL);
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(1), 0ULL);
+	qib_write_kreg_port(ppd, krp_static_adapt_dis(2),
+			    ppd->dd->cspec->r1 ?
+			    QDR_STATIC_ADAPT_DOWN_R1 : QDR_STATIC_ADAPT_DOWN);
+	ppd->cpspec->qdr_dfe_on = 1;
+	/*       Disable LE1  */
+	ibsd_wr_allchans(ppd, 13, (0 << 5), (1 << 5));
+	/*       Disable auto adapt for LE1 */
+	ibsd_wr_allchans(ppd, 1, (0 << 15), BMASK(15, 15));
+	msleep(20);
+	/*       Enable AFE Offset Cancel */
+	ibsd_wr_allchans(ppd, 12, (1 << 12), BMASK(12, 12));
+	/*       Enable Baseline Wander Correction */
+	ibsd_wr_allchans(ppd, 12, (1 << 13), BMASK(13, 13));
+	/* Termination: rxtermctrl_r2d addr 11 bits [12:11] = 1 */
+	ibsd_wr_allchans(ppd, 11, (1 << 11), BMASK(12, 11));
+	/* VGA output common mode */
+	ibsd_wr_allchans(ppd, 12, (3 << 2), BMASK(3, 2));
+
+	/*
+	 * Initialize the Tx DDS tables.  Also done every QSFP event,
+	 * for adapters with QSFP
+	 */
+	init_txdds_table(ppd, 0);
+
+	return 0;
+}
+
+/* start adjust QMH serdes parameters */
+
+static void set_man_code(struct qib_pportdata *ppd, int chan, int code)
+{
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+		9, code << 9, 0x3f << 9);
+}
+
+static void set_man_mode_h1(struct qib_pportdata *ppd, int chan,
+	int enable, u32 tapenable)
+{
+	if (enable)
+		ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+			1, 3 << 10, 0x1f << 10);
+	else
+		ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+			1, 0, 0x1f << 10);
+}
+
+/* Set clock to 1, 0, 1, 0 */
+static void clock_man(struct qib_pportdata *ppd, int chan)
+{
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+		4, 0x4000, 0x4000);
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+		4, 0, 0x4000);
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+		4, 0x4000, 0x4000);
+	ahb_mod(ppd->dd, IBSD(ppd->hw_pidx), (chan + (chan >> 1)),
+		4, 0, 0x4000);
+}
+
+/*
+ * write the current Tx serdes pre,post,main,amp settings into the serdes.
+ * The caller must pass the settings appropriate for the current speed,
+ * or not care if they are correct for the current speed.
+ */
+static void write_tx_serdes_param(struct qib_pportdata *ppd,
+				  struct txdds_ent *txdds)
+{
+	u64 deemph;
+
+	deemph = qib_read_kreg_port(ppd, krp_tx_deemph_override);
+	/* field names for amp, main, post, pre, respectively */
+	deemph &= ~(SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txampcntl_d2a) |
+		    SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txc0_ena) |
+		    SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txcp1_ena) |
+		    SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0, txcn1_ena));
+
+	deemph |= SYM_MASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+			   tx_override_deemphasis_select);
+	deemph |= (txdds->amp & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		    txampcntl_d2a)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+				       txampcntl_d2a);
+	deemph |= (txdds->main & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		     txc0_ena)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+				   txc0_ena);
+	deemph |= (txdds->post & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		     txcp1_ena)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+				    txcp1_ena);
+	deemph |= (txdds->pre & SYM_RMASK(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+		     txcn1_ena)) << SYM_LSB(IBSD_TX_DEEMPHASIS_OVERRIDE_0,
+				    txcn1_ena);
+	qib_write_kreg_port(ppd, krp_tx_deemph_override, deemph);
+}
+
+/*
+ * Set the parameters for mez cards on link bounce, so they are
+ * always exactly what was requested.  Similar logic to init_txdds
+ * but does just the serdes.
+ */
+static void adj_tx_serdes(struct qib_pportdata *ppd)
+{
+	const struct txdds_ent *sdr_dds, *ddr_dds, *qdr_dds;
+	struct txdds_ent *dds;
+
+	find_best_ent(ppd, &sdr_dds, &ddr_dds, &qdr_dds, 1);
+	dds = (struct txdds_ent *)(ppd->link_speed_active == QIB_IB_QDR ?
+		qdr_dds : (ppd->link_speed_active == QIB_IB_DDR ?
+				ddr_dds : sdr_dds));
+	write_tx_serdes_param(ppd, dds);
+}
+
+/* set QDR forced value for H1, if needed */
+static void force_h1(struct qib_pportdata *ppd)
+{
+	int chan;
+
+	ppd->cpspec->qdr_reforce = 0;
+	if (!ppd->dd->cspec->r1)
+		return;
+
+	for (chan = 0; chan < SERDES_CHANS; chan++) {
+		set_man_mode_h1(ppd, chan, 1, 0);
+		set_man_code(ppd, chan, ppd->cpspec->h1_val);
+		clock_man(ppd, chan);
+		set_man_mode_h1(ppd, chan, 0, 0);
+	}
+}
+
+#define SJA_EN SYM_MASK(SPC_JTAG_ACCESS_REG, SPC_JTAG_ACCESS_EN)
+#define BISTEN_LSB SYM_LSB(SPC_JTAG_ACCESS_REG, bist_en)
+
+#define R_OPCODE_LSB 3
+#define R_OP_NOP 0
+#define R_OP_SHIFT 2
+#define R_OP_UPDATE 3
+#define R_TDI_LSB 2
+#define R_TDO_LSB 1
+#define R_RDY 1
+
+static int qib_r_grab(struct qib_devdata *dd)
+{
+	u64 val = SJA_EN;
+
+	qib_write_kreg(dd, kr_r_access, val);
+	qib_read_kreg32(dd, kr_scratch);
+	return 0;
+}
+
+/* qib_r_wait_for_rdy() not only waits for the ready bit, it
+ * returns the current state of R_TDO
+ */
+static int qib_r_wait_for_rdy(struct qib_devdata *dd)
+{
+	u64 val;
+	int timeout;
+
+	for (timeout = 0; timeout < 100 ; ++timeout) {
+		val = qib_read_kreg32(dd, kr_r_access);
+		if (val & R_RDY)
+			return (val >> R_TDO_LSB) & 1;
+	}
+	return -1;
+}
+
+static int qib_r_shift(struct qib_devdata *dd, int bisten,
+		       int len, u8 *inp, u8 *outp)
+{
+	u64 valbase, val;
+	int ret, pos;
+
+	valbase = SJA_EN | (bisten << BISTEN_LSB) |
+		(R_OP_SHIFT << R_OPCODE_LSB);
+	ret = qib_r_wait_for_rdy(dd);
+	if (ret < 0)
+		goto bail;
+	for (pos = 0; pos < len; ++pos) {
+		val = valbase;
+		if (outp) {
+			outp[pos >> 3] &= ~(1 << (pos & 7));
+			outp[pos >> 3] |= (ret << (pos & 7));
+		}
+		if (inp) {
+			int tdi = inp[pos >> 3] >> (pos & 7);
+
+			val |= ((tdi & 1) << R_TDI_LSB);
+		}
+		qib_write_kreg(dd, kr_r_access, val);
+		qib_read_kreg32(dd, kr_scratch);
+		ret = qib_r_wait_for_rdy(dd);
+		if (ret < 0)
+			break;
+	}
+	/* Restore to NOP between operations. */
+	val =  SJA_EN | (bisten << BISTEN_LSB);
+	qib_write_kreg(dd, kr_r_access, val);
+	qib_read_kreg32(dd, kr_scratch);
+	ret = qib_r_wait_for_rdy(dd);
+
+	if (ret >= 0)
+		ret = pos;
+bail:
+	return ret;
+}
+
+static int qib_r_update(struct qib_devdata *dd, int bisten)
+{
+	u64 val;
+	int ret;
+
+	val = SJA_EN | (bisten << BISTEN_LSB) | (R_OP_UPDATE << R_OPCODE_LSB);
+	ret = qib_r_wait_for_rdy(dd);
+	if (ret >= 0) {
+		qib_write_kreg(dd, kr_r_access, val);
+		qib_read_kreg32(dd, kr_scratch);
+	}
+	return ret;
+}
+
+#define BISTEN_PORT_SEL 15
+#define LEN_PORT_SEL 625
+#define BISTEN_AT 17
+#define LEN_AT 156
+#define BISTEN_ETM 16
+#define LEN_ETM 632
+
+#define BIT2BYTE(x) (((x) +  BITS_PER_BYTE - 1) / BITS_PER_BYTE)
+
+/* these are common for all IB port use cases. */
+static u8 reset_at[BIT2BYTE(LEN_AT)] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00,
+};
+static u8 reset_atetm[BIT2BYTE(LEN_ETM)] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x80, 0xe3, 0x81, 0x73, 0x3c, 0x70, 0x8e,
+	0x07, 0xce, 0xf1, 0xc0, 0x39, 0x1e, 0x38, 0xc7, 0x03, 0xe7,
+	0x78, 0xe0, 0x1c, 0x0f, 0x9c, 0x7f, 0x80, 0x73, 0x0f, 0x70,
+	0xde, 0x01, 0xce, 0x39, 0xc0, 0xf9, 0x06, 0x38, 0xd7, 0x00,
+	0xe7, 0x19, 0xe0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00,
+};
+static u8 at[BIT2BYTE(LEN_AT)] = {
+	0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00,
+};
+
+/* used for IB1 or IB2, only one in use */
+static u8 atetm_1port[BIT2BYTE(LEN_ETM)] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x10, 0xf2, 0x80, 0x83, 0x1e, 0x38, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x50, 0xf4, 0x41, 0x00, 0x18, 0x78, 0xc8, 0x03,
+	0x07, 0x7b, 0xa0, 0x3e, 0x00, 0x02, 0x00, 0x00, 0x18, 0x00,
+	0x18, 0x00, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00,
+};
+
+/* used when both IB1 and IB2 are in use */
+static u8 atetm_2port[BIT2BYTE(LEN_ETM)] = {
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x79,
+	0xc0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0xf8, 0x80, 0x83, 0x1e, 0x38, 0xe0, 0x03, 0x05,
+	0x7b, 0xa0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80,
+	0xa2, 0x0f, 0x50, 0xf4, 0x41, 0x00, 0x18, 0x78, 0xd1, 0x07,
+	0x02, 0x7c, 0x80, 0x3e, 0x00, 0x02, 0x00, 0x00, 0x3e, 0x00,
+	0x02, 0x00, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00,
+};
+
+/* used when only IB1 is in use */
+static u8 portsel_port1[BIT2BYTE(LEN_PORT_SEL)] = {
+	0x32, 0x65, 0xa4, 0x7b, 0x10, 0x98, 0xdc, 0xfe, 0x13, 0x13,
+	0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x73, 0x0c, 0x0c, 0x0c,
+	0x0c, 0x0c, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13,
+	0x13, 0x78, 0x78, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13,
+	0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x74, 0x32,
+	0x32, 0x32, 0x32, 0x32, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14,
+	0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14,
+	0x14, 0x14, 0x9f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+/* used when only IB2 is in use */
+static u8 portsel_port2[BIT2BYTE(LEN_PORT_SEL)] = {
+	0x32, 0x65, 0xa4, 0x7b, 0x10, 0x98, 0xdc, 0xfe, 0x39, 0x39,
+	0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x73, 0x32, 0x32, 0x32,
+	0x32, 0x32, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
+	0x39, 0x78, 0x78, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39, 0x39,
+	0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x74, 0x32,
+	0x32, 0x32, 0x32, 0x32, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+	0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a, 0x3a,
+	0x3a, 0x3a, 0x9f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x01,
+};
+
+/* used when both IB1 and IB2 are in use */
+static u8 portsel_2port[BIT2BYTE(LEN_PORT_SEL)] = {
+	0x32, 0xba, 0x54, 0x76, 0x10, 0x98, 0xdc, 0xfe, 0x13, 0x13,
+	0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x73, 0x0c, 0x0c, 0x0c,
+	0x0c, 0x0c, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13,
+	0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13, 0x13,
+	0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x74, 0x32,
+	0x32, 0x32, 0x32, 0x32, 0x14, 0x14, 0x14, 0x14, 0x14, 0x3a,
+	0x3a, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14, 0x14,
+	0x14, 0x14, 0x9f, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00,
+};
+
+/*
+ * Do setup to properly handle IB link recovery; if port is zero, we
+ * are initializing to cover both ports; otherwise we are initializing
+ * to cover a single port card, or the port has reached INIT and we may
+ * need to switch coverage types.
+ */
+static void setup_7322_link_recovery(struct qib_pportdata *ppd, u32 both)
+{
+	u8 *portsel, *etm;
+	struct qib_devdata *dd = ppd->dd;
+
+	if (!ppd->dd->cspec->r1)
+		return;
+	if (!both) {
+		dd->cspec->recovery_ports_initted++;
+		ppd->cpspec->recovery_init = 1;
+	}
+	if (!both && dd->cspec->recovery_ports_initted == 1) {
+		portsel = ppd->port == 1 ? portsel_port1 : portsel_port2;
+		etm = atetm_1port;
+	} else {
+		portsel = portsel_2port;
+		etm = atetm_2port;
+	}
+
+	if (qib_r_grab(dd) < 0 ||
+		qib_r_shift(dd, BISTEN_ETM, LEN_ETM, reset_atetm, NULL) < 0 ||
+		qib_r_update(dd, BISTEN_ETM) < 0 ||
+		qib_r_shift(dd, BISTEN_AT, LEN_AT, reset_at, NULL) < 0 ||
+		qib_r_update(dd, BISTEN_AT) < 0 ||
+		qib_r_shift(dd, BISTEN_PORT_SEL, LEN_PORT_SEL,
+			    portsel, NULL) < 0 ||
+		qib_r_update(dd, BISTEN_PORT_SEL) < 0 ||
+		qib_r_shift(dd, BISTEN_AT, LEN_AT, at, NULL) < 0 ||
+		qib_r_update(dd, BISTEN_AT) < 0 ||
+		qib_r_shift(dd, BISTEN_ETM, LEN_ETM, etm, NULL) < 0 ||
+		qib_r_update(dd, BISTEN_ETM) < 0)
+		qib_dev_err(dd, "Failed IB link recovery setup\n");
+}
+
+static void check_7322_rxe_status(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u64 fmask;
+
+	if (dd->cspec->recovery_ports_initted != 1)
+		return; /* rest doesn't apply to dualport */
+	qib_write_kreg(dd, kr_control, dd->control |
+		       SYM_MASK(Control, FreezeMode));
+	(void)qib_read_kreg64(dd, kr_scratch);
+	udelay(3); /* ibcreset asserted 400ns, be sure that's over */
+	fmask = qib_read_kreg64(dd, kr_act_fmask);
+	if (!fmask) {
+		/*
+		 * require a powercycle before we'll work again, and make
+		 * sure we get no more interrupts, and don't turn off
+		 * freeze.
+		 */
+		ppd->dd->cspec->stay_in_freeze = 1;
+		qib_7322_set_intr_state(ppd->dd, 0);
+		qib_write_kreg(dd, kr_fmask, 0ULL);
+		qib_dev_err(dd, "HCA unusable until powercycled\n");
+		return; /* eventually reset */
+	}
+
+	qib_write_kreg(ppd->dd, kr_hwerrclear,
+	    SYM_MASK(HwErrClear, IBSerdesPClkNotDetectClear_1));
+
+	/* don't do the full clear_freeze(), not needed for this */
+	qib_write_kreg(dd, kr_control, dd->control);
+	qib_read_kreg32(dd, kr_scratch);
+	/* take IBC out of reset */
+	if (ppd->link_speed_supported) {
+		ppd->cpspec->ibcctrl_a &=
+			~SYM_MASK(IBCCtrlA_0, IBStatIntReductionEn);
+		qib_write_kreg_port(ppd, krp_ibcctrl_a,
+				    ppd->cpspec->ibcctrl_a);
+		qib_read_kreg32(dd, kr_scratch);
+		if (ppd->lflags & QIBL_IB_LINK_DISABLED)
+			qib_set_ib_7322_lstate(ppd, 0,
+				QLOGIC_IB_IBCC_LINKINITCMD_DISABLE);
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c
new file mode 100644
index 000000000..7e00470ad
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_init.c
@@ -0,0 +1,1847 @@
+/*
+ * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/delay.h>
+#include <linux/idr.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+#include <linux/dca.h>
+#endif
+
+#include "qib.h"
+#include "qib_common.h"
+#include "qib_mad.h"
+#ifdef CONFIG_DEBUG_FS
+#include "qib_debugfs.h"
+#include "qib_verbs.h"
+#endif
+
+#undef pr_fmt
+#define pr_fmt(fmt) QIB_DRV_NAME ": " fmt
+
+/*
+ * min buffers we want to have per context, after driver
+ */
+#define QIB_MIN_USER_CTXT_BUFCNT 7
+
+#define QLOGIC_IB_R_SOFTWARE_MASK 0xFF
+#define QLOGIC_IB_R_SOFTWARE_SHIFT 24
+#define QLOGIC_IB_R_EMULATOR_MASK (1ULL<<62)
+
+/*
+ * Number of ctxts we are configured to use (to allow for more pio
+ * buffers per ctxt, etc.)  Zero means use chip value.
+ */
+ushort qib_cfgctxts;
+module_param_named(cfgctxts, qib_cfgctxts, ushort, S_IRUGO);
+MODULE_PARM_DESC(cfgctxts, "Set max number of contexts to use");
+
+unsigned qib_numa_aware;
+module_param_named(numa_aware, qib_numa_aware, uint, S_IRUGO);
+MODULE_PARM_DESC(numa_aware,
+	"0 -> PSM allocation close to HCA, 1 -> PSM allocation local to process");
+
+/*
+ * If set, do not write to any regs if avoidable, hack to allow
+ * check for deranged default register values.
+ */
+ushort qib_mini_init;
+module_param_named(mini_init, qib_mini_init, ushort, S_IRUGO);
+MODULE_PARM_DESC(mini_init, "If set, do minimal diag init");
+
+unsigned qib_n_krcv_queues;
+module_param_named(krcvqs, qib_n_krcv_queues, uint, S_IRUGO);
+MODULE_PARM_DESC(krcvqs, "number of kernel receive queues per IB port");
+
+unsigned qib_cc_table_size;
+module_param_named(cc_table_size, qib_cc_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(cc_table_size, "Congestion control table entries 0 (CCA disabled - default), min = 128, max = 1984");
+
+static void verify_interrupt(unsigned long);
+
+static struct idr qib_unit_table;
+u32 qib_cpulist_count;
+unsigned long *qib_cpulist;
+
+/* set number of contexts we'll actually use */
+void qib_set_ctxtcnt(struct qib_devdata *dd)
+{
+	if (!qib_cfgctxts) {
+		dd->cfgctxts = dd->first_user_ctxt + num_online_cpus();
+		if (dd->cfgctxts > dd->ctxtcnt)
+			dd->cfgctxts = dd->ctxtcnt;
+	} else if (qib_cfgctxts < dd->num_pports)
+		dd->cfgctxts = dd->ctxtcnt;
+	else if (qib_cfgctxts <= dd->ctxtcnt)
+		dd->cfgctxts = qib_cfgctxts;
+	else
+		dd->cfgctxts = dd->ctxtcnt;
+	dd->freectxts = (dd->first_user_ctxt > dd->cfgctxts) ? 0 :
+		dd->cfgctxts - dd->first_user_ctxt;
+}
+
+/*
+ * Common code for creating the receive context array.
+ */
+int qib_create_ctxts(struct qib_devdata *dd)
+{
+	unsigned i;
+	int local_node_id = pcibus_to_node(dd->pcidev->bus);
+
+	if (local_node_id < 0)
+		local_node_id = numa_node_id();
+	dd->assigned_node_id = local_node_id;
+
+	/*
+	 * Allocate full ctxtcnt array, rather than just cfgctxts, because
+	 * cleanup iterates across all possible ctxts.
+	 */
+	dd->rcd = kcalloc(dd->ctxtcnt, sizeof(*dd->rcd), GFP_KERNEL);
+	if (!dd->rcd) {
+		qib_dev_err(dd,
+			"Unable to allocate ctxtdata array, failing\n");
+		return -ENOMEM;
+	}
+
+	/* create (one or more) kctxt */
+	for (i = 0; i < dd->first_user_ctxt; ++i) {
+		struct qib_pportdata *ppd;
+		struct qib_ctxtdata *rcd;
+
+		if (dd->skip_kctxt_mask & (1 << i))
+			continue;
+
+		ppd = dd->pport + (i % dd->num_pports);
+
+		rcd = qib_create_ctxtdata(ppd, i, dd->assigned_node_id);
+		if (!rcd) {
+			qib_dev_err(dd,
+				"Unable to allocate ctxtdata for Kernel ctxt, failing\n");
+			kfree(dd->rcd);
+			dd->rcd = NULL;
+			return -ENOMEM;
+		}
+		rcd->pkeys[0] = QIB_DEFAULT_P_KEY;
+		rcd->seq_cnt = 1;
+	}
+	return 0;
+}
+
+/*
+ * Common code for user and kernel context setup.
+ */
+struct qib_ctxtdata *qib_create_ctxtdata(struct qib_pportdata *ppd, u32 ctxt,
+	int node_id)
+{
+	struct qib_devdata *dd = ppd->dd;
+	struct qib_ctxtdata *rcd;
+
+	rcd = kzalloc_node(sizeof(*rcd), GFP_KERNEL, node_id);
+	if (rcd) {
+		INIT_LIST_HEAD(&rcd->qp_wait_list);
+		rcd->node_id = node_id;
+		rcd->ppd = ppd;
+		rcd->dd = dd;
+		rcd->cnt = 1;
+		rcd->ctxt = ctxt;
+		dd->rcd[ctxt] = rcd;
+#ifdef CONFIG_DEBUG_FS
+		if (ctxt < dd->first_user_ctxt) { /* N/A for PSM contexts */
+			rcd->opstats = kzalloc_node(sizeof(*rcd->opstats),
+				GFP_KERNEL, node_id);
+			if (!rcd->opstats) {
+				kfree(rcd);
+				qib_dev_err(dd,
+					"Unable to allocate per ctxt stats buffer\n");
+				return NULL;
+			}
+		}
+#endif
+		dd->f_init_ctxt(rcd);
+
+		/*
+		 * To avoid wasting a lot of memory, we allocate 32KB chunks
+		 * of physically contiguous memory, advance through it until
+		 * used up and then allocate more.  Of course, we need
+		 * memory to store those extra pointers, now.  32KB seems to
+		 * be the most that is "safe" under memory pressure
+		 * (creating large files and then copying them over
+		 * NFS while doing lots of MPI jobs).  The OOM killer can
+		 * get invoked, even though we say we can sleep and this can
+		 * cause significant system problems....
+		 */
+		rcd->rcvegrbuf_size = 0x8000;
+		rcd->rcvegrbufs_perchunk =
+			rcd->rcvegrbuf_size / dd->rcvegrbufsize;
+		rcd->rcvegrbuf_chunks = (rcd->rcvegrcnt +
+			rcd->rcvegrbufs_perchunk - 1) /
+			rcd->rcvegrbufs_perchunk;
+		BUG_ON(!is_power_of_2(rcd->rcvegrbufs_perchunk));
+		rcd->rcvegrbufs_perchunk_shift =
+			ilog2(rcd->rcvegrbufs_perchunk);
+	}
+	return rcd;
+}
+
+/*
+ * Common code for initializing the physical port structure.
+ */
+int qib_init_pportdata(struct qib_pportdata *ppd, struct qib_devdata *dd,
+			u8 hw_pidx, u8 port)
+{
+	int size;
+
+	ppd->dd = dd;
+	ppd->hw_pidx = hw_pidx;
+	ppd->port = port; /* IB port number, not index */
+
+	spin_lock_init(&ppd->sdma_lock);
+	spin_lock_init(&ppd->lflags_lock);
+	spin_lock_init(&ppd->cc_shadow_lock);
+	init_waitqueue_head(&ppd->state_wait);
+
+	init_timer(&ppd->symerr_clear_timer);
+	ppd->symerr_clear_timer.function = qib_clear_symerror_on_linkup;
+	ppd->symerr_clear_timer.data = (unsigned long)ppd;
+
+	ppd->qib_wq = NULL;
+	ppd->ibport_data.pmastats =
+		alloc_percpu(struct qib_pma_counters);
+	if (!ppd->ibport_data.pmastats)
+		return -ENOMEM;
+
+	if (qib_cc_table_size < IB_CCT_MIN_ENTRIES)
+		goto bail;
+
+	ppd->cc_supported_table_entries = min(max_t(int, qib_cc_table_size,
+		IB_CCT_MIN_ENTRIES), IB_CCT_ENTRIES*IB_CC_TABLE_CAP_DEFAULT);
+
+	ppd->cc_max_table_entries =
+		ppd->cc_supported_table_entries/IB_CCT_ENTRIES;
+
+	size = IB_CC_TABLE_CAP_DEFAULT * sizeof(struct ib_cc_table_entry)
+		* IB_CCT_ENTRIES;
+	ppd->ccti_entries = kzalloc(size, GFP_KERNEL);
+	if (!ppd->ccti_entries) {
+		qib_dev_err(dd,
+		  "failed to allocate congestion control table for port %d!\n",
+		  port);
+		goto bail;
+	}
+
+	size = IB_CC_CCS_ENTRIES * sizeof(struct ib_cc_congestion_entry);
+	ppd->congestion_entries = kzalloc(size, GFP_KERNEL);
+	if (!ppd->congestion_entries) {
+		qib_dev_err(dd,
+		 "failed to allocate congestion setting list for port %d!\n",
+		 port);
+		goto bail_1;
+	}
+
+	size = sizeof(struct cc_table_shadow);
+	ppd->ccti_entries_shadow = kzalloc(size, GFP_KERNEL);
+	if (!ppd->ccti_entries_shadow) {
+		qib_dev_err(dd,
+		 "failed to allocate shadow ccti list for port %d!\n",
+		 port);
+		goto bail_2;
+	}
+
+	size = sizeof(struct ib_cc_congestion_setting_attr);
+	ppd->congestion_entries_shadow = kzalloc(size, GFP_KERNEL);
+	if (!ppd->congestion_entries_shadow) {
+		qib_dev_err(dd,
+		 "failed to allocate shadow congestion setting list for port %d!\n",
+		 port);
+		goto bail_3;
+	}
+
+	return 0;
+
+bail_3:
+	kfree(ppd->ccti_entries_shadow);
+	ppd->ccti_entries_shadow = NULL;
+bail_2:
+	kfree(ppd->congestion_entries);
+	ppd->congestion_entries = NULL;
+bail_1:
+	kfree(ppd->ccti_entries);
+	ppd->ccti_entries = NULL;
+bail:
+	/* User is intentionally disabling the congestion control agent */
+	if (!qib_cc_table_size)
+		return 0;
+
+	if (qib_cc_table_size < IB_CCT_MIN_ENTRIES) {
+		qib_cc_table_size = 0;
+		qib_dev_err(dd,
+		 "Congestion Control table size %d less than minimum %d for port %d\n",
+		 qib_cc_table_size, IB_CCT_MIN_ENTRIES, port);
+	}
+
+	qib_dev_err(dd, "Congestion Control Agent disabled for port %d\n",
+		port);
+	return 0;
+}
+
+static int init_pioavailregs(struct qib_devdata *dd)
+{
+	int ret, pidx;
+	u64 *status_page;
+
+	dd->pioavailregs_dma = dma_alloc_coherent(
+		&dd->pcidev->dev, PAGE_SIZE, &dd->pioavailregs_phys,
+		GFP_KERNEL);
+	if (!dd->pioavailregs_dma) {
+		qib_dev_err(dd,
+			"failed to allocate PIOavail reg area in memory\n");
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	/*
+	 * We really want L2 cache aligned, but for current CPUs of
+	 * interest, they are the same.
+	 */
+	status_page = (u64 *)
+		((char *) dd->pioavailregs_dma +
+		 ((2 * L1_CACHE_BYTES +
+		   dd->pioavregs * sizeof(u64)) & ~L1_CACHE_BYTES));
+	/* device status comes first, for backwards compatibility */
+	dd->devstatusp = status_page;
+	*status_page++ = 0;
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		dd->pport[pidx].statusp = status_page;
+		*status_page++ = 0;
+	}
+
+	/*
+	 * Setup buffer to hold freeze and other messages, accessible to
+	 * apps, following statusp.  This is per-unit, not per port.
+	 */
+	dd->freezemsg = (char *) status_page;
+	*dd->freezemsg = 0;
+	/* length of msg buffer is "whatever is left" */
+	ret = (char *) status_page - (char *) dd->pioavailregs_dma;
+	dd->freezelen = PAGE_SIZE - ret;
+
+	ret = 0;
+
+done:
+	return ret;
+}
+
+/**
+ * init_shadow_tids - allocate the shadow TID array
+ * @dd: the qlogic_ib device
+ *
+ * allocate the shadow TID array, so we can qib_munlock previous
+ * entries.  It may make more sense to move the pageshadow to the
+ * ctxt data structure, so we only allocate memory for ctxts actually
+ * in use, since we at 8k per ctxt, now.
+ * We don't want failures here to prevent use of the driver/chip,
+ * so no return value.
+ */
+static void init_shadow_tids(struct qib_devdata *dd)
+{
+	struct page **pages;
+	dma_addr_t *addrs;
+
+	pages = vzalloc(dd->cfgctxts * dd->rcvtidcnt * sizeof(struct page *));
+	if (!pages) {
+		qib_dev_err(dd,
+			"failed to allocate shadow page * array, no expected sends!\n");
+		goto bail;
+	}
+
+	addrs = vzalloc(dd->cfgctxts * dd->rcvtidcnt * sizeof(dma_addr_t));
+	if (!addrs) {
+		qib_dev_err(dd,
+			"failed to allocate shadow dma handle array, no expected sends!\n");
+		goto bail_free;
+	}
+
+	dd->pageshadow = pages;
+	dd->physshadow = addrs;
+	return;
+
+bail_free:
+	vfree(pages);
+bail:
+	dd->pageshadow = NULL;
+}
+
+/*
+ * Do initialization for device that is only needed on
+ * first detect, not on resets.
+ */
+static int loadtime_init(struct qib_devdata *dd)
+{
+	int ret = 0;
+
+	if (((dd->revision >> QLOGIC_IB_R_SOFTWARE_SHIFT) &
+	     QLOGIC_IB_R_SOFTWARE_MASK) != QIB_CHIP_SWVERSION) {
+		qib_dev_err(dd,
+			"Driver only handles version %d, chip swversion is %d (%llx), failng\n",
+			QIB_CHIP_SWVERSION,
+			(int)(dd->revision >>
+				QLOGIC_IB_R_SOFTWARE_SHIFT) &
+				QLOGIC_IB_R_SOFTWARE_MASK,
+			(unsigned long long) dd->revision);
+		ret = -ENOSYS;
+		goto done;
+	}
+
+	if (dd->revision & QLOGIC_IB_R_EMULATOR_MASK)
+		qib_devinfo(dd->pcidev, "%s", dd->boardversion);
+
+	spin_lock_init(&dd->pioavail_lock);
+	spin_lock_init(&dd->sendctrl_lock);
+	spin_lock_init(&dd->uctxt_lock);
+	spin_lock_init(&dd->qib_diag_trans_lock);
+	spin_lock_init(&dd->eep_st_lock);
+	mutex_init(&dd->eep_lock);
+
+	if (qib_mini_init)
+		goto done;
+
+	ret = init_pioavailregs(dd);
+	init_shadow_tids(dd);
+
+	qib_get_eeprom_info(dd);
+
+	/* setup time (don't start yet) to verify we got interrupt */
+	init_timer(&dd->intrchk_timer);
+	dd->intrchk_timer.function = verify_interrupt;
+	dd->intrchk_timer.data = (unsigned long) dd;
+
+	ret = qib_cq_init(dd);
+done:
+	return ret;
+}
+
+/**
+ * init_after_reset - re-initialize after a reset
+ * @dd: the qlogic_ib device
+ *
+ * sanity check at least some of the values after reset, and
+ * ensure no receive or transmit (explicitly, in case reset
+ * failed
+ */
+static int init_after_reset(struct qib_devdata *dd)
+{
+	int i;
+
+	/*
+	 * Ensure chip does no sends or receives, tail updates, or
+	 * pioavail updates while we re-initialize.  This is mostly
+	 * for the driver data structures, not chip registers.
+	 */
+	for (i = 0; i < dd->num_pports; ++i) {
+		/*
+		 * ctxt == -1 means "all contexts". Only really safe for
+		 * _dis_abling things, as here.
+		 */
+		dd->f_rcvctrl(dd->pport + i, QIB_RCVCTRL_CTXT_DIS |
+				  QIB_RCVCTRL_INTRAVAIL_DIS |
+				  QIB_RCVCTRL_TAILUPD_DIS, -1);
+		/* Redundant across ports for some, but no big deal.  */
+		dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_DIS |
+			QIB_SENDCTRL_AVAIL_DIS);
+	}
+
+	return 0;
+}
+
+static void enable_chip(struct qib_devdata *dd)
+{
+	u64 rcvmask;
+	int i;
+
+	/*
+	 * Enable PIO send, and update of PIOavail regs to memory.
+	 */
+	for (i = 0; i < dd->num_pports; ++i)
+		dd->f_sendctrl(dd->pport + i, QIB_SENDCTRL_SEND_ENB |
+			QIB_SENDCTRL_AVAIL_ENB);
+	/*
+	 * Enable kernel ctxts' receive and receive interrupt.
+	 * Other ctxts done as user opens and inits them.
+	 */
+	rcvmask = QIB_RCVCTRL_CTXT_ENB | QIB_RCVCTRL_INTRAVAIL_ENB;
+	rcvmask |= (dd->flags & QIB_NODMA_RTAIL) ?
+		  QIB_RCVCTRL_TAILUPD_DIS : QIB_RCVCTRL_TAILUPD_ENB;
+	for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
+		struct qib_ctxtdata *rcd = dd->rcd[i];
+
+		if (rcd)
+			dd->f_rcvctrl(rcd->ppd, rcvmask, i);
+	}
+}
+
+static void verify_interrupt(unsigned long opaque)
+{
+	struct qib_devdata *dd = (struct qib_devdata *) opaque;
+	u64 int_counter;
+
+	if (!dd)
+		return; /* being torn down */
+
+	/*
+	 * If we don't have a lid or any interrupts, let the user know and
+	 * don't bother checking again.
+	 */
+	int_counter = qib_int_counter(dd) - dd->z_int_counter;
+	if (int_counter == 0) {
+		if (!dd->f_intr_fallback(dd))
+			dev_err(&dd->pcidev->dev,
+				"No interrupts detected, not usable.\n");
+		else /* re-arm the timer to see if fallback works */
+			mod_timer(&dd->intrchk_timer, jiffies + HZ/2);
+	}
+}
+
+static void init_piobuf_state(struct qib_devdata *dd)
+{
+	int i, pidx;
+	u32 uctxts;
+
+	/*
+	 * Ensure all buffers are free, and fifos empty.  Buffers
+	 * are common, so only do once for port 0.
+	 *
+	 * After enable and qib_chg_pioavailkernel so we can safely
+	 * enable pioavail updates and PIOENABLE.  After this, packets
+	 * are ready and able to go out.
+	 */
+	dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_ALL);
+	for (pidx = 0; pidx < dd->num_pports; ++pidx)
+		dd->f_sendctrl(dd->pport + pidx, QIB_SENDCTRL_FLUSH);
+
+	/*
+	 * If not all sendbufs are used, add the one to each of the lower
+	 * numbered contexts.  pbufsctxt and lastctxt_piobuf are
+	 * calculated in chip-specific code because it may cause some
+	 * chip-specific adjustments to be made.
+	 */
+	uctxts = dd->cfgctxts - dd->first_user_ctxt;
+	dd->ctxts_extrabuf = dd->pbufsctxt ?
+		dd->lastctxt_piobuf - (dd->pbufsctxt * uctxts) : 0;
+
+	/*
+	 * Set up the shadow copies of the piobufavail registers,
+	 * which we compare against the chip registers for now, and
+	 * the in memory DMA'ed copies of the registers.
+	 * By now pioavail updates to memory should have occurred, so
+	 * copy them into our working/shadow registers; this is in
+	 * case something went wrong with abort, but mostly to get the
+	 * initial values of the generation bit correct.
+	 */
+	for (i = 0; i < dd->pioavregs; i++) {
+		__le64 tmp;
+
+		tmp = dd->pioavailregs_dma[i];
+		/*
+		 * Don't need to worry about pioavailkernel here
+		 * because we will call qib_chg_pioavailkernel() later
+		 * in initialization, to busy out buffers as needed.
+		 */
+		dd->pioavailshadow[i] = le64_to_cpu(tmp);
+	}
+	while (i < ARRAY_SIZE(dd->pioavailshadow))
+		dd->pioavailshadow[i++] = 0; /* for debugging sanity */
+
+	/* after pioavailshadow is setup */
+	qib_chg_pioavailkernel(dd, 0, dd->piobcnt2k + dd->piobcnt4k,
+			       TXCHK_CHG_TYPE_KERN, NULL);
+	dd->f_initvl15_bufs(dd);
+}
+
+/**
+ * qib_create_workqueues - create per port workqueues
+ * @dd: the qlogic_ib device
+ */
+static int qib_create_workqueues(struct qib_devdata *dd)
+{
+	int pidx;
+	struct qib_pportdata *ppd;
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (!ppd->qib_wq) {
+			char wq_name[8]; /* 3 + 2 + 1 + 1 + 1 */
+
+			snprintf(wq_name, sizeof(wq_name), "qib%d_%d",
+				dd->unit, pidx);
+			ppd->qib_wq =
+				create_singlethread_workqueue(wq_name);
+			if (!ppd->qib_wq)
+				goto wq_error;
+		}
+	}
+	return 0;
+wq_error:
+	pr_err("create_singlethread_workqueue failed for port %d\n",
+		pidx + 1);
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (ppd->qib_wq) {
+			destroy_workqueue(ppd->qib_wq);
+			ppd->qib_wq = NULL;
+		}
+	}
+	return -ENOMEM;
+}
+
+static void qib_free_pportdata(struct qib_pportdata *ppd)
+{
+	free_percpu(ppd->ibport_data.pmastats);
+	ppd->ibport_data.pmastats = NULL;
+}
+
+/**
+ * qib_init - do the actual initialization sequence on the chip
+ * @dd: the qlogic_ib device
+ * @reinit: reinitializing, so don't allocate new memory
+ *
+ * Do the actual initialization sequence on the chip.  This is done
+ * both from the init routine called from the PCI infrastructure, and
+ * when we reset the chip, or detect that it was reset internally,
+ * or it's administratively re-enabled.
+ *
+ * Memory allocation here and in called routines is only done in
+ * the first case (reinit == 0).  We have to be careful, because even
+ * without memory allocation, we need to re-write all the chip registers
+ * TIDs, etc. after the reset or enable has completed.
+ */
+int qib_init(struct qib_devdata *dd, int reinit)
+{
+	int ret = 0, pidx, lastfail = 0;
+	u32 portok = 0;
+	unsigned i;
+	struct qib_ctxtdata *rcd;
+	struct qib_pportdata *ppd;
+	unsigned long flags;
+
+	/* Set linkstate to unknown, so we can watch for a transition. */
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~(QIBL_LINKACTIVE | QIBL_LINKARMED |
+				 QIBL_LINKDOWN | QIBL_LINKINIT |
+				 QIBL_LINKV);
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+	}
+
+	if (reinit)
+		ret = init_after_reset(dd);
+	else
+		ret = loadtime_init(dd);
+	if (ret)
+		goto done;
+
+	/* Bypass most chip-init, to get to device creation */
+	if (qib_mini_init)
+		return 0;
+
+	ret = dd->f_late_initreg(dd);
+	if (ret)
+		goto done;
+
+	/* dd->rcd can be NULL if early init failed */
+	for (i = 0; dd->rcd && i < dd->first_user_ctxt; ++i) {
+		/*
+		 * Set up the (kernel) rcvhdr queue and egr TIDs.  If doing
+		 * re-init, the simplest way to handle this is to free
+		 * existing, and re-allocate.
+		 * Need to re-create rest of ctxt 0 ctxtdata as well.
+		 */
+		rcd = dd->rcd[i];
+		if (!rcd)
+			continue;
+
+		lastfail = qib_create_rcvhdrq(dd, rcd);
+		if (!lastfail)
+			lastfail = qib_setup_eagerbufs(rcd);
+		if (lastfail) {
+			qib_dev_err(dd,
+				"failed to allocate kernel ctxt's rcvhdrq and/or egr bufs\n");
+			continue;
+		}
+	}
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		int mtu;
+
+		if (lastfail)
+			ret = lastfail;
+		ppd = dd->pport + pidx;
+		mtu = ib_mtu_enum_to_int(qib_ibmtu);
+		if (mtu == -1) {
+			mtu = QIB_DEFAULT_MTU;
+			qib_ibmtu = 0; /* don't leave invalid value */
+		}
+		/* set max we can ever have for this driver load */
+		ppd->init_ibmaxlen = min(mtu > 2048 ?
+					 dd->piosize4k : dd->piosize2k,
+					 dd->rcvegrbufsize +
+					 (dd->rcvhdrentsize << 2));
+		/*
+		 * Have to initialize ibmaxlen, but this will normally
+		 * change immediately in qib_set_mtu().
+		 */
+		ppd->ibmaxlen = ppd->init_ibmaxlen;
+		qib_set_mtu(ppd, mtu);
+
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_IB_LINK_DISABLED;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+
+		lastfail = dd->f_bringup_serdes(ppd);
+		if (lastfail) {
+			qib_devinfo(dd->pcidev,
+				 "Failed to bringup IB port %u\n", ppd->port);
+			lastfail = -ENETDOWN;
+			continue;
+		}
+
+		portok++;
+	}
+
+	if (!portok) {
+		/* none of the ports initialized */
+		if (!ret && lastfail)
+			ret = lastfail;
+		else if (!ret)
+			ret = -ENETDOWN;
+		/* but continue on, so we can debug cause */
+	}
+
+	enable_chip(dd);
+
+	init_piobuf_state(dd);
+
+done:
+	if (!ret) {
+		/* chip is OK for user apps; mark it as initialized */
+		for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+			ppd = dd->pport + pidx;
+			/*
+			 * Set status even if port serdes is not initialized
+			 * so that diags will work.
+			 */
+			*ppd->statusp |= QIB_STATUS_CHIP_PRESENT |
+				QIB_STATUS_INITTED;
+			if (!ppd->link_speed_enabled)
+				continue;
+			if (dd->flags & QIB_HAS_SEND_DMA)
+				ret = qib_setup_sdma(ppd);
+			init_timer(&ppd->hol_timer);
+			ppd->hol_timer.function = qib_hol_event;
+			ppd->hol_timer.data = (unsigned long)ppd;
+			ppd->hol_state = QIB_HOL_UP;
+		}
+
+		/* now we can enable all interrupts from the chip */
+		dd->f_set_intr_state(dd, 1);
+
+		/*
+		 * Setup to verify we get an interrupt, and fallback
+		 * to an alternate if necessary and possible.
+		 */
+		mod_timer(&dd->intrchk_timer, jiffies + HZ/2);
+		/* start stats retrieval timer */
+		mod_timer(&dd->stats_timer, jiffies + HZ * ACTIVITY_TIMER);
+	}
+
+	/* if ret is non-zero, we probably should do some cleanup here... */
+	return ret;
+}
+
+/*
+ * These next two routines are placeholders in case we don't have per-arch
+ * code for controlling write combining.  If explicit control of write
+ * combining is not available, performance will probably be awful.
+ */
+
+int __attribute__((weak)) qib_enable_wc(struct qib_devdata *dd)
+{
+	return -EOPNOTSUPP;
+}
+
+void __attribute__((weak)) qib_disable_wc(struct qib_devdata *dd)
+{
+}
+
+static inline struct qib_devdata *__qib_lookup(int unit)
+{
+	return idr_find(&qib_unit_table, unit);
+}
+
+struct qib_devdata *qib_lookup(int unit)
+{
+	struct qib_devdata *dd;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+	dd = __qib_lookup(unit);
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+
+	return dd;
+}
+
+/*
+ * Stop the timers during unit shutdown, or after an error late
+ * in initialization.
+ */
+static void qib_stop_timers(struct qib_devdata *dd)
+{
+	struct qib_pportdata *ppd;
+	int pidx;
+
+	if (dd->stats_timer.data) {
+		del_timer_sync(&dd->stats_timer);
+		dd->stats_timer.data = 0;
+	}
+	if (dd->intrchk_timer.data) {
+		del_timer_sync(&dd->intrchk_timer);
+		dd->intrchk_timer.data = 0;
+	}
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		if (ppd->hol_timer.data)
+			del_timer_sync(&ppd->hol_timer);
+		if (ppd->led_override_timer.data) {
+			del_timer_sync(&ppd->led_override_timer);
+			atomic_set(&ppd->led_override_timer_active, 0);
+		}
+		if (ppd->symerr_clear_timer.data)
+			del_timer_sync(&ppd->symerr_clear_timer);
+	}
+}
+
+/**
+ * qib_shutdown_device - shut down a device
+ * @dd: the qlogic_ib device
+ *
+ * This is called to make the device quiet when we are about to
+ * unload the driver, and also when the device is administratively
+ * disabled.   It does not free any data structures.
+ * Everything it does has to be setup again by qib_init(dd, 1)
+ */
+static void qib_shutdown_device(struct qib_devdata *dd)
+{
+	struct qib_pportdata *ppd;
+	unsigned pidx;
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+
+		spin_lock_irq(&ppd->lflags_lock);
+		ppd->lflags &= ~(QIBL_LINKDOWN | QIBL_LINKINIT |
+				 QIBL_LINKARMED | QIBL_LINKACTIVE |
+				 QIBL_LINKV);
+		spin_unlock_irq(&ppd->lflags_lock);
+		*ppd->statusp &= ~(QIB_STATUS_IB_CONF | QIB_STATUS_IB_READY);
+	}
+	dd->flags &= ~QIB_INITTED;
+
+	/* mask interrupts, but not errors */
+	dd->f_set_intr_state(dd, 0);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		dd->f_rcvctrl(ppd, QIB_RCVCTRL_TAILUPD_DIS |
+				   QIB_RCVCTRL_CTXT_DIS |
+				   QIB_RCVCTRL_INTRAVAIL_DIS |
+				   QIB_RCVCTRL_PKEY_ENB, -1);
+		/*
+		 * Gracefully stop all sends allowing any in progress to
+		 * trickle out first.
+		 */
+		dd->f_sendctrl(ppd, QIB_SENDCTRL_CLEAR);
+	}
+
+	/*
+	 * Enough for anything that's going to trickle out to have actually
+	 * done so.
+	 */
+	udelay(20);
+
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		ppd = dd->pport + pidx;
+		dd->f_setextled(ppd, 0); /* make sure LEDs are off */
+
+		if (dd->flags & QIB_HAS_SEND_DMA)
+			qib_teardown_sdma(ppd);
+
+		dd->f_sendctrl(ppd, QIB_SENDCTRL_AVAIL_DIS |
+				    QIB_SENDCTRL_SEND_DIS);
+		/*
+		 * Clear SerdesEnable.
+		 * We can't count on interrupts since we are stopping.
+		 */
+		dd->f_quiet_serdes(ppd);
+
+		if (ppd->qib_wq) {
+			destroy_workqueue(ppd->qib_wq);
+			ppd->qib_wq = NULL;
+		}
+		qib_free_pportdata(ppd);
+	}
+
+}
+
+/**
+ * qib_free_ctxtdata - free a context's allocated data
+ * @dd: the qlogic_ib device
+ * @rcd: the ctxtdata structure
+ *
+ * free up any allocated data for a context
+ * This should not touch anything that would affect a simultaneous
+ * re-allocation of context data, because it is called after qib_mutex
+ * is released (and can be called from reinit as well).
+ * It should never change any chip state, or global driver state.
+ */
+void qib_free_ctxtdata(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
+{
+	if (!rcd)
+		return;
+
+	if (rcd->rcvhdrq) {
+		dma_free_coherent(&dd->pcidev->dev, rcd->rcvhdrq_size,
+				  rcd->rcvhdrq, rcd->rcvhdrq_phys);
+		rcd->rcvhdrq = NULL;
+		if (rcd->rcvhdrtail_kvaddr) {
+			dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+					  rcd->rcvhdrtail_kvaddr,
+					  rcd->rcvhdrqtailaddr_phys);
+			rcd->rcvhdrtail_kvaddr = NULL;
+		}
+	}
+	if (rcd->rcvegrbuf) {
+		unsigned e;
+
+		for (e = 0; e < rcd->rcvegrbuf_chunks; e++) {
+			void *base = rcd->rcvegrbuf[e];
+			size_t size = rcd->rcvegrbuf_size;
+
+			dma_free_coherent(&dd->pcidev->dev, size,
+					  base, rcd->rcvegrbuf_phys[e]);
+		}
+		kfree(rcd->rcvegrbuf);
+		rcd->rcvegrbuf = NULL;
+		kfree(rcd->rcvegrbuf_phys);
+		rcd->rcvegrbuf_phys = NULL;
+		rcd->rcvegrbuf_chunks = 0;
+	}
+
+	kfree(rcd->tid_pg_list);
+	vfree(rcd->user_event_mask);
+	vfree(rcd->subctxt_uregbase);
+	vfree(rcd->subctxt_rcvegrbuf);
+	vfree(rcd->subctxt_rcvhdr_base);
+#ifdef CONFIG_DEBUG_FS
+	kfree(rcd->opstats);
+	rcd->opstats = NULL;
+#endif
+	kfree(rcd);
+}
+
+/*
+ * Perform a PIO buffer bandwidth write test, to verify proper system
+ * configuration.  Even when all the setup calls work, occasionally
+ * BIOS or other issues can prevent write combining from working, or
+ * can cause other bandwidth problems to the chip.
+ *
+ * This test simply writes the same buffer over and over again, and
+ * measures close to the peak bandwidth to the chip (not testing
+ * data bandwidth to the wire).   On chips that use an address-based
+ * trigger to send packets to the wire, this is easy.  On chips that
+ * use a count to trigger, we want to make sure that the packet doesn't
+ * go out on the wire, or trigger flow control checks.
+ */
+static void qib_verify_pioperf(struct qib_devdata *dd)
+{
+	u32 pbnum, cnt, lcnt;
+	u32 __iomem *piobuf;
+	u32 *addr;
+	u64 msecs, emsecs;
+
+	piobuf = dd->f_getsendbuf(dd->pport, 0ULL, &pbnum);
+	if (!piobuf) {
+		qib_devinfo(dd->pcidev,
+			 "No PIObufs for checking perf, skipping\n");
+		return;
+	}
+
+	/*
+	 * Enough to give us a reasonable test, less than piobuf size, and
+	 * likely multiple of store buffer length.
+	 */
+	cnt = 1024;
+
+	addr = vmalloc(cnt);
+	if (!addr) {
+		qib_devinfo(dd->pcidev,
+			 "Couldn't get memory for checking PIO perf, skipping\n");
+		goto done;
+	}
+
+	preempt_disable();  /* we want reasonably accurate elapsed time */
+	msecs = 1 + jiffies_to_msecs(jiffies);
+	for (lcnt = 0; lcnt < 10000U; lcnt++) {
+		/* wait until we cross msec boundary */
+		if (jiffies_to_msecs(jiffies) >= msecs)
+			break;
+		udelay(1);
+	}
+
+	dd->f_set_armlaunch(dd, 0);
+
+	/*
+	 * length 0, no dwords actually sent
+	 */
+	writeq(0, piobuf);
+	qib_flush_wc();
+
+	/*
+	 * This is only roughly accurate, since even with preempt we
+	 * still take interrupts that could take a while.   Running for
+	 * >= 5 msec seems to get us "close enough" to accurate values.
+	 */
+	msecs = jiffies_to_msecs(jiffies);
+	for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) {
+		qib_pio_copy(piobuf + 64, addr, cnt >> 2);
+		emsecs = jiffies_to_msecs(jiffies) - msecs;
+	}
+
+	/* 1 GiB/sec, slightly over IB SDR line rate */
+	if (lcnt < (emsecs * 1024U))
+		qib_dev_err(dd,
+			    "Performance problem: bandwidth to PIO buffers is only %u MiB/sec\n",
+			    lcnt / (u32) emsecs);
+
+	preempt_enable();
+
+	vfree(addr);
+
+done:
+	/* disarm piobuf, so it's available again */
+	dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(pbnum));
+	qib_sendbuf_done(dd, pbnum);
+	dd->f_set_armlaunch(dd, 1);
+}
+
+void qib_free_devdata(struct qib_devdata *dd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+	idr_remove(&qib_unit_table, dd->unit);
+	list_del(&dd->list);
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+
+#ifdef CONFIG_DEBUG_FS
+	qib_dbg_ibdev_exit(&dd->verbs_dev);
+#endif
+	free_percpu(dd->int_counter);
+	ib_dealloc_device(&dd->verbs_dev.ibdev);
+}
+
+u64 qib_int_counter(struct qib_devdata *dd)
+{
+	int cpu;
+	u64 int_counter = 0;
+
+	for_each_possible_cpu(cpu)
+		int_counter += *per_cpu_ptr(dd->int_counter, cpu);
+	return int_counter;
+}
+
+u64 qib_sps_ints(void)
+{
+	unsigned long flags;
+	struct qib_devdata *dd;
+	u64 sps_ints = 0;
+
+	spin_lock_irqsave(&qib_devs_lock, flags);
+	list_for_each_entry(dd, &qib_dev_list, list) {
+		sps_ints += qib_int_counter(dd);
+	}
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+	return sps_ints;
+}
+
+/*
+ * Allocate our primary per-unit data structure.  Must be done via verbs
+ * allocator, because the verbs cleanup process both does cleanup and
+ * free of the data structure.
+ * "extra" is for chip-specific data.
+ *
+ * Use the idr mechanism to get a unit number for this unit.
+ */
+struct qib_devdata *qib_alloc_devdata(struct pci_dev *pdev, size_t extra)
+{
+	unsigned long flags;
+	struct qib_devdata *dd;
+	int ret;
+
+	dd = (struct qib_devdata *) ib_alloc_device(sizeof(*dd) + extra);
+	if (!dd)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&dd->list);
+
+	idr_preload(GFP_KERNEL);
+	spin_lock_irqsave(&qib_devs_lock, flags);
+
+	ret = idr_alloc(&qib_unit_table, dd, 0, 0, GFP_NOWAIT);
+	if (ret >= 0) {
+		dd->unit = ret;
+		list_add(&dd->list, &qib_dev_list);
+	}
+
+	spin_unlock_irqrestore(&qib_devs_lock, flags);
+	idr_preload_end();
+
+	if (ret < 0) {
+		qib_early_err(&pdev->dev,
+			      "Could not allocate unit ID: error %d\n", -ret);
+		goto bail;
+	}
+	dd->int_counter = alloc_percpu(u64);
+	if (!dd->int_counter) {
+		ret = -ENOMEM;
+		qib_early_err(&pdev->dev,
+			      "Could not allocate per-cpu int_counter\n");
+		goto bail;
+	}
+
+	if (!qib_cpulist_count) {
+		u32 count = num_online_cpus();
+
+		qib_cpulist = kzalloc(BITS_TO_LONGS(count) *
+				      sizeof(long), GFP_KERNEL);
+		if (qib_cpulist)
+			qib_cpulist_count = count;
+		else
+			qib_early_err(&pdev->dev,
+				"Could not alloc cpulist info, cpu affinity might be wrong\n");
+	}
+#ifdef CONFIG_DEBUG_FS
+	qib_dbg_ibdev_init(&dd->verbs_dev);
+#endif
+	return dd;
+bail:
+	if (!list_empty(&dd->list))
+		list_del_init(&dd->list);
+	ib_dealloc_device(&dd->verbs_dev.ibdev);
+	return ERR_PTR(ret);
+}
+
+/*
+ * Called from freeze mode handlers, and from PCI error
+ * reporting code.  Should be paranoid about state of
+ * system and data structures.
+ */
+void qib_disable_after_error(struct qib_devdata *dd)
+{
+	if (dd->flags & QIB_INITTED) {
+		u32 pidx;
+
+		dd->flags &= ~QIB_INITTED;
+		if (dd->pport)
+			for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+				struct qib_pportdata *ppd;
+
+				ppd = dd->pport + pidx;
+				if (dd->flags & QIB_PRESENT) {
+					qib_set_linkstate(ppd,
+						QIB_IB_LINKDOWN_DISABLE);
+					dd->f_setextled(ppd, 0);
+				}
+				*ppd->statusp &= ~QIB_STATUS_IB_READY;
+			}
+	}
+
+	/*
+	 * Mark as having had an error for driver, and also
+	 * for /sys and status word mapped to user programs.
+	 * This marks unit as not usable, until reset.
+	 */
+	if (dd->devstatusp)
+		*dd->devstatusp |= QIB_STATUS_HWERROR;
+}
+
+static void qib_remove_one(struct pci_dev *);
+static int qib_init_one(struct pci_dev *, const struct pci_device_id *);
+
+#define DRIVER_LOAD_MSG "Intel " QIB_DRV_NAME " loaded: "
+#define PFX QIB_DRV_NAME ": "
+
+static const struct pci_device_id qib_pci_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_PATHSCALE, PCI_DEVICE_ID_QLOGIC_IB_6120) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7220) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_QLOGIC, PCI_DEVICE_ID_QLOGIC_IB_7322) },
+	{ 0, }
+};
+
+MODULE_DEVICE_TABLE(pci, qib_pci_tbl);
+
+static struct pci_driver qib_driver = {
+	.name = QIB_DRV_NAME,
+	.probe = qib_init_one,
+	.remove = qib_remove_one,
+	.id_table = qib_pci_tbl,
+	.err_handler = &qib_pci_err_handler,
+};
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+
+static int qib_notify_dca(struct notifier_block *, unsigned long, void *);
+static struct notifier_block dca_notifier = {
+	.notifier_call  = qib_notify_dca,
+	.next           = NULL,
+	.priority       = 0
+};
+
+static int qib_notify_dca_device(struct device *device, void *data)
+{
+	struct qib_devdata *dd = dev_get_drvdata(device);
+	unsigned long event = *(unsigned long *)data;
+
+	return dd->f_notify_dca(dd, event);
+}
+
+static int qib_notify_dca(struct notifier_block *nb, unsigned long event,
+					  void *p)
+{
+	int rval;
+
+	rval = driver_for_each_device(&qib_driver.driver, NULL,
+				      &event, qib_notify_dca_device);
+	return rval ? NOTIFY_BAD : NOTIFY_DONE;
+}
+
+#endif
+
+/*
+ * Do all the generic driver unit- and chip-independent memory
+ * allocation and initialization.
+ */
+static int __init qib_ib_init(void)
+{
+	int ret;
+
+	ret = qib_dev_init();
+	if (ret)
+		goto bail;
+
+	/*
+	 * These must be called before the driver is registered with
+	 * the PCI subsystem.
+	 */
+	idr_init(&qib_unit_table);
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dca_register_notify(&dca_notifier);
+#endif
+#ifdef CONFIG_DEBUG_FS
+	qib_dbg_init();
+#endif
+	ret = pci_register_driver(&qib_driver);
+	if (ret < 0) {
+		pr_err("Unable to register driver: error %d\n", -ret);
+		goto bail_dev;
+	}
+
+	/* not fatal if it doesn't work */
+	if (qib_init_qibfs())
+		pr_err("Unable to register ipathfs\n");
+	goto bail; /* all OK */
+
+bail_dev:
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dca_unregister_notify(&dca_notifier);
+#endif
+#ifdef CONFIG_DEBUG_FS
+	qib_dbg_exit();
+#endif
+	idr_destroy(&qib_unit_table);
+	qib_dev_cleanup();
+bail:
+	return ret;
+}
+
+module_init(qib_ib_init);
+
+/*
+ * Do the non-unit driver cleanup, memory free, etc. at unload.
+ */
+static void __exit qib_ib_cleanup(void)
+{
+	int ret;
+
+	ret = qib_exit_qibfs();
+	if (ret)
+		pr_err(
+			"Unable to cleanup counter filesystem: error %d\n",
+			-ret);
+
+#ifdef CONFIG_INFINIBAND_QIB_DCA
+	dca_unregister_notify(&dca_notifier);
+#endif
+	pci_unregister_driver(&qib_driver);
+#ifdef CONFIG_DEBUG_FS
+	qib_dbg_exit();
+#endif
+
+	qib_cpulist_count = 0;
+	kfree(qib_cpulist);
+
+	idr_destroy(&qib_unit_table);
+	qib_dev_cleanup();
+}
+
+module_exit(qib_ib_cleanup);
+
+/* this can only be called after a successful initialization */
+static void cleanup_device_data(struct qib_devdata *dd)
+{
+	int ctxt;
+	int pidx;
+	struct qib_ctxtdata **tmp;
+	unsigned long flags;
+
+	/* users can't do anything more with chip */
+	for (pidx = 0; pidx < dd->num_pports; ++pidx) {
+		if (dd->pport[pidx].statusp)
+			*dd->pport[pidx].statusp &= ~QIB_STATUS_CHIP_PRESENT;
+
+		spin_lock(&dd->pport[pidx].cc_shadow_lock);
+
+		kfree(dd->pport[pidx].congestion_entries);
+		dd->pport[pidx].congestion_entries = NULL;
+		kfree(dd->pport[pidx].ccti_entries);
+		dd->pport[pidx].ccti_entries = NULL;
+		kfree(dd->pport[pidx].ccti_entries_shadow);
+		dd->pport[pidx].ccti_entries_shadow = NULL;
+		kfree(dd->pport[pidx].congestion_entries_shadow);
+		dd->pport[pidx].congestion_entries_shadow = NULL;
+
+		spin_unlock(&dd->pport[pidx].cc_shadow_lock);
+	}
+
+	qib_disable_wc(dd);
+
+	if (dd->pioavailregs_dma) {
+		dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+				  (void *) dd->pioavailregs_dma,
+				  dd->pioavailregs_phys);
+		dd->pioavailregs_dma = NULL;
+	}
+
+	if (dd->pageshadow) {
+		struct page **tmpp = dd->pageshadow;
+		dma_addr_t *tmpd = dd->physshadow;
+		int i;
+
+		for (ctxt = 0; ctxt < dd->cfgctxts; ctxt++) {
+			int ctxt_tidbase = ctxt * dd->rcvtidcnt;
+			int maxtid = ctxt_tidbase + dd->rcvtidcnt;
+
+			for (i = ctxt_tidbase; i < maxtid; i++) {
+				if (!tmpp[i])
+					continue;
+				pci_unmap_page(dd->pcidev, tmpd[i],
+					       PAGE_SIZE, PCI_DMA_FROMDEVICE);
+				qib_release_user_pages(&tmpp[i], 1);
+				tmpp[i] = NULL;
+			}
+		}
+
+		dd->pageshadow = NULL;
+		vfree(tmpp);
+		dd->physshadow = NULL;
+		vfree(tmpd);
+	}
+
+	/*
+	 * Free any resources still in use (usually just kernel contexts)
+	 * at unload; we do for ctxtcnt, because that's what we allocate.
+	 * We acquire lock to be really paranoid that rcd isn't being
+	 * accessed from some interrupt-related code (that should not happen,
+	 * but best to be sure).
+	 */
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	tmp = dd->rcd;
+	dd->rcd = NULL;
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+	for (ctxt = 0; tmp && ctxt < dd->ctxtcnt; ctxt++) {
+		struct qib_ctxtdata *rcd = tmp[ctxt];
+
+		tmp[ctxt] = NULL; /* debugging paranoia */
+		qib_free_ctxtdata(dd, rcd);
+	}
+	kfree(tmp);
+	kfree(dd->boardname);
+	qib_cq_exit(dd);
+}
+
+/*
+ * Clean up on unit shutdown, or error during unit load after
+ * successful initialization.
+ */
+static void qib_postinit_cleanup(struct qib_devdata *dd)
+{
+	/*
+	 * Clean up chip-specific stuff.
+	 * We check for NULL here, because it's outside
+	 * the kregbase check, and we need to call it
+	 * after the free_irq.  Thus it's possible that
+	 * the function pointers were never initialized.
+	 */
+	if (dd->f_cleanup)
+		dd->f_cleanup(dd);
+
+	qib_pcie_ddcleanup(dd);
+
+	cleanup_device_data(dd);
+
+	qib_free_devdata(dd);
+}
+
+static int qib_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int ret, j, pidx, initfail;
+	struct qib_devdata *dd = NULL;
+
+	ret = qib_pcie_init(pdev, ent);
+	if (ret)
+		goto bail;
+
+	/*
+	 * Do device-specific initialiation, function table setup, dd
+	 * allocation, etc.
+	 */
+	switch (ent->device) {
+	case PCI_DEVICE_ID_QLOGIC_IB_6120:
+#ifdef CONFIG_PCI_MSI
+		dd = qib_init_iba6120_funcs(pdev, ent);
+#else
+		qib_early_err(&pdev->dev,
+			"Intel PCIE device 0x%x cannot work if CONFIG_PCI_MSI is not enabled\n",
+			ent->device);
+		dd = ERR_PTR(-ENODEV);
+#endif
+		break;
+
+	case PCI_DEVICE_ID_QLOGIC_IB_7220:
+		dd = qib_init_iba7220_funcs(pdev, ent);
+		break;
+
+	case PCI_DEVICE_ID_QLOGIC_IB_7322:
+		dd = qib_init_iba7322_funcs(pdev, ent);
+		break;
+
+	default:
+		qib_early_err(&pdev->dev,
+			"Failing on unknown Intel deviceid 0x%x\n",
+			ent->device);
+		ret = -ENODEV;
+	}
+
+	if (IS_ERR(dd))
+		ret = PTR_ERR(dd);
+	if (ret)
+		goto bail; /* error already printed */
+
+	ret = qib_create_workqueues(dd);
+	if (ret)
+		goto bail;
+
+	/* do the generic initialization */
+	initfail = qib_init(dd, 0);
+
+	ret = qib_register_ib_device(dd);
+
+	/*
+	 * Now ready for use.  this should be cleared whenever we
+	 * detect a reset, or initiate one.  If earlier failure,
+	 * we still create devices, so diags, etc. can be used
+	 * to determine cause of problem.
+	 */
+	if (!qib_mini_init && !initfail && !ret)
+		dd->flags |= QIB_INITTED;
+
+	j = qib_device_create(dd);
+	if (j)
+		qib_dev_err(dd, "Failed to create /dev devices: %d\n", -j);
+	j = qibfs_add(dd);
+	if (j)
+		qib_dev_err(dd, "Failed filesystem setup for counters: %d\n",
+			    -j);
+
+	if (qib_mini_init || initfail || ret) {
+		qib_stop_timers(dd);
+		flush_workqueue(ib_wq);
+		for (pidx = 0; pidx < dd->num_pports; ++pidx)
+			dd->f_quiet_serdes(dd->pport + pidx);
+		if (qib_mini_init)
+			goto bail;
+		if (!j) {
+			(void) qibfs_remove(dd);
+			qib_device_remove(dd);
+		}
+		if (!ret)
+			qib_unregister_ib_device(dd);
+		qib_postinit_cleanup(dd);
+		if (initfail)
+			ret = initfail;
+		goto bail;
+	}
+
+	ret = qib_enable_wc(dd);
+	if (ret) {
+		qib_dev_err(dd,
+			"Write combining not enabled (err %d): performance may be poor\n",
+			-ret);
+		ret = 0;
+	}
+
+	qib_verify_pioperf(dd);
+bail:
+	return ret;
+}
+
+static void qib_remove_one(struct pci_dev *pdev)
+{
+	struct qib_devdata *dd = pci_get_drvdata(pdev);
+	int ret;
+
+	/* unregister from IB core */
+	qib_unregister_ib_device(dd);
+
+	/*
+	 * Disable the IB link, disable interrupts on the device,
+	 * clear dma engines, etc.
+	 */
+	if (!qib_mini_init)
+		qib_shutdown_device(dd);
+
+	qib_stop_timers(dd);
+
+	/* wait until all of our (qsfp) queue_work() calls complete */
+	flush_workqueue(ib_wq);
+
+	ret = qibfs_remove(dd);
+	if (ret)
+		qib_dev_err(dd, "Failed counters filesystem cleanup: %d\n",
+			    -ret);
+
+	qib_device_remove(dd);
+
+	qib_postinit_cleanup(dd);
+}
+
+/**
+ * qib_create_rcvhdrq - create a receive header queue
+ * @dd: the qlogic_ib device
+ * @rcd: the context data
+ *
+ * This must be contiguous memory (from an i/o perspective), and must be
+ * DMA'able (which means for some systems, it will go through an IOMMU,
+ * or be forced into a low address range).
+ */
+int qib_create_rcvhdrq(struct qib_devdata *dd, struct qib_ctxtdata *rcd)
+{
+	unsigned amt;
+	int old_node_id;
+
+	if (!rcd->rcvhdrq) {
+		dma_addr_t phys_hdrqtail;
+		gfp_t gfp_flags;
+
+		amt = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize *
+			    sizeof(u32), PAGE_SIZE);
+		gfp_flags = (rcd->ctxt >= dd->first_user_ctxt) ?
+			GFP_USER : GFP_KERNEL;
+
+		old_node_id = dev_to_node(&dd->pcidev->dev);
+		set_dev_node(&dd->pcidev->dev, rcd->node_id);
+		rcd->rcvhdrq = dma_alloc_coherent(
+			&dd->pcidev->dev, amt, &rcd->rcvhdrq_phys,
+			gfp_flags | __GFP_COMP);
+		set_dev_node(&dd->pcidev->dev, old_node_id);
+
+		if (!rcd->rcvhdrq) {
+			qib_dev_err(dd,
+				"attempt to allocate %d bytes for ctxt %u rcvhdrq failed\n",
+				amt, rcd->ctxt);
+			goto bail;
+		}
+
+		if (rcd->ctxt >= dd->first_user_ctxt) {
+			rcd->user_event_mask = vmalloc_user(PAGE_SIZE);
+			if (!rcd->user_event_mask)
+				goto bail_free_hdrq;
+		}
+
+		if (!(dd->flags & QIB_NODMA_RTAIL)) {
+			set_dev_node(&dd->pcidev->dev, rcd->node_id);
+			rcd->rcvhdrtail_kvaddr = dma_alloc_coherent(
+				&dd->pcidev->dev, PAGE_SIZE, &phys_hdrqtail,
+				gfp_flags);
+			set_dev_node(&dd->pcidev->dev, old_node_id);
+			if (!rcd->rcvhdrtail_kvaddr)
+				goto bail_free;
+			rcd->rcvhdrqtailaddr_phys = phys_hdrqtail;
+		}
+
+		rcd->rcvhdrq_size = amt;
+	}
+
+	/* clear for security and sanity on each use */
+	memset(rcd->rcvhdrq, 0, rcd->rcvhdrq_size);
+	if (rcd->rcvhdrtail_kvaddr)
+		memset(rcd->rcvhdrtail_kvaddr, 0, PAGE_SIZE);
+	return 0;
+
+bail_free:
+	qib_dev_err(dd,
+		"attempt to allocate 1 page for ctxt %u rcvhdrqtailaddr failed\n",
+		rcd->ctxt);
+	vfree(rcd->user_event_mask);
+	rcd->user_event_mask = NULL;
+bail_free_hdrq:
+	dma_free_coherent(&dd->pcidev->dev, amt, rcd->rcvhdrq,
+			  rcd->rcvhdrq_phys);
+	rcd->rcvhdrq = NULL;
+bail:
+	return -ENOMEM;
+}
+
+/**
+ * allocate eager buffers, both kernel and user contexts.
+ * @rcd: the context we are setting up.
+ *
+ * Allocate the eager TID buffers and program them into hip.
+ * They are no longer completely contiguous, we do multiple allocation
+ * calls.  Otherwise we get the OOM code involved, by asking for too
+ * much per call, with disastrous results on some kernels.
+ */
+int qib_setup_eagerbufs(struct qib_ctxtdata *rcd)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned e, egrcnt, egrperchunk, chunk, egrsize, egroff;
+	size_t size;
+	gfp_t gfp_flags;
+	int old_node_id;
+
+	/*
+	 * GFP_USER, but without GFP_FS, so buffer cache can be
+	 * coalesced (we hope); otherwise, even at order 4,
+	 * heavy filesystem activity makes these fail, and we can
+	 * use compound pages.
+	 */
+	gfp_flags = __GFP_WAIT | __GFP_IO | __GFP_COMP;
+
+	egrcnt = rcd->rcvegrcnt;
+	egroff = rcd->rcvegr_tid_base;
+	egrsize = dd->rcvegrbufsize;
+
+	chunk = rcd->rcvegrbuf_chunks;
+	egrperchunk = rcd->rcvegrbufs_perchunk;
+	size = rcd->rcvegrbuf_size;
+	if (!rcd->rcvegrbuf) {
+		rcd->rcvegrbuf =
+			kzalloc_node(chunk * sizeof(rcd->rcvegrbuf[0]),
+				GFP_KERNEL, rcd->node_id);
+		if (!rcd->rcvegrbuf)
+			goto bail;
+	}
+	if (!rcd->rcvegrbuf_phys) {
+		rcd->rcvegrbuf_phys =
+			kmalloc_node(chunk * sizeof(rcd->rcvegrbuf_phys[0]),
+				GFP_KERNEL, rcd->node_id);
+		if (!rcd->rcvegrbuf_phys)
+			goto bail_rcvegrbuf;
+	}
+	for (e = 0; e < rcd->rcvegrbuf_chunks; e++) {
+		if (rcd->rcvegrbuf[e])
+			continue;
+
+		old_node_id = dev_to_node(&dd->pcidev->dev);
+		set_dev_node(&dd->pcidev->dev, rcd->node_id);
+		rcd->rcvegrbuf[e] =
+			dma_alloc_coherent(&dd->pcidev->dev, size,
+					   &rcd->rcvegrbuf_phys[e],
+					   gfp_flags);
+		set_dev_node(&dd->pcidev->dev, old_node_id);
+		if (!rcd->rcvegrbuf[e])
+			goto bail_rcvegrbuf_phys;
+	}
+
+	rcd->rcvegr_phys = rcd->rcvegrbuf_phys[0];
+
+	for (e = chunk = 0; chunk < rcd->rcvegrbuf_chunks; chunk++) {
+		dma_addr_t pa = rcd->rcvegrbuf_phys[chunk];
+		unsigned i;
+
+		/* clear for security and sanity on each use */
+		memset(rcd->rcvegrbuf[chunk], 0, size);
+
+		for (i = 0; e < egrcnt && i < egrperchunk; e++, i++) {
+			dd->f_put_tid(dd, e + egroff +
+					  (u64 __iomem *)
+					  ((char __iomem *)
+					   dd->kregbase +
+					   dd->rcvegrbase),
+					  RCVHQ_RCV_TYPE_EAGER, pa);
+			pa += egrsize;
+		}
+		cond_resched(); /* don't hog the cpu */
+	}
+
+	return 0;
+
+bail_rcvegrbuf_phys:
+	for (e = 0; e < rcd->rcvegrbuf_chunks && rcd->rcvegrbuf[e]; e++)
+		dma_free_coherent(&dd->pcidev->dev, size,
+				  rcd->rcvegrbuf[e], rcd->rcvegrbuf_phys[e]);
+	kfree(rcd->rcvegrbuf_phys);
+	rcd->rcvegrbuf_phys = NULL;
+bail_rcvegrbuf:
+	kfree(rcd->rcvegrbuf);
+	rcd->rcvegrbuf = NULL;
+bail:
+	return -ENOMEM;
+}
+
+/*
+ * Note: Changes to this routine should be mirrored
+ * for the diagnostics routine qib_remap_ioaddr32().
+ * There is also related code for VL15 buffers in qib_init_7322_variables().
+ * The teardown code that unmaps is in qib_pcie_ddcleanup()
+ */
+int init_chip_wc_pat(struct qib_devdata *dd, u32 vl15buflen)
+{
+	u64 __iomem *qib_kregbase = NULL;
+	void __iomem *qib_piobase = NULL;
+	u64 __iomem *qib_userbase = NULL;
+	u64 qib_kreglen;
+	u64 qib_pio2koffset = dd->piobufbase & 0xffffffff;
+	u64 qib_pio4koffset = dd->piobufbase >> 32;
+	u64 qib_pio2klen = dd->piobcnt2k * dd->palign;
+	u64 qib_pio4klen = dd->piobcnt4k * dd->align4k;
+	u64 qib_physaddr = dd->physaddr;
+	u64 qib_piolen;
+	u64 qib_userlen = 0;
+
+	/*
+	 * Free the old mapping because the kernel will try to reuse the
+	 * old mapping and not create a new mapping with the
+	 * write combining attribute.
+	 */
+	iounmap(dd->kregbase);
+	dd->kregbase = NULL;
+
+	/*
+	 * Assumes chip address space looks like:
+	 *	- kregs + sregs + cregs + uregs (in any order)
+	 *	- piobufs (2K and 4K bufs in either order)
+	 * or:
+	 *	- kregs + sregs + cregs (in any order)
+	 *	- piobufs (2K and 4K bufs in either order)
+	 *	- uregs
+	 */
+	if (dd->piobcnt4k == 0) {
+		qib_kreglen = qib_pio2koffset;
+		qib_piolen = qib_pio2klen;
+	} else if (qib_pio2koffset < qib_pio4koffset) {
+		qib_kreglen = qib_pio2koffset;
+		qib_piolen = qib_pio4koffset + qib_pio4klen - qib_kreglen;
+	} else {
+		qib_kreglen = qib_pio4koffset;
+		qib_piolen = qib_pio2koffset + qib_pio2klen - qib_kreglen;
+	}
+	qib_piolen += vl15buflen;
+	/* Map just the configured ports (not all hw ports) */
+	if (dd->uregbase > qib_kreglen)
+		qib_userlen = dd->ureg_align * dd->cfgctxts;
+
+	/* Sanity checks passed, now create the new mappings */
+	qib_kregbase = ioremap_nocache(qib_physaddr, qib_kreglen);
+	if (!qib_kregbase)
+		goto bail;
+
+	qib_piobase = ioremap_wc(qib_physaddr + qib_kreglen, qib_piolen);
+	if (!qib_piobase)
+		goto bail_kregbase;
+
+	if (qib_userlen) {
+		qib_userbase = ioremap_nocache(qib_physaddr + dd->uregbase,
+					       qib_userlen);
+		if (!qib_userbase)
+			goto bail_piobase;
+	}
+
+	dd->kregbase = qib_kregbase;
+	dd->kregend = (u64 __iomem *)
+		((char __iomem *) qib_kregbase + qib_kreglen);
+	dd->piobase = qib_piobase;
+	dd->pio2kbase = (void __iomem *)
+		(((char __iomem *) dd->piobase) +
+		 qib_pio2koffset - qib_kreglen);
+	if (dd->piobcnt4k)
+		dd->pio4kbase = (void __iomem *)
+			(((char __iomem *) dd->piobase) +
+			 qib_pio4koffset - qib_kreglen);
+	if (qib_userlen)
+		/* ureg will now be accessed relative to dd->userbase */
+		dd->userbase = qib_userbase;
+	return 0;
+
+bail_piobase:
+	iounmap(qib_piobase);
+bail_kregbase:
+	iounmap(qib_kregbase);
+bail:
+	return -ENOMEM;
+}
diff --git a/drivers/infiniband/hw/qib/qib_intr.c b/drivers/infiniband/hw/qib/qib_intr.c
new file mode 100644
index 000000000..086616d07
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_intr.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+
+#include "qib.h"
+#include "qib_common.h"
+
+/**
+ * qib_format_hwmsg - format a single hwerror message
+ * @msg message buffer
+ * @msgl length of message buffer
+ * @hwmsg message to add to message buffer
+ */
+static void qib_format_hwmsg(char *msg, size_t msgl, const char *hwmsg)
+{
+	strlcat(msg, "[", msgl);
+	strlcat(msg, hwmsg, msgl);
+	strlcat(msg, "]", msgl);
+}
+
+/**
+ * qib_format_hwerrors - format hardware error messages for display
+ * @hwerrs hardware errors bit vector
+ * @hwerrmsgs hardware error descriptions
+ * @nhwerrmsgs number of hwerrmsgs
+ * @msg message buffer
+ * @msgl message buffer length
+ */
+void qib_format_hwerrors(u64 hwerrs, const struct qib_hwerror_msgs *hwerrmsgs,
+			 size_t nhwerrmsgs, char *msg, size_t msgl)
+{
+	int i;
+
+	for (i = 0; i < nhwerrmsgs; i++)
+		if (hwerrs & hwerrmsgs[i].mask)
+			qib_format_hwmsg(msg, msgl, hwerrmsgs[i].msg);
+}
+
+static void signal_ib_event(struct qib_pportdata *ppd, enum ib_event_type ev)
+{
+	struct ib_event event;
+	struct qib_devdata *dd = ppd->dd;
+
+	event.device = &dd->verbs_dev.ibdev;
+	event.element.port_num = ppd->port;
+	event.event = ev;
+	ib_dispatch_event(&event);
+}
+
+void qib_handle_e_ibstatuschanged(struct qib_pportdata *ppd, u64 ibcs)
+{
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+	u32 lstate;
+	u8 ltstate;
+	enum ib_event_type ev = 0;
+
+	lstate = dd->f_iblink_state(ibcs); /* linkstate */
+	ltstate = dd->f_ibphys_portstate(ibcs);
+
+	/*
+	 * If linkstate transitions into INIT from any of the various down
+	 * states, or if it transitions from any of the up (INIT or better)
+	 * states into any of the down states (except link recovery), then
+	 * call the chip-specific code to take appropriate actions.
+	 *
+	 * ppd->lflags could be 0 if this is the first time the interrupt
+	 * handlers has been called but the link is already up.
+	 */
+	if (lstate >= IB_PORT_INIT &&
+	    (!ppd->lflags || (ppd->lflags & QIBL_LINKDOWN)) &&
+	    ltstate == IB_PHYSPORTSTATE_LINKUP) {
+		/* transitioned to UP */
+		if (dd->f_ib_updown(ppd, 1, ibcs))
+			goto skip_ibchange; /* chip-code handled */
+	} else if (ppd->lflags & (QIBL_LINKINIT | QIBL_LINKARMED |
+		   QIBL_LINKACTIVE | QIBL_IB_FORCE_NOTIFY)) {
+		if (ltstate != IB_PHYSPORTSTATE_LINKUP &&
+		    ltstate <= IB_PHYSPORTSTATE_CFG_TRAIN &&
+		    dd->f_ib_updown(ppd, 0, ibcs))
+			goto skip_ibchange; /* chip-code handled */
+		qib_set_uevent_bits(ppd, _QIB_EVENT_LINKDOWN_BIT);
+	}
+
+	if (lstate != IB_PORT_DOWN) {
+		/* lstate is INIT, ARMED, or ACTIVE */
+		if (lstate != IB_PORT_ACTIVE) {
+			*ppd->statusp &= ~QIB_STATUS_IB_READY;
+			if (ppd->lflags & QIBL_LINKACTIVE)
+				ev = IB_EVENT_PORT_ERR;
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			if (lstate == IB_PORT_ARMED) {
+				ppd->lflags |= QIBL_LINKARMED | QIBL_LINKV;
+				ppd->lflags &= ~(QIBL_LINKINIT |
+					QIBL_LINKDOWN | QIBL_LINKACTIVE);
+			} else {
+				ppd->lflags |= QIBL_LINKINIT | QIBL_LINKV;
+				ppd->lflags &= ~(QIBL_LINKARMED |
+					QIBL_LINKDOWN | QIBL_LINKACTIVE);
+			}
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+			/* start a 75msec timer to clear symbol errors */
+			mod_timer(&ppd->symerr_clear_timer,
+				  msecs_to_jiffies(75));
+		} else if (ltstate == IB_PHYSPORTSTATE_LINKUP &&
+			   !(ppd->lflags & QIBL_LINKACTIVE)) {
+			/* active, but not active defered */
+			qib_hol_up(ppd); /* useful only for 6120 now */
+			*ppd->statusp |=
+				QIB_STATUS_IB_READY | QIB_STATUS_IB_CONF;
+			qib_clear_symerror_on_linkup((unsigned long)ppd);
+			spin_lock_irqsave(&ppd->lflags_lock, flags);
+			ppd->lflags |= QIBL_LINKACTIVE | QIBL_LINKV;
+			ppd->lflags &= ~(QIBL_LINKINIT |
+				QIBL_LINKDOWN | QIBL_LINKARMED);
+			spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+			if (dd->flags & QIB_HAS_SEND_DMA)
+				qib_sdma_process_event(ppd,
+					qib_sdma_event_e30_go_running);
+			ev = IB_EVENT_PORT_ACTIVE;
+			dd->f_setextled(ppd, 1);
+		}
+	} else { /* down */
+		if (ppd->lflags & QIBL_LINKACTIVE)
+			ev = IB_EVENT_PORT_ERR;
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags |= QIBL_LINKDOWN | QIBL_LINKV;
+		ppd->lflags &= ~(QIBL_LINKINIT |
+				 QIBL_LINKACTIVE | QIBL_LINKARMED);
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		*ppd->statusp &= ~QIB_STATUS_IB_READY;
+	}
+
+skip_ibchange:
+	ppd->lastibcstat = ibcs;
+	if (ev)
+		signal_ib_event(ppd, ev);
+}
+
+void qib_clear_symerror_on_linkup(unsigned long opaque)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *)opaque;
+
+	if (ppd->lflags & QIBL_LINKACTIVE)
+		return;
+
+	ppd->ibport_data.z_symbol_error_counter =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBSYMBOLERR);
+}
+
+/*
+ * Handle receive interrupts for user ctxts; this means a user
+ * process was waiting for a packet to arrive, and didn't want
+ * to poll.
+ */
+void qib_handle_urcv(struct qib_devdata *dd, u64 ctxtr)
+{
+	struct qib_ctxtdata *rcd;
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&dd->uctxt_lock, flags);
+	for (i = dd->first_user_ctxt; dd->rcd && i < dd->cfgctxts; i++) {
+		if (!(ctxtr & (1ULL << i)))
+			continue;
+		rcd = dd->rcd[i];
+		if (!rcd || !rcd->cnt)
+			continue;
+
+		if (test_and_clear_bit(QIB_CTXT_WAITING_RCV, &rcd->flag)) {
+			wake_up_interruptible(&rcd->wait);
+			dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_INTRAVAIL_DIS,
+				      rcd->ctxt);
+		} else if (test_and_clear_bit(QIB_CTXT_WAITING_URG,
+					      &rcd->flag)) {
+			rcd->urgent++;
+			wake_up_interruptible(&rcd->wait);
+		}
+	}
+	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+}
+
+void qib_bad_intrstatus(struct qib_devdata *dd)
+{
+	static int allbits;
+
+	/* separate routine, for better optimization of qib_intr() */
+
+	/*
+	 * We print the message and disable interrupts, in hope of
+	 * having a better chance of debugging the problem.
+	 */
+	qib_dev_err(dd,
+		"Read of chip interrupt status failed disabling interrupts\n");
+	if (allbits++) {
+		/* disable interrupt delivery, something is very wrong */
+		if (allbits == 2)
+			dd->f_set_intr_state(dd, 0);
+		if (allbits == 3) {
+			qib_dev_err(dd,
+				"2nd bad interrupt status, unregistering interrupts\n");
+			dd->flags |= QIB_BADINTR;
+			dd->flags &= ~QIB_INITTED;
+			dd->f_free_irq(dd);
+		}
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_keys.c b/drivers/infiniband/hw/qib/qib_keys.c
new file mode 100644
index 000000000..ad843c786
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_keys.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2006, 2007, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "qib.h"
+
+/**
+ * qib_alloc_lkey - allocate an lkey
+ * @mr: memory region that this lkey protects
+ * @dma_region: 0->normal key, 1->restricted DMA key
+ *
+ * Returns 0 if successful, otherwise returns -errno.
+ *
+ * Increments mr reference count as required.
+ *
+ * Sets the lkey field mr for non-dma regions.
+ *
+ */
+
+int qib_alloc_lkey(struct qib_mregion *mr, int dma_region)
+{
+	unsigned long flags;
+	u32 r;
+	u32 n;
+	int ret = 0;
+	struct qib_ibdev *dev = to_idev(mr->pd->device);
+	struct qib_lkey_table *rkt = &dev->lk_table;
+
+	spin_lock_irqsave(&rkt->lock, flags);
+
+	/* special case for dma_mr lkey == 0 */
+	if (dma_region) {
+		struct qib_mregion *tmr;
+
+		tmr = rcu_access_pointer(dev->dma_mr);
+		if (!tmr) {
+			qib_get_mr(mr);
+			rcu_assign_pointer(dev->dma_mr, mr);
+			mr->lkey_published = 1;
+		}
+		goto success;
+	}
+
+	/* Find the next available LKEY */
+	r = rkt->next;
+	n = r;
+	for (;;) {
+		if (rkt->table[r] == NULL)
+			break;
+		r = (r + 1) & (rkt->max - 1);
+		if (r == n)
+			goto bail;
+	}
+	rkt->next = (r + 1) & (rkt->max - 1);
+	/*
+	 * Make sure lkey is never zero which is reserved to indicate an
+	 * unrestricted LKEY.
+	 */
+	rkt->gen++;
+	mr->lkey = (r << (32 - ib_qib_lkey_table_size)) |
+		((((1 << (24 - ib_qib_lkey_table_size)) - 1) & rkt->gen)
+		 << 8);
+	if (mr->lkey == 0) {
+		mr->lkey |= 1 << 8;
+		rkt->gen++;
+	}
+	qib_get_mr(mr);
+	rcu_assign_pointer(rkt->table[r], mr);
+	mr->lkey_published = 1;
+success:
+	spin_unlock_irqrestore(&rkt->lock, flags);
+out:
+	return ret;
+bail:
+	spin_unlock_irqrestore(&rkt->lock, flags);
+	ret = -ENOMEM;
+	goto out;
+}
+
+/**
+ * qib_free_lkey - free an lkey
+ * @mr: mr to free from tables
+ */
+void qib_free_lkey(struct qib_mregion *mr)
+{
+	unsigned long flags;
+	u32 lkey = mr->lkey;
+	u32 r;
+	struct qib_ibdev *dev = to_idev(mr->pd->device);
+	struct qib_lkey_table *rkt = &dev->lk_table;
+
+	spin_lock_irqsave(&rkt->lock, flags);
+	if (!mr->lkey_published)
+		goto out;
+	if (lkey == 0)
+		RCU_INIT_POINTER(dev->dma_mr, NULL);
+	else {
+		r = lkey >> (32 - ib_qib_lkey_table_size);
+		RCU_INIT_POINTER(rkt->table[r], NULL);
+	}
+	qib_put_mr(mr);
+	mr->lkey_published = 0;
+out:
+	spin_unlock_irqrestore(&rkt->lock, flags);
+}
+
+/**
+ * qib_lkey_ok - check IB SGE for validity and initialize
+ * @rkt: table containing lkey to check SGE against
+ * @pd: protection domain
+ * @isge: outgoing internal SGE
+ * @sge: SGE to check
+ * @acc: access flags
+ *
+ * Return 1 if valid and successful, otherwise returns 0.
+ *
+ * increments the reference count upon success
+ *
+ * Check the IB SGE for validity and initialize our internal version
+ * of it.
+ */
+int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd,
+		struct qib_sge *isge, struct ib_sge *sge, int acc)
+{
+	struct qib_mregion *mr;
+	unsigned n, m;
+	size_t off;
+
+	/*
+	 * We use LKEY == zero for kernel virtual addresses
+	 * (see qib_get_dma_mr and qib_dma.c).
+	 */
+	rcu_read_lock();
+	if (sge->lkey == 0) {
+		struct qib_ibdev *dev = to_idev(pd->ibpd.device);
+
+		if (pd->user)
+			goto bail;
+		mr = rcu_dereference(dev->dma_mr);
+		if (!mr)
+			goto bail;
+		if (unlikely(!atomic_inc_not_zero(&mr->refcount)))
+			goto bail;
+		rcu_read_unlock();
+
+		isge->mr = mr;
+		isge->vaddr = (void *) sge->addr;
+		isge->length = sge->length;
+		isge->sge_length = sge->length;
+		isge->m = 0;
+		isge->n = 0;
+		goto ok;
+	}
+	mr = rcu_dereference(
+		rkt->table[(sge->lkey >> (32 - ib_qib_lkey_table_size))]);
+	if (unlikely(!mr || mr->lkey != sge->lkey || mr->pd != &pd->ibpd))
+		goto bail;
+
+	off = sge->addr - mr->user_base;
+	if (unlikely(sge->addr < mr->user_base ||
+		     off + sge->length > mr->length ||
+		     (mr->access_flags & acc) != acc))
+		goto bail;
+	if (unlikely(!atomic_inc_not_zero(&mr->refcount)))
+		goto bail;
+	rcu_read_unlock();
+
+	off += mr->offset;
+	if (mr->page_shift) {
+		/*
+		page sizes are uniform power of 2 so no loop is necessary
+		entries_spanned_by_off is the number of times the loop below
+		would have executed.
+		*/
+		size_t entries_spanned_by_off;
+
+		entries_spanned_by_off = off >> mr->page_shift;
+		off -= (entries_spanned_by_off << mr->page_shift);
+		m = entries_spanned_by_off/QIB_SEGSZ;
+		n = entries_spanned_by_off%QIB_SEGSZ;
+	} else {
+		m = 0;
+		n = 0;
+		while (off >= mr->map[m]->segs[n].length) {
+			off -= mr->map[m]->segs[n].length;
+			n++;
+			if (n >= QIB_SEGSZ) {
+				m++;
+				n = 0;
+			}
+		}
+	}
+	isge->mr = mr;
+	isge->vaddr = mr->map[m]->segs[n].vaddr + off;
+	isge->length = mr->map[m]->segs[n].length - off;
+	isge->sge_length = sge->length;
+	isge->m = m;
+	isge->n = n;
+ok:
+	return 1;
+bail:
+	rcu_read_unlock();
+	return 0;
+}
+
+/**
+ * qib_rkey_ok - check the IB virtual address, length, and RKEY
+ * @qp: qp for validation
+ * @sge: SGE state
+ * @len: length of data
+ * @vaddr: virtual address to place data
+ * @rkey: rkey to check
+ * @acc: access flags
+ *
+ * Return 1 if successful, otherwise 0.
+ *
+ * increments the reference count upon success
+ */
+int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge,
+		u32 len, u64 vaddr, u32 rkey, int acc)
+{
+	struct qib_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
+	struct qib_mregion *mr;
+	unsigned n, m;
+	size_t off;
+
+	/*
+	 * We use RKEY == zero for kernel virtual addresses
+	 * (see qib_get_dma_mr and qib_dma.c).
+	 */
+	rcu_read_lock();
+	if (rkey == 0) {
+		struct qib_pd *pd = to_ipd(qp->ibqp.pd);
+		struct qib_ibdev *dev = to_idev(pd->ibpd.device);
+
+		if (pd->user)
+			goto bail;
+		mr = rcu_dereference(dev->dma_mr);
+		if (!mr)
+			goto bail;
+		if (unlikely(!atomic_inc_not_zero(&mr->refcount)))
+			goto bail;
+		rcu_read_unlock();
+
+		sge->mr = mr;
+		sge->vaddr = (void *) vaddr;
+		sge->length = len;
+		sge->sge_length = len;
+		sge->m = 0;
+		sge->n = 0;
+		goto ok;
+	}
+
+	mr = rcu_dereference(
+		rkt->table[(rkey >> (32 - ib_qib_lkey_table_size))]);
+	if (unlikely(!mr || mr->lkey != rkey || qp->ibqp.pd != mr->pd))
+		goto bail;
+
+	off = vaddr - mr->iova;
+	if (unlikely(vaddr < mr->iova || off + len > mr->length ||
+		     (mr->access_flags & acc) == 0))
+		goto bail;
+	if (unlikely(!atomic_inc_not_zero(&mr->refcount)))
+		goto bail;
+	rcu_read_unlock();
+
+	off += mr->offset;
+	if (mr->page_shift) {
+		/*
+		page sizes are uniform power of 2 so no loop is necessary
+		entries_spanned_by_off is the number of times the loop below
+		would have executed.
+		*/
+		size_t entries_spanned_by_off;
+
+		entries_spanned_by_off = off >> mr->page_shift;
+		off -= (entries_spanned_by_off << mr->page_shift);
+		m = entries_spanned_by_off/QIB_SEGSZ;
+		n = entries_spanned_by_off%QIB_SEGSZ;
+	} else {
+		m = 0;
+		n = 0;
+		while (off >= mr->map[m]->segs[n].length) {
+			off -= mr->map[m]->segs[n].length;
+			n++;
+			if (n >= QIB_SEGSZ) {
+				m++;
+				n = 0;
+			}
+		}
+	}
+	sge->mr = mr;
+	sge->vaddr = mr->map[m]->segs[n].vaddr + off;
+	sge->length = mr->map[m]->segs[n].length - off;
+	sge->sge_length = len;
+	sge->m = m;
+	sge->n = n;
+ok:
+	return 1;
+bail:
+	rcu_read_unlock();
+	return 0;
+}
+
+/*
+ * Initialize the memory region specified by the work reqeust.
+ */
+int qib_fast_reg_mr(struct qib_qp *qp, struct ib_send_wr *wr)
+{
+	struct qib_lkey_table *rkt = &to_idev(qp->ibqp.device)->lk_table;
+	struct qib_pd *pd = to_ipd(qp->ibqp.pd);
+	struct qib_mregion *mr;
+	u32 rkey = wr->wr.fast_reg.rkey;
+	unsigned i, n, m;
+	int ret = -EINVAL;
+	unsigned long flags;
+	u64 *page_list;
+	size_t ps;
+
+	spin_lock_irqsave(&rkt->lock, flags);
+	if (pd->user || rkey == 0)
+		goto bail;
+
+	mr = rcu_dereference_protected(
+		rkt->table[(rkey >> (32 - ib_qib_lkey_table_size))],
+		lockdep_is_held(&rkt->lock));
+	if (unlikely(mr == NULL || qp->ibqp.pd != mr->pd))
+		goto bail;
+
+	if (wr->wr.fast_reg.page_list_len > mr->max_segs)
+		goto bail;
+
+	ps = 1UL << wr->wr.fast_reg.page_shift;
+	if (wr->wr.fast_reg.length > ps * wr->wr.fast_reg.page_list_len)
+		goto bail;
+
+	mr->user_base = wr->wr.fast_reg.iova_start;
+	mr->iova = wr->wr.fast_reg.iova_start;
+	mr->lkey = rkey;
+	mr->length = wr->wr.fast_reg.length;
+	mr->access_flags = wr->wr.fast_reg.access_flags;
+	page_list = wr->wr.fast_reg.page_list->page_list;
+	m = 0;
+	n = 0;
+	for (i = 0; i < wr->wr.fast_reg.page_list_len; i++) {
+		mr->map[m]->segs[n].vaddr = (void *) page_list[i];
+		mr->map[m]->segs[n].length = ps;
+		if (++n == QIB_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+
+	ret = 0;
+bail:
+	spin_unlock_irqrestore(&rkt->lock, flags);
+	return ret;
+}
diff --git a/drivers/infiniband/hw/qib/qib_mad.c b/drivers/infiniband/hw/qib/qib_mad.c
new file mode 100644
index 000000000..395f4046d
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_mad.c
@@ -0,0 +1,2533 @@
+/*
+ * Copyright (c) 2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+
+#include "qib.h"
+#include "qib_mad.h"
+
+static int reply(struct ib_smp *smp)
+{
+	/*
+	 * The verbs framework will handle the directed/LID route
+	 * packet changes.
+	 */
+	smp->method = IB_MGMT_METHOD_GET_RESP;
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		smp->status |= IB_SMP_DIRECTION;
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_REPLY;
+}
+
+static int reply_failure(struct ib_smp *smp)
+{
+	/*
+	 * The verbs framework will handle the directed/LID route
+	 * packet changes.
+	 */
+	smp->method = IB_MGMT_METHOD_GET_RESP;
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+		smp->status |= IB_SMP_DIRECTION;
+	return IB_MAD_RESULT_FAILURE | IB_MAD_RESULT_REPLY;
+}
+
+static void qib_send_trap(struct qib_ibport *ibp, void *data, unsigned len)
+{
+	struct ib_mad_send_buf *send_buf;
+	struct ib_mad_agent *agent;
+	struct ib_smp *smp;
+	int ret;
+	unsigned long flags;
+	unsigned long timeout;
+
+	agent = ibp->send_agent;
+	if (!agent)
+		return;
+
+	/* o14-3.2.1 */
+	if (!(ppd_from_ibp(ibp)->lflags & QIBL_LINKACTIVE))
+		return;
+
+	/* o14-2 */
+	if (ibp->trap_timeout && time_before(jiffies, ibp->trap_timeout))
+		return;
+
+	send_buf = ib_create_send_mad(agent, 0, 0, 0, IB_MGMT_MAD_HDR,
+				      IB_MGMT_MAD_DATA, GFP_ATOMIC);
+	if (IS_ERR(send_buf))
+		return;
+
+	smp = send_buf->mad;
+	smp->base_version = IB_MGMT_BASE_VERSION;
+	smp->mgmt_class = IB_MGMT_CLASS_SUBN_LID_ROUTED;
+	smp->class_version = 1;
+	smp->method = IB_MGMT_METHOD_TRAP;
+	ibp->tid++;
+	smp->tid = cpu_to_be64(ibp->tid);
+	smp->attr_id = IB_SMP_ATTR_NOTICE;
+	/* o14-1: smp->mkey = 0; */
+	memcpy(smp->data, data, len);
+
+	spin_lock_irqsave(&ibp->lock, flags);
+	if (!ibp->sm_ah) {
+		if (ibp->sm_lid != be16_to_cpu(IB_LID_PERMISSIVE)) {
+			struct ib_ah *ah;
+
+			ah = qib_create_qp0_ah(ibp, ibp->sm_lid);
+			if (IS_ERR(ah))
+				ret = PTR_ERR(ah);
+			else {
+				send_buf->ah = ah;
+				ibp->sm_ah = to_iah(ah);
+				ret = 0;
+			}
+		} else
+			ret = -EINVAL;
+	} else {
+		send_buf->ah = &ibp->sm_ah->ibah;
+		ret = 0;
+	}
+	spin_unlock_irqrestore(&ibp->lock, flags);
+
+	if (!ret)
+		ret = ib_post_send_mad(send_buf, NULL);
+	if (!ret) {
+		/* 4.096 usec. */
+		timeout = (4096 * (1UL << ibp->subnet_timeout)) / 1000;
+		ibp->trap_timeout = jiffies + usecs_to_jiffies(timeout);
+	} else {
+		ib_free_send_mad(send_buf);
+		ibp->trap_timeout = 0;
+	}
+}
+
+/*
+ * Send a bad [PQ]_Key trap (ch. 14.3.8).
+ */
+void qib_bad_pqkey(struct qib_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
+		   u32 qp1, u32 qp2, __be16 lid1, __be16 lid2)
+{
+	struct ib_mad_notice_attr data;
+
+	if (trap_num == IB_NOTICE_TRAP_BAD_PKEY)
+		ibp->pkey_violations++;
+	else
+		ibp->qkey_violations++;
+	ibp->n_pkt_drops++;
+
+	/* Send violation trap */
+	data.generic_type = IB_NOTICE_TYPE_SECURITY;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = trap_num;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof(data.details));
+	data.details.ntc_257_258.lid1 = lid1;
+	data.details.ntc_257_258.lid2 = lid2;
+	data.details.ntc_257_258.key = cpu_to_be32(key);
+	data.details.ntc_257_258.sl_qp1 = cpu_to_be32((sl << 28) | qp1);
+	data.details.ntc_257_258.qp2 = cpu_to_be32(qp2);
+
+	qib_send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a bad M_Key trap (ch. 14.3.9).
+ */
+static void qib_bad_mkey(struct qib_ibport *ibp, struct ib_smp *smp)
+{
+	struct ib_mad_notice_attr data;
+
+	/* Send violation trap */
+	data.generic_type = IB_NOTICE_TYPE_SECURITY;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_BAD_MKEY;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof(data.details));
+	data.details.ntc_256.lid = data.issuer_lid;
+	data.details.ntc_256.method = smp->method;
+	data.details.ntc_256.attr_id = smp->attr_id;
+	data.details.ntc_256.attr_mod = smp->attr_mod;
+	data.details.ntc_256.mkey = smp->mkey;
+	if (smp->mgmt_class == IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE) {
+		u8 hop_cnt;
+
+		data.details.ntc_256.dr_slid = smp->dr_slid;
+		data.details.ntc_256.dr_trunc_hop = IB_NOTICE_TRAP_DR_NOTICE;
+		hop_cnt = smp->hop_cnt;
+		if (hop_cnt > ARRAY_SIZE(data.details.ntc_256.dr_rtn_path)) {
+			data.details.ntc_256.dr_trunc_hop |=
+				IB_NOTICE_TRAP_DR_TRUNC;
+			hop_cnt = ARRAY_SIZE(data.details.ntc_256.dr_rtn_path);
+		}
+		data.details.ntc_256.dr_trunc_hop |= hop_cnt;
+		memcpy(data.details.ntc_256.dr_rtn_path, smp->return_path,
+		       hop_cnt);
+	}
+
+	qib_send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a Port Capability Mask Changed trap (ch. 14.3.11).
+ */
+void qib_cap_mask_chg(struct qib_ibport *ibp)
+{
+	struct ib_mad_notice_attr data;
+
+	data.generic_type = IB_NOTICE_TYPE_INFO;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof(data.details));
+	data.details.ntc_144.lid = data.issuer_lid;
+	data.details.ntc_144.new_cap_mask = cpu_to_be32(ibp->port_cap_flags);
+
+	qib_send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a System Image GUID Changed trap (ch. 14.3.12).
+ */
+void qib_sys_guid_chg(struct qib_ibport *ibp)
+{
+	struct ib_mad_notice_attr data;
+
+	data.generic_type = IB_NOTICE_TYPE_INFO;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_SYS_GUID_CHG;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof(data.details));
+	data.details.ntc_145.lid = data.issuer_lid;
+	data.details.ntc_145.new_sys_guid = ib_qib_sys_image_guid;
+
+	qib_send_trap(ibp, &data, sizeof(data));
+}
+
+/*
+ * Send a Node Description Changed trap (ch. 14.3.13).
+ */
+void qib_node_desc_chg(struct qib_ibport *ibp)
+{
+	struct ib_mad_notice_attr data;
+
+	data.generic_type = IB_NOTICE_TYPE_INFO;
+	data.prod_type_msb = 0;
+	data.prod_type_lsb = IB_NOTICE_PROD_CA;
+	data.trap_num = IB_NOTICE_TRAP_CAP_MASK_CHG;
+	data.issuer_lid = cpu_to_be16(ppd_from_ibp(ibp)->lid);
+	data.toggle_count = 0;
+	memset(&data.details, 0, sizeof(data.details));
+	data.details.ntc_144.lid = data.issuer_lid;
+	data.details.ntc_144.local_changes = 1;
+	data.details.ntc_144.change_flags = IB_NOTICE_TRAP_NODE_DESC_CHG;
+
+	qib_send_trap(ibp, &data, sizeof(data));
+}
+
+static int subn_get_nodedescription(struct ib_smp *smp,
+				    struct ib_device *ibdev)
+{
+	if (smp->attr_mod)
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	memcpy(smp->data, ibdev->node_desc, sizeof(smp->data));
+
+	return reply(smp);
+}
+
+static int subn_get_nodeinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct ib_node_info *nip = (struct ib_node_info *)&smp->data;
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	u32 vendor, majrev, minrev;
+	unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+	/* GUID 0 is illegal */
+	if (smp->attr_mod || pidx >= dd->num_pports ||
+	    dd->pport[pidx].guid == 0)
+		smp->status |= IB_SMP_INVALID_FIELD;
+	else
+		nip->port_guid = dd->pport[pidx].guid;
+
+	nip->base_version = 1;
+	nip->class_version = 1;
+	nip->node_type = 1;     /* channel adapter */
+	nip->num_ports = ibdev->phys_port_cnt;
+	/* This is already in network order */
+	nip->sys_guid = ib_qib_sys_image_guid;
+	nip->node_guid = dd->pport->guid; /* Use first-port GUID as node */
+	nip->partition_cap = cpu_to_be16(qib_get_npkeys(dd));
+	nip->device_id = cpu_to_be16(dd->deviceid);
+	majrev = dd->majrev;
+	minrev = dd->minrev;
+	nip->revision = cpu_to_be32((majrev << 16) | minrev);
+	nip->local_port_num = port;
+	vendor = dd->vendorid;
+	nip->vendor_id[0] = QIB_SRC_OUI_1;
+	nip->vendor_id[1] = QIB_SRC_OUI_2;
+	nip->vendor_id[2] = QIB_SRC_OUI_3;
+
+	return reply(smp);
+}
+
+static int subn_get_guidinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	u32 startgx = 8 * be32_to_cpu(smp->attr_mod);
+	__be64 *p = (__be64 *) smp->data;
+	unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+	/* 32 blocks of 8 64-bit GUIDs per block */
+
+	memset(smp->data, 0, sizeof(smp->data));
+
+	if (startgx == 0 && pidx < dd->num_pports) {
+		struct qib_pportdata *ppd = dd->pport + pidx;
+		struct qib_ibport *ibp = &ppd->ibport_data;
+		__be64 g = ppd->guid;
+		unsigned i;
+
+		/* GUID 0 is illegal */
+		if (g == 0)
+			smp->status |= IB_SMP_INVALID_FIELD;
+		else {
+			/* The first is a copy of the read-only HW GUID. */
+			p[0] = g;
+			for (i = 1; i < QIB_GUIDS_PER_PORT; i++)
+				p[i] = ibp->guids[i - 1];
+		}
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return reply(smp);
+}
+
+static void set_link_width_enabled(struct qib_pportdata *ppd, u32 w)
+{
+	(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LWID_ENB, w);
+}
+
+static void set_link_speed_enabled(struct qib_pportdata *ppd, u32 s)
+{
+	(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_SPD_ENB, s);
+}
+
+static int get_overrunthreshold(struct qib_pportdata *ppd)
+{
+	return ppd->dd->f_get_ib_cfg(ppd, QIB_IB_CFG_OVERRUN_THRESH);
+}
+
+/**
+ * set_overrunthreshold - set the overrun threshold
+ * @ppd: the physical port data
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_overrunthreshold(struct qib_pportdata *ppd, unsigned n)
+{
+	(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_OVERRUN_THRESH,
+					 (u32)n);
+	return 0;
+}
+
+static int get_phyerrthreshold(struct qib_pportdata *ppd)
+{
+	return ppd->dd->f_get_ib_cfg(ppd, QIB_IB_CFG_PHYERR_THRESH);
+}
+
+/**
+ * set_phyerrthreshold - set the physical error threshold
+ * @ppd: the physical port data
+ * @n: the new threshold
+ *
+ * Note that this will only take effect when the link state changes.
+ */
+static int set_phyerrthreshold(struct qib_pportdata *ppd, unsigned n)
+{
+	(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PHYERR_THRESH,
+					 (u32)n);
+	return 0;
+}
+
+/**
+ * get_linkdowndefaultstate - get the default linkdown state
+ * @ppd: the physical port data
+ *
+ * Returns zero if the default is POLL, 1 if the default is SLEEP.
+ */
+static int get_linkdowndefaultstate(struct qib_pportdata *ppd)
+{
+	return ppd->dd->f_get_ib_cfg(ppd, QIB_IB_CFG_LINKDEFAULT) ==
+		IB_LINKINITCMD_SLEEP;
+}
+
+static int check_mkey(struct qib_ibport *ibp, struct ib_smp *smp, int mad_flags)
+{
+	int valid_mkey = 0;
+	int ret = 0;
+
+	/* Is the mkey in the process of expiring? */
+	if (ibp->mkey_lease_timeout &&
+	    time_after_eq(jiffies, ibp->mkey_lease_timeout)) {
+		/* Clear timeout and mkey protection field. */
+		ibp->mkey_lease_timeout = 0;
+		ibp->mkeyprot = 0;
+	}
+
+	if ((mad_flags & IB_MAD_IGNORE_MKEY) ||  ibp->mkey == 0 ||
+	    ibp->mkey == smp->mkey)
+		valid_mkey = 1;
+
+	/* Unset lease timeout on any valid Get/Set/TrapRepress */
+	if (valid_mkey && ibp->mkey_lease_timeout &&
+	    (smp->method == IB_MGMT_METHOD_GET ||
+	     smp->method == IB_MGMT_METHOD_SET ||
+	     smp->method == IB_MGMT_METHOD_TRAP_REPRESS))
+		ibp->mkey_lease_timeout = 0;
+
+	if (!valid_mkey) {
+		switch (smp->method) {
+		case IB_MGMT_METHOD_GET:
+			/* Bad mkey not a violation below level 2 */
+			if (ibp->mkeyprot < 2)
+				break;
+		case IB_MGMT_METHOD_SET:
+		case IB_MGMT_METHOD_TRAP_REPRESS:
+			if (ibp->mkey_violations != 0xFFFF)
+				++ibp->mkey_violations;
+			if (!ibp->mkey_lease_timeout && ibp->mkey_lease_period)
+				ibp->mkey_lease_timeout = jiffies +
+					ibp->mkey_lease_period * HZ;
+			/* Generate a trap notice. */
+			qib_bad_mkey(ibp, smp);
+			ret = 1;
+		}
+	}
+
+	return ret;
+}
+
+static int subn_get_portinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct qib_devdata *dd;
+	struct qib_pportdata *ppd;
+	struct qib_ibport *ibp;
+	struct ib_port_info *pip = (struct ib_port_info *)smp->data;
+	u8 mtu;
+	int ret;
+	u32 state;
+	u32 port_num = be32_to_cpu(smp->attr_mod);
+
+	if (port_num == 0)
+		port_num = port;
+	else {
+		if (port_num > ibdev->phys_port_cnt) {
+			smp->status |= IB_SMP_INVALID_FIELD;
+			ret = reply(smp);
+			goto bail;
+		}
+		if (port_num != port) {
+			ibp = to_iport(ibdev, port_num);
+			ret = check_mkey(ibp, smp, 0);
+			if (ret) {
+				ret = IB_MAD_RESULT_FAILURE;
+				goto bail;
+			}
+		}
+	}
+
+	dd = dd_from_ibdev(ibdev);
+	/* IB numbers ports from 1, hdw from 0 */
+	ppd = dd->pport + (port_num - 1);
+	ibp = &ppd->ibport_data;
+
+	/* Clear all fields.  Only set the non-zero fields. */
+	memset(smp->data, 0, sizeof(smp->data));
+
+	/* Only return the mkey if the protection field allows it. */
+	if (!(smp->method == IB_MGMT_METHOD_GET &&
+	      ibp->mkey != smp->mkey &&
+	      ibp->mkeyprot == 1))
+		pip->mkey = ibp->mkey;
+	pip->gid_prefix = ibp->gid_prefix;
+	pip->lid = cpu_to_be16(ppd->lid);
+	pip->sm_lid = cpu_to_be16(ibp->sm_lid);
+	pip->cap_mask = cpu_to_be32(ibp->port_cap_flags);
+	/* pip->diag_code; */
+	pip->mkey_lease_period = cpu_to_be16(ibp->mkey_lease_period);
+	pip->local_port_num = port;
+	pip->link_width_enabled = ppd->link_width_enabled;
+	pip->link_width_supported = ppd->link_width_supported;
+	pip->link_width_active = ppd->link_width_active;
+	state = dd->f_iblink_state(ppd->lastibcstat);
+	pip->linkspeed_portstate = ppd->link_speed_supported << 4 | state;
+
+	pip->portphysstate_linkdown =
+		(dd->f_ibphys_portstate(ppd->lastibcstat) << 4) |
+		(get_linkdowndefaultstate(ppd) ? 1 : 2);
+	pip->mkeyprot_resv_lmc = (ibp->mkeyprot << 6) | ppd->lmc;
+	pip->linkspeedactive_enabled = (ppd->link_speed_active << 4) |
+		ppd->link_speed_enabled;
+	switch (ppd->ibmtu) {
+	default: /* something is wrong; fall through */
+	case 4096:
+		mtu = IB_MTU_4096;
+		break;
+	case 2048:
+		mtu = IB_MTU_2048;
+		break;
+	case 1024:
+		mtu = IB_MTU_1024;
+		break;
+	case 512:
+		mtu = IB_MTU_512;
+		break;
+	case 256:
+		mtu = IB_MTU_256;
+		break;
+	}
+	pip->neighbormtu_mastersmsl = (mtu << 4) | ibp->sm_sl;
+	pip->vlcap_inittype = ppd->vls_supported << 4;  /* InitType = 0 */
+	pip->vl_high_limit = ibp->vl_high_limit;
+	pip->vl_arb_high_cap =
+		dd->f_get_ib_cfg(ppd, QIB_IB_CFG_VL_HIGH_CAP);
+	pip->vl_arb_low_cap =
+		dd->f_get_ib_cfg(ppd, QIB_IB_CFG_VL_LOW_CAP);
+	/* InitTypeReply = 0 */
+	pip->inittypereply_mtucap = qib_ibmtu ? qib_ibmtu : IB_MTU_4096;
+	/* HCAs ignore VLStallCount and HOQLife */
+	/* pip->vlstallcnt_hoqlife; */
+	pip->operationalvl_pei_peo_fpi_fpo =
+		dd->f_get_ib_cfg(ppd, QIB_IB_CFG_OP_VLS) << 4;
+	pip->mkey_violations = cpu_to_be16(ibp->mkey_violations);
+	/* P_KeyViolations are counted by hardware. */
+	pip->pkey_violations = cpu_to_be16(ibp->pkey_violations);
+	pip->qkey_violations = cpu_to_be16(ibp->qkey_violations);
+	/* Only the hardware GUID is supported for now */
+	pip->guid_cap = QIB_GUIDS_PER_PORT;
+	pip->clientrereg_resv_subnetto = ibp->subnet_timeout;
+	/* 32.768 usec. response time (guessing) */
+	pip->resv_resptimevalue = 3;
+	pip->localphyerrors_overrunerrors =
+		(get_phyerrthreshold(ppd) << 4) |
+		get_overrunthreshold(ppd);
+	/* pip->max_credit_hint; */
+	if (ibp->port_cap_flags & IB_PORT_LINK_LATENCY_SUP) {
+		u32 v;
+
+		v = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_LINKLATENCY);
+		pip->link_roundtrip_latency[0] = v >> 16;
+		pip->link_roundtrip_latency[1] = v >> 8;
+		pip->link_roundtrip_latency[2] = v;
+	}
+
+	ret = reply(smp);
+
+bail:
+	return ret;
+}
+
+/**
+ * get_pkeys - return the PKEY table
+ * @dd: the qlogic_ib device
+ * @port: the IB port number
+ * @pkeys: the pkey table is placed here
+ */
+static int get_pkeys(struct qib_devdata *dd, u8 port, u16 *pkeys)
+{
+	struct qib_pportdata *ppd = dd->pport + port - 1;
+	/*
+	 * always a kernel context, no locking needed.
+	 * If we get here with ppd setup, no need to check
+	 * that pd is valid.
+	 */
+	struct qib_ctxtdata *rcd = dd->rcd[ppd->hw_pidx];
+
+	memcpy(pkeys, rcd->pkeys, sizeof(rcd->pkeys));
+
+	return 0;
+}
+
+static int subn_get_pkeytable(struct ib_smp *smp, struct ib_device *ibdev,
+			      u8 port)
+{
+	u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+	u16 *p = (u16 *) smp->data;
+	__be16 *q = (__be16 *) smp->data;
+
+	/* 64 blocks of 32 16-bit P_Key entries */
+
+	memset(smp->data, 0, sizeof(smp->data));
+	if (startpx == 0) {
+		struct qib_devdata *dd = dd_from_ibdev(ibdev);
+		unsigned i, n = qib_get_npkeys(dd);
+
+		get_pkeys(dd, port, p);
+
+		for (i = 0; i < n; i++)
+			q[i] = cpu_to_be16(p[i]);
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return reply(smp);
+}
+
+static int subn_set_guidinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	u32 startgx = 8 * be32_to_cpu(smp->attr_mod);
+	__be64 *p = (__be64 *) smp->data;
+	unsigned pidx = port - 1; /* IB number port from 1, hdw from 0 */
+
+	/* 32 blocks of 8 64-bit GUIDs per block */
+
+	if (startgx == 0 && pidx < dd->num_pports) {
+		struct qib_pportdata *ppd = dd->pport + pidx;
+		struct qib_ibport *ibp = &ppd->ibport_data;
+		unsigned i;
+
+		/* The first entry is read-only. */
+		for (i = 1; i < QIB_GUIDS_PER_PORT; i++)
+			ibp->guids[i - 1] = p[i];
+	} else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	/* The only GUID we support is the first read-only entry. */
+	return subn_get_guidinfo(smp, ibdev, port);
+}
+
+/**
+ * subn_set_portinfo - set port information
+ * @smp: the incoming SM packet
+ * @ibdev: the infiniband device
+ * @port: the port on the device
+ *
+ * Set Portinfo (see ch. 14.2.5.6).
+ */
+static int subn_set_portinfo(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct ib_port_info *pip = (struct ib_port_info *)smp->data;
+	struct ib_event event;
+	struct qib_devdata *dd;
+	struct qib_pportdata *ppd;
+	struct qib_ibport *ibp;
+	u8 clientrereg = (pip->clientrereg_resv_subnetto & 0x80);
+	unsigned long flags;
+	u16 lid, smlid;
+	u8 lwe;
+	u8 lse;
+	u8 state;
+	u8 vls;
+	u8 msl;
+	u16 lstate;
+	int ret, ore, mtu;
+	u32 port_num = be32_to_cpu(smp->attr_mod);
+
+	if (port_num == 0)
+		port_num = port;
+	else {
+		if (port_num > ibdev->phys_port_cnt)
+			goto err;
+		/* Port attributes can only be set on the receiving port */
+		if (port_num != port)
+			goto get_only;
+	}
+
+	dd = dd_from_ibdev(ibdev);
+	/* IB numbers ports from 1, hdw from 0 */
+	ppd = dd->pport + (port_num - 1);
+	ibp = &ppd->ibport_data;
+	event.device = ibdev;
+	event.element.port_num = port;
+
+	ibp->mkey = pip->mkey;
+	ibp->gid_prefix = pip->gid_prefix;
+	ibp->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period);
+
+	lid = be16_to_cpu(pip->lid);
+	/* Must be a valid unicast LID address. */
+	if (lid == 0 || lid >= QIB_MULTICAST_LID_BASE)
+		smp->status |= IB_SMP_INVALID_FIELD;
+	else if (ppd->lid != lid || ppd->lmc != (pip->mkeyprot_resv_lmc & 7)) {
+		if (ppd->lid != lid)
+			qib_set_uevent_bits(ppd, _QIB_EVENT_LID_CHANGE_BIT);
+		if (ppd->lmc != (pip->mkeyprot_resv_lmc & 7))
+			qib_set_uevent_bits(ppd, _QIB_EVENT_LMC_CHANGE_BIT);
+		qib_set_lid(ppd, lid, pip->mkeyprot_resv_lmc & 7);
+		event.event = IB_EVENT_LID_CHANGE;
+		ib_dispatch_event(&event);
+	}
+
+	smlid = be16_to_cpu(pip->sm_lid);
+	msl = pip->neighbormtu_mastersmsl & 0xF;
+	/* Must be a valid unicast LID address. */
+	if (smlid == 0 || smlid >= QIB_MULTICAST_LID_BASE)
+		smp->status |= IB_SMP_INVALID_FIELD;
+	else if (smlid != ibp->sm_lid || msl != ibp->sm_sl) {
+		spin_lock_irqsave(&ibp->lock, flags);
+		if (ibp->sm_ah) {
+			if (smlid != ibp->sm_lid)
+				ibp->sm_ah->attr.dlid = smlid;
+			if (msl != ibp->sm_sl)
+				ibp->sm_ah->attr.sl = msl;
+		}
+		spin_unlock_irqrestore(&ibp->lock, flags);
+		if (smlid != ibp->sm_lid)
+			ibp->sm_lid = smlid;
+		if (msl != ibp->sm_sl)
+			ibp->sm_sl = msl;
+		event.event = IB_EVENT_SM_CHANGE;
+		ib_dispatch_event(&event);
+	}
+
+	/* Allow 1x or 4x to be set (see 14.2.6.6). */
+	lwe = pip->link_width_enabled;
+	if (lwe) {
+		if (lwe == 0xFF)
+			set_link_width_enabled(ppd, ppd->link_width_supported);
+		else if (lwe >= 16 || (lwe & ~ppd->link_width_supported))
+			smp->status |= IB_SMP_INVALID_FIELD;
+		else if (lwe != ppd->link_width_enabled)
+			set_link_width_enabled(ppd, lwe);
+	}
+
+	lse = pip->linkspeedactive_enabled & 0xF;
+	if (lse) {
+		/*
+		 * The IB 1.2 spec. only allows link speed values
+		 * 1, 3, 5, 7, 15.  1.2.1 extended to allow specific
+		 * speeds.
+		 */
+		if (lse == 15)
+			set_link_speed_enabled(ppd,
+					       ppd->link_speed_supported);
+		else if (lse >= 8 || (lse & ~ppd->link_speed_supported))
+			smp->status |= IB_SMP_INVALID_FIELD;
+		else if (lse != ppd->link_speed_enabled)
+			set_link_speed_enabled(ppd, lse);
+	}
+
+	/* Set link down default state. */
+	switch (pip->portphysstate_linkdown & 0xF) {
+	case 0: /* NOP */
+		break;
+	case 1: /* SLEEP */
+		(void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LINKDEFAULT,
+					IB_LINKINITCMD_SLEEP);
+		break;
+	case 2: /* POLL */
+		(void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_LINKDEFAULT,
+					IB_LINKINITCMD_POLL);
+		break;
+	default:
+		smp->status |= IB_SMP_INVALID_FIELD;
+	}
+
+	ibp->mkeyprot = pip->mkeyprot_resv_lmc >> 6;
+	ibp->vl_high_limit = pip->vl_high_limit;
+	(void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_VL_HIGH_LIMIT,
+				    ibp->vl_high_limit);
+
+	mtu = ib_mtu_enum_to_int((pip->neighbormtu_mastersmsl >> 4) & 0xF);
+	if (mtu == -1)
+		smp->status |= IB_SMP_INVALID_FIELD;
+	else
+		qib_set_mtu(ppd, mtu);
+
+	/* Set operational VLs */
+	vls = (pip->operationalvl_pei_peo_fpi_fpo >> 4) & 0xF;
+	if (vls) {
+		if (vls > ppd->vls_supported)
+			smp->status |= IB_SMP_INVALID_FIELD;
+		else
+			(void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_OP_VLS, vls);
+	}
+
+	if (pip->mkey_violations == 0)
+		ibp->mkey_violations = 0;
+
+	if (pip->pkey_violations == 0)
+		ibp->pkey_violations = 0;
+
+	if (pip->qkey_violations == 0)
+		ibp->qkey_violations = 0;
+
+	ore = pip->localphyerrors_overrunerrors;
+	if (set_phyerrthreshold(ppd, (ore >> 4) & 0xF))
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	if (set_overrunthreshold(ppd, (ore & 0xF)))
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	ibp->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F;
+
+	/*
+	 * Do the port state change now that the other link parameters
+	 * have been set.
+	 * Changing the port physical state only makes sense if the link
+	 * is down or is being set to down.
+	 */
+	state = pip->linkspeed_portstate & 0xF;
+	lstate = (pip->portphysstate_linkdown >> 4) & 0xF;
+	if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP))
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	/*
+	 * Only state changes of DOWN, ARM, and ACTIVE are valid
+	 * and must be in the correct state to take effect (see 7.2.6).
+	 */
+	switch (state) {
+	case IB_PORT_NOP:
+		if (lstate == 0)
+			break;
+		/* FALLTHROUGH */
+	case IB_PORT_DOWN:
+		if (lstate == 0)
+			lstate = QIB_IB_LINKDOWN_ONLY;
+		else if (lstate == 1)
+			lstate = QIB_IB_LINKDOWN_SLEEP;
+		else if (lstate == 2)
+			lstate = QIB_IB_LINKDOWN;
+		else if (lstate == 3)
+			lstate = QIB_IB_LINKDOWN_DISABLE;
+		else {
+			smp->status |= IB_SMP_INVALID_FIELD;
+			break;
+		}
+		spin_lock_irqsave(&ppd->lflags_lock, flags);
+		ppd->lflags &= ~QIBL_LINKV;
+		spin_unlock_irqrestore(&ppd->lflags_lock, flags);
+		qib_set_linkstate(ppd, lstate);
+		/*
+		 * Don't send a reply if the response would be sent
+		 * through the disabled port.
+		 */
+		if (lstate == QIB_IB_LINKDOWN_DISABLE && smp->hop_cnt) {
+			ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+			goto done;
+		}
+		qib_wait_linkstate(ppd, QIBL_LINKV, 10);
+		break;
+	case IB_PORT_ARMED:
+		qib_set_linkstate(ppd, QIB_IB_LINKARM);
+		break;
+	case IB_PORT_ACTIVE:
+		qib_set_linkstate(ppd, QIB_IB_LINKACTIVE);
+		break;
+	default:
+		smp->status |= IB_SMP_INVALID_FIELD;
+	}
+
+	if (clientrereg) {
+		event.event = IB_EVENT_CLIENT_REREGISTER;
+		ib_dispatch_event(&event);
+	}
+
+	ret = subn_get_portinfo(smp, ibdev, port);
+
+	/* restore re-reg bit per o14-12.2.1 */
+	pip->clientrereg_resv_subnetto |= clientrereg;
+
+	goto get_only;
+
+err:
+	smp->status |= IB_SMP_INVALID_FIELD;
+get_only:
+	ret = subn_get_portinfo(smp, ibdev, port);
+done:
+	return ret;
+}
+
+/**
+ * rm_pkey - decrecment the reference count for the given PKEY
+ * @dd: the qlogic_ib device
+ * @key: the PKEY index
+ *
+ * Return true if this was the last reference and the hardware table entry
+ * needs to be changed.
+ */
+static int rm_pkey(struct qib_pportdata *ppd, u16 key)
+{
+	int i;
+	int ret;
+
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (ppd->pkeys[i] != key)
+			continue;
+		if (atomic_dec_and_test(&ppd->pkeyrefs[i])) {
+			ppd->pkeys[i] = 0;
+			ret = 1;
+			goto bail;
+		}
+		break;
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * add_pkey - add the given PKEY to the hardware table
+ * @dd: the qlogic_ib device
+ * @key: the PKEY
+ *
+ * Return an error code if unable to add the entry, zero if no change,
+ * or 1 if the hardware PKEY register needs to be updated.
+ */
+static int add_pkey(struct qib_pportdata *ppd, u16 key)
+{
+	int i;
+	u16 lkey = key & 0x7FFF;
+	int any = 0;
+	int ret;
+
+	if (lkey == 0x7FFF) {
+		ret = 0;
+		goto bail;
+	}
+
+	/* Look for an empty slot or a matching PKEY. */
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (!ppd->pkeys[i]) {
+			any++;
+			continue;
+		}
+		/* If it matches exactly, try to increment the ref count */
+		if (ppd->pkeys[i] == key) {
+			if (atomic_inc_return(&ppd->pkeyrefs[i]) > 1) {
+				ret = 0;
+				goto bail;
+			}
+			/* Lost the race. Look for an empty slot below. */
+			atomic_dec(&ppd->pkeyrefs[i]);
+			any++;
+		}
+		/*
+		 * It makes no sense to have both the limited and unlimited
+		 * PKEY set at the same time since the unlimited one will
+		 * disable the limited one.
+		 */
+		if ((ppd->pkeys[i] & 0x7FFF) == lkey) {
+			ret = -EEXIST;
+			goto bail;
+		}
+	}
+	if (!any) {
+		ret = -EBUSY;
+		goto bail;
+	}
+	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
+		if (!ppd->pkeys[i] &&
+		    atomic_inc_return(&ppd->pkeyrefs[i]) == 1) {
+			/* for qibstats, etc. */
+			ppd->pkeys[i] = key;
+			ret = 1;
+			goto bail;
+		}
+	}
+	ret = -EBUSY;
+
+bail:
+	return ret;
+}
+
+/**
+ * set_pkeys - set the PKEY table for ctxt 0
+ * @dd: the qlogic_ib device
+ * @port: the IB port number
+ * @pkeys: the PKEY table
+ */
+static int set_pkeys(struct qib_devdata *dd, u8 port, u16 *pkeys)
+{
+	struct qib_pportdata *ppd;
+	struct qib_ctxtdata *rcd;
+	int i;
+	int changed = 0;
+
+	/*
+	 * IB port one/two always maps to context zero/one,
+	 * always a kernel context, no locking needed
+	 * If we get here with ppd setup, no need to check
+	 * that rcd is valid.
+	 */
+	ppd = dd->pport + (port - 1);
+	rcd = dd->rcd[ppd->hw_pidx];
+
+	for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) {
+		u16 key = pkeys[i];
+		u16 okey = rcd->pkeys[i];
+
+		if (key == okey)
+			continue;
+		/*
+		 * The value of this PKEY table entry is changing.
+		 * Remove the old entry in the hardware's array of PKEYs.
+		 */
+		if (okey & 0x7FFF)
+			changed |= rm_pkey(ppd, okey);
+		if (key & 0x7FFF) {
+			int ret = add_pkey(ppd, key);
+
+			if (ret < 0)
+				key = 0;
+			else
+				changed |= ret;
+		}
+		rcd->pkeys[i] = key;
+	}
+	if (changed) {
+		struct ib_event event;
+
+		(void) dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0);
+
+		event.event = IB_EVENT_PKEY_CHANGE;
+		event.device = &dd->verbs_dev.ibdev;
+		event.element.port_num = port;
+		ib_dispatch_event(&event);
+	}
+	return 0;
+}
+
+static int subn_set_pkeytable(struct ib_smp *smp, struct ib_device *ibdev,
+			      u8 port)
+{
+	u32 startpx = 32 * (be32_to_cpu(smp->attr_mod) & 0xffff);
+	__be16 *p = (__be16 *) smp->data;
+	u16 *q = (u16 *) smp->data;
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	unsigned i, n = qib_get_npkeys(dd);
+
+	for (i = 0; i < n; i++)
+		q[i] = be16_to_cpu(p[i]);
+
+	if (startpx != 0 || set_pkeys(dd, port, q) != 0)
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return subn_get_pkeytable(smp, ibdev, port);
+}
+
+static int subn_get_sl_to_vl(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	u8 *p = (u8 *) smp->data;
+	unsigned i;
+
+	memset(smp->data, 0, sizeof(smp->data));
+
+	if (!(ibp->port_cap_flags & IB_PORT_SL_MAP_SUP))
+		smp->status |= IB_SMP_UNSUP_METHOD;
+	else
+		for (i = 0; i < ARRAY_SIZE(ibp->sl_to_vl); i += 2)
+			*p++ = (ibp->sl_to_vl[i] << 4) | ibp->sl_to_vl[i + 1];
+
+	return reply(smp);
+}
+
+static int subn_set_sl_to_vl(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	u8 *p = (u8 *) smp->data;
+	unsigned i;
+
+	if (!(ibp->port_cap_flags & IB_PORT_SL_MAP_SUP)) {
+		smp->status |= IB_SMP_UNSUP_METHOD;
+		return reply(smp);
+	}
+
+	for (i = 0; i < ARRAY_SIZE(ibp->sl_to_vl); i += 2, p++) {
+		ibp->sl_to_vl[i] = *p >> 4;
+		ibp->sl_to_vl[i + 1] = *p & 0xF;
+	}
+	qib_set_uevent_bits(ppd_from_ibp(to_iport(ibdev, port)),
+			    _QIB_EVENT_SL2VL_CHANGE_BIT);
+
+	return subn_get_sl_to_vl(smp, ibdev, port);
+}
+
+static int subn_get_vl_arb(struct ib_smp *smp, struct ib_device *ibdev,
+			   u8 port)
+{
+	unsigned which = be32_to_cpu(smp->attr_mod) >> 16;
+	struct qib_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+
+	memset(smp->data, 0, sizeof(smp->data));
+
+	if (ppd->vls_supported == IB_VL_VL0)
+		smp->status |= IB_SMP_UNSUP_METHOD;
+	else if (which == IB_VLARB_LOWPRI_0_31)
+		(void) ppd->dd->f_get_ib_table(ppd, QIB_IB_TBL_VL_LOW_ARB,
+						   smp->data);
+	else if (which == IB_VLARB_HIGHPRI_0_31)
+		(void) ppd->dd->f_get_ib_table(ppd, QIB_IB_TBL_VL_HIGH_ARB,
+						   smp->data);
+	else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return reply(smp);
+}
+
+static int subn_set_vl_arb(struct ib_smp *smp, struct ib_device *ibdev,
+			   u8 port)
+{
+	unsigned which = be32_to_cpu(smp->attr_mod) >> 16;
+	struct qib_pportdata *ppd = ppd_from_ibp(to_iport(ibdev, port));
+
+	if (ppd->vls_supported == IB_VL_VL0)
+		smp->status |= IB_SMP_UNSUP_METHOD;
+	else if (which == IB_VLARB_LOWPRI_0_31)
+		(void) ppd->dd->f_set_ib_table(ppd, QIB_IB_TBL_VL_LOW_ARB,
+						   smp->data);
+	else if (which == IB_VLARB_HIGHPRI_0_31)
+		(void) ppd->dd->f_set_ib_table(ppd, QIB_IB_TBL_VL_HIGH_ARB,
+						   smp->data);
+	else
+		smp->status |= IB_SMP_INVALID_FIELD;
+
+	return subn_get_vl_arb(smp, ibdev, port);
+}
+
+static int subn_trap_repress(struct ib_smp *smp, struct ib_device *ibdev,
+			     u8 port)
+{
+	/*
+	 * For now, we only send the trap once so no need to process this.
+	 * o13-6, o13-7,
+	 * o14-3.a4 The SMA shall not send any message in response to a valid
+	 * SubnTrapRepress() message.
+	 */
+	return IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+}
+
+static int pma_get_classportinfo(struct ib_pma_mad *pmp,
+				 struct ib_device *ibdev)
+{
+	struct ib_class_port_info *p =
+		(struct ib_class_port_info *)pmp->data;
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	if (pmp->mad_hdr.attr_mod != 0)
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	/* Note that AllPortSelect is not valid */
+	p->base_version = 1;
+	p->class_version = 1;
+	p->capability_mask = IB_PMA_CLASS_CAP_EXT_WIDTH;
+	/*
+	 * Set the most significant bit of CM2 to indicate support for
+	 * congestion statistics
+	 */
+	p->reserved[0] = dd->psxmitwait_supported << 7;
+	/*
+	 * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
+	 */
+	p->resp_time_value = 18;
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_get_portsamplescontrol(struct ib_pma_mad *pmp,
+				      struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplescontrol *p =
+		(struct ib_pma_portsamplescontrol *)pmp->data;
+	struct qib_ibdev *dev = to_idev(ibdev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	unsigned long flags;
+	u8 port_select = p->port_select;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 || port_select != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		goto bail;
+	}
+	spin_lock_irqsave(&ibp->lock, flags);
+	p->tick = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_PMA_TICKS);
+	p->sample_status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT);
+	p->counter_width = 4;   /* 32 bit counters */
+	p->counter_mask0_9 = COUNTER_MASK0_9;
+	p->sample_start = cpu_to_be32(ibp->pma_sample_start);
+	p->sample_interval = cpu_to_be32(ibp->pma_sample_interval);
+	p->tag = cpu_to_be16(ibp->pma_tag);
+	p->counter_select[0] = ibp->pma_counter_select[0];
+	p->counter_select[1] = ibp->pma_counter_select[1];
+	p->counter_select[2] = ibp->pma_counter_select[2];
+	p->counter_select[3] = ibp->pma_counter_select[3];
+	p->counter_select[4] = ibp->pma_counter_select[4];
+	spin_unlock_irqrestore(&ibp->lock, flags);
+
+bail:
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_set_portsamplescontrol(struct ib_pma_mad *pmp,
+				      struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplescontrol *p =
+		(struct ib_pma_portsamplescontrol *)pmp->data;
+	struct qib_ibdev *dev = to_idev(ibdev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	unsigned long flags;
+	u8 status, xmit_flags;
+	int ret;
+
+	if (pmp->mad_hdr.attr_mod != 0 || p->port_select != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		ret = reply((struct ib_smp *) pmp);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&ibp->lock, flags);
+
+	/* Port Sampling code owns the PS* HW counters */
+	xmit_flags = ppd->cong_stats.flags;
+	ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_SAMPLE;
+	status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT);
+	if (status == IB_PMA_SAMPLE_STATUS_DONE ||
+	    (status == IB_PMA_SAMPLE_STATUS_RUNNING &&
+	     xmit_flags == IB_PMA_CONG_HW_CONTROL_TIMER)) {
+		ibp->pma_sample_start = be32_to_cpu(p->sample_start);
+		ibp->pma_sample_interval = be32_to_cpu(p->sample_interval);
+		ibp->pma_tag = be16_to_cpu(p->tag);
+		ibp->pma_counter_select[0] = p->counter_select[0];
+		ibp->pma_counter_select[1] = p->counter_select[1];
+		ibp->pma_counter_select[2] = p->counter_select[2];
+		ibp->pma_counter_select[3] = p->counter_select[3];
+		ibp->pma_counter_select[4] = p->counter_select[4];
+		dd->f_set_cntr_sample(ppd, ibp->pma_sample_interval,
+				      ibp->pma_sample_start);
+	}
+	spin_unlock_irqrestore(&ibp->lock, flags);
+
+	ret = pma_get_portsamplescontrol(pmp, ibdev, port);
+
+bail:
+	return ret;
+}
+
+static u64 get_counter(struct qib_ibport *ibp, struct qib_pportdata *ppd,
+		       __be16 sel)
+{
+	u64 ret;
+
+	switch (sel) {
+	case IB_PMA_PORT_XMIT_DATA:
+		ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSXMITDATA);
+		break;
+	case IB_PMA_PORT_RCV_DATA:
+		ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSRCVDATA);
+		break;
+	case IB_PMA_PORT_XMIT_PKTS:
+		ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSXMITPKTS);
+		break;
+	case IB_PMA_PORT_RCV_PKTS:
+		ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSRCVPKTS);
+		break;
+	case IB_PMA_PORT_XMIT_WAIT:
+		ret = ppd->dd->f_portcntr(ppd, QIBPORTCNTR_PSXMITWAIT);
+		break;
+	default:
+		ret = 0;
+	}
+
+	return ret;
+}
+
+/* This function assumes that the xmit_wait lock is already held */
+static u64 xmit_wait_get_value_delta(struct qib_pportdata *ppd)
+{
+	u32 delta;
+
+	delta = get_counter(&ppd->ibport_data, ppd,
+			    IB_PMA_PORT_XMIT_WAIT);
+	return ppd->cong_stats.counter + delta;
+}
+
+static void cache_hw_sample_counters(struct qib_pportdata *ppd)
+{
+	struct qib_ibport *ibp = &ppd->ibport_data;
+
+	ppd->cong_stats.counter_cache.psxmitdata =
+		get_counter(ibp, ppd, IB_PMA_PORT_XMIT_DATA);
+	ppd->cong_stats.counter_cache.psrcvdata =
+		get_counter(ibp, ppd, IB_PMA_PORT_RCV_DATA);
+	ppd->cong_stats.counter_cache.psxmitpkts =
+		get_counter(ibp, ppd, IB_PMA_PORT_XMIT_PKTS);
+	ppd->cong_stats.counter_cache.psrcvpkts =
+		get_counter(ibp, ppd, IB_PMA_PORT_RCV_PKTS);
+	ppd->cong_stats.counter_cache.psxmitwait =
+		get_counter(ibp, ppd, IB_PMA_PORT_XMIT_WAIT);
+}
+
+static u64 get_cache_hw_sample_counters(struct qib_pportdata *ppd,
+					__be16 sel)
+{
+	u64 ret;
+
+	switch (sel) {
+	case IB_PMA_PORT_XMIT_DATA:
+		ret = ppd->cong_stats.counter_cache.psxmitdata;
+		break;
+	case IB_PMA_PORT_RCV_DATA:
+		ret = ppd->cong_stats.counter_cache.psrcvdata;
+		break;
+	case IB_PMA_PORT_XMIT_PKTS:
+		ret = ppd->cong_stats.counter_cache.psxmitpkts;
+		break;
+	case IB_PMA_PORT_RCV_PKTS:
+		ret = ppd->cong_stats.counter_cache.psrcvpkts;
+		break;
+	case IB_PMA_PORT_XMIT_WAIT:
+		ret = ppd->cong_stats.counter_cache.psxmitwait;
+		break;
+	default:
+		ret = 0;
+	}
+
+	return ret;
+}
+
+static int pma_get_portsamplesresult(struct ib_pma_mad *pmp,
+				     struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplesresult *p =
+		(struct ib_pma_portsamplesresult *)pmp->data;
+	struct qib_ibdev *dev = to_idev(ibdev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	unsigned long flags;
+	u8 status;
+	int i;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+	spin_lock_irqsave(&ibp->lock, flags);
+	p->tag = cpu_to_be16(ibp->pma_tag);
+	if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_TIMER)
+		p->sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+	else {
+		status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT);
+		p->sample_status = cpu_to_be16(status);
+		if (status == IB_PMA_SAMPLE_STATUS_DONE) {
+			cache_hw_sample_counters(ppd);
+			ppd->cong_stats.counter =
+				xmit_wait_get_value_delta(ppd);
+			dd->f_set_cntr_sample(ppd,
+					      QIB_CONG_TIMER_PSINTERVAL, 0);
+			ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER;
+		}
+	}
+	for (i = 0; i < ARRAY_SIZE(ibp->pma_counter_select); i++)
+		p->counter[i] = cpu_to_be32(
+			get_cache_hw_sample_counters(
+				ppd, ibp->pma_counter_select[i]));
+	spin_unlock_irqrestore(&ibp->lock, flags);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_get_portsamplesresult_ext(struct ib_pma_mad *pmp,
+					 struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portsamplesresult_ext *p =
+		(struct ib_pma_portsamplesresult_ext *)pmp->data;
+	struct qib_ibdev *dev = to_idev(ibdev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	unsigned long flags;
+	u8 status;
+	int i;
+
+	/* Port Sampling code owns the PS* HW counters */
+	memset(pmp->data, 0, sizeof(pmp->data));
+	spin_lock_irqsave(&ibp->lock, flags);
+	p->tag = cpu_to_be16(ibp->pma_tag);
+	if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_TIMER)
+		p->sample_status = IB_PMA_SAMPLE_STATUS_DONE;
+	else {
+		status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT);
+		p->sample_status = cpu_to_be16(status);
+		/* 64 bits */
+		p->extended_width = cpu_to_be32(0x80000000);
+		if (status == IB_PMA_SAMPLE_STATUS_DONE) {
+			cache_hw_sample_counters(ppd);
+			ppd->cong_stats.counter =
+				xmit_wait_get_value_delta(ppd);
+			dd->f_set_cntr_sample(ppd,
+					      QIB_CONG_TIMER_PSINTERVAL, 0);
+			ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER;
+		}
+	}
+	for (i = 0; i < ARRAY_SIZE(ibp->pma_counter_select); i++)
+		p->counter[i] = cpu_to_be64(
+			get_cache_hw_sample_counters(
+				ppd, ibp->pma_counter_select[i]));
+	spin_unlock_irqrestore(&ibp->lock, flags);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_get_portcounters(struct ib_pma_mad *pmp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_verbs_counters cntrs;
+	u8 port_select = p->port_select;
+
+	qib_get_counters(ppd, &cntrs);
+
+	/* Adjust counters for any resets done. */
+	cntrs.symbol_error_counter -= ibp->z_symbol_error_counter;
+	cntrs.link_error_recovery_counter -=
+		ibp->z_link_error_recovery_counter;
+	cntrs.link_downed_counter -= ibp->z_link_downed_counter;
+	cntrs.port_rcv_errors -= ibp->z_port_rcv_errors;
+	cntrs.port_rcv_remphys_errors -= ibp->z_port_rcv_remphys_errors;
+	cntrs.port_xmit_discards -= ibp->z_port_xmit_discards;
+	cntrs.port_xmit_data -= ibp->z_port_xmit_data;
+	cntrs.port_rcv_data -= ibp->z_port_rcv_data;
+	cntrs.port_xmit_packets -= ibp->z_port_xmit_packets;
+	cntrs.port_rcv_packets -= ibp->z_port_rcv_packets;
+	cntrs.local_link_integrity_errors -=
+		ibp->z_local_link_integrity_errors;
+	cntrs.excessive_buffer_overrun_errors -=
+		ibp->z_excessive_buffer_overrun_errors;
+	cntrs.vl15_dropped -= ibp->z_vl15_dropped;
+	cntrs.vl15_dropped += ibp->n_vl15_dropped;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 || port_select != port)
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	if (cntrs.symbol_error_counter > 0xFFFFUL)
+		p->symbol_error_counter = cpu_to_be16(0xFFFF);
+	else
+		p->symbol_error_counter =
+			cpu_to_be16((u16)cntrs.symbol_error_counter);
+	if (cntrs.link_error_recovery_counter > 0xFFUL)
+		p->link_error_recovery_counter = 0xFF;
+	else
+		p->link_error_recovery_counter =
+			(u8)cntrs.link_error_recovery_counter;
+	if (cntrs.link_downed_counter > 0xFFUL)
+		p->link_downed_counter = 0xFF;
+	else
+		p->link_downed_counter = (u8)cntrs.link_downed_counter;
+	if (cntrs.port_rcv_errors > 0xFFFFUL)
+		p->port_rcv_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_errors =
+			cpu_to_be16((u16) cntrs.port_rcv_errors);
+	if (cntrs.port_rcv_remphys_errors > 0xFFFFUL)
+		p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_remphys_errors =
+			cpu_to_be16((u16)cntrs.port_rcv_remphys_errors);
+	if (cntrs.port_xmit_discards > 0xFFFFUL)
+		p->port_xmit_discards = cpu_to_be16(0xFFFF);
+	else
+		p->port_xmit_discards =
+			cpu_to_be16((u16)cntrs.port_xmit_discards);
+	if (cntrs.local_link_integrity_errors > 0xFUL)
+		cntrs.local_link_integrity_errors = 0xFUL;
+	if (cntrs.excessive_buffer_overrun_errors > 0xFUL)
+		cntrs.excessive_buffer_overrun_errors = 0xFUL;
+	p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) |
+		cntrs.excessive_buffer_overrun_errors;
+	if (cntrs.vl15_dropped > 0xFFFFUL)
+		p->vl15_dropped = cpu_to_be16(0xFFFF);
+	else
+		p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped);
+	if (cntrs.port_xmit_data > 0xFFFFFFFFUL)
+		p->port_xmit_data = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_xmit_data = cpu_to_be32((u32)cntrs.port_xmit_data);
+	if (cntrs.port_rcv_data > 0xFFFFFFFFUL)
+		p->port_rcv_data = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_rcv_data = cpu_to_be32((u32)cntrs.port_rcv_data);
+	if (cntrs.port_xmit_packets > 0xFFFFFFFFUL)
+		p->port_xmit_packets = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_xmit_packets =
+			cpu_to_be32((u32)cntrs.port_xmit_packets);
+	if (cntrs.port_rcv_packets > 0xFFFFFFFFUL)
+		p->port_rcv_packets = cpu_to_be32(0xFFFFFFFF);
+	else
+		p->port_rcv_packets =
+			cpu_to_be32((u32) cntrs.port_rcv_packets);
+
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_get_portcounters_cong(struct ib_pma_mad *pmp,
+				     struct ib_device *ibdev, u8 port)
+{
+	/* Congestion PMA packets start at offset 24 not 64 */
+	struct ib_pma_portcounters_cong *p =
+		(struct ib_pma_portcounters_cong *)pmp->reserved;
+	struct qib_verbs_counters cntrs;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_devdata *dd = dd_from_ppd(ppd);
+	u32 port_select = be32_to_cpu(pmp->mad_hdr.attr_mod) & 0xFF;
+	u64 xmit_wait_counter;
+	unsigned long flags;
+
+	/*
+	 * This check is performed only in the GET method because the
+	 * SET method ends up calling this anyway.
+	 */
+	if (!dd->psxmitwait_supported)
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+	if (port_select != port)
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+
+	qib_get_counters(ppd, &cntrs);
+	spin_lock_irqsave(&ppd->ibport_data.lock, flags);
+	xmit_wait_counter = xmit_wait_get_value_delta(ppd);
+	spin_unlock_irqrestore(&ppd->ibport_data.lock, flags);
+
+	/* Adjust counters for any resets done. */
+	cntrs.symbol_error_counter -= ibp->z_symbol_error_counter;
+	cntrs.link_error_recovery_counter -=
+		ibp->z_link_error_recovery_counter;
+	cntrs.link_downed_counter -= ibp->z_link_downed_counter;
+	cntrs.port_rcv_errors -= ibp->z_port_rcv_errors;
+	cntrs.port_rcv_remphys_errors -=
+		ibp->z_port_rcv_remphys_errors;
+	cntrs.port_xmit_discards -= ibp->z_port_xmit_discards;
+	cntrs.local_link_integrity_errors -=
+		ibp->z_local_link_integrity_errors;
+	cntrs.excessive_buffer_overrun_errors -=
+		ibp->z_excessive_buffer_overrun_errors;
+	cntrs.vl15_dropped -= ibp->z_vl15_dropped;
+	cntrs.vl15_dropped += ibp->n_vl15_dropped;
+	cntrs.port_xmit_data -= ibp->z_port_xmit_data;
+	cntrs.port_rcv_data -= ibp->z_port_rcv_data;
+	cntrs.port_xmit_packets -= ibp->z_port_xmit_packets;
+	cntrs.port_rcv_packets -= ibp->z_port_rcv_packets;
+
+	memset(pmp->reserved, 0, sizeof(pmp->reserved) +
+	       sizeof(pmp->data));
+
+	/*
+	 * Set top 3 bits to indicate interval in picoseconds in
+	 * remaining bits.
+	 */
+	p->port_check_rate =
+		cpu_to_be16((QIB_XMIT_RATE_PICO << 13) |
+			    (dd->psxmitwait_check_rate &
+			     ~(QIB_XMIT_RATE_PICO << 13)));
+	p->port_adr_events = cpu_to_be64(0);
+	p->port_xmit_wait = cpu_to_be64(xmit_wait_counter);
+	p->port_xmit_data = cpu_to_be64(cntrs.port_xmit_data);
+	p->port_rcv_data = cpu_to_be64(cntrs.port_rcv_data);
+	p->port_xmit_packets =
+		cpu_to_be64(cntrs.port_xmit_packets);
+	p->port_rcv_packets =
+		cpu_to_be64(cntrs.port_rcv_packets);
+	if (cntrs.symbol_error_counter > 0xFFFFUL)
+		p->symbol_error_counter = cpu_to_be16(0xFFFF);
+	else
+		p->symbol_error_counter =
+			cpu_to_be16(
+				(u16)cntrs.symbol_error_counter);
+	if (cntrs.link_error_recovery_counter > 0xFFUL)
+		p->link_error_recovery_counter = 0xFF;
+	else
+		p->link_error_recovery_counter =
+			(u8)cntrs.link_error_recovery_counter;
+	if (cntrs.link_downed_counter > 0xFFUL)
+		p->link_downed_counter = 0xFF;
+	else
+		p->link_downed_counter =
+			(u8)cntrs.link_downed_counter;
+	if (cntrs.port_rcv_errors > 0xFFFFUL)
+		p->port_rcv_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_errors =
+			cpu_to_be16((u16) cntrs.port_rcv_errors);
+	if (cntrs.port_rcv_remphys_errors > 0xFFFFUL)
+		p->port_rcv_remphys_errors = cpu_to_be16(0xFFFF);
+	else
+		p->port_rcv_remphys_errors =
+			cpu_to_be16(
+				(u16)cntrs.port_rcv_remphys_errors);
+	if (cntrs.port_xmit_discards > 0xFFFFUL)
+		p->port_xmit_discards = cpu_to_be16(0xFFFF);
+	else
+		p->port_xmit_discards =
+			cpu_to_be16((u16)cntrs.port_xmit_discards);
+	if (cntrs.local_link_integrity_errors > 0xFUL)
+		cntrs.local_link_integrity_errors = 0xFUL;
+	if (cntrs.excessive_buffer_overrun_errors > 0xFUL)
+		cntrs.excessive_buffer_overrun_errors = 0xFUL;
+	p->link_overrun_errors = (cntrs.local_link_integrity_errors << 4) |
+		cntrs.excessive_buffer_overrun_errors;
+	if (cntrs.vl15_dropped > 0xFFFFUL)
+		p->vl15_dropped = cpu_to_be16(0xFFFF);
+	else
+		p->vl15_dropped = cpu_to_be16((u16)cntrs.vl15_dropped);
+
+	return reply((struct ib_smp *)pmp);
+}
+
+static void qib_snapshot_pmacounters(
+	struct qib_ibport *ibp,
+	struct qib_pma_counters *pmacounters)
+{
+	struct qib_pma_counters *p;
+	int cpu;
+
+	memset(pmacounters, 0, sizeof(*pmacounters));
+	for_each_possible_cpu(cpu) {
+		p = per_cpu_ptr(ibp->pmastats, cpu);
+		pmacounters->n_unicast_xmit += p->n_unicast_xmit;
+		pmacounters->n_unicast_rcv += p->n_unicast_rcv;
+		pmacounters->n_multicast_xmit += p->n_multicast_xmit;
+		pmacounters->n_multicast_rcv += p->n_multicast_rcv;
+	}
+}
+
+static int pma_get_portcounters_ext(struct ib_pma_mad *pmp,
+				    struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters_ext *p =
+		(struct ib_pma_portcounters_ext *)pmp->data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	u64 swords, rwords, spkts, rpkts, xwait;
+	struct qib_pma_counters pma;
+	u8 port_select = p->port_select;
+
+	memset(pmp->data, 0, sizeof(pmp->data));
+
+	p->port_select = port_select;
+	if (pmp->mad_hdr.attr_mod != 0 || port_select != port) {
+		pmp->mad_hdr.status |= IB_SMP_INVALID_FIELD;
+		goto bail;
+	}
+
+	qib_snapshot_counters(ppd, &swords, &rwords, &spkts, &rpkts, &xwait);
+
+	/* Adjust counters for any resets done. */
+	swords -= ibp->z_port_xmit_data;
+	rwords -= ibp->z_port_rcv_data;
+	spkts -= ibp->z_port_xmit_packets;
+	rpkts -= ibp->z_port_rcv_packets;
+
+	p->port_xmit_data = cpu_to_be64(swords);
+	p->port_rcv_data = cpu_to_be64(rwords);
+	p->port_xmit_packets = cpu_to_be64(spkts);
+	p->port_rcv_packets = cpu_to_be64(rpkts);
+
+	qib_snapshot_pmacounters(ibp, &pma);
+
+	p->port_unicast_xmit_packets = cpu_to_be64(pma.n_unicast_xmit
+		- ibp->z_unicast_xmit);
+	p->port_unicast_rcv_packets = cpu_to_be64(pma.n_unicast_rcv
+		- ibp->z_unicast_rcv);
+	p->port_multicast_xmit_packets = cpu_to_be64(pma.n_multicast_xmit
+		- ibp->z_multicast_xmit);
+	p->port_multicast_rcv_packets = cpu_to_be64(pma.n_multicast_rcv
+		- ibp->z_multicast_rcv);
+
+bail:
+	return reply((struct ib_smp *) pmp);
+}
+
+static int pma_set_portcounters(struct ib_pma_mad *pmp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_verbs_counters cntrs;
+
+	/*
+	 * Since the HW doesn't support clearing counters, we save the
+	 * current count and subtract it from future responses.
+	 */
+	qib_get_counters(ppd, &cntrs);
+
+	if (p->counter_select & IB_PMA_SEL_SYMBOL_ERROR)
+		ibp->z_symbol_error_counter = cntrs.symbol_error_counter;
+
+	if (p->counter_select & IB_PMA_SEL_LINK_ERROR_RECOVERY)
+		ibp->z_link_error_recovery_counter =
+			cntrs.link_error_recovery_counter;
+
+	if (p->counter_select & IB_PMA_SEL_LINK_DOWNED)
+		ibp->z_link_downed_counter = cntrs.link_downed_counter;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_ERRORS)
+		ibp->z_port_rcv_errors = cntrs.port_rcv_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_REMPHYS_ERRORS)
+		ibp->z_port_rcv_remphys_errors =
+			cntrs.port_rcv_remphys_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DISCARDS)
+		ibp->z_port_xmit_discards = cntrs.port_xmit_discards;
+
+	if (p->counter_select & IB_PMA_SEL_LOCAL_LINK_INTEGRITY_ERRORS)
+		ibp->z_local_link_integrity_errors =
+			cntrs.local_link_integrity_errors;
+
+	if (p->counter_select & IB_PMA_SEL_EXCESSIVE_BUFFER_OVERRUNS)
+		ibp->z_excessive_buffer_overrun_errors =
+			cntrs.excessive_buffer_overrun_errors;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_VL15_DROPPED) {
+		ibp->n_vl15_dropped = 0;
+		ibp->z_vl15_dropped = cntrs.vl15_dropped;
+	}
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_DATA)
+		ibp->z_port_xmit_data = cntrs.port_xmit_data;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_DATA)
+		ibp->z_port_rcv_data = cntrs.port_rcv_data;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_XMIT_PACKETS)
+		ibp->z_port_xmit_packets = cntrs.port_xmit_packets;
+
+	if (p->counter_select & IB_PMA_SEL_PORT_RCV_PACKETS)
+		ibp->z_port_rcv_packets = cntrs.port_rcv_packets;
+
+	return pma_get_portcounters(pmp, ibdev, port);
+}
+
+static int pma_set_portcounters_cong(struct ib_pma_mad *pmp,
+				     struct ib_device *ibdev, u8 port)
+{
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_devdata *dd = dd_from_ppd(ppd);
+	struct qib_verbs_counters cntrs;
+	u32 counter_select = (be32_to_cpu(pmp->mad_hdr.attr_mod) >> 24) & 0xFF;
+	int ret = 0;
+	unsigned long flags;
+
+	qib_get_counters(ppd, &cntrs);
+	/* Get counter values before we save them */
+	ret = pma_get_portcounters_cong(pmp, ibdev, port);
+
+	if (counter_select & IB_PMA_SEL_CONG_XMIT) {
+		spin_lock_irqsave(&ppd->ibport_data.lock, flags);
+		ppd->cong_stats.counter = 0;
+		dd->f_set_cntr_sample(ppd, QIB_CONG_TIMER_PSINTERVAL,
+				      0x0);
+		spin_unlock_irqrestore(&ppd->ibport_data.lock, flags);
+	}
+	if (counter_select & IB_PMA_SEL_CONG_PORT_DATA) {
+		ibp->z_port_xmit_data = cntrs.port_xmit_data;
+		ibp->z_port_rcv_data = cntrs.port_rcv_data;
+		ibp->z_port_xmit_packets = cntrs.port_xmit_packets;
+		ibp->z_port_rcv_packets = cntrs.port_rcv_packets;
+	}
+	if (counter_select & IB_PMA_SEL_CONG_ALL) {
+		ibp->z_symbol_error_counter =
+			cntrs.symbol_error_counter;
+		ibp->z_link_error_recovery_counter =
+			cntrs.link_error_recovery_counter;
+		ibp->z_link_downed_counter =
+			cntrs.link_downed_counter;
+		ibp->z_port_rcv_errors = cntrs.port_rcv_errors;
+		ibp->z_port_rcv_remphys_errors =
+			cntrs.port_rcv_remphys_errors;
+		ibp->z_port_xmit_discards =
+			cntrs.port_xmit_discards;
+		ibp->z_local_link_integrity_errors =
+			cntrs.local_link_integrity_errors;
+		ibp->z_excessive_buffer_overrun_errors =
+			cntrs.excessive_buffer_overrun_errors;
+		ibp->n_vl15_dropped = 0;
+		ibp->z_vl15_dropped = cntrs.vl15_dropped;
+	}
+
+	return ret;
+}
+
+static int pma_set_portcounters_ext(struct ib_pma_mad *pmp,
+				    struct ib_device *ibdev, u8 port)
+{
+	struct ib_pma_portcounters *p = (struct ib_pma_portcounters *)
+		pmp->data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	u64 swords, rwords, spkts, rpkts, xwait;
+	struct qib_pma_counters pma;
+
+	qib_snapshot_counters(ppd, &swords, &rwords, &spkts, &rpkts, &xwait);
+
+	if (p->counter_select & IB_PMA_SELX_PORT_XMIT_DATA)
+		ibp->z_port_xmit_data = swords;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_RCV_DATA)
+		ibp->z_port_rcv_data = rwords;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_XMIT_PACKETS)
+		ibp->z_port_xmit_packets = spkts;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_RCV_PACKETS)
+		ibp->z_port_rcv_packets = rpkts;
+
+	qib_snapshot_pmacounters(ibp, &pma);
+
+	if (p->counter_select & IB_PMA_SELX_PORT_UNI_XMIT_PACKETS)
+		ibp->z_unicast_xmit = pma.n_unicast_xmit;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_UNI_RCV_PACKETS)
+		ibp->z_unicast_rcv = pma.n_unicast_rcv;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_MULTI_XMIT_PACKETS)
+		ibp->z_multicast_xmit = pma.n_multicast_xmit;
+
+	if (p->counter_select & IB_PMA_SELX_PORT_MULTI_RCV_PACKETS)
+		ibp->z_multicast_rcv = pma.n_multicast_rcv;
+
+	return pma_get_portcounters_ext(pmp, ibdev, port);
+}
+
+static int process_subn(struct ib_device *ibdev, int mad_flags,
+			u8 port, struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_smp *smp = (struct ib_smp *)out_mad;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	int ret;
+
+	*out_mad = *in_mad;
+	if (smp->class_version != 1) {
+		smp->status |= IB_SMP_UNSUP_VERSION;
+		ret = reply(smp);
+		goto bail;
+	}
+
+	ret = check_mkey(ibp, smp, mad_flags);
+	if (ret) {
+		u32 port_num = be32_to_cpu(smp->attr_mod);
+
+		/*
+		 * If this is a get/set portinfo, we already check the
+		 * M_Key if the MAD is for another port and the M_Key
+		 * is OK on the receiving port. This check is needed
+		 * to increment the error counters when the M_Key
+		 * fails to match on *both* ports.
+		 */
+		if (in_mad->mad_hdr.attr_id == IB_SMP_ATTR_PORT_INFO &&
+		    (smp->method == IB_MGMT_METHOD_GET ||
+		     smp->method == IB_MGMT_METHOD_SET) &&
+		    port_num && port_num <= ibdev->phys_port_cnt &&
+		    port != port_num)
+			(void) check_mkey(to_iport(ibdev, port_num), smp, 0);
+		ret = IB_MAD_RESULT_FAILURE;
+		goto bail;
+	}
+
+	switch (smp->method) {
+	case IB_MGMT_METHOD_GET:
+		switch (smp->attr_id) {
+		case IB_SMP_ATTR_NODE_DESC:
+			ret = subn_get_nodedescription(smp, ibdev);
+			goto bail;
+		case IB_SMP_ATTR_NODE_INFO:
+			ret = subn_get_nodeinfo(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_GUID_INFO:
+			ret = subn_get_guidinfo(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_PORT_INFO:
+			ret = subn_get_portinfo(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_PKEY_TABLE:
+			ret = subn_get_pkeytable(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_SL_TO_VL_TABLE:
+			ret = subn_get_sl_to_vl(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_VL_ARB_TABLE:
+			ret = subn_get_vl_arb(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_SM_INFO:
+			if (ibp->port_cap_flags & IB_PORT_SM_DISABLED) {
+				ret = IB_MAD_RESULT_SUCCESS |
+					IB_MAD_RESULT_CONSUMED;
+				goto bail;
+			}
+			if (ibp->port_cap_flags & IB_PORT_SM) {
+				ret = IB_MAD_RESULT_SUCCESS;
+				goto bail;
+			}
+			/* FALLTHROUGH */
+		default:
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply(smp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_SET:
+		switch (smp->attr_id) {
+		case IB_SMP_ATTR_GUID_INFO:
+			ret = subn_set_guidinfo(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_PORT_INFO:
+			ret = subn_set_portinfo(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_PKEY_TABLE:
+			ret = subn_set_pkeytable(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_SL_TO_VL_TABLE:
+			ret = subn_set_sl_to_vl(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_VL_ARB_TABLE:
+			ret = subn_set_vl_arb(smp, ibdev, port);
+			goto bail;
+		case IB_SMP_ATTR_SM_INFO:
+			if (ibp->port_cap_flags & IB_PORT_SM_DISABLED) {
+				ret = IB_MAD_RESULT_SUCCESS |
+					IB_MAD_RESULT_CONSUMED;
+				goto bail;
+			}
+			if (ibp->port_cap_flags & IB_PORT_SM) {
+				ret = IB_MAD_RESULT_SUCCESS;
+				goto bail;
+			}
+			/* FALLTHROUGH */
+		default:
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply(smp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_TRAP_REPRESS:
+		if (smp->attr_id == IB_SMP_ATTR_NOTICE)
+			ret = subn_trap_repress(smp, ibdev, port);
+		else {
+			smp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply(smp);
+		}
+		goto bail;
+
+	case IB_MGMT_METHOD_TRAP:
+	case IB_MGMT_METHOD_REPORT:
+	case IB_MGMT_METHOD_REPORT_RESP:
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+
+	case IB_MGMT_METHOD_SEND:
+		if (ib_get_smp_direction(smp) &&
+		    smp->attr_id == QIB_VENDOR_IPG) {
+			ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PORT,
+					      smp->data[0]);
+			ret = IB_MAD_RESULT_SUCCESS | IB_MAD_RESULT_CONSUMED;
+		} else
+			ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+
+	default:
+		smp->status |= IB_SMP_UNSUP_METHOD;
+		ret = reply(smp);
+	}
+
+bail:
+	return ret;
+}
+
+static int process_perf(struct ib_device *ibdev, u8 port,
+			struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_pma_mad *pmp = (struct ib_pma_mad *)out_mad;
+	int ret;
+
+	*out_mad = *in_mad;
+	if (pmp->mad_hdr.class_version != 1) {
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_VERSION;
+		ret = reply((struct ib_smp *) pmp);
+		goto bail;
+	}
+
+	switch (pmp->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+		switch (pmp->mad_hdr.attr_id) {
+		case IB_PMA_CLASS_PORT_INFO:
+			ret = pma_get_classportinfo(pmp, ibdev);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_CONTROL:
+			ret = pma_get_portsamplescontrol(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_RESULT:
+			ret = pma_get_portsamplesresult(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_SAMPLES_RESULT_EXT:
+			ret = pma_get_portsamplesresult_ext(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS:
+			ret = pma_get_portcounters(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_EXT:
+			ret = pma_get_portcounters_ext(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_CONG:
+			ret = pma_get_portcounters_cong(pmp, ibdev, port);
+			goto bail;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) pmp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_SET:
+		switch (pmp->mad_hdr.attr_id) {
+		case IB_PMA_PORT_SAMPLES_CONTROL:
+			ret = pma_set_portsamplescontrol(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS:
+			ret = pma_set_portcounters(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_EXT:
+			ret = pma_set_portcounters_ext(pmp, ibdev, port);
+			goto bail;
+		case IB_PMA_PORT_COUNTERS_CONG:
+			ret = pma_set_portcounters_cong(pmp, ibdev, port);
+			goto bail;
+		default:
+			pmp->mad_hdr.status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) pmp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_TRAP:
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+
+	default:
+		pmp->mad_hdr.status |= IB_SMP_UNSUP_METHOD;
+		ret = reply((struct ib_smp *) pmp);
+	}
+
+bail:
+	return ret;
+}
+
+static int cc_get_classportinfo(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev)
+{
+	struct ib_cc_classportinfo_attr *p =
+		(struct ib_cc_classportinfo_attr *)ccp->mgmt_data;
+
+	memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data));
+
+	p->base_version = 1;
+	p->class_version = 1;
+	p->cap_mask = 0;
+
+	/*
+	 * Expected response time is 4.096 usec. * 2^18 == 1.073741824 sec.
+	 */
+	p->resp_time_value = 18;
+
+	return reply((struct ib_smp *) ccp);
+}
+
+static int cc_get_congestion_info(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_cc_info_attr *p =
+		(struct ib_cc_info_attr *)ccp->mgmt_data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+
+	memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data));
+
+	p->congestion_info = 0;
+	p->control_table_cap = ppd->cc_max_table_entries;
+
+	return reply((struct ib_smp *) ccp);
+}
+
+static int cc_get_congestion_setting(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev, u8 port)
+{
+	int i;
+	struct ib_cc_congestion_setting_attr *p =
+		(struct ib_cc_congestion_setting_attr *)ccp->mgmt_data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct ib_cc_congestion_entry_shadow *entries;
+
+	memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data));
+
+	spin_lock(&ppd->cc_shadow_lock);
+
+	entries = ppd->congestion_entries_shadow->entries;
+	p->port_control = cpu_to_be16(
+		ppd->congestion_entries_shadow->port_control);
+	p->control_map = cpu_to_be16(
+		ppd->congestion_entries_shadow->control_map);
+	for (i = 0; i < IB_CC_CCS_ENTRIES; i++) {
+		p->entries[i].ccti_increase = entries[i].ccti_increase;
+		p->entries[i].ccti_timer = cpu_to_be16(entries[i].ccti_timer);
+		p->entries[i].trigger_threshold = entries[i].trigger_threshold;
+		p->entries[i].ccti_min = entries[i].ccti_min;
+	}
+
+	spin_unlock(&ppd->cc_shadow_lock);
+
+	return reply((struct ib_smp *) ccp);
+}
+
+static int cc_get_congestion_control_table(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_cc_table_attr *p =
+		(struct ib_cc_table_attr *)ccp->mgmt_data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	u32 cct_block_index = be32_to_cpu(ccp->attr_mod);
+	u32 max_cct_block;
+	u32 cct_entry;
+	struct ib_cc_table_entry_shadow *entries;
+	int i;
+
+	/* Is the table index more than what is supported? */
+	if (cct_block_index > IB_CC_TABLE_CAP_DEFAULT - 1)
+		goto bail;
+
+	memset(ccp->mgmt_data, 0, sizeof(ccp->mgmt_data));
+
+	spin_lock(&ppd->cc_shadow_lock);
+
+	max_cct_block =
+		(ppd->ccti_entries_shadow->ccti_last_entry + 1)/IB_CCT_ENTRIES;
+	max_cct_block = max_cct_block ? max_cct_block - 1 : 0;
+
+	if (cct_block_index > max_cct_block) {
+		spin_unlock(&ppd->cc_shadow_lock);
+		goto bail;
+	}
+
+	ccp->attr_mod = cpu_to_be32(cct_block_index);
+
+	cct_entry = IB_CCT_ENTRIES * (cct_block_index + 1);
+
+	cct_entry--;
+
+	p->ccti_limit = cpu_to_be16(cct_entry);
+
+	entries = &ppd->ccti_entries_shadow->
+			entries[IB_CCT_ENTRIES * cct_block_index];
+	cct_entry %= IB_CCT_ENTRIES;
+
+	for (i = 0; i <= cct_entry; i++)
+		p->ccti_entries[i].entry = cpu_to_be16(entries[i].entry);
+
+	spin_unlock(&ppd->cc_shadow_lock);
+
+	return reply((struct ib_smp *) ccp);
+
+bail:
+	return reply_failure((struct ib_smp *) ccp);
+}
+
+static int cc_set_congestion_setting(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_cc_congestion_setting_attr *p =
+		(struct ib_cc_congestion_setting_attr *)ccp->mgmt_data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	int i;
+
+	ppd->cc_sl_control_map = be16_to_cpu(p->control_map);
+
+	for (i = 0; i < IB_CC_CCS_ENTRIES; i++) {
+		ppd->congestion_entries[i].ccti_increase =
+			p->entries[i].ccti_increase;
+
+		ppd->congestion_entries[i].ccti_timer =
+			be16_to_cpu(p->entries[i].ccti_timer);
+
+		ppd->congestion_entries[i].trigger_threshold =
+			p->entries[i].trigger_threshold;
+
+		ppd->congestion_entries[i].ccti_min =
+			p->entries[i].ccti_min;
+	}
+
+	return reply((struct ib_smp *) ccp);
+}
+
+static int cc_set_congestion_control_table(struct ib_cc_mad *ccp,
+				struct ib_device *ibdev, u8 port)
+{
+	struct ib_cc_table_attr *p =
+		(struct ib_cc_table_attr *)ccp->mgmt_data;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	u32 cct_block_index = be32_to_cpu(ccp->attr_mod);
+	u32 cct_entry;
+	struct ib_cc_table_entry_shadow *entries;
+	int i;
+
+	/* Is the table index more than what is supported? */
+	if (cct_block_index > IB_CC_TABLE_CAP_DEFAULT - 1)
+		goto bail;
+
+	/* If this packet is the first in the sequence then
+	 * zero the total table entry count.
+	 */
+	if (be16_to_cpu(p->ccti_limit) < IB_CCT_ENTRIES)
+		ppd->total_cct_entry = 0;
+
+	cct_entry = (be16_to_cpu(p->ccti_limit))%IB_CCT_ENTRIES;
+
+	/* ccti_limit is 0 to 63 */
+	ppd->total_cct_entry += (cct_entry + 1);
+
+	if (ppd->total_cct_entry > ppd->cc_supported_table_entries)
+		goto bail;
+
+	ppd->ccti_limit = be16_to_cpu(p->ccti_limit);
+
+	entries = ppd->ccti_entries + (IB_CCT_ENTRIES * cct_block_index);
+
+	for (i = 0; i <= cct_entry; i++)
+		entries[i].entry = be16_to_cpu(p->ccti_entries[i].entry);
+
+	spin_lock(&ppd->cc_shadow_lock);
+
+	ppd->ccti_entries_shadow->ccti_last_entry = ppd->total_cct_entry - 1;
+	memcpy(ppd->ccti_entries_shadow->entries, ppd->ccti_entries,
+		(ppd->total_cct_entry * sizeof(struct ib_cc_table_entry)));
+
+	ppd->congestion_entries_shadow->port_control = IB_CC_CCS_PC_SL_BASED;
+	ppd->congestion_entries_shadow->control_map = ppd->cc_sl_control_map;
+	memcpy(ppd->congestion_entries_shadow->entries, ppd->congestion_entries,
+		IB_CC_CCS_ENTRIES * sizeof(struct ib_cc_congestion_entry));
+
+	spin_unlock(&ppd->cc_shadow_lock);
+
+	return reply((struct ib_smp *) ccp);
+
+bail:
+	return reply_failure((struct ib_smp *) ccp);
+}
+
+static int check_cc_key(struct qib_ibport *ibp,
+			struct ib_cc_mad *ccp, int mad_flags)
+{
+	return 0;
+}
+
+static int process_cc(struct ib_device *ibdev, int mad_flags,
+			u8 port, struct ib_mad *in_mad,
+			struct ib_mad *out_mad)
+{
+	struct ib_cc_mad *ccp = (struct ib_cc_mad *)out_mad;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	int ret;
+
+	*out_mad = *in_mad;
+
+	if (ccp->class_version != 2) {
+		ccp->status |= IB_SMP_UNSUP_VERSION;
+		ret = reply((struct ib_smp *)ccp);
+		goto bail;
+	}
+
+	ret = check_cc_key(ibp, ccp, mad_flags);
+	if (ret)
+		goto bail;
+
+	switch (ccp->method) {
+	case IB_MGMT_METHOD_GET:
+		switch (ccp->attr_id) {
+		case IB_CC_ATTR_CLASSPORTINFO:
+			ret = cc_get_classportinfo(ccp, ibdev);
+			goto bail;
+
+		case IB_CC_ATTR_CONGESTION_INFO:
+			ret = cc_get_congestion_info(ccp, ibdev, port);
+			goto bail;
+
+		case IB_CC_ATTR_CA_CONGESTION_SETTING:
+			ret = cc_get_congestion_setting(ccp, ibdev, port);
+			goto bail;
+
+		case IB_CC_ATTR_CONGESTION_CONTROL_TABLE:
+			ret = cc_get_congestion_control_table(ccp, ibdev, port);
+			goto bail;
+
+			/* FALLTHROUGH */
+		default:
+			ccp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) ccp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_SET:
+		switch (ccp->attr_id) {
+		case IB_CC_ATTR_CA_CONGESTION_SETTING:
+			ret = cc_set_congestion_setting(ccp, ibdev, port);
+			goto bail;
+
+		case IB_CC_ATTR_CONGESTION_CONTROL_TABLE:
+			ret = cc_set_congestion_control_table(ccp, ibdev, port);
+			goto bail;
+
+			/* FALLTHROUGH */
+		default:
+			ccp->status |= IB_SMP_UNSUP_METH_ATTR;
+			ret = reply((struct ib_smp *) ccp);
+			goto bail;
+		}
+
+	case IB_MGMT_METHOD_GET_RESP:
+		/*
+		 * The ib_mad module will call us to process responses
+		 * before checking for other consumers.
+		 * Just tell the caller to process it normally.
+		 */
+		ret = IB_MAD_RESULT_SUCCESS;
+		goto bail;
+
+	case IB_MGMT_METHOD_TRAP:
+	default:
+		ccp->status |= IB_SMP_UNSUP_METHOD;
+		ret = reply((struct ib_smp *) ccp);
+	}
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_process_mad - process an incoming MAD packet
+ * @ibdev: the infiniband device this packet came in on
+ * @mad_flags: MAD flags
+ * @port: the port number this packet came in on
+ * @in_wc: the work completion entry for this packet
+ * @in_grh: the global route header for this packet
+ * @in_mad: the incoming MAD
+ * @out_mad: any outgoing MAD reply
+ *
+ * Returns IB_MAD_RESULT_SUCCESS if this is a MAD that we are not
+ * interested in processing.
+ *
+ * Note that the verbs framework has already done the MAD sanity checks,
+ * and hop count/pointer updating for IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE
+ * MADs.
+ *
+ * This is called by the ib_mad module.
+ */
+int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port,
+		    struct ib_wc *in_wc, struct ib_grh *in_grh,
+		    struct ib_mad *in_mad, struct ib_mad *out_mad)
+{
+	int ret;
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+
+	switch (in_mad->mad_hdr.mgmt_class) {
+	case IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE:
+	case IB_MGMT_CLASS_SUBN_LID_ROUTED:
+		ret = process_subn(ibdev, mad_flags, port, in_mad, out_mad);
+		goto bail;
+
+	case IB_MGMT_CLASS_PERF_MGMT:
+		ret = process_perf(ibdev, port, in_mad, out_mad);
+		goto bail;
+
+	case IB_MGMT_CLASS_CONG_MGMT:
+		if (!ppd->congestion_entries_shadow ||
+			 !qib_cc_table_size) {
+			ret = IB_MAD_RESULT_SUCCESS;
+			goto bail;
+		}
+		ret = process_cc(ibdev, mad_flags, port, in_mad, out_mad);
+		goto bail;
+
+	default:
+		ret = IB_MAD_RESULT_SUCCESS;
+	}
+
+bail:
+	return ret;
+}
+
+static void send_handler(struct ib_mad_agent *agent,
+			 struct ib_mad_send_wc *mad_send_wc)
+{
+	ib_free_send_mad(mad_send_wc->send_buf);
+}
+
+static void xmit_wait_timer_func(unsigned long opaque)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *)opaque;
+	struct qib_devdata *dd = dd_from_ppd(ppd);
+	unsigned long flags;
+	u8 status;
+
+	spin_lock_irqsave(&ppd->ibport_data.lock, flags);
+	if (ppd->cong_stats.flags == IB_PMA_CONG_HW_CONTROL_SAMPLE) {
+		status = dd->f_portcntr(ppd, QIBPORTCNTR_PSSTAT);
+		if (status == IB_PMA_SAMPLE_STATUS_DONE) {
+			/* save counter cache */
+			cache_hw_sample_counters(ppd);
+			ppd->cong_stats.flags = IB_PMA_CONG_HW_CONTROL_TIMER;
+		} else
+			goto done;
+	}
+	ppd->cong_stats.counter = xmit_wait_get_value_delta(ppd);
+	dd->f_set_cntr_sample(ppd, QIB_CONG_TIMER_PSINTERVAL, 0x0);
+done:
+	spin_unlock_irqrestore(&ppd->ibport_data.lock, flags);
+	mod_timer(&ppd->cong_stats.timer, jiffies + HZ);
+}
+
+int qib_create_agents(struct qib_ibdev *dev)
+{
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct ib_mad_agent *agent;
+	struct qib_ibport *ibp;
+	int p;
+	int ret;
+
+	for (p = 0; p < dd->num_pports; p++) {
+		ibp = &dd->pport[p].ibport_data;
+		agent = ib_register_mad_agent(&dev->ibdev, p + 1, IB_QPT_SMI,
+					      NULL, 0, send_handler,
+					      NULL, NULL, 0);
+		if (IS_ERR(agent)) {
+			ret = PTR_ERR(agent);
+			goto err;
+		}
+
+		/* Initialize xmit_wait structure */
+		dd->pport[p].cong_stats.counter = 0;
+		init_timer(&dd->pport[p].cong_stats.timer);
+		dd->pport[p].cong_stats.timer.function = xmit_wait_timer_func;
+		dd->pport[p].cong_stats.timer.data =
+			(unsigned long)(&dd->pport[p]);
+		dd->pport[p].cong_stats.timer.expires = 0;
+		add_timer(&dd->pport[p].cong_stats.timer);
+
+		ibp->send_agent = agent;
+	}
+
+	return 0;
+
+err:
+	for (p = 0; p < dd->num_pports; p++) {
+		ibp = &dd->pport[p].ibport_data;
+		if (ibp->send_agent) {
+			agent = ibp->send_agent;
+			ibp->send_agent = NULL;
+			ib_unregister_mad_agent(agent);
+		}
+	}
+
+	return ret;
+}
+
+void qib_free_agents(struct qib_ibdev *dev)
+{
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct ib_mad_agent *agent;
+	struct qib_ibport *ibp;
+	int p;
+
+	for (p = 0; p < dd->num_pports; p++) {
+		ibp = &dd->pport[p].ibport_data;
+		if (ibp->send_agent) {
+			agent = ibp->send_agent;
+			ibp->send_agent = NULL;
+			ib_unregister_mad_agent(agent);
+		}
+		if (ibp->sm_ah) {
+			ib_destroy_ah(&ibp->sm_ah->ibah);
+			ibp->sm_ah = NULL;
+		}
+		if (dd->pport[p].cong_stats.timer.data)
+			del_timer_sync(&dd->pport[p].cong_stats.timer);
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_mad.h b/drivers/infiniband/hw/qib/qib_mad.h
new file mode 100644
index 000000000..941d4d50d
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_mad.h
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _QIB_MAD_H
+#define _QIB_MAD_H
+
+#include <rdma/ib_pma.h>
+
+#define IB_SMP_UNSUP_VERSION    cpu_to_be16(0x0004)
+#define IB_SMP_UNSUP_METHOD     cpu_to_be16(0x0008)
+#define IB_SMP_UNSUP_METH_ATTR  cpu_to_be16(0x000C)
+#define IB_SMP_INVALID_FIELD    cpu_to_be16(0x001C)
+
+struct ib_node_info {
+	u8 base_version;
+	u8 class_version;
+	u8 node_type;
+	u8 num_ports;
+	__be64 sys_guid;
+	__be64 node_guid;
+	__be64 port_guid;
+	__be16 partition_cap;
+	__be16 device_id;
+	__be32 revision;
+	u8 local_port_num;
+	u8 vendor_id[3];
+} __packed;
+
+struct ib_mad_notice_attr {
+	u8 generic_type;
+	u8 prod_type_msb;
+	__be16 prod_type_lsb;
+	__be16 trap_num;
+	__be16 issuer_lid;
+	__be16 toggle_count;
+
+	union {
+		struct {
+			u8	details[54];
+		} raw_data;
+
+		struct {
+			__be16	reserved;
+			__be16	lid;		/* where violation happened */
+			u8	port_num;	/* where violation happened */
+		} __packed ntc_129_131;
+
+		struct {
+			__be16	reserved;
+			__be16	lid;		/* LID where change occurred */
+			u8	reserved2;
+			u8	local_changes;	/* low bit - local changes */
+			__be32	new_cap_mask;	/* new capability mask */
+			u8	reserved3;
+			u8	change_flags;	/* low 3 bits only */
+		} __packed ntc_144;
+
+		struct {
+			__be16	reserved;
+			__be16	lid;		/* lid where sys guid changed */
+			__be16	reserved2;
+			__be64	new_sys_guid;
+		} __packed ntc_145;
+
+		struct {
+			__be16	reserved;
+			__be16	lid;
+			__be16	dr_slid;
+			u8	method;
+			u8	reserved2;
+			__be16	attr_id;
+			__be32	attr_mod;
+			__be64	mkey;
+			u8	reserved3;
+			u8	dr_trunc_hop;
+			u8	dr_rtn_path[30];
+		} __packed ntc_256;
+
+		struct {
+			__be16		reserved;
+			__be16		lid1;
+			__be16		lid2;
+			__be32		key;
+			__be32		sl_qp1;	/* SL: high 4 bits */
+			__be32		qp2;	/* high 8 bits reserved */
+			union ib_gid	gid1;
+			union ib_gid	gid2;
+		} __packed ntc_257_258;
+
+	} details;
+};
+
+/*
+ * Generic trap/notice types
+ */
+#define IB_NOTICE_TYPE_FATAL	0x80
+#define IB_NOTICE_TYPE_URGENT	0x81
+#define IB_NOTICE_TYPE_SECURITY	0x82
+#define IB_NOTICE_TYPE_SM	0x83
+#define IB_NOTICE_TYPE_INFO	0x84
+
+/*
+ * Generic trap/notice producers
+ */
+#define IB_NOTICE_PROD_CA		cpu_to_be16(1)
+#define IB_NOTICE_PROD_SWITCH		cpu_to_be16(2)
+#define IB_NOTICE_PROD_ROUTER		cpu_to_be16(3)
+#define IB_NOTICE_PROD_CLASS_MGR	cpu_to_be16(4)
+
+/*
+ * Generic trap/notice numbers
+ */
+#define IB_NOTICE_TRAP_LLI_THRESH	cpu_to_be16(129)
+#define IB_NOTICE_TRAP_EBO_THRESH	cpu_to_be16(130)
+#define IB_NOTICE_TRAP_FLOW_UPDATE	cpu_to_be16(131)
+#define IB_NOTICE_TRAP_CAP_MASK_CHG	cpu_to_be16(144)
+#define IB_NOTICE_TRAP_SYS_GUID_CHG	cpu_to_be16(145)
+#define IB_NOTICE_TRAP_BAD_MKEY		cpu_to_be16(256)
+#define IB_NOTICE_TRAP_BAD_PKEY		cpu_to_be16(257)
+#define IB_NOTICE_TRAP_BAD_QKEY		cpu_to_be16(258)
+
+/*
+ * Repress trap/notice flags
+ */
+#define IB_NOTICE_REPRESS_LLI_THRESH	(1 << 0)
+#define IB_NOTICE_REPRESS_EBO_THRESH	(1 << 1)
+#define IB_NOTICE_REPRESS_FLOW_UPDATE	(1 << 2)
+#define IB_NOTICE_REPRESS_CAP_MASK_CHG	(1 << 3)
+#define IB_NOTICE_REPRESS_SYS_GUID_CHG	(1 << 4)
+#define IB_NOTICE_REPRESS_BAD_MKEY	(1 << 5)
+#define IB_NOTICE_REPRESS_BAD_PKEY	(1 << 6)
+#define IB_NOTICE_REPRESS_BAD_QKEY	(1 << 7)
+
+/*
+ * Generic trap/notice other local changes flags (trap 144).
+ */
+#define IB_NOTICE_TRAP_LSE_CHG		0x04	/* Link Speed Enable changed */
+#define IB_NOTICE_TRAP_LWE_CHG		0x02	/* Link Width Enable changed */
+#define IB_NOTICE_TRAP_NODE_DESC_CHG	0x01
+
+/*
+ * Generic trap/notice M_Key volation flags in dr_trunc_hop (trap 256).
+ */
+#define IB_NOTICE_TRAP_DR_NOTICE	0x80
+#define IB_NOTICE_TRAP_DR_TRUNC		0x40
+
+struct ib_vl_weight_elem {
+	u8      vl;     /* Only low 4 bits, upper 4 bits reserved */
+	u8      weight;
+};
+
+#define IB_VLARB_LOWPRI_0_31    1
+#define IB_VLARB_LOWPRI_32_63   2
+#define IB_VLARB_HIGHPRI_0_31   3
+#define IB_VLARB_HIGHPRI_32_63  4
+
+#define IB_PMA_PORT_COUNTERS_CONG       cpu_to_be16(0xFF00)
+
+struct ib_pma_portcounters_cong {
+	u8 reserved;
+	u8 reserved1;
+	__be16 port_check_rate;
+	__be16 symbol_error_counter;
+	u8 link_error_recovery_counter;
+	u8 link_downed_counter;
+	__be16 port_rcv_errors;
+	__be16 port_rcv_remphys_errors;
+	__be16 port_rcv_switch_relay_errors;
+	__be16 port_xmit_discards;
+	u8 port_xmit_constraint_errors;
+	u8 port_rcv_constraint_errors;
+	u8 reserved2;
+	u8 link_overrun_errors; /* LocalLink: 7:4, BufferOverrun: 3:0 */
+	__be16 reserved3;
+	__be16 vl15_dropped;
+	__be64 port_xmit_data;
+	__be64 port_rcv_data;
+	__be64 port_xmit_packets;
+	__be64 port_rcv_packets;
+	__be64 port_xmit_wait;
+	__be64 port_adr_events;
+} __packed;
+
+#define IB_PMA_CONG_HW_CONTROL_TIMER            0x00
+#define IB_PMA_CONG_HW_CONTROL_SAMPLE           0x01
+
+#define QIB_XMIT_RATE_UNSUPPORTED               0x0
+#define QIB_XMIT_RATE_PICO                      0x7
+/* number of 4nsec cycles equaling 2secs */
+#define QIB_CONG_TIMER_PSINTERVAL               0x1DCD64EC
+
+#define IB_PMA_SEL_CONG_ALL                     0x01
+#define IB_PMA_SEL_CONG_PORT_DATA               0x02
+#define IB_PMA_SEL_CONG_XMIT                    0x04
+#define IB_PMA_SEL_CONG_ROUTING                 0x08
+
+/*
+ * Congestion control class attributes
+ */
+#define IB_CC_ATTR_CLASSPORTINFO			cpu_to_be16(0x0001)
+#define IB_CC_ATTR_NOTICE				cpu_to_be16(0x0002)
+#define IB_CC_ATTR_CONGESTION_INFO			cpu_to_be16(0x0011)
+#define IB_CC_ATTR_CONGESTION_KEY_INFO			cpu_to_be16(0x0012)
+#define IB_CC_ATTR_CONGESTION_LOG			cpu_to_be16(0x0013)
+#define IB_CC_ATTR_SWITCH_CONGESTION_SETTING		cpu_to_be16(0x0014)
+#define IB_CC_ATTR_SWITCH_PORT_CONGESTION_SETTING	cpu_to_be16(0x0015)
+#define IB_CC_ATTR_CA_CONGESTION_SETTING		cpu_to_be16(0x0016)
+#define IB_CC_ATTR_CONGESTION_CONTROL_TABLE		cpu_to_be16(0x0017)
+#define IB_CC_ATTR_TIME_STAMP				cpu_to_be16(0x0018)
+
+/* generalizations for threshold values */
+#define IB_CC_THRESHOLD_NONE 0x0
+#define IB_CC_THRESHOLD_MIN  0x1
+#define IB_CC_THRESHOLD_MAX  0xf
+
+/* CCA MAD header constants */
+#define IB_CC_MAD_LOGDATA_LEN 32
+#define IB_CC_MAD_MGMTDATA_LEN 192
+
+struct ib_cc_mad {
+	u8	base_version;
+	u8	mgmt_class;
+	u8	class_version;
+	u8	method;
+	__be16	status;
+	__be16	class_specific;
+	__be64	tid;
+	__be16	attr_id;
+	__be16	resv;
+	__be32	attr_mod;
+	__be64 cckey;
+
+	/* For CongestionLog attribute only */
+	u8 log_data[IB_CC_MAD_LOGDATA_LEN];
+
+	u8 mgmt_data[IB_CC_MAD_MGMTDATA_LEN];
+} __packed;
+
+/*
+ * Congestion Control class portinfo capability mask bits
+ */
+#define IB_CC_CPI_CM_TRAP_GEN		cpu_to_be16(1 << 0)
+#define IB_CC_CPI_CM_GET_SET_NOTICE	cpu_to_be16(1 << 1)
+#define IB_CC_CPI_CM_CAP2		cpu_to_be16(1 << 2)
+#define IB_CC_CPI_CM_ENHANCEDPORT0_CC	cpu_to_be16(1 << 8)
+
+struct ib_cc_classportinfo_attr {
+	u8 base_version;
+	u8 class_version;
+	__be16 cap_mask;
+	u8 reserved[3];
+	u8 resp_time_value;     /* only lower 5 bits */
+	union ib_gid redirect_gid;
+	__be32 redirect_tc_sl_fl;       /* 8, 4, 20 bits respectively */
+	__be16 redirect_lid;
+	__be16 redirect_pkey;
+	__be32 redirect_qp;     /* only lower 24 bits */
+	__be32 redirect_qkey;
+	union ib_gid trap_gid;
+	__be32 trap_tc_sl_fl;   /* 8, 4, 20 bits respectively */
+	__be16 trap_lid;
+	__be16 trap_pkey;
+	__be32 trap_hl_qp;      /* 8, 24 bits respectively */
+	__be32 trap_qkey;
+} __packed;
+
+/* Congestion control traps */
+#define IB_CC_TRAP_KEY_VIOLATION 0x0000
+
+struct ib_cc_trap_key_violation_attr {
+	__be16 source_lid;
+	u8 method;
+	u8 reserved1;
+	__be16 attrib_id;
+	__be32 attrib_mod;
+	__be32 qp;
+	__be64 cckey;
+	u8 sgid[16];
+	u8 padding[24];
+} __packed;
+
+/* Congestion info flags */
+#define IB_CC_CI_FLAGS_CREDIT_STARVATION 0x1
+#define IB_CC_TABLE_CAP_DEFAULT 31
+
+struct ib_cc_info_attr {
+	__be16 congestion_info;
+	u8  control_table_cap; /* Multiple of 64 entry unit CCTs */
+} __packed;
+
+struct ib_cc_key_info_attr {
+	__be64 cckey;
+	u8  protect;
+	__be16 lease_period;
+	__be16 violations;
+} __packed;
+
+#define IB_CC_CL_CA_LOGEVENTS_LEN 208
+
+struct ib_cc_log_attr {
+	u8 log_type;
+	u8 congestion_flags;
+	__be16 threshold_event_counter;
+	__be16 threshold_congestion_event_map;
+	__be16 current_time_stamp;
+	u8 log_events[IB_CC_CL_CA_LOGEVENTS_LEN];
+} __packed;
+
+#define IB_CC_CLEC_SERVICETYPE_RC 0x0
+#define IB_CC_CLEC_SERVICETYPE_UC 0x1
+#define IB_CC_CLEC_SERVICETYPE_RD 0x2
+#define IB_CC_CLEC_SERVICETYPE_UD 0x3
+
+struct ib_cc_log_event {
+	u8 local_qp_cn_entry;
+	u8 remote_qp_number_cn_entry[3];
+	u8  sl_cn_entry:4;
+	u8  service_type_cn_entry:4;
+	__be32 remote_lid_cn_entry;
+	__be32 timestamp_cn_entry;
+} __packed;
+
+/* Sixteen congestion entries */
+#define IB_CC_CCS_ENTRIES 16
+
+/* Port control flags */
+#define IB_CC_CCS_PC_SL_BASED 0x01
+
+struct ib_cc_congestion_entry {
+	u8 ccti_increase;
+	__be16 ccti_timer;
+	u8 trigger_threshold;
+	u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct ib_cc_congestion_entry_shadow {
+	u8 ccti_increase;
+	u16 ccti_timer;
+	u8 trigger_threshold;
+	u8 ccti_min; /* min CCTI for cc table */
+} __packed;
+
+struct ib_cc_congestion_setting_attr {
+	__be16 port_control;
+	__be16 control_map;
+	struct ib_cc_congestion_entry entries[IB_CC_CCS_ENTRIES];
+} __packed;
+
+struct ib_cc_congestion_setting_attr_shadow {
+	u16 port_control;
+	u16 control_map;
+	struct ib_cc_congestion_entry_shadow entries[IB_CC_CCS_ENTRIES];
+} __packed;
+
+#define IB_CC_TABLE_ENTRY_INCREASE_DEFAULT 1
+#define IB_CC_TABLE_ENTRY_TIMER_DEFAULT 1
+
+/* 64 Congestion Control table entries in a single MAD */
+#define IB_CCT_ENTRIES 64
+#define IB_CCT_MIN_ENTRIES (IB_CCT_ENTRIES * 2)
+
+struct ib_cc_table_entry {
+	__be16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_entry_shadow {
+	u16 entry; /* shift:2, multiplier:14 */
+};
+
+struct ib_cc_table_attr {
+	__be16 ccti_limit; /* max CCTI for cc table */
+	struct ib_cc_table_entry ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+struct ib_cc_table_attr_shadow {
+	u16 ccti_limit; /* max CCTI for cc table */
+	struct ib_cc_table_entry_shadow ccti_entries[IB_CCT_ENTRIES];
+} __packed;
+
+#define CC_TABLE_SHADOW_MAX \
+	(IB_CC_TABLE_CAP_DEFAULT * IB_CCT_ENTRIES)
+
+struct cc_table_shadow {
+	u16 ccti_last_entry;
+	struct ib_cc_table_entry_shadow entries[CC_TABLE_SHADOW_MAX];
+} __packed;
+
+/*
+ * The PortSamplesControl.CounterMasks field is an array of 3 bit fields
+ * which specify the N'th counter's capabilities. See ch. 16.1.3.2.
+ * We support 5 counters which only count the mandatory quantities.
+ */
+#define COUNTER_MASK(q, n) (q << ((9 - n) * 3))
+#define COUNTER_MASK0_9 \
+	cpu_to_be32(COUNTER_MASK(1, 0) | \
+		    COUNTER_MASK(1, 1) | \
+		    COUNTER_MASK(1, 2) | \
+		    COUNTER_MASK(1, 3) | \
+		    COUNTER_MASK(1, 4))
+
+#endif				/* _QIB_MAD_H */
diff --git a/drivers/infiniband/hw/qib/qib_mmap.c b/drivers/infiniband/hw/qib/qib_mmap.c
new file mode 100644
index 000000000..146cf29a2
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_mmap.c
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/errno.h>
+#include <asm/pgtable.h>
+
+#include "qib_verbs.h"
+
+/**
+ * qib_release_mmap_info - free mmap info structure
+ * @ref: a pointer to the kref within struct qib_mmap_info
+ */
+void qib_release_mmap_info(struct kref *ref)
+{
+	struct qib_mmap_info *ip =
+		container_of(ref, struct qib_mmap_info, ref);
+	struct qib_ibdev *dev = to_idev(ip->context->device);
+
+	spin_lock_irq(&dev->pending_lock);
+	list_del(&ip->pending_mmaps);
+	spin_unlock_irq(&dev->pending_lock);
+
+	vfree(ip->obj);
+	kfree(ip);
+}
+
+/*
+ * open and close keep track of how many times the CQ is mapped,
+ * to avoid releasing it.
+ */
+static void qib_vma_open(struct vm_area_struct *vma)
+{
+	struct qib_mmap_info *ip = vma->vm_private_data;
+
+	kref_get(&ip->ref);
+}
+
+static void qib_vma_close(struct vm_area_struct *vma)
+{
+	struct qib_mmap_info *ip = vma->vm_private_data;
+
+	kref_put(&ip->ref, qib_release_mmap_info);
+}
+
+static struct vm_operations_struct qib_vm_ops = {
+	.open =     qib_vma_open,
+	.close =    qib_vma_close,
+};
+
+/**
+ * qib_mmap - create a new mmap region
+ * @context: the IB user context of the process making the mmap() call
+ * @vma: the VMA to be initialized
+ * Return zero if the mmap is OK. Otherwise, return an errno.
+ */
+int qib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
+{
+	struct qib_ibdev *dev = to_idev(context->device);
+	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
+	unsigned long size = vma->vm_end - vma->vm_start;
+	struct qib_mmap_info *ip, *pp;
+	int ret = -EINVAL;
+
+	/*
+	 * Search the device's list of objects waiting for a mmap call.
+	 * Normally, this list is very short since a call to create a
+	 * CQ, QP, or SRQ is soon followed by a call to mmap().
+	 */
+	spin_lock_irq(&dev->pending_lock);
+	list_for_each_entry_safe(ip, pp, &dev->pending_mmaps,
+				 pending_mmaps) {
+		/* Only the creator is allowed to mmap the object */
+		if (context != ip->context || (__u64) offset != ip->offset)
+			continue;
+		/* Don't allow a mmap larger than the object. */
+		if (size > ip->size)
+			break;
+
+		list_del_init(&ip->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+
+		ret = remap_vmalloc_range(vma, ip->obj, 0);
+		if (ret)
+			goto done;
+		vma->vm_ops = &qib_vm_ops;
+		vma->vm_private_data = ip;
+		qib_vma_open(vma);
+		goto done;
+	}
+	spin_unlock_irq(&dev->pending_lock);
+done:
+	return ret;
+}
+
+/*
+ * Allocate information for qib_mmap
+ */
+struct qib_mmap_info *qib_create_mmap_info(struct qib_ibdev *dev,
+					   u32 size,
+					   struct ib_ucontext *context,
+					   void *obj) {
+	struct qib_mmap_info *ip;
+
+	ip = kmalloc(sizeof(*ip), GFP_KERNEL);
+	if (!ip)
+		goto bail;
+
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irq(&dev->mmap_offset_lock);
+	if (dev->mmap_offset == 0)
+		dev->mmap_offset = PAGE_SIZE;
+	ip->offset = dev->mmap_offset;
+	dev->mmap_offset += size;
+	spin_unlock_irq(&dev->mmap_offset_lock);
+
+	INIT_LIST_HEAD(&ip->pending_mmaps);
+	ip->size = size;
+	ip->context = context;
+	ip->obj = obj;
+	kref_init(&ip->ref);
+
+bail:
+	return ip;
+}
+
+void qib_update_mmap_info(struct qib_ibdev *dev, struct qib_mmap_info *ip,
+			  u32 size, void *obj)
+{
+	size = PAGE_ALIGN(size);
+
+	spin_lock_irq(&dev->mmap_offset_lock);
+	if (dev->mmap_offset == 0)
+		dev->mmap_offset = PAGE_SIZE;
+	ip->offset = dev->mmap_offset;
+	dev->mmap_offset += size;
+	spin_unlock_irq(&dev->mmap_offset_lock);
+
+	ip->size = size;
+	ip->obj = obj;
+}
diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c
new file mode 100644
index 000000000..c4473db46
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_mr.c
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_umem.h>
+#include <rdma/ib_smi.h>
+
+#include "qib.h"
+
+/* Fast memory region */
+struct qib_fmr {
+	struct ib_fmr ibfmr;
+	struct qib_mregion mr;        /* must be last */
+};
+
+static inline struct qib_fmr *to_ifmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct qib_fmr, ibfmr);
+}
+
+static int init_qib_mregion(struct qib_mregion *mr, struct ib_pd *pd,
+	int count)
+{
+	int m, i = 0;
+	int rval = 0;
+
+	m = (count + QIB_SEGSZ - 1) / QIB_SEGSZ;
+	for (; i < m; i++) {
+		mr->map[i] = kzalloc(sizeof(*mr->map[0]), GFP_KERNEL);
+		if (!mr->map[i])
+			goto bail;
+	}
+	mr->mapsz = m;
+	init_completion(&mr->comp);
+	/* count returning the ptr to user */
+	atomic_set(&mr->refcount, 1);
+	mr->pd = pd;
+	mr->max_segs = count;
+out:
+	return rval;
+bail:
+	while (i)
+		kfree(mr->map[--i]);
+	rval = -ENOMEM;
+	goto out;
+}
+
+static void deinit_qib_mregion(struct qib_mregion *mr)
+{
+	int i = mr->mapsz;
+
+	mr->mapsz = 0;
+	while (i)
+		kfree(mr->map[--i]);
+}
+
+
+/**
+ * qib_get_dma_mr - get a DMA memory region
+ * @pd: protection domain for this memory region
+ * @acc: access flags
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ * Note that all DMA addresses should be created via the
+ * struct ib_dma_mapping_ops functions (see qib_dma.c).
+ */
+struct ib_mr *qib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	struct qib_mr *mr = NULL;
+	struct ib_mr *ret;
+	int rval;
+
+	if (to_ipd(pd)->user) {
+		ret = ERR_PTR(-EPERM);
+		goto bail;
+	}
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (!mr) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	rval = init_qib_mregion(&mr->mr, pd, 0);
+	if (rval) {
+		ret = ERR_PTR(rval);
+		goto bail;
+	}
+
+
+	rval = qib_alloc_lkey(&mr->mr, 1);
+	if (rval) {
+		ret = ERR_PTR(rval);
+		goto bail_mregion;
+	}
+
+	mr->mr.access_flags = acc;
+	ret = &mr->ibmr;
+done:
+	return ret;
+
+bail_mregion:
+	deinit_qib_mregion(&mr->mr);
+bail:
+	kfree(mr);
+	goto done;
+}
+
+static struct qib_mr *alloc_mr(int count, struct ib_pd *pd)
+{
+	struct qib_mr *mr;
+	int rval = -ENOMEM;
+	int m;
+
+	/* Allocate struct plus pointers to first level page tables. */
+	m = (count + QIB_SEGSZ - 1) / QIB_SEGSZ;
+	mr = kzalloc(sizeof(*mr) + m * sizeof(mr->mr.map[0]), GFP_KERNEL);
+	if (!mr)
+		goto bail;
+
+	rval = init_qib_mregion(&mr->mr, pd, count);
+	if (rval)
+		goto bail;
+	/*
+	 * ib_reg_phys_mr() will initialize mr->ibmr except for
+	 * lkey and rkey.
+	 */
+	rval = qib_alloc_lkey(&mr->mr, 0);
+	if (rval)
+		goto bail_mregion;
+	mr->ibmr.lkey = mr->mr.lkey;
+	mr->ibmr.rkey = mr->mr.lkey;
+done:
+	return mr;
+
+bail_mregion:
+	deinit_qib_mregion(&mr->mr);
+bail:
+	kfree(mr);
+	mr = ERR_PTR(rval);
+	goto done;
+}
+
+/**
+ * qib_reg_phys_mr - register a physical memory region
+ * @pd: protection domain for this memory region
+ * @buffer_list: pointer to the list of physical buffers to register
+ * @num_phys_buf: the number of physical buffers to register
+ * @iova_start: the starting address passed over IB which maps to this MR
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd,
+			      struct ib_phys_buf *buffer_list,
+			      int num_phys_buf, int acc, u64 *iova_start)
+{
+	struct qib_mr *mr;
+	int n, m, i;
+	struct ib_mr *ret;
+
+	mr = alloc_mr(num_phys_buf, pd);
+	if (IS_ERR(mr)) {
+		ret = (struct ib_mr *)mr;
+		goto bail;
+	}
+
+	mr->mr.user_base = *iova_start;
+	mr->mr.iova = *iova_start;
+	mr->mr.access_flags = acc;
+
+	m = 0;
+	n = 0;
+	for (i = 0; i < num_phys_buf; i++) {
+		mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr;
+		mr->mr.map[m]->segs[n].length = buffer_list[i].size;
+		mr->mr.length += buffer_list[i].size;
+		n++;
+		if (n == QIB_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+
+	ret = &mr->ibmr;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_reg_user_mr - register a userspace memory region
+ * @pd: protection domain for this memory region
+ * @start: starting userspace address
+ * @length: length of region to register
+ * @mr_access_flags: access flags for this memory region
+ * @udata: unused by the QLogic_IB driver
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+			      u64 virt_addr, int mr_access_flags,
+			      struct ib_udata *udata)
+{
+	struct qib_mr *mr;
+	struct ib_umem *umem;
+	struct scatterlist *sg;
+	int n, m, entry;
+	struct ib_mr *ret;
+
+	if (length == 0) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	umem = ib_umem_get(pd->uobject->context, start, length,
+			   mr_access_flags, 0);
+	if (IS_ERR(umem))
+		return (void *) umem;
+
+	n = umem->nmap;
+
+	mr = alloc_mr(n, pd);
+	if (IS_ERR(mr)) {
+		ret = (struct ib_mr *)mr;
+		ib_umem_release(umem);
+		goto bail;
+	}
+
+	mr->mr.user_base = start;
+	mr->mr.iova = virt_addr;
+	mr->mr.length = length;
+	mr->mr.offset = ib_umem_offset(umem);
+	mr->mr.access_flags = mr_access_flags;
+	mr->umem = umem;
+
+	if (is_power_of_2(umem->page_size))
+		mr->mr.page_shift = ilog2(umem->page_size);
+	m = 0;
+	n = 0;
+	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, entry) {
+			void *vaddr;
+
+			vaddr = page_address(sg_page(sg));
+			if (!vaddr) {
+				ret = ERR_PTR(-EINVAL);
+				goto bail;
+			}
+			mr->mr.map[m]->segs[n].vaddr = vaddr;
+			mr->mr.map[m]->segs[n].length = umem->page_size;
+			n++;
+			if (n == QIB_SEGSZ) {
+				m++;
+				n = 0;
+			}
+	}
+	ret = &mr->ibmr;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_dereg_mr - unregister and free a memory region
+ * @ibmr: the memory region to free
+ *
+ * Returns 0 on success.
+ *
+ * Note that this is called to free MRs created by qib_get_dma_mr()
+ * or qib_reg_user_mr().
+ */
+int qib_dereg_mr(struct ib_mr *ibmr)
+{
+	struct qib_mr *mr = to_imr(ibmr);
+	int ret = 0;
+	unsigned long timeout;
+
+	qib_free_lkey(&mr->mr);
+
+	qib_put_mr(&mr->mr); /* will set completion if last */
+	timeout = wait_for_completion_timeout(&mr->mr.comp,
+		5 * HZ);
+	if (!timeout) {
+		qib_get_mr(&mr->mr);
+		ret = -EBUSY;
+		goto out;
+	}
+	deinit_qib_mregion(&mr->mr);
+	if (mr->umem)
+		ib_umem_release(mr->umem);
+	kfree(mr);
+out:
+	return ret;
+}
+
+/*
+ * Allocate a memory region usable with the
+ * IB_WR_FAST_REG_MR send work request.
+ *
+ * Return the memory region on success, otherwise return an errno.
+ */
+struct ib_mr *qib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len)
+{
+	struct qib_mr *mr;
+
+	mr = alloc_mr(max_page_list_len, pd);
+	if (IS_ERR(mr))
+		return (struct ib_mr *)mr;
+
+	return &mr->ibmr;
+}
+
+struct ib_fast_reg_page_list *
+qib_alloc_fast_reg_page_list(struct ib_device *ibdev, int page_list_len)
+{
+	unsigned size = page_list_len * sizeof(u64);
+	struct ib_fast_reg_page_list *pl;
+
+	if (size > PAGE_SIZE)
+		return ERR_PTR(-EINVAL);
+
+	pl = kzalloc(sizeof(*pl), GFP_KERNEL);
+	if (!pl)
+		return ERR_PTR(-ENOMEM);
+
+	pl->page_list = kzalloc(size, GFP_KERNEL);
+	if (!pl->page_list)
+		goto err_free;
+
+	return pl;
+
+err_free:
+	kfree(pl);
+	return ERR_PTR(-ENOMEM);
+}
+
+void qib_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl)
+{
+	kfree(pl->page_list);
+	kfree(pl);
+}
+
+/**
+ * qib_alloc_fmr - allocate a fast memory region
+ * @pd: the protection domain for this memory region
+ * @mr_access_flags: access flags for this memory region
+ * @fmr_attr: fast memory region attributes
+ *
+ * Returns the memory region on success, otherwise returns an errno.
+ */
+struct ib_fmr *qib_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+			     struct ib_fmr_attr *fmr_attr)
+{
+	struct qib_fmr *fmr;
+	int m;
+	struct ib_fmr *ret;
+	int rval = -ENOMEM;
+
+	/* Allocate struct plus pointers to first level page tables. */
+	m = (fmr_attr->max_pages + QIB_SEGSZ - 1) / QIB_SEGSZ;
+	fmr = kzalloc(sizeof(*fmr) + m * sizeof(fmr->mr.map[0]), GFP_KERNEL);
+	if (!fmr)
+		goto bail;
+
+	rval = init_qib_mregion(&fmr->mr, pd, fmr_attr->max_pages);
+	if (rval)
+		goto bail;
+
+	/*
+	 * ib_alloc_fmr() will initialize fmr->ibfmr except for lkey &
+	 * rkey.
+	 */
+	rval = qib_alloc_lkey(&fmr->mr, 0);
+	if (rval)
+		goto bail_mregion;
+	fmr->ibfmr.rkey = fmr->mr.lkey;
+	fmr->ibfmr.lkey = fmr->mr.lkey;
+	/*
+	 * Resources are allocated but no valid mapping (RKEY can't be
+	 * used).
+	 */
+	fmr->mr.access_flags = mr_access_flags;
+	fmr->mr.max_segs = fmr_attr->max_pages;
+	fmr->mr.page_shift = fmr_attr->page_shift;
+
+	ret = &fmr->ibfmr;
+done:
+	return ret;
+
+bail_mregion:
+	deinit_qib_mregion(&fmr->mr);
+bail:
+	kfree(fmr);
+	ret = ERR_PTR(rval);
+	goto done;
+}
+
+/**
+ * qib_map_phys_fmr - set up a fast memory region
+ * @ibmfr: the fast memory region to set up
+ * @page_list: the list of pages to associate with the fast memory region
+ * @list_len: the number of pages to associate with the fast memory region
+ * @iova: the virtual address of the start of the fast memory region
+ *
+ * This may be called from interrupt context.
+ */
+
+int qib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		     int list_len, u64 iova)
+{
+	struct qib_fmr *fmr = to_ifmr(ibfmr);
+	struct qib_lkey_table *rkt;
+	unsigned long flags;
+	int m, n, i;
+	u32 ps;
+	int ret;
+
+	i = atomic_read(&fmr->mr.refcount);
+	if (i > 2)
+		return -EBUSY;
+
+	if (list_len > fmr->mr.max_segs) {
+		ret = -EINVAL;
+		goto bail;
+	}
+	rkt = &to_idev(ibfmr->device)->lk_table;
+	spin_lock_irqsave(&rkt->lock, flags);
+	fmr->mr.user_base = iova;
+	fmr->mr.iova = iova;
+	ps = 1 << fmr->mr.page_shift;
+	fmr->mr.length = list_len * ps;
+	m = 0;
+	n = 0;
+	for (i = 0; i < list_len; i++) {
+		fmr->mr.map[m]->segs[n].vaddr = (void *) page_list[i];
+		fmr->mr.map[m]->segs[n].length = ps;
+		if (++n == QIB_SEGSZ) {
+			m++;
+			n = 0;
+		}
+	}
+	spin_unlock_irqrestore(&rkt->lock, flags);
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_unmap_fmr - unmap fast memory regions
+ * @fmr_list: the list of fast memory regions to unmap
+ *
+ * Returns 0 on success.
+ */
+int qib_unmap_fmr(struct list_head *fmr_list)
+{
+	struct qib_fmr *fmr;
+	struct qib_lkey_table *rkt;
+	unsigned long flags;
+
+	list_for_each_entry(fmr, fmr_list, ibfmr.list) {
+		rkt = &to_idev(fmr->ibfmr.device)->lk_table;
+		spin_lock_irqsave(&rkt->lock, flags);
+		fmr->mr.user_base = 0;
+		fmr->mr.iova = 0;
+		fmr->mr.length = 0;
+		spin_unlock_irqrestore(&rkt->lock, flags);
+	}
+	return 0;
+}
+
+/**
+ * qib_dealloc_fmr - deallocate a fast memory region
+ * @ibfmr: the fast memory region to deallocate
+ *
+ * Returns 0 on success.
+ */
+int qib_dealloc_fmr(struct ib_fmr *ibfmr)
+{
+	struct qib_fmr *fmr = to_ifmr(ibfmr);
+	int ret = 0;
+	unsigned long timeout;
+
+	qib_free_lkey(&fmr->mr);
+	qib_put_mr(&fmr->mr); /* will set completion if last */
+	timeout = wait_for_completion_timeout(&fmr->mr.comp,
+		5 * HZ);
+	if (!timeout) {
+		qib_get_mr(&fmr->mr);
+		ret = -EBUSY;
+		goto out;
+	}
+	deinit_qib_mregion(&fmr->mr);
+	kfree(fmr);
+out:
+	return ret;
+}
+
+void mr_rcu_callback(struct rcu_head *list)
+{
+	struct qib_mregion *mr = container_of(list, struct qib_mregion, list);
+
+	complete(&mr->comp);
+}
diff --git a/drivers/infiniband/hw/qib/qib_pcie.c b/drivers/infiniband/hw/qib/qib_pcie.c
new file mode 100644
index 000000000..4758a3801
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_pcie.c
@@ -0,0 +1,719 @@
+/*
+ * Copyright (c) 2008, 2009 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <linux/aer.h>
+#include <linux/module.h>
+
+#include "qib.h"
+
+/*
+ * This file contains PCIe utility routines that are common to the
+ * various QLogic InfiniPath adapters
+ */
+
+/*
+ * Code to adjust PCIe capabilities.
+ * To minimize the change footprint, we call it
+ * from qib_pcie_params, which every chip-specific
+ * file calls, even though this violates some
+ * expectations of harmlessness.
+ */
+static void qib_tune_pcie_caps(struct qib_devdata *);
+static void qib_tune_pcie_coalesce(struct qib_devdata *);
+
+/*
+ * Do all the common PCIe setup and initialization.
+ * devdata is not yet allocated, and is not allocated until after this
+ * routine returns success.  Therefore qib_dev_err() can't be used for error
+ * printing.
+ */
+int qib_pcie_init(struct pci_dev *pdev, const struct pci_device_id *ent)
+{
+	int ret;
+
+	ret = pci_enable_device(pdev);
+	if (ret) {
+		/*
+		 * This can happen (in theory) iff:
+		 * We did a chip reset, and then failed to reprogram the
+		 * BAR, or the chip reset due to an internal error.  We then
+		 * unloaded the driver and reloaded it.
+		 *
+		 * Both reset cases set the BAR back to initial state.  For
+		 * the latter case, the AER sticky error bit at offset 0x718
+		 * should be set, but the Linux kernel doesn't yet know
+		 * about that, it appears.  If the original BAR was retained
+		 * in the kernel data structures, this may be OK.
+		 */
+		qib_early_err(&pdev->dev, "pci enable failed: error %d\n",
+			      -ret);
+		goto done;
+	}
+
+	ret = pci_request_regions(pdev, QIB_DRV_NAME);
+	if (ret) {
+		qib_devinfo(pdev, "pci_request_regions fails: err %d\n", -ret);
+		goto bail;
+	}
+
+	ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (ret) {
+		/*
+		 * If the 64 bit setup fails, try 32 bit.  Some systems
+		 * do not setup 64 bit maps on systems with 2GB or less
+		 * memory installed.
+		 */
+		ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32));
+		if (ret) {
+			qib_devinfo(pdev, "Unable to set DMA mask: %d\n", ret);
+			goto bail;
+		}
+		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32));
+	} else
+		ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64));
+	if (ret) {
+		qib_early_err(&pdev->dev,
+			      "Unable to set DMA consistent mask: %d\n", ret);
+		goto bail;
+	}
+
+	pci_set_master(pdev);
+	ret = pci_enable_pcie_error_reporting(pdev);
+	if (ret) {
+		qib_early_err(&pdev->dev,
+			      "Unable to enable pcie error reporting: %d\n",
+			      ret);
+		ret = 0;
+	}
+	goto done;
+
+bail:
+	pci_disable_device(pdev);
+	pci_release_regions(pdev);
+done:
+	return ret;
+}
+
+/*
+ * Do remaining PCIe setup, once dd is allocated, and save away
+ * fields required to re-initialize after a chip reset, or for
+ * various other purposes
+ */
+int qib_pcie_ddinit(struct qib_devdata *dd, struct pci_dev *pdev,
+		    const struct pci_device_id *ent)
+{
+	unsigned long len;
+	resource_size_t addr;
+
+	dd->pcidev = pdev;
+	pci_set_drvdata(pdev, dd);
+
+	addr = pci_resource_start(pdev, 0);
+	len = pci_resource_len(pdev, 0);
+
+#if defined(__powerpc__)
+	/* There isn't a generic way to specify writethrough mappings */
+	dd->kregbase = __ioremap(addr, len, _PAGE_NO_CACHE | _PAGE_WRITETHRU);
+#else
+	dd->kregbase = ioremap_nocache(addr, len);
+#endif
+
+	if (!dd->kregbase)
+		return -ENOMEM;
+
+	dd->kregend = (u64 __iomem *)((void __iomem *) dd->kregbase + len);
+	dd->physaddr = addr;        /* used for io_remap, etc. */
+
+	/*
+	 * Save BARs to rewrite after device reset.  Save all 64 bits of
+	 * BAR, just in case.
+	 */
+	dd->pcibar0 = addr;
+	dd->pcibar1 = addr >> 32;
+	dd->deviceid = ent->device; /* save for later use */
+	dd->vendorid = ent->vendor;
+
+	return 0;
+}
+
+/*
+ * Do PCIe cleanup, after chip-specific cleanup, etc.  Just prior
+ * to releasing the dd memory.
+ * void because none of the core pcie cleanup returns are void
+ */
+void qib_pcie_ddcleanup(struct qib_devdata *dd)
+{
+	u64 __iomem *base = (void __iomem *) dd->kregbase;
+
+	dd->kregbase = NULL;
+	iounmap(base);
+	if (dd->piobase)
+		iounmap(dd->piobase);
+	if (dd->userbase)
+		iounmap(dd->userbase);
+	if (dd->piovl15base)
+		iounmap(dd->piovl15base);
+
+	pci_disable_device(dd->pcidev);
+	pci_release_regions(dd->pcidev);
+
+	pci_set_drvdata(dd->pcidev, NULL);
+}
+
+static void qib_msix_setup(struct qib_devdata *dd, int pos, u32 *msixcnt,
+			   struct qib_msix_entry *qib_msix_entry)
+{
+	int ret;
+	int nvec = *msixcnt;
+	struct msix_entry *msix_entry;
+	int i;
+
+	ret = pci_msix_vec_count(dd->pcidev);
+	if (ret < 0)
+		goto do_intx;
+
+	nvec = min(nvec, ret);
+
+	/* We can't pass qib_msix_entry array to qib_msix_setup
+	 * so use a dummy msix_entry array and copy the allocated
+	 * irq back to the qib_msix_entry array. */
+	msix_entry = kcalloc(nvec, sizeof(*msix_entry), GFP_KERNEL);
+	if (!msix_entry)
+		goto do_intx;
+
+	for (i = 0; i < nvec; i++)
+		msix_entry[i] = qib_msix_entry[i].msix;
+
+	ret = pci_enable_msix_range(dd->pcidev, msix_entry, 1, nvec);
+	if (ret < 0)
+		goto free_msix_entry;
+	else
+		nvec = ret;
+
+	for (i = 0; i < nvec; i++)
+		qib_msix_entry[i].msix = msix_entry[i];
+
+	kfree(msix_entry);
+	*msixcnt = nvec;
+	return;
+
+free_msix_entry:
+	kfree(msix_entry);
+
+do_intx:
+	qib_dev_err(
+		dd,
+		"pci_enable_msix_range %d vectors failed: %d, falling back to INTx\n",
+		nvec, ret);
+	*msixcnt = 0;
+	qib_enable_intx(dd->pcidev);
+}
+
+/**
+ * We save the msi lo and hi values, so we can restore them after
+ * chip reset (the kernel PCI infrastructure doesn't yet handle that
+ * correctly.
+ */
+static int qib_msi_setup(struct qib_devdata *dd, int pos)
+{
+	struct pci_dev *pdev = dd->pcidev;
+	u16 control;
+	int ret;
+
+	ret = pci_enable_msi(pdev);
+	if (ret)
+		qib_dev_err(dd,
+			"pci_enable_msi failed: %d, interrupts may not work\n",
+			ret);
+	/* continue even if it fails, we may still be OK... */
+
+	pci_read_config_dword(pdev, pos + PCI_MSI_ADDRESS_LO,
+			      &dd->msi_lo);
+	pci_read_config_dword(pdev, pos + PCI_MSI_ADDRESS_HI,
+			      &dd->msi_hi);
+	pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &control);
+	/* now save the data (vector) info */
+	pci_read_config_word(pdev, pos + ((control & PCI_MSI_FLAGS_64BIT)
+				    ? 12 : 8),
+			     &dd->msi_data);
+	return ret;
+}
+
+int qib_pcie_params(struct qib_devdata *dd, u32 minw, u32 *nent,
+		    struct qib_msix_entry *entry)
+{
+	u16 linkstat, speed;
+	int pos = 0, ret = 1;
+
+	if (!pci_is_pcie(dd->pcidev)) {
+		qib_dev_err(dd, "Can't find PCI Express capability!\n");
+		/* set up something... */
+		dd->lbus_width = 1;
+		dd->lbus_speed = 2500; /* Gen1, 2.5GHz */
+		goto bail;
+	}
+
+	pos = dd->pcidev->msix_cap;
+	if (nent && *nent && pos) {
+		qib_msix_setup(dd, pos, nent, entry);
+		ret = 0; /* did it, either MSIx or INTx */
+	} else {
+		pos = dd->pcidev->msi_cap;
+		if (pos)
+			ret = qib_msi_setup(dd, pos);
+		else
+			qib_dev_err(dd, "No PCI MSI or MSIx capability!\n");
+	}
+	if (!pos)
+		qib_enable_intx(dd->pcidev);
+
+	pcie_capability_read_word(dd->pcidev, PCI_EXP_LNKSTA, &linkstat);
+	/*
+	 * speed is bits 0-3, linkwidth is bits 4-8
+	 * no defines for them in headers
+	 */
+	speed = linkstat & 0xf;
+	linkstat >>= 4;
+	linkstat &= 0x1f;
+	dd->lbus_width = linkstat;
+
+	switch (speed) {
+	case 1:
+		dd->lbus_speed = 2500; /* Gen1, 2.5GHz */
+		break;
+	case 2:
+		dd->lbus_speed = 5000; /* Gen1, 5GHz */
+		break;
+	default: /* not defined, assume gen1 */
+		dd->lbus_speed = 2500;
+		break;
+	}
+
+	/*
+	 * Check against expected pcie width and complain if "wrong"
+	 * on first initialization, not afterwards (i.e., reset).
+	 */
+	if (minw && linkstat < minw)
+		qib_dev_err(dd,
+			    "PCIe width %u (x%u HCA), performance reduced\n",
+			    linkstat, minw);
+
+	qib_tune_pcie_caps(dd);
+
+	qib_tune_pcie_coalesce(dd);
+
+bail:
+	/* fill in string, even on errors */
+	snprintf(dd->lbus_info, sizeof(dd->lbus_info),
+		 "PCIe,%uMHz,x%u\n", dd->lbus_speed, dd->lbus_width);
+	return ret;
+}
+
+/*
+ * Setup pcie interrupt stuff again after a reset.  I'd like to just call
+ * pci_enable_msi() again for msi, but when I do that,
+ * the MSI enable bit doesn't get set in the command word, and
+ * we switch to to a different interrupt vector, which is confusing,
+ * so I instead just do it all inline.  Perhaps somehow can tie this
+ * into the PCIe hotplug support at some point
+ */
+int qib_reinit_intr(struct qib_devdata *dd)
+{
+	int pos;
+	u16 control;
+	int ret = 0;
+
+	/* If we aren't using MSI, don't restore it */
+	if (!dd->msi_lo)
+		goto bail;
+
+	pos = dd->pcidev->msi_cap;
+	if (!pos) {
+		qib_dev_err(dd,
+			"Can't find MSI capability, can't restore MSI settings\n");
+		ret = 0;
+		/* nothing special for MSIx, just MSI */
+		goto bail;
+	}
+	pci_write_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_LO,
+			       dd->msi_lo);
+	pci_write_config_dword(dd->pcidev, pos + PCI_MSI_ADDRESS_HI,
+			       dd->msi_hi);
+	pci_read_config_word(dd->pcidev, pos + PCI_MSI_FLAGS, &control);
+	if (!(control & PCI_MSI_FLAGS_ENABLE)) {
+		control |= PCI_MSI_FLAGS_ENABLE;
+		pci_write_config_word(dd->pcidev, pos + PCI_MSI_FLAGS,
+				      control);
+	}
+	/* now rewrite the data (vector) info */
+	pci_write_config_word(dd->pcidev, pos +
+			      ((control & PCI_MSI_FLAGS_64BIT) ? 12 : 8),
+			      dd->msi_data);
+	ret = 1;
+bail:
+	if (!ret && (dd->flags & QIB_HAS_INTX)) {
+		qib_enable_intx(dd->pcidev);
+		ret = 1;
+	}
+
+	/* and now set the pci master bit again */
+	pci_set_master(dd->pcidev);
+
+	return ret;
+}
+
+/*
+ * Disable msi interrupt if enabled, and clear msi_lo.
+ * This is used primarily for the fallback to INTx, but
+ * is also used in reinit after reset, and during cleanup.
+ */
+void qib_nomsi(struct qib_devdata *dd)
+{
+	dd->msi_lo = 0;
+	pci_disable_msi(dd->pcidev);
+}
+
+/*
+ * Same as qib_nosmi, but for MSIx.
+ */
+void qib_nomsix(struct qib_devdata *dd)
+{
+	pci_disable_msix(dd->pcidev);
+}
+
+/*
+ * Similar to pci_intx(pdev, 1), except that we make sure
+ * msi(x) is off.
+ */
+void qib_enable_intx(struct pci_dev *pdev)
+{
+	u16 cw, new;
+	int pos;
+
+	/* first, turn on INTx */
+	pci_read_config_word(pdev, PCI_COMMAND, &cw);
+	new = cw & ~PCI_COMMAND_INTX_DISABLE;
+	if (new != cw)
+		pci_write_config_word(pdev, PCI_COMMAND, new);
+
+	pos = pdev->msi_cap;
+	if (pos) {
+		/* then turn off MSI */
+		pci_read_config_word(pdev, pos + PCI_MSI_FLAGS, &cw);
+		new = cw & ~PCI_MSI_FLAGS_ENABLE;
+		if (new != cw)
+			pci_write_config_word(pdev, pos + PCI_MSI_FLAGS, new);
+	}
+	pos = pdev->msix_cap;
+	if (pos) {
+		/* then turn off MSIx */
+		pci_read_config_word(pdev, pos + PCI_MSIX_FLAGS, &cw);
+		new = cw & ~PCI_MSIX_FLAGS_ENABLE;
+		if (new != cw)
+			pci_write_config_word(pdev, pos + PCI_MSIX_FLAGS, new);
+	}
+}
+
+/*
+ * These two routines are helper routines for the device reset code
+ * to move all the pcie code out of the chip-specific driver code.
+ */
+void qib_pcie_getcmd(struct qib_devdata *dd, u16 *cmd, u8 *iline, u8 *cline)
+{
+	pci_read_config_word(dd->pcidev, PCI_COMMAND, cmd);
+	pci_read_config_byte(dd->pcidev, PCI_INTERRUPT_LINE, iline);
+	pci_read_config_byte(dd->pcidev, PCI_CACHE_LINE_SIZE, cline);
+}
+
+void qib_pcie_reenable(struct qib_devdata *dd, u16 cmd, u8 iline, u8 cline)
+{
+	int r;
+
+	r = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_0,
+				   dd->pcibar0);
+	if (r)
+		qib_dev_err(dd, "rewrite of BAR0 failed: %d\n", r);
+	r = pci_write_config_dword(dd->pcidev, PCI_BASE_ADDRESS_1,
+				   dd->pcibar1);
+	if (r)
+		qib_dev_err(dd, "rewrite of BAR1 failed: %d\n", r);
+	/* now re-enable memory access, and restore cosmetic settings */
+	pci_write_config_word(dd->pcidev, PCI_COMMAND, cmd);
+	pci_write_config_byte(dd->pcidev, PCI_INTERRUPT_LINE, iline);
+	pci_write_config_byte(dd->pcidev, PCI_CACHE_LINE_SIZE, cline);
+	r = pci_enable_device(dd->pcidev);
+	if (r)
+		qib_dev_err(dd,
+			"pci_enable_device failed after reset: %d\n", r);
+}
+
+
+static int qib_pcie_coalesce;
+module_param_named(pcie_coalesce, qib_pcie_coalesce, int, S_IRUGO);
+MODULE_PARM_DESC(pcie_coalesce, "tune PCIe colescing on some Intel chipsets");
+
+/*
+ * Enable PCIe completion and data coalescing, on Intel 5x00 and 7300
+ * chipsets.   This is known to be unsafe for some revisions of some
+ * of these chipsets, with some BIOS settings, and enabling it on those
+ * systems may result in the system crashing, and/or data corruption.
+ */
+static void qib_tune_pcie_coalesce(struct qib_devdata *dd)
+{
+	int r;
+	struct pci_dev *parent;
+	u16 devid;
+	u32 mask, bits, val;
+
+	if (!qib_pcie_coalesce)
+		return;
+
+	/* Find out supported and configured values for parent (root) */
+	parent = dd->pcidev->bus->self;
+	if (parent->bus->parent) {
+		qib_devinfo(dd->pcidev, "Parent not root\n");
+		return;
+	}
+	if (!pci_is_pcie(parent))
+		return;
+	if (parent->vendor != 0x8086)
+		return;
+
+	/*
+	 *  - bit 12: Max_rdcmp_Imt_EN: need to set to 1
+	 *  - bit 11: COALESCE_FORCE: need to set to 0
+	 *  - bit 10: COALESCE_EN: need to set to 1
+	 *  (but limitations on some on some chipsets)
+	 *
+	 *  On the Intel 5000, 5100, and 7300 chipsets, there is
+	 *  also: - bit 25:24: COALESCE_MODE, need to set to 0
+	 */
+	devid = parent->device;
+	if (devid >= 0x25e2 && devid <= 0x25fa) {
+		/* 5000 P/V/X/Z */
+		if (parent->revision <= 0xb2)
+			bits = 1U << 10;
+		else
+			bits = 7U << 10;
+		mask = (3U << 24) | (7U << 10);
+	} else if (devid >= 0x65e2 && devid <= 0x65fa) {
+		/* 5100 */
+		bits = 1U << 10;
+		mask = (3U << 24) | (7U << 10);
+	} else if (devid >= 0x4021 && devid <= 0x402e) {
+		/* 5400 */
+		bits = 7U << 10;
+		mask = 7U << 10;
+	} else if (devid >= 0x3604 && devid <= 0x360a) {
+		/* 7300 */
+		bits = 7U << 10;
+		mask = (3U << 24) | (7U << 10);
+	} else {
+		/* not one of the chipsets that we know about */
+		return;
+	}
+	pci_read_config_dword(parent, 0x48, &val);
+	val &= ~mask;
+	val |= bits;
+	r = pci_write_config_dword(parent, 0x48, val);
+}
+
+/*
+ * BIOS may not set PCIe bus-utilization parameters for best performance.
+ * Check and optionally adjust them to maximize our throughput.
+ */
+static int qib_pcie_caps;
+module_param_named(pcie_caps, qib_pcie_caps, int, S_IRUGO);
+MODULE_PARM_DESC(pcie_caps, "Max PCIe tuning: Payload (0..3), ReadReq (4..7)");
+
+static void qib_tune_pcie_caps(struct qib_devdata *dd)
+{
+	struct pci_dev *parent;
+	u16 rc_mpss, rc_mps, ep_mpss, ep_mps;
+	u16 rc_mrrs, ep_mrrs, max_mrrs;
+
+	/* Find out supported and configured values for parent (root) */
+	parent = dd->pcidev->bus->self;
+	if (!pci_is_root_bus(parent->bus)) {
+		qib_devinfo(dd->pcidev, "Parent not root\n");
+		return;
+	}
+
+	if (!pci_is_pcie(parent) || !pci_is_pcie(dd->pcidev))
+		return;
+
+	rc_mpss = parent->pcie_mpss;
+	rc_mps = ffs(pcie_get_mps(parent)) - 8;
+	/* Find out supported and configured values for endpoint (us) */
+	ep_mpss = dd->pcidev->pcie_mpss;
+	ep_mps = ffs(pcie_get_mps(dd->pcidev)) - 8;
+
+	/* Find max payload supported by root, endpoint */
+	if (rc_mpss > ep_mpss)
+		rc_mpss = ep_mpss;
+
+	/* If Supported greater than limit in module param, limit it */
+	if (rc_mpss > (qib_pcie_caps & 7))
+		rc_mpss = qib_pcie_caps & 7;
+	/* If less than (allowed, supported), bump root payload */
+	if (rc_mpss > rc_mps) {
+		rc_mps = rc_mpss;
+		pcie_set_mps(parent, 128 << rc_mps);
+	}
+	/* If less than (allowed, supported), bump endpoint payload */
+	if (rc_mpss > ep_mps) {
+		ep_mps = rc_mpss;
+		pcie_set_mps(dd->pcidev, 128 << ep_mps);
+	}
+
+	/*
+	 * Now the Read Request size.
+	 * No field for max supported, but PCIe spec limits it to 4096,
+	 * which is code '5' (log2(4096) - 7)
+	 */
+	max_mrrs = 5;
+	if (max_mrrs > ((qib_pcie_caps >> 4) & 7))
+		max_mrrs = (qib_pcie_caps >> 4) & 7;
+
+	max_mrrs = 128 << max_mrrs;
+	rc_mrrs = pcie_get_readrq(parent);
+	ep_mrrs = pcie_get_readrq(dd->pcidev);
+
+	if (max_mrrs > rc_mrrs) {
+		rc_mrrs = max_mrrs;
+		pcie_set_readrq(parent, rc_mrrs);
+	}
+	if (max_mrrs > ep_mrrs) {
+		ep_mrrs = max_mrrs;
+		pcie_set_readrq(dd->pcidev, ep_mrrs);
+	}
+}
+/* End of PCIe capability tuning */
+
+/*
+ * From here through qib_pci_err_handler definition is invoked via
+ * PCI error infrastructure, registered via pci
+ */
+static pci_ers_result_t
+qib_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
+{
+	struct qib_devdata *dd = pci_get_drvdata(pdev);
+	pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+	switch (state) {
+	case pci_channel_io_normal:
+		qib_devinfo(pdev, "State Normal, ignoring\n");
+		break;
+
+	case pci_channel_io_frozen:
+		qib_devinfo(pdev, "State Frozen, requesting reset\n");
+		pci_disable_device(pdev);
+		ret = PCI_ERS_RESULT_NEED_RESET;
+		break;
+
+	case pci_channel_io_perm_failure:
+		qib_devinfo(pdev, "State Permanent Failure, disabling\n");
+		if (dd) {
+			/* no more register accesses! */
+			dd->flags &= ~QIB_PRESENT;
+			qib_disable_after_error(dd);
+		}
+		 /* else early, or other problem */
+		ret =  PCI_ERS_RESULT_DISCONNECT;
+		break;
+
+	default: /* shouldn't happen */
+		qib_devinfo(pdev, "QIB PCI errors detected (state %d)\n",
+			state);
+		break;
+	}
+	return ret;
+}
+
+static pci_ers_result_t
+qib_pci_mmio_enabled(struct pci_dev *pdev)
+{
+	u64 words = 0U;
+	struct qib_devdata *dd = pci_get_drvdata(pdev);
+	pci_ers_result_t ret = PCI_ERS_RESULT_RECOVERED;
+
+	if (dd && dd->pport) {
+		words = dd->f_portcntr(dd->pport, QIBPORTCNTR_WORDRCV);
+		if (words == ~0ULL)
+			ret = PCI_ERS_RESULT_NEED_RESET;
+	}
+	qib_devinfo(pdev,
+		"QIB mmio_enabled function called, read wordscntr %Lx, returning %d\n",
+		words, ret);
+	return  ret;
+}
+
+static pci_ers_result_t
+qib_pci_slot_reset(struct pci_dev *pdev)
+{
+	qib_devinfo(pdev, "QIB slot_reset function called, ignored\n");
+	return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static pci_ers_result_t
+qib_pci_link_reset(struct pci_dev *pdev)
+{
+	qib_devinfo(pdev, "QIB link_reset function called, ignored\n");
+	return PCI_ERS_RESULT_CAN_RECOVER;
+}
+
+static void
+qib_pci_resume(struct pci_dev *pdev)
+{
+	struct qib_devdata *dd = pci_get_drvdata(pdev);
+
+	qib_devinfo(pdev, "QIB resume function called\n");
+	pci_cleanup_aer_uncorrect_error_status(pdev);
+	/*
+	 * Running jobs will fail, since it's asynchronous
+	 * unlike sysfs-requested reset.   Better than
+	 * doing nothing.
+	 */
+	qib_init(dd, 1); /* same as re-init after reset */
+}
+
+const struct pci_error_handlers qib_pci_err_handler = {
+	.error_detected = qib_pci_error_detected,
+	.mmio_enabled = qib_pci_mmio_enabled,
+	.link_reset = qib_pci_link_reset,
+	.slot_reset = qib_pci_slot_reset,
+	.resume = qib_pci_resume,
+};
diff --git a/drivers/infiniband/hw/qib/qib_pio_copy.c b/drivers/infiniband/hw/qib/qib_pio_copy.c
new file mode 100644
index 000000000..10b8c444d
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_pio_copy.c
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2009 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "qib.h"
+
+/**
+ * qib_pio_copy - copy data to MMIO space, in multiples of 32-bits
+ * @to: destination, in MMIO space (must be 64-bit aligned)
+ * @from: source (must be 64-bit aligned)
+ * @count: number of 32-bit quantities to copy
+ *
+ * Copy data from kernel space to MMIO space, in multiples of 32 bits at a
+ * time.  Order of access is not guaranteed, nor is a memory barrier
+ * performed afterwards.
+ */
+void qib_pio_copy(void __iomem *to, const void *from, size_t count)
+{
+#ifdef CONFIG_64BIT
+	u64 __iomem *dst = to;
+	const u64 *src = from;
+	const u64 *end = src + (count >> 1);
+
+	while (src < end)
+		__raw_writeq(*src++, dst++);
+	if (count & 1)
+		__raw_writel(*(const u32 *)src, dst);
+#else
+	u32 __iomem *dst = to;
+	const u32 *src = from;
+	const u32 *end = src + count;
+
+	while (src < end)
+		__raw_writel(*src++, dst++);
+#endif
+}
diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c
new file mode 100644
index 000000000..4fa88ba29
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_qp.c
@@ -0,0 +1,1376 @@
+/*
+ * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation.  * All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/vmalloc.h>
+#include <linux/jhash.h>
+#ifdef CONFIG_DEBUG_FS
+#include <linux/seq_file.h>
+#endif
+
+#include "qib.h"
+
+#define BITS_PER_PAGE           (PAGE_SIZE*BITS_PER_BYTE)
+#define BITS_PER_PAGE_MASK      (BITS_PER_PAGE-1)
+
+static inline unsigned mk_qpn(struct qib_qpn_table *qpt,
+			      struct qpn_map *map, unsigned off)
+{
+	return (map - qpt->map) * BITS_PER_PAGE + off;
+}
+
+static inline unsigned find_next_offset(struct qib_qpn_table *qpt,
+					struct qpn_map *map, unsigned off,
+					unsigned n)
+{
+	if (qpt->mask) {
+		off++;
+		if (((off & qpt->mask) >> 1) >= n)
+			off = (off | qpt->mask) + 2;
+	} else
+		off = find_next_zero_bit(map->page, BITS_PER_PAGE, off);
+	return off;
+}
+
+/*
+ * Convert the AETH credit code into the number of credits.
+ */
+static u32 credit_table[31] = {
+	0,                      /* 0 */
+	1,                      /* 1 */
+	2,                      /* 2 */
+	3,                      /* 3 */
+	4,                      /* 4 */
+	6,                      /* 5 */
+	8,                      /* 6 */
+	12,                     /* 7 */
+	16,                     /* 8 */
+	24,                     /* 9 */
+	32,                     /* A */
+	48,                     /* B */
+	64,                     /* C */
+	96,                     /* D */
+	128,                    /* E */
+	192,                    /* F */
+	256,                    /* 10 */
+	384,                    /* 11 */
+	512,                    /* 12 */
+	768,                    /* 13 */
+	1024,                   /* 14 */
+	1536,                   /* 15 */
+	2048,                   /* 16 */
+	3072,                   /* 17 */
+	4096,                   /* 18 */
+	6144,                   /* 19 */
+	8192,                   /* 1A */
+	12288,                  /* 1B */
+	16384,                  /* 1C */
+	24576,                  /* 1D */
+	32768                   /* 1E */
+};
+
+static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map)
+{
+	unsigned long page = get_zeroed_page(GFP_KERNEL);
+
+	/*
+	 * Free the page if someone raced with us installing it.
+	 */
+
+	spin_lock(&qpt->lock);
+	if (map->page)
+		free_page(page);
+	else
+		map->page = (void *)page;
+	spin_unlock(&qpt->lock);
+}
+
+/*
+ * Allocate the next available QPN or
+ * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI.
+ */
+static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt,
+		     enum ib_qp_type type, u8 port)
+{
+	u32 i, offset, max_scan, qpn;
+	struct qpn_map *map;
+	u32 ret;
+
+	if (type == IB_QPT_SMI || type == IB_QPT_GSI) {
+		unsigned n;
+
+		ret = type == IB_QPT_GSI;
+		n = 1 << (ret + 2 * (port - 1));
+		spin_lock(&qpt->lock);
+		if (qpt->flags & n)
+			ret = -EINVAL;
+		else
+			qpt->flags |= n;
+		spin_unlock(&qpt->lock);
+		goto bail;
+	}
+
+	qpn = qpt->last + 2;
+	if (qpn >= QPN_MAX)
+		qpn = 2;
+	if (qpt->mask && ((qpn & qpt->mask) >> 1) >= dd->n_krcv_queues)
+		qpn = (qpn | qpt->mask) + 2;
+	offset = qpn & BITS_PER_PAGE_MASK;
+	map = &qpt->map[qpn / BITS_PER_PAGE];
+	max_scan = qpt->nmaps - !offset;
+	for (i = 0;;) {
+		if (unlikely(!map->page)) {
+			get_map_page(qpt, map);
+			if (unlikely(!map->page))
+				break;
+		}
+		do {
+			if (!test_and_set_bit(offset, map->page)) {
+				qpt->last = qpn;
+				ret = qpn;
+				goto bail;
+			}
+			offset = find_next_offset(qpt, map, offset,
+				dd->n_krcv_queues);
+			qpn = mk_qpn(qpt, map, offset);
+			/*
+			 * This test differs from alloc_pidmap().
+			 * If find_next_offset() does find a zero
+			 * bit, we don't need to check for QPN
+			 * wrapping around past our starting QPN.
+			 * We just need to be sure we don't loop
+			 * forever.
+			 */
+		} while (offset < BITS_PER_PAGE && qpn < QPN_MAX);
+		/*
+		 * In order to keep the number of pages allocated to a
+		 * minimum, we scan the all existing pages before increasing
+		 * the size of the bitmap table.
+		 */
+		if (++i > max_scan) {
+			if (qpt->nmaps == QPNMAP_ENTRIES)
+				break;
+			map = &qpt->map[qpt->nmaps++];
+			offset = 0;
+		} else if (map < &qpt->map[qpt->nmaps]) {
+			++map;
+			offset = 0;
+		} else {
+			map = &qpt->map[0];
+			offset = 2;
+		}
+		qpn = mk_qpn(qpt, map, offset);
+	}
+
+	ret = -ENOMEM;
+
+bail:
+	return ret;
+}
+
+static void free_qpn(struct qib_qpn_table *qpt, u32 qpn)
+{
+	struct qpn_map *map;
+
+	map = qpt->map + qpn / BITS_PER_PAGE;
+	if (map->page)
+		clear_bit(qpn & BITS_PER_PAGE_MASK, map->page);
+}
+
+static inline unsigned qpn_hash(struct qib_ibdev *dev, u32 qpn)
+{
+	return jhash_1word(qpn, dev->qp_rnd) &
+		(dev->qp_table_size - 1);
+}
+
+
+/*
+ * Put the QP into the hash table.
+ * The hash table holds a reference to the QP.
+ */
+static void insert_qp(struct qib_ibdev *dev, struct qib_qp *qp)
+{
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	unsigned long flags;
+	unsigned n = qpn_hash(dev, qp->ibqp.qp_num);
+
+	atomic_inc(&qp->refcount);
+	spin_lock_irqsave(&dev->qpt_lock, flags);
+
+	if (qp->ibqp.qp_num == 0)
+		rcu_assign_pointer(ibp->qp0, qp);
+	else if (qp->ibqp.qp_num == 1)
+		rcu_assign_pointer(ibp->qp1, qp);
+	else {
+		qp->next = dev->qp_table[n];
+		rcu_assign_pointer(dev->qp_table[n], qp);
+	}
+
+	spin_unlock_irqrestore(&dev->qpt_lock, flags);
+}
+
+/*
+ * Remove the QP from the table so it can't be found asynchronously by
+ * the receive interrupt routine.
+ */
+static void remove_qp(struct qib_ibdev *dev, struct qib_qp *qp)
+{
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	unsigned n = qpn_hash(dev, qp->ibqp.qp_num);
+	unsigned long flags;
+	int removed = 1;
+
+	spin_lock_irqsave(&dev->qpt_lock, flags);
+
+	if (rcu_dereference_protected(ibp->qp0,
+			lockdep_is_held(&dev->qpt_lock)) == qp) {
+		RCU_INIT_POINTER(ibp->qp0, NULL);
+	} else if (rcu_dereference_protected(ibp->qp1,
+			lockdep_is_held(&dev->qpt_lock)) == qp) {
+		RCU_INIT_POINTER(ibp->qp1, NULL);
+	} else {
+		struct qib_qp *q;
+		struct qib_qp __rcu **qpp;
+
+		removed = 0;
+		qpp = &dev->qp_table[n];
+		for (; (q = rcu_dereference_protected(*qpp,
+				lockdep_is_held(&dev->qpt_lock))) != NULL;
+				qpp = &q->next)
+			if (q == qp) {
+				RCU_INIT_POINTER(*qpp,
+					rcu_dereference_protected(qp->next,
+					 lockdep_is_held(&dev->qpt_lock)));
+				removed = 1;
+				break;
+			}
+	}
+
+	spin_unlock_irqrestore(&dev->qpt_lock, flags);
+	if (removed) {
+		synchronize_rcu();
+		atomic_dec(&qp->refcount);
+	}
+}
+
+/**
+ * qib_free_all_qps - check for QPs still in use
+ * @qpt: the QP table to empty
+ *
+ * There should not be any QPs still in use.
+ * Free memory for table.
+ */
+unsigned qib_free_all_qps(struct qib_devdata *dd)
+{
+	struct qib_ibdev *dev = &dd->verbs_dev;
+	unsigned long flags;
+	struct qib_qp *qp;
+	unsigned n, qp_inuse = 0;
+
+	for (n = 0; n < dd->num_pports; n++) {
+		struct qib_ibport *ibp = &dd->pport[n].ibport_data;
+
+		if (!qib_mcast_tree_empty(ibp))
+			qp_inuse++;
+		rcu_read_lock();
+		if (rcu_dereference(ibp->qp0))
+			qp_inuse++;
+		if (rcu_dereference(ibp->qp1))
+			qp_inuse++;
+		rcu_read_unlock();
+	}
+
+	spin_lock_irqsave(&dev->qpt_lock, flags);
+	for (n = 0; n < dev->qp_table_size; n++) {
+		qp = rcu_dereference_protected(dev->qp_table[n],
+			lockdep_is_held(&dev->qpt_lock));
+		RCU_INIT_POINTER(dev->qp_table[n], NULL);
+
+		for (; qp; qp = rcu_dereference_protected(qp->next,
+					lockdep_is_held(&dev->qpt_lock)))
+			qp_inuse++;
+	}
+	spin_unlock_irqrestore(&dev->qpt_lock, flags);
+	synchronize_rcu();
+
+	return qp_inuse;
+}
+
+/**
+ * qib_lookup_qpn - return the QP with the given QPN
+ * @qpt: the QP table
+ * @qpn: the QP number to look up
+ *
+ * The caller is responsible for decrementing the QP reference count
+ * when done.
+ */
+struct qib_qp *qib_lookup_qpn(struct qib_ibport *ibp, u32 qpn)
+{
+	struct qib_qp *qp = NULL;
+
+	rcu_read_lock();
+	if (unlikely(qpn <= 1)) {
+		if (qpn == 0)
+			qp = rcu_dereference(ibp->qp0);
+		else
+			qp = rcu_dereference(ibp->qp1);
+		if (qp)
+			atomic_inc(&qp->refcount);
+	} else {
+		struct qib_ibdev *dev = &ppd_from_ibp(ibp)->dd->verbs_dev;
+		unsigned n = qpn_hash(dev, qpn);
+
+		for (qp = rcu_dereference(dev->qp_table[n]); qp;
+			qp = rcu_dereference(qp->next))
+			if (qp->ibqp.qp_num == qpn) {
+				atomic_inc(&qp->refcount);
+				break;
+			}
+	}
+	rcu_read_unlock();
+	return qp;
+}
+
+/**
+ * qib_reset_qp - initialize the QP state to the reset state
+ * @qp: the QP to reset
+ * @type: the QP type
+ */
+static void qib_reset_qp(struct qib_qp *qp, enum ib_qp_type type)
+{
+	qp->remote_qpn = 0;
+	qp->qkey = 0;
+	qp->qp_access_flags = 0;
+	atomic_set(&qp->s_dma_busy, 0);
+	qp->s_flags &= QIB_S_SIGNAL_REQ_WR;
+	qp->s_hdrwords = 0;
+	qp->s_wqe = NULL;
+	qp->s_draining = 0;
+	qp->s_next_psn = 0;
+	qp->s_last_psn = 0;
+	qp->s_sending_psn = 0;
+	qp->s_sending_hpsn = 0;
+	qp->s_psn = 0;
+	qp->r_psn = 0;
+	qp->r_msn = 0;
+	if (type == IB_QPT_RC) {
+		qp->s_state = IB_OPCODE_RC_SEND_LAST;
+		qp->r_state = IB_OPCODE_RC_SEND_LAST;
+	} else {
+		qp->s_state = IB_OPCODE_UC_SEND_LAST;
+		qp->r_state = IB_OPCODE_UC_SEND_LAST;
+	}
+	qp->s_ack_state = IB_OPCODE_RC_ACKNOWLEDGE;
+	qp->r_nak_state = 0;
+	qp->r_aflags = 0;
+	qp->r_flags = 0;
+	qp->s_head = 0;
+	qp->s_tail = 0;
+	qp->s_cur = 0;
+	qp->s_acked = 0;
+	qp->s_last = 0;
+	qp->s_ssn = 1;
+	qp->s_lsn = 0;
+	qp->s_mig_state = IB_MIG_MIGRATED;
+	memset(qp->s_ack_queue, 0, sizeof(qp->s_ack_queue));
+	qp->r_head_ack_queue = 0;
+	qp->s_tail_ack_queue = 0;
+	qp->s_num_rd_atomic = 0;
+	if (qp->r_rq.wq) {
+		qp->r_rq.wq->head = 0;
+		qp->r_rq.wq->tail = 0;
+	}
+	qp->r_sge.num_sge = 0;
+}
+
+static void clear_mr_refs(struct qib_qp *qp, int clr_sends)
+{
+	unsigned n;
+
+	if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags))
+		qib_put_ss(&qp->s_rdma_read_sge);
+
+	qib_put_ss(&qp->r_sge);
+
+	if (clr_sends) {
+		while (qp->s_last != qp->s_head) {
+			struct qib_swqe *wqe = get_swqe_ptr(qp, qp->s_last);
+			unsigned i;
+
+			for (i = 0; i < wqe->wr.num_sge; i++) {
+				struct qib_sge *sge = &wqe->sg_list[i];
+
+				qib_put_mr(sge->mr);
+			}
+			if (qp->ibqp.qp_type == IB_QPT_UD ||
+			    qp->ibqp.qp_type == IB_QPT_SMI ||
+			    qp->ibqp.qp_type == IB_QPT_GSI)
+				atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount);
+			if (++qp->s_last >= qp->s_size)
+				qp->s_last = 0;
+		}
+		if (qp->s_rdma_mr) {
+			qib_put_mr(qp->s_rdma_mr);
+			qp->s_rdma_mr = NULL;
+		}
+	}
+
+	if (qp->ibqp.qp_type != IB_QPT_RC)
+		return;
+
+	for (n = 0; n < ARRAY_SIZE(qp->s_ack_queue); n++) {
+		struct qib_ack_entry *e = &qp->s_ack_queue[n];
+
+		if (e->opcode == IB_OPCODE_RC_RDMA_READ_REQUEST &&
+		    e->rdma_sge.mr) {
+			qib_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+	}
+}
+
+/**
+ * qib_error_qp - put a QP into the error state
+ * @qp: the QP to put into the error state
+ * @err: the receive completion error to signal if a RWQE is active
+ *
+ * Flushes both send and receive work queues.
+ * Returns true if last WQE event should be generated.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ * If we are already in error state, just return.
+ */
+int qib_error_qp(struct qib_qp *qp, enum ib_wc_status err)
+{
+	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ib_wc wc;
+	int ret = 0;
+
+	if (qp->state == IB_QPS_ERR || qp->state == IB_QPS_RESET)
+		goto bail;
+
+	qp->state = IB_QPS_ERR;
+
+	if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) {
+		qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR);
+		del_timer(&qp->s_timer);
+	}
+
+	if (qp->s_flags & QIB_S_ANY_WAIT_SEND)
+		qp->s_flags &= ~QIB_S_ANY_WAIT_SEND;
+
+	spin_lock(&dev->pending_lock);
+	if (!list_empty(&qp->iowait) && !(qp->s_flags & QIB_S_BUSY)) {
+		qp->s_flags &= ~QIB_S_ANY_WAIT_IO;
+		list_del_init(&qp->iowait);
+	}
+	spin_unlock(&dev->pending_lock);
+
+	if (!(qp->s_flags & QIB_S_BUSY)) {
+		qp->s_hdrwords = 0;
+		if (qp->s_rdma_mr) {
+			qib_put_mr(qp->s_rdma_mr);
+			qp->s_rdma_mr = NULL;
+		}
+		if (qp->s_tx) {
+			qib_put_txreq(qp->s_tx);
+			qp->s_tx = NULL;
+		}
+	}
+
+	/* Schedule the sending tasklet to drain the send work queue. */
+	if (qp->s_last != qp->s_head)
+		qib_schedule_send(qp);
+
+	clear_mr_refs(qp, 0);
+
+	memset(&wc, 0, sizeof(wc));
+	wc.qp = &qp->ibqp;
+	wc.opcode = IB_WC_RECV;
+
+	if (test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags)) {
+		wc.wr_id = qp->r_wr_id;
+		wc.status = err;
+		qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+	}
+	wc.status = IB_WC_WR_FLUSH_ERR;
+
+	if (qp->r_rq.wq) {
+		struct qib_rwq *wq;
+		u32 head;
+		u32 tail;
+
+		spin_lock(&qp->r_rq.lock);
+
+		/* sanity check pointers before trusting them */
+		wq = qp->r_rq.wq;
+		head = wq->head;
+		if (head >= qp->r_rq.size)
+			head = 0;
+		tail = wq->tail;
+		if (tail >= qp->r_rq.size)
+			tail = 0;
+		while (tail != head) {
+			wc.wr_id = get_rwqe_ptr(&qp->r_rq, tail)->wr_id;
+			if (++tail >= qp->r_rq.size)
+				tail = 0;
+			qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+		}
+		wq->tail = tail;
+
+		spin_unlock(&qp->r_rq.lock);
+	} else if (qp->ibqp.event_handler)
+		ret = 1;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_modify_qp - modify the attributes of a queue pair
+ * @ibqp: the queue pair who's attributes we're modifying
+ * @attr: the new attributes
+ * @attr_mask: the mask of attributes to modify
+ * @udata: user data for libibverbs.so
+ *
+ * Returns 0 on success, otherwise returns an errno.
+ */
+int qib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		  int attr_mask, struct ib_udata *udata)
+{
+	struct qib_ibdev *dev = to_idev(ibqp->device);
+	struct qib_qp *qp = to_iqp(ibqp);
+	enum ib_qp_state cur_state, new_state;
+	struct ib_event ev;
+	int lastwqe = 0;
+	int mig = 0;
+	int ret;
+	u32 pmtu = 0; /* for gcc warning only */
+
+	spin_lock_irq(&qp->r_lock);
+	spin_lock(&qp->s_lock);
+
+	cur_state = attr_mask & IB_QP_CUR_STATE ?
+		attr->cur_qp_state : qp->state;
+	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
+
+	if (!ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type,
+				attr_mask, IB_LINK_LAYER_UNSPECIFIED))
+		goto inval;
+
+	if (attr_mask & IB_QP_AV) {
+		if (attr->ah_attr.dlid >= QIB_MULTICAST_LID_BASE)
+			goto inval;
+		if (qib_check_ah(qp->ibqp.device, &attr->ah_attr))
+			goto inval;
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		if (attr->alt_ah_attr.dlid >= QIB_MULTICAST_LID_BASE)
+			goto inval;
+		if (qib_check_ah(qp->ibqp.device, &attr->alt_ah_attr))
+			goto inval;
+		if (attr->alt_pkey_index >= qib_get_npkeys(dd_from_dev(dev)))
+			goto inval;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		if (attr->pkey_index >= qib_get_npkeys(dd_from_dev(dev)))
+			goto inval;
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		if (attr->min_rnr_timer > 31)
+			goto inval;
+
+	if (attr_mask & IB_QP_PORT)
+		if (qp->ibqp.qp_type == IB_QPT_SMI ||
+		    qp->ibqp.qp_type == IB_QPT_GSI ||
+		    attr->port_num == 0 ||
+		    attr->port_num > ibqp->device->phys_port_cnt)
+			goto inval;
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		if (attr->dest_qp_num > QIB_QPN_MASK)
+			goto inval;
+
+	if (attr_mask & IB_QP_RETRY_CNT)
+		if (attr->retry_cnt > 7)
+			goto inval;
+
+	if (attr_mask & IB_QP_RNR_RETRY)
+		if (attr->rnr_retry > 7)
+			goto inval;
+
+	/*
+	 * Don't allow invalid path_mtu values.  OK to set greater
+	 * than the active mtu (or even the max_cap, if we have tuned
+	 * that to a small mtu.  We'll set qp->path_mtu
+	 * to the lesser of requested attribute mtu and active,
+	 * for packetizing messages.
+	 * Note that the QP port has to be set in INIT and MTU in RTR.
+	 */
+	if (attr_mask & IB_QP_PATH_MTU) {
+		struct qib_devdata *dd = dd_from_dev(dev);
+		int mtu, pidx = qp->port_num - 1;
+
+		mtu = ib_mtu_enum_to_int(attr->path_mtu);
+		if (mtu == -1)
+			goto inval;
+		if (mtu > dd->pport[pidx].ibmtu) {
+			switch (dd->pport[pidx].ibmtu) {
+			case 4096:
+				pmtu = IB_MTU_4096;
+				break;
+			case 2048:
+				pmtu = IB_MTU_2048;
+				break;
+			case 1024:
+				pmtu = IB_MTU_1024;
+				break;
+			case 512:
+				pmtu = IB_MTU_512;
+				break;
+			case 256:
+				pmtu = IB_MTU_256;
+				break;
+			default:
+				pmtu = IB_MTU_2048;
+			}
+		} else
+			pmtu = attr->path_mtu;
+	}
+
+	if (attr_mask & IB_QP_PATH_MIG_STATE) {
+		if (attr->path_mig_state == IB_MIG_REARM) {
+			if (qp->s_mig_state == IB_MIG_ARMED)
+				goto inval;
+			if (new_state != IB_QPS_RTS)
+				goto inval;
+		} else if (attr->path_mig_state == IB_MIG_MIGRATED) {
+			if (qp->s_mig_state == IB_MIG_REARM)
+				goto inval;
+			if (new_state != IB_QPS_RTS && new_state != IB_QPS_SQD)
+				goto inval;
+			if (qp->s_mig_state == IB_MIG_ARMED)
+				mig = 1;
+		} else
+			goto inval;
+	}
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		if (attr->max_dest_rd_atomic > QIB_MAX_RDMA_ATOMIC)
+			goto inval;
+
+	switch (new_state) {
+	case IB_QPS_RESET:
+		if (qp->state != IB_QPS_RESET) {
+			qp->state = IB_QPS_RESET;
+			spin_lock(&dev->pending_lock);
+			if (!list_empty(&qp->iowait))
+				list_del_init(&qp->iowait);
+			spin_unlock(&dev->pending_lock);
+			qp->s_flags &= ~(QIB_S_TIMER | QIB_S_ANY_WAIT);
+			spin_unlock(&qp->s_lock);
+			spin_unlock_irq(&qp->r_lock);
+			/* Stop the sending work queue and retry timer */
+			cancel_work_sync(&qp->s_work);
+			del_timer_sync(&qp->s_timer);
+			wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
+			if (qp->s_tx) {
+				qib_put_txreq(qp->s_tx);
+				qp->s_tx = NULL;
+			}
+			remove_qp(dev, qp);
+			wait_event(qp->wait, !atomic_read(&qp->refcount));
+			spin_lock_irq(&qp->r_lock);
+			spin_lock(&qp->s_lock);
+			clear_mr_refs(qp, 1);
+			qib_reset_qp(qp, ibqp->qp_type);
+		}
+		break;
+
+	case IB_QPS_RTR:
+		/* Allow event to retrigger if QP set to RTR more than once */
+		qp->r_flags &= ~QIB_R_COMM_EST;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_SQD:
+		qp->s_draining = qp->s_last != qp->s_cur;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_SQE:
+		if (qp->ibqp.qp_type == IB_QPT_RC)
+			goto inval;
+		qp->state = new_state;
+		break;
+
+	case IB_QPS_ERR:
+		lastwqe = qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+		break;
+
+	default:
+		qp->state = new_state;
+		break;
+	}
+
+	if (attr_mask & IB_QP_PKEY_INDEX)
+		qp->s_pkey_index = attr->pkey_index;
+
+	if (attr_mask & IB_QP_PORT)
+		qp->port_num = attr->port_num;
+
+	if (attr_mask & IB_QP_DEST_QPN)
+		qp->remote_qpn = attr->dest_qp_num;
+
+	if (attr_mask & IB_QP_SQ_PSN) {
+		qp->s_next_psn = attr->sq_psn & QIB_PSN_MASK;
+		qp->s_psn = qp->s_next_psn;
+		qp->s_sending_psn = qp->s_next_psn;
+		qp->s_last_psn = qp->s_next_psn - 1;
+		qp->s_sending_hpsn = qp->s_last_psn;
+	}
+
+	if (attr_mask & IB_QP_RQ_PSN)
+		qp->r_psn = attr->rq_psn & QIB_PSN_MASK;
+
+	if (attr_mask & IB_QP_ACCESS_FLAGS)
+		qp->qp_access_flags = attr->qp_access_flags;
+
+	if (attr_mask & IB_QP_AV) {
+		qp->remote_ah_attr = attr->ah_attr;
+		qp->s_srate = attr->ah_attr.static_rate;
+	}
+
+	if (attr_mask & IB_QP_ALT_PATH) {
+		qp->alt_ah_attr = attr->alt_ah_attr;
+		qp->s_alt_pkey_index = attr->alt_pkey_index;
+	}
+
+	if (attr_mask & IB_QP_PATH_MIG_STATE) {
+		qp->s_mig_state = attr->path_mig_state;
+		if (mig) {
+			qp->remote_ah_attr = qp->alt_ah_attr;
+			qp->port_num = qp->alt_ah_attr.port_num;
+			qp->s_pkey_index = qp->s_alt_pkey_index;
+		}
+	}
+
+	if (attr_mask & IB_QP_PATH_MTU) {
+		qp->path_mtu = pmtu;
+		qp->pmtu = ib_mtu_enum_to_int(pmtu);
+	}
+
+	if (attr_mask & IB_QP_RETRY_CNT) {
+		qp->s_retry_cnt = attr->retry_cnt;
+		qp->s_retry = attr->retry_cnt;
+	}
+
+	if (attr_mask & IB_QP_RNR_RETRY) {
+		qp->s_rnr_retry_cnt = attr->rnr_retry;
+		qp->s_rnr_retry = attr->rnr_retry;
+	}
+
+	if (attr_mask & IB_QP_MIN_RNR_TIMER)
+		qp->r_min_rnr_timer = attr->min_rnr_timer;
+
+	if (attr_mask & IB_QP_TIMEOUT) {
+		qp->timeout = attr->timeout;
+		qp->timeout_jiffies =
+			usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
+				1000UL);
+	}
+
+	if (attr_mask & IB_QP_QKEY)
+		qp->qkey = attr->qkey;
+
+	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
+		qp->r_max_rd_atomic = attr->max_dest_rd_atomic;
+
+	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC)
+		qp->s_max_rd_atomic = attr->max_rd_atomic;
+
+	spin_unlock(&qp->s_lock);
+	spin_unlock_irq(&qp->r_lock);
+
+	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
+		insert_qp(dev, qp);
+
+	if (lastwqe) {
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+	if (mig) {
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_PATH_MIG;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+	ret = 0;
+	goto bail;
+
+inval:
+	spin_unlock(&qp->s_lock);
+	spin_unlock_irq(&qp->r_lock);
+	ret = -EINVAL;
+
+bail:
+	return ret;
+}
+
+int qib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		 int attr_mask, struct ib_qp_init_attr *init_attr)
+{
+	struct qib_qp *qp = to_iqp(ibqp);
+
+	attr->qp_state = qp->state;
+	attr->cur_qp_state = attr->qp_state;
+	attr->path_mtu = qp->path_mtu;
+	attr->path_mig_state = qp->s_mig_state;
+	attr->qkey = qp->qkey;
+	attr->rq_psn = qp->r_psn & QIB_PSN_MASK;
+	attr->sq_psn = qp->s_next_psn & QIB_PSN_MASK;
+	attr->dest_qp_num = qp->remote_qpn;
+	attr->qp_access_flags = qp->qp_access_flags;
+	attr->cap.max_send_wr = qp->s_size - 1;
+	attr->cap.max_recv_wr = qp->ibqp.srq ? 0 : qp->r_rq.size - 1;
+	attr->cap.max_send_sge = qp->s_max_sge;
+	attr->cap.max_recv_sge = qp->r_rq.max_sge;
+	attr->cap.max_inline_data = 0;
+	attr->ah_attr = qp->remote_ah_attr;
+	attr->alt_ah_attr = qp->alt_ah_attr;
+	attr->pkey_index = qp->s_pkey_index;
+	attr->alt_pkey_index = qp->s_alt_pkey_index;
+	attr->en_sqd_async_notify = 0;
+	attr->sq_draining = qp->s_draining;
+	attr->max_rd_atomic = qp->s_max_rd_atomic;
+	attr->max_dest_rd_atomic = qp->r_max_rd_atomic;
+	attr->min_rnr_timer = qp->r_min_rnr_timer;
+	attr->port_num = qp->port_num;
+	attr->timeout = qp->timeout;
+	attr->retry_cnt = qp->s_retry_cnt;
+	attr->rnr_retry = qp->s_rnr_retry_cnt;
+	attr->alt_port_num = qp->alt_ah_attr.port_num;
+	attr->alt_timeout = qp->alt_timeout;
+
+	init_attr->event_handler = qp->ibqp.event_handler;
+	init_attr->qp_context = qp->ibqp.qp_context;
+	init_attr->send_cq = qp->ibqp.send_cq;
+	init_attr->recv_cq = qp->ibqp.recv_cq;
+	init_attr->srq = qp->ibqp.srq;
+	init_attr->cap = attr->cap;
+	if (qp->s_flags & QIB_S_SIGNAL_REQ_WR)
+		init_attr->sq_sig_type = IB_SIGNAL_REQ_WR;
+	else
+		init_attr->sq_sig_type = IB_SIGNAL_ALL_WR;
+	init_attr->qp_type = qp->ibqp.qp_type;
+	init_attr->port_num = qp->port_num;
+	return 0;
+}
+
+/**
+ * qib_compute_aeth - compute the AETH (syndrome + MSN)
+ * @qp: the queue pair to compute the AETH for
+ *
+ * Returns the AETH.
+ */
+__be32 qib_compute_aeth(struct qib_qp *qp)
+{
+	u32 aeth = qp->r_msn & QIB_MSN_MASK;
+
+	if (qp->ibqp.srq) {
+		/*
+		 * Shared receive queues don't generate credits.
+		 * Set the credit field to the invalid value.
+		 */
+		aeth |= QIB_AETH_CREDIT_INVAL << QIB_AETH_CREDIT_SHIFT;
+	} else {
+		u32 min, max, x;
+		u32 credits;
+		struct qib_rwq *wq = qp->r_rq.wq;
+		u32 head;
+		u32 tail;
+
+		/* sanity check pointers before trusting them */
+		head = wq->head;
+		if (head >= qp->r_rq.size)
+			head = 0;
+		tail = wq->tail;
+		if (tail >= qp->r_rq.size)
+			tail = 0;
+		/*
+		 * Compute the number of credits available (RWQEs).
+		 * XXX Not holding the r_rq.lock here so there is a small
+		 * chance that the pair of reads are not atomic.
+		 */
+		credits = head - tail;
+		if ((int)credits < 0)
+			credits += qp->r_rq.size;
+		/*
+		 * Binary search the credit table to find the code to
+		 * use.
+		 */
+		min = 0;
+		max = 31;
+		for (;;) {
+			x = (min + max) / 2;
+			if (credit_table[x] == credits)
+				break;
+			if (credit_table[x] > credits)
+				max = x;
+			else if (min == x)
+				break;
+			else
+				min = x;
+		}
+		aeth |= x << QIB_AETH_CREDIT_SHIFT;
+	}
+	return cpu_to_be32(aeth);
+}
+
+/**
+ * qib_create_qp - create a queue pair for a device
+ * @ibpd: the protection domain who's device we create the queue pair for
+ * @init_attr: the attributes of the queue pair
+ * @udata: user data for libibverbs.so
+ *
+ * Returns the queue pair on success, otherwise returns an errno.
+ *
+ * Called by the ib_create_qp() core verbs function.
+ */
+struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata)
+{
+	struct qib_qp *qp;
+	int err;
+	struct qib_swqe *swq = NULL;
+	struct qib_ibdev *dev;
+	struct qib_devdata *dd;
+	size_t sz;
+	size_t sg_list_sz;
+	struct ib_qp *ret;
+
+	if (init_attr->cap.max_send_sge > ib_qib_max_sges ||
+	    init_attr->cap.max_send_wr > ib_qib_max_qp_wrs ||
+	    init_attr->create_flags) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	/* Check receive queue parameters if no SRQ is specified. */
+	if (!init_attr->srq) {
+		if (init_attr->cap.max_recv_sge > ib_qib_max_sges ||
+		    init_attr->cap.max_recv_wr > ib_qib_max_qp_wrs) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+		if (init_attr->cap.max_send_sge +
+		    init_attr->cap.max_send_wr +
+		    init_attr->cap.max_recv_sge +
+		    init_attr->cap.max_recv_wr == 0) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+	}
+
+	switch (init_attr->qp_type) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		if (init_attr->port_num == 0 ||
+		    init_attr->port_num > ibpd->device->phys_port_cnt) {
+			ret = ERR_PTR(-EINVAL);
+			goto bail;
+		}
+	case IB_QPT_UC:
+	case IB_QPT_RC:
+	case IB_QPT_UD:
+		sz = sizeof(struct qib_sge) *
+			init_attr->cap.max_send_sge +
+			sizeof(struct qib_swqe);
+		swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz);
+		if (swq == NULL) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail;
+		}
+		sz = sizeof(*qp);
+		sg_list_sz = 0;
+		if (init_attr->srq) {
+			struct qib_srq *srq = to_isrq(init_attr->srq);
+
+			if (srq->rq.max_sge > 1)
+				sg_list_sz = sizeof(*qp->r_sg_list) *
+					(srq->rq.max_sge - 1);
+		} else if (init_attr->cap.max_recv_sge > 1)
+			sg_list_sz = sizeof(*qp->r_sg_list) *
+				(init_attr->cap.max_recv_sge - 1);
+		qp = kzalloc(sz + sg_list_sz, GFP_KERNEL);
+		if (!qp) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_swq;
+		}
+		RCU_INIT_POINTER(qp->next, NULL);
+		qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), GFP_KERNEL);
+		if (!qp->s_hdr) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_qp;
+		}
+		qp->timeout_jiffies =
+			usecs_to_jiffies((4096UL * (1UL << qp->timeout)) /
+				1000UL);
+		if (init_attr->srq)
+			sz = 0;
+		else {
+			qp->r_rq.size = init_attr->cap.max_recv_wr + 1;
+			qp->r_rq.max_sge = init_attr->cap.max_recv_sge;
+			sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) +
+				sizeof(struct qib_rwqe);
+			qp->r_rq.wq = vmalloc_user(sizeof(struct qib_rwq) +
+						   qp->r_rq.size * sz);
+			if (!qp->r_rq.wq) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_qp;
+			}
+		}
+
+		/*
+		 * ib_create_qp() will initialize qp->ibqp
+		 * except for qp->ibqp.qp_num.
+		 */
+		spin_lock_init(&qp->r_lock);
+		spin_lock_init(&qp->s_lock);
+		spin_lock_init(&qp->r_rq.lock);
+		atomic_set(&qp->refcount, 0);
+		init_waitqueue_head(&qp->wait);
+		init_waitqueue_head(&qp->wait_dma);
+		init_timer(&qp->s_timer);
+		qp->s_timer.data = (unsigned long)qp;
+		INIT_WORK(&qp->s_work, qib_do_send);
+		INIT_LIST_HEAD(&qp->iowait);
+		INIT_LIST_HEAD(&qp->rspwait);
+		qp->state = IB_QPS_RESET;
+		qp->s_wq = swq;
+		qp->s_size = init_attr->cap.max_send_wr + 1;
+		qp->s_max_sge = init_attr->cap.max_send_sge;
+		if (init_attr->sq_sig_type == IB_SIGNAL_REQ_WR)
+			qp->s_flags = QIB_S_SIGNAL_REQ_WR;
+		dev = to_idev(ibpd->device);
+		dd = dd_from_dev(dev);
+		err = alloc_qpn(dd, &dev->qpn_table, init_attr->qp_type,
+				init_attr->port_num);
+		if (err < 0) {
+			ret = ERR_PTR(err);
+			vfree(qp->r_rq.wq);
+			goto bail_qp;
+		}
+		qp->ibqp.qp_num = err;
+		qp->port_num = init_attr->port_num;
+		qib_reset_qp(qp, init_attr->qp_type);
+		break;
+
+	default:
+		/* Don't support raw QPs */
+		ret = ERR_PTR(-ENOSYS);
+		goto bail;
+	}
+
+	init_attr->cap.max_inline_data = 0;
+
+	/*
+	 * Return the address of the RWQ as the offset to mmap.
+	 * See qib_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		if (!qp->r_rq.wq) {
+			__u64 offset = 0;
+
+			err = ib_copy_to_udata(udata, &offset,
+					       sizeof(offset));
+			if (err) {
+				ret = ERR_PTR(err);
+				goto bail_ip;
+			}
+		} else {
+			u32 s = sizeof(struct qib_rwq) + qp->r_rq.size * sz;
+
+			qp->ip = qib_create_mmap_info(dev, s,
+						      ibpd->uobject->context,
+						      qp->r_rq.wq);
+			if (!qp->ip) {
+				ret = ERR_PTR(-ENOMEM);
+				goto bail_ip;
+			}
+
+			err = ib_copy_to_udata(udata, &(qp->ip->offset),
+					       sizeof(qp->ip->offset));
+			if (err) {
+				ret = ERR_PTR(err);
+				goto bail_ip;
+			}
+		}
+	}
+
+	spin_lock(&dev->n_qps_lock);
+	if (dev->n_qps_allocated == ib_qib_max_qps) {
+		spin_unlock(&dev->n_qps_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	dev->n_qps_allocated++;
+	spin_unlock(&dev->n_qps_lock);
+
+	if (qp->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&qp->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = &qp->ibqp;
+	goto bail;
+
+bail_ip:
+	if (qp->ip)
+		kref_put(&qp->ip->ref, qib_release_mmap_info);
+	else
+		vfree(qp->r_rq.wq);
+	free_qpn(&dev->qpn_table, qp->ibqp.qp_num);
+bail_qp:
+	kfree(qp->s_hdr);
+	kfree(qp);
+bail_swq:
+	vfree(swq);
+bail:
+	return ret;
+}
+
+/**
+ * qib_destroy_qp - destroy a queue pair
+ * @ibqp: the queue pair to destroy
+ *
+ * Returns 0 on success.
+ *
+ * Note that this can be called while the QP is actively sending or
+ * receiving!
+ */
+int qib_destroy_qp(struct ib_qp *ibqp)
+{
+	struct qib_qp *qp = to_iqp(ibqp);
+	struct qib_ibdev *dev = to_idev(ibqp->device);
+
+	/* Make sure HW and driver activity is stopped. */
+	spin_lock_irq(&qp->s_lock);
+	if (qp->state != IB_QPS_RESET) {
+		qp->state = IB_QPS_RESET;
+		spin_lock(&dev->pending_lock);
+		if (!list_empty(&qp->iowait))
+			list_del_init(&qp->iowait);
+		spin_unlock(&dev->pending_lock);
+		qp->s_flags &= ~(QIB_S_TIMER | QIB_S_ANY_WAIT);
+		spin_unlock_irq(&qp->s_lock);
+		cancel_work_sync(&qp->s_work);
+		del_timer_sync(&qp->s_timer);
+		wait_event(qp->wait_dma, !atomic_read(&qp->s_dma_busy));
+		if (qp->s_tx) {
+			qib_put_txreq(qp->s_tx);
+			qp->s_tx = NULL;
+		}
+		remove_qp(dev, qp);
+		wait_event(qp->wait, !atomic_read(&qp->refcount));
+		clear_mr_refs(qp, 1);
+	} else
+		spin_unlock_irq(&qp->s_lock);
+
+	/* all user's cleaned up, mark it available */
+	free_qpn(&dev->qpn_table, qp->ibqp.qp_num);
+	spin_lock(&dev->n_qps_lock);
+	dev->n_qps_allocated--;
+	spin_unlock(&dev->n_qps_lock);
+
+	if (qp->ip)
+		kref_put(&qp->ip->ref, qib_release_mmap_info);
+	else
+		vfree(qp->r_rq.wq);
+	vfree(qp->s_wq);
+	kfree(qp->s_hdr);
+	kfree(qp);
+	return 0;
+}
+
+/**
+ * qib_init_qpn_table - initialize the QP number table for a device
+ * @qpt: the QPN table
+ */
+void qib_init_qpn_table(struct qib_devdata *dd, struct qib_qpn_table *qpt)
+{
+	spin_lock_init(&qpt->lock);
+	qpt->last = 1;          /* start with QPN 2 */
+	qpt->nmaps = 1;
+	qpt->mask = dd->qpn_mask;
+}
+
+/**
+ * qib_free_qpn_table - free the QP number table for a device
+ * @qpt: the QPN table
+ */
+void qib_free_qpn_table(struct qib_qpn_table *qpt)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(qpt->map); i++)
+		if (qpt->map[i].page)
+			free_page((unsigned long) qpt->map[i].page);
+}
+
+/**
+ * qib_get_credit - flush the send work queue of a QP
+ * @qp: the qp who's send work queue to flush
+ * @aeth: the Acknowledge Extended Transport Header
+ *
+ * The QP s_lock should be held.
+ */
+void qib_get_credit(struct qib_qp *qp, u32 aeth)
+{
+	u32 credit = (aeth >> QIB_AETH_CREDIT_SHIFT) & QIB_AETH_CREDIT_MASK;
+
+	/*
+	 * If the credit is invalid, we can send
+	 * as many packets as we like.  Otherwise, we have to
+	 * honor the credit field.
+	 */
+	if (credit == QIB_AETH_CREDIT_INVAL) {
+		if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) {
+			qp->s_flags |= QIB_S_UNLIMITED_CREDIT;
+			if (qp->s_flags & QIB_S_WAIT_SSN_CREDIT) {
+				qp->s_flags &= ~QIB_S_WAIT_SSN_CREDIT;
+				qib_schedule_send(qp);
+			}
+		}
+	} else if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT)) {
+		/* Compute new LSN (i.e., MSN + credit) */
+		credit = (aeth + credit_table[credit]) & QIB_MSN_MASK;
+		if (qib_cmp24(credit, qp->s_lsn) > 0) {
+			qp->s_lsn = credit;
+			if (qp->s_flags & QIB_S_WAIT_SSN_CREDIT) {
+				qp->s_flags &= ~QIB_S_WAIT_SSN_CREDIT;
+				qib_schedule_send(qp);
+			}
+		}
+	}
+}
+
+#ifdef CONFIG_DEBUG_FS
+
+struct qib_qp_iter {
+	struct qib_ibdev *dev;
+	struct qib_qp *qp;
+	int n;
+};
+
+struct qib_qp_iter *qib_qp_iter_init(struct qib_ibdev *dev)
+{
+	struct qib_qp_iter *iter;
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (!iter)
+		return NULL;
+
+	iter->dev = dev;
+	if (qib_qp_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+int qib_qp_iter_next(struct qib_qp_iter *iter)
+{
+	struct qib_ibdev *dev = iter->dev;
+	int n = iter->n;
+	int ret = 1;
+	struct qib_qp *pqp = iter->qp;
+	struct qib_qp *qp;
+
+	for (; n < dev->qp_table_size; n++) {
+		if (pqp)
+			qp = rcu_dereference(pqp->next);
+		else
+			qp = rcu_dereference(dev->qp_table[n]);
+		pqp = qp;
+		if (qp) {
+			iter->qp = qp;
+			iter->n = n;
+			return 0;
+		}
+	}
+	return ret;
+}
+
+static const char * const qp_type_str[] = {
+	"SMI", "GSI", "RC", "UC", "UD",
+};
+
+void qib_qp_iter_print(struct seq_file *s, struct qib_qp_iter *iter)
+{
+	struct qib_swqe *wqe;
+	struct qib_qp *qp = iter->qp;
+
+	wqe = get_swqe_ptr(qp, qp->s_last);
+	seq_printf(s,
+		   "N %d QP%u %s %u %u %u f=%x %u %u %u %u %u PSN %x %x %x %x %x (%u %u %u %u %u %u) QP%u LID %x\n",
+		   iter->n,
+		   qp->ibqp.qp_num,
+		   qp_type_str[qp->ibqp.qp_type],
+		   qp->state,
+		   wqe->wr.opcode,
+		   qp->s_hdrwords,
+		   qp->s_flags,
+		   atomic_read(&qp->s_dma_busy),
+		   !list_empty(&qp->iowait),
+		   qp->timeout,
+		   wqe->ssn,
+		   qp->s_lsn,
+		   qp->s_last_psn,
+		   qp->s_psn, qp->s_next_psn,
+		   qp->s_sending_psn, qp->s_sending_hpsn,
+		   qp->s_last, qp->s_acked, qp->s_cur,
+		   qp->s_tail, qp->s_head, qp->s_size,
+		   qp->remote_qpn,
+		   qp->remote_ah_attr.dlid);
+}
+
+#endif
diff --git a/drivers/infiniband/hw/qib/qib_qsfp.c b/drivers/infiniband/hw/qib/qib_qsfp.c
new file mode 100644
index 000000000..5e27f7680
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_qsfp.c
@@ -0,0 +1,559 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "qib.h"
+#include "qib_qsfp.h"
+
+/*
+ * QSFP support for ib_qib driver, using "Two Wire Serial Interface" driver
+ * in qib_twsi.c
+ */
+#define QSFP_MAX_RETRY 4
+
+static int qsfp_read(struct qib_pportdata *ppd, int addr, void *bp, int len)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u32 out, mask;
+	int ret, cnt, pass = 0;
+	int stuck = 0;
+	u8 *buff = bp;
+
+	ret = mutex_lock_interruptible(&dd->eep_lock);
+	if (ret)
+		goto no_unlock;
+
+	if (dd->twsi_eeprom_dev == QIB_TWSI_NO_DEV) {
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	/*
+	 * We presume, if we are called at all, that this board has
+	 * QSFP. This is on the same i2c chain as the legacy parts,
+	 * but only responds if the module is selected via GPIO pins.
+	 * Further, there are very long setup and hold requirements
+	 * on MODSEL.
+	 */
+	mask = QSFP_GPIO_MOD_SEL_N | QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE;
+	out = QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE;
+	if (ppd->hw_pidx) {
+		mask <<= QSFP_GPIO_PORT2_SHIFT;
+		out <<= QSFP_GPIO_PORT2_SHIFT;
+	}
+
+	dd->f_gpio_mod(dd, out, mask, mask);
+
+	/*
+	 * Module could take up to 2 Msec to respond to MOD_SEL, and there
+	 * is no way to tell if it is ready, so we must wait.
+	 */
+	msleep(20);
+
+	/* Make sure TWSI bus is in sane state. */
+	ret = qib_twsi_reset(dd);
+	if (ret) {
+		qib_dev_porterr(dd, ppd->port,
+				"QSFP interface Reset for read failed\n");
+		ret = -EIO;
+		stuck = 1;
+		goto deselect;
+	}
+
+	/* All QSFP modules are at A0 */
+
+	cnt = 0;
+	while (cnt < len) {
+		unsigned in_page;
+		int wlen = len - cnt;
+
+		in_page = addr % QSFP_PAGESIZE;
+		if ((in_page + wlen) > QSFP_PAGESIZE)
+			wlen = QSFP_PAGESIZE - in_page;
+		ret = qib_twsi_blk_rd(dd, QSFP_DEV, addr, buff + cnt, wlen);
+		/* Some QSFP's fail first try. Retry as experiment */
+		if (ret && cnt == 0 && ++pass < QSFP_MAX_RETRY)
+			continue;
+		if (ret) {
+			/* qib_twsi_blk_rd() 1 for error, else 0 */
+			ret = -EIO;
+			goto deselect;
+		}
+		addr += wlen;
+		cnt += wlen;
+	}
+	ret = cnt;
+
+deselect:
+	/*
+	 * Module could take up to 10 uSec after transfer before
+	 * ready to respond to MOD_SEL negation, and there is no way
+	 * to tell if it is ready, so we must wait.
+	 */
+	udelay(10);
+	/* set QSFP MODSEL, RST. LP all high */
+	dd->f_gpio_mod(dd, mask, mask, mask);
+
+	/*
+	 * Module could take up to 2 Msec to respond to MOD_SEL
+	 * going away, and there is no way to tell if it is ready.
+	 * so we must wait.
+	 */
+	if (stuck)
+		qib_dev_err(dd, "QSFP interface bus stuck non-idle\n");
+
+	if (pass >= QSFP_MAX_RETRY && ret)
+		qib_dev_porterr(dd, ppd->port, "QSFP failed even retrying\n");
+	else if (pass)
+		qib_dev_porterr(dd, ppd->port, "QSFP retries: %d\n", pass);
+
+	msleep(20);
+
+bail:
+	mutex_unlock(&dd->eep_lock);
+
+no_unlock:
+	return ret;
+}
+
+/*
+ * qsfp_write
+ * We do not ordinarily write the QSFP, but this is needed to select
+ * the page on non-flat QSFPs, and possibly later unusual cases
+ */
+static int qib_qsfp_write(struct qib_pportdata *ppd, int addr, void *bp,
+			  int len)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u32 out, mask;
+	int ret, cnt;
+	u8 *buff = bp;
+
+	ret = mutex_lock_interruptible(&dd->eep_lock);
+	if (ret)
+		goto no_unlock;
+
+	if (dd->twsi_eeprom_dev == QIB_TWSI_NO_DEV) {
+		ret = -ENXIO;
+		goto bail;
+	}
+
+	/*
+	 * We presume, if we are called at all, that this board has
+	 * QSFP. This is on the same i2c chain as the legacy parts,
+	 * but only responds if the module is selected via GPIO pins.
+	 * Further, there are very long setup and hold requirements
+	 * on MODSEL.
+	 */
+	mask = QSFP_GPIO_MOD_SEL_N | QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE;
+	out = QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE;
+	if (ppd->hw_pidx) {
+		mask <<= QSFP_GPIO_PORT2_SHIFT;
+		out <<= QSFP_GPIO_PORT2_SHIFT;
+	}
+	dd->f_gpio_mod(dd, out, mask, mask);
+
+	/*
+	 * Module could take up to 2 Msec to respond to MOD_SEL,
+	 * and there is no way to tell if it is ready, so we must wait.
+	 */
+	msleep(20);
+
+	/* Make sure TWSI bus is in sane state. */
+	ret = qib_twsi_reset(dd);
+	if (ret) {
+		qib_dev_porterr(dd, ppd->port,
+				"QSFP interface Reset for write failed\n");
+		ret = -EIO;
+		goto deselect;
+	}
+
+	/* All QSFP modules are at A0 */
+
+	cnt = 0;
+	while (cnt < len) {
+		unsigned in_page;
+		int wlen = len - cnt;
+
+		in_page = addr % QSFP_PAGESIZE;
+		if ((in_page + wlen) > QSFP_PAGESIZE)
+			wlen = QSFP_PAGESIZE - in_page;
+		ret = qib_twsi_blk_wr(dd, QSFP_DEV, addr, buff + cnt, wlen);
+		if (ret) {
+			/* qib_twsi_blk_wr() 1 for error, else 0 */
+			ret = -EIO;
+			goto deselect;
+		}
+		addr += wlen;
+		cnt += wlen;
+	}
+	ret = cnt;
+
+deselect:
+	/*
+	 * Module could take up to 10 uSec after transfer before
+	 * ready to respond to MOD_SEL negation, and there is no way
+	 * to tell if it is ready, so we must wait.
+	 */
+	udelay(10);
+	/* set QSFP MODSEL, RST, LP high */
+	dd->f_gpio_mod(dd, mask, mask, mask);
+	/*
+	 * Module could take up to 2 Msec to respond to MOD_SEL
+	 * going away, and there is no way to tell if it is ready.
+	 * so we must wait.
+	 */
+	msleep(20);
+
+bail:
+	mutex_unlock(&dd->eep_lock);
+
+no_unlock:
+	return ret;
+}
+
+/*
+ * For validation, we want to check the checksums, even of the
+ * fields we do not otherwise use. This function reads the bytes from
+ * <first> to <next-1> and returns the 8lsbs of the sum, or <0 for errors
+ */
+static int qsfp_cks(struct qib_pportdata *ppd, int first, int next)
+{
+	int ret;
+	u16 cks;
+	u8 bval;
+
+	cks = 0;
+	while (first < next) {
+		ret = qsfp_read(ppd, first, &bval, 1);
+		if (ret < 0)
+			goto bail;
+		cks += bval;
+		++first;
+	}
+	ret = cks & 0xFF;
+bail:
+	return ret;
+
+}
+
+int qib_refresh_qsfp_cache(struct qib_pportdata *ppd, struct qib_qsfp_cache *cp)
+{
+	int ret;
+	int idx;
+	u16 cks;
+	u8 peek[4];
+
+	/* ensure sane contents on invalid reads, for cable swaps */
+	memset(cp, 0, sizeof(*cp));
+
+	if (!qib_qsfp_mod_present(ppd)) {
+		ret = -ENODEV;
+		goto bail;
+	}
+
+	ret = qsfp_read(ppd, 0, peek, 3);
+	if (ret < 0)
+		goto bail;
+	if ((peek[0] & 0xFE) != 0x0C)
+		qib_dev_porterr(ppd->dd, ppd->port,
+				"QSFP byte0 is 0x%02X, S/B 0x0C/D\n", peek[0]);
+
+	if ((peek[2] & 2) == 0) {
+		/*
+		 * If cable is paged, rather than "flat memory", we need to
+		 * set the page to zero, Even if it already appears to be zero.
+		 */
+		u8 poke = 0;
+
+		ret = qib_qsfp_write(ppd, 127, &poke, 1);
+		udelay(50);
+		if (ret != 1) {
+			qib_dev_porterr(ppd->dd, ppd->port,
+					"Failed QSFP Page set\n");
+			goto bail;
+		}
+	}
+
+	ret = qsfp_read(ppd, QSFP_MOD_ID_OFFS, &cp->id, 1);
+	if (ret < 0)
+		goto bail;
+	if ((cp->id & 0xFE) != 0x0C)
+		qib_dev_porterr(ppd->dd, ppd->port,
+				"QSFP ID byte is 0x%02X, S/B 0x0C/D\n", cp->id);
+	cks = cp->id;
+
+	ret = qsfp_read(ppd, QSFP_MOD_PWR_OFFS, &cp->pwr, 1);
+	if (ret < 0)
+		goto bail;
+	cks += cp->pwr;
+
+	ret = qsfp_cks(ppd, QSFP_MOD_PWR_OFFS + 1, QSFP_MOD_LEN_OFFS);
+	if (ret < 0)
+		goto bail;
+	cks += ret;
+
+	ret = qsfp_read(ppd, QSFP_MOD_LEN_OFFS, &cp->len, 1);
+	if (ret < 0)
+		goto bail;
+	cks += cp->len;
+
+	ret = qsfp_read(ppd, QSFP_MOD_TECH_OFFS, &cp->tech, 1);
+	if (ret < 0)
+		goto bail;
+	cks += cp->tech;
+
+	ret = qsfp_read(ppd, QSFP_VEND_OFFS, &cp->vendor, QSFP_VEND_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_VEND_LEN; ++idx)
+		cks += cp->vendor[idx];
+
+	ret = qsfp_read(ppd, QSFP_IBXCV_OFFS, &cp->xt_xcv, 1);
+	if (ret < 0)
+		goto bail;
+	cks += cp->xt_xcv;
+
+	ret = qsfp_read(ppd, QSFP_VOUI_OFFS, &cp->oui, QSFP_VOUI_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_VOUI_LEN; ++idx)
+		cks += cp->oui[idx];
+
+	ret = qsfp_read(ppd, QSFP_PN_OFFS, &cp->partnum, QSFP_PN_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_PN_LEN; ++idx)
+		cks += cp->partnum[idx];
+
+	ret = qsfp_read(ppd, QSFP_REV_OFFS, &cp->rev, QSFP_REV_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_REV_LEN; ++idx)
+		cks += cp->rev[idx];
+
+	ret = qsfp_read(ppd, QSFP_ATTEN_OFFS, &cp->atten, QSFP_ATTEN_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_ATTEN_LEN; ++idx)
+		cks += cp->atten[idx];
+
+	ret = qsfp_cks(ppd, QSFP_ATTEN_OFFS + QSFP_ATTEN_LEN, QSFP_CC_OFFS);
+	if (ret < 0)
+		goto bail;
+	cks += ret;
+
+	cks &= 0xFF;
+	ret = qsfp_read(ppd, QSFP_CC_OFFS, &cp->cks1, 1);
+	if (ret < 0)
+		goto bail;
+	if (cks != cp->cks1)
+		qib_dev_porterr(ppd->dd, ppd->port,
+				"QSFP cks1 is %02X, computed %02X\n", cp->cks1,
+				cks);
+
+	/* Second checksum covers 192 to (serial, date, lot) */
+	ret = qsfp_cks(ppd, QSFP_CC_OFFS + 1, QSFP_SN_OFFS);
+	if (ret < 0)
+		goto bail;
+	cks = ret;
+
+	ret = qsfp_read(ppd, QSFP_SN_OFFS, &cp->serial, QSFP_SN_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_SN_LEN; ++idx)
+		cks += cp->serial[idx];
+
+	ret = qsfp_read(ppd, QSFP_DATE_OFFS, &cp->date, QSFP_DATE_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_DATE_LEN; ++idx)
+		cks += cp->date[idx];
+
+	ret = qsfp_read(ppd, QSFP_LOT_OFFS, &cp->lot, QSFP_LOT_LEN);
+	if (ret < 0)
+		goto bail;
+	for (idx = 0; idx < QSFP_LOT_LEN; ++idx)
+		cks += cp->lot[idx];
+
+	ret = qsfp_cks(ppd, QSFP_LOT_OFFS + QSFP_LOT_LEN, QSFP_CC_EXT_OFFS);
+	if (ret < 0)
+		goto bail;
+	cks += ret;
+
+	ret = qsfp_read(ppd, QSFP_CC_EXT_OFFS, &cp->cks2, 1);
+	if (ret < 0)
+		goto bail;
+	cks &= 0xFF;
+	if (cks != cp->cks2)
+		qib_dev_porterr(ppd->dd, ppd->port,
+				"QSFP cks2 is %02X, computed %02X\n", cp->cks2,
+				cks);
+	return 0;
+
+bail:
+	cp->id = 0;
+	return ret;
+}
+
+const char * const qib_qsfp_devtech[16] = {
+	"850nm VCSEL", "1310nm VCSEL", "1550nm VCSEL", "1310nm FP",
+	"1310nm DFB", "1550nm DFB", "1310nm EML", "1550nm EML",
+	"Cu Misc", "1490nm DFB", "Cu NoEq", "Cu Eq",
+	"Undef", "Cu Active BothEq", "Cu FarEq", "Cu NearEq"
+};
+
+#define QSFP_DUMP_CHUNK 16 /* Holds longest string */
+#define QSFP_DEFAULT_HDR_CNT 224
+
+static const char *pwr_codes = "1.5W2.0W2.5W3.5W";
+
+int qib_qsfp_mod_present(struct qib_pportdata *ppd)
+{
+	u32 mask;
+	int ret;
+
+	mask = QSFP_GPIO_MOD_PRS_N <<
+		(ppd->hw_pidx * QSFP_GPIO_PORT2_SHIFT);
+	ret = ppd->dd->f_gpio_mod(ppd->dd, 0, 0, 0);
+
+	return !((ret & mask) >>
+		 ((ppd->hw_pidx * QSFP_GPIO_PORT2_SHIFT) + 3));
+}
+
+/*
+ * Initialize structures that control access to QSFP. Called once per port
+ * on cards that support QSFP.
+ */
+void qib_qsfp_init(struct qib_qsfp_data *qd,
+		   void (*fevent)(struct work_struct *))
+{
+	u32 mask, highs;
+
+	struct qib_devdata *dd = qd->ppd->dd;
+
+	/* Initialize work struct for later QSFP events */
+	INIT_WORK(&qd->work, fevent);
+
+	/*
+	 * Later, we may want more validation. For now, just set up pins and
+	 * blip reset. If module is present, call qib_refresh_qsfp_cache(),
+	 * to do further init.
+	 */
+	mask = QSFP_GPIO_MOD_SEL_N | QSFP_GPIO_MOD_RST_N | QSFP_GPIO_LP_MODE;
+	highs = mask - QSFP_GPIO_MOD_RST_N;
+	if (qd->ppd->hw_pidx) {
+		mask <<= QSFP_GPIO_PORT2_SHIFT;
+		highs <<= QSFP_GPIO_PORT2_SHIFT;
+	}
+	dd->f_gpio_mod(dd, highs, mask, mask);
+	udelay(20); /* Generous RST dwell */
+
+	dd->f_gpio_mod(dd, mask, mask, mask);
+}
+
+void qib_qsfp_deinit(struct qib_qsfp_data *qd)
+{
+	/*
+	 * There is nothing to do here for now.  our work is scheduled
+	 * with queue_work(), and flush_workqueue() from remove_one
+	 * will block until all work setup with queue_work()
+	 * completes.
+	 */
+}
+
+int qib_qsfp_dump(struct qib_pportdata *ppd, char *buf, int len)
+{
+	struct qib_qsfp_cache cd;
+	u8 bin_buff[QSFP_DUMP_CHUNK];
+	char lenstr[6];
+	int sofar, ret;
+	int bidx = 0;
+
+	sofar = 0;
+	ret = qib_refresh_qsfp_cache(ppd, &cd);
+	if (ret < 0)
+		goto bail;
+
+	lenstr[0] = ' ';
+	lenstr[1] = '\0';
+	if (QSFP_IS_CU(cd.tech))
+		sprintf(lenstr, "%dM ", cd.len);
+
+	sofar += scnprintf(buf + sofar, len - sofar, "PWR:%.3sW\n", pwr_codes +
+			   (QSFP_PWR(cd.pwr) * 4));
+
+	sofar += scnprintf(buf + sofar, len - sofar, "TECH:%s%s\n", lenstr,
+			   qib_qsfp_devtech[cd.tech >> 4]);
+
+	sofar += scnprintf(buf + sofar, len - sofar, "Vendor:%.*s\n",
+			   QSFP_VEND_LEN, cd.vendor);
+
+	sofar += scnprintf(buf + sofar, len - sofar, "OUI:%06X\n",
+			   QSFP_OUI(cd.oui));
+
+	sofar += scnprintf(buf + sofar, len - sofar, "Part#:%.*s\n",
+			   QSFP_PN_LEN, cd.partnum);
+	sofar += scnprintf(buf + sofar, len - sofar, "Rev:%.*s\n",
+			   QSFP_REV_LEN, cd.rev);
+	if (QSFP_IS_CU(cd.tech))
+		sofar += scnprintf(buf + sofar, len - sofar, "Atten:%d, %d\n",
+				   QSFP_ATTEN_SDR(cd.atten),
+				   QSFP_ATTEN_DDR(cd.atten));
+	sofar += scnprintf(buf + sofar, len - sofar, "Serial:%.*s\n",
+			   QSFP_SN_LEN, cd.serial);
+	sofar += scnprintf(buf + sofar, len - sofar, "Date:%.*s\n",
+			   QSFP_DATE_LEN, cd.date);
+	sofar += scnprintf(buf + sofar, len - sofar, "Lot:%.*s\n",
+			   QSFP_LOT_LEN, cd.date);
+
+	while (bidx < QSFP_DEFAULT_HDR_CNT) {
+		int iidx;
+
+		ret = qsfp_read(ppd, bidx, bin_buff, QSFP_DUMP_CHUNK);
+		if (ret < 0)
+			goto bail;
+		for (iidx = 0; iidx < ret; ++iidx) {
+			sofar += scnprintf(buf + sofar, len-sofar, " %02X",
+				bin_buff[iidx]);
+		}
+		sofar += scnprintf(buf + sofar, len - sofar, "\n");
+		bidx += QSFP_DUMP_CHUNK;
+	}
+	ret = sofar;
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/qib/qib_qsfp.h b/drivers/infiniband/hw/qib/qib_qsfp.h
new file mode 100644
index 000000000..91908f533
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_qsfp.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/* QSFP support common definitions, for ib_qib driver */
+
+#define QSFP_DEV 0xA0
+#define QSFP_PWR_LAG_MSEC 2000
+#define QSFP_MODPRS_LAG_MSEC 20
+
+/*
+ * Below are masks for various QSFP signals, for Port 1.
+ * Port2 equivalents are shifted by QSFP_GPIO_PORT2_SHIFT.
+ * _N means asserted low
+ */
+#define QSFP_GPIO_MOD_SEL_N (4)
+#define QSFP_GPIO_MOD_PRS_N (8)
+#define QSFP_GPIO_INT_N (0x10)
+#define QSFP_GPIO_MOD_RST_N (0x20)
+#define QSFP_GPIO_LP_MODE (0x40)
+#define QSFP_GPIO_PORT2_SHIFT 5
+
+#define QSFP_PAGESIZE 128
+/* Defined fields that QLogic requires of qualified cables */
+/* Byte 0 is Identifier, not checked */
+/* Byte 1 is reserved "status MSB" */
+/* Byte 2 is "status LSB" We only care that D2 "Flat Mem" is set. */
+/*
+ * Rest of first 128 not used, although 127 is reserved for page select
+ * if module is not "Flat memory".
+ */
+/* Byte 128 is Identifier: must be 0x0c for QSFP, or 0x0d for QSFP+ */
+#define QSFP_MOD_ID_OFFS 128
+/*
+ * Byte 129 is "Extended Identifier". We only care about D7,D6: Power class
+ *  0:1.5W, 1:2.0W, 2:2.5W, 3:3.5W
+ */
+#define QSFP_MOD_PWR_OFFS 129
+/* Byte 130 is Connector type. Not QLogic req'd */
+/* Bytes 131..138 are Transceiver types, bit maps for various tech, none IB */
+/* Byte 139 is encoding. code 0x01 is 8b10b. Not QLogic req'd */
+/* byte 140 is nominal bit-rate, in units of 100Mbits/sec Not QLogic req'd */
+/* Byte 141 is Extended Rate Select. Not QLogic req'd */
+/* Bytes 142..145 are lengths for various fiber types. Not QLogic req'd */
+/* Byte 146 is length for Copper. Units of 1 meter */
+#define QSFP_MOD_LEN_OFFS 146
+/*
+ * Byte 147 is Device technology. D0..3 not Qlogc req'd
+ * D4..7 select from 15 choices, translated by table:
+ */
+#define QSFP_MOD_TECH_OFFS 147
+extern const char *const qib_qsfp_devtech[16];
+/* Active Equalization includes fiber, copper full EQ, and copper near Eq */
+#define QSFP_IS_ACTIVE(tech) ((0xA2FF >> ((tech) >> 4)) & 1)
+/* Active Equalization includes fiber, copper full EQ, and copper far Eq */
+#define QSFP_IS_ACTIVE_FAR(tech) ((0x32FF >> ((tech) >> 4)) & 1)
+/* Attenuation should be valid for copper other than full/near Eq */
+#define QSFP_HAS_ATTEN(tech) ((0x4D00 >> ((tech) >> 4)) & 1)
+/* Length is only valid if technology is "copper" */
+#define QSFP_IS_CU(tech) ((0xED00 >> ((tech) >> 4)) & 1)
+#define QSFP_TECH_1490 9
+
+#define QSFP_OUI(oui) (((unsigned)oui[0] << 16) | ((unsigned)oui[1] << 8) | \
+			oui[2])
+#define QSFP_OUI_AMPHENOL 0x415048
+#define QSFP_OUI_FINISAR  0x009065
+#define QSFP_OUI_GORE     0x002177
+
+/* Bytes 148..163 are Vendor Name, Left-justified Blank-filled */
+#define QSFP_VEND_OFFS 148
+#define QSFP_VEND_LEN 16
+/* Byte 164 is IB Extended tranceiver codes Bits D0..3 are SDR,DDR,QDR,EDR */
+#define QSFP_IBXCV_OFFS 164
+/* Bytes 165..167 are Vendor OUI number */
+#define QSFP_VOUI_OFFS 165
+#define QSFP_VOUI_LEN 3
+/* Bytes 168..183 are Vendor Part Number, string */
+#define QSFP_PN_OFFS 168
+#define QSFP_PN_LEN 16
+/* Bytes 184,185 are Vendor Rev. Left Justified, Blank-filled */
+#define QSFP_REV_OFFS 184
+#define QSFP_REV_LEN 2
+/*
+ * Bytes 186,187 are Wavelength, if Optical. Not Qlogic req'd
+ *  If copper, they are attenuation in dB:
+ * Byte 186 is at 2.5Gb/sec (SDR), Byte 187 at 5.0Gb/sec (DDR)
+ */
+#define QSFP_ATTEN_OFFS 186
+#define QSFP_ATTEN_LEN 2
+/* Bytes 188,189 are Wavelength tolerance, not QLogic req'd */
+/* Byte 190 is Max Case Temp. Not QLogic req'd */
+/* Byte 191 is LSB of sum of bytes 128..190. Not QLogic req'd */
+#define QSFP_CC_OFFS 191
+/* Bytes 192..195 are Options implemented in qsfp. Not Qlogic req'd */
+/* Bytes 196..211 are Serial Number, String */
+#define QSFP_SN_OFFS 196
+#define QSFP_SN_LEN 16
+/* Bytes 212..219 are date-code YYMMDD (MM==1 for Jan) */
+#define QSFP_DATE_OFFS 212
+#define QSFP_DATE_LEN 6
+/* Bytes 218,219 are optional lot-code, string */
+#define QSFP_LOT_OFFS 218
+#define QSFP_LOT_LEN 2
+/* Bytes 220, 221 indicate monitoring options, Not QLogic req'd */
+/* Byte 223 is LSB of sum of bytes 192..222 */
+#define QSFP_CC_EXT_OFFS 223
+
+/*
+ * struct qib_qsfp_data encapsulates state of QSFP device for one port.
+ * it will be part of port-chip-specific data if a board supports QSFP.
+ *
+ * Since multiple board-types use QSFP, and their pport_data structs
+ * differ (in the chip-specific section), we need a pointer to its head.
+ *
+ * Avoiding premature optimization, we will have one work_struct per port,
+ * and let the (increasingly inaccurately named) eep_lock arbitrate
+ * access to common resources.
+ *
+ */
+
+/*
+ * Hold the parts of the onboard EEPROM that we care about, so we aren't
+ * coonstantly bit-boffing
+ */
+struct qib_qsfp_cache {
+	u8 id;	/* must be 0x0C or 0x0D; 0 indicates invalid EEPROM read */
+	u8 pwr; /* in D6,7 */
+	u8 len;	/* in meters, Cu only */
+	u8 tech;
+	char vendor[QSFP_VEND_LEN];
+	u8 xt_xcv; /* Ext. tranceiver codes, 4 lsbs are IB speed supported */
+	u8 oui[QSFP_VOUI_LEN];
+	u8 partnum[QSFP_PN_LEN];
+	u8 rev[QSFP_REV_LEN];
+	u8 atten[QSFP_ATTEN_LEN];
+	u8 cks1;	/* Checksum of bytes 128..190 */
+	u8 serial[QSFP_SN_LEN];
+	u8 date[QSFP_DATE_LEN];
+	u8 lot[QSFP_LOT_LEN];
+	u8 cks2;	/* Checsum of bytes 192..222 */
+};
+
+#define QSFP_PWR(pbyte) (((pbyte) >> 6) & 3)
+#define QSFP_ATTEN_SDR(attenarray) (attenarray[0])
+#define QSFP_ATTEN_DDR(attenarray) (attenarray[1])
+
+struct qib_qsfp_data {
+	/* Helps to find our way */
+	struct qib_pportdata *ppd;
+	struct work_struct work;
+	struct qib_qsfp_cache cache;
+	unsigned long t_insert;
+	u8 modpresent;
+};
+
+extern int qib_refresh_qsfp_cache(struct qib_pportdata *ppd,
+				  struct qib_qsfp_cache *cp);
+extern int qib_qsfp_mod_present(struct qib_pportdata *ppd);
+extern void qib_qsfp_init(struct qib_qsfp_data *qd,
+			  void (*fevent)(struct work_struct *));
+extern void qib_qsfp_deinit(struct qib_qsfp_data *qd);
diff --git a/drivers/infiniband/hw/qib/qib_rc.c b/drivers/infiniband/hw/qib/qib_rc.c
new file mode 100644
index 000000000..4544d6f88
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_rc.c
@@ -0,0 +1,2290 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/io.h>
+
+#include "qib.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_RC_##x
+
+static void rc_timeout(unsigned long arg);
+
+static u32 restart_sge(struct qib_sge_state *ss, struct qib_swqe *wqe,
+		       u32 psn, u32 pmtu)
+{
+	u32 len;
+
+	len = ((psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
+	ss->sge = wqe->sg_list[0];
+	ss->sg_list = wqe->sg_list + 1;
+	ss->num_sge = wqe->wr.num_sge;
+	ss->total_len = wqe->length;
+	qib_skip_sge(ss, len, 0);
+	return wqe->length - len;
+}
+
+static void start_timer(struct qib_qp *qp)
+{
+	qp->s_flags |= QIB_S_TIMER;
+	qp->s_timer.function = rc_timeout;
+	/* 4.096 usec. * (1 << qp->timeout) */
+	qp->s_timer.expires = jiffies + qp->timeout_jiffies;
+	add_timer(&qp->s_timer);
+}
+
+/**
+ * qib_make_rc_ack - construct a response packet (ACK, NAK, or RDMA read)
+ * @dev: the device for this QP
+ * @qp: a pointer to the QP
+ * @ohdr: a pointer to the IB header being constructed
+ * @pmtu: the path MTU
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ * Note that we are in the responder's side of the QP context.
+ * Note the QP s_lock must be held.
+ */
+static int qib_make_rc_ack(struct qib_ibdev *dev, struct qib_qp *qp,
+			   struct qib_other_headers *ohdr, u32 pmtu)
+{
+	struct qib_ack_entry *e;
+	u32 hwords;
+	u32 len;
+	u32 bth0;
+	u32 bth2;
+
+	/* Don't send an ACK if we aren't supposed to. */
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
+		goto bail;
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+
+	switch (qp->s_ack_state) {
+	case OP(RDMA_READ_RESPONSE_LAST):
+	case OP(RDMA_READ_RESPONSE_ONLY):
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->rdma_sge.mr) {
+			qib_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		/* FALLTHROUGH */
+	case OP(ATOMIC_ACKNOWLEDGE):
+		/*
+		 * We can increment the tail pointer now that the last
+		 * response has been sent instead of only being
+		 * constructed.
+		 */
+		if (++qp->s_tail_ack_queue > QIB_MAX_RDMA_ATOMIC)
+			qp->s_tail_ack_queue = 0;
+		/* FALLTHROUGH */
+	case OP(SEND_ONLY):
+	case OP(ACKNOWLEDGE):
+		/* Check for no next entry in the queue. */
+		if (qp->r_head_ack_queue == qp->s_tail_ack_queue) {
+			if (qp->s_flags & QIB_S_ACK_PENDING)
+				goto normal;
+			goto bail;
+		}
+
+		e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST)) {
+			/*
+			 * If a RDMA read response is being resent and
+			 * we haven't seen the duplicate request yet,
+			 * then stop sending the remaining responses the
+			 * responder has seen until the requester resends it.
+			 */
+			len = e->rdma_sge.sge_length;
+			if (len && !e->rdma_sge.mr) {
+				qp->s_tail_ack_queue = qp->r_head_ack_queue;
+				goto bail;
+			}
+			/* Copy SGE state in case we need to resend */
+			qp->s_rdma_mr = e->rdma_sge.mr;
+			if (qp->s_rdma_mr)
+				qib_get_mr(qp->s_rdma_mr);
+			qp->s_ack_rdma_sge.sge = e->rdma_sge;
+			qp->s_ack_rdma_sge.num_sge = 1;
+			qp->s_cur_sge = &qp->s_ack_rdma_sge;
+			if (len > pmtu) {
+				len = pmtu;
+				qp->s_ack_state = OP(RDMA_READ_RESPONSE_FIRST);
+			} else {
+				qp->s_ack_state = OP(RDMA_READ_RESPONSE_ONLY);
+				e->sent = 1;
+			}
+			ohdr->u.aeth = qib_compute_aeth(qp);
+			hwords++;
+			qp->s_ack_rdma_psn = e->psn;
+			bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
+		} else {
+			/* COMPARE_SWAP or FETCH_ADD */
+			qp->s_cur_sge = NULL;
+			len = 0;
+			qp->s_ack_state = OP(ATOMIC_ACKNOWLEDGE);
+			ohdr->u.at.aeth = qib_compute_aeth(qp);
+			ohdr->u.at.atomic_ack_eth[0] =
+				cpu_to_be32(e->atomic_data >> 32);
+			ohdr->u.at.atomic_ack_eth[1] =
+				cpu_to_be32(e->atomic_data);
+			hwords += sizeof(ohdr->u.at) / sizeof(u32);
+			bth2 = e->psn & QIB_PSN_MASK;
+			e->sent = 1;
+		}
+		bth0 = qp->s_ack_state << 24;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		qp->s_ack_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		qp->s_cur_sge = &qp->s_ack_rdma_sge;
+		qp->s_rdma_mr = qp->s_ack_rdma_sge.sge.mr;
+		if (qp->s_rdma_mr)
+			qib_get_mr(qp->s_rdma_mr);
+		len = qp->s_ack_rdma_sge.sge.sge_length;
+		if (len > pmtu)
+			len = pmtu;
+		else {
+			ohdr->u.aeth = qib_compute_aeth(qp);
+			hwords++;
+			qp->s_ack_state = OP(RDMA_READ_RESPONSE_LAST);
+			e = &qp->s_ack_queue[qp->s_tail_ack_queue];
+			e->sent = 1;
+		}
+		bth0 = qp->s_ack_state << 24;
+		bth2 = qp->s_ack_rdma_psn++ & QIB_PSN_MASK;
+		break;
+
+	default:
+normal:
+		/*
+		 * Send a regular ACK.
+		 * Set the s_ack_state so we wait until after sending
+		 * the ACK before setting s_ack_state to ACKNOWLEDGE
+		 * (see above).
+		 */
+		qp->s_ack_state = OP(SEND_ONLY);
+		qp->s_flags &= ~QIB_S_ACK_PENDING;
+		qp->s_cur_sge = NULL;
+		if (qp->s_nak_state)
+			ohdr->u.aeth =
+				cpu_to_be32((qp->r_msn & QIB_MSN_MASK) |
+					    (qp->s_nak_state <<
+					     QIB_AETH_CREDIT_SHIFT));
+		else
+			ohdr->u.aeth = qib_compute_aeth(qp);
+		hwords++;
+		len = 0;
+		bth0 = OP(ACKNOWLEDGE) << 24;
+		bth2 = qp->s_ack_psn & QIB_PSN_MASK;
+	}
+	qp->s_rdma_ack_cnt++;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_size = len;
+	qib_make_ruc_header(qp, ohdr, bth0, bth2);
+	return 1;
+
+bail:
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+	qp->s_flags &= ~(QIB_S_RESP_PENDING | QIB_S_ACK_PENDING);
+	return 0;
+}
+
+/**
+ * qib_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int qib_make_rc_req(struct qib_qp *qp)
+{
+	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
+	struct qib_other_headers *ohdr;
+	struct qib_sge_state *ss;
+	struct qib_swqe *wqe;
+	u32 hwords;
+	u32 len;
+	u32 bth0;
+	u32 bth2;
+	u32 pmtu = qp->pmtu;
+	char newreq;
+	unsigned long flags;
+	int ret = 0;
+	int delta;
+
+	ohdr = &qp->s_hdr->u.oth;
+	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+		ohdr = &qp->s_hdr->u.l.oth;
+
+	/*
+	 * The lock is needed to synchronize between the sending tasklet,
+	 * the receive interrupt handler, and timeout resends.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Sending responses has higher priority over sending requests. */
+	if ((qp->s_flags & QIB_S_RESP_PENDING) &&
+	    qib_make_rc_ack(dev, qp, ohdr, pmtu))
+		goto done;
+
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_SEND_OK)) {
+		if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_dma_busy)) {
+			qp->s_flags |= QIB_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		qib_send_complete(qp, wqe, qp->s_last != qp->s_acked ?
+			IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR);
+		/* will get called again */
+		goto done;
+	}
+
+	if (qp->s_flags & (QIB_S_WAIT_RNR | QIB_S_WAIT_ACK))
+		goto bail;
+
+	if (qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) {
+		if (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0) {
+			qp->s_flags |= QIB_S_WAIT_PSN;
+			goto bail;
+		}
+		qp->s_sending_psn = qp->s_psn;
+		qp->s_sending_hpsn = qp->s_psn - 1;
+	}
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+	bth0 = 0;
+
+	/* Send a request. */
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	switch (qp->s_state) {
+	default:
+		if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_NEXT_SEND_OK))
+			goto bail;
+		/*
+		 * Resend an old request or start a new one.
+		 *
+		 * We keep track of the current SWQE so that
+		 * we don't reset the "furthest progress" state
+		 * if we need to back up.
+		 */
+		newreq = 0;
+		if (qp->s_cur == qp->s_tail) {
+			/* Check if send work queue is empty. */
+			if (qp->s_tail == qp->s_head)
+				goto bail;
+			/*
+			 * If a fence is requested, wait for previous
+			 * RDMA read and atomic operations to finish.
+			 */
+			if ((wqe->wr.send_flags & IB_SEND_FENCE) &&
+			    qp->s_num_rd_atomic) {
+				qp->s_flags |= QIB_S_WAIT_FENCE;
+				goto bail;
+			}
+			wqe->psn = qp->s_next_psn;
+			newreq = 1;
+		}
+		/*
+		 * Note that we have to be careful not to modify the
+		 * original work request since we may need to resend
+		 * it.
+		 */
+		len = wqe->length;
+		ss = &qp->s_sge;
+		bth2 = qp->s_psn & QIB_PSN_MASK;
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			/* If no credit, return. */
+			if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) &&
+			    qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+				qp->s_flags |= QIB_S_WAIT_SSN_CREDIT;
+				goto bail;
+			}
+			wqe->lpsn = wqe->psn;
+			if (len > pmtu) {
+				wqe->lpsn += (len - 1) / pmtu;
+				qp->s_state = OP(SEND_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_SEND)
+				qp->s_state = OP(SEND_ONLY);
+			else {
+				qp->s_state = OP(SEND_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the BTH */
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+			}
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+			if (newreq && !(qp->s_flags & QIB_S_UNLIMITED_CREDIT))
+				qp->s_lsn++;
+			/* FALLTHROUGH */
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			/* If no credit, return. */
+			if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT) &&
+			    qib_cmp24(wqe->ssn, qp->s_lsn + 1) > 0) {
+				qp->s_flags |= QIB_S_WAIT_SSN_CREDIT;
+				goto bail;
+			}
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			hwords += sizeof(struct ib_reth) / sizeof(u32);
+			wqe->lpsn = wqe->psn;
+			if (len > pmtu) {
+				wqe->lpsn += (len - 1) / pmtu;
+				qp->s_state = OP(RDMA_WRITE_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+				qp->s_state = OP(RDMA_WRITE_ONLY);
+			else {
+				qp->s_state =
+					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after RETH */
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+					bth0 |= IB_BTH_SOLICITED;
+			}
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_READ:
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow.
+			 */
+			if (newreq) {
+				if (qp->s_num_rd_atomic >=
+				    qp->s_max_rd_atomic) {
+					qp->s_flags |= QIB_S_WAIT_RDMAR;
+					goto bail;
+				}
+				qp->s_num_rd_atomic++;
+				if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT))
+					qp->s_lsn++;
+				/*
+				 * Adjust s_next_psn to count the
+				 * expected number of responses.
+				 */
+				if (len > pmtu)
+					qp->s_next_psn += (len - 1) / pmtu;
+				wqe->lpsn = qp->s_next_psn++;
+			}
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			qp->s_state = OP(RDMA_READ_REQUEST);
+			hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+			ss = NULL;
+			len = 0;
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_ATOMIC_CMP_AND_SWP:
+		case IB_WR_ATOMIC_FETCH_AND_ADD:
+			/*
+			 * Don't allow more operations to be started
+			 * than the QP limits allow.
+			 */
+			if (newreq) {
+				if (qp->s_num_rd_atomic >=
+				    qp->s_max_rd_atomic) {
+					qp->s_flags |= QIB_S_WAIT_RDMAR;
+					goto bail;
+				}
+				qp->s_num_rd_atomic++;
+				if (!(qp->s_flags & QIB_S_UNLIMITED_CREDIT))
+					qp->s_lsn++;
+				wqe->lpsn = wqe->psn;
+			}
+			if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP) {
+				qp->s_state = OP(COMPARE_SWAP);
+				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+					wqe->wr.wr.atomic.swap);
+				ohdr->u.atomic_eth.compare_data = cpu_to_be64(
+					wqe->wr.wr.atomic.compare_add);
+			} else {
+				qp->s_state = OP(FETCH_ADD);
+				ohdr->u.atomic_eth.swap_data = cpu_to_be64(
+					wqe->wr.wr.atomic.compare_add);
+				ohdr->u.atomic_eth.compare_data = 0;
+			}
+			ohdr->u.atomic_eth.vaddr[0] = cpu_to_be32(
+				wqe->wr.wr.atomic.remote_addr >> 32);
+			ohdr->u.atomic_eth.vaddr[1] = cpu_to_be32(
+				wqe->wr.wr.atomic.remote_addr);
+			ohdr->u.atomic_eth.rkey = cpu_to_be32(
+				wqe->wr.wr.atomic.rkey);
+			hwords += sizeof(struct ib_atomic_eth) / sizeof(u32);
+			ss = NULL;
+			len = 0;
+			bth2 |= IB_BTH_REQ_ACK;
+			if (++qp->s_cur == qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		default:
+			goto bail;
+		}
+		qp->s_sge.sge = wqe->sg_list[0];
+		qp->s_sge.sg_list = wqe->sg_list + 1;
+		qp->s_sge.num_sge = wqe->wr.num_sge;
+		qp->s_sge.total_len = wqe->length;
+		qp->s_len = wqe->length;
+		if (newreq) {
+			qp->s_tail++;
+			if (qp->s_tail >= qp->s_size)
+				qp->s_tail = 0;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_READ)
+			qp->s_psn = wqe->lpsn + 1;
+		else {
+			qp->s_psn++;
+			if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+				qp->s_next_psn = qp->s_psn;
+		}
+		break;
+
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		/*
+		 * qp->s_state is normally set to the opcode of the
+		 * last packet constructed for new requests and therefore
+		 * is never set to RDMA read response.
+		 * RDMA_READ_RESPONSE_FIRST is used by the ACK processing
+		 * thread to indicate a SEND needs to be restarted from an
+		 * earlier PSN without interferring with the sending thread.
+		 * See qib_restart_rc().
+		 */
+		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+		/* FALLTHROUGH */
+	case OP(SEND_FIRST):
+		qp->s_state = OP(SEND_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		bth2 = qp->s_psn++ & QIB_PSN_MASK;
+		if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+			qp->s_next_psn = qp->s_psn;
+		ss = &qp->s_sge;
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_SEND)
+			qp->s_state = OP(SEND_LAST);
+		else {
+			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+		}
+		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+			bth0 |= IB_BTH_SOLICITED;
+		bth2 |= IB_BTH_REQ_ACK;
+		qp->s_cur++;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_LAST):
+		/*
+		 * qp->s_state is normally set to the opcode of the
+		 * last packet constructed for new requests and therefore
+		 * is never set to RDMA read response.
+		 * RDMA_READ_RESPONSE_LAST is used by the ACK processing
+		 * thread to indicate a RDMA write needs to be restarted from
+		 * an earlier PSN without interferring with the sending thread.
+		 * See qib_restart_rc().
+		 */
+		qp->s_len = restart_sge(&qp->s_sge, wqe, qp->s_psn, pmtu);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_FIRST):
+		qp->s_state = OP(RDMA_WRITE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		bth2 = qp->s_psn++ & QIB_PSN_MASK;
+		if (qib_cmp24(qp->s_psn, qp->s_next_psn) > 0)
+			qp->s_next_psn = qp->s_psn;
+		ss = &qp->s_sge;
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+			qp->s_state = OP(RDMA_WRITE_LAST);
+		else {
+			qp->s_state = OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+		}
+		bth2 |= IB_BTH_REQ_ACK;
+		qp->s_cur++;
+		if (qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		/*
+		 * qp->s_state is normally set to the opcode of the
+		 * last packet constructed for new requests and therefore
+		 * is never set to RDMA read response.
+		 * RDMA_READ_RESPONSE_MIDDLE is used by the ACK processing
+		 * thread to indicate a RDMA read needs to be restarted from
+		 * an earlier PSN without interferring with the sending thread.
+		 * See qib_restart_rc().
+		 */
+		len = ((qp->s_psn - wqe->psn) & QIB_PSN_MASK) * pmtu;
+		ohdr->u.rc.reth.vaddr =
+			cpu_to_be64(wqe->wr.wr.rdma.remote_addr + len);
+		ohdr->u.rc.reth.rkey =
+			cpu_to_be32(wqe->wr.wr.rdma.rkey);
+		ohdr->u.rc.reth.length = cpu_to_be32(wqe->length - len);
+		qp->s_state = OP(RDMA_READ_REQUEST);
+		hwords += sizeof(ohdr->u.rc.reth) / sizeof(u32);
+		bth2 = (qp->s_psn & QIB_PSN_MASK) | IB_BTH_REQ_ACK;
+		qp->s_psn = wqe->lpsn + 1;
+		ss = NULL;
+		len = 0;
+		qp->s_cur++;
+		if (qp->s_cur == qp->s_size)
+			qp->s_cur = 0;
+		break;
+	}
+	qp->s_sending_hpsn = bth2;
+	delta = (((int) bth2 - (int) wqe->psn) << 8) >> 8;
+	if (delta && delta % QIB_PSN_CREDIT == 0)
+		bth2 |= IB_BTH_REQ_ACK;
+	if (qp->s_flags & QIB_S_SEND_ONE) {
+		qp->s_flags &= ~QIB_S_SEND_ONE;
+		qp->s_flags |= QIB_S_WAIT_ACK;
+		bth2 |= IB_BTH_REQ_ACK;
+	}
+	qp->s_len -= len;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_sge = ss;
+	qp->s_cur_size = len;
+	qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24), bth2);
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~QIB_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * qib_send_rc_ack - Construct an ACK packet and send it
+ * @qp: a pointer to the QP
+ *
+ * This is called from qib_rc_rcv() and qib_kreceive().
+ * Note that RDMA reads and atomics are handled in the
+ * send side QP state and tasklet.
+ */
+void qib_send_rc_ack(struct qib_qp *qp)
+{
+	struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	u64 pbc;
+	u16 lrh0;
+	u32 bth0;
+	u32 hwords;
+	u32 pbufn;
+	u32 __iomem *piobuf;
+	struct qib_ib_header hdr;
+	struct qib_other_headers *ohdr;
+	u32 control;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
+		goto unlock;
+
+	/* Don't send ACK or NAK if a RDMA read or atomic is pending. */
+	if ((qp->s_flags & QIB_S_RESP_PENDING) || qp->s_rdma_ack_cnt)
+		goto queue_ack;
+
+	/* Construct the header with s_lock held so APM doesn't change it. */
+	ohdr = &hdr.u.oth;
+	lrh0 = QIB_LRH_BTH;
+	/* header size in 32-bit words LRH+BTH+AETH = (8+12+4)/4. */
+	hwords = 6;
+	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+		hwords += qib_make_grh(ibp, &hdr.u.l.grh,
+				       &qp->remote_ah_attr.grh, hwords, 0);
+		ohdr = &hdr.u.l.oth;
+		lrh0 = QIB_LRH_GRH;
+	}
+	/* read pkey_index w/o lock (its atomic) */
+	bth0 = qib_get_pkey(ibp, qp->s_pkey_index) | (OP(ACKNOWLEDGE) << 24);
+	if (qp->s_mig_state == IB_MIG_MIGRATED)
+		bth0 |= IB_BTH_MIG_REQ;
+	if (qp->r_nak_state)
+		ohdr->u.aeth = cpu_to_be32((qp->r_msn & QIB_MSN_MASK) |
+					    (qp->r_nak_state <<
+					     QIB_AETH_CREDIT_SHIFT));
+	else
+		ohdr->u.aeth = qib_compute_aeth(qp);
+	lrh0 |= ibp->sl_to_vl[qp->remote_ah_attr.sl] << 12 |
+		qp->remote_ah_attr.sl << 4;
+	hdr.lrh[0] = cpu_to_be16(lrh0);
+	hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+	hdr.lrh[2] = cpu_to_be16(hwords + SIZE_OF_CRC);
+	hdr.lrh[3] = cpu_to_be16(ppd->lid | qp->remote_ah_attr.src_path_bits);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(qp->r_ack_psn & QIB_PSN_MASK);
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	/* Don't try to send ACKs if the link isn't ACTIVE */
+	if (!(ppd->lflags & QIBL_LINKACTIVE))
+		goto done;
+
+	control = dd->f_setpbc_control(ppd, hwords + SIZE_OF_CRC,
+				       qp->s_srate, lrh0 >> 12);
+	/* length is + 1 for the control dword */
+	pbc = ((u64) control << 32) | (hwords + 1);
+
+	piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn);
+	if (!piobuf) {
+		/*
+		 * We are out of PIO buffers at the moment.
+		 * Pass responsibility for sending the ACK to the
+		 * send tasklet so that when a PIO buffer becomes
+		 * available, the ACK is sent ahead of other outgoing
+		 * packets.
+		 */
+		spin_lock_irqsave(&qp->s_lock, flags);
+		goto queue_ack;
+	}
+
+	/*
+	 * Write the pbc.
+	 * We have to flush after the PBC for correctness
+	 * on some cpus or WC buffer can be written out of order.
+	 */
+	writeq(pbc, piobuf);
+
+	if (dd->flags & QIB_PIO_FLUSH_WC) {
+		u32 *hdrp = (u32 *) &hdr;
+
+		qib_flush_wc();
+		qib_pio_copy(piobuf + 2, hdrp, hwords - 1);
+		qib_flush_wc();
+		__raw_writel(hdrp[hwords - 1], piobuf + hwords + 1);
+	} else
+		qib_pio_copy(piobuf + 2, (u32 *) &hdr, hwords);
+
+	if (dd->flags & QIB_USE_SPCL_TRIG) {
+		u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023;
+
+		qib_flush_wc();
+		__raw_writel(0xaebecede, piobuf + spcl_off);
+	}
+
+	qib_flush_wc();
+	qib_sendbuf_done(dd, pbufn);
+
+	this_cpu_inc(ibp->pmastats->n_unicast_xmit);
+	goto done;
+
+queue_ack:
+	if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) {
+		ibp->n_rc_qacks++;
+		qp->s_flags |= QIB_S_ACK_PENDING | QIB_S_RESP_PENDING;
+		qp->s_nak_state = qp->r_nak_state;
+		qp->s_ack_psn = qp->r_ack_psn;
+
+		/* Schedule the send tasklet. */
+		qib_schedule_send(qp);
+	}
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return;
+}
+
+/**
+ * reset_psn - reset the QP state to send starting from PSN
+ * @qp: the QP
+ * @psn: the packet sequence number to restart at
+ *
+ * This is called from qib_rc_rcv() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ */
+static void reset_psn(struct qib_qp *qp, u32 psn)
+{
+	u32 n = qp->s_acked;
+	struct qib_swqe *wqe = get_swqe_ptr(qp, n);
+	u32 opcode;
+
+	qp->s_cur = n;
+
+	/*
+	 * If we are starting the request from the beginning,
+	 * let the normal send code handle initialization.
+	 */
+	if (qib_cmp24(psn, wqe->psn) <= 0) {
+		qp->s_state = OP(SEND_LAST);
+		goto done;
+	}
+
+	/* Find the work request opcode corresponding to the given PSN. */
+	opcode = wqe->wr.opcode;
+	for (;;) {
+		int diff;
+
+		if (++n == qp->s_size)
+			n = 0;
+		if (n == qp->s_tail)
+			break;
+		wqe = get_swqe_ptr(qp, n);
+		diff = qib_cmp24(psn, wqe->psn);
+		if (diff < 0)
+			break;
+		qp->s_cur = n;
+		/*
+		 * If we are starting the request from the beginning,
+		 * let the normal send code handle initialization.
+		 */
+		if (diff == 0) {
+			qp->s_state = OP(SEND_LAST);
+			goto done;
+		}
+		opcode = wqe->wr.opcode;
+	}
+
+	/*
+	 * Set the state to restart in the middle of a request.
+	 * Don't change the s_sge, s_cur_sge, or s_cur_size.
+	 * See qib_make_rc_req().
+	 */
+	switch (opcode) {
+	case IB_WR_SEND:
+	case IB_WR_SEND_WITH_IMM:
+		qp->s_state = OP(RDMA_READ_RESPONSE_FIRST);
+		break;
+
+	case IB_WR_RDMA_WRITE:
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		qp->s_state = OP(RDMA_READ_RESPONSE_LAST);
+		break;
+
+	case IB_WR_RDMA_READ:
+		qp->s_state = OP(RDMA_READ_RESPONSE_MIDDLE);
+		break;
+
+	default:
+		/*
+		 * This case shouldn't happen since its only
+		 * one PSN per req.
+		 */
+		qp->s_state = OP(SEND_LAST);
+	}
+done:
+	qp->s_psn = psn;
+	/*
+	 * Set QIB_S_WAIT_PSN as qib_rc_complete() may start the timer
+	 * asynchronously before the send tasklet can get scheduled.
+	 * Doing it in qib_make_rc_req() is too late.
+	 */
+	if ((qib_cmp24(qp->s_psn, qp->s_sending_hpsn) <= 0) &&
+	    (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0))
+		qp->s_flags |= QIB_S_WAIT_PSN;
+}
+
+/*
+ * Back up requester to resend the last un-ACKed request.
+ * The QP r_lock and s_lock should be held and interrupts disabled.
+ */
+static void qib_restart_rc(struct qib_qp *qp, u32 psn, int wait)
+{
+	struct qib_swqe *wqe = get_swqe_ptr(qp, qp->s_acked);
+	struct qib_ibport *ibp;
+
+	if (qp->s_retry == 0) {
+		if (qp->s_mig_state == IB_MIG_ARMED) {
+			qib_migrate_qp(qp);
+			qp->s_retry = qp->s_retry_cnt;
+		} else if (qp->s_last == qp->s_acked) {
+			qib_send_complete(qp, wqe, IB_WC_RETRY_EXC_ERR);
+			qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			return;
+		} else /* XXX need to handle delayed completion */
+			return;
+	} else
+		qp->s_retry--;
+
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	if (wqe->wr.opcode == IB_WR_RDMA_READ)
+		ibp->n_rc_resends++;
+	else
+		ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
+
+	qp->s_flags &= ~(QIB_S_WAIT_FENCE | QIB_S_WAIT_RDMAR |
+			 QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_PSN |
+			 QIB_S_WAIT_ACK);
+	if (wait)
+		qp->s_flags |= QIB_S_SEND_ONE;
+	reset_psn(qp, psn);
+}
+
+/*
+ * This is called from s_timer for missing responses.
+ */
+static void rc_timeout(unsigned long arg)
+{
+	struct qib_qp *qp = (struct qib_qp *)arg;
+	struct qib_ibport *ibp;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->r_lock, flags);
+	spin_lock(&qp->s_lock);
+	if (qp->s_flags & QIB_S_TIMER) {
+		ibp = to_iport(qp->ibqp.device, qp->port_num);
+		ibp->n_rc_timeouts++;
+		qp->s_flags &= ~QIB_S_TIMER;
+		del_timer(&qp->s_timer);
+		qib_restart_rc(qp, qp->s_last_psn + 1, 1);
+		qib_schedule_send(qp);
+	}
+	spin_unlock(&qp->s_lock);
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+}
+
+/*
+ * This is called from s_timer for RNR timeouts.
+ */
+void qib_rc_rnr_retry(unsigned long arg)
+{
+	struct qib_qp *qp = (struct qib_qp *)arg;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (qp->s_flags & QIB_S_WAIT_RNR) {
+		qp->s_flags &= ~QIB_S_WAIT_RNR;
+		del_timer(&qp->s_timer);
+		qib_schedule_send(qp);
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+}
+
+/*
+ * Set qp->s_sending_psn to the next PSN after the given one.
+ * This would be psn+1 except when RDMA reads are present.
+ */
+static void reset_sending_psn(struct qib_qp *qp, u32 psn)
+{
+	struct qib_swqe *wqe;
+	u32 n = qp->s_last;
+
+	/* Find the work request corresponding to the given PSN. */
+	for (;;) {
+		wqe = get_swqe_ptr(qp, n);
+		if (qib_cmp24(psn, wqe->lpsn) <= 0) {
+			if (wqe->wr.opcode == IB_WR_RDMA_READ)
+				qp->s_sending_psn = wqe->lpsn + 1;
+			else
+				qp->s_sending_psn = psn + 1;
+			break;
+		}
+		if (++n == qp->s_size)
+			n = 0;
+		if (n == qp->s_tail)
+			break;
+	}
+}
+
+/*
+ * This should be called with the QP s_lock held and interrupts disabled.
+ */
+void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr)
+{
+	struct qib_other_headers *ohdr;
+	struct qib_swqe *wqe;
+	struct ib_wc wc;
+	unsigned i;
+	u32 opcode;
+	u32 psn;
+
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_OR_FLUSH_SEND))
+		return;
+
+	/* Find out where the BTH is */
+	if ((be16_to_cpu(hdr->lrh[0]) & 3) == QIB_LRH_BTH)
+		ohdr = &hdr->u.oth;
+	else
+		ohdr = &hdr->u.l.oth;
+
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+		WARN_ON(!qp->s_rdma_ack_cnt);
+		qp->s_rdma_ack_cnt--;
+		return;
+	}
+
+	psn = be32_to_cpu(ohdr->bth[2]);
+	reset_sending_psn(qp, psn);
+
+	/*
+	 * Start timer after a packet requesting an ACK has been sent and
+	 * there are still requests that haven't been acked.
+	 */
+	if ((psn & IB_BTH_REQ_ACK) && qp->s_acked != qp->s_tail &&
+	    !(qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR | QIB_S_WAIT_PSN)) &&
+	    (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
+		start_timer(qp);
+
+	while (qp->s_last != qp->s_acked) {
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) >= 0 &&
+		    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)
+			break;
+		for (i = 0; i < wqe->wr.num_sge; i++) {
+			struct qib_sge *sge = &wqe->sg_list[i];
+
+			qib_put_mr(sge->mr);
+		}
+		/* Post a send completion queue entry if requested. */
+		if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) ||
+		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+			memset(&wc, 0, sizeof(wc));
+			wc.wr_id = wqe->wr.wr_id;
+			wc.status = IB_WC_SUCCESS;
+			wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];
+			wc.byte_len = wqe->length;
+			wc.qp = &qp->ibqp;
+			qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+		}
+		if (++qp->s_last >= qp->s_size)
+			qp->s_last = 0;
+	}
+	/*
+	 * If we were waiting for sends to complete before resending,
+	 * and they are now complete, restart sending.
+	 */
+	if (qp->s_flags & QIB_S_WAIT_PSN &&
+	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+		qp->s_flags &= ~QIB_S_WAIT_PSN;
+		qp->s_sending_psn = qp->s_psn;
+		qp->s_sending_hpsn = qp->s_psn - 1;
+		qib_schedule_send(qp);
+	}
+}
+
+static inline void update_last_psn(struct qib_qp *qp, u32 psn)
+{
+	qp->s_last_psn = psn;
+}
+
+/*
+ * Generate a SWQE completion.
+ * This is similar to qib_send_complete but has to check to be sure
+ * that the SGEs are not being referenced if the SWQE is being resent.
+ */
+static struct qib_swqe *do_rc_completion(struct qib_qp *qp,
+					 struct qib_swqe *wqe,
+					 struct qib_ibport *ibp)
+{
+	struct ib_wc wc;
+	unsigned i;
+
+	/*
+	 * Don't decrement refcount and don't generate a
+	 * completion if the SWQE is being resent until the send
+	 * is finished.
+	 */
+	if (qib_cmp24(wqe->lpsn, qp->s_sending_psn) < 0 ||
+	    qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) > 0) {
+		for (i = 0; i < wqe->wr.num_sge; i++) {
+			struct qib_sge *sge = &wqe->sg_list[i];
+
+			qib_put_mr(sge->mr);
+		}
+		/* Post a send completion queue entry if requested. */
+		if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) ||
+		    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
+			memset(&wc, 0, sizeof(wc));
+			wc.wr_id = wqe->wr.wr_id;
+			wc.status = IB_WC_SUCCESS;
+			wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];
+			wc.byte_len = wqe->length;
+			wc.qp = &qp->ibqp;
+			qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+		}
+		if (++qp->s_last >= qp->s_size)
+			qp->s_last = 0;
+	} else
+		ibp->n_rc_delayed_comp++;
+
+	qp->s_retry = qp->s_retry_cnt;
+	update_last_psn(qp, wqe->lpsn);
+
+	/*
+	 * If we are completing a request which is in the process of
+	 * being resent, we can stop resending it since we know the
+	 * responder has already seen it.
+	 */
+	if (qp->s_acked == qp->s_cur) {
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		qp->s_acked = qp->s_cur;
+		wqe = get_swqe_ptr(qp, qp->s_cur);
+		if (qp->s_acked != qp->s_tail) {
+			qp->s_state = OP(SEND_LAST);
+			qp->s_psn = wqe->psn;
+		}
+	} else {
+		if (++qp->s_acked >= qp->s_size)
+			qp->s_acked = 0;
+		if (qp->state == IB_QPS_SQD && qp->s_acked == qp->s_cur)
+			qp->s_draining = 0;
+		wqe = get_swqe_ptr(qp, qp->s_acked);
+	}
+	return wqe;
+}
+
+/**
+ * do_rc_ack - process an incoming RC ACK
+ * @qp: the QP the ACK came in on
+ * @psn: the packet sequence number of the ACK
+ * @opcode: the opcode of the request that resulted in the ACK
+ *
+ * This is called from qib_rc_rcv_resp() to process an incoming RC ACK
+ * for the given QP.
+ * Called at interrupt level with the QP s_lock held.
+ * Returns 1 if OK, 0 if current operation should be aborted (NAK).
+ */
+static int do_rc_ack(struct qib_qp *qp, u32 aeth, u32 psn, int opcode,
+		     u64 val, struct qib_ctxtdata *rcd)
+{
+	struct qib_ibport *ibp;
+	enum ib_wc_status status;
+	struct qib_swqe *wqe;
+	int ret = 0;
+	u32 ack_psn;
+	int diff;
+
+	/* Remove QP from retry timer */
+	if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) {
+		qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR);
+		del_timer(&qp->s_timer);
+	}
+
+	/*
+	 * Note that NAKs implicitly ACK outstanding SEND and RDMA write
+	 * requests and implicitly NAK RDMA read and atomic requests issued
+	 * before the NAK'ed request.  The MSN won't include the NAK'ed
+	 * request but will include an ACK'ed request(s).
+	 */
+	ack_psn = psn;
+	if (aeth >> 29)
+		ack_psn--;
+	wqe = get_swqe_ptr(qp, qp->s_acked);
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+
+	/*
+	 * The MSN might be for a later WQE than the PSN indicates so
+	 * only complete WQEs that the PSN finishes.
+	 */
+	while ((diff = qib_cmp24(ack_psn, wqe->lpsn)) >= 0) {
+		/*
+		 * RDMA_READ_RESPONSE_ONLY is a special case since
+		 * we want to generate completion events for everything
+		 * before the RDMA read, copy the data, then generate
+		 * the completion for the read.
+		 */
+		if (wqe->wr.opcode == IB_WR_RDMA_READ &&
+		    opcode == OP(RDMA_READ_RESPONSE_ONLY) &&
+		    diff == 0) {
+			ret = 1;
+			goto bail;
+		}
+		/*
+		 * If this request is a RDMA read or atomic, and the ACK is
+		 * for a later operation, this ACK NAKs the RDMA read or
+		 * atomic.  In other words, only a RDMA_READ_LAST or ONLY
+		 * can ACK a RDMA read and likewise for atomic ops.  Note
+		 * that the NAK case can only happen if relaxed ordering is
+		 * used and requests are sent after an RDMA read or atomic
+		 * is sent but before the response is received.
+		 */
+		if ((wqe->wr.opcode == IB_WR_RDMA_READ &&
+		     (opcode != OP(RDMA_READ_RESPONSE_LAST) || diff != 0)) ||
+		    ((wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		      wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) &&
+		     (opcode != OP(ATOMIC_ACKNOWLEDGE) || diff != 0))) {
+			/* Retry this request. */
+			if (!(qp->r_flags & QIB_R_RDMAR_SEQ)) {
+				qp->r_flags |= QIB_R_RDMAR_SEQ;
+				qib_restart_rc(qp, qp->s_last_psn + 1, 0);
+				if (list_empty(&qp->rspwait)) {
+					qp->r_flags |= QIB_R_RSP_SEND;
+					atomic_inc(&qp->refcount);
+					list_add_tail(&qp->rspwait,
+						      &rcd->qp_wait_list);
+				}
+			}
+			/*
+			 * No need to process the ACK/NAK since we are
+			 * restarting an earlier request.
+			 */
+			goto bail;
+		}
+		if (wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) {
+			u64 *vaddr = wqe->sg_list[0].vaddr;
+			*vaddr = val;
+		}
+		if (qp->s_num_rd_atomic &&
+		    (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		     wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		     wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)) {
+			qp->s_num_rd_atomic--;
+			/* Restart sending task if fence is complete */
+			if ((qp->s_flags & QIB_S_WAIT_FENCE) &&
+			    !qp->s_num_rd_atomic) {
+				qp->s_flags &= ~(QIB_S_WAIT_FENCE |
+						 QIB_S_WAIT_ACK);
+				qib_schedule_send(qp);
+			} else if (qp->s_flags & QIB_S_WAIT_RDMAR) {
+				qp->s_flags &= ~(QIB_S_WAIT_RDMAR |
+						 QIB_S_WAIT_ACK);
+				qib_schedule_send(qp);
+			}
+		}
+		wqe = do_rc_completion(qp, wqe, ibp);
+		if (qp->s_acked == qp->s_tail)
+			break;
+	}
+
+	switch (aeth >> 29) {
+	case 0:         /* ACK */
+		ibp->n_rc_acks++;
+		if (qp->s_acked != qp->s_tail) {
+			/*
+			 * We are expecting more ACKs so
+			 * reset the retransmit timer.
+			 */
+			start_timer(qp);
+			/*
+			 * We can stop resending the earlier packets and
+			 * continue with the next packet the receiver wants.
+			 */
+			if (qib_cmp24(qp->s_psn, psn) <= 0)
+				reset_psn(qp, psn + 1);
+		} else if (qib_cmp24(qp->s_psn, psn) <= 0) {
+			qp->s_state = OP(SEND_LAST);
+			qp->s_psn = psn + 1;
+		}
+		if (qp->s_flags & QIB_S_WAIT_ACK) {
+			qp->s_flags &= ~QIB_S_WAIT_ACK;
+			qib_schedule_send(qp);
+		}
+		qib_get_credit(qp, aeth);
+		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+		qp->s_retry = qp->s_retry_cnt;
+		update_last_psn(qp, psn);
+		ret = 1;
+		goto bail;
+
+	case 1:         /* RNR NAK */
+		ibp->n_rnr_naks++;
+		if (qp->s_acked == qp->s_tail)
+			goto bail;
+		if (qp->s_flags & QIB_S_WAIT_RNR)
+			goto bail;
+		if (qp->s_rnr_retry == 0) {
+			status = IB_WC_RNR_RETRY_EXC_ERR;
+			goto class_b;
+		}
+		if (qp->s_rnr_retry_cnt < 7)
+			qp->s_rnr_retry--;
+
+		/* The last valid PSN is the previous PSN. */
+		update_last_psn(qp, psn - 1);
+
+		ibp->n_rc_resends += (qp->s_psn - psn) & QIB_PSN_MASK;
+
+		reset_psn(qp, psn);
+
+		qp->s_flags &= ~(QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_ACK);
+		qp->s_flags |= QIB_S_WAIT_RNR;
+		qp->s_timer.function = qib_rc_rnr_retry;
+		qp->s_timer.expires = jiffies + usecs_to_jiffies(
+			ib_qib_rnr_table[(aeth >> QIB_AETH_CREDIT_SHIFT) &
+					   QIB_AETH_CREDIT_MASK]);
+		add_timer(&qp->s_timer);
+		goto bail;
+
+	case 3:         /* NAK */
+		if (qp->s_acked == qp->s_tail)
+			goto bail;
+		/* The last valid PSN is the previous PSN. */
+		update_last_psn(qp, psn - 1);
+		switch ((aeth >> QIB_AETH_CREDIT_SHIFT) &
+			QIB_AETH_CREDIT_MASK) {
+		case 0: /* PSN sequence error */
+			ibp->n_seq_naks++;
+			/*
+			 * Back up to the responder's expected PSN.
+			 * Note that we might get a NAK in the middle of an
+			 * RDMA READ response which terminates the RDMA
+			 * READ.
+			 */
+			qib_restart_rc(qp, psn, 0);
+			qib_schedule_send(qp);
+			break;
+
+		case 1: /* Invalid Request */
+			status = IB_WC_REM_INV_REQ_ERR;
+			ibp->n_other_naks++;
+			goto class_b;
+
+		case 2: /* Remote Access Error */
+			status = IB_WC_REM_ACCESS_ERR;
+			ibp->n_other_naks++;
+			goto class_b;
+
+		case 3: /* Remote Operation Error */
+			status = IB_WC_REM_OP_ERR;
+			ibp->n_other_naks++;
+class_b:
+			if (qp->s_last == qp->s_acked) {
+				qib_send_complete(qp, wqe, status);
+				qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+			}
+			break;
+
+		default:
+			/* Ignore other reserved NAK error codes */
+			goto reserved;
+		}
+		qp->s_retry = qp->s_retry_cnt;
+		qp->s_rnr_retry = qp->s_rnr_retry_cnt;
+		goto bail;
+
+	default:                /* 2: reserved */
+reserved:
+		/* Ignore reserved NAK codes. */
+		goto bail;
+	}
+
+bail:
+	return ret;
+}
+
+/*
+ * We have seen an out of sequence RDMA read middle or last packet.
+ * This ACKs SENDs and RDMA writes up to the first RDMA read or atomic SWQE.
+ */
+static void rdma_seq_err(struct qib_qp *qp, struct qib_ibport *ibp, u32 psn,
+			 struct qib_ctxtdata *rcd)
+{
+	struct qib_swqe *wqe;
+
+	/* Remove QP from retry timer */
+	if (qp->s_flags & (QIB_S_TIMER | QIB_S_WAIT_RNR)) {
+		qp->s_flags &= ~(QIB_S_TIMER | QIB_S_WAIT_RNR);
+		del_timer(&qp->s_timer);
+	}
+
+	wqe = get_swqe_ptr(qp, qp->s_acked);
+
+	while (qib_cmp24(psn, wqe->lpsn) > 0) {
+		if (wqe->wr.opcode == IB_WR_RDMA_READ ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_CMP_AND_SWP ||
+		    wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD)
+			break;
+		wqe = do_rc_completion(qp, wqe, ibp);
+	}
+
+	ibp->n_rdma_seq++;
+	qp->r_flags |= QIB_R_RDMAR_SEQ;
+	qib_restart_rc(qp, qp->s_last_psn + 1, 0);
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= QIB_R_RSP_SEND;
+		atomic_inc(&qp->refcount);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+}
+
+/**
+ * qib_rc_rcv_resp - process an incoming RC response packet
+ * @ibp: the port this packet came in on
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @hdrsize: the header length
+ * @pmtu: the path MTU
+ *
+ * This is called from qib_rc_rcv() to process an incoming RC response
+ * packet for the given QP.
+ * Called at interrupt level.
+ */
+static void qib_rc_rcv_resp(struct qib_ibport *ibp,
+			    struct qib_other_headers *ohdr,
+			    void *data, u32 tlen,
+			    struct qib_qp *qp,
+			    u32 opcode,
+			    u32 psn, u32 hdrsize, u32 pmtu,
+			    struct qib_ctxtdata *rcd)
+{
+	struct qib_swqe *wqe;
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	enum ib_wc_status status;
+	unsigned long flags;
+	int diff;
+	u32 pad;
+	u32 aeth;
+	u64 val;
+
+	if (opcode != OP(RDMA_READ_RESPONSE_MIDDLE)) {
+		/*
+		 * If ACK'd PSN on SDMA busy list try to make progress to
+		 * reclaim SDMA credits.
+		 */
+		if ((qib_cmp24(psn, qp->s_sending_psn) >= 0) &&
+		    (qib_cmp24(qp->s_sending_psn, qp->s_sending_hpsn) <= 0)) {
+
+			/*
+			 * If send tasklet not running attempt to progress
+			 * SDMA queue.
+			 */
+			if (!(qp->s_flags & QIB_S_BUSY)) {
+				/* Acquire SDMA Lock */
+				spin_lock_irqsave(&ppd->sdma_lock, flags);
+				/* Invoke sdma make progress */
+				qib_sdma_make_progress(ppd);
+				/* Release SDMA Lock */
+				spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+			}
+		}
+	}
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK))
+		goto ack_done;
+
+	/* Ignore invalid responses. */
+	if (qib_cmp24(psn, qp->s_next_psn) >= 0)
+		goto ack_done;
+
+	/* Ignore duplicate responses. */
+	diff = qib_cmp24(psn, qp->s_last_psn);
+	if (unlikely(diff <= 0)) {
+		/* Update credits for "ghost" ACKs */
+		if (diff == 0 && opcode == OP(ACKNOWLEDGE)) {
+			aeth = be32_to_cpu(ohdr->u.aeth);
+			if ((aeth >> 29) == 0)
+				qib_get_credit(qp, aeth);
+		}
+		goto ack_done;
+	}
+
+	/*
+	 * Skip everything other than the PSN we expect, if we are waiting
+	 * for a reply to a restarted RDMA read or atomic op.
+	 */
+	if (qp->r_flags & QIB_R_RDMAR_SEQ) {
+		if (qib_cmp24(psn, qp->s_last_psn + 1) != 0)
+			goto ack_done;
+		qp->r_flags &= ~QIB_R_RDMAR_SEQ;
+	}
+
+	if (unlikely(qp->s_acked == qp->s_tail))
+		goto ack_done;
+	wqe = get_swqe_ptr(qp, qp->s_acked);
+	status = IB_WC_SUCCESS;
+
+	switch (opcode) {
+	case OP(ACKNOWLEDGE):
+	case OP(ATOMIC_ACKNOWLEDGE):
+	case OP(RDMA_READ_RESPONSE_FIRST):
+		aeth = be32_to_cpu(ohdr->u.aeth);
+		if (opcode == OP(ATOMIC_ACKNOWLEDGE)) {
+			__be32 *p = ohdr->u.at.atomic_ack_eth;
+
+			val = ((u64) be32_to_cpu(p[0]) << 32) |
+				be32_to_cpu(p[1]);
+		} else
+			val = 0;
+		if (!do_rc_ack(qp, aeth, psn, opcode, val, rcd) ||
+		    opcode != OP(RDMA_READ_RESPONSE_FIRST))
+			goto ack_done;
+		hdrsize += 4;
+		wqe = get_swqe_ptr(qp, qp->s_acked);
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+		/*
+		 * If this is a response to a resent RDMA read, we
+		 * have to be careful to copy the data to the right
+		 * location.
+		 */
+		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+						  wqe, psn, pmtu);
+		goto read_middle;
+
+	case OP(RDMA_READ_RESPONSE_MIDDLE):
+		/* no AETH, no ACK */
+		if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
+			goto ack_seq_err;
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+read_middle:
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto ack_len_err;
+		if (unlikely(pmtu >= qp->s_rdma_read_len))
+			goto ack_len_err;
+
+		/*
+		 * We got a response so update the timeout.
+		 * 4.096 usec. * (1 << qp->timeout)
+		 */
+		qp->s_flags |= QIB_S_TIMER;
+		mod_timer(&qp->s_timer, jiffies + qp->timeout_jiffies);
+		if (qp->s_flags & QIB_S_WAIT_ACK) {
+			qp->s_flags &= ~QIB_S_WAIT_ACK;
+			qib_schedule_send(qp);
+		}
+
+		if (opcode == OP(RDMA_READ_RESPONSE_MIDDLE))
+			qp->s_retry = qp->s_retry_cnt;
+
+		/*
+		 * Update the RDMA receive state but do the copy w/o
+		 * holding the locks and blocking interrupts.
+		 */
+		qp->s_rdma_read_len -= pmtu;
+		update_last_psn(qp, psn);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		qib_copy_sge(&qp->s_rdma_read_sge, data, pmtu, 0);
+		goto bail;
+
+	case OP(RDMA_READ_RESPONSE_ONLY):
+		aeth = be32_to_cpu(ohdr->u.aeth);
+		if (!do_rc_ack(qp, aeth, psn, opcode, 0, rcd))
+			goto ack_done;
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/*
+		 * Check that the data size is >= 0 && <= pmtu.
+		 * Remember to account for the AETH header (4) and
+		 * ICRC (4).
+		 */
+		if (unlikely(tlen < (hdrsize + pad + 8)))
+			goto ack_len_err;
+		/*
+		 * If this is a response to a resent RDMA read, we
+		 * have to be careful to copy the data to the right
+		 * location.
+		 */
+		wqe = get_swqe_ptr(qp, qp->s_acked);
+		qp->s_rdma_read_len = restart_sge(&qp->s_rdma_read_sge,
+						  wqe, psn, pmtu);
+		goto read_last;
+
+	case OP(RDMA_READ_RESPONSE_LAST):
+		/* ACKs READ req. */
+		if (unlikely(qib_cmp24(psn, qp->s_last_psn + 1)))
+			goto ack_seq_err;
+		if (unlikely(wqe->wr.opcode != IB_WR_RDMA_READ))
+			goto ack_op_err;
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/*
+		 * Check that the data size is >= 1 && <= pmtu.
+		 * Remember to account for the AETH header (4) and
+		 * ICRC (4).
+		 */
+		if (unlikely(tlen <= (hdrsize + pad + 8)))
+			goto ack_len_err;
+read_last:
+		tlen -= hdrsize + pad + 8;
+		if (unlikely(tlen != qp->s_rdma_read_len))
+			goto ack_len_err;
+		aeth = be32_to_cpu(ohdr->u.aeth);
+		qib_copy_sge(&qp->s_rdma_read_sge, data, tlen, 0);
+		WARN_ON(qp->s_rdma_read_sge.num_sge);
+		(void) do_rc_ack(qp, aeth, psn,
+				 OP(RDMA_READ_RESPONSE_LAST), 0, rcd);
+		goto ack_done;
+	}
+
+ack_op_err:
+	status = IB_WC_LOC_QP_OP_ERR;
+	goto ack_err;
+
+ack_seq_err:
+	rdma_seq_err(qp, ibp, psn, rcd);
+	goto ack_done;
+
+ack_len_err:
+	status = IB_WC_LOC_LEN_ERR;
+ack_err:
+	if (qp->s_last == qp->s_acked) {
+		qib_send_complete(qp, wqe, status);
+		qib_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+	}
+ack_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+bail:
+	return;
+}
+
+/**
+ * qib_rc_rcv_error - process an incoming duplicate or error RC packet
+ * @ohdr: the other headers for this packet
+ * @data: the packet data
+ * @qp: the QP for this packet
+ * @opcode: the opcode for this packet
+ * @psn: the packet sequence number for this packet
+ * @diff: the difference between the PSN and the expected PSN
+ *
+ * This is called from qib_rc_rcv() to process an unexpected
+ * incoming RC packet for the given QP.
+ * Called at interrupt level.
+ * Return 1 if no more processing is needed; otherwise return 0 to
+ * schedule a response to be sent.
+ */
+static int qib_rc_rcv_error(struct qib_other_headers *ohdr,
+			    void *data,
+			    struct qib_qp *qp,
+			    u32 opcode,
+			    u32 psn,
+			    int diff,
+			    struct qib_ctxtdata *rcd)
+{
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct qib_ack_entry *e;
+	unsigned long flags;
+	u8 i, prev;
+	int old_req;
+
+	if (diff > 0) {
+		/*
+		 * Packet sequence error.
+		 * A NAK will ACK earlier sends and RDMA writes.
+		 * Don't queue the NAK if we already sent one.
+		 */
+		if (!qp->r_nak_state) {
+			ibp->n_rc_seqnak++;
+			qp->r_nak_state = IB_NAK_PSN_ERROR;
+			/* Use the expected PSN. */
+			qp->r_ack_psn = qp->r_psn;
+			/*
+			 * Wait to send the sequence NAK until all packets
+			 * in the receive queue have been processed.
+			 * Otherwise, we end up propagating congestion.
+			 */
+			if (list_empty(&qp->rspwait)) {
+				qp->r_flags |= QIB_R_RSP_NAK;
+				atomic_inc(&qp->refcount);
+				list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+			}
+		}
+		goto done;
+	}
+
+	/*
+	 * Handle a duplicate request.  Don't re-execute SEND, RDMA
+	 * write or atomic op.  Don't NAK errors, just silently drop
+	 * the duplicate request.  Note that r_sge, r_len, and
+	 * r_rcv_len may be in use so don't modify them.
+	 *
+	 * We are supposed to ACK the earliest duplicate PSN but we
+	 * can coalesce an outstanding duplicate ACK.  We have to
+	 * send the earliest so that RDMA reads can be restarted at
+	 * the requester's expected PSN.
+	 *
+	 * First, find where this duplicate PSN falls within the
+	 * ACKs previously sent.
+	 * old_req is true if there is an older response that is scheduled
+	 * to be sent before sending this one.
+	 */
+	e = NULL;
+	old_req = 1;
+	ibp->n_rc_dupreq++;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	for (i = qp->r_head_ack_queue; ; i = prev) {
+		if (i == qp->s_tail_ack_queue)
+			old_req = 0;
+		if (i)
+			prev = i - 1;
+		else
+			prev = QIB_MAX_RDMA_ATOMIC;
+		if (prev == qp->r_head_ack_queue) {
+			e = NULL;
+			break;
+		}
+		e = &qp->s_ack_queue[prev];
+		if (!e->opcode) {
+			e = NULL;
+			break;
+		}
+		if (qib_cmp24(psn, e->psn) >= 0) {
+			if (prev == qp->s_tail_ack_queue &&
+			    qib_cmp24(psn, e->lpsn) <= 0)
+				old_req = 0;
+			break;
+		}
+	}
+	switch (opcode) {
+	case OP(RDMA_READ_REQUEST): {
+		struct ib_reth *reth;
+		u32 offset;
+		u32 len;
+
+		/*
+		 * If we didn't find the RDMA read request in the ack queue,
+		 * we can ignore this request.
+		 */
+		if (!e || e->opcode != OP(RDMA_READ_REQUEST))
+			goto unlock_done;
+		/* RETH comes after BTH */
+		reth = &ohdr->u.rc.reth;
+		/*
+		 * Address range must be a subset of the original
+		 * request and start on pmtu boundaries.
+		 * We reuse the old ack_queue slot since the requester
+		 * should not back up and request an earlier PSN for the
+		 * same request.
+		 */
+		offset = ((psn - e->psn) & QIB_PSN_MASK) *
+			qp->pmtu;
+		len = be32_to_cpu(reth->length);
+		if (unlikely(offset + len != e->rdma_sge.sge_length))
+			goto unlock_done;
+		if (e->rdma_sge.mr) {
+			qib_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		if (len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr, rkey,
+					 IB_ACCESS_REMOTE_READ);
+			if (unlikely(!ok))
+				goto unlock_done;
+		} else {
+			e->rdma_sge.vaddr = NULL;
+			e->rdma_sge.length = 0;
+			e->rdma_sge.sge_length = 0;
+		}
+		e->psn = psn;
+		if (old_req)
+			goto unlock_done;
+		qp->s_tail_ack_queue = prev;
+		break;
+	}
+
+	case OP(COMPARE_SWAP):
+	case OP(FETCH_ADD): {
+		/*
+		 * If we didn't find the atomic request in the ack queue
+		 * or the send tasklet is already backed up to send an
+		 * earlier entry, we can ignore this request.
+		 */
+		if (!e || e->opcode != (u8) opcode || old_req)
+			goto unlock_done;
+		qp->s_tail_ack_queue = prev;
+		break;
+	}
+
+	default:
+		/*
+		 * Ignore this operation if it doesn't request an ACK
+		 * or an earlier RDMA read or atomic is going to be resent.
+		 */
+		if (!(psn & IB_BTH_REQ_ACK) || old_req)
+			goto unlock_done;
+		/*
+		 * Resend the most recent ACK if this request is
+		 * after all the previous RDMA reads and atomics.
+		 */
+		if (i == qp->r_head_ack_queue) {
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qp->r_nak_state = 0;
+			qp->r_ack_psn = qp->r_psn - 1;
+			goto send_ack;
+		}
+		/*
+		 * Try to send a simple ACK to work around a Mellanox bug
+		 * which doesn't accept a RDMA read response or atomic
+		 * response as an ACK for earlier SENDs or RDMA writes.
+		 */
+		if (!(qp->s_flags & QIB_S_RESP_PENDING)) {
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qp->r_nak_state = 0;
+			qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
+			goto send_ack;
+		}
+		/*
+		 * Resend the RDMA read or atomic op which
+		 * ACKs this duplicate request.
+		 */
+		qp->s_tail_ack_queue = i;
+		break;
+	}
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+	qp->s_flags |= QIB_S_RESP_PENDING;
+	qp->r_nak_state = 0;
+	qib_schedule_send(qp);
+
+unlock_done:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+done:
+	return 1;
+
+send_ack:
+	return 0;
+}
+
+void qib_rc_error(struct qib_qp *qp, enum ib_wc_status err)
+{
+	unsigned long flags;
+	int lastwqe;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	lastwqe = qib_error_qp(qp, err);
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	if (lastwqe) {
+		struct ib_event ev;
+
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
+}
+
+static inline void qib_update_ack_queue(struct qib_qp *qp, unsigned n)
+{
+	unsigned next;
+
+	next = n + 1;
+	if (next > QIB_MAX_RDMA_ATOMIC)
+		next = 0;
+	qp->s_tail_ack_queue = next;
+	qp->s_ack_state = OP(ACKNOWLEDGE);
+}
+
+/**
+ * qib_rc_rcv - process an incoming RC packet
+ * @rcd: the context pointer
+ * @hdr: the header of this packet
+ * @has_grh: true if the header has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP for this packet
+ *
+ * This is called from qib_qp_rcv() to process an incoming RC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct qib_qp *qp)
+{
+	struct qib_ibport *ibp = &rcd->ppd->ibport_data;
+	struct qib_other_headers *ohdr;
+	u32 opcode;
+	u32 hdrsize;
+	u32 psn;
+	u32 pad;
+	struct ib_wc wc;
+	u32 pmtu = qp->pmtu;
+	int diff;
+	struct ib_reth *reth;
+	unsigned long flags;
+	int ret;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12;       /* LRH + BTH */
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
+	}
+
+	opcode = be32_to_cpu(ohdr->bth[0]);
+	if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
+		return;
+
+	psn = be32_to_cpu(ohdr->bth[2]);
+	opcode >>= 24;
+
+	/*
+	 * Process responses (ACKs) before anything else.  Note that the
+	 * packet sequence number will be for something in the send work
+	 * queue rather than the expected receive packet sequence number.
+	 * In other words, this QP is the requester.
+	 */
+	if (opcode >= OP(RDMA_READ_RESPONSE_FIRST) &&
+	    opcode <= OP(ATOMIC_ACKNOWLEDGE)) {
+		qib_rc_rcv_resp(ibp, ohdr, data, tlen, qp, opcode, psn,
+				hdrsize, pmtu, rcd);
+		return;
+	}
+
+	/* Compute 24 bits worth of difference. */
+	diff = qib_cmp24(psn, qp->r_psn);
+	if (unlikely(diff)) {
+		if (qib_rc_rcv_error(ohdr, data, qp, opcode, psn, diff, rcd))
+			return;
+		goto send_ack;
+	}
+
+	/* Check for opcode sequence errors. */
+	switch (qp->r_state) {
+	case OP(SEND_FIRST):
+	case OP(SEND_MIDDLE):
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+			break;
+		goto nack_inv;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_MIDDLE):
+		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			break;
+		goto nack_inv;
+
+	default:
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			goto nack_inv;
+		/*
+		 * Note that it is up to the requester to not send a new
+		 * RDMA read or atomic operation before receiving an ACK
+		 * for the previous operation.
+		 */
+		break;
+	}
+
+	if (qp->state == IB_QPS_RTR && !(qp->r_flags & QIB_R_COMM_EST)) {
+		qp->r_flags |= QIB_R_COMM_EST;
+		if (qp->ibqp.event_handler) {
+			struct ib_event ev;
+
+			ev.device = qp->ibqp.device;
+			ev.element.qp = &qp->ibqp;
+			ev.event = IB_EVENT_COMM_EST;
+			qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+		}
+	}
+
+	/* OK, process the packet. */
+	switch (opcode) {
+	case OP(SEND_FIRST):
+		ret = qib_get_rwqe(qp, 0);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		qp->r_rcv_len = 0;
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+	case OP(RDMA_WRITE_MIDDLE):
+send_middle:
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto nack_inv;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto nack_inv;
+		qib_copy_sge(&qp->r_sge, data, pmtu, 1);
+		break;
+
+	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+		/* consume RWQE */
+		ret = qib_get_rwqe(qp, 1);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		goto send_last_imm;
+
+	case OP(SEND_ONLY):
+	case OP(SEND_ONLY_WITH_IMMEDIATE):
+		ret = qib_get_rwqe(qp, 0);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		qp->r_rcv_len = 0;
+		if (opcode == OP(SEND_ONLY))
+			goto no_immediate_data;
+		/* FALLTHROUGH for SEND_ONLY_WITH_IMMEDIATE */
+	case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+		wc.ex.imm_data = ohdr->u.imm_data;
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		goto send_last;
+	case OP(SEND_LAST):
+	case OP(RDMA_WRITE_LAST):
+no_immediate_data:
+		wc.wc_flags = 0;
+		wc.ex.imm_data = 0;
+send_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto nack_inv;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		wc.byte_len = tlen + qp->r_rcv_len;
+		if (unlikely(wc.byte_len > qp->r_len))
+			goto nack_inv;
+		qib_copy_sge(&qp->r_sge, data, tlen, 1);
+		qib_put_ss(&qp->r_sge);
+		qp->r_msn++;
+		if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags))
+			break;
+		wc.wr_id = qp->r_wr_id;
+		wc.status = IB_WC_SUCCESS;
+		if (opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		else
+			wc.opcode = IB_WC_RECV;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = qp->remote_qpn;
+		wc.slid = qp->remote_ah_attr.dlid;
+		wc.sl = qp->remote_ah_attr.sl;
+		/* zero fields that are N/A */
+		wc.vendor_err = 0;
+		wc.pkey_index = 0;
+		wc.dlid_path_bits = 0;
+		wc.port_num = 0;
+		/* Signal completion event if the solicited bit is set. */
+		qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+			     (ohdr->bth[0] &
+			      cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_ONLY):
+	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto nack_inv;
+		/* consume RWQE */
+		reth = &ohdr->u.rc.reth;
+		hdrsize += sizeof(*reth);
+		qp->r_len = be32_to_cpu(reth->length);
+		qp->r_rcv_len = 0;
+		qp->r_sge.sg_list = NULL;
+		if (qp->r_len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey & NAK */
+			ok = qib_rkey_ok(qp, &qp->r_sge.sge, qp->r_len, vaddr,
+					 rkey, IB_ACCESS_REMOTE_WRITE);
+			if (unlikely(!ok))
+				goto nack_acc;
+			qp->r_sge.num_sge = 1;
+		} else {
+			qp->r_sge.num_sge = 0;
+			qp->r_sge.sge.mr = NULL;
+			qp->r_sge.sge.vaddr = NULL;
+			qp->r_sge.sge.length = 0;
+			qp->r_sge.sge.sge_length = 0;
+		}
+		if (opcode == OP(RDMA_WRITE_FIRST))
+			goto send_middle;
+		else if (opcode == OP(RDMA_WRITE_ONLY))
+			goto no_immediate_data;
+		ret = qib_get_rwqe(qp, 1);
+		if (ret < 0)
+			goto nack_op_err;
+		if (!ret)
+			goto rnr_nak;
+		wc.ex.imm_data = ohdr->u.rc.imm_data;
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		goto send_last;
+
+	case OP(RDMA_READ_REQUEST): {
+		struct qib_ack_entry *e;
+		u32 len;
+		u8 next;
+
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+			goto nack_inv;
+		next = qp->r_head_ack_queue + 1;
+		/* s_ack_queue is size QIB_MAX_RDMA_ATOMIC+1 so use > not >= */
+		if (next > QIB_MAX_RDMA_ATOMIC)
+			next = 0;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (unlikely(next == qp->s_tail_ack_queue)) {
+			if (!qp->s_ack_queue[next].sent)
+				goto nack_inv_unlck;
+			qib_update_ack_queue(qp, next);
+		}
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+			qib_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		reth = &ohdr->u.rc.reth;
+		len = be32_to_cpu(reth->length);
+		if (len) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey & NAK */
+			ok = qib_rkey_ok(qp, &e->rdma_sge, len, vaddr,
+					 rkey, IB_ACCESS_REMOTE_READ);
+			if (unlikely(!ok))
+				goto nack_acc_unlck;
+			/*
+			 * Update the next expected PSN.  We add 1 later
+			 * below, so only add the remainder here.
+			 */
+			if (len > pmtu)
+				qp->r_psn += (len - 1) / pmtu;
+		} else {
+			e->rdma_sge.mr = NULL;
+			e->rdma_sge.vaddr = NULL;
+			e->rdma_sge.length = 0;
+			e->rdma_sge.sge_length = 0;
+		}
+		e->opcode = opcode;
+		e->sent = 0;
+		e->psn = psn;
+		e->lpsn = qp->r_psn;
+		/*
+		 * We need to increment the MSN here instead of when we
+		 * finish sending the result since a duplicate request would
+		 * increment it more than once.
+		 */
+		qp->r_msn++;
+		qp->r_psn++;
+		qp->r_state = opcode;
+		qp->r_nak_state = 0;
+		qp->r_head_ack_queue = next;
+
+		/* Schedule the send tasklet. */
+		qp->s_flags |= QIB_S_RESP_PENDING;
+		qib_schedule_send(qp);
+
+		goto sunlock;
+	}
+
+	case OP(COMPARE_SWAP):
+	case OP(FETCH_ADD): {
+		struct ib_atomic_eth *ateth;
+		struct qib_ack_entry *e;
+		u64 vaddr;
+		atomic64_t *maddr;
+		u64 sdata;
+		u32 rkey;
+		u8 next;
+
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+			goto nack_inv;
+		next = qp->r_head_ack_queue + 1;
+		if (next > QIB_MAX_RDMA_ATOMIC)
+			next = 0;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (unlikely(next == qp->s_tail_ack_queue)) {
+			if (!qp->s_ack_queue[next].sent)
+				goto nack_inv_unlck;
+			qib_update_ack_queue(qp, next);
+		}
+		e = &qp->s_ack_queue[qp->r_head_ack_queue];
+		if (e->opcode == OP(RDMA_READ_REQUEST) && e->rdma_sge.mr) {
+			qib_put_mr(e->rdma_sge.mr);
+			e->rdma_sge.mr = NULL;
+		}
+		ateth = &ohdr->u.atomic_eth;
+		vaddr = ((u64) be32_to_cpu(ateth->vaddr[0]) << 32) |
+			be32_to_cpu(ateth->vaddr[1]);
+		if (unlikely(vaddr & (sizeof(u64) - 1)))
+			goto nack_inv_unlck;
+		rkey = be32_to_cpu(ateth->rkey);
+		/* Check rkey & NAK */
+		if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+					  vaddr, rkey,
+					  IB_ACCESS_REMOTE_ATOMIC)))
+			goto nack_acc_unlck;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+		sdata = be64_to_cpu(ateth->swap_data);
+		e->atomic_data = (opcode == OP(FETCH_ADD)) ?
+			(u64) atomic64_add_return(sdata, maddr) - sdata :
+			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+				      be64_to_cpu(ateth->compare_data),
+				      sdata);
+		qib_put_mr(qp->r_sge.sge.mr);
+		qp->r_sge.num_sge = 0;
+		e->opcode = opcode;
+		e->sent = 0;
+		e->psn = psn;
+		e->lpsn = psn;
+		qp->r_msn++;
+		qp->r_psn++;
+		qp->r_state = opcode;
+		qp->r_nak_state = 0;
+		qp->r_head_ack_queue = next;
+
+		/* Schedule the send tasklet. */
+		qp->s_flags |= QIB_S_RESP_PENDING;
+		qib_schedule_send(qp);
+
+		goto sunlock;
+	}
+
+	default:
+		/* NAK unknown opcodes. */
+		goto nack_inv;
+	}
+	qp->r_psn++;
+	qp->r_state = opcode;
+	qp->r_ack_psn = psn;
+	qp->r_nak_state = 0;
+	/* Send an ACK if requested or required. */
+	if (psn & (1 << 31))
+		goto send_ack;
+	return;
+
+rnr_nak:
+	qp->r_nak_state = IB_RNR_NAK | qp->r_min_rnr_timer;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue RNR NAK for later */
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= QIB_R_RSP_NAK;
+		atomic_inc(&qp->refcount);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+	return;
+
+nack_op_err:
+	qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_OPERATIONAL_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue NAK for later */
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= QIB_R_RSP_NAK;
+		atomic_inc(&qp->refcount);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+	return;
+
+nack_inv_unlck:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_inv:
+	qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	qp->r_nak_state = IB_NAK_INVALID_REQUEST;
+	qp->r_ack_psn = qp->r_psn;
+	/* Queue NAK for later */
+	if (list_empty(&qp->rspwait)) {
+		qp->r_flags |= QIB_R_RSP_NAK;
+		atomic_inc(&qp->refcount);
+		list_add_tail(&qp->rspwait, &rcd->qp_wait_list);
+	}
+	return;
+
+nack_acc_unlck:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+nack_acc:
+	qib_rc_error(qp, IB_WC_LOC_PROT_ERR);
+	qp->r_nak_state = IB_NAK_REMOTE_ACCESS_ERROR;
+	qp->r_ack_psn = qp->r_psn;
+send_ack:
+	qib_send_rc_ack(qp);
+	return;
+
+sunlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+}
diff --git a/drivers/infiniband/hw/qib/qib_ruc.c b/drivers/infiniband/hw/qib/qib_ruc.c
new file mode 100644
index 000000000..f42bd0f47
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_ruc.c
@@ -0,0 +1,819 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+
+#include "qib.h"
+#include "qib_mad.h"
+
+/*
+ * Convert the AETH RNR timeout code into the number of microseconds.
+ */
+const u32 ib_qib_rnr_table[32] = {
+	655360,	/* 00: 655.36 */
+	10,	/* 01:    .01 */
+	20,	/* 02     .02 */
+	30,	/* 03:    .03 */
+	40,	/* 04:    .04 */
+	60,	/* 05:    .06 */
+	80,	/* 06:    .08 */
+	120,	/* 07:    .12 */
+	160,	/* 08:    .16 */
+	240,	/* 09:    .24 */
+	320,	/* 0A:    .32 */
+	480,	/* 0B:    .48 */
+	640,	/* 0C:    .64 */
+	960,	/* 0D:    .96 */
+	1280,	/* 0E:   1.28 */
+	1920,	/* 0F:   1.92 */
+	2560,	/* 10:   2.56 */
+	3840,	/* 11:   3.84 */
+	5120,	/* 12:   5.12 */
+	7680,	/* 13:   7.68 */
+	10240,	/* 14:  10.24 */
+	15360,	/* 15:  15.36 */
+	20480,	/* 16:  20.48 */
+	30720,	/* 17:  30.72 */
+	40960,	/* 18:  40.96 */
+	61440,	/* 19:  61.44 */
+	81920,	/* 1A:  81.92 */
+	122880,	/* 1B: 122.88 */
+	163840,	/* 1C: 163.84 */
+	245760,	/* 1D: 245.76 */
+	327680,	/* 1E: 327.68 */
+	491520	/* 1F: 491.52 */
+};
+
+/*
+ * Validate a RWQE and fill in the SGE state.
+ * Return 1 if OK.
+ */
+static int qib_init_sge(struct qib_qp *qp, struct qib_rwqe *wqe)
+{
+	int i, j, ret;
+	struct ib_wc wc;
+	struct qib_lkey_table *rkt;
+	struct qib_pd *pd;
+	struct qib_sge_state *ss;
+
+	rkt = &to_idev(qp->ibqp.device)->lk_table;
+	pd = to_ipd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
+	ss = &qp->r_sge;
+	ss->sg_list = qp->r_sg_list;
+	qp->r_len = 0;
+	for (i = j = 0; i < wqe->num_sge; i++) {
+		if (wqe->sg_list[i].length == 0)
+			continue;
+		/* Check LKEY */
+		if (!qib_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
+				 &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
+			goto bad_lkey;
+		qp->r_len += wqe->sg_list[i].length;
+		j++;
+	}
+	ss->num_sge = j;
+	ss->total_len = qp->r_len;
+	ret = 1;
+	goto bail;
+
+bad_lkey:
+	while (j) {
+		struct qib_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
+
+		qib_put_mr(sge->mr);
+	}
+	ss->num_sge = 0;
+	memset(&wc, 0, sizeof(wc));
+	wc.wr_id = wqe->wr_id;
+	wc.status = IB_WC_LOC_PROT_ERR;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	/* Signal solicited completion event. */
+	qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
+	ret = 0;
+bail:
+	return ret;
+}
+
+/**
+ * qib_get_rwqe - copy the next RWQE into the QP's RWQE
+ * @qp: the QP
+ * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
+ *
+ * Return -1 if there is a local error, 0 if no RWQE is available,
+ * otherwise return 1.
+ *
+ * Can be called from interrupt level.
+ */
+int qib_get_rwqe(struct qib_qp *qp, int wr_id_only)
+{
+	unsigned long flags;
+	struct qib_rq *rq;
+	struct qib_rwq *wq;
+	struct qib_srq *srq;
+	struct qib_rwqe *wqe;
+	void (*handler)(struct ib_event *, void *);
+	u32 tail;
+	int ret;
+
+	if (qp->ibqp.srq) {
+		srq = to_isrq(qp->ibqp.srq);
+		handler = srq->ibsrq.event_handler;
+		rq = &srq->rq;
+	} else {
+		srq = NULL;
+		handler = NULL;
+		rq = &qp->r_rq;
+	}
+
+	spin_lock_irqsave(&rq->lock, flags);
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) {
+		ret = 0;
+		goto unlock;
+	}
+
+	wq = rq->wq;
+	tail = wq->tail;
+	/* Validate tail before using it since it is user writable. */
+	if (tail >= rq->size)
+		tail = 0;
+	if (unlikely(tail == wq->head)) {
+		ret = 0;
+		goto unlock;
+	}
+	/* Make sure entry is read after head index is read. */
+	smp_rmb();
+	wqe = get_rwqe_ptr(rq, tail);
+	/*
+	 * Even though we update the tail index in memory, the verbs
+	 * consumer is not supposed to post more entries until a
+	 * completion is generated.
+	 */
+	if (++tail >= rq->size)
+		tail = 0;
+	wq->tail = tail;
+	if (!wr_id_only && !qib_init_sge(qp, wqe)) {
+		ret = -1;
+		goto unlock;
+	}
+	qp->r_wr_id = wqe->wr_id;
+
+	ret = 1;
+	set_bit(QIB_R_WRID_VALID, &qp->r_aflags);
+	if (handler) {
+		u32 n;
+
+		/*
+		 * Validate head pointer value and compute
+		 * the number of remaining WQEs.
+		 */
+		n = wq->head;
+		if (n >= rq->size)
+			n = 0;
+		if (n < tail)
+			n += rq->size - tail;
+		else
+			n -= tail;
+		if (n < srq->limit) {
+			struct ib_event ev;
+
+			srq->limit = 0;
+			spin_unlock_irqrestore(&rq->lock, flags);
+			ev.device = qp->ibqp.device;
+			ev.element.srq = qp->ibqp.srq;
+			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
+			handler(&ev, srq->ibsrq.srq_context);
+			goto bail;
+		}
+	}
+unlock:
+	spin_unlock_irqrestore(&rq->lock, flags);
+bail:
+	return ret;
+}
+
+/*
+ * Switch to alternate path.
+ * The QP s_lock should be held and interrupts disabled.
+ */
+void qib_migrate_qp(struct qib_qp *qp)
+{
+	struct ib_event ev;
+
+	qp->s_mig_state = IB_MIG_MIGRATED;
+	qp->remote_ah_attr = qp->alt_ah_attr;
+	qp->port_num = qp->alt_ah_attr.port_num;
+	qp->s_pkey_index = qp->s_alt_pkey_index;
+
+	ev.device = qp->ibqp.device;
+	ev.element.qp = &qp->ibqp;
+	ev.event = IB_EVENT_PATH_MIG;
+	qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+}
+
+static __be64 get_sguid(struct qib_ibport *ibp, unsigned index)
+{
+	if (!index) {
+		struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+
+		return ppd->guid;
+	}
+	return ibp->guids[index - 1];
+}
+
+static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
+{
+	return (gid->global.interface_id == id &&
+		(gid->global.subnet_prefix == gid_prefix ||
+		 gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX));
+}
+
+/*
+ *
+ * This should be called with the QP r_lock held.
+ *
+ * The s_lock will be acquired around the qib_migrate_qp() call.
+ */
+int qib_ruc_check_hdr(struct qib_ibport *ibp, struct qib_ib_header *hdr,
+		      int has_grh, struct qib_qp *qp, u32 bth0)
+{
+	__be64 guid;
+	unsigned long flags;
+
+	if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) {
+		if (!has_grh) {
+			if (qp->alt_ah_attr.ah_flags & IB_AH_GRH)
+				goto err;
+		} else {
+			if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH))
+				goto err;
+			guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index);
+			if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid))
+				goto err;
+			if (!gid_ok(&hdr->u.l.grh.sgid,
+			    qp->alt_ah_attr.grh.dgid.global.subnet_prefix,
+			    qp->alt_ah_attr.grh.dgid.global.interface_id))
+				goto err;
+		}
+		if (!qib_pkey_ok((u16)bth0,
+				 qib_get_pkey(ibp, qp->s_alt_pkey_index))) {
+			qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
+				      (u16)bth0,
+				      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+				      0, qp->ibqp.qp_num,
+				      hdr->lrh[3], hdr->lrh[1]);
+			goto err;
+		}
+		/* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */
+		if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid ||
+		    ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num)
+			goto err;
+		spin_lock_irqsave(&qp->s_lock, flags);
+		qib_migrate_qp(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	} else {
+		if (!has_grh) {
+			if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+				goto err;
+		} else {
+			if (!(qp->remote_ah_attr.ah_flags & IB_AH_GRH))
+				goto err;
+			guid = get_sguid(ibp,
+					 qp->remote_ah_attr.grh.sgid_index);
+			if (!gid_ok(&hdr->u.l.grh.dgid, ibp->gid_prefix, guid))
+				goto err;
+			if (!gid_ok(&hdr->u.l.grh.sgid,
+			    qp->remote_ah_attr.grh.dgid.global.subnet_prefix,
+			    qp->remote_ah_attr.grh.dgid.global.interface_id))
+				goto err;
+		}
+		if (!qib_pkey_ok((u16)bth0,
+				 qib_get_pkey(ibp, qp->s_pkey_index))) {
+			qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
+				      (u16)bth0,
+				      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+				      0, qp->ibqp.qp_num,
+				      hdr->lrh[3], hdr->lrh[1]);
+			goto err;
+		}
+		/* Validate the SLID. See Ch. 9.6.1.5 */
+		if (be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid ||
+		    ppd_from_ibp(ibp)->port != qp->port_num)
+			goto err;
+		if (qp->s_mig_state == IB_MIG_REARM &&
+		    !(bth0 & IB_BTH_MIG_REQ))
+			qp->s_mig_state = IB_MIG_ARMED;
+	}
+
+	return 0;
+
+err:
+	return 1;
+}
+
+/**
+ * qib_ruc_loopback - handle UC and RC lookback requests
+ * @sqp: the sending QP
+ *
+ * This is called from qib_do_send() to
+ * forward a WQE addressed to the same HCA.
+ * Note that although we are single threaded due to the tasklet, we still
+ * have to protect against post_send().  We don't have to worry about
+ * receive interrupts since this is a connected protocol and all packets
+ * will pass through here.
+ */
+static void qib_ruc_loopback(struct qib_qp *sqp)
+{
+	struct qib_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+	struct qib_qp *qp;
+	struct qib_swqe *wqe;
+	struct qib_sge *sge;
+	unsigned long flags;
+	struct ib_wc wc;
+	u64 sdata;
+	atomic64_t *maddr;
+	enum ib_wc_status send_status;
+	int release;
+	int ret;
+
+	/*
+	 * Note that we check the responder QP state after
+	 * checking the requester's state.
+	 */
+	qp = qib_lookup_qpn(ibp, sqp->remote_qpn);
+
+	spin_lock_irqsave(&sqp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if ((sqp->s_flags & (QIB_S_BUSY | QIB_S_ANY_WAIT)) ||
+	    !(ib_qib_state_ops[sqp->state] & QIB_PROCESS_OR_FLUSH_SEND))
+		goto unlock;
+
+	sqp->s_flags |= QIB_S_BUSY;
+
+again:
+	if (sqp->s_last == sqp->s_head)
+		goto clr_busy;
+	wqe = get_swqe_ptr(sqp, sqp->s_last);
+
+	/* Return if it is not OK to start a new work reqeust. */
+	if (!(ib_qib_state_ops[sqp->state] & QIB_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_qib_state_ops[sqp->state] & QIB_FLUSH_SEND))
+			goto clr_busy;
+		/* We are in the error state, flush the work request. */
+		send_status = IB_WC_WR_FLUSH_ERR;
+		goto flush_send;
+	}
+
+	/*
+	 * We can rely on the entry not changing without the s_lock
+	 * being held until we update s_last.
+	 * We increment s_cur to indicate s_last is in progress.
+	 */
+	if (sqp->s_last == sqp->s_cur) {
+		if (++sqp->s_cur >= sqp->s_size)
+			sqp->s_cur = 0;
+	}
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+
+	if (!qp || !(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) ||
+	    qp->ibqp.qp_type != sqp->ibqp.qp_type) {
+		ibp->n_pkt_drops++;
+		/*
+		 * For RC, the requester would timeout and retry so
+		 * shortcut the timeouts and just signal too many retries.
+		 */
+		if (sqp->ibqp.qp_type == IB_QPT_RC)
+			send_status = IB_WC_RETRY_EXC_ERR;
+		else
+			send_status = IB_WC_SUCCESS;
+		goto serr;
+	}
+
+	memset(&wc, 0, sizeof(wc));
+	send_status = IB_WC_SUCCESS;
+
+	release = 1;
+	sqp->s_sge.sge = wqe->sg_list[0];
+	sqp->s_sge.sg_list = wqe->sg_list + 1;
+	sqp->s_sge.num_sge = wqe->wr.num_sge;
+	sqp->s_len = wqe->length;
+	switch (wqe->wr.opcode) {
+	case IB_WR_SEND_WITH_IMM:
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		/* FALLTHROUGH */
+	case IB_WR_SEND:
+		ret = qib_get_rwqe(qp, 0);
+		if (ret < 0)
+			goto op_err;
+		if (!ret)
+			goto rnr_nak;
+		break;
+
+	case IB_WR_RDMA_WRITE_WITH_IMM:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = wqe->wr.ex.imm_data;
+		ret = qib_get_rwqe(qp, 1);
+		if (ret < 0)
+			goto op_err;
+		if (!ret)
+			goto rnr_nak;
+		/* FALLTHROUGH */
+	case IB_WR_RDMA_WRITE:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
+			goto inv_err;
+		if (wqe->length == 0)
+			break;
+		if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
+					  wqe->wr.wr.rdma.remote_addr,
+					  wqe->wr.wr.rdma.rkey,
+					  IB_ACCESS_REMOTE_WRITE)))
+			goto acc_err;
+		qp->r_sge.sg_list = NULL;
+		qp->r_sge.num_sge = 1;
+		qp->r_sge.total_len = wqe->length;
+		break;
+
+	case IB_WR_RDMA_READ:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
+			goto inv_err;
+		if (unlikely(!qib_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
+					  wqe->wr.wr.rdma.remote_addr,
+					  wqe->wr.wr.rdma.rkey,
+					  IB_ACCESS_REMOTE_READ)))
+			goto acc_err;
+		release = 0;
+		sqp->s_sge.sg_list = NULL;
+		sqp->s_sge.num_sge = 1;
+		qp->r_sge.sge = wqe->sg_list[0];
+		qp->r_sge.sg_list = wqe->sg_list + 1;
+		qp->r_sge.num_sge = wqe->wr.num_sge;
+		qp->r_sge.total_len = wqe->length;
+		break;
+
+	case IB_WR_ATOMIC_CMP_AND_SWP:
+	case IB_WR_ATOMIC_FETCH_AND_ADD:
+		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
+			goto inv_err;
+		if (unlikely(!qib_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
+					  wqe->wr.wr.atomic.remote_addr,
+					  wqe->wr.wr.atomic.rkey,
+					  IB_ACCESS_REMOTE_ATOMIC)))
+			goto acc_err;
+		/* Perform atomic OP and save result. */
+		maddr = (atomic64_t *) qp->r_sge.sge.vaddr;
+		sdata = wqe->wr.wr.atomic.compare_add;
+		*(u64 *) sqp->s_sge.sge.vaddr =
+			(wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
+			(u64) atomic64_add_return(sdata, maddr) - sdata :
+			(u64) cmpxchg((u64 *) qp->r_sge.sge.vaddr,
+				      sdata, wqe->wr.wr.atomic.swap);
+		qib_put_mr(qp->r_sge.sge.mr);
+		qp->r_sge.num_sge = 0;
+		goto send_comp;
+
+	default:
+		send_status = IB_WC_LOC_QP_OP_ERR;
+		goto serr;
+	}
+
+	sge = &sqp->s_sge.sge;
+	while (sqp->s_len) {
+		u32 len = sqp->s_len;
+
+		if (len > sge->length)
+			len = sge->length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		qib_copy_sge(&qp->r_sge, sge->vaddr, len, release);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (!release)
+				qib_put_mr(sge->mr);
+			if (--sqp->s_sge.num_sge)
+				*sge = *sqp->s_sge.sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= QIB_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		sqp->s_len -= len;
+	}
+	if (release)
+		qib_put_ss(&qp->r_sge);
+
+	if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags))
+		goto send_comp;
+
+	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+	else
+		wc.opcode = IB_WC_RECV;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.byte_len = wqe->length;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = qp->remote_qpn;
+	wc.slid = qp->remote_ah_attr.dlid;
+	wc.sl = qp->remote_ah_attr.sl;
+	wc.port_num = 1;
+	/* Signal completion event if the solicited bit is set. */
+	qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		       wqe->wr.send_flags & IB_SEND_SOLICITED);
+
+send_comp:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	ibp->n_loop_pkts++;
+flush_send:
+	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
+	qib_send_complete(sqp, wqe, send_status);
+	goto again;
+
+rnr_nak:
+	/* Handle RNR NAK */
+	if (qp->ibqp.qp_type == IB_QPT_UC)
+		goto send_comp;
+	ibp->n_rnr_naks++;
+	/*
+	 * Note: we don't need the s_lock held since the BUSY flag
+	 * makes this single threaded.
+	 */
+	if (sqp->s_rnr_retry == 0) {
+		send_status = IB_WC_RNR_RETRY_EXC_ERR;
+		goto serr;
+	}
+	if (sqp->s_rnr_retry_cnt < 7)
+		sqp->s_rnr_retry--;
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	if (!(ib_qib_state_ops[sqp->state] & QIB_PROCESS_RECV_OK))
+		goto clr_busy;
+	sqp->s_flags |= QIB_S_WAIT_RNR;
+	sqp->s_timer.function = qib_rc_rnr_retry;
+	sqp->s_timer.expires = jiffies +
+		usecs_to_jiffies(ib_qib_rnr_table[qp->r_min_rnr_timer]);
+	add_timer(&sqp->s_timer);
+	goto clr_busy;
+
+op_err:
+	send_status = IB_WC_REM_OP_ERR;
+	wc.status = IB_WC_LOC_QP_OP_ERR;
+	goto err;
+
+inv_err:
+	send_status = IB_WC_REM_INV_REQ_ERR;
+	wc.status = IB_WC_LOC_QP_OP_ERR;
+	goto err;
+
+acc_err:
+	send_status = IB_WC_REM_ACCESS_ERR;
+	wc.status = IB_WC_LOC_PROT_ERR;
+err:
+	/* responder goes to error state */
+	qib_rc_error(qp, wc.status);
+
+serr:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+	qib_send_complete(sqp, wqe, send_status);
+	if (sqp->ibqp.qp_type == IB_QPT_RC) {
+		int lastwqe = qib_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
+
+		sqp->s_flags &= ~QIB_S_BUSY;
+		spin_unlock_irqrestore(&sqp->s_lock, flags);
+		if (lastwqe) {
+			struct ib_event ev;
+
+			ev.device = sqp->ibqp.device;
+			ev.element.qp = &sqp->ibqp;
+			ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+			sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
+		}
+		goto done;
+	}
+clr_busy:
+	sqp->s_flags &= ~QIB_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&sqp->s_lock, flags);
+done:
+	if (qp && atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+/**
+ * qib_make_grh - construct a GRH header
+ * @ibp: a pointer to the IB port
+ * @hdr: a pointer to the GRH header being constructed
+ * @grh: the global route address to send to
+ * @hwords: the number of 32 bit words of header being sent
+ * @nwords: the number of 32 bit words of data being sent
+ *
+ * Return the size of the header in 32 bit words.
+ */
+u32 qib_make_grh(struct qib_ibport *ibp, struct ib_grh *hdr,
+		 struct ib_global_route *grh, u32 hwords, u32 nwords)
+{
+	hdr->version_tclass_flow =
+		cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) |
+			    (grh->traffic_class << IB_GRH_TCLASS_SHIFT) |
+			    (grh->flow_label << IB_GRH_FLOW_SHIFT));
+	hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
+	/* next_hdr is defined by C8-7 in ch. 8.4.1 */
+	hdr->next_hdr = IB_GRH_NEXT_HDR;
+	hdr->hop_limit = grh->hop_limit;
+	/* The SGID is 32-bit aligned. */
+	hdr->sgid.global.subnet_prefix = ibp->gid_prefix;
+	hdr->sgid.global.interface_id = grh->sgid_index ?
+		ibp->guids[grh->sgid_index - 1] : ppd_from_ibp(ibp)->guid;
+	hdr->dgid = grh->dgid;
+
+	/* GRH header size in 32-bit words. */
+	return sizeof(struct ib_grh) / sizeof(u32);
+}
+
+void qib_make_ruc_header(struct qib_qp *qp, struct qib_other_headers *ohdr,
+			 u32 bth0, u32 bth2)
+{
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	u16 lrh0;
+	u32 nwords;
+	u32 extra_bytes;
+
+	/* Construct the header. */
+	extra_bytes = -qp->s_cur_size & 3;
+	nwords = (qp->s_cur_size + extra_bytes) >> 2;
+	lrh0 = QIB_LRH_BTH;
+	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+		qp->s_hdrwords += qib_make_grh(ibp, &qp->s_hdr->u.l.grh,
+					       &qp->remote_ah_attr.grh,
+					       qp->s_hdrwords, nwords);
+		lrh0 = QIB_LRH_GRH;
+	}
+	lrh0 |= ibp->sl_to_vl[qp->remote_ah_attr.sl] << 12 |
+		qp->remote_ah_attr.sl << 4;
+	qp->s_hdr->lrh[0] = cpu_to_be16(lrh0);
+	qp->s_hdr->lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+	qp->s_hdr->lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+	qp->s_hdr->lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid |
+				       qp->remote_ah_attr.src_path_bits);
+	bth0 |= qib_get_pkey(ibp, qp->s_pkey_index);
+	bth0 |= extra_bytes << 20;
+	if (qp->s_mig_state == IB_MIG_MIGRATED)
+		bth0 |= IB_BTH_MIG_REQ;
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(bth2);
+	this_cpu_inc(ibp->pmastats->n_unicast_xmit);
+}
+
+/**
+ * qib_do_send - perform a send on a QP
+ * @work: contains a pointer to the QP
+ *
+ * Process entries in the send work queue until credit or queue is
+ * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
+ * Otherwise, two threads could send packets out of order.
+ */
+void qib_do_send(struct work_struct *work)
+{
+	struct qib_qp *qp = container_of(work, struct qib_qp, s_work);
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	int (*make_req)(struct qib_qp *qp);
+	unsigned long flags;
+
+	if ((qp->ibqp.qp_type == IB_QPT_RC ||
+	     qp->ibqp.qp_type == IB_QPT_UC) &&
+	    (qp->remote_ah_attr.dlid & ~((1 << ppd->lmc) - 1)) == ppd->lid) {
+		qib_ruc_loopback(qp);
+		return;
+	}
+
+	if (qp->ibqp.qp_type == IB_QPT_RC)
+		make_req = qib_make_rc_req;
+	else if (qp->ibqp.qp_type == IB_QPT_UC)
+		make_req = qib_make_uc_req;
+	else
+		make_req = qib_make_ud_req;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if (!qib_send_ok(qp)) {
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		return;
+	}
+
+	qp->s_flags |= QIB_S_BUSY;
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	do {
+		/* Check for a constructed packet to be sent. */
+		if (qp->s_hdrwords != 0) {
+			/*
+			 * If the packet cannot be sent now, return and
+			 * the send tasklet will be woken up later.
+			 */
+			if (qib_verbs_send(qp, qp->s_hdr, qp->s_hdrwords,
+					   qp->s_cur_sge, qp->s_cur_size))
+				break;
+			/* Record that s_hdr is empty. */
+			qp->s_hdrwords = 0;
+		}
+	} while (make_req(qp));
+}
+
+/*
+ * This should be called with s_lock held.
+ */
+void qib_send_complete(struct qib_qp *qp, struct qib_swqe *wqe,
+		       enum ib_wc_status status)
+{
+	u32 old_last, last;
+	unsigned i;
+
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_OR_FLUSH_SEND))
+		return;
+
+	for (i = 0; i < wqe->wr.num_sge; i++) {
+		struct qib_sge *sge = &wqe->sg_list[i];
+
+		qib_put_mr(sge->mr);
+	}
+	if (qp->ibqp.qp_type == IB_QPT_UD ||
+	    qp->ibqp.qp_type == IB_QPT_SMI ||
+	    qp->ibqp.qp_type == IB_QPT_GSI)
+		atomic_dec(&to_iah(wqe->wr.wr.ud.ah)->refcount);
+
+	/* See ch. 11.2.4.1 and 10.7.3.1 */
+	if (!(qp->s_flags & QIB_S_SIGNAL_REQ_WR) ||
+	    (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+	    status != IB_WC_SUCCESS) {
+		struct ib_wc wc;
+
+		memset(&wc, 0, sizeof(wc));
+		wc.wr_id = wqe->wr.wr_id;
+		wc.status = status;
+		wc.opcode = ib_qib_wc_opcode[wqe->wr.opcode];
+		wc.qp = &qp->ibqp;
+		if (status == IB_WC_SUCCESS)
+			wc.byte_len = wqe->length;
+		qib_cq_enter(to_icq(qp->ibqp.send_cq), &wc,
+			     status != IB_WC_SUCCESS);
+	}
+
+	last = qp->s_last;
+	old_last = last;
+	if (++last >= qp->s_size)
+		last = 0;
+	qp->s_last = last;
+	if (qp->s_acked == old_last)
+		qp->s_acked = last;
+	if (qp->s_cur == old_last)
+		qp->s_cur = last;
+	if (qp->s_tail == old_last)
+		qp->s_tail = last;
+	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+		qp->s_draining = 0;
+}
diff --git a/drivers/infiniband/hw/qib/qib_sd7220.c b/drivers/infiniband/hw/qib/qib_sd7220.c
new file mode 100644
index 000000000..ca7d1419b
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_sd7220.c
@@ -0,0 +1,1454 @@
+/*
+ * Copyright (c) 2013 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+/*
+ * This file contains all of the code that is specific to the SerDes
+ * on the QLogic_IB 7220 chip.
+ */
+
+#include <linux/pci.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/firmware.h>
+
+#include "qib.h"
+#include "qib_7220.h"
+
+#define SD7220_FW_NAME "/*(DEBLOBBED)*/"
+/*(DEBLOBBED)*/
+
+/*
+ * Same as in qib_iba7220.c, but just the registers needed here.
+ * Could move whole set to qib_7220.h, but decided better to keep
+ * local.
+ */
+#define KREG_IDX(regname) (QIB_7220_##regname##_OFFS / sizeof(u64))
+#define kr_hwerrclear KREG_IDX(HwErrClear)
+#define kr_hwerrmask KREG_IDX(HwErrMask)
+#define kr_hwerrstatus KREG_IDX(HwErrStatus)
+#define kr_ibcstatus KREG_IDX(IBCStatus)
+#define kr_ibserdesctrl KREG_IDX(IBSerDesCtrl)
+#define kr_scratch KREG_IDX(Scratch)
+#define kr_xgxs_cfg KREG_IDX(XGXSCfg)
+/* these are used only here, not in qib_iba7220.c */
+#define kr_ibsd_epb_access_ctrl KREG_IDX(ibsd_epb_access_ctrl)
+#define kr_ibsd_epb_transaction_reg KREG_IDX(ibsd_epb_transaction_reg)
+#define kr_pciesd_epb_transaction_reg KREG_IDX(pciesd_epb_transaction_reg)
+#define kr_pciesd_epb_access_ctrl KREG_IDX(pciesd_epb_access_ctrl)
+#define kr_serdes_ddsrxeq0 KREG_IDX(SerDes_DDSRXEQ0)
+
+/*
+ * The IBSerDesMappTable is a memory that holds values to be stored in
+ * various SerDes registers by IBC.
+ */
+#define kr_serdes_maptable KREG_IDX(IBSerDesMappTable)
+
+/*
+ * Below used for sdnum parameter, selecting one of the two sections
+ * used for PCIe, or the single SerDes used for IB.
+ */
+#define PCIE_SERDES0 0
+#define PCIE_SERDES1 1
+
+/*
+ * The EPB requires addressing in a particular form. EPB_LOC() is intended
+ * to make #definitions a little more readable.
+ */
+#define EPB_ADDR_SHF 8
+#define EPB_LOC(chn, elt, reg) \
+	(((elt & 0xf) | ((chn & 7) << 4) | ((reg & 0x3f) << 9)) << \
+	 EPB_ADDR_SHF)
+#define EPB_IB_QUAD0_CS_SHF (25)
+#define EPB_IB_QUAD0_CS (1U <<  EPB_IB_QUAD0_CS_SHF)
+#define EPB_IB_UC_CS_SHF (26)
+#define EPB_PCIE_UC_CS_SHF (27)
+#define EPB_GLOBAL_WR (1U << (EPB_ADDR_SHF + 8))
+
+/* Forward declarations. */
+static int qib_sd7220_reg_mod(struct qib_devdata *dd, int sdnum, u32 loc,
+			      u32 data, u32 mask);
+static int ibsd_mod_allchnls(struct qib_devdata *dd, int loc, int val,
+			     int mask);
+static int qib_sd_trimdone_poll(struct qib_devdata *dd);
+static void qib_sd_trimdone_monitor(struct qib_devdata *dd, const char *where);
+static int qib_sd_setvals(struct qib_devdata *dd);
+static int qib_sd_early(struct qib_devdata *dd);
+static int qib_sd_dactrim(struct qib_devdata *dd);
+static int qib_internal_presets(struct qib_devdata *dd);
+/* Tweak the register (CMUCTRL5) that contains the TRIMSELF controls */
+static int qib_sd_trimself(struct qib_devdata *dd, int val);
+static int epb_access(struct qib_devdata *dd, int sdnum, int claim);
+static int qib_sd7220_ib_load(struct qib_devdata *dd,
+			      const struct firmware *fw);
+static int qib_sd7220_ib_vfy(struct qib_devdata *dd,
+			     const struct firmware *fw);
+
+/*
+ * Below keeps track of whether the "once per power-on" initialization has
+ * been done, because uC code Version 1.32.17 or higher allows the uC to
+ * be reset at will, and Automatic Equalization may require it. So the
+ * state of the reset "pin", is no longer valid. Instead, we check for the
+ * actual uC code having been loaded.
+ */
+static int qib_ibsd_ucode_loaded(struct qib_pportdata *ppd,
+				 const struct firmware *fw)
+{
+	struct qib_devdata *dd = ppd->dd;
+
+	if (!dd->cspec->serdes_first_init_done &&
+	    qib_sd7220_ib_vfy(dd, fw) > 0)
+		dd->cspec->serdes_first_init_done = 1;
+	return dd->cspec->serdes_first_init_done;
+}
+
+/* repeat #define for local use. "Real" #define is in qib_iba7220.c */
+#define QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR      0x0000004000000000ULL
+#define IB_MPREG5 (EPB_LOC(6, 0, 0xE) | (1L << EPB_IB_UC_CS_SHF))
+#define IB_MPREG6 (EPB_LOC(6, 0, 0xF) | (1U << EPB_IB_UC_CS_SHF))
+#define UC_PAR_CLR_D 8
+#define UC_PAR_CLR_M 0xC
+#define IB_CTRL2(chn) (EPB_LOC(chn, 7, 3) | EPB_IB_QUAD0_CS)
+#define START_EQ1(chan) EPB_LOC(chan, 7, 0x27)
+
+void qib_sd7220_clr_ibpar(struct qib_devdata *dd)
+{
+	int ret;
+
+	/* clear, then re-enable parity errs */
+	ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6,
+		UC_PAR_CLR_D, UC_PAR_CLR_M);
+	if (ret < 0) {
+		qib_dev_err(dd, "Failed clearing IBSerDes Parity err\n");
+		goto bail;
+	}
+	ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, 0,
+		UC_PAR_CLR_M);
+
+	qib_read_kreg32(dd, kr_scratch);
+	udelay(4);
+	qib_write_kreg(dd, kr_hwerrclear,
+		QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR);
+	qib_read_kreg32(dd, kr_scratch);
+bail:
+	return;
+}
+
+/*
+ * After a reset or other unusual event, the epb interface may need
+ * to be re-synchronized, between the host and the uC.
+ * returns <0 for failure to resync within IBSD_RESYNC_TRIES (not expected)
+ */
+#define IBSD_RESYNC_TRIES 3
+#define IB_PGUDP(chn) (EPB_LOC((chn), 2, 1) | EPB_IB_QUAD0_CS)
+#define IB_CMUDONE(chn) (EPB_LOC((chn), 7, 0xF) | EPB_IB_QUAD0_CS)
+
+static int qib_resync_ibepb(struct qib_devdata *dd)
+{
+	int ret, pat, tries, chn;
+	u32 loc;
+
+	ret = -1;
+	chn = 0;
+	for (tries = 0; tries < (4 * IBSD_RESYNC_TRIES); ++tries) {
+		loc = IB_PGUDP(chn);
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, 0, 0);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed read in resync\n");
+			continue;
+		}
+		if (ret != 0xF0 && ret != 0x55 && tries == 0)
+			qib_dev_err(dd, "unexpected pattern in resync\n");
+		pat = ret ^ 0xA5; /* alternate F0 and 55 */
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, pat, 0xFF);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed write in resync\n");
+			continue;
+		}
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, 0, 0);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed re-read in resync\n");
+			continue;
+		}
+		if (ret != pat) {
+			qib_dev_err(dd, "Failed compare1 in resync\n");
+			continue;
+		}
+		loc = IB_CMUDONE(chn);
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, 0, 0);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed CMUDONE rd in resync\n");
+			continue;
+		}
+		if ((ret & 0x70) != ((chn << 4) | 0x40)) {
+			qib_dev_err(dd, "Bad CMUDONE value %02X, chn %d\n",
+				    ret, chn);
+			continue;
+		}
+		if (++chn == 4)
+			break;  /* Success */
+	}
+	return (ret > 0) ? 0 : ret;
+}
+
+/*
+ * Localize the stuff that should be done to change IB uC reset
+ * returns <0 for errors.
+ */
+static int qib_ibsd_reset(struct qib_devdata *dd, int assert_rst)
+{
+	u64 rst_val;
+	int ret = 0;
+	unsigned long flags;
+
+	rst_val = qib_read_kreg64(dd, kr_ibserdesctrl);
+	if (assert_rst) {
+		/*
+		 * Vendor recommends "interrupting" uC before reset, to
+		 * minimize possible glitches.
+		 */
+		spin_lock_irqsave(&dd->cspec->sdepb_lock, flags);
+		epb_access(dd, IB_7220_SERDES, 1);
+		rst_val |= 1ULL;
+		/* Squelch possible parity error from _asserting_ reset */
+		qib_write_kreg(dd, kr_hwerrmask,
+			       dd->cspec->hwerrmask &
+			       ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR);
+		qib_write_kreg(dd, kr_ibserdesctrl, rst_val);
+		/* flush write, delay to ensure it took effect */
+		qib_read_kreg32(dd, kr_scratch);
+		udelay(2);
+		/* once it's reset, can remove interrupt */
+		epb_access(dd, IB_7220_SERDES, -1);
+		spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags);
+	} else {
+		/*
+		 * Before we de-assert reset, we need to deal with
+		 * possible glitch on the Parity-error line.
+		 * Suppress it around the reset, both in chip-level
+		 * hwerrmask and in IB uC control reg. uC will allow
+		 * it again during startup.
+		 */
+		u64 val;
+
+		rst_val &= ~(1ULL);
+		qib_write_kreg(dd, kr_hwerrmask,
+			       dd->cspec->hwerrmask &
+			       ~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR);
+
+		ret = qib_resync_ibepb(dd);
+		if (ret < 0)
+			qib_dev_err(dd, "unable to re-sync IB EPB\n");
+
+		/* set uC control regs to suppress parity errs */
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG5, 1, 1);
+		if (ret < 0)
+			goto bail;
+		/* IB uC code past Version 1.32.17 allow suppression of wdog */
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, 0x80,
+			0x80);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed to set WDOG disable\n");
+			goto bail;
+		}
+		qib_write_kreg(dd, kr_ibserdesctrl, rst_val);
+		/* flush write, delay for startup */
+		qib_read_kreg32(dd, kr_scratch);
+		udelay(1);
+		/* clear, then re-enable parity errs */
+		qib_sd7220_clr_ibpar(dd);
+		val = qib_read_kreg64(dd, kr_hwerrstatus);
+		if (val & QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR) {
+			qib_dev_err(dd, "IBUC Parity still set after RST\n");
+			dd->cspec->hwerrmask &=
+				~QLOGIC_IB_HWE_IB_UC_MEMORYPARITYERR;
+		}
+		qib_write_kreg(dd, kr_hwerrmask,
+			dd->cspec->hwerrmask);
+	}
+
+bail:
+	return ret;
+}
+
+static void qib_sd_trimdone_monitor(struct qib_devdata *dd,
+	const char *where)
+{
+	int ret, chn, baduns;
+	u64 val;
+
+	if (!where)
+		where = "?";
+
+	/* give time for reset to settle out in EPB */
+	udelay(2);
+
+	ret = qib_resync_ibepb(dd);
+	if (ret < 0)
+		qib_dev_err(dd, "not able to re-sync IB EPB (%s)\n", where);
+
+	/* Do "sacrificial read" to get EPB in sane state after reset */
+	ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_CTRL2(0), 0, 0);
+	if (ret < 0)
+		qib_dev_err(dd, "Failed TRIMDONE 1st read, (%s)\n", where);
+
+	/* Check/show "summary" Trim-done bit in IBCStatus */
+	val = qib_read_kreg64(dd, kr_ibcstatus);
+	if (!(val & (1ULL << 11)))
+		qib_dev_err(dd, "IBCS TRIMDONE clear (%s)\n", where);
+	/*
+	 * Do "dummy read/mod/wr" to get EPB in sane state after reset
+	 * The default value for MPREG6 is 0.
+	 */
+	udelay(2);
+
+	ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, IB_MPREG6, 0x80, 0x80);
+	if (ret < 0)
+		qib_dev_err(dd, "Failed Dummy RMW, (%s)\n", where);
+	udelay(10);
+
+	baduns = 0;
+
+	for (chn = 3; chn >= 0; --chn) {
+		/* Read CTRL reg for each channel to check TRIMDONE */
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+			IB_CTRL2(chn), 0, 0);
+		if (ret < 0)
+			qib_dev_err(dd,
+				"Failed checking TRIMDONE, chn %d (%s)\n",
+				chn, where);
+
+		if (!(ret & 0x10)) {
+			int probe;
+
+			baduns |= (1 << chn);
+			qib_dev_err(dd,
+				"TRIMDONE cleared on chn %d (%02X). (%s)\n",
+				chn, ret, where);
+			probe = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+				IB_PGUDP(0), 0, 0);
+			qib_dev_err(dd, "probe is %d (%02X)\n",
+				probe, probe);
+			probe = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+				IB_CTRL2(chn), 0, 0);
+			qib_dev_err(dd, "re-read: %d (%02X)\n",
+				probe, probe);
+			ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+				IB_CTRL2(chn), 0x10, 0x10);
+			if (ret < 0)
+				qib_dev_err(dd,
+					"Err on TRIMDONE rewrite1\n");
+		}
+	}
+	for (chn = 3; chn >= 0; --chn) {
+		/* Read CTRL reg for each channel to check TRIMDONE */
+		if (baduns & (1 << chn)) {
+			qib_dev_err(dd,
+				"Resetting TRIMDONE on chn %d (%s)\n",
+				chn, where);
+			ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+				IB_CTRL2(chn), 0x10, 0x10);
+			if (ret < 0)
+				qib_dev_err(dd,
+					"Failed re-setting TRIMDONE, chn %d (%s)\n",
+					chn, where);
+		}
+	}
+}
+
+/*
+ * Below is portion of IBA7220-specific bringup_serdes() that actually
+ * deals with registers and memory within the SerDes itself.
+ * Post IB uC code version 1.32.17, was_reset being 1 is not really
+ * informative, so we double-check.
+ */
+int qib_sd7220_init(struct qib_devdata *dd)
+{
+	const struct firmware *fw;
+	int ret = 1; /* default to failure */
+	int first_reset, was_reset;
+
+	/* SERDES MPU reset recorded in D0 */
+	was_reset = (qib_read_kreg64(dd, kr_ibserdesctrl) & 1);
+	if (!was_reset) {
+		/* entered with reset not asserted, we need to do it */
+		qib_ibsd_reset(dd, 1);
+		qib_sd_trimdone_monitor(dd, "Driver-reload");
+	}
+
+	ret = reject_firmware(&fw, SD7220_FW_NAME, &dd->pcidev->dev);
+	if (ret) {
+		qib_dev_err(dd, "Failed to load IB SERDES image\n");
+		goto done;
+	}
+
+	/* Substitute our deduced value for was_reset */
+	ret = qib_ibsd_ucode_loaded(dd->pport, fw);
+	if (ret < 0)
+		goto bail;
+
+	first_reset = !ret; /* First reset if IBSD uCode not yet loaded */
+	/*
+	 * Alter some regs per vendor latest doc, reset-defaults
+	 * are not right for IB.
+	 */
+	ret = qib_sd_early(dd);
+	if (ret < 0) {
+		qib_dev_err(dd, "Failed to set IB SERDES early defaults\n");
+		goto bail;
+	}
+	/*
+	 * Set DAC manual trim IB.
+	 * We only do this once after chip has been reset (usually
+	 * same as once per system boot).
+	 */
+	if (first_reset) {
+		ret = qib_sd_dactrim(dd);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed IB SERDES DAC trim\n");
+			goto bail;
+		}
+	}
+	/*
+	 * Set various registers (DDS and RXEQ) that will be
+	 * controlled by IBC (in 1.2 mode) to reasonable preset values
+	 * Calling the "internal" version avoids the "check for needed"
+	 * and "trimdone monitor" that might be counter-productive.
+	 */
+	ret = qib_internal_presets(dd);
+	if (ret < 0) {
+		qib_dev_err(dd, "Failed to set IB SERDES presets\n");
+		goto bail;
+	}
+	ret = qib_sd_trimself(dd, 0x80);
+	if (ret < 0) {
+		qib_dev_err(dd, "Failed to set IB SERDES TRIMSELF\n");
+		goto bail;
+	}
+
+	/* Load image, then try to verify */
+	ret = 0;        /* Assume success */
+	if (first_reset) {
+		int vfy;
+		int trim_done;
+
+		ret = qib_sd7220_ib_load(dd, fw);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed to load IB SERDES image\n");
+			goto bail;
+		} else {
+			/* Loaded image, try to verify */
+			vfy = qib_sd7220_ib_vfy(dd, fw);
+			if (vfy != ret) {
+				qib_dev_err(dd, "SERDES PRAM VFY failed\n");
+				goto bail;
+			} /* end if verified */
+		} /* end if loaded */
+
+		/*
+		 * Loaded and verified. Almost good...
+		 * hold "success" in ret
+		 */
+		ret = 0;
+		/*
+		 * Prev steps all worked, continue bringup
+		 * De-assert RESET to uC, only in first reset, to allow
+		 * trimming.
+		 *
+		 * Since our default setup sets START_EQ1 to
+		 * PRESET, we need to clear that for this very first run.
+		 */
+		ret = ibsd_mod_allchnls(dd, START_EQ1(0), 0, 0x38);
+		if (ret < 0) {
+			qib_dev_err(dd, "Failed clearing START_EQ1\n");
+			goto bail;
+		}
+
+		qib_ibsd_reset(dd, 0);
+		/*
+		 * If this is not the first reset, trimdone should be set
+		 * already. We may need to check about this.
+		 */
+		trim_done = qib_sd_trimdone_poll(dd);
+		/*
+		 * Whether or not trimdone succeeded, we need to put the
+		 * uC back into reset to avoid a possible fight with the
+		 * IBC state-machine.
+		 */
+		qib_ibsd_reset(dd, 1);
+
+		if (!trim_done) {
+			qib_dev_err(dd, "No TRIMDONE seen\n");
+			goto bail;
+		}
+		/*
+		 * DEBUG: check each time we reset if trimdone bits have
+		 * gotten cleared, and re-set them.
+		 */
+		qib_sd_trimdone_monitor(dd, "First-reset");
+		/* Remember so we do not re-do the load, dactrim, etc. */
+		dd->cspec->serdes_first_init_done = 1;
+	}
+	/*
+	 * setup for channel training and load values for
+	 * RxEq and DDS in tables used by IBC in IB1.2 mode
+	 */
+	ret = 0;
+	if (qib_sd_setvals(dd) >= 0)
+		goto done;
+bail:
+	ret = 1;
+done:
+	/* start relock timer regardless, but start at 1 second */
+	set_7220_relock_poll(dd, -1);
+
+	release_firmware(fw);
+	return ret;
+}
+
+#define EPB_ACC_REQ 1
+#define EPB_ACC_GNT 0x100
+#define EPB_DATA_MASK 0xFF
+#define EPB_RD (1ULL << 24)
+#define EPB_TRANS_RDY (1ULL << 31)
+#define EPB_TRANS_ERR (1ULL << 30)
+#define EPB_TRANS_TRIES 5
+
+/*
+ * query, claim, release ownership of the EPB (External Parallel Bus)
+ * for a specified SERDES.
+ * the "claim" parameter is >0 to claim, <0 to release, 0 to query.
+ * Returns <0 for errors, >0 if we had ownership, else 0.
+ */
+static int epb_access(struct qib_devdata *dd, int sdnum, int claim)
+{
+	u16 acc;
+	u64 accval;
+	int owned = 0;
+	u64 oct_sel = 0;
+
+	switch (sdnum) {
+	case IB_7220_SERDES:
+		/*
+		 * The IB SERDES "ownership" is fairly simple. A single each
+		 * request/grant.
+		 */
+		acc = kr_ibsd_epb_access_ctrl;
+		break;
+
+	case PCIE_SERDES0:
+	case PCIE_SERDES1:
+		/* PCIe SERDES has two "octants", need to select which */
+		acc = kr_pciesd_epb_access_ctrl;
+		oct_sel = (2 << (sdnum - PCIE_SERDES0));
+		break;
+
+	default:
+		return 0;
+	}
+
+	/* Make sure any outstanding transaction was seen */
+	qib_read_kreg32(dd, kr_scratch);
+	udelay(15);
+
+	accval = qib_read_kreg32(dd, acc);
+
+	owned = !!(accval & EPB_ACC_GNT);
+	if (claim < 0) {
+		/* Need to release */
+		u64 pollval;
+		/*
+		 * The only writeable bits are the request and CS.
+		 * Both should be clear
+		 */
+		u64 newval = 0;
+
+		qib_write_kreg(dd, acc, newval);
+		/* First read after write is not trustworthy */
+		pollval = qib_read_kreg32(dd, acc);
+		udelay(5);
+		pollval = qib_read_kreg32(dd, acc);
+		if (pollval & EPB_ACC_GNT)
+			owned = -1;
+	} else if (claim > 0) {
+		/* Need to claim */
+		u64 pollval;
+		u64 newval = EPB_ACC_REQ | oct_sel;
+
+		qib_write_kreg(dd, acc, newval);
+		/* First read after write is not trustworthy */
+		pollval = qib_read_kreg32(dd, acc);
+		udelay(5);
+		pollval = qib_read_kreg32(dd, acc);
+		if (!(pollval & EPB_ACC_GNT))
+			owned = -1;
+	}
+	return owned;
+}
+
+/*
+ * Lemma to deal with race condition of write..read to epb regs
+ */
+static int epb_trans(struct qib_devdata *dd, u16 reg, u64 i_val, u64 *o_vp)
+{
+	int tries;
+	u64 transval;
+
+	qib_write_kreg(dd, reg, i_val);
+	/* Throw away first read, as RDY bit may be stale */
+	transval = qib_read_kreg64(dd, reg);
+
+	for (tries = EPB_TRANS_TRIES; tries; --tries) {
+		transval = qib_read_kreg32(dd, reg);
+		if (transval & EPB_TRANS_RDY)
+			break;
+		udelay(5);
+	}
+	if (transval & EPB_TRANS_ERR)
+		return -1;
+	if (tries > 0 && o_vp)
+		*o_vp = transval;
+	return tries;
+}
+
+/**
+ * qib_sd7220_reg_mod - modify SERDES register
+ * @dd: the qlogic_ib device
+ * @sdnum: which SERDES to access
+ * @loc: location - channel, element, register, as packed by EPB_LOC() macro.
+ * @wd: Write Data - value to set in register
+ * @mask: ones where data should be spliced into reg.
+ *
+ * Basic register read/modify/write, with un-needed acesses elided. That is,
+ * a mask of zero will prevent write, while a mask of 0xFF will prevent read.
+ * returns current (presumed, if a write was done) contents of selected
+ * register, or <0 if errors.
+ */
+static int qib_sd7220_reg_mod(struct qib_devdata *dd, int sdnum, u32 loc,
+			      u32 wd, u32 mask)
+{
+	u16 trans;
+	u64 transval;
+	int owned;
+	int tries, ret;
+	unsigned long flags;
+
+	switch (sdnum) {
+	case IB_7220_SERDES:
+		trans = kr_ibsd_epb_transaction_reg;
+		break;
+
+	case PCIE_SERDES0:
+	case PCIE_SERDES1:
+		trans = kr_pciesd_epb_transaction_reg;
+		break;
+
+	default:
+		return -1;
+	}
+
+	/*
+	 * All access is locked in software (vs other host threads) and
+	 * hardware (vs uC access).
+	 */
+	spin_lock_irqsave(&dd->cspec->sdepb_lock, flags);
+
+	owned = epb_access(dd, sdnum, 1);
+	if (owned < 0) {
+		spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags);
+		return -1;
+	}
+	ret = 0;
+	for (tries = EPB_TRANS_TRIES; tries; --tries) {
+		transval = qib_read_kreg32(dd, trans);
+		if (transval & EPB_TRANS_RDY)
+			break;
+		udelay(5);
+	}
+
+	if (tries > 0) {
+		tries = 1;      /* to make read-skip work */
+		if (mask != 0xFF) {
+			/*
+			 * Not a pure write, so need to read.
+			 * loc encodes chip-select as well as address
+			 */
+			transval = loc | EPB_RD;
+			tries = epb_trans(dd, trans, transval, &transval);
+		}
+		if (tries > 0 && mask != 0) {
+			/*
+			 * Not a pure read, so need to write.
+			 */
+			wd = (wd & mask) | (transval & ~mask);
+			transval = loc | (wd & EPB_DATA_MASK);
+			tries = epb_trans(dd, trans, transval, &transval);
+		}
+	}
+	/* else, failed to see ready, what error-handling? */
+
+	/*
+	 * Release bus. Failure is an error.
+	 */
+	if (epb_access(dd, sdnum, -1) < 0)
+		ret = -1;
+	else
+		ret = transval & EPB_DATA_MASK;
+
+	spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags);
+	if (tries <= 0)
+		ret = -1;
+	return ret;
+}
+
+#define EPB_ROM_R (2)
+#define EPB_ROM_W (1)
+/*
+ * Below, all uC-related, use appropriate UC_CS, depending
+ * on which SerDes is used.
+ */
+#define EPB_UC_CTL EPB_LOC(6, 0, 0)
+#define EPB_MADDRL EPB_LOC(6, 0, 2)
+#define EPB_MADDRH EPB_LOC(6, 0, 3)
+#define EPB_ROMDATA EPB_LOC(6, 0, 4)
+#define EPB_RAMDATA EPB_LOC(6, 0, 5)
+
+/* Transfer date to/from uC Program RAM of IB or PCIe SerDes */
+static int qib_sd7220_ram_xfer(struct qib_devdata *dd, int sdnum, u32 loc,
+			       u8 *buf, int cnt, int rd_notwr)
+{
+	u16 trans;
+	u64 transval;
+	u64 csbit;
+	int owned;
+	int tries;
+	int sofar;
+	int addr;
+	int ret;
+	unsigned long flags;
+	const char *op;
+
+	/* Pick appropriate transaction reg and "Chip select" for this serdes */
+	switch (sdnum) {
+	case IB_7220_SERDES:
+		csbit = 1ULL << EPB_IB_UC_CS_SHF;
+		trans = kr_ibsd_epb_transaction_reg;
+		break;
+
+	case PCIE_SERDES0:
+	case PCIE_SERDES1:
+		/* PCIe SERDES has uC "chip select" in different bit, too */
+		csbit = 1ULL << EPB_PCIE_UC_CS_SHF;
+		trans = kr_pciesd_epb_transaction_reg;
+		break;
+
+	default:
+		return -1;
+	}
+
+	op = rd_notwr ? "Rd" : "Wr";
+	spin_lock_irqsave(&dd->cspec->sdepb_lock, flags);
+
+	owned = epb_access(dd, sdnum, 1);
+	if (owned < 0) {
+		spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags);
+		return -1;
+	}
+
+	/*
+	 * In future code, we may need to distinguish several address ranges,
+	 * and select various memories based on this. For now, just trim
+	 * "loc" (location including address and memory select) to
+	 * "addr" (address within memory). we will only support PRAM
+	 * The memory is 8KB.
+	 */
+	addr = loc & 0x1FFF;
+	for (tries = EPB_TRANS_TRIES; tries; --tries) {
+		transval = qib_read_kreg32(dd, trans);
+		if (transval & EPB_TRANS_RDY)
+			break;
+		udelay(5);
+	}
+
+	sofar = 0;
+	if (tries > 0) {
+		/*
+		 * Every "memory" access is doubly-indirect.
+		 * We set two bytes of address, then read/write
+		 * one or mores bytes of data.
+		 */
+
+		/* First, we set control to "Read" or "Write" */
+		transval = csbit | EPB_UC_CTL |
+			(rd_notwr ? EPB_ROM_R : EPB_ROM_W);
+		tries = epb_trans(dd, trans, transval, &transval);
+		while (tries > 0 && sofar < cnt) {
+			if (!sofar) {
+				/* Only set address at start of chunk */
+				int addrbyte = (addr + sofar) >> 8;
+
+				transval = csbit | EPB_MADDRH | addrbyte;
+				tries = epb_trans(dd, trans, transval,
+						  &transval);
+				if (tries <= 0)
+					break;
+				addrbyte = (addr + sofar) & 0xFF;
+				transval = csbit | EPB_MADDRL | addrbyte;
+				tries = epb_trans(dd, trans, transval,
+						 &transval);
+				if (tries <= 0)
+					break;
+			}
+
+			if (rd_notwr)
+				transval = csbit | EPB_ROMDATA | EPB_RD;
+			else
+				transval = csbit | EPB_ROMDATA | buf[sofar];
+			tries = epb_trans(dd, trans, transval, &transval);
+			if (tries <= 0)
+				break;
+			if (rd_notwr)
+				buf[sofar] = transval & EPB_DATA_MASK;
+			++sofar;
+		}
+		/* Finally, clear control-bit for Read or Write */
+		transval = csbit | EPB_UC_CTL;
+		tries = epb_trans(dd, trans, transval, &transval);
+	}
+
+	ret = sofar;
+	/* Release bus. Failure is an error */
+	if (epb_access(dd, sdnum, -1) < 0)
+		ret = -1;
+
+	spin_unlock_irqrestore(&dd->cspec->sdepb_lock, flags);
+	if (tries <= 0)
+		ret = -1;
+	return ret;
+}
+
+#define PROG_CHUNK 64
+
+static int qib_sd7220_prog_ld(struct qib_devdata *dd, int sdnum,
+			      const u8 *img, int len, int offset)
+{
+	int cnt, sofar, req;
+
+	sofar = 0;
+	while (sofar < len) {
+		req = len - sofar;
+		if (req > PROG_CHUNK)
+			req = PROG_CHUNK;
+		cnt = qib_sd7220_ram_xfer(dd, sdnum, offset + sofar,
+					  (u8 *)img + sofar, req, 0);
+		if (cnt < req) {
+			sofar = -1;
+			break;
+		}
+		sofar += req;
+	}
+	return sofar;
+}
+
+#define VFY_CHUNK 64
+#define SD_PRAM_ERROR_LIMIT 42
+
+static int qib_sd7220_prog_vfy(struct qib_devdata *dd, int sdnum,
+			       const u8 *img, int len, int offset)
+{
+	int cnt, sofar, req, idx, errors;
+	unsigned char readback[VFY_CHUNK];
+
+	errors = 0;
+	sofar = 0;
+	while (sofar < len) {
+		req = len - sofar;
+		if (req > VFY_CHUNK)
+			req = VFY_CHUNK;
+		cnt = qib_sd7220_ram_xfer(dd, sdnum, sofar + offset,
+					  readback, req, 1);
+		if (cnt < req) {
+			/* failed in read itself */
+			sofar = -1;
+			break;
+		}
+		for (idx = 0; idx < cnt; ++idx) {
+			if (readback[idx] != img[idx+sofar])
+				++errors;
+		}
+		sofar += cnt;
+	}
+	return errors ? -errors : sofar;
+}
+
+static int
+qib_sd7220_ib_load(struct qib_devdata *dd, const struct firmware *fw)
+{
+	return qib_sd7220_prog_ld(dd, IB_7220_SERDES, fw->data, fw->size, 0);
+}
+
+static int
+qib_sd7220_ib_vfy(struct qib_devdata *dd, const struct firmware *fw)
+{
+	return qib_sd7220_prog_vfy(dd, IB_7220_SERDES, fw->data, fw->size, 0);
+}
+
+/*
+ * IRQ not set up at this point in init, so we poll.
+ */
+#define IB_SERDES_TRIM_DONE (1ULL << 11)
+#define TRIM_TMO (15)
+
+static int qib_sd_trimdone_poll(struct qib_devdata *dd)
+{
+	int trim_tmo, ret;
+	uint64_t val;
+
+	/*
+	 * Default to failure, so IBC will not start
+	 * without IB_SERDES_TRIM_DONE.
+	 */
+	ret = 0;
+	for (trim_tmo = 0; trim_tmo < TRIM_TMO; ++trim_tmo) {
+		val = qib_read_kreg64(dd, kr_ibcstatus);
+		if (val & IB_SERDES_TRIM_DONE) {
+			ret = 1;
+			break;
+		}
+		msleep(20);
+	}
+	if (trim_tmo >= TRIM_TMO) {
+		qib_dev_err(dd, "No TRIMDONE in %d tries\n", trim_tmo);
+		ret = 0;
+	}
+	return ret;
+}
+
+#define TX_FAST_ELT (9)
+
+/*
+ * Set the "negotiation" values for SERDES. These are used by the IB1.2
+ * link negotiation. Macros below are attempt to keep the values a
+ * little more human-editable.
+ * First, values related to Drive De-emphasis Settings.
+ */
+
+#define NUM_DDS_REGS 6
+#define DDS_REG_MAP 0x76A910 /* LSB-first list of regs (in elt 9) to mod */
+
+#define DDS_VAL(amp_d, main_d, ipst_d, ipre_d, amp_s, main_s, ipst_s, ipre_s) \
+	{ { ((amp_d & 0x1F) << 1) | 1, ((amp_s & 0x1F) << 1) | 1, \
+	  (main_d << 3) | 4 | (ipre_d >> 2), \
+	  (main_s << 3) | 4 | (ipre_s >> 2), \
+	  ((ipst_d & 0xF) << 1) | ((ipre_d & 3) << 6) | 0x21, \
+	  ((ipst_s & 0xF) << 1) | ((ipre_s & 3) << 6) | 0x21 } }
+
+static struct dds_init {
+	uint8_t reg_vals[NUM_DDS_REGS];
+} dds_init_vals[] = {
+	/*       DDR(FDR)       SDR(HDR)   */
+	/* Vendor recommends below for 3m cable */
+#define DDS_3M 0
+	DDS_VAL(31, 19, 12, 0, 29, 22,  9, 0),
+	DDS_VAL(31, 12, 15, 4, 31, 15, 15, 1),
+	DDS_VAL(31, 13, 15, 3, 31, 16, 15, 0),
+	DDS_VAL(31, 14, 15, 2, 31, 17, 14, 0),
+	DDS_VAL(31, 15, 15, 1, 31, 18, 13, 0),
+	DDS_VAL(31, 16, 15, 0, 31, 19, 12, 0),
+	DDS_VAL(31, 17, 14, 0, 31, 20, 11, 0),
+	DDS_VAL(31, 18, 13, 0, 30, 21, 10, 0),
+	DDS_VAL(31, 20, 11, 0, 28, 23,  8, 0),
+	DDS_VAL(31, 21, 10, 0, 27, 24,  7, 0),
+	DDS_VAL(31, 22,  9, 0, 26, 25,  6, 0),
+	DDS_VAL(30, 23,  8, 0, 25, 26,  5, 0),
+	DDS_VAL(29, 24,  7, 0, 23, 27,  4, 0),
+	/* Vendor recommends below for 1m cable */
+#define DDS_1M 13
+	DDS_VAL(28, 25,  6, 0, 21, 28,  3, 0),
+	DDS_VAL(27, 26,  5, 0, 19, 29,  2, 0),
+	DDS_VAL(25, 27,  4, 0, 17, 30,  1, 0)
+};
+
+/*
+ * Now the RXEQ section of the table.
+ */
+/* Hardware packs an element number and register address thus: */
+#define RXEQ_INIT_RDESC(elt, addr) (((elt) & 0xF) | ((addr) << 4))
+#define RXEQ_VAL(elt, adr, val0, val1, val2, val3) \
+	{RXEQ_INIT_RDESC((elt), (adr)), {(val0), (val1), (val2), (val3)} }
+
+#define RXEQ_VAL_ALL(elt, adr, val)  \
+	{RXEQ_INIT_RDESC((elt), (adr)), {(val), (val), (val), (val)} }
+
+#define RXEQ_SDR_DFELTH 0
+#define RXEQ_SDR_TLTH 0
+#define RXEQ_SDR_G1CNT_Z1CNT 0x11
+#define RXEQ_SDR_ZCNT 23
+
+static struct rxeq_init {
+	u16 rdesc;      /* in form used in SerDesDDSRXEQ */
+	u8  rdata[4];
+} rxeq_init_vals[] = {
+	/* Set Rcv Eq. to Preset node */
+	RXEQ_VAL_ALL(7, 0x27, 0x10),
+	/* Set DFELTHFDR/HDR thresholds */
+	RXEQ_VAL(7, 8,    0, 0, 0, 0), /* FDR, was 0, 1, 2, 3 */
+	RXEQ_VAL(7, 0x21, 0, 0, 0, 0), /* HDR */
+	/* Set TLTHFDR/HDR theshold */
+	RXEQ_VAL(7, 9,    2, 2, 2, 2), /* FDR, was 0, 2, 4, 6 */
+	RXEQ_VAL(7, 0x23, 2, 2, 2, 2), /* HDR, was  0, 1, 2, 3 */
+	/* Set Preamp setting 2 (ZFR/ZCNT) */
+	RXEQ_VAL(7, 0x1B, 12, 12, 12, 12), /* FDR, was 12, 16, 20, 24 */
+	RXEQ_VAL(7, 0x1C, 12, 12, 12, 12), /* HDR, was 12, 16, 20, 24 */
+	/* Set Preamp DC gain and Setting 1 (GFR/GHR) */
+	RXEQ_VAL(7, 0x1E, 16, 16, 16, 16), /* FDR, was 16, 17, 18, 20 */
+	RXEQ_VAL(7, 0x1F, 16, 16, 16, 16), /* HDR, was 16, 17, 18, 20 */
+	/* Toggle RELOCK (in VCDL_CTRL0) to lock to data */
+	RXEQ_VAL_ALL(6, 6, 0x20), /* Set D5 High */
+	RXEQ_VAL_ALL(6, 6, 0), /* Set D5 Low */
+};
+
+/* There are 17 values from vendor, but IBC only accesses the first 16 */
+#define DDS_ROWS (16)
+#define RXEQ_ROWS ARRAY_SIZE(rxeq_init_vals)
+
+static int qib_sd_setvals(struct qib_devdata *dd)
+{
+	int idx, midx;
+	int min_idx;     /* Minimum index for this portion of table */
+	uint32_t dds_reg_map;
+	u64 __iomem *taddr, *iaddr;
+	uint64_t data;
+	uint64_t sdctl;
+
+	taddr = dd->kregbase + kr_serdes_maptable;
+	iaddr = dd->kregbase + kr_serdes_ddsrxeq0;
+
+	/*
+	 * Init the DDS section of the table.
+	 * Each "row" of the table provokes NUM_DDS_REG writes, to the
+	 * registers indicated in DDS_REG_MAP.
+	 */
+	sdctl = qib_read_kreg64(dd, kr_ibserdesctrl);
+	sdctl = (sdctl & ~(0x1f << 8)) | (NUM_DDS_REGS << 8);
+	sdctl = (sdctl & ~(0x1f << 13)) | (RXEQ_ROWS << 13);
+	qib_write_kreg(dd, kr_ibserdesctrl, sdctl);
+
+	/*
+	 * Iterate down table within loop for each register to store.
+	 */
+	dds_reg_map = DDS_REG_MAP;
+	for (idx = 0; idx < NUM_DDS_REGS; ++idx) {
+		data = ((dds_reg_map & 0xF) << 4) | TX_FAST_ELT;
+		writeq(data, iaddr + idx);
+		mmiowb();
+		qib_read_kreg32(dd, kr_scratch);
+		dds_reg_map >>= 4;
+		for (midx = 0; midx < DDS_ROWS; ++midx) {
+			u64 __iomem *daddr = taddr + ((midx << 4) + idx);
+
+			data = dds_init_vals[midx].reg_vals[idx];
+			writeq(data, daddr);
+			mmiowb();
+			qib_read_kreg32(dd, kr_scratch);
+		} /* End inner for (vals for this reg, each row) */
+	} /* end outer for (regs to be stored) */
+
+	/*
+	 * Init the RXEQ section of the table.
+	 * This runs in a different order, as the pattern of
+	 * register references is more complex, but there are only
+	 * four "data" values per register.
+	 */
+	min_idx = idx; /* RXEQ indices pick up where DDS left off */
+	taddr += 0x100; /* RXEQ data is in second half of table */
+	/* Iterate through RXEQ register addresses */
+	for (idx = 0; idx < RXEQ_ROWS; ++idx) {
+		int didx; /* "destination" */
+		int vidx;
+
+		/* didx is offset by min_idx to address RXEQ range of regs */
+		didx = idx + min_idx;
+		/* Store the next RXEQ register address */
+		writeq(rxeq_init_vals[idx].rdesc, iaddr + didx);
+		mmiowb();
+		qib_read_kreg32(dd, kr_scratch);
+		/* Iterate through RXEQ values */
+		for (vidx = 0; vidx < 4; vidx++) {
+			data = rxeq_init_vals[idx].rdata[vidx];
+			writeq(data, taddr + (vidx << 6) + idx);
+			mmiowb();
+			qib_read_kreg32(dd, kr_scratch);
+		}
+	} /* end outer for (Reg-writes for RXEQ) */
+	return 0;
+}
+
+#define CMUCTRL5 EPB_LOC(7, 0, 0x15)
+#define RXHSCTRL0(chan) EPB_LOC(chan, 6, 0)
+#define VCDL_DAC2(chan) EPB_LOC(chan, 6, 5)
+#define VCDL_CTRL0(chan) EPB_LOC(chan, 6, 6)
+#define VCDL_CTRL2(chan) EPB_LOC(chan, 6, 8)
+#define START_EQ2(chan) EPB_LOC(chan, 7, 0x28)
+
+/*
+ * Repeat a "store" across all channels of the IB SerDes.
+ * Although nominally it inherits the "read value" of the last
+ * channel it modified, the only really useful return is <0 for
+ * failure, >= 0 for success. The parameter 'loc' is assumed to
+ * be the location in some channel of the register to be modified
+ * The caller can specify use of the "gang write" option of EPB,
+ * in which case we use the specified channel data for any fields
+ * not explicitely written.
+ */
+static int ibsd_mod_allchnls(struct qib_devdata *dd, int loc, int val,
+			     int mask)
+{
+	int ret = -1;
+	int chnl;
+
+	if (loc & EPB_GLOBAL_WR) {
+		/*
+		 * Our caller has assured us that we can set all four
+		 * channels at once. Trust that. If mask is not 0xFF,
+		 * we will read the _specified_ channel for our starting
+		 * value.
+		 */
+		loc |= (1U << EPB_IB_QUAD0_CS_SHF);
+		chnl = (loc >> (4 + EPB_ADDR_SHF)) & 7;
+		if (mask != 0xFF) {
+			ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES,
+						 loc & ~EPB_GLOBAL_WR, 0, 0);
+			if (ret < 0) {
+				int sloc = loc >> EPB_ADDR_SHF;
+
+				qib_dev_err(dd,
+					"pre-read failed: elt %d, addr 0x%X, chnl %d\n",
+					(sloc & 0xF),
+					(sloc >> 9) & 0x3f, chnl);
+				return ret;
+			}
+			val = (ret & ~mask) | (val & mask);
+		}
+		loc &=  ~(7 << (4+EPB_ADDR_SHF));
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, val, 0xFF);
+		if (ret < 0) {
+			int sloc = loc >> EPB_ADDR_SHF;
+
+			qib_dev_err(dd,
+				"Global WR failed: elt %d, addr 0x%X, val %02X\n",
+				(sloc & 0xF), (sloc >> 9) & 0x3f, val);
+		}
+		return ret;
+	}
+	/* Clear "channel" and set CS so we can simply iterate */
+	loc &=  ~(7 << (4+EPB_ADDR_SHF));
+	loc |= (1U << EPB_IB_QUAD0_CS_SHF);
+	for (chnl = 0; chnl < 4; ++chnl) {
+		int cloc = loc | (chnl << (4+EPB_ADDR_SHF));
+
+		ret = qib_sd7220_reg_mod(dd, IB_7220_SERDES, cloc, val, mask);
+		if (ret < 0) {
+			int sloc = loc >> EPB_ADDR_SHF;
+
+			qib_dev_err(dd,
+				"Write failed: elt %d, addr 0x%X, chnl %d, val 0x%02X, mask 0x%02X\n",
+				(sloc & 0xF), (sloc >> 9) & 0x3f, chnl,
+				val & 0xFF, mask & 0xFF);
+			break;
+		}
+	}
+	return ret;
+}
+
+/*
+ * Set the Tx values normally modified by IBC in IB1.2 mode to default
+ * values, as gotten from first row of init table.
+ */
+static int set_dds_vals(struct qib_devdata *dd, struct dds_init *ddi)
+{
+	int ret;
+	int idx, reg, data;
+	uint32_t regmap;
+
+	regmap = DDS_REG_MAP;
+	for (idx = 0; idx < NUM_DDS_REGS; ++idx) {
+		reg = (regmap & 0xF);
+		regmap >>= 4;
+		data = ddi->reg_vals[idx];
+		/* Vendor says RMW not needed for these regs, use 0xFF mask */
+		ret = ibsd_mod_allchnls(dd, EPB_LOC(0, 9, reg), data, 0xFF);
+		if (ret < 0)
+			break;
+	}
+	return ret;
+}
+
+/*
+ * Set the Rx values normally modified by IBC in IB1.2 mode to default
+ * values, as gotten from selected column of init table.
+ */
+static int set_rxeq_vals(struct qib_devdata *dd, int vsel)
+{
+	int ret;
+	int ridx;
+	int cnt = ARRAY_SIZE(rxeq_init_vals);
+
+	for (ridx = 0; ridx < cnt; ++ridx) {
+		int elt, reg, val, loc;
+
+		elt = rxeq_init_vals[ridx].rdesc & 0xF;
+		reg = rxeq_init_vals[ridx].rdesc >> 4;
+		loc = EPB_LOC(0, elt, reg);
+		val = rxeq_init_vals[ridx].rdata[vsel];
+		/* mask of 0xFF, because hardware does full-byte store. */
+		ret = ibsd_mod_allchnls(dd, loc, val, 0xFF);
+		if (ret < 0)
+			break;
+	}
+	return ret;
+}
+
+/*
+ * Set the default values (row 0) for DDR Driver Demphasis.
+ * we do this initially and whenever we turn off IB-1.2
+ *
+ * The "default" values for Rx equalization are also stored to
+ * SerDes registers. Formerly (and still default), we used set 2.
+ * For experimenting with cables and link-partners, we allow changing
+ * that via a module parameter.
+ */
+static unsigned qib_rxeq_set = 2;
+module_param_named(rxeq_default_set, qib_rxeq_set, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(rxeq_default_set,
+		 "Which set [0..3] of Rx Equalization values is default");
+
+static int qib_internal_presets(struct qib_devdata *dd)
+{
+	int ret = 0;
+
+	ret = set_dds_vals(dd, dds_init_vals + DDS_3M);
+
+	if (ret < 0)
+		qib_dev_err(dd, "Failed to set default DDS values\n");
+	ret = set_rxeq_vals(dd, qib_rxeq_set & 3);
+	if (ret < 0)
+		qib_dev_err(dd, "Failed to set default RXEQ values\n");
+	return ret;
+}
+
+int qib_sd7220_presets(struct qib_devdata *dd)
+{
+	int ret = 0;
+
+	if (!dd->cspec->presets_needed)
+		return ret;
+	dd->cspec->presets_needed = 0;
+	/* Assert uC reset, so we don't clash with it. */
+	qib_ibsd_reset(dd, 1);
+	udelay(2);
+	qib_sd_trimdone_monitor(dd, "link-down");
+
+	ret = qib_internal_presets(dd);
+	return ret;
+}
+
+static int qib_sd_trimself(struct qib_devdata *dd, int val)
+{
+	int loc = CMUCTRL5 | (1U << EPB_IB_QUAD0_CS_SHF);
+
+	return qib_sd7220_reg_mod(dd, IB_7220_SERDES, loc, val, 0xFF);
+}
+
+static int qib_sd_early(struct qib_devdata *dd)
+{
+	int ret;
+
+	ret = ibsd_mod_allchnls(dd, RXHSCTRL0(0) | EPB_GLOBAL_WR, 0xD4, 0xFF);
+	if (ret < 0)
+		goto bail;
+	ret = ibsd_mod_allchnls(dd, START_EQ1(0) | EPB_GLOBAL_WR, 0x10, 0xFF);
+	if (ret < 0)
+		goto bail;
+	ret = ibsd_mod_allchnls(dd, START_EQ2(0) | EPB_GLOBAL_WR, 0x30, 0xFF);
+bail:
+	return ret;
+}
+
+#define BACTRL(chnl) EPB_LOC(chnl, 6, 0x0E)
+#define LDOUTCTRL1(chnl) EPB_LOC(chnl, 7, 6)
+#define RXHSSTATUS(chnl) EPB_LOC(chnl, 6, 0xF)
+
+static int qib_sd_dactrim(struct qib_devdata *dd)
+{
+	int ret;
+
+	ret = ibsd_mod_allchnls(dd, VCDL_DAC2(0) | EPB_GLOBAL_WR, 0x2D, 0xFF);
+	if (ret < 0)
+		goto bail;
+
+	/* more fine-tuning of what will be default */
+	ret = ibsd_mod_allchnls(dd, VCDL_CTRL2(0), 3, 0xF);
+	if (ret < 0)
+		goto bail;
+
+	ret = ibsd_mod_allchnls(dd, BACTRL(0) | EPB_GLOBAL_WR, 0x40, 0xFF);
+	if (ret < 0)
+		goto bail;
+
+	ret = ibsd_mod_allchnls(dd, LDOUTCTRL1(0) | EPB_GLOBAL_WR, 0x04, 0xFF);
+	if (ret < 0)
+		goto bail;
+
+	ret = ibsd_mod_allchnls(dd, RXHSSTATUS(0) | EPB_GLOBAL_WR, 0x04, 0xFF);
+	if (ret < 0)
+		goto bail;
+
+	/*
+	 * Delay for max possible number of steps, with slop.
+	 * Each step is about 4usec.
+	 */
+	udelay(415);
+
+	ret = ibsd_mod_allchnls(dd, LDOUTCTRL1(0) | EPB_GLOBAL_WR, 0x00, 0xFF);
+
+bail:
+	return ret;
+}
+
+#define RELOCK_FIRST_MS 3
+#define RXLSPPM(chan) EPB_LOC(chan, 0, 2)
+void toggle_7220_rclkrls(struct qib_devdata *dd)
+{
+	int loc = RXLSPPM(0) | EPB_GLOBAL_WR;
+	int ret;
+
+	ret = ibsd_mod_allchnls(dd, loc, 0, 0x80);
+	if (ret < 0)
+		qib_dev_err(dd, "RCLKRLS failed to clear D7\n");
+	else {
+		udelay(1);
+		ibsd_mod_allchnls(dd, loc, 0x80, 0x80);
+	}
+	/* And again for good measure */
+	udelay(1);
+	ret = ibsd_mod_allchnls(dd, loc, 0, 0x80);
+	if (ret < 0)
+		qib_dev_err(dd, "RCLKRLS failed to clear D7\n");
+	else {
+		udelay(1);
+		ibsd_mod_allchnls(dd, loc, 0x80, 0x80);
+	}
+	/* Now reset xgxs and IBC to complete the recovery */
+	dd->f_xgxs_reset(dd->pport);
+}
+
+/*
+ * Shut down the timer that polls for relock occasions, if needed
+ * this is "hooked" from qib_7220_quiet_serdes(), which is called
+ * just before qib_shutdown_device() in qib_driver.c shuts down all
+ * the other timers
+ */
+void shutdown_7220_relock_poll(struct qib_devdata *dd)
+{
+	if (dd->cspec->relock_timer_active)
+		del_timer_sync(&dd->cspec->relock_timer);
+}
+
+static unsigned qib_relock_by_timer = 1;
+module_param_named(relock_by_timer, qib_relock_by_timer, uint,
+		   S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(relock_by_timer, "Allow relock attempt if link not up");
+
+static void qib_run_relock(unsigned long opaque)
+{
+	struct qib_devdata *dd = (struct qib_devdata *)opaque;
+	struct qib_pportdata *ppd = dd->pport;
+	struct qib_chip_specific *cs = dd->cspec;
+	int timeoff;
+
+	/*
+	 * Check link-training state for "stuck" state, when down.
+	 * if found, try relock and schedule another try at
+	 * exponentially growing delay, maxed at one second.
+	 * if not stuck, our work is done.
+	 */
+	if ((dd->flags & QIB_INITTED) && !(ppd->lflags &
+	    (QIBL_IB_AUTONEG_INPROG | QIBL_LINKINIT | QIBL_LINKARMED |
+	     QIBL_LINKACTIVE))) {
+		if (qib_relock_by_timer) {
+			if (!(ppd->lflags & QIBL_IB_LINK_DISABLED))
+				toggle_7220_rclkrls(dd);
+		}
+		/* re-set timer for next check */
+		timeoff = cs->relock_interval << 1;
+		if (timeoff > HZ)
+			timeoff = HZ;
+		cs->relock_interval = timeoff;
+	} else
+		timeoff = HZ;
+	mod_timer(&cs->relock_timer, jiffies + timeoff);
+}
+
+void set_7220_relock_poll(struct qib_devdata *dd, int ibup)
+{
+	struct qib_chip_specific *cs = dd->cspec;
+
+	if (ibup) {
+		/* We are now up, relax timer to 1 second interval */
+		if (cs->relock_timer_active) {
+			cs->relock_interval = HZ;
+			mod_timer(&cs->relock_timer, jiffies + HZ);
+		}
+	} else {
+		/* Transition to down, (re-)set timer to short interval. */
+		unsigned int timeout;
+
+		timeout = msecs_to_jiffies(RELOCK_FIRST_MS);
+		if (timeout == 0)
+			timeout = 1;
+		/* If timer has not yet been started, do so. */
+		if (!cs->relock_timer_active) {
+			cs->relock_timer_active = 1;
+			init_timer(&cs->relock_timer);
+			cs->relock_timer.function = qib_run_relock;
+			cs->relock_timer.data = (unsigned long) dd;
+			cs->relock_interval = timeout;
+			cs->relock_timer.expires = jiffies + timeout;
+			add_timer(&cs->relock_timer);
+		} else {
+			cs->relock_interval = timeout;
+			mod_timer(&cs->relock_timer, jiffies + timeout);
+		}
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_sdma.c b/drivers/infiniband/hw/qib/qib_sdma.c
new file mode 100644
index 000000000..c6d6a54d2
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_sdma.c
@@ -0,0 +1,1039 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2007 - 2012 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+
+#include "qib.h"
+#include "qib_common.h"
+
+/* default pio off, sdma on */
+static ushort sdma_descq_cnt = 256;
+module_param_named(sdma_descq_cnt, sdma_descq_cnt, ushort, S_IRUGO);
+MODULE_PARM_DESC(sdma_descq_cnt, "Number of SDMA descq entries");
+
+/*
+ * Bits defined in the send DMA descriptor.
+ */
+#define SDMA_DESC_LAST          (1ULL << 11)
+#define SDMA_DESC_FIRST         (1ULL << 12)
+#define SDMA_DESC_DMA_HEAD      (1ULL << 13)
+#define SDMA_DESC_USE_LARGE_BUF (1ULL << 14)
+#define SDMA_DESC_INTR          (1ULL << 15)
+#define SDMA_DESC_COUNT_LSB     16
+#define SDMA_DESC_GEN_LSB       30
+
+char *qib_sdma_state_names[] = {
+	[qib_sdma_state_s00_hw_down]          = "s00_HwDown",
+	[qib_sdma_state_s10_hw_start_up_wait] = "s10_HwStartUpWait",
+	[qib_sdma_state_s20_idle]             = "s20_Idle",
+	[qib_sdma_state_s30_sw_clean_up_wait] = "s30_SwCleanUpWait",
+	[qib_sdma_state_s40_hw_clean_up_wait] = "s40_HwCleanUpWait",
+	[qib_sdma_state_s50_hw_halt_wait]     = "s50_HwHaltWait",
+	[qib_sdma_state_s99_running]          = "s99_Running",
+};
+
+char *qib_sdma_event_names[] = {
+	[qib_sdma_event_e00_go_hw_down]   = "e00_GoHwDown",
+	[qib_sdma_event_e10_go_hw_start]  = "e10_GoHwStart",
+	[qib_sdma_event_e20_hw_started]   = "e20_HwStarted",
+	[qib_sdma_event_e30_go_running]   = "e30_GoRunning",
+	[qib_sdma_event_e40_sw_cleaned]   = "e40_SwCleaned",
+	[qib_sdma_event_e50_hw_cleaned]   = "e50_HwCleaned",
+	[qib_sdma_event_e60_hw_halted]    = "e60_HwHalted",
+	[qib_sdma_event_e70_go_idle]      = "e70_GoIdle",
+	[qib_sdma_event_e7220_err_halted] = "e7220_ErrHalted",
+	[qib_sdma_event_e7322_err_halted] = "e7322_ErrHalted",
+	[qib_sdma_event_e90_timer_tick]   = "e90_TimerTick",
+};
+
+/* declare all statics here rather than keep sorting */
+static int alloc_sdma(struct qib_pportdata *);
+static void sdma_complete(struct kref *);
+static void sdma_finalput(struct qib_sdma_state *);
+static void sdma_get(struct qib_sdma_state *);
+static void sdma_put(struct qib_sdma_state *);
+static void sdma_set_state(struct qib_pportdata *, enum qib_sdma_states);
+static void sdma_start_sw_clean_up(struct qib_pportdata *);
+static void sdma_sw_clean_up_task(unsigned long);
+static void unmap_desc(struct qib_pportdata *, unsigned);
+
+static void sdma_get(struct qib_sdma_state *ss)
+{
+	kref_get(&ss->kref);
+}
+
+static void sdma_complete(struct kref *kref)
+{
+	struct qib_sdma_state *ss =
+		container_of(kref, struct qib_sdma_state, kref);
+
+	complete(&ss->comp);
+}
+
+static void sdma_put(struct qib_sdma_state *ss)
+{
+	kref_put(&ss->kref, sdma_complete);
+}
+
+static void sdma_finalput(struct qib_sdma_state *ss)
+{
+	sdma_put(ss);
+	wait_for_completion(&ss->comp);
+}
+
+/*
+ * Complete all the sdma requests on the active list, in the correct
+ * order, and with appropriate processing.   Called when cleaning up
+ * after sdma shutdown, and when new sdma requests are submitted for
+ * a link that is down.   This matches what is done for requests
+ * that complete normally, it's just the full list.
+ *
+ * Must be called with sdma_lock held
+ */
+static void clear_sdma_activelist(struct qib_pportdata *ppd)
+{
+	struct qib_sdma_txreq *txp, *txp_next;
+
+	list_for_each_entry_safe(txp, txp_next, &ppd->sdma_activelist, list) {
+		list_del_init(&txp->list);
+		if (txp->flags & QIB_SDMA_TXREQ_F_FREEDESC) {
+			unsigned idx;
+
+			idx = txp->start_idx;
+			while (idx != txp->next_descq_idx) {
+				unmap_desc(ppd, idx);
+				if (++idx == ppd->sdma_descq_cnt)
+					idx = 0;
+			}
+		}
+		if (txp->callback)
+			(*txp->callback)(txp, QIB_SDMA_TXREQ_S_ABORTED);
+	}
+}
+
+static void sdma_sw_clean_up_task(unsigned long opaque)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *) opaque;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	/*
+	 * At this point, the following should always be true:
+	 * - We are halted, so no more descriptors are getting retired.
+	 * - We are not running, so no one is submitting new work.
+	 * - Only we can send the e40_sw_cleaned, so we can't start
+	 *   running again until we say so.  So, the active list and
+	 *   descq are ours to play with.
+	 */
+
+	/* Process all retired requests. */
+	qib_sdma_make_progress(ppd);
+
+	clear_sdma_activelist(ppd);
+
+	/*
+	 * Resync count of added and removed.  It is VERY important that
+	 * sdma_descq_removed NEVER decrement - user_sdma depends on it.
+	 */
+	ppd->sdma_descq_removed = ppd->sdma_descq_added;
+
+	/*
+	 * Reset our notion of head and tail.
+	 * Note that the HW registers will be reset when switching states
+	 * due to calling __qib_sdma_process_event() below.
+	 */
+	ppd->sdma_descq_tail = 0;
+	ppd->sdma_descq_head = 0;
+	ppd->sdma_head_dma[0] = 0;
+	ppd->sdma_generation = 0;
+
+	__qib_sdma_process_event(ppd, qib_sdma_event_e40_sw_cleaned);
+
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+/*
+ * This is called when changing to state qib_sdma_state_s10_hw_start_up_wait
+ * as a result of send buffer errors or send DMA descriptor errors.
+ * We want to disarm the buffers in these cases.
+ */
+static void sdma_hw_start_up(struct qib_pportdata *ppd)
+{
+	struct qib_sdma_state *ss = &ppd->sdma_state;
+	unsigned bufno;
+
+	for (bufno = ss->first_sendbuf; bufno < ss->last_sendbuf; ++bufno)
+		ppd->dd->f_sendctrl(ppd, QIB_SENDCTRL_DISARM_BUF(bufno));
+
+	ppd->dd->f_sdma_hw_start_up(ppd);
+}
+
+static void sdma_sw_tear_down(struct qib_pportdata *ppd)
+{
+	struct qib_sdma_state *ss = &ppd->sdma_state;
+
+	/* Releasing this reference means the state machine has stopped. */
+	sdma_put(ss);
+}
+
+static void sdma_start_sw_clean_up(struct qib_pportdata *ppd)
+{
+	tasklet_hi_schedule(&ppd->sdma_sw_clean_up_task);
+}
+
+static void sdma_set_state(struct qib_pportdata *ppd,
+	enum qib_sdma_states next_state)
+{
+	struct qib_sdma_state *ss = &ppd->sdma_state;
+	struct sdma_set_state_action *action = ss->set_state_action;
+	unsigned op = 0;
+
+	/* debugging bookkeeping */
+	ss->previous_state = ss->current_state;
+	ss->previous_op = ss->current_op;
+
+	ss->current_state = next_state;
+
+	if (action[next_state].op_enable)
+		op |= QIB_SDMA_SENDCTRL_OP_ENABLE;
+
+	if (action[next_state].op_intenable)
+		op |= QIB_SDMA_SENDCTRL_OP_INTENABLE;
+
+	if (action[next_state].op_halt)
+		op |= QIB_SDMA_SENDCTRL_OP_HALT;
+
+	if (action[next_state].op_drain)
+		op |= QIB_SDMA_SENDCTRL_OP_DRAIN;
+
+	if (action[next_state].go_s99_running_tofalse)
+		ss->go_s99_running = 0;
+
+	if (action[next_state].go_s99_running_totrue)
+		ss->go_s99_running = 1;
+
+	ss->current_op = op;
+
+	ppd->dd->f_sdma_sendctrl(ppd, ss->current_op);
+}
+
+static void unmap_desc(struct qib_pportdata *ppd, unsigned head)
+{
+	__le64 *descqp = &ppd->sdma_descq[head].qw[0];
+	u64 desc[2];
+	dma_addr_t addr;
+	size_t len;
+
+	desc[0] = le64_to_cpu(descqp[0]);
+	desc[1] = le64_to_cpu(descqp[1]);
+
+	addr = (desc[1] << 32) | (desc[0] >> 32);
+	len = (desc[0] >> 14) & (0x7ffULL << 2);
+	dma_unmap_single(&ppd->dd->pcidev->dev, addr, len, DMA_TO_DEVICE);
+}
+
+static int alloc_sdma(struct qib_pportdata *ppd)
+{
+	ppd->sdma_descq_cnt = sdma_descq_cnt;
+	if (!ppd->sdma_descq_cnt)
+		ppd->sdma_descq_cnt = 256;
+
+	/* Allocate memory for SendDMA descriptor FIFO */
+	ppd->sdma_descq = dma_alloc_coherent(&ppd->dd->pcidev->dev,
+		ppd->sdma_descq_cnt * sizeof(u64[2]), &ppd->sdma_descq_phys,
+		GFP_KERNEL);
+
+	if (!ppd->sdma_descq) {
+		qib_dev_err(ppd->dd,
+			"failed to allocate SendDMA descriptor FIFO memory\n");
+		goto bail;
+	}
+
+	/* Allocate memory for DMA of head register to memory */
+	ppd->sdma_head_dma = dma_alloc_coherent(&ppd->dd->pcidev->dev,
+		PAGE_SIZE, &ppd->sdma_head_phys, GFP_KERNEL);
+	if (!ppd->sdma_head_dma) {
+		qib_dev_err(ppd->dd,
+			"failed to allocate SendDMA head memory\n");
+		goto cleanup_descq;
+	}
+	ppd->sdma_head_dma[0] = 0;
+	return 0;
+
+cleanup_descq:
+	dma_free_coherent(&ppd->dd->pcidev->dev,
+		ppd->sdma_descq_cnt * sizeof(u64[2]), (void *)ppd->sdma_descq,
+		ppd->sdma_descq_phys);
+	ppd->sdma_descq = NULL;
+	ppd->sdma_descq_phys = 0;
+bail:
+	ppd->sdma_descq_cnt = 0;
+	return -ENOMEM;
+}
+
+static void free_sdma(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+
+	if (ppd->sdma_head_dma) {
+		dma_free_coherent(&dd->pcidev->dev, PAGE_SIZE,
+				  (void *)ppd->sdma_head_dma,
+				  ppd->sdma_head_phys);
+		ppd->sdma_head_dma = NULL;
+		ppd->sdma_head_phys = 0;
+	}
+
+	if (ppd->sdma_descq) {
+		dma_free_coherent(&dd->pcidev->dev,
+				  ppd->sdma_descq_cnt * sizeof(u64[2]),
+				  ppd->sdma_descq, ppd->sdma_descq_phys);
+		ppd->sdma_descq = NULL;
+		ppd->sdma_descq_phys = 0;
+	}
+}
+
+static inline void make_sdma_desc(struct qib_pportdata *ppd,
+				  u64 *sdmadesc, u64 addr, u64 dwlen,
+				  u64 dwoffset)
+{
+
+	WARN_ON(addr & 3);
+	/* SDmaPhyAddr[47:32] */
+	sdmadesc[1] = addr >> 32;
+	/* SDmaPhyAddr[31:0] */
+	sdmadesc[0] = (addr & 0xfffffffcULL) << 32;
+	/* SDmaGeneration[1:0] */
+	sdmadesc[0] |= (ppd->sdma_generation & 3ULL) <<
+		SDMA_DESC_GEN_LSB;
+	/* SDmaDwordCount[10:0] */
+	sdmadesc[0] |= (dwlen & 0x7ffULL) << SDMA_DESC_COUNT_LSB;
+	/* SDmaBufOffset[12:2] */
+	sdmadesc[0] |= dwoffset & 0x7ffULL;
+}
+
+/* sdma_lock must be held */
+int qib_sdma_make_progress(struct qib_pportdata *ppd)
+{
+	struct list_head *lp = NULL;
+	struct qib_sdma_txreq *txp = NULL;
+	struct qib_devdata *dd = ppd->dd;
+	int progress = 0;
+	u16 hwhead;
+	u16 idx = 0;
+
+	hwhead = dd->f_sdma_gethead(ppd);
+
+	/* The reason for some of the complexity of this code is that
+	 * not all descriptors have corresponding txps.  So, we have to
+	 * be able to skip over descs until we wander into the range of
+	 * the next txp on the list.
+	 */
+
+	if (!list_empty(&ppd->sdma_activelist)) {
+		lp = ppd->sdma_activelist.next;
+		txp = list_entry(lp, struct qib_sdma_txreq, list);
+		idx = txp->start_idx;
+	}
+
+	while (ppd->sdma_descq_head != hwhead) {
+		/* if desc is part of this txp, unmap if needed */
+		if (txp && (txp->flags & QIB_SDMA_TXREQ_F_FREEDESC) &&
+		    (idx == ppd->sdma_descq_head)) {
+			unmap_desc(ppd, ppd->sdma_descq_head);
+			if (++idx == ppd->sdma_descq_cnt)
+				idx = 0;
+		}
+
+		/* increment dequed desc count */
+		ppd->sdma_descq_removed++;
+
+		/* advance head, wrap if needed */
+		if (++ppd->sdma_descq_head == ppd->sdma_descq_cnt)
+			ppd->sdma_descq_head = 0;
+
+		/* if now past this txp's descs, do the callback */
+		if (txp && txp->next_descq_idx == ppd->sdma_descq_head) {
+			/* remove from active list */
+			list_del_init(&txp->list);
+			if (txp->callback)
+				(*txp->callback)(txp, QIB_SDMA_TXREQ_S_OK);
+			/* see if there is another txp */
+			if (list_empty(&ppd->sdma_activelist))
+				txp = NULL;
+			else {
+				lp = ppd->sdma_activelist.next;
+				txp = list_entry(lp, struct qib_sdma_txreq,
+					list);
+				idx = txp->start_idx;
+			}
+		}
+		progress = 1;
+	}
+	if (progress)
+		qib_verbs_sdma_desc_avail(ppd, qib_sdma_descq_freecnt(ppd));
+	return progress;
+}
+
+/*
+ * This is called from interrupt context.
+ */
+void qib_sdma_intr(struct qib_pportdata *ppd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	__qib_sdma_intr(ppd);
+
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+void __qib_sdma_intr(struct qib_pportdata *ppd)
+{
+	if (__qib_sdma_running(ppd)) {
+		qib_sdma_make_progress(ppd);
+		if (!list_empty(&ppd->sdma_userpending))
+			qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending);
+	}
+}
+
+int qib_setup_sdma(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+	int ret = 0;
+
+	ret = alloc_sdma(ppd);
+	if (ret)
+		goto bail;
+
+	/* set consistent sdma state */
+	ppd->dd->f_sdma_init_early(ppd);
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+	sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+
+	/* set up reference counting */
+	kref_init(&ppd->sdma_state.kref);
+	init_completion(&ppd->sdma_state.comp);
+
+	ppd->sdma_generation = 0;
+	ppd->sdma_descq_head = 0;
+	ppd->sdma_descq_removed = 0;
+	ppd->sdma_descq_added = 0;
+
+	ppd->sdma_intrequest = 0;
+	INIT_LIST_HEAD(&ppd->sdma_userpending);
+
+	INIT_LIST_HEAD(&ppd->sdma_activelist);
+
+	tasklet_init(&ppd->sdma_sw_clean_up_task, sdma_sw_clean_up_task,
+		(unsigned long)ppd);
+
+	ret = dd->f_init_sdma_regs(ppd);
+	if (ret)
+		goto bail_alloc;
+
+	qib_sdma_process_event(ppd, qib_sdma_event_e10_go_hw_start);
+
+	return 0;
+
+bail_alloc:
+	qib_teardown_sdma(ppd);
+bail:
+	return ret;
+}
+
+void qib_teardown_sdma(struct qib_pportdata *ppd)
+{
+	qib_sdma_process_event(ppd, qib_sdma_event_e00_go_hw_down);
+
+	/*
+	 * This waits for the state machine to exit so it is not
+	 * necessary to kill the sdma_sw_clean_up_task to make sure
+	 * it is not running.
+	 */
+	sdma_finalput(&ppd->sdma_state);
+
+	free_sdma(ppd);
+}
+
+int qib_sdma_running(struct qib_pportdata *ppd)
+{
+	unsigned long flags;
+	int ret;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+	ret = __qib_sdma_running(ppd);
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+
+	return ret;
+}
+
+/*
+ * Complete a request when sdma not running; likely only request
+ * but to simplify the code, always queue it, then process the full
+ * activelist.  We process the entire list to ensure that this particular
+ * request does get it's callback, but in the correct order.
+ * Must be called with sdma_lock held
+ */
+static void complete_sdma_err_req(struct qib_pportdata *ppd,
+				  struct qib_verbs_txreq *tx)
+{
+	atomic_inc(&tx->qp->s_dma_busy);
+	/* no sdma descriptors, so no unmap_desc */
+	tx->txreq.start_idx = 0;
+	tx->txreq.next_descq_idx = 0;
+	list_add_tail(&tx->txreq.list, &ppd->sdma_activelist);
+	clear_sdma_activelist(ppd);
+}
+
+/*
+ * This function queues one IB packet onto the send DMA queue per call.
+ * The caller is responsible for checking:
+ * 1) The number of send DMA descriptor entries is less than the size of
+ *    the descriptor queue.
+ * 2) The IB SGE addresses and lengths are 32-bit aligned
+ *    (except possibly the last SGE's length)
+ * 3) The SGE addresses are suitable for passing to dma_map_single().
+ */
+int qib_sdma_verbs_send(struct qib_pportdata *ppd,
+			struct qib_sge_state *ss, u32 dwords,
+			struct qib_verbs_txreq *tx)
+{
+	unsigned long flags;
+	struct qib_sge *sge;
+	struct qib_qp *qp;
+	int ret = 0;
+	u16 tail;
+	__le64 *descqp;
+	u64 sdmadesc[2];
+	u32 dwoffset;
+	dma_addr_t addr;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+retry:
+	if (unlikely(!__qib_sdma_running(ppd))) {
+		complete_sdma_err_req(ppd, tx);
+		goto unlock;
+	}
+
+	if (tx->txreq.sg_count > qib_sdma_descq_freecnt(ppd)) {
+		if (qib_sdma_make_progress(ppd))
+			goto retry;
+		if (ppd->dd->flags & QIB_HAS_SDMA_TIMEOUT)
+			ppd->dd->f_sdma_set_desc_cnt(ppd,
+					ppd->sdma_descq_cnt / 2);
+		goto busy;
+	}
+
+	dwoffset = tx->hdr_dwords;
+	make_sdma_desc(ppd, sdmadesc, (u64) tx->txreq.addr, dwoffset, 0);
+
+	sdmadesc[0] |= SDMA_DESC_FIRST;
+	if (tx->txreq.flags & QIB_SDMA_TXREQ_F_USELARGEBUF)
+		sdmadesc[0] |= SDMA_DESC_USE_LARGE_BUF;
+
+	/* write to the descq */
+	tail = ppd->sdma_descq_tail;
+	descqp = &ppd->sdma_descq[tail].qw[0];
+	*descqp++ = cpu_to_le64(sdmadesc[0]);
+	*descqp++ = cpu_to_le64(sdmadesc[1]);
+
+	/* increment the tail */
+	if (++tail == ppd->sdma_descq_cnt) {
+		tail = 0;
+		descqp = &ppd->sdma_descq[0].qw[0];
+		++ppd->sdma_generation;
+	}
+
+	tx->txreq.start_idx = tail;
+
+	sge = &ss->sge;
+	while (dwords) {
+		u32 dw;
+		u32 len;
+
+		len = dwords << 2;
+		if (len > sge->length)
+			len = sge->length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		dw = (len + 3) >> 2;
+		addr = dma_map_single(&ppd->dd->pcidev->dev, sge->vaddr,
+				      dw << 2, DMA_TO_DEVICE);
+		if (dma_mapping_error(&ppd->dd->pcidev->dev, addr))
+			goto unmap;
+		sdmadesc[0] = 0;
+		make_sdma_desc(ppd, sdmadesc, (u64) addr, dw, dwoffset);
+		/* SDmaUseLargeBuf has to be set in every descriptor */
+		if (tx->txreq.flags & QIB_SDMA_TXREQ_F_USELARGEBUF)
+			sdmadesc[0] |= SDMA_DESC_USE_LARGE_BUF;
+		/* write to the descq */
+		*descqp++ = cpu_to_le64(sdmadesc[0]);
+		*descqp++ = cpu_to_le64(sdmadesc[1]);
+
+		/* increment the tail */
+		if (++tail == ppd->sdma_descq_cnt) {
+			tail = 0;
+			descqp = &ppd->sdma_descq[0].qw[0];
+			++ppd->sdma_generation;
+		}
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= QIB_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+
+		dwoffset += dw;
+		dwords -= dw;
+	}
+
+	if (!tail)
+		descqp = &ppd->sdma_descq[ppd->sdma_descq_cnt].qw[0];
+	descqp -= 2;
+	descqp[0] |= cpu_to_le64(SDMA_DESC_LAST);
+	if (tx->txreq.flags & QIB_SDMA_TXREQ_F_HEADTOHOST)
+		descqp[0] |= cpu_to_le64(SDMA_DESC_DMA_HEAD);
+	if (tx->txreq.flags & QIB_SDMA_TXREQ_F_INTREQ)
+		descqp[0] |= cpu_to_le64(SDMA_DESC_INTR);
+
+	atomic_inc(&tx->qp->s_dma_busy);
+	tx->txreq.next_descq_idx = tail;
+	ppd->dd->f_sdma_update_tail(ppd, tail);
+	ppd->sdma_descq_added += tx->txreq.sg_count;
+	list_add_tail(&tx->txreq.list, &ppd->sdma_activelist);
+	goto unlock;
+
+unmap:
+	for (;;) {
+		if (!tail)
+			tail = ppd->sdma_descq_cnt - 1;
+		else
+			tail--;
+		if (tail == ppd->sdma_descq_tail)
+			break;
+		unmap_desc(ppd, tail);
+	}
+	qp = tx->qp;
+	qib_put_txreq(tx);
+	spin_lock(&qp->r_lock);
+	spin_lock(&qp->s_lock);
+	if (qp->ibqp.qp_type == IB_QPT_RC) {
+		/* XXX what about error sending RDMA read responses? */
+		if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)
+			qib_error_qp(qp, IB_WC_GENERAL_ERR);
+	} else if (qp->s_wqe)
+		qib_send_complete(qp, qp->s_wqe, IB_WC_GENERAL_ERR);
+	spin_unlock(&qp->s_lock);
+	spin_unlock(&qp->r_lock);
+	/* return zero to process the next send work request */
+	goto unlock;
+
+busy:
+	qp = tx->qp;
+	spin_lock(&qp->s_lock);
+	if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) {
+		struct qib_ibdev *dev;
+
+		/*
+		 * If we couldn't queue the DMA request, save the info
+		 * and try again later rather than destroying the
+		 * buffer and undoing the side effects of the copy.
+		 */
+		tx->ss = ss;
+		tx->dwords = dwords;
+		qp->s_tx = tx;
+		dev = &ppd->dd->verbs_dev;
+		spin_lock(&dev->pending_lock);
+		if (list_empty(&qp->iowait)) {
+			struct qib_ibport *ibp;
+
+			ibp = &ppd->ibport_data;
+			ibp->n_dmawait++;
+			qp->s_flags |= QIB_S_WAIT_DMA_DESC;
+			list_add_tail(&qp->iowait, &dev->dmawait);
+		}
+		spin_unlock(&dev->pending_lock);
+		qp->s_flags &= ~QIB_S_BUSY;
+		spin_unlock(&qp->s_lock);
+		ret = -EBUSY;
+	} else {
+		spin_unlock(&qp->s_lock);
+		qib_put_txreq(tx);
+	}
+unlock:
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+	return ret;
+}
+
+/*
+ * sdma_lock should be acquired before calling this routine
+ */
+void dump_sdma_state(struct qib_pportdata *ppd)
+{
+	struct qib_sdma_desc *descq;
+	struct qib_sdma_txreq *txp, *txpnext;
+	__le64 *descqp;
+	u64 desc[2];
+	u64 addr;
+	u16 gen, dwlen, dwoffset;
+	u16 head, tail, cnt;
+
+	head = ppd->sdma_descq_head;
+	tail = ppd->sdma_descq_tail;
+	cnt = qib_sdma_descq_freecnt(ppd);
+	descq = ppd->sdma_descq;
+
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA ppd->sdma_descq_head: %u\n", head);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA ppd->sdma_descq_tail: %u\n", tail);
+	qib_dev_porterr(ppd->dd, ppd->port,
+		"SDMA sdma_descq_freecnt: %u\n", cnt);
+
+	/* print info for each entry in the descriptor queue */
+	while (head != tail) {
+		char flags[6] = { 'x', 'x', 'x', 'x', 'x', 0 };
+
+		descqp = &descq[head].qw[0];
+		desc[0] = le64_to_cpu(descqp[0]);
+		desc[1] = le64_to_cpu(descqp[1]);
+		flags[0] = (desc[0] & 1<<15) ? 'I' : '-';
+		flags[1] = (desc[0] & 1<<14) ? 'L' : 'S';
+		flags[2] = (desc[0] & 1<<13) ? 'H' : '-';
+		flags[3] = (desc[0] & 1<<12) ? 'F' : '-';
+		flags[4] = (desc[0] & 1<<11) ? 'L' : '-';
+		addr = (desc[1] << 32) | ((desc[0] >> 32) & 0xfffffffcULL);
+		gen = (desc[0] >> 30) & 3ULL;
+		dwlen = (desc[0] >> 14) & (0x7ffULL << 2);
+		dwoffset = (desc[0] & 0x7ffULL) << 2;
+		qib_dev_porterr(ppd->dd, ppd->port,
+			"SDMA sdmadesc[%u]: flags:%s addr:0x%016llx gen:%u len:%u bytes offset:%u bytes\n",
+			 head, flags, addr, gen, dwlen, dwoffset);
+		if (++head == ppd->sdma_descq_cnt)
+			head = 0;
+	}
+
+	/* print dma descriptor indices from the TX requests */
+	list_for_each_entry_safe(txp, txpnext, &ppd->sdma_activelist,
+				 list)
+		qib_dev_porterr(ppd->dd, ppd->port,
+			"SDMA txp->start_idx: %u txp->next_descq_idx: %u\n",
+			txp->start_idx, txp->next_descq_idx);
+}
+
+void qib_sdma_process_event(struct qib_pportdata *ppd,
+	enum qib_sdma_events event)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+
+	__qib_sdma_process_event(ppd, event);
+
+	if (ppd->sdma_state.current_state == qib_sdma_state_s99_running)
+		qib_verbs_sdma_desc_avail(ppd, qib_sdma_descq_freecnt(ppd));
+
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+}
+
+void __qib_sdma_process_event(struct qib_pportdata *ppd,
+	enum qib_sdma_events event)
+{
+	struct qib_sdma_state *ss = &ppd->sdma_state;
+
+	switch (ss->current_state) {
+	case qib_sdma_state_s00_hw_down:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			break;
+		case qib_sdma_event_e30_go_running:
+			/*
+			 * If down, but running requested (usually result
+			 * of link up, then we need to start up.
+			 * This can happen when hw down is requested while
+			 * bringing the link up with traffic active on
+			 * 7220, e.g. */
+			ss->go_s99_running = 1;
+			/* fall through and start dma engine */
+		case qib_sdma_event_e10_go_hw_start:
+			/* This reference means the state machine is started */
+			sdma_get(&ppd->sdma_state);
+			sdma_set_state(ppd,
+				       qib_sdma_state_s10_hw_start_up_wait);
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			sdma_sw_tear_down(ppd);
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			break;
+		case qib_sdma_event_e70_go_idle:
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s10_hw_start_up_wait:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			sdma_sw_tear_down(ppd);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			sdma_set_state(ppd, ss->go_s99_running ?
+				       qib_sdma_state_s99_running :
+				       qib_sdma_state_s20_idle);
+			break;
+		case qib_sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			break;
+		case qib_sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s20_idle:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			sdma_sw_tear_down(ppd);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e30_go_running:
+			sdma_set_state(ppd, qib_sdma_state_s99_running);
+			ss->go_s99_running = 1;
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			break;
+		case qib_sdma_event_e70_go_idle:
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s30_sw_clean_up_wait:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			sdma_set_state(ppd,
+				       qib_sdma_state_s10_hw_start_up_wait);
+			sdma_hw_start_up(ppd);
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			break;
+		case qib_sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s40_hw_clean_up_wait:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			sdma_set_state(ppd,
+				       qib_sdma_state_s30_sw_clean_up_wait);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			break;
+		case qib_sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s50_hw_halt_wait:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e30_go_running:
+			ss->go_s99_running = 1;
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			sdma_set_state(ppd,
+				       qib_sdma_state_s40_hw_clean_up_wait);
+			ppd->dd->f_sdma_hw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e70_go_idle:
+			ss->go_s99_running = 0;
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+
+	case qib_sdma_state_s99_running:
+		switch (event) {
+		case qib_sdma_event_e00_go_hw_down:
+			sdma_set_state(ppd, qib_sdma_state_s00_hw_down);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e10_go_hw_start:
+			break;
+		case qib_sdma_event_e20_hw_started:
+			break;
+		case qib_sdma_event_e30_go_running:
+			break;
+		case qib_sdma_event_e40_sw_cleaned:
+			break;
+		case qib_sdma_event_e50_hw_cleaned:
+			break;
+		case qib_sdma_event_e60_hw_halted:
+			sdma_set_state(ppd,
+				       qib_sdma_state_s30_sw_clean_up_wait);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e70_go_idle:
+			sdma_set_state(ppd, qib_sdma_state_s50_hw_halt_wait);
+			ss->go_s99_running = 0;
+			break;
+		case qib_sdma_event_e7220_err_halted:
+			sdma_set_state(ppd,
+				       qib_sdma_state_s30_sw_clean_up_wait);
+			sdma_start_sw_clean_up(ppd);
+			break;
+		case qib_sdma_event_e7322_err_halted:
+			sdma_set_state(ppd, qib_sdma_state_s50_hw_halt_wait);
+			break;
+		case qib_sdma_event_e90_timer_tick:
+			break;
+		}
+		break;
+	}
+
+	ss->last_event = event;
+}
diff --git a/drivers/infiniband/hw/qib/qib_srq.c b/drivers/infiniband/hw/qib/qib_srq.c
new file mode 100644
index 000000000..d6235931a
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_srq.c
@@ -0,0 +1,380 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include "qib_verbs.h"
+
+/**
+ * qib_post_srq_receive - post a receive on a shared receive queue
+ * @ibsrq: the SRQ to post the receive on
+ * @wr: the list of work requests to post
+ * @bad_wr: A pointer to the first WR to cause a problem is put here
+ *
+ * This may be called from interrupt context.
+ */
+int qib_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			 struct ib_recv_wr **bad_wr)
+{
+	struct qib_srq *srq = to_isrq(ibsrq);
+	struct qib_rwq *wq;
+	unsigned long flags;
+	int ret;
+
+	for (; wr; wr = wr->next) {
+		struct qib_rwqe *wqe;
+		u32 next;
+		int i;
+
+		if ((unsigned) wr->num_sge > srq->rq.max_sge) {
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		spin_lock_irqsave(&srq->rq.lock, flags);
+		wq = srq->rq.wq;
+		next = wq->head + 1;
+		if (next >= srq->rq.size)
+			next = 0;
+		if (next == wq->tail) {
+			spin_unlock_irqrestore(&srq->rq.lock, flags);
+			*bad_wr = wr;
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		wqe = get_rwqe_ptr(&srq->rq, wq->head);
+		wqe->wr_id = wr->wr_id;
+		wqe->num_sge = wr->num_sge;
+		for (i = 0; i < wr->num_sge; i++)
+			wqe->sg_list[i] = wr->sg_list[i];
+		/* Make sure queue entry is written before the head index. */
+		smp_wmb();
+		wq->head = next;
+		spin_unlock_irqrestore(&srq->rq.lock, flags);
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_create_srq - create a shared receive queue
+ * @ibpd: the protection domain of the SRQ to create
+ * @srq_init_attr: the attributes of the SRQ
+ * @udata: data from libibverbs when creating a user SRQ
+ */
+struct ib_srq *qib_create_srq(struct ib_pd *ibpd,
+			      struct ib_srq_init_attr *srq_init_attr,
+			      struct ib_udata *udata)
+{
+	struct qib_ibdev *dev = to_idev(ibpd->device);
+	struct qib_srq *srq;
+	u32 sz;
+	struct ib_srq *ret;
+
+	if (srq_init_attr->srq_type != IB_SRQT_BASIC) {
+		ret = ERR_PTR(-ENOSYS);
+		goto done;
+	}
+
+	if (srq_init_attr->attr.max_sge == 0 ||
+	    srq_init_attr->attr.max_sge > ib_qib_max_srq_sges ||
+	    srq_init_attr->attr.max_wr == 0 ||
+	    srq_init_attr->attr.max_wr > ib_qib_max_srq_wrs) {
+		ret = ERR_PTR(-EINVAL);
+		goto done;
+	}
+
+	srq = kmalloc(sizeof(*srq), GFP_KERNEL);
+	if (!srq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto done;
+	}
+
+	/*
+	 * Need to use vmalloc() if we want to support large #s of entries.
+	 */
+	srq->rq.size = srq_init_attr->attr.max_wr + 1;
+	srq->rq.max_sge = srq_init_attr->attr.max_sge;
+	sz = sizeof(struct ib_sge) * srq->rq.max_sge +
+		sizeof(struct qib_rwqe);
+	srq->rq.wq = vmalloc_user(sizeof(struct qib_rwq) + srq->rq.size * sz);
+	if (!srq->rq.wq) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_srq;
+	}
+
+	/*
+	 * Return the address of the RWQ as the offset to mmap.
+	 * See qib_mmap() for details.
+	 */
+	if (udata && udata->outlen >= sizeof(__u64)) {
+		int err;
+		u32 s = sizeof(struct qib_rwq) + srq->rq.size * sz;
+
+		srq->ip =
+		    qib_create_mmap_info(dev, s, ibpd->uobject->context,
+					 srq->rq.wq);
+		if (!srq->ip) {
+			ret = ERR_PTR(-ENOMEM);
+			goto bail_wq;
+		}
+
+		err = ib_copy_to_udata(udata, &srq->ip->offset,
+				       sizeof(srq->ip->offset));
+		if (err) {
+			ret = ERR_PTR(err);
+			goto bail_ip;
+		}
+	} else
+		srq->ip = NULL;
+
+	/*
+	 * ib_create_srq() will initialize srq->ibsrq.
+	 */
+	spin_lock_init(&srq->rq.lock);
+	srq->rq.wq->head = 0;
+	srq->rq.wq->tail = 0;
+	srq->limit = srq_init_attr->attr.srq_limit;
+
+	spin_lock(&dev->n_srqs_lock);
+	if (dev->n_srqs_allocated == ib_qib_max_srqs) {
+		spin_unlock(&dev->n_srqs_lock);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail_ip;
+	}
+
+	dev->n_srqs_allocated++;
+	spin_unlock(&dev->n_srqs_lock);
+
+	if (srq->ip) {
+		spin_lock_irq(&dev->pending_lock);
+		list_add(&srq->ip->pending_mmaps, &dev->pending_mmaps);
+		spin_unlock_irq(&dev->pending_lock);
+	}
+
+	ret = &srq->ibsrq;
+	goto done;
+
+bail_ip:
+	kfree(srq->ip);
+bail_wq:
+	vfree(srq->rq.wq);
+bail_srq:
+	kfree(srq);
+done:
+	return ret;
+}
+
+/**
+ * qib_modify_srq - modify a shared receive queue
+ * @ibsrq: the SRQ to modify
+ * @attr: the new attributes of the SRQ
+ * @attr_mask: indicates which attributes to modify
+ * @udata: user data for libibverbs.so
+ */
+int qib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		   enum ib_srq_attr_mask attr_mask,
+		   struct ib_udata *udata)
+{
+	struct qib_srq *srq = to_isrq(ibsrq);
+	struct qib_rwq *wq;
+	int ret = 0;
+
+	if (attr_mask & IB_SRQ_MAX_WR) {
+		struct qib_rwq *owq;
+		struct qib_rwqe *p;
+		u32 sz, size, n, head, tail;
+
+		/* Check that the requested sizes are below the limits. */
+		if ((attr->max_wr > ib_qib_max_srq_wrs) ||
+		    ((attr_mask & IB_SRQ_LIMIT) ?
+		     attr->srq_limit : srq->limit) > attr->max_wr) {
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		sz = sizeof(struct qib_rwqe) +
+			srq->rq.max_sge * sizeof(struct ib_sge);
+		size = attr->max_wr + 1;
+		wq = vmalloc_user(sizeof(struct qib_rwq) + size * sz);
+		if (!wq) {
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		/* Check that we can write the offset to mmap. */
+		if (udata && udata->inlen >= sizeof(__u64)) {
+			__u64 offset_addr;
+			__u64 offset = 0;
+
+			ret = ib_copy_from_udata(&offset_addr, udata,
+						 sizeof(offset_addr));
+			if (ret)
+				goto bail_free;
+			udata->outbuf =
+				(void __user *) (unsigned long) offset_addr;
+			ret = ib_copy_to_udata(udata, &offset,
+					       sizeof(offset));
+			if (ret)
+				goto bail_free;
+		}
+
+		spin_lock_irq(&srq->rq.lock);
+		/*
+		 * validate head and tail pointer values and compute
+		 * the number of remaining WQEs.
+		 */
+		owq = srq->rq.wq;
+		head = owq->head;
+		tail = owq->tail;
+		if (head >= srq->rq.size || tail >= srq->rq.size) {
+			ret = -EINVAL;
+			goto bail_unlock;
+		}
+		n = head;
+		if (n < tail)
+			n += srq->rq.size - tail;
+		else
+			n -= tail;
+		if (size <= n) {
+			ret = -EINVAL;
+			goto bail_unlock;
+		}
+		n = 0;
+		p = wq->wq;
+		while (tail != head) {
+			struct qib_rwqe *wqe;
+			int i;
+
+			wqe = get_rwqe_ptr(&srq->rq, tail);
+			p->wr_id = wqe->wr_id;
+			p->num_sge = wqe->num_sge;
+			for (i = 0; i < wqe->num_sge; i++)
+				p->sg_list[i] = wqe->sg_list[i];
+			n++;
+			p = (struct qib_rwqe *)((char *) p + sz);
+			if (++tail >= srq->rq.size)
+				tail = 0;
+		}
+		srq->rq.wq = wq;
+		srq->rq.size = size;
+		wq->head = n;
+		wq->tail = 0;
+		if (attr_mask & IB_SRQ_LIMIT)
+			srq->limit = attr->srq_limit;
+		spin_unlock_irq(&srq->rq.lock);
+
+		vfree(owq);
+
+		if (srq->ip) {
+			struct qib_mmap_info *ip = srq->ip;
+			struct qib_ibdev *dev = to_idev(srq->ibsrq.device);
+			u32 s = sizeof(struct qib_rwq) + size * sz;
+
+			qib_update_mmap_info(dev, ip, s, wq);
+
+			/*
+			 * Return the offset to mmap.
+			 * See qib_mmap() for details.
+			 */
+			if (udata && udata->inlen >= sizeof(__u64)) {
+				ret = ib_copy_to_udata(udata, &ip->offset,
+						       sizeof(ip->offset));
+				if (ret)
+					goto bail;
+			}
+
+			/*
+			 * Put user mapping info onto the pending list
+			 * unless it already is on the list.
+			 */
+			spin_lock_irq(&dev->pending_lock);
+			if (list_empty(&ip->pending_mmaps))
+				list_add(&ip->pending_mmaps,
+					 &dev->pending_mmaps);
+			spin_unlock_irq(&dev->pending_lock);
+		}
+	} else if (attr_mask & IB_SRQ_LIMIT) {
+		spin_lock_irq(&srq->rq.lock);
+		if (attr->srq_limit >= srq->rq.size)
+			ret = -EINVAL;
+		else
+			srq->limit = attr->srq_limit;
+		spin_unlock_irq(&srq->rq.lock);
+	}
+	goto bail;
+
+bail_unlock:
+	spin_unlock_irq(&srq->rq.lock);
+bail_free:
+	vfree(wq);
+bail:
+	return ret;
+}
+
+int qib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr)
+{
+	struct qib_srq *srq = to_isrq(ibsrq);
+
+	attr->max_wr = srq->rq.size - 1;
+	attr->max_sge = srq->rq.max_sge;
+	attr->srq_limit = srq->limit;
+	return 0;
+}
+
+/**
+ * qib_destroy_srq - destroy a shared receive queue
+ * @ibsrq: the SRQ to destroy
+ */
+int qib_destroy_srq(struct ib_srq *ibsrq)
+{
+	struct qib_srq *srq = to_isrq(ibsrq);
+	struct qib_ibdev *dev = to_idev(ibsrq->device);
+
+	spin_lock(&dev->n_srqs_lock);
+	dev->n_srqs_allocated--;
+	spin_unlock(&dev->n_srqs_lock);
+	if (srq->ip)
+		kref_put(&srq->ip->ref, qib_release_mmap_info);
+	else
+		vfree(srq->rq.wq);
+	kfree(srq);
+
+	return 0;
+}
diff --git a/drivers/infiniband/hw/qib/qib_sysfs.c b/drivers/infiniband/hw/qib/qib_sysfs.c
new file mode 100644
index 000000000..81f56cdff
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_sysfs.c
@@ -0,0 +1,818 @@
+/*
+ * Copyright (c) 2012 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/ctype.h>
+
+#include "qib.h"
+#include "qib_mad.h"
+
+/* start of per-port functions */
+/*
+ * Get/Set heartbeat enable. OR of 1=enabled, 2=auto
+ */
+static ssize_t show_hrtbt_enb(struct qib_pportdata *ppd, char *buf)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int ret;
+
+	ret = dd->f_get_ib_cfg(ppd, QIB_IB_CFG_HRTBT);
+	ret = scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+	return ret;
+}
+
+static ssize_t store_hrtbt_enb(struct qib_pportdata *ppd, const char *buf,
+			       size_t count)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int ret;
+	u16 val;
+
+	ret = kstrtou16(buf, 0, &val);
+	if (ret) {
+		qib_dev_err(dd, "attempt to set invalid Heartbeat enable\n");
+		return ret;
+	}
+
+	/*
+	 * Set the "intentional" heartbeat enable per either of
+	 * "Enable" and "Auto", as these are normally set together.
+	 * This bit is consulted when leaving loopback mode,
+	 * because entering loopback mode overrides it and automatically
+	 * disables heartbeat.
+	 */
+	ret = dd->f_set_ib_cfg(ppd, QIB_IB_CFG_HRTBT, val);
+	return ret < 0 ? ret : count;
+}
+
+static ssize_t store_loopback(struct qib_pportdata *ppd, const char *buf,
+			      size_t count)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int ret = count, r;
+
+	r = dd->f_set_ib_loopback(ppd, buf);
+	if (r < 0)
+		ret = r;
+
+	return ret;
+}
+
+static ssize_t store_led_override(struct qib_pportdata *ppd, const char *buf,
+				  size_t count)
+{
+	struct qib_devdata *dd = ppd->dd;
+	int ret;
+	u16 val;
+
+	ret = kstrtou16(buf, 0, &val);
+	if (ret) {
+		qib_dev_err(dd, "attempt to set invalid LED override\n");
+		return ret;
+	}
+
+	qib_set_led_override(ppd, val);
+	return count;
+}
+
+static ssize_t show_status(struct qib_pportdata *ppd, char *buf)
+{
+	ssize_t ret;
+
+	if (!ppd->statusp)
+		ret = -EINVAL;
+	else
+		ret = scnprintf(buf, PAGE_SIZE, "0x%llx\n",
+				(unsigned long long) *(ppd->statusp));
+	return ret;
+}
+
+/*
+ * For userland compatibility, these offsets must remain fixed.
+ * They are strings for QIB_STATUS_*
+ */
+static const char * const qib_status_str[] = {
+	"Initted",
+	"",
+	"",
+	"",
+	"",
+	"Present",
+	"IB_link_up",
+	"IB_configured",
+	"",
+	"Fatal_Hardware_Error",
+	NULL,
+};
+
+static ssize_t show_status_str(struct qib_pportdata *ppd, char *buf)
+{
+	int i, any;
+	u64 s;
+	ssize_t ret;
+
+	if (!ppd->statusp) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	s = *(ppd->statusp);
+	*buf = '\0';
+	for (any = i = 0; s && qib_status_str[i]; i++) {
+		if (s & 1) {
+			/* if overflow */
+			if (any && strlcat(buf, " ", PAGE_SIZE) >= PAGE_SIZE)
+				break;
+			if (strlcat(buf, qib_status_str[i], PAGE_SIZE) >=
+					PAGE_SIZE)
+				break;
+			any = 1;
+		}
+		s >>= 1;
+	}
+	if (any)
+		strlcat(buf, "\n", PAGE_SIZE);
+
+	ret = strlen(buf);
+
+bail:
+	return ret;
+}
+
+/* end of per-port functions */
+
+/*
+ * Start of per-port file structures and support code
+ * Because we are fitting into other infrastructure, we have to supply the
+ * full set of kobject/sysfs_ops structures and routines.
+ */
+#define QIB_PORT_ATTR(name, mode, show, store) \
+	static struct qib_port_attr qib_port_attr_##name = \
+		__ATTR(name, mode, show, store)
+
+struct qib_port_attr {
+	struct attribute attr;
+	ssize_t (*show)(struct qib_pportdata *, char *);
+	ssize_t (*store)(struct qib_pportdata *, const char *, size_t);
+};
+
+QIB_PORT_ATTR(loopback, S_IWUSR, NULL, store_loopback);
+QIB_PORT_ATTR(led_override, S_IWUSR, NULL, store_led_override);
+QIB_PORT_ATTR(hrtbt_enable, S_IWUSR | S_IRUGO, show_hrtbt_enb,
+	      store_hrtbt_enb);
+QIB_PORT_ATTR(status, S_IRUGO, show_status, NULL);
+QIB_PORT_ATTR(status_str, S_IRUGO, show_status_str, NULL);
+
+static struct attribute *port_default_attributes[] = {
+	&qib_port_attr_loopback.attr,
+	&qib_port_attr_led_override.attr,
+	&qib_port_attr_hrtbt_enable.attr,
+	&qib_port_attr_status.attr,
+	&qib_port_attr_status_str.attr,
+	NULL
+};
+
+/*
+ * Start of per-port congestion control structures and support code
+ */
+
+/*
+ * Congestion control table size followed by table entries
+ */
+static ssize_t read_cc_table_bin(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *bin_attr,
+		char *buf, loff_t pos, size_t count)
+{
+	int ret;
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, pport_cc_kobj);
+
+	if (!qib_cc_table_size || !ppd->ccti_entries_shadow)
+		return -EINVAL;
+
+	ret = ppd->total_cct_entry * sizeof(struct ib_cc_table_entry_shadow)
+		 + sizeof(__be16);
+
+	if (pos > ret)
+		return -EINVAL;
+
+	if (count > ret - pos)
+		count = ret - pos;
+
+	if (!count)
+		return count;
+
+	spin_lock(&ppd->cc_shadow_lock);
+	memcpy(buf, ppd->ccti_entries_shadow, count);
+	spin_unlock(&ppd->cc_shadow_lock);
+
+	return count;
+}
+
+static void qib_port_release(struct kobject *kobj)
+{
+	/* nothing to do since memory is freed by qib_free_devdata() */
+}
+
+static struct kobj_type qib_port_cc_ktype = {
+	.release = qib_port_release,
+};
+
+static struct bin_attribute cc_table_bin_attr = {
+	.attr = {.name = "cc_table_bin", .mode = 0444},
+	.read = read_cc_table_bin,
+	.size = PAGE_SIZE,
+};
+
+/*
+ * Congestion settings: port control, control map and an array of 16
+ * entries for the congestion entries - increase, timer, event log
+ * trigger threshold and the minimum injection rate delay.
+ */
+static ssize_t read_cc_setting_bin(struct file *filp, struct kobject *kobj,
+		struct bin_attribute *bin_attr,
+		char *buf, loff_t pos, size_t count)
+{
+	int ret;
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, pport_cc_kobj);
+
+	if (!qib_cc_table_size || !ppd->congestion_entries_shadow)
+		return -EINVAL;
+
+	ret = sizeof(struct ib_cc_congestion_setting_attr_shadow);
+
+	if (pos > ret)
+		return -EINVAL;
+	if (count > ret - pos)
+		count = ret - pos;
+
+	if (!count)
+		return count;
+
+	spin_lock(&ppd->cc_shadow_lock);
+	memcpy(buf, ppd->congestion_entries_shadow, count);
+	spin_unlock(&ppd->cc_shadow_lock);
+
+	return count;
+}
+
+static struct bin_attribute cc_setting_bin_attr = {
+	.attr = {.name = "cc_settings_bin", .mode = 0444},
+	.read = read_cc_setting_bin,
+	.size = PAGE_SIZE,
+};
+
+
+static ssize_t qib_portattr_show(struct kobject *kobj,
+	struct attribute *attr, char *buf)
+{
+	struct qib_port_attr *pattr =
+		container_of(attr, struct qib_port_attr, attr);
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, pport_kobj);
+
+	return pattr->show(ppd, buf);
+}
+
+static ssize_t qib_portattr_store(struct kobject *kobj,
+	struct attribute *attr, const char *buf, size_t len)
+{
+	struct qib_port_attr *pattr =
+		container_of(attr, struct qib_port_attr, attr);
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, pport_kobj);
+
+	return pattr->store(ppd, buf, len);
+}
+
+
+static const struct sysfs_ops qib_port_ops = {
+	.show = qib_portattr_show,
+	.store = qib_portattr_store,
+};
+
+static struct kobj_type qib_port_ktype = {
+	.release = qib_port_release,
+	.sysfs_ops = &qib_port_ops,
+	.default_attrs = port_default_attributes
+};
+
+/* Start sl2vl */
+
+#define QIB_SL2VL_ATTR(N) \
+	static struct qib_sl2vl_attr qib_sl2vl_attr_##N = { \
+		.attr = { .name = __stringify(N), .mode = 0444 }, \
+		.sl = N \
+	}
+
+struct qib_sl2vl_attr {
+	struct attribute attr;
+	int sl;
+};
+
+QIB_SL2VL_ATTR(0);
+QIB_SL2VL_ATTR(1);
+QIB_SL2VL_ATTR(2);
+QIB_SL2VL_ATTR(3);
+QIB_SL2VL_ATTR(4);
+QIB_SL2VL_ATTR(5);
+QIB_SL2VL_ATTR(6);
+QIB_SL2VL_ATTR(7);
+QIB_SL2VL_ATTR(8);
+QIB_SL2VL_ATTR(9);
+QIB_SL2VL_ATTR(10);
+QIB_SL2VL_ATTR(11);
+QIB_SL2VL_ATTR(12);
+QIB_SL2VL_ATTR(13);
+QIB_SL2VL_ATTR(14);
+QIB_SL2VL_ATTR(15);
+
+static struct attribute *sl2vl_default_attributes[] = {
+	&qib_sl2vl_attr_0.attr,
+	&qib_sl2vl_attr_1.attr,
+	&qib_sl2vl_attr_2.attr,
+	&qib_sl2vl_attr_3.attr,
+	&qib_sl2vl_attr_4.attr,
+	&qib_sl2vl_attr_5.attr,
+	&qib_sl2vl_attr_6.attr,
+	&qib_sl2vl_attr_7.attr,
+	&qib_sl2vl_attr_8.attr,
+	&qib_sl2vl_attr_9.attr,
+	&qib_sl2vl_attr_10.attr,
+	&qib_sl2vl_attr_11.attr,
+	&qib_sl2vl_attr_12.attr,
+	&qib_sl2vl_attr_13.attr,
+	&qib_sl2vl_attr_14.attr,
+	&qib_sl2vl_attr_15.attr,
+	NULL
+};
+
+static ssize_t sl2vl_attr_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct qib_sl2vl_attr *sattr =
+		container_of(attr, struct qib_sl2vl_attr, attr);
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, sl2vl_kobj);
+	struct qib_ibport *qibp = &ppd->ibport_data;
+
+	return sprintf(buf, "%u\n", qibp->sl_to_vl[sattr->sl]);
+}
+
+static const struct sysfs_ops qib_sl2vl_ops = {
+	.show = sl2vl_attr_show,
+};
+
+static struct kobj_type qib_sl2vl_ktype = {
+	.release = qib_port_release,
+	.sysfs_ops = &qib_sl2vl_ops,
+	.default_attrs = sl2vl_default_attributes
+};
+
+/* End sl2vl */
+
+/* Start diag_counters */
+
+#define QIB_DIAGC_ATTR(N) \
+	static struct qib_diagc_attr qib_diagc_attr_##N = { \
+		.attr = { .name = __stringify(N), .mode = 0664 }, \
+		.counter = offsetof(struct qib_ibport, n_##N) \
+	}
+
+struct qib_diagc_attr {
+	struct attribute attr;
+	size_t counter;
+};
+
+QIB_DIAGC_ATTR(rc_resends);
+QIB_DIAGC_ATTR(rc_acks);
+QIB_DIAGC_ATTR(rc_qacks);
+QIB_DIAGC_ATTR(rc_delayed_comp);
+QIB_DIAGC_ATTR(seq_naks);
+QIB_DIAGC_ATTR(rdma_seq);
+QIB_DIAGC_ATTR(rnr_naks);
+QIB_DIAGC_ATTR(other_naks);
+QIB_DIAGC_ATTR(rc_timeouts);
+QIB_DIAGC_ATTR(loop_pkts);
+QIB_DIAGC_ATTR(pkt_drops);
+QIB_DIAGC_ATTR(dmawait);
+QIB_DIAGC_ATTR(unaligned);
+QIB_DIAGC_ATTR(rc_dupreq);
+QIB_DIAGC_ATTR(rc_seqnak);
+
+static struct attribute *diagc_default_attributes[] = {
+	&qib_diagc_attr_rc_resends.attr,
+	&qib_diagc_attr_rc_acks.attr,
+	&qib_diagc_attr_rc_qacks.attr,
+	&qib_diagc_attr_rc_delayed_comp.attr,
+	&qib_diagc_attr_seq_naks.attr,
+	&qib_diagc_attr_rdma_seq.attr,
+	&qib_diagc_attr_rnr_naks.attr,
+	&qib_diagc_attr_other_naks.attr,
+	&qib_diagc_attr_rc_timeouts.attr,
+	&qib_diagc_attr_loop_pkts.attr,
+	&qib_diagc_attr_pkt_drops.attr,
+	&qib_diagc_attr_dmawait.attr,
+	&qib_diagc_attr_unaligned.attr,
+	&qib_diagc_attr_rc_dupreq.attr,
+	&qib_diagc_attr_rc_seqnak.attr,
+	NULL
+};
+
+static ssize_t diagc_attr_show(struct kobject *kobj, struct attribute *attr,
+			       char *buf)
+{
+	struct qib_diagc_attr *dattr =
+		container_of(attr, struct qib_diagc_attr, attr);
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, diagc_kobj);
+	struct qib_ibport *qibp = &ppd->ibport_data;
+
+	return sprintf(buf, "%u\n", *(u32 *)((char *)qibp + dattr->counter));
+}
+
+static ssize_t diagc_attr_store(struct kobject *kobj, struct attribute *attr,
+				const char *buf, size_t size)
+{
+	struct qib_diagc_attr *dattr =
+		container_of(attr, struct qib_diagc_attr, attr);
+	struct qib_pportdata *ppd =
+		container_of(kobj, struct qib_pportdata, diagc_kobj);
+	struct qib_ibport *qibp = &ppd->ibport_data;
+	u32 val;
+	int ret;
+
+	ret = kstrtou32(buf, 0, &val);
+	if (ret)
+		return ret;
+	*(u32 *)((char *) qibp + dattr->counter) = val;
+	return size;
+}
+
+static const struct sysfs_ops qib_diagc_ops = {
+	.show = diagc_attr_show,
+	.store = diagc_attr_store,
+};
+
+static struct kobj_type qib_diagc_ktype = {
+	.release = qib_port_release,
+	.sysfs_ops = &qib_diagc_ops,
+	.default_attrs = diagc_default_attributes
+};
+
+/* End diag_counters */
+
+/* end of per-port file structures and support code */
+
+/*
+ * Start of per-unit (or driver, in some cases, but replicated
+ * per unit) functions (these get a device *)
+ */
+static ssize_t show_rev(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+
+	return sprintf(buf, "%x\n", dd_from_dev(dev)->minrev);
+}
+
+static ssize_t show_hca(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	int ret;
+
+	if (!dd->boardname)
+		ret = -EINVAL;
+	else
+		ret = scnprintf(buf, PAGE_SIZE, "%s\n", dd->boardname);
+	return ret;
+}
+
+static ssize_t show_version(struct device *device,
+			    struct device_attribute *attr, char *buf)
+{
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", (char *)ib_qib_version);
+}
+
+static ssize_t show_boardversion(struct device *device,
+				 struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", dd->boardversion);
+}
+
+
+static ssize_t show_localbus_info(struct device *device,
+				  struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+
+	/* The string printed here is already newline-terminated. */
+	return scnprintf(buf, PAGE_SIZE, "%s", dd->lbus_info);
+}
+
+
+static ssize_t show_nctxts(struct device *device,
+			   struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+
+	/* Return the number of user ports (contexts) available. */
+	/* The calculation below deals with a special case where
+	 * cfgctxts is set to 1 on a single-port board. */
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			(dd->first_user_ctxt > dd->cfgctxts) ? 0 :
+			(dd->cfgctxts - dd->first_user_ctxt));
+}
+
+static ssize_t show_nfreectxts(struct device *device,
+			   struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+
+	/* Return the number of free user ports (contexts) available. */
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->freectxts);
+}
+
+static ssize_t show_serial(struct device *device,
+			   struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+
+	buf[sizeof(dd->serial)] = '\0';
+	memcpy(buf, dd->serial, sizeof(dd->serial));
+	strcat(buf, "\n");
+	return strlen(buf);
+}
+
+static ssize_t store_chip_reset(struct device *device,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	int ret;
+
+	if (count < 5 || memcmp(buf, "reset", 5) || !dd->diag_client) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	ret = qib_reset_device(dd->unit);
+bail:
+	return ret < 0 ? ret : count;
+}
+
+/*
+ * Dump tempsense regs. in decimal, to ease shell-scripts.
+ */
+static ssize_t show_tempsense(struct device *device,
+			      struct device_attribute *attr, char *buf)
+{
+	struct qib_ibdev *dev =
+		container_of(device, struct qib_ibdev, ibdev.dev);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	int ret;
+	int idx;
+	u8 regvals[8];
+
+	ret = -ENXIO;
+	for (idx = 0; idx < 8; ++idx) {
+		if (idx == 6)
+			continue;
+		ret = dd->f_tempsense_rd(dd, idx);
+		if (ret < 0)
+			break;
+		regvals[idx] = ret;
+	}
+	if (idx == 8)
+		ret = scnprintf(buf, PAGE_SIZE, "%d %d %02X %02X %d %d\n",
+				*(signed char *)(regvals),
+				*(signed char *)(regvals + 1),
+				regvals[2], regvals[3],
+				*(signed char *)(regvals + 5),
+				*(signed char *)(regvals + 7));
+	return ret;
+}
+
+/*
+ * end of per-unit (or driver, in some cases, but replicated
+ * per unit) functions
+ */
+
+/* start of per-unit file structures and support code */
+static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL);
+static DEVICE_ATTR(hca_type, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, show_hca, NULL);
+static DEVICE_ATTR(version, S_IRUGO, show_version, NULL);
+static DEVICE_ATTR(nctxts, S_IRUGO, show_nctxts, NULL);
+static DEVICE_ATTR(nfreectxts, S_IRUGO, show_nfreectxts, NULL);
+static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL);
+static DEVICE_ATTR(boardversion, S_IRUGO, show_boardversion, NULL);
+static DEVICE_ATTR(tempsense, S_IRUGO, show_tempsense, NULL);
+static DEVICE_ATTR(localbus_info, S_IRUGO, show_localbus_info, NULL);
+static DEVICE_ATTR(chip_reset, S_IWUSR, NULL, store_chip_reset);
+
+static struct device_attribute *qib_attributes[] = {
+	&dev_attr_hw_rev,
+	&dev_attr_hca_type,
+	&dev_attr_board_id,
+	&dev_attr_version,
+	&dev_attr_nctxts,
+	&dev_attr_nfreectxts,
+	&dev_attr_serial,
+	&dev_attr_boardversion,
+	&dev_attr_tempsense,
+	&dev_attr_localbus_info,
+	&dev_attr_chip_reset,
+};
+
+int qib_create_port_files(struct ib_device *ibdev, u8 port_num,
+			  struct kobject *kobj)
+{
+	struct qib_pportdata *ppd;
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	int ret;
+
+	if (!port_num || port_num > dd->num_pports) {
+		qib_dev_err(dd,
+			"Skipping infiniband class with invalid port %u\n",
+			port_num);
+		ret = -ENODEV;
+		goto bail;
+	}
+	ppd = &dd->pport[port_num - 1];
+
+	ret = kobject_init_and_add(&ppd->pport_kobj, &qib_port_ktype, kobj,
+				   "linkcontrol");
+	if (ret) {
+		qib_dev_err(dd,
+			"Skipping linkcontrol sysfs info, (err %d) port %u\n",
+			ret, port_num);
+		goto bail;
+	}
+	kobject_uevent(&ppd->pport_kobj, KOBJ_ADD);
+
+	ret = kobject_init_and_add(&ppd->sl2vl_kobj, &qib_sl2vl_ktype, kobj,
+				   "sl2vl");
+	if (ret) {
+		qib_dev_err(dd,
+			"Skipping sl2vl sysfs info, (err %d) port %u\n",
+			ret, port_num);
+		goto bail_link;
+	}
+	kobject_uevent(&ppd->sl2vl_kobj, KOBJ_ADD);
+
+	ret = kobject_init_and_add(&ppd->diagc_kobj, &qib_diagc_ktype, kobj,
+				   "diag_counters");
+	if (ret) {
+		qib_dev_err(dd,
+			"Skipping diag_counters sysfs info, (err %d) port %u\n",
+			ret, port_num);
+		goto bail_sl;
+	}
+	kobject_uevent(&ppd->diagc_kobj, KOBJ_ADD);
+
+	if (!qib_cc_table_size || !ppd->congestion_entries_shadow)
+		return 0;
+
+	ret = kobject_init_and_add(&ppd->pport_cc_kobj, &qib_port_cc_ktype,
+				kobj, "CCMgtA");
+	if (ret) {
+		qib_dev_err(dd,
+		 "Skipping Congestion Control sysfs info, (err %d) port %u\n",
+		 ret, port_num);
+		goto bail_diagc;
+	}
+
+	kobject_uevent(&ppd->pport_cc_kobj, KOBJ_ADD);
+
+	ret = sysfs_create_bin_file(&ppd->pport_cc_kobj,
+				&cc_setting_bin_attr);
+	if (ret) {
+		qib_dev_err(dd,
+		 "Skipping Congestion Control setting sysfs info, (err %d) port %u\n",
+		 ret, port_num);
+		goto bail_cc;
+	}
+
+	ret = sysfs_create_bin_file(&ppd->pport_cc_kobj,
+				&cc_table_bin_attr);
+	if (ret) {
+		qib_dev_err(dd,
+		 "Skipping Congestion Control table sysfs info, (err %d) port %u\n",
+		 ret, port_num);
+		goto bail_cc_entry_bin;
+	}
+
+	qib_devinfo(dd->pcidev,
+		"IB%u: Congestion Control Agent enabled for port %d\n",
+		dd->unit, port_num);
+
+	return 0;
+
+bail_cc_entry_bin:
+	sysfs_remove_bin_file(&ppd->pport_cc_kobj, &cc_setting_bin_attr);
+bail_cc:
+	kobject_put(&ppd->pport_cc_kobj);
+bail_diagc:
+	kobject_put(&ppd->diagc_kobj);
+bail_sl:
+	kobject_put(&ppd->sl2vl_kobj);
+bail_link:
+	kobject_put(&ppd->pport_kobj);
+bail:
+	return ret;
+}
+
+/*
+ * Register and create our files in /sys/class/infiniband.
+ */
+int qib_verbs_register_sysfs(struct qib_devdata *dd)
+{
+	struct ib_device *dev = &dd->verbs_dev.ibdev;
+	int i, ret;
+
+	for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i) {
+		ret = device_create_file(&dev->dev, qib_attributes[i]);
+		if (ret)
+			goto bail;
+	}
+
+	return 0;
+bail:
+	for (i = 0; i < ARRAY_SIZE(qib_attributes); ++i)
+		device_remove_file(&dev->dev, qib_attributes[i]);
+	return ret;
+}
+
+/*
+ * Unregister and remove our files in /sys/class/infiniband.
+ */
+void qib_verbs_unregister_sysfs(struct qib_devdata *dd)
+{
+	struct qib_pportdata *ppd;
+	int i;
+
+	for (i = 0; i < dd->num_pports; i++) {
+		ppd = &dd->pport[i];
+		if (qib_cc_table_size &&
+			ppd->congestion_entries_shadow) {
+			sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+				&cc_setting_bin_attr);
+			sysfs_remove_bin_file(&ppd->pport_cc_kobj,
+				&cc_table_bin_attr);
+			kobject_put(&ppd->pport_cc_kobj);
+		}
+		kobject_put(&ppd->sl2vl_kobj);
+		kobject_put(&ppd->pport_kobj);
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_twsi.c b/drivers/infiniband/hw/qib/qib_twsi.c
new file mode 100644
index 000000000..f56986644
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_twsi.c
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/pci.h>
+#include <linux/vmalloc.h>
+
+#include "qib.h"
+
+/*
+ * QLogic_IB "Two Wire Serial Interface" driver.
+ * Originally written for a not-quite-i2c serial eeprom, which is
+ * still used on some supported boards. Later boards have added a
+ * variety of other uses, most board-specific, so the bit-boffing
+ * part has been split off to this file, while the other parts
+ * have been moved to chip-specific files.
+ *
+ * We have also dropped all pretense of fully generic (e.g. pretend
+ * we don't know whether '1' is the higher voltage) interface, as
+ * the restrictions of the generic i2c interface (e.g. no access from
+ * driver itself) make it unsuitable for this use.
+ */
+
+#define READ_CMD 1
+#define WRITE_CMD 0
+
+/**
+ * i2c_wait_for_writes - wait for a write
+ * @dd: the qlogic_ib device
+ *
+ * We use this instead of udelay directly, so we can make sure
+ * that previous register writes have been flushed all the way
+ * to the chip.  Since we are delaying anyway, the cost doesn't
+ * hurt, and makes the bit twiddling more regular
+ */
+static void i2c_wait_for_writes(struct qib_devdata *dd)
+{
+	/*
+	 * implicit read of EXTStatus is as good as explicit
+	 * read of scratch, if all we want to do is flush
+	 * writes.
+	 */
+	dd->f_gpio_mod(dd, 0, 0, 0);
+	rmb(); /* inlined, so prevent compiler reordering */
+}
+
+/*
+ * QSFP modules are allowed to hold SCL low for 500uSec. Allow twice that
+ * for "almost compliant" modules
+ */
+#define SCL_WAIT_USEC 1000
+
+/* BUF_WAIT is time bus must be free between STOP or ACK and to next START.
+ * Should be 20, but some chips need more.
+ */
+#define TWSI_BUF_WAIT_USEC 60
+
+static void scl_out(struct qib_devdata *dd, u8 bit)
+{
+	u32 mask;
+
+	udelay(1);
+
+	mask = 1UL << dd->gpio_scl_num;
+
+	/* SCL is meant to be bare-drain, so never set "OUT", just DIR */
+	dd->f_gpio_mod(dd, 0, bit ? 0 : mask, mask);
+
+	/*
+	 * Allow for slow slaves by simple
+	 * delay for falling edge, sampling on rise.
+	 */
+	if (!bit)
+		udelay(2);
+	else {
+		int rise_usec;
+
+		for (rise_usec = SCL_WAIT_USEC; rise_usec > 0; rise_usec -= 2) {
+			if (mask & dd->f_gpio_mod(dd, 0, 0, 0))
+				break;
+			udelay(2);
+		}
+		if (rise_usec <= 0)
+			qib_dev_err(dd, "SCL interface stuck low > %d uSec\n",
+				    SCL_WAIT_USEC);
+	}
+	i2c_wait_for_writes(dd);
+}
+
+static void sda_out(struct qib_devdata *dd, u8 bit)
+{
+	u32 mask;
+
+	mask = 1UL << dd->gpio_sda_num;
+
+	/* SDA is meant to be bare-drain, so never set "OUT", just DIR */
+	dd->f_gpio_mod(dd, 0, bit ? 0 : mask, mask);
+
+	i2c_wait_for_writes(dd);
+	udelay(2);
+}
+
+static u8 sda_in(struct qib_devdata *dd, int wait)
+{
+	int bnum;
+	u32 read_val, mask;
+
+	bnum = dd->gpio_sda_num;
+	mask = (1UL << bnum);
+	/* SDA is meant to be bare-drain, so never set "OUT", just DIR */
+	dd->f_gpio_mod(dd, 0, 0, mask);
+	read_val = dd->f_gpio_mod(dd, 0, 0, 0);
+	if (wait)
+		i2c_wait_for_writes(dd);
+	return (read_val & mask) >> bnum;
+}
+
+/**
+ * i2c_ackrcv - see if ack following write is true
+ * @dd: the qlogic_ib device
+ */
+static int i2c_ackrcv(struct qib_devdata *dd)
+{
+	u8 ack_received;
+
+	/* AT ENTRY SCL = LOW */
+	/* change direction, ignore data */
+	ack_received = sda_in(dd, 1);
+	scl_out(dd, 1);
+	ack_received = sda_in(dd, 1) == 0;
+	scl_out(dd, 0);
+	return ack_received;
+}
+
+static void stop_cmd(struct qib_devdata *dd);
+
+/**
+ * rd_byte - read a byte, sending STOP on last, else ACK
+ * @dd: the qlogic_ib device
+ *
+ * Returns byte shifted out of device
+ */
+static int rd_byte(struct qib_devdata *dd, int last)
+{
+	int bit_cntr, data;
+
+	data = 0;
+
+	for (bit_cntr = 7; bit_cntr >= 0; --bit_cntr) {
+		data <<= 1;
+		scl_out(dd, 1);
+		data |= sda_in(dd, 0);
+		scl_out(dd, 0);
+	}
+	if (last) {
+		scl_out(dd, 1);
+		stop_cmd(dd);
+	} else {
+		sda_out(dd, 0);
+		scl_out(dd, 1);
+		scl_out(dd, 0);
+		sda_out(dd, 1);
+	}
+	return data;
+}
+
+/**
+ * wr_byte - write a byte, one bit at a time
+ * @dd: the qlogic_ib device
+ * @data: the byte to write
+ *
+ * Returns 0 if we got the following ack, otherwise 1
+ */
+static int wr_byte(struct qib_devdata *dd, u8 data)
+{
+	int bit_cntr;
+	u8 bit;
+
+	for (bit_cntr = 7; bit_cntr >= 0; bit_cntr--) {
+		bit = (data >> bit_cntr) & 1;
+		sda_out(dd, bit);
+		scl_out(dd, 1);
+		scl_out(dd, 0);
+	}
+	return (!i2c_ackrcv(dd)) ? 1 : 0;
+}
+
+/*
+ * issue TWSI start sequence:
+ * (both clock/data high, clock high, data low while clock is high)
+ */
+static void start_seq(struct qib_devdata *dd)
+{
+	sda_out(dd, 1);
+	scl_out(dd, 1);
+	sda_out(dd, 0);
+	udelay(1);
+	scl_out(dd, 0);
+}
+
+/**
+ * stop_seq - transmit the stop sequence
+ * @dd: the qlogic_ib device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_seq(struct qib_devdata *dd)
+{
+	scl_out(dd, 0);
+	sda_out(dd, 0);
+	scl_out(dd, 1);
+	sda_out(dd, 1);
+}
+
+/**
+ * stop_cmd - transmit the stop condition
+ * @dd: the qlogic_ib device
+ *
+ * (both clock/data low, clock high, data high while clock is high)
+ */
+static void stop_cmd(struct qib_devdata *dd)
+{
+	stop_seq(dd);
+	udelay(TWSI_BUF_WAIT_USEC);
+}
+
+/**
+ * qib_twsi_reset - reset I2C communication
+ * @dd: the qlogic_ib device
+ */
+
+int qib_twsi_reset(struct qib_devdata *dd)
+{
+	int clock_cycles_left = 9;
+	int was_high = 0;
+	u32 pins, mask;
+
+	/* Both SCL and SDA should be high. If not, there
+	 * is something wrong.
+	 */
+	mask = (1UL << dd->gpio_scl_num) | (1UL << dd->gpio_sda_num);
+
+	/*
+	 * Force pins to desired innocuous state.
+	 * This is the default power-on state with out=0 and dir=0,
+	 * So tri-stated and should be floating high (barring HW problems)
+	 */
+	dd->f_gpio_mod(dd, 0, 0, mask);
+
+	/*
+	 * Clock nine times to get all listeners into a sane state.
+	 * If SDA does not go high at any point, we are wedged.
+	 * One vendor recommends then issuing START followed by STOP.
+	 * we cannot use our "normal" functions to do that, because
+	 * if SCL drops between them, another vendor's part will
+	 * wedge, dropping SDA and keeping it low forever, at the end of
+	 * the next transaction (even if it was not the device addressed).
+	 * So our START and STOP take place with SCL held high.
+	 */
+	while (clock_cycles_left--) {
+		scl_out(dd, 0);
+		scl_out(dd, 1);
+		/* Note if SDA is high, but keep clocking to sync slave */
+		was_high |= sda_in(dd, 0);
+	}
+
+	if (was_high) {
+		/*
+		 * We saw a high, which we hope means the slave is sync'd.
+		 * Issue START, STOP, pause for T_BUF.
+		 */
+
+		pins = dd->f_gpio_mod(dd, 0, 0, 0);
+		if ((pins & mask) != mask)
+			qib_dev_err(dd, "GPIO pins not at rest: %d\n",
+				    pins & mask);
+		/* Drop SDA to issue START */
+		udelay(1); /* Guarantee .6 uSec setup */
+		sda_out(dd, 0);
+		udelay(1); /* Guarantee .6 uSec hold */
+		/* At this point, SCL is high, SDA low. Raise SDA for STOP */
+		sda_out(dd, 1);
+		udelay(TWSI_BUF_WAIT_USEC);
+	}
+
+	return !was_high;
+}
+
+#define QIB_TWSI_START 0x100
+#define QIB_TWSI_STOP 0x200
+
+/* Write byte to TWSI, optionally prefixed with START or suffixed with
+ * STOP.
+ * returns 0 if OK (ACK received), else != 0
+ */
+static int qib_twsi_wr(struct qib_devdata *dd, int data, int flags)
+{
+	int ret = 1;
+
+	if (flags & QIB_TWSI_START)
+		start_seq(dd);
+
+	ret = wr_byte(dd, data); /* Leaves SCL low (from i2c_ackrcv()) */
+
+	if (flags & QIB_TWSI_STOP)
+		stop_cmd(dd);
+	return ret;
+}
+
+/* Added functionality for IBA7220-based cards */
+#define QIB_TEMP_DEV 0x98
+
+/*
+ * qib_twsi_blk_rd
+ * Formerly called qib_eeprom_internal_read, and only used for eeprom,
+ * but now the general interface for data transfer from twsi devices.
+ * One vestige of its former role is that it recognizes a device
+ * QIB_TWSI_NO_DEV and does the correct operation for the legacy part,
+ * which responded to all TWSI device codes, interpreting them as
+ * address within device. On all other devices found on board handled by
+ * this driver, the device is followed by a one-byte "address" which selects
+ * the "register" or "offset" within the device from which data should
+ * be read.
+ */
+int qib_twsi_blk_rd(struct qib_devdata *dd, int dev, int addr,
+		    void *buffer, int len)
+{
+	int ret;
+	u8 *bp = buffer;
+
+	ret = 1;
+
+	if (dev == QIB_TWSI_NO_DEV) {
+		/* legacy not-really-I2C */
+		addr = (addr << 1) | READ_CMD;
+		ret = qib_twsi_wr(dd, addr, QIB_TWSI_START);
+	} else {
+		/* Actual I2C */
+		ret = qib_twsi_wr(dd, dev | WRITE_CMD, QIB_TWSI_START);
+		if (ret) {
+			stop_cmd(dd);
+			ret = 1;
+			goto bail;
+		}
+		/*
+		 * SFF spec claims we do _not_ stop after the addr
+		 * but simply issue a start with the "read" dev-addr.
+		 * Since we are implicitely waiting for ACK here,
+		 * we need t_buf (nominally 20uSec) before that start,
+		 * and cannot rely on the delay built in to the STOP
+		 */
+		ret = qib_twsi_wr(dd, addr, 0);
+		udelay(TWSI_BUF_WAIT_USEC);
+
+		if (ret) {
+			qib_dev_err(dd,
+				"Failed to write interface read addr %02X\n",
+				addr);
+			ret = 1;
+			goto bail;
+		}
+		ret = qib_twsi_wr(dd, dev | READ_CMD, QIB_TWSI_START);
+	}
+	if (ret) {
+		stop_cmd(dd);
+		ret = 1;
+		goto bail;
+	}
+
+	/*
+	 * block devices keeps clocking data out as long as we ack,
+	 * automatically incrementing the address. Some have "pages"
+	 * whose boundaries will not be crossed, but the handling
+	 * of these is left to the caller, who is in a better
+	 * position to know.
+	 */
+	while (len-- > 0) {
+		/*
+		 * Get and store data, sending ACK if length remaining,
+		 * else STOP
+		 */
+		*bp++ = rd_byte(dd, !len);
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/*
+ * qib_twsi_blk_wr
+ * Formerly called qib_eeprom_internal_write, and only used for eeprom,
+ * but now the general interface for data transfer to twsi devices.
+ * One vestige of its former role is that it recognizes a device
+ * QIB_TWSI_NO_DEV and does the correct operation for the legacy part,
+ * which responded to all TWSI device codes, interpreting them as
+ * address within device. On all other devices found on board handled by
+ * this driver, the device is followed by a one-byte "address" which selects
+ * the "register" or "offset" within the device to which data should
+ * be written.
+ */
+int qib_twsi_blk_wr(struct qib_devdata *dd, int dev, int addr,
+		    const void *buffer, int len)
+{
+	int sub_len;
+	const u8 *bp = buffer;
+	int max_wait_time, i;
+	int ret = 1;
+
+	while (len > 0) {
+		if (dev == QIB_TWSI_NO_DEV) {
+			if (qib_twsi_wr(dd, (addr << 1) | WRITE_CMD,
+					QIB_TWSI_START)) {
+				goto failed_write;
+			}
+		} else {
+			/* Real I2C */
+			if (qib_twsi_wr(dd, dev | WRITE_CMD, QIB_TWSI_START))
+				goto failed_write;
+			ret = qib_twsi_wr(dd, addr, 0);
+			if (ret) {
+				qib_dev_err(dd,
+					"Failed to write interface write addr %02X\n",
+					addr);
+				goto failed_write;
+			}
+		}
+
+		sub_len = min(len, 4);
+		addr += sub_len;
+		len -= sub_len;
+
+		for (i = 0; i < sub_len; i++)
+			if (qib_twsi_wr(dd, *bp++, 0))
+				goto failed_write;
+
+		stop_cmd(dd);
+
+		/*
+		 * Wait for write complete by waiting for a successful
+		 * read (the chip replies with a zero after the write
+		 * cmd completes, and before it writes to the eeprom.
+		 * The startcmd for the read will fail the ack until
+		 * the writes have completed.   We do this inline to avoid
+		 * the debug prints that are in the real read routine
+		 * if the startcmd fails.
+		 * We also use the proper device address, so it doesn't matter
+		 * whether we have real eeprom_dev. Legacy likes any address.
+		 */
+		max_wait_time = 100;
+		while (qib_twsi_wr(dd, dev | READ_CMD, QIB_TWSI_START)) {
+			stop_cmd(dd);
+			if (!--max_wait_time)
+				goto failed_write;
+		}
+		/* now read (and ignore) the resulting byte */
+		rd_byte(dd, 1);
+	}
+
+	ret = 0;
+	goto bail;
+
+failed_write:
+	stop_cmd(dd);
+	ret = 1;
+
+bail:
+	return ret;
+}
diff --git a/drivers/infiniband/hw/qib/qib_tx.c b/drivers/infiniband/hw/qib/qib_tx.c
new file mode 100644
index 000000000..eface3b3d
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_tx.c
@@ -0,0 +1,572 @@
+/*
+ * Copyright (c) 2008, 2009, 2010 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/pci.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+#include <linux/netdevice.h>
+#include <linux/vmalloc.h>
+#include <linux/moduleparam.h>
+
+#include "qib.h"
+
+static unsigned qib_hol_timeout_ms = 3000;
+module_param_named(hol_timeout_ms, qib_hol_timeout_ms, uint, S_IRUGO);
+MODULE_PARM_DESC(hol_timeout_ms,
+		 "duration of user app suspension after link failure");
+
+unsigned qib_sdma_fetch_arb = 1;
+module_param_named(fetch_arb, qib_sdma_fetch_arb, uint, S_IRUGO);
+MODULE_PARM_DESC(fetch_arb, "IBA7220: change SDMA descriptor arbitration");
+
+/**
+ * qib_disarm_piobufs - cancel a range of PIO buffers
+ * @dd: the qlogic_ib device
+ * @first: the first PIO buffer to cancel
+ * @cnt: the number of PIO buffers to cancel
+ *
+ * Cancel a range of PIO buffers. Used at user process close,
+ * in case it died while writing to a PIO buffer.
+ */
+void qib_disarm_piobufs(struct qib_devdata *dd, unsigned first, unsigned cnt)
+{
+	unsigned long flags;
+	unsigned i;
+	unsigned last;
+
+	last = first + cnt;
+	spin_lock_irqsave(&dd->pioavail_lock, flags);
+	for (i = first; i < last; i++) {
+		__clear_bit(i, dd->pio_need_disarm);
+		dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(i));
+	}
+	spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+}
+
+/*
+ * This is called by a user process when it sees the DISARM_BUFS event
+ * bit is set.
+ */
+int qib_disarm_piobufs_ifneeded(struct qib_ctxtdata *rcd)
+{
+	struct qib_devdata *dd = rcd->dd;
+	unsigned i;
+	unsigned last;
+	unsigned n = 0;
+
+	last = rcd->pio_base + rcd->piocnt;
+	/*
+	 * Don't need uctxt_lock here, since user has called in to us.
+	 * Clear at start in case more interrupts set bits while we
+	 * are disarming
+	 */
+	if (rcd->user_event_mask) {
+		/*
+		 * subctxt_cnt is 0 if not shared, so do base
+		 * separately, first, then remaining subctxt, if any
+		 */
+		clear_bit(_QIB_EVENT_DISARM_BUFS_BIT, &rcd->user_event_mask[0]);
+		for (i = 1; i < rcd->subctxt_cnt; i++)
+			clear_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+				  &rcd->user_event_mask[i]);
+	}
+	spin_lock_irq(&dd->pioavail_lock);
+	for (i = rcd->pio_base; i < last; i++) {
+		if (__test_and_clear_bit(i, dd->pio_need_disarm)) {
+			n++;
+			dd->f_sendctrl(rcd->ppd, QIB_SENDCTRL_DISARM_BUF(i));
+		}
+	}
+	spin_unlock_irq(&dd->pioavail_lock);
+	return 0;
+}
+
+static struct qib_pportdata *is_sdma_buf(struct qib_devdata *dd, unsigned i)
+{
+	struct qib_pportdata *ppd;
+	unsigned pidx;
+
+	for (pidx = 0; pidx < dd->num_pports; pidx++) {
+		ppd = dd->pport + pidx;
+		if (i >= ppd->sdma_state.first_sendbuf &&
+		    i < ppd->sdma_state.last_sendbuf)
+			return ppd;
+	}
+	return NULL;
+}
+
+/*
+ * Return true if send buffer is being used by a user context.
+ * Sets  _QIB_EVENT_DISARM_BUFS_BIT in user_event_mask as a side effect
+ */
+static int find_ctxt(struct qib_devdata *dd, unsigned bufn)
+{
+	struct qib_ctxtdata *rcd;
+	unsigned ctxt;
+	int ret = 0;
+
+	spin_lock(&dd->uctxt_lock);
+	for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts; ctxt++) {
+		rcd = dd->rcd[ctxt];
+		if (!rcd || bufn < rcd->pio_base ||
+		    bufn >= rcd->pio_base + rcd->piocnt)
+			continue;
+		if (rcd->user_event_mask) {
+			int i;
+			/*
+			 * subctxt_cnt is 0 if not shared, so do base
+			 * separately, first, then remaining subctxt, if any
+			 */
+			set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+				&rcd->user_event_mask[0]);
+			for (i = 1; i < rcd->subctxt_cnt; i++)
+				set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+					&rcd->user_event_mask[i]);
+		}
+		ret = 1;
+		break;
+	}
+	spin_unlock(&dd->uctxt_lock);
+
+	return ret;
+}
+
+/*
+ * Disarm a set of send buffers.  If the buffer might be actively being
+ * written to, mark the buffer to be disarmed later when it is not being
+ * written to.
+ *
+ * This should only be called from the IRQ error handler.
+ */
+void qib_disarm_piobufs_set(struct qib_devdata *dd, unsigned long *mask,
+			    unsigned cnt)
+{
+	struct qib_pportdata *ppd, *pppd[QIB_MAX_IB_PORTS];
+	unsigned i;
+	unsigned long flags;
+
+	for (i = 0; i < dd->num_pports; i++)
+		pppd[i] = NULL;
+
+	for (i = 0; i < cnt; i++) {
+		int which;
+
+		if (!test_bit(i, mask))
+			continue;
+		/*
+		 * If the buffer is owned by the DMA hardware,
+		 * reset the DMA engine.
+		 */
+		ppd = is_sdma_buf(dd, i);
+		if (ppd) {
+			pppd[ppd->port] = ppd;
+			continue;
+		}
+		/*
+		 * If the kernel is writing the buffer or the buffer is
+		 * owned by a user process, we can't clear it yet.
+		 */
+		spin_lock_irqsave(&dd->pioavail_lock, flags);
+		if (test_bit(i, dd->pio_writing) ||
+		    (!test_bit(i << 1, dd->pioavailkernel) &&
+		     find_ctxt(dd, i))) {
+			__set_bit(i, dd->pio_need_disarm);
+			which = 0;
+		} else {
+			which = 1;
+			dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(i));
+		}
+		spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+	}
+
+	/* do cancel_sends once per port that had sdma piobufs in error */
+	for (i = 0; i < dd->num_pports; i++)
+		if (pppd[i])
+			qib_cancel_sends(pppd[i]);
+}
+
+/**
+ * update_send_bufs - update shadow copy of the PIO availability map
+ * @dd: the qlogic_ib device
+ *
+ * called whenever our local copy indicates we have run out of send buffers
+ */
+static void update_send_bufs(struct qib_devdata *dd)
+{
+	unsigned long flags;
+	unsigned i;
+	const unsigned piobregs = dd->pioavregs;
+
+	/*
+	 * If the generation (check) bits have changed, then we update the
+	 * busy bit for the corresponding PIO buffer.  This algorithm will
+	 * modify positions to the value they already have in some cases
+	 * (i.e., no change), but it's faster than changing only the bits
+	 * that have changed.
+	 *
+	 * We would like to do this atomicly, to avoid spinlocks in the
+	 * critical send path, but that's not really possible, given the
+	 * type of changes, and that this routine could be called on
+	 * multiple cpu's simultaneously, so we lock in this routine only,
+	 * to avoid conflicting updates; all we change is the shadow, and
+	 * it's a single 64 bit memory location, so by definition the update
+	 * is atomic in terms of what other cpu's can see in testing the
+	 * bits.  The spin_lock overhead isn't too bad, since it only
+	 * happens when all buffers are in use, so only cpu overhead, not
+	 * latency or bandwidth is affected.
+	 */
+	if (!dd->pioavailregs_dma)
+		return;
+	spin_lock_irqsave(&dd->pioavail_lock, flags);
+	for (i = 0; i < piobregs; i++) {
+		u64 pchbusy, pchg, piov, pnew;
+
+		piov = le64_to_cpu(dd->pioavailregs_dma[i]);
+		pchg = dd->pioavailkernel[i] &
+			~(dd->pioavailshadow[i] ^ piov);
+		pchbusy = pchg << QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT;
+		if (pchg && (pchbusy & dd->pioavailshadow[i])) {
+			pnew = dd->pioavailshadow[i] & ~pchbusy;
+			pnew |= piov & pchbusy;
+			dd->pioavailshadow[i] = pnew;
+		}
+	}
+	spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+}
+
+/*
+ * Debugging code and stats updates if no pio buffers available.
+ */
+static noinline void no_send_bufs(struct qib_devdata *dd)
+{
+	dd->upd_pio_shadow = 1;
+
+	/* not atomic, but if we lose a stat count in a while, that's OK */
+	qib_stats.sps_nopiobufs++;
+}
+
+/*
+ * Common code for normal driver send buffer allocation, and reserved
+ * allocation.
+ *
+ * Do appropriate marking as busy, etc.
+ * Returns buffer pointer if one is found, otherwise NULL.
+ */
+u32 __iomem *qib_getsendbuf_range(struct qib_devdata *dd, u32 *pbufnum,
+				  u32 first, u32 last)
+{
+	unsigned i, j, updated = 0;
+	unsigned nbufs;
+	unsigned long flags;
+	unsigned long *shadow = dd->pioavailshadow;
+	u32 __iomem *buf;
+
+	if (!(dd->flags & QIB_PRESENT))
+		return NULL;
+
+	nbufs = last - first + 1; /* number in range to check */
+	if (dd->upd_pio_shadow) {
+update_shadow:
+		/*
+		 * Minor optimization.  If we had no buffers on last call,
+		 * start out by doing the update; continue and do scan even
+		 * if no buffers were updated, to be paranoid.
+		 */
+		update_send_bufs(dd);
+		updated++;
+	}
+	i = first;
+	/*
+	 * While test_and_set_bit() is atomic, we do that and then the
+	 * change_bit(), and the pair is not.  See if this is the cause
+	 * of the remaining armlaunch errors.
+	 */
+	spin_lock_irqsave(&dd->pioavail_lock, flags);
+	if (dd->last_pio >= first && dd->last_pio <= last)
+		i = dd->last_pio + 1;
+	if (!first)
+		/* adjust to min possible  */
+		nbufs = last - dd->min_kernel_pio + 1;
+	for (j = 0; j < nbufs; j++, i++) {
+		if (i > last)
+			i = !first ? dd->min_kernel_pio : first;
+		if (__test_and_set_bit((2 * i) + 1, shadow))
+			continue;
+		/* flip generation bit */
+		__change_bit(2 * i, shadow);
+		/* remember that the buffer can be written to now */
+		__set_bit(i, dd->pio_writing);
+		if (!first && first != last) /* first == last on VL15, avoid */
+			dd->last_pio = i;
+		break;
+	}
+	spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+
+	if (j == nbufs) {
+		if (!updated)
+			/*
+			 * First time through; shadow exhausted, but may be
+			 * buffers available, try an update and then rescan.
+			 */
+			goto update_shadow;
+		no_send_bufs(dd);
+		buf = NULL;
+	} else {
+		if (i < dd->piobcnt2k)
+			buf = (u32 __iomem *)(dd->pio2kbase +
+				i * dd->palign);
+		else if (i < dd->piobcnt2k + dd->piobcnt4k || !dd->piovl15base)
+			buf = (u32 __iomem *)(dd->pio4kbase +
+				(i - dd->piobcnt2k) * dd->align4k);
+		else
+			buf = (u32 __iomem *)(dd->piovl15base +
+				(i - (dd->piobcnt2k + dd->piobcnt4k)) *
+				dd->align4k);
+		if (pbufnum)
+			*pbufnum = i;
+		dd->upd_pio_shadow = 0;
+	}
+
+	return buf;
+}
+
+/*
+ * Record that the caller is finished writing to the buffer so we don't
+ * disarm it while it is being written and disarm it now if needed.
+ */
+void qib_sendbuf_done(struct qib_devdata *dd, unsigned n)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&dd->pioavail_lock, flags);
+	__clear_bit(n, dd->pio_writing);
+	if (__test_and_clear_bit(n, dd->pio_need_disarm))
+		dd->f_sendctrl(dd->pport, QIB_SENDCTRL_DISARM_BUF(n));
+	spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+}
+
+/**
+ * qib_chg_pioavailkernel - change which send buffers are available for kernel
+ * @dd: the qlogic_ib device
+ * @start: the starting send buffer number
+ * @len: the number of send buffers
+ * @avail: true if the buffers are available for kernel use, false otherwise
+ */
+void qib_chg_pioavailkernel(struct qib_devdata *dd, unsigned start,
+	unsigned len, u32 avail, struct qib_ctxtdata *rcd)
+{
+	unsigned long flags;
+	unsigned end;
+	unsigned ostart = start;
+
+	/* There are two bits per send buffer (busy and generation) */
+	start *= 2;
+	end = start + len * 2;
+
+	spin_lock_irqsave(&dd->pioavail_lock, flags);
+	/* Set or clear the busy bit in the shadow. */
+	while (start < end) {
+		if (avail) {
+			unsigned long dma;
+			int i;
+
+			/*
+			 * The BUSY bit will never be set, because we disarm
+			 * the user buffers before we hand them back to the
+			 * kernel.  We do have to make sure the generation
+			 * bit is set correctly in shadow, since it could
+			 * have changed many times while allocated to user.
+			 * We can't use the bitmap functions on the full
+			 * dma array because it is always little-endian, so
+			 * we have to flip to host-order first.
+			 * BITS_PER_LONG is slightly wrong, since it's
+			 * always 64 bits per register in chip...
+			 * We only work on 64 bit kernels, so that's OK.
+			 */
+			i = start / BITS_PER_LONG;
+			__clear_bit(QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT + start,
+				    dd->pioavailshadow);
+			dma = (unsigned long)
+				le64_to_cpu(dd->pioavailregs_dma[i]);
+			if (test_bit((QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT +
+				      start) % BITS_PER_LONG, &dma))
+				__set_bit(QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT +
+					  start, dd->pioavailshadow);
+			else
+				__clear_bit(QLOGIC_IB_SENDPIOAVAIL_CHECK_SHIFT
+					    + start, dd->pioavailshadow);
+			__set_bit(start, dd->pioavailkernel);
+			if ((start >> 1) < dd->min_kernel_pio)
+				dd->min_kernel_pio = start >> 1;
+		} else {
+			__set_bit(start + QLOGIC_IB_SENDPIOAVAIL_BUSY_SHIFT,
+				  dd->pioavailshadow);
+			__clear_bit(start, dd->pioavailkernel);
+			if ((start >> 1) > dd->min_kernel_pio)
+				dd->min_kernel_pio = start >> 1;
+		}
+		start += 2;
+	}
+
+	if (dd->min_kernel_pio > 0 && dd->last_pio < dd->min_kernel_pio - 1)
+		dd->last_pio = dd->min_kernel_pio - 1;
+	spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+
+	dd->f_txchk_change(dd, ostart, len, avail, rcd);
+}
+
+/*
+ * Flush all sends that might be in the ready to send state, as well as any
+ * that are in the process of being sent.  Used whenever we need to be
+ * sure the send side is idle.  Cleans up all buffer state by canceling
+ * all pio buffers, and issuing an abort, which cleans up anything in the
+ * launch fifo.  The cancel is superfluous on some chip versions, but
+ * it's safer to always do it.
+ * PIOAvail bits are updated by the chip as if a normal send had happened.
+ */
+void qib_cancel_sends(struct qib_pportdata *ppd)
+{
+	struct qib_devdata *dd = ppd->dd;
+	struct qib_ctxtdata *rcd;
+	unsigned long flags;
+	unsigned ctxt;
+	unsigned i;
+	unsigned last;
+
+	/*
+	 * Tell PSM to disarm buffers again before trying to reuse them.
+	 * We need to be sure the rcd doesn't change out from under us
+	 * while we do so.  We hold the two locks sequentially.  We might
+	 * needlessly set some need_disarm bits as a result, if the
+	 * context is closed after we release the uctxt_lock, but that's
+	 * fairly benign, and safer than nesting the locks.
+	 */
+	for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts; ctxt++) {
+		spin_lock_irqsave(&dd->uctxt_lock, flags);
+		rcd = dd->rcd[ctxt];
+		if (rcd && rcd->ppd == ppd) {
+			last = rcd->pio_base + rcd->piocnt;
+			if (rcd->user_event_mask) {
+				/*
+				 * subctxt_cnt is 0 if not shared, so do base
+				 * separately, first, then remaining subctxt,
+				 * if any
+				 */
+				set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+					&rcd->user_event_mask[0]);
+				for (i = 1; i < rcd->subctxt_cnt; i++)
+					set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
+						&rcd->user_event_mask[i]);
+			}
+			i = rcd->pio_base;
+			spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+			spin_lock_irqsave(&dd->pioavail_lock, flags);
+			for (; i < last; i++)
+				__set_bit(i, dd->pio_need_disarm);
+			spin_unlock_irqrestore(&dd->pioavail_lock, flags);
+		} else
+			spin_unlock_irqrestore(&dd->uctxt_lock, flags);
+	}
+
+	if (!(dd->flags & QIB_HAS_SEND_DMA))
+		dd->f_sendctrl(ppd, QIB_SENDCTRL_DISARM_ALL |
+				    QIB_SENDCTRL_FLUSH);
+}
+
+/*
+ * Force an update of in-memory copy of the pioavail registers, when
+ * needed for any of a variety of reasons.
+ * If already off, this routine is a nop, on the assumption that the
+ * caller (or set of callers) will "do the right thing".
+ * This is a per-device operation, so just the first port.
+ */
+void qib_force_pio_avail_update(struct qib_devdata *dd)
+{
+	dd->f_sendctrl(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
+}
+
+void qib_hol_down(struct qib_pportdata *ppd)
+{
+	/*
+	 * Cancel sends when the link goes DOWN so that we aren't doing it
+	 * at INIT when we might be trying to send SMI packets.
+	 */
+	if (!(ppd->lflags & QIBL_IB_AUTONEG_INPROG))
+		qib_cancel_sends(ppd);
+}
+
+/*
+ * Link is at INIT.
+ * We start the HoL timer so we can detect stuck packets blocking SMP replies.
+ * Timer may already be running, so use mod_timer, not add_timer.
+ */
+void qib_hol_init(struct qib_pportdata *ppd)
+{
+	if (ppd->hol_state != QIB_HOL_INIT) {
+		ppd->hol_state = QIB_HOL_INIT;
+		mod_timer(&ppd->hol_timer,
+			  jiffies + msecs_to_jiffies(qib_hol_timeout_ms));
+	}
+}
+
+/*
+ * Link is up, continue any user processes, and ensure timer
+ * is a nop, if running.  Let timer keep running, if set; it
+ * will nop when it sees the link is up.
+ */
+void qib_hol_up(struct qib_pportdata *ppd)
+{
+	ppd->hol_state = QIB_HOL_UP;
+}
+
+/*
+ * This is only called via the timer.
+ */
+void qib_hol_event(unsigned long opaque)
+{
+	struct qib_pportdata *ppd = (struct qib_pportdata *)opaque;
+
+	/* If hardware error, etc, skip. */
+	if (!(ppd->dd->flags & QIB_INITTED))
+		return;
+
+	if (ppd->hol_state != QIB_HOL_UP) {
+		/*
+		 * Try to flush sends in case a stuck packet is blocking
+		 * SMP replies.
+		 */
+		qib_hol_down(ppd);
+		mod_timer(&ppd->hol_timer,
+			  jiffies + msecs_to_jiffies(qib_hol_timeout_ms));
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_uc.c b/drivers/infiniband/hw/qib/qib_uc.c
new file mode 100644
index 000000000..aa3a8035b
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_uc.c
@@ -0,0 +1,536 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
+ * All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "qib.h"
+
+/* cut down ridiculously long IB macro names */
+#define OP(x) IB_OPCODE_UC_##x
+
+/**
+ * qib_make_uc_req - construct a request packet (SEND, RDMA write)
+ * @qp: a pointer to the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int qib_make_uc_req(struct qib_qp *qp)
+{
+	struct qib_other_headers *ohdr;
+	struct qib_swqe *wqe;
+	unsigned long flags;
+	u32 hwords;
+	u32 bth0;
+	u32 len;
+	u32 pmtu = qp->pmtu;
+	int ret = 0;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_SEND_OK)) {
+		if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_dma_busy)) {
+			qp->s_flags |= QIB_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	ohdr = &qp->s_hdr->u.oth;
+	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+		ohdr = &qp->s_hdr->u.l.oth;
+
+	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
+	hwords = 5;
+	bth0 = 0;
+
+	/* Get the next send request. */
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	qp->s_wqe = NULL;
+	switch (qp->s_state) {
+	default:
+		if (!(ib_qib_state_ops[qp->state] &
+		    QIB_PROCESS_NEXT_SEND_OK))
+			goto bail;
+		/* Check if send work queue is empty. */
+		if (qp->s_cur == qp->s_head)
+			goto bail;
+		/*
+		 * Start a new request.
+		 */
+		wqe->psn = qp->s_next_psn;
+		qp->s_psn = qp->s_next_psn;
+		qp->s_sge.sge = wqe->sg_list[0];
+		qp->s_sge.sg_list = wqe->sg_list + 1;
+		qp->s_sge.num_sge = wqe->wr.num_sge;
+		qp->s_sge.total_len = wqe->length;
+		len = wqe->length;
+		qp->s_len = len;
+		switch (wqe->wr.opcode) {
+		case IB_WR_SEND:
+		case IB_WR_SEND_WITH_IMM:
+			if (len > pmtu) {
+				qp->s_state = OP(SEND_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_SEND)
+				qp->s_state = OP(SEND_ONLY);
+			else {
+				qp->s_state =
+					OP(SEND_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the BTH */
+				ohdr->u.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+			}
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		case IB_WR_RDMA_WRITE:
+		case IB_WR_RDMA_WRITE_WITH_IMM:
+			ohdr->u.rc.reth.vaddr =
+				cpu_to_be64(wqe->wr.wr.rdma.remote_addr);
+			ohdr->u.rc.reth.rkey =
+				cpu_to_be32(wqe->wr.wr.rdma.rkey);
+			ohdr->u.rc.reth.length = cpu_to_be32(len);
+			hwords += sizeof(struct ib_reth) / 4;
+			if (len > pmtu) {
+				qp->s_state = OP(RDMA_WRITE_FIRST);
+				len = pmtu;
+				break;
+			}
+			if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+				qp->s_state = OP(RDMA_WRITE_ONLY);
+			else {
+				qp->s_state =
+					OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE);
+				/* Immediate data comes after the RETH */
+				ohdr->u.rc.imm_data = wqe->wr.ex.imm_data;
+				hwords += 1;
+				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+					bth0 |= IB_BTH_SOLICITED;
+			}
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
+			break;
+
+		default:
+			goto bail;
+		}
+		break;
+
+	case OP(SEND_FIRST):
+		qp->s_state = OP(SEND_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_SEND)
+			qp->s_state = OP(SEND_LAST);
+		else {
+			qp->s_state = OP(SEND_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+		}
+		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+			bth0 |= IB_BTH_SOLICITED;
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+		qp->s_state = OP(RDMA_WRITE_MIDDLE);
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		len = qp->s_len;
+		if (len > pmtu) {
+			len = pmtu;
+			break;
+		}
+		if (wqe->wr.opcode == IB_WR_RDMA_WRITE)
+			qp->s_state = OP(RDMA_WRITE_LAST);
+		else {
+			qp->s_state =
+				OP(RDMA_WRITE_LAST_WITH_IMMEDIATE);
+			/* Immediate data comes after the BTH */
+			ohdr->u.imm_data = wqe->wr.ex.imm_data;
+			hwords += 1;
+			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+				bth0 |= IB_BTH_SOLICITED;
+		}
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
+		break;
+	}
+	qp->s_len -= len;
+	qp->s_hdrwords = hwords;
+	qp->s_cur_sge = &qp->s_sge;
+	qp->s_cur_size = len;
+	qib_make_ruc_header(qp, ohdr, bth0 | (qp->s_state << 24),
+			    qp->s_next_psn++ & QIB_PSN_MASK);
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~QIB_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * qib_uc_rcv - handle an incoming UC packet
+ * @ibp: the port the packet came in on
+ * @hdr: the header of the packet
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the length of the packet
+ * @qp: the QP for this packet.
+ *
+ * This is called from qib_qp_rcv() to process an incoming UC packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void qib_uc_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct qib_qp *qp)
+{
+	struct qib_other_headers *ohdr;
+	u32 opcode;
+	u32 hdrsize;
+	u32 psn;
+	u32 pad;
+	struct ib_wc wc;
+	u32 pmtu = qp->pmtu;
+	struct ib_reth *reth;
+	int ret;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12;       /* LRH + BTH */
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12;  /* LRH + GRH + BTH */
+	}
+
+	opcode = be32_to_cpu(ohdr->bth[0]);
+	if (qib_ruc_check_hdr(ibp, hdr, has_grh, qp, opcode))
+		return;
+
+	psn = be32_to_cpu(ohdr->bth[2]);
+	opcode >>= 24;
+
+	/* Compare the PSN verses the expected PSN. */
+	if (unlikely(qib_cmp24(psn, qp->r_psn) != 0)) {
+		/*
+		 * Handle a sequence error.
+		 * Silently drop any current message.
+		 */
+		qp->r_psn = psn;
+inv:
+		if (qp->r_state == OP(SEND_FIRST) ||
+		    qp->r_state == OP(SEND_MIDDLE)) {
+			set_bit(QIB_R_REWIND_SGE, &qp->r_aflags);
+			qp->r_sge.num_sge = 0;
+		} else
+			qib_put_ss(&qp->r_sge);
+		qp->r_state = OP(SEND_LAST);
+		switch (opcode) {
+		case OP(SEND_FIRST):
+		case OP(SEND_ONLY):
+		case OP(SEND_ONLY_WITH_IMMEDIATE):
+			goto send_first;
+
+		case OP(RDMA_WRITE_FIRST):
+		case OP(RDMA_WRITE_ONLY):
+		case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE):
+			goto rdma_first;
+
+		default:
+			goto drop;
+		}
+	}
+
+	/* Check for opcode sequence errors. */
+	switch (qp->r_state) {
+	case OP(SEND_FIRST):
+	case OP(SEND_MIDDLE):
+		if (opcode == OP(SEND_MIDDLE) ||
+		    opcode == OP(SEND_LAST) ||
+		    opcode == OP(SEND_LAST_WITH_IMMEDIATE))
+			break;
+		goto inv;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_MIDDLE):
+		if (opcode == OP(RDMA_WRITE_MIDDLE) ||
+		    opcode == OP(RDMA_WRITE_LAST) ||
+		    opcode == OP(RDMA_WRITE_LAST_WITH_IMMEDIATE))
+			break;
+		goto inv;
+
+	default:
+		if (opcode == OP(SEND_FIRST) ||
+		    opcode == OP(SEND_ONLY) ||
+		    opcode == OP(SEND_ONLY_WITH_IMMEDIATE) ||
+		    opcode == OP(RDMA_WRITE_FIRST) ||
+		    opcode == OP(RDMA_WRITE_ONLY) ||
+		    opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE))
+			break;
+		goto inv;
+	}
+
+	if (qp->state == IB_QPS_RTR && !(qp->r_flags & QIB_R_COMM_EST)) {
+		qp->r_flags |= QIB_R_COMM_EST;
+		if (qp->ibqp.event_handler) {
+			struct ib_event ev;
+
+			ev.device = qp->ibqp.device;
+			ev.element.qp = &qp->ibqp;
+			ev.event = IB_EVENT_COMM_EST;
+			qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+		}
+	}
+
+	/* OK, process the packet. */
+	switch (opcode) {
+	case OP(SEND_FIRST):
+	case OP(SEND_ONLY):
+	case OP(SEND_ONLY_WITH_IMMEDIATE):
+send_first:
+		if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags))
+			qp->r_sge = qp->s_rdma_read_sge;
+		else {
+			ret = qib_get_rwqe(qp, 0);
+			if (ret < 0)
+				goto op_err;
+			if (!ret)
+				goto drop;
+			/*
+			 * qp->s_rdma_read_sge will be the owner
+			 * of the mr references.
+			 */
+			qp->s_rdma_read_sge = qp->r_sge;
+		}
+		qp->r_rcv_len = 0;
+		if (opcode == OP(SEND_ONLY))
+			goto no_immediate_data;
+		else if (opcode == OP(SEND_ONLY_WITH_IMMEDIATE))
+			goto send_last_imm;
+		/* FALLTHROUGH */
+	case OP(SEND_MIDDLE):
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto rewind;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto rewind;
+		qib_copy_sge(&qp->r_sge, data, pmtu, 0);
+		break;
+
+	case OP(SEND_LAST_WITH_IMMEDIATE):
+send_last_imm:
+		wc.ex.imm_data = ohdr->u.imm_data;
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		goto send_last;
+	case OP(SEND_LAST):
+no_immediate_data:
+		wc.ex.imm_data = 0;
+		wc.wc_flags = 0;
+send_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto rewind;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		wc.byte_len = tlen + qp->r_rcv_len;
+		if (unlikely(wc.byte_len > qp->r_len))
+			goto rewind;
+		wc.opcode = IB_WC_RECV;
+		qib_copy_sge(&qp->r_sge, data, tlen, 0);
+		qib_put_ss(&qp->s_rdma_read_sge);
+last_imm:
+		wc.wr_id = qp->r_wr_id;
+		wc.status = IB_WC_SUCCESS;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = qp->remote_qpn;
+		wc.slid = qp->remote_ah_attr.dlid;
+		wc.sl = qp->remote_ah_attr.sl;
+		/* zero fields that are N/A */
+		wc.vendor_err = 0;
+		wc.pkey_index = 0;
+		wc.dlid_path_bits = 0;
+		wc.port_num = 0;
+		/* Signal completion event if the solicited bit is set. */
+		qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+			     (ohdr->bth[0] &
+				cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+		break;
+
+	case OP(RDMA_WRITE_FIRST):
+	case OP(RDMA_WRITE_ONLY):
+	case OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE): /* consume RWQE */
+rdma_first:
+		if (unlikely(!(qp->qp_access_flags &
+			       IB_ACCESS_REMOTE_WRITE))) {
+			goto drop;
+		}
+		reth = &ohdr->u.rc.reth;
+		hdrsize += sizeof(*reth);
+		qp->r_len = be32_to_cpu(reth->length);
+		qp->r_rcv_len = 0;
+		qp->r_sge.sg_list = NULL;
+		if (qp->r_len != 0) {
+			u32 rkey = be32_to_cpu(reth->rkey);
+			u64 vaddr = be64_to_cpu(reth->vaddr);
+			int ok;
+
+			/* Check rkey */
+			ok = qib_rkey_ok(qp, &qp->r_sge.sge, qp->r_len,
+					 vaddr, rkey, IB_ACCESS_REMOTE_WRITE);
+			if (unlikely(!ok))
+				goto drop;
+			qp->r_sge.num_sge = 1;
+		} else {
+			qp->r_sge.num_sge = 0;
+			qp->r_sge.sge.mr = NULL;
+			qp->r_sge.sge.vaddr = NULL;
+			qp->r_sge.sge.length = 0;
+			qp->r_sge.sge.sge_length = 0;
+		}
+		if (opcode == OP(RDMA_WRITE_ONLY))
+			goto rdma_last;
+		else if (opcode == OP(RDMA_WRITE_ONLY_WITH_IMMEDIATE)) {
+			wc.ex.imm_data = ohdr->u.rc.imm_data;
+			goto rdma_last_imm;
+		}
+		/* FALLTHROUGH */
+	case OP(RDMA_WRITE_MIDDLE):
+		/* Check for invalid length PMTU or posted rwqe len. */
+		if (unlikely(tlen != (hdrsize + pmtu + 4)))
+			goto drop;
+		qp->r_rcv_len += pmtu;
+		if (unlikely(qp->r_rcv_len > qp->r_len))
+			goto drop;
+		qib_copy_sge(&qp->r_sge, data, pmtu, 1);
+		break;
+
+	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
+		wc.ex.imm_data = ohdr->u.imm_data;
+rdma_last_imm:
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto drop;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+			goto drop;
+		if (test_and_clear_bit(QIB_R_REWIND_SGE, &qp->r_aflags))
+			qib_put_ss(&qp->s_rdma_read_sge);
+		else {
+			ret = qib_get_rwqe(qp, 1);
+			if (ret < 0)
+				goto op_err;
+			if (!ret)
+				goto drop;
+		}
+		wc.byte_len = qp->r_len;
+		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
+		qib_copy_sge(&qp->r_sge, data, tlen, 1);
+		qib_put_ss(&qp->r_sge);
+		goto last_imm;
+
+	case OP(RDMA_WRITE_LAST):
+rdma_last:
+		/* Get the number of bytes the message was padded by. */
+		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+		/* Check for invalid length. */
+		/* XXX LAST len should be >= 1 */
+		if (unlikely(tlen < (hdrsize + pad + 4)))
+			goto drop;
+		/* Don't count the CRC. */
+		tlen -= (hdrsize + pad + 4);
+		if (unlikely(tlen + qp->r_rcv_len != qp->r_len))
+			goto drop;
+		qib_copy_sge(&qp->r_sge, data, tlen, 1);
+		qib_put_ss(&qp->r_sge);
+		break;
+
+	default:
+		/* Drop packet for unknown opcodes. */
+		goto drop;
+	}
+	qp->r_psn++;
+	qp->r_state = opcode;
+	return;
+
+rewind:
+	set_bit(QIB_R_REWIND_SGE, &qp->r_aflags);
+	qp->r_sge.num_sge = 0;
+drop:
+	ibp->n_pkt_drops++;
+	return;
+
+op_err:
+	qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+	return;
+
+}
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c
new file mode 100644
index 000000000..26243b722
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -0,0 +1,590 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_smi.h>
+
+#include "qib.h"
+#include "qib_mad.h"
+
+/**
+ * qib_ud_loopback - handle send on loopback QPs
+ * @sqp: the sending QP
+ * @swqe: the send work request
+ *
+ * This is called from qib_make_ud_req() to forward a WQE addressed
+ * to the same HCA.
+ * Note that the receive interrupt handler may be calling qib_ud_rcv()
+ * while this is being called.
+ */
+static void qib_ud_loopback(struct qib_qp *sqp, struct qib_swqe *swqe)
+{
+	struct qib_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
+	struct qib_pportdata *ppd;
+	struct qib_qp *qp;
+	struct ib_ah_attr *ah_attr;
+	unsigned long flags;
+	struct qib_sge_state ssge;
+	struct qib_sge *sge;
+	struct ib_wc wc;
+	u32 length;
+	enum ib_qp_type sqptype, dqptype;
+
+	qp = qib_lookup_qpn(ibp, swqe->wr.wr.ud.remote_qpn);
+	if (!qp) {
+		ibp->n_pkt_drops++;
+		return;
+	}
+
+	sqptype = sqp->ibqp.qp_type == IB_QPT_GSI ?
+			IB_QPT_UD : sqp->ibqp.qp_type;
+	dqptype = qp->ibqp.qp_type == IB_QPT_GSI ?
+			IB_QPT_UD : qp->ibqp.qp_type;
+
+	if (dqptype != sqptype ||
+	    !(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) {
+		ibp->n_pkt_drops++;
+		goto drop;
+	}
+
+	ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr;
+	ppd = ppd_from_ibp(ibp);
+
+	if (qp->ibqp.qp_num > 1) {
+		u16 pkey1;
+		u16 pkey2;
+		u16 lid;
+
+		pkey1 = qib_get_pkey(ibp, sqp->s_pkey_index);
+		pkey2 = qib_get_pkey(ibp, qp->s_pkey_index);
+		if (unlikely(!qib_pkey_ok(pkey1, pkey2))) {
+			lid = ppd->lid | (ah_attr->src_path_bits &
+					  ((1 << ppd->lmc) - 1));
+			qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY, pkey1,
+				      ah_attr->sl,
+				      sqp->ibqp.qp_num, qp->ibqp.qp_num,
+				      cpu_to_be16(lid),
+				      cpu_to_be16(ah_attr->dlid));
+			goto drop;
+		}
+	}
+
+	/*
+	 * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	if (qp->ibqp.qp_num) {
+		u32 qkey;
+
+		qkey = (int)swqe->wr.wr.ud.remote_qkey < 0 ?
+			sqp->qkey : swqe->wr.wr.ud.remote_qkey;
+		if (unlikely(qkey != qp->qkey)) {
+			u16 lid;
+
+			lid = ppd->lid | (ah_attr->src_path_bits &
+					  ((1 << ppd->lmc) - 1));
+			qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey,
+				      ah_attr->sl,
+				      sqp->ibqp.qp_num, qp->ibqp.qp_num,
+				      cpu_to_be16(lid),
+				      cpu_to_be16(ah_attr->dlid));
+			goto drop;
+		}
+	}
+
+	/*
+	 * A GRH is expected to precede the data even if not
+	 * present on the wire.
+	 */
+	length = swqe->length;
+	memset(&wc, 0, sizeof(wc));
+	wc.byte_len = length + sizeof(struct ib_grh);
+
+	if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.ex.imm_data = swqe->wr.ex.imm_data;
+	}
+
+	spin_lock_irqsave(&qp->r_lock, flags);
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 */
+	if (qp->r_flags & QIB_R_REUSE_SGE)
+		qp->r_flags &= ~QIB_R_REUSE_SGE;
+	else {
+		int ret;
+
+		ret = qib_get_rwqe(qp, 0);
+		if (ret < 0) {
+			qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+			goto bail_unlock;
+		}
+		if (!ret) {
+			if (qp->ibqp.qp_num == 0)
+				ibp->n_vl15_dropped++;
+			goto bail_unlock;
+		}
+	}
+	/* Silently drop packets which are too big. */
+	if (unlikely(wc.byte_len > qp->r_len)) {
+		qp->r_flags |= QIB_R_REUSE_SGE;
+		ibp->n_pkt_drops++;
+		goto bail_unlock;
+	}
+
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		qib_copy_sge(&qp->r_sge, &ah_attr->grh,
+			     sizeof(struct ib_grh), 1);
+		wc.wc_flags |= IB_WC_GRH;
+	} else
+		qib_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
+	ssge.sg_list = swqe->sg_list + 1;
+	ssge.sge = *swqe->sg_list;
+	ssge.num_sge = swqe->wr.num_sge;
+	sge = &ssge.sge;
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		qib_copy_sge(&qp->r_sge, sge->vaddr, len, 1);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ssge.num_sge)
+				*sge = *ssge.sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= QIB_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+	qib_put_ss(&qp->r_sge);
+	if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags))
+		goto bail_unlock;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = sqp->ibqp.qp_num;
+	wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ?
+		swqe->wr.wr.ud.pkey_index : 0;
+	wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1));
+	wc.sl = ah_attr->sl;
+	wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1);
+	wc.port_num = qp->port_num;
+	/* Signal completion event if the solicited bit is set. */
+	qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		     swqe->wr.send_flags & IB_SEND_SOLICITED);
+	ibp->n_loop_pkts++;
+bail_unlock:
+	spin_unlock_irqrestore(&qp->r_lock, flags);
+drop:
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+}
+
+/**
+ * qib_make_ud_req - construct a UD request packet
+ * @qp: the QP
+ *
+ * Return 1 if constructed; otherwise, return 0.
+ */
+int qib_make_ud_req(struct qib_qp *qp)
+{
+	struct qib_other_headers *ohdr;
+	struct ib_ah_attr *ah_attr;
+	struct qib_pportdata *ppd;
+	struct qib_ibport *ibp;
+	struct qib_swqe *wqe;
+	unsigned long flags;
+	u32 nwords;
+	u32 extra_bytes;
+	u32 bth0;
+	u16 lrh0;
+	u16 lid;
+	int ret = 0;
+	int next_cur;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_qib_state_ops[qp->state] & QIB_FLUSH_SEND))
+			goto bail;
+		/* We are in the error state, flush the work request. */
+		if (qp->s_last == qp->s_head)
+			goto bail;
+		/* If DMAs are in progress, we can't flush immediately. */
+		if (atomic_read(&qp->s_dma_busy)) {
+			qp->s_flags |= QIB_S_WAIT_DMA;
+			goto bail;
+		}
+		wqe = get_swqe_ptr(qp, qp->s_last);
+		qib_send_complete(qp, wqe, IB_WC_WR_FLUSH_ERR);
+		goto done;
+	}
+
+	if (qp->s_cur == qp->s_head)
+		goto bail;
+
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	next_cur = qp->s_cur + 1;
+	if (next_cur >= qp->s_size)
+		next_cur = 0;
+
+	/* Construct the header. */
+	ibp = to_iport(qp->ibqp.device, qp->port_num);
+	ppd = ppd_from_ibp(ibp);
+	ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
+	if (ah_attr->dlid >= QIB_MULTICAST_LID_BASE) {
+		if (ah_attr->dlid != QIB_PERMISSIVE_LID)
+			this_cpu_inc(ibp->pmastats->n_multicast_xmit);
+		else
+			this_cpu_inc(ibp->pmastats->n_unicast_xmit);
+	} else {
+		this_cpu_inc(ibp->pmastats->n_unicast_xmit);
+		lid = ah_attr->dlid & ~((1 << ppd->lmc) - 1);
+		if (unlikely(lid == ppd->lid)) {
+			/*
+			 * If DMAs are in progress, we can't generate
+			 * a completion for the loopback packet since
+			 * it would be out of order.
+			 * XXX Instead of waiting, we could queue a
+			 * zero length descriptor so we get a callback.
+			 */
+			if (atomic_read(&qp->s_dma_busy)) {
+				qp->s_flags |= QIB_S_WAIT_DMA;
+				goto bail;
+			}
+			qp->s_cur = next_cur;
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qib_ud_loopback(qp, wqe);
+			spin_lock_irqsave(&qp->s_lock, flags);
+			qib_send_complete(qp, wqe, IB_WC_SUCCESS);
+			goto done;
+		}
+	}
+
+	qp->s_cur = next_cur;
+	extra_bytes = -wqe->length & 3;
+	nwords = (wqe->length + extra_bytes) >> 2;
+
+	/* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
+	qp->s_hdrwords = 7;
+	qp->s_cur_size = wqe->length;
+	qp->s_cur_sge = &qp->s_sge;
+	qp->s_srate = ah_attr->static_rate;
+	qp->s_wqe = wqe;
+	qp->s_sge.sge = wqe->sg_list[0];
+	qp->s_sge.sg_list = wqe->sg_list + 1;
+	qp->s_sge.num_sge = wqe->wr.num_sge;
+	qp->s_sge.total_len = wqe->length;
+
+	if (ah_attr->ah_flags & IB_AH_GRH) {
+		/* Header size in 32-bit words. */
+		qp->s_hdrwords += qib_make_grh(ibp, &qp->s_hdr->u.l.grh,
+					       &ah_attr->grh,
+					       qp->s_hdrwords, nwords);
+		lrh0 = QIB_LRH_GRH;
+		ohdr = &qp->s_hdr->u.l.oth;
+		/*
+		 * Don't worry about sending to locally attached multicast
+		 * QPs.  It is unspecified by the spec. what happens.
+		 */
+	} else {
+		/* Header size in 32-bit words. */
+		lrh0 = QIB_LRH_BTH;
+		ohdr = &qp->s_hdr->u.oth;
+	}
+	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		qp->s_hdrwords++;
+		ohdr->u.ud.imm_data = wqe->wr.ex.imm_data;
+		bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
+	} else
+		bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
+	lrh0 |= ah_attr->sl << 4;
+	if (qp->ibqp.qp_type == IB_QPT_SMI)
+		lrh0 |= 0xF000; /* Set VL (see ch. 13.5.3.1) */
+	else
+		lrh0 |= ibp->sl_to_vl[ah_attr->sl] << 12;
+	qp->s_hdr->lrh[0] = cpu_to_be16(lrh0);
+	qp->s_hdr->lrh[1] = cpu_to_be16(ah_attr->dlid);  /* DEST LID */
+	qp->s_hdr->lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+	lid = ppd->lid;
+	if (lid) {
+		lid |= ah_attr->src_path_bits & ((1 << ppd->lmc) - 1);
+		qp->s_hdr->lrh[3] = cpu_to_be16(lid);
+	} else
+		qp->s_hdr->lrh[3] = IB_LID_PERMISSIVE;
+	if (wqe->wr.send_flags & IB_SEND_SOLICITED)
+		bth0 |= IB_BTH_SOLICITED;
+	bth0 |= extra_bytes << 20;
+	bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? QIB_DEFAULT_P_KEY :
+		qib_get_pkey(ibp, qp->ibqp.qp_type == IB_QPT_GSI ?
+			     wqe->wr.wr.ud.pkey_index : qp->s_pkey_index);
+	ohdr->bth[0] = cpu_to_be32(bth0);
+	/*
+	 * Use the multicast QP if the destination LID is a multicast LID.
+	 */
+	ohdr->bth[1] = ah_attr->dlid >= QIB_MULTICAST_LID_BASE &&
+		ah_attr->dlid != QIB_PERMISSIVE_LID ?
+		cpu_to_be32(QIB_MULTICAST_QPN) :
+		cpu_to_be32(wqe->wr.wr.ud.remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & QIB_PSN_MASK);
+	/*
+	 * Qkeys with the high order bit set mean use the
+	 * qkey from the QP context instead of the WR (see 10.2.5).
+	 */
+	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ?
+					 qp->qkey : wqe->wr.wr.ud.remote_qkey);
+	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
+
+done:
+	ret = 1;
+	goto unlock;
+
+bail:
+	qp->s_flags &= ~QIB_S_BUSY;
+unlock:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+static unsigned qib_lookup_pkey(struct qib_ibport *ibp, u16 pkey)
+{
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_devdata *dd = ppd->dd;
+	unsigned ctxt = ppd->hw_pidx;
+	unsigned i;
+
+	pkey &= 0x7fff;	/* remove limited/full membership bit */
+
+	for (i = 0; i < ARRAY_SIZE(dd->rcd[ctxt]->pkeys); ++i)
+		if ((dd->rcd[ctxt]->pkeys[i] & 0x7fff) == pkey)
+			return i;
+
+	/*
+	 * Should not get here, this means hardware failed to validate pkeys.
+	 * Punt and return index 0.
+	 */
+	return 0;
+}
+
+/**
+ * qib_ud_rcv - receive an incoming UD packet
+ * @ibp: the port the packet came in on
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from qib_qp_rcv() to process an incoming UD packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct qib_qp *qp)
+{
+	struct qib_other_headers *ohdr;
+	int opcode;
+	u32 hdrsize;
+	u32 pad;
+	struct ib_wc wc;
+	u32 qkey;
+	u32 src_qp;
+	u16 dlid;
+
+	/* Check for GRH */
+	if (!has_grh) {
+		ohdr = &hdr->u.oth;
+		hdrsize = 8 + 12 + 8;   /* LRH + BTH + DETH */
+	} else {
+		ohdr = &hdr->u.l.oth;
+		hdrsize = 8 + 40 + 12 + 8; /* LRH + GRH + BTH + DETH */
+	}
+	qkey = be32_to_cpu(ohdr->u.ud.deth[0]);
+	src_qp = be32_to_cpu(ohdr->u.ud.deth[1]) & QIB_QPN_MASK;
+
+	/*
+	 * Get the number of bytes the message was padded by
+	 * and drop incomplete packets.
+	 */
+	pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
+	if (unlikely(tlen < (hdrsize + pad + 4)))
+		goto drop;
+
+	tlen -= hdrsize + pad + 4;
+
+	/*
+	 * Check that the permissive LID is only used on QP0
+	 * and the QKEY matches (see 9.6.1.4.1 and 9.6.1.5.1).
+	 */
+	if (qp->ibqp.qp_num) {
+		if (unlikely(hdr->lrh[1] == IB_LID_PERMISSIVE ||
+			     hdr->lrh[3] == IB_LID_PERMISSIVE))
+			goto drop;
+		if (qp->ibqp.qp_num > 1) {
+			u16 pkey1, pkey2;
+
+			pkey1 = be32_to_cpu(ohdr->bth[0]);
+			pkey2 = qib_get_pkey(ibp, qp->s_pkey_index);
+			if (unlikely(!qib_pkey_ok(pkey1, pkey2))) {
+				qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_PKEY,
+					      pkey1,
+					      (be16_to_cpu(hdr->lrh[0]) >> 4) &
+						0xF,
+					      src_qp, qp->ibqp.qp_num,
+					      hdr->lrh[3], hdr->lrh[1]);
+				return;
+			}
+		}
+		if (unlikely(qkey != qp->qkey)) {
+			qib_bad_pqkey(ibp, IB_NOTICE_TRAP_BAD_QKEY, qkey,
+				      (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
+				      src_qp, qp->ibqp.qp_num,
+				      hdr->lrh[3], hdr->lrh[1]);
+			return;
+		}
+		/* Drop invalid MAD packets (see 13.5.3.1). */
+		if (unlikely(qp->ibqp.qp_num == 1 &&
+			     (tlen != 256 ||
+			      (be16_to_cpu(hdr->lrh[0]) >> 12) == 15)))
+			goto drop;
+	} else {
+		struct ib_smp *smp;
+
+		/* Drop invalid MAD packets (see 13.5.3.1). */
+		if (tlen != 256 || (be16_to_cpu(hdr->lrh[0]) >> 12) != 15)
+			goto drop;
+		smp = (struct ib_smp *) data;
+		if ((hdr->lrh[1] == IB_LID_PERMISSIVE ||
+		     hdr->lrh[3] == IB_LID_PERMISSIVE) &&
+		    smp->mgmt_class != IB_MGMT_CLASS_SUBN_DIRECTED_ROUTE)
+			goto drop;
+	}
+
+	/*
+	 * The opcode is in the low byte when its in network order
+	 * (top byte when in host order).
+	 */
+	opcode = be32_to_cpu(ohdr->bth[0]) >> 24;
+	if (qp->ibqp.qp_num > 1 &&
+	    opcode == IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE) {
+		wc.ex.imm_data = ohdr->u.ud.imm_data;
+		wc.wc_flags = IB_WC_WITH_IMM;
+		tlen -= sizeof(u32);
+	} else if (opcode == IB_OPCODE_UD_SEND_ONLY) {
+		wc.ex.imm_data = 0;
+		wc.wc_flags = 0;
+	} else
+		goto drop;
+
+	/*
+	 * A GRH is expected to precede the data even if not
+	 * present on the wire.
+	 */
+	wc.byte_len = tlen + sizeof(struct ib_grh);
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 */
+	if (qp->r_flags & QIB_R_REUSE_SGE)
+		qp->r_flags &= ~QIB_R_REUSE_SGE;
+	else {
+		int ret;
+
+		ret = qib_get_rwqe(qp, 0);
+		if (ret < 0) {
+			qib_rc_error(qp, IB_WC_LOC_QP_OP_ERR);
+			return;
+		}
+		if (!ret) {
+			if (qp->ibqp.qp_num == 0)
+				ibp->n_vl15_dropped++;
+			return;
+		}
+	}
+	/* Silently drop packets which are too big. */
+	if (unlikely(wc.byte_len > qp->r_len)) {
+		qp->r_flags |= QIB_R_REUSE_SGE;
+		goto drop;
+	}
+	if (has_grh) {
+		qib_copy_sge(&qp->r_sge, &hdr->u.l.grh,
+			     sizeof(struct ib_grh), 1);
+		wc.wc_flags |= IB_WC_GRH;
+	} else
+		qib_skip_sge(&qp->r_sge, sizeof(struct ib_grh), 1);
+	qib_copy_sge(&qp->r_sge, data, wc.byte_len - sizeof(struct ib_grh), 1);
+	qib_put_ss(&qp->r_sge);
+	if (!test_and_clear_bit(QIB_R_WRID_VALID, &qp->r_aflags))
+		return;
+	wc.wr_id = qp->r_wr_id;
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.vendor_err = 0;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = src_qp;
+	wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ?
+		qib_lookup_pkey(ibp, be32_to_cpu(ohdr->bth[0])) : 0;
+	wc.slid = be16_to_cpu(hdr->lrh[3]);
+	wc.sl = (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF;
+	dlid = be16_to_cpu(hdr->lrh[1]);
+	/*
+	 * Save the LMC lower bits if the destination LID is a unicast LID.
+	 */
+	wc.dlid_path_bits = dlid >= QIB_MULTICAST_LID_BASE ? 0 :
+		dlid & ((1 << ppd_from_ibp(ibp)->lmc) - 1);
+	wc.port_num = qp->port_num;
+	/* Signal completion event if the solicited bit is set. */
+	qib_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		     (ohdr->bth[0] &
+			cpu_to_be32(IB_BTH_SOLICITED)) != 0);
+	return;
+
+drop:
+	ibp->n_pkt_drops++;
+}
diff --git a/drivers/infiniband/hw/qib/qib_user_pages.c b/drivers/infiniband/hw/qib/qib_user_pages.c
new file mode 100644
index 000000000..74f90b261
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_user_pages.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/device.h>
+
+#include "qib.h"
+
+static void __qib_release_user_pages(struct page **p, size_t num_pages,
+				     int dirty)
+{
+	size_t i;
+
+	for (i = 0; i < num_pages; i++) {
+		if (dirty)
+			set_page_dirty_lock(p[i]);
+		put_page(p[i]);
+	}
+}
+
+/*
+ * Call with current->mm->mmap_sem held.
+ */
+static int __qib_get_user_pages(unsigned long start_page, size_t num_pages,
+				struct page **p)
+{
+	unsigned long lock_limit;
+	size_t got;
+	int ret;
+
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if (num_pages > lock_limit && !capable(CAP_IPC_LOCK)) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	for (got = 0; got < num_pages; got += ret) {
+		ret = get_user_pages(current, current->mm,
+				     start_page + got * PAGE_SIZE,
+				     num_pages - got, 1, 1,
+				     p + got, NULL);
+		if (ret < 0)
+			goto bail_release;
+	}
+
+	current->mm->pinned_vm += num_pages;
+
+	ret = 0;
+	goto bail;
+
+bail_release:
+	__qib_release_user_pages(p, got, 0);
+bail:
+	return ret;
+}
+
+/**
+ * qib_map_page - a safety wrapper around pci_map_page()
+ *
+ * A dma_addr of all 0's is interpreted by the chip as "disabled".
+ * Unfortunately, it can also be a valid dma_addr returned on some
+ * architectures.
+ *
+ * The powerpc iommu assigns dma_addrs in ascending order, so we don't
+ * have to bother with retries or mapping a dummy page to insure we
+ * don't just get the same mapping again.
+ *
+ * I'm sure we won't be so lucky with other iommu's, so FIXME.
+ */
+dma_addr_t qib_map_page(struct pci_dev *hwdev, struct page *page,
+			unsigned long offset, size_t size, int direction)
+{
+	dma_addr_t phys;
+
+	phys = pci_map_page(hwdev, page, offset, size, direction);
+
+	if (phys == 0) {
+		pci_unmap_page(hwdev, phys, size, direction);
+		phys = pci_map_page(hwdev, page, offset, size, direction);
+		/*
+		 * FIXME: If we get 0 again, we should keep this page,
+		 * map another, then free the 0 page.
+		 */
+	}
+
+	return phys;
+}
+
+/**
+ * qib_get_user_pages - lock user pages into memory
+ * @start_page: the start page
+ * @num_pages: the number of pages
+ * @p: the output page structures
+ *
+ * This function takes a given start page (page aligned user virtual
+ * address) and pins it and the following specified number of pages.  For
+ * now, num_pages is always 1, but that will probably change at some point
+ * (because caller is doing expected sends on a single virtually contiguous
+ * buffer, so we can do all pages at once).
+ */
+int qib_get_user_pages(unsigned long start_page, size_t num_pages,
+		       struct page **p)
+{
+	int ret;
+
+	down_write(&current->mm->mmap_sem);
+
+	ret = __qib_get_user_pages(start_page, num_pages, p);
+
+	up_write(&current->mm->mmap_sem);
+
+	return ret;
+}
+
+void qib_release_user_pages(struct page **p, size_t num_pages)
+{
+	if (current->mm) /* during close after signal, mm can be NULL */
+		down_write(&current->mm->mmap_sem);
+
+	__qib_release_user_pages(p, num_pages, 1);
+
+	if (current->mm) {
+		current->mm->pinned_vm -= num_pages;
+		up_write(&current->mm->mmap_sem);
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.c b/drivers/infiniband/hw/qib/qib_user_sdma.c
new file mode 100644
index 000000000..3e0677c51
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_user_sdma.c
@@ -0,0 +1,1465 @@
+/*
+ * Copyright (c) 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/mm.h>
+#include <linux/types.h>
+#include <linux/device.h>
+#include <linux/dmapool.h>
+#include <linux/slab.h>
+#include <linux/list.h>
+#include <linux/highmem.h>
+#include <linux/io.h>
+#include <linux/uio.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+
+#include "qib.h"
+#include "qib_user_sdma.h"
+
+/* minimum size of header */
+#define QIB_USER_SDMA_MIN_HEADER_LENGTH 64
+/* expected size of headers (for dma_pool) */
+#define QIB_USER_SDMA_EXP_HEADER_LENGTH 64
+/* attempt to drain the queue for 5secs */
+#define QIB_USER_SDMA_DRAIN_TIMEOUT 250
+
+/*
+ * track how many times a process open this driver.
+ */
+static struct rb_root qib_user_sdma_rb_root = RB_ROOT;
+
+struct qib_user_sdma_rb_node {
+	struct rb_node node;
+	int refcount;
+	pid_t pid;
+};
+
+struct qib_user_sdma_pkt {
+	struct list_head list;  /* list element */
+
+	u8  tiddma;		/* if this is NEW tid-sdma */
+	u8  largepkt;		/* this is large pkt from kmalloc */
+	u16 frag_size;		/* frag size used by PSM */
+	u16 index;              /* last header index or push index */
+	u16 naddr;              /* dimension of addr (1..3) ... */
+	u16 addrlimit;		/* addr array size */
+	u16 tidsmidx;		/* current tidsm index */
+	u16 tidsmcount;		/* tidsm array item count */
+	u16 payload_size;	/* payload size so far for header */
+	u32 bytes_togo;		/* bytes for processing */
+	u32 counter;            /* sdma pkts queued counter for this entry */
+	struct qib_tid_session_member *tidsm;	/* tid session member array */
+	struct qib_user_sdma_queue *pq;	/* which pq this pkt belongs to */
+	u64 added;              /* global descq number of entries */
+
+	struct {
+		u16 offset;                     /* offset for kvaddr, addr */
+		u16 length;                     /* length in page */
+		u16 first_desc;			/* first desc */
+		u16 last_desc;			/* last desc */
+		u16 put_page;                   /* should we put_page? */
+		u16 dma_mapped;                 /* is page dma_mapped? */
+		u16 dma_length;			/* for dma_unmap_page() */
+		u16 padding;
+		struct page *page;              /* may be NULL (coherent mem) */
+		void *kvaddr;                   /* FIXME: only for pio hack */
+		dma_addr_t addr;
+	} addr[4];   /* max pages, any more and we coalesce */
+};
+
+struct qib_user_sdma_queue {
+	/*
+	 * pkts sent to dma engine are queued on this
+	 * list head.  the type of the elements of this
+	 * list are struct qib_user_sdma_pkt...
+	 */
+	struct list_head sent;
+
+	/*
+	 * Because above list will be accessed by both process and
+	 * signal handler, we need a spinlock for it.
+	 */
+	spinlock_t sent_lock ____cacheline_aligned_in_smp;
+
+	/* headers with expected length are allocated from here... */
+	char header_cache_name[64];
+	struct dma_pool *header_cache;
+
+	/* packets are allocated from the slab cache... */
+	char pkt_slab_name[64];
+	struct kmem_cache *pkt_slab;
+
+	/* as packets go on the queued queue, they are counted... */
+	u32 counter;
+	u32 sent_counter;
+	/* pending packets, not sending yet */
+	u32 num_pending;
+	/* sending packets, not complete yet */
+	u32 num_sending;
+	/* global descq number of entry of last sending packet */
+	u64 added;
+
+	/* dma page table */
+	struct rb_root dma_pages_root;
+
+	struct qib_user_sdma_rb_node *sdma_rb_node;
+
+	/* protect everything above... */
+	struct mutex lock;
+};
+
+static struct qib_user_sdma_rb_node *
+qib_user_sdma_rb_search(struct rb_root *root, pid_t pid)
+{
+	struct qib_user_sdma_rb_node *sdma_rb_node;
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		sdma_rb_node = container_of(node,
+			struct qib_user_sdma_rb_node, node);
+		if (pid < sdma_rb_node->pid)
+			node = node->rb_left;
+		else if (pid > sdma_rb_node->pid)
+			node = node->rb_right;
+		else
+			return sdma_rb_node;
+	}
+	return NULL;
+}
+
+static int
+qib_user_sdma_rb_insert(struct rb_root *root, struct qib_user_sdma_rb_node *new)
+{
+	struct rb_node **node = &(root->rb_node);
+	struct rb_node *parent = NULL;
+	struct qib_user_sdma_rb_node *got;
+
+	while (*node) {
+		got = container_of(*node, struct qib_user_sdma_rb_node, node);
+		parent = *node;
+		if (new->pid < got->pid)
+			node = &((*node)->rb_left);
+		else if (new->pid > got->pid)
+			node = &((*node)->rb_right);
+		else
+			return 0;
+	}
+
+	rb_link_node(&new->node, parent, node);
+	rb_insert_color(&new->node, root);
+	return 1;
+}
+
+struct qib_user_sdma_queue *
+qib_user_sdma_queue_create(struct device *dev, int unit, int ctxt, int sctxt)
+{
+	struct qib_user_sdma_queue *pq =
+		kmalloc(sizeof(struct qib_user_sdma_queue), GFP_KERNEL);
+	struct qib_user_sdma_rb_node *sdma_rb_node;
+
+	if (!pq)
+		goto done;
+
+	pq->counter = 0;
+	pq->sent_counter = 0;
+	pq->num_pending = 0;
+	pq->num_sending = 0;
+	pq->added = 0;
+	pq->sdma_rb_node = NULL;
+
+	INIT_LIST_HEAD(&pq->sent);
+	spin_lock_init(&pq->sent_lock);
+	mutex_init(&pq->lock);
+
+	snprintf(pq->pkt_slab_name, sizeof(pq->pkt_slab_name),
+		 "qib-user-sdma-pkts-%u-%02u.%02u", unit, ctxt, sctxt);
+	pq->pkt_slab = kmem_cache_create(pq->pkt_slab_name,
+					 sizeof(struct qib_user_sdma_pkt),
+					 0, 0, NULL);
+
+	if (!pq->pkt_slab)
+		goto err_kfree;
+
+	snprintf(pq->header_cache_name, sizeof(pq->header_cache_name),
+		 "qib-user-sdma-headers-%u-%02u.%02u", unit, ctxt, sctxt);
+	pq->header_cache = dma_pool_create(pq->header_cache_name,
+					   dev,
+					   QIB_USER_SDMA_EXP_HEADER_LENGTH,
+					   4, 0);
+	if (!pq->header_cache)
+		goto err_slab;
+
+	pq->dma_pages_root = RB_ROOT;
+
+	sdma_rb_node = qib_user_sdma_rb_search(&qib_user_sdma_rb_root,
+					current->pid);
+	if (sdma_rb_node) {
+		sdma_rb_node->refcount++;
+	} else {
+		int ret;
+
+		sdma_rb_node = kmalloc(sizeof(
+			struct qib_user_sdma_rb_node), GFP_KERNEL);
+		if (!sdma_rb_node)
+			goto err_rb;
+
+		sdma_rb_node->refcount = 1;
+		sdma_rb_node->pid = current->pid;
+
+		ret = qib_user_sdma_rb_insert(&qib_user_sdma_rb_root,
+					sdma_rb_node);
+		BUG_ON(ret == 0);
+	}
+	pq->sdma_rb_node = sdma_rb_node;
+
+	goto done;
+
+err_rb:
+	dma_pool_destroy(pq->header_cache);
+err_slab:
+	kmem_cache_destroy(pq->pkt_slab);
+err_kfree:
+	kfree(pq);
+	pq = NULL;
+
+done:
+	return pq;
+}
+
+static void qib_user_sdma_init_frag(struct qib_user_sdma_pkt *pkt,
+				    int i, u16 offset, u16 len,
+				    u16 first_desc, u16 last_desc,
+				    u16 put_page, u16 dma_mapped,
+				    struct page *page, void *kvaddr,
+				    dma_addr_t dma_addr, u16 dma_length)
+{
+	pkt->addr[i].offset = offset;
+	pkt->addr[i].length = len;
+	pkt->addr[i].first_desc = first_desc;
+	pkt->addr[i].last_desc = last_desc;
+	pkt->addr[i].put_page = put_page;
+	pkt->addr[i].dma_mapped = dma_mapped;
+	pkt->addr[i].page = page;
+	pkt->addr[i].kvaddr = kvaddr;
+	pkt->addr[i].addr = dma_addr;
+	pkt->addr[i].dma_length = dma_length;
+}
+
+static void *qib_user_sdma_alloc_header(struct qib_user_sdma_queue *pq,
+				size_t len, dma_addr_t *dma_addr)
+{
+	void *hdr;
+
+	if (len == QIB_USER_SDMA_EXP_HEADER_LENGTH)
+		hdr = dma_pool_alloc(pq->header_cache, GFP_KERNEL,
+					     dma_addr);
+	else
+		hdr = NULL;
+
+	if (!hdr) {
+		hdr = kmalloc(len, GFP_KERNEL);
+		if (!hdr)
+			return NULL;
+
+		*dma_addr = 0;
+	}
+
+	return hdr;
+}
+
+static int qib_user_sdma_page_to_frags(const struct qib_devdata *dd,
+				       struct qib_user_sdma_queue *pq,
+				       struct qib_user_sdma_pkt *pkt,
+				       struct page *page, u16 put,
+				       u16 offset, u16 len, void *kvaddr)
+{
+	__le16 *pbc16;
+	void *pbcvaddr;
+	struct qib_message_header *hdr;
+	u16 newlen, pbclen, lastdesc, dma_mapped;
+	u32 vcto;
+	union qib_seqnum seqnum;
+	dma_addr_t pbcdaddr;
+	dma_addr_t dma_addr =
+		dma_map_page(&dd->pcidev->dev,
+			page, offset, len, DMA_TO_DEVICE);
+	int ret = 0;
+
+	if (dma_mapping_error(&dd->pcidev->dev, dma_addr)) {
+		/*
+		 * dma mapping error, pkt has not managed
+		 * this page yet, return the page here so
+		 * the caller can ignore this page.
+		 */
+		if (put) {
+			put_page(page);
+		} else {
+			/* coalesce case */
+			kunmap(page);
+			__free_page(page);
+		}
+		ret = -ENOMEM;
+		goto done;
+	}
+	offset = 0;
+	dma_mapped = 1;
+
+
+next_fragment:
+
+	/*
+	 * In tid-sdma, the transfer length is restricted by
+	 * receiver side current tid page length.
+	 */
+	if (pkt->tiddma && len > pkt->tidsm[pkt->tidsmidx].length)
+		newlen = pkt->tidsm[pkt->tidsmidx].length;
+	else
+		newlen = len;
+
+	/*
+	 * Then the transfer length is restricted by MTU.
+	 * the last descriptor flag is determined by:
+	 * 1. the current packet is at frag size length.
+	 * 2. the current tid page is done if tid-sdma.
+	 * 3. there is no more byte togo if sdma.
+	 */
+	lastdesc = 0;
+	if ((pkt->payload_size + newlen) >= pkt->frag_size) {
+		newlen = pkt->frag_size - pkt->payload_size;
+		lastdesc = 1;
+	} else if (pkt->tiddma) {
+		if (newlen == pkt->tidsm[pkt->tidsmidx].length)
+			lastdesc = 1;
+	} else {
+		if (newlen == pkt->bytes_togo)
+			lastdesc = 1;
+	}
+
+	/* fill the next fragment in this page */
+	qib_user_sdma_init_frag(pkt, pkt->naddr, /* index */
+		offset, newlen,		/* offset, len */
+		0, lastdesc,		/* first last desc */
+		put, dma_mapped,	/* put page, dma mapped */
+		page, kvaddr,		/* struct page, virt addr */
+		dma_addr, len);		/* dma addr, dma length */
+	pkt->bytes_togo -= newlen;
+	pkt->payload_size += newlen;
+	pkt->naddr++;
+	if (pkt->naddr == pkt->addrlimit) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	/* If there is no more byte togo. (lastdesc==1) */
+	if (pkt->bytes_togo == 0) {
+		/* The packet is done, header is not dma mapped yet.
+		 * it should be from kmalloc */
+		if (!pkt->addr[pkt->index].addr) {
+			pkt->addr[pkt->index].addr =
+				dma_map_single(&dd->pcidev->dev,
+					pkt->addr[pkt->index].kvaddr,
+					pkt->addr[pkt->index].dma_length,
+					DMA_TO_DEVICE);
+			if (dma_mapping_error(&dd->pcidev->dev,
+					pkt->addr[pkt->index].addr)) {
+				ret = -ENOMEM;
+				goto done;
+			}
+			pkt->addr[pkt->index].dma_mapped = 1;
+		}
+
+		goto done;
+	}
+
+	/* If tid-sdma, advance tid info. */
+	if (pkt->tiddma) {
+		pkt->tidsm[pkt->tidsmidx].length -= newlen;
+		if (pkt->tidsm[pkt->tidsmidx].length) {
+			pkt->tidsm[pkt->tidsmidx].offset += newlen;
+		} else {
+			pkt->tidsmidx++;
+			if (pkt->tidsmidx == pkt->tidsmcount) {
+				ret = -EFAULT;
+				goto done;
+			}
+		}
+	}
+
+	/*
+	 * If this is NOT the last descriptor. (newlen==len)
+	 * the current packet is not done yet, but the current
+	 * send side page is done.
+	 */
+	if (lastdesc == 0)
+		goto done;
+
+	/*
+	 * If running this driver under PSM with message size
+	 * fitting into one transfer unit, it is not possible
+	 * to pass this line. otherwise, it is a buggggg.
+	 */
+
+	/*
+	 * Since the current packet is done, and there are more
+	 * bytes togo, we need to create a new sdma header, copying
+	 * from previous sdma header and modify both.
+	 */
+	pbclen = pkt->addr[pkt->index].length;
+	pbcvaddr = qib_user_sdma_alloc_header(pq, pbclen, &pbcdaddr);
+	if (!pbcvaddr) {
+		ret = -ENOMEM;
+		goto done;
+	}
+	/* Copy the previous sdma header to new sdma header */
+	pbc16 = (__le16 *)pkt->addr[pkt->index].kvaddr;
+	memcpy(pbcvaddr, pbc16, pbclen);
+
+	/* Modify the previous sdma header */
+	hdr = (struct qib_message_header *)&pbc16[4];
+
+	/* New pbc length */
+	pbc16[0] = cpu_to_le16(le16_to_cpu(pbc16[0])-(pkt->bytes_togo>>2));
+
+	/* New packet length */
+	hdr->lrh[2] = cpu_to_be16(le16_to_cpu(pbc16[0]));
+
+	if (pkt->tiddma) {
+		/* turn on the header suppression */
+		hdr->iph.pkt_flags =
+			cpu_to_le16(le16_to_cpu(hdr->iph.pkt_flags)|0x2);
+		/* turn off ACK_REQ: 0x04 and EXPECTED_DONE: 0x20 */
+		hdr->flags &= ~(0x04|0x20);
+	} else {
+		/* turn off extra bytes: 20-21 bits */
+		hdr->bth[0] = cpu_to_be32(be32_to_cpu(hdr->bth[0])&0xFFCFFFFF);
+		/* turn off ACK_REQ: 0x04 */
+		hdr->flags &= ~(0x04);
+	}
+
+	/* New kdeth checksum */
+	vcto = le32_to_cpu(hdr->iph.ver_ctxt_tid_offset);
+	hdr->iph.chksum = cpu_to_le16(QIB_LRH_BTH +
+		be16_to_cpu(hdr->lrh[2]) -
+		((vcto>>16)&0xFFFF) - (vcto&0xFFFF) -
+		le16_to_cpu(hdr->iph.pkt_flags));
+
+	/* The packet is done, header is not dma mapped yet.
+	 * it should be from kmalloc */
+	if (!pkt->addr[pkt->index].addr) {
+		pkt->addr[pkt->index].addr =
+			dma_map_single(&dd->pcidev->dev,
+				pkt->addr[pkt->index].kvaddr,
+				pkt->addr[pkt->index].dma_length,
+				DMA_TO_DEVICE);
+		if (dma_mapping_error(&dd->pcidev->dev,
+				pkt->addr[pkt->index].addr)) {
+			ret = -ENOMEM;
+			goto done;
+		}
+		pkt->addr[pkt->index].dma_mapped = 1;
+	}
+
+	/* Modify the new sdma header */
+	pbc16 = (__le16 *)pbcvaddr;
+	hdr = (struct qib_message_header *)&pbc16[4];
+
+	/* New pbc length */
+	pbc16[0] = cpu_to_le16(le16_to_cpu(pbc16[0])-(pkt->payload_size>>2));
+
+	/* New packet length */
+	hdr->lrh[2] = cpu_to_be16(le16_to_cpu(pbc16[0]));
+
+	if (pkt->tiddma) {
+		/* Set new tid and offset for new sdma header */
+		hdr->iph.ver_ctxt_tid_offset = cpu_to_le32(
+			(le32_to_cpu(hdr->iph.ver_ctxt_tid_offset)&0xFF000000) +
+			(pkt->tidsm[pkt->tidsmidx].tid<<QLOGIC_IB_I_TID_SHIFT) +
+			(pkt->tidsm[pkt->tidsmidx].offset>>2));
+	} else {
+		/* Middle protocol new packet offset */
+		hdr->uwords[2] += pkt->payload_size;
+	}
+
+	/* New kdeth checksum */
+	vcto = le32_to_cpu(hdr->iph.ver_ctxt_tid_offset);
+	hdr->iph.chksum = cpu_to_le16(QIB_LRH_BTH +
+		be16_to_cpu(hdr->lrh[2]) -
+		((vcto>>16)&0xFFFF) - (vcto&0xFFFF) -
+		le16_to_cpu(hdr->iph.pkt_flags));
+
+	/* Next sequence number in new sdma header */
+	seqnum.val = be32_to_cpu(hdr->bth[2]);
+	if (pkt->tiddma)
+		seqnum.seq++;
+	else
+		seqnum.pkt++;
+	hdr->bth[2] = cpu_to_be32(seqnum.val);
+
+	/* Init new sdma header. */
+	qib_user_sdma_init_frag(pkt, pkt->naddr, /* index */
+		0, pbclen,		/* offset, len */
+		1, 0,			/* first last desc */
+		0, 0,			/* put page, dma mapped */
+		NULL, pbcvaddr,		/* struct page, virt addr */
+		pbcdaddr, pbclen);	/* dma addr, dma length */
+	pkt->index = pkt->naddr;
+	pkt->payload_size = 0;
+	pkt->naddr++;
+	if (pkt->naddr == pkt->addrlimit) {
+		ret = -EFAULT;
+		goto done;
+	}
+
+	/* Prepare for next fragment in this page */
+	if (newlen != len) {
+		if (dma_mapped) {
+			put = 0;
+			dma_mapped = 0;
+			page = NULL;
+			kvaddr = NULL;
+		}
+		len -= newlen;
+		offset += newlen;
+
+		goto next_fragment;
+	}
+
+done:
+	return ret;
+}
+
+/* we've too many pages in the iovec, coalesce to a single page */
+static int qib_user_sdma_coalesce(const struct qib_devdata *dd,
+				  struct qib_user_sdma_queue *pq,
+				  struct qib_user_sdma_pkt *pkt,
+				  const struct iovec *iov,
+				  unsigned long niov)
+{
+	int ret = 0;
+	struct page *page = alloc_page(GFP_KERNEL);
+	void *mpage_save;
+	char *mpage;
+	int i;
+	int len = 0;
+
+	if (!page) {
+		ret = -ENOMEM;
+		goto done;
+	}
+
+	mpage = kmap(page);
+	mpage_save = mpage;
+	for (i = 0; i < niov; i++) {
+		int cfur;
+
+		cfur = copy_from_user(mpage,
+				      iov[i].iov_base, iov[i].iov_len);
+		if (cfur) {
+			ret = -EFAULT;
+			goto free_unmap;
+		}
+
+		mpage += iov[i].iov_len;
+		len += iov[i].iov_len;
+	}
+
+	ret = qib_user_sdma_page_to_frags(dd, pq, pkt,
+			page, 0, 0, len, mpage_save);
+	goto done;
+
+free_unmap:
+	kunmap(page);
+	__free_page(page);
+done:
+	return ret;
+}
+
+/*
+ * How many pages in this iovec element?
+ */
+static int qib_user_sdma_num_pages(const struct iovec *iov)
+{
+	const unsigned long addr  = (unsigned long) iov->iov_base;
+	const unsigned long  len  = iov->iov_len;
+	const unsigned long spage = addr & PAGE_MASK;
+	const unsigned long epage = (addr + len - 1) & PAGE_MASK;
+
+	return 1 + ((epage - spage) >> PAGE_SHIFT);
+}
+
+static void qib_user_sdma_free_pkt_frag(struct device *dev,
+					struct qib_user_sdma_queue *pq,
+					struct qib_user_sdma_pkt *pkt,
+					int frag)
+{
+	const int i = frag;
+
+	if (pkt->addr[i].page) {
+		/* only user data has page */
+		if (pkt->addr[i].dma_mapped)
+			dma_unmap_page(dev,
+				       pkt->addr[i].addr,
+				       pkt->addr[i].dma_length,
+				       DMA_TO_DEVICE);
+
+		if (pkt->addr[i].kvaddr)
+			kunmap(pkt->addr[i].page);
+
+		if (pkt->addr[i].put_page)
+			put_page(pkt->addr[i].page);
+		else
+			__free_page(pkt->addr[i].page);
+	} else if (pkt->addr[i].kvaddr) {
+		/* for headers */
+		if (pkt->addr[i].dma_mapped) {
+			/* from kmalloc & dma mapped */
+			dma_unmap_single(dev,
+				       pkt->addr[i].addr,
+				       pkt->addr[i].dma_length,
+				       DMA_TO_DEVICE);
+			kfree(pkt->addr[i].kvaddr);
+		} else if (pkt->addr[i].addr) {
+			/* free coherent mem from cache... */
+			dma_pool_free(pq->header_cache,
+			      pkt->addr[i].kvaddr, pkt->addr[i].addr);
+		} else {
+			/* from kmalloc but not dma mapped */
+			kfree(pkt->addr[i].kvaddr);
+		}
+	}
+}
+
+/* return number of pages pinned... */
+static int qib_user_sdma_pin_pages(const struct qib_devdata *dd,
+				   struct qib_user_sdma_queue *pq,
+				   struct qib_user_sdma_pkt *pkt,
+				   unsigned long addr, int tlen, int npages)
+{
+	struct page *pages[8];
+	int i, j;
+	int ret = 0;
+
+	while (npages) {
+		if (npages > 8)
+			j = 8;
+		else
+			j = npages;
+
+		ret = get_user_pages_fast(addr, j, 0, pages);
+		if (ret != j) {
+			i = 0;
+			j = ret;
+			ret = -ENOMEM;
+			goto free_pages;
+		}
+
+		for (i = 0; i < j; i++) {
+			/* map the pages... */
+			unsigned long fofs = addr & ~PAGE_MASK;
+			int flen = ((fofs + tlen) > PAGE_SIZE) ?
+				(PAGE_SIZE - fofs) : tlen;
+
+			ret = qib_user_sdma_page_to_frags(dd, pq, pkt,
+				pages[i], 1, fofs, flen, NULL);
+			if (ret < 0) {
+				/* current page has beed taken
+				 * care of inside above call.
+				 */
+				i++;
+				goto free_pages;
+			}
+
+			addr += flen;
+			tlen -= flen;
+		}
+
+		npages -= j;
+	}
+
+	goto done;
+
+	/* if error, return all pages not managed by pkt */
+free_pages:
+	while (i < j)
+		put_page(pages[i++]);
+
+done:
+	return ret;
+}
+
+static int qib_user_sdma_pin_pkt(const struct qib_devdata *dd,
+				 struct qib_user_sdma_queue *pq,
+				 struct qib_user_sdma_pkt *pkt,
+				 const struct iovec *iov,
+				 unsigned long niov)
+{
+	int ret = 0;
+	unsigned long idx;
+
+	for (idx = 0; idx < niov; idx++) {
+		const int npages = qib_user_sdma_num_pages(iov + idx);
+		const unsigned long addr = (unsigned long) iov[idx].iov_base;
+
+		ret = qib_user_sdma_pin_pages(dd, pq, pkt, addr,
+					      iov[idx].iov_len, npages);
+		if (ret < 0)
+			goto free_pkt;
+	}
+
+	goto done;
+
+free_pkt:
+	/* we need to ignore the first entry here */
+	for (idx = 1; idx < pkt->naddr; idx++)
+		qib_user_sdma_free_pkt_frag(&dd->pcidev->dev, pq, pkt, idx);
+
+	/* need to dma unmap the first entry, this is to restore to
+	 * the original state so that caller can free the memory in
+	 * error condition. Caller does not know if dma mapped or not*/
+	if (pkt->addr[0].dma_mapped) {
+		dma_unmap_single(&dd->pcidev->dev,
+		       pkt->addr[0].addr,
+		       pkt->addr[0].dma_length,
+		       DMA_TO_DEVICE);
+		pkt->addr[0].addr = 0;
+		pkt->addr[0].dma_mapped = 0;
+	}
+
+done:
+	return ret;
+}
+
+static int qib_user_sdma_init_payload(const struct qib_devdata *dd,
+				      struct qib_user_sdma_queue *pq,
+				      struct qib_user_sdma_pkt *pkt,
+				      const struct iovec *iov,
+				      unsigned long niov, int npages)
+{
+	int ret = 0;
+
+	if (pkt->frag_size == pkt->bytes_togo &&
+			npages >= ARRAY_SIZE(pkt->addr))
+		ret = qib_user_sdma_coalesce(dd, pq, pkt, iov, niov);
+	else
+		ret = qib_user_sdma_pin_pkt(dd, pq, pkt, iov, niov);
+
+	return ret;
+}
+
+/* free a packet list -- return counter value of last packet */
+static void qib_user_sdma_free_pkt_list(struct device *dev,
+					struct qib_user_sdma_queue *pq,
+					struct list_head *list)
+{
+	struct qib_user_sdma_pkt *pkt, *pkt_next;
+
+	list_for_each_entry_safe(pkt, pkt_next, list, list) {
+		int i;
+
+		for (i = 0; i < pkt->naddr; i++)
+			qib_user_sdma_free_pkt_frag(dev, pq, pkt, i);
+
+		if (pkt->largepkt)
+			kfree(pkt);
+		else
+			kmem_cache_free(pq->pkt_slab, pkt);
+	}
+	INIT_LIST_HEAD(list);
+}
+
+/*
+ * copy headers, coalesce etc -- pq->lock must be held
+ *
+ * we queue all the packets to list, returning the
+ * number of bytes total.  list must be empty initially,
+ * as, if there is an error we clean it...
+ */
+static int qib_user_sdma_queue_pkts(const struct qib_devdata *dd,
+				    struct qib_pportdata *ppd,
+				    struct qib_user_sdma_queue *pq,
+				    const struct iovec *iov,
+				    unsigned long niov,
+				    struct list_head *list,
+				    int *maxpkts, int *ndesc)
+{
+	unsigned long idx = 0;
+	int ret = 0;
+	int npkts = 0;
+	__le32 *pbc;
+	dma_addr_t dma_addr;
+	struct qib_user_sdma_pkt *pkt = NULL;
+	size_t len;
+	size_t nw;
+	u32 counter = pq->counter;
+	u16 frag_size;
+
+	while (idx < niov && npkts < *maxpkts) {
+		const unsigned long addr = (unsigned long) iov[idx].iov_base;
+		const unsigned long idx_save = idx;
+		unsigned pktnw;
+		unsigned pktnwc;
+		int nfrags = 0;
+		int npages = 0;
+		int bytes_togo = 0;
+		int tiddma = 0;
+		int cfur;
+
+		len = iov[idx].iov_len;
+		nw = len >> 2;
+
+		if (len < QIB_USER_SDMA_MIN_HEADER_LENGTH ||
+		    len > PAGE_SIZE || len & 3 || addr & 3) {
+			ret = -EINVAL;
+			goto free_list;
+		}
+
+		pbc = qib_user_sdma_alloc_header(pq, len, &dma_addr);
+		if (!pbc) {
+			ret = -ENOMEM;
+			goto free_list;
+		}
+
+		cfur = copy_from_user(pbc, iov[idx].iov_base, len);
+		if (cfur) {
+			ret = -EFAULT;
+			goto free_pbc;
+		}
+
+		/*
+		 * This assignment is a bit strange.  it's because the
+		 * the pbc counts the number of 32 bit words in the full
+		 * packet _except_ the first word of the pbc itself...
+		 */
+		pktnwc = nw - 1;
+
+		/*
+		 * pktnw computation yields the number of 32 bit words
+		 * that the caller has indicated in the PBC.  note that
+		 * this is one less than the total number of words that
+		 * goes to the send DMA engine as the first 32 bit word
+		 * of the PBC itself is not counted.  Armed with this count,
+		 * we can verify that the packet is consistent with the
+		 * iovec lengths.
+		 */
+		pktnw = le32_to_cpu(*pbc) & 0xFFFF;
+		if (pktnw < pktnwc) {
+			ret = -EINVAL;
+			goto free_pbc;
+		}
+
+		idx++;
+		while (pktnwc < pktnw && idx < niov) {
+			const size_t slen = iov[idx].iov_len;
+			const unsigned long faddr =
+				(unsigned long) iov[idx].iov_base;
+
+			if (slen & 3 || faddr & 3 || !slen) {
+				ret = -EINVAL;
+				goto free_pbc;
+			}
+
+			npages += qib_user_sdma_num_pages(&iov[idx]);
+
+			bytes_togo += slen;
+			pktnwc += slen >> 2;
+			idx++;
+			nfrags++;
+		}
+
+		if (pktnwc != pktnw) {
+			ret = -EINVAL;
+			goto free_pbc;
+		}
+
+		frag_size = ((le32_to_cpu(*pbc))>>16) & 0xFFFF;
+		if (((frag_size ? frag_size : bytes_togo) + len) >
+						ppd->ibmaxlen) {
+			ret = -EINVAL;
+			goto free_pbc;
+		}
+
+		if (frag_size) {
+			int pktsize, tidsmsize, n;
+
+			n = npages*((2*PAGE_SIZE/frag_size)+1);
+			pktsize = sizeof(*pkt) + sizeof(pkt->addr[0])*n;
+
+			/*
+			 * Determine if this is tid-sdma or just sdma.
+			 */
+			tiddma = (((le32_to_cpu(pbc[7])>>
+				QLOGIC_IB_I_TID_SHIFT)&
+				QLOGIC_IB_I_TID_MASK) !=
+				QLOGIC_IB_I_TID_MASK);
+
+			if (tiddma)
+				tidsmsize = iov[idx].iov_len;
+			else
+				tidsmsize = 0;
+
+			pkt = kmalloc(pktsize+tidsmsize, GFP_KERNEL);
+			if (!pkt) {
+				ret = -ENOMEM;
+				goto free_pbc;
+			}
+			pkt->largepkt = 1;
+			pkt->frag_size = frag_size;
+			pkt->addrlimit = n + ARRAY_SIZE(pkt->addr);
+
+			if (tiddma) {
+				char *tidsm = (char *)pkt + pktsize;
+
+				cfur = copy_from_user(tidsm,
+					iov[idx].iov_base, tidsmsize);
+				if (cfur) {
+					ret = -EFAULT;
+					goto free_pkt;
+				}
+				pkt->tidsm =
+					(struct qib_tid_session_member *)tidsm;
+				pkt->tidsmcount = tidsmsize/
+					sizeof(struct qib_tid_session_member);
+				pkt->tidsmidx = 0;
+				idx++;
+			}
+
+			/*
+			 * pbc 'fill1' field is borrowed to pass frag size,
+			 * we need to clear it after picking frag size, the
+			 * hardware requires this field to be zero.
+			 */
+			*pbc = cpu_to_le32(le32_to_cpu(*pbc) & 0x0000FFFF);
+		} else {
+			pkt = kmem_cache_alloc(pq->pkt_slab, GFP_KERNEL);
+			if (!pkt) {
+				ret = -ENOMEM;
+				goto free_pbc;
+			}
+			pkt->largepkt = 0;
+			pkt->frag_size = bytes_togo;
+			pkt->addrlimit = ARRAY_SIZE(pkt->addr);
+		}
+		pkt->bytes_togo = bytes_togo;
+		pkt->payload_size = 0;
+		pkt->counter = counter;
+		pkt->tiddma = tiddma;
+
+		/* setup the first header */
+		qib_user_sdma_init_frag(pkt, 0, /* index */
+			0, len,		/* offset, len */
+			1, 0,		/* first last desc */
+			0, 0,		/* put page, dma mapped */
+			NULL, pbc,	/* struct page, virt addr */
+			dma_addr, len);	/* dma addr, dma length */
+		pkt->index = 0;
+		pkt->naddr = 1;
+
+		if (nfrags) {
+			ret = qib_user_sdma_init_payload(dd, pq, pkt,
+							 iov + idx_save + 1,
+							 nfrags, npages);
+			if (ret < 0)
+				goto free_pkt;
+		} else {
+			/* since there is no payload, mark the
+			 * header as the last desc. */
+			pkt->addr[0].last_desc = 1;
+
+			if (dma_addr == 0) {
+				/*
+				 * the header is not dma mapped yet.
+				 * it should be from kmalloc.
+				 */
+				dma_addr = dma_map_single(&dd->pcidev->dev,
+					pbc, len, DMA_TO_DEVICE);
+				if (dma_mapping_error(&dd->pcidev->dev,
+								dma_addr)) {
+					ret = -ENOMEM;
+					goto free_pkt;
+				}
+				pkt->addr[0].addr = dma_addr;
+				pkt->addr[0].dma_mapped = 1;
+			}
+		}
+
+		counter++;
+		npkts++;
+		pkt->pq = pq;
+		pkt->index = 0; /* reset index for push on hw */
+		*ndesc += pkt->naddr;
+
+		list_add_tail(&pkt->list, list);
+	}
+
+	*maxpkts = npkts;
+	ret = idx;
+	goto done;
+
+free_pkt:
+	if (pkt->largepkt)
+		kfree(pkt);
+	else
+		kmem_cache_free(pq->pkt_slab, pkt);
+free_pbc:
+	if (dma_addr)
+		dma_pool_free(pq->header_cache, pbc, dma_addr);
+	else
+		kfree(pbc);
+free_list:
+	qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, list);
+done:
+	return ret;
+}
+
+static void qib_user_sdma_set_complete_counter(struct qib_user_sdma_queue *pq,
+					       u32 c)
+{
+	pq->sent_counter = c;
+}
+
+/* try to clean out queue -- needs pq->lock */
+static int qib_user_sdma_queue_clean(struct qib_pportdata *ppd,
+				     struct qib_user_sdma_queue *pq)
+{
+	struct qib_devdata *dd = ppd->dd;
+	struct list_head free_list;
+	struct qib_user_sdma_pkt *pkt;
+	struct qib_user_sdma_pkt *pkt_prev;
+	unsigned long flags;
+	int ret = 0;
+
+	if (!pq->num_sending)
+		return 0;
+
+	INIT_LIST_HEAD(&free_list);
+
+	/*
+	 * We need this spin lock here because interrupt handler
+	 * might modify this list in qib_user_sdma_send_desc(), also
+	 * we can not get interrupted, otherwise it is a deadlock.
+	 */
+	spin_lock_irqsave(&pq->sent_lock, flags);
+	list_for_each_entry_safe(pkt, pkt_prev, &pq->sent, list) {
+		s64 descd = ppd->sdma_descq_removed - pkt->added;
+
+		if (descd < 0)
+			break;
+
+		list_move_tail(&pkt->list, &free_list);
+
+		/* one more packet cleaned */
+		ret++;
+		pq->num_sending--;
+	}
+	spin_unlock_irqrestore(&pq->sent_lock, flags);
+
+	if (!list_empty(&free_list)) {
+		u32 counter;
+
+		pkt = list_entry(free_list.prev,
+				 struct qib_user_sdma_pkt, list);
+		counter = pkt->counter;
+
+		qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
+		qib_user_sdma_set_complete_counter(pq, counter);
+	}
+
+	return ret;
+}
+
+void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq)
+{
+	if (!pq)
+		return;
+
+	pq->sdma_rb_node->refcount--;
+	if (pq->sdma_rb_node->refcount == 0) {
+		rb_erase(&pq->sdma_rb_node->node, &qib_user_sdma_rb_root);
+		kfree(pq->sdma_rb_node);
+	}
+	dma_pool_destroy(pq->header_cache);
+	kmem_cache_destroy(pq->pkt_slab);
+	kfree(pq);
+}
+
+/* clean descriptor queue, returns > 0 if some elements cleaned */
+static int qib_user_sdma_hwqueue_clean(struct qib_pportdata *ppd)
+{
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ppd->sdma_lock, flags);
+	ret = qib_sdma_make_progress(ppd);
+	spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+
+	return ret;
+}
+
+/* we're in close, drain packets so that we can cleanup successfully... */
+void qib_user_sdma_queue_drain(struct qib_pportdata *ppd,
+			       struct qib_user_sdma_queue *pq)
+{
+	struct qib_devdata *dd = ppd->dd;
+	unsigned long flags;
+	int i;
+
+	if (!pq)
+		return;
+
+	for (i = 0; i < QIB_USER_SDMA_DRAIN_TIMEOUT; i++) {
+		mutex_lock(&pq->lock);
+		if (!pq->num_pending && !pq->num_sending) {
+			mutex_unlock(&pq->lock);
+			break;
+		}
+		qib_user_sdma_hwqueue_clean(ppd);
+		qib_user_sdma_queue_clean(ppd, pq);
+		mutex_unlock(&pq->lock);
+		msleep(20);
+	}
+
+	if (pq->num_pending || pq->num_sending) {
+		struct qib_user_sdma_pkt *pkt;
+		struct qib_user_sdma_pkt *pkt_prev;
+		struct list_head free_list;
+
+		mutex_lock(&pq->lock);
+		spin_lock_irqsave(&ppd->sdma_lock, flags);
+		/*
+		 * Since we hold sdma_lock, it is safe without sent_lock.
+		 */
+		if (pq->num_pending) {
+			list_for_each_entry_safe(pkt, pkt_prev,
+					&ppd->sdma_userpending, list) {
+				if (pkt->pq == pq) {
+					list_move_tail(&pkt->list, &pq->sent);
+					pq->num_pending--;
+					pq->num_sending++;
+				}
+			}
+		}
+		spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+
+		qib_dev_err(dd, "user sdma lists not empty: forcing!\n");
+		INIT_LIST_HEAD(&free_list);
+		list_splice_init(&pq->sent, &free_list);
+		pq->num_sending = 0;
+		qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &free_list);
+		mutex_unlock(&pq->lock);
+	}
+}
+
+static inline __le64 qib_sdma_make_desc0(u8 gen,
+					 u64 addr, u64 dwlen, u64 dwoffset)
+{
+	return cpu_to_le64(/* SDmaPhyAddr[31:0] */
+			   ((addr & 0xfffffffcULL) << 32) |
+			   /* SDmaGeneration[1:0] */
+			   ((gen & 3ULL) << 30) |
+			   /* SDmaDwordCount[10:0] */
+			   ((dwlen & 0x7ffULL) << 16) |
+			   /* SDmaBufOffset[12:2] */
+			   (dwoffset & 0x7ffULL));
+}
+
+static inline __le64 qib_sdma_make_first_desc0(__le64 descq)
+{
+	return descq | cpu_to_le64(1ULL << 12);
+}
+
+static inline __le64 qib_sdma_make_last_desc0(__le64 descq)
+{
+					      /* last */  /* dma head */
+	return descq | cpu_to_le64(1ULL << 11 | 1ULL << 13);
+}
+
+static inline __le64 qib_sdma_make_desc1(u64 addr)
+{
+	/* SDmaPhyAddr[47:32] */
+	return cpu_to_le64(addr >> 32);
+}
+
+static void qib_user_sdma_send_frag(struct qib_pportdata *ppd,
+				    struct qib_user_sdma_pkt *pkt, int idx,
+				    unsigned ofs, u16 tail, u8 gen)
+{
+	const u64 addr = (u64) pkt->addr[idx].addr +
+		(u64) pkt->addr[idx].offset;
+	const u64 dwlen = (u64) pkt->addr[idx].length / 4;
+	__le64 *descqp;
+	__le64 descq0;
+
+	descqp = &ppd->sdma_descq[tail].qw[0];
+
+	descq0 = qib_sdma_make_desc0(gen, addr, dwlen, ofs);
+	if (pkt->addr[idx].first_desc)
+		descq0 = qib_sdma_make_first_desc0(descq0);
+	if (pkt->addr[idx].last_desc) {
+		descq0 = qib_sdma_make_last_desc0(descq0);
+		if (ppd->sdma_intrequest) {
+			descq0 |= cpu_to_le64(1ULL << 15);
+			ppd->sdma_intrequest = 0;
+		}
+	}
+
+	descqp[0] = descq0;
+	descqp[1] = qib_sdma_make_desc1(addr);
+}
+
+void qib_user_sdma_send_desc(struct qib_pportdata *ppd,
+				struct list_head *pktlist)
+{
+	struct qib_devdata *dd = ppd->dd;
+	u16 nfree, nsent;
+	u16 tail, tail_c;
+	u8 gen, gen_c;
+
+	nfree = qib_sdma_descq_freecnt(ppd);
+	if (!nfree)
+		return;
+
+retry:
+	nsent = 0;
+	tail_c = tail = ppd->sdma_descq_tail;
+	gen_c = gen = ppd->sdma_generation;
+	while (!list_empty(pktlist)) {
+		struct qib_user_sdma_pkt *pkt =
+			list_entry(pktlist->next, struct qib_user_sdma_pkt,
+				   list);
+		int i, j, c = 0;
+		unsigned ofs = 0;
+		u16 dtail = tail;
+
+		for (i = pkt->index; i < pkt->naddr && nfree; i++) {
+			qib_user_sdma_send_frag(ppd, pkt, i, ofs, tail, gen);
+			ofs += pkt->addr[i].length >> 2;
+
+			if (++tail == ppd->sdma_descq_cnt) {
+				tail = 0;
+				++gen;
+				ppd->sdma_intrequest = 1;
+			} else if (tail == (ppd->sdma_descq_cnt>>1)) {
+				ppd->sdma_intrequest = 1;
+			}
+			nfree--;
+			if (pkt->addr[i].last_desc == 0)
+				continue;
+
+			/*
+			 * If the packet is >= 2KB mtu equivalent, we
+			 * have to use the large buffers, and have to
+			 * mark each descriptor as part of a large
+			 * buffer packet.
+			 */
+			if (ofs > dd->piosize2kmax_dwords) {
+				for (j = pkt->index; j <= i; j++) {
+					ppd->sdma_descq[dtail].qw[0] |=
+						cpu_to_le64(1ULL << 14);
+					if (++dtail == ppd->sdma_descq_cnt)
+						dtail = 0;
+				}
+			}
+			c += i + 1 - pkt->index;
+			pkt->index = i + 1; /* index for next first */
+			tail_c = dtail = tail;
+			gen_c = gen;
+			ofs = 0;  /* reset for next packet */
+		}
+
+		ppd->sdma_descq_added += c;
+		nsent += c;
+		if (pkt->index == pkt->naddr) {
+			pkt->added = ppd->sdma_descq_added;
+			pkt->pq->added = pkt->added;
+			pkt->pq->num_pending--;
+			spin_lock(&pkt->pq->sent_lock);
+			pkt->pq->num_sending++;
+			list_move_tail(&pkt->list, &pkt->pq->sent);
+			spin_unlock(&pkt->pq->sent_lock);
+		}
+		if (!nfree || (nsent<<2) > ppd->sdma_descq_cnt)
+			break;
+	}
+
+	/* advance the tail on the chip if necessary */
+	if (ppd->sdma_descq_tail != tail_c) {
+		ppd->sdma_generation = gen_c;
+		dd->f_sdma_update_tail(ppd, tail_c);
+	}
+
+	if (nfree && !list_empty(pktlist))
+		goto retry;
+}
+
+/* pq->lock must be held, get packets on the wire... */
+static int qib_user_sdma_push_pkts(struct qib_pportdata *ppd,
+				 struct qib_user_sdma_queue *pq,
+				 struct list_head *pktlist, int count)
+{
+	unsigned long flags;
+
+	if (unlikely(!(ppd->lflags & QIBL_LINKACTIVE)))
+		return -ECOMM;
+
+	/* non-blocking mode */
+	if (pq->sdma_rb_node->refcount > 1) {
+		spin_lock_irqsave(&ppd->sdma_lock, flags);
+		if (unlikely(!__qib_sdma_running(ppd))) {
+			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+			return -ECOMM;
+		}
+		pq->num_pending += count;
+		list_splice_tail_init(pktlist, &ppd->sdma_userpending);
+		qib_user_sdma_send_desc(ppd, &ppd->sdma_userpending);
+		spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+		return 0;
+	}
+
+	/* In this case, descriptors from this process are not
+	 * linked to ppd pending queue, interrupt handler
+	 * won't update this process, it is OK to directly
+	 * modify without sdma lock.
+	 */
+
+
+	pq->num_pending += count;
+	/*
+	 * Blocking mode for single rail process, we must
+	 * release/regain sdma_lock to give other process
+	 * chance to make progress. This is important for
+	 * performance.
+	 */
+	do {
+		spin_lock_irqsave(&ppd->sdma_lock, flags);
+		if (unlikely(!__qib_sdma_running(ppd))) {
+			spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+			return -ECOMM;
+		}
+		qib_user_sdma_send_desc(ppd, pktlist);
+		if (!list_empty(pktlist))
+			qib_sdma_make_progress(ppd);
+		spin_unlock_irqrestore(&ppd->sdma_lock, flags);
+	} while (!list_empty(pktlist));
+
+	return 0;
+}
+
+int qib_user_sdma_writev(struct qib_ctxtdata *rcd,
+			 struct qib_user_sdma_queue *pq,
+			 const struct iovec *iov,
+			 unsigned long dim)
+{
+	struct qib_devdata *dd = rcd->dd;
+	struct qib_pportdata *ppd = rcd->ppd;
+	int ret = 0;
+	struct list_head list;
+	int npkts = 0;
+
+	INIT_LIST_HEAD(&list);
+
+	mutex_lock(&pq->lock);
+
+	/* why not -ECOMM like qib_user_sdma_push_pkts() below? */
+	if (!qib_sdma_running(ppd))
+		goto done_unlock;
+
+	/* if I have packets not complete yet */
+	if (pq->added > ppd->sdma_descq_removed)
+		qib_user_sdma_hwqueue_clean(ppd);
+	/* if I have complete packets to be freed */
+	if (pq->num_sending)
+		qib_user_sdma_queue_clean(ppd, pq);
+
+	while (dim) {
+		int mxp = 1;
+		int ndesc = 0;
+
+		ret = qib_user_sdma_queue_pkts(dd, ppd, pq,
+				iov, dim, &list, &mxp, &ndesc);
+		if (ret < 0)
+			goto done_unlock;
+		else {
+			dim -= ret;
+			iov += ret;
+		}
+
+		/* force packets onto the sdma hw queue... */
+		if (!list_empty(&list)) {
+			/*
+			 * Lazily clean hw queue.
+			 */
+			if (qib_sdma_descq_freecnt(ppd) < ndesc) {
+				qib_user_sdma_hwqueue_clean(ppd);
+				if (pq->num_sending)
+					qib_user_sdma_queue_clean(ppd, pq);
+			}
+
+			ret = qib_user_sdma_push_pkts(ppd, pq, &list, mxp);
+			if (ret < 0)
+				goto done_unlock;
+			else {
+				npkts += mxp;
+				pq->counter += mxp;
+			}
+		}
+	}
+
+done_unlock:
+	if (!list_empty(&list))
+		qib_user_sdma_free_pkt_list(&dd->pcidev->dev, pq, &list);
+	mutex_unlock(&pq->lock);
+
+	return (ret < 0) ? ret : npkts;
+}
+
+int qib_user_sdma_make_progress(struct qib_pportdata *ppd,
+				struct qib_user_sdma_queue *pq)
+{
+	int ret = 0;
+
+	mutex_lock(&pq->lock);
+	qib_user_sdma_hwqueue_clean(ppd);
+	ret = qib_user_sdma_queue_clean(ppd, pq);
+	mutex_unlock(&pq->lock);
+
+	return ret;
+}
+
+u32 qib_user_sdma_complete_counter(const struct qib_user_sdma_queue *pq)
+{
+	return pq ? pq->sent_counter : 0;
+}
+
+u32 qib_user_sdma_inflight_counter(struct qib_user_sdma_queue *pq)
+{
+	return pq ? pq->counter : 0;
+}
diff --git a/drivers/infiniband/hw/qib/qib_user_sdma.h b/drivers/infiniband/hw/qib/qib_user_sdma.h
new file mode 100644
index 000000000..ce8cbaf6a
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_user_sdma.h
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/device.h>
+
+struct qib_user_sdma_queue;
+
+struct qib_user_sdma_queue *
+qib_user_sdma_queue_create(struct device *dev, int unit, int port, int sport);
+void qib_user_sdma_queue_destroy(struct qib_user_sdma_queue *pq);
+
+int qib_user_sdma_writev(struct qib_ctxtdata *pd,
+			 struct qib_user_sdma_queue *pq,
+			 const struct iovec *iov,
+			 unsigned long dim);
+
+int qib_user_sdma_make_progress(struct qib_pportdata *ppd,
+				struct qib_user_sdma_queue *pq);
+
+void qib_user_sdma_queue_drain(struct qib_pportdata *ppd,
+			       struct qib_user_sdma_queue *pq);
+
+u32 qib_user_sdma_complete_counter(const struct qib_user_sdma_queue *pq);
+u32 qib_user_sdma_inflight_counter(struct qib_user_sdma_queue *pq);
diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c
new file mode 100644
index 000000000..4a3599890
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_verbs.c
@@ -0,0 +1,2339 @@
+/*
+ * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_mad.h>
+#include <rdma/ib_user_verbs.h>
+#include <linux/io.h>
+#include <linux/module.h>
+#include <linux/utsname.h>
+#include <linux/rculist.h>
+#include <linux/mm.h>
+#include <linux/random.h>
+
+#include "qib.h"
+#include "qib_common.h"
+
+static unsigned int ib_qib_qp_table_size = 256;
+module_param_named(qp_table_size, ib_qib_qp_table_size, uint, S_IRUGO);
+MODULE_PARM_DESC(qp_table_size, "QP table size");
+
+unsigned int ib_qib_lkey_table_size = 16;
+module_param_named(lkey_table_size, ib_qib_lkey_table_size, uint,
+		   S_IRUGO);
+MODULE_PARM_DESC(lkey_table_size,
+		 "LKEY table size in bits (2^n, 1 <= n <= 23)");
+
+static unsigned int ib_qib_max_pds = 0xFFFF;
+module_param_named(max_pds, ib_qib_max_pds, uint, S_IRUGO);
+MODULE_PARM_DESC(max_pds,
+		 "Maximum number of protection domains to support");
+
+static unsigned int ib_qib_max_ahs = 0xFFFF;
+module_param_named(max_ahs, ib_qib_max_ahs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_ahs, "Maximum number of address handles to support");
+
+unsigned int ib_qib_max_cqes = 0x2FFFF;
+module_param_named(max_cqes, ib_qib_max_cqes, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqes,
+		 "Maximum number of completion queue entries to support");
+
+unsigned int ib_qib_max_cqs = 0x1FFFF;
+module_param_named(max_cqs, ib_qib_max_cqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_cqs, "Maximum number of completion queues to support");
+
+unsigned int ib_qib_max_qp_wrs = 0x3FFF;
+module_param_named(max_qp_wrs, ib_qib_max_qp_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qp_wrs, "Maximum number of QP WRs to support");
+
+unsigned int ib_qib_max_qps = 16384;
+module_param_named(max_qps, ib_qib_max_qps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_qps, "Maximum number of QPs to support");
+
+unsigned int ib_qib_max_sges = 0x60;
+module_param_named(max_sges, ib_qib_max_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_sges, "Maximum number of SGEs to support");
+
+unsigned int ib_qib_max_mcast_grps = 16384;
+module_param_named(max_mcast_grps, ib_qib_max_mcast_grps, uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_grps,
+		 "Maximum number of multicast groups to support");
+
+unsigned int ib_qib_max_mcast_qp_attached = 16;
+module_param_named(max_mcast_qp_attached, ib_qib_max_mcast_qp_attached,
+		   uint, S_IRUGO);
+MODULE_PARM_DESC(max_mcast_qp_attached,
+		 "Maximum number of attached QPs to support");
+
+unsigned int ib_qib_max_srqs = 1024;
+module_param_named(max_srqs, ib_qib_max_srqs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srqs, "Maximum number of SRQs to support");
+
+unsigned int ib_qib_max_srq_sges = 128;
+module_param_named(max_srq_sges, ib_qib_max_srq_sges, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_sges, "Maximum number of SRQ SGEs to support");
+
+unsigned int ib_qib_max_srq_wrs = 0x1FFFF;
+module_param_named(max_srq_wrs, ib_qib_max_srq_wrs, uint, S_IRUGO);
+MODULE_PARM_DESC(max_srq_wrs, "Maximum number of SRQ WRs support");
+
+static unsigned int ib_qib_disable_sma;
+module_param_named(disable_sma, ib_qib_disable_sma, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(disable_sma, "Disable the SMA");
+
+/*
+ * Note that it is OK to post send work requests in the SQE and ERR
+ * states; qib_do_send() will process them and generate error
+ * completions as per IB 1.2 C10-96.
+ */
+const int ib_qib_state_ops[IB_QPS_ERR + 1] = {
+	[IB_QPS_RESET] = 0,
+	[IB_QPS_INIT] = QIB_POST_RECV_OK,
+	[IB_QPS_RTR] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK,
+	[IB_QPS_RTS] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK |
+	    QIB_POST_SEND_OK | QIB_PROCESS_SEND_OK |
+	    QIB_PROCESS_NEXT_SEND_OK,
+	[IB_QPS_SQD] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK |
+	    QIB_POST_SEND_OK | QIB_PROCESS_SEND_OK,
+	[IB_QPS_SQE] = QIB_POST_RECV_OK | QIB_PROCESS_RECV_OK |
+	    QIB_POST_SEND_OK | QIB_FLUSH_SEND,
+	[IB_QPS_ERR] = QIB_POST_RECV_OK | QIB_FLUSH_RECV |
+	    QIB_POST_SEND_OK | QIB_FLUSH_SEND,
+};
+
+struct qib_ucontext {
+	struct ib_ucontext ibucontext;
+};
+
+static inline struct qib_ucontext *to_iucontext(struct ib_ucontext
+						  *ibucontext)
+{
+	return container_of(ibucontext, struct qib_ucontext, ibucontext);
+}
+
+/*
+ * Translate ib_wr_opcode into ib_wc_opcode.
+ */
+const enum ib_wc_opcode ib_qib_wc_opcode[] = {
+	[IB_WR_RDMA_WRITE] = IB_WC_RDMA_WRITE,
+	[IB_WR_RDMA_WRITE_WITH_IMM] = IB_WC_RDMA_WRITE,
+	[IB_WR_SEND] = IB_WC_SEND,
+	[IB_WR_SEND_WITH_IMM] = IB_WC_SEND,
+	[IB_WR_RDMA_READ] = IB_WC_RDMA_READ,
+	[IB_WR_ATOMIC_CMP_AND_SWP] = IB_WC_COMP_SWAP,
+	[IB_WR_ATOMIC_FETCH_AND_ADD] = IB_WC_FETCH_ADD
+};
+
+/*
+ * System image GUID.
+ */
+__be64 ib_qib_sys_image_guid;
+
+/**
+ * qib_copy_sge - copy data to SGE memory
+ * @ss: the SGE state
+ * @data: the data to copy
+ * @length: the length of the data
+ */
+void qib_copy_sge(struct qib_sge_state *ss, void *data, u32 length, int release)
+{
+	struct qib_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		memcpy(sge->vaddr, data, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (release)
+				qib_put_mr(sge->mr);
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= QIB_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		data += len;
+		length -= len;
+	}
+}
+
+/**
+ * qib_skip_sge - skip over SGE memory - XXX almost dup of prev func
+ * @ss: the SGE state
+ * @length: the number of bytes to skip
+ */
+void qib_skip_sge(struct qib_sge_state *ss, u32 length, int release)
+{
+	struct qib_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (release)
+				qib_put_mr(sge->mr);
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= QIB_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		length -= len;
+	}
+}
+
+/*
+ * Count the number of DMA descriptors needed to send length bytes of data.
+ * Don't modify the qib_sge_state to get the count.
+ * Return zero if any of the segments is not aligned.
+ */
+static u32 qib_count_sge(struct qib_sge_state *ss, u32 length)
+{
+	struct qib_sge *sg_list = ss->sg_list;
+	struct qib_sge sge = ss->sge;
+	u8 num_sge = ss->num_sge;
+	u32 ndesc = 1;  /* count the header */
+
+	while (length) {
+		u32 len = sge.length;
+
+		if (len > length)
+			len = length;
+		if (len > sge.sge_length)
+			len = sge.sge_length;
+		BUG_ON(len == 0);
+		if (((long) sge.vaddr & (sizeof(u32) - 1)) ||
+		    (len != length && (len & (sizeof(u32) - 1)))) {
+			ndesc = 0;
+			break;
+		}
+		ndesc++;
+		sge.vaddr += len;
+		sge.length -= len;
+		sge.sge_length -= len;
+		if (sge.sge_length == 0) {
+			if (--num_sge)
+				sge = *sg_list++;
+		} else if (sge.length == 0 && sge.mr->lkey) {
+			if (++sge.n >= QIB_SEGSZ) {
+				if (++sge.m >= sge.mr->mapsz)
+					break;
+				sge.n = 0;
+			}
+			sge.vaddr =
+				sge.mr->map[sge.m]->segs[sge.n].vaddr;
+			sge.length =
+				sge.mr->map[sge.m]->segs[sge.n].length;
+		}
+		length -= len;
+	}
+	return ndesc;
+}
+
+/*
+ * Copy from the SGEs to the data buffer.
+ */
+static void qib_copy_from_sge(void *data, struct qib_sge_state *ss, u32 length)
+{
+	struct qib_sge *sge = &ss->sge;
+
+	while (length) {
+		u32 len = sge->length;
+
+		if (len > length)
+			len = length;
+		if (len > sge->sge_length)
+			len = sge->sge_length;
+		BUG_ON(len == 0);
+		memcpy(data, sge->vaddr, len);
+		sge->vaddr += len;
+		sge->length -= len;
+		sge->sge_length -= len;
+		if (sge->sge_length == 0) {
+			if (--ss->num_sge)
+				*sge = *ss->sg_list++;
+		} else if (sge->length == 0 && sge->mr->lkey) {
+			if (++sge->n >= QIB_SEGSZ) {
+				if (++sge->m >= sge->mr->mapsz)
+					break;
+				sge->n = 0;
+			}
+			sge->vaddr =
+				sge->mr->map[sge->m]->segs[sge->n].vaddr;
+			sge->length =
+				sge->mr->map[sge->m]->segs[sge->n].length;
+		}
+		data += len;
+		length -= len;
+	}
+}
+
+/**
+ * qib_post_one_send - post one RC, UC, or UD send work request
+ * @qp: the QP to post on
+ * @wr: the work request to send
+ */
+static int qib_post_one_send(struct qib_qp *qp, struct ib_send_wr *wr,
+	int *scheduled)
+{
+	struct qib_swqe *wqe;
+	u32 next;
+	int i;
+	int j;
+	int acc;
+	int ret;
+	unsigned long flags;
+	struct qib_lkey_table *rkt;
+	struct qib_pd *pd;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Check that state is OK to post send. */
+	if (unlikely(!(ib_qib_state_ops[qp->state] & QIB_POST_SEND_OK)))
+		goto bail_inval;
+
+	/* IB spec says that num_sge == 0 is OK. */
+	if (wr->num_sge > qp->s_max_sge)
+		goto bail_inval;
+
+	/*
+	 * Don't allow RDMA reads or atomic operations on UC or
+	 * undefined operations.
+	 * Make sure buffer is large enough to hold the result for atomics.
+	 */
+	if (wr->opcode == IB_WR_FAST_REG_MR) {
+		if (qib_fast_reg_mr(qp, wr))
+			goto bail_inval;
+	} else if (qp->ibqp.qp_type == IB_QPT_UC) {
+		if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
+			goto bail_inval;
+	} else if (qp->ibqp.qp_type != IB_QPT_RC) {
+		/* Check IB_QPT_SMI, IB_QPT_GSI, IB_QPT_UD opcode */
+		if (wr->opcode != IB_WR_SEND &&
+		    wr->opcode != IB_WR_SEND_WITH_IMM)
+			goto bail_inval;
+		/* Check UD destination address PD */
+		if (qp->ibqp.pd != wr->wr.ud.ah->pd)
+			goto bail_inval;
+	} else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
+		goto bail_inval;
+	else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
+		   (wr->num_sge == 0 ||
+		    wr->sg_list[0].length < sizeof(u64) ||
+		    wr->sg_list[0].addr & (sizeof(u64) - 1)))
+		goto bail_inval;
+	else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
+		goto bail_inval;
+
+	next = qp->s_head + 1;
+	if (next >= qp->s_size)
+		next = 0;
+	if (next == qp->s_last) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+
+	rkt = &to_idev(qp->ibqp.device)->lk_table;
+	pd = to_ipd(qp->ibqp.pd);
+	wqe = get_swqe_ptr(qp, qp->s_head);
+	wqe->wr = *wr;
+	wqe->length = 0;
+	j = 0;
+	if (wr->num_sge) {
+		acc = wr->opcode >= IB_WR_RDMA_READ ?
+			IB_ACCESS_LOCAL_WRITE : 0;
+		for (i = 0; i < wr->num_sge; i++) {
+			u32 length = wr->sg_list[i].length;
+			int ok;
+
+			if (length == 0)
+				continue;
+			ok = qib_lkey_ok(rkt, pd, &wqe->sg_list[j],
+					 &wr->sg_list[i], acc);
+			if (!ok)
+				goto bail_inval_free;
+			wqe->length += length;
+			j++;
+		}
+		wqe->wr.num_sge = j;
+	}
+	if (qp->ibqp.qp_type == IB_QPT_UC ||
+	    qp->ibqp.qp_type == IB_QPT_RC) {
+		if (wqe->length > 0x80000000U)
+			goto bail_inval_free;
+	} else if (wqe->length > (dd_from_ibdev(qp->ibqp.device)->pport +
+				  qp->port_num - 1)->ibmtu)
+		goto bail_inval_free;
+	else
+		atomic_inc(&to_iah(wr->wr.ud.ah)->refcount);
+	wqe->ssn = qp->s_ssn++;
+	qp->s_head = next;
+
+	ret = 0;
+	goto bail;
+
+bail_inval_free:
+	while (j) {
+		struct qib_sge *sge = &wqe->sg_list[--j];
+
+		qib_put_mr(sge->mr);
+	}
+bail_inval:
+	ret = -EINVAL;
+bail:
+	if (!ret && !wr->next &&
+	 !qib_sdma_empty(
+	   dd_from_ibdev(qp->ibqp.device)->pport + qp->port_num - 1)) {
+		qib_schedule_send(qp);
+		*scheduled = 1;
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+/**
+ * qib_post_send - post a send on a QP
+ * @ibqp: the QP to post the send on
+ * @wr: the list of work requests to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int qib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			 struct ib_send_wr **bad_wr)
+{
+	struct qib_qp *qp = to_iqp(ibqp);
+	int err = 0;
+	int scheduled = 0;
+
+	for (; wr; wr = wr->next) {
+		err = qib_post_one_send(qp, wr, &scheduled);
+		if (err) {
+			*bad_wr = wr;
+			goto bail;
+		}
+	}
+
+	/* Try to do the send work in the caller's context. */
+	if (!scheduled)
+		qib_do_send(&qp->s_work);
+
+bail:
+	return err;
+}
+
+/**
+ * qib_post_receive - post a receive on a QP
+ * @ibqp: the QP to post the receive on
+ * @wr: the WR to post
+ * @bad_wr: the first bad WR is put here
+ *
+ * This may be called from interrupt context.
+ */
+static int qib_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			    struct ib_recv_wr **bad_wr)
+{
+	struct qib_qp *qp = to_iqp(ibqp);
+	struct qib_rwq *wq = qp->r_rq.wq;
+	unsigned long flags;
+	int ret;
+
+	/* Check that state is OK to post receive. */
+	if (!(ib_qib_state_ops[qp->state] & QIB_POST_RECV_OK) || !wq) {
+		*bad_wr = wr;
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	for (; wr; wr = wr->next) {
+		struct qib_rwqe *wqe;
+		u32 next;
+		int i;
+
+		if ((unsigned) wr->num_sge > qp->r_rq.max_sge) {
+			*bad_wr = wr;
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		spin_lock_irqsave(&qp->r_rq.lock, flags);
+		next = wq->head + 1;
+		if (next >= qp->r_rq.size)
+			next = 0;
+		if (next == wq->tail) {
+			spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+			*bad_wr = wr;
+			ret = -ENOMEM;
+			goto bail;
+		}
+
+		wqe = get_rwqe_ptr(&qp->r_rq, wq->head);
+		wqe->wr_id = wr->wr_id;
+		wqe->num_sge = wr->num_sge;
+		for (i = 0; i < wr->num_sge; i++)
+			wqe->sg_list[i] = wr->sg_list[i];
+		/* Make sure queue entry is written before the head index. */
+		smp_wmb();
+		wq->head = next;
+		spin_unlock_irqrestore(&qp->r_rq.lock, flags);
+	}
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_qp_rcv - processing an incoming packet on a QP
+ * @rcd: the context pointer
+ * @hdr: the packet header
+ * @has_grh: true if the packet has a GRH
+ * @data: the packet data
+ * @tlen: the packet length
+ * @qp: the QP the packet came on
+ *
+ * This is called from qib_ib_rcv() to process an incoming packet
+ * for the given QP.
+ * Called at interrupt level.
+ */
+static void qib_qp_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr,
+		       int has_grh, void *data, u32 tlen, struct qib_qp *qp)
+{
+	struct qib_ibport *ibp = &rcd->ppd->ibport_data;
+
+	spin_lock(&qp->r_lock);
+
+	/* Check for valid receive state. */
+	if (!(ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK)) {
+		ibp->n_pkt_drops++;
+		goto unlock;
+	}
+
+	switch (qp->ibqp.qp_type) {
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
+		if (ib_qib_disable_sma)
+			break;
+		/* FALLTHROUGH */
+	case IB_QPT_UD:
+		qib_ud_rcv(ibp, hdr, has_grh, data, tlen, qp);
+		break;
+
+	case IB_QPT_RC:
+		qib_rc_rcv(rcd, hdr, has_grh, data, tlen, qp);
+		break;
+
+	case IB_QPT_UC:
+		qib_uc_rcv(ibp, hdr, has_grh, data, tlen, qp);
+		break;
+
+	default:
+		break;
+	}
+
+unlock:
+	spin_unlock(&qp->r_lock);
+}
+
+/**
+ * qib_ib_rcv - process an incoming packet
+ * @rcd: the context pointer
+ * @rhdr: the header of the packet
+ * @data: the packet payload
+ * @tlen: the packet length
+ *
+ * This is called from qib_kreceive() to process an incoming packet at
+ * interrupt level. Tlen is the length of the header + data + CRC in bytes.
+ */
+void qib_ib_rcv(struct qib_ctxtdata *rcd, void *rhdr, void *data, u32 tlen)
+{
+	struct qib_pportdata *ppd = rcd->ppd;
+	struct qib_ibport *ibp = &ppd->ibport_data;
+	struct qib_ib_header *hdr = rhdr;
+	struct qib_other_headers *ohdr;
+	struct qib_qp *qp;
+	u32 qp_num;
+	int lnh;
+	u8 opcode;
+	u16 lid;
+
+	/* 24 == LRH+BTH+CRC */
+	if (unlikely(tlen < 24))
+		goto drop;
+
+	/* Check for a valid destination LID (see ch. 7.11.1). */
+	lid = be16_to_cpu(hdr->lrh[1]);
+	if (lid < QIB_MULTICAST_LID_BASE) {
+		lid &= ~((1 << ppd->lmc) - 1);
+		if (unlikely(lid != ppd->lid))
+			goto drop;
+	}
+
+	/* Check for GRH */
+	lnh = be16_to_cpu(hdr->lrh[0]) & 3;
+	if (lnh == QIB_LRH_BTH)
+		ohdr = &hdr->u.oth;
+	else if (lnh == QIB_LRH_GRH) {
+		u32 vtf;
+
+		ohdr = &hdr->u.l.oth;
+		if (hdr->u.l.grh.next_hdr != IB_GRH_NEXT_HDR)
+			goto drop;
+		vtf = be32_to_cpu(hdr->u.l.grh.version_tclass_flow);
+		if ((vtf >> IB_GRH_VERSION_SHIFT) != IB_GRH_VERSION)
+			goto drop;
+	} else
+		goto drop;
+
+	opcode = (be32_to_cpu(ohdr->bth[0]) >> 24) & 0x7f;
+#ifdef CONFIG_DEBUG_FS
+	rcd->opstats->stats[opcode].n_bytes += tlen;
+	rcd->opstats->stats[opcode].n_packets++;
+#endif
+
+	/* Get the destination QP number. */
+	qp_num = be32_to_cpu(ohdr->bth[1]) & QIB_QPN_MASK;
+	if (qp_num == QIB_MULTICAST_QPN) {
+		struct qib_mcast *mcast;
+		struct qib_mcast_qp *p;
+
+		if (lnh != QIB_LRH_GRH)
+			goto drop;
+		mcast = qib_mcast_find(ibp, &hdr->u.l.grh.dgid);
+		if (mcast == NULL)
+			goto drop;
+		this_cpu_inc(ibp->pmastats->n_multicast_rcv);
+		list_for_each_entry_rcu(p, &mcast->qp_list, list)
+			qib_qp_rcv(rcd, hdr, 1, data, tlen, p->qp);
+		/*
+		 * Notify qib_multicast_detach() if it is waiting for us
+		 * to finish.
+		 */
+		if (atomic_dec_return(&mcast->refcount) <= 1)
+			wake_up(&mcast->wait);
+	} else {
+		if (rcd->lookaside_qp) {
+			if (rcd->lookaside_qpn != qp_num) {
+				if (atomic_dec_and_test(
+					&rcd->lookaside_qp->refcount))
+					wake_up(
+					 &rcd->lookaside_qp->wait);
+				rcd->lookaside_qp = NULL;
+			}
+		}
+		if (!rcd->lookaside_qp) {
+			qp = qib_lookup_qpn(ibp, qp_num);
+			if (!qp)
+				goto drop;
+			rcd->lookaside_qp = qp;
+			rcd->lookaside_qpn = qp_num;
+		} else
+			qp = rcd->lookaside_qp;
+		this_cpu_inc(ibp->pmastats->n_unicast_rcv);
+		qib_qp_rcv(rcd, hdr, lnh == QIB_LRH_GRH, data, tlen, qp);
+	}
+	return;
+
+drop:
+	ibp->n_pkt_drops++;
+}
+
+/*
+ * This is called from a timer to check for QPs
+ * which need kernel memory in order to send a packet.
+ */
+static void mem_timer(unsigned long data)
+{
+	struct qib_ibdev *dev = (struct qib_ibdev *) data;
+	struct list_head *list = &dev->memwait;
+	struct qib_qp *qp = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	if (!list_empty(list)) {
+		qp = list_entry(list->next, struct qib_qp, iowait);
+		list_del_init(&qp->iowait);
+		atomic_inc(&qp->refcount);
+		if (!list_empty(list))
+			mod_timer(&dev->mem_timer, jiffies + 1);
+	}
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+	if (qp) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (qp->s_flags & QIB_S_WAIT_KMEM) {
+			qp->s_flags &= ~QIB_S_WAIT_KMEM;
+			qib_schedule_send(qp);
+		}
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+}
+
+static void update_sge(struct qib_sge_state *ss, u32 length)
+{
+	struct qib_sge *sge = &ss->sge;
+
+	sge->vaddr += length;
+	sge->length -= length;
+	sge->sge_length -= length;
+	if (sge->sge_length == 0) {
+		if (--ss->num_sge)
+			*sge = *ss->sg_list++;
+	} else if (sge->length == 0 && sge->mr->lkey) {
+		if (++sge->n >= QIB_SEGSZ) {
+			if (++sge->m >= sge->mr->mapsz)
+				return;
+			sge->n = 0;
+		}
+		sge->vaddr = sge->mr->map[sge->m]->segs[sge->n].vaddr;
+		sge->length = sge->mr->map[sge->m]->segs[sge->n].length;
+	}
+}
+
+#ifdef __LITTLE_ENDIAN
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+	return data >> shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+	return data << shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+	data <<= ((sizeof(u32) - n) * BITS_PER_BYTE);
+	data >>= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+	return data;
+}
+#else
+static inline u32 get_upper_bits(u32 data, u32 shift)
+{
+	return data << shift;
+}
+
+static inline u32 set_upper_bits(u32 data, u32 shift)
+{
+	return data >> shift;
+}
+
+static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
+{
+	data >>= ((sizeof(u32) - n) * BITS_PER_BYTE);
+	data <<= ((sizeof(u32) - n - off) * BITS_PER_BYTE);
+	return data;
+}
+#endif
+
+static void copy_io(u32 __iomem *piobuf, struct qib_sge_state *ss,
+		    u32 length, unsigned flush_wc)
+{
+	u32 extra = 0;
+	u32 data = 0;
+	u32 last;
+
+	while (1) {
+		u32 len = ss->sge.length;
+		u32 off;
+
+		if (len > length)
+			len = length;
+		if (len > ss->sge.sge_length)
+			len = ss->sge.sge_length;
+		BUG_ON(len == 0);
+		/* If the source address is not aligned, try to align it. */
+		off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
+		if (off) {
+			u32 *addr = (u32 *)((unsigned long)ss->sge.vaddr &
+					    ~(sizeof(u32) - 1));
+			u32 v = get_upper_bits(*addr, off * BITS_PER_BYTE);
+			u32 y;
+
+			y = sizeof(u32) - off;
+			if (len > y)
+				len = y;
+			if (len + extra >= sizeof(u32)) {
+				data |= set_upper_bits(v, extra *
+						       BITS_PER_BYTE);
+				len = sizeof(u32) - extra;
+				if (len == length) {
+					last = data;
+					break;
+				}
+				__raw_writel(data, piobuf);
+				piobuf++;
+				extra = 0;
+				data = 0;
+			} else {
+				/* Clear unused upper bytes */
+				data |= clear_upper_bytes(v, len, extra);
+				if (len == length) {
+					last = data;
+					break;
+				}
+				extra += len;
+			}
+		} else if (extra) {
+			/* Source address is aligned. */
+			u32 *addr = (u32 *) ss->sge.vaddr;
+			int shift = extra * BITS_PER_BYTE;
+			int ushift = 32 - shift;
+			u32 l = len;
+
+			while (l >= sizeof(u32)) {
+				u32 v = *addr;
+
+				data |= set_upper_bits(v, shift);
+				__raw_writel(data, piobuf);
+				data = get_upper_bits(v, ushift);
+				piobuf++;
+				addr++;
+				l -= sizeof(u32);
+			}
+			/*
+			 * We still have 'extra' number of bytes leftover.
+			 */
+			if (l) {
+				u32 v = *addr;
+
+				if (l + extra >= sizeof(u32)) {
+					data |= set_upper_bits(v, shift);
+					len -= l + extra - sizeof(u32);
+					if (len == length) {
+						last = data;
+						break;
+					}
+					__raw_writel(data, piobuf);
+					piobuf++;
+					extra = 0;
+					data = 0;
+				} else {
+					/* Clear unused upper bytes */
+					data |= clear_upper_bytes(v, l, extra);
+					if (len == length) {
+						last = data;
+						break;
+					}
+					extra += l;
+				}
+			} else if (len == length) {
+				last = data;
+				break;
+			}
+		} else if (len == length) {
+			u32 w;
+
+			/*
+			 * Need to round up for the last dword in the
+			 * packet.
+			 */
+			w = (len + 3) >> 2;
+			qib_pio_copy(piobuf, ss->sge.vaddr, w - 1);
+			piobuf += w - 1;
+			last = ((u32 *) ss->sge.vaddr)[w - 1];
+			break;
+		} else {
+			u32 w = len >> 2;
+
+			qib_pio_copy(piobuf, ss->sge.vaddr, w);
+			piobuf += w;
+
+			extra = len & (sizeof(u32) - 1);
+			if (extra) {
+				u32 v = ((u32 *) ss->sge.vaddr)[w];
+
+				/* Clear unused upper bytes */
+				data = clear_upper_bytes(v, extra, 0);
+			}
+		}
+		update_sge(ss, len);
+		length -= len;
+	}
+	/* Update address before sending packet. */
+	update_sge(ss, length);
+	if (flush_wc) {
+		/* must flush early everything before trigger word */
+		qib_flush_wc();
+		__raw_writel(last, piobuf);
+		/* be sure trigger word is written */
+		qib_flush_wc();
+	} else
+		__raw_writel(last, piobuf);
+}
+
+static noinline struct qib_verbs_txreq *__get_txreq(struct qib_ibdev *dev,
+					   struct qib_qp *qp)
+{
+	struct qib_verbs_txreq *tx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	spin_lock(&dev->pending_lock);
+
+	if (!list_empty(&dev->txreq_free)) {
+		struct list_head *l = dev->txreq_free.next;
+
+		list_del(l);
+		spin_unlock(&dev->pending_lock);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		tx = list_entry(l, struct qib_verbs_txreq, txreq.list);
+	} else {
+		if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK &&
+		    list_empty(&qp->iowait)) {
+			dev->n_txwait++;
+			qp->s_flags |= QIB_S_WAIT_TX;
+			list_add_tail(&qp->iowait, &dev->txwait);
+		}
+		qp->s_flags &= ~QIB_S_BUSY;
+		spin_unlock(&dev->pending_lock);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		tx = ERR_PTR(-EBUSY);
+	}
+	return tx;
+}
+
+static inline struct qib_verbs_txreq *get_txreq(struct qib_ibdev *dev,
+					 struct qib_qp *qp)
+{
+	struct qib_verbs_txreq *tx;
+	unsigned long flags;
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	/* assume the list non empty */
+	if (likely(!list_empty(&dev->txreq_free))) {
+		struct list_head *l = dev->txreq_free.next;
+
+		list_del(l);
+		spin_unlock_irqrestore(&dev->pending_lock, flags);
+		tx = list_entry(l, struct qib_verbs_txreq, txreq.list);
+	} else {
+		/* call slow path to get the extra lock */
+		spin_unlock_irqrestore(&dev->pending_lock, flags);
+		tx =  __get_txreq(dev, qp);
+	}
+	return tx;
+}
+
+void qib_put_txreq(struct qib_verbs_txreq *tx)
+{
+	struct qib_ibdev *dev;
+	struct qib_qp *qp;
+	unsigned long flags;
+
+	qp = tx->qp;
+	dev = to_idev(qp->ibqp.device);
+
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+	if (tx->mr) {
+		qib_put_mr(tx->mr);
+		tx->mr = NULL;
+	}
+	if (tx->txreq.flags & QIB_SDMA_TXREQ_F_FREEBUF) {
+		tx->txreq.flags &= ~QIB_SDMA_TXREQ_F_FREEBUF;
+		dma_unmap_single(&dd_from_dev(dev)->pcidev->dev,
+				 tx->txreq.addr, tx->hdr_dwords << 2,
+				 DMA_TO_DEVICE);
+		kfree(tx->align_buf);
+	}
+
+	spin_lock_irqsave(&dev->pending_lock, flags);
+
+	/* Put struct back on free list */
+	list_add(&tx->txreq.list, &dev->txreq_free);
+
+	if (!list_empty(&dev->txwait)) {
+		/* Wake up first QP wanting a free struct */
+		qp = list_entry(dev->txwait.next, struct qib_qp, iowait);
+		list_del_init(&qp->iowait);
+		atomic_inc(&qp->refcount);
+		spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (qp->s_flags & QIB_S_WAIT_TX) {
+			qp->s_flags &= ~QIB_S_WAIT_TX;
+			qib_schedule_send(qp);
+		}
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	} else
+		spin_unlock_irqrestore(&dev->pending_lock, flags);
+}
+
+/*
+ * This is called when there are send DMA descriptors that might be
+ * available.
+ *
+ * This is called with ppd->sdma_lock held.
+ */
+void qib_verbs_sdma_desc_avail(struct qib_pportdata *ppd, unsigned avail)
+{
+	struct qib_qp *qp, *nqp;
+	struct qib_qp *qps[20];
+	struct qib_ibdev *dev;
+	unsigned i, n;
+
+	n = 0;
+	dev = &ppd->dd->verbs_dev;
+	spin_lock(&dev->pending_lock);
+
+	/* Search wait list for first QP wanting DMA descriptors. */
+	list_for_each_entry_safe(qp, nqp, &dev->dmawait, iowait) {
+		if (qp->port_num != ppd->port)
+			continue;
+		if (n == ARRAY_SIZE(qps))
+			break;
+		if (qp->s_tx->txreq.sg_count > avail)
+			break;
+		avail -= qp->s_tx->txreq.sg_count;
+		list_del_init(&qp->iowait);
+		atomic_inc(&qp->refcount);
+		qps[n++] = qp;
+	}
+
+	spin_unlock(&dev->pending_lock);
+
+	for (i = 0; i < n; i++) {
+		qp = qps[i];
+		spin_lock(&qp->s_lock);
+		if (qp->s_flags & QIB_S_WAIT_DMA_DESC) {
+			qp->s_flags &= ~QIB_S_WAIT_DMA_DESC;
+			qib_schedule_send(qp);
+		}
+		spin_unlock(&qp->s_lock);
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+}
+
+/*
+ * This is called with ppd->sdma_lock held.
+ */
+static void sdma_complete(struct qib_sdma_txreq *cookie, int status)
+{
+	struct qib_verbs_txreq *tx =
+		container_of(cookie, struct qib_verbs_txreq, txreq);
+	struct qib_qp *qp = tx->qp;
+
+	spin_lock(&qp->s_lock);
+	if (tx->wqe)
+		qib_send_complete(qp, tx->wqe, IB_WC_SUCCESS);
+	else if (qp->ibqp.qp_type == IB_QPT_RC) {
+		struct qib_ib_header *hdr;
+
+		if (tx->txreq.flags & QIB_SDMA_TXREQ_F_FREEBUF)
+			hdr = &tx->align_buf->hdr;
+		else {
+			struct qib_ibdev *dev = to_idev(qp->ibqp.device);
+
+			hdr = &dev->pio_hdrs[tx->hdr_inx].hdr;
+		}
+		qib_rc_send_complete(qp, hdr);
+	}
+	if (atomic_dec_and_test(&qp->s_dma_busy)) {
+		if (qp->state == IB_QPS_RESET)
+			wake_up(&qp->wait_dma);
+		else if (qp->s_flags & QIB_S_WAIT_DMA) {
+			qp->s_flags &= ~QIB_S_WAIT_DMA;
+			qib_schedule_send(qp);
+		}
+	}
+	spin_unlock(&qp->s_lock);
+
+	qib_put_txreq(tx);
+}
+
+static int wait_kmem(struct qib_ibdev *dev, struct qib_qp *qp)
+{
+	unsigned long flags;
+	int ret = 0;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) {
+		spin_lock(&dev->pending_lock);
+		if (list_empty(&qp->iowait)) {
+			if (list_empty(&dev->memwait))
+				mod_timer(&dev->mem_timer, jiffies + 1);
+			qp->s_flags |= QIB_S_WAIT_KMEM;
+			list_add_tail(&qp->iowait, &dev->memwait);
+		}
+		spin_unlock(&dev->pending_lock);
+		qp->s_flags &= ~QIB_S_BUSY;
+		ret = -EBUSY;
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	return ret;
+}
+
+static int qib_verbs_send_dma(struct qib_qp *qp, struct qib_ib_header *hdr,
+			      u32 hdrwords, struct qib_sge_state *ss, u32 len,
+			      u32 plen, u32 dwords)
+{
+	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
+	struct qib_devdata *dd = dd_from_dev(dev);
+	struct qib_ibport *ibp = to_iport(qp->ibqp.device, qp->port_num);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_verbs_txreq *tx;
+	struct qib_pio_header *phdr;
+	u32 control;
+	u32 ndesc;
+	int ret;
+
+	tx = qp->s_tx;
+	if (tx) {
+		qp->s_tx = NULL;
+		/* resend previously constructed packet */
+		ret = qib_sdma_verbs_send(ppd, tx->ss, tx->dwords, tx);
+		goto bail;
+	}
+
+	tx = get_txreq(dev, qp);
+	if (IS_ERR(tx))
+		goto bail_tx;
+
+	control = dd->f_setpbc_control(ppd, plen, qp->s_srate,
+				       be16_to_cpu(hdr->lrh[0]) >> 12);
+	tx->qp = qp;
+	atomic_inc(&qp->refcount);
+	tx->wqe = qp->s_wqe;
+	tx->mr = qp->s_rdma_mr;
+	if (qp->s_rdma_mr)
+		qp->s_rdma_mr = NULL;
+	tx->txreq.callback = sdma_complete;
+	if (dd->flags & QIB_HAS_SDMA_TIMEOUT)
+		tx->txreq.flags = QIB_SDMA_TXREQ_F_HEADTOHOST;
+	else
+		tx->txreq.flags = QIB_SDMA_TXREQ_F_INTREQ;
+	if (plen + 1 > dd->piosize2kmax_dwords)
+		tx->txreq.flags |= QIB_SDMA_TXREQ_F_USELARGEBUF;
+
+	if (len) {
+		/*
+		 * Don't try to DMA if it takes more descriptors than
+		 * the queue holds.
+		 */
+		ndesc = qib_count_sge(ss, len);
+		if (ndesc >= ppd->sdma_descq_cnt)
+			ndesc = 0;
+	} else
+		ndesc = 1;
+	if (ndesc) {
+		phdr = &dev->pio_hdrs[tx->hdr_inx];
+		phdr->pbc[0] = cpu_to_le32(plen);
+		phdr->pbc[1] = cpu_to_le32(control);
+		memcpy(&phdr->hdr, hdr, hdrwords << 2);
+		tx->txreq.flags |= QIB_SDMA_TXREQ_F_FREEDESC;
+		tx->txreq.sg_count = ndesc;
+		tx->txreq.addr = dev->pio_hdrs_phys +
+			tx->hdr_inx * sizeof(struct qib_pio_header);
+		tx->hdr_dwords = hdrwords + 2; /* add PBC length */
+		ret = qib_sdma_verbs_send(ppd, ss, dwords, tx);
+		goto bail;
+	}
+
+	/* Allocate a buffer and copy the header and payload to it. */
+	tx->hdr_dwords = plen + 1;
+	phdr = kmalloc(tx->hdr_dwords << 2, GFP_ATOMIC);
+	if (!phdr)
+		goto err_tx;
+	phdr->pbc[0] = cpu_to_le32(plen);
+	phdr->pbc[1] = cpu_to_le32(control);
+	memcpy(&phdr->hdr, hdr, hdrwords << 2);
+	qib_copy_from_sge((u32 *) &phdr->hdr + hdrwords, ss, len);
+
+	tx->txreq.addr = dma_map_single(&dd->pcidev->dev, phdr,
+					tx->hdr_dwords << 2, DMA_TO_DEVICE);
+	if (dma_mapping_error(&dd->pcidev->dev, tx->txreq.addr))
+		goto map_err;
+	tx->align_buf = phdr;
+	tx->txreq.flags |= QIB_SDMA_TXREQ_F_FREEBUF;
+	tx->txreq.sg_count = 1;
+	ret = qib_sdma_verbs_send(ppd, NULL, 0, tx);
+	goto unaligned;
+
+map_err:
+	kfree(phdr);
+err_tx:
+	qib_put_txreq(tx);
+	ret = wait_kmem(dev, qp);
+unaligned:
+	ibp->n_unaligned++;
+bail:
+	return ret;
+bail_tx:
+	ret = PTR_ERR(tx);
+	goto bail;
+}
+
+/*
+ * If we are now in the error state, return zero to flush the
+ * send work request.
+ */
+static int no_bufs_available(struct qib_qp *qp)
+{
+	struct qib_ibdev *dev = to_idev(qp->ibqp.device);
+	struct qib_devdata *dd;
+	unsigned long flags;
+	int ret = 0;
+
+	/*
+	 * Note that as soon as want_buffer() is called and
+	 * possibly before it returns, qib_ib_piobufavail()
+	 * could be called. Therefore, put QP on the I/O wait list before
+	 * enabling the PIO avail interrupt.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (ib_qib_state_ops[qp->state] & QIB_PROCESS_RECV_OK) {
+		spin_lock(&dev->pending_lock);
+		if (list_empty(&qp->iowait)) {
+			dev->n_piowait++;
+			qp->s_flags |= QIB_S_WAIT_PIO;
+			list_add_tail(&qp->iowait, &dev->piowait);
+			dd = dd_from_dev(dev);
+			dd->f_wantpiobuf_intr(dd, 1);
+		}
+		spin_unlock(&dev->pending_lock);
+		qp->s_flags &= ~QIB_S_BUSY;
+		ret = -EBUSY;
+	}
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
+static int qib_verbs_send_pio(struct qib_qp *qp, struct qib_ib_header *ibhdr,
+			      u32 hdrwords, struct qib_sge_state *ss, u32 len,
+			      u32 plen, u32 dwords)
+{
+	struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	struct qib_pportdata *ppd = dd->pport + qp->port_num - 1;
+	u32 *hdr = (u32 *) ibhdr;
+	u32 __iomem *piobuf_orig;
+	u32 __iomem *piobuf;
+	u64 pbc;
+	unsigned long flags;
+	unsigned flush_wc;
+	u32 control;
+	u32 pbufn;
+
+	control = dd->f_setpbc_control(ppd, plen, qp->s_srate,
+		be16_to_cpu(ibhdr->lrh[0]) >> 12);
+	pbc = ((u64) control << 32) | plen;
+	piobuf = dd->f_getsendbuf(ppd, pbc, &pbufn);
+	if (unlikely(piobuf == NULL))
+		return no_bufs_available(qp);
+
+	/*
+	 * Write the pbc.
+	 * We have to flush after the PBC for correctness on some cpus
+	 * or WC buffer can be written out of order.
+	 */
+	writeq(pbc, piobuf);
+	piobuf_orig = piobuf;
+	piobuf += 2;
+
+	flush_wc = dd->flags & QIB_PIO_FLUSH_WC;
+	if (len == 0) {
+		/*
+		 * If there is just the header portion, must flush before
+		 * writing last word of header for correctness, and after
+		 * the last header word (trigger word).
+		 */
+		if (flush_wc) {
+			qib_flush_wc();
+			qib_pio_copy(piobuf, hdr, hdrwords - 1);
+			qib_flush_wc();
+			__raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
+			qib_flush_wc();
+		} else
+			qib_pio_copy(piobuf, hdr, hdrwords);
+		goto done;
+	}
+
+	if (flush_wc)
+		qib_flush_wc();
+	qib_pio_copy(piobuf, hdr, hdrwords);
+	piobuf += hdrwords;
+
+	/* The common case is aligned and contained in one segment. */
+	if (likely(ss->num_sge == 1 && len <= ss->sge.length &&
+		   !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) {
+		u32 *addr = (u32 *) ss->sge.vaddr;
+
+		/* Update address before sending packet. */
+		update_sge(ss, len);
+		if (flush_wc) {
+			qib_pio_copy(piobuf, addr, dwords - 1);
+			/* must flush early everything before trigger word */
+			qib_flush_wc();
+			__raw_writel(addr[dwords - 1], piobuf + dwords - 1);
+			/* be sure trigger word is written */
+			qib_flush_wc();
+		} else
+			qib_pio_copy(piobuf, addr, dwords);
+		goto done;
+	}
+	copy_io(piobuf, ss, len, flush_wc);
+done:
+	if (dd->flags & QIB_USE_SPCL_TRIG) {
+		u32 spcl_off = (pbufn >= dd->piobcnt2k) ? 2047 : 1023;
+
+		qib_flush_wc();
+		__raw_writel(0xaebecede, piobuf_orig + spcl_off);
+	}
+	qib_sendbuf_done(dd, pbufn);
+	if (qp->s_rdma_mr) {
+		qib_put_mr(qp->s_rdma_mr);
+		qp->s_rdma_mr = NULL;
+	}
+	if (qp->s_wqe) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		qib_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	} else if (qp->ibqp.qp_type == IB_QPT_RC) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		qib_rc_send_complete(qp, ibhdr);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	}
+	return 0;
+}
+
+/**
+ * qib_verbs_send - send a packet
+ * @qp: the QP to send on
+ * @hdr: the packet header
+ * @hdrwords: the number of 32-bit words in the header
+ * @ss: the SGE to send
+ * @len: the length of the packet in bytes
+ *
+ * Return zero if packet is sent or queued OK.
+ * Return non-zero and clear qp->s_flags QIB_S_BUSY otherwise.
+ */
+int qib_verbs_send(struct qib_qp *qp, struct qib_ib_header *hdr,
+		   u32 hdrwords, struct qib_sge_state *ss, u32 len)
+{
+	struct qib_devdata *dd = dd_from_ibdev(qp->ibqp.device);
+	u32 plen;
+	int ret;
+	u32 dwords = (len + 3) >> 2;
+
+	/*
+	 * Calculate the send buffer trigger address.
+	 * The +1 counts for the pbc control dword following the pbc length.
+	 */
+	plen = hdrwords + dwords + 1;
+
+	/*
+	 * VL15 packets (IB_QPT_SMI) will always use PIO, so we
+	 * can defer SDMA restart until link goes ACTIVE without
+	 * worrying about just how we got there.
+	 */
+	if (qp->ibqp.qp_type == IB_QPT_SMI ||
+	    !(dd->flags & QIB_HAS_SEND_DMA))
+		ret = qib_verbs_send_pio(qp, hdr, hdrwords, ss, len,
+					 plen, dwords);
+	else
+		ret = qib_verbs_send_dma(qp, hdr, hdrwords, ss, len,
+					 plen, dwords);
+
+	return ret;
+}
+
+int qib_snapshot_counters(struct qib_pportdata *ppd, u64 *swords,
+			  u64 *rwords, u64 *spkts, u64 *rpkts,
+			  u64 *xmit_wait)
+{
+	int ret;
+	struct qib_devdata *dd = ppd->dd;
+
+	if (!(dd->flags & QIB_PRESENT)) {
+		/* no hardware, freeze, etc. */
+		ret = -EINVAL;
+		goto bail;
+	}
+	*swords = dd->f_portcntr(ppd, QIBPORTCNTR_WORDSEND);
+	*rwords = dd->f_portcntr(ppd, QIBPORTCNTR_WORDRCV);
+	*spkts = dd->f_portcntr(ppd, QIBPORTCNTR_PKTSEND);
+	*rpkts = dd->f_portcntr(ppd, QIBPORTCNTR_PKTRCV);
+	*xmit_wait = dd->f_portcntr(ppd, QIBPORTCNTR_SENDSTALL);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_get_counters - get various chip counters
+ * @dd: the qlogic_ib device
+ * @cntrs: counters are placed here
+ *
+ * Return the counters needed by recv_pma_get_portcounters().
+ */
+int qib_get_counters(struct qib_pportdata *ppd,
+		     struct qib_verbs_counters *cntrs)
+{
+	int ret;
+
+	if (!(ppd->dd->flags & QIB_PRESENT)) {
+		/* no hardware, freeze, etc. */
+		ret = -EINVAL;
+		goto bail;
+	}
+	cntrs->symbol_error_counter =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBSYMBOLERR);
+	cntrs->link_error_recovery_counter =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBLINKERRRECOV);
+	/*
+	 * The link downed counter counts when the other side downs the
+	 * connection.  We add in the number of times we downed the link
+	 * due to local link integrity errors to compensate.
+	 */
+	cntrs->link_downed_counter =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_IBLINKDOWN);
+	cntrs->port_rcv_errors =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RXDROPPKT) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RCVOVFL) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERR_RLEN) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_INVALIDRLEN) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRLINK) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRICRC) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRVCRC) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_ERRLPCRC) +
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_BADFORMAT);
+	cntrs->port_rcv_errors +=
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RXLOCALPHYERR);
+	cntrs->port_rcv_errors +=
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RXVLERR);
+	cntrs->port_rcv_remphys_errors =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_RCVEBP);
+	cntrs->port_xmit_discards =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_UNSUPVL);
+	cntrs->port_xmit_data = ppd->dd->f_portcntr(ppd,
+			QIBPORTCNTR_WORDSEND);
+	cntrs->port_rcv_data = ppd->dd->f_portcntr(ppd,
+			QIBPORTCNTR_WORDRCV);
+	cntrs->port_xmit_packets = ppd->dd->f_portcntr(ppd,
+			QIBPORTCNTR_PKTSEND);
+	cntrs->port_rcv_packets = ppd->dd->f_portcntr(ppd,
+			QIBPORTCNTR_PKTRCV);
+	cntrs->local_link_integrity_errors =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_LLI);
+	cntrs->excessive_buffer_overrun_errors =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_EXCESSBUFOVFL);
+	cntrs->vl15_dropped =
+		ppd->dd->f_portcntr(ppd, QIBPORTCNTR_VL15PKTDROP);
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_ib_piobufavail - callback when a PIO buffer is available
+ * @dd: the device pointer
+ *
+ * This is called from qib_intr() at interrupt level when a PIO buffer is
+ * available after qib_verbs_send() returned an error that no buffers were
+ * available. Disable the interrupt if there are no more QPs waiting.
+ */
+void qib_ib_piobufavail(struct qib_devdata *dd)
+{
+	struct qib_ibdev *dev = &dd->verbs_dev;
+	struct list_head *list;
+	struct qib_qp *qps[5];
+	struct qib_qp *qp;
+	unsigned long flags;
+	unsigned i, n;
+
+	list = &dev->piowait;
+	n = 0;
+
+	/*
+	 * Note: checking that the piowait list is empty and clearing
+	 * the buffer available interrupt needs to be atomic or we
+	 * could end up with QPs on the wait list with the interrupt
+	 * disabled.
+	 */
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	while (!list_empty(list)) {
+		if (n == ARRAY_SIZE(qps))
+			goto full;
+		qp = list_entry(list->next, struct qib_qp, iowait);
+		list_del_init(&qp->iowait);
+		atomic_inc(&qp->refcount);
+		qps[n++] = qp;
+	}
+	dd->f_wantpiobuf_intr(dd, 0);
+full:
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
+
+	for (i = 0; i < n; i++) {
+		qp = qps[i];
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (qp->s_flags & QIB_S_WAIT_PIO) {
+			qp->s_flags &= ~QIB_S_WAIT_PIO;
+			qib_schedule_send(qp);
+		}
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		/* Notify qib_destroy_qp() if it is waiting. */
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+}
+
+static int qib_query_device(struct ib_device *ibdev,
+			    struct ib_device_attr *props)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	struct qib_ibdev *dev = to_idev(ibdev);
+
+	memset(props, 0, sizeof(*props));
+
+	props->device_cap_flags = IB_DEVICE_BAD_PKEY_CNTR |
+		IB_DEVICE_BAD_QKEY_CNTR | IB_DEVICE_SHUTDOWN_PORT |
+		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_RC_RNR_NAK_GEN |
+		IB_DEVICE_PORT_ACTIVE_EVENT | IB_DEVICE_SRQ_RESIZE;
+	props->page_size_cap = PAGE_SIZE;
+	props->vendor_id =
+		QIB_SRC_OUI_1 << 16 | QIB_SRC_OUI_2 << 8 | QIB_SRC_OUI_3;
+	props->vendor_part_id = dd->deviceid;
+	props->hw_ver = dd->minrev;
+	props->sys_image_guid = ib_qib_sys_image_guid;
+	props->max_mr_size = ~0ULL;
+	props->max_qp = ib_qib_max_qps;
+	props->max_qp_wr = ib_qib_max_qp_wrs;
+	props->max_sge = ib_qib_max_sges;
+	props->max_cq = ib_qib_max_cqs;
+	props->max_ah = ib_qib_max_ahs;
+	props->max_cqe = ib_qib_max_cqes;
+	props->max_mr = dev->lk_table.max;
+	props->max_fmr = dev->lk_table.max;
+	props->max_map_per_fmr = 32767;
+	props->max_pd = ib_qib_max_pds;
+	props->max_qp_rd_atom = QIB_MAX_RDMA_ATOMIC;
+	props->max_qp_init_rd_atom = 255;
+	/* props->max_res_rd_atom */
+	props->max_srq = ib_qib_max_srqs;
+	props->max_srq_wr = ib_qib_max_srq_wrs;
+	props->max_srq_sge = ib_qib_max_srq_sges;
+	/* props->local_ca_ack_delay */
+	props->atomic_cap = IB_ATOMIC_GLOB;
+	props->max_pkeys = qib_get_npkeys(dd);
+	props->max_mcast_grp = ib_qib_max_mcast_grps;
+	props->max_mcast_qp_attach = ib_qib_max_mcast_qp_attached;
+	props->max_total_mcast_qp_attach = props->max_mcast_qp_attach *
+		props->max_mcast_grp;
+
+	return 0;
+}
+
+static int qib_query_port(struct ib_device *ibdev, u8 port,
+			  struct ib_port_attr *props)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	enum ib_mtu mtu;
+	u16 lid = ppd->lid;
+
+	memset(props, 0, sizeof(*props));
+	props->lid = lid ? lid : be16_to_cpu(IB_LID_PERMISSIVE);
+	props->lmc = ppd->lmc;
+	props->sm_lid = ibp->sm_lid;
+	props->sm_sl = ibp->sm_sl;
+	props->state = dd->f_iblink_state(ppd->lastibcstat);
+	props->phys_state = dd->f_ibphys_portstate(ppd->lastibcstat);
+	props->port_cap_flags = ibp->port_cap_flags;
+	props->gid_tbl_len = QIB_GUIDS_PER_PORT;
+	props->max_msg_sz = 0x80000000;
+	props->pkey_tbl_len = qib_get_npkeys(dd);
+	props->bad_pkey_cntr = ibp->pkey_violations;
+	props->qkey_viol_cntr = ibp->qkey_violations;
+	props->active_width = ppd->link_width_active;
+	/* See rate_show() */
+	props->active_speed = ppd->link_speed_active;
+	props->max_vl_num = qib_num_vls(ppd->vls_supported);
+	props->init_type_reply = 0;
+
+	props->max_mtu = qib_ibmtu ? qib_ibmtu : IB_MTU_4096;
+	switch (ppd->ibmtu) {
+	case 4096:
+		mtu = IB_MTU_4096;
+		break;
+	case 2048:
+		mtu = IB_MTU_2048;
+		break;
+	case 1024:
+		mtu = IB_MTU_1024;
+		break;
+	case 512:
+		mtu = IB_MTU_512;
+		break;
+	case 256:
+		mtu = IB_MTU_256;
+		break;
+	default:
+		mtu = IB_MTU_2048;
+	}
+	props->active_mtu = mtu;
+	props->subnet_timeout = ibp->subnet_timeout;
+
+	return 0;
+}
+
+static int qib_modify_device(struct ib_device *device,
+			     int device_modify_mask,
+			     struct ib_device_modify *device_modify)
+{
+	struct qib_devdata *dd = dd_from_ibdev(device);
+	unsigned i;
+	int ret;
+
+	if (device_modify_mask & ~(IB_DEVICE_MODIFY_SYS_IMAGE_GUID |
+				   IB_DEVICE_MODIFY_NODE_DESC)) {
+		ret = -EOPNOTSUPP;
+		goto bail;
+	}
+
+	if (device_modify_mask & IB_DEVICE_MODIFY_NODE_DESC) {
+		memcpy(device->node_desc, device_modify->node_desc, 64);
+		for (i = 0; i < dd->num_pports; i++) {
+			struct qib_ibport *ibp = &dd->pport[i].ibport_data;
+
+			qib_node_desc_chg(ibp);
+		}
+	}
+
+	if (device_modify_mask & IB_DEVICE_MODIFY_SYS_IMAGE_GUID) {
+		ib_qib_sys_image_guid =
+			cpu_to_be64(device_modify->sys_image_guid);
+		for (i = 0; i < dd->num_pports; i++) {
+			struct qib_ibport *ibp = &dd->pport[i].ibport_data;
+
+			qib_sys_guid_chg(ibp);
+		}
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+static int qib_modify_port(struct ib_device *ibdev, u8 port,
+			   int port_modify_mask, struct ib_port_modify *props)
+{
+	struct qib_ibport *ibp = to_iport(ibdev, port);
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+
+	ibp->port_cap_flags |= props->set_port_cap_mask;
+	ibp->port_cap_flags &= ~props->clr_port_cap_mask;
+	if (props->set_port_cap_mask || props->clr_port_cap_mask)
+		qib_cap_mask_chg(ibp);
+	if (port_modify_mask & IB_PORT_SHUTDOWN)
+		qib_set_linkstate(ppd, QIB_IB_LINKDOWN);
+	if (port_modify_mask & IB_PORT_RESET_QKEY_CNTR)
+		ibp->qkey_violations = 0;
+	return 0;
+}
+
+static int qib_query_gid(struct ib_device *ibdev, u8 port,
+			 int index, union ib_gid *gid)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	int ret = 0;
+
+	if (!port || port > dd->num_pports)
+		ret = -EINVAL;
+	else {
+		struct qib_ibport *ibp = to_iport(ibdev, port);
+		struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+
+		gid->global.subnet_prefix = ibp->gid_prefix;
+		if (index == 0)
+			gid->global.interface_id = ppd->guid;
+		else if (index < QIB_GUIDS_PER_PORT)
+			gid->global.interface_id = ibp->guids[index - 1];
+		else
+			ret = -EINVAL;
+	}
+
+	return ret;
+}
+
+static struct ib_pd *qib_alloc_pd(struct ib_device *ibdev,
+				  struct ib_ucontext *context,
+				  struct ib_udata *udata)
+{
+	struct qib_ibdev *dev = to_idev(ibdev);
+	struct qib_pd *pd;
+	struct ib_pd *ret;
+
+	/*
+	 * This is actually totally arbitrary.  Some correctness tests
+	 * assume there's a maximum number of PDs that can be allocated.
+	 * We don't actually have this limit, but we fail the test if
+	 * we allow allocations of more than we report for this value.
+	 */
+
+	pd = kmalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	spin_lock(&dev->n_pds_lock);
+	if (dev->n_pds_allocated == ib_qib_max_pds) {
+		spin_unlock(&dev->n_pds_lock);
+		kfree(pd);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	dev->n_pds_allocated++;
+	spin_unlock(&dev->n_pds_lock);
+
+	/* ib_alloc_pd() will initialize pd->ibpd. */
+	pd->user = udata != NULL;
+
+	ret = &pd->ibpd;
+
+bail:
+	return ret;
+}
+
+static int qib_dealloc_pd(struct ib_pd *ibpd)
+{
+	struct qib_pd *pd = to_ipd(ibpd);
+	struct qib_ibdev *dev = to_idev(ibpd->device);
+
+	spin_lock(&dev->n_pds_lock);
+	dev->n_pds_allocated--;
+	spin_unlock(&dev->n_pds_lock);
+
+	kfree(pd);
+
+	return 0;
+}
+
+int qib_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr)
+{
+	/* A multicast address requires a GRH (see ch. 8.4.1). */
+	if (ah_attr->dlid >= QIB_MULTICAST_LID_BASE &&
+	    ah_attr->dlid != QIB_PERMISSIVE_LID &&
+	    !(ah_attr->ah_flags & IB_AH_GRH))
+		goto bail;
+	if ((ah_attr->ah_flags & IB_AH_GRH) &&
+	    ah_attr->grh.sgid_index >= QIB_GUIDS_PER_PORT)
+		goto bail;
+	if (ah_attr->dlid == 0)
+		goto bail;
+	if (ah_attr->port_num < 1 ||
+	    ah_attr->port_num > ibdev->phys_port_cnt)
+		goto bail;
+	if (ah_attr->static_rate != IB_RATE_PORT_CURRENT &&
+	    ib_rate_to_mult(ah_attr->static_rate) < 0)
+		goto bail;
+	if (ah_attr->sl > 15)
+		goto bail;
+	return 0;
+bail:
+	return -EINVAL;
+}
+
+/**
+ * qib_create_ah - create an address handle
+ * @pd: the protection domain
+ * @ah_attr: the attributes of the AH
+ *
+ * This may be called from interrupt context.
+ */
+static struct ib_ah *qib_create_ah(struct ib_pd *pd,
+				   struct ib_ah_attr *ah_attr)
+{
+	struct qib_ah *ah;
+	struct ib_ah *ret;
+	struct qib_ibdev *dev = to_idev(pd->device);
+	unsigned long flags;
+
+	if (qib_check_ah(pd->device, ah_attr)) {
+		ret = ERR_PTR(-EINVAL);
+		goto bail;
+	}
+
+	ah = kmalloc(sizeof(*ah), GFP_ATOMIC);
+	if (!ah) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	spin_lock_irqsave(&dev->n_ahs_lock, flags);
+	if (dev->n_ahs_allocated == ib_qib_max_ahs) {
+		spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+		kfree(ah);
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	dev->n_ahs_allocated++;
+	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+	/* ib_create_ah() will initialize ah->ibah. */
+	ah->attr = *ah_attr;
+	atomic_set(&ah->refcount, 0);
+
+	ret = &ah->ibah;
+
+bail:
+	return ret;
+}
+
+struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid)
+{
+	struct ib_ah_attr attr;
+	struct ib_ah *ah = ERR_PTR(-EINVAL);
+	struct qib_qp *qp0;
+
+	memset(&attr, 0, sizeof(attr));
+	attr.dlid = dlid;
+	attr.port_num = ppd_from_ibp(ibp)->port;
+	rcu_read_lock();
+	qp0 = rcu_dereference(ibp->qp0);
+	if (qp0)
+		ah = ib_create_ah(qp0->ibqp.pd, &attr);
+	rcu_read_unlock();
+	return ah;
+}
+
+/**
+ * qib_destroy_ah - destroy an address handle
+ * @ibah: the AH to destroy
+ *
+ * This may be called from interrupt context.
+ */
+static int qib_destroy_ah(struct ib_ah *ibah)
+{
+	struct qib_ibdev *dev = to_idev(ibah->device);
+	struct qib_ah *ah = to_iah(ibah);
+	unsigned long flags;
+
+	if (atomic_read(&ah->refcount) != 0)
+		return -EBUSY;
+
+	spin_lock_irqsave(&dev->n_ahs_lock, flags);
+	dev->n_ahs_allocated--;
+	spin_unlock_irqrestore(&dev->n_ahs_lock, flags);
+
+	kfree(ah);
+
+	return 0;
+}
+
+static int qib_modify_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	struct qib_ah *ah = to_iah(ibah);
+
+	if (qib_check_ah(ibah->device, ah_attr))
+		return -EINVAL;
+
+	ah->attr = *ah_attr;
+
+	return 0;
+}
+
+static int qib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr)
+{
+	struct qib_ah *ah = to_iah(ibah);
+
+	*ah_attr = ah->attr;
+
+	return 0;
+}
+
+/**
+ * qib_get_npkeys - return the size of the PKEY table for context 0
+ * @dd: the qlogic_ib device
+ */
+unsigned qib_get_npkeys(struct qib_devdata *dd)
+{
+	return ARRAY_SIZE(dd->rcd[0]->pkeys);
+}
+
+/*
+ * Return the indexed PKEY from the port PKEY table.
+ * No need to validate rcd[ctxt]; the port is setup if we are here.
+ */
+unsigned qib_get_pkey(struct qib_ibport *ibp, unsigned index)
+{
+	struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+	struct qib_devdata *dd = ppd->dd;
+	unsigned ctxt = ppd->hw_pidx;
+	unsigned ret;
+
+	/* dd->rcd null if mini_init or some init failures */
+	if (!dd->rcd || index >= ARRAY_SIZE(dd->rcd[ctxt]->pkeys))
+		ret = 0;
+	else
+		ret = dd->rcd[ctxt]->pkeys[index];
+
+	return ret;
+}
+
+static int qib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+			  u16 *pkey)
+{
+	struct qib_devdata *dd = dd_from_ibdev(ibdev);
+	int ret;
+
+	if (index >= qib_get_npkeys(dd)) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	*pkey = qib_get_pkey(to_iport(ibdev, port), index);
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+/**
+ * qib_alloc_ucontext - allocate a ucontest
+ * @ibdev: the infiniband device
+ * @udata: not used by the QLogic_IB driver
+ */
+
+static struct ib_ucontext *qib_alloc_ucontext(struct ib_device *ibdev,
+					      struct ib_udata *udata)
+{
+	struct qib_ucontext *context;
+	struct ib_ucontext *ret;
+
+	context = kmalloc(sizeof(*context), GFP_KERNEL);
+	if (!context) {
+		ret = ERR_PTR(-ENOMEM);
+		goto bail;
+	}
+
+	ret = &context->ibucontext;
+
+bail:
+	return ret;
+}
+
+static int qib_dealloc_ucontext(struct ib_ucontext *context)
+{
+	kfree(to_iucontext(context));
+	return 0;
+}
+
+static void init_ibport(struct qib_pportdata *ppd)
+{
+	struct qib_verbs_counters cntrs;
+	struct qib_ibport *ibp = &ppd->ibport_data;
+
+	spin_lock_init(&ibp->lock);
+	/* Set the prefix to the default value (see ch. 4.1.1) */
+	ibp->gid_prefix = IB_DEFAULT_GID_PREFIX;
+	ibp->sm_lid = be16_to_cpu(IB_LID_PERMISSIVE);
+	ibp->port_cap_flags = IB_PORT_SYS_IMAGE_GUID_SUP |
+		IB_PORT_CLIENT_REG_SUP | IB_PORT_SL_MAP_SUP |
+		IB_PORT_TRAP_SUP | IB_PORT_AUTO_MIGR_SUP |
+		IB_PORT_DR_NOTICE_SUP | IB_PORT_CAP_MASK_NOTICE_SUP |
+		IB_PORT_OTHER_LOCAL_CHANGES_SUP;
+	if (ppd->dd->flags & QIB_HAS_LINK_LATENCY)
+		ibp->port_cap_flags |= IB_PORT_LINK_LATENCY_SUP;
+	ibp->pma_counter_select[0] = IB_PMA_PORT_XMIT_DATA;
+	ibp->pma_counter_select[1] = IB_PMA_PORT_RCV_DATA;
+	ibp->pma_counter_select[2] = IB_PMA_PORT_XMIT_PKTS;
+	ibp->pma_counter_select[3] = IB_PMA_PORT_RCV_PKTS;
+	ibp->pma_counter_select[4] = IB_PMA_PORT_XMIT_WAIT;
+
+	/* Snapshot current HW counters to "clear" them. */
+	qib_get_counters(ppd, &cntrs);
+	ibp->z_symbol_error_counter = cntrs.symbol_error_counter;
+	ibp->z_link_error_recovery_counter =
+		cntrs.link_error_recovery_counter;
+	ibp->z_link_downed_counter = cntrs.link_downed_counter;
+	ibp->z_port_rcv_errors = cntrs.port_rcv_errors;
+	ibp->z_port_rcv_remphys_errors = cntrs.port_rcv_remphys_errors;
+	ibp->z_port_xmit_discards = cntrs.port_xmit_discards;
+	ibp->z_port_xmit_data = cntrs.port_xmit_data;
+	ibp->z_port_rcv_data = cntrs.port_rcv_data;
+	ibp->z_port_xmit_packets = cntrs.port_xmit_packets;
+	ibp->z_port_rcv_packets = cntrs.port_rcv_packets;
+	ibp->z_local_link_integrity_errors =
+		cntrs.local_link_integrity_errors;
+	ibp->z_excessive_buffer_overrun_errors =
+		cntrs.excessive_buffer_overrun_errors;
+	ibp->z_vl15_dropped = cntrs.vl15_dropped;
+	RCU_INIT_POINTER(ibp->qp0, NULL);
+	RCU_INIT_POINTER(ibp->qp1, NULL);
+}
+
+/**
+ * qib_register_ib_device - register our device with the infiniband core
+ * @dd: the device data structure
+ * Return the allocated qib_ibdev pointer or NULL on error.
+ */
+int qib_register_ib_device(struct qib_devdata *dd)
+{
+	struct qib_ibdev *dev = &dd->verbs_dev;
+	struct ib_device *ibdev = &dev->ibdev;
+	struct qib_pportdata *ppd = dd->pport;
+	unsigned i, lk_tab_size;
+	int ret;
+
+	dev->qp_table_size = ib_qib_qp_table_size;
+	get_random_bytes(&dev->qp_rnd, sizeof(dev->qp_rnd));
+	dev->qp_table = kmalloc_array(
+				dev->qp_table_size,
+				sizeof(*dev->qp_table),
+				GFP_KERNEL);
+	if (!dev->qp_table) {
+		ret = -ENOMEM;
+		goto err_qpt;
+	}
+	for (i = 0; i < dev->qp_table_size; i++)
+		RCU_INIT_POINTER(dev->qp_table[i], NULL);
+
+	for (i = 0; i < dd->num_pports; i++)
+		init_ibport(ppd + i);
+
+	/* Only need to initialize non-zero fields. */
+	spin_lock_init(&dev->qpt_lock);
+	spin_lock_init(&dev->n_pds_lock);
+	spin_lock_init(&dev->n_ahs_lock);
+	spin_lock_init(&dev->n_cqs_lock);
+	spin_lock_init(&dev->n_qps_lock);
+	spin_lock_init(&dev->n_srqs_lock);
+	spin_lock_init(&dev->n_mcast_grps_lock);
+	init_timer(&dev->mem_timer);
+	dev->mem_timer.function = mem_timer;
+	dev->mem_timer.data = (unsigned long) dev;
+
+	qib_init_qpn_table(dd, &dev->qpn_table);
+
+	/*
+	 * The top ib_qib_lkey_table_size bits are used to index the
+	 * table.  The lower 8 bits can be owned by the user (copied from
+	 * the LKEY).  The remaining bits act as a generation number or tag.
+	 */
+	spin_lock_init(&dev->lk_table.lock);
+	dev->lk_table.max = 1 << ib_qib_lkey_table_size;
+	lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table);
+	dev->lk_table.table = (struct qib_mregion __rcu **)
+		__get_free_pages(GFP_KERNEL, get_order(lk_tab_size));
+	if (dev->lk_table.table == NULL) {
+		ret = -ENOMEM;
+		goto err_lk;
+	}
+	RCU_INIT_POINTER(dev->dma_mr, NULL);
+	for (i = 0; i < dev->lk_table.max; i++)
+		RCU_INIT_POINTER(dev->lk_table.table[i], NULL);
+	INIT_LIST_HEAD(&dev->pending_mmaps);
+	spin_lock_init(&dev->pending_lock);
+	dev->mmap_offset = PAGE_SIZE;
+	spin_lock_init(&dev->mmap_offset_lock);
+	INIT_LIST_HEAD(&dev->piowait);
+	INIT_LIST_HEAD(&dev->dmawait);
+	INIT_LIST_HEAD(&dev->txwait);
+	INIT_LIST_HEAD(&dev->memwait);
+	INIT_LIST_HEAD(&dev->txreq_free);
+
+	if (ppd->sdma_descq_cnt) {
+		dev->pio_hdrs = dma_alloc_coherent(&dd->pcidev->dev,
+						ppd->sdma_descq_cnt *
+						sizeof(struct qib_pio_header),
+						&dev->pio_hdrs_phys,
+						GFP_KERNEL);
+		if (!dev->pio_hdrs) {
+			ret = -ENOMEM;
+			goto err_hdrs;
+		}
+	}
+
+	for (i = 0; i < ppd->sdma_descq_cnt; i++) {
+		struct qib_verbs_txreq *tx;
+
+		tx = kzalloc(sizeof(*tx), GFP_KERNEL);
+		if (!tx) {
+			ret = -ENOMEM;
+			goto err_tx;
+		}
+		tx->hdr_inx = i;
+		list_add(&tx->txreq.list, &dev->txreq_free);
+	}
+
+	/*
+	 * The system image GUID is supposed to be the same for all
+	 * IB HCAs in a single system but since there can be other
+	 * device types in the system, we can't be sure this is unique.
+	 */
+	if (!ib_qib_sys_image_guid)
+		ib_qib_sys_image_guid = ppd->guid;
+
+	strlcpy(ibdev->name, "qib%d", IB_DEVICE_NAME_MAX);
+	ibdev->owner = THIS_MODULE;
+	ibdev->node_guid = ppd->guid;
+	ibdev->uverbs_abi_ver = QIB_UVERBS_ABI_VERSION;
+	ibdev->uverbs_cmd_mask =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)         |
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)        |
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT)          |
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD)            |
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD)          |
+		(1ull << IB_USER_VERBS_CMD_CREATE_AH)           |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_AH)           |
+		(1ull << IB_USER_VERBS_CMD_QUERY_AH)            |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_AH)          |
+		(1ull << IB_USER_VERBS_CMD_REG_MR)              |
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR)            |
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ)           |
+		(1ull << IB_USER_VERBS_CMD_RESIZE_CQ)           |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ)          |
+		(1ull << IB_USER_VERBS_CMD_POLL_CQ)             |
+		(1ull << IB_USER_VERBS_CMD_REQ_NOTIFY_CQ)       |
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP)           |
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP)            |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP)           |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP)          |
+		(1ull << IB_USER_VERBS_CMD_POST_SEND)           |
+		(1ull << IB_USER_VERBS_CMD_POST_RECV)           |
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST)        |
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST)        |
+		(1ull << IB_USER_VERBS_CMD_CREATE_SRQ)          |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_SRQ)          |
+		(1ull << IB_USER_VERBS_CMD_QUERY_SRQ)           |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_SRQ)         |
+		(1ull << IB_USER_VERBS_CMD_POST_SRQ_RECV);
+	ibdev->node_type = RDMA_NODE_IB_CA;
+	ibdev->phys_port_cnt = dd->num_pports;
+	ibdev->num_comp_vectors = 1;
+	ibdev->dma_device = &dd->pcidev->dev;
+	ibdev->query_device = qib_query_device;
+	ibdev->modify_device = qib_modify_device;
+	ibdev->query_port = qib_query_port;
+	ibdev->modify_port = qib_modify_port;
+	ibdev->query_pkey = qib_query_pkey;
+	ibdev->query_gid = qib_query_gid;
+	ibdev->alloc_ucontext = qib_alloc_ucontext;
+	ibdev->dealloc_ucontext = qib_dealloc_ucontext;
+	ibdev->alloc_pd = qib_alloc_pd;
+	ibdev->dealloc_pd = qib_dealloc_pd;
+	ibdev->create_ah = qib_create_ah;
+	ibdev->destroy_ah = qib_destroy_ah;
+	ibdev->modify_ah = qib_modify_ah;
+	ibdev->query_ah = qib_query_ah;
+	ibdev->create_srq = qib_create_srq;
+	ibdev->modify_srq = qib_modify_srq;
+	ibdev->query_srq = qib_query_srq;
+	ibdev->destroy_srq = qib_destroy_srq;
+	ibdev->create_qp = qib_create_qp;
+	ibdev->modify_qp = qib_modify_qp;
+	ibdev->query_qp = qib_query_qp;
+	ibdev->destroy_qp = qib_destroy_qp;
+	ibdev->post_send = qib_post_send;
+	ibdev->post_recv = qib_post_receive;
+	ibdev->post_srq_recv = qib_post_srq_receive;
+	ibdev->create_cq = qib_create_cq;
+	ibdev->destroy_cq = qib_destroy_cq;
+	ibdev->resize_cq = qib_resize_cq;
+	ibdev->poll_cq = qib_poll_cq;
+	ibdev->req_notify_cq = qib_req_notify_cq;
+	ibdev->get_dma_mr = qib_get_dma_mr;
+	ibdev->reg_phys_mr = qib_reg_phys_mr;
+	ibdev->reg_user_mr = qib_reg_user_mr;
+	ibdev->dereg_mr = qib_dereg_mr;
+	ibdev->alloc_fast_reg_mr = qib_alloc_fast_reg_mr;
+	ibdev->alloc_fast_reg_page_list = qib_alloc_fast_reg_page_list;
+	ibdev->free_fast_reg_page_list = qib_free_fast_reg_page_list;
+	ibdev->alloc_fmr = qib_alloc_fmr;
+	ibdev->map_phys_fmr = qib_map_phys_fmr;
+	ibdev->unmap_fmr = qib_unmap_fmr;
+	ibdev->dealloc_fmr = qib_dealloc_fmr;
+	ibdev->attach_mcast = qib_multicast_attach;
+	ibdev->detach_mcast = qib_multicast_detach;
+	ibdev->process_mad = qib_process_mad;
+	ibdev->mmap = qib_mmap;
+	ibdev->dma_ops = &qib_dma_mapping_ops;
+
+	snprintf(ibdev->node_desc, sizeof(ibdev->node_desc),
+		 "Intel Infiniband HCA %s", init_utsname()->nodename);
+
+	ret = ib_register_device(ibdev, qib_create_port_files);
+	if (ret)
+		goto err_reg;
+
+	ret = qib_create_agents(dev);
+	if (ret)
+		goto err_agents;
+
+	ret = qib_verbs_register_sysfs(dd);
+	if (ret)
+		goto err_class;
+
+	goto bail;
+
+err_class:
+	qib_free_agents(dev);
+err_agents:
+	ib_unregister_device(ibdev);
+err_reg:
+err_tx:
+	while (!list_empty(&dev->txreq_free)) {
+		struct list_head *l = dev->txreq_free.next;
+		struct qib_verbs_txreq *tx;
+
+		list_del(l);
+		tx = list_entry(l, struct qib_verbs_txreq, txreq.list);
+		kfree(tx);
+	}
+	if (ppd->sdma_descq_cnt)
+		dma_free_coherent(&dd->pcidev->dev,
+				  ppd->sdma_descq_cnt *
+					sizeof(struct qib_pio_header),
+				  dev->pio_hdrs, dev->pio_hdrs_phys);
+err_hdrs:
+	free_pages((unsigned long) dev->lk_table.table, get_order(lk_tab_size));
+err_lk:
+	kfree(dev->qp_table);
+err_qpt:
+	qib_dev_err(dd, "cannot register verbs: %d!\n", -ret);
+bail:
+	return ret;
+}
+
+void qib_unregister_ib_device(struct qib_devdata *dd)
+{
+	struct qib_ibdev *dev = &dd->verbs_dev;
+	struct ib_device *ibdev = &dev->ibdev;
+	u32 qps_inuse;
+	unsigned lk_tab_size;
+
+	qib_verbs_unregister_sysfs(dd);
+
+	qib_free_agents(dev);
+
+	ib_unregister_device(ibdev);
+
+	if (!list_empty(&dev->piowait))
+		qib_dev_err(dd, "piowait list not empty!\n");
+	if (!list_empty(&dev->dmawait))
+		qib_dev_err(dd, "dmawait list not empty!\n");
+	if (!list_empty(&dev->txwait))
+		qib_dev_err(dd, "txwait list not empty!\n");
+	if (!list_empty(&dev->memwait))
+		qib_dev_err(dd, "memwait list not empty!\n");
+	if (dev->dma_mr)
+		qib_dev_err(dd, "DMA MR not NULL!\n");
+
+	qps_inuse = qib_free_all_qps(dd);
+	if (qps_inuse)
+		qib_dev_err(dd, "QP memory leak! %u still in use\n",
+			    qps_inuse);
+
+	del_timer_sync(&dev->mem_timer);
+	qib_free_qpn_table(&dev->qpn_table);
+	while (!list_empty(&dev->txreq_free)) {
+		struct list_head *l = dev->txreq_free.next;
+		struct qib_verbs_txreq *tx;
+
+		list_del(l);
+		tx = list_entry(l, struct qib_verbs_txreq, txreq.list);
+		kfree(tx);
+	}
+	if (dd->pport->sdma_descq_cnt)
+		dma_free_coherent(&dd->pcidev->dev,
+				  dd->pport->sdma_descq_cnt *
+					sizeof(struct qib_pio_header),
+				  dev->pio_hdrs, dev->pio_hdrs_phys);
+	lk_tab_size = dev->lk_table.max * sizeof(*dev->lk_table.table);
+	free_pages((unsigned long) dev->lk_table.table,
+		   get_order(lk_tab_size));
+	kfree(dev->qp_table);
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+void qib_schedule_send(struct qib_qp *qp)
+{
+	if (qib_send_ok(qp)) {
+		struct qib_ibport *ibp =
+			to_iport(qp->ibqp.device, qp->port_num);
+		struct qib_pportdata *ppd = ppd_from_ibp(ibp);
+
+		queue_work(ppd->qib_wq, &qp->s_work);
+	}
+}
diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h
new file mode 100644
index 000000000..bfc8948fd
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_verbs.h
@@ -0,0 +1,1173 @@
+/*
+ * Copyright (c) 2012, 2013 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef QIB_VERBS_H
+#define QIB_VERBS_H
+
+#include <linux/types.h>
+#include <linux/spinlock.h>
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <linux/completion.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_user_verbs.h>
+
+struct qib_ctxtdata;
+struct qib_pportdata;
+struct qib_devdata;
+struct qib_verbs_txreq;
+
+#define QIB_MAX_RDMA_ATOMIC     16
+#define QIB_GUIDS_PER_PORT	5
+
+#define QPN_MAX                 (1 << 24)
+#define QPNMAP_ENTRIES          (QPN_MAX / PAGE_SIZE / BITS_PER_BYTE)
+
+/*
+ * Increment this value if any changes that break userspace ABI
+ * compatibility are made.
+ */
+#define QIB_UVERBS_ABI_VERSION       2
+
+/*
+ * Define an ib_cq_notify value that is not valid so we know when CQ
+ * notifications are armed.
+ */
+#define IB_CQ_NONE      (IB_CQ_NEXT_COMP + 1)
+
+#define IB_SEQ_NAK	(3 << 29)
+
+/* AETH NAK opcode values */
+#define IB_RNR_NAK                      0x20
+#define IB_NAK_PSN_ERROR                0x60
+#define IB_NAK_INVALID_REQUEST          0x61
+#define IB_NAK_REMOTE_ACCESS_ERROR      0x62
+#define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
+#define IB_NAK_INVALID_RD_REQUEST       0x64
+
+/* Flags for checking QP state (see ib_qib_state_ops[]) */
+#define QIB_POST_SEND_OK                0x01
+#define QIB_POST_RECV_OK                0x02
+#define QIB_PROCESS_RECV_OK             0x04
+#define QIB_PROCESS_SEND_OK             0x08
+#define QIB_PROCESS_NEXT_SEND_OK        0x10
+#define QIB_FLUSH_SEND			0x20
+#define QIB_FLUSH_RECV			0x40
+#define QIB_PROCESS_OR_FLUSH_SEND \
+	(QIB_PROCESS_SEND_OK | QIB_FLUSH_SEND)
+
+/* IB Performance Manager status values */
+#define IB_PMA_SAMPLE_STATUS_DONE       0x00
+#define IB_PMA_SAMPLE_STATUS_STARTED    0x01
+#define IB_PMA_SAMPLE_STATUS_RUNNING    0x02
+
+/* Mandatory IB performance counter select values. */
+#define IB_PMA_PORT_XMIT_DATA   cpu_to_be16(0x0001)
+#define IB_PMA_PORT_RCV_DATA    cpu_to_be16(0x0002)
+#define IB_PMA_PORT_XMIT_PKTS   cpu_to_be16(0x0003)
+#define IB_PMA_PORT_RCV_PKTS    cpu_to_be16(0x0004)
+#define IB_PMA_PORT_XMIT_WAIT   cpu_to_be16(0x0005)
+
+#define QIB_VENDOR_IPG		cpu_to_be16(0xFFA0)
+
+#define IB_BTH_REQ_ACK		(1 << 31)
+#define IB_BTH_SOLICITED	(1 << 23)
+#define IB_BTH_MIG_REQ		(1 << 22)
+
+/* XXX Should be defined in ib_verbs.h enum ib_port_cap_flags */
+#define IB_PORT_OTHER_LOCAL_CHANGES_SUP (1 << 26)
+
+#define IB_GRH_VERSION		6
+#define IB_GRH_VERSION_MASK	0xF
+#define IB_GRH_VERSION_SHIFT	28
+#define IB_GRH_TCLASS_MASK	0xFF
+#define IB_GRH_TCLASS_SHIFT	20
+#define IB_GRH_FLOW_MASK	0xFFFFF
+#define IB_GRH_FLOW_SHIFT	0
+#define IB_GRH_NEXT_HDR		0x1B
+
+#define IB_DEFAULT_GID_PREFIX	cpu_to_be64(0xfe80000000000000ULL)
+
+/* Values for set/get portinfo VLCap OperationalVLs */
+#define IB_VL_VL0       1
+#define IB_VL_VL0_1     2
+#define IB_VL_VL0_3     3
+#define IB_VL_VL0_7     4
+#define IB_VL_VL0_14    5
+
+static inline int qib_num_vls(int vls)
+{
+	switch (vls) {
+	default:
+	case IB_VL_VL0:
+		return 1;
+	case IB_VL_VL0_1:
+		return 2;
+	case IB_VL_VL0_3:
+		return 4;
+	case IB_VL_VL0_7:
+		return 8;
+	case IB_VL_VL0_14:
+		return 15;
+	}
+}
+
+struct ib_reth {
+	__be64 vaddr;
+	__be32 rkey;
+	__be32 length;
+} __packed;
+
+struct ib_atomic_eth {
+	__be32 vaddr[2];        /* unaligned so access as 2 32-bit words */
+	__be32 rkey;
+	__be64 swap_data;
+	__be64 compare_data;
+} __packed;
+
+struct qib_other_headers {
+	__be32 bth[3];
+	union {
+		struct {
+			__be32 deth[2];
+			__be32 imm_data;
+		} ud;
+		struct {
+			struct ib_reth reth;
+			__be32 imm_data;
+		} rc;
+		struct {
+			__be32 aeth;
+			__be32 atomic_ack_eth[2];
+		} at;
+		__be32 imm_data;
+		__be32 aeth;
+		struct ib_atomic_eth atomic_eth;
+	} u;
+} __packed;
+
+/*
+ * Note that UD packets with a GRH header are 8+40+12+8 = 68 bytes
+ * long (72 w/ imm_data).  Only the first 56 bytes of the IB header
+ * will be in the eager header buffer.  The remaining 12 or 16 bytes
+ * are in the data buffer.
+ */
+struct qib_ib_header {
+	__be16 lrh[4];
+	union {
+		struct {
+			struct ib_grh grh;
+			struct qib_other_headers oth;
+		} l;
+		struct qib_other_headers oth;
+	} u;
+} __packed;
+
+struct qib_pio_header {
+	__le32 pbc[2];
+	struct qib_ib_header hdr;
+} __packed;
+
+/*
+ * There is one struct qib_mcast for each multicast GID.
+ * All attached QPs are then stored as a list of
+ * struct qib_mcast_qp.
+ */
+struct qib_mcast_qp {
+	struct list_head list;
+	struct qib_qp *qp;
+};
+
+struct qib_mcast {
+	struct rb_node rb_node;
+	union ib_gid mgid;
+	struct list_head qp_list;
+	wait_queue_head_t wait;
+	atomic_t refcount;
+	int n_attached;
+};
+
+/* Protection domain */
+struct qib_pd {
+	struct ib_pd ibpd;
+	int user;               /* non-zero if created from user space */
+};
+
+/* Address Handle */
+struct qib_ah {
+	struct ib_ah ibah;
+	struct ib_ah_attr attr;
+	atomic_t refcount;
+};
+
+/*
+ * This structure is used by qib_mmap() to validate an offset
+ * when an mmap() request is made.  The vm_area_struct then uses
+ * this as its vm_private_data.
+ */
+struct qib_mmap_info {
+	struct list_head pending_mmaps;
+	struct ib_ucontext *context;
+	void *obj;
+	__u64 offset;
+	struct kref ref;
+	unsigned size;
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and completion queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ */
+struct qib_cq_wc {
+	u32 head;               /* index of next entry to fill */
+	u32 tail;               /* index of next ib_poll_cq() entry */
+	union {
+		/* these are actually size ibcq.cqe + 1 */
+		struct ib_uverbs_wc uqueue[0];
+		struct ib_wc kqueue[0];
+	};
+};
+
+/*
+ * The completion queue structure.
+ */
+struct qib_cq {
+	struct ib_cq ibcq;
+	struct kthread_work comptask;
+	struct qib_devdata *dd;
+	spinlock_t lock; /* protect changes in this struct */
+	u8 notify;
+	u8 triggered;
+	struct qib_cq_wc *queue;
+	struct qib_mmap_info *ip;
+};
+
+/*
+ * A segment is a linear region of low physical memory.
+ * XXX Maybe we should use phys addr here and kmap()/kunmap().
+ * Used by the verbs layer.
+ */
+struct qib_seg {
+	void *vaddr;
+	size_t length;
+};
+
+/* The number of qib_segs that fit in a page. */
+#define QIB_SEGSZ     (PAGE_SIZE / sizeof(struct qib_seg))
+
+struct qib_segarray {
+	struct qib_seg segs[QIB_SEGSZ];
+};
+
+struct qib_mregion {
+	struct ib_pd *pd;       /* shares refcnt of ibmr.pd */
+	u64 user_base;          /* User's address for this region */
+	u64 iova;               /* IB start address of this region */
+	size_t length;
+	u32 lkey;
+	u32 offset;             /* offset (bytes) to start of region */
+	int access_flags;
+	u32 max_segs;           /* number of qib_segs in all the arrays */
+	u32 mapsz;              /* size of the map array */
+	u8  page_shift;         /* 0 - non unform/non powerof2 sizes */
+	u8  lkey_published;     /* in global table */
+	struct completion comp; /* complete when refcount goes to zero */
+	struct rcu_head list;
+	atomic_t refcount;
+	struct qib_segarray *map[0];    /* the segments */
+};
+
+/*
+ * These keep track of the copy progress within a memory region.
+ * Used by the verbs layer.
+ */
+struct qib_sge {
+	struct qib_mregion *mr;
+	void *vaddr;            /* kernel virtual address of segment */
+	u32 sge_length;         /* length of the SGE */
+	u32 length;             /* remaining length of the segment */
+	u16 m;                  /* current index: mr->map[m] */
+	u16 n;                  /* current index: mr->map[m]->segs[n] */
+};
+
+/* Memory region */
+struct qib_mr {
+	struct ib_mr ibmr;
+	struct ib_umem *umem;
+	struct qib_mregion mr;  /* must be last */
+};
+
+/*
+ * Send work request queue entry.
+ * The size of the sg_list is determined when the QP is created and stored
+ * in qp->s_max_sge.
+ */
+struct qib_swqe {
+	struct ib_send_wr wr;   /* don't use wr.sg_list */
+	u32 psn;                /* first packet sequence number */
+	u32 lpsn;               /* last packet sequence number */
+	u32 ssn;                /* send sequence number */
+	u32 length;             /* total length of data in sg_list */
+	struct qib_sge sg_list[0];
+};
+
+/*
+ * Receive work request queue entry.
+ * The size of the sg_list is determined when the QP (or SRQ) is created
+ * and stored in qp->r_rq.max_sge (or srq->rq.max_sge).
+ */
+struct qib_rwqe {
+	u64 wr_id;
+	u8 num_sge;
+	struct ib_sge sg_list[0];
+};
+
+/*
+ * This structure is used to contain the head pointer, tail pointer,
+ * and receive work queue entries as a single memory allocation so
+ * it can be mmap'ed into user space.
+ * Note that the wq array elements are variable size so you can't
+ * just index into the array to get the N'th element;
+ * use get_rwqe_ptr() instead.
+ */
+struct qib_rwq {
+	u32 head;               /* new work requests posted to the head */
+	u32 tail;               /* receives pull requests from here. */
+	struct qib_rwqe wq[0];
+};
+
+struct qib_rq {
+	struct qib_rwq *wq;
+	u32 size;               /* size of RWQE array */
+	u8 max_sge;
+	spinlock_t lock /* protect changes in this struct */
+		____cacheline_aligned_in_smp;
+};
+
+struct qib_srq {
+	struct ib_srq ibsrq;
+	struct qib_rq rq;
+	struct qib_mmap_info *ip;
+	/* send signal when number of RWQEs < limit */
+	u32 limit;
+};
+
+struct qib_sge_state {
+	struct qib_sge *sg_list;      /* next SGE to be used if any */
+	struct qib_sge sge;   /* progress state for the current SGE */
+	u32 total_len;
+	u8 num_sge;
+};
+
+/*
+ * This structure holds the information that the send tasklet needs
+ * to send a RDMA read response or atomic operation.
+ */
+struct qib_ack_entry {
+	u8 opcode;
+	u8 sent;
+	u32 psn;
+	u32 lpsn;
+	union {
+		struct qib_sge rdma_sge;
+		u64 atomic_data;
+	};
+};
+
+/*
+ * Variables prefixed with s_ are for the requester (sender).
+ * Variables prefixed with r_ are for the responder (receiver).
+ * Variables prefixed with ack_ are for responder replies.
+ *
+ * Common variables are protected by both r_rq.lock and s_lock in that order
+ * which only happens in modify_qp() or changing the QP 'state'.
+ */
+struct qib_qp {
+	struct ib_qp ibqp;
+	/* read mostly fields above and below */
+	struct ib_ah_attr remote_ah_attr;
+	struct ib_ah_attr alt_ah_attr;
+	struct qib_qp __rcu *next;            /* link list for QPN hash table */
+	struct qib_swqe *s_wq;  /* send work queue */
+	struct qib_mmap_info *ip;
+	struct qib_ib_header *s_hdr;     /* next packet header to send */
+	unsigned long timeout_jiffies;  /* computed from timeout */
+
+	enum ib_mtu path_mtu;
+	u32 remote_qpn;
+	u32 pmtu;		/* decoded from path_mtu */
+	u32 qkey;               /* QKEY for this QP (for UD or RD) */
+	u32 s_size;             /* send work queue size */
+	u32 s_rnr_timeout;      /* number of milliseconds for RNR timeout */
+
+	u8 state;               /* QP state */
+	u8 qp_access_flags;
+	u8 alt_timeout;         /* Alternate path timeout for this QP */
+	u8 timeout;             /* Timeout for this QP */
+	u8 s_srate;
+	u8 s_mig_state;
+	u8 port_num;
+	u8 s_pkey_index;        /* PKEY index to use */
+	u8 s_alt_pkey_index;    /* Alternate path PKEY index to use */
+	u8 r_max_rd_atomic;     /* max number of RDMA read/atomic to receive */
+	u8 s_max_rd_atomic;     /* max number of RDMA read/atomic to send */
+	u8 s_retry_cnt;         /* number of times to retry */
+	u8 s_rnr_retry_cnt;
+	u8 r_min_rnr_timer;     /* retry timeout value for RNR NAKs */
+	u8 s_max_sge;           /* size of s_wq->sg_list */
+	u8 s_draining;
+
+	/* start of read/write fields */
+
+	atomic_t refcount ____cacheline_aligned_in_smp;
+	wait_queue_head_t wait;
+
+
+	struct qib_ack_entry s_ack_queue[QIB_MAX_RDMA_ATOMIC + 1]
+		____cacheline_aligned_in_smp;
+	struct qib_sge_state s_rdma_read_sge;
+
+	spinlock_t r_lock ____cacheline_aligned_in_smp;      /* used for APM */
+	unsigned long r_aflags;
+	u64 r_wr_id;            /* ID for current receive WQE */
+	u32 r_ack_psn;          /* PSN for next ACK or atomic ACK */
+	u32 r_len;              /* total length of r_sge */
+	u32 r_rcv_len;          /* receive data len processed */
+	u32 r_psn;              /* expected rcv packet sequence number */
+	u32 r_msn;              /* message sequence number */
+
+	u8 r_state;             /* opcode of last packet received */
+	u8 r_flags;
+	u8 r_head_ack_queue;    /* index into s_ack_queue[] */
+
+	struct list_head rspwait;       /* link for waititing to respond */
+
+	struct qib_sge_state r_sge;     /* current receive data */
+	struct qib_rq r_rq;             /* receive work queue */
+
+	spinlock_t s_lock ____cacheline_aligned_in_smp;
+	struct qib_sge_state *s_cur_sge;
+	u32 s_flags;
+	struct qib_verbs_txreq *s_tx;
+	struct qib_swqe *s_wqe;
+	struct qib_sge_state s_sge;     /* current send request data */
+	struct qib_mregion *s_rdma_mr;
+	atomic_t s_dma_busy;
+	u32 s_cur_size;         /* size of send packet in bytes */
+	u32 s_len;              /* total length of s_sge */
+	u32 s_rdma_read_len;    /* total length of s_rdma_read_sge */
+	u32 s_next_psn;         /* PSN for next request */
+	u32 s_last_psn;         /* last response PSN processed */
+	u32 s_sending_psn;      /* lowest PSN that is being sent */
+	u32 s_sending_hpsn;     /* highest PSN that is being sent */
+	u32 s_psn;              /* current packet sequence number */
+	u32 s_ack_rdma_psn;     /* PSN for sending RDMA read responses */
+	u32 s_ack_psn;          /* PSN for acking sends and RDMA writes */
+	u32 s_head;             /* new entries added here */
+	u32 s_tail;             /* next entry to process */
+	u32 s_cur;              /* current work queue entry */
+	u32 s_acked;            /* last un-ACK'ed entry */
+	u32 s_last;             /* last completed entry */
+	u32 s_ssn;              /* SSN of tail entry */
+	u32 s_lsn;              /* limit sequence number (credit) */
+	u16 s_hdrwords;         /* size of s_hdr in 32 bit words */
+	u16 s_rdma_ack_cnt;
+	u8 s_state;             /* opcode of last packet sent */
+	u8 s_ack_state;         /* opcode of packet to ACK */
+	u8 s_nak_state;         /* non-zero if NAK is pending */
+	u8 r_nak_state;         /* non-zero if NAK is pending */
+	u8 s_retry;             /* requester retry counter */
+	u8 s_rnr_retry;         /* requester RNR retry counter */
+	u8 s_num_rd_atomic;     /* number of RDMA read/atomic pending */
+	u8 s_tail_ack_queue;    /* index into s_ack_queue[] */
+
+	struct qib_sge_state s_ack_rdma_sge;
+	struct timer_list s_timer;
+	struct list_head iowait;        /* link for wait PIO buf */
+
+	struct work_struct s_work;
+
+	wait_queue_head_t wait_dma;
+
+	struct qib_sge r_sg_list[0] /* verified SGEs */
+		____cacheline_aligned_in_smp;
+};
+
+/*
+ * Atomic bit definitions for r_aflags.
+ */
+#define QIB_R_WRID_VALID        0
+#define QIB_R_REWIND_SGE        1
+
+/*
+ * Bit definitions for r_flags.
+ */
+#define QIB_R_REUSE_SGE 0x01
+#define QIB_R_RDMAR_SEQ 0x02
+#define QIB_R_RSP_NAK   0x04
+#define QIB_R_RSP_SEND  0x08
+#define QIB_R_COMM_EST  0x10
+
+/*
+ * Bit definitions for s_flags.
+ *
+ * QIB_S_SIGNAL_REQ_WR - set if QP send WRs contain completion signaled
+ * QIB_S_BUSY - send tasklet is processing the QP
+ * QIB_S_TIMER - the RC retry timer is active
+ * QIB_S_ACK_PENDING - an ACK is waiting to be sent after RDMA read/atomics
+ * QIB_S_WAIT_FENCE - waiting for all prior RDMA read or atomic SWQEs
+ *                         before processing the next SWQE
+ * QIB_S_WAIT_RDMAR - waiting for a RDMA read or atomic SWQE to complete
+ *                         before processing the next SWQE
+ * QIB_S_WAIT_RNR - waiting for RNR timeout
+ * QIB_S_WAIT_SSN_CREDIT - waiting for RC credits to process next SWQE
+ * QIB_S_WAIT_DMA - waiting for send DMA queue to drain before generating
+ *                  next send completion entry not via send DMA
+ * QIB_S_WAIT_PIO - waiting for a send buffer to be available
+ * QIB_S_WAIT_TX - waiting for a struct qib_verbs_txreq to be available
+ * QIB_S_WAIT_DMA_DESC - waiting for DMA descriptors to be available
+ * QIB_S_WAIT_KMEM - waiting for kernel memory to be available
+ * QIB_S_WAIT_PSN - waiting for a packet to exit the send DMA queue
+ * QIB_S_WAIT_ACK - waiting for an ACK packet before sending more requests
+ * QIB_S_SEND_ONE - send one packet, request ACK, then wait for ACK
+ */
+#define QIB_S_SIGNAL_REQ_WR	0x0001
+#define QIB_S_BUSY		0x0002
+#define QIB_S_TIMER		0x0004
+#define QIB_S_RESP_PENDING	0x0008
+#define QIB_S_ACK_PENDING	0x0010
+#define QIB_S_WAIT_FENCE	0x0020
+#define QIB_S_WAIT_RDMAR	0x0040
+#define QIB_S_WAIT_RNR		0x0080
+#define QIB_S_WAIT_SSN_CREDIT	0x0100
+#define QIB_S_WAIT_DMA		0x0200
+#define QIB_S_WAIT_PIO		0x0400
+#define QIB_S_WAIT_TX		0x0800
+#define QIB_S_WAIT_DMA_DESC	0x1000
+#define QIB_S_WAIT_KMEM		0x2000
+#define QIB_S_WAIT_PSN		0x4000
+#define QIB_S_WAIT_ACK		0x8000
+#define QIB_S_SEND_ONE		0x10000
+#define QIB_S_UNLIMITED_CREDIT	0x20000
+
+/*
+ * Wait flags that would prevent any packet type from being sent.
+ */
+#define QIB_S_ANY_WAIT_IO (QIB_S_WAIT_PIO | QIB_S_WAIT_TX | \
+	QIB_S_WAIT_DMA_DESC | QIB_S_WAIT_KMEM)
+
+/*
+ * Wait flags that would prevent send work requests from making progress.
+ */
+#define QIB_S_ANY_WAIT_SEND (QIB_S_WAIT_FENCE | QIB_S_WAIT_RDMAR | \
+	QIB_S_WAIT_RNR | QIB_S_WAIT_SSN_CREDIT | QIB_S_WAIT_DMA | \
+	QIB_S_WAIT_PSN | QIB_S_WAIT_ACK)
+
+#define QIB_S_ANY_WAIT (QIB_S_ANY_WAIT_IO | QIB_S_ANY_WAIT_SEND)
+
+#define QIB_PSN_CREDIT  16
+
+/*
+ * Since struct qib_swqe is not a fixed size, we can't simply index into
+ * struct qib_qp.s_wq.  This function does the array index computation.
+ */
+static inline struct qib_swqe *get_swqe_ptr(struct qib_qp *qp,
+					      unsigned n)
+{
+	return (struct qib_swqe *)((char *)qp->s_wq +
+				     (sizeof(struct qib_swqe) +
+				      qp->s_max_sge *
+				      sizeof(struct qib_sge)) * n);
+}
+
+/*
+ * Since struct qib_rwqe is not a fixed size, we can't simply index into
+ * struct qib_rwq.wq.  This function does the array index computation.
+ */
+static inline struct qib_rwqe *get_rwqe_ptr(struct qib_rq *rq, unsigned n)
+{
+	return (struct qib_rwqe *)
+		((char *) rq->wq->wq +
+		 (sizeof(struct qib_rwqe) +
+		  rq->max_sge * sizeof(struct ib_sge)) * n);
+}
+
+/*
+ * QPN-map pages start out as NULL, they get allocated upon
+ * first use and are never deallocated. This way,
+ * large bitmaps are not allocated unless large numbers of QPs are used.
+ */
+struct qpn_map {
+	void *page;
+};
+
+struct qib_qpn_table {
+	spinlock_t lock; /* protect changes in this struct */
+	unsigned flags;         /* flags for QP0/1 allocated for each port */
+	u32 last;               /* last QP number allocated */
+	u32 nmaps;              /* size of the map table */
+	u16 limit;
+	u16 mask;
+	/* bit map of free QP numbers other than 0/1 */
+	struct qpn_map map[QPNMAP_ENTRIES];
+};
+
+struct qib_lkey_table {
+	spinlock_t lock; /* protect changes in this struct */
+	u32 next;               /* next unused index (speeds search) */
+	u32 gen;                /* generation count */
+	u32 max;                /* size of the table */
+	struct qib_mregion __rcu **table;
+};
+
+struct qib_opcode_stats {
+	u64 n_packets;          /* number of packets */
+	u64 n_bytes;            /* total number of bytes */
+};
+
+struct qib_opcode_stats_perctx {
+	struct qib_opcode_stats stats[128];
+};
+
+struct qib_pma_counters {
+	u64 n_unicast_xmit;     /* total unicast packets sent */
+	u64 n_unicast_rcv;      /* total unicast packets received */
+	u64 n_multicast_xmit;   /* total multicast packets sent */
+	u64 n_multicast_rcv;    /* total multicast packets received */
+};
+
+struct qib_ibport {
+	struct qib_qp __rcu *qp0;
+	struct qib_qp __rcu *qp1;
+	struct ib_mad_agent *send_agent;	/* agent for SMI (traps) */
+	struct qib_ah *sm_ah;
+	struct qib_ah *smi_ah;
+	struct rb_root mcast_tree;
+	spinlock_t lock;		/* protect changes in this struct */
+
+	/* non-zero when timer is set */
+	unsigned long mkey_lease_timeout;
+	unsigned long trap_timeout;
+	__be64 gid_prefix;      /* in network order */
+	__be64 mkey;
+	__be64 guids[QIB_GUIDS_PER_PORT	- 1];	/* writable GUIDs */
+	u64 tid;		/* TID for traps */
+	struct qib_pma_counters __percpu *pmastats;
+	u64 z_unicast_xmit;     /* starting count for PMA */
+	u64 z_unicast_rcv;      /* starting count for PMA */
+	u64 z_multicast_xmit;   /* starting count for PMA */
+	u64 z_multicast_rcv;    /* starting count for PMA */
+	u64 z_symbol_error_counter;             /* starting count for PMA */
+	u64 z_link_error_recovery_counter;      /* starting count for PMA */
+	u64 z_link_downed_counter;              /* starting count for PMA */
+	u64 z_port_rcv_errors;                  /* starting count for PMA */
+	u64 z_port_rcv_remphys_errors;          /* starting count for PMA */
+	u64 z_port_xmit_discards;               /* starting count for PMA */
+	u64 z_port_xmit_data;                   /* starting count for PMA */
+	u64 z_port_rcv_data;                    /* starting count for PMA */
+	u64 z_port_xmit_packets;                /* starting count for PMA */
+	u64 z_port_rcv_packets;                 /* starting count for PMA */
+	u32 z_local_link_integrity_errors;      /* starting count for PMA */
+	u32 z_excessive_buffer_overrun_errors;  /* starting count for PMA */
+	u32 z_vl15_dropped;                     /* starting count for PMA */
+	u32 n_rc_resends;
+	u32 n_rc_acks;
+	u32 n_rc_qacks;
+	u32 n_rc_delayed_comp;
+	u32 n_seq_naks;
+	u32 n_rdma_seq;
+	u32 n_rnr_naks;
+	u32 n_other_naks;
+	u32 n_loop_pkts;
+	u32 n_pkt_drops;
+	u32 n_vl15_dropped;
+	u32 n_rc_timeouts;
+	u32 n_dmawait;
+	u32 n_unaligned;
+	u32 n_rc_dupreq;
+	u32 n_rc_seqnak;
+	u32 port_cap_flags;
+	u32 pma_sample_start;
+	u32 pma_sample_interval;
+	__be16 pma_counter_select[5];
+	u16 pma_tag;
+	u16 pkey_violations;
+	u16 qkey_violations;
+	u16 mkey_violations;
+	u16 mkey_lease_period;
+	u16 sm_lid;
+	u16 repress_traps;
+	u8 sm_sl;
+	u8 mkeyprot;
+	u8 subnet_timeout;
+	u8 vl_high_limit;
+	u8 sl_to_vl[16];
+
+};
+
+
+struct qib_ibdev {
+	struct ib_device ibdev;
+	struct list_head pending_mmaps;
+	spinlock_t mmap_offset_lock; /* protect mmap_offset */
+	u32 mmap_offset;
+	struct qib_mregion __rcu *dma_mr;
+
+	/* QP numbers are shared by all IB ports */
+	struct qib_qpn_table qpn_table;
+	struct qib_lkey_table lk_table;
+	struct list_head piowait;       /* list for wait PIO buf */
+	struct list_head dmawait;	/* list for wait DMA */
+	struct list_head txwait;        /* list for wait qib_verbs_txreq */
+	struct list_head memwait;       /* list for wait kernel memory */
+	struct list_head txreq_free;
+	struct timer_list mem_timer;
+	struct qib_qp __rcu **qp_table;
+	struct qib_pio_header *pio_hdrs;
+	dma_addr_t pio_hdrs_phys;
+	/* list of QPs waiting for RNR timer */
+	spinlock_t pending_lock; /* protect wait lists, PMA counters, etc. */
+	u32 qp_table_size; /* size of the hash table */
+	u32 qp_rnd; /* random bytes for hash */
+	spinlock_t qpt_lock;
+
+	u32 n_piowait;
+	u32 n_txwait;
+
+	u32 n_pds_allocated;    /* number of PDs allocated for device */
+	spinlock_t n_pds_lock;
+	u32 n_ahs_allocated;    /* number of AHs allocated for device */
+	spinlock_t n_ahs_lock;
+	u32 n_cqs_allocated;    /* number of CQs allocated for device */
+	spinlock_t n_cqs_lock;
+	u32 n_qps_allocated;    /* number of QPs allocated for device */
+	spinlock_t n_qps_lock;
+	u32 n_srqs_allocated;   /* number of SRQs allocated for device */
+	spinlock_t n_srqs_lock;
+	u32 n_mcast_grps_allocated; /* number of mcast groups allocated */
+	spinlock_t n_mcast_grps_lock;
+#ifdef CONFIG_DEBUG_FS
+	/* per HCA debugfs */
+	struct dentry *qib_ibdev_dbg;
+#endif
+};
+
+struct qib_verbs_counters {
+	u64 symbol_error_counter;
+	u64 link_error_recovery_counter;
+	u64 link_downed_counter;
+	u64 port_rcv_errors;
+	u64 port_rcv_remphys_errors;
+	u64 port_xmit_discards;
+	u64 port_xmit_data;
+	u64 port_rcv_data;
+	u64 port_xmit_packets;
+	u64 port_rcv_packets;
+	u32 local_link_integrity_errors;
+	u32 excessive_buffer_overrun_errors;
+	u32 vl15_dropped;
+};
+
+static inline struct qib_mr *to_imr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct qib_mr, ibmr);
+}
+
+static inline struct qib_pd *to_ipd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct qib_pd, ibpd);
+}
+
+static inline struct qib_ah *to_iah(struct ib_ah *ibah)
+{
+	return container_of(ibah, struct qib_ah, ibah);
+}
+
+static inline struct qib_cq *to_icq(struct ib_cq *ibcq)
+{
+	return container_of(ibcq, struct qib_cq, ibcq);
+}
+
+static inline struct qib_srq *to_isrq(struct ib_srq *ibsrq)
+{
+	return container_of(ibsrq, struct qib_srq, ibsrq);
+}
+
+static inline struct qib_qp *to_iqp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct qib_qp, ibqp);
+}
+
+static inline struct qib_ibdev *to_idev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct qib_ibdev, ibdev);
+}
+
+/*
+ * Send if not busy or waiting for I/O and either
+ * a RC response is pending or we can process send work requests.
+ */
+static inline int qib_send_ok(struct qib_qp *qp)
+{
+	return !(qp->s_flags & (QIB_S_BUSY | QIB_S_ANY_WAIT_IO)) &&
+		(qp->s_hdrwords || (qp->s_flags & QIB_S_RESP_PENDING) ||
+		 !(qp->s_flags & QIB_S_ANY_WAIT_SEND));
+}
+
+/*
+ * This must be called with s_lock held.
+ */
+void qib_schedule_send(struct qib_qp *qp);
+
+static inline int qib_pkey_ok(u16 pkey1, u16 pkey2)
+{
+	u16 p1 = pkey1 & 0x7FFF;
+	u16 p2 = pkey2 & 0x7FFF;
+
+	/*
+	 * Low 15 bits must be non-zero and match, and
+	 * one of the two must be a full member.
+	 */
+	return p1 && p1 == p2 && ((__s16)pkey1 < 0 || (__s16)pkey2 < 0);
+}
+
+void qib_bad_pqkey(struct qib_ibport *ibp, __be16 trap_num, u32 key, u32 sl,
+		   u32 qp1, u32 qp2, __be16 lid1, __be16 lid2);
+void qib_cap_mask_chg(struct qib_ibport *ibp);
+void qib_sys_guid_chg(struct qib_ibport *ibp);
+void qib_node_desc_chg(struct qib_ibport *ibp);
+int qib_process_mad(struct ib_device *ibdev, int mad_flags, u8 port_num,
+		    struct ib_wc *in_wc, struct ib_grh *in_grh,
+		    struct ib_mad *in_mad, struct ib_mad *out_mad);
+int qib_create_agents(struct qib_ibdev *dev);
+void qib_free_agents(struct qib_ibdev *dev);
+
+/*
+ * Compare the lower 24 bits of the two values.
+ * Returns an integer <, ==, or > than zero.
+ */
+static inline int qib_cmp24(u32 a, u32 b)
+{
+	return (((int) a) - ((int) b)) << 8;
+}
+
+struct qib_mcast *qib_mcast_find(struct qib_ibport *ibp, union ib_gid *mgid);
+
+int qib_snapshot_counters(struct qib_pportdata *ppd, u64 *swords,
+			  u64 *rwords, u64 *spkts, u64 *rpkts,
+			  u64 *xmit_wait);
+
+int qib_get_counters(struct qib_pportdata *ppd,
+		     struct qib_verbs_counters *cntrs);
+
+int qib_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid);
+
+int qib_mcast_tree_empty(struct qib_ibport *ibp);
+
+__be32 qib_compute_aeth(struct qib_qp *qp);
+
+struct qib_qp *qib_lookup_qpn(struct qib_ibport *ibp, u32 qpn);
+
+struct ib_qp *qib_create_qp(struct ib_pd *ibpd,
+			    struct ib_qp_init_attr *init_attr,
+			    struct ib_udata *udata);
+
+int qib_destroy_qp(struct ib_qp *ibqp);
+
+int qib_error_qp(struct qib_qp *qp, enum ib_wc_status err);
+
+int qib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		  int attr_mask, struct ib_udata *udata);
+
+int qib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+		 int attr_mask, struct ib_qp_init_attr *init_attr);
+
+unsigned qib_free_all_qps(struct qib_devdata *dd);
+
+void qib_init_qpn_table(struct qib_devdata *dd, struct qib_qpn_table *qpt);
+
+void qib_free_qpn_table(struct qib_qpn_table *qpt);
+
+#ifdef CONFIG_DEBUG_FS
+
+struct qib_qp_iter;
+
+struct qib_qp_iter *qib_qp_iter_init(struct qib_ibdev *dev);
+
+int qib_qp_iter_next(struct qib_qp_iter *iter);
+
+void qib_qp_iter_print(struct seq_file *s, struct qib_qp_iter *iter);
+
+#endif
+
+void qib_get_credit(struct qib_qp *qp, u32 aeth);
+
+unsigned qib_pkt_delay(u32 plen, u8 snd_mult, u8 rcv_mult);
+
+void qib_verbs_sdma_desc_avail(struct qib_pportdata *ppd, unsigned avail);
+
+void qib_put_txreq(struct qib_verbs_txreq *tx);
+
+int qib_verbs_send(struct qib_qp *qp, struct qib_ib_header *hdr,
+		   u32 hdrwords, struct qib_sge_state *ss, u32 len);
+
+void qib_copy_sge(struct qib_sge_state *ss, void *data, u32 length,
+		  int release);
+
+void qib_skip_sge(struct qib_sge_state *ss, u32 length, int release);
+
+void qib_uc_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct qib_qp *qp);
+
+void qib_rc_rcv(struct qib_ctxtdata *rcd, struct qib_ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct qib_qp *qp);
+
+int qib_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr);
+
+struct ib_ah *qib_create_qp0_ah(struct qib_ibport *ibp, u16 dlid);
+
+void qib_rc_rnr_retry(unsigned long arg);
+
+void qib_rc_send_complete(struct qib_qp *qp, struct qib_ib_header *hdr);
+
+void qib_rc_error(struct qib_qp *qp, enum ib_wc_status err);
+
+int qib_post_ud_send(struct qib_qp *qp, struct ib_send_wr *wr);
+
+void qib_ud_rcv(struct qib_ibport *ibp, struct qib_ib_header *hdr,
+		int has_grh, void *data, u32 tlen, struct qib_qp *qp);
+
+int qib_alloc_lkey(struct qib_mregion *mr, int dma_region);
+
+void qib_free_lkey(struct qib_mregion *mr);
+
+int qib_lkey_ok(struct qib_lkey_table *rkt, struct qib_pd *pd,
+		struct qib_sge *isge, struct ib_sge *sge, int acc);
+
+int qib_rkey_ok(struct qib_qp *qp, struct qib_sge *sge,
+		u32 len, u64 vaddr, u32 rkey, int acc);
+
+int qib_post_srq_receive(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
+			 struct ib_recv_wr **bad_wr);
+
+struct ib_srq *qib_create_srq(struct ib_pd *ibpd,
+			      struct ib_srq_init_attr *srq_init_attr,
+			      struct ib_udata *udata);
+
+int qib_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
+		   enum ib_srq_attr_mask attr_mask,
+		   struct ib_udata *udata);
+
+int qib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
+
+int qib_destroy_srq(struct ib_srq *ibsrq);
+
+int qib_cq_init(struct qib_devdata *dd);
+
+void qib_cq_exit(struct qib_devdata *dd);
+
+void qib_cq_enter(struct qib_cq *cq, struct ib_wc *entry, int sig);
+
+int qib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
+
+struct ib_cq *qib_create_cq(struct ib_device *ibdev, int entries,
+			    int comp_vector, struct ib_ucontext *context,
+			    struct ib_udata *udata);
+
+int qib_destroy_cq(struct ib_cq *ibcq);
+
+int qib_req_notify_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags notify_flags);
+
+int qib_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata);
+
+struct ib_mr *qib_get_dma_mr(struct ib_pd *pd, int acc);
+
+struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd,
+			      struct ib_phys_buf *buffer_list,
+			      int num_phys_buf, int acc, u64 *iova_start);
+
+struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
+			      u64 virt_addr, int mr_access_flags,
+			      struct ib_udata *udata);
+
+int qib_dereg_mr(struct ib_mr *ibmr);
+
+struct ib_mr *qib_alloc_fast_reg_mr(struct ib_pd *pd, int max_page_list_len);
+
+struct ib_fast_reg_page_list *qib_alloc_fast_reg_page_list(
+				struct ib_device *ibdev, int page_list_len);
+
+void qib_free_fast_reg_page_list(struct ib_fast_reg_page_list *pl);
+
+int qib_fast_reg_mr(struct qib_qp *qp, struct ib_send_wr *wr);
+
+struct ib_fmr *qib_alloc_fmr(struct ib_pd *pd, int mr_access_flags,
+			     struct ib_fmr_attr *fmr_attr);
+
+int qib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		     int list_len, u64 iova);
+
+int qib_unmap_fmr(struct list_head *fmr_list);
+
+int qib_dealloc_fmr(struct ib_fmr *ibfmr);
+
+static inline void qib_get_mr(struct qib_mregion *mr)
+{
+	atomic_inc(&mr->refcount);
+}
+
+void mr_rcu_callback(struct rcu_head *list);
+
+static inline void qib_put_mr(struct qib_mregion *mr)
+{
+	if (unlikely(atomic_dec_and_test(&mr->refcount)))
+		call_rcu(&mr->list, mr_rcu_callback);
+}
+
+static inline void qib_put_ss(struct qib_sge_state *ss)
+{
+	while (ss->num_sge) {
+		qib_put_mr(ss->sge.mr);
+		if (--ss->num_sge)
+			ss->sge = *ss->sg_list++;
+	}
+}
+
+
+void qib_release_mmap_info(struct kref *ref);
+
+struct qib_mmap_info *qib_create_mmap_info(struct qib_ibdev *dev, u32 size,
+					   struct ib_ucontext *context,
+					   void *obj);
+
+void qib_update_mmap_info(struct qib_ibdev *dev, struct qib_mmap_info *ip,
+			  u32 size, void *obj);
+
+int qib_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
+
+int qib_get_rwqe(struct qib_qp *qp, int wr_id_only);
+
+void qib_migrate_qp(struct qib_qp *qp);
+
+int qib_ruc_check_hdr(struct qib_ibport *ibp, struct qib_ib_header *hdr,
+		      int has_grh, struct qib_qp *qp, u32 bth0);
+
+u32 qib_make_grh(struct qib_ibport *ibp, struct ib_grh *hdr,
+		 struct ib_global_route *grh, u32 hwords, u32 nwords);
+
+void qib_make_ruc_header(struct qib_qp *qp, struct qib_other_headers *ohdr,
+			 u32 bth0, u32 bth2);
+
+void qib_do_send(struct work_struct *work);
+
+void qib_send_complete(struct qib_qp *qp, struct qib_swqe *wqe,
+		       enum ib_wc_status status);
+
+void qib_send_rc_ack(struct qib_qp *qp);
+
+int qib_make_rc_req(struct qib_qp *qp);
+
+int qib_make_uc_req(struct qib_qp *qp);
+
+int qib_make_ud_req(struct qib_qp *qp);
+
+int qib_register_ib_device(struct qib_devdata *);
+
+void qib_unregister_ib_device(struct qib_devdata *);
+
+void qib_ib_rcv(struct qib_ctxtdata *, void *, void *, u32);
+
+void qib_ib_piobufavail(struct qib_devdata *);
+
+unsigned qib_get_npkeys(struct qib_devdata *);
+
+unsigned qib_get_pkey(struct qib_ibport *, unsigned);
+
+extern const enum ib_wc_opcode ib_qib_wc_opcode[];
+
+/*
+ * Below  HCA-independent IB PhysPortState values, returned
+ * by the f_ibphys_portstate() routine.
+ */
+#define IB_PHYSPORTSTATE_SLEEP 1
+#define IB_PHYSPORTSTATE_POLL 2
+#define IB_PHYSPORTSTATE_DISABLED 3
+#define IB_PHYSPORTSTATE_CFG_TRAIN 4
+#define IB_PHYSPORTSTATE_LINKUP 5
+#define IB_PHYSPORTSTATE_LINK_ERR_RECOVER 6
+#define IB_PHYSPORTSTATE_CFG_DEBOUNCE 8
+#define IB_PHYSPORTSTATE_CFG_IDLE 0xB
+#define IB_PHYSPORTSTATE_RECOVERY_RETRAIN 0xC
+#define IB_PHYSPORTSTATE_RECOVERY_WAITRMT 0xE
+#define IB_PHYSPORTSTATE_RECOVERY_IDLE 0xF
+#define IB_PHYSPORTSTATE_CFG_ENH 0x10
+#define IB_PHYSPORTSTATE_CFG_WAIT_ENH 0x13
+
+extern const int ib_qib_state_ops[];
+
+extern __be64 ib_qib_sys_image_guid;    /* in network order */
+
+extern unsigned int ib_qib_lkey_table_size;
+
+extern unsigned int ib_qib_max_cqes;
+
+extern unsigned int ib_qib_max_cqs;
+
+extern unsigned int ib_qib_max_qp_wrs;
+
+extern unsigned int ib_qib_max_qps;
+
+extern unsigned int ib_qib_max_sges;
+
+extern unsigned int ib_qib_max_mcast_grps;
+
+extern unsigned int ib_qib_max_mcast_qp_attached;
+
+extern unsigned int ib_qib_max_srqs;
+
+extern unsigned int ib_qib_max_srq_sges;
+
+extern unsigned int ib_qib_max_srq_wrs;
+
+extern const u32 ib_qib_rnr_table[];
+
+extern struct ib_dma_mapping_ops qib_dma_mapping_ops;
+
+#endif                          /* QIB_VERBS_H */
diff --git a/drivers/infiniband/hw/qib/qib_verbs_mcast.c b/drivers/infiniband/hw/qib/qib_verbs_mcast.c
new file mode 100644
index 000000000..f8ea069a3
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_verbs_mcast.c
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2006, 2007, 2008, 2009 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/rculist.h>
+
+#include "qib.h"
+
+/**
+ * qib_mcast_qp_alloc - alloc a struct to link a QP to mcast GID struct
+ * @qp: the QP to link
+ */
+static struct qib_mcast_qp *qib_mcast_qp_alloc(struct qib_qp *qp)
+{
+	struct qib_mcast_qp *mqp;
+
+	mqp = kmalloc(sizeof(*mqp), GFP_KERNEL);
+	if (!mqp)
+		goto bail;
+
+	mqp->qp = qp;
+	atomic_inc(&qp->refcount);
+
+bail:
+	return mqp;
+}
+
+static void qib_mcast_qp_free(struct qib_mcast_qp *mqp)
+{
+	struct qib_qp *qp = mqp->qp;
+
+	/* Notify qib_destroy_qp() if it is waiting. */
+	if (atomic_dec_and_test(&qp->refcount))
+		wake_up(&qp->wait);
+
+	kfree(mqp);
+}
+
+/**
+ * qib_mcast_alloc - allocate the multicast GID structure
+ * @mgid: the multicast GID
+ *
+ * A list of QPs will be attached to this structure.
+ */
+static struct qib_mcast *qib_mcast_alloc(union ib_gid *mgid)
+{
+	struct qib_mcast *mcast;
+
+	mcast = kmalloc(sizeof(*mcast), GFP_KERNEL);
+	if (!mcast)
+		goto bail;
+
+	mcast->mgid = *mgid;
+	INIT_LIST_HEAD(&mcast->qp_list);
+	init_waitqueue_head(&mcast->wait);
+	atomic_set(&mcast->refcount, 0);
+	mcast->n_attached = 0;
+
+bail:
+	return mcast;
+}
+
+static void qib_mcast_free(struct qib_mcast *mcast)
+{
+	struct qib_mcast_qp *p, *tmp;
+
+	list_for_each_entry_safe(p, tmp, &mcast->qp_list, list)
+		qib_mcast_qp_free(p);
+
+	kfree(mcast);
+}
+
+/**
+ * qib_mcast_find - search the global table for the given multicast GID
+ * @ibp: the IB port structure
+ * @mgid: the multicast GID to search for
+ *
+ * Returns NULL if not found.
+ *
+ * The caller is responsible for decrementing the reference count if found.
+ */
+struct qib_mcast *qib_mcast_find(struct qib_ibport *ibp, union ib_gid *mgid)
+{
+	struct rb_node *n;
+	unsigned long flags;
+	struct qib_mcast *mcast;
+
+	spin_lock_irqsave(&ibp->lock, flags);
+	n = ibp->mcast_tree.rb_node;
+	while (n) {
+		int ret;
+
+		mcast = rb_entry(n, struct qib_mcast, rb_node);
+
+		ret = memcmp(mgid->raw, mcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else {
+			atomic_inc(&mcast->refcount);
+			spin_unlock_irqrestore(&ibp->lock, flags);
+			goto bail;
+		}
+	}
+	spin_unlock_irqrestore(&ibp->lock, flags);
+
+	mcast = NULL;
+
+bail:
+	return mcast;
+}
+
+/**
+ * qib_mcast_add - insert mcast GID into table and attach QP struct
+ * @mcast: the mcast GID table
+ * @mqp: the QP to attach
+ *
+ * Return zero if both were added.  Return EEXIST if the GID was already in
+ * the table but the QP was added.  Return ESRCH if the QP was already
+ * attached and neither structure was added.
+ */
+static int qib_mcast_add(struct qib_ibdev *dev, struct qib_ibport *ibp,
+			 struct qib_mcast *mcast, struct qib_mcast_qp *mqp)
+{
+	struct rb_node **n = &ibp->mcast_tree.rb_node;
+	struct rb_node *pn = NULL;
+	int ret;
+
+	spin_lock_irq(&ibp->lock);
+
+	while (*n) {
+		struct qib_mcast *tmcast;
+		struct qib_mcast_qp *p;
+
+		pn = *n;
+		tmcast = rb_entry(pn, struct qib_mcast, rb_node);
+
+		ret = memcmp(mcast->mgid.raw, tmcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0) {
+			n = &pn->rb_left;
+			continue;
+		}
+		if (ret > 0) {
+			n = &pn->rb_right;
+			continue;
+		}
+
+		/* Search the QP list to see if this is already there. */
+		list_for_each_entry_rcu(p, &tmcast->qp_list, list) {
+			if (p->qp == mqp->qp) {
+				ret = ESRCH;
+				goto bail;
+			}
+		}
+		if (tmcast->n_attached == ib_qib_max_mcast_qp_attached) {
+			ret = ENOMEM;
+			goto bail;
+		}
+
+		tmcast->n_attached++;
+
+		list_add_tail_rcu(&mqp->list, &tmcast->qp_list);
+		ret = EEXIST;
+		goto bail;
+	}
+
+	spin_lock(&dev->n_mcast_grps_lock);
+	if (dev->n_mcast_grps_allocated == ib_qib_max_mcast_grps) {
+		spin_unlock(&dev->n_mcast_grps_lock);
+		ret = ENOMEM;
+		goto bail;
+	}
+
+	dev->n_mcast_grps_allocated++;
+	spin_unlock(&dev->n_mcast_grps_lock);
+
+	mcast->n_attached++;
+
+	list_add_tail_rcu(&mqp->list, &mcast->qp_list);
+
+	atomic_inc(&mcast->refcount);
+	rb_link_node(&mcast->rb_node, pn, n);
+	rb_insert_color(&mcast->rb_node, &ibp->mcast_tree);
+
+	ret = 0;
+
+bail:
+	spin_unlock_irq(&ibp->lock);
+
+	return ret;
+}
+
+int qib_multicast_attach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct qib_qp *qp = to_iqp(ibqp);
+	struct qib_ibdev *dev = to_idev(ibqp->device);
+	struct qib_ibport *ibp;
+	struct qib_mcast *mcast;
+	struct qib_mcast_qp *mqp;
+	int ret;
+
+	if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	/*
+	 * Allocate data structures since its better to do this outside of
+	 * spin locks and it will most likely be needed.
+	 */
+	mcast = qib_mcast_alloc(gid);
+	if (mcast == NULL) {
+		ret = -ENOMEM;
+		goto bail;
+	}
+	mqp = qib_mcast_qp_alloc(qp);
+	if (mqp == NULL) {
+		qib_mcast_free(mcast);
+		ret = -ENOMEM;
+		goto bail;
+	}
+	ibp = to_iport(ibqp->device, qp->port_num);
+	switch (qib_mcast_add(dev, ibp, mcast, mqp)) {
+	case ESRCH:
+		/* Neither was used: OK to attach the same QP twice. */
+		qib_mcast_qp_free(mqp);
+		qib_mcast_free(mcast);
+		break;
+
+	case EEXIST:            /* The mcast wasn't used */
+		qib_mcast_free(mcast);
+		break;
+
+	case ENOMEM:
+		/* Exceeded the maximum number of mcast groups. */
+		qib_mcast_qp_free(mqp);
+		qib_mcast_free(mcast);
+		ret = -ENOMEM;
+		goto bail;
+
+	default:
+		break;
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
+{
+	struct qib_qp *qp = to_iqp(ibqp);
+	struct qib_ibdev *dev = to_idev(ibqp->device);
+	struct qib_ibport *ibp = to_iport(ibqp->device, qp->port_num);
+	struct qib_mcast *mcast = NULL;
+	struct qib_mcast_qp *p, *tmp;
+	struct rb_node *n;
+	int last = 0;
+	int ret;
+
+	if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) {
+		ret = -EINVAL;
+		goto bail;
+	}
+
+	spin_lock_irq(&ibp->lock);
+
+	/* Find the GID in the mcast table. */
+	n = ibp->mcast_tree.rb_node;
+	while (1) {
+		if (n == NULL) {
+			spin_unlock_irq(&ibp->lock);
+			ret = -EINVAL;
+			goto bail;
+		}
+
+		mcast = rb_entry(n, struct qib_mcast, rb_node);
+		ret = memcmp(gid->raw, mcast->mgid.raw,
+			     sizeof(union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			break;
+	}
+
+	/* Search the QP list. */
+	list_for_each_entry_safe(p, tmp, &mcast->qp_list, list) {
+		if (p->qp != qp)
+			continue;
+		/*
+		 * We found it, so remove it, but don't poison the forward
+		 * link until we are sure there are no list walkers.
+		 */
+		list_del_rcu(&p->list);
+		mcast->n_attached--;
+
+		/* If this was the last attached QP, remove the GID too. */
+		if (list_empty(&mcast->qp_list)) {
+			rb_erase(&mcast->rb_node, &ibp->mcast_tree);
+			last = 1;
+		}
+		break;
+	}
+
+	spin_unlock_irq(&ibp->lock);
+
+	if (p) {
+		/*
+		 * Wait for any list walkers to finish before freeing the
+		 * list element.
+		 */
+		wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1);
+		qib_mcast_qp_free(p);
+	}
+	if (last) {
+		atomic_dec(&mcast->refcount);
+		wait_event(mcast->wait, !atomic_read(&mcast->refcount));
+		qib_mcast_free(mcast);
+		spin_lock_irq(&dev->n_mcast_grps_lock);
+		dev->n_mcast_grps_allocated--;
+		spin_unlock_irq(&dev->n_mcast_grps_lock);
+	}
+
+	ret = 0;
+
+bail:
+	return ret;
+}
+
+int qib_mcast_tree_empty(struct qib_ibport *ibp)
+{
+	return ibp->mcast_tree.rb_node == NULL;
+}
diff --git a/drivers/infiniband/hw/qib/qib_wc_ppc64.c b/drivers/infiniband/hw/qib/qib_wc_ppc64.c
new file mode 100644
index 000000000..673cf4c22
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_wc_ppc64.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2006, 2007, 2008 QLogic Corporation. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on PowerPC only.  Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include "qib.h"
+
+/**
+ * qib_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: qlogic_ib device
+ *
+ * Nothing to do on PowerPC, so just return without error.
+ */
+int qib_enable_wc(struct qib_devdata *dd)
+{
+	return 0;
+}
+
+/**
+ * qib_unordered_wc - indicate whether write combining is unordered
+ *
+ * Because our performance depends on our ability to do write
+ * combining mmio writes in the most efficient way, we need to
+ * know if we are on a processor that may reorder stores when
+ * write combining.
+ */
+int qib_unordered_wc(void)
+{
+	return 1;
+}
diff --git a/drivers/infiniband/hw/qib/qib_wc_x86_64.c b/drivers/infiniband/hw/qib/qib_wc_x86_64.c
new file mode 100644
index 000000000..edd0ddbd4
--- /dev/null
+++ b/drivers/infiniband/hw/qib/qib_wc_x86_64.c
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2012 Intel Corporation. All rights reserved.
+ * Copyright (c) 2006 - 2012 QLogic Corporation. All rights reserved.
+ * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ * This file is conditionally built on x86_64 only.  Otherwise weak symbol
+ * versions of the functions exported from here are used.
+ */
+
+#include <linux/pci.h>
+#include <asm/mtrr.h>
+#include <asm/processor.h>
+
+#include "qib.h"
+
+/**
+ * qib_enable_wc - enable write combining for MMIO writes to the device
+ * @dd: qlogic_ib device
+ *
+ * This routine is x86_64-specific; it twiddles the CPU's MTRRs to enable
+ * write combining.
+ */
+int qib_enable_wc(struct qib_devdata *dd)
+{
+	int ret = 0;
+	u64 pioaddr, piolen;
+	unsigned bits;
+	const unsigned long addr = pci_resource_start(dd->pcidev, 0);
+	const size_t len = pci_resource_len(dd->pcidev, 0);
+
+	/*
+	 * Set the PIO buffers to be WCCOMB, so we get HT bursts to the
+	 * chip.  Linux (possibly the hardware) requires it to be on a power
+	 * of 2 address matching the length (which has to be a power of 2).
+	 * For rev1, that means the base address, for rev2, it will be just
+	 * the PIO buffers themselves.
+	 * For chips with two sets of buffers, the calculations are
+	 * somewhat more complicated; we need to sum, and the piobufbase
+	 * register has both offsets, 2K in low 32 bits, 4K in high 32 bits.
+	 * The buffers are still packed, so a single range covers both.
+	 */
+	if (dd->piobcnt2k && dd->piobcnt4k) {
+		/* 2 sizes for chip */
+		unsigned long pio2kbase, pio4kbase;
+
+		pio2kbase = dd->piobufbase & 0xffffffffUL;
+		pio4kbase = (dd->piobufbase >> 32) & 0xffffffffUL;
+		if (pio2kbase < pio4kbase) {
+			/* all current chips */
+			pioaddr = addr + pio2kbase;
+			piolen = pio4kbase - pio2kbase +
+				dd->piobcnt4k * dd->align4k;
+		} else {
+			pioaddr = addr + pio4kbase;
+			piolen = pio2kbase - pio4kbase +
+				dd->piobcnt2k * dd->palign;
+		}
+	} else {  /* single buffer size (2K, currently) */
+		pioaddr = addr + dd->piobufbase;
+		piolen = dd->piobcnt2k * dd->palign +
+			dd->piobcnt4k * dd->align4k;
+	}
+
+	for (bits = 0; !(piolen & (1ULL << bits)); bits++)
+		; /* do nothing */
+
+	if (piolen != (1ULL << bits)) {
+		piolen >>= bits;
+		while (piolen >>= 1)
+			bits++;
+		piolen = 1ULL << (bits + 1);
+	}
+	if (pioaddr & (piolen - 1)) {
+		u64 atmp = pioaddr & ~(piolen - 1);
+
+		if (atmp < addr || (atmp + piolen) > (addr + len)) {
+			qib_dev_err(dd,
+				"No way to align address/size (%llx/%llx), no WC mtrr\n",
+				(unsigned long long) atmp,
+				(unsigned long long) piolen << 1);
+			ret = -ENODEV;
+		} else {
+			pioaddr = atmp;
+			piolen <<= 1;
+		}
+	}
+
+	if (!ret) {
+		dd->wc_cookie = arch_phys_wc_add(pioaddr, piolen);
+		if (dd->wc_cookie < 0)
+			/* use error from routine */
+			ret = dd->wc_cookie;
+	}
+
+	return ret;
+}
+
+/**
+ * qib_disable_wc - disable write combining for MMIO writes to the device
+ * @dd: qlogic_ib device
+ */
+void qib_disable_wc(struct qib_devdata *dd)
+{
+	arch_phys_wc_del(dd->wc_cookie);
+}
+
+/**
+ * qib_unordered_wc - indicate whether write combining is ordered
+ *
+ * Because our performance depends on our ability to do write combining mmio
+ * writes in the most efficient way, we need to know if we are on an Intel
+ * or AMD x86_64 processor.  AMD x86_64 processors flush WC buffers out in
+ * the order completed, and so no special flushing is required to get
+ * correct ordering.  Intel processors, however, will flush write buffers
+ * out in "random" orders, and so explicit ordering is needed at times.
+ */
+int qib_unordered_wc(void)
+{
+	return boot_cpu_data.x86_vendor != X86_VENDOR_AMD;
+}
diff --git a/drivers/infiniband/hw/usnic/Kconfig b/drivers/infiniband/hw/usnic/Kconfig
new file mode 100644
index 000000000..29ab11c34
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/Kconfig
@@ -0,0 +1,10 @@
+config INFINIBAND_USNIC
+	tristate "Verbs support for Cisco VIC"
+	depends on NETDEVICES && ETHERNET && INET && PCI && INTEL_IOMMU
+	select ENIC
+	select NET_VENDOR_CISCO
+	select PCI_IOV
+	select INFINIBAND_USER_ACCESS
+	---help---
+	  This is a low-level driver for Cisco's Virtual Interface
+	  Cards (VICs), including the VIC 1240 and 1280 cards.
diff --git a/drivers/infiniband/hw/usnic/Makefile b/drivers/infiniband/hw/usnic/Makefile
new file mode 100644
index 000000000..99fb2db47
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/Makefile
@@ -0,0 +1,15 @@
+ccflags-y := -Idrivers/net/ethernet/cisco/enic
+
+obj-$(CONFIG_INFINIBAND_USNIC)+= usnic_verbs.o
+
+usnic_verbs-y=\
+usnic_fwd.o \
+usnic_transport.o \
+usnic_uiom.o \
+usnic_uiom_interval_tree.o \
+usnic_vnic.o \
+usnic_ib_main.o \
+usnic_ib_qp_grp.o \
+usnic_ib_sysfs.o \
+usnic_ib_verbs.o \
+usnic_debugfs.o \
diff --git a/drivers/infiniband/hw/usnic/usnic.h b/drivers/infiniband/hw/usnic/usnic.h
new file mode 100644
index 000000000..5be13d899
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_H_
+#define USNIC_H_
+
+#define DRV_NAME	"usnic_verbs"
+
+#define PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC	0x00cf	/* User space NIC */
+
+#define DRV_VERSION    "1.0.3"
+#define DRV_RELDATE    "December 19, 2013"
+
+#endif /* USNIC_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_abi.h b/drivers/infiniband/hw/usnic/usnic_abi.h
new file mode 100644
index 000000000..04a662295
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_abi.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+
+#ifndef USNIC_ABI_H
+#define USNIC_ABI_H
+
+/* ABI between userspace and kernel */
+#define USNIC_UVERBS_ABI_VERSION	4
+
+#define USNIC_QP_GRP_MAX_WQS		8
+#define USNIC_QP_GRP_MAX_RQS		8
+#define USNIC_QP_GRP_MAX_CQS		16
+
+enum usnic_transport_type {
+	USNIC_TRANSPORT_UNKNOWN		= 0,
+	USNIC_TRANSPORT_ROCE_CUSTOM	= 1,
+	USNIC_TRANSPORT_IPV4_UDP	= 2,
+	USNIC_TRANSPORT_MAX		= 3,
+};
+
+struct usnic_transport_spec {
+	enum usnic_transport_type	trans_type;
+	union {
+		struct {
+			uint16_t	port_num;
+		} usnic_roce;
+		struct {
+			uint32_t	sock_fd;
+		} udp;
+	};
+};
+
+struct usnic_ib_create_qp_cmd {
+	struct usnic_transport_spec	spec;
+};
+
+/*TODO: Future - usnic_modify_qp needs to pass in generic filters */
+struct usnic_ib_create_qp_resp {
+	u32				vfid;
+	u32				qp_grp_id;
+	u64				bar_bus_addr;
+	u32				bar_len;
+/*
+ * WQ, RQ, CQ are explicity specified bc exposing a generic resources inteface
+ * expands the scope of ABI to many files.
+ */
+	u32				wq_cnt;
+	u32				rq_cnt;
+	u32				cq_cnt;
+	u32				wq_idx[USNIC_QP_GRP_MAX_WQS];
+	u32				rq_idx[USNIC_QP_GRP_MAX_RQS];
+	u32				cq_idx[USNIC_QP_GRP_MAX_CQS];
+	u32				transport;
+	u32				reserved[9];
+};
+
+#endif /* USNIC_ABI_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h b/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h
new file mode 100644
index 000000000..393567266
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_common_pkt_hdr.h
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_CMN_PKT_HDR_H
+#define USNIC_CMN_PKT_HDR_H
+
+#define USNIC_ROCE_ETHERTYPE		(0x8915)
+#define USNIC_ROCE_GRH_VER              (8)
+#define USNIC_PROTO_VER                 (1)
+#define USNIC_ROCE_GRH_VER_SHIFT        (4)
+
+#endif /* USNIC_COMMON_PKT_HDR_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_common_util.h b/drivers/infiniband/hw/usnic/usnic_common_util.h
new file mode 100644
index 000000000..9d737ed5e
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_common_util.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_CMN_UTIL_H
+#define USNIC_CMN_UTIL_H
+
+static inline void
+usnic_mac_to_gid(const char *const mac, char *raw_gid)
+{
+	raw_gid[0] = 0xfe;
+	raw_gid[1] = 0x80;
+	memset(&raw_gid[2], 0, 6);
+	raw_gid[8] = mac[0]^2;
+	raw_gid[9] = mac[1];
+	raw_gid[10] = mac[2];
+	raw_gid[11] = 0xff;
+	raw_gid[12] = 0xfe;
+	raw_gid[13] = mac[3];
+	raw_gid[14] = mac[4];
+	raw_gid[15] = mac[5];
+}
+
+static inline void
+usnic_mac_ip_to_gid(const char *const mac, const __be32 inaddr, char *raw_gid)
+{
+	raw_gid[0] = 0xfe;
+	raw_gid[1] = 0x80;
+	memset(&raw_gid[2], 0, 2);
+	memcpy(&raw_gid[4], &inaddr, 4);
+	raw_gid[8] = mac[0]^2;
+	raw_gid[9] = mac[1];
+	raw_gid[10] = mac[2];
+	raw_gid[11] = 0xff;
+	raw_gid[12] = 0xfe;
+	raw_gid[13] = mac[3];
+	raw_gid[14] = mac[4];
+	raw_gid[15] = mac[5];
+}
+
+static inline void
+usnic_write_gid_if_id_from_mac(char *mac, char *raw_gid)
+{
+	raw_gid[8] = mac[0]^2;
+	raw_gid[9] = mac[1];
+	raw_gid[10] = mac[2];
+	raw_gid[11] = 0xff;
+	raw_gid[12] = 0xfe;
+	raw_gid[13] = mac[3];
+	raw_gid[14] = mac[4];
+	raw_gid[15] = mac[5];
+}
+
+#endif /* USNIC_COMMON_UTIL_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.c b/drivers/infiniband/hw/usnic/usnic_debugfs.c
new file mode 100644
index 000000000..5d1386016
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_debugfs.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/debugfs.h>
+#include <linux/module.h>
+
+#include "usnic.h"
+#include "usnic_log.h"
+#include "usnic_debugfs.h"
+#include "usnic_ib_qp_grp.h"
+#include "usnic_transport.h"
+
+static struct dentry *debugfs_root;
+static struct dentry *flows_dentry;
+
+static ssize_t usnic_debugfs_buildinfo_read(struct file *f, char __user *data,
+						size_t count, loff_t *ppos)
+{
+	char buf[500];
+	int res;
+
+	if (*ppos > 0)
+		return 0;
+
+	res = scnprintf(buf, sizeof(buf),
+			"version:       %s\n"
+			"build date:    %s\n",
+			DRV_VERSION, DRV_RELDATE);
+
+	return simple_read_from_buffer(data, count, ppos, buf, res);
+}
+
+static const struct file_operations usnic_debugfs_buildinfo_ops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = usnic_debugfs_buildinfo_read
+};
+
+static ssize_t flowinfo_read(struct file *f, char __user *data,
+				size_t count, loff_t *ppos)
+{
+	struct usnic_ib_qp_grp_flow *qp_flow;
+	int n;
+	int left;
+	char *ptr;
+	char buf[512];
+
+	qp_flow = f->private_data;
+	ptr = buf;
+	left = count;
+
+	if (*ppos > 0)
+		return 0;
+
+	spin_lock(&qp_flow->qp_grp->lock);
+	n = scnprintf(ptr, left,
+			"QP Grp ID: %d Transport: %s ",
+			qp_flow->qp_grp->grp_id,
+			usnic_transport_to_str(qp_flow->trans_type));
+	UPDATE_PTR_LEFT(n, ptr, left);
+	if (qp_flow->trans_type == USNIC_TRANSPORT_ROCE_CUSTOM) {
+		n = scnprintf(ptr, left, "Port_Num:%hu\n",
+					qp_flow->usnic_roce.port_num);
+		UPDATE_PTR_LEFT(n, ptr, left);
+	} else if (qp_flow->trans_type == USNIC_TRANSPORT_IPV4_UDP) {
+		n = usnic_transport_sock_to_str(ptr, left,
+				qp_flow->udp.sock);
+		UPDATE_PTR_LEFT(n, ptr, left);
+		n = scnprintf(ptr, left, "\n");
+		UPDATE_PTR_LEFT(n, ptr, left);
+	}
+	spin_unlock(&qp_flow->qp_grp->lock);
+
+	return simple_read_from_buffer(data, count, ppos, buf, ptr - buf);
+}
+
+static const struct file_operations flowinfo_ops = {
+	.owner = THIS_MODULE,
+	.open = simple_open,
+	.read = flowinfo_read,
+};
+
+void usnic_debugfs_init(void)
+{
+	debugfs_root = debugfs_create_dir(DRV_NAME, NULL);
+	if (IS_ERR(debugfs_root)) {
+		usnic_err("Failed to create debugfs root dir, check if debugfs is enabled in kernel configuration\n");
+		goto out_clear_root;
+	}
+
+	flows_dentry = debugfs_create_dir("flows", debugfs_root);
+	if (IS_ERR_OR_NULL(flows_dentry)) {
+		usnic_err("Failed to create debugfs flow dir with err %ld\n",
+				PTR_ERR(flows_dentry));
+		goto out_free_root;
+	}
+
+	debugfs_create_file("build-info", S_IRUGO, debugfs_root,
+				NULL, &usnic_debugfs_buildinfo_ops);
+	return;
+
+out_free_root:
+	debugfs_remove_recursive(debugfs_root);
+out_clear_root:
+	debugfs_root = NULL;
+}
+
+void usnic_debugfs_exit(void)
+{
+	if (!debugfs_root)
+		return;
+
+	debugfs_remove_recursive(debugfs_root);
+	debugfs_root = NULL;
+}
+
+void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow)
+{
+	if (IS_ERR_OR_NULL(flows_dentry))
+		return;
+
+	scnprintf(qp_flow->dentry_name, sizeof(qp_flow->dentry_name),
+			"%u", qp_flow->flow->flow_id);
+	qp_flow->dbgfs_dentry = debugfs_create_file(qp_flow->dentry_name,
+							S_IRUGO,
+							flows_dentry,
+							qp_flow,
+							&flowinfo_ops);
+	if (IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) {
+		usnic_err("Failed to create dbg fs entry for flow %u\n",
+				qp_flow->flow->flow_id);
+	}
+}
+
+void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow)
+{
+	if (!IS_ERR_OR_NULL(qp_flow->dbgfs_dentry))
+		debugfs_remove(qp_flow->dbgfs_dentry);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.h b/drivers/infiniband/hw/usnic/usnic_debugfs.h
new file mode 100644
index 000000000..4087d24a8
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_debugfs.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#ifndef USNIC_DEBUGFS_H_
+#define USNIC_DEBUGFS_H_
+
+#include "usnic_ib_qp_grp.h"
+
+void usnic_debugfs_init(void);
+
+void usnic_debugfs_exit(void);
+void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow);
+void usnic_debugfs_flow_remove(struct usnic_ib_qp_grp_flow *qp_flow);
+
+#endif /*!USNIC_DEBUGFS_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.c b/drivers/infiniband/hw/usnic/usnic_fwd.c
new file mode 100644
index 000000000..e3c9bd9d3
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_fwd.c
@@ -0,0 +1,350 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+
+#include "enic_api.h"
+#include "usnic_common_pkt_hdr.h"
+#include "usnic_fwd.h"
+#include "usnic_log.h"
+
+static int usnic_fwd_devcmd_locked(struct usnic_fwd_dev *ufdev, int vnic_idx,
+					enum vnic_devcmd_cmd cmd, u64 *a0,
+					u64 *a1)
+{
+	int status;
+	struct net_device *netdev = ufdev->netdev;
+
+	lockdep_assert_held(&ufdev->lock);
+
+	status = enic_api_devcmd_proxy_by_index(netdev,
+			vnic_idx,
+			cmd,
+			a0, a1,
+			1000);
+	if (status) {
+		if (status == ERR_EINVAL && cmd == CMD_DEL_FILTER) {
+			usnic_dbg("Dev %s vnic idx %u cmd %u already deleted",
+					ufdev->name, vnic_idx, cmd);
+		} else {
+			usnic_err("Dev %s vnic idx %u cmd %u failed with status %d\n",
+					ufdev->name, vnic_idx, cmd,
+					status);
+		}
+	} else {
+		usnic_dbg("Dev %s vnic idx %u cmd %u success",
+				ufdev->name, vnic_idx, cmd);
+	}
+
+	return status;
+}
+
+static int usnic_fwd_devcmd(struct usnic_fwd_dev *ufdev, int vnic_idx,
+				enum vnic_devcmd_cmd cmd, u64 *a0, u64 *a1)
+{
+	int status;
+
+	spin_lock(&ufdev->lock);
+	status = usnic_fwd_devcmd_locked(ufdev, vnic_idx, cmd, a0, a1);
+	spin_unlock(&ufdev->lock);
+
+	return status;
+}
+
+struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev)
+{
+	struct usnic_fwd_dev *ufdev;
+
+	ufdev = kzalloc(sizeof(*ufdev), GFP_KERNEL);
+	if (!ufdev)
+		return NULL;
+
+	ufdev->pdev = pdev;
+	ufdev->netdev = pci_get_drvdata(pdev);
+	spin_lock_init(&ufdev->lock);
+	strncpy(ufdev->name, netdev_name(ufdev->netdev),
+			sizeof(ufdev->name) - 1);
+
+	return ufdev;
+}
+
+void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev)
+{
+	kfree(ufdev);
+}
+
+void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN])
+{
+	spin_lock(&ufdev->lock);
+	memcpy(&ufdev->mac, mac, sizeof(ufdev->mac));
+	spin_unlock(&ufdev->lock);
+}
+
+int usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr)
+{
+	int status;
+
+	spin_lock(&ufdev->lock);
+	if (ufdev->inaddr == 0) {
+		ufdev->inaddr = inaddr;
+		status = 0;
+	} else {
+		status = -EFAULT;
+	}
+	spin_unlock(&ufdev->lock);
+
+	return status;
+}
+
+void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev)
+{
+	spin_lock(&ufdev->lock);
+	ufdev->inaddr = 0;
+	spin_unlock(&ufdev->lock);
+}
+
+void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev)
+{
+	spin_lock(&ufdev->lock);
+	ufdev->link_up = 1;
+	spin_unlock(&ufdev->lock);
+}
+
+void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev)
+{
+	spin_lock(&ufdev->lock);
+	ufdev->link_up = 0;
+	spin_unlock(&ufdev->lock);
+}
+
+void usnic_fwd_set_mtu(struct usnic_fwd_dev *ufdev, unsigned int mtu)
+{
+	spin_lock(&ufdev->lock);
+	ufdev->mtu = mtu;
+	spin_unlock(&ufdev->lock);
+}
+
+static int usnic_fwd_dev_ready_locked(struct usnic_fwd_dev *ufdev)
+{
+	lockdep_assert_held(&ufdev->lock);
+
+	if (!ufdev->link_up)
+		return -EPERM;
+
+	return 0;
+}
+
+static int validate_filter_locked(struct usnic_fwd_dev *ufdev,
+					struct filter *filter)
+{
+
+	lockdep_assert_held(&ufdev->lock);
+
+	if (filter->type == FILTER_IPV4_5TUPLE) {
+		if (!(filter->u.ipv4.flags & FILTER_FIELD_5TUP_DST_AD))
+			return -EACCES;
+		if (!(filter->u.ipv4.flags & FILTER_FIELD_5TUP_DST_PT))
+			return -EBUSY;
+		else if (ufdev->inaddr == 0)
+			return -EINVAL;
+		else if (filter->u.ipv4.dst_port == 0)
+			return -ERANGE;
+		else if (ntohl(ufdev->inaddr) != filter->u.ipv4.dst_addr)
+			return -EFAULT;
+		else
+			return 0;
+	}
+
+	return 0;
+}
+
+static void fill_tlv(struct filter_tlv *tlv, struct filter *filter,
+		struct filter_action *action)
+{
+	tlv->type = CLSF_TLV_FILTER;
+	tlv->length = sizeof(struct filter);
+	*((struct filter *)&tlv->val) = *filter;
+
+	tlv = (struct filter_tlv *)((char *)tlv + sizeof(struct filter_tlv) +
+			sizeof(struct filter));
+	tlv->type = CLSF_TLV_ACTION;
+	tlv->length = sizeof(struct filter_action);
+	*((struct filter_action *)&tlv->val) = *action;
+}
+
+struct usnic_fwd_flow*
+usnic_fwd_alloc_flow(struct usnic_fwd_dev *ufdev, struct filter *filter,
+				struct usnic_filter_action *uaction)
+{
+	struct filter_tlv *tlv;
+	struct pci_dev *pdev;
+	struct usnic_fwd_flow *flow;
+	uint64_t a0, a1;
+	uint64_t tlv_size;
+	dma_addr_t tlv_pa;
+	int status;
+
+	pdev = ufdev->pdev;
+	tlv_size = (2*sizeof(struct filter_tlv) + sizeof(struct filter) +
+			sizeof(struct filter_action));
+
+	flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
+	if (!flow)
+		return ERR_PTR(-ENOMEM);
+
+	tlv = pci_alloc_consistent(pdev, tlv_size, &tlv_pa);
+	if (!tlv) {
+		usnic_err("Failed to allocate memory\n");
+		status = -ENOMEM;
+		goto out_free_flow;
+	}
+
+	fill_tlv(tlv, filter, &uaction->action);
+
+	spin_lock(&ufdev->lock);
+	status = usnic_fwd_dev_ready_locked(ufdev);
+	if (status) {
+		usnic_err("Forwarding dev %s not ready with status %d\n",
+				ufdev->name, status);
+		goto out_free_tlv;
+	}
+
+	status = validate_filter_locked(ufdev, filter);
+	if (status) {
+		usnic_err("Failed to validate filter with status %d\n",
+				status);
+		goto out_free_tlv;
+	}
+
+	/* Issue Devcmd */
+	a0 = tlv_pa;
+	a1 = tlv_size;
+	status = usnic_fwd_devcmd_locked(ufdev, uaction->vnic_idx,
+						CMD_ADD_FILTER, &a0, &a1);
+	if (status) {
+		usnic_err("VF %s Filter add failed with status:%d",
+				ufdev->name, status);
+		status = -EFAULT;
+		goto out_free_tlv;
+	} else {
+		usnic_dbg("VF %s FILTER ID:%llu", ufdev->name, a0);
+	}
+
+	flow->flow_id = (uint32_t) a0;
+	flow->vnic_idx = uaction->vnic_idx;
+	flow->ufdev = ufdev;
+
+out_free_tlv:
+	spin_unlock(&ufdev->lock);
+	pci_free_consistent(pdev, tlv_size, tlv, tlv_pa);
+	if (!status)
+		return flow;
+out_free_flow:
+	kfree(flow);
+	return ERR_PTR(status);
+}
+
+int usnic_fwd_dealloc_flow(struct usnic_fwd_flow *flow)
+{
+	int status;
+	u64 a0, a1;
+
+	a0 = flow->flow_id;
+
+	status = usnic_fwd_devcmd(flow->ufdev, flow->vnic_idx,
+					CMD_DEL_FILTER, &a0, &a1);
+	if (status) {
+		if (status == ERR_EINVAL) {
+			usnic_dbg("Filter %u already deleted for VF Idx %u pf: %s status: %d",
+					flow->flow_id, flow->vnic_idx,
+					flow->ufdev->name, status);
+		} else {
+			usnic_err("PF %s VF Idx %u Filter: %u FILTER DELETE failed with status %d",
+					flow->ufdev->name, flow->vnic_idx,
+					flow->flow_id, status);
+		}
+		status = 0;
+		/*
+		 * Log the error and fake success to the caller because if
+		 * a flow fails to be deleted in the firmware, it is an
+		 * unrecoverable error.
+		 */
+	} else {
+		usnic_dbg("PF %s VF Idx %u Filter: %u FILTER DELETED",
+				flow->ufdev->name, flow->vnic_idx,
+				flow->flow_id);
+	}
+
+	kfree(flow);
+	return status;
+}
+
+int usnic_fwd_enable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx)
+{
+	int status;
+	struct net_device *pf_netdev;
+	u64 a0, a1;
+
+	pf_netdev = ufdev->netdev;
+	a0 = qp_idx;
+	a1 = CMD_QP_RQWQ;
+
+	status = usnic_fwd_devcmd(ufdev, vnic_idx, CMD_QP_ENABLE,
+						&a0, &a1);
+	if (status) {
+		usnic_err("PF %s VNIC Index %u RQ Index: %u ENABLE Failed with status %d",
+				netdev_name(pf_netdev),
+				vnic_idx,
+				qp_idx,
+				status);
+	} else {
+		usnic_dbg("PF %s VNIC Index %u RQ Index: %u ENABLED",
+				netdev_name(pf_netdev),
+				vnic_idx, qp_idx);
+	}
+
+	return status;
+}
+
+int usnic_fwd_disable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx)
+{
+	int status;
+	u64 a0, a1;
+	struct net_device *pf_netdev;
+
+	pf_netdev = ufdev->netdev;
+	a0 = qp_idx;
+	a1 = CMD_QP_RQWQ;
+
+	status = usnic_fwd_devcmd(ufdev, vnic_idx, CMD_QP_DISABLE,
+			&a0, &a1);
+	if (status) {
+		usnic_err("PF %s VNIC Index %u RQ Index: %u DISABLE Failed with status %d",
+				netdev_name(pf_netdev),
+				vnic_idx,
+				qp_idx,
+				status);
+	} else {
+		usnic_dbg("PF %s VNIC Index %u RQ Index: %u DISABLED",
+				netdev_name(pf_netdev),
+				vnic_idx,
+				qp_idx);
+	}
+
+	return status;
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_fwd.h b/drivers/infiniband/hw/usnic/usnic_fwd.h
new file mode 100644
index 000000000..93713a223
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_fwd.h
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_FWD_H_
+#define USNIC_FWD_H_
+
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/pci.h>
+#include <linux/in.h>
+
+#include "usnic_abi.h"
+#include "usnic_common_pkt_hdr.h"
+#include "vnic_devcmd.h"
+
+struct usnic_fwd_dev {
+	struct pci_dev			*pdev;
+	struct net_device		*netdev;
+	spinlock_t			lock;
+	/*
+	 * The following fields can be read directly off the device.
+	 * However, they should be set by a accessor function, except name,
+	 * which cannot be changed.
+	 */
+	bool				link_up;
+	char				mac[ETH_ALEN];
+	unsigned int			mtu;
+	__be32				inaddr;
+	char				name[IFNAMSIZ+1];
+};
+
+struct usnic_fwd_flow {
+	uint32_t			flow_id;
+	struct usnic_fwd_dev		*ufdev;
+	unsigned int			vnic_idx;
+};
+
+struct usnic_filter_action {
+	int				vnic_idx;
+	struct filter_action		action;
+};
+
+struct usnic_fwd_dev *usnic_fwd_dev_alloc(struct pci_dev *pdev);
+void usnic_fwd_dev_free(struct usnic_fwd_dev *ufdev);
+
+void usnic_fwd_set_mac(struct usnic_fwd_dev *ufdev, char mac[ETH_ALEN]);
+int usnic_fwd_add_ipaddr(struct usnic_fwd_dev *ufdev, __be32 inaddr);
+void usnic_fwd_del_ipaddr(struct usnic_fwd_dev *ufdev);
+void usnic_fwd_carrier_up(struct usnic_fwd_dev *ufdev);
+void usnic_fwd_carrier_down(struct usnic_fwd_dev *ufdev);
+void usnic_fwd_set_mtu(struct usnic_fwd_dev *ufdev, unsigned int mtu);
+
+/*
+ * Allocate a flow on this forwarding device. Whoever calls this function,
+ * must monitor netdev events on ufdev's netdevice. If NETDEV_REBOOT or
+ * NETDEV_DOWN is seen, flow will no longer function and must be
+ * immediately freed by calling usnic_dealloc_flow.
+ */
+struct usnic_fwd_flow*
+usnic_fwd_alloc_flow(struct usnic_fwd_dev *ufdev, struct filter *filter,
+				struct usnic_filter_action *action);
+int usnic_fwd_dealloc_flow(struct usnic_fwd_flow *flow);
+int usnic_fwd_enable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx);
+int usnic_fwd_disable_qp(struct usnic_fwd_dev *ufdev, int vnic_idx, int qp_idx);
+
+static inline void usnic_fwd_init_usnic_filter(struct filter *filter,
+						uint32_t usnic_id)
+{
+	filter->type = FILTER_USNIC_ID;
+	filter->u.usnic.ethtype = USNIC_ROCE_ETHERTYPE;
+	filter->u.usnic.flags = FILTER_FIELD_USNIC_ETHTYPE |
+				FILTER_FIELD_USNIC_ID |
+				FILTER_FIELD_USNIC_PROTO;
+	filter->u.usnic.proto_version = (USNIC_ROCE_GRH_VER <<
+					 USNIC_ROCE_GRH_VER_SHIFT) |
+					 USNIC_PROTO_VER;
+	filter->u.usnic.usnic_id = usnic_id;
+}
+
+static inline void usnic_fwd_init_udp_filter(struct filter *filter,
+						uint32_t daddr, uint16_t dport)
+{
+	filter->type = FILTER_IPV4_5TUPLE;
+	filter->u.ipv4.flags = FILTER_FIELD_5TUP_PROTO;
+	filter->u.ipv4.protocol = PROTO_UDP;
+
+	if (daddr) {
+		filter->u.ipv4.flags |= FILTER_FIELD_5TUP_DST_AD;
+		filter->u.ipv4.dst_addr = daddr;
+	}
+
+	if (dport) {
+		filter->u.ipv4.flags |= FILTER_FIELD_5TUP_DST_PT;
+		filter->u.ipv4.dst_port = dport;
+	}
+}
+
+#endif /* !USNIC_FWD_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib.h b/drivers/infiniband/hw/usnic/usnic_ib.h
new file mode 100644
index 000000000..e5a9297dd
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_IB_H_
+#define USNIC_IB_H_
+
+#include <linux/iommu.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_verbs.h>
+
+
+#include "usnic.h"
+#include "usnic_abi.h"
+#include "usnic_vnic.h"
+
+#define USNIC_IB_PORT_CNT		1
+#define USNIC_IB_NUM_COMP_VECTORS	1
+
+extern unsigned int usnic_ib_share_vf;
+
+struct usnic_ib_ucontext {
+	struct ib_ucontext		ibucontext;
+	/* Protected by usnic_ib_dev->usdev_lock */
+	struct list_head		qp_grp_list;
+	struct list_head		link;
+};
+
+struct usnic_ib_pd {
+	struct ib_pd			ibpd;
+	struct usnic_uiom_pd		*umem_pd;
+};
+
+struct usnic_ib_mr {
+	struct ib_mr			ibmr;
+	struct usnic_uiom_reg		*umem;
+};
+
+struct usnic_ib_dev {
+	struct ib_device		ib_dev;
+	struct pci_dev			*pdev;
+	struct net_device		*netdev;
+	struct usnic_fwd_dev		*ufdev;
+	struct list_head		ib_dev_link;
+	struct list_head		vf_dev_list;
+	struct list_head		ctx_list;
+	struct mutex			usdev_lock;
+
+	/* provisioning information */
+	struct kref			vf_cnt;
+	unsigned int			vf_res_cnt[USNIC_VNIC_RES_TYPE_MAX];
+
+	/* sysfs vars for QPN reporting */
+	struct kobject *qpn_kobj;
+};
+
+struct usnic_ib_vf {
+	struct usnic_ib_dev		*pf;
+	spinlock_t			lock;
+	struct usnic_vnic		*vnic;
+	unsigned int			qp_grp_ref_cnt;
+	struct usnic_ib_pd		*pd;
+	struct list_head		link;
+};
+
+static inline
+struct usnic_ib_dev *to_usdev(struct ib_device *ibdev)
+{
+	return container_of(ibdev, struct usnic_ib_dev, ib_dev);
+}
+
+static inline
+struct usnic_ib_ucontext *to_ucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct usnic_ib_ucontext, ibucontext);
+}
+
+static inline
+struct usnic_ib_pd *to_upd(struct ib_pd *ibpd)
+{
+	return container_of(ibpd, struct usnic_ib_pd, ibpd);
+}
+
+static inline
+struct usnic_ib_ucontext *to_uucontext(struct ib_ucontext *ibucontext)
+{
+	return container_of(ibucontext, struct usnic_ib_ucontext, ibucontext);
+}
+
+static inline
+struct usnic_ib_mr *to_umr(struct ib_mr *ibmr)
+{
+	return container_of(ibmr, struct usnic_ib_mr, ibmr);
+}
+void usnic_ib_log_vf(struct usnic_ib_vf *vf);
+
+#define UPDATE_PTR_LEFT(N, P, L)			\
+do {							\
+	L -= (N);					\
+	P += (N);					\
+} while (0)
+
+#endif /* USNIC_IB_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_main.c b/drivers/infiniband/hw/usnic/usnic_ib_main.c
new file mode 100644
index 000000000..0d0f98695
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_main.c
@@ -0,0 +1,682 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Author: Upinder Malhi <umalhi@cisco.com>
+ * Author: Anant Deepak <anadeepa@cisco.com>
+ * Author: Cesare Cantu' <cantuc@cisco.com>
+ * Author: Jeff Squyres <jsquyres@cisco.com>
+ * Author: Kiran Thirumalai <kithirum@cisco.com>
+ * Author: Xuyang Wang <xuywang@cisco.com>
+ * Author: Reese Faucette <rfaucett@cisco.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/inetdevice.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+#include <linux/pci.h>
+#include <linux/netdevice.h>
+
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include "usnic_abi.h"
+#include "usnic_common_util.h"
+#include "usnic_ib.h"
+#include "usnic_ib_qp_grp.h"
+#include "usnic_log.h"
+#include "usnic_fwd.h"
+#include "usnic_debugfs.h"
+#include "usnic_ib_verbs.h"
+#include "usnic_transport.h"
+#include "usnic_uiom.h"
+#include "usnic_ib_sysfs.h"
+
+unsigned int usnic_log_lvl = USNIC_LOG_LVL_ERR;
+unsigned int usnic_ib_share_vf = 1;
+
+static const char usnic_version[] =
+	DRV_NAME ": Cisco VIC (USNIC) Verbs Driver v"
+	DRV_VERSION " (" DRV_RELDATE ")\n";
+
+static DEFINE_MUTEX(usnic_ib_ibdev_list_lock);
+static LIST_HEAD(usnic_ib_ibdev_list);
+
+/* Callback dump funcs */
+static int usnic_ib_dump_vf_hdr(void *obj, char *buf, int buf_sz)
+{
+	struct usnic_ib_vf *vf = obj;
+	return scnprintf(buf, buf_sz, "PF: %s ", vf->pf->ib_dev.name);
+}
+/* End callback dump funcs */
+
+static void usnic_ib_dump_vf(struct usnic_ib_vf *vf, char *buf, int buf_sz)
+{
+	usnic_vnic_dump(vf->vnic, buf, buf_sz, vf,
+			usnic_ib_dump_vf_hdr,
+			usnic_ib_qp_grp_dump_hdr, usnic_ib_qp_grp_dump_rows);
+}
+
+void usnic_ib_log_vf(struct usnic_ib_vf *vf)
+{
+	char buf[1000];
+	usnic_ib_dump_vf(vf, buf, sizeof(buf));
+	usnic_dbg("%s\n", buf);
+}
+
+/* Start of netdev section */
+static inline const char *usnic_ib_netdev_event_to_string(unsigned long event)
+{
+	const char *event2str[] = {"NETDEV_NONE", "NETDEV_UP", "NETDEV_DOWN",
+		"NETDEV_REBOOT", "NETDEV_CHANGE",
+		"NETDEV_REGISTER", "NETDEV_UNREGISTER", "NETDEV_CHANGEMTU",
+		"NETDEV_CHANGEADDR", "NETDEV_GOING_DOWN", "NETDEV_FEAT_CHANGE",
+		"NETDEV_BONDING_FAILOVER", "NETDEV_PRE_UP",
+		"NETDEV_PRE_TYPE_CHANGE", "NETDEV_POST_TYPE_CHANGE",
+		"NETDEV_POST_INT", "NETDEV_UNREGISTER_FINAL", "NETDEV_RELEASE",
+		"NETDEV_NOTIFY_PEERS", "NETDEV_JOIN"
+	};
+
+	if (event >= ARRAY_SIZE(event2str))
+		return "UNKNOWN_NETDEV_EVENT";
+	else
+		return event2str[event];
+}
+
+static void usnic_ib_qp_grp_modify_active_to_err(struct usnic_ib_dev *us_ibdev)
+{
+	struct usnic_ib_ucontext *ctx;
+	struct usnic_ib_qp_grp *qp_grp;
+	enum ib_qp_state cur_state;
+	int status;
+
+	BUG_ON(!mutex_is_locked(&us_ibdev->usdev_lock));
+
+	list_for_each_entry(ctx, &us_ibdev->ctx_list, link) {
+		list_for_each_entry(qp_grp, &ctx->qp_grp_list, link) {
+			cur_state = qp_grp->state;
+			if (cur_state == IB_QPS_INIT ||
+				cur_state == IB_QPS_RTR ||
+				cur_state == IB_QPS_RTS) {
+				status = usnic_ib_qp_grp_modify(qp_grp,
+								IB_QPS_ERR,
+								NULL);
+				if (status) {
+					usnic_err("Failed to transistion qp grp %u from %s to %s\n",
+						qp_grp->grp_id,
+						usnic_ib_qp_grp_state_to_string
+						(cur_state),
+						usnic_ib_qp_grp_state_to_string
+						(IB_QPS_ERR));
+				}
+			}
+		}
+	}
+}
+
+static void usnic_ib_handle_usdev_event(struct usnic_ib_dev *us_ibdev,
+					unsigned long event)
+{
+	struct net_device *netdev;
+	struct ib_event ib_event;
+
+	memset(&ib_event, 0, sizeof(ib_event));
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	netdev = us_ibdev->netdev;
+	switch (event) {
+	case NETDEV_REBOOT:
+		usnic_info("PF Reset on %s\n", us_ibdev->ib_dev.name);
+		usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
+		ib_event.event = IB_EVENT_PORT_ERR;
+		ib_event.device = &us_ibdev->ib_dev;
+		ib_event.element.port_num = 1;
+		ib_dispatch_event(&ib_event);
+		break;
+	case NETDEV_UP:
+	case NETDEV_DOWN:
+	case NETDEV_CHANGE:
+		if (!us_ibdev->ufdev->link_up &&
+				netif_carrier_ok(netdev)) {
+			usnic_fwd_carrier_up(us_ibdev->ufdev);
+			usnic_info("Link UP on %s\n", us_ibdev->ib_dev.name);
+			ib_event.event = IB_EVENT_PORT_ACTIVE;
+			ib_event.device = &us_ibdev->ib_dev;
+			ib_event.element.port_num = 1;
+			ib_dispatch_event(&ib_event);
+		} else if (us_ibdev->ufdev->link_up &&
+				!netif_carrier_ok(netdev)) {
+			usnic_fwd_carrier_down(us_ibdev->ufdev);
+			usnic_info("Link DOWN on %s\n", us_ibdev->ib_dev.name);
+			usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
+			ib_event.event = IB_EVENT_PORT_ERR;
+			ib_event.device = &us_ibdev->ib_dev;
+			ib_event.element.port_num = 1;
+			ib_dispatch_event(&ib_event);
+		} else {
+			usnic_dbg("Ignoring %s on %s\n",
+					usnic_ib_netdev_event_to_string(event),
+					us_ibdev->ib_dev.name);
+		}
+		break;
+	case NETDEV_CHANGEADDR:
+		if (!memcmp(us_ibdev->ufdev->mac, netdev->dev_addr,
+				sizeof(us_ibdev->ufdev->mac))) {
+			usnic_dbg("Ignoring addr change on %s\n",
+					us_ibdev->ib_dev.name);
+		} else {
+			usnic_info(" %s old mac: %pM new mac: %pM\n",
+					us_ibdev->ib_dev.name,
+					us_ibdev->ufdev->mac,
+					netdev->dev_addr);
+			usnic_fwd_set_mac(us_ibdev->ufdev, netdev->dev_addr);
+			usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
+			ib_event.event = IB_EVENT_GID_CHANGE;
+			ib_event.device = &us_ibdev->ib_dev;
+			ib_event.element.port_num = 1;
+			ib_dispatch_event(&ib_event);
+		}
+
+		break;
+	case NETDEV_CHANGEMTU:
+		if (us_ibdev->ufdev->mtu != netdev->mtu) {
+			usnic_info("MTU Change on %s old: %u new: %u\n",
+					us_ibdev->ib_dev.name,
+					us_ibdev->ufdev->mtu, netdev->mtu);
+			usnic_fwd_set_mtu(us_ibdev->ufdev, netdev->mtu);
+			usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
+		} else {
+			usnic_dbg("Ignoring MTU change on %s\n",
+					us_ibdev->ib_dev.name);
+		}
+		break;
+	default:
+		usnic_dbg("Ignoring event %s on %s",
+				usnic_ib_netdev_event_to_string(event),
+				us_ibdev->ib_dev.name);
+	}
+	mutex_unlock(&us_ibdev->usdev_lock);
+}
+
+static int usnic_ib_netdevice_event(struct notifier_block *notifier,
+					unsigned long event, void *ptr)
+{
+	struct usnic_ib_dev *us_ibdev;
+
+	struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
+
+	mutex_lock(&usnic_ib_ibdev_list_lock);
+	list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) {
+		if (us_ibdev->netdev == netdev) {
+			usnic_ib_handle_usdev_event(us_ibdev, event);
+			break;
+		}
+	}
+	mutex_unlock(&usnic_ib_ibdev_list_lock);
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block usnic_ib_netdevice_notifier = {
+	.notifier_call = usnic_ib_netdevice_event
+};
+/* End of netdev section */
+
+/* Start of inet section */
+static int usnic_ib_handle_inet_event(struct usnic_ib_dev *us_ibdev,
+					unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = ptr;
+	struct ib_event ib_event;
+
+	mutex_lock(&us_ibdev->usdev_lock);
+
+	switch (event) {
+	case NETDEV_DOWN:
+		usnic_info("%s via ip notifiers",
+				usnic_ib_netdev_event_to_string(event));
+		usnic_fwd_del_ipaddr(us_ibdev->ufdev);
+		usnic_ib_qp_grp_modify_active_to_err(us_ibdev);
+		ib_event.event = IB_EVENT_GID_CHANGE;
+		ib_event.device = &us_ibdev->ib_dev;
+		ib_event.element.port_num = 1;
+		ib_dispatch_event(&ib_event);
+		break;
+	case NETDEV_UP:
+		usnic_fwd_add_ipaddr(us_ibdev->ufdev, ifa->ifa_address);
+		usnic_info("%s via ip notifiers: ip %pI4",
+				usnic_ib_netdev_event_to_string(event),
+				&us_ibdev->ufdev->inaddr);
+		ib_event.event = IB_EVENT_GID_CHANGE;
+		ib_event.device = &us_ibdev->ib_dev;
+		ib_event.element.port_num = 1;
+		ib_dispatch_event(&ib_event);
+		break;
+	default:
+		usnic_info("Ignoring event %s on %s",
+				usnic_ib_netdev_event_to_string(event),
+				us_ibdev->ib_dev.name);
+	}
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return NOTIFY_DONE;
+}
+
+static int usnic_ib_inetaddr_event(struct notifier_block *notifier,
+					unsigned long event, void *ptr)
+{
+	struct usnic_ib_dev *us_ibdev;
+	struct in_ifaddr *ifa = ptr;
+	struct net_device *netdev = ifa->ifa_dev->dev;
+
+	mutex_lock(&usnic_ib_ibdev_list_lock);
+	list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) {
+		if (us_ibdev->netdev == netdev) {
+			usnic_ib_handle_inet_event(us_ibdev, event, ptr);
+			break;
+		}
+	}
+	mutex_unlock(&usnic_ib_ibdev_list_lock);
+
+	return NOTIFY_DONE;
+}
+static struct notifier_block usnic_ib_inetaddr_notifier = {
+	.notifier_call = usnic_ib_inetaddr_event
+};
+/* End of inet section*/
+
+/* Start of PF discovery section */
+static void *usnic_ib_device_add(struct pci_dev *dev)
+{
+	struct usnic_ib_dev *us_ibdev;
+	union ib_gid gid;
+	struct in_ifaddr *in;
+	struct net_device *netdev;
+
+	usnic_dbg("\n");
+	netdev = pci_get_drvdata(dev);
+
+	us_ibdev = (struct usnic_ib_dev *)ib_alloc_device(sizeof(*us_ibdev));
+	if (IS_ERR_OR_NULL(us_ibdev)) {
+		usnic_err("Device %s context alloc failed\n",
+				netdev_name(pci_get_drvdata(dev)));
+		return ERR_PTR(us_ibdev ? PTR_ERR(us_ibdev) : -EFAULT);
+	}
+
+	us_ibdev->ufdev = usnic_fwd_dev_alloc(dev);
+	if (IS_ERR_OR_NULL(us_ibdev->ufdev)) {
+		usnic_err("Failed to alloc ufdev for %s with err %ld\n",
+				pci_name(dev), PTR_ERR(us_ibdev->ufdev));
+		goto err_dealloc;
+	}
+
+	mutex_init(&us_ibdev->usdev_lock);
+	INIT_LIST_HEAD(&us_ibdev->vf_dev_list);
+	INIT_LIST_HEAD(&us_ibdev->ctx_list);
+
+	us_ibdev->pdev = dev;
+	us_ibdev->netdev = pci_get_drvdata(dev);
+	us_ibdev->ib_dev.owner = THIS_MODULE;
+	us_ibdev->ib_dev.node_type = RDMA_NODE_USNIC_UDP;
+	us_ibdev->ib_dev.phys_port_cnt = USNIC_IB_PORT_CNT;
+	us_ibdev->ib_dev.num_comp_vectors = USNIC_IB_NUM_COMP_VECTORS;
+	us_ibdev->ib_dev.dma_device = &dev->dev;
+	us_ibdev->ib_dev.uverbs_abi_ver = USNIC_UVERBS_ABI_VERSION;
+	strlcpy(us_ibdev->ib_dev.name, "usnic_%d", IB_DEVICE_NAME_MAX);
+
+	us_ibdev->ib_dev.uverbs_cmd_mask =
+		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_PORT) |
+		(1ull << IB_USER_VERBS_CMD_ALLOC_PD) |
+		(1ull << IB_USER_VERBS_CMD_DEALLOC_PD) |
+		(1ull << IB_USER_VERBS_CMD_REG_MR) |
+		(1ull << IB_USER_VERBS_CMD_DEREG_MR) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_COMP_CHANNEL) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_CQ) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_CQ) |
+		(1ull << IB_USER_VERBS_CMD_CREATE_QP) |
+		(1ull << IB_USER_VERBS_CMD_MODIFY_QP) |
+		(1ull << IB_USER_VERBS_CMD_QUERY_QP) |
+		(1ull << IB_USER_VERBS_CMD_DESTROY_QP) |
+		(1ull << IB_USER_VERBS_CMD_ATTACH_MCAST) |
+		(1ull << IB_USER_VERBS_CMD_DETACH_MCAST) |
+		(1ull << IB_USER_VERBS_CMD_OPEN_QP);
+
+	us_ibdev->ib_dev.query_device = usnic_ib_query_device;
+	us_ibdev->ib_dev.query_port = usnic_ib_query_port;
+	us_ibdev->ib_dev.query_pkey = usnic_ib_query_pkey;
+	us_ibdev->ib_dev.query_gid = usnic_ib_query_gid;
+	us_ibdev->ib_dev.get_link_layer = usnic_ib_port_link_layer;
+	us_ibdev->ib_dev.alloc_pd = usnic_ib_alloc_pd;
+	us_ibdev->ib_dev.dealloc_pd = usnic_ib_dealloc_pd;
+	us_ibdev->ib_dev.create_qp = usnic_ib_create_qp;
+	us_ibdev->ib_dev.modify_qp = usnic_ib_modify_qp;
+	us_ibdev->ib_dev.query_qp = usnic_ib_query_qp;
+	us_ibdev->ib_dev.destroy_qp = usnic_ib_destroy_qp;
+	us_ibdev->ib_dev.create_cq = usnic_ib_create_cq;
+	us_ibdev->ib_dev.destroy_cq = usnic_ib_destroy_cq;
+	us_ibdev->ib_dev.reg_user_mr = usnic_ib_reg_mr;
+	us_ibdev->ib_dev.dereg_mr = usnic_ib_dereg_mr;
+	us_ibdev->ib_dev.alloc_ucontext = usnic_ib_alloc_ucontext;
+	us_ibdev->ib_dev.dealloc_ucontext = usnic_ib_dealloc_ucontext;
+	us_ibdev->ib_dev.mmap = usnic_ib_mmap;
+	us_ibdev->ib_dev.create_ah = usnic_ib_create_ah;
+	us_ibdev->ib_dev.destroy_ah = usnic_ib_destroy_ah;
+	us_ibdev->ib_dev.post_send = usnic_ib_post_send;
+	us_ibdev->ib_dev.post_recv = usnic_ib_post_recv;
+	us_ibdev->ib_dev.poll_cq = usnic_ib_poll_cq;
+	us_ibdev->ib_dev.req_notify_cq = usnic_ib_req_notify_cq;
+	us_ibdev->ib_dev.get_dma_mr = usnic_ib_get_dma_mr;
+
+
+	if (ib_register_device(&us_ibdev->ib_dev, NULL))
+		goto err_fwd_dealloc;
+
+	usnic_fwd_set_mtu(us_ibdev->ufdev, us_ibdev->netdev->mtu);
+	usnic_fwd_set_mac(us_ibdev->ufdev, us_ibdev->netdev->dev_addr);
+	if (netif_carrier_ok(us_ibdev->netdev))
+		usnic_fwd_carrier_up(us_ibdev->ufdev);
+
+	in = ((struct in_device *)(netdev->ip_ptr))->ifa_list;
+	if (in != NULL)
+		usnic_fwd_add_ipaddr(us_ibdev->ufdev, in->ifa_address);
+
+	usnic_mac_ip_to_gid(us_ibdev->netdev->perm_addr,
+				us_ibdev->ufdev->inaddr, &gid.raw[0]);
+	memcpy(&us_ibdev->ib_dev.node_guid, &gid.global.interface_id,
+		sizeof(gid.global.interface_id));
+	kref_init(&us_ibdev->vf_cnt);
+
+	usnic_info("Added ibdev: %s netdev: %s with mac %pM Link: %u MTU: %u\n",
+			us_ibdev->ib_dev.name, netdev_name(us_ibdev->netdev),
+			us_ibdev->ufdev->mac, us_ibdev->ufdev->link_up,
+			us_ibdev->ufdev->mtu);
+	return us_ibdev;
+
+err_fwd_dealloc:
+	usnic_fwd_dev_free(us_ibdev->ufdev);
+err_dealloc:
+	usnic_err("failed -- deallocing device\n");
+	ib_dealloc_device(&us_ibdev->ib_dev);
+	return NULL;
+}
+
+static void usnic_ib_device_remove(struct usnic_ib_dev *us_ibdev)
+{
+	usnic_info("Unregistering %s\n", us_ibdev->ib_dev.name);
+	usnic_ib_sysfs_unregister_usdev(us_ibdev);
+	usnic_fwd_dev_free(us_ibdev->ufdev);
+	ib_unregister_device(&us_ibdev->ib_dev);
+	ib_dealloc_device(&us_ibdev->ib_dev);
+}
+
+static void usnic_ib_undiscover_pf(struct kref *kref)
+{
+	struct usnic_ib_dev *us_ibdev, *tmp;
+	struct pci_dev *dev;
+	bool found = false;
+
+	dev = container_of(kref, struct usnic_ib_dev, vf_cnt)->pdev;
+	mutex_lock(&usnic_ib_ibdev_list_lock);
+	list_for_each_entry_safe(us_ibdev, tmp,
+				&usnic_ib_ibdev_list, ib_dev_link) {
+		if (us_ibdev->pdev == dev) {
+			list_del(&us_ibdev->ib_dev_link);
+			usnic_ib_device_remove(us_ibdev);
+			found = true;
+			break;
+		}
+	}
+
+	WARN(!found, "Failed to remove PF %s\n", pci_name(dev));
+
+	mutex_unlock(&usnic_ib_ibdev_list_lock);
+}
+
+static struct usnic_ib_dev *usnic_ib_discover_pf(struct usnic_vnic *vnic)
+{
+	struct usnic_ib_dev *us_ibdev;
+	struct pci_dev *parent_pci, *vf_pci;
+	int err;
+
+	vf_pci = usnic_vnic_get_pdev(vnic);
+	parent_pci = pci_physfn(vf_pci);
+
+	BUG_ON(!parent_pci);
+
+	mutex_lock(&usnic_ib_ibdev_list_lock);
+	list_for_each_entry(us_ibdev, &usnic_ib_ibdev_list, ib_dev_link) {
+		if (us_ibdev->pdev == parent_pci) {
+			kref_get(&us_ibdev->vf_cnt);
+			goto out;
+		}
+	}
+
+	us_ibdev = usnic_ib_device_add(parent_pci);
+	if (IS_ERR_OR_NULL(us_ibdev)) {
+		us_ibdev = us_ibdev ? us_ibdev : ERR_PTR(-EFAULT);
+		goto out;
+	}
+
+	err = usnic_ib_sysfs_register_usdev(us_ibdev);
+	if (err) {
+		usnic_ib_device_remove(us_ibdev);
+		us_ibdev = ERR_PTR(err);
+		goto out;
+	}
+
+	list_add(&us_ibdev->ib_dev_link, &usnic_ib_ibdev_list);
+out:
+	mutex_unlock(&usnic_ib_ibdev_list_lock);
+	return us_ibdev;
+}
+/* End of PF discovery section */
+
+/* Start of PCI section */
+
+static const struct pci_device_id usnic_ib_pci_ids[] = {
+	{PCI_DEVICE(PCI_VENDOR_ID_CISCO, PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC)},
+	{0,}
+};
+
+static int usnic_ib_pci_probe(struct pci_dev *pdev,
+				const struct pci_device_id *id)
+{
+	int err;
+	struct usnic_ib_dev *pf;
+	struct usnic_ib_vf *vf;
+	enum usnic_vnic_res_type res_type;
+
+	vf = kzalloc(sizeof(*vf), GFP_KERNEL);
+	if (!vf)
+		return -ENOMEM;
+
+	err = pci_enable_device(pdev);
+	if (err) {
+		usnic_err("Failed to enable %s with err %d\n",
+				pci_name(pdev), err);
+		goto out_clean_vf;
+	}
+
+	err = pci_request_regions(pdev, DRV_NAME);
+	if (err) {
+		usnic_err("Failed to request region for %s with err %d\n",
+				pci_name(pdev), err);
+		goto out_disable_device;
+	}
+
+	pci_set_master(pdev);
+	pci_set_drvdata(pdev, vf);
+
+	vf->vnic = usnic_vnic_alloc(pdev);
+	if (IS_ERR_OR_NULL(vf->vnic)) {
+		err = vf->vnic ? PTR_ERR(vf->vnic) : -ENOMEM;
+		usnic_err("Failed to alloc vnic for %s with err %d\n",
+				pci_name(pdev), err);
+		goto out_release_regions;
+	}
+
+	pf = usnic_ib_discover_pf(vf->vnic);
+	if (IS_ERR_OR_NULL(pf)) {
+		usnic_err("Failed to discover pf of vnic %s with err%ld\n",
+				pci_name(pdev), PTR_ERR(pf));
+		err = pf ? PTR_ERR(pf) : -EFAULT;
+		goto out_clean_vnic;
+	}
+
+	vf->pf = pf;
+	spin_lock_init(&vf->lock);
+	mutex_lock(&pf->usdev_lock);
+	list_add_tail(&vf->link, &pf->vf_dev_list);
+	/*
+	 * Save max settings (will be same for each VF, easier to re-write than
+	 * to say "if (!set) { set_values(); set=1; }
+	 */
+	for (res_type = USNIC_VNIC_RES_TYPE_EOL+1;
+			res_type < USNIC_VNIC_RES_TYPE_MAX;
+			res_type++) {
+		pf->vf_res_cnt[res_type] = usnic_vnic_res_cnt(vf->vnic,
+								res_type);
+	}
+
+	mutex_unlock(&pf->usdev_lock);
+
+	usnic_info("Registering usnic VF %s into PF %s\n", pci_name(pdev),
+			pf->ib_dev.name);
+	usnic_ib_log_vf(vf);
+	return 0;
+
+out_clean_vnic:
+	usnic_vnic_free(vf->vnic);
+out_release_regions:
+	pci_set_drvdata(pdev, NULL);
+	pci_clear_master(pdev);
+	pci_release_regions(pdev);
+out_disable_device:
+	pci_disable_device(pdev);
+out_clean_vf:
+	kfree(vf);
+	return err;
+}
+
+static void usnic_ib_pci_remove(struct pci_dev *pdev)
+{
+	struct usnic_ib_vf *vf = pci_get_drvdata(pdev);
+	struct usnic_ib_dev *pf = vf->pf;
+
+	mutex_lock(&pf->usdev_lock);
+	list_del(&vf->link);
+	mutex_unlock(&pf->usdev_lock);
+
+	kref_put(&pf->vf_cnt, usnic_ib_undiscover_pf);
+	usnic_vnic_free(vf->vnic);
+	pci_set_drvdata(pdev, NULL);
+	pci_clear_master(pdev);
+	pci_release_regions(pdev);
+	pci_disable_device(pdev);
+	kfree(vf);
+
+	usnic_info("Removed VF %s\n", pci_name(pdev));
+}
+
+/* PCI driver entry points */
+static struct pci_driver usnic_ib_pci_driver = {
+	.name = DRV_NAME,
+	.id_table = usnic_ib_pci_ids,
+	.probe = usnic_ib_pci_probe,
+	.remove = usnic_ib_pci_remove,
+};
+/* End of PCI section */
+
+/* Start of module section */
+static int __init usnic_ib_init(void)
+{
+	int err;
+
+	printk_once(KERN_INFO "%s", usnic_version);
+
+	err = usnic_uiom_init(DRV_NAME);
+	if (err) {
+		usnic_err("Unable to initalize umem with err %d\n", err);
+		return err;
+	}
+
+	if (pci_register_driver(&usnic_ib_pci_driver)) {
+		usnic_err("Unable to register with PCI\n");
+		goto out_umem_fini;
+	}
+
+	err = register_netdevice_notifier(&usnic_ib_netdevice_notifier);
+	if (err) {
+		usnic_err("Failed to register netdev notifier\n");
+		goto out_pci_unreg;
+	}
+
+	err = register_inetaddr_notifier(&usnic_ib_inetaddr_notifier);
+	if (err) {
+		usnic_err("Failed to register inet addr notifier\n");
+		goto out_unreg_netdev_notifier;
+	}
+
+	err = usnic_transport_init();
+	if (err) {
+		usnic_err("Failed to initialize transport\n");
+		goto out_unreg_inetaddr_notifier;
+	}
+
+	usnic_debugfs_init();
+
+	return 0;
+
+out_unreg_inetaddr_notifier:
+	unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier);
+out_unreg_netdev_notifier:
+	unregister_netdevice_notifier(&usnic_ib_netdevice_notifier);
+out_pci_unreg:
+	pci_unregister_driver(&usnic_ib_pci_driver);
+out_umem_fini:
+	usnic_uiom_fini();
+
+	return err;
+}
+
+static void __exit usnic_ib_destroy(void)
+{
+	usnic_dbg("\n");
+	usnic_debugfs_exit();
+	usnic_transport_fini();
+	unregister_inetaddr_notifier(&usnic_ib_inetaddr_notifier);
+	unregister_netdevice_notifier(&usnic_ib_netdevice_notifier);
+	pci_unregister_driver(&usnic_ib_pci_driver);
+	usnic_uiom_fini();
+}
+
+MODULE_DESCRIPTION("Cisco VIC (usNIC) Verbs Driver");
+MODULE_AUTHOR("Upinder Malhi <umalhi@cisco.com>");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+module_param(usnic_log_lvl, uint, S_IRUGO | S_IWUSR);
+module_param(usnic_ib_share_vf, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(usnic_log_lvl, " Off=0, Err=1, Info=2, Debug=3");
+MODULE_PARM_DESC(usnic_ib_share_vf, "Off=0, On=1 VF sharing amongst QPs");
+MODULE_DEVICE_TABLE(pci, usnic_ib_pci_ids);
+
+module_init(usnic_ib_init);
+module_exit(usnic_ib_destroy);
+/* End of module section */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
new file mode 100644
index 000000000..db3588df3
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c
@@ -0,0 +1,761 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/bug.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+
+#include "usnic_log.h"
+#include "usnic_vnic.h"
+#include "usnic_fwd.h"
+#include "usnic_uiom.h"
+#include "usnic_debugfs.h"
+#include "usnic_ib_qp_grp.h"
+#include "usnic_ib_sysfs.h"
+#include "usnic_transport.h"
+
+#define DFLT_RQ_IDX	0
+
+const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state)
+{
+	switch (state) {
+	case IB_QPS_RESET:
+		return "Rst";
+	case IB_QPS_INIT:
+		return "Init";
+	case IB_QPS_RTR:
+		return "RTR";
+	case IB_QPS_RTS:
+		return "RTS";
+	case IB_QPS_SQD:
+		return "SQD";
+	case IB_QPS_SQE:
+		return "SQE";
+	case IB_QPS_ERR:
+		return "ERR";
+	default:
+		return "UNKOWN STATE";
+
+	}
+}
+
+int usnic_ib_qp_grp_dump_hdr(char *buf, int buf_sz)
+{
+	return scnprintf(buf, buf_sz, "|QPN\t|State\t|PID\t|VF Idx\t|Fil ID");
+}
+
+int usnic_ib_qp_grp_dump_rows(void *obj, char *buf, int buf_sz)
+{
+	struct usnic_ib_qp_grp *qp_grp = obj;
+	struct usnic_ib_qp_grp_flow *default_flow;
+	if (obj) {
+		default_flow = list_first_entry(&qp_grp->flows_lst,
+					struct usnic_ib_qp_grp_flow, link);
+		return scnprintf(buf, buf_sz, "|%d\t|%s\t|%d\t|%hu\t|%d",
+					qp_grp->ibqp.qp_num,
+					usnic_ib_qp_grp_state_to_string(
+							qp_grp->state),
+					qp_grp->owner_pid,
+					usnic_vnic_get_index(qp_grp->vf->vnic),
+					default_flow->flow->flow_id);
+	} else {
+		return scnprintf(buf, buf_sz, "|N/A\t|N/A\t|N/A\t|N/A\t|N/A");
+	}
+}
+
+static struct usnic_vnic_res_chunk *
+get_qp_res_chunk(struct usnic_ib_qp_grp *qp_grp)
+{
+	lockdep_assert_held(&qp_grp->lock);
+	/*
+	 * The QP res chunk, used to derive qp indices,
+	 * are just indices of the RQs
+	 */
+	return usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ);
+}
+
+static int enable_qp_grp(struct usnic_ib_qp_grp *qp_grp)
+{
+
+	int status;
+	int i, vnic_idx;
+	struct usnic_vnic_res_chunk *res_chunk;
+	struct usnic_vnic_res *res;
+
+	lockdep_assert_held(&qp_grp->lock);
+
+	vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic);
+
+	res_chunk = get_qp_res_chunk(qp_grp);
+	if (IS_ERR_OR_NULL(res_chunk)) {
+		usnic_err("Unable to get qp res with err %ld\n",
+				PTR_ERR(res_chunk));
+		return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM;
+	}
+
+	for (i = 0; i < res_chunk->cnt; i++) {
+		res = res_chunk->res[i];
+		status = usnic_fwd_enable_qp(qp_grp->ufdev, vnic_idx,
+						res->vnic_idx);
+		if (status) {
+			usnic_err("Failed to enable qp %d of %s:%d\n with err %d\n",
+					res->vnic_idx, qp_grp->ufdev->name,
+					vnic_idx, status);
+			goto out_err;
+		}
+	}
+
+	return 0;
+
+out_err:
+	for (i--; i >= 0; i--) {
+		res = res_chunk->res[i];
+		usnic_fwd_disable_qp(qp_grp->ufdev, vnic_idx,
+					res->vnic_idx);
+	}
+
+	return status;
+}
+
+static int disable_qp_grp(struct usnic_ib_qp_grp *qp_grp)
+{
+	int i, vnic_idx;
+	struct usnic_vnic_res_chunk *res_chunk;
+	struct usnic_vnic_res *res;
+	int status = 0;
+
+	lockdep_assert_held(&qp_grp->lock);
+	vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic);
+
+	res_chunk = get_qp_res_chunk(qp_grp);
+	if (IS_ERR_OR_NULL(res_chunk)) {
+		usnic_err("Unable to get qp res with err %ld\n",
+			PTR_ERR(res_chunk));
+		return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM;
+	}
+
+	for (i = 0; i < res_chunk->cnt; i++) {
+		res = res_chunk->res[i];
+		status = usnic_fwd_disable_qp(qp_grp->ufdev, vnic_idx,
+						res->vnic_idx);
+		if (status) {
+			usnic_err("Failed to disable rq %d of %s:%d\n with err %d\n",
+					res->vnic_idx,
+					qp_grp->ufdev->name,
+					vnic_idx, status);
+		}
+	}
+
+	return status;
+
+}
+
+static int init_filter_action(struct usnic_ib_qp_grp *qp_grp,
+				struct usnic_filter_action *uaction)
+{
+	struct usnic_vnic_res_chunk *res_chunk;
+
+	res_chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ);
+	if (IS_ERR_OR_NULL(res_chunk)) {
+		usnic_err("Unable to get %s with err %ld\n",
+			usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_RQ),
+			PTR_ERR(res_chunk));
+		return res_chunk ? PTR_ERR(res_chunk) : -ENOMEM;
+	}
+
+	uaction->vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic);
+	uaction->action.type = FILTER_ACTION_RQ_STEERING;
+	uaction->action.u.rq_idx = res_chunk->res[DFLT_RQ_IDX]->vnic_idx;
+
+	return 0;
+}
+
+static struct usnic_ib_qp_grp_flow*
+create_roce_custom_flow(struct usnic_ib_qp_grp *qp_grp,
+			struct usnic_transport_spec *trans_spec)
+{
+	uint16_t port_num;
+	int err;
+	struct filter filter;
+	struct usnic_filter_action uaction;
+	struct usnic_ib_qp_grp_flow *qp_flow;
+	struct usnic_fwd_flow *flow;
+	enum usnic_transport_type trans_type;
+
+	trans_type = trans_spec->trans_type;
+	port_num = trans_spec->usnic_roce.port_num;
+
+	/* Reserve Port */
+	port_num = usnic_transport_rsrv_port(trans_type, port_num);
+	if (port_num == 0)
+		return ERR_PTR(-EINVAL);
+
+	/* Create Flow */
+	usnic_fwd_init_usnic_filter(&filter, port_num);
+	err = init_filter_action(qp_grp, &uaction);
+	if (err)
+		goto out_unreserve_port;
+
+	flow = usnic_fwd_alloc_flow(qp_grp->ufdev, &filter, &uaction);
+	if (IS_ERR_OR_NULL(flow)) {
+		usnic_err("Unable to alloc flow failed with err %ld\n",
+				PTR_ERR(flow));
+		err = flow ? PTR_ERR(flow) : -EFAULT;
+		goto out_unreserve_port;
+	}
+
+	/* Create Flow Handle */
+	qp_flow = kzalloc(sizeof(*qp_flow), GFP_ATOMIC);
+	if (IS_ERR_OR_NULL(qp_flow)) {
+		err = qp_flow ? PTR_ERR(qp_flow) : -ENOMEM;
+		goto out_dealloc_flow;
+	}
+	qp_flow->flow = flow;
+	qp_flow->trans_type = trans_type;
+	qp_flow->usnic_roce.port_num = port_num;
+	qp_flow->qp_grp = qp_grp;
+	return qp_flow;
+
+out_dealloc_flow:
+	usnic_fwd_dealloc_flow(flow);
+out_unreserve_port:
+	usnic_transport_unrsrv_port(trans_type, port_num);
+	return ERR_PTR(err);
+}
+
+static void release_roce_custom_flow(struct usnic_ib_qp_grp_flow *qp_flow)
+{
+	usnic_fwd_dealloc_flow(qp_flow->flow);
+	usnic_transport_unrsrv_port(qp_flow->trans_type,
+					qp_flow->usnic_roce.port_num);
+	kfree(qp_flow);
+}
+
+static struct usnic_ib_qp_grp_flow*
+create_udp_flow(struct usnic_ib_qp_grp *qp_grp,
+		struct usnic_transport_spec *trans_spec)
+{
+	struct socket *sock;
+	int sock_fd;
+	int err;
+	struct filter filter;
+	struct usnic_filter_action uaction;
+	struct usnic_ib_qp_grp_flow *qp_flow;
+	struct usnic_fwd_flow *flow;
+	enum usnic_transport_type trans_type;
+	uint32_t addr;
+	uint16_t port_num;
+	int proto;
+
+	trans_type = trans_spec->trans_type;
+	sock_fd = trans_spec->udp.sock_fd;
+
+	/* Get and check socket */
+	sock = usnic_transport_get_socket(sock_fd);
+	if (IS_ERR_OR_NULL(sock))
+		return ERR_CAST(sock);
+
+	err = usnic_transport_sock_get_addr(sock, &proto, &addr, &port_num);
+	if (err)
+		goto out_put_sock;
+
+	if (proto != IPPROTO_UDP) {
+		usnic_err("Protocol for fd %d is not UDP", sock_fd);
+		err = -EPERM;
+		goto out_put_sock;
+	}
+
+	/* Create flow */
+	usnic_fwd_init_udp_filter(&filter, addr, port_num);
+	err = init_filter_action(qp_grp, &uaction);
+	if (err)
+		goto out_put_sock;
+
+	flow = usnic_fwd_alloc_flow(qp_grp->ufdev, &filter, &uaction);
+	if (IS_ERR_OR_NULL(flow)) {
+		usnic_err("Unable to alloc flow failed with err %ld\n",
+				PTR_ERR(flow));
+		err = flow ? PTR_ERR(flow) : -EFAULT;
+		goto out_put_sock;
+	}
+
+	/* Create qp_flow */
+	qp_flow = kzalloc(sizeof(*qp_flow), GFP_ATOMIC);
+	if (IS_ERR_OR_NULL(qp_flow)) {
+		err = qp_flow ? PTR_ERR(qp_flow) : -ENOMEM;
+		goto out_dealloc_flow;
+	}
+	qp_flow->flow = flow;
+	qp_flow->trans_type = trans_type;
+	qp_flow->udp.sock = sock;
+	qp_flow->qp_grp = qp_grp;
+	return qp_flow;
+
+out_dealloc_flow:
+	usnic_fwd_dealloc_flow(flow);
+out_put_sock:
+	usnic_transport_put_socket(sock);
+	return ERR_PTR(err);
+}
+
+static void release_udp_flow(struct usnic_ib_qp_grp_flow *qp_flow)
+{
+	usnic_fwd_dealloc_flow(qp_flow->flow);
+	usnic_transport_put_socket(qp_flow->udp.sock);
+	kfree(qp_flow);
+}
+
+static struct usnic_ib_qp_grp_flow*
+create_and_add_flow(struct usnic_ib_qp_grp *qp_grp,
+			struct usnic_transport_spec *trans_spec)
+{
+	struct usnic_ib_qp_grp_flow *qp_flow;
+	enum usnic_transport_type trans_type;
+
+	trans_type = trans_spec->trans_type;
+	switch (trans_type) {
+	case USNIC_TRANSPORT_ROCE_CUSTOM:
+		qp_flow = create_roce_custom_flow(qp_grp, trans_spec);
+		break;
+	case USNIC_TRANSPORT_IPV4_UDP:
+		qp_flow = create_udp_flow(qp_grp, trans_spec);
+		break;
+	default:
+		usnic_err("Unsupported transport %u\n",
+				trans_spec->trans_type);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (!IS_ERR_OR_NULL(qp_flow)) {
+		list_add_tail(&qp_flow->link, &qp_grp->flows_lst);
+		usnic_debugfs_flow_add(qp_flow);
+	}
+
+
+	return qp_flow;
+}
+
+static void release_and_remove_flow(struct usnic_ib_qp_grp_flow *qp_flow)
+{
+	usnic_debugfs_flow_remove(qp_flow);
+	list_del(&qp_flow->link);
+
+	switch (qp_flow->trans_type) {
+	case USNIC_TRANSPORT_ROCE_CUSTOM:
+		release_roce_custom_flow(qp_flow);
+		break;
+	case USNIC_TRANSPORT_IPV4_UDP:
+		release_udp_flow(qp_flow);
+		break;
+	default:
+		WARN(1, "Unsupported transport %u\n",
+				qp_flow->trans_type);
+		break;
+	}
+}
+
+static void release_and_remove_all_flows(struct usnic_ib_qp_grp *qp_grp)
+{
+	struct usnic_ib_qp_grp_flow *qp_flow, *tmp;
+	list_for_each_entry_safe(qp_flow, tmp, &qp_grp->flows_lst, link)
+		release_and_remove_flow(qp_flow);
+}
+
+int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp,
+				enum ib_qp_state new_state,
+				void *data)
+{
+	int status = 0;
+	int vnic_idx;
+	struct ib_event ib_event;
+	enum ib_qp_state old_state;
+	struct usnic_transport_spec *trans_spec;
+	struct usnic_ib_qp_grp_flow *qp_flow;
+
+	old_state = qp_grp->state;
+	vnic_idx = usnic_vnic_get_index(qp_grp->vf->vnic);
+	trans_spec = (struct usnic_transport_spec *) data;
+
+	spin_lock(&qp_grp->lock);
+	switch (new_state) {
+	case IB_QPS_RESET:
+		switch (old_state) {
+		case IB_QPS_RESET:
+			/* NO-OP */
+			break;
+		case IB_QPS_INIT:
+			release_and_remove_all_flows(qp_grp);
+			status = 0;
+			break;
+		case IB_QPS_RTR:
+		case IB_QPS_RTS:
+		case IB_QPS_ERR:
+			status = disable_qp_grp(qp_grp);
+			release_and_remove_all_flows(qp_grp);
+			break;
+		default:
+			status = -EINVAL;
+		}
+		break;
+	case IB_QPS_INIT:
+		switch (old_state) {
+		case IB_QPS_RESET:
+			if (trans_spec) {
+				qp_flow = create_and_add_flow(qp_grp,
+								trans_spec);
+				if (IS_ERR_OR_NULL(qp_flow)) {
+					status = qp_flow ? PTR_ERR(qp_flow) : -EFAULT;
+					break;
+				}
+			} else {
+				/*
+				 * Optional to specify filters.
+				 */
+				status = 0;
+			}
+			break;
+		case IB_QPS_INIT:
+			if (trans_spec) {
+				qp_flow = create_and_add_flow(qp_grp,
+								trans_spec);
+				if (IS_ERR_OR_NULL(qp_flow)) {
+					status = qp_flow ? PTR_ERR(qp_flow) : -EFAULT;
+					break;
+				}
+			} else {
+				/*
+				 * Doesn't make sense to go into INIT state
+				 * from INIT state w/o adding filters.
+				 */
+				status = -EINVAL;
+			}
+			break;
+		case IB_QPS_RTR:
+			status = disable_qp_grp(qp_grp);
+			break;
+		case IB_QPS_RTS:
+			status = disable_qp_grp(qp_grp);
+			break;
+		default:
+			status = -EINVAL;
+		}
+		break;
+	case IB_QPS_RTR:
+		switch (old_state) {
+		case IB_QPS_INIT:
+			status = enable_qp_grp(qp_grp);
+			break;
+		default:
+			status = -EINVAL;
+		}
+		break;
+	case IB_QPS_RTS:
+		switch (old_state) {
+		case IB_QPS_RTR:
+			/* NO-OP FOR NOW */
+			break;
+		default:
+			status = -EINVAL;
+		}
+		break;
+	case IB_QPS_ERR:
+		ib_event.device = &qp_grp->vf->pf->ib_dev;
+		ib_event.element.qp = &qp_grp->ibqp;
+		ib_event.event = IB_EVENT_QP_FATAL;
+
+		switch (old_state) {
+		case IB_QPS_RESET:
+			qp_grp->ibqp.event_handler(&ib_event,
+					qp_grp->ibqp.qp_context);
+			break;
+		case IB_QPS_INIT:
+			release_and_remove_all_flows(qp_grp);
+			qp_grp->ibqp.event_handler(&ib_event,
+					qp_grp->ibqp.qp_context);
+			break;
+		case IB_QPS_RTR:
+		case IB_QPS_RTS:
+			status = disable_qp_grp(qp_grp);
+			release_and_remove_all_flows(qp_grp);
+			qp_grp->ibqp.event_handler(&ib_event,
+					qp_grp->ibqp.qp_context);
+			break;
+		default:
+			status = -EINVAL;
+		}
+		break;
+	default:
+		status = -EINVAL;
+	}
+	spin_unlock(&qp_grp->lock);
+
+	if (!status) {
+		qp_grp->state = new_state;
+		usnic_info("Transistioned %u from %s to %s",
+		qp_grp->grp_id,
+		usnic_ib_qp_grp_state_to_string(old_state),
+		usnic_ib_qp_grp_state_to_string(new_state));
+	} else {
+		usnic_err("Failed to transition %u from %s to %s",
+		qp_grp->grp_id,
+		usnic_ib_qp_grp_state_to_string(old_state),
+		usnic_ib_qp_grp_state_to_string(new_state));
+	}
+
+	return status;
+}
+
+static struct usnic_vnic_res_chunk**
+alloc_res_chunk_list(struct usnic_vnic *vnic,
+			struct usnic_vnic_res_spec *res_spec, void *owner_obj)
+{
+	enum usnic_vnic_res_type res_type;
+	struct usnic_vnic_res_chunk **res_chunk_list;
+	int err, i, res_cnt, res_lst_sz;
+
+	for (res_lst_sz = 0;
+		res_spec->resources[res_lst_sz].type != USNIC_VNIC_RES_TYPE_EOL;
+		res_lst_sz++) {
+		/* Do Nothing */
+	}
+
+	res_chunk_list = kzalloc(sizeof(*res_chunk_list)*(res_lst_sz+1),
+					GFP_ATOMIC);
+	if (!res_chunk_list)
+		return ERR_PTR(-ENOMEM);
+
+	for (i = 0; res_spec->resources[i].type != USNIC_VNIC_RES_TYPE_EOL;
+		i++) {
+		res_type = res_spec->resources[i].type;
+		res_cnt = res_spec->resources[i].cnt;
+
+		res_chunk_list[i] = usnic_vnic_get_resources(vnic, res_type,
+					res_cnt, owner_obj);
+		if (IS_ERR_OR_NULL(res_chunk_list[i])) {
+			err = res_chunk_list[i] ?
+					PTR_ERR(res_chunk_list[i]) : -ENOMEM;
+			usnic_err("Failed to get %s from %s with err %d\n",
+				usnic_vnic_res_type_to_str(res_type),
+				usnic_vnic_pci_name(vnic),
+				err);
+			goto out_free_res;
+		}
+	}
+
+	return res_chunk_list;
+
+out_free_res:
+	for (i--; i > 0; i--)
+		usnic_vnic_put_resources(res_chunk_list[i]);
+	kfree(res_chunk_list);
+	return ERR_PTR(err);
+}
+
+static void free_qp_grp_res(struct usnic_vnic_res_chunk **res_chunk_list)
+{
+	int i;
+	for (i = 0; res_chunk_list[i]; i++)
+		usnic_vnic_put_resources(res_chunk_list[i]);
+	kfree(res_chunk_list);
+}
+
+static int qp_grp_and_vf_bind(struct usnic_ib_vf *vf,
+				struct usnic_ib_pd *pd,
+				struct usnic_ib_qp_grp *qp_grp)
+{
+	int err;
+	struct pci_dev *pdev;
+
+	lockdep_assert_held(&vf->lock);
+
+	pdev = usnic_vnic_get_pdev(vf->vnic);
+	if (vf->qp_grp_ref_cnt == 0) {
+		err = usnic_uiom_attach_dev_to_pd(pd->umem_pd, &pdev->dev);
+		if (err) {
+			usnic_err("Failed to attach %s to domain\n",
+					pci_name(pdev));
+			return err;
+		}
+		vf->pd = pd;
+	}
+	vf->qp_grp_ref_cnt++;
+
+	WARN_ON(vf->pd != pd);
+	qp_grp->vf = vf;
+
+	return 0;
+}
+
+static void qp_grp_and_vf_unbind(struct usnic_ib_qp_grp *qp_grp)
+{
+	struct pci_dev *pdev;
+	struct usnic_ib_pd *pd;
+
+	lockdep_assert_held(&qp_grp->vf->lock);
+
+	pd = qp_grp->vf->pd;
+	pdev = usnic_vnic_get_pdev(qp_grp->vf->vnic);
+	if (--qp_grp->vf->qp_grp_ref_cnt == 0) {
+		qp_grp->vf->pd = NULL;
+		usnic_uiom_detach_dev_from_pd(pd->umem_pd, &pdev->dev);
+	}
+	qp_grp->vf = NULL;
+}
+
+static void log_spec(struct usnic_vnic_res_spec *res_spec)
+{
+	char buf[512];
+	usnic_vnic_spec_dump(buf, sizeof(buf), res_spec);
+	usnic_dbg("%s\n", buf);
+}
+
+static int qp_grp_id_from_flow(struct usnic_ib_qp_grp_flow *qp_flow,
+				uint32_t *id)
+{
+	enum usnic_transport_type trans_type = qp_flow->trans_type;
+	int err;
+	uint16_t port_num = 0;
+
+	switch (trans_type) {
+	case USNIC_TRANSPORT_ROCE_CUSTOM:
+		*id = qp_flow->usnic_roce.port_num;
+		break;
+	case USNIC_TRANSPORT_IPV4_UDP:
+		err = usnic_transport_sock_get_addr(qp_flow->udp.sock,
+							NULL, NULL,
+							&port_num);
+		if (err)
+			return err;
+		/*
+		 * Copy port_num to stack first and then to *id,
+		 * so that the short to int cast works for little
+		 * and big endian systems.
+		 */
+		*id = port_num;
+		break;
+	default:
+		usnic_err("Unsupported transport %u\n", trans_type);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+struct usnic_ib_qp_grp *
+usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf,
+			struct usnic_ib_pd *pd,
+			struct usnic_vnic_res_spec *res_spec,
+			struct usnic_transport_spec *transport_spec)
+{
+	struct usnic_ib_qp_grp *qp_grp;
+	int err;
+	enum usnic_transport_type transport = transport_spec->trans_type;
+	struct usnic_ib_qp_grp_flow *qp_flow;
+
+	lockdep_assert_held(&vf->lock);
+
+	err = usnic_vnic_res_spec_satisfied(&min_transport_spec[transport],
+						res_spec);
+	if (err) {
+		usnic_err("Spec does not meet miniumum req for transport %d\n",
+				transport);
+		log_spec(res_spec);
+		return ERR_PTR(err);
+	}
+
+	qp_grp = kzalloc(sizeof(*qp_grp), GFP_ATOMIC);
+	if (!qp_grp) {
+		usnic_err("Unable to alloc qp_grp - Out of memory\n");
+		return NULL;
+	}
+
+	qp_grp->res_chunk_list = alloc_res_chunk_list(vf->vnic, res_spec,
+							qp_grp);
+	if (IS_ERR_OR_NULL(qp_grp->res_chunk_list)) {
+		err = qp_grp->res_chunk_list ?
+				PTR_ERR(qp_grp->res_chunk_list) : -ENOMEM;
+		usnic_err("Unable to alloc res for %d with err %d\n",
+				qp_grp->grp_id, err);
+		goto out_free_qp_grp;
+	}
+
+	err = qp_grp_and_vf_bind(vf, pd, qp_grp);
+	if (err)
+		goto out_free_res;
+
+	INIT_LIST_HEAD(&qp_grp->flows_lst);
+	spin_lock_init(&qp_grp->lock);
+	qp_grp->ufdev = ufdev;
+	qp_grp->state = IB_QPS_RESET;
+	qp_grp->owner_pid = current->pid;
+
+	qp_flow = create_and_add_flow(qp_grp, transport_spec);
+	if (IS_ERR_OR_NULL(qp_flow)) {
+		usnic_err("Unable to create and add flow with err %ld\n",
+				PTR_ERR(qp_flow));
+		err = qp_flow ? PTR_ERR(qp_flow) : -EFAULT;
+		goto out_qp_grp_vf_unbind;
+	}
+
+	err = qp_grp_id_from_flow(qp_flow, &qp_grp->grp_id);
+	if (err)
+		goto out_release_flow;
+	qp_grp->ibqp.qp_num = qp_grp->grp_id;
+
+	usnic_ib_sysfs_qpn_add(qp_grp);
+
+	return qp_grp;
+
+out_release_flow:
+	release_and_remove_flow(qp_flow);
+out_qp_grp_vf_unbind:
+	qp_grp_and_vf_unbind(qp_grp);
+out_free_res:
+	free_qp_grp_res(qp_grp->res_chunk_list);
+out_free_qp_grp:
+	kfree(qp_grp);
+
+	return ERR_PTR(err);
+}
+
+void usnic_ib_qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp)
+{
+
+	WARN_ON(qp_grp->state != IB_QPS_RESET);
+	lockdep_assert_held(&qp_grp->vf->lock);
+
+	release_and_remove_all_flows(qp_grp);
+	usnic_ib_sysfs_qpn_remove(qp_grp);
+	qp_grp_and_vf_unbind(qp_grp);
+	free_qp_grp_res(qp_grp->res_chunk_list);
+	kfree(qp_grp);
+}
+
+struct usnic_vnic_res_chunk*
+usnic_ib_qp_grp_get_chunk(struct usnic_ib_qp_grp *qp_grp,
+				enum usnic_vnic_res_type res_type)
+{
+	int i;
+
+	for (i = 0; qp_grp->res_chunk_list[i]; i++) {
+		if (qp_grp->res_chunk_list[i]->type == res_type)
+			return qp_grp->res_chunk_list[i];
+	}
+
+	return ERR_PTR(-EINVAL);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h
new file mode 100644
index 000000000..b0aafe8db
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_IB_QP_GRP_H_
+#define USNIC_IB_QP_GRP_H_
+
+#include <linux/debugfs.h>
+#include <rdma/ib_verbs.h>
+
+#include "usnic_ib.h"
+#include "usnic_abi.h"
+#include "usnic_fwd.h"
+#include "usnic_vnic.h"
+
+/*
+ * The qp group struct represents all the hw resources needed to present a ib_qp
+ */
+struct usnic_ib_qp_grp {
+	struct ib_qp				ibqp;
+	enum ib_qp_state			state;
+	int					grp_id;
+
+	struct usnic_fwd_dev			*ufdev;
+	struct usnic_ib_ucontext		*ctx;
+	struct list_head			flows_lst;
+
+	struct usnic_vnic_res_chunk		**res_chunk_list;
+
+	pid_t					owner_pid;
+	struct usnic_ib_vf			*vf;
+	struct list_head			link;
+
+	spinlock_t				lock;
+
+	struct kobject				kobj;
+};
+
+struct usnic_ib_qp_grp_flow {
+	struct usnic_fwd_flow		*flow;
+	enum usnic_transport_type	trans_type;
+	union {
+		struct {
+			uint16_t	port_num;
+		} usnic_roce;
+		struct {
+			struct socket	*sock;
+		} udp;
+	};
+	struct usnic_ib_qp_grp		*qp_grp;
+	struct list_head		link;
+
+	/* Debug FS */
+	struct dentry			*dbgfs_dentry;
+	char				dentry_name[32];
+};
+
+static const struct
+usnic_vnic_res_spec min_transport_spec[USNIC_TRANSPORT_MAX] = {
+	{ /*USNIC_TRANSPORT_UNKNOWN*/
+		.resources = {
+			{.type = USNIC_VNIC_RES_TYPE_EOL,	.cnt = 0,},
+		},
+	},
+	{ /*USNIC_TRANSPORT_ROCE_CUSTOM*/
+		.resources = {
+			{.type = USNIC_VNIC_RES_TYPE_WQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_RQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_CQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_EOL,	.cnt = 0,},
+		},
+	},
+	{ /*USNIC_TRANSPORT_IPV4_UDP*/
+		.resources = {
+			{.type = USNIC_VNIC_RES_TYPE_WQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_RQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_CQ,	.cnt = 1,},
+			{.type = USNIC_VNIC_RES_TYPE_EOL,	.cnt = 0,},
+		},
+	},
+};
+
+const char *usnic_ib_qp_grp_state_to_string(enum ib_qp_state state);
+int usnic_ib_qp_grp_dump_hdr(char *buf, int buf_sz);
+int usnic_ib_qp_grp_dump_rows(void *obj, char *buf, int buf_sz);
+struct usnic_ib_qp_grp *
+usnic_ib_qp_grp_create(struct usnic_fwd_dev *ufdev, struct usnic_ib_vf *vf,
+			struct usnic_ib_pd *pd,
+			struct usnic_vnic_res_spec *res_spec,
+			struct usnic_transport_spec *trans_spec);
+void usnic_ib_qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp);
+int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp,
+				enum ib_qp_state new_state,
+				void *data);
+struct usnic_vnic_res_chunk
+*usnic_ib_qp_grp_get_chunk(struct usnic_ib_qp_grp *qp_grp,
+				enum usnic_vnic_res_type type);
+static inline
+struct usnic_ib_qp_grp *to_uqp_grp(struct ib_qp *ibqp)
+{
+	return container_of(ibqp, struct usnic_ib_qp_grp, ibqp);
+}
+#endif /* USNIC_IB_QP_GRP_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
new file mode 100644
index 000000000..27dc67c16
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.c
@@ -0,0 +1,341 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include "usnic_common_util.h"
+#include "usnic_ib.h"
+#include "usnic_ib_qp_grp.h"
+#include "usnic_vnic.h"
+#include "usnic_ib_verbs.h"
+#include "usnic_log.h"
+
+static ssize_t usnic_ib_show_fw_ver(struct device *device,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct usnic_ib_dev *us_ibdev =
+		container_of(device, struct usnic_ib_dev, ib_dev.dev);
+	struct ethtool_drvinfo info;
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info);
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n", info.fw_version);
+}
+
+static ssize_t usnic_ib_show_board(struct device *device,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct usnic_ib_dev *us_ibdev =
+		container_of(device, struct usnic_ib_dev, ib_dev.dev);
+	unsigned short subsystem_device_id;
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	subsystem_device_id = us_ibdev->pdev->subsystem_device;
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%hu\n", subsystem_device_id);
+}
+
+/*
+ * Report the configuration for this PF
+ */
+static ssize_t
+usnic_ib_show_config(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct usnic_ib_dev *us_ibdev;
+	char *ptr;
+	unsigned left;
+	unsigned n;
+	enum usnic_vnic_res_type res_type;
+
+	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+
+	/* Buffer space limit is 1 page */
+	ptr = buf;
+	left = PAGE_SIZE;
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	if (atomic_read(&us_ibdev->vf_cnt.refcount) > 0) {
+		char *busname;
+
+		/*
+		 * bus name seems to come with annoying prefix.
+		 * Remove it if it is predictable
+		 */
+		busname = us_ibdev->pdev->bus->name;
+		if (strncmp(busname, "PCI Bus ", 8) == 0)
+			busname += 8;
+
+		n = scnprintf(ptr, left,
+			"%s: %s:%d.%d, %s, %pM, %u VFs\n Per VF:",
+			us_ibdev->ib_dev.name,
+			busname,
+			PCI_SLOT(us_ibdev->pdev->devfn),
+			PCI_FUNC(us_ibdev->pdev->devfn),
+			netdev_name(us_ibdev->netdev),
+			us_ibdev->ufdev->mac,
+			atomic_read(&us_ibdev->vf_cnt.refcount));
+		UPDATE_PTR_LEFT(n, ptr, left);
+
+		for (res_type = USNIC_VNIC_RES_TYPE_EOL;
+				res_type < USNIC_VNIC_RES_TYPE_MAX;
+				res_type++) {
+			if (us_ibdev->vf_res_cnt[res_type] == 0)
+				continue;
+			n = scnprintf(ptr, left, " %d %s%s",
+				us_ibdev->vf_res_cnt[res_type],
+				usnic_vnic_res_type_to_str(res_type),
+				(res_type < (USNIC_VNIC_RES_TYPE_MAX - 1)) ?
+				 "," : "");
+			UPDATE_PTR_LEFT(n, ptr, left);
+		}
+		n = scnprintf(ptr, left, "\n");
+		UPDATE_PTR_LEFT(n, ptr, left);
+	} else {
+		n = scnprintf(ptr, left, "%s: no VFs\n",
+				us_ibdev->ib_dev.name);
+		UPDATE_PTR_LEFT(n, ptr, left);
+	}
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return ptr - buf;
+}
+
+static ssize_t
+usnic_ib_show_iface(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct usnic_ib_dev *us_ibdev;
+
+	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%s\n",
+			netdev_name(us_ibdev->netdev));
+}
+
+static ssize_t
+usnic_ib_show_max_vf(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct usnic_ib_dev *us_ibdev;
+
+	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n",
+			atomic_read(&us_ibdev->vf_cnt.refcount));
+}
+
+static ssize_t
+usnic_ib_show_qp_per_vf(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct usnic_ib_dev *us_ibdev;
+	int qp_per_vf;
+
+	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+	qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ],
+			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]);
+
+	return scnprintf(buf, PAGE_SIZE,
+				"%d\n", qp_per_vf);
+}
+
+static ssize_t
+usnic_ib_show_cq_per_vf(struct device *device, struct device_attribute *attr,
+			char *buf)
+{
+	struct usnic_ib_dev *us_ibdev;
+
+	us_ibdev = container_of(device, struct usnic_ib_dev, ib_dev.dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n",
+			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ]);
+}
+
+static DEVICE_ATTR(fw_ver, S_IRUGO, usnic_ib_show_fw_ver, NULL);
+static DEVICE_ATTR(board_id, S_IRUGO, usnic_ib_show_board, NULL);
+static DEVICE_ATTR(config, S_IRUGO, usnic_ib_show_config, NULL);
+static DEVICE_ATTR(iface, S_IRUGO, usnic_ib_show_iface, NULL);
+static DEVICE_ATTR(max_vf, S_IRUGO, usnic_ib_show_max_vf, NULL);
+static DEVICE_ATTR(qp_per_vf, S_IRUGO, usnic_ib_show_qp_per_vf, NULL);
+static DEVICE_ATTR(cq_per_vf, S_IRUGO, usnic_ib_show_cq_per_vf, NULL);
+
+static struct device_attribute *usnic_class_attributes[] = {
+	&dev_attr_fw_ver,
+	&dev_attr_board_id,
+	&dev_attr_config,
+	&dev_attr_iface,
+	&dev_attr_max_vf,
+	&dev_attr_qp_per_vf,
+	&dev_attr_cq_per_vf,
+};
+
+struct qpn_attribute {
+	struct attribute attr;
+	ssize_t (*show)(struct usnic_ib_qp_grp *, char *buf);
+};
+
+/*
+ * Definitions for supporting QPN entries in sysfs
+ */
+static ssize_t
+usnic_ib_qpn_attr_show(struct kobject *kobj, struct attribute *attr, char *buf)
+{
+	struct usnic_ib_qp_grp *qp_grp;
+	struct qpn_attribute *qpn_attr;
+
+	qp_grp = container_of(kobj, struct usnic_ib_qp_grp, kobj);
+	qpn_attr = container_of(attr, struct qpn_attribute, attr);
+
+	return qpn_attr->show(qp_grp, buf);
+}
+
+static const struct sysfs_ops usnic_ib_qpn_sysfs_ops = {
+	.show = usnic_ib_qpn_attr_show
+};
+
+#define QPN_ATTR_RO(NAME) \
+struct qpn_attribute qpn_attr_##NAME = __ATTR_RO(NAME)
+
+static ssize_t context_show(struct usnic_ib_qp_grp *qp_grp, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "0x%p\n", qp_grp->ctx);
+}
+
+static ssize_t summary_show(struct usnic_ib_qp_grp *qp_grp, char *buf)
+{
+	int i, j, n;
+	int left;
+	char *ptr;
+	struct usnic_vnic_res_chunk *res_chunk;
+	struct usnic_vnic_res *vnic_res;
+
+	left = PAGE_SIZE;
+	ptr = buf;
+
+	n = scnprintf(ptr, left,
+			"QPN: %d State: (%s) PID: %u VF Idx: %hu ",
+			qp_grp->ibqp.qp_num,
+			usnic_ib_qp_grp_state_to_string(qp_grp->state),
+			qp_grp->owner_pid,
+			usnic_vnic_get_index(qp_grp->vf->vnic));
+	UPDATE_PTR_LEFT(n, ptr, left);
+
+	for (i = 0; qp_grp->res_chunk_list[i]; i++) {
+		res_chunk = qp_grp->res_chunk_list[i];
+		for (j = 0; j < res_chunk->cnt; j++) {
+			vnic_res = res_chunk->res[j];
+			n = scnprintf(ptr, left, "%s[%d] ",
+				usnic_vnic_res_type_to_str(vnic_res->type),
+				vnic_res->vnic_idx);
+			UPDATE_PTR_LEFT(n, ptr, left);
+		}
+	}
+
+	n = scnprintf(ptr, left, "\n");
+	UPDATE_PTR_LEFT(n, ptr, left);
+
+	return ptr - buf;
+}
+
+static QPN_ATTR_RO(context);
+static QPN_ATTR_RO(summary);
+
+static struct attribute *usnic_ib_qpn_default_attrs[] = {
+	&qpn_attr_context.attr,
+	&qpn_attr_summary.attr,
+	NULL
+};
+
+static struct kobj_type usnic_ib_qpn_type = {
+	.sysfs_ops = &usnic_ib_qpn_sysfs_ops,
+	.default_attrs = usnic_ib_qpn_default_attrs
+};
+
+int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev)
+{
+	int i;
+	int err;
+	for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) {
+		err = device_create_file(&us_ibdev->ib_dev.dev,
+						usnic_class_attributes[i]);
+		if (err) {
+			usnic_err("Failed to create device file %d for %s eith err %d",
+				i, us_ibdev->ib_dev.name, err);
+			return -EINVAL;
+		}
+	}
+
+	/* create kernel object for looking at individual QPs */
+	kobject_get(&us_ibdev->ib_dev.dev.kobj);
+	us_ibdev->qpn_kobj = kobject_create_and_add("qpn",
+			&us_ibdev->ib_dev.dev.kobj);
+	if (us_ibdev->qpn_kobj == NULL) {
+		kobject_put(&us_ibdev->ib_dev.dev.kobj);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev)
+{
+	int i;
+	for (i = 0; i < ARRAY_SIZE(usnic_class_attributes); ++i) {
+		device_remove_file(&us_ibdev->ib_dev.dev,
+					usnic_class_attributes[i]);
+	}
+
+	kobject_put(us_ibdev->qpn_kobj);
+}
+
+void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp)
+{
+	struct usnic_ib_dev *us_ibdev;
+	int err;
+
+	us_ibdev = qp_grp->vf->pf;
+
+	err = kobject_init_and_add(&qp_grp->kobj, &usnic_ib_qpn_type,
+			kobject_get(us_ibdev->qpn_kobj),
+			"%d", qp_grp->grp_id);
+	if (err) {
+		kobject_put(us_ibdev->qpn_kobj);
+		return;
+	}
+}
+
+void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp)
+{
+	struct usnic_ib_dev *us_ibdev;
+
+	us_ibdev = qp_grp->vf->pf;
+
+	kobject_put(&qp_grp->kobj);
+	kobject_put(us_ibdev->qpn_kobj);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h
new file mode 100644
index 000000000..0d09b493c
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_sysfs.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_IB_SYSFS_H_
+#define USNIC_IB_SYSFS_H_
+
+#include "usnic_ib.h"
+
+int usnic_ib_sysfs_register_usdev(struct usnic_ib_dev *us_ibdev);
+void usnic_ib_sysfs_unregister_usdev(struct usnic_ib_dev *us_ibdev);
+void usnic_ib_sysfs_qpn_add(struct usnic_ib_qp_grp *qp_grp);
+void usnic_ib_sysfs_qpn_remove(struct usnic_ib_qp_grp *qp_grp);
+
+#endif /* !USNIC_IB_SYSFS_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
new file mode 100644
index 000000000..53bd6a2d9
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c
@@ -0,0 +1,768 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/errno.h>
+
+#include <rdma/ib_user_verbs.h>
+#include <rdma/ib_addr.h>
+
+#include "usnic_abi.h"
+#include "usnic_ib.h"
+#include "usnic_common_util.h"
+#include "usnic_ib_qp_grp.h"
+#include "usnic_fwd.h"
+#include "usnic_log.h"
+#include "usnic_uiom.h"
+#include "usnic_transport.h"
+
+#define USNIC_DEFAULT_TRANSPORT USNIC_TRANSPORT_ROCE_CUSTOM
+
+static void usnic_ib_fw_string_to_u64(char *fw_ver_str, u64 *fw_ver)
+{
+	*fw_ver = (u64) *fw_ver_str;
+}
+
+static int usnic_ib_fill_create_qp_resp(struct usnic_ib_qp_grp *qp_grp,
+					struct ib_udata *udata)
+{
+	struct usnic_ib_dev *us_ibdev;
+	struct usnic_ib_create_qp_resp resp;
+	struct pci_dev *pdev;
+	struct vnic_dev_bar *bar;
+	struct usnic_vnic_res_chunk *chunk;
+	struct usnic_ib_qp_grp_flow *default_flow;
+	int i, err;
+
+	memset(&resp, 0, sizeof(resp));
+
+	us_ibdev = qp_grp->vf->pf;
+	pdev = usnic_vnic_get_pdev(qp_grp->vf->vnic);
+	if (!pdev) {
+		usnic_err("Failed to get pdev of qp_grp %d\n",
+				qp_grp->grp_id);
+		return -EFAULT;
+	}
+
+	bar = usnic_vnic_get_bar(qp_grp->vf->vnic, 0);
+	if (!bar) {
+		usnic_err("Failed to get bar0 of qp_grp %d vf %s",
+				qp_grp->grp_id, pci_name(pdev));
+		return -EFAULT;
+	}
+
+	resp.vfid = usnic_vnic_get_index(qp_grp->vf->vnic);
+	resp.bar_bus_addr = bar->bus_addr;
+	resp.bar_len = bar->len;
+
+	chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_RQ);
+	if (IS_ERR_OR_NULL(chunk)) {
+		usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n",
+			usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_RQ),
+			qp_grp->grp_id,
+			PTR_ERR(chunk));
+		return chunk ? PTR_ERR(chunk) : -ENOMEM;
+	}
+
+	WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_RQ);
+	resp.rq_cnt = chunk->cnt;
+	for (i = 0; i < chunk->cnt; i++)
+		resp.rq_idx[i] = chunk->res[i]->vnic_idx;
+
+	chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_WQ);
+	if (IS_ERR_OR_NULL(chunk)) {
+		usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n",
+			usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_WQ),
+			qp_grp->grp_id,
+			PTR_ERR(chunk));
+		return chunk ? PTR_ERR(chunk) : -ENOMEM;
+	}
+
+	WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_WQ);
+	resp.wq_cnt = chunk->cnt;
+	for (i = 0; i < chunk->cnt; i++)
+		resp.wq_idx[i] = chunk->res[i]->vnic_idx;
+
+	chunk = usnic_ib_qp_grp_get_chunk(qp_grp, USNIC_VNIC_RES_TYPE_CQ);
+	if (IS_ERR_OR_NULL(chunk)) {
+		usnic_err("Failed to get chunk %s for qp_grp %d with err %ld\n",
+			usnic_vnic_res_type_to_str(USNIC_VNIC_RES_TYPE_CQ),
+			qp_grp->grp_id,
+			PTR_ERR(chunk));
+		return chunk ? PTR_ERR(chunk) : -ENOMEM;
+	}
+
+	WARN_ON(chunk->type != USNIC_VNIC_RES_TYPE_CQ);
+	resp.cq_cnt = chunk->cnt;
+	for (i = 0; i < chunk->cnt; i++)
+		resp.cq_idx[i] = chunk->res[i]->vnic_idx;
+
+	default_flow = list_first_entry(&qp_grp->flows_lst,
+					struct usnic_ib_qp_grp_flow, link);
+	resp.transport = default_flow->trans_type;
+
+	err = ib_copy_to_udata(udata, &resp, sizeof(resp));
+	if (err) {
+		usnic_err("Failed to copy udata for %s", us_ibdev->ib_dev.name);
+		return err;
+	}
+
+	return 0;
+}
+
+static struct usnic_ib_qp_grp*
+find_free_vf_and_create_qp_grp(struct usnic_ib_dev *us_ibdev,
+				struct usnic_ib_pd *pd,
+				struct usnic_transport_spec *trans_spec,
+				struct usnic_vnic_res_spec *res_spec)
+{
+	struct usnic_ib_vf *vf;
+	struct usnic_vnic *vnic;
+	struct usnic_ib_qp_grp *qp_grp;
+	struct device *dev, **dev_list;
+	int i, found = 0;
+
+	BUG_ON(!mutex_is_locked(&us_ibdev->usdev_lock));
+
+	if (list_empty(&us_ibdev->vf_dev_list)) {
+		usnic_info("No vfs to allocate\n");
+		return NULL;
+	}
+
+	if (usnic_ib_share_vf) {
+		/* Try to find resouces on a used vf which is in pd */
+		dev_list = usnic_uiom_get_dev_list(pd->umem_pd);
+		for (i = 0; dev_list[i]; i++) {
+			dev = dev_list[i];
+			vf = pci_get_drvdata(to_pci_dev(dev));
+			spin_lock(&vf->lock);
+			vnic = vf->vnic;
+			if (!usnic_vnic_check_room(vnic, res_spec)) {
+				usnic_dbg("Found used vnic %s from %s\n",
+						us_ibdev->ib_dev.name,
+						pci_name(usnic_vnic_get_pdev(
+									vnic)));
+				found = 1;
+				break;
+			}
+			spin_unlock(&vf->lock);
+
+		}
+		usnic_uiom_free_dev_list(dev_list);
+	}
+
+	if (!found) {
+		/* Try to find resources on an unused vf */
+		list_for_each_entry(vf, &us_ibdev->vf_dev_list, link) {
+			spin_lock(&vf->lock);
+			vnic = vf->vnic;
+			if (vf->qp_grp_ref_cnt == 0 &&
+				usnic_vnic_check_room(vnic, res_spec) == 0) {
+				found = 1;
+				break;
+			}
+			spin_unlock(&vf->lock);
+		}
+	}
+
+	if (!found) {
+		usnic_info("No free qp grp found on %s\n",
+				us_ibdev->ib_dev.name);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	qp_grp = usnic_ib_qp_grp_create(us_ibdev->ufdev, vf, pd, res_spec,
+						trans_spec);
+	spin_unlock(&vf->lock);
+	if (IS_ERR_OR_NULL(qp_grp)) {
+		usnic_err("Failed to allocate qp_grp\n");
+		return ERR_PTR(qp_grp ? PTR_ERR(qp_grp) : -ENOMEM);
+	}
+
+	return qp_grp;
+}
+
+static void qp_grp_destroy(struct usnic_ib_qp_grp *qp_grp)
+{
+	struct usnic_ib_vf *vf = qp_grp->vf;
+
+	WARN_ON(qp_grp->state != IB_QPS_RESET);
+
+	spin_lock(&vf->lock);
+	usnic_ib_qp_grp_destroy(qp_grp);
+	spin_unlock(&vf->lock);
+}
+
+static void eth_speed_to_ib_speed(int speed, u8 *active_speed,
+					u8 *active_width)
+{
+	if (speed <= 10000) {
+		*active_width = IB_WIDTH_1X;
+		*active_speed = IB_SPEED_FDR10;
+	} else if (speed <= 20000) {
+		*active_width = IB_WIDTH_4X;
+		*active_speed = IB_SPEED_DDR;
+	} else if (speed <= 30000) {
+		*active_width = IB_WIDTH_4X;
+		*active_speed = IB_SPEED_QDR;
+	} else if (speed <= 40000) {
+		*active_width = IB_WIDTH_4X;
+		*active_speed = IB_SPEED_FDR10;
+	} else {
+		*active_width = IB_WIDTH_4X;
+		*active_speed = IB_SPEED_EDR;
+	}
+}
+
+static int create_qp_validate_user_data(struct usnic_ib_create_qp_cmd cmd)
+{
+	if (cmd.spec.trans_type <= USNIC_TRANSPORT_UNKNOWN ||
+			cmd.spec.trans_type >= USNIC_TRANSPORT_MAX)
+		return -EINVAL;
+
+	return 0;
+}
+
+/* Start of ib callback functions */
+
+enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device,
+						u8 port_num)
+{
+	return IB_LINK_LAYER_ETHERNET;
+}
+
+int usnic_ib_query_device(struct ib_device *ibdev,
+				struct ib_device_attr *props)
+{
+	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
+	union ib_gid gid;
+	struct ethtool_drvinfo info;
+	struct ethtool_cmd cmd;
+	int qp_per_vf;
+
+	usnic_dbg("\n");
+	mutex_lock(&us_ibdev->usdev_lock);
+	us_ibdev->netdev->ethtool_ops->get_drvinfo(us_ibdev->netdev, &info);
+	us_ibdev->netdev->ethtool_ops->get_settings(us_ibdev->netdev, &cmd);
+	memset(props, 0, sizeof(*props));
+	usnic_mac_ip_to_gid(us_ibdev->ufdev->mac, us_ibdev->ufdev->inaddr,
+			&gid.raw[0]);
+	memcpy(&props->sys_image_guid, &gid.global.interface_id,
+		sizeof(gid.global.interface_id));
+	usnic_ib_fw_string_to_u64(&info.fw_version[0], &props->fw_ver);
+	props->max_mr_size = USNIC_UIOM_MAX_MR_SIZE;
+	props->page_size_cap = USNIC_UIOM_PAGE_SIZE;
+	props->vendor_id = PCI_VENDOR_ID_CISCO;
+	props->vendor_part_id = PCI_DEVICE_ID_CISCO_VIC_USPACE_NIC;
+	props->hw_ver = us_ibdev->pdev->subsystem_device;
+	qp_per_vf = max(us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_WQ],
+			us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_RQ]);
+	props->max_qp = qp_per_vf *
+		atomic_read(&us_ibdev->vf_cnt.refcount);
+	props->device_cap_flags = IB_DEVICE_PORT_ACTIVE_EVENT |
+		IB_DEVICE_SYS_IMAGE_GUID | IB_DEVICE_BLOCK_MULTICAST_LOOPBACK;
+	props->max_cq = us_ibdev->vf_res_cnt[USNIC_VNIC_RES_TYPE_CQ] *
+		atomic_read(&us_ibdev->vf_cnt.refcount);
+	props->max_pd = USNIC_UIOM_MAX_PD_CNT;
+	props->max_mr = USNIC_UIOM_MAX_MR_CNT;
+	props->local_ca_ack_delay = 0;
+	props->max_pkeys = 0;
+	props->atomic_cap = IB_ATOMIC_NONE;
+	props->masked_atomic_cap = props->atomic_cap;
+	props->max_qp_rd_atom = 0;
+	props->max_qp_init_rd_atom = 0;
+	props->max_res_rd_atom = 0;
+	props->max_srq = 0;
+	props->max_srq_wr = 0;
+	props->max_srq_sge = 0;
+	props->max_fast_reg_page_list_len = 0;
+	props->max_mcast_grp = 0;
+	props->max_mcast_qp_attach = 0;
+	props->max_total_mcast_qp_attach = 0;
+	props->max_map_per_fmr = 0;
+	/* Owned by Userspace
+	 * max_qp_wr, max_sge, max_sge_rd, max_cqe */
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return 0;
+}
+
+int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
+				struct ib_port_attr *props)
+{
+	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
+	struct ethtool_cmd cmd;
+
+	usnic_dbg("\n");
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	us_ibdev->netdev->ethtool_ops->get_settings(us_ibdev->netdev, &cmd);
+	memset(props, 0, sizeof(*props));
+
+	props->lid = 0;
+	props->lmc = 1;
+	props->sm_lid = 0;
+	props->sm_sl = 0;
+
+	if (!us_ibdev->ufdev->link_up) {
+		props->state = IB_PORT_DOWN;
+		props->phys_state = 3;
+	} else if (!us_ibdev->ufdev->inaddr) {
+		props->state = IB_PORT_INIT;
+		props->phys_state = 4;
+	} else {
+		props->state = IB_PORT_ACTIVE;
+		props->phys_state = 5;
+	}
+
+	props->port_cap_flags = 0;
+	props->gid_tbl_len = 1;
+	props->pkey_tbl_len = 1;
+	props->bad_pkey_cntr = 0;
+	props->qkey_viol_cntr = 0;
+	eth_speed_to_ib_speed(cmd.speed, &props->active_speed,
+				&props->active_width);
+	props->max_mtu = IB_MTU_4096;
+	props->active_mtu = iboe_get_mtu(us_ibdev->ufdev->mtu);
+	/* Userspace will adjust for hdrs */
+	props->max_msg_sz = us_ibdev->ufdev->mtu;
+	props->max_vl_num = 1;
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return 0;
+}
+
+int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+				int qp_attr_mask,
+				struct ib_qp_init_attr *qp_init_attr)
+{
+	struct usnic_ib_qp_grp *qp_grp;
+	struct usnic_ib_vf *vf;
+	int err;
+
+	usnic_dbg("\n");
+
+	memset(qp_attr, 0, sizeof(*qp_attr));
+	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
+
+	qp_grp = to_uqp_grp(qp);
+	vf = qp_grp->vf;
+	mutex_lock(&vf->pf->usdev_lock);
+	usnic_dbg("\n");
+	qp_attr->qp_state = qp_grp->state;
+	qp_attr->cur_qp_state = qp_grp->state;
+
+	switch (qp_grp->ibqp.qp_type) {
+	case IB_QPT_UD:
+		qp_attr->qkey = 0;
+		break;
+	default:
+		usnic_err("Unexpected qp_type %d\n", qp_grp->ibqp.qp_type);
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	mutex_unlock(&vf->pf->usdev_lock);
+	return 0;
+
+err_out:
+	mutex_unlock(&vf->pf->usdev_lock);
+	return err;
+}
+
+int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+				union ib_gid *gid)
+{
+
+	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
+	usnic_dbg("\n");
+
+	if (index > 1)
+		return -EINVAL;
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	memset(&(gid->raw[0]), 0, sizeof(gid->raw));
+	usnic_mac_ip_to_gid(us_ibdev->ufdev->mac, us_ibdev->ufdev->inaddr,
+			&gid->raw[0]);
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return 0;
+}
+
+int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+				u16 *pkey)
+{
+	if (index > 1)
+		return -EINVAL;
+
+	*pkey = 0xffff;
+	return 0;
+}
+
+struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev,
+					struct ib_ucontext *context,
+					struct ib_udata *udata)
+{
+	struct usnic_ib_pd *pd;
+	void *umem_pd;
+
+	usnic_dbg("\n");
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	umem_pd = pd->umem_pd = usnic_uiom_alloc_pd();
+	if (IS_ERR_OR_NULL(umem_pd)) {
+		kfree(pd);
+		return ERR_PTR(umem_pd ? PTR_ERR(umem_pd) : -ENOMEM);
+	}
+
+	usnic_info("domain 0x%p allocated for context 0x%p and device %s\n",
+			pd, context, ibdev->name);
+	return &pd->ibpd;
+}
+
+int usnic_ib_dealloc_pd(struct ib_pd *pd)
+{
+	usnic_info("freeing domain 0x%p\n", pd);
+
+	usnic_uiom_dealloc_pd((to_upd(pd))->umem_pd);
+	kfree(pd);
+	return 0;
+}
+
+struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
+					struct ib_qp_init_attr *init_attr,
+					struct ib_udata *udata)
+{
+	int err;
+	struct usnic_ib_dev *us_ibdev;
+	struct usnic_ib_qp_grp *qp_grp;
+	struct usnic_ib_ucontext *ucontext;
+	int cq_cnt;
+	struct usnic_vnic_res_spec res_spec;
+	struct usnic_ib_create_qp_cmd cmd;
+	struct usnic_transport_spec trans_spec;
+
+	usnic_dbg("\n");
+
+	ucontext = to_uucontext(pd->uobject->context);
+	us_ibdev = to_usdev(pd->device);
+
+	if (init_attr->create_flags)
+		return ERR_PTR(-EINVAL);
+
+	err = ib_copy_from_udata(&cmd, udata, sizeof(cmd));
+	if (err) {
+		usnic_err("%s: cannot copy udata for create_qp\n",
+				us_ibdev->ib_dev.name);
+		return ERR_PTR(-EINVAL);
+	}
+
+	err = create_qp_validate_user_data(cmd);
+	if (err) {
+		usnic_err("%s: Failed to validate user data\n",
+				us_ibdev->ib_dev.name);
+		return ERR_PTR(-EINVAL);
+	}
+
+	if (init_attr->qp_type != IB_QPT_UD) {
+		usnic_err("%s asked to make a non-UD QP: %d\n",
+				us_ibdev->ib_dev.name, init_attr->qp_type);
+		return ERR_PTR(-EINVAL);
+	}
+
+	trans_spec = cmd.spec;
+	mutex_lock(&us_ibdev->usdev_lock);
+	cq_cnt = (init_attr->send_cq == init_attr->recv_cq) ? 1 : 2;
+	res_spec = min_transport_spec[trans_spec.trans_type];
+	usnic_vnic_res_spec_update(&res_spec, USNIC_VNIC_RES_TYPE_CQ, cq_cnt);
+	qp_grp = find_free_vf_and_create_qp_grp(us_ibdev, to_upd(pd),
+						&trans_spec,
+						&res_spec);
+	if (IS_ERR_OR_NULL(qp_grp)) {
+		err = qp_grp ? PTR_ERR(qp_grp) : -ENOMEM;
+		goto out_release_mutex;
+	}
+
+	err = usnic_ib_fill_create_qp_resp(qp_grp, udata);
+	if (err) {
+		err = -EBUSY;
+		goto out_release_qp_grp;
+	}
+
+	qp_grp->ctx = ucontext;
+	list_add_tail(&qp_grp->link, &ucontext->qp_grp_list);
+	usnic_ib_log_vf(qp_grp->vf);
+	mutex_unlock(&us_ibdev->usdev_lock);
+	return &qp_grp->ibqp;
+
+out_release_qp_grp:
+	qp_grp_destroy(qp_grp);
+out_release_mutex:
+	mutex_unlock(&us_ibdev->usdev_lock);
+	return ERR_PTR(err);
+}
+
+int usnic_ib_destroy_qp(struct ib_qp *qp)
+{
+	struct usnic_ib_qp_grp *qp_grp;
+	struct usnic_ib_vf *vf;
+
+	usnic_dbg("\n");
+
+	qp_grp = to_uqp_grp(qp);
+	vf = qp_grp->vf;
+	mutex_lock(&vf->pf->usdev_lock);
+	if (usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RESET, NULL)) {
+		usnic_err("Failed to move qp grp %u to reset\n",
+				qp_grp->grp_id);
+	}
+
+	list_del(&qp_grp->link);
+	qp_grp_destroy(qp_grp);
+	mutex_unlock(&vf->pf->usdev_lock);
+
+	return 0;
+}
+
+int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+				int attr_mask, struct ib_udata *udata)
+{
+	struct usnic_ib_qp_grp *qp_grp;
+	int status;
+	usnic_dbg("\n");
+
+	qp_grp = to_uqp_grp(ibqp);
+
+	/* TODO: Future Support All States */
+	mutex_lock(&qp_grp->vf->pf->usdev_lock);
+	if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT) {
+		status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_INIT, NULL);
+	} else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTR) {
+		status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTR, NULL);
+	} else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTS) {
+		status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTS, NULL);
+	} else {
+		usnic_err("Unexpected combination mask: %u state: %u\n",
+				attr_mask & IB_QP_STATE, attr->qp_state);
+		status = -EINVAL;
+	}
+
+	mutex_unlock(&qp_grp->vf->pf->usdev_lock);
+	return status;
+}
+
+struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, int entries,
+					int vector, struct ib_ucontext *context,
+					struct ib_udata *udata)
+{
+	struct ib_cq *cq;
+
+	usnic_dbg("\n");
+	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
+	if (!cq)
+		return ERR_PTR(-EBUSY);
+
+	return cq;
+}
+
+int usnic_ib_destroy_cq(struct ib_cq *cq)
+{
+	usnic_dbg("\n");
+	kfree(cq);
+	return 0;
+}
+
+struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
+					u64 virt_addr, int access_flags,
+					struct ib_udata *udata)
+{
+	struct usnic_ib_mr *mr;
+	int err;
+
+	usnic_dbg("start 0x%llx va 0x%llx length 0x%llx\n", start,
+			virt_addr, length);
+
+	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+	if (IS_ERR_OR_NULL(mr))
+		return ERR_PTR(mr ? PTR_ERR(mr) : -ENOMEM);
+
+	mr->umem = usnic_uiom_reg_get(to_upd(pd)->umem_pd, start, length,
+					access_flags, 0);
+	if (IS_ERR_OR_NULL(mr->umem)) {
+		err = mr->umem ? PTR_ERR(mr->umem) : -EFAULT;
+		goto err_free;
+	}
+
+	mr->ibmr.lkey = mr->ibmr.rkey = 0;
+	return &mr->ibmr;
+
+err_free:
+	kfree(mr);
+	return ERR_PTR(err);
+}
+
+int usnic_ib_dereg_mr(struct ib_mr *ibmr)
+{
+	struct usnic_ib_mr *mr = to_umr(ibmr);
+
+	usnic_dbg("va 0x%lx length 0x%zx\n", mr->umem->va, mr->umem->length);
+
+	usnic_uiom_reg_release(mr->umem, ibmr->pd->uobject->context->closing);
+	kfree(mr);
+	return 0;
+}
+
+struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev,
+							struct ib_udata *udata)
+{
+	struct usnic_ib_ucontext *context;
+	struct usnic_ib_dev *us_ibdev = to_usdev(ibdev);
+	usnic_dbg("\n");
+
+	context = kmalloc(sizeof(*context), GFP_KERNEL);
+	if (!context)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&context->qp_grp_list);
+	mutex_lock(&us_ibdev->usdev_lock);
+	list_add_tail(&context->link, &us_ibdev->ctx_list);
+	mutex_unlock(&us_ibdev->usdev_lock);
+
+	return &context->ibucontext;
+}
+
+int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext)
+{
+	struct usnic_ib_ucontext *context = to_uucontext(ibcontext);
+	struct usnic_ib_dev *us_ibdev = to_usdev(ibcontext->device);
+	usnic_dbg("\n");
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	BUG_ON(!list_empty(&context->qp_grp_list));
+	list_del(&context->link);
+	mutex_unlock(&us_ibdev->usdev_lock);
+	kfree(context);
+	return 0;
+}
+
+int usnic_ib_mmap(struct ib_ucontext *context,
+				struct vm_area_struct *vma)
+{
+	struct usnic_ib_ucontext *uctx = to_ucontext(context);
+	struct usnic_ib_dev *us_ibdev;
+	struct usnic_ib_qp_grp *qp_grp;
+	struct usnic_ib_vf *vf;
+	struct vnic_dev_bar *bar;
+	dma_addr_t bus_addr;
+	unsigned int len;
+	unsigned int vfid;
+
+	usnic_dbg("\n");
+
+	us_ibdev = to_usdev(context->device);
+	vma->vm_flags |= VM_IO;
+	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+	vfid = vma->vm_pgoff;
+	usnic_dbg("Page Offset %lu PAGE_SHIFT %u VFID %u\n",
+			vma->vm_pgoff, PAGE_SHIFT, vfid);
+
+	mutex_lock(&us_ibdev->usdev_lock);
+	list_for_each_entry(qp_grp, &uctx->qp_grp_list, link) {
+		vf = qp_grp->vf;
+		if (usnic_vnic_get_index(vf->vnic) == vfid) {
+			bar = usnic_vnic_get_bar(vf->vnic, 0);
+			if ((vma->vm_end - vma->vm_start) != bar->len) {
+				usnic_err("Bar0 Len %lu - Request map %lu\n",
+						bar->len,
+						vma->vm_end - vma->vm_start);
+				mutex_unlock(&us_ibdev->usdev_lock);
+				return -EINVAL;
+			}
+			bus_addr = bar->bus_addr;
+			len = bar->len;
+			usnic_dbg("bus: %pa vaddr: %p size: %ld\n",
+					&bus_addr, bar->vaddr, bar->len);
+			mutex_unlock(&us_ibdev->usdev_lock);
+
+			return remap_pfn_range(vma,
+						vma->vm_start,
+						bus_addr >> PAGE_SHIFT,
+						len, vma->vm_page_prot);
+		}
+	}
+
+	mutex_unlock(&us_ibdev->usdev_lock);
+	usnic_err("No VF %u found\n", vfid);
+	return -EINVAL;
+}
+
+/* In ib callbacks section -  Start of stub funcs */
+struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd,
+					struct ib_ah_attr *ah_attr)
+{
+	usnic_dbg("\n");
+	return ERR_PTR(-EPERM);
+}
+
+int usnic_ib_destroy_ah(struct ib_ah *ah)
+{
+	usnic_dbg("\n");
+	return -EINVAL;
+}
+
+int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+				struct ib_send_wr **bad_wr)
+{
+	usnic_dbg("\n");
+	return -EINVAL;
+}
+
+int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+				struct ib_recv_wr **bad_wr)
+{
+	usnic_dbg("\n");
+	return -EINVAL;
+}
+
+int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries,
+				struct ib_wc *wc)
+{
+	usnic_dbg("\n");
+	return -EINVAL;
+}
+
+int usnic_ib_req_notify_cq(struct ib_cq *cq,
+					enum ib_cq_notify_flags flags)
+{
+	usnic_dbg("\n");
+	return -EINVAL;
+}
+
+struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc)
+{
+	usnic_dbg("\n");
+	return ERR_PTR(-ENOMEM);
+}
+
+
+/* In ib callbacks section - End of stub funcs */
+/* End of ib callbacks section */
diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
new file mode 100644
index 000000000..bb864f5ae
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_IB_VERBS_H_
+#define USNIC_IB_VERBS_H_
+
+#include "usnic_ib.h"
+
+enum rdma_link_layer usnic_ib_port_link_layer(struct ib_device *device,
+						u8 port_num);
+int usnic_ib_query_device(struct ib_device *ibdev,
+				struct ib_device_attr *props);
+int usnic_ib_query_port(struct ib_device *ibdev, u8 port,
+				struct ib_port_attr *props);
+int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr,
+				int qp_attr_mask,
+				struct ib_qp_init_attr *qp_init_attr);
+int usnic_ib_query_gid(struct ib_device *ibdev, u8 port, int index,
+				union ib_gid *gid);
+int usnic_ib_query_pkey(struct ib_device *ibdev, u8 port, u16 index,
+				u16 *pkey);
+struct ib_pd *usnic_ib_alloc_pd(struct ib_device *ibdev,
+				struct ib_ucontext *context,
+				struct ib_udata *udata);
+int usnic_ib_dealloc_pd(struct ib_pd *pd);
+struct ib_qp *usnic_ib_create_qp(struct ib_pd *pd,
+					struct ib_qp_init_attr *init_attr,
+					struct ib_udata *udata);
+int usnic_ib_destroy_qp(struct ib_qp *qp);
+int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
+				int attr_mask, struct ib_udata *udata);
+struct ib_cq *usnic_ib_create_cq(struct ib_device *ibdev, int entries,
+					int vector, struct ib_ucontext *context,
+					struct ib_udata *udata);
+int usnic_ib_destroy_cq(struct ib_cq *cq);
+struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length,
+				u64 virt_addr, int access_flags,
+				struct ib_udata *udata);
+int usnic_ib_dereg_mr(struct ib_mr *ibmr);
+struct ib_ucontext *usnic_ib_alloc_ucontext(struct ib_device *ibdev,
+						struct ib_udata *udata);
+int usnic_ib_dealloc_ucontext(struct ib_ucontext *ibcontext);
+int usnic_ib_mmap(struct ib_ucontext *context,
+			struct vm_area_struct *vma);
+struct ib_ah *usnic_ib_create_ah(struct ib_pd *pd,
+					struct ib_ah_attr *ah_attr);
+int usnic_ib_destroy_ah(struct ib_ah *ah);
+int usnic_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
+			struct ib_send_wr **bad_wr);
+int usnic_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
+			struct ib_recv_wr **bad_wr);
+int usnic_ib_poll_cq(struct ib_cq *ibcq, int num_entries,
+			struct ib_wc *wc);
+int usnic_ib_req_notify_cq(struct ib_cq *cq,
+				enum ib_cq_notify_flags flags);
+struct ib_mr *usnic_ib_get_dma_mr(struct ib_pd *pd, int acc);
+#endif /* !USNIC_IB_VERBS_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_log.h b/drivers/infiniband/hw/usnic/usnic_log.h
new file mode 100644
index 000000000..75777a66c
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_log.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_LOG_H_
+#define USNIC_LOG_H_
+
+#include "usnic.h"
+
+extern unsigned int usnic_log_lvl;
+
+#define USNIC_LOG_LVL_NONE		(0)
+#define USNIC_LOG_LVL_ERR		(1)
+#define USNIC_LOG_LVL_INFO		(2)
+#define USNIC_LOG_LVL_DBG		(3)
+
+#define usnic_printk(lvl, args...) \
+	do { \
+		printk(lvl "%s:%s:%d: ", DRV_NAME, __func__, \
+				__LINE__); \
+		printk(args); \
+	} while (0)
+
+#define usnic_dbg(args...) \
+	do { \
+		if (unlikely(usnic_log_lvl >= USNIC_LOG_LVL_DBG)) { \
+			usnic_printk(KERN_INFO, args); \
+	} \
+} while (0)
+
+#define usnic_info(args...) \
+do { \
+	if (usnic_log_lvl >= USNIC_LOG_LVL_INFO) { \
+			usnic_printk(KERN_INFO, args); \
+	} \
+} while (0)
+
+#define usnic_err(args...) \
+	do { \
+		if (usnic_log_lvl >= USNIC_LOG_LVL_ERR) { \
+			usnic_printk(KERN_ERR, args); \
+		} \
+	} while (0)
+#endif /* !USNIC_LOG_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_transport.c b/drivers/infiniband/hw/usnic/usnic_transport.c
new file mode 100644
index 000000000..ddef6f77a
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_transport.c
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/bitmap.h>
+#include <linux/file.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <net/inet_sock.h>
+
+#include "usnic_transport.h"
+#include "usnic_log.h"
+
+/* ROCE */
+static unsigned long *roce_bitmap;
+static u16 roce_next_port = 1;
+#define ROCE_BITMAP_SZ ((1 << (8 /*CHAR_BIT*/ * sizeof(u16)))/8 /*CHAR BIT*/)
+static DEFINE_SPINLOCK(roce_bitmap_lock);
+
+const char *usnic_transport_to_str(enum usnic_transport_type type)
+{
+	switch (type) {
+	case USNIC_TRANSPORT_UNKNOWN:
+		return "Unknown";
+	case USNIC_TRANSPORT_ROCE_CUSTOM:
+		return "roce custom";
+	case USNIC_TRANSPORT_IPV4_UDP:
+		return "IPv4 UDP";
+	case USNIC_TRANSPORT_MAX:
+		return "Max?";
+	default:
+		return "Not known";
+	}
+}
+
+int usnic_transport_sock_to_str(char *buf, int buf_sz,
+					struct socket *sock)
+{
+	int err;
+	uint32_t addr;
+	uint16_t port;
+	int proto;
+
+	memset(buf, 0, buf_sz);
+	err = usnic_transport_sock_get_addr(sock, &proto, &addr, &port);
+	if (err)
+		return 0;
+
+	return scnprintf(buf, buf_sz, "Proto:%u Addr:%pI4h Port:%hu",
+			proto, &addr, port);
+}
+
+/*
+ * reserve a port number.  if "0" specified, we will try to pick one
+ * starting at roce_next_port.  roce_next_port will take on the values
+ * 1..4096
+ */
+u16 usnic_transport_rsrv_port(enum usnic_transport_type type, u16 port_num)
+{
+	if (type == USNIC_TRANSPORT_ROCE_CUSTOM) {
+		spin_lock(&roce_bitmap_lock);
+		if (!port_num) {
+			port_num = bitmap_find_next_zero_area(roce_bitmap,
+						ROCE_BITMAP_SZ,
+						roce_next_port /* start */,
+						1 /* nr */,
+						0 /* align */);
+			roce_next_port = (port_num & 4095) + 1;
+		} else if (test_bit(port_num, roce_bitmap)) {
+			usnic_err("Failed to allocate port for %s\n",
+					usnic_transport_to_str(type));
+			spin_unlock(&roce_bitmap_lock);
+			goto out_fail;
+		}
+		bitmap_set(roce_bitmap, port_num, 1);
+		spin_unlock(&roce_bitmap_lock);
+	} else {
+		usnic_err("Failed to allocate port - transport %s unsupported\n",
+				usnic_transport_to_str(type));
+		goto out_fail;
+	}
+
+	usnic_dbg("Allocating port %hu for %s\n", port_num,
+			usnic_transport_to_str(type));
+	return port_num;
+
+out_fail:
+	return 0;
+}
+
+void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num)
+{
+	if (type == USNIC_TRANSPORT_ROCE_CUSTOM) {
+		spin_lock(&roce_bitmap_lock);
+		if (!port_num) {
+			usnic_err("Unreserved unvalid port num 0 for %s\n",
+					usnic_transport_to_str(type));
+			goto out_roce_custom;
+		}
+
+		if (!test_bit(port_num, roce_bitmap)) {
+			usnic_err("Unreserving invalid %hu for %s\n",
+					port_num,
+					usnic_transport_to_str(type));
+			goto out_roce_custom;
+		}
+		bitmap_clear(roce_bitmap, port_num, 1);
+		usnic_dbg("Freeing port %hu for %s\n", port_num,
+				usnic_transport_to_str(type));
+out_roce_custom:
+		spin_unlock(&roce_bitmap_lock);
+	} else {
+		usnic_err("Freeing invalid port %hu for %d\n", port_num, type);
+	}
+}
+
+struct socket *usnic_transport_get_socket(int sock_fd)
+{
+	struct socket *sock;
+	int err;
+	char buf[25];
+
+	/* sockfd_lookup will internally do a fget */
+	sock = sockfd_lookup(sock_fd, &err);
+	if (!sock) {
+		usnic_err("Unable to lookup socket for fd %d with err %d\n",
+				sock_fd, err);
+		return ERR_PTR(-ENOENT);
+	}
+
+	usnic_transport_sock_to_str(buf, sizeof(buf), sock);
+	usnic_dbg("Get sock %s\n", buf);
+
+	return sock;
+}
+
+void usnic_transport_put_socket(struct socket *sock)
+{
+	char buf[100];
+
+	usnic_transport_sock_to_str(buf, sizeof(buf), sock);
+	usnic_dbg("Put sock %s\n", buf);
+	sockfd_put(sock);
+}
+
+int usnic_transport_sock_get_addr(struct socket *sock, int *proto,
+					uint32_t *addr, uint16_t *port)
+{
+	int len;
+	int err;
+	struct sockaddr_in sock_addr;
+
+	err = sock->ops->getname(sock,
+				(struct sockaddr *)&sock_addr,
+				&len, 0);
+	if (err)
+		return err;
+
+	if (sock_addr.sin_family != AF_INET)
+		return -EINVAL;
+
+	if (proto)
+		*proto = sock->sk->sk_protocol;
+	if (port)
+		*port = ntohs(((struct sockaddr_in *)&sock_addr)->sin_port);
+	if (addr)
+		*addr = ntohl(((struct sockaddr_in *)
+					&sock_addr)->sin_addr.s_addr);
+
+	return 0;
+}
+
+int usnic_transport_init(void)
+{
+	roce_bitmap = kzalloc(ROCE_BITMAP_SZ, GFP_KERNEL);
+	if (!roce_bitmap) {
+		usnic_err("Failed to allocate bit map");
+		return -ENOMEM;
+	}
+
+	/* Do not ever allocate bit 0, hence set it here */
+	bitmap_set(roce_bitmap, 0, 1);
+	return 0;
+}
+
+void usnic_transport_fini(void)
+{
+	kfree(roce_bitmap);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_transport.h b/drivers/infiniband/hw/usnic/usnic_transport.h
new file mode 100644
index 000000000..7e5dc6d9f
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_transport.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_TRANSPORT_H_
+#define USNIC_TRANSPORT_H_
+
+#include "usnic_abi.h"
+
+const char *usnic_transport_to_str(enum usnic_transport_type trans_type);
+/*
+ * Returns number of bytes written, excluding null terminator. If
+ * nothing was written, the function returns 0.
+ */
+int usnic_transport_sock_to_str(char *buf, int buf_sz,
+					struct socket *sock);
+/*
+ * Reserve a port. If "port_num" is set, then the function will try
+ * to reserve that particular port.
+ */
+u16 usnic_transport_rsrv_port(enum usnic_transport_type type, u16 port_num);
+void usnic_transport_unrsrv_port(enum usnic_transport_type type, u16 port_num);
+/*
+ * Do a fget on the socket refered to by sock_fd and returns the socket.
+ * Socket will not be destroyed before usnic_transport_put_socket has
+ * been called.
+ */
+struct socket *usnic_transport_get_socket(int sock_fd);
+void usnic_transport_put_socket(struct socket *sock);
+/*
+ * Call usnic_transport_get_socket before calling *_sock_get_addr
+ */
+int usnic_transport_sock_get_addr(struct socket *sock, int *proto,
+					uint32_t *addr, uint16_t *port);
+int usnic_transport_init(void);
+void usnic_transport_fini(void);
+#endif /* !USNIC_TRANSPORT_H */
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.c b/drivers/infiniband/hw/usnic/usnic_uiom.c
new file mode 100644
index 000000000..417de1f32
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.c
@@ -0,0 +1,604 @@
+/*
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2013 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/mm.h>
+#include <linux/dma-mapping.h>
+#include <linux/sched.h>
+#include <linux/hugetlb.h>
+#include <linux/dma-attrs.h>
+#include <linux/iommu.h>
+#include <linux/workqueue.h>
+#include <linux/list.h>
+#include <linux/pci.h>
+
+#include "usnic_log.h"
+#include "usnic_uiom.h"
+#include "usnic_uiom_interval_tree.h"
+
+static struct workqueue_struct *usnic_uiom_wq;
+
+#define USNIC_UIOM_PAGE_CHUNK						\
+	((PAGE_SIZE - offsetof(struct usnic_uiom_chunk, page_list))	/\
+	((void *) &((struct usnic_uiom_chunk *) 0)->page_list[1] -	\
+	(void *) &((struct usnic_uiom_chunk *) 0)->page_list[0]))
+
+static void usnic_uiom_reg_account(struct work_struct *work)
+{
+	struct usnic_uiom_reg *umem = container_of(work,
+						struct usnic_uiom_reg, work);
+
+	down_write(&umem->mm->mmap_sem);
+	umem->mm->locked_vm -= umem->diff;
+	up_write(&umem->mm->mmap_sem);
+	mmput(umem->mm);
+	kfree(umem);
+}
+
+static int usnic_uiom_dma_fault(struct iommu_domain *domain,
+				struct device *dev,
+				unsigned long iova, int flags,
+				void *token)
+{
+	usnic_err("Device %s iommu fault domain 0x%pK va 0x%lx flags 0x%x\n",
+		dev_name(dev),
+		domain, iova, flags);
+	return -ENOSYS;
+}
+
+static void usnic_uiom_put_pages(struct list_head *chunk_list, int dirty)
+{
+	struct usnic_uiom_chunk *chunk, *tmp;
+	struct page *page;
+	struct scatterlist *sg;
+	int i;
+	dma_addr_t pa;
+
+	list_for_each_entry_safe(chunk, tmp, chunk_list, list) {
+		for_each_sg(chunk->page_list, sg, chunk->nents, i) {
+			page = sg_page(sg);
+			pa = sg_phys(sg);
+			if (dirty)
+				set_page_dirty_lock(page);
+			put_page(page);
+			usnic_dbg("pa: %pa\n", &pa);
+		}
+		kfree(chunk);
+	}
+}
+
+static int usnic_uiom_get_pages(unsigned long addr, size_t size, int writable,
+				int dmasync, struct list_head *chunk_list)
+{
+	struct page **page_list;
+	struct scatterlist *sg;
+	struct usnic_uiom_chunk *chunk;
+	unsigned long locked;
+	unsigned long lock_limit;
+	unsigned long cur_base;
+	unsigned long npages;
+	int ret;
+	int off;
+	int i;
+	int flags;
+	dma_addr_t pa;
+	DEFINE_DMA_ATTRS(attrs);
+
+	if (dmasync)
+		dma_set_attr(DMA_ATTR_WRITE_BARRIER, &attrs);
+
+	if (!can_do_mlock())
+		return -EPERM;
+
+	INIT_LIST_HEAD(chunk_list);
+
+	page_list = (struct page **) __get_free_page(GFP_KERNEL);
+	if (!page_list)
+		return -ENOMEM;
+
+	npages = PAGE_ALIGN(size + (addr & ~PAGE_MASK)) >> PAGE_SHIFT;
+
+	down_write(&current->mm->mmap_sem);
+
+	locked = npages + current->mm->locked_vm;
+	lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+
+	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	flags = IOMMU_READ | IOMMU_CACHE;
+	flags |= (writable) ? IOMMU_WRITE : 0;
+	cur_base = addr & PAGE_MASK;
+	ret = 0;
+
+	while (npages) {
+		ret = get_user_pages(current, current->mm, cur_base,
+					min_t(unsigned long, npages,
+					PAGE_SIZE / sizeof(struct page *)),
+					1, !writable, page_list, NULL);
+
+		if (ret < 0)
+			goto out;
+
+		npages -= ret;
+		off = 0;
+
+		while (ret) {
+			chunk = kmalloc(sizeof(*chunk) +
+					sizeof(struct scatterlist) *
+					min_t(int, ret, USNIC_UIOM_PAGE_CHUNK),
+					GFP_KERNEL);
+			if (!chunk) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			chunk->nents = min_t(int, ret, USNIC_UIOM_PAGE_CHUNK);
+			sg_init_table(chunk->page_list, chunk->nents);
+			for_each_sg(chunk->page_list, sg, chunk->nents, i) {
+				sg_set_page(sg, page_list[i + off],
+						PAGE_SIZE, 0);
+				pa = sg_phys(sg);
+				usnic_dbg("va: 0x%lx pa: %pa\n",
+						cur_base + i*PAGE_SIZE, &pa);
+			}
+			cur_base += chunk->nents * PAGE_SIZE;
+			ret -= chunk->nents;
+			off += chunk->nents;
+			list_add_tail(&chunk->list, chunk_list);
+		}
+
+		ret = 0;
+	}
+
+out:
+	if (ret < 0)
+		usnic_uiom_put_pages(chunk_list, 0);
+	else
+		current->mm->locked_vm = locked;
+
+	up_write(&current->mm->mmap_sem);
+	free_page((unsigned long) page_list);
+	return ret;
+}
+
+static void usnic_uiom_unmap_sorted_intervals(struct list_head *intervals,
+						struct usnic_uiom_pd *pd)
+{
+	struct usnic_uiom_interval_node *interval, *tmp;
+	long unsigned va, size;
+
+	list_for_each_entry_safe(interval, tmp, intervals, link) {
+		va = interval->start << PAGE_SHIFT;
+		size = ((interval->last - interval->start) + 1) << PAGE_SHIFT;
+		while (size > 0) {
+			/* Workaround for RH 970401 */
+			usnic_dbg("va 0x%lx size 0x%lx", va, PAGE_SIZE);
+			iommu_unmap(pd->domain, va, PAGE_SIZE);
+			va += PAGE_SIZE;
+			size -= PAGE_SIZE;
+		}
+	}
+}
+
+static void __usnic_uiom_reg_release(struct usnic_uiom_pd *pd,
+					struct usnic_uiom_reg *uiomr,
+					int dirty)
+{
+	int npages;
+	unsigned long vpn_start, vpn_last;
+	struct usnic_uiom_interval_node *interval, *tmp;
+	int writable = 0;
+	LIST_HEAD(rm_intervals);
+
+	npages = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT;
+	vpn_start = (uiomr->va & PAGE_MASK) >> PAGE_SHIFT;
+	vpn_last = vpn_start + npages - 1;
+
+	spin_lock(&pd->lock);
+	usnic_uiom_remove_interval(&pd->rb_root, vpn_start,
+					vpn_last, &rm_intervals);
+	usnic_uiom_unmap_sorted_intervals(&rm_intervals, pd);
+
+	list_for_each_entry_safe(interval, tmp, &rm_intervals, link) {
+		if (interval->flags & IOMMU_WRITE)
+			writable = 1;
+		list_del(&interval->link);
+		kfree(interval);
+	}
+
+	usnic_uiom_put_pages(&uiomr->chunk_list, dirty & writable);
+	spin_unlock(&pd->lock);
+}
+
+static int usnic_uiom_map_sorted_intervals(struct list_head *intervals,
+						struct usnic_uiom_reg *uiomr)
+{
+	int i, err;
+	size_t size;
+	struct usnic_uiom_chunk *chunk;
+	struct usnic_uiom_interval_node *interval_node;
+	dma_addr_t pa;
+	dma_addr_t pa_start = 0;
+	dma_addr_t pa_end = 0;
+	long int va_start = -EINVAL;
+	struct usnic_uiom_pd *pd = uiomr->pd;
+	long int va = uiomr->va & PAGE_MASK;
+	int flags = IOMMU_READ | IOMMU_CACHE;
+
+	flags |= (uiomr->writable) ? IOMMU_WRITE : 0;
+	chunk = list_first_entry(&uiomr->chunk_list, struct usnic_uiom_chunk,
+									list);
+	list_for_each_entry(interval_node, intervals, link) {
+iter_chunk:
+		for (i = 0; i < chunk->nents; i++, va += PAGE_SIZE) {
+			pa = sg_phys(&chunk->page_list[i]);
+			if ((va >> PAGE_SHIFT) < interval_node->start)
+				continue;
+
+			if ((va >> PAGE_SHIFT) == interval_node->start) {
+				/* First page of the interval */
+				va_start = va;
+				pa_start = pa;
+				pa_end = pa;
+			}
+
+			WARN_ON(va_start == -EINVAL);
+
+			if ((pa_end + PAGE_SIZE != pa) &&
+					(pa != pa_start)) {
+				/* PAs are not contiguous */
+				size = pa_end - pa_start + PAGE_SIZE;
+				usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x",
+					va_start, &pa_start, size, flags);
+				err = iommu_map(pd->domain, va_start, pa_start,
+							size, flags);
+				if (err) {
+					usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n",
+						va_start, &pa_start, size, err);
+					goto err_out;
+				}
+				va_start = va;
+				pa_start = pa;
+				pa_end = pa;
+			}
+
+			if ((va >> PAGE_SHIFT) == interval_node->last) {
+				/* Last page of the interval */
+				size = pa - pa_start + PAGE_SIZE;
+				usnic_dbg("va 0x%lx pa %pa size 0x%zx flags 0x%x\n",
+					va_start, &pa_start, size, flags);
+				err = iommu_map(pd->domain, va_start, pa_start,
+						size, flags);
+				if (err) {
+					usnic_err("Failed to map va 0x%lx pa %pa size 0x%zx with err %d\n",
+						va_start, &pa_start, size, err);
+					goto err_out;
+				}
+				break;
+			}
+
+			if (pa != pa_start)
+				pa_end += PAGE_SIZE;
+		}
+
+		if (i == chunk->nents) {
+			/*
+			 * Hit last entry of the chunk,
+			 * hence advance to next chunk
+			 */
+			chunk = list_first_entry(&chunk->list,
+							struct usnic_uiom_chunk,
+							list);
+			goto iter_chunk;
+		}
+	}
+
+	return 0;
+
+err_out:
+	usnic_uiom_unmap_sorted_intervals(intervals, pd);
+	return err;
+}
+
+struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd,
+						unsigned long addr, size_t size,
+						int writable, int dmasync)
+{
+	struct usnic_uiom_reg *uiomr;
+	unsigned long va_base, vpn_start, vpn_last;
+	unsigned long npages;
+	int offset, err;
+	LIST_HEAD(sorted_diff_intervals);
+
+	/*
+	 * Intel IOMMU map throws an error if a translation entry is
+	 * changed from read to write.  This module may not unmap
+	 * and then remap the entry after fixing the permission
+	 * b/c this open up a small windows where hw DMA may page fault
+	 * Hence, make all entries to be writable.
+	 */
+	writable = 1;
+
+	va_base = addr & PAGE_MASK;
+	offset = addr & ~PAGE_MASK;
+	npages = PAGE_ALIGN(size + offset) >> PAGE_SHIFT;
+	vpn_start = (addr & PAGE_MASK) >> PAGE_SHIFT;
+	vpn_last = vpn_start + npages - 1;
+
+	uiomr = kmalloc(sizeof(*uiomr), GFP_KERNEL);
+	if (!uiomr)
+		return ERR_PTR(-ENOMEM);
+
+	uiomr->va = va_base;
+	uiomr->offset = offset;
+	uiomr->length = size;
+	uiomr->writable = writable;
+	uiomr->pd = pd;
+
+	err = usnic_uiom_get_pages(addr, size, writable, dmasync,
+					&uiomr->chunk_list);
+	if (err) {
+		usnic_err("Failed get_pages vpn [0x%lx,0x%lx] err %d\n",
+				vpn_start, vpn_last, err);
+		goto out_free_uiomr;
+	}
+
+	spin_lock(&pd->lock);
+	err = usnic_uiom_get_intervals_diff(vpn_start, vpn_last,
+						(writable) ? IOMMU_WRITE : 0,
+						IOMMU_WRITE,
+						&pd->rb_root,
+						&sorted_diff_intervals);
+	if (err) {
+		usnic_err("Failed disjoint interval vpn [0x%lx,0x%lx] err %d\n",
+						vpn_start, vpn_last, err);
+		goto out_put_pages;
+	}
+
+	err = usnic_uiom_map_sorted_intervals(&sorted_diff_intervals, uiomr);
+	if (err) {
+		usnic_err("Failed map interval vpn [0x%lx,0x%lx] err %d\n",
+						vpn_start, vpn_last, err);
+		goto out_put_intervals;
+
+	}
+
+	err = usnic_uiom_insert_interval(&pd->rb_root, vpn_start, vpn_last,
+					(writable) ? IOMMU_WRITE : 0);
+	if (err) {
+		usnic_err("Failed insert interval vpn [0x%lx,0x%lx] err %d\n",
+						vpn_start, vpn_last, err);
+		goto out_unmap_intervals;
+	}
+
+	usnic_uiom_put_interval_set(&sorted_diff_intervals);
+	spin_unlock(&pd->lock);
+
+	return uiomr;
+
+out_unmap_intervals:
+	usnic_uiom_unmap_sorted_intervals(&sorted_diff_intervals, pd);
+out_put_intervals:
+	usnic_uiom_put_interval_set(&sorted_diff_intervals);
+out_put_pages:
+	usnic_uiom_put_pages(&uiomr->chunk_list, 0);
+	spin_unlock(&pd->lock);
+out_free_uiomr:
+	kfree(uiomr);
+	return ERR_PTR(err);
+}
+
+void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing)
+{
+	struct mm_struct *mm;
+	unsigned long diff;
+
+	__usnic_uiom_reg_release(uiomr->pd, uiomr, 1);
+
+	mm = get_task_mm(current);
+	if (!mm) {
+		kfree(uiomr);
+		return;
+	}
+
+	diff = PAGE_ALIGN(uiomr->length + uiomr->offset) >> PAGE_SHIFT;
+
+	/*
+	 * We may be called with the mm's mmap_sem already held.  This
+	 * can happen when a userspace munmap() is the call that drops
+	 * the last reference to our file and calls our release
+	 * method.  If there are memory regions to destroy, we'll end
+	 * up here and not be able to take the mmap_sem.  In that case
+	 * we defer the vm_locked accounting to the system workqueue.
+	 */
+	if (closing) {
+		if (!down_write_trylock(&mm->mmap_sem)) {
+			INIT_WORK(&uiomr->work, usnic_uiom_reg_account);
+			uiomr->mm = mm;
+			uiomr->diff = diff;
+
+			queue_work(usnic_uiom_wq, &uiomr->work);
+			return;
+		}
+	} else
+		down_write(&mm->mmap_sem);
+
+	current->mm->locked_vm -= diff;
+	up_write(&mm->mmap_sem);
+	mmput(mm);
+	kfree(uiomr);
+}
+
+struct usnic_uiom_pd *usnic_uiom_alloc_pd(void)
+{
+	struct usnic_uiom_pd *pd;
+	void *domain;
+
+	pd = kzalloc(sizeof(*pd), GFP_KERNEL);
+	if (!pd)
+		return ERR_PTR(-ENOMEM);
+
+	pd->domain = domain = iommu_domain_alloc(&pci_bus_type);
+	if (IS_ERR_OR_NULL(domain)) {
+		usnic_err("Failed to allocate IOMMU domain with err %ld\n",
+				PTR_ERR(pd->domain));
+		kfree(pd);
+		return ERR_PTR(domain ? PTR_ERR(domain) : -ENOMEM);
+	}
+
+	iommu_set_fault_handler(pd->domain, usnic_uiom_dma_fault, NULL);
+
+	spin_lock_init(&pd->lock);
+	INIT_LIST_HEAD(&pd->devs);
+
+	return pd;
+}
+
+void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd)
+{
+	iommu_domain_free(pd->domain);
+	kfree(pd);
+}
+
+int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev)
+{
+	struct usnic_uiom_dev *uiom_dev;
+	int err;
+
+	uiom_dev = kzalloc(sizeof(*uiom_dev), GFP_ATOMIC);
+	if (!uiom_dev)
+		return -ENOMEM;
+	uiom_dev->dev = dev;
+
+	err = iommu_attach_device(pd->domain, dev);
+	if (err)
+		goto out_free_dev;
+
+	if (!iommu_capable(dev->bus, IOMMU_CAP_CACHE_COHERENCY)) {
+		usnic_err("IOMMU of %s does not support cache coherency\n",
+				dev_name(dev));
+		err = -EINVAL;
+		goto out_detach_device;
+	}
+
+	spin_lock(&pd->lock);
+	list_add_tail(&uiom_dev->link, &pd->devs);
+	pd->dev_cnt++;
+	spin_unlock(&pd->lock);
+
+	return 0;
+
+out_detach_device:
+	iommu_detach_device(pd->domain, dev);
+out_free_dev:
+	kfree(uiom_dev);
+	return err;
+}
+
+void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd, struct device *dev)
+{
+	struct usnic_uiom_dev *uiom_dev;
+	int found = 0;
+
+	spin_lock(&pd->lock);
+	list_for_each_entry(uiom_dev, &pd->devs, link) {
+		if (uiom_dev->dev == dev) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		usnic_err("Unable to free dev %s - not found\n",
+				dev_name(dev));
+		spin_unlock(&pd->lock);
+		return;
+	}
+
+	list_del(&uiom_dev->link);
+	pd->dev_cnt--;
+	spin_unlock(&pd->lock);
+
+	return iommu_detach_device(pd->domain, dev);
+}
+
+struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd)
+{
+	struct usnic_uiom_dev *uiom_dev;
+	struct device **devs;
+	int i = 0;
+
+	spin_lock(&pd->lock);
+	devs = kcalloc(pd->dev_cnt + 1, sizeof(*devs), GFP_ATOMIC);
+	if (!devs) {
+		devs = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+
+	list_for_each_entry(uiom_dev, &pd->devs, link) {
+		devs[i++] = uiom_dev->dev;
+	}
+out:
+	spin_unlock(&pd->lock);
+	return devs;
+}
+
+void usnic_uiom_free_dev_list(struct device **devs)
+{
+	kfree(devs);
+}
+
+int usnic_uiom_init(char *drv_name)
+{
+	if (!iommu_present(&pci_bus_type)) {
+		usnic_err("IOMMU required but not present or enabled.  USNIC QPs will not function w/o enabling IOMMU\n");
+		return -EPERM;
+	}
+
+	usnic_uiom_wq = create_workqueue(drv_name);
+	if (!usnic_uiom_wq) {
+		usnic_err("Unable to alloc wq for drv %s\n", drv_name);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+void usnic_uiom_fini(void)
+{
+	flush_workqueue(usnic_uiom_wq);
+	destroy_workqueue(usnic_uiom_wq);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom.h b/drivers/infiniband/hw/usnic/usnic_uiom.h
new file mode 100644
index 000000000..70440996e
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_uiom.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_UIOM_H_
+#define USNIC_UIOM_H_
+
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+
+#include "usnic_uiom_interval_tree.h"
+
+#define USNIC_UIOM_READ			(1)
+#define USNIC_UIOM_WRITE		(2)
+
+#define USNIC_UIOM_MAX_PD_CNT		(1000)
+#define USNIC_UIOM_MAX_MR_CNT		(1000000)
+#define USNIC_UIOM_MAX_MR_SIZE		(~0UL)
+#define USNIC_UIOM_PAGE_SIZE		(PAGE_SIZE)
+
+struct usnic_uiom_dev {
+	struct device			*dev;
+	struct list_head		link;
+};
+
+struct usnic_uiom_pd {
+	struct iommu_domain		*domain;
+	spinlock_t			lock;
+	struct rb_root			rb_root;
+	struct list_head		devs;
+	int				dev_cnt;
+};
+
+struct usnic_uiom_reg {
+	struct usnic_uiom_pd		*pd;
+	unsigned long			va;
+	size_t				length;
+	int				offset;
+	int				page_size;
+	int				writable;
+	struct list_head		chunk_list;
+	struct work_struct		work;
+	struct mm_struct		*mm;
+	unsigned long			diff;
+};
+
+struct usnic_uiom_chunk {
+	struct list_head		list;
+	int				nents;
+	struct scatterlist		page_list[0];
+};
+
+struct usnic_uiom_pd *usnic_uiom_alloc_pd(void);
+void usnic_uiom_dealloc_pd(struct usnic_uiom_pd *pd);
+int usnic_uiom_attach_dev_to_pd(struct usnic_uiom_pd *pd, struct device *dev);
+void usnic_uiom_detach_dev_from_pd(struct usnic_uiom_pd *pd,
+					struct device *dev);
+struct device **usnic_uiom_get_dev_list(struct usnic_uiom_pd *pd);
+void usnic_uiom_free_dev_list(struct device **devs);
+struct usnic_uiom_reg *usnic_uiom_reg_get(struct usnic_uiom_pd *pd,
+						unsigned long addr, size_t size,
+						int access, int dmasync);
+void usnic_uiom_reg_release(struct usnic_uiom_reg *uiomr, int closing);
+int usnic_uiom_init(char *drv_name);
+void usnic_uiom_fini(void);
+#endif /* USNIC_UIOM_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
new file mode 100644
index 000000000..3a4288e0f
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.c
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) 2014, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/list_sort.h>
+
+#include <linux/interval_tree_generic.h>
+#include "usnic_uiom_interval_tree.h"
+
+#define START(node) ((node)->start)
+#define LAST(node) ((node)->last)
+
+#define MAKE_NODE(node, start, end, ref_cnt, flags, err, err_out)	\
+		do {							\
+			node = usnic_uiom_interval_node_alloc(start,	\
+					end, ref_cnt, flags);		\
+				if (!node) {				\
+					err = -ENOMEM;			\
+					goto err_out;			\
+				}					\
+		} while (0)
+
+#define MARK_FOR_ADD(node, list) (list_add_tail(&node->link, list))
+
+#define MAKE_NODE_AND_APPEND(node, start, end, ref_cnt, flags, err,	\
+				err_out, list)				\
+				do {					\
+					MAKE_NODE(node, start, end,	\
+						ref_cnt, flags, err,	\
+						err_out);		\
+					MARK_FOR_ADD(node, list);	\
+				} while (0)
+
+#define FLAGS_EQUAL(flags1, flags2, mask)				\
+			(((flags1) & (mask)) == ((flags2) & (mask)))
+
+static struct usnic_uiom_interval_node*
+usnic_uiom_interval_node_alloc(long int start, long int last, int ref_cnt,
+				int flags)
+{
+	struct usnic_uiom_interval_node *interval = kzalloc(sizeof(*interval),
+								GFP_ATOMIC);
+	if (!interval)
+		return NULL;
+
+	interval->start = start;
+	interval->last = last;
+	interval->flags = flags;
+	interval->ref_cnt = ref_cnt;
+
+	return interval;
+}
+
+static int interval_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+	struct usnic_uiom_interval_node *node_a, *node_b;
+
+	node_a = list_entry(a, struct usnic_uiom_interval_node, link);
+	node_b = list_entry(b, struct usnic_uiom_interval_node, link);
+
+	/* long to int */
+	if (node_a->start < node_b->start)
+		return -1;
+	else if (node_a->start > node_b->start)
+		return 1;
+
+	return 0;
+}
+
+static void
+find_intervals_intersection_sorted(struct rb_root *root, unsigned long start,
+					unsigned long last,
+					struct list_head *list)
+{
+	struct usnic_uiom_interval_node *node;
+
+	INIT_LIST_HEAD(list);
+
+	for (node = usnic_uiom_interval_tree_iter_first(root, start, last);
+		node;
+		node = usnic_uiom_interval_tree_iter_next(node, start, last))
+		list_add_tail(&node->link, list);
+
+	list_sort(NULL, list, interval_cmp);
+}
+
+int usnic_uiom_get_intervals_diff(unsigned long start, unsigned long last,
+					int flags, int flag_mask,
+					struct rb_root *root,
+					struct list_head *diff_set)
+{
+	struct usnic_uiom_interval_node *interval, *tmp;
+	int err = 0;
+	long int pivot = start;
+	LIST_HEAD(intersection_set);
+
+	INIT_LIST_HEAD(diff_set);
+
+	find_intervals_intersection_sorted(root, start, last,
+						&intersection_set);
+
+	list_for_each_entry(interval, &intersection_set, link) {
+		if (pivot < interval->start) {
+			MAKE_NODE_AND_APPEND(tmp, pivot, interval->start - 1,
+						1, flags, err, err_out,
+						diff_set);
+			pivot = interval->start;
+		}
+
+		/*
+		 * Invariant: Set [start, pivot] is either in diff_set or root,
+		 * but not in both.
+		 */
+
+		if (pivot > interval->last) {
+			continue;
+		} else if (pivot <= interval->last &&
+				FLAGS_EQUAL(interval->flags, flags,
+				flag_mask)) {
+			pivot = interval->last + 1;
+		}
+	}
+
+	if (pivot <= last)
+		MAKE_NODE_AND_APPEND(tmp, pivot, last, 1, flags, err, err_out,
+					diff_set);
+
+	return 0;
+
+err_out:
+	list_for_each_entry_safe(interval, tmp, diff_set, link) {
+		list_del(&interval->link);
+		kfree(interval);
+	}
+
+	return err;
+}
+
+void usnic_uiom_put_interval_set(struct list_head *intervals)
+{
+	struct usnic_uiom_interval_node *interval, *tmp;
+	list_for_each_entry_safe(interval, tmp, intervals, link)
+		kfree(interval);
+}
+
+int usnic_uiom_insert_interval(struct rb_root *root, unsigned long start,
+				unsigned long last, int flags)
+{
+	struct usnic_uiom_interval_node *interval, *tmp;
+	unsigned long istart, ilast;
+	int iref_cnt, iflags;
+	unsigned long lpivot = start;
+	int err = 0;
+	LIST_HEAD(to_add);
+	LIST_HEAD(intersection_set);
+
+	find_intervals_intersection_sorted(root, start, last,
+						&intersection_set);
+
+	list_for_each_entry(interval, &intersection_set, link) {
+		/*
+		 * Invariant - lpivot is the left edge of next interval to be
+		 * inserted
+		 */
+		istart = interval->start;
+		ilast = interval->last;
+		iref_cnt = interval->ref_cnt;
+		iflags = interval->flags;
+
+		if (istart < lpivot) {
+			MAKE_NODE_AND_APPEND(tmp, istart, lpivot - 1, iref_cnt,
+						iflags, err, err_out, &to_add);
+		} else if (istart > lpivot) {
+			MAKE_NODE_AND_APPEND(tmp, lpivot, istart - 1, 1, flags,
+						err, err_out, &to_add);
+			lpivot = istart;
+		} else {
+			lpivot = istart;
+		}
+
+		if (ilast > last) {
+			MAKE_NODE_AND_APPEND(tmp, lpivot, last, iref_cnt + 1,
+						iflags | flags, err, err_out,
+						&to_add);
+			MAKE_NODE_AND_APPEND(tmp, last + 1, ilast, iref_cnt,
+						iflags, err, err_out, &to_add);
+		} else {
+			MAKE_NODE_AND_APPEND(tmp, lpivot, ilast, iref_cnt + 1,
+						iflags | flags, err, err_out,
+						&to_add);
+		}
+
+		lpivot = ilast + 1;
+	}
+
+	if (lpivot <= last)
+		MAKE_NODE_AND_APPEND(tmp, lpivot, last, 1, flags, err, err_out,
+					&to_add);
+
+	list_for_each_entry_safe(interval, tmp, &intersection_set, link) {
+		usnic_uiom_interval_tree_remove(interval, root);
+		kfree(interval);
+	}
+
+	list_for_each_entry(interval, &to_add, link)
+		usnic_uiom_interval_tree_insert(interval, root);
+
+	return 0;
+
+err_out:
+	list_for_each_entry_safe(interval, tmp, &to_add, link)
+		kfree(interval);
+
+	return err;
+}
+
+void usnic_uiom_remove_interval(struct rb_root *root, unsigned long start,
+				unsigned long last, struct list_head *removed)
+{
+	struct usnic_uiom_interval_node *interval;
+
+	for (interval = usnic_uiom_interval_tree_iter_first(root, start, last);
+			interval;
+			interval = usnic_uiom_interval_tree_iter_next(interval,
+									start,
+									last)) {
+		if (--interval->ref_cnt == 0)
+			list_add_tail(&interval->link, removed);
+	}
+
+	list_for_each_entry(interval, removed, link)
+		usnic_uiom_interval_tree_remove(interval, root);
+}
+
+INTERVAL_TREE_DEFINE(struct usnic_uiom_interval_node, rb,
+			unsigned long, __subtree_last,
+			START, LAST, , usnic_uiom_interval_tree)
diff --git a/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h
new file mode 100644
index 000000000..d4f752e25
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_uiom_interval_tree.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_UIOM_INTERVAL_TREE_H_
+#define USNIC_UIOM_INTERVAL_TREE_H_
+
+#include <linux/rbtree.h>
+
+struct usnic_uiom_interval_node {
+	struct rb_node			rb;
+	struct list_head		link;
+	unsigned long			start;
+	unsigned long			last;
+	unsigned long			__subtree_last;
+	unsigned int			ref_cnt;
+	int				flags;
+};
+
+extern void
+usnic_uiom_interval_tree_insert(struct usnic_uiom_interval_node *node,
+					struct rb_root *root);
+extern void
+usnic_uiom_interval_tree_remove(struct usnic_uiom_interval_node *node,
+					struct rb_root *root);
+extern struct usnic_uiom_interval_node *
+usnic_uiom_interval_tree_iter_first(struct rb_root *root,
+					unsigned long start,
+					unsigned long last);
+extern struct usnic_uiom_interval_node *
+usnic_uiom_interval_tree_iter_next(struct usnic_uiom_interval_node *node,
+			unsigned long start, unsigned long last);
+/*
+ * Inserts {start...last} into {root}.  If there are overlaps,
+ * nodes will be broken up and merged
+ */
+int usnic_uiom_insert_interval(struct rb_root *root,
+				unsigned long start, unsigned long last,
+				int flags);
+/*
+ * Removed {start...last} from {root}.  The nodes removed are returned in
+ * 'removed.' The caller is responsibile for freeing memory of nodes in
+ * 'removed.'
+ */
+void usnic_uiom_remove_interval(struct rb_root *root,
+				unsigned long start, unsigned long last,
+				struct list_head *removed);
+/*
+ * Returns {start...last} - {root} (relative complement of {start...last} in
+ * {root}) in diff_set sorted ascendingly
+ */
+int usnic_uiom_get_intervals_diff(unsigned long start,
+					unsigned long last, int flags,
+					int flag_mask,
+					struct rb_root *root,
+					struct list_head *diff_set);
+/* Call this to free diff_set returned by usnic_uiom_get_intervals_diff */
+void usnic_uiom_put_interval_set(struct list_head *intervals);
+#endif /* USNIC_UIOM_INTERVAL_TREE_H_ */
diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.c b/drivers/infiniband/hw/usnic/usnic_vnic.c
new file mode 100644
index 000000000..656b88c39
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_vnic.c
@@ -0,0 +1,467 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+
+#include "usnic_ib.h"
+#include "vnic_resource.h"
+#include "usnic_log.h"
+#include "usnic_vnic.h"
+
+struct usnic_vnic {
+	struct vnic_dev			*vdev;
+	struct vnic_dev_bar		bar[PCI_NUM_RESOURCES];
+	struct usnic_vnic_res_chunk	chunks[USNIC_VNIC_RES_TYPE_MAX];
+	spinlock_t			res_lock;
+};
+
+static enum vnic_res_type _to_vnic_res_type(enum usnic_vnic_res_type res_type)
+{
+#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \
+		vnic_res_type,
+#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \
+		vnic_res_type,
+	static enum vnic_res_type usnic_vnic_type_2_vnic_type[] = {
+						USNIC_VNIC_RES_TYPES};
+#undef DEFINE_USNIC_VNIC_RES
+#undef DEFINE_USNIC_VNIC_RES_AT
+
+	if (res_type >= USNIC_VNIC_RES_TYPE_MAX)
+		return RES_TYPE_MAX;
+
+	return usnic_vnic_type_2_vnic_type[res_type];
+}
+
+const char *usnic_vnic_res_type_to_str(enum usnic_vnic_res_type res_type)
+{
+#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \
+		desc,
+#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \
+		desc,
+	static const char * const usnic_vnic_res_type_desc[] = {
+						USNIC_VNIC_RES_TYPES};
+#undef DEFINE_USNIC_VNIC_RES
+#undef DEFINE_USNIC_VNIC_RES_AT
+
+	if (res_type >= USNIC_VNIC_RES_TYPE_MAX)
+		return "unknown";
+
+	return usnic_vnic_res_type_desc[res_type];
+
+}
+
+const char *usnic_vnic_pci_name(struct usnic_vnic *vnic)
+{
+	return pci_name(usnic_vnic_get_pdev(vnic));
+}
+
+int usnic_vnic_dump(struct usnic_vnic *vnic, char *buf,
+			int buf_sz,
+			void *hdr_obj,
+			int (*printtitle)(void *, char*, int),
+			int (*printcols)(char *, int),
+			int (*printrow)(void *, char *, int))
+{
+	struct usnic_vnic_res_chunk *chunk;
+	struct usnic_vnic_res *res;
+	struct vnic_dev_bar *bar0;
+	int i, j, offset;
+
+	offset = 0;
+	bar0 = usnic_vnic_get_bar(vnic, 0);
+	offset += scnprintf(buf + offset, buf_sz - offset,
+			"VF:%hu BAR0 bus_addr=%pa vaddr=0x%p size=%ld ",
+			usnic_vnic_get_index(vnic),
+			&bar0->bus_addr,
+			bar0->vaddr, bar0->len);
+	if (printtitle)
+		offset += printtitle(hdr_obj, buf + offset, buf_sz - offset);
+	offset += scnprintf(buf + offset, buf_sz - offset, "\n");
+	offset += scnprintf(buf + offset, buf_sz - offset,
+			"|RES\t|CTRL_PIN\t\t|IN_USE\t");
+	if (printcols)
+		offset += printcols(buf + offset, buf_sz - offset);
+	offset += scnprintf(buf + offset, buf_sz - offset, "\n");
+
+	spin_lock(&vnic->res_lock);
+	for (i = 0; i < ARRAY_SIZE(vnic->chunks); i++) {
+		chunk = &vnic->chunks[i];
+		for (j = 0; j < chunk->cnt; j++) {
+			res = chunk->res[j];
+			offset += scnprintf(buf + offset, buf_sz - offset,
+					"|%s[%u]\t|0x%p\t|%u\t",
+					usnic_vnic_res_type_to_str(res->type),
+					res->vnic_idx, res->ctrl, !!res->owner);
+			if (printrow) {
+				offset += printrow(res->owner, buf + offset,
+							buf_sz - offset);
+			}
+			offset += scnprintf(buf + offset, buf_sz - offset,
+						"\n");
+		}
+	}
+	spin_unlock(&vnic->res_lock);
+	return offset;
+}
+
+void usnic_vnic_res_spec_update(struct usnic_vnic_res_spec *spec,
+				enum usnic_vnic_res_type trgt_type,
+				u16 cnt)
+{
+	int i;
+
+	for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) {
+		if (spec->resources[i].type == trgt_type) {
+			spec->resources[i].cnt = cnt;
+			return;
+		}
+	}
+
+	WARN_ON(1);
+}
+
+int usnic_vnic_res_spec_satisfied(const struct usnic_vnic_res_spec *min_spec,
+					struct usnic_vnic_res_spec *res_spec)
+{
+	int found, i, j;
+
+	for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) {
+		found = 0;
+
+		for (j = 0; j < USNIC_VNIC_RES_TYPE_MAX; j++) {
+			if (res_spec->resources[i].type !=
+				min_spec->resources[i].type)
+				continue;
+			found = 1;
+			if (min_spec->resources[i].cnt >
+					res_spec->resources[i].cnt)
+				return -EINVAL;
+			break;
+		}
+
+		if (!found)
+			return -EINVAL;
+	}
+	return 0;
+}
+
+int usnic_vnic_spec_dump(char *buf, int buf_sz,
+				struct usnic_vnic_res_spec *res_spec)
+{
+	enum usnic_vnic_res_type res_type;
+	int res_cnt;
+	int i;
+	int offset = 0;
+
+	for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) {
+		res_type = res_spec->resources[i].type;
+		res_cnt = res_spec->resources[i].cnt;
+		offset += scnprintf(buf + offset, buf_sz - offset,
+				"Res: %s Cnt: %d ",
+				usnic_vnic_res_type_to_str(res_type),
+				res_cnt);
+	}
+
+	return offset;
+}
+
+int usnic_vnic_check_room(struct usnic_vnic *vnic,
+				struct usnic_vnic_res_spec *res_spec)
+{
+	int i;
+	enum usnic_vnic_res_type res_type;
+	int res_cnt;
+
+	for (i = 0; i < USNIC_VNIC_RES_TYPE_MAX; i++) {
+		res_type = res_spec->resources[i].type;
+		res_cnt = res_spec->resources[i].cnt;
+
+		if (res_type == USNIC_VNIC_RES_TYPE_EOL)
+			break;
+
+		if (res_cnt > usnic_vnic_res_free_cnt(vnic, res_type))
+			return -EBUSY;
+	}
+
+	return 0;
+}
+
+int usnic_vnic_res_cnt(struct usnic_vnic *vnic,
+				enum usnic_vnic_res_type type)
+{
+	return vnic->chunks[type].cnt;
+}
+
+int usnic_vnic_res_free_cnt(struct usnic_vnic *vnic,
+				enum usnic_vnic_res_type type)
+{
+	return vnic->chunks[type].free_cnt;
+}
+
+struct usnic_vnic_res_chunk *
+usnic_vnic_get_resources(struct usnic_vnic *vnic, enum usnic_vnic_res_type type,
+				int cnt, void *owner)
+{
+	struct usnic_vnic_res_chunk *src, *ret;
+	struct usnic_vnic_res *res;
+	int i;
+
+	if (usnic_vnic_res_free_cnt(vnic, type) < cnt || cnt < 1 || !owner)
+		return ERR_PTR(-EINVAL);
+
+	ret = kzalloc(sizeof(*ret), GFP_ATOMIC);
+	if (!ret) {
+		usnic_err("Failed to allocate chunk for %s - Out of memory\n",
+				usnic_vnic_pci_name(vnic));
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ret->res = kzalloc(sizeof(*(ret->res))*cnt, GFP_ATOMIC);
+	if (!ret->res) {
+		usnic_err("Failed to allocate resources for %s. Out of memory\n",
+				usnic_vnic_pci_name(vnic));
+		kfree(ret);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	spin_lock(&vnic->res_lock);
+	src = &vnic->chunks[type];
+	for (i = 0; i < src->cnt && ret->cnt < cnt; i++) {
+		res = src->res[i];
+		if (!res->owner) {
+			src->free_cnt--;
+			res->owner = owner;
+			ret->res[ret->cnt++] = res;
+		}
+	}
+
+	spin_unlock(&vnic->res_lock);
+	ret->type = type;
+	ret->vnic = vnic;
+	WARN_ON(ret->cnt != cnt);
+
+	return ret;
+}
+
+void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk)
+{
+
+	struct usnic_vnic_res *res;
+	int i;
+	struct usnic_vnic *vnic = chunk->vnic;
+
+	spin_lock(&vnic->res_lock);
+	while ((i = --chunk->cnt) >= 0) {
+		res = chunk->res[i];
+		chunk->res[i] = NULL;
+		res->owner = NULL;
+		vnic->chunks[res->type].free_cnt++;
+	}
+	spin_unlock(&vnic->res_lock);
+
+	kfree(chunk->res);
+	kfree(chunk);
+}
+
+u16 usnic_vnic_get_index(struct usnic_vnic *vnic)
+{
+	return usnic_vnic_get_pdev(vnic)->devfn - 1;
+}
+
+static int usnic_vnic_alloc_res_chunk(struct usnic_vnic *vnic,
+					enum usnic_vnic_res_type type,
+					struct usnic_vnic_res_chunk *chunk)
+{
+	int cnt, err, i;
+	struct usnic_vnic_res *res;
+
+	cnt = vnic_dev_get_res_count(vnic->vdev, _to_vnic_res_type(type));
+	if (cnt < 1)
+		return -EINVAL;
+
+	chunk->cnt = chunk->free_cnt = cnt;
+	chunk->res = kzalloc(sizeof(*(chunk->res))*cnt, GFP_KERNEL);
+	if (!chunk->res)
+		return -ENOMEM;
+
+	for (i = 0; i < cnt; i++) {
+		res = kzalloc(sizeof(*res), GFP_KERNEL);
+		if (!res) {
+			err = -ENOMEM;
+			goto fail;
+		}
+		res->type = type;
+		res->vnic_idx = i;
+		res->vnic = vnic;
+		res->ctrl = vnic_dev_get_res(vnic->vdev,
+						_to_vnic_res_type(type), i);
+		chunk->res[i] = res;
+	}
+
+	chunk->vnic = vnic;
+	return 0;
+fail:
+	for (i--; i >= 0; i--)
+		kfree(chunk->res[i]);
+	kfree(chunk->res);
+	return err;
+}
+
+static void usnic_vnic_free_res_chunk(struct usnic_vnic_res_chunk *chunk)
+{
+	int i;
+	for (i = 0; i < chunk->cnt; i++)
+		kfree(chunk->res[i]);
+	kfree(chunk->res);
+}
+
+static int usnic_vnic_discover_resources(struct pci_dev *pdev,
+						struct usnic_vnic *vnic)
+{
+	enum usnic_vnic_res_type res_type;
+	int i;
+	int err = 0;
+
+	for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) {
+		if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
+			continue;
+		vnic->bar[i].len = pci_resource_len(pdev, i);
+		vnic->bar[i].vaddr = pci_iomap(pdev, i, vnic->bar[i].len);
+		if (!vnic->bar[i].vaddr) {
+			usnic_err("Cannot memory-map BAR %d, aborting\n",
+					i);
+			err = -ENODEV;
+			goto out_clean_bar;
+		}
+		vnic->bar[i].bus_addr = pci_resource_start(pdev, i);
+	}
+
+	vnic->vdev = vnic_dev_register(NULL, pdev, pdev, vnic->bar,
+			ARRAY_SIZE(vnic->bar));
+	if (!vnic->vdev) {
+		usnic_err("Failed to register device %s\n",
+				pci_name(pdev));
+		err = -EINVAL;
+		goto out_clean_bar;
+	}
+
+	for (res_type = USNIC_VNIC_RES_TYPE_EOL + 1;
+			res_type < USNIC_VNIC_RES_TYPE_MAX; res_type++) {
+		err = usnic_vnic_alloc_res_chunk(vnic, res_type,
+						&vnic->chunks[res_type]);
+		if (err) {
+			usnic_err("Failed to alloc res %s with err %d\n",
+					usnic_vnic_res_type_to_str(res_type),
+					err);
+			goto out_clean_chunks;
+		}
+	}
+
+	return 0;
+
+out_clean_chunks:
+	for (res_type--; res_type > USNIC_VNIC_RES_TYPE_EOL; res_type--)
+		usnic_vnic_free_res_chunk(&vnic->chunks[res_type]);
+	vnic_dev_unregister(vnic->vdev);
+out_clean_bar:
+	for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) {
+		if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
+			continue;
+		if (!vnic->bar[i].vaddr)
+			break;
+
+		iounmap(vnic->bar[i].vaddr);
+	}
+
+	return err;
+}
+
+struct pci_dev *usnic_vnic_get_pdev(struct usnic_vnic *vnic)
+{
+	return vnic_dev_get_pdev(vnic->vdev);
+}
+
+struct vnic_dev_bar *usnic_vnic_get_bar(struct usnic_vnic *vnic,
+				int bar_num)
+{
+	return (bar_num < ARRAY_SIZE(vnic->bar)) ? &vnic->bar[bar_num] : NULL;
+}
+
+static void usnic_vnic_release_resources(struct usnic_vnic *vnic)
+{
+	int i;
+	struct pci_dev *pdev;
+	enum usnic_vnic_res_type res_type;
+
+	pdev = usnic_vnic_get_pdev(vnic);
+
+	for (res_type = USNIC_VNIC_RES_TYPE_EOL + 1;
+			res_type < USNIC_VNIC_RES_TYPE_MAX; res_type++)
+		usnic_vnic_free_res_chunk(&vnic->chunks[res_type]);
+
+	vnic_dev_unregister(vnic->vdev);
+
+	for (i = 0; i < ARRAY_SIZE(vnic->bar); i++) {
+		if (!(pci_resource_flags(pdev, i) & IORESOURCE_MEM))
+			continue;
+		iounmap(vnic->bar[i].vaddr);
+	}
+}
+
+struct usnic_vnic *usnic_vnic_alloc(struct pci_dev *pdev)
+{
+	struct usnic_vnic *vnic;
+	int err = 0;
+
+	if (!pci_is_enabled(pdev)) {
+		usnic_err("PCI dev %s is disabled\n", pci_name(pdev));
+		return ERR_PTR(-EINVAL);
+	}
+
+	vnic = kzalloc(sizeof(*vnic), GFP_KERNEL);
+	if (!vnic) {
+		usnic_err("Failed to alloc vnic for %s - out of memory\n",
+				pci_name(pdev));
+		return ERR_PTR(-ENOMEM);
+	}
+
+	spin_lock_init(&vnic->res_lock);
+
+	err = usnic_vnic_discover_resources(pdev, vnic);
+	if (err) {
+		usnic_err("Failed to discover %s resources with err %d\n",
+				pci_name(pdev), err);
+		goto out_free_vnic;
+	}
+
+	usnic_dbg("Allocated vnic for %s\n", usnic_vnic_pci_name(vnic));
+
+	return vnic;
+
+out_free_vnic:
+	kfree(vnic);
+
+	return ERR_PTR(err);
+}
+
+void usnic_vnic_free(struct usnic_vnic *vnic)
+{
+	usnic_vnic_release_resources(vnic);
+	kfree(vnic);
+}
diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.h b/drivers/infiniband/hw/usnic/usnic_vnic.h
new file mode 100644
index 000000000..14d931a88
--- /dev/null
+++ b/drivers/infiniband/hw/usnic/usnic_vnic.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) 2013, Cisco Systems, Inc. All rights reserved.
+ *
+ * This program is free software; you may redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef USNIC_VNIC_H_
+#define USNIC_VNIC_H_
+
+#include <linux/pci.h>
+
+#include "vnic_dev.h"
+
+/*                      =USNIC_VNIC_RES_TYPE= =VNIC_RES=   =DESC= */
+#define USNIC_VNIC_RES_TYPES \
+	DEFINE_USNIC_VNIC_RES_AT(EOL, RES_TYPE_EOL, "EOL", 0) \
+	DEFINE_USNIC_VNIC_RES(WQ, RES_TYPE_WQ, "WQ") \
+	DEFINE_USNIC_VNIC_RES(RQ, RES_TYPE_RQ, "RQ") \
+	DEFINE_USNIC_VNIC_RES(CQ, RES_TYPE_CQ, "CQ") \
+	DEFINE_USNIC_VNIC_RES(INTR, RES_TYPE_INTR_CTRL, "INT") \
+	DEFINE_USNIC_VNIC_RES(MAX, RES_TYPE_MAX, "MAX")\
+
+#define DEFINE_USNIC_VNIC_RES_AT(usnic_vnic_res_t, vnic_res_type, desc, val) \
+	USNIC_VNIC_RES_TYPE_##usnic_vnic_res_t = val,
+#define DEFINE_USNIC_VNIC_RES(usnic_vnic_res_t, vnic_res_type, desc) \
+	USNIC_VNIC_RES_TYPE_##usnic_vnic_res_t,
+enum usnic_vnic_res_type {
+	USNIC_VNIC_RES_TYPES
+};
+#undef DEFINE_USNIC_VNIC_RES
+#undef DEFINE_USNIC_VNIC_RES_AT
+
+struct usnic_vnic_res {
+	enum usnic_vnic_res_type	type;
+	unsigned int			vnic_idx;
+	struct usnic_vnic		*vnic;
+	void __iomem			*ctrl;
+	void				*owner;
+};
+
+struct usnic_vnic_res_chunk {
+	enum usnic_vnic_res_type	type;
+	int				cnt;
+	int				free_cnt;
+	struct usnic_vnic_res		**res;
+	struct usnic_vnic		*vnic;
+};
+
+struct usnic_vnic_res_desc {
+	enum usnic_vnic_res_type	type;
+	uint16_t			cnt;
+};
+
+struct usnic_vnic_res_spec {
+	struct usnic_vnic_res_desc resources[USNIC_VNIC_RES_TYPE_MAX];
+};
+
+const char *usnic_vnic_res_type_to_str(enum usnic_vnic_res_type res_type);
+const char *usnic_vnic_pci_name(struct usnic_vnic *vnic);
+int usnic_vnic_dump(struct usnic_vnic *vnic, char *buf, int buf_sz,
+			void *hdr_obj,
+			int (*printtitle)(void *, char*, int),
+			int (*printcols)(char *, int),
+			int (*printrow)(void *, char *, int));
+void usnic_vnic_res_spec_update(struct usnic_vnic_res_spec *spec,
+				enum usnic_vnic_res_type trgt_type,
+				u16 cnt);
+int usnic_vnic_res_spec_satisfied(const struct usnic_vnic_res_spec *min_spec,
+					struct usnic_vnic_res_spec *res_spec);
+int usnic_vnic_spec_dump(char *buf, int buf_sz,
+				struct usnic_vnic_res_spec *res_spec);
+int usnic_vnic_check_room(struct usnic_vnic *vnic,
+				struct usnic_vnic_res_spec *res_spec);
+int usnic_vnic_res_cnt(struct usnic_vnic *vnic,
+				enum usnic_vnic_res_type type);
+int usnic_vnic_res_free_cnt(struct usnic_vnic *vnic,
+				enum usnic_vnic_res_type type);
+struct usnic_vnic_res_chunk *
+usnic_vnic_get_resources(struct usnic_vnic *vnic,
+				enum usnic_vnic_res_type type,
+				int cnt,
+				void *owner);
+void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk);
+struct pci_dev *usnic_vnic_get_pdev(struct usnic_vnic *vnic);
+struct vnic_dev_bar *usnic_vnic_get_bar(struct usnic_vnic *vnic,
+				int bar_num);
+struct usnic_vnic *usnic_vnic_alloc(struct pci_dev *pdev);
+void usnic_vnic_free(struct usnic_vnic *vnic);
+u16 usnic_vnic_get_index(struct usnic_vnic *vnic);
+
+#endif /*!USNIC_VNIC_H_*/
diff --git a/drivers/infiniband/ulp/Makefile b/drivers/infiniband/ulp/Makefile
new file mode 100644
index 000000000..f3c7dcf03
--- /dev/null
+++ b/drivers/infiniband/ulp/Makefile
@@ -0,0 +1,5 @@
+obj-$(CONFIG_INFINIBAND_IPOIB)		+= ipoib/
+obj-$(CONFIG_INFINIBAND_SRP)		+= srp/
+obj-$(CONFIG_INFINIBAND_SRPT)		+= srpt/
+obj-$(CONFIG_INFINIBAND_ISER)		+= iser/
+obj-$(CONFIG_INFINIBAND_ISERT)		+= isert/
diff --git a/drivers/infiniband/ulp/ipoib/Kconfig b/drivers/infiniband/ulp/ipoib/Kconfig
new file mode 100644
index 000000000..cda8eac55
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/Kconfig
@@ -0,0 +1,49 @@
+config INFINIBAND_IPOIB
+	tristate "IP-over-InfiniBand"
+	depends on NETDEVICES && INET && (IPV6 || IPV6=n)
+	---help---
+	  Support for the IP-over-InfiniBand protocol (IPoIB). This
+	  transports IP packets over InfiniBand so you can use your IB
+	  device as a fancy NIC.
+
+	  See Documentation/infiniband/ipoib.txt for more information
+
+config INFINIBAND_IPOIB_CM
+	bool "IP-over-InfiniBand Connected Mode support"
+	depends on INFINIBAND_IPOIB
+	default n
+	---help---
+	  This option enables support for IPoIB connected mode.  After
+	  enabling this option, you need to switch to connected mode
+	  through /sys/class/net/ibXXX/mode to actually create
+	  connections, and then increase the interface MTU with
+	  e.g. ifconfig ib0 mtu 65520.
+
+	  WARNING: Enabling connected mode will trigger some packet
+	  drops for multicast and UD mode traffic from this interface,
+	  unless you limit mtu for these destinations to 2044.
+
+config INFINIBAND_IPOIB_DEBUG
+	bool "IP-over-InfiniBand debugging" if EXPERT
+	depends on INFINIBAND_IPOIB
+	default y
+	---help---
+	  This option causes debugging code to be compiled into the
+	  IPoIB driver.  The output can be turned on via the
+	  debug_level and mcast_debug_level module parameters (which
+	  can also be set after the driver is loaded through sysfs).
+
+	  This option also creates a directory tree under ipoib/ in
+	  debugfs, which contains files that expose debugging
+	  information about IB multicast groups used by the IPoIB
+	  driver.
+
+config INFINIBAND_IPOIB_DEBUG_DATA
+	bool "IP-over-InfiniBand data path debugging"
+	depends on INFINIBAND_IPOIB_DEBUG
+	---help---
+	  This option compiles debugging code into the data path
+	  of the IPoIB driver.  The output can be turned on via the
+	  data_debug_level module parameter; however, even with output
+	  turned off, this debugging code will have some performance
+	  impact.
diff --git a/drivers/infiniband/ulp/ipoib/Makefile b/drivers/infiniband/ulp/ipoib/Makefile
new file mode 100644
index 000000000..e5430dd50
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/Makefile
@@ -0,0 +1,12 @@
+obj-$(CONFIG_INFINIBAND_IPOIB)			+= ib_ipoib.o
+
+ib_ipoib-y					:= ipoib_main.o \
+						   ipoib_ib.o \
+						   ipoib_multicast.o \
+						   ipoib_verbs.o \
+						   ipoib_vlan.o \
+						   ipoib_ethtool.o \
+						   ipoib_netlink.o
+ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_CM)		+= ipoib_cm.o
+ib_ipoib-$(CONFIG_INFINIBAND_IPOIB_DEBUG)	+= ipoib_fs.o
+
diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h
new file mode 100644
index 000000000..bd94b0a6e
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib.h
@@ -0,0 +1,780 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _IPOIB_H
+#define _IPOIB_H
+
+#include <linux/list.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/workqueue.h>
+#include <linux/kref.h>
+#include <linux/if_infiniband.h>
+#include <linux/mutex.h>
+
+#include <net/neighbour.h>
+#include <net/sch_generic.h>
+
+#include <linux/atomic.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_pack.h>
+#include <rdma/ib_sa.h>
+#include <linux/sched.h>
+
+/* constants */
+
+enum ipoib_flush_level {
+	IPOIB_FLUSH_LIGHT,
+	IPOIB_FLUSH_NORMAL,
+	IPOIB_FLUSH_HEAVY
+};
+
+enum {
+	IPOIB_ENCAP_LEN		  = 4,
+
+	IPOIB_UD_HEAD_SIZE	  = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
+	IPOIB_UD_RX_SG		  = 2, /* max buffer needed for 4K mtu */
+
+	IPOIB_CM_MTU		  = 0x10000 - 0x10, /* padding to align header to 16 */
+	IPOIB_CM_BUF_SIZE	  = IPOIB_CM_MTU  + IPOIB_ENCAP_LEN,
+	IPOIB_CM_HEAD_SIZE	  = IPOIB_CM_BUF_SIZE % PAGE_SIZE,
+	IPOIB_CM_RX_SG		  = ALIGN(IPOIB_CM_BUF_SIZE, PAGE_SIZE) / PAGE_SIZE,
+	IPOIB_RX_RING_SIZE	  = 256,
+	IPOIB_TX_RING_SIZE	  = 128,
+	IPOIB_MAX_QUEUE_SIZE	  = 8192,
+	IPOIB_MIN_QUEUE_SIZE	  = 2,
+	IPOIB_CM_MAX_CONN_QP	  = 4096,
+
+	IPOIB_NUM_WC		  = 4,
+
+	IPOIB_MAX_PATH_REC_QUEUE  = 3,
+	IPOIB_MAX_MCAST_QUEUE	  = 3,
+
+	IPOIB_FLAG_OPER_UP	  = 0,
+	IPOIB_FLAG_INITIALIZED	  = 1,
+	IPOIB_FLAG_ADMIN_UP	  = 2,
+	IPOIB_PKEY_ASSIGNED	  = 3,
+	IPOIB_FLAG_SUBINTERFACE	  = 5,
+	IPOIB_STOP_REAPER	  = 7,
+	IPOIB_FLAG_ADMIN_CM	  = 9,
+	IPOIB_FLAG_UMCAST	  = 10,
+	IPOIB_STOP_NEIGH_GC	  = 11,
+	IPOIB_NEIGH_TBL_FLUSH	  = 12,
+
+	IPOIB_MAX_BACKOFF_SECONDS = 16,
+
+	IPOIB_MCAST_FLAG_FOUND	  = 0,	/* used in set_multicast_list */
+	IPOIB_MCAST_FLAG_SENDONLY = 1,
+	/*
+	 * For IPOIB_MCAST_FLAG_BUSY
+	 * When set, in flight join and mcast->mc is unreliable
+	 * When clear and mcast->mc IS_ERR_OR_NULL, need to restart or
+	 *   haven't started yet
+	 * When clear and mcast->mc is valid pointer, join was successful
+	 */
+	IPOIB_MCAST_FLAG_BUSY	  = 2,
+	IPOIB_MCAST_FLAG_ATTACHED = 3,
+
+	MAX_SEND_CQE		  = 16,
+	IPOIB_CM_COPYBREAK	  = 256,
+
+	IPOIB_NON_CHILD		  = 0,
+	IPOIB_LEGACY_CHILD	  = 1,
+	IPOIB_RTNL_CHILD	  = 2,
+};
+
+#define	IPOIB_OP_RECV   (1ul << 31)
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+#define	IPOIB_OP_CM     (1ul << 30)
+#else
+#define	IPOIB_OP_CM     (0)
+#endif
+
+#define IPOIB_QPN_MASK ((__force u32) cpu_to_be32(0xFFFFFF))
+
+/* structs */
+
+struct ipoib_header {
+	__be16	proto;
+	u16	reserved;
+};
+
+struct ipoib_cb {
+	struct qdisc_skb_cb	qdisc_cb;
+	u8			hwaddr[INFINIBAND_ALEN];
+};
+
+static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb)
+{
+	BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb));
+	return (struct ipoib_cb *)skb->cb;
+}
+
+/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */
+struct ipoib_mcast {
+	struct ib_sa_mcmember_rec mcmember;
+	struct ib_sa_multicast	 *mc;
+	struct ipoib_ah		 *ah;
+
+	struct rb_node    rb_node;
+	struct list_head  list;
+
+	unsigned long created;
+	unsigned long backoff;
+	unsigned long delay_until;
+
+	unsigned long flags;
+	unsigned char logcount;
+
+	struct list_head  neigh_list;
+
+	struct sk_buff_head pkt_queue;
+
+	struct net_device *dev;
+	struct completion done;
+};
+
+struct ipoib_rx_buf {
+	struct sk_buff *skb;
+	u64		mapping[IPOIB_UD_RX_SG];
+};
+
+struct ipoib_tx_buf {
+	struct sk_buff *skb;
+	u64		mapping[MAX_SKB_FRAGS + 1];
+};
+
+struct ipoib_cm_tx_buf {
+	struct sk_buff *skb;
+	u64		mapping;
+};
+
+struct ib_cm_id;
+
+struct ipoib_cm_data {
+	__be32 qpn; /* High byte MUST be ignored on receive */
+	__be32 mtu;
+};
+
+/*
+ * Quoting 10.3.1 Queue Pair and EE Context States:
+ *
+ * Note, for QPs that are associated with an SRQ, the Consumer should take the
+ * QP through the Error State before invoking a Destroy QP or a Modify QP to the
+ * Reset State.  The Consumer may invoke the Destroy QP without first performing
+ * a Modify QP to the Error State and waiting for the Affiliated Asynchronous
+ * Last WQE Reached Event. However, if the Consumer does not wait for the
+ * Affiliated Asynchronous Last WQE Reached Event, then WQE and Data Segment
+ * leakage may occur. Therefore, it is good programming practice to tear down a
+ * QP that is associated with an SRQ by using the following process:
+ *
+ * - Put the QP in the Error State
+ * - Wait for the Affiliated Asynchronous Last WQE Reached Event;
+ * - either:
+ *       drain the CQ by invoking the Poll CQ verb and either wait for CQ
+ *       to be empty or the number of Poll CQ operations has exceeded
+ *       CQ capacity size;
+ * - or
+ *       post another WR that completes on the same CQ and wait for this
+ *       WR to return as a WC;
+ * - and then invoke a Destroy QP or Reset QP.
+ *
+ * We use the second option and wait for a completion on the
+ * same CQ before destroying QPs attached to our SRQ.
+ */
+
+enum ipoib_cm_state {
+	IPOIB_CM_RX_LIVE,
+	IPOIB_CM_RX_ERROR, /* Ignored by stale task */
+	IPOIB_CM_RX_FLUSH  /* Last WQE Reached event observed */
+};
+
+struct ipoib_cm_rx {
+	struct ib_cm_id	       *id;
+	struct ib_qp	       *qp;
+	struct ipoib_cm_rx_buf *rx_ring;
+	struct list_head	list;
+	struct net_device      *dev;
+	unsigned long		jiffies;
+	enum ipoib_cm_state	state;
+	int			recv_count;
+};
+
+struct ipoib_cm_tx {
+	struct ib_cm_id	    *id;
+	struct ib_qp	    *qp;
+	struct list_head     list;
+	struct net_device   *dev;
+	struct ipoib_neigh  *neigh;
+	struct ipoib_path   *path;
+	struct ipoib_cm_tx_buf *tx_ring;
+	unsigned	     tx_head;
+	unsigned	     tx_tail;
+	unsigned long	     flags;
+	u32		     mtu;
+};
+
+struct ipoib_cm_rx_buf {
+	struct sk_buff *skb;
+	u64 mapping[IPOIB_CM_RX_SG];
+};
+
+struct ipoib_cm_dev_priv {
+	struct ib_srq	       *srq;
+	struct ipoib_cm_rx_buf *srq_ring;
+	struct ib_cm_id	       *id;
+	struct list_head	passive_ids;   /* state: LIVE */
+	struct list_head	rx_error_list; /* state: ERROR */
+	struct list_head	rx_flush_list; /* state: FLUSH, drain not started */
+	struct list_head	rx_drain_list; /* state: FLUSH, drain started */
+	struct list_head	rx_reap_list;  /* state: FLUSH, drain done */
+	struct work_struct      start_task;
+	struct work_struct      reap_task;
+	struct work_struct      skb_task;
+	struct work_struct      rx_reap_task;
+	struct delayed_work     stale_task;
+	struct sk_buff_head     skb_queue;
+	struct list_head	start_list;
+	struct list_head	reap_list;
+	struct ib_wc		ibwc[IPOIB_NUM_WC];
+	struct ib_sge		rx_sge[IPOIB_CM_RX_SG];
+	struct ib_recv_wr       rx_wr;
+	int			nonsrq_conn_qp;
+	int			max_cm_mtu;
+	int			num_frags;
+};
+
+struct ipoib_ethtool_st {
+	u16     coalesce_usecs;
+	u16     max_coalesced_frames;
+};
+
+struct ipoib_neigh_table;
+
+struct ipoib_neigh_hash {
+	struct ipoib_neigh_table       *ntbl;
+	struct ipoib_neigh __rcu      **buckets;
+	struct rcu_head			rcu;
+	u32				mask;
+	u32				size;
+};
+
+struct ipoib_neigh_table {
+	struct ipoib_neigh_hash __rcu  *htbl;
+	atomic_t			entries;
+	struct completion		flushed;
+	struct completion		deleted;
+};
+
+struct ipoib_qp_state_validate {
+	struct work_struct work;
+	struct ipoib_dev_priv   *priv;
+};
+
+/*
+ * Device private locking: network stack tx_lock protects members used
+ * in TX fast path, lock protects everything else.  lock nests inside
+ * of tx_lock (ie tx_lock must be acquired first if needed).
+ */
+struct ipoib_dev_priv {
+	spinlock_t lock;
+
+	struct net_device *dev;
+
+	struct napi_struct napi;
+
+	unsigned long flags;
+
+	struct rw_semaphore vlan_rwsem;
+
+	struct rb_root  path_tree;
+	struct list_head path_list;
+
+	struct ipoib_neigh_table ntbl;
+
+	struct ipoib_mcast *broadcast;
+	struct list_head multicast_list;
+	struct rb_root multicast_tree;
+
+	struct workqueue_struct *wq;
+	struct delayed_work mcast_task;
+	struct work_struct carrier_on_task;
+	struct work_struct flush_light;
+	struct work_struct flush_normal;
+	struct work_struct flush_heavy;
+	struct work_struct restart_task;
+	struct delayed_work ah_reap_task;
+	struct delayed_work neigh_reap_task;
+	struct ib_device *ca;
+	u8		  port;
+	u16		  pkey;
+	u16		  pkey_index;
+	struct ib_pd	 *pd;
+	struct ib_mr	 *mr;
+	struct ib_cq	 *recv_cq;
+	struct ib_cq	 *send_cq;
+	struct ib_qp	 *qp;
+	u32		  qkey;
+
+	union ib_gid local_gid;
+	u16	     local_lid;
+
+	unsigned int admin_mtu;
+	unsigned int mcast_mtu;
+	unsigned int max_ib_mtu;
+
+	struct ipoib_rx_buf *rx_ring;
+
+	struct ipoib_tx_buf *tx_ring;
+	unsigned	     tx_head;
+	unsigned	     tx_tail;
+	struct ib_sge	     tx_sge[MAX_SKB_FRAGS + 1];
+	struct ib_send_wr    tx_wr;
+	unsigned	     tx_outstanding;
+	struct ib_wc	     send_wc[MAX_SEND_CQE];
+
+	struct ib_recv_wr    rx_wr;
+	struct ib_sge	     rx_sge[IPOIB_UD_RX_SG];
+
+	struct ib_wc ibwc[IPOIB_NUM_WC];
+
+	struct list_head dead_ahs;
+
+	struct ib_event_handler event_handler;
+
+	struct net_device *parent;
+	struct list_head child_intfs;
+	struct list_head list;
+	int    child_type;
+
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+	struct ipoib_cm_dev_priv cm;
+#endif
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+	struct list_head fs_list;
+	struct dentry *mcg_dentry;
+	struct dentry *path_dentry;
+#endif
+	int	hca_caps;
+	struct ipoib_ethtool_st ethtool;
+	struct timer_list poll_timer;
+};
+
+struct ipoib_ah {
+	struct net_device *dev;
+	struct ib_ah	  *ah;
+	struct list_head   list;
+	struct kref	   ref;
+	unsigned	   last_send;
+};
+
+struct ipoib_path {
+	struct net_device    *dev;
+	struct ib_sa_path_rec pathrec;
+	struct ipoib_ah      *ah;
+	struct sk_buff_head   queue;
+
+	struct list_head      neigh_list;
+
+	int		      query_id;
+	struct ib_sa_query   *query;
+	struct completion     done;
+
+	struct rb_node	      rb_node;
+	struct list_head      list;
+	int  		      valid;
+};
+
+struct ipoib_neigh {
+	struct ipoib_ah    *ah;
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+	struct ipoib_cm_tx *cm;
+#endif
+	u8     daddr[INFINIBAND_ALEN];
+	struct sk_buff_head queue;
+
+	struct net_device *dev;
+
+	struct list_head    list;
+	struct ipoib_neigh __rcu *hnext;
+	struct rcu_head     rcu;
+	atomic_t	    refcnt;
+	unsigned long       alive;
+};
+
+#define IPOIB_UD_MTU(ib_mtu)		(ib_mtu - IPOIB_ENCAP_LEN)
+#define IPOIB_UD_BUF_SIZE(ib_mtu)	(ib_mtu + IB_GRH_BYTES)
+
+void ipoib_neigh_dtor(struct ipoib_neigh *neigh);
+static inline void ipoib_neigh_put(struct ipoib_neigh *neigh)
+{
+	if (atomic_dec_and_test(&neigh->refcnt))
+		ipoib_neigh_dtor(neigh);
+}
+struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr);
+struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
+				      struct net_device *dev);
+void ipoib_neigh_free(struct ipoib_neigh *neigh);
+void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid);
+
+extern struct workqueue_struct *ipoib_workqueue;
+
+/* functions */
+
+int ipoib_poll(struct napi_struct *napi, int budget);
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr);
+void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr);
+
+struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
+				 struct ib_pd *pd, struct ib_ah_attr *attr);
+void ipoib_free_ah(struct kref *kref);
+static inline void ipoib_put_ah(struct ipoib_ah *ah)
+{
+	kref_put(&ah->ref, ipoib_free_ah);
+}
+int ipoib_open(struct net_device *dev);
+int ipoib_add_pkey_attr(struct net_device *dev);
+int ipoib_add_umcast_attr(struct net_device *dev);
+
+void ipoib_send(struct net_device *dev, struct sk_buff *skb,
+		struct ipoib_ah *address, u32 qpn);
+void ipoib_reap_ah(struct work_struct *work);
+
+void ipoib_mark_paths_invalid(struct net_device *dev);
+void ipoib_flush_paths(struct net_device *dev);
+struct ipoib_dev_priv *ipoib_intf_alloc(const char *format);
+
+int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
+void ipoib_ib_dev_flush_light(struct work_struct *work);
+void ipoib_ib_dev_flush_normal(struct work_struct *work);
+void ipoib_ib_dev_flush_heavy(struct work_struct *work);
+void ipoib_pkey_event(struct work_struct *work);
+void ipoib_ib_dev_cleanup(struct net_device *dev);
+
+int ipoib_ib_dev_open(struct net_device *dev);
+int ipoib_ib_dev_up(struct net_device *dev);
+int ipoib_ib_dev_down(struct net_device *dev);
+int ipoib_ib_dev_stop(struct net_device *dev);
+void ipoib_pkey_dev_check_presence(struct net_device *dev);
+
+int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port);
+void ipoib_dev_cleanup(struct net_device *dev);
+
+void ipoib_mcast_join_task(struct work_struct *work);
+void ipoib_mcast_carrier_on_task(struct work_struct *work);
+void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb);
+
+void ipoib_mcast_restart_task(struct work_struct *work);
+int ipoib_mcast_start_thread(struct net_device *dev);
+int ipoib_mcast_stop_thread(struct net_device *dev);
+
+void ipoib_mcast_dev_down(struct net_device *dev);
+void ipoib_mcast_dev_flush(struct net_device *dev);
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev);
+int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter);
+void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
+				  union ib_gid *gid,
+				  unsigned long *created,
+				  unsigned int *queuelen,
+				  unsigned int *complete,
+				  unsigned int *send_only);
+
+struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev);
+int ipoib_path_iter_next(struct ipoib_path_iter *iter);
+void ipoib_path_iter_read(struct ipoib_path_iter *iter,
+			  struct ipoib_path *path);
+#endif
+
+int ipoib_mcast_attach(struct net_device *dev, u16 mlid,
+		       union ib_gid *mgid, int set_qkey);
+
+int ipoib_init_qp(struct net_device *dev);
+int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca);
+void ipoib_transport_dev_cleanup(struct net_device *dev);
+
+void ipoib_event(struct ib_event_handler *handler,
+		 struct ib_event *record);
+
+int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey);
+int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey);
+
+int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
+		     u16 pkey, int child_type);
+
+int  __init ipoib_netlink_init(void);
+void __exit ipoib_netlink_fini(void);
+
+void ipoib_set_umcast(struct net_device *ndev, int umcast_val);
+int  ipoib_set_mode(struct net_device *dev, const char *buf);
+
+void ipoib_setup(struct net_device *dev);
+
+void ipoib_pkey_open(struct ipoib_dev_priv *priv);
+void ipoib_drain_cq(struct net_device *dev);
+
+void ipoib_set_ethtool_ops(struct net_device *dev);
+int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca);
+
+#define IPOIB_FLAGS_RC		0x80
+#define IPOIB_FLAGS_UC		0x40
+
+/* We don't support UC connections at the moment */
+#define IPOIB_CM_SUPPORTED(ha)   (ha[0] & (IPOIB_FLAGS_RC))
+
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+
+extern int ipoib_max_conn_qp;
+
+static inline int ipoib_cm_admin_enabled(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	return IPOIB_CM_SUPPORTED(dev->dev_addr) &&
+		test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+}
+
+static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	return IPOIB_CM_SUPPORTED(hwaddr) &&
+		test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+}
+
+static inline int ipoib_cm_up(struct ipoib_neigh *neigh)
+
+{
+	return test_bit(IPOIB_FLAG_OPER_UP, &neigh->cm->flags);
+}
+
+static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
+{
+	return neigh->cm;
+}
+
+static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
+{
+	neigh->cm = tx;
+}
+
+static inline int ipoib_cm_has_srq(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	return !!priv->cm.srq;
+}
+
+static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	return priv->cm.max_cm_mtu;
+}
+
+void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx);
+int ipoib_cm_dev_open(struct net_device *dev);
+void ipoib_cm_dev_stop(struct net_device *dev);
+int ipoib_cm_dev_init(struct net_device *dev);
+int ipoib_cm_add_mode_attr(struct net_device *dev);
+void ipoib_cm_dev_cleanup(struct net_device *dev);
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
+				    struct ipoib_neigh *neigh);
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx);
+void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
+			   unsigned int mtu);
+void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc);
+void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc);
+#else
+
+struct ipoib_cm_tx;
+
+#define ipoib_max_conn_qp 0
+
+static inline int ipoib_cm_admin_enabled(struct net_device *dev)
+{
+	return 0;
+}
+static inline int ipoib_cm_enabled(struct net_device *dev, u8 *hwaddr)
+
+{
+	return 0;
+}
+
+static inline int ipoib_cm_up(struct ipoib_neigh *neigh)
+
+{
+	return 0;
+}
+
+static inline struct ipoib_cm_tx *ipoib_cm_get(struct ipoib_neigh *neigh)
+{
+	return NULL;
+}
+
+static inline void ipoib_cm_set(struct ipoib_neigh *neigh, struct ipoib_cm_tx *tx)
+{
+}
+
+static inline int ipoib_cm_has_srq(struct net_device *dev)
+{
+	return 0;
+}
+
+static inline unsigned int ipoib_cm_max_mtu(struct net_device *dev)
+{
+	return 0;
+}
+
+static inline
+void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
+{
+	return;
+}
+
+static inline
+int ipoib_cm_dev_open(struct net_device *dev)
+{
+	return 0;
+}
+
+static inline
+void ipoib_cm_dev_stop(struct net_device *dev)
+{
+	return;
+}
+
+static inline
+int ipoib_cm_dev_init(struct net_device *dev)
+{
+	return -ENOSYS;
+}
+
+static inline
+void ipoib_cm_dev_cleanup(struct net_device *dev)
+{
+	return;
+}
+
+static inline
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
+				    struct ipoib_neigh *neigh)
+{
+	return NULL;
+}
+
+static inline
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
+{
+	return;
+}
+
+static inline
+int ipoib_cm_add_mode_attr(struct net_device *dev)
+{
+	return 0;
+}
+
+static inline void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
+					 unsigned int mtu)
+{
+	dev_kfree_skb_any(skb);
+}
+
+static inline void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+}
+
+static inline void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+}
+#endif
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+void ipoib_create_debug_files(struct net_device *dev);
+void ipoib_delete_debug_files(struct net_device *dev);
+int ipoib_register_debugfs(void);
+void ipoib_unregister_debugfs(void);
+#else
+static inline void ipoib_create_debug_files(struct net_device *dev) { }
+static inline void ipoib_delete_debug_files(struct net_device *dev) { }
+static inline int ipoib_register_debugfs(void) { return 0; }
+static inline void ipoib_unregister_debugfs(void) { }
+#endif
+
+#define ipoib_printk(level, priv, format, arg...)	\
+	printk(level "%s: " format, ((struct ipoib_dev_priv *) priv)->dev->name , ## arg)
+#define ipoib_warn(priv, format, arg...)		\
+	ipoib_printk(KERN_WARNING, priv, format , ## arg)
+
+extern int ipoib_sendq_size;
+extern int ipoib_recvq_size;
+
+extern struct ib_sa_client ipoib_sa_client;
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+extern int ipoib_debug_level;
+
+#define ipoib_dbg(priv, format, arg...)			\
+	do {						\
+		if (ipoib_debug_level > 0)			\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#define ipoib_dbg_mcast(priv, format, arg...)		\
+	do {						\
+		if (mcast_debug_level > 0)		\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#else /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+#define ipoib_dbg(priv, format, arg...)			\
+	do { (void) (priv); } while (0)
+#define ipoib_dbg_mcast(priv, format, arg...)		\
+	do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+#define ipoib_dbg_data(priv, format, arg...)		\
+	do {						\
+		if (data_debug_level > 0)		\
+			ipoib_printk(KERN_DEBUG, priv, format , ## arg); \
+	} while (0)
+#else /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */
+#define ipoib_dbg_data(priv, format, arg...)		\
+	do { (void) (priv); } while (0)
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG_DATA */
+
+#define IPOIB_QPN(ha) (be32_to_cpup((__be32 *) ha) & 0xffffff)
+
+extern const char ipoib_driver_version[];
+
+#endif /* _IPOIB_H */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
new file mode 100644
index 000000000..cf32a778e
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c
@@ -0,0 +1,1615 @@
+/*
+ * Copyright (c) 2006 Mellanox Technologies. All rights reserved
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <rdma/ib_cm.h>
+#include <net/dst.h>
+#include <net/icmp.h>
+#include <linux/icmpv6.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/moduleparam.h>
+
+#include "ipoib.h"
+
+int ipoib_max_conn_qp = 128;
+
+module_param_named(max_nonsrq_conn_qp, ipoib_max_conn_qp, int, 0444);
+MODULE_PARM_DESC(max_nonsrq_conn_qp,
+		 "Max number of connected-mode QPs per interface "
+		 "(applied only if shared receive queue is not available)");
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+static int data_debug_level;
+
+module_param_named(cm_data_debug_level, data_debug_level, int, 0644);
+MODULE_PARM_DESC(cm_data_debug_level,
+		 "Enable data path debug tracing for connected mode if > 0");
+#endif
+
+#define IPOIB_CM_IETF_ID 0x1000000000000000ULL
+
+#define IPOIB_CM_RX_UPDATE_TIME (256 * HZ)
+#define IPOIB_CM_RX_TIMEOUT     (2 * 256 * HZ)
+#define IPOIB_CM_RX_DELAY       (3 * 256 * HZ)
+#define IPOIB_CM_RX_UPDATE_MASK (0x3)
+
+static struct ib_qp_attr ipoib_cm_err_attr = {
+	.qp_state = IB_QPS_ERR
+};
+
+#define IPOIB_CM_RX_DRAIN_WRID 0xffffffff
+
+static struct ib_send_wr ipoib_cm_rx_drain_wr = {
+	.wr_id = IPOIB_CM_RX_DRAIN_WRID,
+	.opcode = IB_WR_SEND,
+};
+
+static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
+			       struct ib_cm_event *event);
+
+static void ipoib_cm_dma_unmap_rx(struct ipoib_dev_priv *priv, int frags,
+				  u64 mapping[IPOIB_CM_RX_SG])
+{
+	int i;
+
+	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
+
+	for (i = 0; i < frags; ++i)
+		ib_dma_unmap_page(priv->ca, mapping[i + 1], PAGE_SIZE, DMA_FROM_DEVICE);
+}
+
+static int ipoib_cm_post_receive_srq(struct net_device *dev, int id)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_recv_wr *bad_wr;
+	int i, ret;
+
+	priv->cm.rx_wr.wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
+
+	for (i = 0; i < priv->cm.num_frags; ++i)
+		priv->cm.rx_sge[i].addr = priv->cm.srq_ring[id].mapping[i];
+
+	ret = ib_post_srq_recv(priv->cm.srq, &priv->cm.rx_wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "post srq failed for buf %d (%d)\n", id, ret);
+		ipoib_cm_dma_unmap_rx(priv, priv->cm.num_frags - 1,
+				      priv->cm.srq_ring[id].mapping);
+		dev_kfree_skb_any(priv->cm.srq_ring[id].skb);
+		priv->cm.srq_ring[id].skb = NULL;
+	}
+
+	return ret;
+}
+
+static int ipoib_cm_post_receive_nonsrq(struct net_device *dev,
+					struct ipoib_cm_rx *rx,
+					struct ib_recv_wr *wr,
+					struct ib_sge *sge, int id)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_recv_wr *bad_wr;
+	int i, ret;
+
+	wr->wr_id = id | IPOIB_OP_CM | IPOIB_OP_RECV;
+
+	for (i = 0; i < IPOIB_CM_RX_SG; ++i)
+		sge[i].addr = rx->rx_ring[id].mapping[i];
+
+	ret = ib_post_recv(rx->qp, wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "post recv failed for buf %d (%d)\n", id, ret);
+		ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+				      rx->rx_ring[id].mapping);
+		dev_kfree_skb_any(rx->rx_ring[id].skb);
+		rx->rx_ring[id].skb = NULL;
+	}
+
+	return ret;
+}
+
+static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,
+					     struct ipoib_cm_rx_buf *rx_ring,
+					     int id, int frags,
+					     u64 mapping[IPOIB_CM_RX_SG],
+					     gfp_t gfp)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct sk_buff *skb;
+	int i;
+
+	skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12);
+	if (unlikely(!skb))
+		return NULL;
+
+	/*
+	 * IPoIB adds a 4 byte header. So we need 12 more bytes to align the
+	 * IP header to a multiple of 16.
+	 */
+	skb_reserve(skb, 12);
+
+	mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
+				       DMA_FROM_DEVICE);
+	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0]))) {
+		dev_kfree_skb_any(skb);
+		return NULL;
+	}
+
+	for (i = 0; i < frags; i++) {
+		struct page *page = alloc_page(gfp);
+
+		if (!page)
+			goto partial_error;
+		skb_fill_page_desc(skb, i, page, 0, PAGE_SIZE);
+
+		mapping[i + 1] = ib_dma_map_page(priv->ca, page,
+						 0, PAGE_SIZE, DMA_FROM_DEVICE);
+		if (unlikely(ib_dma_mapping_error(priv->ca, mapping[i + 1])))
+			goto partial_error;
+	}
+
+	rx_ring[id].skb = skb;
+	return skb;
+
+partial_error:
+
+	ib_dma_unmap_single(priv->ca, mapping[0], IPOIB_CM_HEAD_SIZE, DMA_FROM_DEVICE);
+
+	for (; i > 0; --i)
+		ib_dma_unmap_page(priv->ca, mapping[i], PAGE_SIZE, DMA_FROM_DEVICE);
+
+	dev_kfree_skb_any(skb);
+	return NULL;
+}
+
+static void ipoib_cm_free_rx_ring(struct net_device *dev,
+				  struct ipoib_cm_rx_buf *rx_ring)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < ipoib_recvq_size; ++i)
+		if (rx_ring[i].skb) {
+			ipoib_cm_dma_unmap_rx(priv, IPOIB_CM_RX_SG - 1,
+					      rx_ring[i].mapping);
+			dev_kfree_skb_any(rx_ring[i].skb);
+		}
+
+	vfree(rx_ring);
+}
+
+static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv)
+{
+	struct ib_send_wr *bad_wr;
+	struct ipoib_cm_rx *p;
+
+	/* We only reserved 1 extra slot in CQ for drain WRs, so
+	 * make sure we have at most 1 outstanding WR. */
+	if (list_empty(&priv->cm.rx_flush_list) ||
+	    !list_empty(&priv->cm.rx_drain_list))
+		return;
+
+	/*
+	 * QPs on flush list are error state.  This way, a "flush
+	 * error" WC will be immediately generated for each WR we post.
+	 */
+	p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list);
+	if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr))
+		ipoib_warn(priv, "failed to post drain wr\n");
+
+	list_splice_init(&priv->cm.rx_flush_list, &priv->cm.rx_drain_list);
+}
+
+static void ipoib_cm_rx_event_handler(struct ib_event *event, void *ctx)
+{
+	struct ipoib_cm_rx *p = ctx;
+	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+	unsigned long flags;
+
+	if (event->event != IB_EVENT_QP_LAST_WQE_REACHED)
+		return;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_move(&p->list, &priv->cm.rx_flush_list);
+	p->state = IPOIB_CM_RX_FLUSH;
+	ipoib_cm_start_rx_drain(priv);
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static struct ib_qp *ipoib_cm_create_rx_qp(struct net_device *dev,
+					   struct ipoib_cm_rx *p)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_init_attr attr = {
+		.event_handler = ipoib_cm_rx_event_handler,
+		.send_cq = priv->recv_cq, /* For drain WR */
+		.recv_cq = priv->recv_cq,
+		.srq = priv->cm.srq,
+		.cap.max_send_wr = 1, /* For drain WR */
+		.cap.max_send_sge = 1, /* FIXME: 0 Seems not to work */
+		.sq_sig_type = IB_SIGNAL_ALL_WR,
+		.qp_type = IB_QPT_RC,
+		.qp_context = p,
+	};
+
+	if (!ipoib_cm_has_srq(dev)) {
+		attr.cap.max_recv_wr  = ipoib_recvq_size;
+		attr.cap.max_recv_sge = IPOIB_CM_RX_SG;
+	}
+
+	return ib_create_qp(priv->pd, &attr);
+}
+
+static int ipoib_cm_modify_rx_qp(struct net_device *dev,
+				 struct ib_cm_id *cm_id, struct ib_qp *qp,
+				 unsigned psn)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for INIT: %d\n", ret);
+		return ret;
+	}
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to INIT: %d\n", ret);
+		return ret;
+	}
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
+		return ret;
+	}
+	qp_attr.rq_psn = psn;
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
+		return ret;
+	}
+
+	/*
+	 * Current Mellanox HCA firmware won't generate completions
+	 * with error for drain WRs unless the QP has been moved to
+	 * RTS first. This work-around leaves a window where a QP has
+	 * moved to error asynchronously, but this will eventually get
+	 * fixed in firmware, so let's not error out if modify QP
+	 * fails.
+	 */
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
+		return 0;
+	}
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
+		return 0;
+	}
+
+	return 0;
+}
+
+static void ipoib_cm_init_rx_wr(struct net_device *dev,
+				struct ib_recv_wr *wr,
+				struct ib_sge *sge)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < priv->cm.num_frags; ++i)
+		sge[i].lkey = priv->mr->lkey;
+
+	sge[0].length = IPOIB_CM_HEAD_SIZE;
+	for (i = 1; i < priv->cm.num_frags; ++i)
+		sge[i].length = PAGE_SIZE;
+
+	wr->next    = NULL;
+	wr->sg_list = sge;
+	wr->num_sge = priv->cm.num_frags;
+}
+
+static int ipoib_cm_nonsrq_init_rx(struct net_device *dev, struct ib_cm_id *cm_id,
+				   struct ipoib_cm_rx *rx)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct {
+		struct ib_recv_wr wr;
+		struct ib_sge sge[IPOIB_CM_RX_SG];
+	} *t;
+	int ret;
+	int i;
+
+	rx->rx_ring = vzalloc(ipoib_recvq_size * sizeof *rx->rx_ring);
+	if (!rx->rx_ring) {
+		printk(KERN_WARNING "%s: failed to allocate CM non-SRQ ring (%d entries)\n",
+		       priv->ca->name, ipoib_recvq_size);
+		return -ENOMEM;
+	}
+
+	t = kmalloc(sizeof *t, GFP_KERNEL);
+	if (!t) {
+		ret = -ENOMEM;
+		goto err_free;
+	}
+
+	ipoib_cm_init_rx_wr(dev, &t->wr, t->sge);
+
+	spin_lock_irq(&priv->lock);
+
+	if (priv->cm.nonsrq_conn_qp >= ipoib_max_conn_qp) {
+		spin_unlock_irq(&priv->lock);
+		ib_send_cm_rej(cm_id, IB_CM_REJ_NO_QP, NULL, 0, NULL, 0);
+		ret = -EINVAL;
+		goto err_free;
+	} else
+		++priv->cm.nonsrq_conn_qp;
+
+	spin_unlock_irq(&priv->lock);
+
+	for (i = 0; i < ipoib_recvq_size; ++i) {
+		if (!ipoib_cm_alloc_rx_skb(dev, rx->rx_ring, i, IPOIB_CM_RX_SG - 1,
+					   rx->rx_ring[i].mapping,
+					   GFP_KERNEL)) {
+			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+			ret = -ENOMEM;
+			goto err_count;
+		}
+		ret = ipoib_cm_post_receive_nonsrq(dev, rx, &t->wr, t->sge, i);
+		if (ret) {
+			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq "
+				   "failed for buf %d\n", i);
+			ret = -EIO;
+			goto err_count;
+		}
+	}
+
+	rx->recv_count = ipoib_recvq_size;
+
+	kfree(t);
+
+	return 0;
+
+err_count:
+	spin_lock_irq(&priv->lock);
+	--priv->cm.nonsrq_conn_qp;
+	spin_unlock_irq(&priv->lock);
+
+err_free:
+	kfree(t);
+	ipoib_cm_free_rx_ring(dev, rx->rx_ring);
+
+	return ret;
+}
+
+static int ipoib_cm_send_rep(struct net_device *dev, struct ib_cm_id *cm_id,
+			     struct ib_qp *qp, struct ib_cm_req_event_param *req,
+			     unsigned psn)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_data data = {};
+	struct ib_cm_rep_param rep = {};
+
+	data.qpn = cpu_to_be32(priv->qp->qp_num);
+	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
+
+	rep.private_data = &data;
+	rep.private_data_len = sizeof data;
+	rep.flow_control = 0;
+	rep.rnr_retry_count = req->rnr_retry_count;
+	rep.srq = ipoib_cm_has_srq(dev);
+	rep.qp_num = qp->qp_num;
+	rep.starting_psn = psn;
+	return ib_send_cm_rep(cm_id, &rep);
+}
+
+static int ipoib_cm_req_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+	struct net_device *dev = cm_id->context;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_rx *p;
+	unsigned psn;
+	int ret;
+
+	ipoib_dbg(priv, "REQ arrived\n");
+	p = kzalloc(sizeof *p, GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+	p->dev = dev;
+	p->id = cm_id;
+	cm_id->context = p;
+	p->state = IPOIB_CM_RX_LIVE;
+	p->jiffies = jiffies;
+	INIT_LIST_HEAD(&p->list);
+
+	p->qp = ipoib_cm_create_rx_qp(dev, p);
+	if (IS_ERR(p->qp)) {
+		ret = PTR_ERR(p->qp);
+		goto err_qp;
+	}
+
+	psn = prandom_u32() & 0xffffff;
+	ret = ipoib_cm_modify_rx_qp(dev, cm_id, p->qp, psn);
+	if (ret)
+		goto err_modify;
+
+	if (!ipoib_cm_has_srq(dev)) {
+		ret = ipoib_cm_nonsrq_init_rx(dev, cm_id, p);
+		if (ret)
+			goto err_modify;
+	}
+
+	spin_lock_irq(&priv->lock);
+	queue_delayed_work(priv->wq,
+			   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
+	/* Add this entry to passive ids list head, but do not re-add it
+	 * if IB_EVENT_QP_LAST_WQE_REACHED has moved it to flush list. */
+	p->jiffies = jiffies;
+	if (p->state == IPOIB_CM_RX_LIVE)
+		list_move(&p->list, &priv->cm.passive_ids);
+	spin_unlock_irq(&priv->lock);
+
+	ret = ipoib_cm_send_rep(dev, cm_id, p->qp, &event->param.req_rcvd, psn);
+	if (ret) {
+		ipoib_warn(priv, "failed to send REP: %d\n", ret);
+		if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
+			ipoib_warn(priv, "unable to move qp to error state\n");
+	}
+	return 0;
+
+err_modify:
+	ib_destroy_qp(p->qp);
+err_qp:
+	kfree(p);
+	return ret;
+}
+
+static int ipoib_cm_rx_handler(struct ib_cm_id *cm_id,
+			       struct ib_cm_event *event)
+{
+	struct ipoib_cm_rx *p;
+	struct ipoib_dev_priv *priv;
+
+	switch (event->event) {
+	case IB_CM_REQ_RECEIVED:
+		return ipoib_cm_req_handler(cm_id, event);
+	case IB_CM_DREQ_RECEIVED:
+		p = cm_id->context;
+		ib_send_cm_drep(cm_id, NULL, 0);
+		/* Fall through */
+	case IB_CM_REJ_RECEIVED:
+		p = cm_id->context;
+		priv = netdev_priv(p->dev);
+		if (ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE))
+			ipoib_warn(priv, "unable to move qp to error state\n");
+		/* Fall through */
+	default:
+		return 0;
+	}
+}
+/* Adjust length of skb with fragments to match received data */
+static void skb_put_frags(struct sk_buff *skb, unsigned int hdr_space,
+			  unsigned int length, struct sk_buff *toskb)
+{
+	int i, num_frags;
+	unsigned int size;
+
+	/* put header into skb */
+	size = min(length, hdr_space);
+	skb->tail += size;
+	skb->len += size;
+	length -= size;
+
+	num_frags = skb_shinfo(skb)->nr_frags;
+	for (i = 0; i < num_frags; i++) {
+		skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		if (length == 0) {
+			/* don't need this page */
+			skb_fill_page_desc(toskb, i, skb_frag_page(frag),
+					   0, PAGE_SIZE);
+			--skb_shinfo(skb)->nr_frags;
+		} else {
+			size = min(length, (unsigned) PAGE_SIZE);
+
+			skb_frag_size_set(frag, size);
+			skb->data_len += size;
+			skb->truesize += size;
+			skb->len += size;
+			length -= size;
+		}
+	}
+}
+
+void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_rx_buf *rx_ring;
+	unsigned int wr_id = wc->wr_id & ~(IPOIB_OP_CM | IPOIB_OP_RECV);
+	struct sk_buff *skb, *newskb;
+	struct ipoib_cm_rx *p;
+	unsigned long flags;
+	u64 mapping[IPOIB_CM_RX_SG];
+	int frags;
+	int has_srq;
+	struct sk_buff *small_skb;
+
+	ipoib_dbg_data(priv, "cm recv completion: id %d, status: %d\n",
+		       wr_id, wc->status);
+
+	if (unlikely(wr_id >= ipoib_recvq_size)) {
+		if (wr_id == (IPOIB_CM_RX_DRAIN_WRID & ~(IPOIB_OP_CM | IPOIB_OP_RECV))) {
+			spin_lock_irqsave(&priv->lock, flags);
+			list_splice_init(&priv->cm.rx_drain_list, &priv->cm.rx_reap_list);
+			ipoib_cm_start_rx_drain(priv);
+			queue_work(priv->wq, &priv->cm.rx_reap_task);
+			spin_unlock_irqrestore(&priv->lock, flags);
+		} else
+			ipoib_warn(priv, "cm recv completion event with wrid %d (> %d)\n",
+				   wr_id, ipoib_recvq_size);
+		return;
+	}
+
+	p = wc->qp->qp_context;
+
+	has_srq = ipoib_cm_has_srq(dev);
+	rx_ring = has_srq ? priv->cm.srq_ring : p->rx_ring;
+
+	skb = rx_ring[wr_id].skb;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		ipoib_dbg(priv, "cm recv error "
+			   "(status=%d, wrid=%d vend_err %x)\n",
+			   wc->status, wr_id, wc->vendor_err);
+		++dev->stats.rx_dropped;
+		if (has_srq)
+			goto repost;
+		else {
+			if (!--p->recv_count) {
+				spin_lock_irqsave(&priv->lock, flags);
+				list_move(&p->list, &priv->cm.rx_reap_list);
+				spin_unlock_irqrestore(&priv->lock, flags);
+				queue_work(priv->wq, &priv->cm.rx_reap_task);
+			}
+			return;
+		}
+	}
+
+	if (unlikely(!(wr_id & IPOIB_CM_RX_UPDATE_MASK))) {
+		if (p && time_after_eq(jiffies, p->jiffies + IPOIB_CM_RX_UPDATE_TIME)) {
+			spin_lock_irqsave(&priv->lock, flags);
+			p->jiffies = jiffies;
+			/* Move this entry to list head, but do not re-add it
+			 * if it has been moved out of list. */
+			if (p->state == IPOIB_CM_RX_LIVE)
+				list_move(&p->list, &priv->cm.passive_ids);
+			spin_unlock_irqrestore(&priv->lock, flags);
+		}
+	}
+
+	if (wc->byte_len < IPOIB_CM_COPYBREAK) {
+		int dlen = wc->byte_len;
+
+		small_skb = dev_alloc_skb(dlen + 12);
+		if (small_skb) {
+			skb_reserve(small_skb, 12);
+			ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
+						   dlen, DMA_FROM_DEVICE);
+			skb_copy_from_linear_data(skb, small_skb->data, dlen);
+			ib_dma_sync_single_for_device(priv->ca, rx_ring[wr_id].mapping[0],
+						      dlen, DMA_FROM_DEVICE);
+			skb_put(small_skb, dlen);
+			skb = small_skb;
+			goto copied;
+		}
+	}
+
+	frags = PAGE_ALIGN(wc->byte_len - min(wc->byte_len,
+					      (unsigned)IPOIB_CM_HEAD_SIZE)) / PAGE_SIZE;
+
+	newskb = ipoib_cm_alloc_rx_skb(dev, rx_ring, wr_id, frags,
+				       mapping, GFP_ATOMIC);
+	if (unlikely(!newskb)) {
+		/*
+		 * If we can't allocate a new RX buffer, dump
+		 * this packet and reuse the old buffer.
+		 */
+		ipoib_dbg(priv, "failed to allocate receive buffer %d\n", wr_id);
+		++dev->stats.rx_dropped;
+		goto repost;
+	}
+
+	ipoib_cm_dma_unmap_rx(priv, frags, rx_ring[wr_id].mapping);
+	memcpy(rx_ring[wr_id].mapping, mapping, (frags + 1) * sizeof *mapping);
+
+	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+		       wc->byte_len, wc->slid);
+
+	skb_put_frags(skb, IPOIB_CM_HEAD_SIZE, wc->byte_len, newskb);
+
+copied:
+	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
+	skb_reset_mac_header(skb);
+	skb_pull(skb, IPOIB_ENCAP_LEN);
+
+	++dev->stats.rx_packets;
+	dev->stats.rx_bytes += skb->len;
+
+	skb->dev = dev;
+	/* XXX get correct PACKET_ type here */
+	skb->pkt_type = PACKET_HOST;
+	netif_receive_skb(skb);
+
+repost:
+	if (has_srq) {
+		if (unlikely(ipoib_cm_post_receive_srq(dev, wr_id)))
+			ipoib_warn(priv, "ipoib_cm_post_receive_srq failed "
+				   "for buf %d\n", wr_id);
+	} else {
+		if (unlikely(ipoib_cm_post_receive_nonsrq(dev, p,
+							  &priv->cm.rx_wr,
+							  priv->cm.rx_sge,
+							  wr_id))) {
+			--p->recv_count;
+			ipoib_warn(priv, "ipoib_cm_post_receive_nonsrq failed "
+				   "for buf %d\n", wr_id);
+		}
+	}
+}
+
+static inline int post_send(struct ipoib_dev_priv *priv,
+			    struct ipoib_cm_tx *tx,
+			    unsigned int wr_id,
+			    u64 addr, int len)
+{
+	struct ib_send_wr *bad_wr;
+
+	priv->tx_sge[0].addr          = addr;
+	priv->tx_sge[0].length        = len;
+
+	priv->tx_wr.num_sge	= 1;
+	priv->tx_wr.wr_id	= wr_id | IPOIB_OP_CM;
+
+	return ib_post_send(tx->qp, &priv->tx_wr, &bad_wr);
+}
+
+void ipoib_cm_send(struct net_device *dev, struct sk_buff *skb, struct ipoib_cm_tx *tx)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_tx_buf *tx_req;
+	u64 addr;
+	int rc;
+
+	if (unlikely(skb->len > tx->mtu)) {
+		ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
+			   skb->len, tx->mtu);
+		++dev->stats.tx_dropped;
+		++dev->stats.tx_errors;
+		ipoib_cm_skb_too_long(dev, skb, tx->mtu - IPOIB_ENCAP_LEN);
+		return;
+	}
+
+	ipoib_dbg_data(priv, "sending packet: head 0x%x length %d connection 0x%x\n",
+		       tx->tx_head, skb->len, tx->qp->qp_num);
+
+	/*
+	 * We put the skb into the tx_ring _before_ we call post_send()
+	 * because it's entirely possible that the completion handler will
+	 * run before we execute anything after the post_send().  That
+	 * means we have to make sure everything is properly recorded and
+	 * our state is consistent before we call post_send().
+	 */
+	tx_req = &tx->tx_ring[tx->tx_head & (ipoib_sendq_size - 1)];
+	tx_req->skb = skb;
+	addr = ib_dma_map_single(priv->ca, skb->data, skb->len, DMA_TO_DEVICE);
+	if (unlikely(ib_dma_mapping_error(priv->ca, addr))) {
+		++dev->stats.tx_errors;
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	tx_req->mapping = addr;
+
+	skb_orphan(skb);
+	skb_dst_drop(skb);
+
+	rc = post_send(priv, tx, tx->tx_head & (ipoib_sendq_size - 1),
+		       addr, skb->len);
+	if (unlikely(rc)) {
+		ipoib_warn(priv, "post_send failed, error %d\n", rc);
+		++dev->stats.tx_errors;
+		ib_dma_unmap_single(priv->ca, addr, skb->len, DMA_TO_DEVICE);
+		dev_kfree_skb_any(skb);
+	} else {
+		dev->trans_start = jiffies;
+		++tx->tx_head;
+
+		if (++priv->tx_outstanding == ipoib_sendq_size) {
+			ipoib_dbg(priv, "TX ring 0x%x full, stopping kernel net queue\n",
+				  tx->qp->qp_num);
+			netif_stop_queue(dev);
+			rc = ib_req_notify_cq(priv->send_cq,
+				IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
+			if (rc < 0)
+				ipoib_warn(priv, "request notify on send CQ failed\n");
+			else if (rc)
+				ipoib_send_comp_handler(priv->send_cq, dev);
+		}
+	}
+}
+
+void ipoib_cm_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_tx *tx = wc->qp->qp_context;
+	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_CM;
+	struct ipoib_cm_tx_buf *tx_req;
+	unsigned long flags;
+
+	ipoib_dbg_data(priv, "cm send completion: id %d, status: %d\n",
+		       wr_id, wc->status);
+
+	if (unlikely(wr_id >= ipoib_sendq_size)) {
+		ipoib_warn(priv, "cm send completion event with wrid %d (> %d)\n",
+			   wr_id, ipoib_sendq_size);
+		return;
+	}
+
+	tx_req = &tx->tx_ring[wr_id];
+
+	ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len, DMA_TO_DEVICE);
+
+	/* FIXME: is this right? Shouldn't we only increment on success? */
+	++dev->stats.tx_packets;
+	dev->stats.tx_bytes += tx_req->skb->len;
+
+	dev_kfree_skb_any(tx_req->skb);
+
+	netif_tx_lock(dev);
+
+	++tx->tx_tail;
+	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+	    netif_queue_stopped(dev) &&
+	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+		netif_wake_queue(dev);
+
+	if (wc->status != IB_WC_SUCCESS &&
+	    wc->status != IB_WC_WR_FLUSH_ERR) {
+		struct ipoib_neigh *neigh;
+
+		ipoib_dbg(priv, "failed cm send event "
+			   "(status=%d, wrid=%d vend_err %x)\n",
+			   wc->status, wr_id, wc->vendor_err);
+
+		spin_lock_irqsave(&priv->lock, flags);
+		neigh = tx->neigh;
+
+		if (neigh) {
+			neigh->cm = NULL;
+			ipoib_neigh_free(neigh);
+
+			tx->neigh = NULL;
+		}
+
+		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+			list_move(&tx->list, &priv->cm.reap_list);
+			queue_work(priv->wq, &priv->cm.reap_task);
+		}
+
+		clear_bit(IPOIB_FLAG_OPER_UP, &tx->flags);
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+	}
+
+	netif_tx_unlock(dev);
+}
+
+int ipoib_cm_dev_open(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+
+	if (!IPOIB_CM_SUPPORTED(dev->dev_addr))
+		return 0;
+
+	priv->cm.id = ib_create_cm_id(priv->ca, ipoib_cm_rx_handler, dev);
+	if (IS_ERR(priv->cm.id)) {
+		printk(KERN_WARNING "%s: failed to create CM ID\n", priv->ca->name);
+		ret = PTR_ERR(priv->cm.id);
+		goto err_cm;
+	}
+
+	ret = ib_cm_listen(priv->cm.id, cpu_to_be64(IPOIB_CM_IETF_ID | priv->qp->qp_num),
+			   0, NULL);
+	if (ret) {
+		printk(KERN_WARNING "%s: failed to listen on ID 0x%llx\n", priv->ca->name,
+		       IPOIB_CM_IETF_ID | priv->qp->qp_num);
+		goto err_listen;
+	}
+
+	return 0;
+
+err_listen:
+	ib_destroy_cm_id(priv->cm.id);
+err_cm:
+	priv->cm.id = NULL;
+	return ret;
+}
+
+static void ipoib_cm_free_rx_reap_list(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_rx *rx, *n;
+	LIST_HEAD(list);
+
+	spin_lock_irq(&priv->lock);
+	list_splice_init(&priv->cm.rx_reap_list, &list);
+	spin_unlock_irq(&priv->lock);
+
+	list_for_each_entry_safe(rx, n, &list, list) {
+		ib_destroy_cm_id(rx->id);
+		ib_destroy_qp(rx->qp);
+		if (!ipoib_cm_has_srq(dev)) {
+			ipoib_cm_free_rx_ring(priv->dev, rx->rx_ring);
+			spin_lock_irq(&priv->lock);
+			--priv->cm.nonsrq_conn_qp;
+			spin_unlock_irq(&priv->lock);
+		}
+		kfree(rx);
+	}
+}
+
+void ipoib_cm_dev_stop(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_rx *p;
+	unsigned long begin;
+	int ret;
+
+	if (!IPOIB_CM_SUPPORTED(dev->dev_addr) || !priv->cm.id)
+		return;
+
+	ib_destroy_cm_id(priv->cm.id);
+	priv->cm.id = NULL;
+
+	spin_lock_irq(&priv->lock);
+	while (!list_empty(&priv->cm.passive_ids)) {
+		p = list_entry(priv->cm.passive_ids.next, typeof(*p), list);
+		list_move(&p->list, &priv->cm.rx_error_list);
+		p->state = IPOIB_CM_RX_ERROR;
+		spin_unlock_irq(&priv->lock);
+		ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+		if (ret)
+			ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
+		spin_lock_irq(&priv->lock);
+	}
+
+	/* Wait for all RX to be drained */
+	begin = jiffies;
+
+	while (!list_empty(&priv->cm.rx_error_list) ||
+	       !list_empty(&priv->cm.rx_flush_list) ||
+	       !list_empty(&priv->cm.rx_drain_list)) {
+		if (time_after(jiffies, begin + 5 * HZ)) {
+			ipoib_warn(priv, "RX drain timing out\n");
+
+			/*
+			 * assume the HW is wedged and just free up everything.
+			 */
+			list_splice_init(&priv->cm.rx_flush_list,
+					 &priv->cm.rx_reap_list);
+			list_splice_init(&priv->cm.rx_error_list,
+					 &priv->cm.rx_reap_list);
+			list_splice_init(&priv->cm.rx_drain_list,
+					 &priv->cm.rx_reap_list);
+			break;
+		}
+		spin_unlock_irq(&priv->lock);
+		msleep(1);
+		ipoib_drain_cq(dev);
+		spin_lock_irq(&priv->lock);
+	}
+
+	spin_unlock_irq(&priv->lock);
+
+	ipoib_cm_free_rx_reap_list(dev);
+
+	cancel_delayed_work(&priv->cm.stale_task);
+}
+
+static int ipoib_cm_rep_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+	struct ipoib_cm_tx *p = cm_id->context;
+	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+	struct ipoib_cm_data *data = event->private_data;
+	struct sk_buff_head skqueue;
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+	struct sk_buff *skb;
+
+	p->mtu = be32_to_cpu(data->mtu);
+
+	if (p->mtu <= IPOIB_ENCAP_LEN) {
+		ipoib_warn(priv, "Rejecting connection: mtu %d <= %d\n",
+			   p->mtu, IPOIB_ENCAP_LEN);
+		return -EINVAL;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTR: %d\n", ret);
+		return ret;
+	}
+
+	qp_attr.rq_psn = 0 /* FIXME */;
+	ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTR: %d\n", ret);
+		return ret;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = ib_cm_init_qp_attr(cm_id, &qp_attr, &qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to init QP attr for RTS: %d\n", ret);
+		return ret;
+	}
+	ret = ib_modify_qp(p->qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTS: %d\n", ret);
+		return ret;
+	}
+
+	skb_queue_head_init(&skqueue);
+
+	spin_lock_irq(&priv->lock);
+	set_bit(IPOIB_FLAG_OPER_UP, &p->flags);
+	if (p->neigh)
+		while ((skb = __skb_dequeue(&p->neigh->queue)))
+			__skb_queue_tail(&skqueue, skb);
+	spin_unlock_irq(&priv->lock);
+
+	while ((skb = __skb_dequeue(&skqueue))) {
+		skb->dev = p->dev;
+		if (dev_queue_xmit(skb))
+			ipoib_warn(priv, "dev_queue_xmit failed "
+				   "to requeue packet\n");
+	}
+
+	ret = ib_send_cm_rtu(cm_id, NULL, 0);
+	if (ret) {
+		ipoib_warn(priv, "failed to send RTU: %d\n", ret);
+		return ret;
+	}
+	return 0;
+}
+
+static struct ib_qp *ipoib_cm_create_tx_qp(struct net_device *dev, struct ipoib_cm_tx *tx)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_init_attr attr = {
+		.send_cq		= priv->recv_cq,
+		.recv_cq		= priv->recv_cq,
+		.srq			= priv->cm.srq,
+		.cap.max_send_wr	= ipoib_sendq_size,
+		.cap.max_send_sge	= 1,
+		.sq_sig_type		= IB_SIGNAL_ALL_WR,
+		.qp_type		= IB_QPT_RC,
+		.qp_context		= tx,
+		.create_flags		= IB_QP_CREATE_USE_GFP_NOIO
+	};
+
+	struct ib_qp *tx_qp;
+
+	tx_qp = ib_create_qp(priv->pd, &attr);
+	if (PTR_ERR(tx_qp) == -EINVAL) {
+		ipoib_warn(priv, "can't use GFP_NOIO for QPs on device %s, using GFP_KERNEL\n",
+			   priv->ca->name);
+		attr.create_flags &= ~IB_QP_CREATE_USE_GFP_NOIO;
+		tx_qp = ib_create_qp(priv->pd, &attr);
+	}
+	return tx_qp;
+}
+
+static int ipoib_cm_send_req(struct net_device *dev,
+			     struct ib_cm_id *id, struct ib_qp *qp,
+			     u32 qpn,
+			     struct ib_sa_path_rec *pathrec)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_data data = {};
+	struct ib_cm_req_param req = {};
+
+	data.qpn = cpu_to_be32(priv->qp->qp_num);
+	data.mtu = cpu_to_be32(IPOIB_CM_BUF_SIZE);
+
+	req.primary_path		= pathrec;
+	req.alternate_path		= NULL;
+	req.service_id			= cpu_to_be64(IPOIB_CM_IETF_ID | qpn);
+	req.qp_num			= qp->qp_num;
+	req.qp_type			= qp->qp_type;
+	req.private_data		= &data;
+	req.private_data_len		= sizeof data;
+	req.flow_control		= 0;
+
+	req.starting_psn		= 0; /* FIXME */
+
+	/*
+	 * Pick some arbitrary defaults here; we could make these
+	 * module parameters if anyone cared about setting them.
+	 */
+	req.responder_resources		= 4;
+	req.remote_cm_response_timeout	= 20;
+	req.local_cm_response_timeout	= 20;
+	req.retry_count			= 0; /* RFC draft warns against retries */
+	req.rnr_retry_count		= 0; /* RFC draft warns against retries */
+	req.max_cm_retries		= 15;
+	req.srq				= ipoib_cm_has_srq(dev);
+	return ib_send_cm_req(id, &req);
+}
+
+static int ipoib_cm_modify_tx_init(struct net_device *dev,
+				  struct ib_cm_id *cm_id, struct ib_qp *qp)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_attr qp_attr;
+	int qp_attr_mask, ret;
+	ret = ib_find_pkey(priv->ca, priv->port, priv->pkey, &qp_attr.pkey_index);
+	if (ret) {
+		ipoib_warn(priv, "pkey 0x%x not found: %d\n", priv->pkey, ret);
+		return ret;
+	}
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	qp_attr.qp_access_flags = IB_ACCESS_LOCAL_WRITE;
+	qp_attr.port_num = priv->port;
+	qp_attr_mask = IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PKEY_INDEX | IB_QP_PORT;
+
+	ret = ib_modify_qp(qp, &qp_attr, qp_attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify tx QP to INIT: %d\n", ret);
+		return ret;
+	}
+	return 0;
+}
+
+static int ipoib_cm_tx_init(struct ipoib_cm_tx *p, u32 qpn,
+			    struct ib_sa_path_rec *pathrec)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+	int ret;
+
+	p->tx_ring = __vmalloc(ipoib_sendq_size * sizeof *p->tx_ring,
+			       GFP_NOIO, PAGE_KERNEL);
+	if (!p->tx_ring) {
+		ipoib_warn(priv, "failed to allocate tx ring\n");
+		ret = -ENOMEM;
+		goto err_tx;
+	}
+	memset(p->tx_ring, 0, ipoib_sendq_size * sizeof *p->tx_ring);
+
+	p->qp = ipoib_cm_create_tx_qp(p->dev, p);
+	if (IS_ERR(p->qp)) {
+		ret = PTR_ERR(p->qp);
+		ipoib_warn(priv, "failed to allocate tx qp: %d\n", ret);
+		goto err_qp;
+	}
+
+	p->id = ib_create_cm_id(priv->ca, ipoib_cm_tx_handler, p);
+	if (IS_ERR(p->id)) {
+		ret = PTR_ERR(p->id);
+		ipoib_warn(priv, "failed to create tx cm id: %d\n", ret);
+		goto err_id;
+	}
+
+	ret = ipoib_cm_modify_tx_init(p->dev, p->id,  p->qp);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify tx qp to rtr: %d\n", ret);
+		goto err_modify;
+	}
+
+	ret = ipoib_cm_send_req(p->dev, p->id, p->qp, qpn, pathrec);
+	if (ret) {
+		ipoib_warn(priv, "failed to send cm req: %d\n", ret);
+		goto err_send_cm;
+	}
+
+	ipoib_dbg(priv, "Request connection 0x%x for gid %pI6 qpn 0x%x\n",
+		  p->qp->qp_num, pathrec->dgid.raw, qpn);
+
+	return 0;
+
+err_send_cm:
+err_modify:
+	ib_destroy_cm_id(p->id);
+err_id:
+	p->id = NULL;
+	ib_destroy_qp(p->qp);
+err_qp:
+	p->qp = NULL;
+	vfree(p->tx_ring);
+err_tx:
+	return ret;
+}
+
+static void ipoib_cm_tx_destroy(struct ipoib_cm_tx *p)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(p->dev);
+	struct ipoib_cm_tx_buf *tx_req;
+	unsigned long begin;
+
+	ipoib_dbg(priv, "Destroy active connection 0x%x head 0x%x tail 0x%x\n",
+		  p->qp ? p->qp->qp_num : 0, p->tx_head, p->tx_tail);
+
+	if (p->id)
+		ib_destroy_cm_id(p->id);
+
+	if (p->tx_ring) {
+		/* Wait for all sends to complete */
+		begin = jiffies;
+		while ((int) p->tx_tail - (int) p->tx_head < 0) {
+			if (time_after(jiffies, begin + 5 * HZ)) {
+				ipoib_warn(priv, "timing out; %d sends not completed\n",
+					   p->tx_head - p->tx_tail);
+				goto timeout;
+			}
+
+			msleep(1);
+		}
+	}
+
+timeout:
+
+	while ((int) p->tx_tail - (int) p->tx_head < 0) {
+		tx_req = &p->tx_ring[p->tx_tail & (ipoib_sendq_size - 1)];
+		ib_dma_unmap_single(priv->ca, tx_req->mapping, tx_req->skb->len,
+				    DMA_TO_DEVICE);
+		dev_kfree_skb_any(tx_req->skb);
+		++p->tx_tail;
+		netif_tx_lock_bh(p->dev);
+		if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+		    netif_queue_stopped(p->dev) &&
+		    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+			netif_wake_queue(p->dev);
+		netif_tx_unlock_bh(p->dev);
+	}
+
+	if (p->qp)
+		ib_destroy_qp(p->qp);
+
+	vfree(p->tx_ring);
+	kfree(p);
+}
+
+static int ipoib_cm_tx_handler(struct ib_cm_id *cm_id,
+			       struct ib_cm_event *event)
+{
+	struct ipoib_cm_tx *tx = cm_id->context;
+	struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
+	struct net_device *dev = priv->dev;
+	struct ipoib_neigh *neigh;
+	unsigned long flags;
+	int ret;
+
+	switch (event->event) {
+	case IB_CM_DREQ_RECEIVED:
+		ipoib_dbg(priv, "DREQ received.\n");
+		ib_send_cm_drep(cm_id, NULL, 0);
+		break;
+	case IB_CM_REP_RECEIVED:
+		ipoib_dbg(priv, "REP received.\n");
+		ret = ipoib_cm_rep_handler(cm_id, event);
+		if (ret)
+			ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED,
+				       NULL, 0, NULL, 0);
+		break;
+	case IB_CM_REQ_ERROR:
+	case IB_CM_REJ_RECEIVED:
+	case IB_CM_TIMEWAIT_EXIT:
+		ipoib_dbg(priv, "CM error %d.\n", event->event);
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
+		neigh = tx->neigh;
+
+		if (neigh) {
+			neigh->cm = NULL;
+			ipoib_neigh_free(neigh);
+
+			tx->neigh = NULL;
+		}
+
+		if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+			list_move(&tx->list, &priv->cm.reap_list);
+			queue_work(priv->wq, &priv->cm.reap_task);
+		}
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+		break;
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+struct ipoib_cm_tx *ipoib_cm_create_tx(struct net_device *dev, struct ipoib_path *path,
+				       struct ipoib_neigh *neigh)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_cm_tx *tx;
+
+	tx = kzalloc(sizeof *tx, GFP_ATOMIC);
+	if (!tx)
+		return NULL;
+
+	neigh->cm = tx;
+	tx->neigh = neigh;
+	tx->path = path;
+	tx->dev = dev;
+	list_add(&tx->list, &priv->cm.start_list);
+	set_bit(IPOIB_FLAG_INITIALIZED, &tx->flags);
+	queue_work(priv->wq, &priv->cm.start_task);
+	return tx;
+}
+
+void ipoib_cm_destroy_tx(struct ipoib_cm_tx *tx)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(tx->dev);
+	unsigned long flags;
+	if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &tx->flags)) {
+		spin_lock_irqsave(&priv->lock, flags);
+		list_move(&tx->list, &priv->cm.reap_list);
+		queue_work(priv->wq, &priv->cm.reap_task);
+		ipoib_dbg(priv, "Reap connection for gid %pI6\n",
+			  tx->neigh->daddr + 4);
+		tx->neigh = NULL;
+		spin_unlock_irqrestore(&priv->lock, flags);
+	}
+}
+
+static void ipoib_cm_tx_start(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.start_task);
+	struct net_device *dev = priv->dev;
+	struct ipoib_neigh *neigh;
+	struct ipoib_cm_tx *p;
+	unsigned long flags;
+	int ret;
+
+	struct ib_sa_path_rec pathrec;
+	u32 qpn;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
+	while (!list_empty(&priv->cm.start_list)) {
+		p = list_entry(priv->cm.start_list.next, typeof(*p), list);
+		list_del_init(&p->list);
+		neigh = p->neigh;
+		qpn = IPOIB_QPN(neigh->daddr);
+		memcpy(&pathrec, &p->path->pathrec, sizeof pathrec);
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+
+		ret = ipoib_cm_tx_init(p, qpn, &pathrec);
+
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
+
+		if (ret) {
+			neigh = p->neigh;
+			if (neigh) {
+				neigh->cm = NULL;
+				ipoib_neigh_free(neigh);
+			}
+			list_del(&p->list);
+			kfree(p);
+		}
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
+}
+
+static void ipoib_cm_tx_reap(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.reap_task);
+	struct net_device *dev = priv->dev;
+	struct ipoib_cm_tx *p;
+	unsigned long flags;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
+	while (!list_empty(&priv->cm.reap_list)) {
+		p = list_entry(priv->cm.reap_list.next, typeof(*p), list);
+		list_del(&p->list);
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+		ipoib_cm_tx_destroy(p);
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
+}
+
+static void ipoib_cm_skb_reap(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.skb_task);
+	struct net_device *dev = priv->dev;
+	struct sk_buff *skb;
+	unsigned long flags;
+	unsigned mtu = priv->mcast_mtu;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
+	while ((skb = skb_dequeue(&priv->cm.skb_queue))) {
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+
+		if (skb->protocol == htons(ETH_P_IP))
+			icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
+#if IS_ENABLED(CONFIG_IPV6)
+		else if (skb->protocol == htons(ETH_P_IPV6))
+			icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+#endif
+		dev_kfree_skb_any(skb);
+
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
+}
+
+void ipoib_cm_skb_too_long(struct net_device *dev, struct sk_buff *skb,
+			   unsigned int mtu)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int e = skb_queue_empty(&priv->cm.skb_queue);
+
+	if (skb_dst(skb))
+		skb_dst(skb)->ops->update_pmtu(skb_dst(skb), NULL, skb, mtu);
+
+	skb_queue_tail(&priv->cm.skb_queue, skb);
+	if (e)
+		queue_work(priv->wq, &priv->cm.skb_task);
+}
+
+static void ipoib_cm_rx_reap(struct work_struct *work)
+{
+	ipoib_cm_free_rx_reap_list(container_of(work, struct ipoib_dev_priv,
+						cm.rx_reap_task)->dev);
+}
+
+static void ipoib_cm_stale_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   cm.stale_task.work);
+	struct ipoib_cm_rx *p;
+	int ret;
+
+	spin_lock_irq(&priv->lock);
+	while (!list_empty(&priv->cm.passive_ids)) {
+		/* List is sorted by LRU, start from tail,
+		 * stop when we see a recently used entry */
+		p = list_entry(priv->cm.passive_ids.prev, typeof(*p), list);
+		if (time_before_eq(jiffies, p->jiffies + IPOIB_CM_RX_TIMEOUT))
+			break;
+		list_move(&p->list, &priv->cm.rx_error_list);
+		p->state = IPOIB_CM_RX_ERROR;
+		spin_unlock_irq(&priv->lock);
+		ret = ib_modify_qp(p->qp, &ipoib_cm_err_attr, IB_QP_STATE);
+		if (ret)
+			ipoib_warn(priv, "unable to move qp to error state: %d\n", ret);
+		spin_lock_irq(&priv->lock);
+	}
+
+	if (!list_empty(&priv->cm.passive_ids))
+		queue_delayed_work(priv->wq,
+				   &priv->cm.stale_task, IPOIB_CM_RX_DELAY);
+	spin_unlock_irq(&priv->lock);
+}
+
+
+static ssize_t show_mode(struct device *d, struct device_attribute *attr,
+			 char *buf)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(d));
+
+	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
+		return sprintf(buf, "connected\n");
+	else
+		return sprintf(buf, "datagram\n");
+}
+
+static ssize_t set_mode(struct device *d, struct device_attribute *attr,
+			const char *buf, size_t count)
+{
+	struct net_device *dev = to_net_dev(d);
+	int ret;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	ret = ipoib_set_mode(dev, buf);
+
+	rtnl_unlock();
+
+	if (!ret)
+		return count;
+
+	return ret;
+}
+
+static DEVICE_ATTR(mode, S_IWUSR | S_IRUGO, show_mode, set_mode);
+
+int ipoib_cm_add_mode_attr(struct net_device *dev)
+{
+	return device_create_file(&dev->dev, &dev_attr_mode);
+}
+
+static void ipoib_cm_create_srq(struct net_device *dev, int max_sge)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_srq_init_attr srq_init_attr = {
+		.srq_type = IB_SRQT_BASIC,
+		.attr = {
+			.max_wr  = ipoib_recvq_size,
+			.max_sge = max_sge
+		}
+	};
+
+	priv->cm.srq = ib_create_srq(priv->pd, &srq_init_attr);
+	if (IS_ERR(priv->cm.srq)) {
+		if (PTR_ERR(priv->cm.srq) != -ENOSYS)
+			printk(KERN_WARNING "%s: failed to allocate SRQ, error %ld\n",
+			       priv->ca->name, PTR_ERR(priv->cm.srq));
+		priv->cm.srq = NULL;
+		return;
+	}
+
+	priv->cm.srq_ring = vzalloc(ipoib_recvq_size * sizeof *priv->cm.srq_ring);
+	if (!priv->cm.srq_ring) {
+		printk(KERN_WARNING "%s: failed to allocate CM SRQ ring (%d entries)\n",
+		       priv->ca->name, ipoib_recvq_size);
+		ib_destroy_srq(priv->cm.srq);
+		priv->cm.srq = NULL;
+		return;
+	}
+
+}
+
+int ipoib_cm_dev_init(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i, ret;
+	struct ib_device_attr attr;
+
+	INIT_LIST_HEAD(&priv->cm.passive_ids);
+	INIT_LIST_HEAD(&priv->cm.reap_list);
+	INIT_LIST_HEAD(&priv->cm.start_list);
+	INIT_LIST_HEAD(&priv->cm.rx_error_list);
+	INIT_LIST_HEAD(&priv->cm.rx_flush_list);
+	INIT_LIST_HEAD(&priv->cm.rx_drain_list);
+	INIT_LIST_HEAD(&priv->cm.rx_reap_list);
+	INIT_WORK(&priv->cm.start_task, ipoib_cm_tx_start);
+	INIT_WORK(&priv->cm.reap_task, ipoib_cm_tx_reap);
+	INIT_WORK(&priv->cm.skb_task, ipoib_cm_skb_reap);
+	INIT_WORK(&priv->cm.rx_reap_task, ipoib_cm_rx_reap);
+	INIT_DELAYED_WORK(&priv->cm.stale_task, ipoib_cm_stale_task);
+
+	skb_queue_head_init(&priv->cm.skb_queue);
+
+	ret = ib_query_device(priv->ca, &attr);
+	if (ret) {
+		printk(KERN_WARNING "ib_query_device() failed with %d\n", ret);
+		return ret;
+	}
+
+	ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge);
+
+	attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge);
+	ipoib_cm_create_srq(dev, attr.max_srq_sge);
+	if (ipoib_cm_has_srq(dev)) {
+		priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10;
+		priv->cm.num_frags  = attr.max_srq_sge;
+		ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n",
+			  priv->cm.max_cm_mtu, priv->cm.num_frags);
+	} else {
+		priv->cm.max_cm_mtu = IPOIB_CM_MTU;
+		priv->cm.num_frags  = IPOIB_CM_RX_SG;
+	}
+
+	ipoib_cm_init_rx_wr(dev, &priv->cm.rx_wr, priv->cm.rx_sge);
+
+	if (ipoib_cm_has_srq(dev)) {
+		for (i = 0; i < ipoib_recvq_size; ++i) {
+			if (!ipoib_cm_alloc_rx_skb(dev, priv->cm.srq_ring, i,
+						   priv->cm.num_frags - 1,
+						   priv->cm.srq_ring[i].mapping,
+						   GFP_KERNEL)) {
+				ipoib_warn(priv, "failed to allocate "
+					   "receive buffer %d\n", i);
+				ipoib_cm_dev_cleanup(dev);
+				return -ENOMEM;
+			}
+
+			if (ipoib_cm_post_receive_srq(dev, i)) {
+				ipoib_warn(priv, "ipoib_cm_post_receive_srq "
+					   "failed for buf %d\n", i);
+				ipoib_cm_dev_cleanup(dev);
+				return -EIO;
+			}
+		}
+	}
+
+	priv->dev->dev_addr[0] = IPOIB_FLAGS_RC;
+	return 0;
+}
+
+void ipoib_cm_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+
+	if (!priv->cm.srq)
+		return;
+
+	ipoib_dbg(priv, "Cleanup ipoib connected mode.\n");
+
+	ret = ib_destroy_srq(priv->cm.srq);
+	if (ret)
+		ipoib_warn(priv, "ib_destroy_srq failed: %d\n", ret);
+
+	priv->cm.srq = NULL;
+	if (!priv->cm.srq_ring)
+		return;
+
+	ipoib_cm_free_rx_ring(dev, priv->cm.srq_ring);
+	priv->cm.srq_ring = NULL;
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
new file mode 100644
index 000000000..078cadd6c
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2007 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/kernel.h>
+#include <linux/ethtool.h>
+#include <linux/netdevice.h>
+
+#include "ipoib.h"
+
+static void ipoib_get_drvinfo(struct net_device *netdev,
+			      struct ethtool_drvinfo *drvinfo)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(netdev);
+	struct ib_device_attr *attr;
+
+	attr = kmalloc(sizeof(*attr), GFP_KERNEL);
+	if (attr && !ib_query_device(priv->ca, attr))
+		snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+			 "%d.%d.%d", (int)(attr->fw_ver >> 32),
+			 (int)(attr->fw_ver >> 16) & 0xffff,
+			 (int)attr->fw_ver & 0xffff);
+	kfree(attr);
+
+	strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device),
+		sizeof(drvinfo->bus_info));
+
+	strlcpy(drvinfo->version, ipoib_driver_version,
+		sizeof(drvinfo->version));
+
+	strlcpy(drvinfo->driver, "ib_ipoib", sizeof(drvinfo->driver));
+}
+
+static int ipoib_get_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	coal->rx_coalesce_usecs = priv->ethtool.coalesce_usecs;
+	coal->rx_max_coalesced_frames = priv->ethtool.max_coalesced_frames;
+
+	return 0;
+}
+
+static int ipoib_set_coalesce(struct net_device *dev,
+			      struct ethtool_coalesce *coal)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+
+	/*
+	 * These values are saved in the private data and returned
+	 * when ipoib_get_coalesce() is called
+	 */
+	if (coal->rx_coalesce_usecs       > 0xffff ||
+	    coal->rx_max_coalesced_frames > 0xffff)
+		return -EINVAL;
+
+	ret = ib_modify_cq(priv->recv_cq, coal->rx_max_coalesced_frames,
+			   coal->rx_coalesce_usecs);
+	if (ret && ret != -ENOSYS) {
+		ipoib_warn(priv, "failed modifying CQ (%d)\n", ret);
+		return ret;
+	}
+
+	priv->ethtool.coalesce_usecs       = coal->rx_coalesce_usecs;
+	priv->ethtool.max_coalesced_frames = coal->rx_max_coalesced_frames;
+
+	return 0;
+}
+
+static const struct ethtool_ops ipoib_ethtool_ops = {
+	.get_drvinfo		= ipoib_get_drvinfo,
+	.get_coalesce		= ipoib_get_coalesce,
+	.set_coalesce		= ipoib_set_coalesce,
+};
+
+void ipoib_set_ethtool_ops(struct net_device *dev)
+{
+	dev->ethtool_ops = &ipoib_ethtool_ops;
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_fs.c b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
new file mode 100644
index 000000000..6bd5740e2
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_fs.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/err.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+struct file_operations;
+
+#include <linux/debugfs.h>
+#include <linux/export.h>
+
+#include "ipoib.h"
+
+static struct dentry *ipoib_root;
+
+static void format_gid(union ib_gid *gid, char *buf)
+{
+	int i, n;
+
+	for (n = 0, i = 0; i < 8; ++i) {
+		n += sprintf(buf + n, "%x",
+			     be16_to_cpu(((__be16 *) gid->raw)[i]));
+		if (i < 7)
+			buf[n++] = ':';
+	}
+}
+
+static void *ipoib_mcg_seq_start(struct seq_file *file, loff_t *pos)
+{
+	struct ipoib_mcast_iter *iter;
+	loff_t n = *pos;
+
+	iter = ipoib_mcast_iter_init(file->private);
+	if (!iter)
+		return NULL;
+
+	while (n--) {
+		if (ipoib_mcast_iter_next(iter)) {
+			kfree(iter);
+			return NULL;
+		}
+	}
+
+	return iter;
+}
+
+static void *ipoib_mcg_seq_next(struct seq_file *file, void *iter_ptr,
+				   loff_t *pos)
+{
+	struct ipoib_mcast_iter *iter = iter_ptr;
+
+	(*pos)++;
+
+	if (ipoib_mcast_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void ipoib_mcg_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+	/* nothing for now */
+}
+
+static int ipoib_mcg_seq_show(struct seq_file *file, void *iter_ptr)
+{
+	struct ipoib_mcast_iter *iter = iter_ptr;
+	char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"];
+	union ib_gid mgid;
+	unsigned long created;
+	unsigned int queuelen, complete, send_only;
+
+	if (!iter)
+		return 0;
+
+	ipoib_mcast_iter_read(iter, &mgid, &created, &queuelen,
+			      &complete, &send_only);
+
+	format_gid(&mgid, gid_buf);
+
+	seq_printf(file,
+		   "GID: %s\n"
+		   "  created: %10ld\n"
+		   "  queuelen: %9d\n"
+		   "  complete: %9s\n"
+		   "  send_only: %8s\n"
+		   "\n",
+		   gid_buf, created, queuelen,
+		   complete ? "yes" : "no",
+		   send_only ? "yes" : "no");
+
+	return 0;
+}
+
+static const struct seq_operations ipoib_mcg_seq_ops = {
+	.start = ipoib_mcg_seq_start,
+	.next  = ipoib_mcg_seq_next,
+	.stop  = ipoib_mcg_seq_stop,
+	.show  = ipoib_mcg_seq_show,
+};
+
+static int ipoib_mcg_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret;
+
+	ret = seq_open(file, &ipoib_mcg_seq_ops);
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+
+	return 0;
+}
+
+static const struct file_operations ipoib_mcg_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ipoib_mcg_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+static void *ipoib_path_seq_start(struct seq_file *file, loff_t *pos)
+{
+	struct ipoib_path_iter *iter;
+	loff_t n = *pos;
+
+	iter = ipoib_path_iter_init(file->private);
+	if (!iter)
+		return NULL;
+
+	while (n--) {
+		if (ipoib_path_iter_next(iter)) {
+			kfree(iter);
+			return NULL;
+		}
+	}
+
+	return iter;
+}
+
+static void *ipoib_path_seq_next(struct seq_file *file, void *iter_ptr,
+				   loff_t *pos)
+{
+	struct ipoib_path_iter *iter = iter_ptr;
+
+	(*pos)++;
+
+	if (ipoib_path_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+static void ipoib_path_seq_stop(struct seq_file *file, void *iter_ptr)
+{
+	/* nothing for now */
+}
+
+static int ipoib_path_seq_show(struct seq_file *file, void *iter_ptr)
+{
+	struct ipoib_path_iter *iter = iter_ptr;
+	char gid_buf[sizeof "ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"];
+	struct ipoib_path path;
+	int rate;
+
+	if (!iter)
+		return 0;
+
+	ipoib_path_iter_read(iter, &path);
+
+	format_gid(&path.pathrec.dgid, gid_buf);
+
+	seq_printf(file,
+		   "GID: %s\n"
+		   "  complete: %6s\n",
+		   gid_buf, path.pathrec.dlid ? "yes" : "no");
+
+	if (path.pathrec.dlid) {
+		rate = ib_rate_to_mbps(path.pathrec.rate);
+
+		seq_printf(file,
+			   "  DLID:     0x%04x\n"
+			   "  SL: %12d\n"
+			   "  rate: %8d.%d Gb/sec\n",
+			   be16_to_cpu(path.pathrec.dlid),
+			   path.pathrec.sl,
+			   rate / 1000, rate % 1000);
+	}
+
+	seq_putc(file, '\n');
+
+	return 0;
+}
+
+static const struct seq_operations ipoib_path_seq_ops = {
+	.start = ipoib_path_seq_start,
+	.next  = ipoib_path_seq_next,
+	.stop  = ipoib_path_seq_stop,
+	.show  = ipoib_path_seq_show,
+};
+
+static int ipoib_path_open(struct inode *inode, struct file *file)
+{
+	struct seq_file *seq;
+	int ret;
+
+	ret = seq_open(file, &ipoib_path_seq_ops);
+	if (ret)
+		return ret;
+
+	seq = file->private_data;
+	seq->private = inode->i_private;
+
+	return 0;
+}
+
+static const struct file_operations ipoib_path_fops = {
+	.owner   = THIS_MODULE,
+	.open    = ipoib_path_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release
+};
+
+void ipoib_create_debug_files(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	char name[IFNAMSIZ + sizeof "_path"];
+
+	snprintf(name, sizeof name, "%s_mcg", dev->name);
+	priv->mcg_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO,
+					       ipoib_root, dev, &ipoib_mcg_fops);
+	if (!priv->mcg_dentry)
+		ipoib_warn(priv, "failed to create mcg debug file\n");
+
+	snprintf(name, sizeof name, "%s_path", dev->name);
+	priv->path_dentry = debugfs_create_file(name, S_IFREG | S_IRUGO,
+						ipoib_root, dev, &ipoib_path_fops);
+	if (!priv->path_dentry)
+		ipoib_warn(priv, "failed to create path debug file\n");
+}
+
+void ipoib_delete_debug_files(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	debugfs_remove(priv->mcg_dentry);
+	debugfs_remove(priv->path_dentry);
+}
+
+int ipoib_register_debugfs(void)
+{
+	ipoib_root = debugfs_create_dir("ipoib", NULL);
+	return ipoib_root ? 0 : -ENOMEM;
+}
+
+void ipoib_unregister_debugfs(void)
+{
+	debugfs_remove(ipoib_root);
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ib.c b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
new file mode 100644
index 000000000..63b92cbb2
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_ib.c
@@ -0,0 +1,1123 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2004, 2005 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/delay.h>
+#include <linux/moduleparam.h>
+#include <linux/dma-mapping.h>
+#include <linux/slab.h>
+
+#include <linux/ip.h>
+#include <linux/tcp.h>
+
+#include "ipoib.h"
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG_DATA
+static int data_debug_level;
+
+module_param(data_debug_level, int, 0644);
+MODULE_PARM_DESC(data_debug_level,
+		 "Enable data path debug tracing if > 0");
+#endif
+
+static DEFINE_MUTEX(pkey_mutex);
+
+struct ipoib_ah *ipoib_create_ah(struct net_device *dev,
+				 struct ib_pd *pd, struct ib_ah_attr *attr)
+{
+	struct ipoib_ah *ah;
+	struct ib_ah *vah;
+
+	ah = kmalloc(sizeof *ah, GFP_KERNEL);
+	if (!ah)
+		return ERR_PTR(-ENOMEM);
+
+	ah->dev       = dev;
+	ah->last_send = 0;
+	kref_init(&ah->ref);
+
+	vah = ib_create_ah(pd, attr);
+	if (IS_ERR(vah)) {
+		kfree(ah);
+		ah = (struct ipoib_ah *)vah;
+	} else {
+		ah->ah = vah;
+		ipoib_dbg(netdev_priv(dev), "Created ah %p\n", ah->ah);
+	}
+
+	return ah;
+}
+
+void ipoib_free_ah(struct kref *kref)
+{
+	struct ipoib_ah *ah = container_of(kref, struct ipoib_ah, ref);
+	struct ipoib_dev_priv *priv = netdev_priv(ah->dev);
+
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	list_add_tail(&ah->list, &priv->dead_ahs);
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void ipoib_ud_dma_unmap_rx(struct ipoib_dev_priv *priv,
+				  u64 mapping[IPOIB_UD_RX_SG])
+{
+	ib_dma_unmap_single(priv->ca, mapping[0],
+			    IPOIB_UD_BUF_SIZE(priv->max_ib_mtu),
+			    DMA_FROM_DEVICE);
+}
+
+static int ipoib_ib_post_receive(struct net_device *dev, int id)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_recv_wr *bad_wr;
+	int ret;
+
+	priv->rx_wr.wr_id   = id | IPOIB_OP_RECV;
+	priv->rx_sge[0].addr = priv->rx_ring[id].mapping[0];
+	priv->rx_sge[1].addr = priv->rx_ring[id].mapping[1];
+
+
+	ret = ib_post_recv(priv->qp, &priv->rx_wr, &bad_wr);
+	if (unlikely(ret)) {
+		ipoib_warn(priv, "receive failed for buf %d (%d)\n", id, ret);
+		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[id].mapping);
+		dev_kfree_skb_any(priv->rx_ring[id].skb);
+		priv->rx_ring[id].skb = NULL;
+	}
+
+	return ret;
+}
+
+static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct sk_buff *skb;
+	int buf_size;
+	u64 *mapping;
+
+	buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
+
+	skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN);
+	if (unlikely(!skb))
+		return NULL;
+
+	/*
+	 * IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
+	 * header.  So we need 4 more bytes to get to 48 and align the
+	 * IP header to a multiple of 16.
+	 */
+	skb_reserve(skb, 4);
+
+	mapping = priv->rx_ring[id].mapping;
+	mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
+				       DMA_FROM_DEVICE);
+	if (unlikely(ib_dma_mapping_error(priv->ca, mapping[0])))
+		goto error;
+
+	priv->rx_ring[id].skb = skb;
+	return skb;
+error:
+	dev_kfree_skb_any(skb);
+	return NULL;
+}
+
+static int ipoib_ib_post_receives(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i;
+
+	for (i = 0; i < ipoib_recvq_size; ++i) {
+		if (!ipoib_alloc_rx_skb(dev, i)) {
+			ipoib_warn(priv, "failed to allocate receive buffer %d\n", i);
+			return -ENOMEM;
+		}
+		if (ipoib_ib_post_receive(dev, i)) {
+			ipoib_warn(priv, "ipoib_ib_post_receive failed for buf %d\n", i);
+			return -EIO;
+		}
+	}
+
+	return 0;
+}
+
+static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned int wr_id = wc->wr_id & ~IPOIB_OP_RECV;
+	struct sk_buff *skb;
+	u64 mapping[IPOIB_UD_RX_SG];
+	union ib_gid *dgid;
+
+	ipoib_dbg_data(priv, "recv completion: id %d, status: %d\n",
+		       wr_id, wc->status);
+
+	if (unlikely(wr_id >= ipoib_recvq_size)) {
+		ipoib_warn(priv, "recv completion event with wrid %d (> %d)\n",
+			   wr_id, ipoib_recvq_size);
+		return;
+	}
+
+	skb  = priv->rx_ring[wr_id].skb;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS)) {
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			ipoib_warn(priv, "failed recv event "
+				   "(status=%d, wrid=%d vend_err %x)\n",
+				   wc->status, wr_id, wc->vendor_err);
+		ipoib_ud_dma_unmap_rx(priv, priv->rx_ring[wr_id].mapping);
+		dev_kfree_skb_any(skb);
+		priv->rx_ring[wr_id].skb = NULL;
+		return;
+	}
+
+	/*
+	 * Drop packets that this interface sent, ie multicast packets
+	 * that the HCA has replicated.
+	 */
+	if (wc->slid == priv->local_lid && wc->src_qp == priv->qp->qp_num)
+		goto repost;
+
+	memcpy(mapping, priv->rx_ring[wr_id].mapping,
+	       IPOIB_UD_RX_SG * sizeof *mapping);
+
+	/*
+	 * If we can't allocate a new RX buffer, dump
+	 * this packet and reuse the old buffer.
+	 */
+	if (unlikely(!ipoib_alloc_rx_skb(dev, wr_id))) {
+		++dev->stats.rx_dropped;
+		goto repost;
+	}
+
+	ipoib_dbg_data(priv, "received %d bytes, SLID 0x%04x\n",
+		       wc->byte_len, wc->slid);
+
+	ipoib_ud_dma_unmap_rx(priv, mapping);
+
+	skb_put(skb, wc->byte_len);
+
+	/* First byte of dgid signals multicast when 0xff */
+	dgid = &((struct ib_grh *)skb->data)->dgid;
+
+	if (!(wc->wc_flags & IB_WC_GRH) || dgid->raw[0] != 0xff)
+		skb->pkt_type = PACKET_HOST;
+	else if (memcmp(dgid, dev->broadcast + 4, sizeof(union ib_gid)) == 0)
+		skb->pkt_type = PACKET_BROADCAST;
+	else
+		skb->pkt_type = PACKET_MULTICAST;
+
+	skb_pull(skb, IB_GRH_BYTES);
+
+	skb->protocol = ((struct ipoib_header *) skb->data)->proto;
+	skb_reset_mac_header(skb);
+	skb_pull(skb, IPOIB_ENCAP_LEN);
+
+	skb->truesize = SKB_TRUESIZE(skb->len);
+
+	++dev->stats.rx_packets;
+	dev->stats.rx_bytes += skb->len;
+
+	skb->dev = dev;
+	if ((dev->features & NETIF_F_RXCSUM) &&
+			likely(wc->wc_flags & IB_WC_IP_CSUM_OK))
+		skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+	napi_gro_receive(&priv->napi, skb);
+
+repost:
+	if (unlikely(ipoib_ib_post_receive(dev, wr_id)))
+		ipoib_warn(priv, "ipoib_ib_post_receive failed "
+			   "for buf %d\n", wr_id);
+}
+
+static int ipoib_dma_map_tx(struct ib_device *ca,
+			    struct ipoib_tx_buf *tx_req)
+{
+	struct sk_buff *skb = tx_req->skb;
+	u64 *mapping = tx_req->mapping;
+	int i;
+	int off;
+
+	if (skb_headlen(skb)) {
+		mapping[0] = ib_dma_map_single(ca, skb->data, skb_headlen(skb),
+					       DMA_TO_DEVICE);
+		if (unlikely(ib_dma_mapping_error(ca, mapping[0])))
+			return -EIO;
+
+		off = 1;
+	} else
+		off = 0;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+		mapping[i + off] = ib_dma_map_page(ca,
+						 skb_frag_page(frag),
+						 frag->page_offset, skb_frag_size(frag),
+						 DMA_TO_DEVICE);
+		if (unlikely(ib_dma_mapping_error(ca, mapping[i + off])))
+			goto partial_error;
+	}
+	return 0;
+
+partial_error:
+	for (; i > 0; --i) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i - 1];
+
+		ib_dma_unmap_page(ca, mapping[i - !off], skb_frag_size(frag), DMA_TO_DEVICE);
+	}
+
+	if (off)
+		ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
+
+	return -EIO;
+}
+
+static void ipoib_dma_unmap_tx(struct ib_device *ca,
+			       struct ipoib_tx_buf *tx_req)
+{
+	struct sk_buff *skb = tx_req->skb;
+	u64 *mapping = tx_req->mapping;
+	int i;
+	int off;
+
+	if (skb_headlen(skb)) {
+		ib_dma_unmap_single(ca, mapping[0], skb_headlen(skb), DMA_TO_DEVICE);
+		off = 1;
+	} else
+		off = 0;
+
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; ++i) {
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		ib_dma_unmap_page(ca, mapping[i + off], skb_frag_size(frag),
+				  DMA_TO_DEVICE);
+	}
+}
+
+/*
+ * As the result of a completion error the QP Can be transferred to SQE states.
+ * The function checks if the (send)QP is in SQE state and
+ * moves it back to RTS state, that in order to have it functional again.
+ */
+static void ipoib_qp_state_validate_work(struct work_struct *work)
+{
+	struct ipoib_qp_state_validate *qp_work =
+		container_of(work, struct ipoib_qp_state_validate, work);
+
+	struct ipoib_dev_priv *priv = qp_work->priv;
+	struct ib_qp_attr qp_attr;
+	struct ib_qp_init_attr query_init_attr;
+	int ret;
+
+	ret = ib_query_qp(priv->qp, &qp_attr, IB_QP_STATE, &query_init_attr);
+	if (ret) {
+		ipoib_warn(priv, "%s: Failed to query QP ret: %d\n",
+			   __func__, ret);
+		goto free_res;
+	}
+	pr_info("%s: QP: 0x%x is in state: %d\n",
+		__func__, priv->qp->qp_num, qp_attr.qp_state);
+
+	/* currently support only in SQE->RTS transition*/
+	if (qp_attr.qp_state == IB_QPS_SQE) {
+		qp_attr.qp_state = IB_QPS_RTS;
+
+		ret = ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE);
+		if (ret) {
+			pr_warn("failed(%d) modify QP:0x%x SQE->RTS\n",
+				ret, priv->qp->qp_num);
+			goto free_res;
+		}
+		pr_info("%s: QP: 0x%x moved from IB_QPS_SQE to IB_QPS_RTS\n",
+			__func__, priv->qp->qp_num);
+	} else {
+		pr_warn("QP (%d) will stay in state: %d\n",
+			priv->qp->qp_num, qp_attr.qp_state);
+	}
+
+free_res:
+	kfree(qp_work);
+}
+
+static void ipoib_ib_handle_tx_wc(struct net_device *dev, struct ib_wc *wc)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned int wr_id = wc->wr_id;
+	struct ipoib_tx_buf *tx_req;
+
+	ipoib_dbg_data(priv, "send completion: id %d, status: %d\n",
+		       wr_id, wc->status);
+
+	if (unlikely(wr_id >= ipoib_sendq_size)) {
+		ipoib_warn(priv, "send completion event with wrid %d (> %d)\n",
+			   wr_id, ipoib_sendq_size);
+		return;
+	}
+
+	tx_req = &priv->tx_ring[wr_id];
+
+	ipoib_dma_unmap_tx(priv->ca, tx_req);
+
+	++dev->stats.tx_packets;
+	dev->stats.tx_bytes += tx_req->skb->len;
+
+	dev_kfree_skb_any(tx_req->skb);
+
+	++priv->tx_tail;
+	if (unlikely(--priv->tx_outstanding == ipoib_sendq_size >> 1) &&
+	    netif_queue_stopped(dev) &&
+	    test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags))
+		netif_wake_queue(dev);
+
+	if (wc->status != IB_WC_SUCCESS &&
+	    wc->status != IB_WC_WR_FLUSH_ERR) {
+		struct ipoib_qp_state_validate *qp_work;
+		ipoib_warn(priv, "failed send event "
+			   "(status=%d, wrid=%d vend_err %x)\n",
+			   wc->status, wr_id, wc->vendor_err);
+		qp_work = kzalloc(sizeof(*qp_work), GFP_ATOMIC);
+		if (!qp_work) {
+			ipoib_warn(priv, "%s Failed alloc ipoib_qp_state_validate for qp: 0x%x\n",
+				   __func__, priv->qp->qp_num);
+			return;
+		}
+
+		INIT_WORK(&qp_work->work, ipoib_qp_state_validate_work);
+		qp_work->priv = priv;
+		queue_work(priv->wq, &qp_work->work);
+	}
+}
+
+static int poll_tx(struct ipoib_dev_priv *priv)
+{
+	int n, i;
+
+	n = ib_poll_cq(priv->send_cq, MAX_SEND_CQE, priv->send_wc);
+	for (i = 0; i < n; ++i)
+		ipoib_ib_handle_tx_wc(priv->dev, priv->send_wc + i);
+
+	return n == MAX_SEND_CQE;
+}
+
+int ipoib_poll(struct napi_struct *napi, int budget)
+{
+	struct ipoib_dev_priv *priv = container_of(napi, struct ipoib_dev_priv, napi);
+	struct net_device *dev = priv->dev;
+	int done;
+	int t;
+	int n, i;
+
+	done  = 0;
+
+poll_more:
+	while (done < budget) {
+		int max = (budget - done);
+
+		t = min(IPOIB_NUM_WC, max);
+		n = ib_poll_cq(priv->recv_cq, t, priv->ibwc);
+
+		for (i = 0; i < n; i++) {
+			struct ib_wc *wc = priv->ibwc + i;
+
+			if (wc->wr_id & IPOIB_OP_RECV) {
+				++done;
+				if (wc->wr_id & IPOIB_OP_CM)
+					ipoib_cm_handle_rx_wc(dev, wc);
+				else
+					ipoib_ib_handle_rx_wc(dev, wc);
+			} else
+				ipoib_cm_handle_tx_wc(priv->dev, wc);
+		}
+
+		if (n != t)
+			break;
+	}
+
+	if (done < budget) {
+		napi_complete(napi);
+		if (unlikely(ib_req_notify_cq(priv->recv_cq,
+					      IB_CQ_NEXT_COMP |
+					      IB_CQ_REPORT_MISSED_EVENTS)) &&
+		    napi_reschedule(napi))
+			goto poll_more;
+	}
+
+	return done;
+}
+
+void ipoib_ib_completion(struct ib_cq *cq, void *dev_ptr)
+{
+	struct net_device *dev = dev_ptr;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	napi_schedule(&priv->napi);
+}
+
+static void drain_tx_cq(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	netif_tx_lock(dev);
+	while (poll_tx(priv))
+		; /* nothing */
+
+	if (netif_queue_stopped(dev))
+		mod_timer(&priv->poll_timer, jiffies + 1);
+
+	netif_tx_unlock(dev);
+}
+
+void ipoib_send_comp_handler(struct ib_cq *cq, void *dev_ptr)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev_ptr);
+
+	mod_timer(&priv->poll_timer, jiffies);
+}
+
+static inline int post_send(struct ipoib_dev_priv *priv,
+			    unsigned int wr_id,
+			    struct ib_ah *address, u32 qpn,
+			    struct ipoib_tx_buf *tx_req,
+			    void *head, int hlen)
+{
+	struct ib_send_wr *bad_wr;
+	int i, off;
+	struct sk_buff *skb = tx_req->skb;
+	skb_frag_t *frags = skb_shinfo(skb)->frags;
+	int nr_frags = skb_shinfo(skb)->nr_frags;
+	u64 *mapping = tx_req->mapping;
+
+	if (skb_headlen(skb)) {
+		priv->tx_sge[0].addr         = mapping[0];
+		priv->tx_sge[0].length       = skb_headlen(skb);
+		off = 1;
+	} else
+		off = 0;
+
+	for (i = 0; i < nr_frags; ++i) {
+		priv->tx_sge[i + off].addr = mapping[i + off];
+		priv->tx_sge[i + off].length = skb_frag_size(&frags[i]);
+	}
+	priv->tx_wr.num_sge	     = nr_frags + off;
+	priv->tx_wr.wr_id 	     = wr_id;
+	priv->tx_wr.wr.ud.remote_qpn = qpn;
+	priv->tx_wr.wr.ud.ah 	     = address;
+
+	if (head) {
+		priv->tx_wr.wr.ud.mss	 = skb_shinfo(skb)->gso_size;
+		priv->tx_wr.wr.ud.header = head;
+		priv->tx_wr.wr.ud.hlen	 = hlen;
+		priv->tx_wr.opcode	 = IB_WR_LSO;
+	} else
+		priv->tx_wr.opcode	 = IB_WR_SEND;
+
+	return ib_post_send(priv->qp, &priv->tx_wr, &bad_wr);
+}
+
+void ipoib_send(struct net_device *dev, struct sk_buff *skb,
+		struct ipoib_ah *address, u32 qpn)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_tx_buf *tx_req;
+	int hlen, rc;
+	void *phead;
+
+	if (skb_is_gso(skb)) {
+		hlen = skb_transport_offset(skb) + tcp_hdrlen(skb);
+		phead = skb->data;
+		if (unlikely(!skb_pull(skb, hlen))) {
+			ipoib_warn(priv, "linear data too small\n");
+			++dev->stats.tx_dropped;
+			++dev->stats.tx_errors;
+			dev_kfree_skb_any(skb);
+			return;
+		}
+	} else {
+		if (unlikely(skb->len > priv->mcast_mtu + IPOIB_ENCAP_LEN)) {
+			ipoib_warn(priv, "packet len %d (> %d) too long to send, dropping\n",
+				   skb->len, priv->mcast_mtu + IPOIB_ENCAP_LEN);
+			++dev->stats.tx_dropped;
+			++dev->stats.tx_errors;
+			ipoib_cm_skb_too_long(dev, skb, priv->mcast_mtu);
+			return;
+		}
+		phead = NULL;
+		hlen  = 0;
+	}
+
+	ipoib_dbg_data(priv, "sending packet, length=%d address=%p qpn=0x%06x\n",
+		       skb->len, address, qpn);
+
+	/*
+	 * We put the skb into the tx_ring _before_ we call post_send()
+	 * because it's entirely possible that the completion handler will
+	 * run before we execute anything after the post_send().  That
+	 * means we have to make sure everything is properly recorded and
+	 * our state is consistent before we call post_send().
+	 */
+	tx_req = &priv->tx_ring[priv->tx_head & (ipoib_sendq_size - 1)];
+	tx_req->skb = skb;
+	if (unlikely(ipoib_dma_map_tx(priv->ca, tx_req))) {
+		++dev->stats.tx_errors;
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	if (skb->ip_summed == CHECKSUM_PARTIAL)
+		priv->tx_wr.send_flags |= IB_SEND_IP_CSUM;
+	else
+		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
+
+	if (++priv->tx_outstanding == ipoib_sendq_size) {
+		ipoib_dbg(priv, "TX ring full, stopping kernel net queue\n");
+		if (ib_req_notify_cq(priv->send_cq, IB_CQ_NEXT_COMP))
+			ipoib_warn(priv, "request notify on send CQ failed\n");
+		netif_stop_queue(dev);
+	}
+
+	skb_orphan(skb);
+	skb_dst_drop(skb);
+
+	rc = post_send(priv, priv->tx_head & (ipoib_sendq_size - 1),
+		       address->ah, qpn, tx_req, phead, hlen);
+	if (unlikely(rc)) {
+		ipoib_warn(priv, "post_send failed, error %d\n", rc);
+		++dev->stats.tx_errors;
+		--priv->tx_outstanding;
+		ipoib_dma_unmap_tx(priv->ca, tx_req);
+		dev_kfree_skb_any(skb);
+		if (netif_queue_stopped(dev))
+			netif_wake_queue(dev);
+	} else {
+		dev->trans_start = jiffies;
+
+		address->last_send = priv->tx_head;
+		++priv->tx_head;
+	}
+
+	if (unlikely(priv->tx_outstanding > MAX_SEND_CQE))
+		while (poll_tx(priv))
+			; /* nothing */
+}
+
+static void __ipoib_reap_ah(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_ah *ah, *tah;
+	LIST_HEAD(remove_list);
+	unsigned long flags;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_for_each_entry_safe(ah, tah, &priv->dead_ahs, list)
+		if ((int) priv->tx_tail - (int) ah->last_send >= 0) {
+			list_del(&ah->list);
+			ib_destroy_ah(ah->ah);
+			kfree(ah);
+		}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
+}
+
+void ipoib_reap_ah(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, ah_reap_task.work);
+	struct net_device *dev = priv->dev;
+
+	__ipoib_reap_ah(dev);
+
+	if (!test_bit(IPOIB_STOP_REAPER, &priv->flags))
+		queue_delayed_work(priv->wq, &priv->ah_reap_task,
+				   round_jiffies_relative(HZ));
+}
+
+static void ipoib_flush_ah(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	cancel_delayed_work(&priv->ah_reap_task);
+	flush_workqueue(priv->wq);
+	ipoib_reap_ah(&priv->ah_reap_task.work);
+}
+
+static void ipoib_stop_ah(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	set_bit(IPOIB_STOP_REAPER, &priv->flags);
+	ipoib_flush_ah(dev);
+}
+
+static void ipoib_ib_tx_timer_func(unsigned long ctx)
+{
+	drain_tx_cq((struct net_device *)ctx);
+}
+
+int ipoib_ib_dev_open(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+
+	ipoib_pkey_dev_check_presence(dev);
+
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+		ipoib_warn(priv, "P_Key 0x%04x is %s\n", priv->pkey,
+			   (!(priv->pkey & 0x7fff) ? "Invalid" : "not found"));
+		return -1;
+	}
+
+	ret = ipoib_init_qp(dev);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_init_qp returned %d\n", ret);
+		return -1;
+	}
+
+	ret = ipoib_ib_post_receives(dev);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_ib_post_receives returned %d\n", ret);
+		goto dev_stop;
+	}
+
+	ret = ipoib_cm_dev_open(dev);
+	if (ret) {
+		ipoib_warn(priv, "ipoib_cm_dev_open returned %d\n", ret);
+		goto dev_stop;
+	}
+
+	clear_bit(IPOIB_STOP_REAPER, &priv->flags);
+	queue_delayed_work(priv->wq, &priv->ah_reap_task,
+			   round_jiffies_relative(HZ));
+
+	if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
+		napi_enable(&priv->napi);
+
+	return 0;
+dev_stop:
+	if (!test_and_set_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
+		napi_enable(&priv->napi);
+	ipoib_ib_dev_stop(dev);
+	return -1;
+}
+
+void ipoib_pkey_dev_check_presence(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	if (!(priv->pkey & 0x7fff) ||
+	    ib_find_pkey(priv->ca, priv->port, priv->pkey,
+			 &priv->pkey_index))
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+	else
+		set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+}
+
+int ipoib_ib_dev_up(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_pkey_dev_check_presence(dev);
+
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags)) {
+		ipoib_dbg(priv, "PKEY is not assigned.\n");
+		return 0;
+	}
+
+	set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+
+	return ipoib_mcast_start_thread(dev);
+}
+
+int ipoib_ib_dev_down(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "downing ib_dev\n");
+
+	clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
+	netif_carrier_off(dev);
+
+	ipoib_mcast_stop_thread(dev);
+	ipoib_mcast_dev_flush(dev);
+
+	ipoib_flush_paths(dev);
+
+	return 0;
+}
+
+static int recvs_pending(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int pending = 0;
+	int i;
+
+	for (i = 0; i < ipoib_recvq_size; ++i)
+		if (priv->rx_ring[i].skb)
+			++pending;
+
+	return pending;
+}
+
+void ipoib_drain_cq(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int i, n;
+
+	/*
+	 * We call completion handling routines that expect to be
+	 * called from the BH-disabled NAPI poll context, so disable
+	 * BHs here too.
+	 */
+	local_bh_disable();
+
+	do {
+		n = ib_poll_cq(priv->recv_cq, IPOIB_NUM_WC, priv->ibwc);
+		for (i = 0; i < n; ++i) {
+			/*
+			 * Convert any successful completions to flush
+			 * errors to avoid passing packets up the
+			 * stack after bringing the device down.
+			 */
+			if (priv->ibwc[i].status == IB_WC_SUCCESS)
+				priv->ibwc[i].status = IB_WC_WR_FLUSH_ERR;
+
+			if (priv->ibwc[i].wr_id & IPOIB_OP_RECV) {
+				if (priv->ibwc[i].wr_id & IPOIB_OP_CM)
+					ipoib_cm_handle_rx_wc(dev, priv->ibwc + i);
+				else
+					ipoib_ib_handle_rx_wc(dev, priv->ibwc + i);
+			} else
+				ipoib_cm_handle_tx_wc(dev, priv->ibwc + i);
+		}
+	} while (n == IPOIB_NUM_WC);
+
+	while (poll_tx(priv))
+		; /* nothing */
+
+	local_bh_enable();
+}
+
+int ipoib_ib_dev_stop(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_attr qp_attr;
+	unsigned long begin;
+	struct ipoib_tx_buf *tx_req;
+	int i;
+
+	if (test_and_clear_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
+		napi_disable(&priv->napi);
+
+	ipoib_cm_dev_stop(dev);
+
+	/*
+	 * Move our QP to the error state and then reinitialize in
+	 * when all work requests have completed or have been flushed.
+	 */
+	qp_attr.qp_state = IB_QPS_ERR;
+	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
+		ipoib_warn(priv, "Failed to modify QP to ERROR state\n");
+
+	/* Wait for all sends and receives to complete */
+	begin = jiffies;
+
+	while (priv->tx_head != priv->tx_tail || recvs_pending(dev)) {
+		if (time_after(jiffies, begin + 5 * HZ)) {
+			ipoib_warn(priv, "timing out; %d sends %d receives not completed\n",
+				   priv->tx_head - priv->tx_tail, recvs_pending(dev));
+
+			/*
+			 * assume the HW is wedged and just free up
+			 * all our pending work requests.
+			 */
+			while ((int) priv->tx_tail - (int) priv->tx_head < 0) {
+				tx_req = &priv->tx_ring[priv->tx_tail &
+							(ipoib_sendq_size - 1)];
+				ipoib_dma_unmap_tx(priv->ca, tx_req);
+				dev_kfree_skb_any(tx_req->skb);
+				++priv->tx_tail;
+				--priv->tx_outstanding;
+			}
+
+			for (i = 0; i < ipoib_recvq_size; ++i) {
+				struct ipoib_rx_buf *rx_req;
+
+				rx_req = &priv->rx_ring[i];
+				if (!rx_req->skb)
+					continue;
+				ipoib_ud_dma_unmap_rx(priv,
+						      priv->rx_ring[i].mapping);
+				dev_kfree_skb_any(rx_req->skb);
+				rx_req->skb = NULL;
+			}
+
+			goto timeout;
+		}
+
+		ipoib_drain_cq(dev);
+
+		msleep(1);
+	}
+
+	ipoib_dbg(priv, "All sends and receives done.\n");
+
+timeout:
+	del_timer_sync(&priv->poll_timer);
+	qp_attr.qp_state = IB_QPS_RESET;
+	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
+		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
+
+	ipoib_flush_ah(dev);
+
+	ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP);
+
+	return 0;
+}
+
+int ipoib_ib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	priv->ca = ca;
+	priv->port = port;
+	priv->qp = NULL;
+
+	if (ipoib_transport_dev_init(dev, ca)) {
+		printk(KERN_WARNING "%s: ipoib_transport_dev_init failed\n", ca->name);
+		return -ENODEV;
+	}
+
+	setup_timer(&priv->poll_timer, ipoib_ib_tx_timer_func,
+		    (unsigned long) dev);
+
+	if (dev->flags & IFF_UP) {
+		if (ipoib_ib_dev_open(dev)) {
+			ipoib_transport_dev_cleanup(dev);
+			return -ENODEV;
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Takes whatever value which is in pkey index 0 and updates priv->pkey
+ * returns 0 if the pkey value was changed.
+ */
+static inline int update_parent_pkey(struct ipoib_dev_priv *priv)
+{
+	int result;
+	u16 prev_pkey;
+
+	prev_pkey = priv->pkey;
+	result = ib_query_pkey(priv->ca, priv->port, 0, &priv->pkey);
+	if (result) {
+		ipoib_warn(priv, "ib_query_pkey port %d failed (ret = %d)\n",
+			   priv->port, result);
+		return result;
+	}
+
+	priv->pkey |= 0x8000;
+
+	if (prev_pkey != priv->pkey) {
+		ipoib_dbg(priv, "pkey changed from 0x%x to 0x%x\n",
+			  prev_pkey, priv->pkey);
+		/*
+		 * Update the pkey in the broadcast address, while making sure to set
+		 * the full membership bit, so that we join the right broadcast group.
+		 */
+		priv->dev->broadcast[8] = priv->pkey >> 8;
+		priv->dev->broadcast[9] = priv->pkey & 0xff;
+		return 0;
+	}
+
+	return 1;
+}
+/*
+ * returns 0 if pkey value was found in a different slot.
+ */
+static inline int update_child_pkey(struct ipoib_dev_priv *priv)
+{
+	u16 old_index = priv->pkey_index;
+
+	priv->pkey_index = 0;
+	ipoib_pkey_dev_check_presence(priv->dev);
+
+	if (test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags) &&
+	    (old_index == priv->pkey_index))
+		return 1;
+	return 0;
+}
+
+static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
+				enum ipoib_flush_level level)
+{
+	struct ipoib_dev_priv *cpriv;
+	struct net_device *dev = priv->dev;
+	int result;
+
+	down_read(&priv->vlan_rwsem);
+
+	/*
+	 * Flush any child interfaces too -- they might be up even if
+	 * the parent is down.
+	 */
+	list_for_each_entry(cpriv, &priv->child_intfs, list)
+		__ipoib_ib_dev_flush(cpriv, level);
+
+	up_read(&priv->vlan_rwsem);
+
+	if (!test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags) &&
+	    level != IPOIB_FLUSH_HEAVY) {
+		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_INITIALIZED not set.\n");
+		return;
+	}
+
+	if (!test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
+		/* interface is down. update pkey and leave. */
+		if (level == IPOIB_FLUSH_HEAVY) {
+			if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
+				update_parent_pkey(priv);
+			else
+				update_child_pkey(priv);
+		}
+		ipoib_dbg(priv, "Not flushing - IPOIB_FLAG_ADMIN_UP not set.\n");
+		return;
+	}
+
+	if (level == IPOIB_FLUSH_HEAVY) {
+		/* child devices chase their origin pkey value, while non-child
+		 * (parent) devices should always takes what present in pkey index 0
+		 */
+		if (test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+			result = update_child_pkey(priv);
+			if (result) {
+				/* restart QP only if P_Key index is changed */
+				ipoib_dbg(priv, "Not flushing - P_Key index not changed.\n");
+				return;
+			}
+
+		} else {
+			result = update_parent_pkey(priv);
+			/* restart QP only if P_Key value changed */
+			if (result) {
+				ipoib_dbg(priv, "Not flushing - P_Key value not changed.\n");
+				return;
+			}
+		}
+	}
+
+	if (level == IPOIB_FLUSH_LIGHT) {
+		ipoib_mark_paths_invalid(dev);
+		ipoib_mcast_dev_flush(dev);
+		ipoib_flush_ah(dev);
+	}
+
+	if (level >= IPOIB_FLUSH_NORMAL)
+		ipoib_ib_dev_down(dev);
+
+	if (level == IPOIB_FLUSH_HEAVY) {
+		if (test_bit(IPOIB_FLAG_INITIALIZED, &priv->flags))
+			ipoib_ib_dev_stop(dev);
+		if (ipoib_ib_dev_open(dev) != 0)
+			return;
+		if (netif_queue_stopped(dev))
+			netif_start_queue(dev);
+	}
+
+	/*
+	 * The device could have been brought down between the start and when
+	 * we get here, don't bring it back up if it's not configured up
+	 */
+	if (test_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags)) {
+		if (level >= IPOIB_FLUSH_NORMAL)
+			ipoib_ib_dev_up(dev);
+		ipoib_mcast_restart_task(&priv->restart_task);
+	}
+}
+
+void ipoib_ib_dev_flush_light(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_light);
+
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_LIGHT);
+}
+
+void ipoib_ib_dev_flush_normal(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_normal);
+
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_NORMAL);
+}
+
+void ipoib_ib_dev_flush_heavy(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, flush_heavy);
+
+	__ipoib_ib_dev_flush(priv, IPOIB_FLUSH_HEAVY);
+}
+
+void ipoib_ib_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "cleaning up ib_dev\n");
+	/*
+	 * We must make sure there are no more (path) completions
+	 * that may wish to touch priv fields that are no longer valid
+	 */
+	ipoib_flush_paths(dev);
+
+	ipoib_mcast_stop_thread(dev);
+	ipoib_mcast_dev_flush(dev);
+
+	/*
+	 * All of our ah references aren't free until after
+	 * ipoib_mcast_dev_flush(), ipoib_flush_paths, and
+	 * the neighbor garbage collection is stopped and reaped.
+	 * That should all be done now, so make a final ah flush.
+	 */
+	ipoib_stop_ah(dev);
+
+	ipoib_transport_dev_cleanup(dev);
+}
+
+
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c
new file mode 100644
index 000000000..9e1b203d7
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
@@ -0,0 +1,1827 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ipoib.h"
+
+#include <linux/module.h>
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+
+#include <linux/if_arp.h>	/* For ARPHRD_xxx */
+
+#include <linux/ip.h>
+#include <linux/in.h>
+
+#include <linux/jhash.h>
+#include <net/arp.h>
+
+#define DRV_VERSION "1.0.0"
+
+const char ipoib_driver_version[] = DRV_VERSION;
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("IP-over-InfiniBand net driver");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_VERSION(DRV_VERSION);
+
+int ipoib_sendq_size __read_mostly = IPOIB_TX_RING_SIZE;
+int ipoib_recvq_size __read_mostly = IPOIB_RX_RING_SIZE;
+
+module_param_named(send_queue_size, ipoib_sendq_size, int, 0444);
+MODULE_PARM_DESC(send_queue_size, "Number of descriptors in send queue");
+module_param_named(recv_queue_size, ipoib_recvq_size, int, 0444);
+MODULE_PARM_DESC(recv_queue_size, "Number of descriptors in receive queue");
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+int ipoib_debug_level;
+
+module_param_named(debug_level, ipoib_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
+#endif
+
+struct ipoib_path_iter {
+	struct net_device *dev;
+	struct ipoib_path  path;
+};
+
+static const u8 ipv4_bcast_addr[] = {
+	0x00, 0xff, 0xff, 0xff,
+	0xff, 0x12, 0x40, 0x1b,	0x00, 0x00, 0x00, 0x00,
+	0x00, 0x00, 0x00, 0x00,	0xff, 0xff, 0xff, 0xff
+};
+
+struct workqueue_struct *ipoib_workqueue;
+
+struct ib_sa_client ipoib_sa_client;
+
+static void ipoib_add_one(struct ib_device *device);
+static void ipoib_remove_one(struct ib_device *device);
+static void ipoib_neigh_reclaim(struct rcu_head *rp);
+
+static struct ib_client ipoib_client = {
+	.name   = "ipoib",
+	.add    = ipoib_add_one,
+	.remove = ipoib_remove_one
+};
+
+int ipoib_open(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "bringing up interface\n");
+
+	netif_carrier_off(dev);
+
+	set_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+	if (ipoib_ib_dev_open(dev)) {
+		if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+			return 0;
+		goto err_disable;
+	}
+
+	if (ipoib_ib_dev_up(dev))
+		goto err_stop;
+
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+		struct ipoib_dev_priv *cpriv;
+
+		/* Bring up any child interfaces too */
+		down_read(&priv->vlan_rwsem);
+		list_for_each_entry(cpriv, &priv->child_intfs, list) {
+			int flags;
+
+			flags = cpriv->dev->flags;
+			if (flags & IFF_UP)
+				continue;
+
+			dev_change_flags(cpriv->dev, flags | IFF_UP);
+		}
+		up_read(&priv->vlan_rwsem);
+	}
+
+	netif_start_queue(dev);
+
+	return 0;
+
+err_stop:
+	ipoib_ib_dev_stop(dev);
+
+err_disable:
+	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+	return -EINVAL;
+}
+
+static int ipoib_stop(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "stopping interface\n");
+
+	clear_bit(IPOIB_FLAG_ADMIN_UP, &priv->flags);
+
+	netif_stop_queue(dev);
+
+	ipoib_ib_dev_down(dev);
+	ipoib_ib_dev_stop(dev);
+
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags)) {
+		struct ipoib_dev_priv *cpriv;
+
+		/* Bring down any child interfaces too */
+		down_read(&priv->vlan_rwsem);
+		list_for_each_entry(cpriv, &priv->child_intfs, list) {
+			int flags;
+
+			flags = cpriv->dev->flags;
+			if (!(flags & IFF_UP))
+				continue;
+
+			dev_change_flags(cpriv->dev, flags & ~IFF_UP);
+		}
+		up_read(&priv->vlan_rwsem);
+	}
+
+	return 0;
+}
+
+static void ipoib_uninit(struct net_device *dev)
+{
+	ipoib_dev_cleanup(dev);
+}
+
+static netdev_features_t ipoib_fix_features(struct net_device *dev, netdev_features_t features)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	if (test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags))
+		features &= ~(NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO);
+
+	return features;
+}
+
+static int ipoib_change_mtu(struct net_device *dev, int new_mtu)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	/* dev->mtu > 2K ==> connected mode */
+	if (ipoib_cm_admin_enabled(dev)) {
+		if (new_mtu > ipoib_cm_max_mtu(dev))
+			return -EINVAL;
+
+		if (new_mtu > priv->mcast_mtu)
+			ipoib_warn(priv, "mtu > %d will cause multicast packet drops.\n",
+				   priv->mcast_mtu);
+
+		dev->mtu = new_mtu;
+		return 0;
+	}
+
+	if (new_mtu > IPOIB_UD_MTU(priv->max_ib_mtu))
+		return -EINVAL;
+
+	priv->admin_mtu = new_mtu;
+
+	dev->mtu = min(priv->mcast_mtu, priv->admin_mtu);
+
+	return 0;
+}
+
+int ipoib_set_mode(struct net_device *dev, const char *buf)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	/* flush paths if we switch modes so that connections are restarted */
+	if (IPOIB_CM_SUPPORTED(dev->dev_addr) && !strcmp(buf, "connected\n")) {
+		set_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+		ipoib_warn(priv, "enabling connected mode "
+			   "will cause multicast packet drops\n");
+		netdev_update_features(dev);
+		rtnl_unlock();
+		priv->tx_wr.send_flags &= ~IB_SEND_IP_CSUM;
+
+		ipoib_flush_paths(dev);
+		rtnl_lock();
+		return 0;
+	}
+
+	if (!strcmp(buf, "datagram\n")) {
+		clear_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+		netdev_update_features(dev);
+		dev_set_mtu(dev, min(priv->mcast_mtu, dev->mtu));
+		rtnl_unlock();
+		ipoib_flush_paths(dev);
+		rtnl_lock();
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
+static struct ipoib_path *__path_find(struct net_device *dev, void *gid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node *n = priv->path_tree.rb_node;
+	struct ipoib_path *path;
+	int ret;
+
+	while (n) {
+		path = rb_entry(n, struct ipoib_path, rb_node);
+
+		ret = memcmp(gid, path->pathrec.dgid.raw,
+			     sizeof (union ib_gid));
+
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			return path;
+	}
+
+	return NULL;
+}
+
+static int __path_add(struct net_device *dev, struct ipoib_path *path)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node **n = &priv->path_tree.rb_node;
+	struct rb_node *pn = NULL;
+	struct ipoib_path *tpath;
+	int ret;
+
+	while (*n) {
+		pn = *n;
+		tpath = rb_entry(pn, struct ipoib_path, rb_node);
+
+		ret = memcmp(path->pathrec.dgid.raw, tpath->pathrec.dgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = &pn->rb_left;
+		else if (ret > 0)
+			n = &pn->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&path->rb_node, pn, n);
+	rb_insert_color(&path->rb_node, &priv->path_tree);
+
+	list_add_tail(&path->list, &priv->path_list);
+
+	return 0;
+}
+
+static void path_free(struct net_device *dev, struct ipoib_path *path)
+{
+	struct sk_buff *skb;
+
+	while ((skb = __skb_dequeue(&path->queue)))
+		dev_kfree_skb_irq(skb);
+
+	ipoib_dbg(netdev_priv(dev), "path_free\n");
+
+	/* remove all neigh connected to this path */
+	ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
+
+	if (path->ah)
+		ipoib_put_ah(path->ah);
+
+	kfree(path);
+}
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+
+struct ipoib_path_iter *ipoib_path_iter_init(struct net_device *dev)
+{
+	struct ipoib_path_iter *iter;
+
+	iter = kmalloc(sizeof *iter, GFP_KERNEL);
+	if (!iter)
+		return NULL;
+
+	iter->dev = dev;
+	memset(iter->path.pathrec.dgid.raw, 0, 16);
+
+	if (ipoib_path_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+int ipoib_path_iter_next(struct ipoib_path_iter *iter)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
+	struct rb_node *n;
+	struct ipoib_path *path;
+	int ret = 1;
+
+	spin_lock_irq(&priv->lock);
+
+	n = rb_first(&priv->path_tree);
+
+	while (n) {
+		path = rb_entry(n, struct ipoib_path, rb_node);
+
+		if (memcmp(iter->path.pathrec.dgid.raw, path->pathrec.dgid.raw,
+			   sizeof (union ib_gid)) < 0) {
+			iter->path = *path;
+			ret = 0;
+			break;
+		}
+
+		n = rb_next(n);
+	}
+
+	spin_unlock_irq(&priv->lock);
+
+	return ret;
+}
+
+void ipoib_path_iter_read(struct ipoib_path_iter *iter,
+			  struct ipoib_path *path)
+{
+	*path = iter->path;
+}
+
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
+
+void ipoib_mark_paths_invalid(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path, *tp;
+
+	spin_lock_irq(&priv->lock);
+
+	list_for_each_entry_safe(path, tp, &priv->path_list, list) {
+		ipoib_dbg(priv, "mark path LID 0x%04x GID %pI6 invalid\n",
+			be16_to_cpu(path->pathrec.dlid),
+			path->pathrec.dgid.raw);
+		path->valid =  0;
+	}
+
+	spin_unlock_irq(&priv->lock);
+}
+
+void ipoib_flush_paths(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path, *tp;
+	LIST_HEAD(remove_list);
+	unsigned long flags;
+
+	netif_tx_lock_bh(dev);
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_splice_init(&priv->path_list, &remove_list);
+
+	list_for_each_entry(path, &remove_list, list)
+		rb_erase(&path->rb_node, &priv->path_tree);
+
+	list_for_each_entry_safe(path, tp, &remove_list, list) {
+		if (path->query)
+			ib_sa_cancel_query(path->query_id, path->query);
+		spin_unlock_irqrestore(&priv->lock, flags);
+		netif_tx_unlock_bh(dev);
+		wait_for_completion(&path->done);
+		path_free(dev, path);
+		netif_tx_lock_bh(dev);
+		spin_lock_irqsave(&priv->lock, flags);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	netif_tx_unlock_bh(dev);
+}
+
+static void path_rec_completion(int status,
+				struct ib_sa_path_rec *pathrec,
+				void *path_ptr)
+{
+	struct ipoib_path *path = path_ptr;
+	struct net_device *dev = path->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_ah *ah = NULL;
+	struct ipoib_ah *old_ah = NULL;
+	struct ipoib_neigh *neigh, *tn;
+	struct sk_buff_head skqueue;
+	struct sk_buff *skb;
+	unsigned long flags;
+
+	if (!status)
+		ipoib_dbg(priv, "PathRec LID 0x%04x for GID %pI6\n",
+			  be16_to_cpu(pathrec->dlid), pathrec->dgid.raw);
+	else
+		ipoib_dbg(priv, "PathRec status %d for GID %pI6\n",
+			  status, path->pathrec.dgid.raw);
+
+	skb_queue_head_init(&skqueue);
+
+	if (!status) {
+		struct ib_ah_attr av;
+
+		if (!ib_init_ah_from_path(priv->ca, priv->port, pathrec, &av))
+			ah = ipoib_create_ah(dev, priv->pd, &av);
+	}
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	if (!IS_ERR_OR_NULL(ah)) {
+		path->pathrec = *pathrec;
+
+		old_ah   = path->ah;
+		path->ah = ah;
+
+		ipoib_dbg(priv, "created address handle %p for LID 0x%04x, SL %d\n",
+			  ah, be16_to_cpu(pathrec->dlid), pathrec->sl);
+
+		while ((skb = __skb_dequeue(&path->queue)))
+			__skb_queue_tail(&skqueue, skb);
+
+		list_for_each_entry_safe(neigh, tn, &path->neigh_list, list) {
+			if (neigh->ah) {
+				WARN_ON(neigh->ah != old_ah);
+				/*
+				 * Dropping the ah reference inside
+				 * priv->lock is safe here, because we
+				 * will hold one more reference from
+				 * the original value of path->ah (ie
+				 * old_ah).
+				 */
+				ipoib_put_ah(neigh->ah);
+			}
+			kref_get(&path->ah->ref);
+			neigh->ah = path->ah;
+
+			if (ipoib_cm_enabled(dev, neigh->daddr)) {
+				if (!ipoib_cm_get(neigh))
+					ipoib_cm_set(neigh, ipoib_cm_create_tx(dev,
+									       path,
+									       neigh));
+				if (!ipoib_cm_get(neigh)) {
+					ipoib_neigh_free(neigh);
+					continue;
+				}
+			}
+
+			while ((skb = __skb_dequeue(&neigh->queue)))
+				__skb_queue_tail(&skqueue, skb);
+		}
+		path->valid = 1;
+	}
+
+	path->query = NULL;
+	complete(&path->done);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	if (IS_ERR_OR_NULL(ah))
+		ipoib_del_neighs_by_gid(dev, path->pathrec.dgid.raw);
+
+	if (old_ah)
+		ipoib_put_ah(old_ah);
+
+	while ((skb = __skb_dequeue(&skqueue))) {
+		skb->dev = dev;
+		if (dev_queue_xmit(skb))
+			ipoib_warn(priv, "dev_queue_xmit failed "
+				   "to requeue packet\n");
+	}
+}
+
+static struct ipoib_path *path_rec_create(struct net_device *dev, void *gid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path;
+
+	if (!priv->broadcast)
+		return NULL;
+
+	path = kzalloc(sizeof *path, GFP_ATOMIC);
+	if (!path)
+		return NULL;
+
+	path->dev = dev;
+
+	skb_queue_head_init(&path->queue);
+
+	INIT_LIST_HEAD(&path->neigh_list);
+
+	memcpy(path->pathrec.dgid.raw, gid, sizeof (union ib_gid));
+	path->pathrec.sgid	    = priv->local_gid;
+	path->pathrec.pkey	    = cpu_to_be16(priv->pkey);
+	path->pathrec.numb_path     = 1;
+	path->pathrec.traffic_class = priv->broadcast->mcmember.traffic_class;
+
+	return path;
+}
+
+static int path_rec_start(struct net_device *dev,
+			  struct ipoib_path *path)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg(priv, "Start path record lookup for %pI6\n",
+		  path->pathrec.dgid.raw);
+
+	init_completion(&path->done);
+
+	path->query_id =
+		ib_sa_path_rec_get(&ipoib_sa_client, priv->ca, priv->port,
+				   &path->pathrec,
+				   IB_SA_PATH_REC_DGID		|
+				   IB_SA_PATH_REC_SGID		|
+				   IB_SA_PATH_REC_NUMB_PATH	|
+				   IB_SA_PATH_REC_TRAFFIC_CLASS |
+				   IB_SA_PATH_REC_PKEY,
+				   1000, GFP_ATOMIC,
+				   path_rec_completion,
+				   path, &path->query);
+	if (path->query_id < 0) {
+		ipoib_warn(priv, "ib_sa_path_rec_get failed: %d\n", path->query_id);
+		path->query = NULL;
+		complete(&path->done);
+		return path->query_id;
+	}
+
+	return 0;
+}
+
+static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
+			   struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path;
+	struct ipoib_neigh *neigh;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+	neigh = ipoib_neigh_alloc(daddr, dev);
+	if (!neigh) {
+		spin_unlock_irqrestore(&priv->lock, flags);
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+		return;
+	}
+
+	path = __path_find(dev, daddr + 4);
+	if (!path) {
+		path = path_rec_create(dev, daddr + 4);
+		if (!path)
+			goto err_path;
+
+		__path_add(dev, path);
+	}
+
+	list_add_tail(&neigh->list, &path->neigh_list);
+
+	if (path->ah) {
+		kref_get(&path->ah->ref);
+		neigh->ah = path->ah;
+
+		if (ipoib_cm_enabled(dev, neigh->daddr)) {
+			if (!ipoib_cm_get(neigh))
+				ipoib_cm_set(neigh, ipoib_cm_create_tx(dev, path, neigh));
+			if (!ipoib_cm_get(neigh)) {
+				ipoib_neigh_free(neigh);
+				goto err_drop;
+			}
+			if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
+				__skb_queue_tail(&neigh->queue, skb);
+			else {
+				ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
+					   skb_queue_len(&neigh->queue));
+				goto err_drop;
+			}
+		} else {
+			spin_unlock_irqrestore(&priv->lock, flags);
+			ipoib_send(dev, skb, path->ah, IPOIB_QPN(daddr));
+			ipoib_neigh_put(neigh);
+			return;
+		}
+	} else {
+		neigh->ah  = NULL;
+
+		if (!path->query && path_rec_start(dev, path))
+			goto err_path;
+		if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
+			__skb_queue_tail(&neigh->queue, skb);
+		else
+			goto err_drop;
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	ipoib_neigh_put(neigh);
+	return;
+
+err_path:
+	ipoib_neigh_free(neigh);
+err_drop:
+	++dev->stats.tx_dropped;
+	dev_kfree_skb_any(skb);
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+	ipoib_neigh_put(neigh);
+}
+
+static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
+			     struct ipoib_cb *cb)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_path *path;
+	unsigned long flags;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	path = __path_find(dev, cb->hwaddr + 4);
+	if (!path || !path->valid) {
+		int new_path = 0;
+
+		if (!path) {
+			path = path_rec_create(dev, cb->hwaddr + 4);
+			new_path = 1;
+		}
+		if (path) {
+			if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+				__skb_queue_tail(&path->queue, skb);
+			} else {
+				++dev->stats.tx_dropped;
+				dev_kfree_skb_any(skb);
+			}
+
+			if (!path->query && path_rec_start(dev, path)) {
+				spin_unlock_irqrestore(&priv->lock, flags);
+				if (new_path)
+					path_free(dev, path);
+				return;
+			} else
+				__path_add(dev, path);
+		} else {
+			++dev->stats.tx_dropped;
+			dev_kfree_skb_any(skb);
+		}
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		return;
+	}
+
+	if (path->ah) {
+		ipoib_dbg(priv, "Send unicast ARP to %04x\n",
+			  be16_to_cpu(path->pathrec.dlid));
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr));
+		return;
+	} else if ((path->query || !path_rec_start(dev, path)) &&
+		   skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+		__skb_queue_tail(&path->queue, skb);
+	} else {
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh *neigh;
+	struct ipoib_cb *cb = ipoib_skb_cb(skb);
+	struct ipoib_header *header;
+	unsigned long flags;
+
+	header = (struct ipoib_header *) skb->data;
+
+	if (unlikely(cb->hwaddr[4] == 0xff)) {
+		/* multicast, arrange "if" according to probability */
+		if ((header->proto != htons(ETH_P_IP)) &&
+		    (header->proto != htons(ETH_P_IPV6)) &&
+		    (header->proto != htons(ETH_P_ARP)) &&
+		    (header->proto != htons(ETH_P_RARP)) &&
+		    (header->proto != htons(ETH_P_TIPC))) {
+			/* ethertype not supported by IPoIB */
+			++dev->stats.tx_dropped;
+			dev_kfree_skb_any(skb);
+			return NETDEV_TX_OK;
+		}
+		/* Add in the P_Key for multicast*/
+		cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
+		cb->hwaddr[9] = priv->pkey & 0xff;
+
+		neigh = ipoib_neigh_get(dev, cb->hwaddr);
+		if (likely(neigh))
+			goto send_using_neigh;
+		ipoib_mcast_send(dev, cb->hwaddr, skb);
+		return NETDEV_TX_OK;
+	}
+
+	/* unicast, arrange "switch" according to probability */
+	switch (header->proto) {
+	case htons(ETH_P_IP):
+	case htons(ETH_P_IPV6):
+	case htons(ETH_P_TIPC):
+		neigh = ipoib_neigh_get(dev, cb->hwaddr);
+		if (unlikely(!neigh)) {
+			neigh_add_path(skb, cb->hwaddr, dev);
+			return NETDEV_TX_OK;
+		}
+		break;
+	case htons(ETH_P_ARP):
+	case htons(ETH_P_RARP):
+		/* for unicast ARP and RARP should always perform path find */
+		unicast_arp_send(skb, dev, cb);
+		return NETDEV_TX_OK;
+	default:
+		/* ethertype not supported by IPoIB */
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+		return NETDEV_TX_OK;
+	}
+
+send_using_neigh:
+	/* note we now hold a ref to neigh */
+	if (ipoib_cm_get(neigh)) {
+		if (ipoib_cm_up(neigh)) {
+			ipoib_cm_send(dev, skb, ipoib_cm_get(neigh));
+			goto unref;
+		}
+	} else if (neigh->ah) {
+		ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr));
+		goto unref;
+	}
+
+	if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
+		spin_lock_irqsave(&priv->lock, flags);
+		__skb_queue_tail(&neigh->queue, skb);
+		spin_unlock_irqrestore(&priv->lock, flags);
+	} else {
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+	}
+
+unref:
+	ipoib_neigh_put(neigh);
+
+	return NETDEV_TX_OK;
+}
+
+static void ipoib_timeout(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_warn(priv, "transmit timeout: latency %d msecs\n",
+		   jiffies_to_msecs(jiffies - dev->trans_start));
+	ipoib_warn(priv, "queue stopped %d, tx_head %u, tx_tail %u\n",
+		   netif_queue_stopped(dev),
+		   priv->tx_head, priv->tx_tail);
+	/* XXX reset QP, etc. */
+}
+
+static int ipoib_hard_header(struct sk_buff *skb,
+			     struct net_device *dev,
+			     unsigned short type,
+			     const void *daddr, const void *saddr, unsigned len)
+{
+	struct ipoib_header *header;
+	struct ipoib_cb *cb = ipoib_skb_cb(skb);
+
+	header = (struct ipoib_header *) skb_push(skb, sizeof *header);
+
+	header->proto = htons(type);
+	header->reserved = 0;
+
+	/*
+	 * we don't rely on dst_entry structure,  always stuff the
+	 * destination address into skb->cb so we can figure out where
+	 * to send the packet later.
+	 */
+	memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
+
+	return sizeof *header;
+}
+
+static void ipoib_set_mcast_list(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
+		ipoib_dbg(priv, "IPOIB_FLAG_OPER_UP not set");
+		return;
+	}
+
+	queue_work(priv->wq, &priv->restart_task);
+}
+
+static int ipoib_get_iflink(const struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	/* parent interface */
+	if (!test_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags))
+		return dev->ifindex;
+
+	/* child/vlan interface */
+	return priv->parent->ifindex;
+}
+
+static u32 ipoib_addr_hash(struct ipoib_neigh_hash *htbl, u8 *daddr)
+{
+	/*
+	 * Use only the address parts that contributes to spreading
+	 * The subnet prefix is not used as one can not connect to
+	 * same remote port (GUID) using the same remote QPN via two
+	 * different subnets.
+	 */
+	 /* qpn octets[1:4) & port GUID octets[12:20) */
+	u32 *d32 = (u32 *) daddr;
+	u32 hv;
+
+	hv = jhash_3words(d32[3], d32[4], IPOIB_QPN_MASK & d32[0], 0);
+	return hv & htbl->mask;
+}
+
+struct ipoib_neigh *ipoib_neigh_get(struct net_device *dev, u8 *daddr)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	struct ipoib_neigh *neigh = NULL;
+	u32 hash_val;
+
+	rcu_read_lock_bh();
+
+	htbl = rcu_dereference_bh(ntbl->htbl);
+
+	if (!htbl)
+		goto out_unlock;
+
+	hash_val = ipoib_addr_hash(htbl, daddr);
+	for (neigh = rcu_dereference_bh(htbl->buckets[hash_val]);
+	     neigh != NULL;
+	     neigh = rcu_dereference_bh(neigh->hnext)) {
+		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
+			/* found, take one ref on behalf of the caller */
+			if (!atomic_inc_not_zero(&neigh->refcnt)) {
+				/* deleted */
+				neigh = NULL;
+				goto out_unlock;
+			}
+			neigh->alive = jiffies;
+			goto out_unlock;
+		}
+	}
+
+out_unlock:
+	rcu_read_unlock_bh();
+	return neigh;
+}
+
+static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	unsigned long neigh_obsolete;
+	unsigned long dt;
+	unsigned long flags;
+	int i;
+
+	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
+		return;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	htbl = rcu_dereference_protected(ntbl->htbl,
+					 lockdep_is_held(&priv->lock));
+
+	if (!htbl)
+		goto out_unlock;
+
+	/* neigh is obsolete if it was idle for two GC periods */
+	dt = 2 * arp_tbl.gc_interval;
+	neigh_obsolete = jiffies - dt;
+	/* handle possible race condition */
+	if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
+		goto out_unlock;
+
+	for (i = 0; i < htbl->size; i++) {
+		struct ipoib_neigh *neigh;
+		struct ipoib_neigh __rcu **np = &htbl->buckets[i];
+
+		while ((neigh = rcu_dereference_protected(*np,
+							  lockdep_is_held(&priv->lock))) != NULL) {
+			/* was the neigh idle for two GC periods */
+			if (time_after(neigh_obsolete, neigh->alive)) {
+				rcu_assign_pointer(*np,
+						   rcu_dereference_protected(neigh->hnext,
+									     lockdep_is_held(&priv->lock)));
+				/* remove from path/mc list */
+				list_del(&neigh->list);
+				call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
+			} else {
+				np = &neigh->hnext;
+			}
+
+		}
+	}
+
+out_unlock:
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void ipoib_reap_neigh(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, neigh_reap_task.work);
+
+	__ipoib_reap_neigh(priv);
+
+	if (!test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags))
+		queue_delayed_work(priv->wq, &priv->neigh_reap_task,
+				   arp_tbl.gc_interval);
+}
+
+
+static struct ipoib_neigh *ipoib_neigh_ctor(u8 *daddr,
+				      struct net_device *dev)
+{
+	struct ipoib_neigh *neigh;
+
+	neigh = kzalloc(sizeof *neigh, GFP_ATOMIC);
+	if (!neigh)
+		return NULL;
+
+	neigh->dev = dev;
+	memcpy(&neigh->daddr, daddr, sizeof(neigh->daddr));
+	skb_queue_head_init(&neigh->queue);
+	INIT_LIST_HEAD(&neigh->list);
+	ipoib_cm_set(neigh, NULL);
+	/* one ref on behalf of the caller */
+	atomic_set(&neigh->refcnt, 1);
+
+	return neigh;
+}
+
+struct ipoib_neigh *ipoib_neigh_alloc(u8 *daddr,
+				      struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	struct ipoib_neigh *neigh;
+	u32 hash_val;
+
+	htbl = rcu_dereference_protected(ntbl->htbl,
+					 lockdep_is_held(&priv->lock));
+	if (!htbl) {
+		neigh = NULL;
+		goto out_unlock;
+	}
+
+	/* need to add a new neigh, but maybe some other thread succeeded?
+	 * recalc hash, maybe hash resize took place so we do a search
+	 */
+	hash_val = ipoib_addr_hash(htbl, daddr);
+	for (neigh = rcu_dereference_protected(htbl->buckets[hash_val],
+					       lockdep_is_held(&priv->lock));
+	     neigh != NULL;
+	     neigh = rcu_dereference_protected(neigh->hnext,
+					       lockdep_is_held(&priv->lock))) {
+		if (memcmp(daddr, neigh->daddr, INFINIBAND_ALEN) == 0) {
+			/* found, take one ref on behalf of the caller */
+			if (!atomic_inc_not_zero(&neigh->refcnt)) {
+				/* deleted */
+				neigh = NULL;
+				break;
+			}
+			neigh->alive = jiffies;
+			goto out_unlock;
+		}
+	}
+
+	neigh = ipoib_neigh_ctor(daddr, dev);
+	if (!neigh)
+		goto out_unlock;
+
+	/* one ref on behalf of the hash table */
+	atomic_inc(&neigh->refcnt);
+	neigh->alive = jiffies;
+	/* put in hash */
+	rcu_assign_pointer(neigh->hnext,
+			   rcu_dereference_protected(htbl->buckets[hash_val],
+						     lockdep_is_held(&priv->lock)));
+	rcu_assign_pointer(htbl->buckets[hash_val], neigh);
+	atomic_inc(&ntbl->entries);
+
+out_unlock:
+
+	return neigh;
+}
+
+void ipoib_neigh_dtor(struct ipoib_neigh *neigh)
+{
+	/* neigh reference count was dropprd to zero */
+	struct net_device *dev = neigh->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct sk_buff *skb;
+	if (neigh->ah)
+		ipoib_put_ah(neigh->ah);
+	while ((skb = __skb_dequeue(&neigh->queue))) {
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+	}
+	if (ipoib_cm_get(neigh))
+		ipoib_cm_destroy_tx(ipoib_cm_get(neigh));
+	ipoib_dbg(netdev_priv(dev),
+		  "neigh free for %06x %pI6\n",
+		  IPOIB_QPN(neigh->daddr),
+		  neigh->daddr + 4);
+	kfree(neigh);
+	if (atomic_dec_and_test(&priv->ntbl.entries)) {
+		if (test_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags))
+			complete(&priv->ntbl.flushed);
+	}
+}
+
+static void ipoib_neigh_reclaim(struct rcu_head *rp)
+{
+	/* Called as a result of removal from hash table */
+	struct ipoib_neigh *neigh = container_of(rp, struct ipoib_neigh, rcu);
+	/* note TX context may hold another ref */
+	ipoib_neigh_put(neigh);
+}
+
+void ipoib_neigh_free(struct ipoib_neigh *neigh)
+{
+	struct net_device *dev = neigh->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	struct ipoib_neigh __rcu **np;
+	struct ipoib_neigh *n;
+	u32 hash_val;
+
+	htbl = rcu_dereference_protected(ntbl->htbl,
+					lockdep_is_held(&priv->lock));
+	if (!htbl)
+		return;
+
+	hash_val = ipoib_addr_hash(htbl, neigh->daddr);
+	np = &htbl->buckets[hash_val];
+	for (n = rcu_dereference_protected(*np,
+					    lockdep_is_held(&priv->lock));
+	     n != NULL;
+	     n = rcu_dereference_protected(*np,
+					lockdep_is_held(&priv->lock))) {
+		if (n == neigh) {
+			/* found */
+			rcu_assign_pointer(*np,
+					   rcu_dereference_protected(neigh->hnext,
+								     lockdep_is_held(&priv->lock)));
+			/* remove from parent list */
+			list_del(&neigh->list);
+			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
+			return;
+		} else {
+			np = &n->hnext;
+		}
+	}
+}
+
+static int ipoib_neigh_hash_init(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	struct ipoib_neigh **buckets;
+	u32 size;
+
+	clear_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
+	ntbl->htbl = NULL;
+	htbl = kzalloc(sizeof(*htbl), GFP_KERNEL);
+	if (!htbl)
+		return -ENOMEM;
+	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
+	size = roundup_pow_of_two(arp_tbl.gc_thresh3);
+	buckets = kzalloc(size * sizeof(*buckets), GFP_KERNEL);
+	if (!buckets) {
+		kfree(htbl);
+		return -ENOMEM;
+	}
+	htbl->size = size;
+	htbl->mask = (size - 1);
+	htbl->buckets = buckets;
+	ntbl->htbl = htbl;
+	htbl->ntbl = ntbl;
+	atomic_set(&ntbl->entries, 0);
+
+	/* start garbage collection */
+	clear_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
+	queue_delayed_work(priv->wq, &priv->neigh_reap_task,
+			   arp_tbl.gc_interval);
+
+	return 0;
+}
+
+static void neigh_hash_free_rcu(struct rcu_head *head)
+{
+	struct ipoib_neigh_hash *htbl = container_of(head,
+						    struct ipoib_neigh_hash,
+						    rcu);
+	struct ipoib_neigh __rcu **buckets = htbl->buckets;
+	struct ipoib_neigh_table *ntbl = htbl->ntbl;
+
+	kfree(buckets);
+	kfree(htbl);
+	complete(&ntbl->deleted);
+}
+
+void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	unsigned long flags;
+	int i;
+
+	/* remove all neigh connected to a given path or mcast */
+	spin_lock_irqsave(&priv->lock, flags);
+
+	htbl = rcu_dereference_protected(ntbl->htbl,
+					 lockdep_is_held(&priv->lock));
+
+	if (!htbl)
+		goto out_unlock;
+
+	for (i = 0; i < htbl->size; i++) {
+		struct ipoib_neigh *neigh;
+		struct ipoib_neigh __rcu **np = &htbl->buckets[i];
+
+		while ((neigh = rcu_dereference_protected(*np,
+							  lockdep_is_held(&priv->lock))) != NULL) {
+			/* delete neighs belong to this parent */
+			if (!memcmp(gid, neigh->daddr + 4, sizeof (union ib_gid))) {
+				rcu_assign_pointer(*np,
+						   rcu_dereference_protected(neigh->hnext,
+									     lockdep_is_held(&priv->lock)));
+				/* remove from parent list */
+				list_del(&neigh->list);
+				call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
+			} else {
+				np = &neigh->hnext;
+			}
+
+		}
+	}
+out_unlock:
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+static void ipoib_flush_neighs(struct ipoib_dev_priv *priv)
+{
+	struct ipoib_neigh_table *ntbl = &priv->ntbl;
+	struct ipoib_neigh_hash *htbl;
+	unsigned long flags;
+	int i, wait_flushed = 0;
+
+	init_completion(&priv->ntbl.flushed);
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	htbl = rcu_dereference_protected(ntbl->htbl,
+					lockdep_is_held(&priv->lock));
+	if (!htbl)
+		goto out_unlock;
+
+	wait_flushed = atomic_read(&priv->ntbl.entries);
+	if (!wait_flushed)
+		goto free_htbl;
+
+	for (i = 0; i < htbl->size; i++) {
+		struct ipoib_neigh *neigh;
+		struct ipoib_neigh __rcu **np = &htbl->buckets[i];
+
+		while ((neigh = rcu_dereference_protected(*np,
+				       lockdep_is_held(&priv->lock))) != NULL) {
+			rcu_assign_pointer(*np,
+					   rcu_dereference_protected(neigh->hnext,
+								     lockdep_is_held(&priv->lock)));
+			/* remove from path/mc list */
+			list_del(&neigh->list);
+			call_rcu(&neigh->rcu, ipoib_neigh_reclaim);
+		}
+	}
+
+free_htbl:
+	rcu_assign_pointer(ntbl->htbl, NULL);
+	call_rcu(&htbl->rcu, neigh_hash_free_rcu);
+
+out_unlock:
+	spin_unlock_irqrestore(&priv->lock, flags);
+	if (wait_flushed)
+		wait_for_completion(&priv->ntbl.flushed);
+}
+
+static void ipoib_neigh_hash_uninit(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int stopped;
+
+	ipoib_dbg(priv, "ipoib_neigh_hash_uninit\n");
+	init_completion(&priv->ntbl.deleted);
+	set_bit(IPOIB_NEIGH_TBL_FLUSH, &priv->flags);
+
+	/* Stop GC if called at init fail need to cancel work */
+	stopped = test_and_set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
+	if (!stopped)
+		cancel_delayed_work(&priv->neigh_reap_task);
+
+	ipoib_flush_neighs(priv);
+
+	wait_for_completion(&priv->ntbl.deleted);
+}
+
+
+int ipoib_dev_init(struct net_device *dev, struct ib_device *ca, int port)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	/* Allocate RX/TX "rings" to hold queued skbs */
+	priv->rx_ring =	kzalloc(ipoib_recvq_size * sizeof *priv->rx_ring,
+				GFP_KERNEL);
+	if (!priv->rx_ring) {
+		printk(KERN_WARNING "%s: failed to allocate RX ring (%d entries)\n",
+		       ca->name, ipoib_recvq_size);
+		goto out;
+	}
+
+	priv->tx_ring = vzalloc(ipoib_sendq_size * sizeof *priv->tx_ring);
+	if (!priv->tx_ring) {
+		printk(KERN_WARNING "%s: failed to allocate TX ring (%d entries)\n",
+		       ca->name, ipoib_sendq_size);
+		goto out_rx_ring_cleanup;
+	}
+
+	/* priv->tx_head, tx_tail & tx_outstanding are already 0 */
+
+	if (ipoib_ib_dev_init(dev, ca, port))
+		goto out_tx_ring_cleanup;
+
+	/*
+	 * Must be after ipoib_ib_dev_init so we can allocate a per
+	 * device wq there and use it here
+	 */
+	if (ipoib_neigh_hash_init(priv) < 0)
+		goto out_dev_uninit;
+
+	return 0;
+
+out_dev_uninit:
+	ipoib_ib_dev_cleanup(dev);
+
+out_tx_ring_cleanup:
+	vfree(priv->tx_ring);
+
+out_rx_ring_cleanup:
+	kfree(priv->rx_ring);
+
+out:
+	return -ENOMEM;
+}
+
+void ipoib_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev), *cpriv, *tcpriv;
+	LIST_HEAD(head);
+
+	ASSERT_RTNL();
+
+	ipoib_delete_debug_files(dev);
+
+	/* Delete any child interfaces first */
+	list_for_each_entry_safe(cpriv, tcpriv, &priv->child_intfs, list) {
+		/* Stop GC on child */
+		set_bit(IPOIB_STOP_NEIGH_GC, &cpriv->flags);
+		cancel_delayed_work(&cpriv->neigh_reap_task);
+		unregister_netdevice_queue(cpriv->dev, &head);
+	}
+	unregister_netdevice_many(&head);
+
+	/*
+	 * Must be before ipoib_ib_dev_cleanup or we delete an in use
+	 * work queue
+	 */
+	ipoib_neigh_hash_uninit(dev);
+
+	ipoib_ib_dev_cleanup(dev);
+
+	kfree(priv->rx_ring);
+	vfree(priv->tx_ring);
+
+	priv->rx_ring = NULL;
+	priv->tx_ring = NULL;
+}
+
+static const struct header_ops ipoib_header_ops = {
+	.create	= ipoib_hard_header,
+};
+
+static const struct net_device_ops ipoib_netdev_ops = {
+	.ndo_uninit		 = ipoib_uninit,
+	.ndo_open		 = ipoib_open,
+	.ndo_stop		 = ipoib_stop,
+	.ndo_change_mtu		 = ipoib_change_mtu,
+	.ndo_fix_features	 = ipoib_fix_features,
+	.ndo_start_xmit	 	 = ipoib_start_xmit,
+	.ndo_tx_timeout		 = ipoib_timeout,
+	.ndo_set_rx_mode	 = ipoib_set_mcast_list,
+	.ndo_get_iflink		 = ipoib_get_iflink,
+};
+
+void ipoib_setup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	dev->netdev_ops		 = &ipoib_netdev_ops;
+	dev->header_ops		 = &ipoib_header_ops;
+
+	ipoib_set_ethtool_ops(dev);
+
+	netif_napi_add(dev, &priv->napi, ipoib_poll, NAPI_POLL_WEIGHT);
+
+	dev->watchdog_timeo	 = HZ;
+
+	dev->flags		|= IFF_BROADCAST | IFF_MULTICAST;
+
+	dev->hard_header_len	 = IPOIB_ENCAP_LEN;
+	dev->addr_len		 = INFINIBAND_ALEN;
+	dev->type		 = ARPHRD_INFINIBAND;
+	dev->tx_queue_len	 = ipoib_sendq_size * 2;
+	dev->features		 = (NETIF_F_VLAN_CHALLENGED	|
+				    NETIF_F_HIGHDMA);
+	netif_keep_dst(dev);
+
+	memcpy(dev->broadcast, ipv4_bcast_addr, INFINIBAND_ALEN);
+
+	priv->dev = dev;
+
+	spin_lock_init(&priv->lock);
+
+	init_rwsem(&priv->vlan_rwsem);
+
+	INIT_LIST_HEAD(&priv->path_list);
+	INIT_LIST_HEAD(&priv->child_intfs);
+	INIT_LIST_HEAD(&priv->dead_ahs);
+	INIT_LIST_HEAD(&priv->multicast_list);
+
+	INIT_DELAYED_WORK(&priv->mcast_task,   ipoib_mcast_join_task);
+	INIT_WORK(&priv->carrier_on_task, ipoib_mcast_carrier_on_task);
+	INIT_WORK(&priv->flush_light,   ipoib_ib_dev_flush_light);
+	INIT_WORK(&priv->flush_normal,   ipoib_ib_dev_flush_normal);
+	INIT_WORK(&priv->flush_heavy,   ipoib_ib_dev_flush_heavy);
+	INIT_WORK(&priv->restart_task, ipoib_mcast_restart_task);
+	INIT_DELAYED_WORK(&priv->ah_reap_task, ipoib_reap_ah);
+	INIT_DELAYED_WORK(&priv->neigh_reap_task, ipoib_reap_neigh);
+}
+
+struct ipoib_dev_priv *ipoib_intf_alloc(const char *name)
+{
+	struct net_device *dev;
+
+	dev = alloc_netdev((int)sizeof(struct ipoib_dev_priv), name,
+			   NET_NAME_UNKNOWN, ipoib_setup);
+	if (!dev)
+		return NULL;
+
+	return netdev_priv(dev);
+}
+
+static ssize_t show_pkey(struct device *dev,
+			 struct device_attribute *attr, char *buf)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
+
+	return sprintf(buf, "0x%04x\n", priv->pkey);
+}
+static DEVICE_ATTR(pkey, S_IRUGO, show_pkey, NULL);
+
+static ssize_t show_umcast(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(to_net_dev(dev));
+
+	return sprintf(buf, "%d\n", test_bit(IPOIB_FLAG_UMCAST, &priv->flags));
+}
+
+void ipoib_set_umcast(struct net_device *ndev, int umcast_val)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(ndev);
+
+	if (umcast_val > 0) {
+		set_bit(IPOIB_FLAG_UMCAST, &priv->flags);
+		ipoib_warn(priv, "ignoring multicast groups joined directly "
+				"by userspace\n");
+	} else
+		clear_bit(IPOIB_FLAG_UMCAST, &priv->flags);
+}
+
+static ssize_t set_umcast(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf, size_t count)
+{
+	unsigned long umcast_val = simple_strtoul(buf, NULL, 0);
+
+	ipoib_set_umcast(to_net_dev(dev), umcast_val);
+
+	return count;
+}
+static DEVICE_ATTR(umcast, S_IWUSR | S_IRUGO, show_umcast, set_umcast);
+
+int ipoib_add_umcast_attr(struct net_device *dev)
+{
+	return device_create_file(&dev->dev, &dev_attr_umcast);
+}
+
+static ssize_t create_child(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	int pkey;
+	int ret;
+
+	if (sscanf(buf, "%i", &pkey) != 1)
+		return -EINVAL;
+
+	if (pkey <= 0 || pkey > 0xffff || pkey == 0x8000)
+		return -EINVAL;
+
+	/*
+	 * Set the full membership bit, so that we join the right
+	 * broadcast group, etc.
+	 */
+	pkey |= 0x8000;
+
+	ret = ipoib_vlan_add(to_net_dev(dev), pkey);
+
+	return ret ? ret : count;
+}
+static DEVICE_ATTR(create_child, S_IWUSR, NULL, create_child);
+
+static ssize_t delete_child(struct device *dev,
+			    struct device_attribute *attr,
+			    const char *buf, size_t count)
+{
+	int pkey;
+	int ret;
+
+	if (sscanf(buf, "%i", &pkey) != 1)
+		return -EINVAL;
+
+	if (pkey < 0 || pkey > 0xffff)
+		return -EINVAL;
+
+	ret = ipoib_vlan_delete(to_net_dev(dev), pkey);
+
+	return ret ? ret : count;
+
+}
+static DEVICE_ATTR(delete_child, S_IWUSR, NULL, delete_child);
+
+int ipoib_add_pkey_attr(struct net_device *dev)
+{
+	return device_create_file(&dev->dev, &dev_attr_pkey);
+}
+
+int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca)
+{
+	struct ib_device_attr *device_attr;
+	int result = -ENOMEM;
+
+	device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL);
+	if (!device_attr) {
+		printk(KERN_WARNING "%s: allocation of %zu bytes failed\n",
+		       hca->name, sizeof *device_attr);
+		return result;
+	}
+
+	result = ib_query_device(hca, device_attr);
+	if (result) {
+		printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n",
+		       hca->name, result);
+		kfree(device_attr);
+		return result;
+	}
+	priv->hca_caps = device_attr->device_cap_flags;
+
+	kfree(device_attr);
+
+	if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) {
+		priv->dev->hw_features = NETIF_F_SG |
+			NETIF_F_IP_CSUM | NETIF_F_RXCSUM;
+
+		if (priv->hca_caps & IB_DEVICE_UD_TSO)
+			priv->dev->hw_features |= NETIF_F_TSO;
+
+		priv->dev->features |= priv->dev->hw_features;
+	}
+
+	return 0;
+}
+
+static struct net_device *ipoib_add_port(const char *format,
+					 struct ib_device *hca, u8 port)
+{
+	struct ipoib_dev_priv *priv;
+	struct ib_port_attr attr;
+	int result = -ENOMEM;
+
+	priv = ipoib_intf_alloc(format);
+	if (!priv)
+		goto alloc_mem_failed;
+
+	SET_NETDEV_DEV(priv->dev, hca->dma_device);
+	priv->dev->dev_id = port - 1;
+
+	if (!ib_query_port(hca, port, &attr))
+		priv->max_ib_mtu = ib_mtu_enum_to_int(attr.max_mtu);
+	else {
+		printk(KERN_WARNING "%s: ib_query_port %d failed\n",
+		       hca->name, port);
+		goto device_init_failed;
+	}
+
+	/* MTU will be reset when mcast join happens */
+	priv->dev->mtu  = IPOIB_UD_MTU(priv->max_ib_mtu);
+	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
+
+	priv->dev->neigh_priv_len = sizeof(struct ipoib_neigh);
+
+	result = ib_query_pkey(hca, port, 0, &priv->pkey);
+	if (result) {
+		printk(KERN_WARNING "%s: ib_query_pkey port %d failed (ret = %d)\n",
+		       hca->name, port, result);
+		goto device_init_failed;
+	}
+
+	if (ipoib_set_dev_features(priv, hca))
+		goto device_init_failed;
+
+	/*
+	 * Set the full membership bit, so that we join the right
+	 * broadcast group, etc.
+	 */
+	priv->pkey |= 0x8000;
+
+	priv->dev->broadcast[8] = priv->pkey >> 8;
+	priv->dev->broadcast[9] = priv->pkey & 0xff;
+
+	result = ib_query_gid(hca, port, 0, &priv->local_gid);
+	if (result) {
+		printk(KERN_WARNING "%s: ib_query_gid port %d failed (ret = %d)\n",
+		       hca->name, port, result);
+		goto device_init_failed;
+	} else
+		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+
+	result = ipoib_dev_init(priv->dev, hca, port);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: failed to initialize port %d (ret = %d)\n",
+		       hca->name, port, result);
+		goto device_init_failed;
+	}
+
+	INIT_IB_EVENT_HANDLER(&priv->event_handler,
+			      priv->ca, ipoib_event);
+	result = ib_register_event_handler(&priv->event_handler);
+	if (result < 0) {
+		printk(KERN_WARNING "%s: ib_register_event_handler failed for "
+		       "port %d (ret = %d)\n",
+		       hca->name, port, result);
+		goto event_failed;
+	}
+
+	result = register_netdev(priv->dev);
+	if (result) {
+		printk(KERN_WARNING "%s: couldn't register ipoib port %d; error %d\n",
+		       hca->name, port, result);
+		goto register_failed;
+	}
+
+	ipoib_create_debug_files(priv->dev);
+
+	if (ipoib_cm_add_mode_attr(priv->dev))
+		goto sysfs_failed;
+	if (ipoib_add_pkey_attr(priv->dev))
+		goto sysfs_failed;
+	if (ipoib_add_umcast_attr(priv->dev))
+		goto sysfs_failed;
+	if (device_create_file(&priv->dev->dev, &dev_attr_create_child))
+		goto sysfs_failed;
+	if (device_create_file(&priv->dev->dev, &dev_attr_delete_child))
+		goto sysfs_failed;
+
+	return priv->dev;
+
+sysfs_failed:
+	ipoib_delete_debug_files(priv->dev);
+	unregister_netdev(priv->dev);
+
+register_failed:
+	ib_unregister_event_handler(&priv->event_handler);
+	flush_workqueue(ipoib_workqueue);
+	/* Stop GC if started before flush */
+	set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
+	cancel_delayed_work(&priv->neigh_reap_task);
+	flush_workqueue(priv->wq);
+
+event_failed:
+	ipoib_dev_cleanup(priv->dev);
+
+device_init_failed:
+	free_netdev(priv->dev);
+
+alloc_mem_failed:
+	return ERR_PTR(result);
+}
+
+static void ipoib_add_one(struct ib_device *device)
+{
+	struct list_head *dev_list;
+	struct net_device *dev;
+	struct ipoib_dev_priv *priv;
+	int s, e, p;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	dev_list = kmalloc(sizeof *dev_list, GFP_KERNEL);
+	if (!dev_list)
+		return;
+
+	INIT_LIST_HEAD(dev_list);
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		s = 0;
+		e = 0;
+	} else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	for (p = s; p <= e; ++p) {
+		if (rdma_port_get_link_layer(device, p) != IB_LINK_LAYER_INFINIBAND)
+			continue;
+		dev = ipoib_add_port("ib%d", device, p);
+		if (!IS_ERR(dev)) {
+			priv = netdev_priv(dev);
+			list_add_tail(&priv->list, dev_list);
+		}
+	}
+
+	ib_set_client_data(device, &ipoib_client, dev_list);
+}
+
+static void ipoib_remove_one(struct ib_device *device)
+{
+	struct ipoib_dev_priv *priv, *tmp;
+	struct list_head *dev_list;
+
+	if (rdma_node_get_transport(device->node_type) != RDMA_TRANSPORT_IB)
+		return;
+
+	dev_list = ib_get_client_data(device, &ipoib_client);
+	if (!dev_list)
+		return;
+
+	list_for_each_entry_safe(priv, tmp, dev_list, list) {
+		ib_unregister_event_handler(&priv->event_handler);
+		flush_workqueue(ipoib_workqueue);
+
+		rtnl_lock();
+		dev_change_flags(priv->dev, priv->dev->flags & ~IFF_UP);
+		rtnl_unlock();
+
+		/* Stop GC */
+		set_bit(IPOIB_STOP_NEIGH_GC, &priv->flags);
+		cancel_delayed_work(&priv->neigh_reap_task);
+		flush_workqueue(priv->wq);
+
+		unregister_netdev(priv->dev);
+		free_netdev(priv->dev);
+	}
+
+	kfree(dev_list);
+}
+
+static int __init ipoib_init_module(void)
+{
+	int ret;
+
+	ipoib_recvq_size = roundup_pow_of_two(ipoib_recvq_size);
+	ipoib_recvq_size = min(ipoib_recvq_size, IPOIB_MAX_QUEUE_SIZE);
+	ipoib_recvq_size = max(ipoib_recvq_size, IPOIB_MIN_QUEUE_SIZE);
+
+	ipoib_sendq_size = roundup_pow_of_two(ipoib_sendq_size);
+	ipoib_sendq_size = min(ipoib_sendq_size, IPOIB_MAX_QUEUE_SIZE);
+	ipoib_sendq_size = max3(ipoib_sendq_size, 2 * MAX_SEND_CQE, IPOIB_MIN_QUEUE_SIZE);
+#ifdef CONFIG_INFINIBAND_IPOIB_CM
+	ipoib_max_conn_qp = min(ipoib_max_conn_qp, IPOIB_CM_MAX_CONN_QP);
+#endif
+
+	/*
+	 * When copying small received packets, we only copy from the
+	 * linear data part of the SKB, so we rely on this condition.
+	 */
+	BUILD_BUG_ON(IPOIB_CM_COPYBREAK > IPOIB_CM_HEAD_SIZE);
+
+	ret = ipoib_register_debugfs();
+	if (ret)
+		return ret;
+
+	/*
+	 * We create a global workqueue here that is used for all flush
+	 * operations.  However, if you attempt to flush a workqueue
+	 * from a task on that same workqueue, it deadlocks the system.
+	 * We want to be able to flush the tasks associated with a
+	 * specific net device, so we also create a workqueue for each
+	 * netdevice.  We queue up the tasks for that device only on
+	 * its private workqueue, and we only queue up flush events
+	 * on our global flush workqueue.  This avoids the deadlocks.
+	 */
+	ipoib_workqueue = create_singlethread_workqueue("ipoib_flush");
+	if (!ipoib_workqueue) {
+		ret = -ENOMEM;
+		goto err_fs;
+	}
+
+	ib_sa_register_client(&ipoib_sa_client);
+
+	ret = ib_register_client(&ipoib_client);
+	if (ret)
+		goto err_sa;
+
+	ret = ipoib_netlink_init();
+	if (ret)
+		goto err_client;
+
+	return 0;
+
+err_client:
+	ib_unregister_client(&ipoib_client);
+
+err_sa:
+	ib_sa_unregister_client(&ipoib_sa_client);
+	destroy_workqueue(ipoib_workqueue);
+
+err_fs:
+	ipoib_unregister_debugfs();
+
+	return ret;
+}
+
+static void __exit ipoib_cleanup_module(void)
+{
+	ipoib_netlink_fini();
+	ib_unregister_client(&ipoib_client);
+	ib_sa_unregister_client(&ipoib_sa_client);
+	ipoib_unregister_debugfs();
+	destroy_workqueue(ipoib_workqueue);
+}
+
+module_init(ipoib_init_module);
+module_exit(ipoib_cleanup_module);
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
new file mode 100644
index 000000000..0d23e0568
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c
@@ -0,0 +1,1001 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright (c) 2004 Voltaire, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/moduleparam.h>
+#include <linux/ip.h>
+#include <linux/in.h>
+#include <linux/igmp.h>
+#include <linux/inetdevice.h>
+#include <linux/delay.h>
+#include <linux/completion.h>
+#include <linux/slab.h>
+
+#include <net/dst.h>
+
+#include "ipoib.h"
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+static int mcast_debug_level;
+
+module_param(mcast_debug_level, int, 0644);
+MODULE_PARM_DESC(mcast_debug_level,
+		 "Enable multicast debug tracing if > 0");
+#endif
+
+struct ipoib_mcast_iter {
+	struct net_device *dev;
+	union ib_gid       mgid;
+	unsigned long      created;
+	unsigned int       queuelen;
+	unsigned int       complete;
+	unsigned int       send_only;
+};
+
+/*
+ * This should be called with the priv->lock held
+ */
+static void __ipoib_mcast_schedule_join_thread(struct ipoib_dev_priv *priv,
+					       struct ipoib_mcast *mcast,
+					       bool delay)
+{
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+		return;
+
+	/*
+	 * We will be scheduling *something*, so cancel whatever is
+	 * currently scheduled first
+	 */
+	cancel_delayed_work(&priv->mcast_task);
+	if (mcast && delay) {
+		/*
+		 * We had a failure and want to schedule a retry later
+		 */
+		mcast->backoff *= 2;
+		if (mcast->backoff > IPOIB_MAX_BACKOFF_SECONDS)
+			mcast->backoff = IPOIB_MAX_BACKOFF_SECONDS;
+		mcast->delay_until = jiffies + (mcast->backoff * HZ);
+		/*
+		 * Mark this mcast for its delay, but restart the
+		 * task immediately.  The join task will make sure to
+		 * clear out all entries without delays, and then
+		 * schedule itself to run again when the earliest
+		 * delay expires
+		 */
+		queue_delayed_work(priv->wq, &priv->mcast_task, 0);
+	} else if (delay) {
+		/*
+		 * Special case of retrying after a failure to
+		 * allocate the broadcast multicast group, wait
+		 * 1 second and try again
+		 */
+		queue_delayed_work(priv->wq, &priv->mcast_task, HZ);
+	} else
+		queue_delayed_work(priv->wq, &priv->mcast_task, 0);
+}
+
+static void ipoib_mcast_free(struct ipoib_mcast *mcast)
+{
+	struct net_device *dev = mcast->dev;
+	int tx_dropped = 0;
+
+	ipoib_dbg_mcast(netdev_priv(dev), "deleting multicast group %pI6\n",
+			mcast->mcmember.mgid.raw);
+
+	/* remove all neigh connected to this mcast */
+	ipoib_del_neighs_by_gid(dev, mcast->mcmember.mgid.raw);
+
+	if (mcast->ah)
+		ipoib_put_ah(mcast->ah);
+
+	while (!skb_queue_empty(&mcast->pkt_queue)) {
+		++tx_dropped;
+		dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
+	}
+
+	netif_tx_lock_bh(dev);
+	dev->stats.tx_dropped += tx_dropped;
+	netif_tx_unlock_bh(dev);
+
+	kfree(mcast);
+}
+
+static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev,
+					     int can_sleep)
+{
+	struct ipoib_mcast *mcast;
+
+	mcast = kzalloc(sizeof *mcast, can_sleep ? GFP_KERNEL : GFP_ATOMIC);
+	if (!mcast)
+		return NULL;
+
+	mcast->dev = dev;
+	mcast->created = jiffies;
+	mcast->delay_until = jiffies;
+	mcast->backoff = 1;
+
+	INIT_LIST_HEAD(&mcast->list);
+	INIT_LIST_HEAD(&mcast->neigh_list);
+	skb_queue_head_init(&mcast->pkt_queue);
+
+	return mcast;
+}
+
+static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node *n = priv->multicast_tree.rb_node;
+
+	while (n) {
+		struct ipoib_mcast *mcast;
+		int ret;
+
+		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
+
+		ret = memcmp(mgid, mcast->mcmember.mgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = n->rb_left;
+		else if (ret > 0)
+			n = n->rb_right;
+		else
+			return mcast;
+	}
+
+	return NULL;
+}
+
+static int __ipoib_mcast_add(struct net_device *dev, struct ipoib_mcast *mcast)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct rb_node **n = &priv->multicast_tree.rb_node, *pn = NULL;
+
+	while (*n) {
+		struct ipoib_mcast *tmcast;
+		int ret;
+
+		pn = *n;
+		tmcast = rb_entry(pn, struct ipoib_mcast, rb_node);
+
+		ret = memcmp(mcast->mcmember.mgid.raw, tmcast->mcmember.mgid.raw,
+			     sizeof (union ib_gid));
+		if (ret < 0)
+			n = &pn->rb_left;
+		else if (ret > 0)
+			n = &pn->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&mcast->rb_node, pn, n);
+	rb_insert_color(&mcast->rb_node, &priv->multicast_tree);
+
+	return 0;
+}
+
+static int ipoib_mcast_join_finish(struct ipoib_mcast *mcast,
+				   struct ib_sa_mcmember_rec *mcmember)
+{
+	struct net_device *dev = mcast->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_ah *ah;
+	int ret;
+	int set_qkey = 0;
+
+	mcast->mcmember = *mcmember;
+
+	/* Set the multicast MTU and cached Q_Key before we attach if it's
+	 * the broadcast group.
+	 */
+	if (!memcmp(mcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
+		    sizeof (union ib_gid))) {
+		spin_lock_irq(&priv->lock);
+		if (!priv->broadcast) {
+			spin_unlock_irq(&priv->lock);
+			return -EAGAIN;
+		}
+		/*update priv member according to the new mcast*/
+		priv->broadcast->mcmember.qkey = mcmember->qkey;
+		priv->broadcast->mcmember.mtu = mcmember->mtu;
+		priv->broadcast->mcmember.traffic_class = mcmember->traffic_class;
+		priv->broadcast->mcmember.rate = mcmember->rate;
+		priv->broadcast->mcmember.sl = mcmember->sl;
+		priv->broadcast->mcmember.flow_label = mcmember->flow_label;
+		priv->broadcast->mcmember.hop_limit = mcmember->hop_limit;
+		/* assume if the admin and the mcast are the same both can be changed */
+		if (priv->mcast_mtu == priv->admin_mtu)
+			priv->admin_mtu =
+			priv->mcast_mtu =
+			IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
+		else
+			priv->mcast_mtu =
+			IPOIB_UD_MTU(ib_mtu_enum_to_int(priv->broadcast->mcmember.mtu));
+
+		priv->qkey = be32_to_cpu(priv->broadcast->mcmember.qkey);
+		spin_unlock_irq(&priv->lock);
+		priv->tx_wr.wr.ud.remote_qkey = priv->qkey;
+		set_qkey = 1;
+	}
+
+	if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+		if (test_and_set_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+			ipoib_warn(priv, "multicast group %pI6 already attached\n",
+				   mcast->mcmember.mgid.raw);
+
+			return 0;
+		}
+
+		ret = ipoib_mcast_attach(dev, be16_to_cpu(mcast->mcmember.mlid),
+					 &mcast->mcmember.mgid, set_qkey);
+		if (ret < 0) {
+			ipoib_warn(priv, "couldn't attach QP to multicast group %pI6\n",
+				   mcast->mcmember.mgid.raw);
+
+			clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags);
+			return ret;
+		}
+	}
+
+	{
+		struct ib_ah_attr av = {
+			.dlid	       = be16_to_cpu(mcast->mcmember.mlid),
+			.port_num      = priv->port,
+			.sl	       = mcast->mcmember.sl,
+			.ah_flags      = IB_AH_GRH,
+			.static_rate   = mcast->mcmember.rate,
+			.grh	       = {
+				.flow_label    = be32_to_cpu(mcast->mcmember.flow_label),
+				.hop_limit     = mcast->mcmember.hop_limit,
+				.sgid_index    = 0,
+				.traffic_class = mcast->mcmember.traffic_class
+			}
+		};
+		av.grh.dgid = mcast->mcmember.mgid;
+
+		ah = ipoib_create_ah(dev, priv->pd, &av);
+		if (IS_ERR(ah)) {
+			ipoib_warn(priv, "ib_address_create failed %ld\n",
+				-PTR_ERR(ah));
+			/* use original error */
+			return PTR_ERR(ah);
+		} else {
+			spin_lock_irq(&priv->lock);
+			mcast->ah = ah;
+			spin_unlock_irq(&priv->lock);
+
+			ipoib_dbg_mcast(priv, "MGID %pI6 AV %p, LID 0x%04x, SL %d\n",
+					mcast->mcmember.mgid.raw,
+					mcast->ah->ah,
+					be16_to_cpu(mcast->mcmember.mlid),
+					mcast->mcmember.sl);
+		}
+	}
+
+	/* actually send any queued packets */
+	netif_tx_lock_bh(dev);
+	while (!skb_queue_empty(&mcast->pkt_queue)) {
+		struct sk_buff *skb = skb_dequeue(&mcast->pkt_queue);
+
+		netif_tx_unlock_bh(dev);
+
+		skb->dev = dev;
+		if (dev_queue_xmit(skb))
+			ipoib_warn(priv, "dev_queue_xmit failed to requeue packet\n");
+
+		netif_tx_lock_bh(dev);
+	}
+	netif_tx_unlock_bh(dev);
+
+	return 0;
+}
+
+void ipoib_mcast_carrier_on_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv = container_of(work, struct ipoib_dev_priv,
+						   carrier_on_task);
+	struct ib_port_attr attr;
+
+	if (ib_query_port(priv->ca, priv->port, &attr) ||
+	    attr.state != IB_PORT_ACTIVE) {
+		ipoib_dbg(priv, "Keeping carrier off until IB port is active\n");
+		return;
+	}
+
+	/*
+	 * Take rtnl_lock to avoid racing with ipoib_stop() and
+	 * turning the carrier back on while a device is being
+	 * removed.  However, ipoib_stop() will attempt to flush
+	 * the workqueue while holding the rtnl lock, so loop
+	 * on trylock until either we get the lock or we see
+	 * FLAG_OPER_UP go away as that signals that we are bailing
+	 * and can safely ignore the carrier on work.
+	 */
+	while (!rtnl_trylock()) {
+		if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+			return;
+		else
+			msleep(20);
+	}
+	if (!ipoib_cm_admin_enabled(priv->dev))
+		dev_set_mtu(priv->dev, min(priv->mcast_mtu, priv->admin_mtu));
+	netif_carrier_on(priv->dev);
+	rtnl_unlock();
+}
+
+static int ipoib_mcast_join_complete(int status,
+				     struct ib_sa_multicast *multicast)
+{
+	struct ipoib_mcast *mcast = multicast->context;
+	struct net_device *dev = mcast->dev;
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	ipoib_dbg_mcast(priv, "%sjoin completion for %pI6 (status %d)\n",
+			test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ?
+			"sendonly " : "",
+			mcast->mcmember.mgid.raw, status);
+
+	/* We trap for port events ourselves. */
+	if (status == -ENETRESET) {
+		status = 0;
+		goto out;
+	}
+
+	if (!status)
+		status = ipoib_mcast_join_finish(mcast, &multicast->rec);
+
+	if (!status) {
+		mcast->backoff = 1;
+		mcast->delay_until = jiffies;
+
+		/*
+		 * Defer carrier on work to priv->wq to avoid a
+		 * deadlock on rtnl_lock here.  Requeue our multicast
+		 * work too, which will end up happening right after
+		 * our carrier on task work and will allow us to
+		 * send out all of the non-broadcast joins
+		 */
+		if (mcast == priv->broadcast) {
+			spin_lock_irq(&priv->lock);
+			queue_work(priv->wq, &priv->carrier_on_task);
+			__ipoib_mcast_schedule_join_thread(priv, NULL, 0);
+			goto out_locked;
+		}
+	} else {
+		if (mcast->logcount++ < 20) {
+			if (status == -ETIMEDOUT || status == -EAGAIN) {
+				ipoib_dbg_mcast(priv, "%smulticast join failed for %pI6, status %d\n",
+						test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "",
+						mcast->mcmember.mgid.raw, status);
+			} else {
+				ipoib_warn(priv, "%smulticast join failed for %pI6, status %d\n",
+						test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ? "sendonly " : "",
+					   mcast->mcmember.mgid.raw, status);
+			}
+		}
+
+		if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) &&
+		    mcast->backoff >= 2) {
+			/*
+			 * We only retry sendonly joins once before we drop
+			 * the packet and quit trying to deal with the
+			 * group.  However, we leave the group in the
+			 * mcast list as an unjoined group.  If we want to
+			 * try joining again, we simply queue up a packet
+			 * and restart the join thread.  The empty queue
+			 * is why the join thread ignores this group.
+			 */
+			mcast->backoff = 1;
+			netif_tx_lock_bh(dev);
+			while (!skb_queue_empty(&mcast->pkt_queue)) {
+				++dev->stats.tx_dropped;
+				dev_kfree_skb_any(skb_dequeue(&mcast->pkt_queue));
+			}
+			netif_tx_unlock_bh(dev);
+		} else {
+			spin_lock_irq(&priv->lock);
+			/* Requeue this join task with a backoff delay */
+			__ipoib_mcast_schedule_join_thread(priv, mcast, 1);
+			goto out_locked;
+		}
+	}
+out:
+	spin_lock_irq(&priv->lock);
+out_locked:
+	/*
+	 * Make sure to set mcast->mc before we clear the busy flag to avoid
+	 * racing with code that checks for BUSY before checking mcast->mc
+	 */
+	if (status)
+		mcast->mc = NULL;
+	else
+		mcast->mc = multicast;
+	clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+	spin_unlock_irq(&priv->lock);
+	complete(&mcast->done);
+
+	return status;
+}
+
+static void ipoib_mcast_join(struct net_device *dev, struct ipoib_mcast *mcast,
+			     int create)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_sa_multicast *multicast;
+	struct ib_sa_mcmember_rec rec = {
+		.join_state = 1
+	};
+	ib_sa_comp_mask comp_mask;
+	int ret = 0;
+
+	ipoib_dbg_mcast(priv, "joining MGID %pI6\n", mcast->mcmember.mgid.raw);
+
+	rec.mgid     = mcast->mcmember.mgid;
+	rec.port_gid = priv->local_gid;
+	rec.pkey     = cpu_to_be16(priv->pkey);
+
+	comp_mask =
+		IB_SA_MCMEMBER_REC_MGID		|
+		IB_SA_MCMEMBER_REC_PORT_GID	|
+		IB_SA_MCMEMBER_REC_PKEY		|
+		IB_SA_MCMEMBER_REC_JOIN_STATE;
+
+	if (create) {
+		comp_mask |=
+			IB_SA_MCMEMBER_REC_QKEY			|
+			IB_SA_MCMEMBER_REC_MTU_SELECTOR		|
+			IB_SA_MCMEMBER_REC_MTU			|
+			IB_SA_MCMEMBER_REC_TRAFFIC_CLASS	|
+			IB_SA_MCMEMBER_REC_RATE_SELECTOR	|
+			IB_SA_MCMEMBER_REC_RATE			|
+			IB_SA_MCMEMBER_REC_SL			|
+			IB_SA_MCMEMBER_REC_FLOW_LABEL		|
+			IB_SA_MCMEMBER_REC_HOP_LIMIT;
+
+		rec.qkey	  = priv->broadcast->mcmember.qkey;
+		rec.mtu_selector  = IB_SA_EQ;
+		rec.mtu		  = priv->broadcast->mcmember.mtu;
+		rec.traffic_class = priv->broadcast->mcmember.traffic_class;
+		rec.rate_selector = IB_SA_EQ;
+		rec.rate	  = priv->broadcast->mcmember.rate;
+		rec.sl		  = priv->broadcast->mcmember.sl;
+		rec.flow_label	  = priv->broadcast->mcmember.flow_label;
+		rec.hop_limit	  = priv->broadcast->mcmember.hop_limit;
+	}
+
+	multicast = ib_sa_join_multicast(&ipoib_sa_client, priv->ca, priv->port,
+					 &rec, comp_mask, GFP_KERNEL,
+					 ipoib_mcast_join_complete, mcast);
+	if (IS_ERR(multicast)) {
+		ret = PTR_ERR(multicast);
+		ipoib_warn(priv, "ib_sa_join_multicast failed, status %d\n", ret);
+		spin_lock_irq(&priv->lock);
+		/* Requeue this join task with a backoff delay */
+		__ipoib_mcast_schedule_join_thread(priv, mcast, 1);
+		clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+		spin_unlock_irq(&priv->lock);
+		complete(&mcast->done);
+	}
+}
+
+void ipoib_mcast_join_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, mcast_task.work);
+	struct net_device *dev = priv->dev;
+	struct ib_port_attr port_attr;
+	unsigned long delay_until = 0;
+	struct ipoib_mcast *mcast = NULL;
+	int create = 1;
+
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+		return;
+
+	if (ib_query_port(priv->ca, priv->port, &port_attr) ||
+	    port_attr.state != IB_PORT_ACTIVE) {
+		ipoib_dbg(priv, "port state is not ACTIVE (state = %d) suspending join task\n",
+			  port_attr.state);
+		return;
+	}
+	priv->local_lid = port_attr.lid;
+
+	if (ib_query_gid(priv->ca, priv->port, 0, &priv->local_gid))
+		ipoib_warn(priv, "ib_query_gid() failed\n");
+	else
+		memcpy(priv->dev->dev_addr + 4, priv->local_gid.raw, sizeof (union ib_gid));
+
+	spin_lock_irq(&priv->lock);
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+		goto out;
+
+	if (!priv->broadcast) {
+		struct ipoib_mcast *broadcast;
+
+		broadcast = ipoib_mcast_alloc(dev, 0);
+		if (!broadcast) {
+			ipoib_warn(priv, "failed to allocate broadcast group\n");
+			/*
+			 * Restart us after a 1 second delay to retry
+			 * creating our broadcast group and attaching to
+			 * it.  Until this succeeds, this ipoib dev is
+			 * completely stalled (multicast wise).
+			 */
+			__ipoib_mcast_schedule_join_thread(priv, NULL, 1);
+			goto out;
+		}
+
+		memcpy(broadcast->mcmember.mgid.raw, priv->dev->broadcast + 4,
+		       sizeof (union ib_gid));
+		priv->broadcast = broadcast;
+
+		__ipoib_mcast_add(dev, priv->broadcast);
+	}
+
+	if (!test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
+		if (IS_ERR_OR_NULL(priv->broadcast->mc) &&
+		    !test_bit(IPOIB_MCAST_FLAG_BUSY, &priv->broadcast->flags)) {
+			mcast = priv->broadcast;
+			create = 0;
+			if (mcast->backoff > 1 &&
+			    time_before(jiffies, mcast->delay_until)) {
+				delay_until = mcast->delay_until;
+				mcast = NULL;
+			}
+		}
+		goto out;
+	}
+
+	/*
+	 * We'll never get here until the broadcast group is both allocated
+	 * and attached
+	 */
+	list_for_each_entry(mcast, &priv->multicast_list, list) {
+		if (IS_ERR_OR_NULL(mcast->mc) &&
+		    !test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags) &&
+		    (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags) ||
+		     !skb_queue_empty(&mcast->pkt_queue))) {
+			if (mcast->backoff == 1 ||
+			    time_after_eq(jiffies, mcast->delay_until)) {
+				/* Found the next unjoined group */
+				init_completion(&mcast->done);
+				set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+				if (test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+					create = 0;
+				else
+					create = 1;
+				spin_unlock_irq(&priv->lock);
+				ipoib_mcast_join(dev, mcast, create);
+				spin_lock_irq(&priv->lock);
+			} else if (!delay_until ||
+				 time_before(mcast->delay_until, delay_until))
+				delay_until = mcast->delay_until;
+		}
+	}
+
+	mcast = NULL;
+	ipoib_dbg_mcast(priv, "successfully started all multicast joins\n");
+
+out:
+	if (delay_until) {
+		cancel_delayed_work(&priv->mcast_task);
+		queue_delayed_work(priv->wq, &priv->mcast_task,
+				   delay_until - jiffies);
+	}
+	if (mcast) {
+		init_completion(&mcast->done);
+		set_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags);
+	}
+	spin_unlock_irq(&priv->lock);
+	if (mcast)
+		ipoib_mcast_join(dev, mcast, create);
+}
+
+int ipoib_mcast_start_thread(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned long flags;
+
+	ipoib_dbg_mcast(priv, "starting multicast thread\n");
+
+	spin_lock_irqsave(&priv->lock, flags);
+	__ipoib_mcast_schedule_join_thread(priv, NULL, 0);
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	return 0;
+}
+
+int ipoib_mcast_stop_thread(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	unsigned long flags;
+
+	ipoib_dbg_mcast(priv, "stopping multicast thread\n");
+
+	spin_lock_irqsave(&priv->lock, flags);
+	cancel_delayed_work(&priv->mcast_task);
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	flush_workqueue(priv->wq);
+
+	return 0;
+}
+
+static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret = 0;
+
+	if (test_and_clear_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+		ipoib_warn(priv, "ipoib_mcast_leave on an in-flight join\n");
+
+	if (!IS_ERR_OR_NULL(mcast->mc))
+		ib_sa_free_multicast(mcast->mc);
+
+	if (test_and_clear_bit(IPOIB_MCAST_FLAG_ATTACHED, &mcast->flags)) {
+		ipoib_dbg_mcast(priv, "leaving MGID %pI6\n",
+				mcast->mcmember.mgid.raw);
+
+		/* Remove ourselves from the multicast group */
+		ret = ib_detach_mcast(priv->qp, &mcast->mcmember.mgid,
+				      be16_to_cpu(mcast->mcmember.mlid));
+		if (ret)
+			ipoib_warn(priv, "ib_detach_mcast failed (result = %d)\n", ret);
+	} else if (!test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags))
+		ipoib_dbg(priv, "leaving with no mcmember but not a "
+			  "SENDONLY join\n");
+
+	return 0;
+}
+
+void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ipoib_mcast *mcast;
+	unsigned long flags;
+	void *mgid = daddr + 4;
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)		||
+	    !priv->broadcast					||
+	    !test_bit(IPOIB_MCAST_FLAG_ATTACHED, &priv->broadcast->flags)) {
+		++dev->stats.tx_dropped;
+		dev_kfree_skb_any(skb);
+		goto unlock;
+	}
+
+	mcast = __ipoib_mcast_find(dev, mgid);
+	if (!mcast || !mcast->ah) {
+		if (!mcast) {
+			/* Let's create a new send only group now */
+			ipoib_dbg_mcast(priv, "setting up send only multicast group for %pI6\n",
+					mgid);
+
+			mcast = ipoib_mcast_alloc(dev, 0);
+			if (!mcast) {
+				ipoib_warn(priv, "unable to allocate memory "
+					   "for multicast structure\n");
+				++dev->stats.tx_dropped;
+				dev_kfree_skb_any(skb);
+				goto unlock;
+			}
+
+			set_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags);
+			memcpy(mcast->mcmember.mgid.raw, mgid,
+			       sizeof (union ib_gid));
+			__ipoib_mcast_add(dev, mcast);
+			list_add_tail(&mcast->list, &priv->multicast_list);
+		}
+		if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE)
+			skb_queue_tail(&mcast->pkt_queue, skb);
+		else {
+			++dev->stats.tx_dropped;
+			dev_kfree_skb_any(skb);
+		}
+		if (!test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) {
+			__ipoib_mcast_schedule_join_thread(priv, NULL, 0);
+		}
+	} else {
+		struct ipoib_neigh *neigh;
+
+		spin_unlock_irqrestore(&priv->lock, flags);
+		neigh = ipoib_neigh_get(dev, daddr);
+		spin_lock_irqsave(&priv->lock, flags);
+		if (!neigh) {
+			neigh = ipoib_neigh_alloc(daddr, dev);
+			if (neigh) {
+				kref_get(&mcast->ah->ref);
+				neigh->ah	= mcast->ah;
+				list_add_tail(&neigh->list, &mcast->neigh_list);
+			}
+		}
+		spin_unlock_irqrestore(&priv->lock, flags);
+		ipoib_send(dev, skb, mcast->ah, IB_MULTICAST_QPN);
+		if (neigh)
+			ipoib_neigh_put(neigh);
+		return;
+	}
+
+unlock:
+	spin_unlock_irqrestore(&priv->lock, flags);
+}
+
+void ipoib_mcast_dev_flush(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	LIST_HEAD(remove_list);
+	struct ipoib_mcast *mcast, *tmcast;
+	unsigned long flags;
+
+	ipoib_dbg_mcast(priv, "flushing multicast list\n");
+
+	spin_lock_irqsave(&priv->lock, flags);
+
+	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+		list_del(&mcast->list);
+		rb_erase(&mcast->rb_node, &priv->multicast_tree);
+		list_add_tail(&mcast->list, &remove_list);
+	}
+
+	if (priv->broadcast) {
+		rb_erase(&priv->broadcast->rb_node, &priv->multicast_tree);
+		list_add_tail(&priv->broadcast->list, &remove_list);
+		priv->broadcast = NULL;
+	}
+
+	spin_unlock_irqrestore(&priv->lock, flags);
+
+	/*
+	 * make sure the in-flight joins have finished before we attempt
+	 * to leave
+	 */
+	list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
+		if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+			wait_for_completion(&mcast->done);
+
+	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+		ipoib_mcast_leave(dev, mcast);
+		ipoib_mcast_free(mcast);
+	}
+}
+
+static int ipoib_mcast_addr_is_valid(const u8 *addr, const u8 *broadcast)
+{
+	/* reserved QPN, prefix, scope */
+	if (memcmp(addr, broadcast, 6))
+		return 0;
+	/* signature lower, pkey */
+	if (memcmp(addr + 7, broadcast + 7, 3))
+		return 0;
+	return 1;
+}
+
+void ipoib_mcast_restart_task(struct work_struct *work)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(work, struct ipoib_dev_priv, restart_task);
+	struct net_device *dev = priv->dev;
+	struct netdev_hw_addr *ha;
+	struct ipoib_mcast *mcast, *tmcast;
+	LIST_HEAD(remove_list);
+	unsigned long flags;
+	struct ib_sa_mcmember_rec rec;
+
+	if (!test_bit(IPOIB_FLAG_OPER_UP, &priv->flags))
+		/*
+		 * shortcut...on shutdown flush is called next, just
+		 * let it do all the work
+		 */
+		return;
+
+	ipoib_dbg_mcast(priv, "restarting multicast task\n");
+
+	local_irq_save(flags);
+	netif_addr_lock(dev);
+	spin_lock(&priv->lock);
+
+	/*
+	 * Unfortunately, the networking core only gives us a list of all of
+	 * the multicast hardware addresses. We need to figure out which ones
+	 * are new and which ones have been removed
+	 */
+
+	/* Clear out the found flag */
+	list_for_each_entry(mcast, &priv->multicast_list, list)
+		clear_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
+
+	/* Mark all of the entries that are found or don't exist */
+	netdev_for_each_mc_addr(ha, dev) {
+		union ib_gid mgid;
+
+		if (!ipoib_mcast_addr_is_valid(ha->addr, dev->broadcast))
+			continue;
+
+		memcpy(mgid.raw, ha->addr + 4, sizeof mgid);
+
+		mcast = __ipoib_mcast_find(dev, &mgid);
+		if (!mcast || test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+			struct ipoib_mcast *nmcast;
+
+			/* ignore group which is directly joined by userspace */
+			if (test_bit(IPOIB_FLAG_UMCAST, &priv->flags) &&
+			    !ib_sa_get_mcmember_rec(priv->ca, priv->port, &mgid, &rec)) {
+				ipoib_dbg_mcast(priv, "ignoring multicast entry for mgid %pI6\n",
+						mgid.raw);
+				continue;
+			}
+
+			/* Not found or send-only group, let's add a new entry */
+			ipoib_dbg_mcast(priv, "adding multicast entry for mgid %pI6\n",
+					mgid.raw);
+
+			nmcast = ipoib_mcast_alloc(dev, 0);
+			if (!nmcast) {
+				ipoib_warn(priv, "unable to allocate memory for multicast structure\n");
+				continue;
+			}
+
+			set_bit(IPOIB_MCAST_FLAG_FOUND, &nmcast->flags);
+
+			nmcast->mcmember.mgid = mgid;
+
+			if (mcast) {
+				/* Destroy the send only entry */
+				list_move_tail(&mcast->list, &remove_list);
+
+				rb_replace_node(&mcast->rb_node,
+						&nmcast->rb_node,
+						&priv->multicast_tree);
+			} else
+				__ipoib_mcast_add(dev, nmcast);
+
+			list_add_tail(&nmcast->list, &priv->multicast_list);
+		}
+
+		if (mcast)
+			set_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags);
+	}
+
+	/* Remove all of the entries don't exist anymore */
+	list_for_each_entry_safe(mcast, tmcast, &priv->multicast_list, list) {
+		if (!test_bit(IPOIB_MCAST_FLAG_FOUND, &mcast->flags) &&
+		    !test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) {
+			ipoib_dbg_mcast(priv, "deleting multicast group %pI6\n",
+					mcast->mcmember.mgid.raw);
+
+			rb_erase(&mcast->rb_node, &priv->multicast_tree);
+
+			/* Move to the remove list */
+			list_move_tail(&mcast->list, &remove_list);
+		}
+	}
+
+	spin_unlock(&priv->lock);
+	netif_addr_unlock(dev);
+	local_irq_restore(flags);
+
+	/*
+	 * make sure the in-flight joins have finished before we attempt
+	 * to leave
+	 */
+	list_for_each_entry_safe(mcast, tmcast, &remove_list, list)
+		if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags))
+			wait_for_completion(&mcast->done);
+
+	list_for_each_entry_safe(mcast, tmcast, &remove_list, list) {
+		ipoib_mcast_leave(mcast->dev, mcast);
+		ipoib_mcast_free(mcast);
+	}
+
+	/*
+	 * Double check that we are still up
+	 */
+	if (test_bit(IPOIB_FLAG_OPER_UP, &priv->flags)) {
+		spin_lock_irqsave(&priv->lock, flags);
+		__ipoib_mcast_schedule_join_thread(priv, NULL, 0);
+		spin_unlock_irqrestore(&priv->lock, flags);
+	}
+}
+
+#ifdef CONFIG_INFINIBAND_IPOIB_DEBUG
+
+struct ipoib_mcast_iter *ipoib_mcast_iter_init(struct net_device *dev)
+{
+	struct ipoib_mcast_iter *iter;
+
+	iter = kmalloc(sizeof *iter, GFP_KERNEL);
+	if (!iter)
+		return NULL;
+
+	iter->dev = dev;
+	memset(iter->mgid.raw, 0, 16);
+
+	if (ipoib_mcast_iter_next(iter)) {
+		kfree(iter);
+		return NULL;
+	}
+
+	return iter;
+}
+
+int ipoib_mcast_iter_next(struct ipoib_mcast_iter *iter)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(iter->dev);
+	struct rb_node *n;
+	struct ipoib_mcast *mcast;
+	int ret = 1;
+
+	spin_lock_irq(&priv->lock);
+
+	n = rb_first(&priv->multicast_tree);
+
+	while (n) {
+		mcast = rb_entry(n, struct ipoib_mcast, rb_node);
+
+		if (memcmp(iter->mgid.raw, mcast->mcmember.mgid.raw,
+			   sizeof (union ib_gid)) < 0) {
+			iter->mgid      = mcast->mcmember.mgid;
+			iter->created   = mcast->created;
+			iter->queuelen  = skb_queue_len(&mcast->pkt_queue);
+			iter->complete  = !!mcast->ah;
+			iter->send_only = !!(mcast->flags & (1 << IPOIB_MCAST_FLAG_SENDONLY));
+
+			ret = 0;
+
+			break;
+		}
+
+		n = rb_next(n);
+	}
+
+	spin_unlock_irq(&priv->lock);
+
+	return ret;
+}
+
+void ipoib_mcast_iter_read(struct ipoib_mcast_iter *iter,
+			   union ib_gid *mgid,
+			   unsigned long *created,
+			   unsigned int *queuelen,
+			   unsigned int *complete,
+			   unsigned int *send_only)
+{
+	*mgid      = iter->mgid;
+	*created   = iter->created;
+	*queuelen  = iter->queuelen;
+	*complete  = iter->complete;
+	*send_only = iter->send_only;
+}
+
+#endif /* CONFIG_INFINIBAND_IPOIB_DEBUG */
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_netlink.c b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
new file mode 100644
index 000000000..cdc7df4fd
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_netlink.c
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2012 Mellanox Technologies. -  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/netdevice.h>
+#include <linux/if_arp.h>      /* For ARPHRD_xxx */
+#include <linux/module.h>
+#include <net/rtnetlink.h>
+#include "ipoib.h"
+
+static const struct nla_policy ipoib_policy[IFLA_IPOIB_MAX + 1] = {
+	[IFLA_IPOIB_PKEY]	= { .type = NLA_U16 },
+	[IFLA_IPOIB_MODE]	= { .type = NLA_U16 },
+	[IFLA_IPOIB_UMCAST]	= { .type = NLA_U16 },
+};
+
+static int ipoib_fill_info(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	u16 val;
+
+	if (nla_put_u16(skb, IFLA_IPOIB_PKEY, priv->pkey))
+		goto nla_put_failure;
+
+	val = test_bit(IPOIB_FLAG_ADMIN_CM, &priv->flags);
+	if (nla_put_u16(skb, IFLA_IPOIB_MODE, val))
+		goto nla_put_failure;
+
+	val = test_bit(IPOIB_FLAG_UMCAST, &priv->flags);
+	if (nla_put_u16(skb, IFLA_IPOIB_UMCAST, val))
+		goto nla_put_failure;
+
+	return 0;
+
+nla_put_failure:
+	return -EMSGSIZE;
+}
+
+static int ipoib_changelink(struct net_device *dev,
+			    struct nlattr *tb[], struct nlattr *data[])
+{
+	u16 mode, umcast;
+	int ret = 0;
+
+	if (data[IFLA_IPOIB_MODE]) {
+		mode  = nla_get_u16(data[IFLA_IPOIB_MODE]);
+		if (mode == IPOIB_MODE_DATAGRAM)
+			ret = ipoib_set_mode(dev, "datagram\n");
+		else if (mode == IPOIB_MODE_CONNECTED)
+			ret = ipoib_set_mode(dev, "connected\n");
+		else
+			ret = -EINVAL;
+
+		if (ret < 0)
+			goto out_err;
+	}
+
+	if (data[IFLA_IPOIB_UMCAST]) {
+		umcast = nla_get_u16(data[IFLA_IPOIB_UMCAST]);
+		ipoib_set_umcast(dev, umcast);
+	}
+
+out_err:
+	return ret;
+}
+
+static int ipoib_new_child_link(struct net *src_net, struct net_device *dev,
+			       struct nlattr *tb[], struct nlattr *data[])
+{
+	struct net_device *pdev;
+	struct ipoib_dev_priv *ppriv;
+	u16 child_pkey;
+	int err;
+
+	if (!tb[IFLA_LINK])
+		return -EINVAL;
+
+	pdev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK]));
+	if (!pdev || pdev->type != ARPHRD_INFINIBAND)
+		return -ENODEV;
+
+	ppriv = netdev_priv(pdev);
+
+	if (test_bit(IPOIB_FLAG_SUBINTERFACE, &ppriv->flags)) {
+		ipoib_warn(ppriv, "child creation disallowed for child devices\n");
+		return -EINVAL;
+	}
+
+	if (!data || !data[IFLA_IPOIB_PKEY]) {
+		ipoib_dbg(ppriv, "no pkey specified, using parent pkey\n");
+		child_pkey  = ppriv->pkey;
+	} else
+		child_pkey  = nla_get_u16(data[IFLA_IPOIB_PKEY]);
+
+	if (child_pkey == 0 || child_pkey == 0x8000)
+		return -EINVAL;
+
+	/*
+	 * Set the full membership bit, so that we join the right
+	 * broadcast group, etc.
+	 */
+	child_pkey |= 0x8000;
+
+	err = __ipoib_vlan_add(ppriv, netdev_priv(dev), child_pkey, IPOIB_RTNL_CHILD);
+
+	if (!err && data)
+		err = ipoib_changelink(dev, tb, data);
+	return err;
+}
+
+static void ipoib_unregister_child_dev(struct net_device *dev, struct list_head *head)
+{
+	struct ipoib_dev_priv *priv, *ppriv;
+
+	priv = netdev_priv(dev);
+	ppriv = netdev_priv(priv->parent);
+
+	down_write(&ppriv->vlan_rwsem);
+	unregister_netdevice_queue(dev, head);
+	list_del(&priv->list);
+	up_write(&ppriv->vlan_rwsem);
+}
+
+static size_t ipoib_get_size(const struct net_device *dev)
+{
+	return nla_total_size(2) +	/* IFLA_IPOIB_PKEY   */
+		nla_total_size(2) +	/* IFLA_IPOIB_MODE   */
+		nla_total_size(2);	/* IFLA_IPOIB_UMCAST */
+}
+
+static struct rtnl_link_ops ipoib_link_ops __read_mostly = {
+	.kind		= "ipoib",
+	.maxtype	= IFLA_IPOIB_MAX,
+	.policy		= ipoib_policy,
+	.priv_size	= sizeof(struct ipoib_dev_priv),
+	.setup		= ipoib_setup,
+	.newlink	= ipoib_new_child_link,
+	.changelink	= ipoib_changelink,
+	.dellink	= ipoib_unregister_child_dev,
+	.get_size	= ipoib_get_size,
+	.fill_info	= ipoib_fill_info,
+};
+
+int __init ipoib_netlink_init(void)
+{
+	return rtnl_link_register(&ipoib_link_ops);
+}
+
+void __exit ipoib_netlink_fini(void)
+{
+	rtnl_link_unregister(&ipoib_link_ops);
+}
+
+MODULE_ALIAS_RTNL_LINK("ipoib");
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_verbs.c b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
new file mode 100644
index 000000000..e5cc43074
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_verbs.c
@@ -0,0 +1,317 @@
+/*
+ * Copyright (c) 2004, 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/slab.h>
+
+#include "ipoib.h"
+
+int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int set_qkey)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_attr *qp_attr = NULL;
+	int ret;
+	u16 pkey_index;
+
+	if (ib_find_pkey(priv->ca, priv->port, priv->pkey, &pkey_index)) {
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+		ret = -ENXIO;
+		goto out;
+	}
+	set_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+
+	if (set_qkey) {
+		ret = -ENOMEM;
+		qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
+		if (!qp_attr)
+			goto out;
+
+		/* set correct QKey for QP */
+		qp_attr->qkey = priv->qkey;
+		ret = ib_modify_qp(priv->qp, qp_attr, IB_QP_QKEY);
+		if (ret) {
+			ipoib_warn(priv, "failed to modify QP, ret = %d\n", ret);
+			goto out;
+		}
+	}
+
+	/* attach QP to multicast group */
+	ret = ib_attach_mcast(priv->qp, mgid, mlid);
+	if (ret)
+		ipoib_warn(priv, "failed to attach to multicast group, ret = %d\n", ret);
+
+out:
+	kfree(qp_attr);
+	return ret;
+}
+
+int ipoib_init_qp(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	int ret;
+	struct ib_qp_attr qp_attr;
+	int attr_mask;
+
+	if (!test_bit(IPOIB_PKEY_ASSIGNED, &priv->flags))
+		return -1;
+
+	qp_attr.qp_state = IB_QPS_INIT;
+	qp_attr.qkey = 0;
+	qp_attr.port_num = priv->port;
+	qp_attr.pkey_index = priv->pkey_index;
+	attr_mask =
+	    IB_QP_QKEY |
+	    IB_QP_PORT |
+	    IB_QP_PKEY_INDEX |
+	    IB_QP_STATE;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to init, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	/* Can't set this in a INIT->RTR transition */
+	attr_mask &= ~IB_QP_PORT;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTR, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	qp_attr.sq_psn = 0;
+	attr_mask |= IB_QP_SQ_PSN;
+	attr_mask &= ~IB_QP_PKEY_INDEX;
+	ret = ib_modify_qp(priv->qp, &qp_attr, attr_mask);
+	if (ret) {
+		ipoib_warn(priv, "failed to modify QP to RTS, ret = %d\n", ret);
+		goto out_fail;
+	}
+
+	return 0;
+
+out_fail:
+	qp_attr.qp_state = IB_QPS_RESET;
+	if (ib_modify_qp(priv->qp, &qp_attr, IB_QP_STATE))
+		ipoib_warn(priv, "Failed to modify QP to RESET state\n");
+
+	return ret;
+}
+
+int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+	struct ib_qp_init_attr init_attr = {
+		.cap = {
+			.max_send_wr  = ipoib_sendq_size,
+			.max_recv_wr  = ipoib_recvq_size,
+			.max_send_sge = 1,
+			.max_recv_sge = IPOIB_UD_RX_SG
+		},
+		.sq_sig_type = IB_SIGNAL_ALL_WR,
+		.qp_type     = IB_QPT_UD
+	};
+
+	int ret, size;
+	int i;
+
+	priv->pd = ib_alloc_pd(priv->ca);
+	if (IS_ERR(priv->pd)) {
+		printk(KERN_WARNING "%s: failed to allocate PD\n", ca->name);
+		return -ENODEV;
+	}
+
+	priv->mr = ib_get_dma_mr(priv->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(priv->mr)) {
+		printk(KERN_WARNING "%s: ib_get_dma_mr failed\n", ca->name);
+		goto out_free_pd;
+	}
+
+	/*
+	 * the various IPoIB tasks assume they will never race against
+	 * themselves, so always use a single thread workqueue
+	 */
+	priv->wq = create_singlethread_workqueue("ipoib_wq");
+	if (!priv->wq) {
+		printk(KERN_WARNING "ipoib: failed to allocate device WQ\n");
+		goto out_free_mr;
+	}
+
+	size = ipoib_recvq_size + 1;
+	ret = ipoib_cm_dev_init(dev);
+	if (!ret) {
+		size += ipoib_sendq_size;
+		if (ipoib_cm_has_srq(dev))
+			size += ipoib_recvq_size + 1; /* 1 extra for rx_drain_qp */
+		else
+			size += ipoib_recvq_size * ipoib_max_conn_qp;
+	} else
+		goto out_free_wq;
+
+	priv->recv_cq = ib_create_cq(priv->ca, ipoib_ib_completion, NULL, dev, size, 0);
+	if (IS_ERR(priv->recv_cq)) {
+		printk(KERN_WARNING "%s: failed to create receive CQ\n", ca->name);
+		goto out_cm_dev_cleanup;
+	}
+
+	priv->send_cq = ib_create_cq(priv->ca, ipoib_send_comp_handler, NULL,
+				     dev, ipoib_sendq_size, 0);
+	if (IS_ERR(priv->send_cq)) {
+		printk(KERN_WARNING "%s: failed to create send CQ\n", ca->name);
+		goto out_free_recv_cq;
+	}
+
+	if (ib_req_notify_cq(priv->recv_cq, IB_CQ_NEXT_COMP))
+		goto out_free_send_cq;
+
+	init_attr.send_cq = priv->send_cq;
+	init_attr.recv_cq = priv->recv_cq;
+
+	if (priv->hca_caps & IB_DEVICE_UD_TSO)
+		init_attr.create_flags |= IB_QP_CREATE_IPOIB_UD_LSO;
+
+	if (priv->hca_caps & IB_DEVICE_BLOCK_MULTICAST_LOOPBACK)
+		init_attr.create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
+
+	if (priv->hca_caps & IB_DEVICE_MANAGED_FLOW_STEERING)
+		init_attr.create_flags |= IB_QP_CREATE_NETIF_QP;
+
+	if (dev->features & NETIF_F_SG)
+		init_attr.cap.max_send_sge = MAX_SKB_FRAGS + 1;
+
+	priv->qp = ib_create_qp(priv->pd, &init_attr);
+	if (IS_ERR(priv->qp)) {
+		printk(KERN_WARNING "%s: failed to create QP\n", ca->name);
+		goto out_free_send_cq;
+	}
+
+	priv->dev->dev_addr[1] = (priv->qp->qp_num >> 16) & 0xff;
+	priv->dev->dev_addr[2] = (priv->qp->qp_num >>  8) & 0xff;
+	priv->dev->dev_addr[3] = (priv->qp->qp_num      ) & 0xff;
+
+	for (i = 0; i < MAX_SKB_FRAGS + 1; ++i)
+		priv->tx_sge[i].lkey = priv->mr->lkey;
+
+	priv->tx_wr.opcode	= IB_WR_SEND;
+	priv->tx_wr.sg_list	= priv->tx_sge;
+	priv->tx_wr.send_flags	= IB_SEND_SIGNALED;
+
+	priv->rx_sge[0].lkey = priv->mr->lkey;
+
+	priv->rx_sge[0].length = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
+	priv->rx_wr.num_sge = 1;
+
+	priv->rx_wr.next = NULL;
+	priv->rx_wr.sg_list = priv->rx_sge;
+
+	return 0;
+
+out_free_send_cq:
+	ib_destroy_cq(priv->send_cq);
+
+out_free_recv_cq:
+	ib_destroy_cq(priv->recv_cq);
+
+out_cm_dev_cleanup:
+	ipoib_cm_dev_cleanup(dev);
+
+out_free_wq:
+	destroy_workqueue(priv->wq);
+	priv->wq = NULL;
+
+out_free_mr:
+	ib_dereg_mr(priv->mr);
+
+out_free_pd:
+	ib_dealloc_pd(priv->pd);
+
+	return -ENODEV;
+}
+
+void ipoib_transport_dev_cleanup(struct net_device *dev)
+{
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	if (priv->qp) {
+		if (ib_destroy_qp(priv->qp))
+			ipoib_warn(priv, "ib_qp_destroy failed\n");
+
+		priv->qp = NULL;
+		clear_bit(IPOIB_PKEY_ASSIGNED, &priv->flags);
+	}
+
+	if (ib_destroy_cq(priv->send_cq))
+		ipoib_warn(priv, "ib_cq_destroy (send) failed\n");
+
+	if (ib_destroy_cq(priv->recv_cq))
+		ipoib_warn(priv, "ib_cq_destroy (recv) failed\n");
+
+	ipoib_cm_dev_cleanup(dev);
+
+	if (priv->wq) {
+		flush_workqueue(priv->wq);
+		destroy_workqueue(priv->wq);
+		priv->wq = NULL;
+	}
+
+	if (ib_dereg_mr(priv->mr))
+		ipoib_warn(priv, "ib_dereg_mr failed\n");
+
+	if (ib_dealloc_pd(priv->pd))
+		ipoib_warn(priv, "ib_dealloc_pd failed\n");
+
+}
+
+void ipoib_event(struct ib_event_handler *handler,
+		 struct ib_event *record)
+{
+	struct ipoib_dev_priv *priv =
+		container_of(handler, struct ipoib_dev_priv, event_handler);
+
+	if (record->element.port_num != priv->port)
+		return;
+
+	ipoib_dbg(priv, "Event %d on device %s port %d\n", record->event,
+		  record->device->name, record->element.port_num);
+
+	if (record->event == IB_EVENT_SM_CHANGE ||
+	    record->event == IB_EVENT_CLIENT_REREGISTER) {
+		queue_work(ipoib_workqueue, &priv->flush_light);
+	} else if (record->event == IB_EVENT_PORT_ERR ||
+		   record->event == IB_EVENT_PORT_ACTIVE ||
+		   record->event == IB_EVENT_LID_CHANGE) {
+		queue_work(ipoib_workqueue, &priv->flush_normal);
+	} else if (record->event == IB_EVENT_PKEY_CHANGE) {
+		queue_work(ipoib_workqueue, &priv->flush_heavy);
+	}
+}
diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
new file mode 100644
index 000000000..fca1a882d
--- /dev/null
+++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2004 Topspin Communications.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <linux/module.h>
+
+#include <linux/init.h>
+#include <linux/seq_file.h>
+
+#include <asm/uaccess.h>
+
+#include "ipoib.h"
+
+static ssize_t show_parent(struct device *d, struct device_attribute *attr,
+			   char *buf)
+{
+	struct net_device *dev = to_net_dev(d);
+	struct ipoib_dev_priv *priv = netdev_priv(dev);
+
+	return sprintf(buf, "%s\n", priv->parent->name);
+}
+static DEVICE_ATTR(parent, S_IRUGO, show_parent, NULL);
+
+int __ipoib_vlan_add(struct ipoib_dev_priv *ppriv, struct ipoib_dev_priv *priv,
+		     u16 pkey, int type)
+{
+	int result;
+
+	priv->max_ib_mtu = ppriv->max_ib_mtu;
+	/* MTU will be reset when mcast join happens */
+	priv->dev->mtu   = IPOIB_UD_MTU(priv->max_ib_mtu);
+	priv->mcast_mtu  = priv->admin_mtu = priv->dev->mtu;
+	priv->parent = ppriv->dev;
+	set_bit(IPOIB_FLAG_SUBINTERFACE, &priv->flags);
+
+	result = ipoib_set_dev_features(priv, ppriv->ca);
+	if (result)
+		goto err;
+
+	priv->pkey = pkey;
+
+	memcpy(priv->dev->dev_addr, ppriv->dev->dev_addr, INFINIBAND_ALEN);
+	priv->dev->broadcast[8] = pkey >> 8;
+	priv->dev->broadcast[9] = pkey & 0xff;
+
+	result = ipoib_dev_init(priv->dev, ppriv->ca, ppriv->port);
+	if (result < 0) {
+		ipoib_warn(ppriv, "failed to initialize subinterface: "
+			   "device %s, port %d",
+			   ppriv->ca->name, ppriv->port);
+		goto err;
+	}
+
+	result = register_netdevice(priv->dev);
+	if (result) {
+		ipoib_warn(priv, "failed to initialize; error %i", result);
+		goto register_failed;
+	}
+
+	ipoib_create_debug_files(priv->dev);
+
+	/* RTNL childs don't need proprietary sysfs entries */
+	if (type == IPOIB_LEGACY_CHILD) {
+		if (ipoib_cm_add_mode_attr(priv->dev))
+			goto sysfs_failed;
+		if (ipoib_add_pkey_attr(priv->dev))
+			goto sysfs_failed;
+		if (ipoib_add_umcast_attr(priv->dev))
+			goto sysfs_failed;
+
+		if (device_create_file(&priv->dev->dev, &dev_attr_parent))
+			goto sysfs_failed;
+	}
+
+	priv->child_type  = type;
+	list_add_tail(&priv->list, &ppriv->child_intfs);
+
+	return 0;
+
+sysfs_failed:
+	result = -ENOMEM;
+	ipoib_delete_debug_files(priv->dev);
+	unregister_netdevice(priv->dev);
+
+register_failed:
+	ipoib_dev_cleanup(priv->dev);
+
+err:
+	return result;
+}
+
+int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey)
+{
+	struct ipoib_dev_priv *ppriv, *priv;
+	char intf_name[IFNAMSIZ];
+	struct ipoib_dev_priv *tpriv;
+	int result;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ppriv = netdev_priv(pdev);
+
+	snprintf(intf_name, sizeof intf_name, "%s.%04x",
+		 ppriv->dev->name, pkey);
+	priv = ipoib_intf_alloc(intf_name);
+	if (!priv)
+		return -ENOMEM;
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	down_write(&ppriv->vlan_rwsem);
+
+	/*
+	 * First ensure this isn't a duplicate. We check the parent device and
+	 * then all of the legacy child interfaces to make sure the Pkey
+	 * doesn't match.
+	 */
+	if (ppriv->pkey == pkey) {
+		result = -ENOTUNIQ;
+		goto out;
+	}
+
+	list_for_each_entry(tpriv, &ppriv->child_intfs, list) {
+		if (tpriv->pkey == pkey &&
+		    tpriv->child_type == IPOIB_LEGACY_CHILD) {
+			result = -ENOTUNIQ;
+			goto out;
+		}
+	}
+
+	result = __ipoib_vlan_add(ppriv, priv, pkey, IPOIB_LEGACY_CHILD);
+
+out:
+	up_write(&ppriv->vlan_rwsem);
+
+	if (result)
+		free_netdev(priv->dev);
+
+	rtnl_unlock();
+
+	return result;
+}
+
+int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey)
+{
+	struct ipoib_dev_priv *ppriv, *priv, *tpriv;
+	struct net_device *dev = NULL;
+
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	ppriv = netdev_priv(pdev);
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	down_write(&ppriv->vlan_rwsem);
+	list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) {
+		if (priv->pkey == pkey &&
+		    priv->child_type == IPOIB_LEGACY_CHILD) {
+			unregister_netdevice(priv->dev);
+			list_del(&priv->list);
+			dev = priv->dev;
+			break;
+		}
+	}
+	up_write(&ppriv->vlan_rwsem);
+
+	rtnl_unlock();
+
+	if (dev) {
+		free_netdev(dev);
+		return 0;
+	}
+
+	return -ENODEV;
+}
diff --git a/drivers/infiniband/ulp/iser/Kconfig b/drivers/infiniband/ulp/iser/Kconfig
new file mode 100644
index 000000000..d00af71a2
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/Kconfig
@@ -0,0 +1,12 @@
+config INFINIBAND_ISER
+	tristate "iSCSI Extensions for RDMA (iSER)"
+	depends on SCSI && INET && INFINIBAND_ADDR_TRANS
+	select SCSI_ISCSI_ATTRS
+	---help---
+	  Support for the iSCSI Extensions for RDMA (iSER) Protocol
+          over InfiniBand. This allows you to access storage devices
+          that speak iSCSI over iSER over InfiniBand.
+
+	  The iSER protocol is defined by IETF.
+	  See <http://www.ietf.org/rfc/rfc5046.txt>
+	  and <http://members.infinibandta.org/kwspub/spec/Annex_iSER.PDF>
diff --git a/drivers/infiniband/ulp/iser/Makefile b/drivers/infiniband/ulp/iser/Makefile
new file mode 100644
index 000000000..fe6cd15f2
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/Makefile
@@ -0,0 +1,4 @@
+obj-$(CONFIG_INFINIBAND_ISER)	+= ib_iser.o
+
+ib_iser-y			:= iser_verbs.o iser_initiator.o iser_memory.o \
+				   iscsi_iser.o
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c
new file mode 100644
index 000000000..6a594aac2
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.c
@@ -0,0 +1,1089 @@
+/*
+ * iSCSI Initiator over iSER Data-Path
+ *
+ * Copyright (C) 2004 Dmitry Yusupov
+ * Copyright (C) 2004 Alex Aizman
+ * Copyright (C) 2005 Mike Christie
+ * Copyright (c) 2005, 2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
+ * maintained by openib-general@openib.org
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Credits:
+ *	Christoph Hellwig
+ *	FUJITA Tomonori
+ *	Arne Redlich
+ *	Zhenyu Wang
+ * Modified by:
+ *      Erez Zilber
+ */
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/hardirq.h>
+#include <linux/kfifo.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/ioctl.h>
+#include <linux/cdev.h>
+#include <linux/in.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include <net/sock.h>
+
+#include <asm/uaccess.h>
+
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_eh.h>
+#include <scsi/scsi_tcq.h>
+#include <scsi/scsi_host.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_transport_iscsi.h>
+
+#include "iscsi_iser.h"
+
+static struct scsi_host_template iscsi_iser_sht;
+static struct iscsi_transport iscsi_iser_transport;
+static struct scsi_transport_template *iscsi_iser_scsi_transport;
+
+static unsigned int iscsi_max_lun = 512;
+module_param_named(max_lun, iscsi_max_lun, uint, S_IRUGO);
+
+int iser_debug_level = 0;
+bool iser_pi_enable = false;
+int iser_pi_guard = 1;
+
+MODULE_DESCRIPTION("iSER (iSCSI Extensions for RDMA) Datamover");
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Alex Nezhinsky, Dan Bar Dov, Or Gerlitz");
+MODULE_VERSION(DRV_VER);
+
+module_param_named(debug_level, iser_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:disabled)");
+
+module_param_named(pi_enable, iser_pi_enable, bool, 0644);
+MODULE_PARM_DESC(pi_enable, "Enable T10-PI offload support (default:disabled)");
+
+module_param_named(pi_guard, iser_pi_guard, int, 0644);
+MODULE_PARM_DESC(pi_guard, "T10-PI guard_type [deprecated]");
+
+static struct workqueue_struct *release_wq;
+struct iser_global ig;
+
+/*
+ * iscsi_iser_recv() - Process a successfull recv completion
+ * @conn:         iscsi connection
+ * @hdr:          iscsi header
+ * @rx_data:      buffer containing receive data payload
+ * @rx_data_len:  length of rx_data
+ *
+ * Notes: In case of data length errors or iscsi PDU completion failures
+ *        this routine will signal iscsi layer of connection failure.
+ */
+void
+iscsi_iser_recv(struct iscsi_conn *conn, struct iscsi_hdr *hdr,
+		char *rx_data, int rx_data_len)
+{
+	int rc = 0;
+	int datalen;
+	int ahslen;
+
+	/* verify PDU length */
+	datalen = ntoh24(hdr->dlength);
+	if (datalen > rx_data_len || (datalen + 4) < rx_data_len) {
+		iser_err("wrong datalen %d (hdr), %d (IB)\n",
+			datalen, rx_data_len);
+		rc = ISCSI_ERR_DATALEN;
+		goto error;
+	}
+
+	if (datalen != rx_data_len)
+		iser_dbg("aligned datalen (%d) hdr, %d (IB)\n",
+			datalen, rx_data_len);
+
+	/* read AHS */
+	ahslen = hdr->hlength * 4;
+
+	rc = iscsi_complete_pdu(conn, hdr, rx_data, rx_data_len);
+	if (rc && rc != ISCSI_ERR_NO_SCSI_CMD)
+		goto error;
+
+	return;
+error:
+	iscsi_conn_failure(conn, rc);
+}
+
+/**
+ * iscsi_iser_pdu_alloc() - allocate an iscsi-iser PDU
+ * @task:     iscsi task
+ * @opcode:   iscsi command opcode
+ *
+ * Netes: This routine can't fail, just assign iscsi task
+ *        hdr and max hdr size.
+ */
+static int
+iscsi_iser_pdu_alloc(struct iscsi_task *task, uint8_t opcode)
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+
+	task->hdr = (struct iscsi_hdr *)&iser_task->desc.iscsi_header;
+	task->hdr_max = sizeof(iser_task->desc.iscsi_header);
+
+	return 0;
+}
+
+/**
+ * iser_initialize_task_headers() - Initialize task headers
+ * @task:       iscsi task
+ * @tx_desc:    iser tx descriptor
+ *
+ * Notes:
+ * This routine may race with iser teardown flow for scsi
+ * error handling TMFs. So for TMF we should acquire the
+ * state mutex to avoid dereferencing the IB device which
+ * may have already been terminated.
+ */
+int
+iser_initialize_task_headers(struct iscsi_task *task,
+			     struct iser_tx_desc *tx_desc)
+{
+	struct iser_conn *iser_conn = task->conn->dd_data;
+	struct iser_device *device = iser_conn->ib_conn.device;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	u64 dma_addr;
+	const bool mgmt_task = !task->sc && !in_interrupt();
+	int ret = 0;
+
+	if (unlikely(mgmt_task))
+		mutex_lock(&iser_conn->state_mutex);
+
+	if (unlikely(iser_conn->state != ISER_CONN_UP)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	dma_addr = ib_dma_map_single(device->ib_device, (void *)tx_desc,
+				ISER_HEADERS_LEN, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(device->ib_device, dma_addr)) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	tx_desc->dma_addr = dma_addr;
+	tx_desc->tx_sg[0].addr   = tx_desc->dma_addr;
+	tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
+	tx_desc->tx_sg[0].lkey   = device->mr->lkey;
+
+	iser_task->iser_conn = iser_conn;
+out:
+	if (unlikely(mgmt_task))
+		mutex_unlock(&iser_conn->state_mutex);
+
+	return ret;
+}
+
+/**
+ * iscsi_iser_task_init() - Initialize iscsi-iser task
+ * @task: iscsi task
+ *
+ * Initialize the task for the scsi command or mgmt command.
+ *
+ * Return: Returns zero on success or -ENOMEM when failing
+ *         to init task headers (dma mapping error).
+ */
+static int
+iscsi_iser_task_init(struct iscsi_task *task)
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	int ret;
+
+	ret = iser_initialize_task_headers(task, &iser_task->desc);
+	if (ret) {
+		iser_err("Failed to init task %p, err = %d\n",
+			 iser_task, ret);
+		return ret;
+	}
+
+	/* mgmt task */
+	if (!task->sc)
+		return 0;
+
+	iser_task->command_sent = 0;
+	iser_task_rdma_init(iser_task);
+	iser_task->sc = task->sc;
+
+	return 0;
+}
+
+/**
+ * iscsi_iser_mtask_xmit() - xmit management (immediate) task
+ * @conn: iscsi connection
+ * @task: task management task
+ *
+ * Notes:
+ *	The function can return -EAGAIN in which case caller must
+ *	call it again later, or recover. '0' return code means successful
+ *	xmit.
+ *
+ **/
+static int
+iscsi_iser_mtask_xmit(struct iscsi_conn *conn, struct iscsi_task *task)
+{
+	int error = 0;
+
+	iser_dbg("mtask xmit [cid %d itt 0x%x]\n", conn->id, task->itt);
+
+	error = iser_send_control(conn, task);
+
+	/* since iser xmits control with zero copy, tasks can not be recycled
+	 * right after sending them.
+	 * The recycling scheme is based on whether a response is expected
+	 * - if yes, the task is recycled at iscsi_complete_pdu
+	 * - if no,  the task is recycled at iser_snd_completion
+	 */
+	return error;
+}
+
+static int
+iscsi_iser_task_xmit_unsol_data(struct iscsi_conn *conn,
+				 struct iscsi_task *task)
+{
+	struct iscsi_r2t_info *r2t = &task->unsol_r2t;
+	struct iscsi_data hdr;
+	int error = 0;
+
+	/* Send data-out PDUs while there's still unsolicited data to send */
+	while (iscsi_task_has_unsol_data(task)) {
+		iscsi_prep_data_out_pdu(task, r2t, &hdr);
+		iser_dbg("Sending data-out: itt 0x%x, data count %d\n",
+			   hdr.itt, r2t->data_count);
+
+		/* the buffer description has been passed with the command */
+		/* Send the command */
+		error = iser_send_data_out(conn, task, &hdr);
+		if (error) {
+			r2t->datasn--;
+			goto iscsi_iser_task_xmit_unsol_data_exit;
+		}
+		r2t->sent += r2t->data_count;
+		iser_dbg("Need to send %d more as data-out PDUs\n",
+			   r2t->data_length - r2t->sent);
+	}
+
+iscsi_iser_task_xmit_unsol_data_exit:
+	return error;
+}
+
+/**
+ * iscsi_iser_task_xmit() - xmit iscsi-iser task
+ * @task: iscsi task
+ *
+ * Return: zero on success or escalates $error on failure.
+ */
+static int
+iscsi_iser_task_xmit(struct iscsi_task *task)
+{
+	struct iscsi_conn *conn = task->conn;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	int error = 0;
+
+	if (!task->sc)
+		return iscsi_iser_mtask_xmit(conn, task);
+
+	if (task->sc->sc_data_direction == DMA_TO_DEVICE) {
+		BUG_ON(scsi_bufflen(task->sc) == 0);
+
+		iser_dbg("cmd [itt %x total %d imm %d unsol_data %d\n",
+			   task->itt, scsi_bufflen(task->sc),
+			   task->imm_count, task->unsol_r2t.data_length);
+	}
+
+	iser_dbg("ctask xmit [cid %d itt 0x%x]\n",
+		   conn->id, task->itt);
+
+	/* Send the cmd PDU */
+	if (!iser_task->command_sent) {
+		error = iser_send_command(conn, task);
+		if (error)
+			goto iscsi_iser_task_xmit_exit;
+		iser_task->command_sent = 1;
+	}
+
+	/* Send unsolicited data-out PDU(s) if necessary */
+	if (iscsi_task_has_unsol_data(task))
+		error = iscsi_iser_task_xmit_unsol_data(conn, task);
+
+ iscsi_iser_task_xmit_exit:
+	return error;
+}
+
+/**
+ * iscsi_iser_cleanup_task() - cleanup an iscsi-iser task
+ * @task: iscsi task
+ *
+ * Notes: In case the RDMA device is already NULL (might have
+ *        been removed in DEVICE_REMOVAL CM event it will bail-out
+ *        without doing dma unmapping.
+ */
+static void iscsi_iser_cleanup_task(struct iscsi_task *task)
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_tx_desc    *tx_desc   = &iser_task->desc;
+	struct iser_conn       *iser_conn	  = task->conn->dd_data;
+	struct iser_device *device = iser_conn->ib_conn.device;
+
+	/* DEVICE_REMOVAL event might have already released the device */
+	if (!device)
+		return;
+
+	ib_dma_unmap_single(device->ib_device,
+		tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
+
+	/* mgmt tasks do not need special cleanup */
+	if (!task->sc)
+		return;
+
+	if (iser_task->status == ISER_TASK_STATUS_STARTED) {
+		iser_task->status = ISER_TASK_STATUS_COMPLETED;
+		iser_task_rdma_finalize(iser_task);
+	}
+}
+
+/**
+ * iscsi_iser_check_protection() - check protection information status of task.
+ * @task:     iscsi task
+ * @sector:   error sector if exsists (output)
+ *
+ * Return: zero if no data-integrity errors have occured
+ *         0x1: data-integrity error occured in the guard-block
+ *         0x2: data-integrity error occured in the reference tag
+ *         0x3: data-integrity error occured in the application tag
+ *
+ *         In addition the error sector is marked.
+ */
+static u8
+iscsi_iser_check_protection(struct iscsi_task *task, sector_t *sector)
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+
+	if (iser_task->dir[ISER_DIR_IN])
+		return iser_check_task_pi_status(iser_task, ISER_DIR_IN,
+						 sector);
+	else
+		return iser_check_task_pi_status(iser_task, ISER_DIR_OUT,
+						 sector);
+}
+
+/**
+ * iscsi_iser_conn_create() - create a new iscsi-iser connection
+ * @cls_session: iscsi class connection
+ * @conn_idx:    connection index within the session (for MCS)
+ *
+ * Return: iscsi_cls_conn when iscsi_conn_setup succeeds or NULL
+ *         otherwise.
+ */
+static struct iscsi_cls_conn *
+iscsi_iser_conn_create(struct iscsi_cls_session *cls_session,
+		       uint32_t conn_idx)
+{
+	struct iscsi_conn *conn;
+	struct iscsi_cls_conn *cls_conn;
+
+	cls_conn = iscsi_conn_setup(cls_session, 0, conn_idx);
+	if (!cls_conn)
+		return NULL;
+	conn = cls_conn->dd_data;
+
+	/*
+	 * due to issues with the login code re iser sematics
+	 * this not set in iscsi_conn_setup - FIXME
+	 */
+	conn->max_recv_dlength = ISER_RECV_DATA_SEG_LEN;
+
+	return cls_conn;
+}
+
+/**
+ * iscsi_iser_conn_bind() - bind iscsi and iser connection structures
+ * @cls_session:     iscsi class session
+ * @cls_conn:        iscsi class connection
+ * @transport_eph:   transport end-point handle
+ * @is_leading:      indicate if this is the session leading connection (MCS)
+ *
+ * Return: zero on success, $error if iscsi_conn_bind fails and
+ *         -EINVAL in case end-point doesn't exsits anymore or iser connection
+ *         state is not UP (teardown already started).
+ */
+static int
+iscsi_iser_conn_bind(struct iscsi_cls_session *cls_session,
+		     struct iscsi_cls_conn *cls_conn,
+		     uint64_t transport_eph,
+		     int is_leading)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct iser_conn *iser_conn;
+	struct iscsi_endpoint *ep;
+	int error;
+
+	error = iscsi_conn_bind(cls_session, cls_conn, is_leading);
+	if (error)
+		return error;
+
+	/* the transport ep handle comes from user space so it must be
+	 * verified against the global ib connections list */
+	ep = iscsi_lookup_endpoint(transport_eph);
+	if (!ep) {
+		iser_err("can't bind eph %llx\n",
+			 (unsigned long long)transport_eph);
+		return -EINVAL;
+	}
+	iser_conn = ep->dd_data;
+
+	mutex_lock(&iser_conn->state_mutex);
+	if (iser_conn->state != ISER_CONN_UP) {
+		error = -EINVAL;
+		iser_err("iser_conn %p state is %d, teardown started\n",
+			 iser_conn, iser_conn->state);
+		goto out;
+	}
+
+	error = iser_alloc_rx_descriptors(iser_conn, conn->session);
+	if (error)
+		goto out;
+
+	/* binds the iSER connection retrieved from the previously
+	 * connected ep_handle to the iSCSI layer connection. exchanges
+	 * connection pointers */
+	iser_info("binding iscsi conn %p to iser_conn %p\n", conn, iser_conn);
+
+	conn->dd_data = iser_conn;
+	iser_conn->iscsi_conn = conn;
+
+out:
+	mutex_unlock(&iser_conn->state_mutex);
+	return error;
+}
+
+/**
+ * iscsi_iser_conn_start() - start iscsi-iser connection
+ * @cls_conn: iscsi class connection
+ *
+ * Notes: Here iser intialize (or re-initialize) stop_completion as
+ *        from this point iscsi must call conn_stop in session/connection
+ *        teardown so iser transport must wait for it.
+ */
+static int
+iscsi_iser_conn_start(struct iscsi_cls_conn *cls_conn)
+{
+	struct iscsi_conn *iscsi_conn;
+	struct iser_conn *iser_conn;
+
+	iscsi_conn = cls_conn->dd_data;
+	iser_conn = iscsi_conn->dd_data;
+	reinit_completion(&iser_conn->stop_completion);
+
+	return iscsi_conn_start(cls_conn);
+}
+
+/**
+ * iscsi_iser_conn_stop() - stop iscsi-iser connection
+ * @cls_conn:  iscsi class connection
+ * @flag:      indicate if recover or terminate (passed as is)
+ *
+ * Notes: Calling iscsi_conn_stop might theoretically race with
+ *        DEVICE_REMOVAL event and dereference a previously freed RDMA device
+ *        handle, so we call it under iser the state lock to protect against
+ *        this kind of race.
+ */
+static void
+iscsi_iser_conn_stop(struct iscsi_cls_conn *cls_conn, int flag)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+	struct iser_conn *iser_conn = conn->dd_data;
+
+	iser_info("stopping iscsi_conn: %p, iser_conn: %p\n", conn, iser_conn);
+
+	/*
+	 * Userspace may have goofed up and not bound the connection or
+	 * might have only partially setup the connection.
+	 */
+	if (iser_conn) {
+		mutex_lock(&iser_conn->state_mutex);
+		iser_conn_terminate(iser_conn);
+		iscsi_conn_stop(cls_conn, flag);
+
+		/* unbind */
+		iser_conn->iscsi_conn = NULL;
+		conn->dd_data = NULL;
+
+		complete(&iser_conn->stop_completion);
+		mutex_unlock(&iser_conn->state_mutex);
+	} else {
+		iscsi_conn_stop(cls_conn, flag);
+	}
+}
+
+/**
+ * iscsi_iser_session_destroy() - destroy iscsi-iser session
+ * @cls_session: iscsi class session
+ *
+ * Removes and free iscsi host.
+ */
+static void
+iscsi_iser_session_destroy(struct iscsi_cls_session *cls_session)
+{
+	struct Scsi_Host *shost = iscsi_session_to_shost(cls_session);
+
+	iscsi_session_teardown(cls_session);
+	iscsi_host_remove(shost);
+	iscsi_host_free(shost);
+}
+
+static inline unsigned int
+iser_dif_prot_caps(int prot_caps)
+{
+	return ((prot_caps & IB_PROT_T10DIF_TYPE_1) ?
+		SHOST_DIF_TYPE1_PROTECTION | SHOST_DIX_TYPE0_PROTECTION |
+		SHOST_DIX_TYPE1_PROTECTION : 0) |
+	       ((prot_caps & IB_PROT_T10DIF_TYPE_2) ?
+		SHOST_DIF_TYPE2_PROTECTION | SHOST_DIX_TYPE2_PROTECTION : 0) |
+	       ((prot_caps & IB_PROT_T10DIF_TYPE_3) ?
+		SHOST_DIF_TYPE3_PROTECTION | SHOST_DIX_TYPE3_PROTECTION : 0);
+}
+
+/**
+ * iscsi_iser_session_create() - create an iscsi-iser session
+ * @ep:             iscsi end-point handle
+ * @cmds_max:       maximum commands in this session
+ * @qdepth:         session command queue depth
+ * @initial_cmdsn:  initiator command sequnce number
+ *
+ * Allocates and adds a scsi host, expose DIF supprot if
+ * exists, and sets up an iscsi session.
+ */
+static struct iscsi_cls_session *
+iscsi_iser_session_create(struct iscsi_endpoint *ep,
+			  uint16_t cmds_max, uint16_t qdepth,
+			  uint32_t initial_cmdsn)
+{
+	struct iscsi_cls_session *cls_session;
+	struct iscsi_session *session;
+	struct Scsi_Host *shost;
+	struct iser_conn *iser_conn = NULL;
+	struct ib_conn *ib_conn;
+	u16 max_cmds;
+
+	shost = iscsi_host_alloc(&iscsi_iser_sht, 0, 0);
+	if (!shost)
+		return NULL;
+	shost->transportt = iscsi_iser_scsi_transport;
+	shost->cmd_per_lun = qdepth;
+	shost->max_lun = iscsi_max_lun;
+	shost->max_id = 0;
+	shost->max_channel = 0;
+	shost->max_cmd_len = 16;
+
+	/*
+	 * older userspace tools (before 2.0-870) did not pass us
+	 * the leading conn's ep so this will be NULL;
+	 */
+	if (ep) {
+		iser_conn = ep->dd_data;
+		max_cmds = iser_conn->max_cmds;
+
+		mutex_lock(&iser_conn->state_mutex);
+		if (iser_conn->state != ISER_CONN_UP) {
+			iser_err("iser conn %p already started teardown\n",
+				 iser_conn);
+			mutex_unlock(&iser_conn->state_mutex);
+			goto free_host;
+		}
+
+		ib_conn = &iser_conn->ib_conn;
+		if (ib_conn->pi_support) {
+			u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap;
+
+			scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps));
+			scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP |
+						   SHOST_DIX_GUARD_CRC);
+		}
+
+		if (iscsi_host_add(shost,
+				   ib_conn->device->ib_device->dma_device)) {
+			mutex_unlock(&iser_conn->state_mutex);
+			goto free_host;
+		}
+		mutex_unlock(&iser_conn->state_mutex);
+	} else {
+		max_cmds = ISER_DEF_XMIT_CMDS_MAX;
+		if (iscsi_host_add(shost, NULL))
+			goto free_host;
+	}
+
+	if (cmds_max > max_cmds) {
+		iser_info("cmds_max changed from %u to %u\n",
+			  cmds_max, max_cmds);
+		cmds_max = max_cmds;
+	}
+
+	cls_session = iscsi_session_setup(&iscsi_iser_transport, shost,
+					  cmds_max, 0,
+					  sizeof(struct iscsi_iser_task),
+					  initial_cmdsn, 0);
+	if (!cls_session)
+		goto remove_host;
+	session = cls_session->dd_data;
+
+	shost->can_queue = session->scsi_cmds_max;
+	return cls_session;
+
+remove_host:
+	iscsi_host_remove(shost);
+free_host:
+	iscsi_host_free(shost);
+	return NULL;
+}
+
+static int
+iscsi_iser_set_param(struct iscsi_cls_conn *cls_conn,
+		     enum iscsi_param param, char *buf, int buflen)
+{
+	int value;
+
+	switch (param) {
+	case ISCSI_PARAM_MAX_RECV_DLENGTH:
+		/* TBD */
+		break;
+	case ISCSI_PARAM_HDRDGST_EN:
+		sscanf(buf, "%d", &value);
+		if (value) {
+			iser_err("DataDigest wasn't negotiated to None\n");
+			return -EPROTO;
+		}
+		break;
+	case ISCSI_PARAM_DATADGST_EN:
+		sscanf(buf, "%d", &value);
+		if (value) {
+			iser_err("DataDigest wasn't negotiated to None\n");
+			return -EPROTO;
+		}
+		break;
+	case ISCSI_PARAM_IFMARKER_EN:
+		sscanf(buf, "%d", &value);
+		if (value) {
+			iser_err("IFMarker wasn't negotiated to No\n");
+			return -EPROTO;
+		}
+		break;
+	case ISCSI_PARAM_OFMARKER_EN:
+		sscanf(buf, "%d", &value);
+		if (value) {
+			iser_err("OFMarker wasn't negotiated to No\n");
+			return -EPROTO;
+		}
+		break;
+	default:
+		return iscsi_set_param(cls_conn, param, buf, buflen);
+	}
+
+	return 0;
+}
+
+/**
+ * iscsi_iser_set_param() - set class connection parameter
+ * @cls_conn:    iscsi class connection
+ * @stats:       iscsi stats to output
+ *
+ * Output connection statistics.
+ */
+static void
+iscsi_iser_conn_get_stats(struct iscsi_cls_conn *cls_conn, struct iscsi_stats *stats)
+{
+	struct iscsi_conn *conn = cls_conn->dd_data;
+
+	stats->txdata_octets = conn->txdata_octets;
+	stats->rxdata_octets = conn->rxdata_octets;
+	stats->scsicmd_pdus = conn->scsicmd_pdus_cnt;
+	stats->dataout_pdus = conn->dataout_pdus_cnt;
+	stats->scsirsp_pdus = conn->scsirsp_pdus_cnt;
+	stats->datain_pdus = conn->datain_pdus_cnt; /* always 0 */
+	stats->r2t_pdus = conn->r2t_pdus_cnt; /* always 0 */
+	stats->tmfcmd_pdus = conn->tmfcmd_pdus_cnt;
+	stats->tmfrsp_pdus = conn->tmfrsp_pdus_cnt;
+	stats->custom_length = 4;
+	strcpy(stats->custom[0].desc, "qp_tx_queue_full");
+	stats->custom[0].value = 0; /* TB iser_conn->qp_tx_queue_full; */
+	strcpy(stats->custom[1].desc, "fmr_map_not_avail");
+	stats->custom[1].value = 0; /* TB iser_conn->fmr_map_not_avail */;
+	strcpy(stats->custom[2].desc, "eh_abort_cnt");
+	stats->custom[2].value = conn->eh_abort_cnt;
+	strcpy(stats->custom[3].desc, "fmr_unalign_cnt");
+	stats->custom[3].value = conn->fmr_unalign_cnt;
+}
+
+static int iscsi_iser_get_ep_param(struct iscsi_endpoint *ep,
+				   enum iscsi_param param, char *buf)
+{
+	struct iser_conn *iser_conn = ep->dd_data;
+	int len;
+
+	switch (param) {
+	case ISCSI_PARAM_CONN_PORT:
+	case ISCSI_PARAM_CONN_ADDRESS:
+		if (!iser_conn || !iser_conn->ib_conn.cma_id)
+			return -ENOTCONN;
+
+		return iscsi_conn_get_addr_param((struct sockaddr_storage *)
+				&iser_conn->ib_conn.cma_id->route.addr.dst_addr,
+				param, buf);
+		break;
+	default:
+		return -ENOSYS;
+	}
+
+	return len;
+}
+
+/**
+ * iscsi_iser_ep_connect() - Initiate iSER connection establishment
+ * @shost:          scsi_host
+ * @dst_addr:       destination address
+ * @non-blocking:   indicate if routine can block
+ *
+ * Allocate an iscsi endpoint, an iser_conn structure and bind them.
+ * After that start RDMA connection establishment via rdma_cm. We
+ * don't allocate iser_conn embedded in iscsi_endpoint since in teardown
+ * the endpoint will be destroyed at ep_disconnect while iser_conn will
+ * cleanup its resources asynchronuously.
+ *
+ * Return: iscsi_endpoint created by iscsi layer or ERR_PTR(error)
+ *         if fails.
+ */
+static struct iscsi_endpoint *
+iscsi_iser_ep_connect(struct Scsi_Host *shost, struct sockaddr *dst_addr,
+		      int non_blocking)
+{
+	int err;
+	struct iser_conn *iser_conn;
+	struct iscsi_endpoint *ep;
+
+	ep = iscsi_create_endpoint(0);
+	if (!ep)
+		return ERR_PTR(-ENOMEM);
+
+	iser_conn = kzalloc(sizeof(*iser_conn), GFP_KERNEL);
+	if (!iser_conn) {
+		err = -ENOMEM;
+		goto failure;
+	}
+
+	ep->dd_data = iser_conn;
+	iser_conn->ep = ep;
+	iser_conn_init(iser_conn);
+
+	err = iser_connect(iser_conn, NULL, dst_addr, non_blocking);
+	if (err)
+		goto failure;
+
+	return ep;
+failure:
+	iscsi_destroy_endpoint(ep);
+	return ERR_PTR(err);
+}
+
+/**
+ * iscsi_iser_ep_poll() - poll for iser connection establishment to complete
+ * @ep:            iscsi endpoint (created at ep_connect)
+ * @timeout_ms:    polling timeout allowed in ms.
+ *
+ * This routine boils down to waiting for up_completion signaling
+ * that cma_id got CONNECTED event.
+ *
+ * Return: 1 if succeeded in connection establishment, 0 if timeout expired
+ *         (libiscsi will retry will kick in) or -1 if interrupted by signal
+ *         or more likely iser connection state transitioned to TEMINATING or
+ *         DOWN during the wait period.
+ */
+static int
+iscsi_iser_ep_poll(struct iscsi_endpoint *ep, int timeout_ms)
+{
+	struct iser_conn *iser_conn;
+	int rc;
+
+	iser_conn = ep->dd_data;
+	rc = wait_for_completion_interruptible_timeout(&iser_conn->up_completion,
+						       msecs_to_jiffies(timeout_ms));
+	/* if conn establishment failed, return error code to iscsi */
+	if (rc == 0) {
+		mutex_lock(&iser_conn->state_mutex);
+		if (iser_conn->state == ISER_CONN_TERMINATING ||
+		    iser_conn->state == ISER_CONN_DOWN)
+			rc = -1;
+		mutex_unlock(&iser_conn->state_mutex);
+	}
+
+	iser_info("ib conn %p rc = %d\n", iser_conn, rc);
+
+	if (rc > 0)
+		return 1; /* success, this is the equivalent of POLLOUT */
+	else if (!rc)
+		return 0; /* timeout */
+	else
+		return rc; /* signal */
+}
+
+/**
+ * iscsi_iser_ep_disconnect() - Initiate connection teardown process
+ * @ep:    iscsi endpoint handle
+ *
+ * This routine is not blocked by iser and RDMA termination process
+ * completion as we queue a deffered work for iser/RDMA destruction
+ * and cleanup or actually call it immediately in case we didn't pass
+ * iscsi conn bind/start stage, thus it is safe.
+ */
+static void
+iscsi_iser_ep_disconnect(struct iscsi_endpoint *ep)
+{
+	struct iser_conn *iser_conn;
+
+	iser_conn = ep->dd_data;
+	iser_info("ep %p iser conn %p state %d\n",
+		  ep, iser_conn, iser_conn->state);
+
+	mutex_lock(&iser_conn->state_mutex);
+	iser_conn_terminate(iser_conn);
+
+	/*
+	 * if iser_conn and iscsi_conn are bound, we must wait for
+	 * iscsi_conn_stop and flush errors completion before freeing
+	 * the iser resources. Otherwise we are safe to free resources
+	 * immediately.
+	 */
+	if (iser_conn->iscsi_conn) {
+		INIT_WORK(&iser_conn->release_work, iser_release_work);
+		queue_work(release_wq, &iser_conn->release_work);
+		mutex_unlock(&iser_conn->state_mutex);
+	} else {
+		iser_conn->state = ISER_CONN_DOWN;
+		mutex_unlock(&iser_conn->state_mutex);
+		iser_conn_release(iser_conn);
+	}
+	iscsi_destroy_endpoint(ep);
+}
+
+static umode_t iser_attr_is_visible(int param_type, int param)
+{
+	switch (param_type) {
+	case ISCSI_HOST_PARAM:
+		switch (param) {
+		case ISCSI_HOST_PARAM_NETDEV_NAME:
+		case ISCSI_HOST_PARAM_HWADDRESS:
+		case ISCSI_HOST_PARAM_INITIATOR_NAME:
+			return S_IRUGO;
+		default:
+			return 0;
+		}
+	case ISCSI_PARAM:
+		switch (param) {
+		case ISCSI_PARAM_MAX_RECV_DLENGTH:
+		case ISCSI_PARAM_MAX_XMIT_DLENGTH:
+		case ISCSI_PARAM_HDRDGST_EN:
+		case ISCSI_PARAM_DATADGST_EN:
+		case ISCSI_PARAM_CONN_ADDRESS:
+		case ISCSI_PARAM_CONN_PORT:
+		case ISCSI_PARAM_EXP_STATSN:
+		case ISCSI_PARAM_PERSISTENT_ADDRESS:
+		case ISCSI_PARAM_PERSISTENT_PORT:
+		case ISCSI_PARAM_PING_TMO:
+		case ISCSI_PARAM_RECV_TMO:
+		case ISCSI_PARAM_INITIAL_R2T_EN:
+		case ISCSI_PARAM_MAX_R2T:
+		case ISCSI_PARAM_IMM_DATA_EN:
+		case ISCSI_PARAM_FIRST_BURST:
+		case ISCSI_PARAM_MAX_BURST:
+		case ISCSI_PARAM_PDU_INORDER_EN:
+		case ISCSI_PARAM_DATASEQ_INORDER_EN:
+		case ISCSI_PARAM_TARGET_NAME:
+		case ISCSI_PARAM_TPGT:
+		case ISCSI_PARAM_USERNAME:
+		case ISCSI_PARAM_PASSWORD:
+		case ISCSI_PARAM_USERNAME_IN:
+		case ISCSI_PARAM_PASSWORD_IN:
+		case ISCSI_PARAM_FAST_ABORT:
+		case ISCSI_PARAM_ABORT_TMO:
+		case ISCSI_PARAM_LU_RESET_TMO:
+		case ISCSI_PARAM_TGT_RESET_TMO:
+		case ISCSI_PARAM_IFACE_NAME:
+		case ISCSI_PARAM_INITIATOR_NAME:
+		case ISCSI_PARAM_DISCOVERY_SESS:
+			return S_IRUGO;
+		default:
+			return 0;
+		}
+	}
+
+	return 0;
+}
+
+static struct scsi_host_template iscsi_iser_sht = {
+	.module                 = THIS_MODULE,
+	.name                   = "iSCSI Initiator over iSER",
+	.queuecommand           = iscsi_queuecommand,
+	.change_queue_depth	= scsi_change_queue_depth,
+	.sg_tablesize           = ISCSI_ISER_SG_TABLESIZE,
+	.max_sectors		= 1024,
+	.cmd_per_lun            = ISER_DEF_CMD_PER_LUN,
+	.eh_abort_handler       = iscsi_eh_abort,
+	.eh_device_reset_handler= iscsi_eh_device_reset,
+	.eh_target_reset_handler = iscsi_eh_recover_target,
+	.target_alloc		= iscsi_target_alloc,
+	.use_clustering         = DISABLE_CLUSTERING,
+	.proc_name              = "iscsi_iser",
+	.this_id                = -1,
+	.track_queue_depth	= 1,
+};
+
+static struct iscsi_transport iscsi_iser_transport = {
+	.owner                  = THIS_MODULE,
+	.name                   = "iser",
+	.caps                   = CAP_RECOVERY_L0 | CAP_MULTI_R2T | CAP_TEXT_NEGO,
+	/* session management */
+	.create_session         = iscsi_iser_session_create,
+	.destroy_session        = iscsi_iser_session_destroy,
+	/* connection management */
+	.create_conn            = iscsi_iser_conn_create,
+	.bind_conn              = iscsi_iser_conn_bind,
+	.destroy_conn           = iscsi_conn_teardown,
+	.attr_is_visible	= iser_attr_is_visible,
+	.set_param              = iscsi_iser_set_param,
+	.get_conn_param		= iscsi_conn_get_param,
+	.get_ep_param		= iscsi_iser_get_ep_param,
+	.get_session_param	= iscsi_session_get_param,
+	.start_conn             = iscsi_iser_conn_start,
+	.stop_conn              = iscsi_iser_conn_stop,
+	/* iscsi host params */
+	.get_host_param		= iscsi_host_get_param,
+	.set_host_param		= iscsi_host_set_param,
+	/* IO */
+	.send_pdu		= iscsi_conn_send_pdu,
+	.get_stats		= iscsi_iser_conn_get_stats,
+	.init_task		= iscsi_iser_task_init,
+	.xmit_task		= iscsi_iser_task_xmit,
+	.cleanup_task		= iscsi_iser_cleanup_task,
+	.alloc_pdu		= iscsi_iser_pdu_alloc,
+	.check_protection	= iscsi_iser_check_protection,
+	/* recovery */
+	.session_recovery_timedout = iscsi_session_recovery_timedout,
+
+	.ep_connect             = iscsi_iser_ep_connect,
+	.ep_poll                = iscsi_iser_ep_poll,
+	.ep_disconnect          = iscsi_iser_ep_disconnect
+};
+
+static int __init iser_init(void)
+{
+	int err;
+
+	iser_dbg("Starting iSER datamover...\n");
+
+	if (iscsi_max_lun < 1) {
+		iser_err("Invalid max_lun value of %u\n", iscsi_max_lun);
+		return -EINVAL;
+	}
+
+	memset(&ig, 0, sizeof(struct iser_global));
+
+	ig.desc_cache = kmem_cache_create("iser_descriptors",
+					  sizeof(struct iser_tx_desc),
+					  0, SLAB_HWCACHE_ALIGN,
+					  NULL);
+	if (ig.desc_cache == NULL)
+		return -ENOMEM;
+
+	/* device init is called only after the first addr resolution */
+	mutex_init(&ig.device_list_mutex);
+	INIT_LIST_HEAD(&ig.device_list);
+	mutex_init(&ig.connlist_mutex);
+	INIT_LIST_HEAD(&ig.connlist);
+
+	release_wq = alloc_workqueue("release workqueue", 0, 0);
+	if (!release_wq) {
+		iser_err("failed to allocate release workqueue\n");
+		return -ENOMEM;
+	}
+
+	iscsi_iser_scsi_transport = iscsi_register_transport(
+							&iscsi_iser_transport);
+	if (!iscsi_iser_scsi_transport) {
+		iser_err("iscsi_register_transport failed\n");
+		err = -EINVAL;
+		goto register_transport_failure;
+	}
+
+	return 0;
+
+register_transport_failure:
+	kmem_cache_destroy(ig.desc_cache);
+
+	return err;
+}
+
+static void __exit iser_exit(void)
+{
+	struct iser_conn *iser_conn, *n;
+	int connlist_empty;
+
+	iser_dbg("Removing iSER datamover...\n");
+	destroy_workqueue(release_wq);
+
+	mutex_lock(&ig.connlist_mutex);
+	connlist_empty = list_empty(&ig.connlist);
+	mutex_unlock(&ig.connlist_mutex);
+
+	if (!connlist_empty) {
+		iser_err("Error cleanup stage completed but we still have iser "
+			 "connections, destroying them anyway.\n");
+		list_for_each_entry_safe(iser_conn, n, &ig.connlist,
+					 conn_list) {
+			iser_conn_release(iser_conn);
+		}
+	}
+
+	iscsi_unregister_transport(&iscsi_iser_transport);
+	kmem_cache_destroy(ig.desc_cache);
+}
+
+module_init(iser_init);
+module_exit(iser_exit);
diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h
new file mode 100644
index 000000000..262ba1f8e
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/iscsi_iser.h
@@ -0,0 +1,644 @@
+/*
+ * iSER transport for the Open iSCSI Initiator & iSER transport internals
+ *
+ * Copyright (C) 2004 Dmitry Yusupov
+ * Copyright (C) 2004 Alex Aizman
+ * Copyright (C) 2005 Mike Christie
+ * based on code maintained by open-iscsi@googlegroups.com
+ *
+ * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ISCSI_ISER_H__
+#define __ISCSI_ISER_H__
+
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/printk.h>
+#include <scsi/libiscsi.h>
+#include <scsi/scsi_transport_iscsi.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_device.h>
+
+#include <linux/interrupt.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/dma-mapping.h>
+#include <linux/mutex.h>
+#include <linux/mempool.h>
+#include <linux/uio.h>
+
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_fmr_pool.h>
+#include <rdma/rdma_cm.h>
+
+#define DRV_NAME	"iser"
+#define PFX		DRV_NAME ": "
+#define DRV_VER		"1.6"
+
+#define iser_dbg(fmt, arg...)				 \
+	do {						 \
+		if (unlikely(iser_debug_level > 2))	 \
+			printk(KERN_DEBUG PFX "%s: " fmt,\
+				__func__ , ## arg);	 \
+	} while (0)
+
+#define iser_warn(fmt, arg...)				\
+	do {						\
+		if (unlikely(iser_debug_level > 0))	\
+			pr_warn(PFX "%s: " fmt,		\
+				__func__ , ## arg);	\
+	} while (0)
+
+#define iser_info(fmt, arg...)				\
+	do {						\
+		if (unlikely(iser_debug_level > 1))	\
+			pr_info(PFX "%s: " fmt,		\
+				__func__ , ## arg);	\
+	} while (0)
+
+#define iser_err(fmt, arg...) \
+	pr_err(PFX "%s: " fmt, __func__ , ## arg)
+
+#define SHIFT_4K	12
+#define SIZE_4K	(1ULL << SHIFT_4K)
+#define MASK_4K	(~(SIZE_4K-1))
+					/* support up to 512KB in one RDMA */
+#define ISCSI_ISER_SG_TABLESIZE         (0x80000 >> SHIFT_4K)
+#define ISER_DEF_XMIT_CMDS_DEFAULT		512
+#if ISCSI_DEF_XMIT_CMDS_MAX > ISER_DEF_XMIT_CMDS_DEFAULT
+	#define ISER_DEF_XMIT_CMDS_MAX		ISCSI_DEF_XMIT_CMDS_MAX
+#else
+	#define ISER_DEF_XMIT_CMDS_MAX		ISER_DEF_XMIT_CMDS_DEFAULT
+#endif
+#define ISER_DEF_CMD_PER_LUN		ISER_DEF_XMIT_CMDS_MAX
+
+/* QP settings */
+/* Maximal bounds on received asynchronous PDUs */
+#define ISER_MAX_RX_MISC_PDUS		4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
+
+#define ISER_MAX_TX_MISC_PDUS		6 /* NOOP_OUT(2), TEXT(1),         *
+					   * SCSI_TMFUNC(2), LOGOUT(1) */
+
+#define ISER_QP_MAX_RECV_DTOS		(ISER_DEF_XMIT_CMDS_MAX)
+
+#define ISER_MIN_POSTED_RX		(ISER_DEF_XMIT_CMDS_MAX >> 2)
+
+/* the max TX (send) WR supported by the iSER QP is defined by                 *
+ * max_send_wr = T * (1 + D) + C ; D is how many inflight dataouts we expect   *
+ * to have at max for SCSI command. The tx posting & completion handling code  *
+ * supports -EAGAIN scheme where tx is suspended till the QP has room for more *
+ * send WR. D=8 comes from 64K/8K                                              */
+
+#define ISER_INFLIGHT_DATAOUTS		8
+
+#define ISER_QP_MAX_REQ_DTOS		(ISER_DEF_XMIT_CMDS_MAX *    \
+					(1 + ISER_INFLIGHT_DATAOUTS) + \
+					ISER_MAX_TX_MISC_PDUS        + \
+					ISER_MAX_RX_MISC_PDUS)
+
+/* Max registration work requests per command */
+#define ISER_MAX_REG_WR_PER_CMD		5
+
+/* For Signature we don't support DATAOUTs so no need to make room for them */
+#define ISER_QP_SIG_MAX_REQ_DTOS	(ISER_DEF_XMIT_CMDS_MAX	*       \
+					(1 + ISER_MAX_REG_WR_PER_CMD) + \
+					ISER_MAX_TX_MISC_PDUS         + \
+					ISER_MAX_RX_MISC_PDUS)
+
+#define ISER_GET_MAX_XMIT_CMDS(send_wr) ((send_wr			\
+					 - ISER_MAX_TX_MISC_PDUS	\
+					 - ISER_MAX_RX_MISC_PDUS) /	\
+					 (1 + ISER_INFLIGHT_DATAOUTS))
+
+#define ISER_WC_BATCH_COUNT   16
+#define ISER_SIGNAL_CMD_COUNT 32
+
+#define ISER_VER			0x10
+#define ISER_WSV			0x08
+#define ISER_RSV			0x04
+
+#define ISER_FASTREG_LI_WRID		0xffffffffffffffffULL
+#define ISER_BEACON_WRID		0xfffffffffffffffeULL
+
+/**
+ * struct iser_hdr - iSER header
+ *
+ * @flags:        flags support (zbva, remote_inv)
+ * @rsvd:         reserved
+ * @write_stag:   write rkey
+ * @write_va:     write virtual address
+ * @reaf_stag:    read rkey
+ * @read_va:      read virtual address
+ */
+struct iser_hdr {
+	u8      flags;
+	u8      rsvd[3];
+	__be32  write_stag;
+	__be64  write_va;
+	__be32  read_stag;
+	__be64  read_va;
+} __attribute__((packed));
+
+
+#define ISER_ZBVA_NOT_SUPPORTED		0x80
+#define ISER_SEND_W_INV_NOT_SUPPORTED	0x40
+
+struct iser_cm_hdr {
+	u8      flags;
+	u8      rsvd[3];
+} __packed;
+
+/* Constant PDU lengths calculations */
+#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr))
+
+#define ISER_RECV_DATA_SEG_LEN	128
+#define ISER_RX_PAYLOAD_SIZE	(ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
+#define ISER_RX_LOGIN_SIZE	(ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
+
+/* Length of an object name string */
+#define ISER_OBJECT_NAME_SIZE		    64
+
+enum iser_conn_state {
+	ISER_CONN_INIT,		   /* descriptor allocd, no conn          */
+	ISER_CONN_PENDING,	   /* in the process of being established */
+	ISER_CONN_UP,		   /* up and running                      */
+	ISER_CONN_TERMINATING,	   /* in the process of being terminated  */
+	ISER_CONN_DOWN,		   /* shut down                           */
+	ISER_CONN_STATES_NUM
+};
+
+enum iser_task_status {
+	ISER_TASK_STATUS_INIT = 0,
+	ISER_TASK_STATUS_STARTED,
+	ISER_TASK_STATUS_COMPLETED
+};
+
+enum iser_data_dir {
+	ISER_DIR_IN = 0,	   /* to initiator */
+	ISER_DIR_OUT,		   /* from initiator */
+	ISER_DIRS_NUM
+};
+
+/**
+ * struct iser_data_buf - iSER data buffer
+ *
+ * @sg:           pointer to the sg list
+ * @size:         num entries of this sg
+ * @data_len:     total beffer byte len
+ * @dma_nents:    returned by dma_map_sg
+ * @orig_sg:      pointer to the original sg list (in case
+ *                we used a copy)
+ * @orig_size:    num entris of orig sg list
+ */
+struct iser_data_buf {
+	struct scatterlist *sg;
+	unsigned int       size;
+	unsigned long      data_len;
+	unsigned int       dma_nents;
+	struct scatterlist *orig_sg;
+	unsigned int       orig_size;
+  };
+
+/* fwd declarations */
+struct iser_device;
+struct iscsi_iser_task;
+struct iscsi_endpoint;
+
+/**
+ * struct iser_mem_reg - iSER memory registration info
+ *
+ * @sge:          memory region sg element
+ * @rkey:         memory region remote key
+ * @mem_h:        pointer to registration context (FMR/Fastreg)
+ */
+struct iser_mem_reg {
+	struct ib_sge	 sge;
+	u32		 rkey;
+	void		*mem_h;
+};
+
+enum iser_desc_type {
+	ISCSI_TX_CONTROL ,
+	ISCSI_TX_SCSI_COMMAND,
+	ISCSI_TX_DATAOUT
+};
+
+/**
+ * struct iser_tx_desc - iSER TX descriptor (for send wr_id)
+ *
+ * @iser_header:   iser header
+ * @iscsi_header:  iscsi header
+ * @type:          command/control/dataout
+ * @dam_addr:      header buffer dma_address
+ * @tx_sg:         sg[0] points to iser/iscsi headers
+ *                 sg[1] optionally points to either of immediate data
+ *                 unsolicited data-out or control
+ * @num_sge:       number sges used on this TX task
+ */
+struct iser_tx_desc {
+	struct iser_hdr              iser_header;
+	struct iscsi_hdr             iscsi_header;
+	enum   iser_desc_type        type;
+	u64		             dma_addr;
+	struct ib_sge		     tx_sg[2];
+	int                          num_sge;
+};
+
+#define ISER_RX_PAD_SIZE	(256 - (ISER_RX_PAYLOAD_SIZE + \
+					sizeof(u64) + sizeof(struct ib_sge)))
+/**
+ * struct iser_rx_desc - iSER RX descriptor (for recv wr_id)
+ *
+ * @iser_header:   iser header
+ * @iscsi_header:  iscsi header
+ * @data:          received data segment
+ * @dma_addr:      receive buffer dma address
+ * @rx_sg:         ib_sge of receive buffer
+ * @pad:           for sense data TODO: Modify to maximum sense length supported
+ */
+struct iser_rx_desc {
+	struct iser_hdr              iser_header;
+	struct iscsi_hdr             iscsi_header;
+	char		             data[ISER_RECV_DATA_SEG_LEN];
+	u64		             dma_addr;
+	struct ib_sge		     rx_sg;
+	char		             pad[ISER_RX_PAD_SIZE];
+} __attribute__((packed));
+
+struct iser_conn;
+struct ib_conn;
+struct iscsi_iser_task;
+
+/**
+ * struct iser_comp - iSER completion context
+ *
+ * @device:     pointer to device handle
+ * @cq:         completion queue
+ * @wcs:        work completion array
+ * @tasklet:    Tasklet handle
+ * @active_qps: Number of active QPs attached
+ *              to completion context
+ */
+struct iser_comp {
+	struct iser_device      *device;
+	struct ib_cq		*cq;
+	struct ib_wc		 wcs[ISER_WC_BATCH_COUNT];
+	struct tasklet_struct	 tasklet;
+	int                      active_qps;
+};
+
+/**
+ * struct iser_device - iSER device handle
+ *
+ * @ib_device:     RDMA device
+ * @pd:            Protection Domain for this device
+ * @dev_attr:      Device attributes container
+ * @mr:            Global DMA memory region
+ * @event_handler: IB events handle routine
+ * @ig_list:	   entry in devices list
+ * @refcount:      Reference counter, dominated by open iser connections
+ * @comps_used:    Number of completion contexts used, Min between online
+ *                 cpus and device max completion vectors
+ * @comps:         Dinamically allocated array of completion handlers
+ * Memory registration pool Function pointers (FMR or Fastreg):
+ *     @iser_alloc_rdma_reg_res: Allocation of memory regions pool
+ *     @iser_free_rdma_reg_res:  Free of memory regions pool
+ *     @iser_reg_rdma_mem:       Memory registration routine
+ *     @iser_unreg_rdma_mem:     Memory deregistration routine
+ */
+struct iser_device {
+	struct ib_device             *ib_device;
+	struct ib_pd	             *pd;
+	struct ib_device_attr	     dev_attr;
+	struct ib_mr	             *mr;
+	struct ib_event_handler      event_handler;
+	struct list_head             ig_list;
+	int                          refcount;
+	int			     comps_used;
+	struct iser_comp	     *comps;
+	int                          (*iser_alloc_rdma_reg_res)(struct ib_conn *ib_conn,
+								unsigned cmds_max);
+	void                         (*iser_free_rdma_reg_res)(struct ib_conn *ib_conn);
+	int                          (*iser_reg_rdma_mem)(struct iscsi_iser_task *iser_task,
+							  enum iser_data_dir cmd_dir);
+	void                         (*iser_unreg_rdma_mem)(struct iscsi_iser_task *iser_task,
+							    enum iser_data_dir cmd_dir);
+};
+
+#define ISER_CHECK_GUARD	0xc0
+#define ISER_CHECK_REFTAG	0x0f
+#define ISER_CHECK_APPTAG	0x30
+
+enum iser_reg_indicator {
+	ISER_DATA_KEY_VALID	= 1 << 0,
+	ISER_PROT_KEY_VALID	= 1 << 1,
+	ISER_SIG_KEY_VALID	= 1 << 2,
+	ISER_FASTREG_PROTECTED	= 1 << 3,
+};
+
+/**
+ * struct iser_pi_context - Protection information context
+ *
+ * @prot_mr:        protection memory region
+ * @prot_frpl:      protection fastreg page list
+ * @sig_mr:         signature feature enabled memory region
+ */
+struct iser_pi_context {
+	struct ib_mr                   *prot_mr;
+	struct ib_fast_reg_page_list   *prot_frpl;
+	struct ib_mr                   *sig_mr;
+};
+
+/**
+ * struct fast_reg_descriptor - Fast registration descriptor
+ *
+ * @list:           entry in connection fastreg pool
+ * @data_mr:        data memory region
+ * @data_frpl:      data fastreg page list
+ * @pi_ctx:         protection information context
+ * @reg_indicators: fast registration indicators
+ */
+struct fast_reg_descriptor {
+	struct list_head		  list;
+	struct ib_mr			 *data_mr;
+	struct ib_fast_reg_page_list     *data_frpl;
+	struct iser_pi_context		 *pi_ctx;
+	u8				  reg_indicators;
+};
+
+/**
+ * struct ib_conn - Infiniband related objects
+ *
+ * @cma_id:              rdma_cm connection maneger handle
+ * @qp:                  Connection Queue-pair
+ * @post_recv_buf_count: post receive counter
+ * @sig_count:           send work request signal count
+ * @rx_wr:               receive work request for batch posts
+ * @device:              reference to iser device
+ * @comp:                iser completion context
+ * @pi_support:          Indicate device T10-PI support
+ * @beacon:              beacon send wr to signal all flush errors were drained
+ * @flush_comp:          completes when all connection completions consumed
+ * @lock:                protects fmr/fastreg pool
+ * @union.fmr:
+ *     @pool:            FMR pool for fast registrations
+ *     @page_vec:        page vector to hold mapped commands pages
+ *                       used for registration
+ * @union.fastreg:
+ *     @pool:            Fast registration descriptors pool for fast
+ *                       registrations
+ *     @pool_size:       Size of pool
+ */
+struct ib_conn {
+	struct rdma_cm_id           *cma_id;
+	struct ib_qp	            *qp;
+	int                          post_recv_buf_count;
+	u8                           sig_count;
+	struct ib_recv_wr	     rx_wr[ISER_MIN_POSTED_RX];
+	struct iser_device          *device;
+	struct iser_comp	    *comp;
+	bool			     pi_support;
+	struct ib_send_wr	     beacon;
+	struct completion	     flush_comp;
+	spinlock_t		     lock;
+	union {
+		struct {
+			struct ib_fmr_pool      *pool;
+			struct iser_page_vec	*page_vec;
+		} fmr;
+		struct {
+			struct list_head	 pool;
+			int			 pool_size;
+		} fastreg;
+	};
+};
+
+/**
+ * struct iser_conn - iSER connection context
+ *
+ * @ib_conn:          connection RDMA resources
+ * @iscsi_conn:       link to matching iscsi connection
+ * @ep:               transport handle
+ * @state:            connection logical state
+ * @qp_max_recv_dtos: maximum number of data outs, corresponds
+ *                    to max number of post recvs
+ * @qp_max_recv_dtos_mask: (qp_max_recv_dtos - 1)
+ * @min_posted_rx:    (qp_max_recv_dtos >> 2)
+ * @max_cmds:         maximum cmds allowed for this connection
+ * @name:             connection peer portal
+ * @release_work:     deffered work for release job
+ * @state_mutex:      protects iser onnection state
+ * @stop_completion:  conn_stop completion
+ * @ib_completion:    RDMA cleanup completion
+ * @up_completion:    connection establishment completed
+ *                    (state is ISER_CONN_UP)
+ * @conn_list:        entry in ig conn list
+ * @login_buf:        login data buffer (stores login parameters)
+ * @login_req_buf:    login request buffer
+ * @login_req_dma:    login request buffer dma address
+ * @login_resp_buf:   login response buffer
+ * @login_resp_dma:   login response buffer dma address
+ * @rx_desc_head:     head of rx_descs cyclic buffer
+ * @rx_descs:         rx buffers array (cyclic buffer)
+ * @num_rx_descs:     number of rx descriptors
+ */
+struct iser_conn {
+	struct ib_conn		     ib_conn;
+	struct iscsi_conn	     *iscsi_conn;
+	struct iscsi_endpoint	     *ep;
+	enum iser_conn_state	     state;
+	unsigned		     qp_max_recv_dtos;
+	unsigned		     qp_max_recv_dtos_mask;
+	unsigned		     min_posted_rx;
+	u16                          max_cmds;
+	char 			     name[ISER_OBJECT_NAME_SIZE];
+	struct work_struct	     release_work;
+	struct mutex		     state_mutex;
+	struct completion	     stop_completion;
+	struct completion	     ib_completion;
+	struct completion	     up_completion;
+	struct list_head	     conn_list;
+
+	char  			     *login_buf;
+	char			     *login_req_buf, *login_resp_buf;
+	u64			     login_req_dma, login_resp_dma;
+	unsigned int 		     rx_desc_head;
+	struct iser_rx_desc	     *rx_descs;
+	u32                          num_rx_descs;
+};
+
+/**
+ * struct iscsi_iser_task - iser task context
+ *
+ * @desc:     TX descriptor
+ * @iser_conn:        link to iser connection
+ * @status:           current task status
+ * @sc:               link to scsi command
+ * @command_sent:     indicate if command was sent
+ * @dir:              iser data direction
+ * @rdma_reg:         task rdma registration desc
+ * @data:             iser data buffer desc
+ * @prot:             iser protection buffer desc
+ */
+struct iscsi_iser_task {
+	struct iser_tx_desc          desc;
+	struct iser_conn	     *iser_conn;
+	enum iser_task_status 	     status;
+	struct scsi_cmnd	     *sc;
+	int                          command_sent;
+	int                          dir[ISER_DIRS_NUM];
+	struct iser_mem_reg          rdma_reg[ISER_DIRS_NUM];
+	struct iser_data_buf         data[ISER_DIRS_NUM];
+	struct iser_data_buf         prot[ISER_DIRS_NUM];
+};
+
+struct iser_page_vec {
+	u64 *pages;
+	int length;
+	int offset;
+	int data_size;
+};
+
+/**
+ * struct iser_global: iSER global context
+ *
+ * @device_list_mutex:    protects device_list
+ * @device_list:          iser devices global list
+ * @connlist_mutex:       protects connlist
+ * @connlist:             iser connections global list
+ * @desc_cache:           kmem cache for tx dataout
+ */
+struct iser_global {
+	struct mutex      device_list_mutex;
+	struct list_head  device_list;
+	struct mutex      connlist_mutex;
+	struct list_head  connlist;
+	struct kmem_cache *desc_cache;
+};
+
+extern struct iser_global ig;
+extern int iser_debug_level;
+extern bool iser_pi_enable;
+extern int iser_pi_guard;
+
+int iser_send_control(struct iscsi_conn *conn,
+		      struct iscsi_task *task);
+
+int iser_send_command(struct iscsi_conn *conn,
+		      struct iscsi_task *task);
+
+int iser_send_data_out(struct iscsi_conn *conn,
+		       struct iscsi_task *task,
+		       struct iscsi_data *hdr);
+
+void iscsi_iser_recv(struct iscsi_conn *conn,
+		     struct iscsi_hdr *hdr,
+		     char *rx_data,
+		     int rx_data_len);
+
+void iser_conn_init(struct iser_conn *iser_conn);
+
+void iser_conn_release(struct iser_conn *iser_conn);
+
+int iser_conn_terminate(struct iser_conn *iser_conn);
+
+void iser_release_work(struct work_struct *work);
+
+void iser_rcv_completion(struct iser_rx_desc *desc,
+			 unsigned long dto_xfer_len,
+			 struct ib_conn *ib_conn);
+
+void iser_snd_completion(struct iser_tx_desc *desc,
+			 struct ib_conn *ib_conn);
+
+void iser_task_rdma_init(struct iscsi_iser_task *task);
+
+void iser_task_rdma_finalize(struct iscsi_iser_task *task);
+
+void iser_free_rx_descriptors(struct iser_conn *iser_conn);
+
+void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
+				     struct iser_data_buf *mem,
+				     enum iser_data_dir cmd_dir);
+
+int  iser_reg_rdma_mem_fmr(struct iscsi_iser_task *task,
+			   enum iser_data_dir cmd_dir);
+int  iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *task,
+			       enum iser_data_dir cmd_dir);
+
+int  iser_connect(struct iser_conn *iser_conn,
+		  struct sockaddr *src_addr,
+		  struct sockaddr *dst_addr,
+		  int non_blocking);
+
+void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
+			enum iser_data_dir cmd_dir);
+void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
+			    enum iser_data_dir cmd_dir);
+
+int  iser_post_recvl(struct iser_conn *iser_conn);
+int  iser_post_recvm(struct iser_conn *iser_conn, int count);
+int  iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
+		    bool signal);
+
+int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
+			   struct iser_data_buf *data,
+			   enum iser_data_dir iser_dir,
+			   enum dma_data_direction dma_dir);
+
+void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task,
+			      struct iser_data_buf *data,
+			      enum dma_data_direction dir);
+
+int  iser_initialize_task_headers(struct iscsi_task *task,
+			struct iser_tx_desc *tx_desc);
+int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
+			      struct iscsi_session *session);
+int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max);
+void iser_free_fmr_pool(struct ib_conn *ib_conn);
+int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max);
+void iser_free_fastreg_pool(struct ib_conn *ib_conn);
+u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
+			     enum iser_data_dir cmd_dir, sector_t *sector);
+struct fast_reg_descriptor *
+iser_reg_desc_get(struct ib_conn *ib_conn);
+void
+iser_reg_desc_put(struct ib_conn *ib_conn,
+		  struct fast_reg_descriptor *desc);
+#endif
diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c
new file mode 100644
index 000000000..3e2118e8e
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/iser_initiator.c
@@ -0,0 +1,728 @@
+/*
+ * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/scatterlist.h>
+#include <linux/kfifo.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_host.h>
+
+#include "iscsi_iser.h"
+
+/* Register user buffer memory and initialize passive rdma
+ *  dto descriptor. Data size is stored in
+ *  task->data[ISER_DIR_IN].data_len, Protection size
+ *  os stored in task->prot[ISER_DIR_IN].data_len
+ */
+static int iser_prepare_read_cmd(struct iscsi_task *task)
+
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_device  *device = iser_task->iser_conn->ib_conn.device;
+	struct iser_mem_reg *mem_reg;
+	int err;
+	struct iser_hdr *hdr = &iser_task->desc.iser_header;
+	struct iser_data_buf *buf_in = &iser_task->data[ISER_DIR_IN];
+
+	err = iser_dma_map_task_data(iser_task,
+				     buf_in,
+				     ISER_DIR_IN,
+				     DMA_FROM_DEVICE);
+	if (err)
+		return err;
+
+	if (scsi_prot_sg_count(iser_task->sc)) {
+		struct iser_data_buf *pbuf_in = &iser_task->prot[ISER_DIR_IN];
+
+		err = iser_dma_map_task_data(iser_task,
+					     pbuf_in,
+					     ISER_DIR_IN,
+					     DMA_FROM_DEVICE);
+		if (err)
+			return err;
+	}
+
+	err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_IN);
+	if (err) {
+		iser_err("Failed to set up Data-IN RDMA\n");
+		return err;
+	}
+	mem_reg = &iser_task->rdma_reg[ISER_DIR_IN];
+
+	hdr->flags    |= ISER_RSV;
+	hdr->read_stag = cpu_to_be32(mem_reg->rkey);
+	hdr->read_va   = cpu_to_be64(mem_reg->sge.addr);
+
+	iser_dbg("Cmd itt:%d READ tags RKEY:%#.4X VA:%#llX\n",
+		 task->itt, mem_reg->rkey,
+		 (unsigned long long)mem_reg->sge.addr);
+
+	return 0;
+}
+
+/* Register user buffer memory and initialize passive rdma
+ *  dto descriptor. Data size is stored in
+ *  task->data[ISER_DIR_OUT].data_len, Protection size
+ *  is stored at task->prot[ISER_DIR_OUT].data_len
+ */
+static int
+iser_prepare_write_cmd(struct iscsi_task *task,
+		       unsigned int imm_sz,
+		       unsigned int unsol_sz,
+		       unsigned int edtl)
+{
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_device  *device = iser_task->iser_conn->ib_conn.device;
+	struct iser_mem_reg *mem_reg;
+	int err;
+	struct iser_hdr *hdr = &iser_task->desc.iser_header;
+	struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT];
+	struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1];
+
+	err = iser_dma_map_task_data(iser_task,
+				     buf_out,
+				     ISER_DIR_OUT,
+				     DMA_TO_DEVICE);
+	if (err)
+		return err;
+
+	if (scsi_prot_sg_count(iser_task->sc)) {
+		struct iser_data_buf *pbuf_out = &iser_task->prot[ISER_DIR_OUT];
+
+		err = iser_dma_map_task_data(iser_task,
+					     pbuf_out,
+					     ISER_DIR_OUT,
+					     DMA_TO_DEVICE);
+		if (err)
+			return err;
+	}
+
+	err = device->iser_reg_rdma_mem(iser_task, ISER_DIR_OUT);
+	if (err != 0) {
+		iser_err("Failed to register write cmd RDMA mem\n");
+		return err;
+	}
+
+	mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT];
+
+	if (unsol_sz < edtl) {
+		hdr->flags     |= ISER_WSV;
+		hdr->write_stag = cpu_to_be32(mem_reg->rkey);
+		hdr->write_va   = cpu_to_be64(mem_reg->sge.addr + unsol_sz);
+
+		iser_dbg("Cmd itt:%d, WRITE tags, RKEY:%#.4X "
+			 "VA:%#llX + unsol:%d\n",
+			 task->itt, mem_reg->rkey,
+			 (unsigned long long)mem_reg->sge.addr, unsol_sz);
+	}
+
+	if (imm_sz > 0) {
+		iser_dbg("Cmd itt:%d, WRITE, adding imm.data sz: %d\n",
+			 task->itt, imm_sz);
+		tx_dsg->addr = mem_reg->sge.addr;
+		tx_dsg->length = imm_sz;
+		tx_dsg->lkey = mem_reg->sge.lkey;
+		iser_task->desc.num_sge = 2;
+	}
+
+	return 0;
+}
+
+/* creates a new tx descriptor and adds header regd buffer */
+static void iser_create_send_desc(struct iser_conn	*iser_conn,
+				  struct iser_tx_desc	*tx_desc)
+{
+	struct iser_device *device = iser_conn->ib_conn.device;
+
+	ib_dma_sync_single_for_cpu(device->ib_device,
+		tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE);
+
+	memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
+	tx_desc->iser_header.flags = ISER_VER;
+
+	tx_desc->num_sge = 1;
+
+	if (tx_desc->tx_sg[0].lkey != device->mr->lkey) {
+		tx_desc->tx_sg[0].lkey = device->mr->lkey;
+		iser_dbg("sdesc %p lkey mismatch, fixing\n", tx_desc);
+	}
+}
+
+static void iser_free_login_buf(struct iser_conn *iser_conn)
+{
+	struct iser_device *device = iser_conn->ib_conn.device;
+
+	if (!iser_conn->login_buf)
+		return;
+
+	if (iser_conn->login_req_dma)
+		ib_dma_unmap_single(device->ib_device,
+				    iser_conn->login_req_dma,
+				    ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE);
+
+	if (iser_conn->login_resp_dma)
+		ib_dma_unmap_single(device->ib_device,
+				    iser_conn->login_resp_dma,
+				    ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE);
+
+	kfree(iser_conn->login_buf);
+
+	/* make sure we never redo any unmapping */
+	iser_conn->login_req_dma = 0;
+	iser_conn->login_resp_dma = 0;
+	iser_conn->login_buf = NULL;
+}
+
+static int iser_alloc_login_buf(struct iser_conn *iser_conn)
+{
+	struct iser_device *device = iser_conn->ib_conn.device;
+	int			req_err, resp_err;
+
+	BUG_ON(device == NULL);
+
+	iser_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN +
+				     ISER_RX_LOGIN_SIZE, GFP_KERNEL);
+	if (!iser_conn->login_buf)
+		goto out_err;
+
+	iser_conn->login_req_buf  = iser_conn->login_buf;
+	iser_conn->login_resp_buf = iser_conn->login_buf +
+						ISCSI_DEF_MAX_RECV_SEG_LEN;
+
+	iser_conn->login_req_dma = ib_dma_map_single(device->ib_device,
+						     iser_conn->login_req_buf,
+						     ISCSI_DEF_MAX_RECV_SEG_LEN,
+						     DMA_TO_DEVICE);
+
+	iser_conn->login_resp_dma = ib_dma_map_single(device->ib_device,
+						      iser_conn->login_resp_buf,
+						      ISER_RX_LOGIN_SIZE,
+						      DMA_FROM_DEVICE);
+
+	req_err  = ib_dma_mapping_error(device->ib_device,
+					iser_conn->login_req_dma);
+	resp_err = ib_dma_mapping_error(device->ib_device,
+					iser_conn->login_resp_dma);
+
+	if (req_err || resp_err) {
+		if (req_err)
+			iser_conn->login_req_dma = 0;
+		if (resp_err)
+			iser_conn->login_resp_dma = 0;
+		goto free_login_buf;
+	}
+	return 0;
+
+free_login_buf:
+	iser_free_login_buf(iser_conn);
+
+out_err:
+	iser_err("unable to alloc or map login buf\n");
+	return -ENOMEM;
+}
+
+int iser_alloc_rx_descriptors(struct iser_conn *iser_conn,
+			      struct iscsi_session *session)
+{
+	int i, j;
+	u64 dma_addr;
+	struct iser_rx_desc *rx_desc;
+	struct ib_sge       *rx_sg;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+
+	iser_conn->qp_max_recv_dtos = session->cmds_max;
+	iser_conn->qp_max_recv_dtos_mask = session->cmds_max - 1; /* cmds_max is 2^N */
+	iser_conn->min_posted_rx = iser_conn->qp_max_recv_dtos >> 2;
+
+	if (device->iser_alloc_rdma_reg_res(ib_conn, session->scsi_cmds_max))
+		goto create_rdma_reg_res_failed;
+
+	if (iser_alloc_login_buf(iser_conn))
+		goto alloc_login_buf_fail;
+
+	iser_conn->num_rx_descs = session->cmds_max;
+	iser_conn->rx_descs = kmalloc(iser_conn->num_rx_descs *
+				sizeof(struct iser_rx_desc), GFP_KERNEL);
+	if (!iser_conn->rx_descs)
+		goto rx_desc_alloc_fail;
+
+	rx_desc = iser_conn->rx_descs;
+
+	for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++)  {
+		dma_addr = ib_dma_map_single(device->ib_device, (void *)rx_desc,
+					ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(device->ib_device, dma_addr))
+			goto rx_desc_dma_map_failed;
+
+		rx_desc->dma_addr = dma_addr;
+
+		rx_sg = &rx_desc->rx_sg;
+		rx_sg->addr   = rx_desc->dma_addr;
+		rx_sg->length = ISER_RX_PAYLOAD_SIZE;
+		rx_sg->lkey   = device->mr->lkey;
+	}
+
+	iser_conn->rx_desc_head = 0;
+	return 0;
+
+rx_desc_dma_map_failed:
+	rx_desc = iser_conn->rx_descs;
+	for (j = 0; j < i; j++, rx_desc++)
+		ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr,
+				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	kfree(iser_conn->rx_descs);
+	iser_conn->rx_descs = NULL;
+rx_desc_alloc_fail:
+	iser_free_login_buf(iser_conn);
+alloc_login_buf_fail:
+	device->iser_free_rdma_reg_res(ib_conn);
+create_rdma_reg_res_failed:
+	iser_err("failed allocating rx descriptors / data buffers\n");
+	return -ENOMEM;
+}
+
+void iser_free_rx_descriptors(struct iser_conn *iser_conn)
+{
+	int i;
+	struct iser_rx_desc *rx_desc;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+
+	if (device->iser_free_rdma_reg_res)
+		device->iser_free_rdma_reg_res(ib_conn);
+
+	rx_desc = iser_conn->rx_descs;
+	for (i = 0; i < iser_conn->qp_max_recv_dtos; i++, rx_desc++)
+		ib_dma_unmap_single(device->ib_device, rx_desc->dma_addr,
+				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	kfree(iser_conn->rx_descs);
+	/* make sure we never redo any unmapping */
+	iser_conn->rx_descs = NULL;
+
+	iser_free_login_buf(iser_conn);
+}
+
+static int iser_post_rx_bufs(struct iscsi_conn *conn, struct iscsi_hdr *req)
+{
+	struct iser_conn *iser_conn = conn->dd_data;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iscsi_session *session = conn->session;
+
+	iser_dbg("req op %x flags %x\n", req->opcode, req->flags);
+	/* check if this is the last login - going to full feature phase */
+	if ((req->flags & ISCSI_FULL_FEATURE_PHASE) != ISCSI_FULL_FEATURE_PHASE)
+		return 0;
+
+	/*
+	 * Check that there is one posted recv buffer
+	 * (for the last login response).
+	 */
+	WARN_ON(ib_conn->post_recv_buf_count != 1);
+
+	if (session->discovery_sess) {
+		iser_info("Discovery session, re-using login RX buffer\n");
+		return 0;
+	} else
+		iser_info("Normal session, posting batch of RX %d buffers\n",
+			  iser_conn->min_posted_rx);
+
+	/* Initial post receive buffers */
+	if (iser_post_recvm(iser_conn, iser_conn->min_posted_rx))
+		return -ENOMEM;
+
+	return 0;
+}
+
+static inline bool iser_signal_comp(u8 sig_count)
+{
+	return ((sig_count % ISER_SIGNAL_CMD_COUNT) == 0);
+}
+
+/**
+ * iser_send_command - send command PDU
+ */
+int iser_send_command(struct iscsi_conn *conn,
+		      struct iscsi_task *task)
+{
+	struct iser_conn *iser_conn = conn->dd_data;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	unsigned long edtl;
+	int err;
+	struct iser_data_buf *data_buf, *prot_buf;
+	struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)task->hdr;
+	struct scsi_cmnd *sc  =  task->sc;
+	struct iser_tx_desc *tx_desc = &iser_task->desc;
+	u8 sig_count = ++iser_conn->ib_conn.sig_count;
+
+	edtl = ntohl(hdr->data_length);
+
+	/* build the tx desc regd header and add it to the tx desc dto */
+	tx_desc->type = ISCSI_TX_SCSI_COMMAND;
+	iser_create_send_desc(iser_conn, tx_desc);
+
+	if (hdr->flags & ISCSI_FLAG_CMD_READ) {
+		data_buf = &iser_task->data[ISER_DIR_IN];
+		prot_buf = &iser_task->prot[ISER_DIR_IN];
+	} else {
+		data_buf = &iser_task->data[ISER_DIR_OUT];
+		prot_buf = &iser_task->prot[ISER_DIR_OUT];
+	}
+
+	if (scsi_sg_count(sc)) { /* using a scatter list */
+		data_buf->sg = scsi_sglist(sc);
+		data_buf->size = scsi_sg_count(sc);
+	}
+	data_buf->data_len = scsi_bufflen(sc);
+
+	if (scsi_prot_sg_count(sc)) {
+		prot_buf->sg  = scsi_prot_sglist(sc);
+		prot_buf->size = scsi_prot_sg_count(sc);
+		prot_buf->data_len = (data_buf->data_len >>
+				     ilog2(sc->device->sector_size)) * 8;
+	}
+
+	if (hdr->flags & ISCSI_FLAG_CMD_READ) {
+		err = iser_prepare_read_cmd(task);
+		if (err)
+			goto send_command_error;
+	}
+	if (hdr->flags & ISCSI_FLAG_CMD_WRITE) {
+		err = iser_prepare_write_cmd(task,
+					     task->imm_count,
+				             task->imm_count +
+					     task->unsol_r2t.data_length,
+					     edtl);
+		if (err)
+			goto send_command_error;
+	}
+
+	iser_task->status = ISER_TASK_STATUS_STARTED;
+
+	err = iser_post_send(&iser_conn->ib_conn, tx_desc,
+			     iser_signal_comp(sig_count));
+	if (!err)
+		return 0;
+
+send_command_error:
+	iser_err("conn %p failed task->itt %d err %d\n",conn, task->itt, err);
+	return err;
+}
+
+/**
+ * iser_send_data_out - send data out PDU
+ */
+int iser_send_data_out(struct iscsi_conn *conn,
+		       struct iscsi_task *task,
+		       struct iscsi_data *hdr)
+{
+	struct iser_conn *iser_conn = conn->dd_data;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_tx_desc *tx_desc = NULL;
+	struct iser_mem_reg *mem_reg;
+	unsigned long buf_offset;
+	unsigned long data_seg_len;
+	uint32_t itt;
+	int err = 0;
+	struct ib_sge *tx_dsg;
+
+	itt = (__force uint32_t)hdr->itt;
+	data_seg_len = ntoh24(hdr->dlength);
+	buf_offset   = ntohl(hdr->offset);
+
+	iser_dbg("%s itt %d dseg_len %d offset %d\n",
+		 __func__,(int)itt,(int)data_seg_len,(int)buf_offset);
+
+	tx_desc = kmem_cache_zalloc(ig.desc_cache, GFP_ATOMIC);
+	if (tx_desc == NULL) {
+		iser_err("Failed to alloc desc for post dataout\n");
+		return -ENOMEM;
+	}
+
+	tx_desc->type = ISCSI_TX_DATAOUT;
+	tx_desc->iser_header.flags = ISER_VER;
+	memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr));
+
+	/* build the tx desc */
+	iser_initialize_task_headers(task, tx_desc);
+
+	mem_reg = &iser_task->rdma_reg[ISER_DIR_OUT];
+	tx_dsg = &tx_desc->tx_sg[1];
+	tx_dsg->addr = mem_reg->sge.addr + buf_offset;
+	tx_dsg->length = data_seg_len;
+	tx_dsg->lkey = mem_reg->sge.lkey;
+	tx_desc->num_sge = 2;
+
+	if (buf_offset + data_seg_len > iser_task->data[ISER_DIR_OUT].data_len) {
+		iser_err("Offset:%ld & DSL:%ld in Data-Out "
+			 "inconsistent with total len:%ld, itt:%d\n",
+			 buf_offset, data_seg_len,
+			 iser_task->data[ISER_DIR_OUT].data_len, itt);
+		err = -EINVAL;
+		goto send_data_out_error;
+	}
+	iser_dbg("data-out itt: %d, offset: %ld, sz: %ld\n",
+		 itt, buf_offset, data_seg_len);
+
+
+	err = iser_post_send(&iser_conn->ib_conn, tx_desc, true);
+	if (!err)
+		return 0;
+
+send_data_out_error:
+	kmem_cache_free(ig.desc_cache, tx_desc);
+	iser_err("conn %p failed err %d\n",conn, err);
+	return err;
+}
+
+int iser_send_control(struct iscsi_conn *conn,
+		      struct iscsi_task *task)
+{
+	struct iser_conn *iser_conn = conn->dd_data;
+	struct iscsi_iser_task *iser_task = task->dd_data;
+	struct iser_tx_desc *mdesc = &iser_task->desc;
+	unsigned long data_seg_len;
+	int err = 0;
+	struct iser_device *device;
+
+	/* build the tx desc regd header and add it to the tx desc dto */
+	mdesc->type = ISCSI_TX_CONTROL;
+	iser_create_send_desc(iser_conn, mdesc);
+
+	device = iser_conn->ib_conn.device;
+
+	data_seg_len = ntoh24(task->hdr->dlength);
+
+	if (data_seg_len > 0) {
+		struct ib_sge *tx_dsg = &mdesc->tx_sg[1];
+		if (task != conn->login_task) {
+			iser_err("data present on non login task!!!\n");
+			goto send_control_error;
+		}
+
+		ib_dma_sync_single_for_cpu(device->ib_device,
+			iser_conn->login_req_dma, task->data_count,
+			DMA_TO_DEVICE);
+
+		memcpy(iser_conn->login_req_buf, task->data, task->data_count);
+
+		ib_dma_sync_single_for_device(device->ib_device,
+			iser_conn->login_req_dma, task->data_count,
+			DMA_TO_DEVICE);
+
+		tx_dsg->addr    = iser_conn->login_req_dma;
+		tx_dsg->length  = task->data_count;
+		tx_dsg->lkey    = device->mr->lkey;
+		mdesc->num_sge = 2;
+	}
+
+	if (task == conn->login_task) {
+		iser_dbg("op %x dsl %lx, posting login rx buffer\n",
+			 task->hdr->opcode, data_seg_len);
+		err = iser_post_recvl(iser_conn);
+		if (err)
+			goto send_control_error;
+		err = iser_post_rx_bufs(conn, task->hdr);
+		if (err)
+			goto send_control_error;
+	}
+
+	err = iser_post_send(&iser_conn->ib_conn, mdesc, true);
+	if (!err)
+		return 0;
+
+send_control_error:
+	iser_err("conn %p failed err %d\n",conn, err);
+	return err;
+}
+
+/**
+ * iser_rcv_dto_completion - recv DTO completion
+ */
+void iser_rcv_completion(struct iser_rx_desc *rx_desc,
+			 unsigned long rx_xfer_len,
+			 struct ib_conn *ib_conn)
+{
+	struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
+						   ib_conn);
+	struct iscsi_hdr *hdr;
+	u64 rx_dma;
+	int rx_buflen, outstanding, count, err;
+
+	/* differentiate between login to all other PDUs */
+	if ((char *)rx_desc == iser_conn->login_resp_buf) {
+		rx_dma = iser_conn->login_resp_dma;
+		rx_buflen = ISER_RX_LOGIN_SIZE;
+	} else {
+		rx_dma = rx_desc->dma_addr;
+		rx_buflen = ISER_RX_PAYLOAD_SIZE;
+	}
+
+	ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma,
+				   rx_buflen, DMA_FROM_DEVICE);
+
+	hdr = &rx_desc->iscsi_header;
+
+	iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode,
+			hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN));
+
+	iscsi_iser_recv(iser_conn->iscsi_conn, hdr, rx_desc->data,
+			rx_xfer_len - ISER_HEADERS_LEN);
+
+	ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma,
+				      rx_buflen, DMA_FROM_DEVICE);
+
+	/* decrementing conn->post_recv_buf_count only --after-- freeing the   *
+	 * task eliminates the need to worry on tasks which are completed in   *
+	 * parallel to the execution of iser_conn_term. So the code that waits *
+	 * for the posted rx bufs refcount to become zero handles everything   */
+	ib_conn->post_recv_buf_count--;
+
+	if (rx_dma == iser_conn->login_resp_dma)
+		return;
+
+	outstanding = ib_conn->post_recv_buf_count;
+	if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) {
+		count = min(iser_conn->qp_max_recv_dtos - outstanding,
+			    iser_conn->min_posted_rx);
+		err = iser_post_recvm(iser_conn, count);
+		if (err)
+			iser_err("posting %d rx bufs err %d\n", count, err);
+	}
+}
+
+void iser_snd_completion(struct iser_tx_desc *tx_desc,
+			struct ib_conn *ib_conn)
+{
+	struct iscsi_task *task;
+	struct iser_device *device = ib_conn->device;
+
+	if (tx_desc->type == ISCSI_TX_DATAOUT) {
+		ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr,
+					ISER_HEADERS_LEN, DMA_TO_DEVICE);
+		kmem_cache_free(ig.desc_cache, tx_desc);
+		tx_desc = NULL;
+	}
+
+	if (tx_desc && tx_desc->type == ISCSI_TX_CONTROL) {
+		/* this arithmetic is legal by libiscsi dd_data allocation */
+		task = (void *) ((long)(void *)tx_desc -
+				  sizeof(struct iscsi_task));
+		if (task->hdr->itt == RESERVED_ITT)
+			iscsi_put_task(task);
+	}
+}
+
+void iser_task_rdma_init(struct iscsi_iser_task *iser_task)
+
+{
+	iser_task->status = ISER_TASK_STATUS_INIT;
+
+	iser_task->dir[ISER_DIR_IN] = 0;
+	iser_task->dir[ISER_DIR_OUT] = 0;
+
+	iser_task->data[ISER_DIR_IN].data_len  = 0;
+	iser_task->data[ISER_DIR_OUT].data_len = 0;
+
+	iser_task->prot[ISER_DIR_IN].data_len  = 0;
+	iser_task->prot[ISER_DIR_OUT].data_len = 0;
+
+	memset(&iser_task->rdma_reg[ISER_DIR_IN], 0,
+	       sizeof(struct iser_mem_reg));
+	memset(&iser_task->rdma_reg[ISER_DIR_OUT], 0,
+	       sizeof(struct iser_mem_reg));
+}
+
+void iser_task_rdma_finalize(struct iscsi_iser_task *iser_task)
+{
+	struct iser_device *device = iser_task->iser_conn->ib_conn.device;
+	int is_rdma_data_aligned = 1;
+	int is_rdma_prot_aligned = 1;
+	int prot_count = scsi_prot_sg_count(iser_task->sc);
+
+	/* if we were reading, copy back to unaligned sglist,
+	 * anyway dma_unmap and free the copy
+	 */
+	if (iser_task->data[ISER_DIR_IN].orig_sg) {
+		is_rdma_data_aligned = 0;
+		iser_finalize_rdma_unaligned_sg(iser_task,
+						&iser_task->data[ISER_DIR_IN],
+						ISER_DIR_IN);
+	}
+
+	if (iser_task->data[ISER_DIR_OUT].orig_sg) {
+		is_rdma_data_aligned = 0;
+		iser_finalize_rdma_unaligned_sg(iser_task,
+						&iser_task->data[ISER_DIR_OUT],
+						ISER_DIR_OUT);
+	}
+
+	if (iser_task->prot[ISER_DIR_IN].orig_sg) {
+		is_rdma_prot_aligned = 0;
+		iser_finalize_rdma_unaligned_sg(iser_task,
+						&iser_task->prot[ISER_DIR_IN],
+						ISER_DIR_IN);
+	}
+
+	if (iser_task->prot[ISER_DIR_OUT].orig_sg) {
+		is_rdma_prot_aligned = 0;
+		iser_finalize_rdma_unaligned_sg(iser_task,
+						&iser_task->prot[ISER_DIR_OUT],
+						ISER_DIR_OUT);
+	}
+
+	if (iser_task->dir[ISER_DIR_IN]) {
+		device->iser_unreg_rdma_mem(iser_task, ISER_DIR_IN);
+		if (is_rdma_data_aligned)
+			iser_dma_unmap_task_data(iser_task,
+						 &iser_task->data[ISER_DIR_IN],
+						 DMA_FROM_DEVICE);
+		if (prot_count && is_rdma_prot_aligned)
+			iser_dma_unmap_task_data(iser_task,
+						 &iser_task->prot[ISER_DIR_IN],
+						 DMA_FROM_DEVICE);
+	}
+
+	if (iser_task->dir[ISER_DIR_OUT]) {
+		device->iser_unreg_rdma_mem(iser_task, ISER_DIR_OUT);
+		if (is_rdma_data_aligned)
+			iser_dma_unmap_task_data(iser_task,
+						 &iser_task->data[ISER_DIR_OUT],
+						 DMA_TO_DEVICE);
+		if (prot_count && is_rdma_prot_aligned)
+			iser_dma_unmap_task_data(iser_task,
+						 &iser_task->prot[ISER_DIR_OUT],
+						 DMA_TO_DEVICE);
+	}
+}
diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c
new file mode 100644
index 000000000..f0cdc961e
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/iser_memory.c
@@ -0,0 +1,859 @@
+/*
+ * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/scatterlist.h>
+
+#include "iscsi_iser.h"
+
+static void
+iser_free_bounce_sg(struct iser_data_buf *data)
+{
+	struct scatterlist *sg;
+	int count;
+
+	for_each_sg(data->sg, sg, data->size, count)
+		__free_page(sg_page(sg));
+
+	kfree(data->sg);
+
+	data->sg = data->orig_sg;
+	data->size = data->orig_size;
+	data->orig_sg = NULL;
+	data->orig_size = 0;
+}
+
+static int
+iser_alloc_bounce_sg(struct iser_data_buf *data)
+{
+	struct scatterlist *sg;
+	struct page *page;
+	unsigned long length = data->data_len;
+	int i = 0, nents = DIV_ROUND_UP(length, PAGE_SIZE);
+
+	sg = kcalloc(nents, sizeof(*sg), GFP_ATOMIC);
+	if (!sg)
+		goto err;
+
+	sg_init_table(sg, nents);
+	while (length) {
+		u32 page_len = min_t(u32, length, PAGE_SIZE);
+
+		page = alloc_page(GFP_ATOMIC);
+		if (!page)
+			goto err;
+
+		sg_set_page(&sg[i], page, page_len, 0);
+		length -= page_len;
+		i++;
+	}
+
+	data->orig_sg = data->sg;
+	data->orig_size = data->size;
+	data->sg = sg;
+	data->size = nents;
+
+	return 0;
+
+err:
+	for (; i > 0; i--)
+		__free_page(sg_page(&sg[i - 1]));
+	kfree(sg);
+
+	return -ENOMEM;
+}
+
+static void
+iser_copy_bounce(struct iser_data_buf *data, bool to_buffer)
+{
+	struct scatterlist *osg, *bsg = data->sg;
+	void *oaddr, *baddr;
+	unsigned int left = data->data_len;
+	unsigned int bsg_off = 0;
+	int i;
+
+	for_each_sg(data->orig_sg, osg, data->orig_size, i) {
+		unsigned int copy_len, osg_off = 0;
+
+		oaddr = kmap_atomic(sg_page(osg)) + osg->offset;
+		copy_len = min(left, osg->length);
+		while (copy_len) {
+			unsigned int len = min(copy_len, bsg->length - bsg_off);
+
+			baddr = kmap_atomic(sg_page(bsg)) + bsg->offset;
+			if (to_buffer)
+				memcpy(baddr + bsg_off, oaddr + osg_off, len);
+			else
+				memcpy(oaddr + osg_off, baddr + bsg_off, len);
+
+			kunmap_atomic(baddr - bsg->offset);
+			osg_off += len;
+			bsg_off += len;
+			copy_len -= len;
+
+			if (bsg_off >= bsg->length) {
+				bsg = sg_next(bsg);
+				bsg_off = 0;
+			}
+		}
+		kunmap_atomic(oaddr - osg->offset);
+		left -= osg_off;
+	}
+}
+
+static inline void
+iser_copy_from_bounce(struct iser_data_buf *data)
+{
+	iser_copy_bounce(data, false);
+}
+
+static inline void
+iser_copy_to_bounce(struct iser_data_buf *data)
+{
+	iser_copy_bounce(data, true);
+}
+
+struct fast_reg_descriptor *
+iser_reg_desc_get(struct ib_conn *ib_conn)
+{
+	struct fast_reg_descriptor *desc;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_conn->lock, flags);
+	desc = list_first_entry(&ib_conn->fastreg.pool,
+				struct fast_reg_descriptor, list);
+	list_del(&desc->list);
+	spin_unlock_irqrestore(&ib_conn->lock, flags);
+
+	return desc;
+}
+
+void
+iser_reg_desc_put(struct ib_conn *ib_conn,
+		  struct fast_reg_descriptor *desc)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ib_conn->lock, flags);
+	list_add(&desc->list, &ib_conn->fastreg.pool);
+	spin_unlock_irqrestore(&ib_conn->lock, flags);
+}
+
+/**
+ * iser_start_rdma_unaligned_sg
+ */
+static int iser_start_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
+					struct iser_data_buf *data,
+					enum iser_data_dir cmd_dir)
+{
+	struct ib_device *dev = iser_task->iser_conn->ib_conn.device->ib_device;
+	int rc;
+
+	rc = iser_alloc_bounce_sg(data);
+	if (rc) {
+		iser_err("Failed to allocate bounce for data len %lu\n",
+			 data->data_len);
+		return rc;
+	}
+
+	if (cmd_dir == ISER_DIR_OUT)
+		iser_copy_to_bounce(data);
+
+	data->dma_nents = ib_dma_map_sg(dev, data->sg, data->size,
+					(cmd_dir == ISER_DIR_OUT) ?
+					DMA_TO_DEVICE : DMA_FROM_DEVICE);
+	if (!data->dma_nents) {
+		iser_err("Got dma_nents %d, something went wrong...\n",
+			 data->dma_nents);
+		rc = -ENOMEM;
+		goto err;
+	}
+
+	return 0;
+err:
+	iser_free_bounce_sg(data);
+	return rc;
+}
+
+/**
+ * iser_finalize_rdma_unaligned_sg
+ */
+
+void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task,
+				     struct iser_data_buf *data,
+				     enum iser_data_dir cmd_dir)
+{
+	struct ib_device *dev = iser_task->iser_conn->ib_conn.device->ib_device;
+
+	ib_dma_unmap_sg(dev, data->sg, data->size,
+			(cmd_dir == ISER_DIR_OUT) ?
+			DMA_TO_DEVICE : DMA_FROM_DEVICE);
+
+	if (cmd_dir == ISER_DIR_IN)
+		iser_copy_from_bounce(data);
+
+	iser_free_bounce_sg(data);
+}
+
+#define IS_4K_ALIGNED(addr)	((((unsigned long)addr) & ~MASK_4K) == 0)
+
+/**
+ * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses
+ * and returns the length of resulting physical address array (may be less than
+ * the original due to possible compaction).
+ *
+ * we build a "page vec" under the assumption that the SG meets the RDMA
+ * alignment requirements. Other then the first and last SG elements, all
+ * the "internal" elements can be compacted into a list whose elements are
+ * dma addresses of physical pages. The code supports also the weird case
+ * where --few fragments of the same page-- are present in the SG as
+ * consecutive elements. Also, it handles one entry SG.
+ */
+
+static int iser_sg_to_page_vec(struct iser_data_buf *data,
+			       struct ib_device *ibdev, u64 *pages,
+			       int *offset, int *data_size)
+{
+	struct scatterlist *sg, *sgl = data->sg;
+	u64 start_addr, end_addr, page, chunk_start = 0;
+	unsigned long total_sz = 0;
+	unsigned int dma_len;
+	int i, new_chunk, cur_page, last_ent = data->dma_nents - 1;
+
+	/* compute the offset of first element */
+	*offset = (u64) sgl[0].offset & ~MASK_4K;
+
+	new_chunk = 1;
+	cur_page  = 0;
+	for_each_sg(sgl, sg, data->dma_nents, i) {
+		start_addr = ib_sg_dma_address(ibdev, sg);
+		if (new_chunk)
+			chunk_start = start_addr;
+		dma_len = ib_sg_dma_len(ibdev, sg);
+		end_addr = start_addr + dma_len;
+		total_sz += dma_len;
+
+		/* collect page fragments until aligned or end of SG list */
+		if (!IS_4K_ALIGNED(end_addr) && i < last_ent) {
+			new_chunk = 0;
+			continue;
+		}
+		new_chunk = 1;
+
+		/* address of the first page in the contiguous chunk;
+		   masking relevant for the very first SG entry,
+		   which might be unaligned */
+		page = chunk_start & MASK_4K;
+		do {
+			pages[cur_page++] = page;
+			page += SIZE_4K;
+		} while (page < end_addr);
+	}
+
+	*data_size = total_sz;
+	iser_dbg("page_vec->data_size:%d cur_page %d\n",
+		 *data_size, cur_page);
+	return cur_page;
+}
+
+
+/**
+ * iser_data_buf_aligned_len - Tries to determine the maximal correctly aligned
+ * for RDMA sub-list of a scatter-gather list of memory buffers, and  returns
+ * the number of entries which are aligned correctly. Supports the case where
+ * consecutive SG elements are actually fragments of the same physcial page.
+ */
+static int iser_data_buf_aligned_len(struct iser_data_buf *data,
+				      struct ib_device *ibdev)
+{
+	struct scatterlist *sg, *sgl, *next_sg = NULL;
+	u64 start_addr, end_addr;
+	int i, ret_len, start_check = 0;
+
+	if (data->dma_nents == 1)
+		return 1;
+
+	sgl = data->sg;
+	start_addr  = ib_sg_dma_address(ibdev, sgl);
+
+	for_each_sg(sgl, sg, data->dma_nents, i) {
+		if (start_check && !IS_4K_ALIGNED(start_addr))
+			break;
+
+		next_sg = sg_next(sg);
+		if (!next_sg)
+			break;
+
+		end_addr    = start_addr + ib_sg_dma_len(ibdev, sg);
+		start_addr  = ib_sg_dma_address(ibdev, next_sg);
+
+		if (end_addr == start_addr) {
+			start_check = 0;
+			continue;
+		} else
+			start_check = 1;
+
+		if (!IS_4K_ALIGNED(end_addr))
+			break;
+	}
+	ret_len = (next_sg) ? i : i+1;
+	iser_dbg("Found %d aligned entries out of %d in sg:0x%p\n",
+		 ret_len, data->dma_nents, data);
+	return ret_len;
+}
+
+static void iser_data_buf_dump(struct iser_data_buf *data,
+			       struct ib_device *ibdev)
+{
+	struct scatterlist *sg;
+	int i;
+
+	for_each_sg(data->sg, sg, data->dma_nents, i)
+		iser_dbg("sg[%d] dma_addr:0x%lX page:0x%p "
+			 "off:0x%x sz:0x%x dma_len:0x%x\n",
+			 i, (unsigned long)ib_sg_dma_address(ibdev, sg),
+			 sg_page(sg), sg->offset,
+			 sg->length, ib_sg_dma_len(ibdev, sg));
+}
+
+static void iser_dump_page_vec(struct iser_page_vec *page_vec)
+{
+	int i;
+
+	iser_err("page vec length %d data size %d\n",
+		 page_vec->length, page_vec->data_size);
+	for (i = 0; i < page_vec->length; i++)
+		iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]);
+}
+
+int iser_dma_map_task_data(struct iscsi_iser_task *iser_task,
+			    struct iser_data_buf *data,
+			    enum iser_data_dir iser_dir,
+			    enum dma_data_direction dma_dir)
+{
+	struct ib_device *dev;
+
+	iser_task->dir[iser_dir] = 1;
+	dev = iser_task->iser_conn->ib_conn.device->ib_device;
+
+	data->dma_nents = ib_dma_map_sg(dev, data->sg, data->size, dma_dir);
+	if (data->dma_nents == 0) {
+		iser_err("dma_map_sg failed!!!\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+void iser_dma_unmap_task_data(struct iscsi_iser_task *iser_task,
+			      struct iser_data_buf *data,
+			      enum dma_data_direction dir)
+{
+	struct ib_device *dev;
+
+	dev = iser_task->iser_conn->ib_conn.device->ib_device;
+	ib_dma_unmap_sg(dev, data->sg, data->size, dir);
+}
+
+static int
+iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem,
+	     struct iser_mem_reg *reg)
+{
+	struct scatterlist *sg = mem->sg;
+
+	reg->sge.lkey = device->mr->lkey;
+	reg->rkey = device->mr->rkey;
+	reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]);
+	reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]);
+
+	iser_dbg("Single DMA entry: lkey=0x%x, rkey=0x%x, addr=0x%llx,"
+		 " length=0x%x\n", reg->sge.lkey, reg->rkey,
+		 reg->sge.addr, reg->sge.length);
+
+	return 0;
+}
+
+static int fall_to_bounce_buf(struct iscsi_iser_task *iser_task,
+			      struct iser_data_buf *mem,
+			      enum iser_data_dir cmd_dir,
+			      int aligned_len)
+{
+	struct iscsi_conn *iscsi_conn = iser_task->iser_conn->iscsi_conn;
+	struct iser_device *device = iser_task->iser_conn->ib_conn.device;
+
+	iscsi_conn->fmr_unalign_cnt++;
+	iser_warn("rdma alignment violation (%d/%d aligned) or FMR not supported\n",
+		  aligned_len, mem->size);
+
+	if (iser_debug_level > 0)
+		iser_data_buf_dump(mem, device->ib_device);
+
+	/* unmap the command data before accessing it */
+	iser_dma_unmap_task_data(iser_task, mem,
+				 (cmd_dir == ISER_DIR_OUT) ?
+				 DMA_TO_DEVICE : DMA_FROM_DEVICE);
+
+	/* allocate copy buf, if we are writing, copy the */
+	/* unaligned scatterlist, dma map the copy        */
+	if (iser_start_rdma_unaligned_sg(iser_task, mem, cmd_dir) != 0)
+		return -ENOMEM;
+
+	return 0;
+}
+
+/**
+ * iser_reg_page_vec - Register physical memory
+ *
+ * returns: 0 on success, errno code on failure
+ */
+static
+int iser_reg_page_vec(struct iscsi_iser_task *iser_task,
+		      struct iser_data_buf *mem,
+		      struct iser_page_vec *page_vec,
+		      struct iser_mem_reg *mem_reg)
+{
+	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+	struct ib_pool_fmr *fmr;
+	int ret, plen;
+
+	plen = iser_sg_to_page_vec(mem, device->ib_device,
+				   page_vec->pages,
+				   &page_vec->offset,
+				   &page_vec->data_size);
+	page_vec->length = plen;
+	if (plen * SIZE_4K < page_vec->data_size) {
+		iser_err("page vec too short to hold this SG\n");
+		iser_data_buf_dump(mem, device->ib_device);
+		iser_dump_page_vec(page_vec);
+		return -EINVAL;
+	}
+
+	fmr  = ib_fmr_pool_map_phys(ib_conn->fmr.pool,
+				    page_vec->pages,
+				    page_vec->length,
+				    page_vec->pages[0]);
+	if (IS_ERR(fmr)) {
+		ret = PTR_ERR(fmr);
+		iser_err("ib_fmr_pool_map_phys failed: %d\n", ret);
+		return ret;
+	}
+
+	mem_reg->sge.lkey = fmr->fmr->lkey;
+	mem_reg->rkey = fmr->fmr->rkey;
+	mem_reg->sge.addr = page_vec->pages[0] + page_vec->offset;
+	mem_reg->sge.length = page_vec->data_size;
+	mem_reg->mem_h = fmr;
+
+	return 0;
+}
+
+/**
+ * Unregister (previosuly registered using FMR) memory.
+ * If memory is non-FMR does nothing.
+ */
+void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task,
+			enum iser_data_dir cmd_dir)
+{
+	struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
+	int ret;
+
+	if (!reg->mem_h)
+		return;
+
+	iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n", reg->mem_h);
+
+	ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h);
+	if (ret)
+		iser_err("ib_fmr_pool_unmap failed %d\n", ret);
+
+	reg->mem_h = NULL;
+}
+
+void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task,
+			    enum iser_data_dir cmd_dir)
+{
+	struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
+
+	if (!reg->mem_h)
+		return;
+
+	iser_reg_desc_put(&iser_task->iser_conn->ib_conn,
+			  reg->mem_h);
+	reg->mem_h = NULL;
+}
+
+/**
+ * iser_reg_rdma_mem_fmr - Registers memory intended for RDMA,
+ * using FMR (if possible) obtaining rkey and va
+ *
+ * returns 0 on success, errno code on failure
+ */
+int iser_reg_rdma_mem_fmr(struct iscsi_iser_task *iser_task,
+			  enum iser_data_dir cmd_dir)
+{
+	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
+	struct iser_device   *device = ib_conn->device;
+	struct ib_device     *ibdev = device->ib_device;
+	struct iser_data_buf *mem = &iser_task->data[cmd_dir];
+	struct iser_mem_reg *mem_reg;
+	int aligned_len;
+	int err;
+	int i;
+
+	mem_reg = &iser_task->rdma_reg[cmd_dir];
+
+	aligned_len = iser_data_buf_aligned_len(mem, ibdev);
+	if (aligned_len != mem->dma_nents) {
+		err = fall_to_bounce_buf(iser_task, mem,
+					 cmd_dir, aligned_len);
+		if (err) {
+			iser_err("failed to allocate bounce buffer\n");
+			return err;
+		}
+	}
+
+	/* if there a single dma entry, FMR is not needed */
+	if (mem->dma_nents == 1) {
+		return iser_reg_dma(device, mem, mem_reg);
+	} else { /* use FMR for multiple dma entries */
+		err = iser_reg_page_vec(iser_task, mem, ib_conn->fmr.page_vec,
+					mem_reg);
+		if (err && err != -EAGAIN) {
+			iser_data_buf_dump(mem, ibdev);
+			iser_err("mem->dma_nents = %d (dlength = 0x%x)\n",
+				 mem->dma_nents,
+				 ntoh24(iser_task->desc.iscsi_header.dlength));
+			iser_err("page_vec: data_size = 0x%x, length = %d, offset = 0x%x\n",
+				 ib_conn->fmr.page_vec->data_size,
+				 ib_conn->fmr.page_vec->length,
+				 ib_conn->fmr.page_vec->offset);
+			for (i = 0; i < ib_conn->fmr.page_vec->length; i++)
+				iser_err("page_vec[%d] = 0x%llx\n", i,
+					 (unsigned long long)ib_conn->fmr.page_vec->pages[i]);
+		}
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static void
+iser_set_dif_domain(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs,
+		    struct ib_sig_domain *domain)
+{
+	domain->sig_type = IB_SIG_TYPE_T10_DIF;
+	domain->sig.dif.pi_interval = scsi_prot_interval(sc);
+	domain->sig.dif.ref_tag = scsi_prot_ref_tag(sc);
+	/*
+	 * At the moment we hard code those, but in the future
+	 * we will take them from sc.
+	 */
+	domain->sig.dif.apptag_check_mask = 0xffff;
+	domain->sig.dif.app_escape = true;
+	domain->sig.dif.ref_escape = true;
+	if (sc->prot_flags & SCSI_PROT_REF_INCREMENT)
+		domain->sig.dif.ref_remap = true;
+};
+
+static int
+iser_set_sig_attrs(struct scsi_cmnd *sc, struct ib_sig_attrs *sig_attrs)
+{
+	switch (scsi_get_prot_op(sc)) {
+	case SCSI_PROT_WRITE_INSERT:
+	case SCSI_PROT_READ_STRIP:
+		sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
+		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
+		sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
+		break;
+	case SCSI_PROT_READ_INSERT:
+	case SCSI_PROT_WRITE_STRIP:
+		sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
+		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
+		sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ?
+						IB_T10DIF_CSUM : IB_T10DIF_CRC;
+		break;
+	case SCSI_PROT_READ_PASS:
+	case SCSI_PROT_WRITE_PASS:
+		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->wire);
+		sig_attrs->wire.sig.dif.bg_type = IB_T10DIF_CRC;
+		iser_set_dif_domain(sc, sig_attrs, &sig_attrs->mem);
+		sig_attrs->mem.sig.dif.bg_type = sc->prot_flags & SCSI_PROT_IP_CHECKSUM ?
+						IB_T10DIF_CSUM : IB_T10DIF_CRC;
+		break;
+	default:
+		iser_err("Unsupported PI operation %d\n",
+			 scsi_get_prot_op(sc));
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static inline void
+iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask)
+{
+	*mask = 0;
+	if (sc->prot_flags & SCSI_PROT_REF_CHECK)
+		*mask |= ISER_CHECK_REFTAG;
+	if (sc->prot_flags & SCSI_PROT_GUARD_CHECK)
+		*mask |= ISER_CHECK_GUARD;
+}
+
+static void
+iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
+{
+	u32 rkey;
+
+	memset(inv_wr, 0, sizeof(*inv_wr));
+	inv_wr->opcode = IB_WR_LOCAL_INV;
+	inv_wr->wr_id = ISER_FASTREG_LI_WRID;
+	inv_wr->ex.invalidate_rkey = mr->rkey;
+
+	rkey = ib_inc_rkey(mr->rkey);
+	ib_update_fast_reg_key(mr, rkey);
+}
+
+static int
+iser_reg_sig_mr(struct iscsi_iser_task *iser_task,
+		struct fast_reg_descriptor *desc,
+		struct iser_mem_reg *data_reg,
+		struct iser_mem_reg *prot_reg,
+		struct iser_mem_reg *sig_reg)
+{
+	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
+	struct iser_pi_context *pi_ctx = desc->pi_ctx;
+	struct ib_send_wr sig_wr, inv_wr;
+	struct ib_send_wr *bad_wr, *wr = NULL;
+	struct ib_sig_attrs sig_attrs;
+	int ret;
+
+	memset(&sig_attrs, 0, sizeof(sig_attrs));
+	ret = iser_set_sig_attrs(iser_task->sc, &sig_attrs);
+	if (ret)
+		goto err;
+
+	iser_set_prot_checks(iser_task->sc, &sig_attrs.check_mask);
+
+	if (!(desc->reg_indicators & ISER_SIG_KEY_VALID)) {
+		iser_inv_rkey(&inv_wr, pi_ctx->sig_mr);
+		wr = &inv_wr;
+	}
+
+	memset(&sig_wr, 0, sizeof(sig_wr));
+	sig_wr.opcode = IB_WR_REG_SIG_MR;
+	sig_wr.wr_id = ISER_FASTREG_LI_WRID;
+	sig_wr.sg_list = &data_reg->sge;
+	sig_wr.num_sge = 1;
+	sig_wr.wr.sig_handover.sig_attrs = &sig_attrs;
+	sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr;
+	if (scsi_prot_sg_count(iser_task->sc))
+		sig_wr.wr.sig_handover.prot = &prot_reg->sge;
+	sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE |
+					      IB_ACCESS_REMOTE_READ |
+					      IB_ACCESS_REMOTE_WRITE;
+
+	if (!wr)
+		wr = &sig_wr;
+	else
+		wr->next = &sig_wr;
+
+	ret = ib_post_send(ib_conn->qp, wr, &bad_wr);
+	if (ret) {
+		iser_err("reg_sig_mr failed, ret:%d\n", ret);
+		goto err;
+	}
+	desc->reg_indicators &= ~ISER_SIG_KEY_VALID;
+
+	sig_reg->sge.lkey = pi_ctx->sig_mr->lkey;
+	sig_reg->rkey = pi_ctx->sig_mr->rkey;
+	sig_reg->sge.addr = 0;
+	sig_reg->sge.length = scsi_transfer_length(iser_task->sc);
+
+	iser_dbg("sig_sge: lkey: 0x%x, rkey: 0x%x, addr: 0x%llx, length: %u\n",
+		 sig_reg->sge.lkey, sig_reg->rkey, sig_reg->sge.addr,
+		 sig_reg->sge.length);
+err:
+	return ret;
+}
+
+static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task,
+			    struct iser_data_buf *mem,
+			    struct fast_reg_descriptor *desc,
+			    enum iser_reg_indicator ind,
+			    struct iser_mem_reg *reg)
+{
+	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+	struct ib_mr *mr;
+	struct ib_fast_reg_page_list *frpl;
+	struct ib_send_wr fastreg_wr, inv_wr;
+	struct ib_send_wr *bad_wr, *wr = NULL;
+	int ret, offset, size, plen;
+
+	/* if there a single dma entry, dma mr suffices */
+	if (mem->dma_nents == 1)
+		return iser_reg_dma(device, mem, reg);
+
+	if (ind == ISER_DATA_KEY_VALID) {
+		mr = desc->data_mr;
+		frpl = desc->data_frpl;
+	} else {
+		mr = desc->pi_ctx->prot_mr;
+		frpl = desc->pi_ctx->prot_frpl;
+	}
+
+	plen = iser_sg_to_page_vec(mem, device->ib_device, frpl->page_list,
+				   &offset, &size);
+	if (plen * SIZE_4K < size) {
+		iser_err("fast reg page_list too short to hold this SG\n");
+		return -EINVAL;
+	}
+
+	if (!(desc->reg_indicators & ind)) {
+		iser_inv_rkey(&inv_wr, mr);
+		wr = &inv_wr;
+	}
+
+	/* Prepare FASTREG WR */
+	memset(&fastreg_wr, 0, sizeof(fastreg_wr));
+	fastreg_wr.wr_id = ISER_FASTREG_LI_WRID;
+	fastreg_wr.opcode = IB_WR_FAST_REG_MR;
+	fastreg_wr.wr.fast_reg.iova_start = frpl->page_list[0] + offset;
+	fastreg_wr.wr.fast_reg.page_list = frpl;
+	fastreg_wr.wr.fast_reg.page_list_len = plen;
+	fastreg_wr.wr.fast_reg.page_shift = SHIFT_4K;
+	fastreg_wr.wr.fast_reg.length = size;
+	fastreg_wr.wr.fast_reg.rkey = mr->rkey;
+	fastreg_wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE  |
+					       IB_ACCESS_REMOTE_WRITE |
+					       IB_ACCESS_REMOTE_READ);
+
+	if (!wr)
+		wr = &fastreg_wr;
+	else
+		wr->next = &fastreg_wr;
+
+	ret = ib_post_send(ib_conn->qp, wr, &bad_wr);
+	if (ret) {
+		iser_err("fast registration failed, ret:%d\n", ret);
+		return ret;
+	}
+	desc->reg_indicators &= ~ind;
+
+	reg->sge.lkey = mr->lkey;
+	reg->rkey = mr->rkey;
+	reg->sge.addr = frpl->page_list[0] + offset;
+	reg->sge.length = size;
+
+	return ret;
+}
+
+/**
+ * iser_reg_rdma_mem_fastreg - Registers memory intended for RDMA,
+ * using Fast Registration WR (if possible) obtaining rkey and va
+ *
+ * returns 0 on success, errno code on failure
+ */
+int iser_reg_rdma_mem_fastreg(struct iscsi_iser_task *iser_task,
+			      enum iser_data_dir cmd_dir)
+{
+	struct ib_conn *ib_conn = &iser_task->iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+	struct ib_device *ibdev = device->ib_device;
+	struct iser_data_buf *mem = &iser_task->data[cmd_dir];
+	struct iser_mem_reg *mem_reg = &iser_task->rdma_reg[cmd_dir];
+	struct fast_reg_descriptor *desc = NULL;
+	int err, aligned_len;
+
+	aligned_len = iser_data_buf_aligned_len(mem, ibdev);
+	if (aligned_len != mem->dma_nents) {
+		err = fall_to_bounce_buf(iser_task, mem,
+					 cmd_dir, aligned_len);
+		if (err) {
+			iser_err("failed to allocate bounce buffer\n");
+			return err;
+		}
+	}
+
+	if (mem->dma_nents != 1 ||
+	    scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) {
+		desc = iser_reg_desc_get(ib_conn);
+		mem_reg->mem_h = desc;
+	}
+
+	err = iser_fast_reg_mr(iser_task, mem, desc,
+			       ISER_DATA_KEY_VALID, mem_reg);
+	if (err)
+		goto err_reg;
+
+	if (scsi_get_prot_op(iser_task->sc) != SCSI_PROT_NORMAL) {
+		struct iser_mem_reg prot_reg;
+
+		memset(&prot_reg, 0, sizeof(prot_reg));
+		if (scsi_prot_sg_count(iser_task->sc)) {
+			mem = &iser_task->prot[cmd_dir];
+			aligned_len = iser_data_buf_aligned_len(mem, ibdev);
+			if (aligned_len != mem->dma_nents) {
+				err = fall_to_bounce_buf(iser_task, mem,
+							 cmd_dir, aligned_len);
+				if (err) {
+					iser_err("failed to allocate bounce buffer\n");
+					return err;
+				}
+			}
+
+			err = iser_fast_reg_mr(iser_task, mem, desc,
+					       ISER_PROT_KEY_VALID, &prot_reg);
+			if (err)
+				goto err_reg;
+		}
+
+		err = iser_reg_sig_mr(iser_task, desc, mem_reg,
+				      &prot_reg, mem_reg);
+		if (err) {
+			iser_err("Failed to register signature mr\n");
+			return err;
+		}
+		desc->reg_indicators |= ISER_FASTREG_PROTECTED;
+	}
+
+	return 0;
+err_reg:
+	if (desc)
+		iser_reg_desc_put(ib_conn, desc);
+
+	return err;
+}
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
new file mode 100644
index 000000000..cc2dd35ff
--- /dev/null
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -0,0 +1,1274 @@
+/*
+ * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved.
+ * Copyright (c) 2005, 2006 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *	- Redistributions of source code must retain the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer.
+ *
+ *	- Redistributions in binary form must reproduce the above
+ *	  copyright notice, this list of conditions and the following
+ *	  disclaimer in the documentation and/or other materials
+ *	  provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+
+#include "iscsi_iser.h"
+
+#define ISCSI_ISER_MAX_CONN	8
+#define ISER_MAX_RX_LEN		(ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_TX_LEN		(ISER_QP_MAX_REQ_DTOS  * ISCSI_ISER_MAX_CONN)
+#define ISER_MAX_CQ_LEN		(ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \
+				 ISCSI_ISER_MAX_CONN)
+
+static int iser_cq_poll_limit = 512;
+
+static void iser_cq_tasklet_fn(unsigned long data);
+static void iser_cq_callback(struct ib_cq *cq, void *cq_context);
+
+static void iser_cq_event_callback(struct ib_event *cause, void *context)
+{
+	iser_err("got cq event %d \n", cause->event);
+}
+
+static void iser_qp_event_callback(struct ib_event *cause, void *context)
+{
+	iser_err("got qp event %d\n",cause->event);
+}
+
+static void iser_event_handler(struct ib_event_handler *handler,
+				struct ib_event *event)
+{
+	iser_err("async event %d on device %s port %d\n", event->event,
+		event->device->name, event->element.port_num);
+}
+
+/**
+ * iser_create_device_ib_res - creates Protection Domain (PD), Completion
+ * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with
+ * the adapator.
+ *
+ * returns 0 on success, -1 on failure
+ */
+static int iser_create_device_ib_res(struct iser_device *device)
+{
+	struct ib_device_attr *dev_attr = &device->dev_attr;
+	int ret, i, max_cqe;
+
+	ret = ib_query_device(device->ib_device, dev_attr);
+	if (ret) {
+		pr_warn("Query device failed for %s\n", device->ib_device->name);
+		return ret;
+	}
+
+	/* Assign function handles  - based on FMR support */
+	if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr &&
+	    device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) {
+		iser_info("FMR supported, using FMR for registration\n");
+		device->iser_alloc_rdma_reg_res = iser_create_fmr_pool;
+		device->iser_free_rdma_reg_res = iser_free_fmr_pool;
+		device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr;
+		device->iser_unreg_rdma_mem = iser_unreg_mem_fmr;
+	} else
+	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+		iser_info("FastReg supported, using FastReg for registration\n");
+		device->iser_alloc_rdma_reg_res = iser_create_fastreg_pool;
+		device->iser_free_rdma_reg_res = iser_free_fastreg_pool;
+		device->iser_reg_rdma_mem = iser_reg_rdma_mem_fastreg;
+		device->iser_unreg_rdma_mem = iser_unreg_mem_fastreg;
+	} else {
+		iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n");
+		return -1;
+	}
+
+	device->comps_used = min_t(int, num_online_cpus(),
+				 device->ib_device->num_comp_vectors);
+
+	device->comps = kcalloc(device->comps_used, sizeof(*device->comps),
+				GFP_KERNEL);
+	if (!device->comps)
+		goto comps_err;
+
+	max_cqe = min(ISER_MAX_CQ_LEN, dev_attr->max_cqe);
+
+	iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n",
+		  device->comps_used, device->ib_device->name,
+		  device->ib_device->num_comp_vectors, max_cqe);
+
+	device->pd = ib_alloc_pd(device->ib_device);
+	if (IS_ERR(device->pd))
+		goto pd_err;
+
+	for (i = 0; i < device->comps_used; i++) {
+		struct iser_comp *comp = &device->comps[i];
+
+		comp->device = device;
+		comp->cq = ib_create_cq(device->ib_device,
+					iser_cq_callback,
+					iser_cq_event_callback,
+					(void *)comp,
+					max_cqe, i);
+		if (IS_ERR(comp->cq)) {
+			comp->cq = NULL;
+			goto cq_err;
+		}
+
+		if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP))
+			goto cq_err;
+
+		tasklet_init(&comp->tasklet, iser_cq_tasklet_fn,
+			     (unsigned long)comp);
+	}
+
+	device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE |
+				   IB_ACCESS_REMOTE_WRITE |
+				   IB_ACCESS_REMOTE_READ);
+	if (IS_ERR(device->mr))
+		goto dma_mr_err;
+
+	INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device,
+				iser_event_handler);
+	if (ib_register_event_handler(&device->event_handler))
+		goto handler_err;
+
+	return 0;
+
+handler_err:
+	ib_dereg_mr(device->mr);
+dma_mr_err:
+	for (i = 0; i < device->comps_used; i++)
+		tasklet_kill(&device->comps[i].tasklet);
+cq_err:
+	for (i = 0; i < device->comps_used; i++) {
+		struct iser_comp *comp = &device->comps[i];
+
+		if (comp->cq)
+			ib_destroy_cq(comp->cq);
+	}
+	ib_dealloc_pd(device->pd);
+pd_err:
+	kfree(device->comps);
+comps_err:
+	iser_err("failed to allocate an IB resource\n");
+	return -1;
+}
+
+/**
+ * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR,
+ * CQ and PD created with the device associated with the adapator.
+ */
+static void iser_free_device_ib_res(struct iser_device *device)
+{
+	int i;
+	BUG_ON(device->mr == NULL);
+
+	for (i = 0; i < device->comps_used; i++) {
+		struct iser_comp *comp = &device->comps[i];
+
+		tasklet_kill(&comp->tasklet);
+		ib_destroy_cq(comp->cq);
+		comp->cq = NULL;
+	}
+
+	(void)ib_unregister_event_handler(&device->event_handler);
+	(void)ib_dereg_mr(device->mr);
+	(void)ib_dealloc_pd(device->pd);
+
+	kfree(device->comps);
+	device->comps = NULL;
+
+	device->mr = NULL;
+	device->pd = NULL;
+}
+
+/**
+ * iser_create_fmr_pool - Creates FMR pool and page_vector
+ *
+ * returns 0 on success, or errno code on failure
+ */
+int iser_create_fmr_pool(struct ib_conn *ib_conn, unsigned cmds_max)
+{
+	struct iser_device *device = ib_conn->device;
+	struct ib_fmr_pool_param params;
+	int ret = -ENOMEM;
+
+	ib_conn->fmr.page_vec = kmalloc(sizeof(*ib_conn->fmr.page_vec) +
+					(sizeof(u64)*(ISCSI_ISER_SG_TABLESIZE + 1)),
+					GFP_KERNEL);
+	if (!ib_conn->fmr.page_vec)
+		return ret;
+
+	ib_conn->fmr.page_vec->pages = (u64 *)(ib_conn->fmr.page_vec + 1);
+
+	params.page_shift        = SHIFT_4K;
+	/* when the first/last SG element are not start/end *
+	 * page aligned, the map whould be of N+1 pages     */
+	params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1;
+	/* make the pool size twice the max number of SCSI commands *
+	 * the ML is expected to queue, watermark for unmap at 50%  */
+	params.pool_size	 = cmds_max * 2;
+	params.dirty_watermark	 = cmds_max;
+	params.cache		 = 0;
+	params.flush_function	 = NULL;
+	params.access		 = (IB_ACCESS_LOCAL_WRITE  |
+				    IB_ACCESS_REMOTE_WRITE |
+				    IB_ACCESS_REMOTE_READ);
+
+	ib_conn->fmr.pool = ib_create_fmr_pool(device->pd, &params);
+	if (!IS_ERR(ib_conn->fmr.pool))
+		return 0;
+
+	/* no FMR => no need for page_vec */
+	kfree(ib_conn->fmr.page_vec);
+	ib_conn->fmr.page_vec = NULL;
+
+	ret = PTR_ERR(ib_conn->fmr.pool);
+	ib_conn->fmr.pool = NULL;
+	if (ret != -ENOSYS) {
+		iser_err("FMR allocation failed, err %d\n", ret);
+		return ret;
+	} else {
+		iser_warn("FMRs are not supported, using unaligned mode\n");
+		return 0;
+	}
+}
+
+/**
+ * iser_free_fmr_pool - releases the FMR pool and page vec
+ */
+void iser_free_fmr_pool(struct ib_conn *ib_conn)
+{
+	iser_info("freeing conn %p fmr pool %p\n",
+		  ib_conn, ib_conn->fmr.pool);
+
+	if (ib_conn->fmr.pool != NULL)
+		ib_destroy_fmr_pool(ib_conn->fmr.pool);
+
+	ib_conn->fmr.pool = NULL;
+
+	kfree(ib_conn->fmr.page_vec);
+	ib_conn->fmr.page_vec = NULL;
+}
+
+static int
+iser_alloc_pi_ctx(struct ib_device *ib_device, struct ib_pd *pd,
+		  struct fast_reg_descriptor *desc)
+{
+	struct iser_pi_context *pi_ctx = NULL;
+	struct ib_mr_init_attr mr_init_attr = {.max_reg_descriptors = 2,
+					       .flags = IB_MR_SIGNATURE_EN};
+	int ret = 0;
+
+	desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL);
+	if (!desc->pi_ctx)
+		return -ENOMEM;
+
+	pi_ctx = desc->pi_ctx;
+
+	pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device,
+					    ISCSI_ISER_SG_TABLESIZE);
+	if (IS_ERR(pi_ctx->prot_frpl)) {
+		ret = PTR_ERR(pi_ctx->prot_frpl);
+		goto prot_frpl_failure;
+	}
+
+	pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd,
+					ISCSI_ISER_SG_TABLESIZE + 1);
+	if (IS_ERR(pi_ctx->prot_mr)) {
+		ret = PTR_ERR(pi_ctx->prot_mr);
+		goto prot_mr_failure;
+	}
+	desc->reg_indicators |= ISER_PROT_KEY_VALID;
+
+	pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr);
+	if (IS_ERR(pi_ctx->sig_mr)) {
+		ret = PTR_ERR(pi_ctx->sig_mr);
+		goto sig_mr_failure;
+	}
+	desc->reg_indicators |= ISER_SIG_KEY_VALID;
+	desc->reg_indicators &= ~ISER_FASTREG_PROTECTED;
+
+	return 0;
+
+sig_mr_failure:
+	ib_dereg_mr(desc->pi_ctx->prot_mr);
+prot_mr_failure:
+	ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl);
+prot_frpl_failure:
+	kfree(desc->pi_ctx);
+
+	return ret;
+}
+
+static void
+iser_free_pi_ctx(struct iser_pi_context *pi_ctx)
+{
+	ib_free_fast_reg_page_list(pi_ctx->prot_frpl);
+	ib_dereg_mr(pi_ctx->prot_mr);
+	ib_destroy_mr(pi_ctx->sig_mr);
+	kfree(pi_ctx);
+}
+
+static int
+iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd,
+			 bool pi_enable, struct fast_reg_descriptor *desc)
+{
+	int ret;
+
+	desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device,
+						      ISCSI_ISER_SG_TABLESIZE + 1);
+	if (IS_ERR(desc->data_frpl)) {
+		ret = PTR_ERR(desc->data_frpl);
+		iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n",
+			 ret);
+		return PTR_ERR(desc->data_frpl);
+	}
+
+	desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1);
+	if (IS_ERR(desc->data_mr)) {
+		ret = PTR_ERR(desc->data_mr);
+		iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret);
+		goto fast_reg_mr_failure;
+	}
+	desc->reg_indicators |= ISER_DATA_KEY_VALID;
+
+	if (pi_enable) {
+		ret = iser_alloc_pi_ctx(ib_device, pd, desc);
+		if (ret)
+			goto pi_ctx_alloc_failure;
+	}
+
+	return 0;
+pi_ctx_alloc_failure:
+	ib_dereg_mr(desc->data_mr);
+fast_reg_mr_failure:
+	ib_free_fast_reg_page_list(desc->data_frpl);
+
+	return ret;
+}
+
+/**
+ * iser_create_fastreg_pool - Creates pool of fast_reg descriptors
+ * for fast registration work requests.
+ * returns 0 on success, or errno code on failure
+ */
+int iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
+{
+	struct iser_device *device = ib_conn->device;
+	struct fast_reg_descriptor *desc;
+	int i, ret;
+
+	INIT_LIST_HEAD(&ib_conn->fastreg.pool);
+	ib_conn->fastreg.pool_size = 0;
+	for (i = 0; i < cmds_max; i++) {
+		desc = kzalloc(sizeof(*desc), GFP_KERNEL);
+		if (!desc) {
+			iser_err("Failed to allocate a new fast_reg descriptor\n");
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		ret = iser_create_fastreg_desc(device->ib_device, device->pd,
+					       ib_conn->pi_support, desc);
+		if (ret) {
+			iser_err("Failed to create fastreg descriptor err=%d\n",
+				 ret);
+			kfree(desc);
+			goto err;
+		}
+
+		list_add_tail(&desc->list, &ib_conn->fastreg.pool);
+		ib_conn->fastreg.pool_size++;
+	}
+
+	return 0;
+
+err:
+	iser_free_fastreg_pool(ib_conn);
+	return ret;
+}
+
+/**
+ * iser_free_fastreg_pool - releases the pool of fast_reg descriptors
+ */
+void iser_free_fastreg_pool(struct ib_conn *ib_conn)
+{
+	struct fast_reg_descriptor *desc, *tmp;
+	int i = 0;
+
+	if (list_empty(&ib_conn->fastreg.pool))
+		return;
+
+	iser_info("freeing conn %p fr pool\n", ib_conn);
+
+	list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) {
+		list_del(&desc->list);
+		ib_free_fast_reg_page_list(desc->data_frpl);
+		ib_dereg_mr(desc->data_mr);
+		if (desc->pi_ctx)
+			iser_free_pi_ctx(desc->pi_ctx);
+		kfree(desc);
+		++i;
+	}
+
+	if (i < ib_conn->fastreg.pool_size)
+		iser_warn("pool still has %d regions registered\n",
+			  ib_conn->fastreg.pool_size - i);
+}
+
+/**
+ * iser_create_ib_conn_res - Queue-Pair (QP)
+ *
+ * returns 0 on success, -1 on failure
+ */
+static int iser_create_ib_conn_res(struct ib_conn *ib_conn)
+{
+	struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
+						   ib_conn);
+	struct iser_device	*device;
+	struct ib_device_attr *dev_attr;
+	struct ib_qp_init_attr	init_attr;
+	int			ret = -ENOMEM;
+	int index, min_index = 0;
+
+	BUG_ON(ib_conn->device == NULL);
+
+	device = ib_conn->device;
+	dev_attr = &device->dev_attr;
+
+	memset(&init_attr, 0, sizeof init_attr);
+
+	mutex_lock(&ig.connlist_mutex);
+	/* select the CQ with the minimal number of usages */
+	for (index = 0; index < device->comps_used; index++) {
+		if (device->comps[index].active_qps <
+		    device->comps[min_index].active_qps)
+			min_index = index;
+	}
+	ib_conn->comp = &device->comps[min_index];
+	ib_conn->comp->active_qps++;
+	mutex_unlock(&ig.connlist_mutex);
+	iser_info("cq index %d used for ib_conn %p\n", min_index, ib_conn);
+
+	init_attr.event_handler = iser_qp_event_callback;
+	init_attr.qp_context	= (void *)ib_conn;
+	init_attr.send_cq	= ib_conn->comp->cq;
+	init_attr.recv_cq	= ib_conn->comp->cq;
+	init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;
+	init_attr.cap.max_send_sge = 2;
+	init_attr.cap.max_recv_sge = 1;
+	init_attr.sq_sig_type	= IB_SIGNAL_REQ_WR;
+	init_attr.qp_type	= IB_QPT_RC;
+	if (ib_conn->pi_support) {
+		init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS + 1;
+		init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN;
+		iser_conn->max_cmds =
+			ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS);
+	} else {
+		if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
+			init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS + 1;
+			iser_conn->max_cmds =
+				ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS);
+		} else {
+			init_attr.cap.max_send_wr = dev_attr->max_qp_wr;
+			iser_conn->max_cmds =
+				ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr);
+			iser_dbg("device %s supports max_send_wr %d\n",
+				 device->ib_device->name, dev_attr->max_qp_wr);
+		}
+	}
+
+	ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
+	if (ret)
+		goto out_err;
+
+	ib_conn->qp = ib_conn->cma_id->qp;
+	iser_info("setting conn %p cma_id %p qp %p\n",
+		  ib_conn, ib_conn->cma_id,
+		  ib_conn->cma_id->qp);
+	return ret;
+
+out_err:
+	mutex_lock(&ig.connlist_mutex);
+	ib_conn->comp->active_qps--;
+	mutex_unlock(&ig.connlist_mutex);
+	iser_err("unable to alloc mem or create resource, err %d\n", ret);
+
+	return ret;
+}
+
+/**
+ * based on the resolved device node GUID see if there already allocated
+ * device for this device. If there's no such, create one.
+ */
+static
+struct iser_device *iser_device_find_by_ib_device(struct rdma_cm_id *cma_id)
+{
+	struct iser_device *device;
+
+	mutex_lock(&ig.device_list_mutex);
+
+	list_for_each_entry(device, &ig.device_list, ig_list)
+		/* find if there's a match using the node GUID */
+		if (device->ib_device->node_guid == cma_id->device->node_guid)
+			goto inc_refcnt;
+
+	device = kzalloc(sizeof *device, GFP_KERNEL);
+	if (device == NULL)
+		goto out;
+
+	/* assign this device to the device */
+	device->ib_device = cma_id->device;
+	/* init the device and link it into ig device list */
+	if (iser_create_device_ib_res(device)) {
+		kfree(device);
+		device = NULL;
+		goto out;
+	}
+	list_add(&device->ig_list, &ig.device_list);
+
+inc_refcnt:
+	device->refcount++;
+out:
+	mutex_unlock(&ig.device_list_mutex);
+	return device;
+}
+
+/* if there's no demand for this device, release it */
+static void iser_device_try_release(struct iser_device *device)
+{
+	mutex_lock(&ig.device_list_mutex);
+	device->refcount--;
+	iser_info("device %p refcount %d\n", device, device->refcount);
+	if (!device->refcount) {
+		iser_free_device_ib_res(device);
+		list_del(&device->ig_list);
+		kfree(device);
+	}
+	mutex_unlock(&ig.device_list_mutex);
+}
+
+/**
+ * Called with state mutex held
+ **/
+static int iser_conn_state_comp_exch(struct iser_conn *iser_conn,
+				     enum iser_conn_state comp,
+				     enum iser_conn_state exch)
+{
+	int ret;
+
+	ret = (iser_conn->state == comp);
+	if (ret)
+		iser_conn->state = exch;
+
+	return ret;
+}
+
+void iser_release_work(struct work_struct *work)
+{
+	struct iser_conn *iser_conn;
+
+	iser_conn = container_of(work, struct iser_conn, release_work);
+
+	/* Wait for conn_stop to complete */
+	wait_for_completion(&iser_conn->stop_completion);
+	/* Wait for IB resouces cleanup to complete */
+	wait_for_completion(&iser_conn->ib_completion);
+
+	mutex_lock(&iser_conn->state_mutex);
+	iser_conn->state = ISER_CONN_DOWN;
+	mutex_unlock(&iser_conn->state_mutex);
+
+	iser_conn_release(iser_conn);
+}
+
+/**
+ * iser_free_ib_conn_res - release IB related resources
+ * @iser_conn: iser connection struct
+ * @destroy: indicator if we need to try to release the
+ *     iser device and memory regoins pool (only iscsi
+ *     shutdown and DEVICE_REMOVAL will use this).
+ *
+ * This routine is called with the iser state mutex held
+ * so the cm_id removal is out of here. It is Safe to
+ * be invoked multiple times.
+ */
+static void iser_free_ib_conn_res(struct iser_conn *iser_conn,
+				  bool destroy)
+{
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+
+	iser_info("freeing conn %p cma_id %p qp %p\n",
+		  iser_conn, ib_conn->cma_id, ib_conn->qp);
+
+	if (ib_conn->qp != NULL) {
+		ib_conn->comp->active_qps--;
+		rdma_destroy_qp(ib_conn->cma_id);
+		ib_conn->qp = NULL;
+	}
+
+	if (destroy) {
+		if (iser_conn->rx_descs)
+			iser_free_rx_descriptors(iser_conn);
+
+		if (device != NULL) {
+			iser_device_try_release(device);
+			ib_conn->device = NULL;
+		}
+	}
+}
+
+/**
+ * Frees all conn objects and deallocs conn descriptor
+ */
+void iser_conn_release(struct iser_conn *iser_conn)
+{
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+
+	mutex_lock(&ig.connlist_mutex);
+	list_del(&iser_conn->conn_list);
+	mutex_unlock(&ig.connlist_mutex);
+
+	mutex_lock(&iser_conn->state_mutex);
+	/* In case we endup here without ep_disconnect being invoked. */
+	if (iser_conn->state != ISER_CONN_DOWN) {
+		iser_warn("iser conn %p state %d, expected state down.\n",
+			  iser_conn, iser_conn->state);
+		iscsi_destroy_endpoint(iser_conn->ep);
+		iser_conn->state = ISER_CONN_DOWN;
+	}
+	/*
+	 * In case we never got to bind stage, we still need to
+	 * release IB resources (which is safe to call more than once).
+	 */
+	iser_free_ib_conn_res(iser_conn, true);
+	mutex_unlock(&iser_conn->state_mutex);
+
+	if (ib_conn->cma_id != NULL) {
+		rdma_destroy_id(ib_conn->cma_id);
+		ib_conn->cma_id = NULL;
+	}
+
+	kfree(iser_conn);
+}
+
+/**
+ * triggers start of the disconnect procedures and wait for them to be done
+ * Called with state mutex held
+ */
+int iser_conn_terminate(struct iser_conn *iser_conn)
+{
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct ib_send_wr *bad_wr;
+	int err = 0;
+
+	/* terminate the iser conn only if the conn state is UP */
+	if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP,
+				       ISER_CONN_TERMINATING))
+		return 0;
+
+	iser_info("iser_conn %p state %d\n", iser_conn, iser_conn->state);
+
+	/* suspend queuing of new iscsi commands */
+	if (iser_conn->iscsi_conn)
+		iscsi_suspend_queue(iser_conn->iscsi_conn);
+
+	/*
+	 * In case we didn't already clean up the cma_id (peer initiated
+	 * a disconnection), we need to Cause the CMA to change the QP
+	 * state to ERROR.
+	 */
+	if (ib_conn->cma_id) {
+		err = rdma_disconnect(ib_conn->cma_id);
+		if (err)
+			iser_err("Failed to disconnect, conn: 0x%p err %d\n",
+				 iser_conn, err);
+
+		/* post an indication that all flush errors were consumed */
+		err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr);
+		if (err) {
+			iser_err("conn %p failed to post beacon", ib_conn);
+			return 1;
+		}
+
+		wait_for_completion(&ib_conn->flush_comp);
+	}
+
+	return 1;
+}
+
+/**
+ * Called with state mutex held
+ **/
+static void iser_connect_error(struct rdma_cm_id *cma_id)
+{
+	struct iser_conn *iser_conn;
+
+	iser_conn = (struct iser_conn *)cma_id->context;
+	iser_conn->state = ISER_CONN_TERMINATING;
+}
+
+/**
+ * Called with state mutex held
+ **/
+static void iser_addr_handler(struct rdma_cm_id *cma_id)
+{
+	struct iser_device *device;
+	struct iser_conn   *iser_conn;
+	struct ib_conn   *ib_conn;
+	int    ret;
+
+	iser_conn = (struct iser_conn *)cma_id->context;
+	if (iser_conn->state != ISER_CONN_PENDING)
+		/* bailout */
+		return;
+
+	ib_conn = &iser_conn->ib_conn;
+	device = iser_device_find_by_ib_device(cma_id);
+	if (!device) {
+		iser_err("device lookup/creation failed\n");
+		iser_connect_error(cma_id);
+		return;
+	}
+
+	ib_conn->device = device;
+
+	/* connection T10-PI support */
+	if (iser_pi_enable) {
+		if (!(device->dev_attr.device_cap_flags &
+		      IB_DEVICE_SIGNATURE_HANDOVER)) {
+			iser_warn("T10-PI requested but not supported on %s, "
+				  "continue without T10-PI\n",
+				  ib_conn->device->ib_device->name);
+			ib_conn->pi_support = false;
+		} else {
+			ib_conn->pi_support = true;
+		}
+	}
+
+	ret = rdma_resolve_route(cma_id, 1000);
+	if (ret) {
+		iser_err("resolve route failed: %d\n", ret);
+		iser_connect_error(cma_id);
+		return;
+	}
+}
+
+/**
+ * Called with state mutex held
+ **/
+static void iser_route_handler(struct rdma_cm_id *cma_id)
+{
+	struct rdma_conn_param conn_param;
+	int    ret;
+	struct iser_cm_hdr req_hdr;
+	struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct iser_device *device = ib_conn->device;
+
+	if (iser_conn->state != ISER_CONN_PENDING)
+		/* bailout */
+		return;
+
+	ret = iser_create_ib_conn_res(ib_conn);
+	if (ret)
+		goto failure;
+
+	memset(&conn_param, 0, sizeof conn_param);
+	conn_param.responder_resources = device->dev_attr.max_qp_rd_atom;
+	conn_param.initiator_depth     = 1;
+	conn_param.retry_count	       = 7;
+	conn_param.rnr_retry_count     = 6;
+
+	memset(&req_hdr, 0, sizeof(req_hdr));
+	req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED |
+			ISER_SEND_W_INV_NOT_SUPPORTED);
+	conn_param.private_data		= (void *)&req_hdr;
+	conn_param.private_data_len	= sizeof(struct iser_cm_hdr);
+
+	ret = rdma_connect(cma_id, &conn_param);
+	if (ret) {
+		iser_err("failure connecting: %d\n", ret);
+		goto failure;
+	}
+
+	return;
+failure:
+	iser_connect_error(cma_id);
+}
+
+static void iser_connected_handler(struct rdma_cm_id *cma_id)
+{
+	struct iser_conn *iser_conn;
+	struct ib_qp_attr attr;
+	struct ib_qp_init_attr init_attr;
+
+	iser_conn = (struct iser_conn *)cma_id->context;
+	if (iser_conn->state != ISER_CONN_PENDING)
+		/* bailout */
+		return;
+
+	(void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr);
+	iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num);
+
+	iser_conn->state = ISER_CONN_UP;
+	complete(&iser_conn->up_completion);
+}
+
+static void iser_disconnected_handler(struct rdma_cm_id *cma_id)
+{
+	struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context;
+
+	if (iser_conn_terminate(iser_conn)) {
+		if (iser_conn->iscsi_conn)
+			iscsi_conn_failure(iser_conn->iscsi_conn,
+					   ISCSI_ERR_CONN_FAILED);
+		else
+			iser_err("iscsi_iser connection isn't bound\n");
+	}
+}
+
+static void iser_cleanup_handler(struct rdma_cm_id *cma_id,
+				 bool destroy)
+{
+	struct iser_conn *iser_conn = (struct iser_conn *)cma_id->context;
+
+	/*
+	 * We are not guaranteed that we visited disconnected_handler
+	 * by now, call it here to be safe that we handle CM drep
+	 * and flush errors.
+	 */
+	iser_disconnected_handler(cma_id);
+	iser_free_ib_conn_res(iser_conn, destroy);
+	complete(&iser_conn->ib_completion);
+};
+
+static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
+{
+	struct iser_conn *iser_conn;
+	int ret = 0;
+
+	iser_conn = (struct iser_conn *)cma_id->context;
+	iser_info("event %d status %d conn %p id %p\n",
+		  event->event, event->status, cma_id->context, cma_id);
+
+	mutex_lock(&iser_conn->state_mutex);
+	switch (event->event) {
+	case RDMA_CM_EVENT_ADDR_RESOLVED:
+		iser_addr_handler(cma_id);
+		break;
+	case RDMA_CM_EVENT_ROUTE_RESOLVED:
+		iser_route_handler(cma_id);
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		iser_connected_handler(cma_id);
+		break;
+	case RDMA_CM_EVENT_ADDR_ERROR:
+	case RDMA_CM_EVENT_ROUTE_ERROR:
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+	case RDMA_CM_EVENT_UNREACHABLE:
+	case RDMA_CM_EVENT_REJECTED:
+		iser_connect_error(cma_id);
+		break;
+	case RDMA_CM_EVENT_DISCONNECTED:
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+		iser_cleanup_handler(cma_id, false);
+		break;
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		/*
+		 * we *must* destroy the device as we cannot rely
+		 * on iscsid to be around to initiate error handling.
+		 * also if we are not in state DOWN implicitly destroy
+		 * the cma_id.
+		 */
+		iser_cleanup_handler(cma_id, true);
+		if (iser_conn->state != ISER_CONN_DOWN) {
+			iser_conn->ib_conn.cma_id = NULL;
+			ret = 1;
+		}
+		break;
+	default:
+		iser_err("Unexpected RDMA CM event (%d)\n", event->event);
+		break;
+	}
+	mutex_unlock(&iser_conn->state_mutex);
+
+	return ret;
+}
+
+void iser_conn_init(struct iser_conn *iser_conn)
+{
+	iser_conn->state = ISER_CONN_INIT;
+	iser_conn->ib_conn.post_recv_buf_count = 0;
+	init_completion(&iser_conn->ib_conn.flush_comp);
+	init_completion(&iser_conn->stop_completion);
+	init_completion(&iser_conn->ib_completion);
+	init_completion(&iser_conn->up_completion);
+	INIT_LIST_HEAD(&iser_conn->conn_list);
+	spin_lock_init(&iser_conn->ib_conn.lock);
+	mutex_init(&iser_conn->state_mutex);
+}
+
+ /**
+ * starts the process of connecting to the target
+ * sleeps until the connection is established or rejected
+ */
+int iser_connect(struct iser_conn   *iser_conn,
+		 struct sockaddr    *src_addr,
+		 struct sockaddr    *dst_addr,
+		 int                 non_blocking)
+{
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	int err = 0;
+
+	mutex_lock(&iser_conn->state_mutex);
+
+	sprintf(iser_conn->name, "%pISp", dst_addr);
+
+	iser_info("connecting to: %s\n", iser_conn->name);
+
+	/* the device is known only --after-- address resolution */
+	ib_conn->device = NULL;
+
+	iser_conn->state = ISER_CONN_PENDING;
+
+	ib_conn->beacon.wr_id = ISER_BEACON_WRID;
+	ib_conn->beacon.opcode = IB_WR_SEND;
+
+	ib_conn->cma_id = rdma_create_id(iser_cma_handler,
+					 (void *)iser_conn,
+					 RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(ib_conn->cma_id)) {
+		err = PTR_ERR(ib_conn->cma_id);
+		iser_err("rdma_create_id failed: %d\n", err);
+		goto id_failure;
+	}
+
+	err = rdma_resolve_addr(ib_conn->cma_id, src_addr, dst_addr, 1000);
+	if (err) {
+		iser_err("rdma_resolve_addr failed: %d\n", err);
+		goto addr_failure;
+	}
+
+	if (!non_blocking) {
+		wait_for_completion_interruptible(&iser_conn->up_completion);
+
+		if (iser_conn->state != ISER_CONN_UP) {
+			err =  -EIO;
+			goto connect_failure;
+		}
+	}
+	mutex_unlock(&iser_conn->state_mutex);
+
+	mutex_lock(&ig.connlist_mutex);
+	list_add(&iser_conn->conn_list, &ig.connlist);
+	mutex_unlock(&ig.connlist_mutex);
+	return 0;
+
+id_failure:
+	ib_conn->cma_id = NULL;
+addr_failure:
+	iser_conn->state = ISER_CONN_DOWN;
+connect_failure:
+	mutex_unlock(&iser_conn->state_mutex);
+	iser_conn_release(iser_conn);
+	return err;
+}
+
+int iser_post_recvl(struct iser_conn *iser_conn)
+{
+	struct ib_recv_wr rx_wr, *rx_wr_failed;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	struct ib_sge	  sge;
+	int ib_ret;
+
+	sge.addr   = iser_conn->login_resp_dma;
+	sge.length = ISER_RX_LOGIN_SIZE;
+	sge.lkey   = ib_conn->device->mr->lkey;
+
+	rx_wr.wr_id   = (uintptr_t)iser_conn->login_resp_buf;
+	rx_wr.sg_list = &sge;
+	rx_wr.num_sge = 1;
+	rx_wr.next    = NULL;
+
+	ib_conn->post_recv_buf_count++;
+	ib_ret	= ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
+	if (ib_ret) {
+		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
+		ib_conn->post_recv_buf_count--;
+	}
+	return ib_ret;
+}
+
+int iser_post_recvm(struct iser_conn *iser_conn, int count)
+{
+	struct ib_recv_wr *rx_wr, *rx_wr_failed;
+	int i, ib_ret;
+	struct ib_conn *ib_conn = &iser_conn->ib_conn;
+	unsigned int my_rx_head = iser_conn->rx_desc_head;
+	struct iser_rx_desc *rx_desc;
+
+	for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
+		rx_desc		= &iser_conn->rx_descs[my_rx_head];
+		rx_wr->wr_id	= (uintptr_t)rx_desc;
+		rx_wr->sg_list	= &rx_desc->rx_sg;
+		rx_wr->num_sge	= 1;
+		rx_wr->next	= rx_wr + 1;
+		my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask;
+	}
+
+	rx_wr--;
+	rx_wr->next = NULL; /* mark end of work requests list */
+
+	ib_conn->post_recv_buf_count += count;
+	ib_ret	= ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
+	if (ib_ret) {
+		iser_err("ib_post_recv failed ret=%d\n", ib_ret);
+		ib_conn->post_recv_buf_count -= count;
+	} else
+		iser_conn->rx_desc_head = my_rx_head;
+	return ib_ret;
+}
+
+
+/**
+ * iser_start_send - Initiate a Send DTO operation
+ *
+ * returns 0 on success, -1 on failure
+ */
+int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
+		   bool signal)
+{
+	int		  ib_ret;
+	struct ib_send_wr send_wr, *send_wr_failed;
+
+	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
+				      tx_desc->dma_addr, ISER_HEADERS_LEN,
+				      DMA_TO_DEVICE);
+
+	send_wr.next	   = NULL;
+	send_wr.wr_id	   = (uintptr_t)tx_desc;
+	send_wr.sg_list	   = tx_desc->tx_sg;
+	send_wr.num_sge	   = tx_desc->num_sge;
+	send_wr.opcode	   = IB_WR_SEND;
+	send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0;
+
+	ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed);
+	if (ib_ret)
+		iser_err("ib_post_send failed, ret:%d\n", ib_ret);
+
+	return ib_ret;
+}
+
+/**
+ * is_iser_tx_desc - Indicate if the completion wr_id
+ *     is a TX descriptor or not.
+ * @iser_conn: iser connection
+ * @wr_id: completion WR identifier
+ *
+ * Since we cannot rely on wc opcode in FLUSH errors
+ * we must work around it by checking if the wr_id address
+ * falls in the iser connection rx_descs buffer. If so
+ * it is an RX descriptor, otherwize it is a TX.
+ */
+static inline bool
+is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id)
+{
+	void *start = iser_conn->rx_descs;
+	int len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs);
+
+	if (wr_id >= start && wr_id < start + len)
+		return false;
+
+	return true;
+}
+
+/**
+ * iser_handle_comp_error() - Handle error completion
+ * @ib_conn:   connection RDMA resources
+ * @wc:        work completion
+ *
+ * Notes: We may handle a FLUSH error completion and in this case
+ *        we only cleanup in case TX type was DATAOUT. For non-FLUSH
+ *        error completion we should also notify iscsi layer that
+ *        connection is failed (in case we passed bind stage).
+ */
+static void
+iser_handle_comp_error(struct ib_conn *ib_conn,
+		       struct ib_wc *wc)
+{
+	void *wr_id = (void *)(uintptr_t)wc->wr_id;
+	struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
+						   ib_conn);
+
+	if (wc->status != IB_WC_WR_FLUSH_ERR)
+		if (iser_conn->iscsi_conn)
+			iscsi_conn_failure(iser_conn->iscsi_conn,
+					   ISCSI_ERR_CONN_FAILED);
+
+	if (wc->wr_id == ISER_FASTREG_LI_WRID)
+		return;
+
+	if (is_iser_tx_desc(iser_conn, wr_id)) {
+		struct iser_tx_desc *desc = wr_id;
+
+		if (desc->type == ISCSI_TX_DATAOUT)
+			kmem_cache_free(ig.desc_cache, desc);
+	} else {
+		ib_conn->post_recv_buf_count--;
+	}
+}
+
+/**
+ * iser_handle_wc - handle a single work completion
+ * @wc: work completion
+ *
+ * Soft-IRQ context, work completion can be either
+ * SEND or RECV, and can turn out successful or
+ * with error (or flush error).
+ */
+static void iser_handle_wc(struct ib_wc *wc)
+{
+	struct ib_conn *ib_conn;
+	struct iser_tx_desc *tx_desc;
+	struct iser_rx_desc *rx_desc;
+
+	ib_conn = wc->qp->qp_context;
+	if (likely(wc->status == IB_WC_SUCCESS)) {
+		if (wc->opcode == IB_WC_RECV) {
+			rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id;
+			iser_rcv_completion(rx_desc, wc->byte_len,
+					    ib_conn);
+		} else
+		if (wc->opcode == IB_WC_SEND) {
+			tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id;
+			iser_snd_completion(tx_desc, ib_conn);
+		} else {
+			iser_err("Unknown wc opcode %d\n", wc->opcode);
+		}
+	} else {
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			iser_err("wr id %llx status %d vend_err %x\n",
+				 wc->wr_id, wc->status, wc->vendor_err);
+		else
+			iser_dbg("flush error: wr id %llx\n", wc->wr_id);
+
+		if (wc->wr_id == ISER_BEACON_WRID)
+			/* all flush errors were consumed */
+			complete(&ib_conn->flush_comp);
+		else
+			iser_handle_comp_error(ib_conn, wc);
+	}
+}
+
+/**
+ * iser_cq_tasklet_fn - iSER completion polling loop
+ * @data: iSER completion context
+ *
+ * Soft-IRQ context, polling connection CQ until
+ * either CQ was empty or we exausted polling budget
+ */
+static void iser_cq_tasklet_fn(unsigned long data)
+{
+	struct iser_comp *comp = (struct iser_comp *)data;
+	struct ib_cq *cq = comp->cq;
+	struct ib_wc *const wcs = comp->wcs;
+	int i, n, completed = 0;
+
+	while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) {
+		for (i = 0; i < n; i++)
+			iser_handle_wc(&wcs[i]);
+
+		completed += n;
+		if (completed >= iser_cq_poll_limit)
+			break;
+	}
+
+	/*
+	 * It is assumed here that arming CQ only once its empty
+	 * would not cause interrupts to be missed.
+	 */
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+
+	iser_dbg("got %d completions\n", completed);
+}
+
+static void iser_cq_callback(struct ib_cq *cq, void *cq_context)
+{
+	struct iser_comp *comp = cq_context;
+
+	tasklet_schedule(&comp->tasklet);
+}
+
+u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task,
+			     enum iser_data_dir cmd_dir, sector_t *sector)
+{
+	struct iser_mem_reg *reg = &iser_task->rdma_reg[cmd_dir];
+	struct fast_reg_descriptor *desc = reg->mem_h;
+	unsigned long sector_size = iser_task->sc->device->sector_size;
+	struct ib_mr_status mr_status;
+	int ret;
+
+	if (desc && desc->reg_indicators & ISER_FASTREG_PROTECTED) {
+		desc->reg_indicators &= ~ISER_FASTREG_PROTECTED;
+		ret = ib_check_mr_status(desc->pi_ctx->sig_mr,
+					 IB_MR_CHECK_SIG_STATUS, &mr_status);
+		if (ret) {
+			pr_err("ib_check_mr_status failed, ret %d\n", ret);
+			goto err;
+		}
+
+		if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) {
+			sector_t sector_off = mr_status.sig_err.sig_err_offset;
+
+			do_div(sector_off, sector_size + 8);
+			*sector = scsi_get_lba(iser_task->sc) + sector_off;
+
+			pr_err("PI error found type %d at sector %llx "
+			       "expected %x vs actual %x\n",
+			       mr_status.sig_err.err_type,
+			       (unsigned long long)*sector,
+			       mr_status.sig_err.expected,
+			       mr_status.sig_err.actual);
+
+			switch (mr_status.sig_err.err_type) {
+			case IB_SIG_BAD_GUARD:
+				return 0x1;
+			case IB_SIG_BAD_REFTAG:
+				return 0x3;
+			case IB_SIG_BAD_APPTAG:
+				return 0x2;
+			}
+		}
+	}
+
+	return 0;
+err:
+	/* Not alot we can do here, return ambiguous guard error */
+	return 0x1;
+}
diff --git a/drivers/infiniband/ulp/isert/Kconfig b/drivers/infiniband/ulp/isert/Kconfig
new file mode 100644
index 000000000..02f9759eb
--- /dev/null
+++ b/drivers/infiniband/ulp/isert/Kconfig
@@ -0,0 +1,5 @@
+config INFINIBAND_ISERT
+	tristate "iSCSI Extensions for RDMA (iSER) target support"
+	depends on INET && INFINIBAND_ADDR_TRANS && TARGET_CORE && ISCSI_TARGET
+	---help---
+	Support for iSCSI Extensions for RDMA (iSER) Target on Infiniband fabrics.
diff --git a/drivers/infiniband/ulp/isert/Makefile b/drivers/infiniband/ulp/isert/Makefile
new file mode 100644
index 000000000..c8bf2421f
--- /dev/null
+++ b/drivers/infiniband/ulp/isert/Makefile
@@ -0,0 +1,2 @@
+ccflags-y		:= -Idrivers/target -Idrivers/target/iscsi
+obj-$(CONFIG_INFINIBAND_ISERT)	+= ib_isert.o
diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c
new file mode 100644
index 000000000..575a072d7
--- /dev/null
+++ b/drivers/infiniband/ulp/isert/ib_isert.c
@@ -0,0 +1,3466 @@
+/*******************************************************************************
+ * This file contains iSCSI extentions for RDMA (iSER) Verbs
+ *
+ * (c) Copyright 2013 Datera, Inc.
+ *
+ * Nicholas A. Bellinger <nab@linux-iscsi.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ ****************************************************************************/
+
+#include <linux/string.h>
+#include <linux/module.h>
+#include <linux/scatterlist.h>
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+#include <target/target_core_base.h>
+#include <target/target_core_fabric.h>
+#include <target/iscsi/iscsi_transport.h>
+#include <linux/semaphore.h>
+
+#include "isert_proto.h"
+#include "ib_isert.h"
+
+#define	ISERT_MAX_CONN		8
+#define ISER_MAX_RX_CQ_LEN	(ISERT_QP_MAX_RECV_DTOS * ISERT_MAX_CONN)
+#define ISER_MAX_TX_CQ_LEN	(ISERT_QP_MAX_REQ_DTOS  * ISERT_MAX_CONN)
+#define ISER_MAX_CQ_LEN		(ISER_MAX_RX_CQ_LEN + ISER_MAX_TX_CQ_LEN + \
+				 ISERT_MAX_CONN)
+
+static int isert_debug_level;
+module_param_named(debug_level, isert_debug_level, int, 0644);
+MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0 (default:0)");
+
+static DEFINE_MUTEX(device_list_mutex);
+static LIST_HEAD(device_list);
+static struct workqueue_struct *isert_comp_wq;
+static struct workqueue_struct *isert_release_wq;
+
+static void
+isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn);
+static int
+isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
+	       struct isert_rdma_wr *wr);
+static void
+isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn);
+static int
+isert_reg_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
+	       struct isert_rdma_wr *wr);
+static int
+isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd);
+static int
+isert_rdma_post_recvl(struct isert_conn *isert_conn);
+static int
+isert_rdma_accept(struct isert_conn *isert_conn);
+struct rdma_cm_id *isert_setup_id(struct isert_np *isert_np);
+
+static void isert_release_work(struct work_struct *work);
+
+static inline bool
+isert_prot_cmd(struct isert_conn *conn, struct se_cmd *cmd)
+{
+	return (conn->pi_support &&
+		cmd->prot_op != TARGET_PROT_NORMAL);
+}
+
+
+static void
+isert_qp_event_callback(struct ib_event *e, void *context)
+{
+	struct isert_conn *isert_conn = context;
+
+	isert_err("conn %p event: %d\n", isert_conn, e->event);
+	switch (e->event) {
+	case IB_EVENT_COMM_EST:
+		rdma_notify(isert_conn->cm_id, IB_EVENT_COMM_EST);
+		break;
+	case IB_EVENT_QP_LAST_WQE_REACHED:
+		isert_warn("Reached TX IB_EVENT_QP_LAST_WQE_REACHED\n");
+		break;
+	default:
+		break;
+	}
+}
+
+static int
+isert_query_device(struct ib_device *ib_dev, struct ib_device_attr *devattr)
+{
+	int ret;
+
+	ret = ib_query_device(ib_dev, devattr);
+	if (ret) {
+		isert_err("ib_query_device() failed: %d\n", ret);
+		return ret;
+	}
+	isert_dbg("devattr->max_sge: %d\n", devattr->max_sge);
+	isert_dbg("devattr->max_sge_rd: %d\n", devattr->max_sge_rd);
+
+	return 0;
+}
+
+static struct isert_comp *
+isert_comp_get(struct isert_conn *isert_conn)
+{
+	struct isert_device *device = isert_conn->device;
+	struct isert_comp *comp;
+	int i, min = 0;
+
+	mutex_lock(&device_list_mutex);
+	for (i = 0; i < device->comps_used; i++)
+		if (device->comps[i].active_qps <
+		    device->comps[min].active_qps)
+			min = i;
+	comp = &device->comps[min];
+	comp->active_qps++;
+	mutex_unlock(&device_list_mutex);
+
+	isert_info("conn %p, using comp %p min_index: %d\n",
+		   isert_conn, comp, min);
+
+	return comp;
+}
+
+static void
+isert_comp_put(struct isert_comp *comp)
+{
+	mutex_lock(&device_list_mutex);
+	comp->active_qps--;
+	mutex_unlock(&device_list_mutex);
+}
+
+static struct ib_qp *
+isert_create_qp(struct isert_conn *isert_conn,
+		struct isert_comp *comp,
+		struct rdma_cm_id *cma_id)
+{
+	struct isert_device *device = isert_conn->device;
+	struct ib_qp_init_attr attr;
+	int ret;
+
+	memset(&attr, 0, sizeof(struct ib_qp_init_attr));
+	attr.event_handler = isert_qp_event_callback;
+	attr.qp_context = isert_conn;
+	attr.send_cq = comp->cq;
+	attr.recv_cq = comp->cq;
+	attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS;
+	attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1;
+	/*
+	 * FIXME: Use devattr.max_sge - 2 for max_send_sge as
+	 * work-around for RDMA_READs with ConnectX-2.
+	 *
+	 * Also, still make sure to have at least two SGEs for
+	 * outgoing control PDU responses.
+	 */
+	attr.cap.max_send_sge = max(2, device->dev_attr.max_sge - 2);
+	isert_conn->max_sge = attr.cap.max_send_sge;
+
+	attr.cap.max_recv_sge = 1;
+	attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+	attr.qp_type = IB_QPT_RC;
+	if (device->pi_capable)
+		attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN;
+
+	ret = rdma_create_qp(cma_id, device->pd, &attr);
+	if (ret) {
+		isert_err("rdma_create_qp failed for cma_id %d\n", ret);
+		return ERR_PTR(ret);
+	}
+
+	return cma_id->qp;
+}
+
+static int
+isert_conn_setup_qp(struct isert_conn *isert_conn, struct rdma_cm_id *cma_id)
+{
+	struct isert_comp *comp;
+	int ret;
+
+	comp = isert_comp_get(isert_conn);
+	isert_conn->qp = isert_create_qp(isert_conn, comp, cma_id);
+	if (IS_ERR(isert_conn->qp)) {
+		ret = PTR_ERR(isert_conn->qp);
+		goto err;
+	}
+
+	return 0;
+err:
+	isert_comp_put(comp);
+	return ret;
+}
+
+static void
+isert_cq_event_callback(struct ib_event *e, void *context)
+{
+	isert_dbg("event: %d\n", e->event);
+}
+
+static int
+isert_alloc_rx_descriptors(struct isert_conn *isert_conn)
+{
+	struct isert_device *device = isert_conn->device;
+	struct ib_device *ib_dev = device->ib_device;
+	struct iser_rx_desc *rx_desc;
+	struct ib_sge *rx_sg;
+	u64 dma_addr;
+	int i, j;
+
+	isert_conn->rx_descs = kzalloc(ISERT_QP_MAX_RECV_DTOS *
+				sizeof(struct iser_rx_desc), GFP_KERNEL);
+	if (!isert_conn->rx_descs)
+		goto fail;
+
+	rx_desc = isert_conn->rx_descs;
+
+	for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++)  {
+		dma_addr = ib_dma_map_single(ib_dev, (void *)rx_desc,
+					ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+		if (ib_dma_mapping_error(ib_dev, dma_addr))
+			goto dma_map_fail;
+
+		rx_desc->dma_addr = dma_addr;
+
+		rx_sg = &rx_desc->rx_sg;
+		rx_sg->addr = rx_desc->dma_addr;
+		rx_sg->length = ISER_RX_PAYLOAD_SIZE;
+		rx_sg->lkey = device->mr->lkey;
+	}
+
+	isert_conn->rx_desc_head = 0;
+
+	return 0;
+
+dma_map_fail:
+	rx_desc = isert_conn->rx_descs;
+	for (j = 0; j < i; j++, rx_desc++) {
+		ib_dma_unmap_single(ib_dev, rx_desc->dma_addr,
+				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	}
+	kfree(isert_conn->rx_descs);
+	isert_conn->rx_descs = NULL;
+fail:
+	isert_err("conn %p failed to allocate rx descriptors\n", isert_conn);
+
+	return -ENOMEM;
+}
+
+static void
+isert_free_rx_descriptors(struct isert_conn *isert_conn)
+{
+	struct ib_device *ib_dev = isert_conn->device->ib_device;
+	struct iser_rx_desc *rx_desc;
+	int i;
+
+	if (!isert_conn->rx_descs)
+		return;
+
+	rx_desc = isert_conn->rx_descs;
+	for (i = 0; i < ISERT_QP_MAX_RECV_DTOS; i++, rx_desc++)  {
+		ib_dma_unmap_single(ib_dev, rx_desc->dma_addr,
+				    ISER_RX_PAYLOAD_SIZE, DMA_FROM_DEVICE);
+	}
+
+	kfree(isert_conn->rx_descs);
+	isert_conn->rx_descs = NULL;
+}
+
+static void isert_cq_work(struct work_struct *);
+static void isert_cq_callback(struct ib_cq *, void *);
+
+static void
+isert_free_comps(struct isert_device *device)
+{
+	int i;
+
+	for (i = 0; i < device->comps_used; i++) {
+		struct isert_comp *comp = &device->comps[i];
+
+		if (comp->cq) {
+			cancel_work_sync(&comp->work);
+			ib_destroy_cq(comp->cq);
+		}
+	}
+	kfree(device->comps);
+}
+
+static int
+isert_alloc_comps(struct isert_device *device,
+		  struct ib_device_attr *attr)
+{
+	int i, max_cqe, ret = 0;
+
+	device->comps_used = min(ISERT_MAX_CQ, min_t(int, num_online_cpus(),
+				 device->ib_device->num_comp_vectors));
+
+	isert_info("Using %d CQs, %s supports %d vectors support "
+		   "Fast registration %d pi_capable %d\n",
+		   device->comps_used, device->ib_device->name,
+		   device->ib_device->num_comp_vectors, device->use_fastreg,
+		   device->pi_capable);
+
+	device->comps = kcalloc(device->comps_used, sizeof(struct isert_comp),
+				GFP_KERNEL);
+	if (!device->comps) {
+		isert_err("Unable to allocate completion contexts\n");
+		return -ENOMEM;
+	}
+
+	max_cqe = min(ISER_MAX_CQ_LEN, attr->max_cqe);
+
+	for (i = 0; i < device->comps_used; i++) {
+		struct isert_comp *comp = &device->comps[i];
+
+		comp->device = device;
+		INIT_WORK(&comp->work, isert_cq_work);
+		comp->cq = ib_create_cq(device->ib_device,
+					isert_cq_callback,
+					isert_cq_event_callback,
+					(void *)comp,
+					max_cqe, i);
+		if (IS_ERR(comp->cq)) {
+			isert_err("Unable to allocate cq\n");
+			ret = PTR_ERR(comp->cq);
+			comp->cq = NULL;
+			goto out_cq;
+		}
+
+		ret = ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP);
+		if (ret)
+			goto out_cq;
+	}
+
+	return 0;
+out_cq:
+	isert_free_comps(device);
+	return ret;
+}
+
+static int
+isert_create_device_ib_res(struct isert_device *device)
+{
+	struct ib_device_attr *dev_attr;
+	int ret;
+
+	dev_attr = &device->dev_attr;
+	ret = isert_query_device(device->ib_device, dev_attr);
+	if (ret)
+		return ret;
+
+	/* asign function handlers */
+	if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS &&
+	    dev_attr->device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) {
+		device->use_fastreg = 1;
+		device->reg_rdma_mem = isert_reg_rdma;
+		device->unreg_rdma_mem = isert_unreg_rdma;
+	} else {
+		device->use_fastreg = 0;
+		device->reg_rdma_mem = isert_map_rdma;
+		device->unreg_rdma_mem = isert_unmap_cmd;
+	}
+
+	ret = isert_alloc_comps(device, dev_attr);
+	if (ret)
+		return ret;
+
+	device->pd = ib_alloc_pd(device->ib_device);
+	if (IS_ERR(device->pd)) {
+		ret = PTR_ERR(device->pd);
+		isert_err("failed to allocate pd, device %p, ret=%d\n",
+			  device, ret);
+		goto out_cq;
+	}
+
+	device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(device->mr)) {
+		ret = PTR_ERR(device->mr);
+		isert_err("failed to create dma mr, device %p, ret=%d\n",
+			  device, ret);
+		goto out_mr;
+	}
+
+	/* Check signature cap */
+	device->pi_capable = dev_attr->device_cap_flags &
+			     IB_DEVICE_SIGNATURE_HANDOVER ? true : false;
+
+	return 0;
+
+out_mr:
+	ib_dealloc_pd(device->pd);
+out_cq:
+	isert_free_comps(device);
+	return ret;
+}
+
+static void
+isert_free_device_ib_res(struct isert_device *device)
+{
+	isert_info("device %p\n", device);
+
+	ib_dereg_mr(device->mr);
+	ib_dealloc_pd(device->pd);
+	isert_free_comps(device);
+}
+
+static void
+isert_device_put(struct isert_device *device)
+{
+	mutex_lock(&device_list_mutex);
+	device->refcount--;
+	isert_info("device %p refcount %d\n", device, device->refcount);
+	if (!device->refcount) {
+		isert_free_device_ib_res(device);
+		list_del(&device->dev_node);
+		kfree(device);
+	}
+	mutex_unlock(&device_list_mutex);
+}
+
+static struct isert_device *
+isert_device_get(struct rdma_cm_id *cma_id)
+{
+	struct isert_device *device;
+	int ret;
+
+	mutex_lock(&device_list_mutex);
+	list_for_each_entry(device, &device_list, dev_node) {
+		if (device->ib_device->node_guid == cma_id->device->node_guid) {
+			device->refcount++;
+			isert_info("Found iser device %p refcount %d\n",
+				   device, device->refcount);
+			mutex_unlock(&device_list_mutex);
+			return device;
+		}
+	}
+
+	device = kzalloc(sizeof(struct isert_device), GFP_KERNEL);
+	if (!device) {
+		mutex_unlock(&device_list_mutex);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	INIT_LIST_HEAD(&device->dev_node);
+
+	device->ib_device = cma_id->device;
+	ret = isert_create_device_ib_res(device);
+	if (ret) {
+		kfree(device);
+		mutex_unlock(&device_list_mutex);
+		return ERR_PTR(ret);
+	}
+
+	device->refcount++;
+	list_add_tail(&device->dev_node, &device_list);
+	isert_info("Created a new iser device %p refcount %d\n",
+		   device, device->refcount);
+	mutex_unlock(&device_list_mutex);
+
+	return device;
+}
+
+static void
+isert_conn_free_fastreg_pool(struct isert_conn *isert_conn)
+{
+	struct fast_reg_descriptor *fr_desc, *tmp;
+	int i = 0;
+
+	if (list_empty(&isert_conn->fr_pool))
+		return;
+
+	isert_info("Freeing conn %p fastreg pool", isert_conn);
+
+	list_for_each_entry_safe(fr_desc, tmp,
+				 &isert_conn->fr_pool, list) {
+		list_del(&fr_desc->list);
+		ib_free_fast_reg_page_list(fr_desc->data_frpl);
+		ib_dereg_mr(fr_desc->data_mr);
+		if (fr_desc->pi_ctx) {
+			ib_free_fast_reg_page_list(fr_desc->pi_ctx->prot_frpl);
+			ib_dereg_mr(fr_desc->pi_ctx->prot_mr);
+			ib_destroy_mr(fr_desc->pi_ctx->sig_mr);
+			kfree(fr_desc->pi_ctx);
+		}
+		kfree(fr_desc);
+		++i;
+	}
+
+	if (i < isert_conn->fr_pool_size)
+		isert_warn("Pool still has %d regions registered\n",
+			isert_conn->fr_pool_size - i);
+}
+
+static int
+isert_create_pi_ctx(struct fast_reg_descriptor *desc,
+		    struct ib_device *device,
+		    struct ib_pd *pd)
+{
+	struct ib_mr_init_attr mr_init_attr;
+	struct pi_context *pi_ctx;
+	int ret;
+
+	pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL);
+	if (!pi_ctx) {
+		isert_err("Failed to allocate pi context\n");
+		return -ENOMEM;
+	}
+
+	pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(device,
+					    ISCSI_ISER_SG_TABLESIZE);
+	if (IS_ERR(pi_ctx->prot_frpl)) {
+		isert_err("Failed to allocate prot frpl err=%ld\n",
+			  PTR_ERR(pi_ctx->prot_frpl));
+		ret = PTR_ERR(pi_ctx->prot_frpl);
+		goto err_pi_ctx;
+	}
+
+	pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE);
+	if (IS_ERR(pi_ctx->prot_mr)) {
+		isert_err("Failed to allocate prot frmr err=%ld\n",
+			  PTR_ERR(pi_ctx->prot_mr));
+		ret = PTR_ERR(pi_ctx->prot_mr);
+		goto err_prot_frpl;
+	}
+	desc->ind |= ISERT_PROT_KEY_VALID;
+
+	memset(&mr_init_attr, 0, sizeof(mr_init_attr));
+	mr_init_attr.max_reg_descriptors = 2;
+	mr_init_attr.flags |= IB_MR_SIGNATURE_EN;
+	pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr);
+	if (IS_ERR(pi_ctx->sig_mr)) {
+		isert_err("Failed to allocate signature enabled mr err=%ld\n",
+			  PTR_ERR(pi_ctx->sig_mr));
+		ret = PTR_ERR(pi_ctx->sig_mr);
+		goto err_prot_mr;
+	}
+
+	desc->pi_ctx = pi_ctx;
+	desc->ind |= ISERT_SIG_KEY_VALID;
+	desc->ind &= ~ISERT_PROTECTED;
+
+	return 0;
+
+err_prot_mr:
+	ib_dereg_mr(pi_ctx->prot_mr);
+err_prot_frpl:
+	ib_free_fast_reg_page_list(pi_ctx->prot_frpl);
+err_pi_ctx:
+	kfree(pi_ctx);
+
+	return ret;
+}
+
+static int
+isert_create_fr_desc(struct ib_device *ib_device, struct ib_pd *pd,
+		     struct fast_reg_descriptor *fr_desc)
+{
+	int ret;
+
+	fr_desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device,
+							 ISCSI_ISER_SG_TABLESIZE);
+	if (IS_ERR(fr_desc->data_frpl)) {
+		isert_err("Failed to allocate data frpl err=%ld\n",
+			  PTR_ERR(fr_desc->data_frpl));
+		return PTR_ERR(fr_desc->data_frpl);
+	}
+
+	fr_desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE);
+	if (IS_ERR(fr_desc->data_mr)) {
+		isert_err("Failed to allocate data frmr err=%ld\n",
+			  PTR_ERR(fr_desc->data_mr));
+		ret = PTR_ERR(fr_desc->data_mr);
+		goto err_data_frpl;
+	}
+	fr_desc->ind |= ISERT_DATA_KEY_VALID;
+
+	isert_dbg("Created fr_desc %p\n", fr_desc);
+
+	return 0;
+
+err_data_frpl:
+	ib_free_fast_reg_page_list(fr_desc->data_frpl);
+
+	return ret;
+}
+
+static int
+isert_conn_create_fastreg_pool(struct isert_conn *isert_conn)
+{
+	struct fast_reg_descriptor *fr_desc;
+	struct isert_device *device = isert_conn->device;
+	struct se_session *se_sess = isert_conn->conn->sess->se_sess;
+	struct se_node_acl *se_nacl = se_sess->se_node_acl;
+	int i, ret, tag_num;
+	/*
+	 * Setup the number of FRMRs based upon the number of tags
+	 * available to session in iscsi_target_locate_portal().
+	 */
+	tag_num = max_t(u32, ISCSIT_MIN_TAGS, se_nacl->queue_depth);
+	tag_num = (tag_num * 2) + ISCSIT_EXTRA_TAGS;
+
+	isert_conn->fr_pool_size = 0;
+	for (i = 0; i < tag_num; i++) {
+		fr_desc = kzalloc(sizeof(*fr_desc), GFP_KERNEL);
+		if (!fr_desc) {
+			isert_err("Failed to allocate fast_reg descriptor\n");
+			ret = -ENOMEM;
+			goto err;
+		}
+
+		ret = isert_create_fr_desc(device->ib_device,
+					   device->pd, fr_desc);
+		if (ret) {
+			isert_err("Failed to create fastreg descriptor err=%d\n",
+			       ret);
+			kfree(fr_desc);
+			goto err;
+		}
+
+		list_add_tail(&fr_desc->list, &isert_conn->fr_pool);
+		isert_conn->fr_pool_size++;
+	}
+
+	isert_dbg("Creating conn %p fastreg pool size=%d",
+		 isert_conn, isert_conn->fr_pool_size);
+
+	return 0;
+
+err:
+	isert_conn_free_fastreg_pool(isert_conn);
+	return ret;
+}
+
+static void
+isert_init_conn(struct isert_conn *isert_conn)
+{
+	isert_conn->state = ISER_CONN_INIT;
+	INIT_LIST_HEAD(&isert_conn->accept_node);
+	init_completion(&isert_conn->login_comp);
+	init_completion(&isert_conn->login_req_comp);
+	init_completion(&isert_conn->wait);
+	kref_init(&isert_conn->kref);
+	mutex_init(&isert_conn->mutex);
+	spin_lock_init(&isert_conn->pool_lock);
+	INIT_LIST_HEAD(&isert_conn->fr_pool);
+	INIT_WORK(&isert_conn->release_work, isert_release_work);
+}
+
+static void
+isert_free_login_buf(struct isert_conn *isert_conn)
+{
+	struct ib_device *ib_dev = isert_conn->device->ib_device;
+
+	ib_dma_unmap_single(ib_dev, isert_conn->login_rsp_dma,
+			    ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE);
+	ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma,
+			    ISCSI_DEF_MAX_RECV_SEG_LEN,
+			    DMA_FROM_DEVICE);
+	kfree(isert_conn->login_buf);
+}
+
+static int
+isert_alloc_login_buf(struct isert_conn *isert_conn,
+		      struct ib_device *ib_dev)
+{
+	int ret;
+
+	isert_conn->login_buf = kzalloc(ISCSI_DEF_MAX_RECV_SEG_LEN +
+					ISER_RX_LOGIN_SIZE, GFP_KERNEL);
+	if (!isert_conn->login_buf) {
+		isert_err("Unable to allocate isert_conn->login_buf\n");
+		return -ENOMEM;
+	}
+
+	isert_conn->login_req_buf = isert_conn->login_buf;
+	isert_conn->login_rsp_buf = isert_conn->login_buf +
+				    ISCSI_DEF_MAX_RECV_SEG_LEN;
+
+	isert_dbg("Set login_buf: %p login_req_buf: %p login_rsp_buf: %p\n",
+		 isert_conn->login_buf, isert_conn->login_req_buf,
+		 isert_conn->login_rsp_buf);
+
+	isert_conn->login_req_dma = ib_dma_map_single(ib_dev,
+				(void *)isert_conn->login_req_buf,
+				ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_FROM_DEVICE);
+
+	ret = ib_dma_mapping_error(ib_dev, isert_conn->login_req_dma);
+	if (ret) {
+		isert_err("login_req_dma mapping error: %d\n", ret);
+		isert_conn->login_req_dma = 0;
+		goto out_login_buf;
+	}
+
+	isert_conn->login_rsp_dma = ib_dma_map_single(ib_dev,
+					(void *)isert_conn->login_rsp_buf,
+					ISER_RX_LOGIN_SIZE, DMA_TO_DEVICE);
+
+	ret = ib_dma_mapping_error(ib_dev, isert_conn->login_rsp_dma);
+	if (ret) {
+		isert_err("login_rsp_dma mapping error: %d\n", ret);
+		isert_conn->login_rsp_dma = 0;
+		goto out_req_dma_map;
+	}
+
+	return 0;
+
+out_req_dma_map:
+	ib_dma_unmap_single(ib_dev, isert_conn->login_req_dma,
+			    ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_FROM_DEVICE);
+out_login_buf:
+	kfree(isert_conn->login_buf);
+	return ret;
+}
+
+static int
+isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
+{
+	struct isert_np *isert_np = cma_id->context;
+	struct iscsi_np *np = isert_np->np;
+	struct isert_conn *isert_conn;
+	struct isert_device *device;
+	int ret = 0;
+
+	spin_lock_bh(&np->np_thread_lock);
+	if (!np->enabled) {
+		spin_unlock_bh(&np->np_thread_lock);
+		isert_dbg("iscsi_np is not enabled, reject connect request\n");
+		return rdma_reject(cma_id, NULL, 0);
+	}
+	spin_unlock_bh(&np->np_thread_lock);
+
+	isert_dbg("cma_id: %p, portal: %p\n",
+		 cma_id, cma_id->context);
+
+	isert_conn = kzalloc(sizeof(struct isert_conn), GFP_KERNEL);
+	if (!isert_conn)
+		return -ENOMEM;
+
+	isert_init_conn(isert_conn);
+	isert_conn->cm_id = cma_id;
+
+	ret = isert_alloc_login_buf(isert_conn, cma_id->device);
+	if (ret)
+		goto out;
+
+	device = isert_device_get(cma_id);
+	if (IS_ERR(device)) {
+		ret = PTR_ERR(device);
+		goto out_rsp_dma_map;
+	}
+	isert_conn->device = device;
+
+	/* Set max inflight RDMA READ requests */
+	isert_conn->initiator_depth = min_t(u8,
+				event->param.conn.initiator_depth,
+				device->dev_attr.max_qp_init_rd_atom);
+	isert_dbg("Using initiator_depth: %u\n", isert_conn->initiator_depth);
+
+	ret = isert_conn_setup_qp(isert_conn, cma_id);
+	if (ret)
+		goto out_conn_dev;
+
+	ret = isert_rdma_post_recvl(isert_conn);
+	if (ret)
+		goto out_conn_dev;
+
+	ret = isert_rdma_accept(isert_conn);
+	if (ret)
+		goto out_conn_dev;
+
+	mutex_lock(&isert_np->np_accept_mutex);
+	list_add_tail(&isert_conn->accept_node, &isert_np->np_accept_list);
+	mutex_unlock(&isert_np->np_accept_mutex);
+
+	isert_info("np %p: Allow accept_np to continue\n", np);
+	up(&isert_np->np_sem);
+	return 0;
+
+out_conn_dev:
+	isert_device_put(device);
+out_rsp_dma_map:
+	isert_free_login_buf(isert_conn);
+out:
+	kfree(isert_conn);
+	rdma_reject(cma_id, NULL, 0);
+	return ret;
+}
+
+static void
+isert_connect_release(struct isert_conn *isert_conn)
+{
+	struct isert_device *device = isert_conn->device;
+
+	isert_dbg("conn %p\n", isert_conn);
+
+	BUG_ON(!device);
+
+	if (device->use_fastreg)
+		isert_conn_free_fastreg_pool(isert_conn);
+
+	isert_free_rx_descriptors(isert_conn);
+	if (isert_conn->cm_id)
+		rdma_destroy_id(isert_conn->cm_id);
+
+	if (isert_conn->qp) {
+		struct isert_comp *comp = isert_conn->qp->recv_cq->cq_context;
+
+		isert_comp_put(comp);
+		ib_destroy_qp(isert_conn->qp);
+	}
+
+	if (isert_conn->login_buf)
+		isert_free_login_buf(isert_conn);
+
+	isert_device_put(device);
+
+	kfree(isert_conn);
+}
+
+static void
+isert_connected_handler(struct rdma_cm_id *cma_id)
+{
+	struct isert_conn *isert_conn = cma_id->qp->qp_context;
+
+	isert_info("conn %p\n", isert_conn);
+
+	if (!kref_get_unless_zero(&isert_conn->kref)) {
+		isert_warn("conn %p connect_release is running\n", isert_conn);
+		return;
+	}
+
+	mutex_lock(&isert_conn->mutex);
+	if (isert_conn->state != ISER_CONN_FULL_FEATURE)
+		isert_conn->state = ISER_CONN_UP;
+	mutex_unlock(&isert_conn->mutex);
+}
+
+static void
+isert_release_kref(struct kref *kref)
+{
+	struct isert_conn *isert_conn = container_of(kref,
+				struct isert_conn, kref);
+
+	isert_info("conn %p final kref %s/%d\n", isert_conn, current->comm,
+		   current->pid);
+
+	isert_connect_release(isert_conn);
+}
+
+static void
+isert_put_conn(struct isert_conn *isert_conn)
+{
+	kref_put(&isert_conn->kref, isert_release_kref);
+}
+
+/**
+ * isert_conn_terminate() - Initiate connection termination
+ * @isert_conn: isert connection struct
+ *
+ * Notes:
+ * In case the connection state is FULL_FEATURE, move state
+ * to TEMINATING and start teardown sequence (rdma_disconnect).
+ * In case the connection state is UP, complete flush as well.
+ *
+ * This routine must be called with mutex held. Thus it is
+ * safe to call multiple times.
+ */
+static void
+isert_conn_terminate(struct isert_conn *isert_conn)
+{
+	int err;
+
+	switch (isert_conn->state) {
+	case ISER_CONN_TERMINATING:
+		break;
+	case ISER_CONN_UP:
+	case ISER_CONN_FULL_FEATURE: /* FALLTHRU */
+		isert_info("Terminating conn %p state %d\n",
+			   isert_conn, isert_conn->state);
+		isert_conn->state = ISER_CONN_TERMINATING;
+		err = rdma_disconnect(isert_conn->cm_id);
+		if (err)
+			isert_warn("Failed rdma_disconnect isert_conn %p\n",
+				   isert_conn);
+		break;
+	default:
+		isert_warn("conn %p teminating in state %d\n",
+			   isert_conn, isert_conn->state);
+	}
+}
+
+static int
+isert_np_cma_handler(struct isert_np *isert_np,
+		     enum rdma_cm_event_type event)
+{
+	isert_dbg("isert np %p, handling event %d\n", isert_np, event);
+
+	switch (event) {
+	case RDMA_CM_EVENT_DEVICE_REMOVAL:
+		isert_np->np_cm_id = NULL;
+		break;
+	case RDMA_CM_EVENT_ADDR_CHANGE:
+		isert_np->np_cm_id = isert_setup_id(isert_np);
+		if (IS_ERR(isert_np->np_cm_id)) {
+			isert_err("isert np %p setup id failed: %ld\n",
+				  isert_np, PTR_ERR(isert_np->np_cm_id));
+			isert_np->np_cm_id = NULL;
+		}
+		break;
+	default:
+		isert_err("isert np %p Unexpected event %d\n",
+			  isert_np, event);
+	}
+
+	return -1;
+}
+
+static int
+isert_disconnected_handler(struct rdma_cm_id *cma_id,
+			   enum rdma_cm_event_type event)
+{
+	struct isert_np *isert_np = cma_id->context;
+	struct isert_conn *isert_conn;
+	bool terminating = false;
+
+	if (isert_np->np_cm_id == cma_id)
+		return isert_np_cma_handler(cma_id->context, event);
+
+	isert_conn = cma_id->qp->qp_context;
+
+	mutex_lock(&isert_conn->mutex);
+	terminating = (isert_conn->state == ISER_CONN_TERMINATING);
+	isert_conn_terminate(isert_conn);
+	mutex_unlock(&isert_conn->mutex);
+
+	isert_info("conn %p completing wait\n", isert_conn);
+	complete(&isert_conn->wait);
+
+	if (terminating)
+		goto out;
+
+	mutex_lock(&isert_np->np_accept_mutex);
+	if (!list_empty(&isert_conn->accept_node)) {
+		list_del_init(&isert_conn->accept_node);
+		isert_put_conn(isert_conn);
+		queue_work(isert_release_wq, &isert_conn->release_work);
+	}
+	mutex_unlock(&isert_np->np_accept_mutex);
+
+out:
+	return 0;
+}
+
+static int
+isert_connect_error(struct rdma_cm_id *cma_id)
+{
+	struct isert_conn *isert_conn = cma_id->qp->qp_context;
+
+	isert_conn->cm_id = NULL;
+	isert_put_conn(isert_conn);
+
+	return -1;
+}
+
+static int
+isert_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
+{
+	int ret = 0;
+
+	isert_info("event %d status %d id %p np %p\n", event->event,
+		   event->status, cma_id, cma_id->context);
+
+	switch (event->event) {
+	case RDMA_CM_EVENT_CONNECT_REQUEST:
+		ret = isert_connect_request(cma_id, event);
+		if (ret)
+			isert_err("failed handle connect request %d\n", ret);
+		break;
+	case RDMA_CM_EVENT_ESTABLISHED:
+		isert_connected_handler(cma_id);
+		break;
+	case RDMA_CM_EVENT_ADDR_CHANGE:    /* FALLTHRU */
+	case RDMA_CM_EVENT_DISCONNECTED:   /* FALLTHRU */
+	case RDMA_CM_EVENT_DEVICE_REMOVAL: /* FALLTHRU */
+	case RDMA_CM_EVENT_TIMEWAIT_EXIT:  /* FALLTHRU */
+		ret = isert_disconnected_handler(cma_id, event->event);
+		break;
+	case RDMA_CM_EVENT_REJECTED:       /* FALLTHRU */
+	case RDMA_CM_EVENT_UNREACHABLE:    /* FALLTHRU */
+	case RDMA_CM_EVENT_CONNECT_ERROR:
+		ret = isert_connect_error(cma_id);
+		break;
+	default:
+		isert_err("Unhandled RDMA CMA event: %d\n", event->event);
+		break;
+	}
+
+	return ret;
+}
+
+static int
+isert_post_recv(struct isert_conn *isert_conn, u32 count)
+{
+	struct ib_recv_wr *rx_wr, *rx_wr_failed;
+	int i, ret;
+	unsigned int rx_head = isert_conn->rx_desc_head;
+	struct iser_rx_desc *rx_desc;
+
+	for (rx_wr = isert_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
+		rx_desc		= &isert_conn->rx_descs[rx_head];
+		rx_wr->wr_id	= (uintptr_t)rx_desc;
+		rx_wr->sg_list	= &rx_desc->rx_sg;
+		rx_wr->num_sge	= 1;
+		rx_wr->next	= rx_wr + 1;
+		rx_head = (rx_head + 1) & (ISERT_QP_MAX_RECV_DTOS - 1);
+	}
+
+	rx_wr--;
+	rx_wr->next = NULL; /* mark end of work requests list */
+
+	isert_conn->post_recv_buf_count += count;
+	ret = ib_post_recv(isert_conn->qp, isert_conn->rx_wr,
+				&rx_wr_failed);
+	if (ret) {
+		isert_err("ib_post_recv() failed with ret: %d\n", ret);
+		isert_conn->post_recv_buf_count -= count;
+	} else {
+		isert_dbg("Posted %d RX buffers\n", count);
+		isert_conn->rx_desc_head = rx_head;
+	}
+	return ret;
+}
+
+static int
+isert_post_send(struct isert_conn *isert_conn, struct iser_tx_desc *tx_desc)
+{
+	struct ib_device *ib_dev = isert_conn->cm_id->device;
+	struct ib_send_wr send_wr, *send_wr_failed;
+	int ret;
+
+	ib_dma_sync_single_for_device(ib_dev, tx_desc->dma_addr,
+				      ISER_HEADERS_LEN, DMA_TO_DEVICE);
+
+	send_wr.next	= NULL;
+	send_wr.wr_id	= (uintptr_t)tx_desc;
+	send_wr.sg_list	= tx_desc->tx_sg;
+	send_wr.num_sge	= tx_desc->num_sge;
+	send_wr.opcode	= IB_WR_SEND;
+	send_wr.send_flags = IB_SEND_SIGNALED;
+
+	ret = ib_post_send(isert_conn->qp, &send_wr, &send_wr_failed);
+	if (ret)
+		isert_err("ib_post_send() failed, ret: %d\n", ret);
+
+	return ret;
+}
+
+static void
+isert_create_send_desc(struct isert_conn *isert_conn,
+		       struct isert_cmd *isert_cmd,
+		       struct iser_tx_desc *tx_desc)
+{
+	struct isert_device *device = isert_conn->device;
+	struct ib_device *ib_dev = device->ib_device;
+
+	ib_dma_sync_single_for_cpu(ib_dev, tx_desc->dma_addr,
+				   ISER_HEADERS_LEN, DMA_TO_DEVICE);
+
+	memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr));
+	tx_desc->iser_header.flags = ISER_VER;
+
+	tx_desc->num_sge = 1;
+	tx_desc->isert_cmd = isert_cmd;
+
+	if (tx_desc->tx_sg[0].lkey != device->mr->lkey) {
+		tx_desc->tx_sg[0].lkey = device->mr->lkey;
+		isert_dbg("tx_desc %p lkey mismatch, fixing\n", tx_desc);
+	}
+}
+
+static int
+isert_init_tx_hdrs(struct isert_conn *isert_conn,
+		   struct iser_tx_desc *tx_desc)
+{
+	struct isert_device *device = isert_conn->device;
+	struct ib_device *ib_dev = device->ib_device;
+	u64 dma_addr;
+
+	dma_addr = ib_dma_map_single(ib_dev, (void *)tx_desc,
+			ISER_HEADERS_LEN, DMA_TO_DEVICE);
+	if (ib_dma_mapping_error(ib_dev, dma_addr)) {
+		isert_err("ib_dma_mapping_error() failed\n");
+		return -ENOMEM;
+	}
+
+	tx_desc->dma_addr = dma_addr;
+	tx_desc->tx_sg[0].addr	= tx_desc->dma_addr;
+	tx_desc->tx_sg[0].length = ISER_HEADERS_LEN;
+	tx_desc->tx_sg[0].lkey = device->mr->lkey;
+
+	isert_dbg("Setup tx_sg[0].addr: 0x%llx length: %u lkey: 0x%x\n",
+		  tx_desc->tx_sg[0].addr, tx_desc->tx_sg[0].length,
+		  tx_desc->tx_sg[0].lkey);
+
+	return 0;
+}
+
+static void
+isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
+		   struct ib_send_wr *send_wr)
+{
+	struct iser_tx_desc *tx_desc = &isert_cmd->tx_desc;
+
+	isert_cmd->rdma_wr.iser_ib_op = ISER_IB_SEND;
+	send_wr->wr_id = (uintptr_t)&isert_cmd->tx_desc;
+	send_wr->opcode = IB_WR_SEND;
+	send_wr->sg_list = &tx_desc->tx_sg[0];
+	send_wr->num_sge = isert_cmd->tx_desc.num_sge;
+	send_wr->send_flags = IB_SEND_SIGNALED;
+}
+
+static int
+isert_rdma_post_recvl(struct isert_conn *isert_conn)
+{
+	struct ib_recv_wr rx_wr, *rx_wr_fail;
+	struct ib_sge sge;
+	int ret;
+
+	memset(&sge, 0, sizeof(struct ib_sge));
+	sge.addr = isert_conn->login_req_dma;
+	sge.length = ISER_RX_LOGIN_SIZE;
+	sge.lkey = isert_conn->device->mr->lkey;
+
+	isert_dbg("Setup sge: addr: %llx length: %d 0x%08x\n",
+		sge.addr, sge.length, sge.lkey);
+
+	memset(&rx_wr, 0, sizeof(struct ib_recv_wr));
+	rx_wr.wr_id = (uintptr_t)isert_conn->login_req_buf;
+	rx_wr.sg_list = &sge;
+	rx_wr.num_sge = 1;
+
+	isert_conn->post_recv_buf_count++;
+	ret = ib_post_recv(isert_conn->qp, &rx_wr, &rx_wr_fail);
+	if (ret) {
+		isert_err("ib_post_recv() failed: %d\n", ret);
+		isert_conn->post_recv_buf_count--;
+	}
+
+	return ret;
+}
+
+static int
+isert_put_login_tx(struct iscsi_conn *conn, struct iscsi_login *login,
+		   u32 length)
+{
+	struct isert_conn *isert_conn = conn->context;
+	struct isert_device *device = isert_conn->device;
+	struct ib_device *ib_dev = device->ib_device;
+	struct iser_tx_desc *tx_desc = &isert_conn->login_tx_desc;
+	int ret;
+
+	isert_create_send_desc(isert_conn, NULL, tx_desc);
+
+	memcpy(&tx_desc->iscsi_header, &login->rsp[0],
+	       sizeof(struct iscsi_hdr));
+
+	isert_init_tx_hdrs(isert_conn, tx_desc);
+
+	if (length > 0) {
+		struct ib_sge *tx_dsg = &tx_desc->tx_sg[1];
+
+		ib_dma_sync_single_for_cpu(ib_dev, isert_conn->login_rsp_dma,
+					   length, DMA_TO_DEVICE);
+
+		memcpy(isert_conn->login_rsp_buf, login->rsp_buf, length);
+
+		ib_dma_sync_single_for_device(ib_dev, isert_conn->login_rsp_dma,
+					      length, DMA_TO_DEVICE);
+
+		tx_dsg->addr	= isert_conn->login_rsp_dma;
+		tx_dsg->length	= length;
+		tx_dsg->lkey	= isert_conn->device->mr->lkey;
+		tx_desc->num_sge = 2;
+	}
+	if (!login->login_failed) {
+		if (login->login_complete) {
+			if (!conn->sess->sess_ops->SessionType &&
+			    isert_conn->device->use_fastreg) {
+				ret = isert_conn_create_fastreg_pool(isert_conn);
+				if (ret) {
+					isert_err("Conn: %p failed to create"
+					       " fastreg pool\n", isert_conn);
+					return ret;
+				}
+			}
+
+			ret = isert_alloc_rx_descriptors(isert_conn);
+			if (ret)
+				return ret;
+
+			ret = isert_post_recv(isert_conn, ISERT_MIN_POSTED_RX);
+			if (ret)
+				return ret;
+
+			/* Now we are in FULL_FEATURE phase */
+			mutex_lock(&isert_conn->mutex);
+			isert_conn->state = ISER_CONN_FULL_FEATURE;
+			mutex_unlock(&isert_conn->mutex);
+			goto post_send;
+		}
+
+		ret = isert_rdma_post_recvl(isert_conn);
+		if (ret)
+			return ret;
+	}
+post_send:
+	ret = isert_post_send(isert_conn, tx_desc);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static void
+isert_rx_login_req(struct isert_conn *isert_conn)
+{
+	struct iser_rx_desc *rx_desc = (void *)isert_conn->login_req_buf;
+	int rx_buflen = isert_conn->login_req_len;
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct iscsi_login *login = conn->conn_login;
+	int size;
+
+	isert_info("conn %p\n", isert_conn);
+
+	WARN_ON_ONCE(!login);
+
+	if (login->first_request) {
+		struct iscsi_login_req *login_req =
+			(struct iscsi_login_req *)&rx_desc->iscsi_header;
+		/*
+		 * Setup the initial iscsi_login values from the leading
+		 * login request PDU.
+		 */
+		login->leading_connection = (!login_req->tsih) ? 1 : 0;
+		login->current_stage =
+			(login_req->flags & ISCSI_FLAG_LOGIN_CURRENT_STAGE_MASK)
+			 >> 2;
+		login->version_min	= login_req->min_version;
+		login->version_max	= login_req->max_version;
+		memcpy(login->isid, login_req->isid, 6);
+		login->cmd_sn		= be32_to_cpu(login_req->cmdsn);
+		login->init_task_tag	= login_req->itt;
+		login->initial_exp_statsn = be32_to_cpu(login_req->exp_statsn);
+		login->cid		= be16_to_cpu(login_req->cid);
+		login->tsih		= be16_to_cpu(login_req->tsih);
+	}
+
+	memcpy(&login->req[0], (void *)&rx_desc->iscsi_header, ISCSI_HDR_LEN);
+
+	size = min(rx_buflen, MAX_KEY_VALUE_PAIRS);
+	isert_dbg("Using login payload size: %d, rx_buflen: %d "
+		  "MAX_KEY_VALUE_PAIRS: %d\n", size, rx_buflen,
+		  MAX_KEY_VALUE_PAIRS);
+	memcpy(login->req_buf, &rx_desc->data[0], size);
+
+	if (login->first_request) {
+		complete(&isert_conn->login_comp);
+		return;
+	}
+	schedule_delayed_work(&conn->login_work, 0);
+}
+
+static struct iscsi_cmd
+*isert_allocate_cmd(struct iscsi_conn *conn)
+{
+	struct isert_conn *isert_conn = conn->context;
+	struct isert_cmd *isert_cmd;
+	struct iscsi_cmd *cmd;
+
+	cmd = iscsit_allocate_cmd(conn, TASK_INTERRUPTIBLE);
+	if (!cmd) {
+		isert_err("Unable to allocate iscsi_cmd + isert_cmd\n");
+		return NULL;
+	}
+	isert_cmd = iscsit_priv_cmd(cmd);
+	isert_cmd->conn = isert_conn;
+	isert_cmd->iscsi_cmd = cmd;
+
+	return cmd;
+}
+
+static int
+isert_handle_scsi_cmd(struct isert_conn *isert_conn,
+		      struct isert_cmd *isert_cmd, struct iscsi_cmd *cmd,
+		      struct iser_rx_desc *rx_desc, unsigned char *buf)
+{
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct iscsi_scsi_req *hdr = (struct iscsi_scsi_req *)buf;
+	struct scatterlist *sg;
+	int imm_data, imm_data_len, unsol_data, sg_nents, rc;
+	bool dump_payload = false;
+
+	rc = iscsit_setup_scsi_cmd(conn, cmd, buf);
+	if (rc < 0)
+		return rc;
+
+	imm_data = cmd->immediate_data;
+	imm_data_len = cmd->first_burst_len;
+	unsol_data = cmd->unsolicited_data;
+
+	rc = iscsit_process_scsi_cmd(conn, cmd, hdr);
+	if (rc < 0) {
+		return 0;
+	} else if (rc > 0) {
+		dump_payload = true;
+		goto sequence_cmd;
+	}
+
+	if (!imm_data)
+		return 0;
+
+	sg = &cmd->se_cmd.t_data_sg[0];
+	sg_nents = max(1UL, DIV_ROUND_UP(imm_data_len, PAGE_SIZE));
+
+	isert_dbg("Copying Immediate SG: %p sg_nents: %u from %p imm_data_len: %d\n",
+		  sg, sg_nents, &rx_desc->data[0], imm_data_len);
+
+	sg_copy_from_buffer(sg, sg_nents, &rx_desc->data[0], imm_data_len);
+
+	cmd->write_data_done += imm_data_len;
+
+	if (cmd->write_data_done == cmd->se_cmd.data_length) {
+		spin_lock_bh(&cmd->istate_lock);
+		cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT;
+		cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT;
+		spin_unlock_bh(&cmd->istate_lock);
+	}
+
+sequence_cmd:
+	rc = iscsit_sequence_cmd(conn, cmd, buf, hdr->cmdsn);
+
+	if (!rc && dump_payload == false && unsol_data)
+		iscsit_set_unsoliticed_dataout(cmd);
+	else if (dump_payload && imm_data)
+		target_put_sess_cmd(conn->sess->se_sess, &cmd->se_cmd);
+
+	return 0;
+}
+
+static int
+isert_handle_iscsi_dataout(struct isert_conn *isert_conn,
+			   struct iser_rx_desc *rx_desc, unsigned char *buf)
+{
+	struct scatterlist *sg_start;
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct iscsi_cmd *cmd = NULL;
+	struct iscsi_data *hdr = (struct iscsi_data *)buf;
+	u32 unsol_data_len = ntoh24(hdr->dlength);
+	int rc, sg_nents, sg_off, page_off;
+
+	rc = iscsit_check_dataout_hdr(conn, buf, &cmd);
+	if (rc < 0)
+		return rc;
+	else if (!cmd)
+		return 0;
+	/*
+	 * FIXME: Unexpected unsolicited_data out
+	 */
+	if (!cmd->unsolicited_data) {
+		isert_err("Received unexpected solicited data payload\n");
+		dump_stack();
+		return -1;
+	}
+
+	isert_dbg("Unsolicited DataOut unsol_data_len: %u, "
+		  "write_data_done: %u, data_length: %u\n",
+		  unsol_data_len,  cmd->write_data_done,
+		  cmd->se_cmd.data_length);
+
+	sg_off = cmd->write_data_done / PAGE_SIZE;
+	sg_start = &cmd->se_cmd.t_data_sg[sg_off];
+	sg_nents = max(1UL, DIV_ROUND_UP(unsol_data_len, PAGE_SIZE));
+	page_off = cmd->write_data_done % PAGE_SIZE;
+	/*
+	 * FIXME: Non page-aligned unsolicited_data out
+	 */
+	if (page_off) {
+		isert_err("unexpected non-page aligned data payload\n");
+		dump_stack();
+		return -1;
+	}
+	isert_dbg("Copying DataOut: sg_start: %p, sg_off: %u "
+		  "sg_nents: %u from %p %u\n", sg_start, sg_off,
+		  sg_nents, &rx_desc->data[0], unsol_data_len);
+
+	sg_copy_from_buffer(sg_start, sg_nents, &rx_desc->data[0],
+			    unsol_data_len);
+
+	rc = iscsit_check_dataout_payload(cmd, hdr, false);
+	if (rc < 0)
+		return rc;
+
+	return 0;
+}
+
+static int
+isert_handle_nop_out(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
+		     struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc,
+		     unsigned char *buf)
+{
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct iscsi_nopout *hdr = (struct iscsi_nopout *)buf;
+	int rc;
+
+	rc = iscsit_setup_nop_out(conn, cmd, hdr);
+	if (rc < 0)
+		return rc;
+	/*
+	 * FIXME: Add support for NOPOUT payload using unsolicited RDMA payload
+	 */
+
+	return iscsit_process_nop_out(conn, cmd, hdr);
+}
+
+static int
+isert_handle_text_cmd(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
+		      struct iscsi_cmd *cmd, struct iser_rx_desc *rx_desc,
+		      struct iscsi_text *hdr)
+{
+	struct iscsi_conn *conn = isert_conn->conn;
+	u32 payload_length = ntoh24(hdr->dlength);
+	int rc;
+	unsigned char *text_in = NULL;
+
+	rc = iscsit_setup_text_cmd(conn, cmd, hdr);
+	if (rc < 0)
+		return rc;
+
+	if (payload_length) {
+		text_in = kzalloc(payload_length, GFP_KERNEL);
+		if (!text_in) {
+			isert_err("Unable to allocate text_in of payload_length: %u\n",
+				  payload_length);
+			return -ENOMEM;
+		}
+	}
+	cmd->text_in_ptr = text_in;
+
+	memcpy(cmd->text_in_ptr, &rx_desc->data[0], payload_length);
+
+	return iscsit_process_text_cmd(conn, cmd, hdr);
+}
+
+static int
+isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc,
+		uint32_t read_stag, uint64_t read_va,
+		uint32_t write_stag, uint64_t write_va)
+{
+	struct iscsi_hdr *hdr = &rx_desc->iscsi_header;
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct iscsi_cmd *cmd;
+	struct isert_cmd *isert_cmd;
+	int ret = -EINVAL;
+	u8 opcode = (hdr->opcode & ISCSI_OPCODE_MASK);
+
+	if (conn->sess->sess_ops->SessionType &&
+	   (!(opcode & ISCSI_OP_TEXT) || !(opcode & ISCSI_OP_LOGOUT))) {
+		isert_err("Got illegal opcode: 0x%02x in SessionType=Discovery,"
+			  " ignoring\n", opcode);
+		return 0;
+	}
+
+	switch (opcode) {
+	case ISCSI_OP_SCSI_CMD:
+		cmd = isert_allocate_cmd(conn);
+		if (!cmd)
+			break;
+
+		isert_cmd = iscsit_priv_cmd(cmd);
+		isert_cmd->read_stag = read_stag;
+		isert_cmd->read_va = read_va;
+		isert_cmd->write_stag = write_stag;
+		isert_cmd->write_va = write_va;
+
+		ret = isert_handle_scsi_cmd(isert_conn, isert_cmd, cmd,
+					rx_desc, (unsigned char *)hdr);
+		break;
+	case ISCSI_OP_NOOP_OUT:
+		cmd = isert_allocate_cmd(conn);
+		if (!cmd)
+			break;
+
+		isert_cmd = iscsit_priv_cmd(cmd);
+		ret = isert_handle_nop_out(isert_conn, isert_cmd, cmd,
+					   rx_desc, (unsigned char *)hdr);
+		break;
+	case ISCSI_OP_SCSI_DATA_OUT:
+		ret = isert_handle_iscsi_dataout(isert_conn, rx_desc,
+						(unsigned char *)hdr);
+		break;
+	case ISCSI_OP_SCSI_TMFUNC:
+		cmd = isert_allocate_cmd(conn);
+		if (!cmd)
+			break;
+
+		ret = iscsit_handle_task_mgt_cmd(conn, cmd,
+						(unsigned char *)hdr);
+		break;
+	case ISCSI_OP_LOGOUT:
+		cmd = isert_allocate_cmd(conn);
+		if (!cmd)
+			break;
+
+		ret = iscsit_handle_logout_cmd(conn, cmd, (unsigned char *)hdr);
+		break;
+	case ISCSI_OP_TEXT:
+		if (be32_to_cpu(hdr->ttt) != 0xFFFFFFFF) {
+			cmd = iscsit_find_cmd_from_itt(conn, hdr->itt);
+			if (!cmd)
+				break;
+		} else {
+			cmd = isert_allocate_cmd(conn);
+			if (!cmd)
+				break;
+		}
+
+		isert_cmd = iscsit_priv_cmd(cmd);
+		ret = isert_handle_text_cmd(isert_conn, isert_cmd, cmd,
+					    rx_desc, (struct iscsi_text *)hdr);
+		break;
+	default:
+		isert_err("Got unknown iSCSI OpCode: 0x%02x\n", opcode);
+		dump_stack();
+		break;
+	}
+
+	return ret;
+}
+
+static void
+isert_rx_do_work(struct iser_rx_desc *rx_desc, struct isert_conn *isert_conn)
+{
+	struct iser_hdr *iser_hdr = &rx_desc->iser_header;
+	uint64_t read_va = 0, write_va = 0;
+	uint32_t read_stag = 0, write_stag = 0;
+	int rc;
+
+	switch (iser_hdr->flags & 0xF0) {
+	case ISCSI_CTRL:
+		if (iser_hdr->flags & ISER_RSV) {
+			read_stag = be32_to_cpu(iser_hdr->read_stag);
+			read_va = be64_to_cpu(iser_hdr->read_va);
+			isert_dbg("ISER_RSV: read_stag: 0x%x read_va: 0x%llx\n",
+				  read_stag, (unsigned long long)read_va);
+		}
+		if (iser_hdr->flags & ISER_WSV) {
+			write_stag = be32_to_cpu(iser_hdr->write_stag);
+			write_va = be64_to_cpu(iser_hdr->write_va);
+			isert_dbg("ISER_WSV: write_stag: 0x%x write_va: 0x%llx\n",
+				  write_stag, (unsigned long long)write_va);
+		}
+
+		isert_dbg("ISER ISCSI_CTRL PDU\n");
+		break;
+	case ISER_HELLO:
+		isert_err("iSER Hello message\n");
+		break;
+	default:
+		isert_warn("Unknown iSER hdr flags: 0x%02x\n", iser_hdr->flags);
+		break;
+	}
+
+	rc = isert_rx_opcode(isert_conn, rx_desc,
+			     read_stag, read_va, write_stag, write_va);
+}
+
+static void
+isert_rcv_completion(struct iser_rx_desc *desc,
+		     struct isert_conn *isert_conn,
+		     u32 xfer_len)
+{
+	struct ib_device *ib_dev = isert_conn->cm_id->device;
+	struct iscsi_hdr *hdr;
+	u64 rx_dma;
+	int rx_buflen, outstanding;
+
+	if ((char *)desc == isert_conn->login_req_buf) {
+		rx_dma = isert_conn->login_req_dma;
+		rx_buflen = ISER_RX_LOGIN_SIZE;
+		isert_dbg("login_buf: Using rx_dma: 0x%llx, rx_buflen: %d\n",
+			 rx_dma, rx_buflen);
+	} else {
+		rx_dma = desc->dma_addr;
+		rx_buflen = ISER_RX_PAYLOAD_SIZE;
+		isert_dbg("req_buf: Using rx_dma: 0x%llx, rx_buflen: %d\n",
+			 rx_dma, rx_buflen);
+	}
+
+	ib_dma_sync_single_for_cpu(ib_dev, rx_dma, rx_buflen, DMA_FROM_DEVICE);
+
+	hdr = &desc->iscsi_header;
+	isert_dbg("iSCSI opcode: 0x%02x, ITT: 0x%08x, flags: 0x%02x dlen: %d\n",
+		 hdr->opcode, hdr->itt, hdr->flags,
+		 (int)(xfer_len - ISER_HEADERS_LEN));
+
+	if ((char *)desc == isert_conn->login_req_buf) {
+		isert_conn->login_req_len = xfer_len - ISER_HEADERS_LEN;
+		if (isert_conn->conn) {
+			struct iscsi_login *login = isert_conn->conn->conn_login;
+
+			if (login && !login->first_request)
+				isert_rx_login_req(isert_conn);
+		}
+		mutex_lock(&isert_conn->mutex);
+		complete(&isert_conn->login_req_comp);
+		mutex_unlock(&isert_conn->mutex);
+	} else {
+		isert_rx_do_work(desc, isert_conn);
+	}
+
+	ib_dma_sync_single_for_device(ib_dev, rx_dma, rx_buflen,
+				      DMA_FROM_DEVICE);
+
+	isert_conn->post_recv_buf_count--;
+	isert_dbg("Decremented post_recv_buf_count: %d\n",
+		  isert_conn->post_recv_buf_count);
+
+	if ((char *)desc == isert_conn->login_req_buf)
+		return;
+
+	outstanding = isert_conn->post_recv_buf_count;
+	if (outstanding + ISERT_MIN_POSTED_RX <= ISERT_QP_MAX_RECV_DTOS) {
+		int err, count = min(ISERT_QP_MAX_RECV_DTOS - outstanding,
+				ISERT_MIN_POSTED_RX);
+		err = isert_post_recv(isert_conn, count);
+		if (err) {
+			isert_err("isert_post_recv() count: %d failed, %d\n",
+			       count, err);
+		}
+	}
+}
+
+static int
+isert_map_data_buf(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
+		   struct scatterlist *sg, u32 nents, u32 length, u32 offset,
+		   enum iser_ib_op_code op, struct isert_data_buf *data)
+{
+	struct ib_device *ib_dev = isert_conn->cm_id->device;
+
+	data->dma_dir = op == ISER_IB_RDMA_WRITE ?
+			      DMA_TO_DEVICE : DMA_FROM_DEVICE;
+
+	data->len = length - offset;
+	data->offset = offset;
+	data->sg_off = data->offset / PAGE_SIZE;
+
+	data->sg = &sg[data->sg_off];
+	data->nents = min_t(unsigned int, nents - data->sg_off,
+					  ISCSI_ISER_SG_TABLESIZE);
+	data->len = min_t(unsigned int, data->len, ISCSI_ISER_SG_TABLESIZE *
+					PAGE_SIZE);
+
+	data->dma_nents = ib_dma_map_sg(ib_dev, data->sg, data->nents,
+					data->dma_dir);
+	if (unlikely(!data->dma_nents)) {
+		isert_err("Cmd: unable to dma map SGs %p\n", sg);
+		return -EINVAL;
+	}
+
+	isert_dbg("Mapped cmd: %p count: %u sg: %p sg_nents: %u rdma_len %d\n",
+		  isert_cmd, data->dma_nents, data->sg, data->nents, data->len);
+
+	return 0;
+}
+
+static void
+isert_unmap_data_buf(struct isert_conn *isert_conn, struct isert_data_buf *data)
+{
+	struct ib_device *ib_dev = isert_conn->cm_id->device;
+
+	ib_dma_unmap_sg(ib_dev, data->sg, data->nents, data->dma_dir);
+	memset(data, 0, sizeof(*data));
+}
+
+
+
+static void
+isert_unmap_cmd(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn)
+{
+	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr;
+
+	isert_dbg("Cmd %p\n", isert_cmd);
+
+	if (wr->data.sg) {
+		isert_dbg("Cmd %p unmap_sg op\n", isert_cmd);
+		isert_unmap_data_buf(isert_conn, &wr->data);
+	}
+
+	if (wr->send_wr) {
+		isert_dbg("Cmd %p free send_wr\n", isert_cmd);
+		kfree(wr->send_wr);
+		wr->send_wr = NULL;
+	}
+
+	if (wr->ib_sge) {
+		isert_dbg("Cmd %p free ib_sge\n", isert_cmd);
+		kfree(wr->ib_sge);
+		wr->ib_sge = NULL;
+	}
+}
+
+static void
+isert_unreg_rdma(struct isert_cmd *isert_cmd, struct isert_conn *isert_conn)
+{
+	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr;
+
+	isert_dbg("Cmd %p\n", isert_cmd);
+
+	if (wr->fr_desc) {
+		isert_dbg("Cmd %p free fr_desc %p\n", isert_cmd, wr->fr_desc);
+		if (wr->fr_desc->ind & ISERT_PROTECTED) {
+			isert_unmap_data_buf(isert_conn, &wr->prot);
+			wr->fr_desc->ind &= ~ISERT_PROTECTED;
+		}
+		spin_lock_bh(&isert_conn->pool_lock);
+		list_add_tail(&wr->fr_desc->list, &isert_conn->fr_pool);
+		spin_unlock_bh(&isert_conn->pool_lock);
+		wr->fr_desc = NULL;
+	}
+
+	if (wr->data.sg) {
+		isert_dbg("Cmd %p unmap_sg op\n", isert_cmd);
+		isert_unmap_data_buf(isert_conn, &wr->data);
+	}
+
+	wr->ib_sge = NULL;
+	wr->send_wr = NULL;
+}
+
+static void
+isert_put_cmd(struct isert_cmd *isert_cmd, bool comp_err)
+{
+	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+	struct isert_conn *isert_conn = isert_cmd->conn;
+	struct iscsi_conn *conn = isert_conn->conn;
+	struct isert_device *device = isert_conn->device;
+	struct iscsi_text_rsp *hdr;
+
+	isert_dbg("Cmd %p\n", isert_cmd);
+
+	switch (cmd->iscsi_opcode) {
+	case ISCSI_OP_SCSI_CMD:
+		spin_lock_bh(&conn->cmd_lock);
+		if (!list_empty(&cmd->i_conn_node))
+			list_del_init(&cmd->i_conn_node);
+		spin_unlock_bh(&conn->cmd_lock);
+
+		if (cmd->data_direction == DMA_TO_DEVICE) {
+			iscsit_stop_dataout_timer(cmd);
+			/*
+			 * Check for special case during comp_err where
+			 * WRITE_PENDING has been handed off from core,
+			 * but requires an extra target_put_sess_cmd()
+			 * before transport_generic_free_cmd() below.
+			 */
+			if (comp_err &&
+			    cmd->se_cmd.t_state == TRANSPORT_WRITE_PENDING) {
+				struct se_cmd *se_cmd = &cmd->se_cmd;
+
+				target_put_sess_cmd(se_cmd->se_sess, se_cmd);
+			}
+		}
+
+		device->unreg_rdma_mem(isert_cmd, isert_conn);
+		transport_generic_free_cmd(&cmd->se_cmd, 0);
+		break;
+	case ISCSI_OP_SCSI_TMFUNC:
+		spin_lock_bh(&conn->cmd_lock);
+		if (!list_empty(&cmd->i_conn_node))
+			list_del_init(&cmd->i_conn_node);
+		spin_unlock_bh(&conn->cmd_lock);
+
+		transport_generic_free_cmd(&cmd->se_cmd, 0);
+		break;
+	case ISCSI_OP_REJECT:
+	case ISCSI_OP_NOOP_OUT:
+	case ISCSI_OP_TEXT:
+		hdr = (struct iscsi_text_rsp *)&isert_cmd->tx_desc.iscsi_header;
+		/* If the continue bit is on, keep the command alive */
+		if (hdr->flags & ISCSI_FLAG_TEXT_CONTINUE)
+			break;
+
+		spin_lock_bh(&conn->cmd_lock);
+		if (!list_empty(&cmd->i_conn_node))
+			list_del_init(&cmd->i_conn_node);
+		spin_unlock_bh(&conn->cmd_lock);
+
+		/*
+		 * Handle special case for REJECT when iscsi_add_reject*() has
+		 * overwritten the original iscsi_opcode assignment, and the
+		 * associated cmd->se_cmd needs to be released.
+		 */
+		if (cmd->se_cmd.se_tfo != NULL) {
+			isert_dbg("Calling transport_generic_free_cmd for 0x%02x\n",
+				 cmd->iscsi_opcode);
+			transport_generic_free_cmd(&cmd->se_cmd, 0);
+			break;
+		}
+		/*
+		 * Fall-through
+		 */
+	default:
+		iscsit_release_cmd(cmd);
+		break;
+	}
+}
+
+static void
+isert_unmap_tx_desc(struct iser_tx_desc *tx_desc, struct ib_device *ib_dev)
+{
+	if (tx_desc->dma_addr != 0) {
+		isert_dbg("unmap single for tx_desc->dma_addr\n");
+		ib_dma_unmap_single(ib_dev, tx_desc->dma_addr,
+				    ISER_HEADERS_LEN, DMA_TO_DEVICE);
+		tx_desc->dma_addr = 0;
+	}
+}
+
+static void
+isert_completion_put(struct iser_tx_desc *tx_desc, struct isert_cmd *isert_cmd,
+		     struct ib_device *ib_dev, bool comp_err)
+{
+	if (isert_cmd->pdu_buf_dma != 0) {
+		isert_dbg("unmap single for isert_cmd->pdu_buf_dma\n");
+		ib_dma_unmap_single(ib_dev, isert_cmd->pdu_buf_dma,
+				    isert_cmd->pdu_buf_len, DMA_TO_DEVICE);
+		isert_cmd->pdu_buf_dma = 0;
+	}
+
+	isert_unmap_tx_desc(tx_desc, ib_dev);
+	isert_put_cmd(isert_cmd, comp_err);
+}
+
+static int
+isert_check_pi_status(struct se_cmd *se_cmd, struct ib_mr *sig_mr)
+{
+	struct ib_mr_status mr_status;
+	int ret;
+
+	ret = ib_check_mr_status(sig_mr, IB_MR_CHECK_SIG_STATUS, &mr_status);
+	if (ret) {
+		isert_err("ib_check_mr_status failed, ret %d\n", ret);
+		goto fail_mr_status;
+	}
+
+	if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) {
+		u64 sec_offset_err;
+		u32 block_size = se_cmd->se_dev->dev_attrib.block_size + 8;
+
+		switch (mr_status.sig_err.err_type) {
+		case IB_SIG_BAD_GUARD:
+			se_cmd->pi_err = TCM_LOGICAL_BLOCK_GUARD_CHECK_FAILED;
+			break;
+		case IB_SIG_BAD_REFTAG:
+			se_cmd->pi_err = TCM_LOGICAL_BLOCK_REF_TAG_CHECK_FAILED;
+			break;
+		case IB_SIG_BAD_APPTAG:
+			se_cmd->pi_err = TCM_LOGICAL_BLOCK_APP_TAG_CHECK_FAILED;
+			break;
+		}
+		sec_offset_err = mr_status.sig_err.sig_err_offset;
+		do_div(sec_offset_err, block_size);
+		se_cmd->bad_sector = sec_offset_err + se_cmd->t_task_lba;
+
+		isert_err("PI error found type %d at sector 0x%llx "
+			  "expected 0x%x vs actual 0x%x\n",
+			  mr_status.sig_err.err_type,
+			  (unsigned long long)se_cmd->bad_sector,
+			  mr_status.sig_err.expected,
+			  mr_status.sig_err.actual);
+		ret = 1;
+	}
+
+fail_mr_status:
+	return ret;
+}
+
+static void
+isert_completion_rdma_write(struct iser_tx_desc *tx_desc,
+			    struct isert_cmd *isert_cmd)
+{
+	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr;
+	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+	struct se_cmd *se_cmd = &cmd->se_cmd;
+	struct isert_conn *isert_conn = isert_cmd->conn;
+	struct isert_device *device = isert_conn->device;
+	int ret = 0;
+
+	if (wr->fr_desc && wr->fr_desc->ind & ISERT_PROTECTED) {
+		ret = isert_check_pi_status(se_cmd,
+					    wr->fr_desc->pi_ctx->sig_mr);
+		wr->fr_desc->ind &= ~ISERT_PROTECTED;
+	}
+
+	device->unreg_rdma_mem(isert_cmd, isert_conn);
+	wr->send_wr_num = 0;
+	if (ret)
+		transport_send_check_condition_and_sense(se_cmd,
+							 se_cmd->pi_err, 0);
+	else
+		isert_put_response(isert_conn->conn, cmd);
+}
+
+static void
+isert_completion_rdma_read(struct iser_tx_desc *tx_desc,
+			   struct isert_cmd *isert_cmd)
+{
+	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr;
+	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+	struct se_cmd *se_cmd = &cmd->se_cmd;
+	struct isert_conn *isert_conn = isert_cmd->conn;
+	struct isert_device *device = isert_conn->device;
+	int ret = 0;
+
+	if (wr->fr_desc && wr->fr_desc->ind & ISERT_PROTECTED) {
+		ret = isert_check_pi_status(se_cmd,
+					    wr->fr_desc->pi_ctx->sig_mr);
+		wr->fr_desc->ind &= ~ISERT_PROTECTED;
+	}
+
+	iscsit_stop_dataout_timer(cmd);
+	device->unreg_rdma_mem(isert_cmd, isert_conn);
+	cmd->write_data_done = wr->data.len;
+	wr->send_wr_num = 0;
+
+	isert_dbg("Cmd: %p RDMA_READ comp calling execute_cmd\n", isert_cmd);
+	spin_lock_bh(&cmd->istate_lock);
+	cmd->cmd_flags |= ICF_GOT_LAST_DATAOUT;
+	cmd->i_state = ISTATE_RECEIVED_LAST_DATAOUT;
+	spin_unlock_bh(&cmd->istate_lock);
+
+	if (ret) {
+		target_put_sess_cmd(se_cmd->se_sess, se_cmd);
+		transport_send_check_condition_and_sense(se_cmd,
+							 se_cmd->pi_err, 0);
+	} else {
+		target_execute_cmd(se_cmd);
+	}
+}
+
+static void
+isert_do_control_comp(struct work_struct *work)
+{
+	struct isert_cmd *isert_cmd = container_of(work,
+			struct isert_cmd, comp_work);
+	struct isert_conn *isert_conn = isert_cmd->conn;
+	struct ib_device *ib_dev = isert_conn->cm_id->device;
+	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+
+	isert_dbg("Cmd %p i_state %d\n", isert_cmd, cmd->i_state);
+
+	switch (cmd->i_state) {
+	case ISTATE_SEND_TASKMGTRSP:
+		iscsit_tmr_post_handler(cmd, cmd->conn);
+	case ISTATE_SEND_REJECT:   /* FALLTHRU */
+	case ISTATE_SEND_TEXTRSP:  /* FALLTHRU */
+		cmd->i_state = ISTATE_SENT_STATUS;
+		isert_completion_put(&isert_cmd->tx_desc, isert_cmd,
+				     ib_dev, false);
+		break;
+	case ISTATE_SEND_LOGOUTRSP:
+		iscsit_logout_post_handler(cmd, cmd->conn);
+		break;
+	default:
+		isert_err("Unknown i_state %d\n", cmd->i_state);
+		dump_stack();
+		break;
+	}
+}
+
+static void
+isert_response_completion(struct iser_tx_desc *tx_desc,
+			  struct isert_cmd *isert_cmd,
+			  struct isert_conn *isert_conn,
+			  struct ib_device *ib_dev)
+{
+	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+
+	if (cmd->i_state == ISTATE_SEND_TASKMGTRSP ||
+	    cmd->i_state == ISTATE_SEND_LOGOUTRSP ||
+	    cmd->i_state == ISTATE_SEND_REJECT ||
+	    cmd->i_state == ISTATE_SEND_TEXTRSP) {
+		isert_unmap_tx_desc(tx_desc, ib_dev);
+
+		INIT_WORK(&isert_cmd->comp_work, isert_do_control_comp);
+		queue_work(isert_comp_wq, &isert_cmd->comp_work);
+		return;
+	}
+
+	cmd->i_state = ISTATE_SENT_STATUS;
+	isert_completion_put(tx_desc, isert_cmd, ib_dev, false);
+}
+
+static void
+isert_snd_completion(struct iser_tx_desc *tx_desc,
+		      struct isert_conn *isert_conn)
+{
+	struct ib_device *ib_dev = isert_conn->cm_id->device;
+	struct isert_cmd *isert_cmd = tx_desc->isert_cmd;
+	struct isert_rdma_wr *wr;
+
+	if (!isert_cmd) {
+		isert_unmap_tx_desc(tx_desc, ib_dev);
+		return;
+	}
+	wr = &isert_cmd->rdma_wr;
+
+	isert_dbg("Cmd %p iser_ib_op %d\n", isert_cmd, wr->iser_ib_op);
+
+	switch (wr->iser_ib_op) {
+	case ISER_IB_SEND:
+		isert_response_completion(tx_desc, isert_cmd,
+					  isert_conn, ib_dev);
+		break;
+	case ISER_IB_RDMA_WRITE:
+		isert_completion_rdma_write(tx_desc, isert_cmd);
+		break;
+	case ISER_IB_RDMA_READ:
+		isert_completion_rdma_read(tx_desc, isert_cmd);
+		break;
+	default:
+		isert_err("Unknown wr->iser_ib_op: 0x%x\n", wr->iser_ib_op);
+		dump_stack();
+		break;
+	}
+}
+
+/**
+ * is_isert_tx_desc() - Indicate if the completion wr_id
+ *     is a TX descriptor or not.
+ * @isert_conn: iser connection
+ * @wr_id: completion WR identifier
+ *
+ * Since we cannot rely on wc opcode in FLUSH errors
+ * we must work around it by checking if the wr_id address
+ * falls in the iser connection rx_descs buffer. If so
+ * it is an RX descriptor, otherwize it is a TX.
+ */
+static inline bool
+is_isert_tx_desc(struct isert_conn *isert_conn, void *wr_id)
+{
+	void *start = isert_conn->rx_descs;
+	int len = ISERT_QP_MAX_RECV_DTOS * sizeof(*isert_conn->rx_descs);
+
+	if (wr_id >= start && wr_id < start + len)
+		return false;
+
+	return true;
+}
+
+static void
+isert_cq_comp_err(struct isert_conn *isert_conn, struct ib_wc *wc)
+{
+	if (wc->wr_id == ISER_BEACON_WRID) {
+		isert_info("conn %p completing wait_comp_err\n",
+			   isert_conn);
+		complete(&isert_conn->wait_comp_err);
+	} else if (is_isert_tx_desc(isert_conn, (void *)(uintptr_t)wc->wr_id)) {
+		struct ib_device *ib_dev = isert_conn->cm_id->device;
+		struct isert_cmd *isert_cmd;
+		struct iser_tx_desc *desc;
+
+		desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id;
+		isert_cmd = desc->isert_cmd;
+		if (!isert_cmd)
+			isert_unmap_tx_desc(desc, ib_dev);
+		else
+			isert_completion_put(desc, isert_cmd, ib_dev, true);
+	} else {
+		isert_conn->post_recv_buf_count--;
+		if (!isert_conn->post_recv_buf_count)
+			iscsit_cause_connection_reinstatement(isert_conn->conn, 0);
+	}
+}
+
+static void
+isert_handle_wc(struct ib_wc *wc)
+{
+	struct isert_conn *isert_conn;
+	struct iser_tx_desc *tx_desc;
+	struct iser_rx_desc *rx_desc;
+
+	isert_conn = wc->qp->qp_context;
+	if (likely(wc->status == IB_WC_SUCCESS)) {
+		if (wc->opcode == IB_WC_RECV) {
+			rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id;
+			isert_rcv_completion(rx_desc, isert_conn, wc->byte_len);
+		} else {
+			tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id;
+			isert_snd_completion(tx_desc, isert_conn);
+		}
+	} else {
+		if (wc->status != IB_WC_WR_FLUSH_ERR)
+			isert_err("wr id %llx status %d vend_err %x\n",
+				  wc->wr_id, wc->status, wc->vendor_err);
+		else
+			isert_dbg("flush error: wr id %llx\n", wc->wr_id);
+
+		if (wc->wr_id != ISER_FASTREG_LI_WRID)
+			isert_cq_comp_err(isert_conn, wc);
+	}
+}
+
+static void
+isert_cq_work(struct work_struct *work)
+{
+	enum { isert_poll_budget = 65536 };
+	struct isert_comp *comp = container_of(work, struct isert_comp,
+					       work);
+	struct ib_wc *const wcs = comp->wcs;
+	int i, n, completed = 0;
+
+	while ((n = ib_poll_cq(comp->cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) {
+		for (i = 0; i < n; i++)
+			isert_handle_wc(&wcs[i]);
+
+		completed += n;
+		if (completed >= isert_poll_budget)
+			break;
+	}
+
+	ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP);
+}
+
+static void
+isert_cq_callback(struct ib_cq *cq, void *context)
+{
+	struct isert_comp *comp = context;
+
+	queue_work(isert_comp_wq, &comp->work);
+}
+
+static int
+isert_post_response(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd)
+{
+	struct ib_send_wr *wr_failed;
+	int ret;
+
+	ret = ib_post_send(isert_conn->qp, &isert_cmd->tx_desc.send_wr,
+			   &wr_failed);
+	if (ret) {
+		isert_err("ib_post_send failed with %d\n", ret);
+		return ret;
+	}
+	return ret;
+}
+
+static int
+isert_put_response(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+	struct iscsi_scsi_rsp *hdr = (struct iscsi_scsi_rsp *)
+				&isert_cmd->tx_desc.iscsi_header;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	iscsit_build_rsp_pdu(cmd, conn, true, hdr);
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+	/*
+	 * Attach SENSE DATA payload to iSCSI Response PDU
+	 */
+	if (cmd->se_cmd.sense_buffer &&
+	    ((cmd->se_cmd.se_cmd_flags & SCF_TRANSPORT_TASK_SENSE) ||
+	    (cmd->se_cmd.se_cmd_flags & SCF_EMULATED_TASK_SENSE))) {
+		struct isert_device *device = isert_conn->device;
+		struct ib_device *ib_dev = device->ib_device;
+		struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1];
+		u32 padding, pdu_len;
+
+		put_unaligned_be16(cmd->se_cmd.scsi_sense_length,
+				   cmd->sense_buffer);
+		cmd->se_cmd.scsi_sense_length += sizeof(__be16);
+
+		padding = -(cmd->se_cmd.scsi_sense_length) & 3;
+		hton24(hdr->dlength, (u32)cmd->se_cmd.scsi_sense_length);
+		pdu_len = cmd->se_cmd.scsi_sense_length + padding;
+
+		isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev,
+				(void *)cmd->sense_buffer, pdu_len,
+				DMA_TO_DEVICE);
+
+		isert_cmd->pdu_buf_len = pdu_len;
+		tx_dsg->addr	= isert_cmd->pdu_buf_dma;
+		tx_dsg->length	= pdu_len;
+		tx_dsg->lkey	= device->mr->lkey;
+		isert_cmd->tx_desc.num_sge = 2;
+	}
+
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr);
+
+	isert_dbg("Posting SCSI Response\n");
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static void
+isert_aborted_task(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct isert_device *device = isert_conn->device;
+
+	spin_lock_bh(&conn->cmd_lock);
+	if (!list_empty(&cmd->i_conn_node))
+		list_del_init(&cmd->i_conn_node);
+	spin_unlock_bh(&conn->cmd_lock);
+
+	if (cmd->data_direction == DMA_TO_DEVICE)
+		iscsit_stop_dataout_timer(cmd);
+
+	device->unreg_rdma_mem(isert_cmd, isert_conn);
+}
+
+static enum target_prot_op
+isert_get_sup_prot_ops(struct iscsi_conn *conn)
+{
+	struct isert_conn *isert_conn = conn->context;
+	struct isert_device *device = isert_conn->device;
+
+	if (conn->tpg->tpg_attrib.t10_pi) {
+		if (device->pi_capable) {
+			isert_info("conn %p PI offload enabled\n", isert_conn);
+			isert_conn->pi_support = true;
+			return TARGET_PROT_ALL;
+		}
+	}
+
+	isert_info("conn %p PI offload disabled\n", isert_conn);
+	isert_conn->pi_support = false;
+
+	return TARGET_PROT_NORMAL;
+}
+
+static int
+isert_put_nopin(struct iscsi_cmd *cmd, struct iscsi_conn *conn,
+		bool nopout_response)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	iscsit_build_nopin_rsp(cmd, conn, (struct iscsi_nopin *)
+			       &isert_cmd->tx_desc.iscsi_header,
+			       nopout_response);
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr);
+
+	isert_dbg("conn %p Posting NOPIN Response\n", isert_conn);
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static int
+isert_put_logout_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	iscsit_build_logout_rsp(cmd, conn, (struct iscsi_logout_rsp *)
+				&isert_cmd->tx_desc.iscsi_header);
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr);
+
+	isert_dbg("conn %p Posting Logout Response\n", isert_conn);
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static int
+isert_put_tm_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	iscsit_build_task_mgt_rsp(cmd, conn, (struct iscsi_tm_rsp *)
+				  &isert_cmd->tx_desc.iscsi_header);
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr);
+
+	isert_dbg("conn %p Posting Task Management Response\n", isert_conn);
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static int
+isert_put_reject(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+	struct isert_device *device = isert_conn->device;
+	struct ib_device *ib_dev = device->ib_device;
+	struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1];
+	struct iscsi_reject *hdr =
+		(struct iscsi_reject *)&isert_cmd->tx_desc.iscsi_header;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	iscsit_build_reject(cmd, conn, hdr);
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+
+	hton24(hdr->dlength, ISCSI_HDR_LEN);
+	isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev,
+			(void *)cmd->buf_ptr, ISCSI_HDR_LEN,
+			DMA_TO_DEVICE);
+	isert_cmd->pdu_buf_len = ISCSI_HDR_LEN;
+	tx_dsg->addr	= isert_cmd->pdu_buf_dma;
+	tx_dsg->length	= ISCSI_HDR_LEN;
+	tx_dsg->lkey	= device->mr->lkey;
+	isert_cmd->tx_desc.num_sge = 2;
+
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr);
+
+	isert_dbg("conn %p Posting Reject\n", isert_conn);
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static int
+isert_put_text_rsp(struct iscsi_cmd *cmd, struct iscsi_conn *conn)
+{
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct ib_send_wr *send_wr = &isert_cmd->tx_desc.send_wr;
+	struct iscsi_text_rsp *hdr =
+		(struct iscsi_text_rsp *)&isert_cmd->tx_desc.iscsi_header;
+	u32 txt_rsp_len;
+	int rc;
+
+	isert_create_send_desc(isert_conn, isert_cmd, &isert_cmd->tx_desc);
+	rc = iscsit_build_text_rsp(cmd, conn, hdr, ISCSI_INFINIBAND);
+	if (rc < 0)
+		return rc;
+
+	txt_rsp_len = rc;
+	isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+
+	if (txt_rsp_len) {
+		struct isert_device *device = isert_conn->device;
+		struct ib_device *ib_dev = device->ib_device;
+		struct ib_sge *tx_dsg = &isert_cmd->tx_desc.tx_sg[1];
+		void *txt_rsp_buf = cmd->buf_ptr;
+
+		isert_cmd->pdu_buf_dma = ib_dma_map_single(ib_dev,
+				txt_rsp_buf, txt_rsp_len, DMA_TO_DEVICE);
+
+		isert_cmd->pdu_buf_len = txt_rsp_len;
+		tx_dsg->addr	= isert_cmd->pdu_buf_dma;
+		tx_dsg->length	= txt_rsp_len;
+		tx_dsg->lkey	= device->mr->lkey;
+		isert_cmd->tx_desc.num_sge = 2;
+	}
+	isert_init_send_wr(isert_conn, isert_cmd, send_wr);
+
+	isert_dbg("conn %p Text Response\n", isert_conn);
+
+	return isert_post_response(isert_conn, isert_cmd);
+}
+
+static int
+isert_build_rdma_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd,
+		    struct ib_sge *ib_sge, struct ib_send_wr *send_wr,
+		    u32 data_left, u32 offset)
+{
+	struct iscsi_cmd *cmd = isert_cmd->iscsi_cmd;
+	struct scatterlist *sg_start, *tmp_sg;
+	struct isert_device *device = isert_conn->device;
+	struct ib_device *ib_dev = device->ib_device;
+	u32 sg_off, page_off;
+	int i = 0, sg_nents;
+
+	sg_off = offset / PAGE_SIZE;
+	sg_start = &cmd->se_cmd.t_data_sg[sg_off];
+	sg_nents = min(cmd->se_cmd.t_data_nents - sg_off, isert_conn->max_sge);
+	page_off = offset % PAGE_SIZE;
+
+	send_wr->sg_list = ib_sge;
+	send_wr->wr_id = (uintptr_t)&isert_cmd->tx_desc;
+	/*
+	 * Perform mapping of TCM scatterlist memory ib_sge dma_addr.
+	 */
+	for_each_sg(sg_start, tmp_sg, sg_nents, i) {
+		isert_dbg("RDMA from SGL dma_addr: 0x%llx dma_len: %u, "
+			  "page_off: %u\n",
+			  (unsigned long long)tmp_sg->dma_address,
+			  tmp_sg->length, page_off);
+
+		ib_sge->addr = ib_sg_dma_address(ib_dev, tmp_sg) + page_off;
+		ib_sge->length = min_t(u32, data_left,
+				ib_sg_dma_len(ib_dev, tmp_sg) - page_off);
+		ib_sge->lkey = device->mr->lkey;
+
+		isert_dbg("RDMA ib_sge: addr: 0x%llx  length: %u lkey: %x\n",
+			  ib_sge->addr, ib_sge->length, ib_sge->lkey);
+		page_off = 0;
+		data_left -= ib_sge->length;
+		if (!data_left)
+			break;
+		ib_sge++;
+		isert_dbg("Incrementing ib_sge pointer to %p\n", ib_sge);
+	}
+
+	send_wr->num_sge = ++i;
+	isert_dbg("Set outgoing sg_list: %p num_sg: %u from TCM SGLs\n",
+		  send_wr->sg_list, send_wr->num_sge);
+
+	return send_wr->num_sge;
+}
+
+static int
+isert_map_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
+	       struct isert_rdma_wr *wr)
+{
+	struct se_cmd *se_cmd = &cmd->se_cmd;
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct isert_data_buf *data = &wr->data;
+	struct ib_send_wr *send_wr;
+	struct ib_sge *ib_sge;
+	u32 offset, data_len, data_left, rdma_write_max, va_offset = 0;
+	int ret = 0, i, ib_sge_cnt;
+
+	isert_cmd->tx_desc.isert_cmd = isert_cmd;
+
+	offset = wr->iser_ib_op == ISER_IB_RDMA_READ ? cmd->write_data_done : 0;
+	ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg,
+				 se_cmd->t_data_nents, se_cmd->data_length,
+				 offset, wr->iser_ib_op, &wr->data);
+	if (ret)
+		return ret;
+
+	data_left = data->len;
+	offset = data->offset;
+
+	ib_sge = kzalloc(sizeof(struct ib_sge) * data->nents, GFP_KERNEL);
+	if (!ib_sge) {
+		isert_warn("Unable to allocate ib_sge\n");
+		ret = -ENOMEM;
+		goto unmap_cmd;
+	}
+	wr->ib_sge = ib_sge;
+
+	wr->send_wr_num = DIV_ROUND_UP(data->nents, isert_conn->max_sge);
+	wr->send_wr = kzalloc(sizeof(struct ib_send_wr) * wr->send_wr_num,
+				GFP_KERNEL);
+	if (!wr->send_wr) {
+		isert_dbg("Unable to allocate wr->send_wr\n");
+		ret = -ENOMEM;
+		goto unmap_cmd;
+	}
+
+	wr->isert_cmd = isert_cmd;
+	rdma_write_max = isert_conn->max_sge * PAGE_SIZE;
+
+	for (i = 0; i < wr->send_wr_num; i++) {
+		send_wr = &isert_cmd->rdma_wr.send_wr[i];
+		data_len = min(data_left, rdma_write_max);
+
+		send_wr->send_flags = 0;
+		if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) {
+			send_wr->opcode = IB_WR_RDMA_WRITE;
+			send_wr->wr.rdma.remote_addr = isert_cmd->read_va + offset;
+			send_wr->wr.rdma.rkey = isert_cmd->read_stag;
+			if (i + 1 == wr->send_wr_num)
+				send_wr->next = &isert_cmd->tx_desc.send_wr;
+			else
+				send_wr->next = &wr->send_wr[i + 1];
+		} else {
+			send_wr->opcode = IB_WR_RDMA_READ;
+			send_wr->wr.rdma.remote_addr = isert_cmd->write_va + va_offset;
+			send_wr->wr.rdma.rkey = isert_cmd->write_stag;
+			if (i + 1 == wr->send_wr_num)
+				send_wr->send_flags = IB_SEND_SIGNALED;
+			else
+				send_wr->next = &wr->send_wr[i + 1];
+		}
+
+		ib_sge_cnt = isert_build_rdma_wr(isert_conn, isert_cmd, ib_sge,
+					send_wr, data_len, offset);
+		ib_sge += ib_sge_cnt;
+
+		offset += data_len;
+		va_offset += data_len;
+		data_left -= data_len;
+	}
+
+	return 0;
+unmap_cmd:
+	isert_unmap_data_buf(isert_conn, data);
+
+	return ret;
+}
+
+static int
+isert_map_fr_pagelist(struct ib_device *ib_dev,
+		      struct scatterlist *sg_start, int sg_nents, u64 *fr_pl)
+{
+	u64 start_addr, end_addr, page, chunk_start = 0;
+	struct scatterlist *tmp_sg;
+	int i = 0, new_chunk, last_ent, n_pages;
+
+	n_pages = 0;
+	new_chunk = 1;
+	last_ent = sg_nents - 1;
+	for_each_sg(sg_start, tmp_sg, sg_nents, i) {
+		start_addr = ib_sg_dma_address(ib_dev, tmp_sg);
+		if (new_chunk)
+			chunk_start = start_addr;
+		end_addr = start_addr + ib_sg_dma_len(ib_dev, tmp_sg);
+
+		isert_dbg("SGL[%d] dma_addr: 0x%llx len: %u\n",
+			  i, (unsigned long long)tmp_sg->dma_address,
+			  tmp_sg->length);
+
+		if ((end_addr & ~PAGE_MASK) && i < last_ent) {
+			new_chunk = 0;
+			continue;
+		}
+		new_chunk = 1;
+
+		page = chunk_start & PAGE_MASK;
+		do {
+			fr_pl[n_pages++] = page;
+			isert_dbg("Mapped page_list[%d] page_addr: 0x%llx\n",
+				  n_pages - 1, page);
+			page += PAGE_SIZE;
+		} while (page < end_addr);
+	}
+
+	return n_pages;
+}
+
+static inline void
+isert_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr)
+{
+	u32 rkey;
+
+	memset(inv_wr, 0, sizeof(*inv_wr));
+	inv_wr->wr_id = ISER_FASTREG_LI_WRID;
+	inv_wr->opcode = IB_WR_LOCAL_INV;
+	inv_wr->ex.invalidate_rkey = mr->rkey;
+
+	/* Bump the key */
+	rkey = ib_inc_rkey(mr->rkey);
+	ib_update_fast_reg_key(mr, rkey);
+}
+
+static int
+isert_fast_reg_mr(struct isert_conn *isert_conn,
+		  struct fast_reg_descriptor *fr_desc,
+		  struct isert_data_buf *mem,
+		  enum isert_indicator ind,
+		  struct ib_sge *sge)
+{
+	struct isert_device *device = isert_conn->device;
+	struct ib_device *ib_dev = device->ib_device;
+	struct ib_mr *mr;
+	struct ib_fast_reg_page_list *frpl;
+	struct ib_send_wr fr_wr, inv_wr;
+	struct ib_send_wr *bad_wr, *wr = NULL;
+	int ret, pagelist_len;
+	u32 page_off;
+
+	if (mem->dma_nents == 1) {
+		sge->lkey = device->mr->lkey;
+		sge->addr = ib_sg_dma_address(ib_dev, &mem->sg[0]);
+		sge->length = ib_sg_dma_len(ib_dev, &mem->sg[0]);
+		isert_dbg("sge: addr: 0x%llx  length: %u lkey: %x\n",
+			 sge->addr, sge->length, sge->lkey);
+		return 0;
+	}
+
+	if (ind == ISERT_DATA_KEY_VALID) {
+		/* Registering data buffer */
+		mr = fr_desc->data_mr;
+		frpl = fr_desc->data_frpl;
+	} else {
+		/* Registering protection buffer */
+		mr = fr_desc->pi_ctx->prot_mr;
+		frpl = fr_desc->pi_ctx->prot_frpl;
+	}
+
+	page_off = mem->offset % PAGE_SIZE;
+
+	isert_dbg("Use fr_desc %p sg_nents %d offset %u\n",
+		  fr_desc, mem->nents, mem->offset);
+
+	pagelist_len = isert_map_fr_pagelist(ib_dev, mem->sg, mem->nents,
+					     &frpl->page_list[0]);
+
+	if (!(fr_desc->ind & ind)) {
+		isert_inv_rkey(&inv_wr, mr);
+		wr = &inv_wr;
+	}
+
+	/* Prepare FASTREG WR */
+	memset(&fr_wr, 0, sizeof(fr_wr));
+	fr_wr.wr_id = ISER_FASTREG_LI_WRID;
+	fr_wr.opcode = IB_WR_FAST_REG_MR;
+	fr_wr.wr.fast_reg.iova_start = frpl->page_list[0] + page_off;
+	fr_wr.wr.fast_reg.page_list = frpl;
+	fr_wr.wr.fast_reg.page_list_len = pagelist_len;
+	fr_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
+	fr_wr.wr.fast_reg.length = mem->len;
+	fr_wr.wr.fast_reg.rkey = mr->rkey;
+	fr_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE;
+
+	if (!wr)
+		wr = &fr_wr;
+	else
+		wr->next = &fr_wr;
+
+	ret = ib_post_send(isert_conn->qp, wr, &bad_wr);
+	if (ret) {
+		isert_err("fast registration failed, ret:%d\n", ret);
+		return ret;
+	}
+	fr_desc->ind &= ~ind;
+
+	sge->lkey = mr->lkey;
+	sge->addr = frpl->page_list[0] + page_off;
+	sge->length = mem->len;
+
+	isert_dbg("sge: addr: 0x%llx  length: %u lkey: %x\n",
+		  sge->addr, sge->length, sge->lkey);
+
+	return ret;
+}
+
+static inline void
+isert_set_dif_domain(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs,
+		     struct ib_sig_domain *domain)
+{
+	domain->sig_type = IB_SIG_TYPE_T10_DIF;
+	domain->sig.dif.bg_type = IB_T10DIF_CRC;
+	domain->sig.dif.pi_interval = se_cmd->se_dev->dev_attrib.block_size;
+	domain->sig.dif.ref_tag = se_cmd->reftag_seed;
+	/*
+	 * At the moment we hard code those, but if in the future
+	 * the target core would like to use it, we will take it
+	 * from se_cmd.
+	 */
+	domain->sig.dif.apptag_check_mask = 0xffff;
+	domain->sig.dif.app_escape = true;
+	domain->sig.dif.ref_escape = true;
+	if (se_cmd->prot_type == TARGET_DIF_TYPE1_PROT ||
+	    se_cmd->prot_type == TARGET_DIF_TYPE2_PROT)
+		domain->sig.dif.ref_remap = true;
+};
+
+static int
+isert_set_sig_attrs(struct se_cmd *se_cmd, struct ib_sig_attrs *sig_attrs)
+{
+	switch (se_cmd->prot_op) {
+	case TARGET_PROT_DIN_INSERT:
+	case TARGET_PROT_DOUT_STRIP:
+		sig_attrs->mem.sig_type = IB_SIG_TYPE_NONE;
+		isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->wire);
+		break;
+	case TARGET_PROT_DOUT_INSERT:
+	case TARGET_PROT_DIN_STRIP:
+		sig_attrs->wire.sig_type = IB_SIG_TYPE_NONE;
+		isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->mem);
+		break;
+	case TARGET_PROT_DIN_PASS:
+	case TARGET_PROT_DOUT_PASS:
+		isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->wire);
+		isert_set_dif_domain(se_cmd, sig_attrs, &sig_attrs->mem);
+		break;
+	default:
+		isert_err("Unsupported PI operation %d\n", se_cmd->prot_op);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static inline u8
+isert_set_prot_checks(u8 prot_checks)
+{
+	return (prot_checks & TARGET_DIF_CHECK_GUARD  ? 0xc0 : 0) |
+	       (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x30 : 0) |
+	       (prot_checks & TARGET_DIF_CHECK_REFTAG ? 0x0f : 0);
+}
+
+static int
+isert_reg_sig_mr(struct isert_conn *isert_conn,
+		 struct se_cmd *se_cmd,
+		 struct isert_rdma_wr *rdma_wr,
+		 struct fast_reg_descriptor *fr_desc)
+{
+	struct ib_send_wr sig_wr, inv_wr;
+	struct ib_send_wr *bad_wr, *wr = NULL;
+	struct pi_context *pi_ctx = fr_desc->pi_ctx;
+	struct ib_sig_attrs sig_attrs;
+	int ret;
+
+	memset(&sig_attrs, 0, sizeof(sig_attrs));
+	ret = isert_set_sig_attrs(se_cmd, &sig_attrs);
+	if (ret)
+		goto err;
+
+	sig_attrs.check_mask = isert_set_prot_checks(se_cmd->prot_checks);
+
+	if (!(fr_desc->ind & ISERT_SIG_KEY_VALID)) {
+		isert_inv_rkey(&inv_wr, pi_ctx->sig_mr);
+		wr = &inv_wr;
+	}
+
+	memset(&sig_wr, 0, sizeof(sig_wr));
+	sig_wr.opcode = IB_WR_REG_SIG_MR;
+	sig_wr.wr_id = ISER_FASTREG_LI_WRID;
+	sig_wr.sg_list = &rdma_wr->ib_sg[DATA];
+	sig_wr.num_sge = 1;
+	sig_wr.wr.sig_handover.access_flags = IB_ACCESS_LOCAL_WRITE;
+	sig_wr.wr.sig_handover.sig_attrs = &sig_attrs;
+	sig_wr.wr.sig_handover.sig_mr = pi_ctx->sig_mr;
+	if (se_cmd->t_prot_sg)
+		sig_wr.wr.sig_handover.prot = &rdma_wr->ib_sg[PROT];
+
+	if (!wr)
+		wr = &sig_wr;
+	else
+		wr->next = &sig_wr;
+
+	ret = ib_post_send(isert_conn->qp, wr, &bad_wr);
+	if (ret) {
+		isert_err("fast registration failed, ret:%d\n", ret);
+		goto err;
+	}
+	fr_desc->ind &= ~ISERT_SIG_KEY_VALID;
+
+	rdma_wr->ib_sg[SIG].lkey = pi_ctx->sig_mr->lkey;
+	rdma_wr->ib_sg[SIG].addr = 0;
+	rdma_wr->ib_sg[SIG].length = se_cmd->data_length;
+	if (se_cmd->prot_op != TARGET_PROT_DIN_STRIP &&
+	    se_cmd->prot_op != TARGET_PROT_DOUT_INSERT)
+		/*
+		 * We have protection guards on the wire
+		 * so we need to set a larget transfer
+		 */
+		rdma_wr->ib_sg[SIG].length += se_cmd->prot_length;
+
+	isert_dbg("sig_sge: addr: 0x%llx  length: %u lkey: %x\n",
+		  rdma_wr->ib_sg[SIG].addr, rdma_wr->ib_sg[SIG].length,
+		  rdma_wr->ib_sg[SIG].lkey);
+err:
+	return ret;
+}
+
+static int
+isert_handle_prot_cmd(struct isert_conn *isert_conn,
+		      struct isert_cmd *isert_cmd,
+		      struct isert_rdma_wr *wr)
+{
+	struct isert_device *device = isert_conn->device;
+	struct se_cmd *se_cmd = &isert_cmd->iscsi_cmd->se_cmd;
+	int ret;
+
+	if (!wr->fr_desc->pi_ctx) {
+		ret = isert_create_pi_ctx(wr->fr_desc,
+					  device->ib_device,
+					  device->pd);
+		if (ret) {
+			isert_err("conn %p failed to allocate pi_ctx\n",
+				  isert_conn);
+			return ret;
+		}
+	}
+
+	if (se_cmd->t_prot_sg) {
+		ret = isert_map_data_buf(isert_conn, isert_cmd,
+					 se_cmd->t_prot_sg,
+					 se_cmd->t_prot_nents,
+					 se_cmd->prot_length,
+					 0, wr->iser_ib_op, &wr->prot);
+		if (ret) {
+			isert_err("conn %p failed to map protection buffer\n",
+				  isert_conn);
+			return ret;
+		}
+
+		memset(&wr->ib_sg[PROT], 0, sizeof(wr->ib_sg[PROT]));
+		ret = isert_fast_reg_mr(isert_conn, wr->fr_desc, &wr->prot,
+					ISERT_PROT_KEY_VALID, &wr->ib_sg[PROT]);
+		if (ret) {
+			isert_err("conn %p failed to fast reg mr\n",
+				  isert_conn);
+			goto unmap_prot_cmd;
+		}
+	}
+
+	ret = isert_reg_sig_mr(isert_conn, se_cmd, wr, wr->fr_desc);
+	if (ret) {
+		isert_err("conn %p failed to fast reg mr\n",
+			  isert_conn);
+		goto unmap_prot_cmd;
+	}
+	wr->fr_desc->ind |= ISERT_PROTECTED;
+
+	return 0;
+
+unmap_prot_cmd:
+	if (se_cmd->t_prot_sg)
+		isert_unmap_data_buf(isert_conn, &wr->prot);
+
+	return ret;
+}
+
+static int
+isert_reg_rdma(struct iscsi_conn *conn, struct iscsi_cmd *cmd,
+	       struct isert_rdma_wr *wr)
+{
+	struct se_cmd *se_cmd = &cmd->se_cmd;
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_conn *isert_conn = conn->context;
+	struct fast_reg_descriptor *fr_desc = NULL;
+	struct ib_send_wr *send_wr;
+	struct ib_sge *ib_sg;
+	u32 offset;
+	int ret = 0;
+	unsigned long flags;
+
+	isert_cmd->tx_desc.isert_cmd = isert_cmd;
+
+	offset = wr->iser_ib_op == ISER_IB_RDMA_READ ? cmd->write_data_done : 0;
+	ret = isert_map_data_buf(isert_conn, isert_cmd, se_cmd->t_data_sg,
+				 se_cmd->t_data_nents, se_cmd->data_length,
+				 offset, wr->iser_ib_op, &wr->data);
+	if (ret)
+		return ret;
+
+	if (wr->data.dma_nents != 1 || isert_prot_cmd(isert_conn, se_cmd)) {
+		spin_lock_irqsave(&isert_conn->pool_lock, flags);
+		fr_desc = list_first_entry(&isert_conn->fr_pool,
+					   struct fast_reg_descriptor, list);
+		list_del(&fr_desc->list);
+		spin_unlock_irqrestore(&isert_conn->pool_lock, flags);
+		wr->fr_desc = fr_desc;
+	}
+
+	ret = isert_fast_reg_mr(isert_conn, fr_desc, &wr->data,
+				ISERT_DATA_KEY_VALID, &wr->ib_sg[DATA]);
+	if (ret)
+		goto unmap_cmd;
+
+	if (isert_prot_cmd(isert_conn, se_cmd)) {
+		ret = isert_handle_prot_cmd(isert_conn, isert_cmd, wr);
+		if (ret)
+			goto unmap_cmd;
+
+		ib_sg = &wr->ib_sg[SIG];
+	} else {
+		ib_sg = &wr->ib_sg[DATA];
+	}
+
+	memcpy(&wr->s_ib_sge, ib_sg, sizeof(*ib_sg));
+	wr->ib_sge = &wr->s_ib_sge;
+	wr->send_wr_num = 1;
+	memset(&wr->s_send_wr, 0, sizeof(*send_wr));
+	wr->send_wr = &wr->s_send_wr;
+	wr->isert_cmd = isert_cmd;
+
+	send_wr = &isert_cmd->rdma_wr.s_send_wr;
+	send_wr->sg_list = &wr->s_ib_sge;
+	send_wr->num_sge = 1;
+	send_wr->wr_id = (uintptr_t)&isert_cmd->tx_desc;
+	if (wr->iser_ib_op == ISER_IB_RDMA_WRITE) {
+		send_wr->opcode = IB_WR_RDMA_WRITE;
+		send_wr->wr.rdma.remote_addr = isert_cmd->read_va;
+		send_wr->wr.rdma.rkey = isert_cmd->read_stag;
+		send_wr->send_flags = !isert_prot_cmd(isert_conn, se_cmd) ?
+				      0 : IB_SEND_SIGNALED;
+	} else {
+		send_wr->opcode = IB_WR_RDMA_READ;
+		send_wr->wr.rdma.remote_addr = isert_cmd->write_va;
+		send_wr->wr.rdma.rkey = isert_cmd->write_stag;
+		send_wr->send_flags = IB_SEND_SIGNALED;
+	}
+
+	return 0;
+
+unmap_cmd:
+	if (fr_desc) {
+		spin_lock_irqsave(&isert_conn->pool_lock, flags);
+		list_add_tail(&fr_desc->list, &isert_conn->fr_pool);
+		spin_unlock_irqrestore(&isert_conn->pool_lock, flags);
+	}
+	isert_unmap_data_buf(isert_conn, &wr->data);
+
+	return ret;
+}
+
+static int
+isert_put_datain(struct iscsi_conn *conn, struct iscsi_cmd *cmd)
+{
+	struct se_cmd *se_cmd = &cmd->se_cmd;
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr;
+	struct isert_conn *isert_conn = conn->context;
+	struct isert_device *device = isert_conn->device;
+	struct ib_send_wr *wr_failed;
+	int rc;
+
+	isert_dbg("Cmd: %p RDMA_WRITE data_length: %u\n",
+		 isert_cmd, se_cmd->data_length);
+
+	wr->iser_ib_op = ISER_IB_RDMA_WRITE;
+	rc = device->reg_rdma_mem(conn, cmd, wr);
+	if (rc) {
+		isert_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd);
+		return rc;
+	}
+
+	if (!isert_prot_cmd(isert_conn, se_cmd)) {
+		/*
+		 * Build isert_conn->tx_desc for iSCSI response PDU and attach
+		 */
+		isert_create_send_desc(isert_conn, isert_cmd,
+				       &isert_cmd->tx_desc);
+		iscsit_build_rsp_pdu(cmd, conn, true, (struct iscsi_scsi_rsp *)
+				     &isert_cmd->tx_desc.iscsi_header);
+		isert_init_tx_hdrs(isert_conn, &isert_cmd->tx_desc);
+		isert_init_send_wr(isert_conn, isert_cmd,
+				   &isert_cmd->tx_desc.send_wr);
+		isert_cmd->rdma_wr.s_send_wr.next = &isert_cmd->tx_desc.send_wr;
+		wr->send_wr_num += 1;
+	}
+
+	rc = ib_post_send(isert_conn->qp, wr->send_wr, &wr_failed);
+	if (rc)
+		isert_warn("ib_post_send() failed for IB_WR_RDMA_WRITE\n");
+
+	if (!isert_prot_cmd(isert_conn, se_cmd))
+		isert_dbg("Cmd: %p posted RDMA_WRITE + Response for iSER Data "
+			 "READ\n", isert_cmd);
+	else
+		isert_dbg("Cmd: %p posted RDMA_WRITE for iSER Data READ\n",
+			 isert_cmd);
+
+	return 1;
+}
+
+static int
+isert_get_dataout(struct iscsi_conn *conn, struct iscsi_cmd *cmd, bool recovery)
+{
+	struct se_cmd *se_cmd = &cmd->se_cmd;
+	struct isert_cmd *isert_cmd = iscsit_priv_cmd(cmd);
+	struct isert_rdma_wr *wr = &isert_cmd->rdma_wr;
+	struct isert_conn *isert_conn = conn->context;
+	struct isert_device *device = isert_conn->device;
+	struct ib_send_wr *wr_failed;
+	int rc;
+
+	isert_dbg("Cmd: %p RDMA_READ data_length: %u write_data_done: %u\n",
+		 isert_cmd, se_cmd->data_length, cmd->write_data_done);
+	wr->iser_ib_op = ISER_IB_RDMA_READ;
+	rc = device->reg_rdma_mem(conn, cmd, wr);
+	if (rc) {
+		isert_err("Cmd: %p failed to prepare RDMA res\n", isert_cmd);
+		return rc;
+	}
+
+	rc = ib_post_send(isert_conn->qp, wr->send_wr, &wr_failed);
+	if (rc)
+		isert_warn("ib_post_send() failed for IB_WR_RDMA_READ\n");
+
+	isert_dbg("Cmd: %p posted RDMA_READ memory for ISER Data WRITE\n",
+		 isert_cmd);
+
+	return 0;
+}
+
+static int
+isert_immediate_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state)
+{
+	int ret;
+
+	switch (state) {
+	case ISTATE_SEND_NOPIN_WANT_RESPONSE:
+		ret = isert_put_nopin(cmd, conn, false);
+		break;
+	default:
+		isert_err("Unknown immediate state: 0x%02x\n", state);
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+static int
+isert_response_queue(struct iscsi_conn *conn, struct iscsi_cmd *cmd, int state)
+{
+	struct isert_conn *isert_conn = conn->context;
+	int ret;
+
+	switch (state) {
+	case ISTATE_SEND_LOGOUTRSP:
+		ret = isert_put_logout_rsp(cmd, conn);
+		if (!ret)
+			isert_conn->logout_posted = true;
+		break;
+	case ISTATE_SEND_NOPIN:
+		ret = isert_put_nopin(cmd, conn, true);
+		break;
+	case ISTATE_SEND_TASKMGTRSP:
+		ret = isert_put_tm_rsp(cmd, conn);
+		break;
+	case ISTATE_SEND_REJECT:
+		ret = isert_put_reject(cmd, conn);
+		break;
+	case ISTATE_SEND_TEXTRSP:
+		ret = isert_put_text_rsp(cmd, conn);
+		break;
+	case ISTATE_SEND_STATUS:
+		/*
+		 * Special case for sending non GOOD SCSI status from TX thread
+		 * context during pre se_cmd excecution failure.
+		 */
+		ret = isert_put_response(conn, cmd);
+		break;
+	default:
+		isert_err("Unknown response state: 0x%02x\n", state);
+		ret = -EINVAL;
+		break;
+	}
+
+	return ret;
+}
+
+struct rdma_cm_id *
+isert_setup_id(struct isert_np *isert_np)
+{
+	struct iscsi_np *np = isert_np->np;
+	struct rdma_cm_id *id;
+	struct sockaddr *sa;
+	int ret;
+
+	sa = (struct sockaddr *)&np->np_sockaddr;
+	isert_dbg("ksockaddr: %p, sa: %p\n", &np->np_sockaddr, sa);
+
+	id = rdma_create_id(isert_cma_handler, isert_np,
+			    RDMA_PS_TCP, IB_QPT_RC);
+	if (IS_ERR(id)) {
+		isert_err("rdma_create_id() failed: %ld\n", PTR_ERR(id));
+		ret = PTR_ERR(id);
+		goto out;
+	}
+	isert_dbg("id %p context %p\n", id, id->context);
+
+	ret = rdma_bind_addr(id, sa);
+	if (ret) {
+		isert_err("rdma_bind_addr() failed: %d\n", ret);
+		goto out_id;
+	}
+
+	ret = rdma_listen(id, 0);
+	if (ret) {
+		isert_err("rdma_listen() failed: %d\n", ret);
+		goto out_id;
+	}
+
+	return id;
+out_id:
+	rdma_destroy_id(id);
+out:
+	return ERR_PTR(ret);
+}
+
+static int
+isert_setup_np(struct iscsi_np *np,
+	       struct __kernel_sockaddr_storage *ksockaddr)
+{
+	struct isert_np *isert_np;
+	struct rdma_cm_id *isert_lid;
+	int ret;
+
+	isert_np = kzalloc(sizeof(struct isert_np), GFP_KERNEL);
+	if (!isert_np) {
+		isert_err("Unable to allocate struct isert_np\n");
+		return -ENOMEM;
+	}
+	sema_init(&isert_np->np_sem, 0);
+	mutex_init(&isert_np->np_accept_mutex);
+	INIT_LIST_HEAD(&isert_np->np_accept_list);
+	init_completion(&isert_np->np_login_comp);
+	isert_np->np = np;
+
+	/*
+	 * Setup the np->np_sockaddr from the passed sockaddr setup
+	 * in iscsi_target_configfs.c code..
+	 */
+	memcpy(&np->np_sockaddr, ksockaddr,
+	       sizeof(struct __kernel_sockaddr_storage));
+
+	isert_lid = isert_setup_id(isert_np);
+	if (IS_ERR(isert_lid)) {
+		ret = PTR_ERR(isert_lid);
+		goto out;
+	}
+
+	isert_np->np_cm_id = isert_lid;
+	np->np_context = isert_np;
+
+	return 0;
+
+out:
+	kfree(isert_np);
+
+	return ret;
+}
+
+static int
+isert_rdma_accept(struct isert_conn *isert_conn)
+{
+	struct rdma_cm_id *cm_id = isert_conn->cm_id;
+	struct rdma_conn_param cp;
+	int ret;
+
+	memset(&cp, 0, sizeof(struct rdma_conn_param));
+	cp.initiator_depth = isert_conn->initiator_depth;
+	cp.retry_count = 7;
+	cp.rnr_retry_count = 7;
+
+	ret = rdma_accept(cm_id, &cp);
+	if (ret) {
+		isert_err("rdma_accept() failed with: %d\n", ret);
+		return ret;
+	}
+
+	return 0;
+}
+
+static int
+isert_get_login_rx(struct iscsi_conn *conn, struct iscsi_login *login)
+{
+	struct isert_conn *isert_conn = conn->context;
+	int ret;
+
+	isert_info("before login_req comp conn: %p\n", isert_conn);
+	ret = wait_for_completion_interruptible(&isert_conn->login_req_comp);
+	if (ret) {
+		isert_err("isert_conn %p interrupted before got login req\n",
+			  isert_conn);
+		return ret;
+	}
+	reinit_completion(&isert_conn->login_req_comp);
+
+	/*
+	 * For login requests after the first PDU, isert_rx_login_req() will
+	 * kick schedule_delayed_work(&conn->login_work) as the packet is
+	 * received, which turns this callback from iscsi_target_do_login_rx()
+	 * into a NOP.
+	 */
+	if (!login->first_request)
+		return 0;
+
+	isert_rx_login_req(isert_conn);
+
+	isert_info("before login_comp conn: %p\n", conn);
+	ret = wait_for_completion_interruptible(&isert_conn->login_comp);
+	if (ret)
+		return ret;
+
+	isert_info("processing login->req: %p\n", login->req);
+
+	return 0;
+}
+
+static void
+isert_set_conn_info(struct iscsi_np *np, struct iscsi_conn *conn,
+		    struct isert_conn *isert_conn)
+{
+	struct rdma_cm_id *cm_id = isert_conn->cm_id;
+	struct rdma_route *cm_route = &cm_id->route;
+	struct sockaddr_in *sock_in;
+	struct sockaddr_in6 *sock_in6;
+
+	conn->login_family = np->np_sockaddr.ss_family;
+
+	if (np->np_sockaddr.ss_family == AF_INET6) {
+		sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.dst_addr;
+		snprintf(conn->login_ip, sizeof(conn->login_ip), "%pI6c",
+			 &sock_in6->sin6_addr.in6_u);
+		conn->login_port = ntohs(sock_in6->sin6_port);
+
+		sock_in6 = (struct sockaddr_in6 *)&cm_route->addr.src_addr;
+		snprintf(conn->local_ip, sizeof(conn->local_ip), "%pI6c",
+			 &sock_in6->sin6_addr.in6_u);
+		conn->local_port = ntohs(sock_in6->sin6_port);
+	} else {
+		sock_in = (struct sockaddr_in *)&cm_route->addr.dst_addr;
+		sprintf(conn->login_ip, "%pI4",
+			&sock_in->sin_addr.s_addr);
+		conn->login_port = ntohs(sock_in->sin_port);
+
+		sock_in = (struct sockaddr_in *)&cm_route->addr.src_addr;
+		sprintf(conn->local_ip, "%pI4",
+			&sock_in->sin_addr.s_addr);
+		conn->local_port = ntohs(sock_in->sin_port);
+	}
+}
+
+static int
+isert_accept_np(struct iscsi_np *np, struct iscsi_conn *conn)
+{
+	struct isert_np *isert_np = np->np_context;
+	struct isert_conn *isert_conn;
+	int ret;
+
+accept_wait:
+	ret = down_interruptible(&isert_np->np_sem);
+	if (ret)
+		return -ENODEV;
+
+	spin_lock_bh(&np->np_thread_lock);
+	if (np->np_thread_state >= ISCSI_NP_THREAD_RESET) {
+		spin_unlock_bh(&np->np_thread_lock);
+		isert_dbg("np_thread_state %d\n",
+			 np->np_thread_state);
+		/**
+		 * No point in stalling here when np_thread
+		 * is in state RESET/SHUTDOWN/EXIT - bail
+		 **/
+		return -ENODEV;
+	}
+	spin_unlock_bh(&np->np_thread_lock);
+
+	mutex_lock(&isert_np->np_accept_mutex);
+	if (list_empty(&isert_np->np_accept_list)) {
+		mutex_unlock(&isert_np->np_accept_mutex);
+		goto accept_wait;
+	}
+	isert_conn = list_first_entry(&isert_np->np_accept_list,
+			struct isert_conn, accept_node);
+	list_del_init(&isert_conn->accept_node);
+	mutex_unlock(&isert_np->np_accept_mutex);
+
+	conn->context = isert_conn;
+	isert_conn->conn = conn;
+
+	isert_set_conn_info(np, conn, isert_conn);
+
+	isert_dbg("Processing isert_conn: %p\n", isert_conn);
+
+	return 0;
+}
+
+static void
+isert_free_np(struct iscsi_np *np)
+{
+	struct isert_np *isert_np = np->np_context;
+	struct isert_conn *isert_conn, *n;
+
+	if (isert_np->np_cm_id)
+		rdma_destroy_id(isert_np->np_cm_id);
+
+	/*
+	 * FIXME: At this point we don't have a good way to insure
+	 * that at this point we don't have hanging connections that
+	 * completed RDMA establishment but didn't start iscsi login
+	 * process. So work-around this by cleaning up what ever piled
+	 * up in np_accept_list.
+	 */
+	mutex_lock(&isert_np->np_accept_mutex);
+	if (!list_empty(&isert_np->np_accept_list)) {
+		isert_info("Still have isert connections, cleaning up...\n");
+		list_for_each_entry_safe(isert_conn, n,
+					 &isert_np->np_accept_list,
+					 accept_node) {
+			isert_info("cleaning isert_conn %p state (%d)\n",
+				   isert_conn, isert_conn->state);
+			isert_connect_release(isert_conn);
+		}
+	}
+	mutex_unlock(&isert_np->np_accept_mutex);
+
+	np->np_context = NULL;
+	kfree(isert_np);
+}
+
+static void isert_release_work(struct work_struct *work)
+{
+	struct isert_conn *isert_conn = container_of(work,
+						     struct isert_conn,
+						     release_work);
+
+	isert_info("Starting release conn %p\n", isert_conn);
+
+	wait_for_completion(&isert_conn->wait);
+
+	mutex_lock(&isert_conn->mutex);
+	isert_conn->state = ISER_CONN_DOWN;
+	mutex_unlock(&isert_conn->mutex);
+
+	isert_info("Destroying conn %p\n", isert_conn);
+	isert_put_conn(isert_conn);
+}
+
+static void
+isert_wait4logout(struct isert_conn *isert_conn)
+{
+	struct iscsi_conn *conn = isert_conn->conn;
+
+	isert_info("conn %p\n", isert_conn);
+
+	if (isert_conn->logout_posted) {
+		isert_info("conn %p wait for conn_logout_comp\n", isert_conn);
+		wait_for_completion_timeout(&conn->conn_logout_comp,
+					    SECONDS_FOR_LOGOUT_COMP * HZ);
+	}
+}
+
+static void
+isert_wait4cmds(struct iscsi_conn *conn)
+{
+	isert_info("iscsi_conn %p\n", conn);
+
+	if (conn->sess) {
+		target_sess_cmd_list_set_waiting(conn->sess->se_sess);
+		target_wait_for_sess_cmds(conn->sess->se_sess);
+	}
+}
+
+static void
+isert_wait4flush(struct isert_conn *isert_conn)
+{
+	struct ib_recv_wr *bad_wr;
+
+	isert_info("conn %p\n", isert_conn);
+
+	init_completion(&isert_conn->wait_comp_err);
+	isert_conn->beacon.wr_id = ISER_BEACON_WRID;
+	/* post an indication that all flush errors were consumed */
+	if (ib_post_recv(isert_conn->qp, &isert_conn->beacon, &bad_wr)) {
+		isert_err("conn %p failed to post beacon", isert_conn);
+		return;
+	}
+
+	wait_for_completion(&isert_conn->wait_comp_err);
+}
+
+static void isert_wait_conn(struct iscsi_conn *conn)
+{
+	struct isert_conn *isert_conn = conn->context;
+
+	isert_info("Starting conn %p\n", isert_conn);
+
+	mutex_lock(&isert_conn->mutex);
+	/*
+	 * Only wait for wait_comp_err if the isert_conn made it
+	 * into full feature phase..
+	 */
+	if (isert_conn->state == ISER_CONN_INIT) {
+		mutex_unlock(&isert_conn->mutex);
+		return;
+	}
+	isert_conn_terminate(isert_conn);
+	mutex_unlock(&isert_conn->mutex);
+
+	isert_wait4cmds(conn);
+	isert_wait4flush(isert_conn);
+	isert_wait4logout(isert_conn);
+
+	queue_work(isert_release_wq, &isert_conn->release_work);
+}
+
+static void isert_free_conn(struct iscsi_conn *conn)
+{
+	struct isert_conn *isert_conn = conn->context;
+
+	isert_wait4flush(isert_conn);
+	isert_put_conn(isert_conn);
+}
+
+static struct iscsit_transport iser_target_transport = {
+	.name			= "IB/iSER",
+	.transport_type		= ISCSI_INFINIBAND,
+	.priv_size		= sizeof(struct isert_cmd),
+	.owner			= THIS_MODULE,
+	.iscsit_setup_np	= isert_setup_np,
+	.iscsit_accept_np	= isert_accept_np,
+	.iscsit_free_np		= isert_free_np,
+	.iscsit_wait_conn	= isert_wait_conn,
+	.iscsit_free_conn	= isert_free_conn,
+	.iscsit_get_login_rx	= isert_get_login_rx,
+	.iscsit_put_login_tx	= isert_put_login_tx,
+	.iscsit_immediate_queue	= isert_immediate_queue,
+	.iscsit_response_queue	= isert_response_queue,
+	.iscsit_get_dataout	= isert_get_dataout,
+	.iscsit_queue_data_in	= isert_put_datain,
+	.iscsit_queue_status	= isert_put_response,
+	.iscsit_aborted_task	= isert_aborted_task,
+	.iscsit_get_sup_prot_ops = isert_get_sup_prot_ops,
+};
+
+static int __init isert_init(void)
+{
+	int ret;
+
+	isert_comp_wq = alloc_workqueue("isert_comp_wq",
+					WQ_UNBOUND | WQ_HIGHPRI, 0);
+	if (!isert_comp_wq) {
+		isert_err("Unable to allocate isert_comp_wq\n");
+		ret = -ENOMEM;
+		return -ENOMEM;
+	}
+
+	isert_release_wq = alloc_workqueue("isert_release_wq", WQ_UNBOUND,
+					WQ_UNBOUND_MAX_ACTIVE);
+	if (!isert_release_wq) {
+		isert_err("Unable to allocate isert_release_wq\n");
+		ret = -ENOMEM;
+		goto destroy_comp_wq;
+	}
+
+	iscsit_register_transport(&iser_target_transport);
+	isert_info("iSER_TARGET[0] - Loaded iser_target_transport\n");
+
+	return 0;
+
+destroy_comp_wq:
+	destroy_workqueue(isert_comp_wq);
+
+	return ret;
+}
+
+static void __exit isert_exit(void)
+{
+	flush_scheduled_work();
+	destroy_workqueue(isert_release_wq);
+	destroy_workqueue(isert_comp_wq);
+	iscsit_unregister_transport(&iser_target_transport);
+	isert_info("iSER_TARGET[0] - Released iser_target_transport\n");
+}
+
+MODULE_DESCRIPTION("iSER-Target for mainline target infrastructure");
+MODULE_VERSION("1.0");
+MODULE_AUTHOR("nab@Linux-iSCSI.org");
+MODULE_LICENSE("GPL");
+
+module_init(isert_init);
+module_exit(isert_exit);
diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h
new file mode 100644
index 000000000..9ec23a786
--- /dev/null
+++ b/drivers/infiniband/ulp/isert/ib_isert.h
@@ -0,0 +1,231 @@
+#include <linux/socket.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <rdma/ib_verbs.h>
+#include <rdma/rdma_cm.h>
+
+#define DRV_NAME	"isert"
+#define PFX		DRV_NAME ": "
+
+#define isert_dbg(fmt, arg...)				 \
+	do {						 \
+		if (unlikely(isert_debug_level > 2))	 \
+			printk(KERN_DEBUG PFX "%s: " fmt,\
+				__func__ , ## arg);	 \
+	} while (0)
+
+#define isert_warn(fmt, arg...)				\
+	do {						\
+		if (unlikely(isert_debug_level > 0))	\
+			pr_warn(PFX "%s: " fmt,         \
+				__func__ , ## arg);	\
+	} while (0)
+
+#define isert_info(fmt, arg...)				\
+	do {						\
+		if (unlikely(isert_debug_level > 1))	\
+			pr_info(PFX "%s: " fmt,         \
+				__func__ , ## arg);	\
+	} while (0)
+
+#define isert_err(fmt, arg...) \
+	pr_err(PFX "%s: " fmt, __func__ , ## arg)
+
+#define ISCSI_ISER_SG_TABLESIZE		256
+#define ISER_FASTREG_LI_WRID		0xffffffffffffffffULL
+#define ISER_BEACON_WRID               0xfffffffffffffffeULL
+
+enum isert_desc_type {
+	ISCSI_TX_CONTROL,
+	ISCSI_TX_DATAIN
+};
+
+enum iser_ib_op_code {
+	ISER_IB_RECV,
+	ISER_IB_SEND,
+	ISER_IB_RDMA_WRITE,
+	ISER_IB_RDMA_READ,
+};
+
+enum iser_conn_state {
+	ISER_CONN_INIT,
+	ISER_CONN_UP,
+	ISER_CONN_FULL_FEATURE,
+	ISER_CONN_TERMINATING,
+	ISER_CONN_DOWN,
+};
+
+struct iser_rx_desc {
+	struct iser_hdr iser_header;
+	struct iscsi_hdr iscsi_header;
+	char		data[ISER_RECV_DATA_SEG_LEN];
+	u64		dma_addr;
+	struct ib_sge	rx_sg;
+	char		pad[ISER_RX_PAD_SIZE];
+} __packed;
+
+struct iser_tx_desc {
+	struct iser_hdr iser_header;
+	struct iscsi_hdr iscsi_header;
+	enum isert_desc_type type;
+	u64		dma_addr;
+	struct ib_sge	tx_sg[2];
+	int		num_sge;
+	struct isert_cmd *isert_cmd;
+	struct ib_send_wr send_wr;
+} __packed;
+
+enum isert_indicator {
+	ISERT_PROTECTED		= 1 << 0,
+	ISERT_DATA_KEY_VALID	= 1 << 1,
+	ISERT_PROT_KEY_VALID	= 1 << 2,
+	ISERT_SIG_KEY_VALID	= 1 << 3,
+};
+
+struct pi_context {
+	struct ib_mr		       *prot_mr;
+	struct ib_fast_reg_page_list   *prot_frpl;
+	struct ib_mr		       *sig_mr;
+};
+
+struct fast_reg_descriptor {
+	struct list_head		list;
+	struct ib_mr		       *data_mr;
+	struct ib_fast_reg_page_list   *data_frpl;
+	u8				ind;
+	struct pi_context	       *pi_ctx;
+};
+
+struct isert_data_buf {
+	struct scatterlist     *sg;
+	int			nents;
+	u32			sg_off;
+	u32			len; /* cur_rdma_length */
+	u32			offset;
+	unsigned int		dma_nents;
+	enum dma_data_direction dma_dir;
+};
+
+enum {
+	DATA = 0,
+	PROT = 1,
+	SIG = 2,
+};
+
+struct isert_rdma_wr {
+	struct list_head	wr_list;
+	struct isert_cmd	*isert_cmd;
+	enum iser_ib_op_code	iser_ib_op;
+	struct ib_sge		*ib_sge;
+	struct ib_sge		s_ib_sge;
+	int			send_wr_num;
+	struct ib_send_wr	*send_wr;
+	struct ib_send_wr	s_send_wr;
+	struct ib_sge		ib_sg[3];
+	struct isert_data_buf	data;
+	struct isert_data_buf	prot;
+	struct fast_reg_descriptor *fr_desc;
+};
+
+struct isert_cmd {
+	uint32_t		read_stag;
+	uint32_t		write_stag;
+	uint64_t		read_va;
+	uint64_t		write_va;
+	u64			pdu_buf_dma;
+	u32			pdu_buf_len;
+	u32			read_va_off;
+	u32			write_va_off;
+	u32			rdma_wr_num;
+	struct isert_conn	*conn;
+	struct iscsi_cmd	*iscsi_cmd;
+	struct iser_tx_desc	tx_desc;
+	struct isert_rdma_wr	rdma_wr;
+	struct work_struct	comp_work;
+};
+
+struct isert_device;
+
+struct isert_conn {
+	enum iser_conn_state	state;
+	int			post_recv_buf_count;
+	u32			responder_resources;
+	u32			initiator_depth;
+	bool			pi_support;
+	u32			max_sge;
+	char			*login_buf;
+	char			*login_req_buf;
+	char			*login_rsp_buf;
+	u64			login_req_dma;
+	int			login_req_len;
+	u64			login_rsp_dma;
+	unsigned int		rx_desc_head;
+	struct iser_rx_desc	*rx_descs;
+	struct ib_recv_wr	rx_wr[ISERT_MIN_POSTED_RX];
+	struct iscsi_conn	*conn;
+	struct list_head	accept_node;
+	struct completion	login_comp;
+	struct completion	login_req_comp;
+	struct iser_tx_desc	login_tx_desc;
+	struct rdma_cm_id	*cm_id;
+	struct ib_qp		*qp;
+	struct isert_device	*device;
+	struct mutex		mutex;
+	struct completion	wait;
+	struct completion	wait_comp_err;
+	struct kref		kref;
+	struct list_head	fr_pool;
+	int			fr_pool_size;
+	/* lock to protect fastreg pool */
+	spinlock_t		pool_lock;
+	struct work_struct	release_work;
+	struct ib_recv_wr       beacon;
+	bool                    logout_posted;
+};
+
+#define ISERT_MAX_CQ 64
+
+/**
+ * struct isert_comp - iSER completion context
+ *
+ * @device:     pointer to device handle
+ * @cq:         completion queue
+ * @wcs:        work completion array
+ * @active_qps: Number of active QPs attached
+ *              to completion context
+ * @work:       completion work handle
+ */
+struct isert_comp {
+	struct isert_device     *device;
+	struct ib_cq		*cq;
+	struct ib_wc		 wcs[16];
+	int                      active_qps;
+	struct work_struct	 work;
+};
+
+struct isert_device {
+	int			use_fastreg;
+	bool			pi_capable;
+	int			refcount;
+	struct ib_device	*ib_device;
+	struct ib_pd		*pd;
+	struct ib_mr		*mr;
+	struct isert_comp	*comps;
+	int                     comps_used;
+	struct list_head	dev_node;
+	struct ib_device_attr	dev_attr;
+	int			(*reg_rdma_mem)(struct iscsi_conn *conn,
+						    struct iscsi_cmd *cmd,
+						    struct isert_rdma_wr *wr);
+	void			(*unreg_rdma_mem)(struct isert_cmd *isert_cmd,
+						  struct isert_conn *isert_conn);
+};
+
+struct isert_np {
+	struct iscsi_np         *np;
+	struct semaphore	np_sem;
+	struct rdma_cm_id	*np_cm_id;
+	struct mutex		np_accept_mutex;
+	struct list_head	np_accept_list;
+	struct completion	np_login_comp;
+};
diff --git a/drivers/infiniband/ulp/isert/isert_proto.h b/drivers/infiniband/ulp/isert/isert_proto.h
new file mode 100644
index 000000000..4dccd313b
--- /dev/null
+++ b/drivers/infiniband/ulp/isert/isert_proto.h
@@ -0,0 +1,47 @@
+/* From iscsi_iser.h */
+
+struct iser_hdr {
+	u8	flags;
+	u8	rsvd[3];
+	__be32	write_stag; /* write rkey */
+	__be64	write_va;
+	__be32	read_stag;  /* read rkey */
+	__be64	read_va;
+} __packed;
+
+/*Constant PDU lengths calculations */
+#define ISER_HEADERS_LEN  (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr))
+
+#define ISER_RECV_DATA_SEG_LEN  8192
+#define ISER_RX_PAYLOAD_SIZE    (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN)
+#define ISER_RX_LOGIN_SIZE      (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN)
+
+/* QP settings */
+/* Maximal bounds on received asynchronous PDUs */
+#define ISERT_MAX_TX_MISC_PDUS	4 /* NOOP_IN(2) , ASYNC_EVENT(2)   */
+
+#define ISERT_MAX_RX_MISC_PDUS	6 /* NOOP_OUT(2), TEXT(1),         *
+				   * SCSI_TMFUNC(2), LOGOUT(1) */
+
+#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* from libiscsi.h, must be power of 2 */
+
+#define ISERT_QP_MAX_RECV_DTOS	(ISCSI_DEF_XMIT_CMDS_MAX)
+
+#define ISERT_MIN_POSTED_RX	(ISCSI_DEF_XMIT_CMDS_MAX >> 2)
+
+#define ISERT_INFLIGHT_DATAOUTS	8
+
+#define ISERT_QP_MAX_REQ_DTOS	(ISCSI_DEF_XMIT_CMDS_MAX *    \
+				(1 + ISERT_INFLIGHT_DATAOUTS) + \
+				ISERT_MAX_TX_MISC_PDUS	+ \
+				ISERT_MAX_RX_MISC_PDUS)
+
+#define ISER_RX_PAD_SIZE	(ISER_RECV_DATA_SEG_LEN + 4096 - \
+		(ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge)))
+
+#define ISER_VER	0x10
+#define ISER_WSV	0x08
+#define ISER_RSV	0x04
+#define ISCSI_CTRL	0x10
+#define ISER_HELLO	0x20
+#define ISER_HELLORPLY	0x30
diff --git a/drivers/infiniband/ulp/srp/Kbuild b/drivers/infiniband/ulp/srp/Kbuild
new file mode 100644
index 000000000..a16c73c66
--- /dev/null
+++ b/drivers/infiniband/ulp/srp/Kbuild
@@ -0,0 +1 @@
+obj-$(CONFIG_INFINIBAND_SRP)			+= ib_srp.o
diff --git a/drivers/infiniband/ulp/srp/Kconfig b/drivers/infiniband/ulp/srp/Kconfig
new file mode 100644
index 000000000..c74ee9633
--- /dev/null
+++ b/drivers/infiniband/ulp/srp/Kconfig
@@ -0,0 +1,12 @@
+config INFINIBAND_SRP
+	tristate "InfiniBand SCSI RDMA Protocol"
+	depends on SCSI
+	select SCSI_SRP_ATTRS
+	---help---
+	  Support for the SCSI RDMA Protocol over InfiniBand.  This
+	  allows you to access storage devices that speak SRP over
+	  InfiniBand.
+
+	  The SRP protocol is defined by the INCITS T10 technical
+	  committee.  See <http://www.t10.org/>.
+
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
new file mode 100644
index 000000000..75c01b27b
--- /dev/null
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -0,0 +1,3606 @@
+/*
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/string.h>
+#include <linux/parser.h>
+#include <linux/random.h>
+#include <linux/jiffies.h>
+#include <rdma/ib_cache.h>
+
+#include <linux/atomic.h>
+
+#include <scsi/scsi.h>
+#include <scsi/scsi_device.h>
+#include <scsi/scsi_dbg.h>
+#include <scsi/scsi_tcq.h>
+#include <scsi/srp.h>
+#include <scsi/scsi_transport_srp.h>
+
+#include "ib_srp.h"
+
+#define DRV_NAME	"ib_srp"
+#define PFX		DRV_NAME ": "
+#define DRV_VERSION	"1.0"
+#define DRV_RELDATE	"July 1, 2013"
+
+MODULE_AUTHOR("Roland Dreier");
+MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol initiator "
+		   "v" DRV_VERSION " (" DRV_RELDATE ")");
+MODULE_LICENSE("Dual BSD/GPL");
+
+static unsigned int srp_sg_tablesize;
+static unsigned int cmd_sg_entries;
+static unsigned int indirect_sg_entries;
+static bool allow_ext_sg;
+static bool prefer_fr;
+static bool register_always;
+static int topspin_workarounds = 1;
+
+module_param(srp_sg_tablesize, uint, 0444);
+MODULE_PARM_DESC(srp_sg_tablesize, "Deprecated name for cmd_sg_entries");
+
+module_param(cmd_sg_entries, uint, 0444);
+MODULE_PARM_DESC(cmd_sg_entries,
+		 "Default number of gather/scatter entries in the SRP command (default is 12, max 255)");
+
+module_param(indirect_sg_entries, uint, 0444);
+MODULE_PARM_DESC(indirect_sg_entries,
+		 "Default max number of gather/scatter entries (default is 12, max is " __stringify(SCSI_MAX_SG_CHAIN_SEGMENTS) ")");
+
+module_param(allow_ext_sg, bool, 0444);
+MODULE_PARM_DESC(allow_ext_sg,
+		  "Default behavior when there are more than cmd_sg_entries S/G entries after mapping; fails the request when false (default false)");
+
+module_param(topspin_workarounds, int, 0444);
+MODULE_PARM_DESC(topspin_workarounds,
+		 "Enable workarounds for Topspin/Cisco SRP target bugs if != 0");
+
+module_param(prefer_fr, bool, 0444);
+MODULE_PARM_DESC(prefer_fr,
+"Whether to use fast registration if both FMR and fast registration are supported");
+
+module_param(register_always, bool, 0444);
+MODULE_PARM_DESC(register_always,
+		 "Use memory registration even for contiguous memory regions");
+
+static struct kernel_param_ops srp_tmo_ops;
+
+static int srp_reconnect_delay = 10;
+module_param_cb(reconnect_delay, &srp_tmo_ops, &srp_reconnect_delay,
+		S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(reconnect_delay, "Time between successive reconnect attempts");
+
+static int srp_fast_io_fail_tmo = 15;
+module_param_cb(fast_io_fail_tmo, &srp_tmo_ops, &srp_fast_io_fail_tmo,
+		S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(fast_io_fail_tmo,
+		 "Number of seconds between the observation of a transport"
+		 " layer error and failing all I/O. \"off\" means that this"
+		 " functionality is disabled.");
+
+static int srp_dev_loss_tmo = 600;
+module_param_cb(dev_loss_tmo, &srp_tmo_ops, &srp_dev_loss_tmo,
+		S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dev_loss_tmo,
+		 "Maximum number of seconds that the SRP transport should"
+		 " insulate transport layer errors. After this time has been"
+		 " exceeded the SCSI host is removed. Should be"
+		 " between 1 and " __stringify(SCSI_DEVICE_BLOCK_MAX_TIMEOUT)
+		 " if fast_io_fail_tmo has not been set. \"off\" means that"
+		 " this functionality is disabled.");
+
+static unsigned ch_count;
+module_param(ch_count, uint, 0444);
+MODULE_PARM_DESC(ch_count,
+		 "Number of RDMA channels to use for communication with an SRP target. Using more than one channel improves performance if the HCA supports multiple completion vectors. The default value is the minimum of four times the number of online CPU sockets and the number of completion vectors supported by the HCA.");
+
+static void srp_add_one(struct ib_device *device);
+static void srp_remove_one(struct ib_device *device);
+static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr);
+static void srp_send_completion(struct ib_cq *cq, void *ch_ptr);
+static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event);
+
+static struct scsi_transport_template *ib_srp_transport_template;
+static struct workqueue_struct *srp_remove_wq;
+
+static struct ib_client srp_client = {
+	.name   = "srp",
+	.add    = srp_add_one,
+	.remove = srp_remove_one
+};
+
+static struct ib_sa_client srp_sa_client;
+
+static int srp_tmo_get(char *buffer, const struct kernel_param *kp)
+{
+	int tmo = *(int *)kp->arg;
+
+	if (tmo >= 0)
+		return sprintf(buffer, "%d", tmo);
+	else
+		return sprintf(buffer, "off");
+}
+
+static int srp_tmo_set(const char *val, const struct kernel_param *kp)
+{
+	int tmo, res;
+
+	if (strncmp(val, "off", 3) != 0) {
+		res = kstrtoint(val, 0, &tmo);
+		if (res)
+			goto out;
+	} else {
+		tmo = -1;
+	}
+	if (kp->arg == &srp_reconnect_delay)
+		res = srp_tmo_valid(tmo, srp_fast_io_fail_tmo,
+				    srp_dev_loss_tmo);
+	else if (kp->arg == &srp_fast_io_fail_tmo)
+		res = srp_tmo_valid(srp_reconnect_delay, tmo, srp_dev_loss_tmo);
+	else
+		res = srp_tmo_valid(srp_reconnect_delay, srp_fast_io_fail_tmo,
+				    tmo);
+	if (res)
+		goto out;
+	*(int *)kp->arg = tmo;
+
+out:
+	return res;
+}
+
+static struct kernel_param_ops srp_tmo_ops = {
+	.get = srp_tmo_get,
+	.set = srp_tmo_set,
+};
+
+static inline struct srp_target_port *host_to_target(struct Scsi_Host *host)
+{
+	return (struct srp_target_port *) host->hostdata;
+}
+
+static const char *srp_target_info(struct Scsi_Host *host)
+{
+	return host_to_target(host)->target_name;
+}
+
+static int srp_target_is_topspin(struct srp_target_port *target)
+{
+	static const u8 topspin_oui[3] = { 0x00, 0x05, 0xad };
+	static const u8 cisco_oui[3]   = { 0x00, 0x1b, 0x0d };
+
+	return topspin_workarounds &&
+		(!memcmp(&target->ioc_guid, topspin_oui, sizeof topspin_oui) ||
+		 !memcmp(&target->ioc_guid, cisco_oui, sizeof cisco_oui));
+}
+
+static struct srp_iu *srp_alloc_iu(struct srp_host *host, size_t size,
+				   gfp_t gfp_mask,
+				   enum dma_data_direction direction)
+{
+	struct srp_iu *iu;
+
+	iu = kmalloc(sizeof *iu, gfp_mask);
+	if (!iu)
+		goto out;
+
+	iu->buf = kzalloc(size, gfp_mask);
+	if (!iu->buf)
+		goto out_free_iu;
+
+	iu->dma = ib_dma_map_single(host->srp_dev->dev, iu->buf, size,
+				    direction);
+	if (ib_dma_mapping_error(host->srp_dev->dev, iu->dma))
+		goto out_free_buf;
+
+	iu->size      = size;
+	iu->direction = direction;
+
+	return iu;
+
+out_free_buf:
+	kfree(iu->buf);
+out_free_iu:
+	kfree(iu);
+out:
+	return NULL;
+}
+
+static void srp_free_iu(struct srp_host *host, struct srp_iu *iu)
+{
+	if (!iu)
+		return;
+
+	ib_dma_unmap_single(host->srp_dev->dev, iu->dma, iu->size,
+			    iu->direction);
+	kfree(iu->buf);
+	kfree(iu);
+}
+
+static void srp_qp_event(struct ib_event *event, void *context)
+{
+	pr_debug("QP event %d\n", event->event);
+}
+
+static int srp_init_qp(struct srp_target_port *target,
+		       struct ib_qp *qp)
+{
+	struct ib_qp_attr *attr;
+	int ret;
+
+	attr = kmalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	ret = ib_find_cached_pkey(target->srp_host->srp_dev->dev,
+				  target->srp_host->port,
+				  be16_to_cpu(target->pkey),
+				  &attr->pkey_index);
+	if (ret)
+		goto out;
+
+	attr->qp_state        = IB_QPS_INIT;
+	attr->qp_access_flags = (IB_ACCESS_REMOTE_READ |
+				    IB_ACCESS_REMOTE_WRITE);
+	attr->port_num        = target->srp_host->port;
+
+	ret = ib_modify_qp(qp, attr,
+			   IB_QP_STATE		|
+			   IB_QP_PKEY_INDEX	|
+			   IB_QP_ACCESS_FLAGS	|
+			   IB_QP_PORT);
+
+out:
+	kfree(attr);
+	return ret;
+}
+
+static int srp_new_cm_id(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	struct ib_cm_id *new_cm_id;
+
+	new_cm_id = ib_create_cm_id(target->srp_host->srp_dev->dev,
+				    srp_cm_handler, ch);
+	if (IS_ERR(new_cm_id))
+		return PTR_ERR(new_cm_id);
+
+	if (ch->cm_id)
+		ib_destroy_cm_id(ch->cm_id);
+	ch->cm_id = new_cm_id;
+	ch->path.sgid = target->sgid;
+	ch->path.dgid = target->orig_dgid;
+	ch->path.pkey = target->pkey;
+	ch->path.service_id = target->service_id;
+
+	return 0;
+}
+
+static struct ib_fmr_pool *srp_alloc_fmr_pool(struct srp_target_port *target)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_fmr_pool_param fmr_param;
+
+	memset(&fmr_param, 0, sizeof(fmr_param));
+	fmr_param.pool_size	    = target->scsi_host->can_queue;
+	fmr_param.dirty_watermark   = fmr_param.pool_size / 4;
+	fmr_param.cache		    = 1;
+	fmr_param.max_pages_per_fmr = dev->max_pages_per_mr;
+	fmr_param.page_shift	    = ilog2(dev->mr_page_size);
+	fmr_param.access	    = (IB_ACCESS_LOCAL_WRITE |
+				       IB_ACCESS_REMOTE_WRITE |
+				       IB_ACCESS_REMOTE_READ);
+
+	return ib_create_fmr_pool(dev->pd, &fmr_param);
+}
+
+/**
+ * srp_destroy_fr_pool() - free the resources owned by a pool
+ * @pool: Fast registration pool to be destroyed.
+ */
+static void srp_destroy_fr_pool(struct srp_fr_pool *pool)
+{
+	int i;
+	struct srp_fr_desc *d;
+
+	if (!pool)
+		return;
+
+	for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) {
+		if (d->frpl)
+			ib_free_fast_reg_page_list(d->frpl);
+		if (d->mr)
+			ib_dereg_mr(d->mr);
+	}
+	kfree(pool);
+}
+
+/**
+ * srp_create_fr_pool() - allocate and initialize a pool for fast registration
+ * @device:            IB device to allocate fast registration descriptors for.
+ * @pd:                Protection domain associated with the FR descriptors.
+ * @pool_size:         Number of descriptors to allocate.
+ * @max_page_list_len: Maximum fast registration work request page list length.
+ */
+static struct srp_fr_pool *srp_create_fr_pool(struct ib_device *device,
+					      struct ib_pd *pd, int pool_size,
+					      int max_page_list_len)
+{
+	struct srp_fr_pool *pool;
+	struct srp_fr_desc *d;
+	struct ib_mr *mr;
+	struct ib_fast_reg_page_list *frpl;
+	int i, ret = -EINVAL;
+
+	if (pool_size <= 0)
+		goto err;
+	ret = -ENOMEM;
+	pool = kzalloc(sizeof(struct srp_fr_pool) +
+		       pool_size * sizeof(struct srp_fr_desc), GFP_KERNEL);
+	if (!pool)
+		goto err;
+	pool->size = pool_size;
+	pool->max_page_list_len = max_page_list_len;
+	spin_lock_init(&pool->lock);
+	INIT_LIST_HEAD(&pool->free_list);
+
+	for (i = 0, d = &pool->desc[0]; i < pool->size; i++, d++) {
+		mr = ib_alloc_fast_reg_mr(pd, max_page_list_len);
+		if (IS_ERR(mr)) {
+			ret = PTR_ERR(mr);
+			goto destroy_pool;
+		}
+		d->mr = mr;
+		frpl = ib_alloc_fast_reg_page_list(device, max_page_list_len);
+		if (IS_ERR(frpl)) {
+			ret = PTR_ERR(frpl);
+			goto destroy_pool;
+		}
+		d->frpl = frpl;
+		list_add_tail(&d->entry, &pool->free_list);
+	}
+
+out:
+	return pool;
+
+destroy_pool:
+	srp_destroy_fr_pool(pool);
+
+err:
+	pool = ERR_PTR(ret);
+	goto out;
+}
+
+/**
+ * srp_fr_pool_get() - obtain a descriptor suitable for fast registration
+ * @pool: Pool to obtain descriptor from.
+ */
+static struct srp_fr_desc *srp_fr_pool_get(struct srp_fr_pool *pool)
+{
+	struct srp_fr_desc *d = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	if (!list_empty(&pool->free_list)) {
+		d = list_first_entry(&pool->free_list, typeof(*d), entry);
+		list_del(&d->entry);
+	}
+	spin_unlock_irqrestore(&pool->lock, flags);
+
+	return d;
+}
+
+/**
+ * srp_fr_pool_put() - put an FR descriptor back in the free list
+ * @pool: Pool the descriptor was allocated from.
+ * @desc: Pointer to an array of fast registration descriptor pointers.
+ * @n:    Number of descriptors to put back.
+ *
+ * Note: The caller must already have queued an invalidation request for
+ * desc->mr->rkey before calling this function.
+ */
+static void srp_fr_pool_put(struct srp_fr_pool *pool, struct srp_fr_desc **desc,
+			    int n)
+{
+	unsigned long flags;
+	int i;
+
+	spin_lock_irqsave(&pool->lock, flags);
+	for (i = 0; i < n; i++)
+		list_add(&desc[i]->entry, &pool->free_list);
+	spin_unlock_irqrestore(&pool->lock, flags);
+}
+
+static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+
+	return srp_create_fr_pool(dev->dev, dev->pd,
+				  target->scsi_host->can_queue,
+				  dev->max_pages_per_mr);
+}
+
+/**
+ * srp_destroy_qp() - destroy an RDMA queue pair
+ * @ch: SRP RDMA channel.
+ *
+ * Change a queue pair into the error state and wait until all receive
+ * completions have been processed before destroying it. This avoids that
+ * the receive completion handler can access the queue pair while it is
+ * being destroyed.
+ */
+static void srp_destroy_qp(struct srp_rdma_ch *ch)
+{
+	static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR };
+	static struct ib_recv_wr wr = { .wr_id = SRP_LAST_WR_ID };
+	struct ib_recv_wr *bad_wr;
+	int ret;
+
+	/* Destroying a QP and reusing ch->done is only safe if not connected */
+	WARN_ON_ONCE(ch->connected);
+
+	ret = ib_modify_qp(ch->qp, &attr, IB_QP_STATE);
+	WARN_ONCE(ret, "ib_cm_init_qp_attr() returned %d\n", ret);
+	if (ret)
+		goto out;
+
+	init_completion(&ch->done);
+	ret = ib_post_recv(ch->qp, &wr, &bad_wr);
+	WARN_ONCE(ret, "ib_post_recv() returned %d\n", ret);
+	if (ret == 0)
+		wait_for_completion(&ch->done);
+
+out:
+	ib_destroy_qp(ch->qp);
+}
+
+static int srp_create_ch_ib(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_qp_init_attr *init_attr;
+	struct ib_cq *recv_cq, *send_cq;
+	struct ib_qp *qp;
+	struct ib_fmr_pool *fmr_pool = NULL;
+	struct srp_fr_pool *fr_pool = NULL;
+	const int m = 1 + dev->use_fast_reg;
+	int ret;
+
+	init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL);
+	if (!init_attr)
+		return -ENOMEM;
+
+	/* + 1 for SRP_LAST_WR_ID */
+	recv_cq = ib_create_cq(dev->dev, srp_recv_completion, NULL, ch,
+			       target->queue_size + 1, ch->comp_vector);
+	if (IS_ERR(recv_cq)) {
+		ret = PTR_ERR(recv_cq);
+		goto err;
+	}
+
+	send_cq = ib_create_cq(dev->dev, srp_send_completion, NULL, ch,
+			       m * target->queue_size, ch->comp_vector);
+	if (IS_ERR(send_cq)) {
+		ret = PTR_ERR(send_cq);
+		goto err_recv_cq;
+	}
+
+	ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP);
+
+	init_attr->event_handler       = srp_qp_event;
+	init_attr->cap.max_send_wr     = m * target->queue_size;
+	init_attr->cap.max_recv_wr     = target->queue_size + 1;
+	init_attr->cap.max_recv_sge    = 1;
+	init_attr->cap.max_send_sge    = 1;
+	init_attr->sq_sig_type         = IB_SIGNAL_REQ_WR;
+	init_attr->qp_type             = IB_QPT_RC;
+	init_attr->send_cq             = send_cq;
+	init_attr->recv_cq             = recv_cq;
+
+	qp = ib_create_qp(dev->pd, init_attr);
+	if (IS_ERR(qp)) {
+		ret = PTR_ERR(qp);
+		goto err_send_cq;
+	}
+
+	ret = srp_init_qp(target, qp);
+	if (ret)
+		goto err_qp;
+
+	if (dev->use_fast_reg && dev->has_fr) {
+		fr_pool = srp_alloc_fr_pool(target);
+		if (IS_ERR(fr_pool)) {
+			ret = PTR_ERR(fr_pool);
+			shost_printk(KERN_WARNING, target->scsi_host, PFX
+				     "FR pool allocation failed (%d)\n", ret);
+			goto err_qp;
+		}
+		if (ch->fr_pool)
+			srp_destroy_fr_pool(ch->fr_pool);
+		ch->fr_pool = fr_pool;
+	} else if (!dev->use_fast_reg && dev->has_fmr) {
+		fmr_pool = srp_alloc_fmr_pool(target);
+		if (IS_ERR(fmr_pool)) {
+			ret = PTR_ERR(fmr_pool);
+			shost_printk(KERN_WARNING, target->scsi_host, PFX
+				     "FMR pool allocation failed (%d)\n", ret);
+			goto err_qp;
+		}
+		if (ch->fmr_pool)
+			ib_destroy_fmr_pool(ch->fmr_pool);
+		ch->fmr_pool = fmr_pool;
+	}
+
+	if (ch->qp)
+		srp_destroy_qp(ch);
+	if (ch->recv_cq)
+		ib_destroy_cq(ch->recv_cq);
+	if (ch->send_cq)
+		ib_destroy_cq(ch->send_cq);
+
+	ch->qp = qp;
+	ch->recv_cq = recv_cq;
+	ch->send_cq = send_cq;
+
+	kfree(init_attr);
+	return 0;
+
+err_qp:
+	ib_destroy_qp(qp);
+
+err_send_cq:
+	ib_destroy_cq(send_cq);
+
+err_recv_cq:
+	ib_destroy_cq(recv_cq);
+
+err:
+	kfree(init_attr);
+	return ret;
+}
+
+/*
+ * Note: this function may be called without srp_alloc_iu_bufs() having been
+ * invoked. Hence the ch->[rt]x_ring checks.
+ */
+static void srp_free_ch_ib(struct srp_target_port *target,
+			   struct srp_rdma_ch *ch)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+	int i;
+
+	if (!ch->target)
+		return;
+
+	if (ch->cm_id) {
+		ib_destroy_cm_id(ch->cm_id);
+		ch->cm_id = NULL;
+	}
+
+	/* If srp_new_cm_id() succeeded but srp_create_ch_ib() not, return. */
+	if (!ch->qp)
+		return;
+
+	if (dev->use_fast_reg) {
+		if (ch->fr_pool)
+			srp_destroy_fr_pool(ch->fr_pool);
+	} else {
+		if (ch->fmr_pool)
+			ib_destroy_fmr_pool(ch->fmr_pool);
+	}
+	srp_destroy_qp(ch);
+	ib_destroy_cq(ch->send_cq);
+	ib_destroy_cq(ch->recv_cq);
+
+	/*
+	 * Avoid that the SCSI error handler tries to use this channel after
+	 * it has been freed. The SCSI error handler can namely continue
+	 * trying to perform recovery actions after scsi_remove_host()
+	 * returned.
+	 */
+	ch->target = NULL;
+
+	ch->qp = NULL;
+	ch->send_cq = ch->recv_cq = NULL;
+
+	if (ch->rx_ring) {
+		for (i = 0; i < target->queue_size; ++i)
+			srp_free_iu(target->srp_host, ch->rx_ring[i]);
+		kfree(ch->rx_ring);
+		ch->rx_ring = NULL;
+	}
+	if (ch->tx_ring) {
+		for (i = 0; i < target->queue_size; ++i)
+			srp_free_iu(target->srp_host, ch->tx_ring[i]);
+		kfree(ch->tx_ring);
+		ch->tx_ring = NULL;
+	}
+}
+
+static void srp_path_rec_completion(int status,
+				    struct ib_sa_path_rec *pathrec,
+				    void *ch_ptr)
+{
+	struct srp_rdma_ch *ch = ch_ptr;
+	struct srp_target_port *target = ch->target;
+
+	ch->status = status;
+	if (status)
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "Got failed path rec status %d\n", status);
+	else
+		ch->path = *pathrec;
+	complete(&ch->done);
+}
+
+static int srp_lookup_path(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	int ret;
+
+	ch->path.numb_path = 1;
+
+	init_completion(&ch->done);
+
+	ch->path_query_id = ib_sa_path_rec_get(&srp_sa_client,
+					       target->srp_host->srp_dev->dev,
+					       target->srp_host->port,
+					       &ch->path,
+					       IB_SA_PATH_REC_SERVICE_ID |
+					       IB_SA_PATH_REC_DGID	 |
+					       IB_SA_PATH_REC_SGID	 |
+					       IB_SA_PATH_REC_NUMB_PATH	 |
+					       IB_SA_PATH_REC_PKEY,
+					       SRP_PATH_REC_TIMEOUT_MS,
+					       GFP_KERNEL,
+					       srp_path_rec_completion,
+					       ch, &ch->path_query);
+	if (ch->path_query_id < 0)
+		return ch->path_query_id;
+
+	ret = wait_for_completion_interruptible(&ch->done);
+	if (ret < 0)
+		return ret;
+
+	if (ch->status < 0)
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Path record query failed\n");
+
+	return ch->status;
+}
+
+static int srp_send_req(struct srp_rdma_ch *ch, bool multich)
+{
+	struct srp_target_port *target = ch->target;
+	struct {
+		struct ib_cm_req_param param;
+		struct srp_login_req   priv;
+	} *req = NULL;
+	int status;
+
+	req = kzalloc(sizeof *req, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	req->param.primary_path		      = &ch->path;
+	req->param.alternate_path 	      = NULL;
+	req->param.service_id 		      = target->service_id;
+	req->param.qp_num		      = ch->qp->qp_num;
+	req->param.qp_type		      = ch->qp->qp_type;
+	req->param.private_data 	      = &req->priv;
+	req->param.private_data_len 	      = sizeof req->priv;
+	req->param.flow_control 	      = 1;
+
+	get_random_bytes(&req->param.starting_psn, 4);
+	req->param.starting_psn 	     &= 0xffffff;
+
+	/*
+	 * Pick some arbitrary defaults here; we could make these
+	 * module parameters if anyone cared about setting them.
+	 */
+	req->param.responder_resources	      = 4;
+	req->param.remote_cm_response_timeout = 20;
+	req->param.local_cm_response_timeout  = 20;
+	req->param.retry_count                = target->tl_retry_count;
+	req->param.rnr_retry_count 	      = 7;
+	req->param.max_cm_retries 	      = 15;
+
+	req->priv.opcode     	= SRP_LOGIN_REQ;
+	req->priv.tag        	= 0;
+	req->priv.req_it_iu_len = cpu_to_be32(target->max_iu_len);
+	req->priv.req_buf_fmt 	= cpu_to_be16(SRP_BUF_FORMAT_DIRECT |
+					      SRP_BUF_FORMAT_INDIRECT);
+	req->priv.req_flags	= (multich ? SRP_MULTICHAN_MULTI :
+				   SRP_MULTICHAN_SINGLE);
+	/*
+	 * In the published SRP specification (draft rev. 16a), the
+	 * port identifier format is 8 bytes of ID extension followed
+	 * by 8 bytes of GUID.  Older drafts put the two halves in the
+	 * opposite order, so that the GUID comes first.
+	 *
+	 * Targets conforming to these obsolete drafts can be
+	 * recognized by the I/O Class they report.
+	 */
+	if (target->io_class == SRP_REV10_IB_IO_CLASS) {
+		memcpy(req->priv.initiator_port_id,
+		       &target->sgid.global.interface_id, 8);
+		memcpy(req->priv.initiator_port_id + 8,
+		       &target->initiator_ext, 8);
+		memcpy(req->priv.target_port_id,     &target->ioc_guid, 8);
+		memcpy(req->priv.target_port_id + 8, &target->id_ext, 8);
+	} else {
+		memcpy(req->priv.initiator_port_id,
+		       &target->initiator_ext, 8);
+		memcpy(req->priv.initiator_port_id + 8,
+		       &target->sgid.global.interface_id, 8);
+		memcpy(req->priv.target_port_id,     &target->id_ext, 8);
+		memcpy(req->priv.target_port_id + 8, &target->ioc_guid, 8);
+	}
+
+	/*
+	 * Topspin/Cisco SRP targets will reject our login unless we
+	 * zero out the first 8 bytes of our initiator port ID and set
+	 * the second 8 bytes to the local node GUID.
+	 */
+	if (srp_target_is_topspin(target)) {
+		shost_printk(KERN_DEBUG, target->scsi_host,
+			     PFX "Topspin/Cisco initiator port ID workaround "
+			     "activated for target GUID %016llx\n",
+			     (unsigned long long) be64_to_cpu(target->ioc_guid));
+		memset(req->priv.initiator_port_id, 0, 8);
+		memcpy(req->priv.initiator_port_id + 8,
+		       &target->srp_host->srp_dev->dev->node_guid, 8);
+	}
+
+	status = ib_send_cm_req(ch->cm_id, &req->param);
+
+	kfree(req);
+
+	return status;
+}
+
+static bool srp_queue_remove_work(struct srp_target_port *target)
+{
+	bool changed = false;
+
+	spin_lock_irq(&target->lock);
+	if (target->state != SRP_TARGET_REMOVED) {
+		target->state = SRP_TARGET_REMOVED;
+		changed = true;
+	}
+	spin_unlock_irq(&target->lock);
+
+	if (changed)
+		queue_work(srp_remove_wq, &target->remove_work);
+
+	return changed;
+}
+
+static void srp_disconnect_target(struct srp_target_port *target)
+{
+	struct srp_rdma_ch *ch;
+	int i;
+
+	/* XXX should send SRP_I_LOGOUT request */
+
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		ch->connected = false;
+		if (ch->cm_id && ib_send_cm_dreq(ch->cm_id, NULL, 0)) {
+			shost_printk(KERN_DEBUG, target->scsi_host,
+				     PFX "Sending CM DREQ failed\n");
+		}
+	}
+}
+
+static void srp_free_req_data(struct srp_target_port *target,
+			      struct srp_rdma_ch *ch)
+{
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_device *ibdev = dev->dev;
+	struct srp_request *req;
+	int i;
+
+	if (!ch->target || !ch->req_ring)
+		return;
+
+	for (i = 0; i < target->req_ring_size; ++i) {
+		req = &ch->req_ring[i];
+		if (dev->use_fast_reg)
+			kfree(req->fr_list);
+		else
+			kfree(req->fmr_list);
+		kfree(req->map_page);
+		if (req->indirect_dma_addr) {
+			ib_dma_unmap_single(ibdev, req->indirect_dma_addr,
+					    target->indirect_size,
+					    DMA_TO_DEVICE);
+		}
+		kfree(req->indirect_desc);
+	}
+
+	kfree(ch->req_ring);
+	ch->req_ring = NULL;
+}
+
+static int srp_alloc_req_data(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_device *srp_dev = target->srp_host->srp_dev;
+	struct ib_device *ibdev = srp_dev->dev;
+	struct srp_request *req;
+	void *mr_list;
+	dma_addr_t dma_addr;
+	int i, ret = -ENOMEM;
+
+	ch->req_ring = kcalloc(target->req_ring_size, sizeof(*ch->req_ring),
+			       GFP_KERNEL);
+	if (!ch->req_ring)
+		goto out;
+
+	for (i = 0; i < target->req_ring_size; ++i) {
+		req = &ch->req_ring[i];
+		mr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *),
+				  GFP_KERNEL);
+		if (!mr_list)
+			goto out;
+		if (srp_dev->use_fast_reg)
+			req->fr_list = mr_list;
+		else
+			req->fmr_list = mr_list;
+		req->map_page = kmalloc(srp_dev->max_pages_per_mr *
+					sizeof(void *), GFP_KERNEL);
+		if (!req->map_page)
+			goto out;
+		req->indirect_desc = kmalloc(target->indirect_size, GFP_KERNEL);
+		if (!req->indirect_desc)
+			goto out;
+
+		dma_addr = ib_dma_map_single(ibdev, req->indirect_desc,
+					     target->indirect_size,
+					     DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(ibdev, dma_addr))
+			goto out;
+
+		req->indirect_dma_addr = dma_addr;
+	}
+	ret = 0;
+
+out:
+	return ret;
+}
+
+/**
+ * srp_del_scsi_host_attr() - Remove attributes defined in the host template.
+ * @shost: SCSI host whose attributes to remove from sysfs.
+ *
+ * Note: Any attributes defined in the host template and that did not exist
+ * before invocation of this function will be ignored.
+ */
+static void srp_del_scsi_host_attr(struct Scsi_Host *shost)
+{
+	struct device_attribute **attr;
+
+	for (attr = shost->hostt->shost_attrs; attr && *attr; ++attr)
+		device_remove_file(&shost->shost_dev, *attr);
+}
+
+static void srp_remove_target(struct srp_target_port *target)
+{
+	struct srp_rdma_ch *ch;
+	int i;
+
+	WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED);
+
+	srp_del_scsi_host_attr(target->scsi_host);
+	srp_rport_get(target->rport);
+	srp_remove_host(target->scsi_host);
+	scsi_remove_host(target->scsi_host);
+	srp_stop_rport_timers(target->rport);
+	srp_disconnect_target(target);
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		srp_free_ch_ib(target, ch);
+	}
+	cancel_work_sync(&target->tl_err_work);
+	srp_rport_put(target->rport);
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		srp_free_req_data(target, ch);
+	}
+	kfree(target->ch);
+	target->ch = NULL;
+
+	spin_lock(&target->srp_host->target_lock);
+	list_del(&target->list);
+	spin_unlock(&target->srp_host->target_lock);
+
+	scsi_host_put(target->scsi_host);
+}
+
+static void srp_remove_work(struct work_struct *work)
+{
+	struct srp_target_port *target =
+		container_of(work, struct srp_target_port, remove_work);
+
+	WARN_ON_ONCE(target->state != SRP_TARGET_REMOVED);
+
+	srp_remove_target(target);
+}
+
+static void srp_rport_delete(struct srp_rport *rport)
+{
+	struct srp_target_port *target = rport->lld_data;
+
+	srp_queue_remove_work(target);
+}
+
+/**
+ * srp_connected_ch() - number of connected channels
+ * @target: SRP target port.
+ */
+static int srp_connected_ch(struct srp_target_port *target)
+{
+	int i, c = 0;
+
+	for (i = 0; i < target->ch_count; i++)
+		c += target->ch[i].connected;
+
+	return c;
+}
+
+static int srp_connect_ch(struct srp_rdma_ch *ch, bool multich)
+{
+	struct srp_target_port *target = ch->target;
+	int ret;
+
+	WARN_ON_ONCE(!multich && srp_connected_ch(target) > 0);
+
+	ret = srp_lookup_path(ch);
+	if (ret)
+		return ret;
+
+	while (1) {
+		init_completion(&ch->done);
+		ret = srp_send_req(ch, multich);
+		if (ret)
+			return ret;
+		ret = wait_for_completion_interruptible(&ch->done);
+		if (ret < 0)
+			return ret;
+
+		/*
+		 * The CM event handling code will set status to
+		 * SRP_PORT_REDIRECT if we get a port redirect REJ
+		 * back, or SRP_DLID_REDIRECT if we get a lid/qp
+		 * redirect REJ back.
+		 */
+		switch (ch->status) {
+		case 0:
+			ch->connected = true;
+			return 0;
+
+		case SRP_PORT_REDIRECT:
+			ret = srp_lookup_path(ch);
+			if (ret)
+				return ret;
+			break;
+
+		case SRP_DLID_REDIRECT:
+			break;
+
+		case SRP_STALE_CONN:
+			shost_printk(KERN_ERR, target->scsi_host, PFX
+				     "giving up on stale connection\n");
+			ch->status = -ECONNRESET;
+			return ch->status;
+
+		default:
+			return ch->status;
+		}
+	}
+}
+
+static int srp_inv_rkey(struct srp_rdma_ch *ch, u32 rkey)
+{
+	struct ib_send_wr *bad_wr;
+	struct ib_send_wr wr = {
+		.opcode		    = IB_WR_LOCAL_INV,
+		.wr_id		    = LOCAL_INV_WR_ID_MASK,
+		.next		    = NULL,
+		.num_sge	    = 0,
+		.send_flags	    = 0,
+		.ex.invalidate_rkey = rkey,
+	};
+
+	return ib_post_send(ch->qp, &wr, &bad_wr);
+}
+
+static void srp_unmap_data(struct scsi_cmnd *scmnd,
+			   struct srp_rdma_ch *ch,
+			   struct srp_request *req)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_device *ibdev = dev->dev;
+	int i, res;
+
+	if (!scsi_sglist(scmnd) ||
+	    (scmnd->sc_data_direction != DMA_TO_DEVICE &&
+	     scmnd->sc_data_direction != DMA_FROM_DEVICE))
+		return;
+
+	if (dev->use_fast_reg) {
+		struct srp_fr_desc **pfr;
+
+		for (i = req->nmdesc, pfr = req->fr_list; i > 0; i--, pfr++) {
+			res = srp_inv_rkey(ch, (*pfr)->mr->rkey);
+			if (res < 0) {
+				shost_printk(KERN_ERR, target->scsi_host, PFX
+				  "Queueing INV WR for rkey %#x failed (%d)\n",
+				  (*pfr)->mr->rkey, res);
+				queue_work(system_long_wq,
+					   &target->tl_err_work);
+			}
+		}
+		if (req->nmdesc)
+			srp_fr_pool_put(ch->fr_pool, req->fr_list,
+					req->nmdesc);
+	} else {
+		struct ib_pool_fmr **pfmr;
+
+		for (i = req->nmdesc, pfmr = req->fmr_list; i > 0; i--, pfmr++)
+			ib_fmr_pool_unmap(*pfmr);
+	}
+
+	ib_dma_unmap_sg(ibdev, scsi_sglist(scmnd), scsi_sg_count(scmnd),
+			scmnd->sc_data_direction);
+}
+
+/**
+ * srp_claim_req - Take ownership of the scmnd associated with a request.
+ * @ch: SRP RDMA channel.
+ * @req: SRP request.
+ * @sdev: If not NULL, only take ownership for this SCSI device.
+ * @scmnd: If NULL, take ownership of @req->scmnd. If not NULL, only take
+ *         ownership of @req->scmnd if it equals @scmnd.
+ *
+ * Return value:
+ * Either NULL or a pointer to the SCSI command the caller became owner of.
+ */
+static struct scsi_cmnd *srp_claim_req(struct srp_rdma_ch *ch,
+				       struct srp_request *req,
+				       struct scsi_device *sdev,
+				       struct scsi_cmnd *scmnd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ch->lock, flags);
+	if (req->scmnd &&
+	    (!sdev || req->scmnd->device == sdev) &&
+	    (!scmnd || req->scmnd == scmnd)) {
+		scmnd = req->scmnd;
+		req->scmnd = NULL;
+	} else {
+		scmnd = NULL;
+	}
+	spin_unlock_irqrestore(&ch->lock, flags);
+
+	return scmnd;
+}
+
+/**
+ * srp_free_req() - Unmap data and add request to the free request list.
+ * @ch:     SRP RDMA channel.
+ * @req:    Request to be freed.
+ * @scmnd:  SCSI command associated with @req.
+ * @req_lim_delta: Amount to be added to @target->req_lim.
+ */
+static void srp_free_req(struct srp_rdma_ch *ch, struct srp_request *req,
+			 struct scsi_cmnd *scmnd, s32 req_lim_delta)
+{
+	unsigned long flags;
+
+	srp_unmap_data(scmnd, ch, req);
+
+	spin_lock_irqsave(&ch->lock, flags);
+	ch->req_lim += req_lim_delta;
+	spin_unlock_irqrestore(&ch->lock, flags);
+}
+
+static void srp_finish_req(struct srp_rdma_ch *ch, struct srp_request *req,
+			   struct scsi_device *sdev, int result)
+{
+	struct scsi_cmnd *scmnd = srp_claim_req(ch, req, sdev, NULL);
+
+	if (scmnd) {
+		srp_free_req(ch, req, scmnd, 0);
+		scmnd->result = result;
+		scmnd->scsi_done(scmnd);
+	}
+}
+
+static void srp_terminate_io(struct srp_rport *rport)
+{
+	struct srp_target_port *target = rport->lld_data;
+	struct srp_rdma_ch *ch;
+	struct Scsi_Host *shost = target->scsi_host;
+	struct scsi_device *sdev;
+	int i, j;
+
+	/*
+	 * Invoking srp_terminate_io() while srp_queuecommand() is running
+	 * is not safe. Hence the warning statement below.
+	 */
+	shost_for_each_device(sdev, shost)
+		WARN_ON_ONCE(sdev->request_queue->request_fn_active);
+
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+
+		for (j = 0; j < target->req_ring_size; ++j) {
+			struct srp_request *req = &ch->req_ring[j];
+
+			srp_finish_req(ch, req, NULL,
+				       DID_TRANSPORT_FAILFAST << 16);
+		}
+	}
+}
+
+/*
+ * It is up to the caller to ensure that srp_rport_reconnect() calls are
+ * serialized and that no concurrent srp_queuecommand(), srp_abort(),
+ * srp_reset_device() or srp_reset_host() calls will occur while this function
+ * is in progress. One way to realize that is not to call this function
+ * directly but to call srp_reconnect_rport() instead since that last function
+ * serializes calls of this function via rport->mutex and also blocks
+ * srp_queuecommand() calls before invoking this function.
+ */
+static int srp_rport_reconnect(struct srp_rport *rport)
+{
+	struct srp_target_port *target = rport->lld_data;
+	struct srp_rdma_ch *ch;
+	int i, j, ret = 0;
+	bool multich = false;
+
+	srp_disconnect_target(target);
+
+	if (target->state == SRP_TARGET_SCANNING)
+		return -ENODEV;
+
+	/*
+	 * Now get a new local CM ID so that we avoid confusing the target in
+	 * case things are really fouled up. Doing so also ensures that all CM
+	 * callbacks will have finished before a new QP is allocated.
+	 */
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		if (!ch->target)
+			break;
+		ret += srp_new_cm_id(ch);
+	}
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		if (!ch->target)
+			break;
+		for (j = 0; j < target->req_ring_size; ++j) {
+			struct srp_request *req = &ch->req_ring[j];
+
+			srp_finish_req(ch, req, NULL, DID_RESET << 16);
+		}
+	}
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		if (!ch->target)
+			break;
+		/*
+		 * Whether or not creating a new CM ID succeeded, create a new
+		 * QP. This guarantees that all completion callback function
+		 * invocations have finished before request resetting starts.
+		 */
+		ret += srp_create_ch_ib(ch);
+
+		INIT_LIST_HEAD(&ch->free_tx);
+		for (j = 0; j < target->queue_size; ++j)
+			list_add(&ch->tx_ring[j]->list, &ch->free_tx);
+	}
+
+	target->qp_in_error = false;
+
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		if (ret || !ch->target)
+			break;
+		ret = srp_connect_ch(ch, multich);
+		multich = true;
+	}
+
+	if (ret == 0)
+		shost_printk(KERN_INFO, target->scsi_host,
+			     PFX "reconnect succeeded\n");
+
+	return ret;
+}
+
+static void srp_map_desc(struct srp_map_state *state, dma_addr_t dma_addr,
+			 unsigned int dma_len, u32 rkey)
+{
+	struct srp_direct_buf *desc = state->desc;
+
+	desc->va = cpu_to_be64(dma_addr);
+	desc->key = cpu_to_be32(rkey);
+	desc->len = cpu_to_be32(dma_len);
+
+	state->total_len += dma_len;
+	state->desc++;
+	state->ndesc++;
+}
+
+static int srp_map_finish_fmr(struct srp_map_state *state,
+			      struct srp_rdma_ch *ch)
+{
+	struct ib_pool_fmr *fmr;
+	u64 io_addr = 0;
+
+	fmr = ib_fmr_pool_map_phys(ch->fmr_pool, state->pages,
+				   state->npages, io_addr);
+	if (IS_ERR(fmr))
+		return PTR_ERR(fmr);
+
+	*state->next_fmr++ = fmr;
+	state->nmdesc++;
+
+	srp_map_desc(state, 0, state->dma_len, fmr->fmr->rkey);
+
+	return 0;
+}
+
+static int srp_map_finish_fr(struct srp_map_state *state,
+			     struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_send_wr *bad_wr;
+	struct ib_send_wr wr;
+	struct srp_fr_desc *desc;
+	u32 rkey;
+
+	desc = srp_fr_pool_get(ch->fr_pool);
+	if (!desc)
+		return -ENOMEM;
+
+	rkey = ib_inc_rkey(desc->mr->rkey);
+	ib_update_fast_reg_key(desc->mr, rkey);
+
+	memcpy(desc->frpl->page_list, state->pages,
+	       sizeof(state->pages[0]) * state->npages);
+
+	memset(&wr, 0, sizeof(wr));
+	wr.opcode = IB_WR_FAST_REG_MR;
+	wr.wr_id = FAST_REG_WR_ID_MASK;
+	wr.wr.fast_reg.iova_start = state->base_dma_addr;
+	wr.wr.fast_reg.page_list = desc->frpl;
+	wr.wr.fast_reg.page_list_len = state->npages;
+	wr.wr.fast_reg.page_shift = ilog2(dev->mr_page_size);
+	wr.wr.fast_reg.length = state->dma_len;
+	wr.wr.fast_reg.access_flags = (IB_ACCESS_LOCAL_WRITE |
+				       IB_ACCESS_REMOTE_READ |
+				       IB_ACCESS_REMOTE_WRITE);
+	wr.wr.fast_reg.rkey = desc->mr->lkey;
+
+	*state->next_fr++ = desc;
+	state->nmdesc++;
+
+	srp_map_desc(state, state->base_dma_addr, state->dma_len,
+		     desc->mr->rkey);
+
+	return ib_post_send(ch->qp, &wr, &bad_wr);
+}
+
+static int srp_finish_mapping(struct srp_map_state *state,
+			      struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	int ret = 0;
+
+	if (state->npages == 0)
+		return 0;
+
+	if (state->npages == 1 && !register_always)
+		srp_map_desc(state, state->base_dma_addr, state->dma_len,
+			     target->rkey);
+	else
+		ret = target->srp_host->srp_dev->use_fast_reg ?
+			srp_map_finish_fr(state, ch) :
+			srp_map_finish_fmr(state, ch);
+
+	if (ret == 0) {
+		state->npages = 0;
+		state->dma_len = 0;
+	}
+
+	return ret;
+}
+
+static void srp_map_update_start(struct srp_map_state *state,
+				 struct scatterlist *sg, int sg_index,
+				 dma_addr_t dma_addr)
+{
+	state->unmapped_sg = sg;
+	state->unmapped_index = sg_index;
+	state->unmapped_addr = dma_addr;
+}
+
+static int srp_map_sg_entry(struct srp_map_state *state,
+			    struct srp_rdma_ch *ch,
+			    struct scatterlist *sg, int sg_index,
+			    bool use_mr)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_device *ibdev = dev->dev;
+	dma_addr_t dma_addr = ib_sg_dma_address(ibdev, sg);
+	unsigned int dma_len = ib_sg_dma_len(ibdev, sg);
+	unsigned int len;
+	int ret;
+
+	if (!dma_len)
+		return 0;
+
+	if (!use_mr) {
+		/*
+		 * Once we're in direct map mode for a request, we don't
+		 * go back to FMR or FR mode, so no need to update anything
+		 * other than the descriptor.
+		 */
+		srp_map_desc(state, dma_addr, dma_len, target->rkey);
+		return 0;
+	}
+
+	/*
+	 * Since not all RDMA HW drivers support non-zero page offsets for
+	 * FMR, if we start at an offset into a page, don't merge into the
+	 * current FMR mapping. Finish it out, and use the kernel's MR for
+	 * this sg entry.
+	 */
+	if ((!dev->use_fast_reg && dma_addr & ~dev->mr_page_mask) ||
+	    dma_len > dev->mr_max_size) {
+		ret = srp_finish_mapping(state, ch);
+		if (ret)
+			return ret;
+
+		srp_map_desc(state, dma_addr, dma_len, target->rkey);
+		srp_map_update_start(state, NULL, 0, 0);
+		return 0;
+	}
+
+	/*
+	 * If this is the first sg that will be mapped via FMR or via FR, save
+	 * our position. We need to know the first unmapped entry, its index,
+	 * and the first unmapped address within that entry to be able to
+	 * restart mapping after an error.
+	 */
+	if (!state->unmapped_sg)
+		srp_map_update_start(state, sg, sg_index, dma_addr);
+
+	while (dma_len) {
+		unsigned offset = dma_addr & ~dev->mr_page_mask;
+		if (state->npages == dev->max_pages_per_mr || offset != 0) {
+			ret = srp_finish_mapping(state, ch);
+			if (ret)
+				return ret;
+
+			srp_map_update_start(state, sg, sg_index, dma_addr);
+		}
+
+		len = min_t(unsigned int, dma_len, dev->mr_page_size - offset);
+
+		if (!state->npages)
+			state->base_dma_addr = dma_addr;
+		state->pages[state->npages++] = dma_addr & dev->mr_page_mask;
+		state->dma_len += len;
+		dma_addr += len;
+		dma_len -= len;
+	}
+
+	/*
+	 * If the last entry of the MR wasn't a full page, then we need to
+	 * close it out and start a new one -- we can only merge at page
+	 * boundries.
+	 */
+	ret = 0;
+	if (len != dev->mr_page_size) {
+		ret = srp_finish_mapping(state, ch);
+		if (!ret)
+			srp_map_update_start(state, NULL, 0, 0);
+	}
+	return ret;
+}
+
+static int srp_map_sg(struct srp_map_state *state, struct srp_rdma_ch *ch,
+		      struct srp_request *req, struct scatterlist *scat,
+		      int count)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_device *dev = target->srp_host->srp_dev;
+	struct ib_device *ibdev = dev->dev;
+	struct scatterlist *sg;
+	int i;
+	bool use_mr;
+
+	state->desc	= req->indirect_desc;
+	state->pages	= req->map_page;
+	if (dev->use_fast_reg) {
+		state->next_fr = req->fr_list;
+		use_mr = !!ch->fr_pool;
+	} else {
+		state->next_fmr = req->fmr_list;
+		use_mr = !!ch->fmr_pool;
+	}
+
+	for_each_sg(scat, sg, count, i) {
+		if (srp_map_sg_entry(state, ch, sg, i, use_mr)) {
+			/*
+			 * Memory registration failed, so backtrack to the
+			 * first unmapped entry and continue on without using
+			 * memory registration.
+			 */
+			dma_addr_t dma_addr;
+			unsigned int dma_len;
+
+backtrack:
+			sg = state->unmapped_sg;
+			i = state->unmapped_index;
+
+			dma_addr = ib_sg_dma_address(ibdev, sg);
+			dma_len = ib_sg_dma_len(ibdev, sg);
+			dma_len -= (state->unmapped_addr - dma_addr);
+			dma_addr = state->unmapped_addr;
+			use_mr = false;
+			srp_map_desc(state, dma_addr, dma_len, target->rkey);
+		}
+	}
+
+	if (use_mr && srp_finish_mapping(state, ch))
+		goto backtrack;
+
+	req->nmdesc = state->nmdesc;
+
+	return 0;
+}
+
+static int srp_map_data(struct scsi_cmnd *scmnd, struct srp_rdma_ch *ch,
+			struct srp_request *req)
+{
+	struct srp_target_port *target = ch->target;
+	struct scatterlist *scat;
+	struct srp_cmd *cmd = req->cmd->buf;
+	int len, nents, count;
+	struct srp_device *dev;
+	struct ib_device *ibdev;
+	struct srp_map_state state;
+	struct srp_indirect_buf *indirect_hdr;
+	u32 table_len;
+	u8 fmt;
+
+	if (!scsi_sglist(scmnd) || scmnd->sc_data_direction == DMA_NONE)
+		return sizeof (struct srp_cmd);
+
+	if (scmnd->sc_data_direction != DMA_FROM_DEVICE &&
+	    scmnd->sc_data_direction != DMA_TO_DEVICE) {
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled data direction %d\n",
+			     scmnd->sc_data_direction);
+		return -EINVAL;
+	}
+
+	nents = scsi_sg_count(scmnd);
+	scat  = scsi_sglist(scmnd);
+
+	dev = target->srp_host->srp_dev;
+	ibdev = dev->dev;
+
+	count = ib_dma_map_sg(ibdev, scat, nents, scmnd->sc_data_direction);
+	if (unlikely(count == 0))
+		return -EIO;
+
+	fmt = SRP_DATA_DESC_DIRECT;
+	len = sizeof (struct srp_cmd) +	sizeof (struct srp_direct_buf);
+
+	if (count == 1 && !register_always) {
+		/*
+		 * The midlayer only generated a single gather/scatter
+		 * entry, or DMA mapping coalesced everything to a
+		 * single entry.  So a direct descriptor along with
+		 * the DMA MR suffices.
+		 */
+		struct srp_direct_buf *buf = (void *) cmd->add_data;
+
+		buf->va  = cpu_to_be64(ib_sg_dma_address(ibdev, scat));
+		buf->key = cpu_to_be32(target->rkey);
+		buf->len = cpu_to_be32(ib_sg_dma_len(ibdev, scat));
+
+		req->nmdesc = 0;
+		goto map_complete;
+	}
+
+	/*
+	 * We have more than one scatter/gather entry, so build our indirect
+	 * descriptor table, trying to merge as many entries as we can.
+	 */
+	indirect_hdr = (void *) cmd->add_data;
+
+	ib_dma_sync_single_for_cpu(ibdev, req->indirect_dma_addr,
+				   target->indirect_size, DMA_TO_DEVICE);
+
+	memset(&state, 0, sizeof(state));
+	srp_map_sg(&state, ch, req, scat, count);
+
+	/* We've mapped the request, now pull as much of the indirect
+	 * descriptor table as we can into the command buffer. If this
+	 * target is not using an external indirect table, we are
+	 * guaranteed to fit into the command, as the SCSI layer won't
+	 * give us more S/G entries than we allow.
+	 */
+	if (state.ndesc == 1) {
+		/*
+		 * Memory registration collapsed the sg-list into one entry,
+		 * so use a direct descriptor.
+		 */
+		struct srp_direct_buf *buf = (void *) cmd->add_data;
+
+		*buf = req->indirect_desc[0];
+		goto map_complete;
+	}
+
+	if (unlikely(target->cmd_sg_cnt < state.ndesc &&
+						!target->allow_ext_sg)) {
+		shost_printk(KERN_ERR, target->scsi_host,
+			     "Could not fit S/G list into SRP_CMD\n");
+		return -EIO;
+	}
+
+	count = min(state.ndesc, target->cmd_sg_cnt);
+	table_len = state.ndesc * sizeof (struct srp_direct_buf);
+
+	fmt = SRP_DATA_DESC_INDIRECT;
+	len = sizeof(struct srp_cmd) + sizeof (struct srp_indirect_buf);
+	len += count * sizeof (struct srp_direct_buf);
+
+	memcpy(indirect_hdr->desc_list, req->indirect_desc,
+	       count * sizeof (struct srp_direct_buf));
+
+	indirect_hdr->table_desc.va = cpu_to_be64(req->indirect_dma_addr);
+	indirect_hdr->table_desc.key = cpu_to_be32(target->rkey);
+	indirect_hdr->table_desc.len = cpu_to_be32(table_len);
+	indirect_hdr->len = cpu_to_be32(state.total_len);
+
+	if (scmnd->sc_data_direction == DMA_TO_DEVICE)
+		cmd->data_out_desc_cnt = count;
+	else
+		cmd->data_in_desc_cnt = count;
+
+	ib_dma_sync_single_for_device(ibdev, req->indirect_dma_addr, table_len,
+				      DMA_TO_DEVICE);
+
+map_complete:
+	if (scmnd->sc_data_direction == DMA_TO_DEVICE)
+		cmd->buf_fmt = fmt << 4;
+	else
+		cmd->buf_fmt = fmt;
+
+	return len;
+}
+
+/*
+ * Return an IU and possible credit to the free pool
+ */
+static void srp_put_tx_iu(struct srp_rdma_ch *ch, struct srp_iu *iu,
+			  enum srp_iu_type iu_type)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ch->lock, flags);
+	list_add(&iu->list, &ch->free_tx);
+	if (iu_type != SRP_IU_RSP)
+		++ch->req_lim;
+	spin_unlock_irqrestore(&ch->lock, flags);
+}
+
+/*
+ * Must be called with ch->lock held to protect req_lim and free_tx.
+ * If IU is not sent, it must be returned using srp_put_tx_iu().
+ *
+ * Note:
+ * An upper limit for the number of allocated information units for each
+ * request type is:
+ * - SRP_IU_CMD: SRP_CMD_SQ_SIZE, since the SCSI mid-layer never queues
+ *   more than Scsi_Host.can_queue requests.
+ * - SRP_IU_TSK_MGMT: SRP_TSK_MGMT_SQ_SIZE.
+ * - SRP_IU_RSP: 1, since a conforming SRP target never sends more than
+ *   one unanswered SRP request to an initiator.
+ */
+static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch,
+				      enum srp_iu_type iu_type)
+{
+	struct srp_target_port *target = ch->target;
+	s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE;
+	struct srp_iu *iu;
+
+	srp_send_completion(ch->send_cq, ch);
+
+	if (list_empty(&ch->free_tx))
+		return NULL;
+
+	/* Initiator responses to target requests do not consume credits */
+	if (iu_type != SRP_IU_RSP) {
+		if (ch->req_lim <= rsv) {
+			++target->zero_req_lim;
+			return NULL;
+		}
+
+		--ch->req_lim;
+	}
+
+	iu = list_first_entry(&ch->free_tx, struct srp_iu, list);
+	list_del(&iu->list);
+	return iu;
+}
+
+static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len)
+{
+	struct srp_target_port *target = ch->target;
+	struct ib_sge list;
+	struct ib_send_wr wr, *bad_wr;
+
+	list.addr   = iu->dma;
+	list.length = len;
+	list.lkey   = target->lkey;
+
+	wr.next       = NULL;
+	wr.wr_id      = (uintptr_t) iu;
+	wr.sg_list    = &list;
+	wr.num_sge    = 1;
+	wr.opcode     = IB_WR_SEND;
+	wr.send_flags = IB_SEND_SIGNALED;
+
+	return ib_post_send(ch->qp, &wr, &bad_wr);
+}
+
+static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu)
+{
+	struct srp_target_port *target = ch->target;
+	struct ib_recv_wr wr, *bad_wr;
+	struct ib_sge list;
+
+	list.addr   = iu->dma;
+	list.length = iu->size;
+	list.lkey   = target->lkey;
+
+	wr.next     = NULL;
+	wr.wr_id    = (uintptr_t) iu;
+	wr.sg_list  = &list;
+	wr.num_sge  = 1;
+
+	return ib_post_recv(ch->qp, &wr, &bad_wr);
+}
+
+static void srp_process_rsp(struct srp_rdma_ch *ch, struct srp_rsp *rsp)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_request *req;
+	struct scsi_cmnd *scmnd;
+	unsigned long flags;
+
+	if (unlikely(rsp->tag & SRP_TAG_TSK_MGMT)) {
+		spin_lock_irqsave(&ch->lock, flags);
+		ch->req_lim += be32_to_cpu(rsp->req_lim_delta);
+		spin_unlock_irqrestore(&ch->lock, flags);
+
+		ch->tsk_mgmt_status = -1;
+		if (be32_to_cpu(rsp->resp_data_len) >= 4)
+			ch->tsk_mgmt_status = rsp->data[3];
+		complete(&ch->tsk_mgmt_done);
+	} else {
+		scmnd = scsi_host_find_tag(target->scsi_host, rsp->tag);
+		if (scmnd) {
+			req = (void *)scmnd->host_scribble;
+			scmnd = srp_claim_req(ch, req, NULL, scmnd);
+		}
+		if (!scmnd) {
+			shost_printk(KERN_ERR, target->scsi_host,
+				     "Null scmnd for RSP w/tag %#016llx received on ch %td / QP %#x\n",
+				     rsp->tag, ch - target->ch, ch->qp->qp_num);
+
+			spin_lock_irqsave(&ch->lock, flags);
+			ch->req_lim += be32_to_cpu(rsp->req_lim_delta);
+			spin_unlock_irqrestore(&ch->lock, flags);
+
+			return;
+		}
+		scmnd->result = rsp->status;
+
+		if (rsp->flags & SRP_RSP_FLAG_SNSVALID) {
+			memcpy(scmnd->sense_buffer, rsp->data +
+			       be32_to_cpu(rsp->resp_data_len),
+			       min_t(int, be32_to_cpu(rsp->sense_data_len),
+				     SCSI_SENSE_BUFFERSIZE));
+		}
+
+		if (unlikely(rsp->flags & SRP_RSP_FLAG_DIUNDER))
+			scsi_set_resid(scmnd, be32_to_cpu(rsp->data_in_res_cnt));
+		else if (unlikely(rsp->flags & SRP_RSP_FLAG_DIOVER))
+			scsi_set_resid(scmnd, -be32_to_cpu(rsp->data_in_res_cnt));
+		else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOUNDER))
+			scsi_set_resid(scmnd, be32_to_cpu(rsp->data_out_res_cnt));
+		else if (unlikely(rsp->flags & SRP_RSP_FLAG_DOOVER))
+			scsi_set_resid(scmnd, -be32_to_cpu(rsp->data_out_res_cnt));
+
+		srp_free_req(ch, req, scmnd,
+			     be32_to_cpu(rsp->req_lim_delta));
+
+		scmnd->host_scribble = NULL;
+		scmnd->scsi_done(scmnd);
+	}
+}
+
+static int srp_response_common(struct srp_rdma_ch *ch, s32 req_delta,
+			       void *rsp, int len)
+{
+	struct srp_target_port *target = ch->target;
+	struct ib_device *dev = target->srp_host->srp_dev->dev;
+	unsigned long flags;
+	struct srp_iu *iu;
+	int err;
+
+	spin_lock_irqsave(&ch->lock, flags);
+	ch->req_lim += req_delta;
+	iu = __srp_get_tx_iu(ch, SRP_IU_RSP);
+	spin_unlock_irqrestore(&ch->lock, flags);
+
+	if (!iu) {
+		shost_printk(KERN_ERR, target->scsi_host, PFX
+			     "no IU available to send response\n");
+		return 1;
+	}
+
+	ib_dma_sync_single_for_cpu(dev, iu->dma, len, DMA_TO_DEVICE);
+	memcpy(iu->buf, rsp, len);
+	ib_dma_sync_single_for_device(dev, iu->dma, len, DMA_TO_DEVICE);
+
+	err = srp_post_send(ch, iu, len);
+	if (err) {
+		shost_printk(KERN_ERR, target->scsi_host, PFX
+			     "unable to post response: %d\n", err);
+		srp_put_tx_iu(ch, iu, SRP_IU_RSP);
+	}
+
+	return err;
+}
+
+static void srp_process_cred_req(struct srp_rdma_ch *ch,
+				 struct srp_cred_req *req)
+{
+	struct srp_cred_rsp rsp = {
+		.opcode = SRP_CRED_RSP,
+		.tag = req->tag,
+	};
+	s32 delta = be32_to_cpu(req->req_lim_delta);
+
+	if (srp_response_common(ch, delta, &rsp, sizeof(rsp)))
+		shost_printk(KERN_ERR, ch->target->scsi_host, PFX
+			     "problems processing SRP_CRED_REQ\n");
+}
+
+static void srp_process_aer_req(struct srp_rdma_ch *ch,
+				struct srp_aer_req *req)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_aer_rsp rsp = {
+		.opcode = SRP_AER_RSP,
+		.tag = req->tag,
+	};
+	s32 delta = be32_to_cpu(req->req_lim_delta);
+
+	shost_printk(KERN_ERR, target->scsi_host, PFX
+		     "ignoring AER for LUN %llu\n", be64_to_cpu(req->lun));
+
+	if (srp_response_common(ch, delta, &rsp, sizeof(rsp)))
+		shost_printk(KERN_ERR, target->scsi_host, PFX
+			     "problems processing SRP_AER_REQ\n");
+}
+
+static void srp_handle_recv(struct srp_rdma_ch *ch, struct ib_wc *wc)
+{
+	struct srp_target_port *target = ch->target;
+	struct ib_device *dev = target->srp_host->srp_dev->dev;
+	struct srp_iu *iu = (struct srp_iu *) (uintptr_t) wc->wr_id;
+	int res;
+	u8 opcode;
+
+	ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_ti_iu_len,
+				   DMA_FROM_DEVICE);
+
+	opcode = *(u8 *) iu->buf;
+
+	if (0) {
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "recv completion, opcode 0x%02x\n", opcode);
+		print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 8, 1,
+			       iu->buf, wc->byte_len, true);
+	}
+
+	switch (opcode) {
+	case SRP_RSP:
+		srp_process_rsp(ch, iu->buf);
+		break;
+
+	case SRP_CRED_REQ:
+		srp_process_cred_req(ch, iu->buf);
+		break;
+
+	case SRP_AER_REQ:
+		srp_process_aer_req(ch, iu->buf);
+		break;
+
+	case SRP_T_LOGOUT:
+		/* XXX Handle target logout */
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Got target logout request\n");
+		break;
+
+	default:
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled SRP opcode 0x%02x\n", opcode);
+		break;
+	}
+
+	ib_dma_sync_single_for_device(dev, iu->dma, ch->max_ti_iu_len,
+				      DMA_FROM_DEVICE);
+
+	res = srp_post_recv(ch, iu);
+	if (res != 0)
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "Recv failed with error code %d\n", res);
+}
+
+/**
+ * srp_tl_err_work() - handle a transport layer error
+ * @work: Work structure embedded in an SRP target port.
+ *
+ * Note: This function may get invoked before the rport has been created,
+ * hence the target->rport test.
+ */
+static void srp_tl_err_work(struct work_struct *work)
+{
+	struct srp_target_port *target;
+
+	target = container_of(work, struct srp_target_port, tl_err_work);
+	if (target->rport)
+		srp_start_tl_fail_timers(target->rport);
+}
+
+static void srp_handle_qp_err(u64 wr_id, enum ib_wc_status wc_status,
+			      bool send_err, struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+
+	if (wr_id == SRP_LAST_WR_ID) {
+		complete(&ch->done);
+		return;
+	}
+
+	if (ch->connected && !target->qp_in_error) {
+		if (wr_id & LOCAL_INV_WR_ID_MASK) {
+			shost_printk(KERN_ERR, target->scsi_host, PFX
+				     "LOCAL_INV failed with status %d\n",
+				     wc_status);
+		} else if (wr_id & FAST_REG_WR_ID_MASK) {
+			shost_printk(KERN_ERR, target->scsi_host, PFX
+				     "FAST_REG_MR failed status %d\n",
+				     wc_status);
+		} else {
+			shost_printk(KERN_ERR, target->scsi_host,
+				     PFX "failed %s status %d for iu %p\n",
+				     send_err ? "send" : "receive",
+				     wc_status, (void *)(uintptr_t)wr_id);
+		}
+		queue_work(system_long_wq, &target->tl_err_work);
+	}
+	target->qp_in_error = true;
+}
+
+static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr)
+{
+	struct srp_rdma_ch *ch = ch_ptr;
+	struct ib_wc wc;
+
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	while (ib_poll_cq(cq, 1, &wc) > 0) {
+		if (likely(wc.status == IB_WC_SUCCESS)) {
+			srp_handle_recv(ch, &wc);
+		} else {
+			srp_handle_qp_err(wc.wr_id, wc.status, false, ch);
+		}
+	}
+}
+
+static void srp_send_completion(struct ib_cq *cq, void *ch_ptr)
+{
+	struct srp_rdma_ch *ch = ch_ptr;
+	struct ib_wc wc;
+	struct srp_iu *iu;
+
+	while (ib_poll_cq(cq, 1, &wc) > 0) {
+		if (likely(wc.status == IB_WC_SUCCESS)) {
+			iu = (struct srp_iu *) (uintptr_t) wc.wr_id;
+			list_add(&iu->list, &ch->free_tx);
+		} else {
+			srp_handle_qp_err(wc.wr_id, wc.status, true, ch);
+		}
+	}
+}
+
+static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd)
+{
+	struct srp_target_port *target = host_to_target(shost);
+	struct srp_rport *rport = target->rport;
+	struct srp_rdma_ch *ch;
+	struct srp_request *req;
+	struct srp_iu *iu;
+	struct srp_cmd *cmd;
+	struct ib_device *dev;
+	unsigned long flags;
+	u32 tag;
+	u16 idx;
+	int len, ret;
+	const bool in_scsi_eh = !in_interrupt() && current == shost->ehandler;
+
+	/*
+	 * The SCSI EH thread is the only context from which srp_queuecommand()
+	 * can get invoked for blocked devices (SDEV_BLOCK /
+	 * SDEV_CREATED_BLOCK). Avoid racing with srp_reconnect_rport() by
+	 * locking the rport mutex if invoked from inside the SCSI EH.
+	 */
+	if (in_scsi_eh)
+		mutex_lock(&rport->mutex);
+
+	scmnd->result = srp_chkready(target->rport);
+	if (unlikely(scmnd->result))
+		goto err;
+
+	WARN_ON_ONCE(scmnd->request->tag < 0);
+	tag = blk_mq_unique_tag(scmnd->request);
+	ch = &target->ch[blk_mq_unique_tag_to_hwq(tag)];
+	idx = blk_mq_unique_tag_to_tag(tag);
+	WARN_ONCE(idx >= target->req_ring_size, "%s: tag %#x: idx %d >= %d\n",
+		  dev_name(&shost->shost_gendev), tag, idx,
+		  target->req_ring_size);
+
+	spin_lock_irqsave(&ch->lock, flags);
+	iu = __srp_get_tx_iu(ch, SRP_IU_CMD);
+	spin_unlock_irqrestore(&ch->lock, flags);
+
+	if (!iu)
+		goto err;
+
+	req = &ch->req_ring[idx];
+	dev = target->srp_host->srp_dev->dev;
+	ib_dma_sync_single_for_cpu(dev, iu->dma, target->max_iu_len,
+				   DMA_TO_DEVICE);
+
+	scmnd->host_scribble = (void *) req;
+
+	cmd = iu->buf;
+	memset(cmd, 0, sizeof *cmd);
+
+	cmd->opcode = SRP_CMD;
+	cmd->lun    = cpu_to_be64((u64) scmnd->device->lun << 48);
+	cmd->tag    = tag;
+	memcpy(cmd->cdb, scmnd->cmnd, scmnd->cmd_len);
+
+	req->scmnd    = scmnd;
+	req->cmd      = iu;
+
+	len = srp_map_data(scmnd, ch, req);
+	if (len < 0) {
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "Failed to map data (%d)\n", len);
+		/*
+		 * If we ran out of memory descriptors (-ENOMEM) because an
+		 * application is queuing many requests with more than
+		 * max_pages_per_mr sg-list elements, tell the SCSI mid-layer
+		 * to reduce queue depth temporarily.
+		 */
+		scmnd->result = len == -ENOMEM ?
+			DID_OK << 16 | QUEUE_FULL << 1 : DID_ERROR << 16;
+		goto err_iu;
+	}
+
+	ib_dma_sync_single_for_device(dev, iu->dma, target->max_iu_len,
+				      DMA_TO_DEVICE);
+
+	if (srp_post_send(ch, iu, len)) {
+		shost_printk(KERN_ERR, target->scsi_host, PFX "Send failed\n");
+		goto err_unmap;
+	}
+
+	ret = 0;
+
+unlock_rport:
+	if (in_scsi_eh)
+		mutex_unlock(&rport->mutex);
+
+	return ret;
+
+err_unmap:
+	srp_unmap_data(scmnd, ch, req);
+
+err_iu:
+	srp_put_tx_iu(ch, iu, SRP_IU_CMD);
+
+	/*
+	 * Avoid that the loops that iterate over the request ring can
+	 * encounter a dangling SCSI command pointer.
+	 */
+	req->scmnd = NULL;
+
+err:
+	if (scmnd->result) {
+		scmnd->scsi_done(scmnd);
+		ret = 0;
+	} else {
+		ret = SCSI_MLQUEUE_HOST_BUSY;
+	}
+
+	goto unlock_rport;
+}
+
+/*
+ * Note: the resources allocated in this function are freed in
+ * srp_free_ch_ib().
+ */
+static int srp_alloc_iu_bufs(struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	int i;
+
+	ch->rx_ring = kcalloc(target->queue_size, sizeof(*ch->rx_ring),
+			      GFP_KERNEL);
+	if (!ch->rx_ring)
+		goto err_no_ring;
+	ch->tx_ring = kcalloc(target->queue_size, sizeof(*ch->tx_ring),
+			      GFP_KERNEL);
+	if (!ch->tx_ring)
+		goto err_no_ring;
+
+	for (i = 0; i < target->queue_size; ++i) {
+		ch->rx_ring[i] = srp_alloc_iu(target->srp_host,
+					      ch->max_ti_iu_len,
+					      GFP_KERNEL, DMA_FROM_DEVICE);
+		if (!ch->rx_ring[i])
+			goto err;
+	}
+
+	for (i = 0; i < target->queue_size; ++i) {
+		ch->tx_ring[i] = srp_alloc_iu(target->srp_host,
+					      target->max_iu_len,
+					      GFP_KERNEL, DMA_TO_DEVICE);
+		if (!ch->tx_ring[i])
+			goto err;
+
+		list_add(&ch->tx_ring[i]->list, &ch->free_tx);
+	}
+
+	return 0;
+
+err:
+	for (i = 0; i < target->queue_size; ++i) {
+		srp_free_iu(target->srp_host, ch->rx_ring[i]);
+		srp_free_iu(target->srp_host, ch->tx_ring[i]);
+	}
+
+
+err_no_ring:
+	kfree(ch->tx_ring);
+	ch->tx_ring = NULL;
+	kfree(ch->rx_ring);
+	ch->rx_ring = NULL;
+
+	return -ENOMEM;
+}
+
+static uint32_t srp_compute_rq_tmo(struct ib_qp_attr *qp_attr, int attr_mask)
+{
+	uint64_t T_tr_ns, max_compl_time_ms;
+	uint32_t rq_tmo_jiffies;
+
+	/*
+	 * According to section 11.2.4.2 in the IBTA spec (Modify Queue Pair,
+	 * table 91), both the QP timeout and the retry count have to be set
+	 * for RC QP's during the RTR to RTS transition.
+	 */
+	WARN_ON_ONCE((attr_mask & (IB_QP_TIMEOUT | IB_QP_RETRY_CNT)) !=
+		     (IB_QP_TIMEOUT | IB_QP_RETRY_CNT));
+
+	/*
+	 * Set target->rq_tmo_jiffies to one second more than the largest time
+	 * it can take before an error completion is generated. See also
+	 * C9-140..142 in the IBTA spec for more information about how to
+	 * convert the QP Local ACK Timeout value to nanoseconds.
+	 */
+	T_tr_ns = 4096 * (1ULL << qp_attr->timeout);
+	max_compl_time_ms = qp_attr->retry_cnt * 4 * T_tr_ns;
+	do_div(max_compl_time_ms, NSEC_PER_MSEC);
+	rq_tmo_jiffies = msecs_to_jiffies(max_compl_time_ms + 1000);
+
+	return rq_tmo_jiffies;
+}
+
+static void srp_cm_rep_handler(struct ib_cm_id *cm_id,
+			       struct srp_login_rsp *lrsp,
+			       struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	struct ib_qp_attr *qp_attr = NULL;
+	int attr_mask = 0;
+	int ret;
+	int i;
+
+	if (lrsp->opcode == SRP_LOGIN_RSP) {
+		ch->max_ti_iu_len = be32_to_cpu(lrsp->max_ti_iu_len);
+		ch->req_lim       = be32_to_cpu(lrsp->req_lim_delta);
+
+		/*
+		 * Reserve credits for task management so we don't
+		 * bounce requests back to the SCSI mid-layer.
+		 */
+		target->scsi_host->can_queue
+			= min(ch->req_lim - SRP_TSK_MGMT_SQ_SIZE,
+			      target->scsi_host->can_queue);
+		target->scsi_host->cmd_per_lun
+			= min_t(int, target->scsi_host->can_queue,
+				target->scsi_host->cmd_per_lun);
+	} else {
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled RSP opcode %#x\n", lrsp->opcode);
+		ret = -ECONNRESET;
+		goto error;
+	}
+
+	if (!ch->rx_ring) {
+		ret = srp_alloc_iu_bufs(ch);
+		if (ret)
+			goto error;
+	}
+
+	ret = -ENOMEM;
+	qp_attr = kmalloc(sizeof *qp_attr, GFP_KERNEL);
+	if (!qp_attr)
+		goto error;
+
+	qp_attr->qp_state = IB_QPS_RTR;
+	ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask);
+	if (ret)
+		goto error_free;
+
+	ret = ib_modify_qp(ch->qp, qp_attr, attr_mask);
+	if (ret)
+		goto error_free;
+
+	for (i = 0; i < target->queue_size; i++) {
+		struct srp_iu *iu = ch->rx_ring[i];
+
+		ret = srp_post_recv(ch, iu);
+		if (ret)
+			goto error_free;
+	}
+
+	qp_attr->qp_state = IB_QPS_RTS;
+	ret = ib_cm_init_qp_attr(cm_id, qp_attr, &attr_mask);
+	if (ret)
+		goto error_free;
+
+	target->rq_tmo_jiffies = srp_compute_rq_tmo(qp_attr, attr_mask);
+
+	ret = ib_modify_qp(ch->qp, qp_attr, attr_mask);
+	if (ret)
+		goto error_free;
+
+	ret = ib_send_cm_rtu(cm_id, NULL, 0);
+
+error_free:
+	kfree(qp_attr);
+
+error:
+	ch->status = ret;
+}
+
+static void srp_cm_rej_handler(struct ib_cm_id *cm_id,
+			       struct ib_cm_event *event,
+			       struct srp_rdma_ch *ch)
+{
+	struct srp_target_port *target = ch->target;
+	struct Scsi_Host *shost = target->scsi_host;
+	struct ib_class_port_info *cpi;
+	int opcode;
+
+	switch (event->param.rej_rcvd.reason) {
+	case IB_CM_REJ_PORT_CM_REDIRECT:
+		cpi = event->param.rej_rcvd.ari;
+		ch->path.dlid = cpi->redirect_lid;
+		ch->path.pkey = cpi->redirect_pkey;
+		cm_id->remote_cm_qpn = be32_to_cpu(cpi->redirect_qp) & 0x00ffffff;
+		memcpy(ch->path.dgid.raw, cpi->redirect_gid, 16);
+
+		ch->status = ch->path.dlid ?
+			SRP_DLID_REDIRECT : SRP_PORT_REDIRECT;
+		break;
+
+	case IB_CM_REJ_PORT_REDIRECT:
+		if (srp_target_is_topspin(target)) {
+			/*
+			 * Topspin/Cisco SRP gateways incorrectly send
+			 * reject reason code 25 when they mean 24
+			 * (port redirect).
+			 */
+			memcpy(ch->path.dgid.raw,
+			       event->param.rej_rcvd.ari, 16);
+
+			shost_printk(KERN_DEBUG, shost,
+				     PFX "Topspin/Cisco redirect to target port GID %016llx%016llx\n",
+				     be64_to_cpu(ch->path.dgid.global.subnet_prefix),
+				     be64_to_cpu(ch->path.dgid.global.interface_id));
+
+			ch->status = SRP_PORT_REDIRECT;
+		} else {
+			shost_printk(KERN_WARNING, shost,
+				     "  REJ reason: IB_CM_REJ_PORT_REDIRECT\n");
+			ch->status = -ECONNRESET;
+		}
+		break;
+
+	case IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID:
+		shost_printk(KERN_WARNING, shost,
+			    "  REJ reason: IB_CM_REJ_DUPLICATE_LOCAL_COMM_ID\n");
+		ch->status = -ECONNRESET;
+		break;
+
+	case IB_CM_REJ_CONSUMER_DEFINED:
+		opcode = *(u8 *) event->private_data;
+		if (opcode == SRP_LOGIN_REJ) {
+			struct srp_login_rej *rej = event->private_data;
+			u32 reason = be32_to_cpu(rej->reason);
+
+			if (reason == SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE)
+				shost_printk(KERN_WARNING, shost,
+					     PFX "SRP_LOGIN_REJ: requested max_it_iu_len too large\n");
+			else
+				shost_printk(KERN_WARNING, shost, PFX
+					     "SRP LOGIN from %pI6 to %pI6 REJECTED, reason 0x%08x\n",
+					     target->sgid.raw,
+					     target->orig_dgid.raw, reason);
+		} else
+			shost_printk(KERN_WARNING, shost,
+				     "  REJ reason: IB_CM_REJ_CONSUMER_DEFINED,"
+				     " opcode 0x%02x\n", opcode);
+		ch->status = -ECONNRESET;
+		break;
+
+	case IB_CM_REJ_STALE_CONN:
+		shost_printk(KERN_WARNING, shost, "  REJ reason: stale connection\n");
+		ch->status = SRP_STALE_CONN;
+		break;
+
+	default:
+		shost_printk(KERN_WARNING, shost, "  REJ reason 0x%x\n",
+			     event->param.rej_rcvd.reason);
+		ch->status = -ECONNRESET;
+	}
+}
+
+static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+	struct srp_rdma_ch *ch = cm_id->context;
+	struct srp_target_port *target = ch->target;
+	int comp = 0;
+
+	switch (event->event) {
+	case IB_CM_REQ_ERROR:
+		shost_printk(KERN_DEBUG, target->scsi_host,
+			     PFX "Sending CM REQ failed\n");
+		comp = 1;
+		ch->status = -ECONNRESET;
+		break;
+
+	case IB_CM_REP_RECEIVED:
+		comp = 1;
+		srp_cm_rep_handler(cm_id, event->private_data, ch);
+		break;
+
+	case IB_CM_REJ_RECEIVED:
+		shost_printk(KERN_DEBUG, target->scsi_host, PFX "REJ received\n");
+		comp = 1;
+
+		srp_cm_rej_handler(cm_id, event, ch);
+		break;
+
+	case IB_CM_DREQ_RECEIVED:
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "DREQ received - connection closed\n");
+		ch->connected = false;
+		if (ib_send_cm_drep(cm_id, NULL, 0))
+			shost_printk(KERN_ERR, target->scsi_host,
+				     PFX "Sending CM DREP failed\n");
+		queue_work(system_long_wq, &target->tl_err_work);
+		break;
+
+	case IB_CM_TIMEWAIT_EXIT:
+		shost_printk(KERN_ERR, target->scsi_host,
+			     PFX "connection closed\n");
+		comp = 1;
+
+		ch->status = 0;
+		break;
+
+	case IB_CM_MRA_RECEIVED:
+	case IB_CM_DREQ_ERROR:
+	case IB_CM_DREP_RECEIVED:
+		break;
+
+	default:
+		shost_printk(KERN_WARNING, target->scsi_host,
+			     PFX "Unhandled CM event %d\n", event->event);
+		break;
+	}
+
+	if (comp)
+		complete(&ch->done);
+
+	return 0;
+}
+
+/**
+ * srp_change_queue_depth - setting device queue depth
+ * @sdev: scsi device struct
+ * @qdepth: requested queue depth
+ *
+ * Returns queue depth.
+ */
+static int
+srp_change_queue_depth(struct scsi_device *sdev, int qdepth)
+{
+	if (!sdev->tagged_supported)
+		qdepth = 1;
+	return scsi_change_queue_depth(sdev, qdepth);
+}
+
+static int srp_send_tsk_mgmt(struct srp_rdma_ch *ch, u64 req_tag,
+			     unsigned int lun, u8 func)
+{
+	struct srp_target_port *target = ch->target;
+	struct srp_rport *rport = target->rport;
+	struct ib_device *dev = target->srp_host->srp_dev->dev;
+	struct srp_iu *iu;
+	struct srp_tsk_mgmt *tsk_mgmt;
+
+	if (!ch->connected || target->qp_in_error)
+		return -1;
+
+	init_completion(&ch->tsk_mgmt_done);
+
+	/*
+	 * Lock the rport mutex to avoid that srp_create_ch_ib() is
+	 * invoked while a task management function is being sent.
+	 */
+	mutex_lock(&rport->mutex);
+	spin_lock_irq(&ch->lock);
+	iu = __srp_get_tx_iu(ch, SRP_IU_TSK_MGMT);
+	spin_unlock_irq(&ch->lock);
+
+	if (!iu) {
+		mutex_unlock(&rport->mutex);
+
+		return -1;
+	}
+
+	ib_dma_sync_single_for_cpu(dev, iu->dma, sizeof *tsk_mgmt,
+				   DMA_TO_DEVICE);
+	tsk_mgmt = iu->buf;
+	memset(tsk_mgmt, 0, sizeof *tsk_mgmt);
+
+	tsk_mgmt->opcode 	= SRP_TSK_MGMT;
+	tsk_mgmt->lun		= cpu_to_be64((u64) lun << 48);
+	tsk_mgmt->tag		= req_tag | SRP_TAG_TSK_MGMT;
+	tsk_mgmt->tsk_mgmt_func = func;
+	tsk_mgmt->task_tag	= req_tag;
+
+	ib_dma_sync_single_for_device(dev, iu->dma, sizeof *tsk_mgmt,
+				      DMA_TO_DEVICE);
+	if (srp_post_send(ch, iu, sizeof(*tsk_mgmt))) {
+		srp_put_tx_iu(ch, iu, SRP_IU_TSK_MGMT);
+		mutex_unlock(&rport->mutex);
+
+		return -1;
+	}
+	mutex_unlock(&rport->mutex);
+
+	if (!wait_for_completion_timeout(&ch->tsk_mgmt_done,
+					 msecs_to_jiffies(SRP_ABORT_TIMEOUT_MS)))
+		return -1;
+
+	return 0;
+}
+
+static int srp_abort(struct scsi_cmnd *scmnd)
+{
+	struct srp_target_port *target = host_to_target(scmnd->device->host);
+	struct srp_request *req = (struct srp_request *) scmnd->host_scribble;
+	u32 tag;
+	u16 ch_idx;
+	struct srp_rdma_ch *ch;
+	int ret;
+
+	shost_printk(KERN_ERR, target->scsi_host, "SRP abort called\n");
+
+	if (!req)
+		return SUCCESS;
+	tag = blk_mq_unique_tag(scmnd->request);
+	ch_idx = blk_mq_unique_tag_to_hwq(tag);
+	if (WARN_ON_ONCE(ch_idx >= target->ch_count))
+		return SUCCESS;
+	ch = &target->ch[ch_idx];
+	if (!srp_claim_req(ch, req, NULL, scmnd))
+		return SUCCESS;
+	shost_printk(KERN_ERR, target->scsi_host,
+		     "Sending SRP abort for tag %#x\n", tag);
+	if (srp_send_tsk_mgmt(ch, tag, scmnd->device->lun,
+			      SRP_TSK_ABORT_TASK) == 0)
+		ret = SUCCESS;
+	else if (target->rport->state == SRP_RPORT_LOST)
+		ret = FAST_IO_FAIL;
+	else
+		ret = FAILED;
+	srp_free_req(ch, req, scmnd, 0);
+	scmnd->result = DID_ABORT << 16;
+	scmnd->scsi_done(scmnd);
+
+	return ret;
+}
+
+static int srp_reset_device(struct scsi_cmnd *scmnd)
+{
+	struct srp_target_port *target = host_to_target(scmnd->device->host);
+	struct srp_rdma_ch *ch;
+	int i;
+
+	shost_printk(KERN_ERR, target->scsi_host, "SRP reset_device called\n");
+
+	ch = &target->ch[0];
+	if (srp_send_tsk_mgmt(ch, SRP_TAG_NO_REQ, scmnd->device->lun,
+			      SRP_TSK_LUN_RESET))
+		return FAILED;
+	if (ch->tsk_mgmt_status)
+		return FAILED;
+
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		for (i = 0; i < target->req_ring_size; ++i) {
+			struct srp_request *req = &ch->req_ring[i];
+
+			srp_finish_req(ch, req, scmnd->device, DID_RESET << 16);
+		}
+	}
+
+	return SUCCESS;
+}
+
+static int srp_reset_host(struct scsi_cmnd *scmnd)
+{
+	struct srp_target_port *target = host_to_target(scmnd->device->host);
+
+	shost_printk(KERN_ERR, target->scsi_host, PFX "SRP reset_host called\n");
+
+	return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED;
+}
+
+static int srp_slave_configure(struct scsi_device *sdev)
+{
+	struct Scsi_Host *shost = sdev->host;
+	struct srp_target_port *target = host_to_target(shost);
+	struct request_queue *q = sdev->request_queue;
+	unsigned long timeout;
+
+	if (sdev->type == TYPE_DISK) {
+		timeout = max_t(unsigned, 30 * HZ, target->rq_tmo_jiffies);
+		blk_queue_rq_timeout(q, timeout);
+	}
+
+	return 0;
+}
+
+static ssize_t show_id_ext(struct device *dev, struct device_attribute *attr,
+			   char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "0x%016llx\n",
+		       (unsigned long long) be64_to_cpu(target->id_ext));
+}
+
+static ssize_t show_ioc_guid(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "0x%016llx\n",
+		       (unsigned long long) be64_to_cpu(target->ioc_guid));
+}
+
+static ssize_t show_service_id(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "0x%016llx\n",
+		       (unsigned long long) be64_to_cpu(target->service_id));
+}
+
+static ssize_t show_pkey(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "0x%04x\n", be16_to_cpu(target->pkey));
+}
+
+static ssize_t show_sgid(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%pI6\n", target->sgid.raw);
+}
+
+static ssize_t show_dgid(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+	struct srp_rdma_ch *ch = &target->ch[0];
+
+	return sprintf(buf, "%pI6\n", ch->path.dgid.raw);
+}
+
+static ssize_t show_orig_dgid(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%pI6\n", target->orig_dgid.raw);
+}
+
+static ssize_t show_req_lim(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+	struct srp_rdma_ch *ch;
+	int i, req_lim = INT_MAX;
+
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		req_lim = min(req_lim, ch->req_lim);
+	}
+	return sprintf(buf, "%d\n", req_lim);
+}
+
+static ssize_t show_zero_req_lim(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->zero_req_lim);
+}
+
+static ssize_t show_local_ib_port(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->srp_host->port);
+}
+
+static ssize_t show_local_ib_device(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%s\n", target->srp_host->srp_dev->dev->name);
+}
+
+static ssize_t show_ch_count(struct device *dev, struct device_attribute *attr,
+			     char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->ch_count);
+}
+
+static ssize_t show_comp_vector(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->comp_vector);
+}
+
+static ssize_t show_tl_retry_count(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%d\n", target->tl_retry_count);
+}
+
+static ssize_t show_cmd_sg_entries(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%u\n", target->cmd_sg_cnt);
+}
+
+static ssize_t show_allow_ext_sg(struct device *dev,
+				 struct device_attribute *attr, char *buf)
+{
+	struct srp_target_port *target = host_to_target(class_to_shost(dev));
+
+	return sprintf(buf, "%s\n", target->allow_ext_sg ? "true" : "false");
+}
+
+static DEVICE_ATTR(id_ext,	    S_IRUGO, show_id_ext,	   NULL);
+static DEVICE_ATTR(ioc_guid,	    S_IRUGO, show_ioc_guid,	   NULL);
+static DEVICE_ATTR(service_id,	    S_IRUGO, show_service_id,	   NULL);
+static DEVICE_ATTR(pkey,	    S_IRUGO, show_pkey,		   NULL);
+static DEVICE_ATTR(sgid,	    S_IRUGO, show_sgid,		   NULL);
+static DEVICE_ATTR(dgid,	    S_IRUGO, show_dgid,		   NULL);
+static DEVICE_ATTR(orig_dgid,	    S_IRUGO, show_orig_dgid,	   NULL);
+static DEVICE_ATTR(req_lim,         S_IRUGO, show_req_lim,         NULL);
+static DEVICE_ATTR(zero_req_lim,    S_IRUGO, show_zero_req_lim,	   NULL);
+static DEVICE_ATTR(local_ib_port,   S_IRUGO, show_local_ib_port,   NULL);
+static DEVICE_ATTR(local_ib_device, S_IRUGO, show_local_ib_device, NULL);
+static DEVICE_ATTR(ch_count,        S_IRUGO, show_ch_count,        NULL);
+static DEVICE_ATTR(comp_vector,     S_IRUGO, show_comp_vector,     NULL);
+static DEVICE_ATTR(tl_retry_count,  S_IRUGO, show_tl_retry_count,  NULL);
+static DEVICE_ATTR(cmd_sg_entries,  S_IRUGO, show_cmd_sg_entries,  NULL);
+static DEVICE_ATTR(allow_ext_sg,    S_IRUGO, show_allow_ext_sg,    NULL);
+
+static struct device_attribute *srp_host_attrs[] = {
+	&dev_attr_id_ext,
+	&dev_attr_ioc_guid,
+	&dev_attr_service_id,
+	&dev_attr_pkey,
+	&dev_attr_sgid,
+	&dev_attr_dgid,
+	&dev_attr_orig_dgid,
+	&dev_attr_req_lim,
+	&dev_attr_zero_req_lim,
+	&dev_attr_local_ib_port,
+	&dev_attr_local_ib_device,
+	&dev_attr_ch_count,
+	&dev_attr_comp_vector,
+	&dev_attr_tl_retry_count,
+	&dev_attr_cmd_sg_entries,
+	&dev_attr_allow_ext_sg,
+	NULL
+};
+
+static struct scsi_host_template srp_template = {
+	.module				= THIS_MODULE,
+	.name				= "InfiniBand SRP initiator",
+	.proc_name			= DRV_NAME,
+	.slave_configure		= srp_slave_configure,
+	.info				= srp_target_info,
+	.queuecommand			= srp_queuecommand,
+	.change_queue_depth             = srp_change_queue_depth,
+	.eh_abort_handler		= srp_abort,
+	.eh_device_reset_handler	= srp_reset_device,
+	.eh_host_reset_handler		= srp_reset_host,
+	.skip_settle_delay		= true,
+	.sg_tablesize			= SRP_DEF_SG_TABLESIZE,
+	.can_queue			= SRP_DEFAULT_CMD_SQ_SIZE,
+	.this_id			= -1,
+	.cmd_per_lun			= SRP_DEFAULT_CMD_SQ_SIZE,
+	.use_clustering			= ENABLE_CLUSTERING,
+	.shost_attrs			= srp_host_attrs,
+	.use_blk_tags			= 1,
+	.track_queue_depth		= 1,
+};
+
+static int srp_sdev_count(struct Scsi_Host *host)
+{
+	struct scsi_device *sdev;
+	int c = 0;
+
+	shost_for_each_device(sdev, host)
+		c++;
+
+	return c;
+}
+
+static int srp_add_target(struct srp_host *host, struct srp_target_port *target)
+{
+	struct srp_rport_identifiers ids;
+	struct srp_rport *rport;
+
+	target->state = SRP_TARGET_SCANNING;
+	sprintf(target->target_name, "SRP.T10:%016llX",
+		 (unsigned long long) be64_to_cpu(target->id_ext));
+
+	if (scsi_add_host(target->scsi_host, host->srp_dev->dev->dma_device))
+		return -ENODEV;
+
+	memcpy(ids.port_id, &target->id_ext, 8);
+	memcpy(ids.port_id + 8, &target->ioc_guid, 8);
+	ids.roles = SRP_RPORT_ROLE_TARGET;
+	rport = srp_rport_add(target->scsi_host, &ids);
+	if (IS_ERR(rport)) {
+		scsi_remove_host(target->scsi_host);
+		return PTR_ERR(rport);
+	}
+
+	rport->lld_data = target;
+	target->rport = rport;
+
+	spin_lock(&host->target_lock);
+	list_add_tail(&target->list, &host->target_list);
+	spin_unlock(&host->target_lock);
+
+	scsi_scan_target(&target->scsi_host->shost_gendev,
+			 0, target->scsi_id, SCAN_WILD_CARD, 0);
+
+	if (srp_connected_ch(target) < target->ch_count ||
+	    target->qp_in_error) {
+		shost_printk(KERN_INFO, target->scsi_host,
+			     PFX "SCSI scan failed - removing SCSI host\n");
+		srp_queue_remove_work(target);
+		goto out;
+	}
+
+	pr_debug(PFX "%s: SCSI scan succeeded - detected %d LUNs\n",
+		 dev_name(&target->scsi_host->shost_gendev),
+		 srp_sdev_count(target->scsi_host));
+
+	spin_lock_irq(&target->lock);
+	if (target->state == SRP_TARGET_SCANNING)
+		target->state = SRP_TARGET_LIVE;
+	spin_unlock_irq(&target->lock);
+
+out:
+	return 0;
+}
+
+static void srp_release_dev(struct device *dev)
+{
+	struct srp_host *host =
+		container_of(dev, struct srp_host, dev);
+
+	complete(&host->released);
+}
+
+static struct class srp_class = {
+	.name    = "infiniband_srp",
+	.dev_release = srp_release_dev
+};
+
+/**
+ * srp_conn_unique() - check whether the connection to a target is unique
+ * @host:   SRP host.
+ * @target: SRP target port.
+ */
+static bool srp_conn_unique(struct srp_host *host,
+			    struct srp_target_port *target)
+{
+	struct srp_target_port *t;
+	bool ret = false;
+
+	if (target->state == SRP_TARGET_REMOVED)
+		goto out;
+
+	ret = true;
+
+	spin_lock(&host->target_lock);
+	list_for_each_entry(t, &host->target_list, list) {
+		if (t != target &&
+		    target->id_ext == t->id_ext &&
+		    target->ioc_guid == t->ioc_guid &&
+		    target->initiator_ext == t->initiator_ext) {
+			ret = false;
+			break;
+		}
+	}
+	spin_unlock(&host->target_lock);
+
+out:
+	return ret;
+}
+
+/*
+ * Target ports are added by writing
+ *
+ *     id_ext=<SRP ID ext>,ioc_guid=<SRP IOC GUID>,dgid=<dest GID>,
+ *     pkey=<P_Key>,service_id=<service ID>
+ *
+ * to the add_target sysfs attribute.
+ */
+enum {
+	SRP_OPT_ERR		= 0,
+	SRP_OPT_ID_EXT		= 1 << 0,
+	SRP_OPT_IOC_GUID	= 1 << 1,
+	SRP_OPT_DGID		= 1 << 2,
+	SRP_OPT_PKEY		= 1 << 3,
+	SRP_OPT_SERVICE_ID	= 1 << 4,
+	SRP_OPT_MAX_SECT	= 1 << 5,
+	SRP_OPT_MAX_CMD_PER_LUN	= 1 << 6,
+	SRP_OPT_IO_CLASS	= 1 << 7,
+	SRP_OPT_INITIATOR_EXT	= 1 << 8,
+	SRP_OPT_CMD_SG_ENTRIES	= 1 << 9,
+	SRP_OPT_ALLOW_EXT_SG	= 1 << 10,
+	SRP_OPT_SG_TABLESIZE	= 1 << 11,
+	SRP_OPT_COMP_VECTOR	= 1 << 12,
+	SRP_OPT_TL_RETRY_COUNT	= 1 << 13,
+	SRP_OPT_QUEUE_SIZE	= 1 << 14,
+	SRP_OPT_ALL		= (SRP_OPT_ID_EXT	|
+				   SRP_OPT_IOC_GUID	|
+				   SRP_OPT_DGID		|
+				   SRP_OPT_PKEY		|
+				   SRP_OPT_SERVICE_ID),
+};
+
+static const match_table_t srp_opt_tokens = {
+	{ SRP_OPT_ID_EXT,		"id_ext=%s" 		},
+	{ SRP_OPT_IOC_GUID,		"ioc_guid=%s" 		},
+	{ SRP_OPT_DGID,			"dgid=%s" 		},
+	{ SRP_OPT_PKEY,			"pkey=%x" 		},
+	{ SRP_OPT_SERVICE_ID,		"service_id=%s"		},
+	{ SRP_OPT_MAX_SECT,		"max_sect=%d" 		},
+	{ SRP_OPT_MAX_CMD_PER_LUN,	"max_cmd_per_lun=%d" 	},
+	{ SRP_OPT_IO_CLASS,		"io_class=%x"		},
+	{ SRP_OPT_INITIATOR_EXT,	"initiator_ext=%s"	},
+	{ SRP_OPT_CMD_SG_ENTRIES,	"cmd_sg_entries=%u"	},
+	{ SRP_OPT_ALLOW_EXT_SG,		"allow_ext_sg=%u"	},
+	{ SRP_OPT_SG_TABLESIZE,		"sg_tablesize=%u"	},
+	{ SRP_OPT_COMP_VECTOR,		"comp_vector=%u"	},
+	{ SRP_OPT_TL_RETRY_COUNT,	"tl_retry_count=%u"	},
+	{ SRP_OPT_QUEUE_SIZE,		"queue_size=%d"		},
+	{ SRP_OPT_ERR,			NULL 			}
+};
+
+static int srp_parse_options(const char *buf, struct srp_target_port *target)
+{
+	char *options, *sep_opt;
+	char *p;
+	char dgid[3];
+	substring_t args[MAX_OPT_ARGS];
+	int opt_mask = 0;
+	int token;
+	int ret = -EINVAL;
+	int i;
+
+	options = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	sep_opt = options;
+	while ((p = strsep(&sep_opt, ",\n")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, srp_opt_tokens, args);
+		opt_mask |= token;
+
+		switch (token) {
+		case SRP_OPT_ID_EXT:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			target->id_ext = cpu_to_be64(simple_strtoull(p, NULL, 16));
+			kfree(p);
+			break;
+
+		case SRP_OPT_IOC_GUID:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			target->ioc_guid = cpu_to_be64(simple_strtoull(p, NULL, 16));
+			kfree(p);
+			break;
+
+		case SRP_OPT_DGID:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) != 32) {
+				pr_warn("bad dest GID parameter '%s'\n", p);
+				kfree(p);
+				goto out;
+			}
+
+			for (i = 0; i < 16; ++i) {
+				strlcpy(dgid, p + i * 2, sizeof(dgid));
+				if (sscanf(dgid, "%hhx",
+					   &target->orig_dgid.raw[i]) < 1) {
+					ret = -EINVAL;
+					kfree(p);
+					goto out;
+				}
+			}
+			kfree(p);
+			break;
+
+		case SRP_OPT_PKEY:
+			if (match_hex(args, &token)) {
+				pr_warn("bad P_Key parameter '%s'\n", p);
+				goto out;
+			}
+			target->pkey = cpu_to_be16(token);
+			break;
+
+		case SRP_OPT_SERVICE_ID:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			target->service_id = cpu_to_be64(simple_strtoull(p, NULL, 16));
+			kfree(p);
+			break;
+
+		case SRP_OPT_MAX_SECT:
+			if (match_int(args, &token)) {
+				pr_warn("bad max sect parameter '%s'\n", p);
+				goto out;
+			}
+			target->scsi_host->max_sectors = token;
+			break;
+
+		case SRP_OPT_QUEUE_SIZE:
+			if (match_int(args, &token) || token < 1) {
+				pr_warn("bad queue_size parameter '%s'\n", p);
+				goto out;
+			}
+			target->scsi_host->can_queue = token;
+			target->queue_size = token + SRP_RSP_SQ_SIZE +
+					     SRP_TSK_MGMT_SQ_SIZE;
+			if (!(opt_mask & SRP_OPT_MAX_CMD_PER_LUN))
+				target->scsi_host->cmd_per_lun = token;
+			break;
+
+		case SRP_OPT_MAX_CMD_PER_LUN:
+			if (match_int(args, &token) || token < 1) {
+				pr_warn("bad max cmd_per_lun parameter '%s'\n",
+					p);
+				goto out;
+			}
+			target->scsi_host->cmd_per_lun = token;
+			break;
+
+		case SRP_OPT_IO_CLASS:
+			if (match_hex(args, &token)) {
+				pr_warn("bad IO class parameter '%s'\n", p);
+				goto out;
+			}
+			if (token != SRP_REV10_IB_IO_CLASS &&
+			    token != SRP_REV16A_IB_IO_CLASS) {
+				pr_warn("unknown IO class parameter value %x specified (use %x or %x).\n",
+					token, SRP_REV10_IB_IO_CLASS,
+					SRP_REV16A_IB_IO_CLASS);
+				goto out;
+			}
+			target->io_class = token;
+			break;
+
+		case SRP_OPT_INITIATOR_EXT:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			target->initiator_ext = cpu_to_be64(simple_strtoull(p, NULL, 16));
+			kfree(p);
+			break;
+
+		case SRP_OPT_CMD_SG_ENTRIES:
+			if (match_int(args, &token) || token < 1 || token > 255) {
+				pr_warn("bad max cmd_sg_entries parameter '%s'\n",
+					p);
+				goto out;
+			}
+			target->cmd_sg_cnt = token;
+			break;
+
+		case SRP_OPT_ALLOW_EXT_SG:
+			if (match_int(args, &token)) {
+				pr_warn("bad allow_ext_sg parameter '%s'\n", p);
+				goto out;
+			}
+			target->allow_ext_sg = !!token;
+			break;
+
+		case SRP_OPT_SG_TABLESIZE:
+			if (match_int(args, &token) || token < 1 ||
+					token > SCSI_MAX_SG_CHAIN_SEGMENTS) {
+				pr_warn("bad max sg_tablesize parameter '%s'\n",
+					p);
+				goto out;
+			}
+			target->sg_tablesize = token;
+			break;
+
+		case SRP_OPT_COMP_VECTOR:
+			if (match_int(args, &token) || token < 0) {
+				pr_warn("bad comp_vector parameter '%s'\n", p);
+				goto out;
+			}
+			target->comp_vector = token;
+			break;
+
+		case SRP_OPT_TL_RETRY_COUNT:
+			if (match_int(args, &token) || token < 2 || token > 7) {
+				pr_warn("bad tl_retry_count parameter '%s' (must be a number between 2 and 7)\n",
+					p);
+				goto out;
+			}
+			target->tl_retry_count = token;
+			break;
+
+		default:
+			pr_warn("unknown parameter or missing value '%s' in target creation request\n",
+				p);
+			goto out;
+		}
+	}
+
+	if ((opt_mask & SRP_OPT_ALL) == SRP_OPT_ALL)
+		ret = 0;
+	else
+		for (i = 0; i < ARRAY_SIZE(srp_opt_tokens); ++i)
+			if ((srp_opt_tokens[i].token & SRP_OPT_ALL) &&
+			    !(srp_opt_tokens[i].token & opt_mask))
+				pr_warn("target creation request is missing parameter '%s'\n",
+					srp_opt_tokens[i].pattern);
+
+	if (target->scsi_host->cmd_per_lun > target->scsi_host->can_queue
+	    && (opt_mask & SRP_OPT_MAX_CMD_PER_LUN))
+		pr_warn("cmd_per_lun = %d > queue_size = %d\n",
+			target->scsi_host->cmd_per_lun,
+			target->scsi_host->can_queue);
+
+out:
+	kfree(options);
+	return ret;
+}
+
+static ssize_t srp_create_target(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf, size_t count)
+{
+	struct srp_host *host =
+		container_of(dev, struct srp_host, dev);
+	struct Scsi_Host *target_host;
+	struct srp_target_port *target;
+	struct srp_rdma_ch *ch;
+	struct srp_device *srp_dev = host->srp_dev;
+	struct ib_device *ibdev = srp_dev->dev;
+	int ret, node_idx, node, cpu, i;
+	bool multich = false;
+
+	target_host = scsi_host_alloc(&srp_template,
+				      sizeof (struct srp_target_port));
+	if (!target_host)
+		return -ENOMEM;
+
+	target_host->transportt  = ib_srp_transport_template;
+	target_host->max_channel = 0;
+	target_host->max_id      = 1;
+	target_host->max_lun     = SRP_MAX_LUN;
+	target_host->max_cmd_len = sizeof ((struct srp_cmd *) (void *) 0L)->cdb;
+
+	target = host_to_target(target_host);
+
+	target->io_class	= SRP_REV16A_IB_IO_CLASS;
+	target->scsi_host	= target_host;
+	target->srp_host	= host;
+	target->lkey		= host->srp_dev->mr->lkey;
+	target->rkey		= host->srp_dev->mr->rkey;
+	target->cmd_sg_cnt	= cmd_sg_entries;
+	target->sg_tablesize	= indirect_sg_entries ? : cmd_sg_entries;
+	target->allow_ext_sg	= allow_ext_sg;
+	target->tl_retry_count	= 7;
+	target->queue_size	= SRP_DEFAULT_QUEUE_SIZE;
+
+	/*
+	 * Avoid that the SCSI host can be removed by srp_remove_target()
+	 * before this function returns.
+	 */
+	scsi_host_get(target->scsi_host);
+
+	mutex_lock(&host->add_target_mutex);
+
+	ret = srp_parse_options(buf, target);
+	if (ret)
+		goto out;
+
+	ret = scsi_init_shared_tag_map(target_host, target_host->can_queue);
+	if (ret)
+		goto out;
+
+	target->req_ring_size = target->queue_size - SRP_TSK_MGMT_SQ_SIZE;
+
+	if (!srp_conn_unique(target->srp_host, target)) {
+		shost_printk(KERN_INFO, target->scsi_host,
+			     PFX "Already connected to target port with id_ext=%016llx;ioc_guid=%016llx;initiator_ext=%016llx\n",
+			     be64_to_cpu(target->id_ext),
+			     be64_to_cpu(target->ioc_guid),
+			     be64_to_cpu(target->initiator_ext));
+		ret = -EEXIST;
+		goto out;
+	}
+
+	if (!srp_dev->has_fmr && !srp_dev->has_fr && !target->allow_ext_sg &&
+	    target->cmd_sg_cnt < target->sg_tablesize) {
+		pr_warn("No MR pool and no external indirect descriptors, limiting sg_tablesize to cmd_sg_cnt\n");
+		target->sg_tablesize = target->cmd_sg_cnt;
+	}
+
+	target_host->sg_tablesize = target->sg_tablesize;
+	target->indirect_size = target->sg_tablesize *
+				sizeof (struct srp_direct_buf);
+	target->max_iu_len = sizeof (struct srp_cmd) +
+			     sizeof (struct srp_indirect_buf) +
+			     target->cmd_sg_cnt * sizeof (struct srp_direct_buf);
+
+	INIT_WORK(&target->tl_err_work, srp_tl_err_work);
+	INIT_WORK(&target->remove_work, srp_remove_work);
+	spin_lock_init(&target->lock);
+	ret = ib_query_gid(ibdev, host->port, 0, &target->sgid);
+	if (ret)
+		goto out;
+
+	ret = -ENOMEM;
+	target->ch_count = max_t(unsigned, num_online_nodes(),
+				 min(ch_count ? :
+				     min(4 * num_online_nodes(),
+					 ibdev->num_comp_vectors),
+				     num_online_cpus()));
+	target->ch = kcalloc(target->ch_count, sizeof(*target->ch),
+			     GFP_KERNEL);
+	if (!target->ch)
+		goto out;
+
+	node_idx = 0;
+	for_each_online_node(node) {
+		const int ch_start = (node_idx * target->ch_count /
+				      num_online_nodes());
+		const int ch_end = ((node_idx + 1) * target->ch_count /
+				    num_online_nodes());
+		const int cv_start = (node_idx * ibdev->num_comp_vectors /
+				      num_online_nodes() + target->comp_vector)
+				     % ibdev->num_comp_vectors;
+		const int cv_end = ((node_idx + 1) * ibdev->num_comp_vectors /
+				    num_online_nodes() + target->comp_vector)
+				   % ibdev->num_comp_vectors;
+		int cpu_idx = 0;
+
+		for_each_online_cpu(cpu) {
+			if (cpu_to_node(cpu) != node)
+				continue;
+			if (ch_start + cpu_idx >= ch_end)
+				continue;
+			ch = &target->ch[ch_start + cpu_idx];
+			ch->target = target;
+			ch->comp_vector = cv_start == cv_end ? cv_start :
+				cv_start + cpu_idx % (cv_end - cv_start);
+			spin_lock_init(&ch->lock);
+			INIT_LIST_HEAD(&ch->free_tx);
+			ret = srp_new_cm_id(ch);
+			if (ret)
+				goto err_disconnect;
+
+			ret = srp_create_ch_ib(ch);
+			if (ret)
+				goto err_disconnect;
+
+			ret = srp_alloc_req_data(ch);
+			if (ret)
+				goto err_disconnect;
+
+			ret = srp_connect_ch(ch, multich);
+			if (ret) {
+				shost_printk(KERN_ERR, target->scsi_host,
+					     PFX "Connection %d/%d failed\n",
+					     ch_start + cpu_idx,
+					     target->ch_count);
+				if (node_idx == 0 && cpu_idx == 0) {
+					goto err_disconnect;
+				} else {
+					srp_free_ch_ib(target, ch);
+					srp_free_req_data(target, ch);
+					target->ch_count = ch - target->ch;
+					break;
+				}
+			}
+
+			multich = true;
+			cpu_idx++;
+		}
+		node_idx++;
+	}
+
+	target->scsi_host->nr_hw_queues = target->ch_count;
+
+	ret = srp_add_target(host, target);
+	if (ret)
+		goto err_disconnect;
+
+	if (target->state != SRP_TARGET_REMOVED) {
+		shost_printk(KERN_DEBUG, target->scsi_host, PFX
+			     "new target: id_ext %016llx ioc_guid %016llx pkey %04x service_id %016llx sgid %pI6 dgid %pI6\n",
+			     be64_to_cpu(target->id_ext),
+			     be64_to_cpu(target->ioc_guid),
+			     be16_to_cpu(target->pkey),
+			     be64_to_cpu(target->service_id),
+			     target->sgid.raw, target->orig_dgid.raw);
+	}
+
+	ret = count;
+
+out:
+	mutex_unlock(&host->add_target_mutex);
+
+	scsi_host_put(target->scsi_host);
+
+	return ret;
+
+err_disconnect:
+	srp_disconnect_target(target);
+
+	for (i = 0; i < target->ch_count; i++) {
+		ch = &target->ch[i];
+		srp_free_ch_ib(target, ch);
+		srp_free_req_data(target, ch);
+	}
+
+	kfree(target->ch);
+	goto out;
+}
+
+static DEVICE_ATTR(add_target, S_IWUSR, NULL, srp_create_target);
+
+static ssize_t show_ibdev(struct device *dev, struct device_attribute *attr,
+			  char *buf)
+{
+	struct srp_host *host = container_of(dev, struct srp_host, dev);
+
+	return sprintf(buf, "%s\n", host->srp_dev->dev->name);
+}
+
+static DEVICE_ATTR(ibdev, S_IRUGO, show_ibdev, NULL);
+
+static ssize_t show_port(struct device *dev, struct device_attribute *attr,
+			 char *buf)
+{
+	struct srp_host *host = container_of(dev, struct srp_host, dev);
+
+	return sprintf(buf, "%d\n", host->port);
+}
+
+static DEVICE_ATTR(port, S_IRUGO, show_port, NULL);
+
+static struct srp_host *srp_add_port(struct srp_device *device, u8 port)
+{
+	struct srp_host *host;
+
+	host = kzalloc(sizeof *host, GFP_KERNEL);
+	if (!host)
+		return NULL;
+
+	INIT_LIST_HEAD(&host->target_list);
+	spin_lock_init(&host->target_lock);
+	init_completion(&host->released);
+	mutex_init(&host->add_target_mutex);
+	host->srp_dev = device;
+	host->port = port;
+
+	host->dev.class = &srp_class;
+	host->dev.parent = device->dev->dma_device;
+	dev_set_name(&host->dev, "srp-%s-%d", device->dev->name, port);
+
+	if (device_register(&host->dev))
+		goto free_host;
+	if (device_create_file(&host->dev, &dev_attr_add_target))
+		goto err_class;
+	if (device_create_file(&host->dev, &dev_attr_ibdev))
+		goto err_class;
+	if (device_create_file(&host->dev, &dev_attr_port))
+		goto err_class;
+
+	return host;
+
+err_class:
+	device_unregister(&host->dev);
+
+free_host:
+	kfree(host);
+
+	return NULL;
+}
+
+static void srp_add_one(struct ib_device *device)
+{
+	struct srp_device *srp_dev;
+	struct ib_device_attr *dev_attr;
+	struct srp_host *host;
+	int mr_page_shift, s, e, p;
+	u64 max_pages_per_mr;
+
+	dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
+	if (!dev_attr)
+		return;
+
+	if (ib_query_device(device, dev_attr)) {
+		pr_warn("Query device failed for %s\n", device->name);
+		goto free_attr;
+	}
+
+	srp_dev = kmalloc(sizeof *srp_dev, GFP_KERNEL);
+	if (!srp_dev)
+		goto free_attr;
+
+	srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
+			    device->map_phys_fmr && device->unmap_fmr);
+	srp_dev->has_fr = (dev_attr->device_cap_flags &
+			   IB_DEVICE_MEM_MGT_EXTENSIONS);
+	if (!srp_dev->has_fmr && !srp_dev->has_fr)
+		dev_warn(&device->dev, "neither FMR nor FR is supported\n");
+
+	srp_dev->use_fast_reg = (srp_dev->has_fr &&
+				 (!srp_dev->has_fmr || prefer_fr));
+
+	/*
+	 * Use the smallest page size supported by the HCA, down to a
+	 * minimum of 4096 bytes. We're unlikely to build large sglists
+	 * out of smaller entries.
+	 */
+	mr_page_shift		= max(12, ffs(dev_attr->page_size_cap) - 1);
+	srp_dev->mr_page_size	= 1 << mr_page_shift;
+	srp_dev->mr_page_mask	= ~((u64) srp_dev->mr_page_size - 1);
+	max_pages_per_mr	= dev_attr->max_mr_size;
+	do_div(max_pages_per_mr, srp_dev->mr_page_size);
+	srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR,
+					  max_pages_per_mr);
+	if (srp_dev->use_fast_reg) {
+		srp_dev->max_pages_per_mr =
+			min_t(u32, srp_dev->max_pages_per_mr,
+			      dev_attr->max_fast_reg_page_list_len);
+	}
+	srp_dev->mr_max_size	= srp_dev->mr_page_size *
+				   srp_dev->max_pages_per_mr;
+	pr_debug("%s: mr_page_shift = %d, dev_attr->max_mr_size = %#llx, dev_attr->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n",
+		 device->name, mr_page_shift, dev_attr->max_mr_size,
+		 dev_attr->max_fast_reg_page_list_len,
+		 srp_dev->max_pages_per_mr, srp_dev->mr_max_size);
+
+	INIT_LIST_HEAD(&srp_dev->dev_list);
+
+	srp_dev->dev = device;
+	srp_dev->pd  = ib_alloc_pd(device);
+	if (IS_ERR(srp_dev->pd))
+		goto free_dev;
+
+	srp_dev->mr = ib_get_dma_mr(srp_dev->pd,
+				    IB_ACCESS_LOCAL_WRITE |
+				    IB_ACCESS_REMOTE_READ |
+				    IB_ACCESS_REMOTE_WRITE);
+	if (IS_ERR(srp_dev->mr))
+		goto err_pd;
+
+	if (device->node_type == RDMA_NODE_IB_SWITCH) {
+		s = 0;
+		e = 0;
+	} else {
+		s = 1;
+		e = device->phys_port_cnt;
+	}
+
+	for (p = s; p <= e; ++p) {
+		host = srp_add_port(srp_dev, p);
+		if (host)
+			list_add_tail(&host->list, &srp_dev->dev_list);
+	}
+
+	ib_set_client_data(device, &srp_client, srp_dev);
+
+	goto free_attr;
+
+err_pd:
+	ib_dealloc_pd(srp_dev->pd);
+
+free_dev:
+	kfree(srp_dev);
+
+free_attr:
+	kfree(dev_attr);
+}
+
+static void srp_remove_one(struct ib_device *device)
+{
+	struct srp_device *srp_dev;
+	struct srp_host *host, *tmp_host;
+	struct srp_target_port *target;
+
+	srp_dev = ib_get_client_data(device, &srp_client);
+	if (!srp_dev)
+		return;
+
+	list_for_each_entry_safe(host, tmp_host, &srp_dev->dev_list, list) {
+		device_unregister(&host->dev);
+		/*
+		 * Wait for the sysfs entry to go away, so that no new
+		 * target ports can be created.
+		 */
+		wait_for_completion(&host->released);
+
+		/*
+		 * Remove all target ports.
+		 */
+		spin_lock(&host->target_lock);
+		list_for_each_entry(target, &host->target_list, list)
+			srp_queue_remove_work(target);
+		spin_unlock(&host->target_lock);
+
+		/*
+		 * Wait for tl_err and target port removal tasks.
+		 */
+		flush_workqueue(system_long_wq);
+		flush_workqueue(srp_remove_wq);
+
+		kfree(host);
+	}
+
+	ib_dereg_mr(srp_dev->mr);
+	ib_dealloc_pd(srp_dev->pd);
+
+	kfree(srp_dev);
+}
+
+static struct srp_function_template ib_srp_transport_functions = {
+	.has_rport_state	 = true,
+	.reset_timer_if_blocked	 = true,
+	.reconnect_delay	 = &srp_reconnect_delay,
+	.fast_io_fail_tmo	 = &srp_fast_io_fail_tmo,
+	.dev_loss_tmo		 = &srp_dev_loss_tmo,
+	.reconnect		 = srp_rport_reconnect,
+	.rport_delete		 = srp_rport_delete,
+	.terminate_rport_io	 = srp_terminate_io,
+};
+
+static int __init srp_init_module(void)
+{
+	int ret;
+
+	BUILD_BUG_ON(FIELD_SIZEOF(struct ib_wc, wr_id) < sizeof(void *));
+
+	if (srp_sg_tablesize) {
+		pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n");
+		if (!cmd_sg_entries)
+			cmd_sg_entries = srp_sg_tablesize;
+	}
+
+	if (!cmd_sg_entries)
+		cmd_sg_entries = SRP_DEF_SG_TABLESIZE;
+
+	if (cmd_sg_entries > 255) {
+		pr_warn("Clamping cmd_sg_entries to 255\n");
+		cmd_sg_entries = 255;
+	}
+
+	if (!indirect_sg_entries)
+		indirect_sg_entries = cmd_sg_entries;
+	else if (indirect_sg_entries < cmd_sg_entries) {
+		pr_warn("Bumping up indirect_sg_entries to match cmd_sg_entries (%u)\n",
+			cmd_sg_entries);
+		indirect_sg_entries = cmd_sg_entries;
+	}
+
+	srp_remove_wq = create_workqueue("srp_remove");
+	if (!srp_remove_wq) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = -ENOMEM;
+	ib_srp_transport_template =
+		srp_attach_transport(&ib_srp_transport_functions);
+	if (!ib_srp_transport_template)
+		goto destroy_wq;
+
+	ret = class_register(&srp_class);
+	if (ret) {
+		pr_err("couldn't register class infiniband_srp\n");
+		goto release_tr;
+	}
+
+	ib_sa_register_client(&srp_sa_client);
+
+	ret = ib_register_client(&srp_client);
+	if (ret) {
+		pr_err("couldn't register IB client\n");
+		goto unreg_sa;
+	}
+
+out:
+	return ret;
+
+unreg_sa:
+	ib_sa_unregister_client(&srp_sa_client);
+	class_unregister(&srp_class);
+
+release_tr:
+	srp_release_transport(ib_srp_transport_template);
+
+destroy_wq:
+	destroy_workqueue(srp_remove_wq);
+	goto out;
+}
+
+static void __exit srp_cleanup_module(void)
+{
+	ib_unregister_client(&srp_client);
+	ib_sa_unregister_client(&srp_sa_client);
+	class_unregister(&srp_class);
+	srp_release_transport(ib_srp_transport_template);
+	destroy_workqueue(srp_remove_wq);
+}
+
+module_init(srp_init_module);
+module_exit(srp_cleanup_module);
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
new file mode 100644
index 000000000..e690847a4
--- /dev/null
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef IB_SRP_H
+#define IB_SRP_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/scatterlist.h>
+
+#include <scsi/scsi_host.h>
+#include <scsi/scsi_cmnd.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_cm.h>
+#include <rdma/ib_fmr_pool.h>
+
+enum {
+	SRP_PATH_REC_TIMEOUT_MS	= 1000,
+	SRP_ABORT_TIMEOUT_MS	= 5000,
+
+	SRP_PORT_REDIRECT	= 1,
+	SRP_DLID_REDIRECT	= 2,
+	SRP_STALE_CONN		= 3,
+
+	SRP_MAX_LUN		= 512,
+	SRP_DEF_SG_TABLESIZE	= 12,
+
+	SRP_DEFAULT_QUEUE_SIZE	= 1 << 6,
+	SRP_RSP_SQ_SIZE		= 1,
+	SRP_TSK_MGMT_SQ_SIZE	= 1,
+	SRP_DEFAULT_CMD_SQ_SIZE = SRP_DEFAULT_QUEUE_SIZE - SRP_RSP_SQ_SIZE -
+				  SRP_TSK_MGMT_SQ_SIZE,
+
+	SRP_TAG_NO_REQ		= ~0U,
+	SRP_TAG_TSK_MGMT	= 1U << 31,
+
+	SRP_MAX_PAGES_PER_MR	= 512,
+
+	LOCAL_INV_WR_ID_MASK	= 1,
+	FAST_REG_WR_ID_MASK	= 2,
+
+	SRP_LAST_WR_ID		= 0xfffffffcU,
+};
+
+enum srp_target_state {
+	SRP_TARGET_SCANNING,
+	SRP_TARGET_LIVE,
+	SRP_TARGET_REMOVED,
+};
+
+enum srp_iu_type {
+	SRP_IU_CMD,
+	SRP_IU_TSK_MGMT,
+	SRP_IU_RSP,
+};
+
+/*
+ * @mr_page_mask: HCA memory registration page mask.
+ * @mr_page_size: HCA memory registration page size.
+ * @mr_max_size: Maximum size in bytes of a single FMR / FR registration
+ *   request.
+ */
+struct srp_device {
+	struct list_head	dev_list;
+	struct ib_device       *dev;
+	struct ib_pd	       *pd;
+	struct ib_mr	       *mr;
+	u64			mr_page_mask;
+	int			mr_page_size;
+	int			mr_max_size;
+	int			max_pages_per_mr;
+	bool			has_fmr;
+	bool			has_fr;
+	bool			use_fast_reg;
+};
+
+struct srp_host {
+	struct srp_device      *srp_dev;
+	u8			port;
+	struct device		dev;
+	struct list_head	target_list;
+	spinlock_t		target_lock;
+	struct completion	released;
+	struct list_head	list;
+	struct mutex		add_target_mutex;
+};
+
+struct srp_request {
+	struct scsi_cmnd       *scmnd;
+	struct srp_iu	       *cmd;
+	union {
+		struct ib_pool_fmr **fmr_list;
+		struct srp_fr_desc **fr_list;
+	};
+	u64		       *map_page;
+	struct srp_direct_buf  *indirect_desc;
+	dma_addr_t		indirect_dma_addr;
+	short			nmdesc;
+};
+
+/**
+ * struct srp_rdma_ch
+ * @comp_vector: Completion vector used by this RDMA channel.
+ */
+struct srp_rdma_ch {
+	/* These are RW in the hot path, and commonly used together */
+	struct list_head	free_tx;
+	spinlock_t		lock;
+	s32			req_lim;
+
+	/* These are read-only in the hot path */
+	struct srp_target_port *target ____cacheline_aligned_in_smp;
+	struct ib_cq	       *send_cq;
+	struct ib_cq	       *recv_cq;
+	struct ib_qp	       *qp;
+	union {
+		struct ib_fmr_pool     *fmr_pool;
+		struct srp_fr_pool     *fr_pool;
+	};
+
+	/* Everything above this point is used in the hot path of
+	 * command processing. Try to keep them packed into cachelines.
+	 */
+
+	struct completion	done;
+	int			status;
+
+	struct ib_sa_path_rec	path;
+	struct ib_sa_query     *path_query;
+	int			path_query_id;
+
+	struct ib_cm_id	       *cm_id;
+	struct srp_iu	      **tx_ring;
+	struct srp_iu	      **rx_ring;
+	struct srp_request     *req_ring;
+	int			max_ti_iu_len;
+	int			comp_vector;
+
+	struct completion	tsk_mgmt_done;
+	u8			tsk_mgmt_status;
+	bool			connected;
+};
+
+/**
+ * struct srp_target_port
+ * @comp_vector: Completion vector used by the first RDMA channel created for
+ *   this target port.
+ */
+struct srp_target_port {
+	/* read and written in the hot path */
+	spinlock_t		lock;
+
+	/* read only in the hot path */
+	struct srp_rdma_ch	*ch;
+	u32			ch_count;
+	u32			lkey;
+	u32			rkey;
+	enum srp_target_state	state;
+	unsigned int		max_iu_len;
+	unsigned int		cmd_sg_cnt;
+	unsigned int		indirect_size;
+	bool			allow_ext_sg;
+
+	/* other member variables */
+	union ib_gid		sgid;
+	__be64			id_ext;
+	__be64			ioc_guid;
+	__be64			service_id;
+	__be64			initiator_ext;
+	u16			io_class;
+	struct srp_host	       *srp_host;
+	struct Scsi_Host       *scsi_host;
+	struct srp_rport       *rport;
+	char			target_name[32];
+	unsigned int		scsi_id;
+	unsigned int		sg_tablesize;
+	int			queue_size;
+	int			req_ring_size;
+	int			comp_vector;
+	int			tl_retry_count;
+
+	union ib_gid		orig_dgid;
+	__be16			pkey;
+
+	u32			rq_tmo_jiffies;
+
+	int			zero_req_lim;
+
+	struct work_struct	tl_err_work;
+	struct work_struct	remove_work;
+
+	struct list_head	list;
+	bool			qp_in_error;
+};
+
+struct srp_iu {
+	struct list_head	list;
+	u64			dma;
+	void		       *buf;
+	size_t			size;
+	enum dma_data_direction	direction;
+};
+
+/**
+ * struct srp_fr_desc - fast registration work request arguments
+ * @entry: Entry in srp_fr_pool.free_list.
+ * @mr:    Memory region.
+ * @frpl:  Fast registration page list.
+ */
+struct srp_fr_desc {
+	struct list_head		entry;
+	struct ib_mr			*mr;
+	struct ib_fast_reg_page_list	*frpl;
+};
+
+/**
+ * struct srp_fr_pool - pool of fast registration descriptors
+ *
+ * An entry is available for allocation if and only if it occurs in @free_list.
+ *
+ * @size:      Number of descriptors in this pool.
+ * @max_page_list_len: Maximum fast registration work request page list length.
+ * @lock:      Protects free_list.
+ * @free_list: List of free descriptors.
+ * @desc:      Fast registration descriptor pool.
+ */
+struct srp_fr_pool {
+	int			size;
+	int			max_page_list_len;
+	spinlock_t		lock;
+	struct list_head	free_list;
+	struct srp_fr_desc	desc[0];
+};
+
+/**
+ * struct srp_map_state - per-request DMA memory mapping state
+ * @desc:	    Pointer to the element of the SRP buffer descriptor array
+ *		    that is being filled in.
+ * @pages:	    Array with DMA addresses of pages being considered for
+ *		    memory registration.
+ * @base_dma_addr:  DMA address of the first page that has not yet been mapped.
+ * @dma_len:	    Number of bytes that will be registered with the next
+ *		    FMR or FR memory registration call.
+ * @total_len:	    Total number of bytes in the sg-list being mapped.
+ * @npages:	    Number of page addresses in the pages[] array.
+ * @nmdesc:	    Number of FMR or FR memory descriptors used for mapping.
+ * @ndesc:	    Number of SRP buffer descriptors that have been filled in.
+ * @unmapped_sg:    First element of the sg-list that is mapped via FMR or FR.
+ * @unmapped_index: Index of the first element mapped via FMR or FR.
+ * @unmapped_addr:  DMA address of the first element mapped via FMR or FR.
+ */
+struct srp_map_state {
+	union {
+		struct ib_pool_fmr **next_fmr;
+		struct srp_fr_desc **next_fr;
+	};
+	struct srp_direct_buf  *desc;
+	u64		       *pages;
+	dma_addr_t		base_dma_addr;
+	u32			dma_len;
+	u32			total_len;
+	unsigned int		npages;
+	unsigned int		nmdesc;
+	unsigned int		ndesc;
+	struct scatterlist     *unmapped_sg;
+	int			unmapped_index;
+	dma_addr_t		unmapped_addr;
+};
+
+#endif /* IB_SRP_H */
diff --git a/drivers/infiniband/ulp/srpt/Kconfig b/drivers/infiniband/ulp/srpt/Kconfig
new file mode 100644
index 000000000..31ee83d52
--- /dev/null
+++ b/drivers/infiniband/ulp/srpt/Kconfig
@@ -0,0 +1,12 @@
+config INFINIBAND_SRPT
+	tristate "InfiniBand SCSI RDMA Protocol target support"
+	depends on INFINIBAND && TARGET_CORE
+	---help---
+
+	  Support for the SCSI RDMA Protocol (SRP) Target driver. The
+	  SRP protocol is a protocol that allows an initiator to access
+	  a block storage device on another host (target) over a network
+	  that supports the RDMA protocol. Currently the RDMA protocol is
+	  supported by InfiniBand and by iWarp network hardware. More
+	  information about the SRP protocol can be found on the website
+	  of the INCITS T10 technical committee (http://www.t10.org/).
diff --git a/drivers/infiniband/ulp/srpt/Makefile b/drivers/infiniband/ulp/srpt/Makefile
new file mode 100644
index 000000000..e3ee4bdff
--- /dev/null
+++ b/drivers/infiniband/ulp/srpt/Makefile
@@ -0,0 +1,2 @@
+ccflags-y			:= -Idrivers/target
+obj-$(CONFIG_INFINIBAND_SRPT)	+= ib_srpt.o
diff --git a/drivers/infiniband/ulp/srpt/ib_dm_mad.h b/drivers/infiniband/ulp/srpt/ib_dm_mad.h
new file mode 100644
index 000000000..fb1de1f6f
--- /dev/null
+++ b/drivers/infiniband/ulp/srpt/ib_dm_mad.h
@@ -0,0 +1,139 @@
+/*
+ * Copyright (c) 2006 - 2009 Mellanox Technology Inc.  All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef IB_DM_MAD_H
+#define IB_DM_MAD_H
+
+#include <linux/types.h>
+
+#include <rdma/ib_mad.h>
+
+enum {
+	/*
+	 * See also section 13.4.7 Status Field, table 115 MAD Common Status
+	 * Field Bit Values and also section 16.3.1.1 Status Field in the
+	 * InfiniBand Architecture Specification.
+	 */
+	DM_MAD_STATUS_UNSUP_METHOD = 0x0008,
+	DM_MAD_STATUS_UNSUP_METHOD_ATTR = 0x000c,
+	DM_MAD_STATUS_INVALID_FIELD = 0x001c,
+	DM_MAD_STATUS_NO_IOC = 0x0100,
+
+	/*
+	 * See also the Device Management chapter, section 16.3.3 Attributes,
+	 * table 279 Device Management Attributes in the InfiniBand
+	 * Architecture Specification.
+	 */
+	DM_ATTR_CLASS_PORT_INFO = 0x01,
+	DM_ATTR_IOU_INFO = 0x10,
+	DM_ATTR_IOC_PROFILE = 0x11,
+	DM_ATTR_SVC_ENTRIES = 0x12
+};
+
+struct ib_dm_hdr {
+	u8 reserved[28];
+};
+
+/*
+ * Structure of management datagram sent by the SRP target implementation.
+ * Contains a management datagram header, reliable multi-packet transaction
+ * protocol (RMPP) header and ib_dm_hdr. Notes:
+ * - The SRP target implementation does not use RMPP or ib_dm_hdr when sending
+ *   management datagrams.
+ * - The header size must be exactly 64 bytes (IB_MGMT_DEVICE_HDR), since this
+ *   is the header size that is passed to ib_create_send_mad() in ib_srpt.c.
+ * - The maximum supported size for a management datagram when not using RMPP
+ *   is 256 bytes -- 64 bytes header and 192 (IB_MGMT_DEVICE_DATA) bytes data.
+ */
+struct ib_dm_mad {
+	struct ib_mad_hdr mad_hdr;
+	struct ib_rmpp_hdr rmpp_hdr;
+	struct ib_dm_hdr dm_hdr;
+	u8 data[IB_MGMT_DEVICE_DATA];
+};
+
+/*
+ * IOUnitInfo as defined in section 16.3.3.3 IOUnitInfo of the InfiniBand
+ * Architecture Specification.
+ */
+struct ib_dm_iou_info {
+	__be16 change_id;
+	u8 max_controllers;
+	u8 op_rom;
+	u8 controller_list[128];
+};
+
+/*
+ * IOControllerprofile as defined in section 16.3.3.4 IOControllerProfile of
+ * the InfiniBand Architecture Specification.
+ */
+struct ib_dm_ioc_profile {
+	__be64 guid;
+	__be32 vendor_id;
+	__be32 device_id;
+	__be16 device_version;
+	__be16 reserved1;
+	__be32 subsys_vendor_id;
+	__be32 subsys_device_id;
+	__be16 io_class;
+	__be16 io_subclass;
+	__be16 protocol;
+	__be16 protocol_version;
+	__be16 service_conn;
+	__be16 initiators_supported;
+	__be16 send_queue_depth;
+	u8 reserved2;
+	u8 rdma_read_depth;
+	__be32 send_size;
+	__be32 rdma_size;
+	u8 op_cap_mask;
+	u8 svc_cap_mask;
+	u8 num_svc_entries;
+	u8 reserved3[9];
+	u8 id_string[64];
+};
+
+struct ib_dm_svc_entry {
+	u8 name[40];
+	__be64 id;
+};
+
+/*
+ * See also section 16.3.3.5 ServiceEntries in the InfiniBand Architecture
+ * Specification. See also section B.7, table B.8 in the T10 SRP r16a document.
+ */
+struct ib_dm_svc_entries {
+	struct ib_dm_svc_entry service_entries[4];
+};
+
+#endif
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c
new file mode 100644
index 000000000..9b84b4c0a
--- /dev/null
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.c
@@ -0,0 +1,4022 @@
+/*
+ * Copyright (c) 2006 - 2009 Mellanox Technology Inc.  All rights reserved.
+ * Copyright (C) 2008 - 2011 Bart Van Assche <bvanassche@acm.org>.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/ctype.h>
+#include <linux/kthread.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+#include <linux/atomic.h>
+#include <scsi/scsi_tcq.h>
+#include <target/configfs_macros.h>
+#include <target/target_core_base.h>
+#include <target/target_core_fabric_configfs.h>
+#include <target/target_core_fabric.h>
+#include <target/target_core_configfs.h>
+#include "ib_srpt.h"
+
+/* Name of this kernel module. */
+#define DRV_NAME		"ib_srpt"
+#define DRV_VERSION		"2.0.0"
+#define DRV_RELDATE		"2011-02-14"
+
+#define SRPT_ID_STRING	"Linux SRP target"
+
+#undef pr_fmt
+#define pr_fmt(fmt) DRV_NAME " " fmt
+
+MODULE_AUTHOR("Vu Pham and Bart Van Assche");
+MODULE_DESCRIPTION("InfiniBand SCSI RDMA Protocol target "
+		   "v" DRV_VERSION " (" DRV_RELDATE ")");
+MODULE_LICENSE("Dual BSD/GPL");
+
+/*
+ * Global Variables
+ */
+
+static u64 srpt_service_guid;
+static DEFINE_SPINLOCK(srpt_dev_lock);	/* Protects srpt_dev_list. */
+static LIST_HEAD(srpt_dev_list);	/* List of srpt_device structures. */
+
+static unsigned srp_max_req_size = DEFAULT_MAX_REQ_SIZE;
+module_param(srp_max_req_size, int, 0444);
+MODULE_PARM_DESC(srp_max_req_size,
+		 "Maximum size of SRP request messages in bytes.");
+
+static int srpt_srq_size = DEFAULT_SRPT_SRQ_SIZE;
+module_param(srpt_srq_size, int, 0444);
+MODULE_PARM_DESC(srpt_srq_size,
+		 "Shared receive queue (SRQ) size.");
+
+static int srpt_get_u64_x(char *buffer, struct kernel_param *kp)
+{
+	return sprintf(buffer, "0x%016llx", *(u64 *)kp->arg);
+}
+module_param_call(srpt_service_guid, NULL, srpt_get_u64_x, &srpt_service_guid,
+		  0444);
+MODULE_PARM_DESC(srpt_service_guid,
+		 "Using this value for ioc_guid, id_ext, and cm_listen_id"
+		 " instead of using the node_guid of the first HCA.");
+
+static struct ib_client srpt_client;
+static const struct target_core_fabric_ops srpt_template;
+static void srpt_release_channel(struct srpt_rdma_ch *ch);
+static int srpt_queue_status(struct se_cmd *cmd);
+
+/**
+ * opposite_dma_dir() - Swap DMA_TO_DEVICE and DMA_FROM_DEVICE.
+ */
+static inline
+enum dma_data_direction opposite_dma_dir(enum dma_data_direction dir)
+{
+	switch (dir) {
+	case DMA_TO_DEVICE:	return DMA_FROM_DEVICE;
+	case DMA_FROM_DEVICE:	return DMA_TO_DEVICE;
+	default:		return dir;
+	}
+}
+
+/**
+ * srpt_sdev_name() - Return the name associated with the HCA.
+ *
+ * Examples are ib0, ib1, ...
+ */
+static inline const char *srpt_sdev_name(struct srpt_device *sdev)
+{
+	return sdev->device->name;
+}
+
+static enum rdma_ch_state srpt_get_ch_state(struct srpt_rdma_ch *ch)
+{
+	unsigned long flags;
+	enum rdma_ch_state state;
+
+	spin_lock_irqsave(&ch->spinlock, flags);
+	state = ch->state;
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+	return state;
+}
+
+static enum rdma_ch_state
+srpt_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state new_state)
+{
+	unsigned long flags;
+	enum rdma_ch_state prev;
+
+	spin_lock_irqsave(&ch->spinlock, flags);
+	prev = ch->state;
+	ch->state = new_state;
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+	return prev;
+}
+
+/**
+ * srpt_test_and_set_ch_state() - Test and set the channel state.
+ *
+ * Returns true if and only if the channel state has been set to the new state.
+ */
+static bool
+srpt_test_and_set_ch_state(struct srpt_rdma_ch *ch, enum rdma_ch_state old,
+			   enum rdma_ch_state new)
+{
+	unsigned long flags;
+	enum rdma_ch_state prev;
+
+	spin_lock_irqsave(&ch->spinlock, flags);
+	prev = ch->state;
+	if (prev == old)
+		ch->state = new;
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+	return prev == old;
+}
+
+/**
+ * srpt_event_handler() - Asynchronous IB event callback function.
+ *
+ * Callback function called by the InfiniBand core when an asynchronous IB
+ * event occurs. This callback may occur in interrupt context. See also
+ * section 11.5.2, Set Asynchronous Event Handler in the InfiniBand
+ * Architecture Specification.
+ */
+static void srpt_event_handler(struct ib_event_handler *handler,
+			       struct ib_event *event)
+{
+	struct srpt_device *sdev;
+	struct srpt_port *sport;
+
+	sdev = ib_get_client_data(event->device, &srpt_client);
+	if (!sdev || sdev->device != event->device)
+		return;
+
+	pr_debug("ASYNC event= %d on device= %s\n", event->event,
+		 srpt_sdev_name(sdev));
+
+	switch (event->event) {
+	case IB_EVENT_PORT_ERR:
+		if (event->element.port_num <= sdev->device->phys_port_cnt) {
+			sport = &sdev->port[event->element.port_num - 1];
+			sport->lid = 0;
+			sport->sm_lid = 0;
+		}
+		break;
+	case IB_EVENT_PORT_ACTIVE:
+	case IB_EVENT_LID_CHANGE:
+	case IB_EVENT_PKEY_CHANGE:
+	case IB_EVENT_SM_CHANGE:
+	case IB_EVENT_CLIENT_REREGISTER:
+	case IB_EVENT_GID_CHANGE:
+		/* Refresh port data asynchronously. */
+		if (event->element.port_num <= sdev->device->phys_port_cnt) {
+			sport = &sdev->port[event->element.port_num - 1];
+			if (!sport->lid && !sport->sm_lid)
+				schedule_work(&sport->work);
+		}
+		break;
+	default:
+		pr_err("received unrecognized IB event %d\n",
+		       event->event);
+		break;
+	}
+}
+
+/**
+ * srpt_srq_event() - SRQ event callback function.
+ */
+static void srpt_srq_event(struct ib_event *event, void *ctx)
+{
+	pr_info("SRQ event %d\n", event->event);
+}
+
+/**
+ * srpt_qp_event() - QP event callback function.
+ */
+static void srpt_qp_event(struct ib_event *event, struct srpt_rdma_ch *ch)
+{
+	pr_debug("QP event %d on cm_id=%p sess_name=%s state=%d\n",
+		 event->event, ch->cm_id, ch->sess_name, srpt_get_ch_state(ch));
+
+	switch (event->event) {
+	case IB_EVENT_COMM_EST:
+		ib_cm_notify(ch->cm_id, event->event);
+		break;
+	case IB_EVENT_QP_LAST_WQE_REACHED:
+		if (srpt_test_and_set_ch_state(ch, CH_DRAINING,
+					       CH_RELEASING))
+			srpt_release_channel(ch);
+		else
+			pr_debug("%s: state %d - ignored LAST_WQE.\n",
+				 ch->sess_name, srpt_get_ch_state(ch));
+		break;
+	default:
+		pr_err("received unrecognized IB QP event %d\n", event->event);
+		break;
+	}
+}
+
+/**
+ * srpt_set_ioc() - Helper function for initializing an IOUnitInfo structure.
+ *
+ * @slot: one-based slot number.
+ * @value: four-bit value.
+ *
+ * Copies the lowest four bits of value in element slot of the array of four
+ * bit elements called c_list (controller list). The index slot is one-based.
+ */
+static void srpt_set_ioc(u8 *c_list, u32 slot, u8 value)
+{
+	u16 id;
+	u8 tmp;
+
+	id = (slot - 1) / 2;
+	if (slot & 0x1) {
+		tmp = c_list[id] & 0xf;
+		c_list[id] = (value << 4) | tmp;
+	} else {
+		tmp = c_list[id] & 0xf0;
+		c_list[id] = (value & 0xf) | tmp;
+	}
+}
+
+/**
+ * srpt_get_class_port_info() - Copy ClassPortInfo to a management datagram.
+ *
+ * See also section 16.3.3.1 ClassPortInfo in the InfiniBand Architecture
+ * Specification.
+ */
+static void srpt_get_class_port_info(struct ib_dm_mad *mad)
+{
+	struct ib_class_port_info *cif;
+
+	cif = (struct ib_class_port_info *)mad->data;
+	memset(cif, 0, sizeof *cif);
+	cif->base_version = 1;
+	cif->class_version = 1;
+	cif->resp_time_value = 20;
+
+	mad->mad_hdr.status = 0;
+}
+
+/**
+ * srpt_get_iou() - Write IOUnitInfo to a management datagram.
+ *
+ * See also section 16.3.3.3 IOUnitInfo in the InfiniBand Architecture
+ * Specification. See also section B.7, table B.6 in the SRP r16a document.
+ */
+static void srpt_get_iou(struct ib_dm_mad *mad)
+{
+	struct ib_dm_iou_info *ioui;
+	u8 slot;
+	int i;
+
+	ioui = (struct ib_dm_iou_info *)mad->data;
+	ioui->change_id = __constant_cpu_to_be16(1);
+	ioui->max_controllers = 16;
+
+	/* set present for slot 1 and empty for the rest */
+	srpt_set_ioc(ioui->controller_list, 1, 1);
+	for (i = 1, slot = 2; i < 16; i++, slot++)
+		srpt_set_ioc(ioui->controller_list, slot, 0);
+
+	mad->mad_hdr.status = 0;
+}
+
+/**
+ * srpt_get_ioc() - Write IOControllerprofile to a management datagram.
+ *
+ * See also section 16.3.3.4 IOControllerProfile in the InfiniBand
+ * Architecture Specification. See also section B.7, table B.7 in the SRP
+ * r16a document.
+ */
+static void srpt_get_ioc(struct srpt_port *sport, u32 slot,
+			 struct ib_dm_mad *mad)
+{
+	struct srpt_device *sdev = sport->sdev;
+	struct ib_dm_ioc_profile *iocp;
+
+	iocp = (struct ib_dm_ioc_profile *)mad->data;
+
+	if (!slot || slot > 16) {
+		mad->mad_hdr.status
+			= __constant_cpu_to_be16(DM_MAD_STATUS_INVALID_FIELD);
+		return;
+	}
+
+	if (slot > 2) {
+		mad->mad_hdr.status
+			= __constant_cpu_to_be16(DM_MAD_STATUS_NO_IOC);
+		return;
+	}
+
+	memset(iocp, 0, sizeof *iocp);
+	strcpy(iocp->id_string, SRPT_ID_STRING);
+	iocp->guid = cpu_to_be64(srpt_service_guid);
+	iocp->vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id);
+	iocp->device_id = cpu_to_be32(sdev->dev_attr.vendor_part_id);
+	iocp->device_version = cpu_to_be16(sdev->dev_attr.hw_ver);
+	iocp->subsys_vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id);
+	iocp->subsys_device_id = 0x0;
+	iocp->io_class = __constant_cpu_to_be16(SRP_REV16A_IB_IO_CLASS);
+	iocp->io_subclass = __constant_cpu_to_be16(SRP_IO_SUBCLASS);
+	iocp->protocol = __constant_cpu_to_be16(SRP_PROTOCOL);
+	iocp->protocol_version = __constant_cpu_to_be16(SRP_PROTOCOL_VERSION);
+	iocp->send_queue_depth = cpu_to_be16(sdev->srq_size);
+	iocp->rdma_read_depth = 4;
+	iocp->send_size = cpu_to_be32(srp_max_req_size);
+	iocp->rdma_size = cpu_to_be32(min(sport->port_attrib.srp_max_rdma_size,
+					  1U << 24));
+	iocp->num_svc_entries = 1;
+	iocp->op_cap_mask = SRP_SEND_TO_IOC | SRP_SEND_FROM_IOC |
+		SRP_RDMA_READ_FROM_IOC | SRP_RDMA_WRITE_FROM_IOC;
+
+	mad->mad_hdr.status = 0;
+}
+
+/**
+ * srpt_get_svc_entries() - Write ServiceEntries to a management datagram.
+ *
+ * See also section 16.3.3.5 ServiceEntries in the InfiniBand Architecture
+ * Specification. See also section B.7, table B.8 in the SRP r16a document.
+ */
+static void srpt_get_svc_entries(u64 ioc_guid,
+				 u16 slot, u8 hi, u8 lo, struct ib_dm_mad *mad)
+{
+	struct ib_dm_svc_entries *svc_entries;
+
+	WARN_ON(!ioc_guid);
+
+	if (!slot || slot > 16) {
+		mad->mad_hdr.status
+			= __constant_cpu_to_be16(DM_MAD_STATUS_INVALID_FIELD);
+		return;
+	}
+
+	if (slot > 2 || lo > hi || hi > 1) {
+		mad->mad_hdr.status
+			= __constant_cpu_to_be16(DM_MAD_STATUS_NO_IOC);
+		return;
+	}
+
+	svc_entries = (struct ib_dm_svc_entries *)mad->data;
+	memset(svc_entries, 0, sizeof *svc_entries);
+	svc_entries->service_entries[0].id = cpu_to_be64(ioc_guid);
+	snprintf(svc_entries->service_entries[0].name,
+		 sizeof(svc_entries->service_entries[0].name),
+		 "%s%016llx",
+		 SRP_SERVICE_NAME_PREFIX,
+		 ioc_guid);
+
+	mad->mad_hdr.status = 0;
+}
+
+/**
+ * srpt_mgmt_method_get() - Process a received management datagram.
+ * @sp:      source port through which the MAD has been received.
+ * @rq_mad:  received MAD.
+ * @rsp_mad: response MAD.
+ */
+static void srpt_mgmt_method_get(struct srpt_port *sp, struct ib_mad *rq_mad,
+				 struct ib_dm_mad *rsp_mad)
+{
+	u16 attr_id;
+	u32 slot;
+	u8 hi, lo;
+
+	attr_id = be16_to_cpu(rq_mad->mad_hdr.attr_id);
+	switch (attr_id) {
+	case DM_ATTR_CLASS_PORT_INFO:
+		srpt_get_class_port_info(rsp_mad);
+		break;
+	case DM_ATTR_IOU_INFO:
+		srpt_get_iou(rsp_mad);
+		break;
+	case DM_ATTR_IOC_PROFILE:
+		slot = be32_to_cpu(rq_mad->mad_hdr.attr_mod);
+		srpt_get_ioc(sp, slot, rsp_mad);
+		break;
+	case DM_ATTR_SVC_ENTRIES:
+		slot = be32_to_cpu(rq_mad->mad_hdr.attr_mod);
+		hi = (u8) ((slot >> 8) & 0xff);
+		lo = (u8) (slot & 0xff);
+		slot = (u16) ((slot >> 16) & 0xffff);
+		srpt_get_svc_entries(srpt_service_guid,
+				     slot, hi, lo, rsp_mad);
+		break;
+	default:
+		rsp_mad->mad_hdr.status =
+		    __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD_ATTR);
+		break;
+	}
+}
+
+/**
+ * srpt_mad_send_handler() - Post MAD-send callback function.
+ */
+static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent,
+				  struct ib_mad_send_wc *mad_wc)
+{
+	ib_destroy_ah(mad_wc->send_buf->ah);
+	ib_free_send_mad(mad_wc->send_buf);
+}
+
+/**
+ * srpt_mad_recv_handler() - MAD reception callback function.
+ */
+static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent,
+				  struct ib_mad_recv_wc *mad_wc)
+{
+	struct srpt_port *sport = (struct srpt_port *)mad_agent->context;
+	struct ib_ah *ah;
+	struct ib_mad_send_buf *rsp;
+	struct ib_dm_mad *dm_mad;
+
+	if (!mad_wc || !mad_wc->recv_buf.mad)
+		return;
+
+	ah = ib_create_ah_from_wc(mad_agent->qp->pd, mad_wc->wc,
+				  mad_wc->recv_buf.grh, mad_agent->port_num);
+	if (IS_ERR(ah))
+		goto err;
+
+	BUILD_BUG_ON(offsetof(struct ib_dm_mad, data) != IB_MGMT_DEVICE_HDR);
+
+	rsp = ib_create_send_mad(mad_agent, mad_wc->wc->src_qp,
+				 mad_wc->wc->pkey_index, 0,
+				 IB_MGMT_DEVICE_HDR, IB_MGMT_DEVICE_DATA,
+				 GFP_KERNEL);
+	if (IS_ERR(rsp))
+		goto err_rsp;
+
+	rsp->ah = ah;
+
+	dm_mad = rsp->mad;
+	memcpy(dm_mad, mad_wc->recv_buf.mad, sizeof *dm_mad);
+	dm_mad->mad_hdr.method = IB_MGMT_METHOD_GET_RESP;
+	dm_mad->mad_hdr.status = 0;
+
+	switch (mad_wc->recv_buf.mad->mad_hdr.method) {
+	case IB_MGMT_METHOD_GET:
+		srpt_mgmt_method_get(sport, mad_wc->recv_buf.mad, dm_mad);
+		break;
+	case IB_MGMT_METHOD_SET:
+		dm_mad->mad_hdr.status =
+		    __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD_ATTR);
+		break;
+	default:
+		dm_mad->mad_hdr.status =
+		    __constant_cpu_to_be16(DM_MAD_STATUS_UNSUP_METHOD);
+		break;
+	}
+
+	if (!ib_post_send_mad(rsp, NULL)) {
+		ib_free_recv_mad(mad_wc);
+		/* will destroy_ah & free_send_mad in send completion */
+		return;
+	}
+
+	ib_free_send_mad(rsp);
+
+err_rsp:
+	ib_destroy_ah(ah);
+err:
+	ib_free_recv_mad(mad_wc);
+}
+
+/**
+ * srpt_refresh_port() - Configure a HCA port.
+ *
+ * Enable InfiniBand management datagram processing, update the cached sm_lid,
+ * lid and gid values, and register a callback function for processing MADs
+ * on the specified port.
+ *
+ * Note: It is safe to call this function more than once for the same port.
+ */
+static int srpt_refresh_port(struct srpt_port *sport)
+{
+	struct ib_mad_reg_req reg_req;
+	struct ib_port_modify port_modify;
+	struct ib_port_attr port_attr;
+	int ret;
+
+	memset(&port_modify, 0, sizeof port_modify);
+	port_modify.set_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP;
+	port_modify.clr_port_cap_mask = 0;
+
+	ret = ib_modify_port(sport->sdev->device, sport->port, 0, &port_modify);
+	if (ret)
+		goto err_mod_port;
+
+	ret = ib_query_port(sport->sdev->device, sport->port, &port_attr);
+	if (ret)
+		goto err_query_port;
+
+	sport->sm_lid = port_attr.sm_lid;
+	sport->lid = port_attr.lid;
+
+	ret = ib_query_gid(sport->sdev->device, sport->port, 0, &sport->gid);
+	if (ret)
+		goto err_query_port;
+
+	if (!sport->mad_agent) {
+		memset(&reg_req, 0, sizeof reg_req);
+		reg_req.mgmt_class = IB_MGMT_CLASS_DEVICE_MGMT;
+		reg_req.mgmt_class_version = IB_MGMT_BASE_VERSION;
+		set_bit(IB_MGMT_METHOD_GET, reg_req.method_mask);
+		set_bit(IB_MGMT_METHOD_SET, reg_req.method_mask);
+
+		sport->mad_agent = ib_register_mad_agent(sport->sdev->device,
+							 sport->port,
+							 IB_QPT_GSI,
+							 &reg_req, 0,
+							 srpt_mad_send_handler,
+							 srpt_mad_recv_handler,
+							 sport, 0);
+		if (IS_ERR(sport->mad_agent)) {
+			ret = PTR_ERR(sport->mad_agent);
+			sport->mad_agent = NULL;
+			goto err_query_port;
+		}
+	}
+
+	return 0;
+
+err_query_port:
+
+	port_modify.set_port_cap_mask = 0;
+	port_modify.clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP;
+	ib_modify_port(sport->sdev->device, sport->port, 0, &port_modify);
+
+err_mod_port:
+
+	return ret;
+}
+
+/**
+ * srpt_unregister_mad_agent() - Unregister MAD callback functions.
+ *
+ * Note: It is safe to call this function more than once for the same device.
+ */
+static void srpt_unregister_mad_agent(struct srpt_device *sdev)
+{
+	struct ib_port_modify port_modify = {
+		.clr_port_cap_mask = IB_PORT_DEVICE_MGMT_SUP,
+	};
+	struct srpt_port *sport;
+	int i;
+
+	for (i = 1; i <= sdev->device->phys_port_cnt; i++) {
+		sport = &sdev->port[i - 1];
+		WARN_ON(sport->port != i);
+		if (ib_modify_port(sdev->device, i, 0, &port_modify) < 0)
+			pr_err("disabling MAD processing failed.\n");
+		if (sport->mad_agent) {
+			ib_unregister_mad_agent(sport->mad_agent);
+			sport->mad_agent = NULL;
+		}
+	}
+}
+
+/**
+ * srpt_alloc_ioctx() - Allocate an SRPT I/O context structure.
+ */
+static struct srpt_ioctx *srpt_alloc_ioctx(struct srpt_device *sdev,
+					   int ioctx_size, int dma_size,
+					   enum dma_data_direction dir)
+{
+	struct srpt_ioctx *ioctx;
+
+	ioctx = kmalloc(ioctx_size, GFP_KERNEL);
+	if (!ioctx)
+		goto err;
+
+	ioctx->buf = kmalloc(dma_size, GFP_KERNEL);
+	if (!ioctx->buf)
+		goto err_free_ioctx;
+
+	ioctx->dma = ib_dma_map_single(sdev->device, ioctx->buf, dma_size, dir);
+	if (ib_dma_mapping_error(sdev->device, ioctx->dma))
+		goto err_free_buf;
+
+	return ioctx;
+
+err_free_buf:
+	kfree(ioctx->buf);
+err_free_ioctx:
+	kfree(ioctx);
+err:
+	return NULL;
+}
+
+/**
+ * srpt_free_ioctx() - Free an SRPT I/O context structure.
+ */
+static void srpt_free_ioctx(struct srpt_device *sdev, struct srpt_ioctx *ioctx,
+			    int dma_size, enum dma_data_direction dir)
+{
+	if (!ioctx)
+		return;
+
+	ib_dma_unmap_single(sdev->device, ioctx->dma, dma_size, dir);
+	kfree(ioctx->buf);
+	kfree(ioctx);
+}
+
+/**
+ * srpt_alloc_ioctx_ring() - Allocate a ring of SRPT I/O context structures.
+ * @sdev:       Device to allocate the I/O context ring for.
+ * @ring_size:  Number of elements in the I/O context ring.
+ * @ioctx_size: I/O context size.
+ * @dma_size:   DMA buffer size.
+ * @dir:        DMA data direction.
+ */
+static struct srpt_ioctx **srpt_alloc_ioctx_ring(struct srpt_device *sdev,
+				int ring_size, int ioctx_size,
+				int dma_size, enum dma_data_direction dir)
+{
+	struct srpt_ioctx **ring;
+	int i;
+
+	WARN_ON(ioctx_size != sizeof(struct srpt_recv_ioctx)
+		&& ioctx_size != sizeof(struct srpt_send_ioctx));
+
+	ring = kmalloc(ring_size * sizeof(ring[0]), GFP_KERNEL);
+	if (!ring)
+		goto out;
+	for (i = 0; i < ring_size; ++i) {
+		ring[i] = srpt_alloc_ioctx(sdev, ioctx_size, dma_size, dir);
+		if (!ring[i])
+			goto err;
+		ring[i]->index = i;
+	}
+	goto out;
+
+err:
+	while (--i >= 0)
+		srpt_free_ioctx(sdev, ring[i], dma_size, dir);
+	kfree(ring);
+	ring = NULL;
+out:
+	return ring;
+}
+
+/**
+ * srpt_free_ioctx_ring() - Free the ring of SRPT I/O context structures.
+ */
+static void srpt_free_ioctx_ring(struct srpt_ioctx **ioctx_ring,
+				 struct srpt_device *sdev, int ring_size,
+				 int dma_size, enum dma_data_direction dir)
+{
+	int i;
+
+	for (i = 0; i < ring_size; ++i)
+		srpt_free_ioctx(sdev, ioctx_ring[i], dma_size, dir);
+	kfree(ioctx_ring);
+}
+
+/**
+ * srpt_get_cmd_state() - Get the state of a SCSI command.
+ */
+static enum srpt_command_state srpt_get_cmd_state(struct srpt_send_ioctx *ioctx)
+{
+	enum srpt_command_state state;
+	unsigned long flags;
+
+	BUG_ON(!ioctx);
+
+	spin_lock_irqsave(&ioctx->spinlock, flags);
+	state = ioctx->state;
+	spin_unlock_irqrestore(&ioctx->spinlock, flags);
+	return state;
+}
+
+/**
+ * srpt_set_cmd_state() - Set the state of a SCSI command.
+ *
+ * Does not modify the state of aborted commands. Returns the previous command
+ * state.
+ */
+static enum srpt_command_state srpt_set_cmd_state(struct srpt_send_ioctx *ioctx,
+						  enum srpt_command_state new)
+{
+	enum srpt_command_state previous;
+	unsigned long flags;
+
+	BUG_ON(!ioctx);
+
+	spin_lock_irqsave(&ioctx->spinlock, flags);
+	previous = ioctx->state;
+	if (previous != SRPT_STATE_DONE)
+		ioctx->state = new;
+	spin_unlock_irqrestore(&ioctx->spinlock, flags);
+
+	return previous;
+}
+
+/**
+ * srpt_test_and_set_cmd_state() - Test and set the state of a command.
+ *
+ * Returns true if and only if the previous command state was equal to 'old'.
+ */
+static bool srpt_test_and_set_cmd_state(struct srpt_send_ioctx *ioctx,
+					enum srpt_command_state old,
+					enum srpt_command_state new)
+{
+	enum srpt_command_state previous;
+	unsigned long flags;
+
+	WARN_ON(!ioctx);
+	WARN_ON(old == SRPT_STATE_DONE);
+	WARN_ON(new == SRPT_STATE_NEW);
+
+	spin_lock_irqsave(&ioctx->spinlock, flags);
+	previous = ioctx->state;
+	if (previous == old)
+		ioctx->state = new;
+	spin_unlock_irqrestore(&ioctx->spinlock, flags);
+	return previous == old;
+}
+
+/**
+ * srpt_post_recv() - Post an IB receive request.
+ */
+static int srpt_post_recv(struct srpt_device *sdev,
+			  struct srpt_recv_ioctx *ioctx)
+{
+	struct ib_sge list;
+	struct ib_recv_wr wr, *bad_wr;
+
+	BUG_ON(!sdev);
+	wr.wr_id = encode_wr_id(SRPT_RECV, ioctx->ioctx.index);
+
+	list.addr = ioctx->ioctx.dma;
+	list.length = srp_max_req_size;
+	list.lkey = sdev->mr->lkey;
+
+	wr.next = NULL;
+	wr.sg_list = &list;
+	wr.num_sge = 1;
+
+	return ib_post_srq_recv(sdev->srq, &wr, &bad_wr);
+}
+
+/**
+ * srpt_post_send() - Post an IB send request.
+ *
+ * Returns zero upon success and a non-zero value upon failure.
+ */
+static int srpt_post_send(struct srpt_rdma_ch *ch,
+			  struct srpt_send_ioctx *ioctx, int len)
+{
+	struct ib_sge list;
+	struct ib_send_wr wr, *bad_wr;
+	struct srpt_device *sdev = ch->sport->sdev;
+	int ret;
+
+	atomic_inc(&ch->req_lim);
+
+	ret = -ENOMEM;
+	if (unlikely(atomic_dec_return(&ch->sq_wr_avail) < 0)) {
+		pr_warn("IB send queue full (needed 1)\n");
+		goto out;
+	}
+
+	ib_dma_sync_single_for_device(sdev->device, ioctx->ioctx.dma, len,
+				      DMA_TO_DEVICE);
+
+	list.addr = ioctx->ioctx.dma;
+	list.length = len;
+	list.lkey = sdev->mr->lkey;
+
+	wr.next = NULL;
+	wr.wr_id = encode_wr_id(SRPT_SEND, ioctx->ioctx.index);
+	wr.sg_list = &list;
+	wr.num_sge = 1;
+	wr.opcode = IB_WR_SEND;
+	wr.send_flags = IB_SEND_SIGNALED;
+
+	ret = ib_post_send(ch->qp, &wr, &bad_wr);
+
+out:
+	if (ret < 0) {
+		atomic_inc(&ch->sq_wr_avail);
+		atomic_dec(&ch->req_lim);
+	}
+	return ret;
+}
+
+/**
+ * srpt_get_desc_tbl() - Parse the data descriptors of an SRP_CMD request.
+ * @ioctx: Pointer to the I/O context associated with the request.
+ * @srp_cmd: Pointer to the SRP_CMD request data.
+ * @dir: Pointer to the variable to which the transfer direction will be
+ *   written.
+ * @data_len: Pointer to the variable to which the total data length of all
+ *   descriptors in the SRP_CMD request will be written.
+ *
+ * This function initializes ioctx->nrbuf and ioctx->r_bufs.
+ *
+ * Returns -EINVAL when the SRP_CMD request contains inconsistent descriptors;
+ * -ENOMEM when memory allocation fails and zero upon success.
+ */
+static int srpt_get_desc_tbl(struct srpt_send_ioctx *ioctx,
+			     struct srp_cmd *srp_cmd,
+			     enum dma_data_direction *dir, u64 *data_len)
+{
+	struct srp_indirect_buf *idb;
+	struct srp_direct_buf *db;
+	unsigned add_cdb_offset;
+	int ret;
+
+	/*
+	 * The pointer computations below will only be compiled correctly
+	 * if srp_cmd::add_data is declared as s8*, u8*, s8[] or u8[], so check
+	 * whether srp_cmd::add_data has been declared as a byte pointer.
+	 */
+	BUILD_BUG_ON(!__same_type(srp_cmd->add_data[0], (s8)0)
+		     && !__same_type(srp_cmd->add_data[0], (u8)0));
+
+	BUG_ON(!dir);
+	BUG_ON(!data_len);
+
+	ret = 0;
+	*data_len = 0;
+
+	/*
+	 * The lower four bits of the buffer format field contain the DATA-IN
+	 * buffer descriptor format, and the highest four bits contain the
+	 * DATA-OUT buffer descriptor format.
+	 */
+	*dir = DMA_NONE;
+	if (srp_cmd->buf_fmt & 0xf)
+		/* DATA-IN: transfer data from target to initiator (read). */
+		*dir = DMA_FROM_DEVICE;
+	else if (srp_cmd->buf_fmt >> 4)
+		/* DATA-OUT: transfer data from initiator to target (write). */
+		*dir = DMA_TO_DEVICE;
+
+	/*
+	 * According to the SRP spec, the lower two bits of the 'ADDITIONAL
+	 * CDB LENGTH' field are reserved and the size in bytes of this field
+	 * is four times the value specified in bits 3..7. Hence the "& ~3".
+	 */
+	add_cdb_offset = srp_cmd->add_cdb_len & ~3;
+	if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_DIRECT) ||
+	    ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_DIRECT)) {
+		ioctx->n_rbuf = 1;
+		ioctx->rbufs = &ioctx->single_rbuf;
+
+		db = (struct srp_direct_buf *)(srp_cmd->add_data
+					       + add_cdb_offset);
+		memcpy(ioctx->rbufs, db, sizeof *db);
+		*data_len = be32_to_cpu(db->len);
+	} else if (((srp_cmd->buf_fmt & 0xf) == SRP_DATA_DESC_INDIRECT) ||
+		   ((srp_cmd->buf_fmt >> 4) == SRP_DATA_DESC_INDIRECT)) {
+		idb = (struct srp_indirect_buf *)(srp_cmd->add_data
+						  + add_cdb_offset);
+
+		ioctx->n_rbuf = be32_to_cpu(idb->table_desc.len) / sizeof *db;
+
+		if (ioctx->n_rbuf >
+		    (srp_cmd->data_out_desc_cnt + srp_cmd->data_in_desc_cnt)) {
+			pr_err("received unsupported SRP_CMD request"
+			       " type (%u out + %u in != %u / %zu)\n",
+			       srp_cmd->data_out_desc_cnt,
+			       srp_cmd->data_in_desc_cnt,
+			       be32_to_cpu(idb->table_desc.len),
+			       sizeof(*db));
+			ioctx->n_rbuf = 0;
+			ret = -EINVAL;
+			goto out;
+		}
+
+		if (ioctx->n_rbuf == 1)
+			ioctx->rbufs = &ioctx->single_rbuf;
+		else {
+			ioctx->rbufs =
+				kmalloc(ioctx->n_rbuf * sizeof *db, GFP_ATOMIC);
+			if (!ioctx->rbufs) {
+				ioctx->n_rbuf = 0;
+				ret = -ENOMEM;
+				goto out;
+			}
+		}
+
+		db = idb->desc_list;
+		memcpy(ioctx->rbufs, db, ioctx->n_rbuf * sizeof *db);
+		*data_len = be32_to_cpu(idb->len);
+	}
+out:
+	return ret;
+}
+
+/**
+ * srpt_init_ch_qp() - Initialize queue pair attributes.
+ *
+ * Initialized the attributes of queue pair 'qp' by allowing local write,
+ * remote read and remote write. Also transitions 'qp' to state IB_QPS_INIT.
+ */
+static int srpt_init_ch_qp(struct srpt_rdma_ch *ch, struct ib_qp *qp)
+{
+	struct ib_qp_attr *attr;
+	int ret;
+
+	attr = kzalloc(sizeof *attr, GFP_KERNEL);
+	if (!attr)
+		return -ENOMEM;
+
+	attr->qp_state = IB_QPS_INIT;
+	attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ |
+	    IB_ACCESS_REMOTE_WRITE;
+	attr->port_num = ch->sport->port;
+	attr->pkey_index = 0;
+
+	ret = ib_modify_qp(qp, attr,
+			   IB_QP_STATE | IB_QP_ACCESS_FLAGS | IB_QP_PORT |
+			   IB_QP_PKEY_INDEX);
+
+	kfree(attr);
+	return ret;
+}
+
+/**
+ * srpt_ch_qp_rtr() - Change the state of a channel to 'ready to receive' (RTR).
+ * @ch: channel of the queue pair.
+ * @qp: queue pair to change the state of.
+ *
+ * Returns zero upon success and a negative value upon failure.
+ *
+ * Note: currently a struct ib_qp_attr takes 136 bytes on a 64-bit system.
+ * If this structure ever becomes larger, it might be necessary to allocate
+ * it dynamically instead of on the stack.
+ */
+static int srpt_ch_qp_rtr(struct srpt_rdma_ch *ch, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int attr_mask;
+	int ret;
+
+	qp_attr.qp_state = IB_QPS_RTR;
+	ret = ib_cm_init_qp_attr(ch->cm_id, &qp_attr, &attr_mask);
+	if (ret)
+		goto out;
+
+	qp_attr.max_dest_rd_atomic = 4;
+
+	ret = ib_modify_qp(qp, &qp_attr, attr_mask);
+
+out:
+	return ret;
+}
+
+/**
+ * srpt_ch_qp_rts() - Change the state of a channel to 'ready to send' (RTS).
+ * @ch: channel of the queue pair.
+ * @qp: queue pair to change the state of.
+ *
+ * Returns zero upon success and a negative value upon failure.
+ *
+ * Note: currently a struct ib_qp_attr takes 136 bytes on a 64-bit system.
+ * If this structure ever becomes larger, it might be necessary to allocate
+ * it dynamically instead of on the stack.
+ */
+static int srpt_ch_qp_rts(struct srpt_rdma_ch *ch, struct ib_qp *qp)
+{
+	struct ib_qp_attr qp_attr;
+	int attr_mask;
+	int ret;
+
+	qp_attr.qp_state = IB_QPS_RTS;
+	ret = ib_cm_init_qp_attr(ch->cm_id, &qp_attr, &attr_mask);
+	if (ret)
+		goto out;
+
+	qp_attr.max_rd_atomic = 4;
+
+	ret = ib_modify_qp(qp, &qp_attr, attr_mask);
+
+out:
+	return ret;
+}
+
+/**
+ * srpt_ch_qp_err() - Set the channel queue pair state to 'error'.
+ */
+static int srpt_ch_qp_err(struct srpt_rdma_ch *ch)
+{
+	struct ib_qp_attr qp_attr;
+
+	qp_attr.qp_state = IB_QPS_ERR;
+	return ib_modify_qp(ch->qp, &qp_attr, IB_QP_STATE);
+}
+
+/**
+ * srpt_unmap_sg_to_ib_sge() - Unmap an IB SGE list.
+ */
+static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch,
+				    struct srpt_send_ioctx *ioctx)
+{
+	struct scatterlist *sg;
+	enum dma_data_direction dir;
+
+	BUG_ON(!ch);
+	BUG_ON(!ioctx);
+	BUG_ON(ioctx->n_rdma && !ioctx->rdma_ius);
+
+	while (ioctx->n_rdma)
+		kfree(ioctx->rdma_ius[--ioctx->n_rdma].sge);
+
+	kfree(ioctx->rdma_ius);
+	ioctx->rdma_ius = NULL;
+
+	if (ioctx->mapped_sg_count) {
+		sg = ioctx->sg;
+		WARN_ON(!sg);
+		dir = ioctx->cmd.data_direction;
+		BUG_ON(dir == DMA_NONE);
+		ib_dma_unmap_sg(ch->sport->sdev->device, sg, ioctx->sg_cnt,
+				opposite_dma_dir(dir));
+		ioctx->mapped_sg_count = 0;
+	}
+}
+
+/**
+ * srpt_map_sg_to_ib_sge() - Map an SG list to an IB SGE list.
+ */
+static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch,
+				 struct srpt_send_ioctx *ioctx)
+{
+	struct ib_device *dev = ch->sport->sdev->device;
+	struct se_cmd *cmd;
+	struct scatterlist *sg, *sg_orig;
+	int sg_cnt;
+	enum dma_data_direction dir;
+	struct rdma_iu *riu;
+	struct srp_direct_buf *db;
+	dma_addr_t dma_addr;
+	struct ib_sge *sge;
+	u64 raddr;
+	u32 rsize;
+	u32 tsize;
+	u32 dma_len;
+	int count, nrdma;
+	int i, j, k;
+
+	BUG_ON(!ch);
+	BUG_ON(!ioctx);
+	cmd = &ioctx->cmd;
+	dir = cmd->data_direction;
+	BUG_ON(dir == DMA_NONE);
+
+	ioctx->sg = sg = sg_orig = cmd->t_data_sg;
+	ioctx->sg_cnt = sg_cnt = cmd->t_data_nents;
+
+	count = ib_dma_map_sg(ch->sport->sdev->device, sg, sg_cnt,
+			      opposite_dma_dir(dir));
+	if (unlikely(!count))
+		return -EAGAIN;
+
+	ioctx->mapped_sg_count = count;
+
+	if (ioctx->rdma_ius && ioctx->n_rdma_ius)
+		nrdma = ioctx->n_rdma_ius;
+	else {
+		nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE
+			+ ioctx->n_rbuf;
+
+		ioctx->rdma_ius = kzalloc(nrdma * sizeof *riu, GFP_KERNEL);
+		if (!ioctx->rdma_ius)
+			goto free_mem;
+
+		ioctx->n_rdma_ius = nrdma;
+	}
+
+	db = ioctx->rbufs;
+	tsize = cmd->data_length;
+	dma_len = ib_sg_dma_len(dev, &sg[0]);
+	riu = ioctx->rdma_ius;
+
+	/*
+	 * For each remote desc - calculate the #ib_sge.
+	 * If #ib_sge < SRPT_DEF_SG_PER_WQE per rdma operation then
+	 *      each remote desc rdma_iu is required a rdma wr;
+	 * else
+	 *      we need to allocate extra rdma_iu to carry extra #ib_sge in
+	 *      another rdma wr
+	 */
+	for (i = 0, j = 0;
+	     j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
+		rsize = be32_to_cpu(db->len);
+		raddr = be64_to_cpu(db->va);
+		riu->raddr = raddr;
+		riu->rkey = be32_to_cpu(db->key);
+		riu->sge_cnt = 0;
+
+		/* calculate how many sge required for this remote_buf */
+		while (rsize > 0 && tsize > 0) {
+
+			if (rsize >= dma_len) {
+				tsize -= dma_len;
+				rsize -= dma_len;
+				raddr += dma_len;
+
+				if (tsize > 0) {
+					++j;
+					if (j < count) {
+						sg = sg_next(sg);
+						dma_len = ib_sg_dma_len(
+								dev, sg);
+					}
+				}
+			} else {
+				tsize -= rsize;
+				dma_len -= rsize;
+				rsize = 0;
+			}
+
+			++riu->sge_cnt;
+
+			if (rsize > 0 && riu->sge_cnt == SRPT_DEF_SG_PER_WQE) {
+				++ioctx->n_rdma;
+				riu->sge =
+				    kmalloc(riu->sge_cnt * sizeof *riu->sge,
+					    GFP_KERNEL);
+				if (!riu->sge)
+					goto free_mem;
+
+				++riu;
+				riu->sge_cnt = 0;
+				riu->raddr = raddr;
+				riu->rkey = be32_to_cpu(db->key);
+			}
+		}
+
+		++ioctx->n_rdma;
+		riu->sge = kmalloc(riu->sge_cnt * sizeof *riu->sge,
+				   GFP_KERNEL);
+		if (!riu->sge)
+			goto free_mem;
+	}
+
+	db = ioctx->rbufs;
+	tsize = cmd->data_length;
+	riu = ioctx->rdma_ius;
+	sg = sg_orig;
+	dma_len = ib_sg_dma_len(dev, &sg[0]);
+	dma_addr = ib_sg_dma_address(dev, &sg[0]);
+
+	/* this second loop is really mapped sg_addres to rdma_iu->ib_sge */
+	for (i = 0, j = 0;
+	     j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) {
+		rsize = be32_to_cpu(db->len);
+		sge = riu->sge;
+		k = 0;
+
+		while (rsize > 0 && tsize > 0) {
+			sge->addr = dma_addr;
+			sge->lkey = ch->sport->sdev->mr->lkey;
+
+			if (rsize >= dma_len) {
+				sge->length =
+					(tsize < dma_len) ? tsize : dma_len;
+				tsize -= dma_len;
+				rsize -= dma_len;
+
+				if (tsize > 0) {
+					++j;
+					if (j < count) {
+						sg = sg_next(sg);
+						dma_len = ib_sg_dma_len(
+								dev, sg);
+						dma_addr = ib_sg_dma_address(
+								dev, sg);
+					}
+				}
+			} else {
+				sge->length = (tsize < rsize) ? tsize : rsize;
+				tsize -= rsize;
+				dma_len -= rsize;
+				dma_addr += rsize;
+				rsize = 0;
+			}
+
+			++k;
+			if (k == riu->sge_cnt && rsize > 0 && tsize > 0) {
+				++riu;
+				sge = riu->sge;
+				k = 0;
+			} else if (rsize > 0 && tsize > 0)
+				++sge;
+		}
+	}
+
+	return 0;
+
+free_mem:
+	srpt_unmap_sg_to_ib_sge(ch, ioctx);
+
+	return -ENOMEM;
+}
+
+/**
+ * srpt_get_send_ioctx() - Obtain an I/O context for sending to the initiator.
+ */
+static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch)
+{
+	struct srpt_send_ioctx *ioctx;
+	unsigned long flags;
+
+	BUG_ON(!ch);
+
+	ioctx = NULL;
+	spin_lock_irqsave(&ch->spinlock, flags);
+	if (!list_empty(&ch->free_list)) {
+		ioctx = list_first_entry(&ch->free_list,
+					 struct srpt_send_ioctx, free_list);
+		list_del(&ioctx->free_list);
+	}
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+
+	if (!ioctx)
+		return ioctx;
+
+	BUG_ON(ioctx->ch != ch);
+	spin_lock_init(&ioctx->spinlock);
+	ioctx->state = SRPT_STATE_NEW;
+	ioctx->n_rbuf = 0;
+	ioctx->rbufs = NULL;
+	ioctx->n_rdma = 0;
+	ioctx->n_rdma_ius = 0;
+	ioctx->rdma_ius = NULL;
+	ioctx->mapped_sg_count = 0;
+	init_completion(&ioctx->tx_done);
+	ioctx->queue_status_only = false;
+	/*
+	 * transport_init_se_cmd() does not initialize all fields, so do it
+	 * here.
+	 */
+	memset(&ioctx->cmd, 0, sizeof(ioctx->cmd));
+	memset(&ioctx->sense_data, 0, sizeof(ioctx->sense_data));
+
+	return ioctx;
+}
+
+/**
+ * srpt_abort_cmd() - Abort a SCSI command.
+ * @ioctx:   I/O context associated with the SCSI command.
+ * @context: Preferred execution context.
+ */
+static int srpt_abort_cmd(struct srpt_send_ioctx *ioctx)
+{
+	enum srpt_command_state state;
+	unsigned long flags;
+
+	BUG_ON(!ioctx);
+
+	/*
+	 * If the command is in a state where the target core is waiting for
+	 * the ib_srpt driver, change the state to the next state. Changing
+	 * the state of the command from SRPT_STATE_NEED_DATA to
+	 * SRPT_STATE_DATA_IN ensures that srpt_xmit_response() will call this
+	 * function a second time.
+	 */
+
+	spin_lock_irqsave(&ioctx->spinlock, flags);
+	state = ioctx->state;
+	switch (state) {
+	case SRPT_STATE_NEED_DATA:
+		ioctx->state = SRPT_STATE_DATA_IN;
+		break;
+	case SRPT_STATE_DATA_IN:
+	case SRPT_STATE_CMD_RSP_SENT:
+	case SRPT_STATE_MGMT_RSP_SENT:
+		ioctx->state = SRPT_STATE_DONE;
+		break;
+	default:
+		break;
+	}
+	spin_unlock_irqrestore(&ioctx->spinlock, flags);
+
+	if (state == SRPT_STATE_DONE) {
+		struct srpt_rdma_ch *ch = ioctx->ch;
+
+		BUG_ON(ch->sess == NULL);
+
+		target_put_sess_cmd(ch->sess, &ioctx->cmd);
+		goto out;
+	}
+
+	pr_debug("Aborting cmd with state %d and tag %lld\n", state,
+		 ioctx->tag);
+
+	switch (state) {
+	case SRPT_STATE_NEW:
+	case SRPT_STATE_DATA_IN:
+	case SRPT_STATE_MGMT:
+		/*
+		 * Do nothing - defer abort processing until
+		 * srpt_queue_response() is invoked.
+		 */
+		WARN_ON(!transport_check_aborted_status(&ioctx->cmd, false));
+		break;
+	case SRPT_STATE_NEED_DATA:
+		/* DMA_TO_DEVICE (write) - RDMA read error. */
+
+		/* XXX(hch): this is a horrible layering violation.. */
+		spin_lock_irqsave(&ioctx->cmd.t_state_lock, flags);
+		ioctx->cmd.transport_state &= ~CMD_T_ACTIVE;
+		spin_unlock_irqrestore(&ioctx->cmd.t_state_lock, flags);
+		break;
+	case SRPT_STATE_CMD_RSP_SENT:
+		/*
+		 * SRP_RSP sending failed or the SRP_RSP send completion has
+		 * not been received in time.
+		 */
+		srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx);
+		target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd);
+		break;
+	case SRPT_STATE_MGMT_RSP_SENT:
+		srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
+		target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd);
+		break;
+	default:
+		WARN(1, "Unexpected command state (%d)", state);
+		break;
+	}
+
+out:
+	return state;
+}
+
+/**
+ * srpt_handle_send_err_comp() - Process an IB_WC_SEND error completion.
+ */
+static void srpt_handle_send_err_comp(struct srpt_rdma_ch *ch, u64 wr_id)
+{
+	struct srpt_send_ioctx *ioctx;
+	enum srpt_command_state state;
+	struct se_cmd *cmd;
+	u32 index;
+
+	atomic_inc(&ch->sq_wr_avail);
+
+	index = idx_from_wr_id(wr_id);
+	ioctx = ch->ioctx_ring[index];
+	state = srpt_get_cmd_state(ioctx);
+	cmd = &ioctx->cmd;
+
+	WARN_ON(state != SRPT_STATE_CMD_RSP_SENT
+		&& state != SRPT_STATE_MGMT_RSP_SENT
+		&& state != SRPT_STATE_NEED_DATA
+		&& state != SRPT_STATE_DONE);
+
+	/* If SRP_RSP sending failed, undo the ch->req_lim change. */
+	if (state == SRPT_STATE_CMD_RSP_SENT
+	    || state == SRPT_STATE_MGMT_RSP_SENT)
+		atomic_dec(&ch->req_lim);
+
+	srpt_abort_cmd(ioctx);
+}
+
+/**
+ * srpt_handle_send_comp() - Process an IB send completion notification.
+ */
+static void srpt_handle_send_comp(struct srpt_rdma_ch *ch,
+				  struct srpt_send_ioctx *ioctx)
+{
+	enum srpt_command_state state;
+
+	atomic_inc(&ch->sq_wr_avail);
+
+	state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
+
+	if (WARN_ON(state != SRPT_STATE_CMD_RSP_SENT
+		    && state != SRPT_STATE_MGMT_RSP_SENT
+		    && state != SRPT_STATE_DONE))
+		pr_debug("state = %d\n", state);
+
+	if (state != SRPT_STATE_DONE) {
+		srpt_unmap_sg_to_ib_sge(ch, ioctx);
+		transport_generic_free_cmd(&ioctx->cmd, 0);
+	} else {
+		pr_err("IB completion has been received too late for"
+		       " wr_id = %u.\n", ioctx->ioctx.index);
+	}
+}
+
+/**
+ * srpt_handle_rdma_comp() - Process an IB RDMA completion notification.
+ *
+ * XXX: what is now target_execute_cmd used to be asynchronous, and unmapping
+ * the data that has been transferred via IB RDMA had to be postponed until the
+ * check_stop_free() callback.  None of this is necessary anymore and needs to
+ * be cleaned up.
+ */
+static void srpt_handle_rdma_comp(struct srpt_rdma_ch *ch,
+				  struct srpt_send_ioctx *ioctx,
+				  enum srpt_opcode opcode)
+{
+	WARN_ON(ioctx->n_rdma <= 0);
+	atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
+
+	if (opcode == SRPT_RDMA_READ_LAST) {
+		if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA,
+						SRPT_STATE_DATA_IN))
+			target_execute_cmd(&ioctx->cmd);
+		else
+			pr_err("%s[%d]: wrong state = %d\n", __func__,
+			       __LINE__, srpt_get_cmd_state(ioctx));
+	} else if (opcode == SRPT_RDMA_ABORT) {
+		ioctx->rdma_aborted = true;
+	} else {
+		WARN(true, "unexpected opcode %d\n", opcode);
+	}
+}
+
+/**
+ * srpt_handle_rdma_err_comp() - Process an IB RDMA error completion.
+ */
+static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch,
+				      struct srpt_send_ioctx *ioctx,
+				      enum srpt_opcode opcode)
+{
+	struct se_cmd *cmd;
+	enum srpt_command_state state;
+
+	cmd = &ioctx->cmd;
+	state = srpt_get_cmd_state(ioctx);
+	switch (opcode) {
+	case SRPT_RDMA_READ_LAST:
+		if (ioctx->n_rdma <= 0) {
+			pr_err("Received invalid RDMA read"
+			       " error completion with idx %d\n",
+			       ioctx->ioctx.index);
+			break;
+		}
+		atomic_add(ioctx->n_rdma, &ch->sq_wr_avail);
+		if (state == SRPT_STATE_NEED_DATA)
+			srpt_abort_cmd(ioctx);
+		else
+			pr_err("%s[%d]: wrong state = %d\n",
+			       __func__, __LINE__, state);
+		break;
+	case SRPT_RDMA_WRITE_LAST:
+		break;
+	default:
+		pr_err("%s[%d]: opcode = %u\n", __func__, __LINE__, opcode);
+		break;
+	}
+}
+
+/**
+ * srpt_build_cmd_rsp() - Build an SRP_RSP response.
+ * @ch: RDMA channel through which the request has been received.
+ * @ioctx: I/O context associated with the SRP_CMD request. The response will
+ *   be built in the buffer ioctx->buf points at and hence this function will
+ *   overwrite the request data.
+ * @tag: tag of the request for which this response is being generated.
+ * @status: value for the STATUS field of the SRP_RSP information unit.
+ *
+ * Returns the size in bytes of the SRP_RSP response.
+ *
+ * An SRP_RSP response contains a SCSI status or service response. See also
+ * section 6.9 in the SRP r16a document for the format of an SRP_RSP
+ * response. See also SPC-2 for more information about sense data.
+ */
+static int srpt_build_cmd_rsp(struct srpt_rdma_ch *ch,
+			      struct srpt_send_ioctx *ioctx, u64 tag,
+			      int status)
+{
+	struct srp_rsp *srp_rsp;
+	const u8 *sense_data;
+	int sense_data_len, max_sense_len;
+
+	/*
+	 * The lowest bit of all SAM-3 status codes is zero (see also
+	 * paragraph 5.3 in SAM-3).
+	 */
+	WARN_ON(status & 1);
+
+	srp_rsp = ioctx->ioctx.buf;
+	BUG_ON(!srp_rsp);
+
+	sense_data = ioctx->sense_data;
+	sense_data_len = ioctx->cmd.scsi_sense_length;
+	WARN_ON(sense_data_len > sizeof(ioctx->sense_data));
+
+	memset(srp_rsp, 0, sizeof *srp_rsp);
+	srp_rsp->opcode = SRP_RSP;
+	srp_rsp->req_lim_delta =
+		__constant_cpu_to_be32(1 + atomic_xchg(&ch->req_lim_delta, 0));
+	srp_rsp->tag = tag;
+	srp_rsp->status = status;
+
+	if (sense_data_len) {
+		BUILD_BUG_ON(MIN_MAX_RSP_SIZE <= sizeof(*srp_rsp));
+		max_sense_len = ch->max_ti_iu_len - sizeof(*srp_rsp);
+		if (sense_data_len > max_sense_len) {
+			pr_warn("truncated sense data from %d to %d"
+				" bytes\n", sense_data_len, max_sense_len);
+			sense_data_len = max_sense_len;
+		}
+
+		srp_rsp->flags |= SRP_RSP_FLAG_SNSVALID;
+		srp_rsp->sense_data_len = cpu_to_be32(sense_data_len);
+		memcpy(srp_rsp + 1, sense_data, sense_data_len);
+	}
+
+	return sizeof(*srp_rsp) + sense_data_len;
+}
+
+/**
+ * srpt_build_tskmgmt_rsp() - Build a task management response.
+ * @ch:       RDMA channel through which the request has been received.
+ * @ioctx:    I/O context in which the SRP_RSP response will be built.
+ * @rsp_code: RSP_CODE that will be stored in the response.
+ * @tag:      Tag of the request for which this response is being generated.
+ *
+ * Returns the size in bytes of the SRP_RSP response.
+ *
+ * An SRP_RSP response contains a SCSI status or service response. See also
+ * section 6.9 in the SRP r16a document for the format of an SRP_RSP
+ * response.
+ */
+static int srpt_build_tskmgmt_rsp(struct srpt_rdma_ch *ch,
+				  struct srpt_send_ioctx *ioctx,
+				  u8 rsp_code, u64 tag)
+{
+	struct srp_rsp *srp_rsp;
+	int resp_data_len;
+	int resp_len;
+
+	resp_data_len = 4;
+	resp_len = sizeof(*srp_rsp) + resp_data_len;
+
+	srp_rsp = ioctx->ioctx.buf;
+	BUG_ON(!srp_rsp);
+	memset(srp_rsp, 0, sizeof *srp_rsp);
+
+	srp_rsp->opcode = SRP_RSP;
+	srp_rsp->req_lim_delta = __constant_cpu_to_be32(1
+				    + atomic_xchg(&ch->req_lim_delta, 0));
+	srp_rsp->tag = tag;
+
+	srp_rsp->flags |= SRP_RSP_FLAG_RSPVALID;
+	srp_rsp->resp_data_len = cpu_to_be32(resp_data_len);
+	srp_rsp->data[3] = rsp_code;
+
+	return resp_len;
+}
+
+#define NO_SUCH_LUN ((uint64_t)-1LL)
+
+/*
+ * SCSI LUN addressing method. See also SAM-2 and the section about
+ * eight byte LUNs.
+ */
+enum scsi_lun_addr_method {
+	SCSI_LUN_ADDR_METHOD_PERIPHERAL   = 0,
+	SCSI_LUN_ADDR_METHOD_FLAT         = 1,
+	SCSI_LUN_ADDR_METHOD_LUN          = 2,
+	SCSI_LUN_ADDR_METHOD_EXTENDED_LUN = 3,
+};
+
+/*
+ * srpt_unpack_lun() - Convert from network LUN to linear LUN.
+ *
+ * Convert an 2-byte, 4-byte, 6-byte or 8-byte LUN structure in network byte
+ * order (big endian) to a linear LUN. Supports three LUN addressing methods:
+ * peripheral, flat and logical unit. See also SAM-2, section 4.9.4 (page 40).
+ */
+static uint64_t srpt_unpack_lun(const uint8_t *lun, int len)
+{
+	uint64_t res = NO_SUCH_LUN;
+	int addressing_method;
+
+	if (unlikely(len < 2)) {
+		pr_err("Illegal LUN length %d, expected 2 bytes or more\n",
+		       len);
+		goto out;
+	}
+
+	switch (len) {
+	case 8:
+		if ((*((__be64 *)lun) &
+		     __constant_cpu_to_be64(0x0000FFFFFFFFFFFFLL)) != 0)
+			goto out_err;
+		break;
+	case 4:
+		if (*((__be16 *)&lun[2]) != 0)
+			goto out_err;
+		break;
+	case 6:
+		if (*((__be32 *)&lun[2]) != 0)
+			goto out_err;
+		break;
+	case 2:
+		break;
+	default:
+		goto out_err;
+	}
+
+	addressing_method = (*lun) >> 6; /* highest two bits of byte 0 */
+	switch (addressing_method) {
+	case SCSI_LUN_ADDR_METHOD_PERIPHERAL:
+	case SCSI_LUN_ADDR_METHOD_FLAT:
+	case SCSI_LUN_ADDR_METHOD_LUN:
+		res = *(lun + 1) | (((*lun) & 0x3f) << 8);
+		break;
+
+	case SCSI_LUN_ADDR_METHOD_EXTENDED_LUN:
+	default:
+		pr_err("Unimplemented LUN addressing method %u\n",
+		       addressing_method);
+		break;
+	}
+
+out:
+	return res;
+
+out_err:
+	pr_err("Support for multi-level LUNs has not yet been implemented\n");
+	goto out;
+}
+
+static int srpt_check_stop_free(struct se_cmd *cmd)
+{
+	struct srpt_send_ioctx *ioctx = container_of(cmd,
+				struct srpt_send_ioctx, cmd);
+
+	return target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd);
+}
+
+/**
+ * srpt_handle_cmd() - Process SRP_CMD.
+ */
+static int srpt_handle_cmd(struct srpt_rdma_ch *ch,
+			   struct srpt_recv_ioctx *recv_ioctx,
+			   struct srpt_send_ioctx *send_ioctx)
+{
+	struct se_cmd *cmd;
+	struct srp_cmd *srp_cmd;
+	uint64_t unpacked_lun;
+	u64 data_len;
+	enum dma_data_direction dir;
+	sense_reason_t ret;
+	int rc;
+
+	BUG_ON(!send_ioctx);
+
+	srp_cmd = recv_ioctx->ioctx.buf;
+	cmd = &send_ioctx->cmd;
+	send_ioctx->tag = srp_cmd->tag;
+
+	switch (srp_cmd->task_attr) {
+	case SRP_CMD_SIMPLE_Q:
+		cmd->sam_task_attr = TCM_SIMPLE_TAG;
+		break;
+	case SRP_CMD_ORDERED_Q:
+	default:
+		cmd->sam_task_attr = TCM_ORDERED_TAG;
+		break;
+	case SRP_CMD_HEAD_OF_Q:
+		cmd->sam_task_attr = TCM_HEAD_TAG;
+		break;
+	case SRP_CMD_ACA:
+		cmd->sam_task_attr = TCM_ACA_TAG;
+		break;
+	}
+
+	if (srpt_get_desc_tbl(send_ioctx, srp_cmd, &dir, &data_len)) {
+		pr_err("0x%llx: parsing SRP descriptor table failed.\n",
+		       srp_cmd->tag);
+		ret = TCM_INVALID_CDB_FIELD;
+		goto send_sense;
+	}
+
+	unpacked_lun = srpt_unpack_lun((uint8_t *)&srp_cmd->lun,
+				       sizeof(srp_cmd->lun));
+	rc = target_submit_cmd(cmd, ch->sess, srp_cmd->cdb,
+			&send_ioctx->sense_data[0], unpacked_lun, data_len,
+			TCM_SIMPLE_TAG, dir, TARGET_SCF_ACK_KREF);
+	if (rc != 0) {
+		ret = TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
+		goto send_sense;
+	}
+	return 0;
+
+send_sense:
+	transport_send_check_condition_and_sense(cmd, ret, 0);
+	return -1;
+}
+
+/**
+ * srpt_rx_mgmt_fn_tag() - Process a task management function by tag.
+ * @ch: RDMA channel of the task management request.
+ * @fn: Task management function to perform.
+ * @req_tag: Tag of the SRP task management request.
+ * @mgmt_ioctx: I/O context of the task management request.
+ *
+ * Returns zero if the target core will process the task management
+ * request asynchronously.
+ *
+ * Note: It is assumed that the initiator serializes tag-based task management
+ * requests.
+ */
+static int srpt_rx_mgmt_fn_tag(struct srpt_send_ioctx *ioctx, u64 tag)
+{
+	struct srpt_device *sdev;
+	struct srpt_rdma_ch *ch;
+	struct srpt_send_ioctx *target;
+	int ret, i;
+
+	ret = -EINVAL;
+	ch = ioctx->ch;
+	BUG_ON(!ch);
+	BUG_ON(!ch->sport);
+	sdev = ch->sport->sdev;
+	BUG_ON(!sdev);
+	spin_lock_irq(&sdev->spinlock);
+	for (i = 0; i < ch->rq_size; ++i) {
+		target = ch->ioctx_ring[i];
+		if (target->cmd.se_lun == ioctx->cmd.se_lun &&
+		    target->tag == tag &&
+		    srpt_get_cmd_state(target) != SRPT_STATE_DONE) {
+			ret = 0;
+			/* now let the target core abort &target->cmd; */
+			break;
+		}
+	}
+	spin_unlock_irq(&sdev->spinlock);
+	return ret;
+}
+
+static int srp_tmr_to_tcm(int fn)
+{
+	switch (fn) {
+	case SRP_TSK_ABORT_TASK:
+		return TMR_ABORT_TASK;
+	case SRP_TSK_ABORT_TASK_SET:
+		return TMR_ABORT_TASK_SET;
+	case SRP_TSK_CLEAR_TASK_SET:
+		return TMR_CLEAR_TASK_SET;
+	case SRP_TSK_LUN_RESET:
+		return TMR_LUN_RESET;
+	case SRP_TSK_CLEAR_ACA:
+		return TMR_CLEAR_ACA;
+	default:
+		return -1;
+	}
+}
+
+/**
+ * srpt_handle_tsk_mgmt() - Process an SRP_TSK_MGMT information unit.
+ *
+ * Returns 0 if and only if the request will be processed by the target core.
+ *
+ * For more information about SRP_TSK_MGMT information units, see also section
+ * 6.7 in the SRP r16a document.
+ */
+static void srpt_handle_tsk_mgmt(struct srpt_rdma_ch *ch,
+				 struct srpt_recv_ioctx *recv_ioctx,
+				 struct srpt_send_ioctx *send_ioctx)
+{
+	struct srp_tsk_mgmt *srp_tsk;
+	struct se_cmd *cmd;
+	struct se_session *sess = ch->sess;
+	uint64_t unpacked_lun;
+	uint32_t tag = 0;
+	int tcm_tmr;
+	int rc;
+
+	BUG_ON(!send_ioctx);
+
+	srp_tsk = recv_ioctx->ioctx.buf;
+	cmd = &send_ioctx->cmd;
+
+	pr_debug("recv tsk_mgmt fn %d for task_tag %lld and cmd tag %lld"
+		 " cm_id %p sess %p\n", srp_tsk->tsk_mgmt_func,
+		 srp_tsk->task_tag, srp_tsk->tag, ch->cm_id, ch->sess);
+
+	srpt_set_cmd_state(send_ioctx, SRPT_STATE_MGMT);
+	send_ioctx->tag = srp_tsk->tag;
+	tcm_tmr = srp_tmr_to_tcm(srp_tsk->tsk_mgmt_func);
+	if (tcm_tmr < 0) {
+		send_ioctx->cmd.se_tmr_req->response =
+			TMR_TASK_MGMT_FUNCTION_NOT_SUPPORTED;
+		goto fail;
+	}
+	unpacked_lun = srpt_unpack_lun((uint8_t *)&srp_tsk->lun,
+				       sizeof(srp_tsk->lun));
+
+	if (srp_tsk->tsk_mgmt_func == SRP_TSK_ABORT_TASK) {
+		rc = srpt_rx_mgmt_fn_tag(send_ioctx, srp_tsk->task_tag);
+		if (rc < 0) {
+			send_ioctx->cmd.se_tmr_req->response =
+					TMR_TASK_DOES_NOT_EXIST;
+			goto fail;
+		}
+		tag = srp_tsk->task_tag;
+	}
+	rc = target_submit_tmr(&send_ioctx->cmd, sess, NULL, unpacked_lun,
+				srp_tsk, tcm_tmr, GFP_KERNEL, tag,
+				TARGET_SCF_ACK_KREF);
+	if (rc != 0) {
+		send_ioctx->cmd.se_tmr_req->response = TMR_FUNCTION_REJECTED;
+		goto fail;
+	}
+	return;
+fail:
+	transport_send_check_condition_and_sense(cmd, 0, 0); // XXX:
+}
+
+/**
+ * srpt_handle_new_iu() - Process a newly received information unit.
+ * @ch:    RDMA channel through which the information unit has been received.
+ * @ioctx: SRPT I/O context associated with the information unit.
+ */
+static void srpt_handle_new_iu(struct srpt_rdma_ch *ch,
+			       struct srpt_recv_ioctx *recv_ioctx,
+			       struct srpt_send_ioctx *send_ioctx)
+{
+	struct srp_cmd *srp_cmd;
+	enum rdma_ch_state ch_state;
+
+	BUG_ON(!ch);
+	BUG_ON(!recv_ioctx);
+
+	ib_dma_sync_single_for_cpu(ch->sport->sdev->device,
+				   recv_ioctx->ioctx.dma, srp_max_req_size,
+				   DMA_FROM_DEVICE);
+
+	ch_state = srpt_get_ch_state(ch);
+	if (unlikely(ch_state == CH_CONNECTING)) {
+		list_add_tail(&recv_ioctx->wait_list, &ch->cmd_wait_list);
+		goto out;
+	}
+
+	if (unlikely(ch_state != CH_LIVE))
+		goto out;
+
+	srp_cmd = recv_ioctx->ioctx.buf;
+	if (srp_cmd->opcode == SRP_CMD || srp_cmd->opcode == SRP_TSK_MGMT) {
+		if (!send_ioctx)
+			send_ioctx = srpt_get_send_ioctx(ch);
+		if (unlikely(!send_ioctx)) {
+			list_add_tail(&recv_ioctx->wait_list,
+				      &ch->cmd_wait_list);
+			goto out;
+		}
+	}
+
+	switch (srp_cmd->opcode) {
+	case SRP_CMD:
+		srpt_handle_cmd(ch, recv_ioctx, send_ioctx);
+		break;
+	case SRP_TSK_MGMT:
+		srpt_handle_tsk_mgmt(ch, recv_ioctx, send_ioctx);
+		break;
+	case SRP_I_LOGOUT:
+		pr_err("Not yet implemented: SRP_I_LOGOUT\n");
+		break;
+	case SRP_CRED_RSP:
+		pr_debug("received SRP_CRED_RSP\n");
+		break;
+	case SRP_AER_RSP:
+		pr_debug("received SRP_AER_RSP\n");
+		break;
+	case SRP_RSP:
+		pr_err("Received SRP_RSP\n");
+		break;
+	default:
+		pr_err("received IU with unknown opcode 0x%x\n",
+		       srp_cmd->opcode);
+		break;
+	}
+
+	srpt_post_recv(ch->sport->sdev, recv_ioctx);
+out:
+	return;
+}
+
+static void srpt_process_rcv_completion(struct ib_cq *cq,
+					struct srpt_rdma_ch *ch,
+					struct ib_wc *wc)
+{
+	struct srpt_device *sdev = ch->sport->sdev;
+	struct srpt_recv_ioctx *ioctx;
+	u32 index;
+
+	index = idx_from_wr_id(wc->wr_id);
+	if (wc->status == IB_WC_SUCCESS) {
+		int req_lim;
+
+		req_lim = atomic_dec_return(&ch->req_lim);
+		if (unlikely(req_lim < 0))
+			pr_err("req_lim = %d < 0\n", req_lim);
+		ioctx = sdev->ioctx_ring[index];
+		srpt_handle_new_iu(ch, ioctx, NULL);
+	} else {
+		pr_info("receiving failed for idx %u with status %d\n",
+			index, wc->status);
+	}
+}
+
+/**
+ * srpt_process_send_completion() - Process an IB send completion.
+ *
+ * Note: Although this has not yet been observed during tests, at least in
+ * theory it is possible that the srpt_get_send_ioctx() call invoked by
+ * srpt_handle_new_iu() fails. This is possible because the req_lim_delta
+ * value in each response is set to one, and it is possible that this response
+ * makes the initiator send a new request before the send completion for that
+ * response has been processed. This could e.g. happen if the call to
+ * srpt_put_send_iotcx() is delayed because of a higher priority interrupt or
+ * if IB retransmission causes generation of the send completion to be
+ * delayed. Incoming information units for which srpt_get_send_ioctx() fails
+ * are queued on cmd_wait_list. The code below processes these delayed
+ * requests one at a time.
+ */
+static void srpt_process_send_completion(struct ib_cq *cq,
+					 struct srpt_rdma_ch *ch,
+					 struct ib_wc *wc)
+{
+	struct srpt_send_ioctx *send_ioctx;
+	uint32_t index;
+	enum srpt_opcode opcode;
+
+	index = idx_from_wr_id(wc->wr_id);
+	opcode = opcode_from_wr_id(wc->wr_id);
+	send_ioctx = ch->ioctx_ring[index];
+	if (wc->status == IB_WC_SUCCESS) {
+		if (opcode == SRPT_SEND)
+			srpt_handle_send_comp(ch, send_ioctx);
+		else {
+			WARN_ON(opcode != SRPT_RDMA_ABORT &&
+				wc->opcode != IB_WC_RDMA_READ);
+			srpt_handle_rdma_comp(ch, send_ioctx, opcode);
+		}
+	} else {
+		if (opcode == SRPT_SEND) {
+			pr_info("sending response for idx %u failed"
+				" with status %d\n", index, wc->status);
+			srpt_handle_send_err_comp(ch, wc->wr_id);
+		} else if (opcode != SRPT_RDMA_MID) {
+			pr_info("RDMA t %d for idx %u failed with"
+				" status %d\n", opcode, index, wc->status);
+			srpt_handle_rdma_err_comp(ch, send_ioctx, opcode);
+		}
+	}
+
+	while (unlikely(opcode == SRPT_SEND
+			&& !list_empty(&ch->cmd_wait_list)
+			&& srpt_get_ch_state(ch) == CH_LIVE
+			&& (send_ioctx = srpt_get_send_ioctx(ch)) != NULL)) {
+		struct srpt_recv_ioctx *recv_ioctx;
+
+		recv_ioctx = list_first_entry(&ch->cmd_wait_list,
+					      struct srpt_recv_ioctx,
+					      wait_list);
+		list_del(&recv_ioctx->wait_list);
+		srpt_handle_new_iu(ch, recv_ioctx, send_ioctx);
+	}
+}
+
+static void srpt_process_completion(struct ib_cq *cq, struct srpt_rdma_ch *ch)
+{
+	struct ib_wc *const wc = ch->wc;
+	int i, n;
+
+	WARN_ON(cq != ch->cq);
+
+	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
+	while ((n = ib_poll_cq(cq, ARRAY_SIZE(ch->wc), wc)) > 0) {
+		for (i = 0; i < n; i++) {
+			if (opcode_from_wr_id(wc[i].wr_id) == SRPT_RECV)
+				srpt_process_rcv_completion(cq, ch, &wc[i]);
+			else
+				srpt_process_send_completion(cq, ch, &wc[i]);
+		}
+	}
+}
+
+/**
+ * srpt_completion() - IB completion queue callback function.
+ *
+ * Notes:
+ * - It is guaranteed that a completion handler will never be invoked
+ *   concurrently on two different CPUs for the same completion queue. See also
+ *   Documentation/infiniband/core_locking.txt and the implementation of
+ *   handle_edge_irq() in kernel/irq/chip.c.
+ * - When threaded IRQs are enabled, completion handlers are invoked in thread
+ *   context instead of interrupt context.
+ */
+static void srpt_completion(struct ib_cq *cq, void *ctx)
+{
+	struct srpt_rdma_ch *ch = ctx;
+
+	wake_up_interruptible(&ch->wait_queue);
+}
+
+static int srpt_compl_thread(void *arg)
+{
+	struct srpt_rdma_ch *ch;
+
+	/* Hibernation / freezing of the SRPT kernel thread is not supported. */
+	current->flags |= PF_NOFREEZE;
+
+	ch = arg;
+	BUG_ON(!ch);
+	pr_info("Session %s: kernel thread %s (PID %d) started\n",
+		ch->sess_name, ch->thread->comm, current->pid);
+	while (!kthread_should_stop()) {
+		wait_event_interruptible(ch->wait_queue,
+			(srpt_process_completion(ch->cq, ch),
+			 kthread_should_stop()));
+	}
+	pr_info("Session %s: kernel thread %s (PID %d) stopped\n",
+		ch->sess_name, ch->thread->comm, current->pid);
+	return 0;
+}
+
+/**
+ * srpt_create_ch_ib() - Create receive and send completion queues.
+ */
+static int srpt_create_ch_ib(struct srpt_rdma_ch *ch)
+{
+	struct ib_qp_init_attr *qp_init;
+	struct srpt_port *sport = ch->sport;
+	struct srpt_device *sdev = sport->sdev;
+	u32 srp_sq_size = sport->port_attrib.srp_sq_size;
+	int ret;
+
+	WARN_ON(ch->rq_size < 1);
+
+	ret = -ENOMEM;
+	qp_init = kzalloc(sizeof *qp_init, GFP_KERNEL);
+	if (!qp_init)
+		goto out;
+
+retry:
+	ch->cq = ib_create_cq(sdev->device, srpt_completion, NULL, ch,
+			      ch->rq_size + srp_sq_size, 0);
+	if (IS_ERR(ch->cq)) {
+		ret = PTR_ERR(ch->cq);
+		pr_err("failed to create CQ cqe= %d ret= %d\n",
+		       ch->rq_size + srp_sq_size, ret);
+		goto out;
+	}
+
+	qp_init->qp_context = (void *)ch;
+	qp_init->event_handler
+		= (void(*)(struct ib_event *, void*))srpt_qp_event;
+	qp_init->send_cq = ch->cq;
+	qp_init->recv_cq = ch->cq;
+	qp_init->srq = sdev->srq;
+	qp_init->sq_sig_type = IB_SIGNAL_REQ_WR;
+	qp_init->qp_type = IB_QPT_RC;
+	qp_init->cap.max_send_wr = srp_sq_size;
+	qp_init->cap.max_send_sge = SRPT_DEF_SG_PER_WQE;
+
+	ch->qp = ib_create_qp(sdev->pd, qp_init);
+	if (IS_ERR(ch->qp)) {
+		ret = PTR_ERR(ch->qp);
+		if (ret == -ENOMEM) {
+			srp_sq_size /= 2;
+			if (srp_sq_size >= MIN_SRPT_SQ_SIZE) {
+				ib_destroy_cq(ch->cq);
+				goto retry;
+			}
+		}
+		pr_err("failed to create_qp ret= %d\n", ret);
+		goto err_destroy_cq;
+	}
+
+	atomic_set(&ch->sq_wr_avail, qp_init->cap.max_send_wr);
+
+	pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n",
+		 __func__, ch->cq->cqe, qp_init->cap.max_send_sge,
+		 qp_init->cap.max_send_wr, ch->cm_id);
+
+	ret = srpt_init_ch_qp(ch, ch->qp);
+	if (ret)
+		goto err_destroy_qp;
+
+	init_waitqueue_head(&ch->wait_queue);
+
+	pr_debug("creating thread for session %s\n", ch->sess_name);
+
+	ch->thread = kthread_run(srpt_compl_thread, ch, "ib_srpt_compl");
+	if (IS_ERR(ch->thread)) {
+		pr_err("failed to create kernel thread %ld\n",
+		       PTR_ERR(ch->thread));
+		ch->thread = NULL;
+		goto err_destroy_qp;
+	}
+
+out:
+	kfree(qp_init);
+	return ret;
+
+err_destroy_qp:
+	ib_destroy_qp(ch->qp);
+err_destroy_cq:
+	ib_destroy_cq(ch->cq);
+	goto out;
+}
+
+static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch)
+{
+	if (ch->thread)
+		kthread_stop(ch->thread);
+
+	ib_destroy_qp(ch->qp);
+	ib_destroy_cq(ch->cq);
+}
+
+/**
+ * __srpt_close_ch() - Close an RDMA channel by setting the QP error state.
+ *
+ * Reset the QP and make sure all resources associated with the channel will
+ * be deallocated at an appropriate time.
+ *
+ * Note: The caller must hold ch->sport->sdev->spinlock.
+ */
+static void __srpt_close_ch(struct srpt_rdma_ch *ch)
+{
+	struct srpt_device *sdev;
+	enum rdma_ch_state prev_state;
+	unsigned long flags;
+
+	sdev = ch->sport->sdev;
+
+	spin_lock_irqsave(&ch->spinlock, flags);
+	prev_state = ch->state;
+	switch (prev_state) {
+	case CH_CONNECTING:
+	case CH_LIVE:
+		ch->state = CH_DISCONNECTING;
+		break;
+	default:
+		break;
+	}
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+
+	switch (prev_state) {
+	case CH_CONNECTING:
+		ib_send_cm_rej(ch->cm_id, IB_CM_REJ_NO_RESOURCES, NULL, 0,
+			       NULL, 0);
+		/* fall through */
+	case CH_LIVE:
+		if (ib_send_cm_dreq(ch->cm_id, NULL, 0) < 0)
+			pr_err("sending CM DREQ failed.\n");
+		break;
+	case CH_DISCONNECTING:
+		break;
+	case CH_DRAINING:
+	case CH_RELEASING:
+		break;
+	}
+}
+
+/**
+ * srpt_close_ch() - Close an RDMA channel.
+ */
+static void srpt_close_ch(struct srpt_rdma_ch *ch)
+{
+	struct srpt_device *sdev;
+
+	sdev = ch->sport->sdev;
+	spin_lock_irq(&sdev->spinlock);
+	__srpt_close_ch(ch);
+	spin_unlock_irq(&sdev->spinlock);
+}
+
+/**
+ * srpt_shutdown_session() - Whether or not a session may be shut down.
+ */
+static int srpt_shutdown_session(struct se_session *se_sess)
+{
+	struct srpt_rdma_ch *ch = se_sess->fabric_sess_ptr;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ch->spinlock, flags);
+	if (ch->in_shutdown) {
+		spin_unlock_irqrestore(&ch->spinlock, flags);
+		return true;
+	}
+
+	ch->in_shutdown = true;
+	target_sess_cmd_list_set_waiting(se_sess);
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+
+	return true;
+}
+
+/**
+ * srpt_drain_channel() - Drain a channel by resetting the IB queue pair.
+ * @cm_id: Pointer to the CM ID of the channel to be drained.
+ *
+ * Note: Must be called from inside srpt_cm_handler to avoid a race between
+ * accessing sdev->spinlock and the call to kfree(sdev) in srpt_remove_one()
+ * (the caller of srpt_cm_handler holds the cm_id spinlock; srpt_remove_one()
+ * waits until all target sessions for the associated IB device have been
+ * unregistered and target session registration involves a call to
+ * ib_destroy_cm_id(), which locks the cm_id spinlock and hence waits until
+ * this function has finished).
+ */
+static void srpt_drain_channel(struct ib_cm_id *cm_id)
+{
+	struct srpt_device *sdev;
+	struct srpt_rdma_ch *ch;
+	int ret;
+	bool do_reset = false;
+
+	WARN_ON_ONCE(irqs_disabled());
+
+	sdev = cm_id->context;
+	BUG_ON(!sdev);
+	spin_lock_irq(&sdev->spinlock);
+	list_for_each_entry(ch, &sdev->rch_list, list) {
+		if (ch->cm_id == cm_id) {
+			do_reset = srpt_test_and_set_ch_state(ch,
+					CH_CONNECTING, CH_DRAINING) ||
+				   srpt_test_and_set_ch_state(ch,
+					CH_LIVE, CH_DRAINING) ||
+				   srpt_test_and_set_ch_state(ch,
+					CH_DISCONNECTING, CH_DRAINING);
+			break;
+		}
+	}
+	spin_unlock_irq(&sdev->spinlock);
+
+	if (do_reset) {
+		if (ch->sess)
+			srpt_shutdown_session(ch->sess);
+
+		ret = srpt_ch_qp_err(ch);
+		if (ret < 0)
+			pr_err("Setting queue pair in error state"
+			       " failed: %d\n", ret);
+	}
+}
+
+/**
+ * srpt_find_channel() - Look up an RDMA channel.
+ * @cm_id: Pointer to the CM ID of the channel to be looked up.
+ *
+ * Return NULL if no matching RDMA channel has been found.
+ */
+static struct srpt_rdma_ch *srpt_find_channel(struct srpt_device *sdev,
+					      struct ib_cm_id *cm_id)
+{
+	struct srpt_rdma_ch *ch;
+	bool found;
+
+	WARN_ON_ONCE(irqs_disabled());
+	BUG_ON(!sdev);
+
+	found = false;
+	spin_lock_irq(&sdev->spinlock);
+	list_for_each_entry(ch, &sdev->rch_list, list) {
+		if (ch->cm_id == cm_id) {
+			found = true;
+			break;
+		}
+	}
+	spin_unlock_irq(&sdev->spinlock);
+
+	return found ? ch : NULL;
+}
+
+/**
+ * srpt_release_channel() - Release channel resources.
+ *
+ * Schedules the actual release because:
+ * - Calling the ib_destroy_cm_id() call from inside an IB CM callback would
+ *   trigger a deadlock.
+ * - It is not safe to call TCM transport_* functions from interrupt context.
+ */
+static void srpt_release_channel(struct srpt_rdma_ch *ch)
+{
+	schedule_work(&ch->release_work);
+}
+
+static void srpt_release_channel_work(struct work_struct *w)
+{
+	struct srpt_rdma_ch *ch;
+	struct srpt_device *sdev;
+	struct se_session *se_sess;
+
+	ch = container_of(w, struct srpt_rdma_ch, release_work);
+	pr_debug("ch = %p; ch->sess = %p; release_done = %p\n", ch, ch->sess,
+		 ch->release_done);
+
+	sdev = ch->sport->sdev;
+	BUG_ON(!sdev);
+
+	se_sess = ch->sess;
+	BUG_ON(!se_sess);
+
+	target_wait_for_sess_cmds(se_sess);
+
+	transport_deregister_session_configfs(se_sess);
+	transport_deregister_session(se_sess);
+	ch->sess = NULL;
+
+	ib_destroy_cm_id(ch->cm_id);
+
+	srpt_destroy_ch_ib(ch);
+
+	srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring,
+			     ch->sport->sdev, ch->rq_size,
+			     ch->rsp_size, DMA_TO_DEVICE);
+
+	spin_lock_irq(&sdev->spinlock);
+	list_del(&ch->list);
+	spin_unlock_irq(&sdev->spinlock);
+
+	if (ch->release_done)
+		complete(ch->release_done);
+
+	wake_up(&sdev->ch_releaseQ);
+
+	kfree(ch);
+}
+
+static struct srpt_node_acl *__srpt_lookup_acl(struct srpt_port *sport,
+					       u8 i_port_id[16])
+{
+	struct srpt_node_acl *nacl;
+
+	list_for_each_entry(nacl, &sport->port_acl_list, list)
+		if (memcmp(nacl->i_port_id, i_port_id,
+			   sizeof(nacl->i_port_id)) == 0)
+			return nacl;
+
+	return NULL;
+}
+
+static struct srpt_node_acl *srpt_lookup_acl(struct srpt_port *sport,
+					     u8 i_port_id[16])
+{
+	struct srpt_node_acl *nacl;
+
+	spin_lock_irq(&sport->port_acl_lock);
+	nacl = __srpt_lookup_acl(sport, i_port_id);
+	spin_unlock_irq(&sport->port_acl_lock);
+
+	return nacl;
+}
+
+/**
+ * srpt_cm_req_recv() - Process the event IB_CM_REQ_RECEIVED.
+ *
+ * Ownership of the cm_id is transferred to the target session if this
+ * functions returns zero. Otherwise the caller remains the owner of cm_id.
+ */
+static int srpt_cm_req_recv(struct ib_cm_id *cm_id,
+			    struct ib_cm_req_event_param *param,
+			    void *private_data)
+{
+	struct srpt_device *sdev = cm_id->context;
+	struct srpt_port *sport = &sdev->port[param->port - 1];
+	struct srp_login_req *req;
+	struct srp_login_rsp *rsp;
+	struct srp_login_rej *rej;
+	struct ib_cm_rep_param *rep_param;
+	struct srpt_rdma_ch *ch, *tmp_ch;
+	struct srpt_node_acl *nacl;
+	u32 it_iu_len;
+	int i;
+	int ret = 0;
+
+	WARN_ON_ONCE(irqs_disabled());
+
+	if (WARN_ON(!sdev || !private_data))
+		return -EINVAL;
+
+	req = (struct srp_login_req *)private_data;
+
+	it_iu_len = be32_to_cpu(req->req_it_iu_len);
+
+	pr_info("Received SRP_LOGIN_REQ with i_port_id 0x%llx:0x%llx,"
+		" t_port_id 0x%llx:0x%llx and it_iu_len %d on port %d"
+		" (guid=0x%llx:0x%llx)\n",
+		be64_to_cpu(*(__be64 *)&req->initiator_port_id[0]),
+		be64_to_cpu(*(__be64 *)&req->initiator_port_id[8]),
+		be64_to_cpu(*(__be64 *)&req->target_port_id[0]),
+		be64_to_cpu(*(__be64 *)&req->target_port_id[8]),
+		it_iu_len,
+		param->port,
+		be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[0]),
+		be64_to_cpu(*(__be64 *)&sdev->port[param->port - 1].gid.raw[8]));
+
+	rsp = kzalloc(sizeof *rsp, GFP_KERNEL);
+	rej = kzalloc(sizeof *rej, GFP_KERNEL);
+	rep_param = kzalloc(sizeof *rep_param, GFP_KERNEL);
+
+	if (!rsp || !rej || !rep_param) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (it_iu_len > srp_max_req_size || it_iu_len < 64) {
+		rej->reason = __constant_cpu_to_be32(
+				SRP_LOGIN_REJ_REQ_IT_IU_LENGTH_TOO_LARGE);
+		ret = -EINVAL;
+		pr_err("rejected SRP_LOGIN_REQ because its"
+		       " length (%d bytes) is out of range (%d .. %d)\n",
+		       it_iu_len, 64, srp_max_req_size);
+		goto reject;
+	}
+
+	if (!sport->enabled) {
+		rej->reason = __constant_cpu_to_be32(
+			     SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		ret = -EINVAL;
+		pr_err("rejected SRP_LOGIN_REQ because the target port"
+		       " has not yet been enabled\n");
+		goto reject;
+	}
+
+	if ((req->req_flags & SRP_MTCH_ACTION) == SRP_MULTICHAN_SINGLE) {
+		rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_NO_CHAN;
+
+		spin_lock_irq(&sdev->spinlock);
+
+		list_for_each_entry_safe(ch, tmp_ch, &sdev->rch_list, list) {
+			if (!memcmp(ch->i_port_id, req->initiator_port_id, 16)
+			    && !memcmp(ch->t_port_id, req->target_port_id, 16)
+			    && param->port == ch->sport->port
+			    && param->listen_id == ch->sport->sdev->cm_id
+			    && ch->cm_id) {
+				enum rdma_ch_state ch_state;
+
+				ch_state = srpt_get_ch_state(ch);
+				if (ch_state != CH_CONNECTING
+				    && ch_state != CH_LIVE)
+					continue;
+
+				/* found an existing channel */
+				pr_debug("Found existing channel %s"
+					 " cm_id= %p state= %d\n",
+					 ch->sess_name, ch->cm_id, ch_state);
+
+				__srpt_close_ch(ch);
+
+				rsp->rsp_flags =
+					SRP_LOGIN_RSP_MULTICHAN_TERMINATED;
+			}
+		}
+
+		spin_unlock_irq(&sdev->spinlock);
+
+	} else
+		rsp->rsp_flags = SRP_LOGIN_RSP_MULTICHAN_MAINTAINED;
+
+	if (*(__be64 *)req->target_port_id != cpu_to_be64(srpt_service_guid)
+	    || *(__be64 *)(req->target_port_id + 8) !=
+	       cpu_to_be64(srpt_service_guid)) {
+		rej->reason = __constant_cpu_to_be32(
+				SRP_LOGIN_REJ_UNABLE_ASSOCIATE_CHANNEL);
+		ret = -ENOMEM;
+		pr_err("rejected SRP_LOGIN_REQ because it"
+		       " has an invalid target port identifier.\n");
+		goto reject;
+	}
+
+	ch = kzalloc(sizeof *ch, GFP_KERNEL);
+	if (!ch) {
+		rej->reason = __constant_cpu_to_be32(
+					SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		pr_err("rejected SRP_LOGIN_REQ because no memory.\n");
+		ret = -ENOMEM;
+		goto reject;
+	}
+
+	INIT_WORK(&ch->release_work, srpt_release_channel_work);
+	memcpy(ch->i_port_id, req->initiator_port_id, 16);
+	memcpy(ch->t_port_id, req->target_port_id, 16);
+	ch->sport = &sdev->port[param->port - 1];
+	ch->cm_id = cm_id;
+	/*
+	 * Avoid QUEUE_FULL conditions by limiting the number of buffers used
+	 * for the SRP protocol to the command queue size.
+	 */
+	ch->rq_size = SRPT_RQ_SIZE;
+	spin_lock_init(&ch->spinlock);
+	ch->state = CH_CONNECTING;
+	INIT_LIST_HEAD(&ch->cmd_wait_list);
+	ch->rsp_size = ch->sport->port_attrib.srp_max_rsp_size;
+
+	ch->ioctx_ring = (struct srpt_send_ioctx **)
+		srpt_alloc_ioctx_ring(ch->sport->sdev, ch->rq_size,
+				      sizeof(*ch->ioctx_ring[0]),
+				      ch->rsp_size, DMA_TO_DEVICE);
+	if (!ch->ioctx_ring)
+		goto free_ch;
+
+	INIT_LIST_HEAD(&ch->free_list);
+	for (i = 0; i < ch->rq_size; i++) {
+		ch->ioctx_ring[i]->ch = ch;
+		list_add_tail(&ch->ioctx_ring[i]->free_list, &ch->free_list);
+	}
+
+	ret = srpt_create_ch_ib(ch);
+	if (ret) {
+		rej->reason = __constant_cpu_to_be32(
+				SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		pr_err("rejected SRP_LOGIN_REQ because creating"
+		       " a new RDMA channel failed.\n");
+		goto free_ring;
+	}
+
+	ret = srpt_ch_qp_rtr(ch, ch->qp);
+	if (ret) {
+		rej->reason = __constant_cpu_to_be32(
+				SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		pr_err("rejected SRP_LOGIN_REQ because enabling"
+		       " RTR failed (error code = %d)\n", ret);
+		goto destroy_ib;
+	}
+	/*
+	 * Use the initator port identifier as the session name.
+	 */
+	snprintf(ch->sess_name, sizeof(ch->sess_name), "0x%016llx%016llx",
+			be64_to_cpu(*(__be64 *)ch->i_port_id),
+			be64_to_cpu(*(__be64 *)(ch->i_port_id + 8)));
+
+	pr_debug("registering session %s\n", ch->sess_name);
+
+	nacl = srpt_lookup_acl(sport, ch->i_port_id);
+	if (!nacl) {
+		pr_info("Rejected login because no ACL has been"
+			" configured yet for initiator %s.\n", ch->sess_name);
+		rej->reason = __constant_cpu_to_be32(
+				SRP_LOGIN_REJ_CHANNEL_LIMIT_REACHED);
+		goto destroy_ib;
+	}
+
+	ch->sess = transport_init_session(TARGET_PROT_NORMAL);
+	if (IS_ERR(ch->sess)) {
+		rej->reason = __constant_cpu_to_be32(
+				SRP_LOGIN_REJ_INSUFFICIENT_RESOURCES);
+		pr_debug("Failed to create session\n");
+		goto deregister_session;
+	}
+	ch->sess->se_node_acl = &nacl->nacl;
+	transport_register_session(&sport->port_tpg_1, &nacl->nacl, ch->sess, ch);
+
+	pr_debug("Establish connection sess=%p name=%s cm_id=%p\n", ch->sess,
+		 ch->sess_name, ch->cm_id);
+
+	/* create srp_login_response */
+	rsp->opcode = SRP_LOGIN_RSP;
+	rsp->tag = req->tag;
+	rsp->max_it_iu_len = req->req_it_iu_len;
+	rsp->max_ti_iu_len = req->req_it_iu_len;
+	ch->max_ti_iu_len = it_iu_len;
+	rsp->buf_fmt = __constant_cpu_to_be16(SRP_BUF_FORMAT_DIRECT
+					      | SRP_BUF_FORMAT_INDIRECT);
+	rsp->req_lim_delta = cpu_to_be32(ch->rq_size);
+	atomic_set(&ch->req_lim, ch->rq_size);
+	atomic_set(&ch->req_lim_delta, 0);
+
+	/* create cm reply */
+	rep_param->qp_num = ch->qp->qp_num;
+	rep_param->private_data = (void *)rsp;
+	rep_param->private_data_len = sizeof *rsp;
+	rep_param->rnr_retry_count = 7;
+	rep_param->flow_control = 1;
+	rep_param->failover_accepted = 0;
+	rep_param->srq = 1;
+	rep_param->responder_resources = 4;
+	rep_param->initiator_depth = 4;
+
+	ret = ib_send_cm_rep(cm_id, rep_param);
+	if (ret) {
+		pr_err("sending SRP_LOGIN_REQ response failed"
+		       " (error code = %d)\n", ret);
+		goto release_channel;
+	}
+
+	spin_lock_irq(&sdev->spinlock);
+	list_add_tail(&ch->list, &sdev->rch_list);
+	spin_unlock_irq(&sdev->spinlock);
+
+	goto out;
+
+release_channel:
+	srpt_set_ch_state(ch, CH_RELEASING);
+	transport_deregister_session_configfs(ch->sess);
+
+deregister_session:
+	transport_deregister_session(ch->sess);
+	ch->sess = NULL;
+
+destroy_ib:
+	srpt_destroy_ch_ib(ch);
+
+free_ring:
+	srpt_free_ioctx_ring((struct srpt_ioctx **)ch->ioctx_ring,
+			     ch->sport->sdev, ch->rq_size,
+			     ch->rsp_size, DMA_TO_DEVICE);
+free_ch:
+	kfree(ch);
+
+reject:
+	rej->opcode = SRP_LOGIN_REJ;
+	rej->tag = req->tag;
+	rej->buf_fmt = __constant_cpu_to_be16(SRP_BUF_FORMAT_DIRECT
+					      | SRP_BUF_FORMAT_INDIRECT);
+
+	ib_send_cm_rej(cm_id, IB_CM_REJ_CONSUMER_DEFINED, NULL, 0,
+			     (void *)rej, sizeof *rej);
+
+out:
+	kfree(rep_param);
+	kfree(rsp);
+	kfree(rej);
+
+	return ret;
+}
+
+static void srpt_cm_rej_recv(struct ib_cm_id *cm_id)
+{
+	pr_info("Received IB REJ for cm_id %p.\n", cm_id);
+	srpt_drain_channel(cm_id);
+}
+
+/**
+ * srpt_cm_rtu_recv() - Process an IB_CM_RTU_RECEIVED or USER_ESTABLISHED event.
+ *
+ * An IB_CM_RTU_RECEIVED message indicates that the connection is established
+ * and that the recipient may begin transmitting (RTU = ready to use).
+ */
+static void srpt_cm_rtu_recv(struct ib_cm_id *cm_id)
+{
+	struct srpt_rdma_ch *ch;
+	int ret;
+
+	ch = srpt_find_channel(cm_id->context, cm_id);
+	BUG_ON(!ch);
+
+	if (srpt_test_and_set_ch_state(ch, CH_CONNECTING, CH_LIVE)) {
+		struct srpt_recv_ioctx *ioctx, *ioctx_tmp;
+
+		ret = srpt_ch_qp_rts(ch, ch->qp);
+
+		list_for_each_entry_safe(ioctx, ioctx_tmp, &ch->cmd_wait_list,
+					 wait_list) {
+			list_del(&ioctx->wait_list);
+			srpt_handle_new_iu(ch, ioctx, NULL);
+		}
+		if (ret)
+			srpt_close_ch(ch);
+	}
+}
+
+static void srpt_cm_timewait_exit(struct ib_cm_id *cm_id)
+{
+	pr_info("Received IB TimeWait exit for cm_id %p.\n", cm_id);
+	srpt_drain_channel(cm_id);
+}
+
+static void srpt_cm_rep_error(struct ib_cm_id *cm_id)
+{
+	pr_info("Received IB REP error for cm_id %p.\n", cm_id);
+	srpt_drain_channel(cm_id);
+}
+
+/**
+ * srpt_cm_dreq_recv() - Process reception of a DREQ message.
+ */
+static void srpt_cm_dreq_recv(struct ib_cm_id *cm_id)
+{
+	struct srpt_rdma_ch *ch;
+	unsigned long flags;
+	bool send_drep = false;
+
+	ch = srpt_find_channel(cm_id->context, cm_id);
+	BUG_ON(!ch);
+
+	pr_debug("cm_id= %p ch->state= %d\n", cm_id, srpt_get_ch_state(ch));
+
+	spin_lock_irqsave(&ch->spinlock, flags);
+	switch (ch->state) {
+	case CH_CONNECTING:
+	case CH_LIVE:
+		send_drep = true;
+		ch->state = CH_DISCONNECTING;
+		break;
+	case CH_DISCONNECTING:
+	case CH_DRAINING:
+	case CH_RELEASING:
+		WARN(true, "unexpected channel state %d\n", ch->state);
+		break;
+	}
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+
+	if (send_drep) {
+		if (ib_send_cm_drep(ch->cm_id, NULL, 0) < 0)
+			pr_err("Sending IB DREP failed.\n");
+		pr_info("Received DREQ and sent DREP for session %s.\n",
+			ch->sess_name);
+	}
+}
+
+/**
+ * srpt_cm_drep_recv() - Process reception of a DREP message.
+ */
+static void srpt_cm_drep_recv(struct ib_cm_id *cm_id)
+{
+	pr_info("Received InfiniBand DREP message for cm_id %p.\n", cm_id);
+	srpt_drain_channel(cm_id);
+}
+
+/**
+ * srpt_cm_handler() - IB connection manager callback function.
+ *
+ * A non-zero return value will cause the caller destroy the CM ID.
+ *
+ * Note: srpt_cm_handler() must only return a non-zero value when transferring
+ * ownership of the cm_id to a channel by srpt_cm_req_recv() failed. Returning
+ * a non-zero value in any other case will trigger a race with the
+ * ib_destroy_cm_id() call in srpt_release_channel().
+ */
+static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event)
+{
+	int ret;
+
+	ret = 0;
+	switch (event->event) {
+	case IB_CM_REQ_RECEIVED:
+		ret = srpt_cm_req_recv(cm_id, &event->param.req_rcvd,
+				       event->private_data);
+		break;
+	case IB_CM_REJ_RECEIVED:
+		srpt_cm_rej_recv(cm_id);
+		break;
+	case IB_CM_RTU_RECEIVED:
+	case IB_CM_USER_ESTABLISHED:
+		srpt_cm_rtu_recv(cm_id);
+		break;
+	case IB_CM_DREQ_RECEIVED:
+		srpt_cm_dreq_recv(cm_id);
+		break;
+	case IB_CM_DREP_RECEIVED:
+		srpt_cm_drep_recv(cm_id);
+		break;
+	case IB_CM_TIMEWAIT_EXIT:
+		srpt_cm_timewait_exit(cm_id);
+		break;
+	case IB_CM_REP_ERROR:
+		srpt_cm_rep_error(cm_id);
+		break;
+	case IB_CM_DREQ_ERROR:
+		pr_info("Received IB DREQ ERROR event.\n");
+		break;
+	case IB_CM_MRA_RECEIVED:
+		pr_info("Received IB MRA event\n");
+		break;
+	default:
+		pr_err("received unrecognized IB CM event %d\n", event->event);
+		break;
+	}
+
+	return ret;
+}
+
+/**
+ * srpt_perform_rdmas() - Perform IB RDMA.
+ *
+ * Returns zero upon success or a negative number upon failure.
+ */
+static int srpt_perform_rdmas(struct srpt_rdma_ch *ch,
+			      struct srpt_send_ioctx *ioctx)
+{
+	struct ib_send_wr wr;
+	struct ib_send_wr *bad_wr;
+	struct rdma_iu *riu;
+	int i;
+	int ret;
+	int sq_wr_avail;
+	enum dma_data_direction dir;
+	const int n_rdma = ioctx->n_rdma;
+
+	dir = ioctx->cmd.data_direction;
+	if (dir == DMA_TO_DEVICE) {
+		/* write */
+		ret = -ENOMEM;
+		sq_wr_avail = atomic_sub_return(n_rdma, &ch->sq_wr_avail);
+		if (sq_wr_avail < 0) {
+			pr_warn("IB send queue full (needed %d)\n",
+				n_rdma);
+			goto out;
+		}
+	}
+
+	ioctx->rdma_aborted = false;
+	ret = 0;
+	riu = ioctx->rdma_ius;
+	memset(&wr, 0, sizeof wr);
+
+	for (i = 0; i < n_rdma; ++i, ++riu) {
+		if (dir == DMA_FROM_DEVICE) {
+			wr.opcode = IB_WR_RDMA_WRITE;
+			wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
+						SRPT_RDMA_WRITE_LAST :
+						SRPT_RDMA_MID,
+						ioctx->ioctx.index);
+		} else {
+			wr.opcode = IB_WR_RDMA_READ;
+			wr.wr_id = encode_wr_id(i == n_rdma - 1 ?
+						SRPT_RDMA_READ_LAST :
+						SRPT_RDMA_MID,
+						ioctx->ioctx.index);
+		}
+		wr.next = NULL;
+		wr.wr.rdma.remote_addr = riu->raddr;
+		wr.wr.rdma.rkey = riu->rkey;
+		wr.num_sge = riu->sge_cnt;
+		wr.sg_list = riu->sge;
+
+		/* only get completion event for the last rdma write */
+		if (i == (n_rdma - 1) && dir == DMA_TO_DEVICE)
+			wr.send_flags = IB_SEND_SIGNALED;
+
+		ret = ib_post_send(ch->qp, &wr, &bad_wr);
+		if (ret)
+			break;
+	}
+
+	if (ret)
+		pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n",
+				 __func__, __LINE__, ret, i, n_rdma);
+	if (ret && i > 0) {
+		wr.num_sge = 0;
+		wr.wr_id = encode_wr_id(SRPT_RDMA_ABORT, ioctx->ioctx.index);
+		wr.send_flags = IB_SEND_SIGNALED;
+		while (ch->state == CH_LIVE &&
+			ib_post_send(ch->qp, &wr, &bad_wr) != 0) {
+			pr_info("Trying to abort failed RDMA transfer [%d]\n",
+				ioctx->ioctx.index);
+			msleep(1000);
+		}
+		while (ch->state != CH_RELEASING && !ioctx->rdma_aborted) {
+			pr_info("Waiting until RDMA abort finished [%d]\n",
+				ioctx->ioctx.index);
+			msleep(1000);
+		}
+	}
+out:
+	if (unlikely(dir == DMA_TO_DEVICE && ret < 0))
+		atomic_add(n_rdma, &ch->sq_wr_avail);
+	return ret;
+}
+
+/**
+ * srpt_xfer_data() - Start data transfer from initiator to target.
+ */
+static int srpt_xfer_data(struct srpt_rdma_ch *ch,
+			  struct srpt_send_ioctx *ioctx)
+{
+	int ret;
+
+	ret = srpt_map_sg_to_ib_sge(ch, ioctx);
+	if (ret) {
+		pr_err("%s[%d] ret=%d\n", __func__, __LINE__, ret);
+		goto out;
+	}
+
+	ret = srpt_perform_rdmas(ch, ioctx);
+	if (ret) {
+		if (ret == -EAGAIN || ret == -ENOMEM)
+			pr_info("%s[%d] queue full -- ret=%d\n",
+				__func__, __LINE__, ret);
+		else
+			pr_err("%s[%d] fatal error -- ret=%d\n",
+			       __func__, __LINE__, ret);
+		goto out_unmap;
+	}
+
+out:
+	return ret;
+out_unmap:
+	srpt_unmap_sg_to_ib_sge(ch, ioctx);
+	goto out;
+}
+
+static int srpt_write_pending_status(struct se_cmd *se_cmd)
+{
+	struct srpt_send_ioctx *ioctx;
+
+	ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd);
+	return srpt_get_cmd_state(ioctx) == SRPT_STATE_NEED_DATA;
+}
+
+/*
+ * srpt_write_pending() - Start data transfer from initiator to target (write).
+ */
+static int srpt_write_pending(struct se_cmd *se_cmd)
+{
+	struct srpt_rdma_ch *ch;
+	struct srpt_send_ioctx *ioctx;
+	enum srpt_command_state new_state;
+	enum rdma_ch_state ch_state;
+	int ret;
+
+	ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd);
+
+	new_state = srpt_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA);
+	WARN_ON(new_state == SRPT_STATE_DONE);
+
+	ch = ioctx->ch;
+	BUG_ON(!ch);
+
+	ch_state = srpt_get_ch_state(ch);
+	switch (ch_state) {
+	case CH_CONNECTING:
+		WARN(true, "unexpected channel state %d\n", ch_state);
+		ret = -EINVAL;
+		goto out;
+	case CH_LIVE:
+		break;
+	case CH_DISCONNECTING:
+	case CH_DRAINING:
+	case CH_RELEASING:
+		pr_debug("cmd with tag %lld: channel disconnecting\n",
+			 ioctx->tag);
+		srpt_set_cmd_state(ioctx, SRPT_STATE_DATA_IN);
+		ret = -EINVAL;
+		goto out;
+	}
+	ret = srpt_xfer_data(ch, ioctx);
+
+out:
+	return ret;
+}
+
+static u8 tcm_to_srp_tsk_mgmt_status(const int tcm_mgmt_status)
+{
+	switch (tcm_mgmt_status) {
+	case TMR_FUNCTION_COMPLETE:
+		return SRP_TSK_MGMT_SUCCESS;
+	case TMR_FUNCTION_REJECTED:
+		return SRP_TSK_MGMT_FUNC_NOT_SUPP;
+	}
+	return SRP_TSK_MGMT_FAILED;
+}
+
+/**
+ * srpt_queue_response() - Transmits the response to a SCSI command.
+ *
+ * Callback function called by the TCM core. Must not block since it can be
+ * invoked on the context of the IB completion handler.
+ */
+static void srpt_queue_response(struct se_cmd *cmd)
+{
+	struct srpt_rdma_ch *ch;
+	struct srpt_send_ioctx *ioctx;
+	enum srpt_command_state state;
+	unsigned long flags;
+	int ret;
+	enum dma_data_direction dir;
+	int resp_len;
+	u8 srp_tm_status;
+
+	ioctx = container_of(cmd, struct srpt_send_ioctx, cmd);
+	ch = ioctx->ch;
+	BUG_ON(!ch);
+
+	spin_lock_irqsave(&ioctx->spinlock, flags);
+	state = ioctx->state;
+	switch (state) {
+	case SRPT_STATE_NEW:
+	case SRPT_STATE_DATA_IN:
+		ioctx->state = SRPT_STATE_CMD_RSP_SENT;
+		break;
+	case SRPT_STATE_MGMT:
+		ioctx->state = SRPT_STATE_MGMT_RSP_SENT;
+		break;
+	default:
+		WARN(true, "ch %p; cmd %d: unexpected command state %d\n",
+			ch, ioctx->ioctx.index, ioctx->state);
+		break;
+	}
+	spin_unlock_irqrestore(&ioctx->spinlock, flags);
+
+	if (unlikely(transport_check_aborted_status(&ioctx->cmd, false)
+		     || WARN_ON_ONCE(state == SRPT_STATE_CMD_RSP_SENT))) {
+		atomic_inc(&ch->req_lim_delta);
+		srpt_abort_cmd(ioctx);
+		return;
+	}
+
+	dir = ioctx->cmd.data_direction;
+
+	/* For read commands, transfer the data to the initiator. */
+	if (dir == DMA_FROM_DEVICE && ioctx->cmd.data_length &&
+	    !ioctx->queue_status_only) {
+		ret = srpt_xfer_data(ch, ioctx);
+		if (ret) {
+			pr_err("xfer_data failed for tag %llu\n",
+			       ioctx->tag);
+			return;
+		}
+	}
+
+	if (state != SRPT_STATE_MGMT)
+		resp_len = srpt_build_cmd_rsp(ch, ioctx, ioctx->tag,
+					      cmd->scsi_status);
+	else {
+		srp_tm_status
+			= tcm_to_srp_tsk_mgmt_status(cmd->se_tmr_req->response);
+		resp_len = srpt_build_tskmgmt_rsp(ch, ioctx, srp_tm_status,
+						 ioctx->tag);
+	}
+	ret = srpt_post_send(ch, ioctx, resp_len);
+	if (ret) {
+		pr_err("sending cmd response failed for tag %llu\n",
+		       ioctx->tag);
+		srpt_unmap_sg_to_ib_sge(ch, ioctx);
+		srpt_set_cmd_state(ioctx, SRPT_STATE_DONE);
+		target_put_sess_cmd(ioctx->ch->sess, &ioctx->cmd);
+	}
+}
+
+static int srpt_queue_data_in(struct se_cmd *cmd)
+{
+	srpt_queue_response(cmd);
+	return 0;
+}
+
+static void srpt_queue_tm_rsp(struct se_cmd *cmd)
+{
+	srpt_queue_response(cmd);
+}
+
+static void srpt_aborted_task(struct se_cmd *cmd)
+{
+	struct srpt_send_ioctx *ioctx = container_of(cmd,
+				struct srpt_send_ioctx, cmd);
+
+	srpt_unmap_sg_to_ib_sge(ioctx->ch, ioctx);
+}
+
+static int srpt_queue_status(struct se_cmd *cmd)
+{
+	struct srpt_send_ioctx *ioctx;
+
+	ioctx = container_of(cmd, struct srpt_send_ioctx, cmd);
+	BUG_ON(ioctx->sense_data != cmd->sense_buffer);
+	if (cmd->se_cmd_flags &
+	    (SCF_TRANSPORT_TASK_SENSE | SCF_EMULATED_TASK_SENSE))
+		WARN_ON(cmd->scsi_status != SAM_STAT_CHECK_CONDITION);
+	ioctx->queue_status_only = true;
+	srpt_queue_response(cmd);
+	return 0;
+}
+
+static void srpt_refresh_port_work(struct work_struct *work)
+{
+	struct srpt_port *sport = container_of(work, struct srpt_port, work);
+
+	srpt_refresh_port(sport);
+}
+
+static int srpt_ch_list_empty(struct srpt_device *sdev)
+{
+	int res;
+
+	spin_lock_irq(&sdev->spinlock);
+	res = list_empty(&sdev->rch_list);
+	spin_unlock_irq(&sdev->spinlock);
+
+	return res;
+}
+
+/**
+ * srpt_release_sdev() - Free the channel resources associated with a target.
+ */
+static int srpt_release_sdev(struct srpt_device *sdev)
+{
+	struct srpt_rdma_ch *ch, *tmp_ch;
+	int res;
+
+	WARN_ON_ONCE(irqs_disabled());
+
+	BUG_ON(!sdev);
+
+	spin_lock_irq(&sdev->spinlock);
+	list_for_each_entry_safe(ch, tmp_ch, &sdev->rch_list, list)
+		__srpt_close_ch(ch);
+	spin_unlock_irq(&sdev->spinlock);
+
+	res = wait_event_interruptible(sdev->ch_releaseQ,
+				       srpt_ch_list_empty(sdev));
+	if (res)
+		pr_err("%s: interrupted.\n", __func__);
+
+	return 0;
+}
+
+static struct srpt_port *__srpt_lookup_port(const char *name)
+{
+	struct ib_device *dev;
+	struct srpt_device *sdev;
+	struct srpt_port *sport;
+	int i;
+
+	list_for_each_entry(sdev, &srpt_dev_list, list) {
+		dev = sdev->device;
+		if (!dev)
+			continue;
+
+		for (i = 0; i < dev->phys_port_cnt; i++) {
+			sport = &sdev->port[i];
+
+			if (!strcmp(sport->port_guid, name))
+				return sport;
+		}
+	}
+
+	return NULL;
+}
+
+static struct srpt_port *srpt_lookup_port(const char *name)
+{
+	struct srpt_port *sport;
+
+	spin_lock(&srpt_dev_lock);
+	sport = __srpt_lookup_port(name);
+	spin_unlock(&srpt_dev_lock);
+
+	return sport;
+}
+
+/**
+ * srpt_add_one() - Infiniband device addition callback function.
+ */
+static void srpt_add_one(struct ib_device *device)
+{
+	struct srpt_device *sdev;
+	struct srpt_port *sport;
+	struct ib_srq_init_attr srq_attr;
+	int i;
+
+	pr_debug("device = %p, device->dma_ops = %p\n", device,
+		 device->dma_ops);
+
+	sdev = kzalloc(sizeof *sdev, GFP_KERNEL);
+	if (!sdev)
+		goto err;
+
+	sdev->device = device;
+	INIT_LIST_HEAD(&sdev->rch_list);
+	init_waitqueue_head(&sdev->ch_releaseQ);
+	spin_lock_init(&sdev->spinlock);
+
+	if (ib_query_device(device, &sdev->dev_attr))
+		goto free_dev;
+
+	sdev->pd = ib_alloc_pd(device);
+	if (IS_ERR(sdev->pd))
+		goto free_dev;
+
+	sdev->mr = ib_get_dma_mr(sdev->pd, IB_ACCESS_LOCAL_WRITE);
+	if (IS_ERR(sdev->mr))
+		goto err_pd;
+
+	sdev->srq_size = min(srpt_srq_size, sdev->dev_attr.max_srq_wr);
+
+	srq_attr.event_handler = srpt_srq_event;
+	srq_attr.srq_context = (void *)sdev;
+	srq_attr.attr.max_wr = sdev->srq_size;
+	srq_attr.attr.max_sge = 1;
+	srq_attr.attr.srq_limit = 0;
+	srq_attr.srq_type = IB_SRQT_BASIC;
+
+	sdev->srq = ib_create_srq(sdev->pd, &srq_attr);
+	if (IS_ERR(sdev->srq))
+		goto err_mr;
+
+	pr_debug("%s: create SRQ #wr= %d max_allow=%d dev= %s\n",
+		 __func__, sdev->srq_size, sdev->dev_attr.max_srq_wr,
+		 device->name);
+
+	if (!srpt_service_guid)
+		srpt_service_guid = be64_to_cpu(device->node_guid);
+
+	sdev->cm_id = ib_create_cm_id(device, srpt_cm_handler, sdev);
+	if (IS_ERR(sdev->cm_id))
+		goto err_srq;
+
+	/* print out target login information */
+	pr_debug("Target login info: id_ext=%016llx,ioc_guid=%016llx,"
+		 "pkey=ffff,service_id=%016llx\n", srpt_service_guid,
+		 srpt_service_guid, srpt_service_guid);
+
+	/*
+	 * We do not have a consistent service_id (ie. also id_ext of target_id)
+	 * to identify this target. We currently use the guid of the first HCA
+	 * in the system as service_id; therefore, the target_id will change
+	 * if this HCA is gone bad and replaced by different HCA
+	 */
+	if (ib_cm_listen(sdev->cm_id, cpu_to_be64(srpt_service_guid), 0, NULL))
+		goto err_cm;
+
+	INIT_IB_EVENT_HANDLER(&sdev->event_handler, sdev->device,
+			      srpt_event_handler);
+	if (ib_register_event_handler(&sdev->event_handler))
+		goto err_cm;
+
+	sdev->ioctx_ring = (struct srpt_recv_ioctx **)
+		srpt_alloc_ioctx_ring(sdev, sdev->srq_size,
+				      sizeof(*sdev->ioctx_ring[0]),
+				      srp_max_req_size, DMA_FROM_DEVICE);
+	if (!sdev->ioctx_ring)
+		goto err_event;
+
+	for (i = 0; i < sdev->srq_size; ++i)
+		srpt_post_recv(sdev, sdev->ioctx_ring[i]);
+
+	WARN_ON(sdev->device->phys_port_cnt > ARRAY_SIZE(sdev->port));
+
+	for (i = 1; i <= sdev->device->phys_port_cnt; i++) {
+		sport = &sdev->port[i - 1];
+		sport->sdev = sdev;
+		sport->port = i;
+		sport->port_attrib.srp_max_rdma_size = DEFAULT_MAX_RDMA_SIZE;
+		sport->port_attrib.srp_max_rsp_size = DEFAULT_MAX_RSP_SIZE;
+		sport->port_attrib.srp_sq_size = DEF_SRPT_SQ_SIZE;
+		INIT_WORK(&sport->work, srpt_refresh_port_work);
+		INIT_LIST_HEAD(&sport->port_acl_list);
+		spin_lock_init(&sport->port_acl_lock);
+
+		if (srpt_refresh_port(sport)) {
+			pr_err("MAD registration failed for %s-%d.\n",
+			       srpt_sdev_name(sdev), i);
+			goto err_ring;
+		}
+		snprintf(sport->port_guid, sizeof(sport->port_guid),
+			"0x%016llx%016llx",
+			be64_to_cpu(sport->gid.global.subnet_prefix),
+			be64_to_cpu(sport->gid.global.interface_id));
+	}
+
+	spin_lock(&srpt_dev_lock);
+	list_add_tail(&sdev->list, &srpt_dev_list);
+	spin_unlock(&srpt_dev_lock);
+
+out:
+	ib_set_client_data(device, &srpt_client, sdev);
+	pr_debug("added %s.\n", device->name);
+	return;
+
+err_ring:
+	srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev,
+			     sdev->srq_size, srp_max_req_size,
+			     DMA_FROM_DEVICE);
+err_event:
+	ib_unregister_event_handler(&sdev->event_handler);
+err_cm:
+	ib_destroy_cm_id(sdev->cm_id);
+err_srq:
+	ib_destroy_srq(sdev->srq);
+err_mr:
+	ib_dereg_mr(sdev->mr);
+err_pd:
+	ib_dealloc_pd(sdev->pd);
+free_dev:
+	kfree(sdev);
+err:
+	sdev = NULL;
+	pr_info("%s(%s) failed.\n", __func__, device->name);
+	goto out;
+}
+
+/**
+ * srpt_remove_one() - InfiniBand device removal callback function.
+ */
+static void srpt_remove_one(struct ib_device *device)
+{
+	struct srpt_device *sdev;
+	int i;
+
+	sdev = ib_get_client_data(device, &srpt_client);
+	if (!sdev) {
+		pr_info("%s(%s): nothing to do.\n", __func__, device->name);
+		return;
+	}
+
+	srpt_unregister_mad_agent(sdev);
+
+	ib_unregister_event_handler(&sdev->event_handler);
+
+	/* Cancel any work queued by the just unregistered IB event handler. */
+	for (i = 0; i < sdev->device->phys_port_cnt; i++)
+		cancel_work_sync(&sdev->port[i].work);
+
+	ib_destroy_cm_id(sdev->cm_id);
+
+	/*
+	 * Unregistering a target must happen after destroying sdev->cm_id
+	 * such that no new SRP_LOGIN_REQ information units can arrive while
+	 * destroying the target.
+	 */
+	spin_lock(&srpt_dev_lock);
+	list_del(&sdev->list);
+	spin_unlock(&srpt_dev_lock);
+	srpt_release_sdev(sdev);
+
+	ib_destroy_srq(sdev->srq);
+	ib_dereg_mr(sdev->mr);
+	ib_dealloc_pd(sdev->pd);
+
+	srpt_free_ioctx_ring((struct srpt_ioctx **)sdev->ioctx_ring, sdev,
+			     sdev->srq_size, srp_max_req_size, DMA_FROM_DEVICE);
+	sdev->ioctx_ring = NULL;
+	kfree(sdev);
+}
+
+static struct ib_client srpt_client = {
+	.name = DRV_NAME,
+	.add = srpt_add_one,
+	.remove = srpt_remove_one
+};
+
+static int srpt_check_true(struct se_portal_group *se_tpg)
+{
+	return 1;
+}
+
+static int srpt_check_false(struct se_portal_group *se_tpg)
+{
+	return 0;
+}
+
+static char *srpt_get_fabric_name(void)
+{
+	return "srpt";
+}
+
+static u8 srpt_get_fabric_proto_ident(struct se_portal_group *se_tpg)
+{
+	return SCSI_TRANSPORTID_PROTOCOLID_SRP;
+}
+
+static char *srpt_get_fabric_wwn(struct se_portal_group *tpg)
+{
+	struct srpt_port *sport = container_of(tpg, struct srpt_port, port_tpg_1);
+
+	return sport->port_guid;
+}
+
+static u16 srpt_get_tag(struct se_portal_group *tpg)
+{
+	return 1;
+}
+
+static u32 srpt_get_default_depth(struct se_portal_group *se_tpg)
+{
+	return 1;
+}
+
+static u32 srpt_get_pr_transport_id(struct se_portal_group *se_tpg,
+				    struct se_node_acl *se_nacl,
+				    struct t10_pr_registration *pr_reg,
+				    int *format_code, unsigned char *buf)
+{
+	struct srpt_node_acl *nacl;
+	struct spc_rdma_transport_id *tr_id;
+
+	nacl = container_of(se_nacl, struct srpt_node_acl, nacl);
+	tr_id = (void *)buf;
+	tr_id->protocol_identifier = SCSI_TRANSPORTID_PROTOCOLID_SRP;
+	memcpy(tr_id->i_port_id, nacl->i_port_id, sizeof(tr_id->i_port_id));
+	return sizeof(*tr_id);
+}
+
+static u32 srpt_get_pr_transport_id_len(struct se_portal_group *se_tpg,
+					struct se_node_acl *se_nacl,
+					struct t10_pr_registration *pr_reg,
+					int *format_code)
+{
+	*format_code = 0;
+	return sizeof(struct spc_rdma_transport_id);
+}
+
+static char *srpt_parse_pr_out_transport_id(struct se_portal_group *se_tpg,
+					    const char *buf, u32 *out_tid_len,
+					    char **port_nexus_ptr)
+{
+	struct spc_rdma_transport_id *tr_id;
+
+	*port_nexus_ptr = NULL;
+	*out_tid_len = sizeof(struct spc_rdma_transport_id);
+	tr_id = (void *)buf;
+	return (char *)tr_id->i_port_id;
+}
+
+static struct se_node_acl *srpt_alloc_fabric_acl(struct se_portal_group *se_tpg)
+{
+	struct srpt_node_acl *nacl;
+
+	nacl = kzalloc(sizeof(struct srpt_node_acl), GFP_KERNEL);
+	if (!nacl) {
+		pr_err("Unable to allocate struct srpt_node_acl\n");
+		return NULL;
+	}
+
+	return &nacl->nacl;
+}
+
+static void srpt_release_fabric_acl(struct se_portal_group *se_tpg,
+				    struct se_node_acl *se_nacl)
+{
+	struct srpt_node_acl *nacl;
+
+	nacl = container_of(se_nacl, struct srpt_node_acl, nacl);
+	kfree(nacl);
+}
+
+static u32 srpt_tpg_get_inst_index(struct se_portal_group *se_tpg)
+{
+	return 1;
+}
+
+static void srpt_release_cmd(struct se_cmd *se_cmd)
+{
+	struct srpt_send_ioctx *ioctx = container_of(se_cmd,
+				struct srpt_send_ioctx, cmd);
+	struct srpt_rdma_ch *ch = ioctx->ch;
+	unsigned long flags;
+
+	WARN_ON(ioctx->state != SRPT_STATE_DONE);
+	WARN_ON(ioctx->mapped_sg_count != 0);
+
+	if (ioctx->n_rbuf > 1) {
+		kfree(ioctx->rbufs);
+		ioctx->rbufs = NULL;
+		ioctx->n_rbuf = 0;
+	}
+
+	spin_lock_irqsave(&ch->spinlock, flags);
+	list_add(&ioctx->free_list, &ch->free_list);
+	spin_unlock_irqrestore(&ch->spinlock, flags);
+}
+
+/**
+ * srpt_close_session() - Forcibly close a session.
+ *
+ * Callback function invoked by the TCM core to clean up sessions associated
+ * with a node ACL when the user invokes
+ * rmdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id
+ */
+static void srpt_close_session(struct se_session *se_sess)
+{
+	DECLARE_COMPLETION_ONSTACK(release_done);
+	struct srpt_rdma_ch *ch;
+	struct srpt_device *sdev;
+	unsigned long res;
+
+	ch = se_sess->fabric_sess_ptr;
+	WARN_ON(ch->sess != se_sess);
+
+	pr_debug("ch %p state %d\n", ch, srpt_get_ch_state(ch));
+
+	sdev = ch->sport->sdev;
+	spin_lock_irq(&sdev->spinlock);
+	BUG_ON(ch->release_done);
+	ch->release_done = &release_done;
+	__srpt_close_ch(ch);
+	spin_unlock_irq(&sdev->spinlock);
+
+	res = wait_for_completion_timeout(&release_done, 60 * HZ);
+	WARN_ON(res == 0);
+}
+
+/**
+ * srpt_sess_get_index() - Return the value of scsiAttIntrPortIndex (SCSI-MIB).
+ *
+ * A quote from RFC 4455 (SCSI-MIB) about this MIB object:
+ * This object represents an arbitrary integer used to uniquely identify a
+ * particular attached remote initiator port to a particular SCSI target port
+ * within a particular SCSI target device within a particular SCSI instance.
+ */
+static u32 srpt_sess_get_index(struct se_session *se_sess)
+{
+	return 0;
+}
+
+static void srpt_set_default_node_attrs(struct se_node_acl *nacl)
+{
+}
+
+static u32 srpt_get_task_tag(struct se_cmd *se_cmd)
+{
+	struct srpt_send_ioctx *ioctx;
+
+	ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd);
+	return ioctx->tag;
+}
+
+/* Note: only used from inside debug printk's by the TCM core. */
+static int srpt_get_tcm_cmd_state(struct se_cmd *se_cmd)
+{
+	struct srpt_send_ioctx *ioctx;
+
+	ioctx = container_of(se_cmd, struct srpt_send_ioctx, cmd);
+	return srpt_get_cmd_state(ioctx);
+}
+
+/**
+ * srpt_parse_i_port_id() - Parse an initiator port ID.
+ * @name: ASCII representation of a 128-bit initiator port ID.
+ * @i_port_id: Binary 128-bit port ID.
+ */
+static int srpt_parse_i_port_id(u8 i_port_id[16], const char *name)
+{
+	const char *p;
+	unsigned len, count, leading_zero_bytes;
+	int ret, rc;
+
+	p = name;
+	if (strncasecmp(p, "0x", 2) == 0)
+		p += 2;
+	ret = -EINVAL;
+	len = strlen(p);
+	if (len % 2)
+		goto out;
+	count = min(len / 2, 16U);
+	leading_zero_bytes = 16 - count;
+	memset(i_port_id, 0, leading_zero_bytes);
+	rc = hex2bin(i_port_id + leading_zero_bytes, p, count);
+	if (rc < 0)
+		pr_debug("hex2bin failed for srpt_parse_i_port_id: %d\n", rc);
+	ret = 0;
+out:
+	return ret;
+}
+
+/*
+ * configfs callback function invoked for
+ * mkdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id
+ */
+static struct se_node_acl *srpt_make_nodeacl(struct se_portal_group *tpg,
+					     struct config_group *group,
+					     const char *name)
+{
+	struct srpt_port *sport = container_of(tpg, struct srpt_port, port_tpg_1);
+	struct se_node_acl *se_nacl, *se_nacl_new;
+	struct srpt_node_acl *nacl;
+	int ret = 0;
+	u32 nexus_depth = 1;
+	u8 i_port_id[16];
+
+	if (srpt_parse_i_port_id(i_port_id, name) < 0) {
+		pr_err("invalid initiator port ID %s\n", name);
+		ret = -EINVAL;
+		goto err;
+	}
+
+	se_nacl_new = srpt_alloc_fabric_acl(tpg);
+	if (!se_nacl_new) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	/*
+	 * nacl_new may be released by core_tpg_add_initiator_node_acl()
+	 * when converting a node ACL from demo mode to explict
+	 */
+	se_nacl = core_tpg_add_initiator_node_acl(tpg, se_nacl_new, name,
+						  nexus_depth);
+	if (IS_ERR(se_nacl)) {
+		ret = PTR_ERR(se_nacl);
+		goto err;
+	}
+	/* Locate our struct srpt_node_acl and set sdev and i_port_id. */
+	nacl = container_of(se_nacl, struct srpt_node_acl, nacl);
+	memcpy(&nacl->i_port_id[0], &i_port_id[0], 16);
+	nacl->sport = sport;
+
+	spin_lock_irq(&sport->port_acl_lock);
+	list_add_tail(&nacl->list, &sport->port_acl_list);
+	spin_unlock_irq(&sport->port_acl_lock);
+
+	return se_nacl;
+err:
+	return ERR_PTR(ret);
+}
+
+/*
+ * configfs callback function invoked for
+ * rmdir /sys/kernel/config/target/$driver/$port/$tpg/acls/$i_port_id
+ */
+static void srpt_drop_nodeacl(struct se_node_acl *se_nacl)
+{
+	struct srpt_node_acl *nacl;
+	struct srpt_device *sdev;
+	struct srpt_port *sport;
+
+	nacl = container_of(se_nacl, struct srpt_node_acl, nacl);
+	sport = nacl->sport;
+	sdev = sport->sdev;
+	spin_lock_irq(&sport->port_acl_lock);
+	list_del(&nacl->list);
+	spin_unlock_irq(&sport->port_acl_lock);
+	core_tpg_del_initiator_node_acl(&sport->port_tpg_1, se_nacl, 1);
+	srpt_release_fabric_acl(NULL, se_nacl);
+}
+
+static ssize_t srpt_tpg_attrib_show_srp_max_rdma_size(
+	struct se_portal_group *se_tpg,
+	char *page)
+{
+	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1);
+
+	return sprintf(page, "%u\n", sport->port_attrib.srp_max_rdma_size);
+}
+
+static ssize_t srpt_tpg_attrib_store_srp_max_rdma_size(
+	struct se_portal_group *se_tpg,
+	const char *page,
+	size_t count)
+{
+	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1);
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(page, 0, &val);
+	if (ret < 0) {
+		pr_err("kstrtoul() failed with ret: %d\n", ret);
+		return -EINVAL;
+	}
+	if (val > MAX_SRPT_RDMA_SIZE) {
+		pr_err("val: %lu exceeds MAX_SRPT_RDMA_SIZE: %d\n", val,
+			MAX_SRPT_RDMA_SIZE);
+		return -EINVAL;
+	}
+	if (val < DEFAULT_MAX_RDMA_SIZE) {
+		pr_err("val: %lu smaller than DEFAULT_MAX_RDMA_SIZE: %d\n",
+			val, DEFAULT_MAX_RDMA_SIZE);
+		return -EINVAL;
+	}
+	sport->port_attrib.srp_max_rdma_size = val;
+
+	return count;
+}
+
+TF_TPG_ATTRIB_ATTR(srpt, srp_max_rdma_size, S_IRUGO | S_IWUSR);
+
+static ssize_t srpt_tpg_attrib_show_srp_max_rsp_size(
+	struct se_portal_group *se_tpg,
+	char *page)
+{
+	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1);
+
+	return sprintf(page, "%u\n", sport->port_attrib.srp_max_rsp_size);
+}
+
+static ssize_t srpt_tpg_attrib_store_srp_max_rsp_size(
+	struct se_portal_group *se_tpg,
+	const char *page,
+	size_t count)
+{
+	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1);
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(page, 0, &val);
+	if (ret < 0) {
+		pr_err("kstrtoul() failed with ret: %d\n", ret);
+		return -EINVAL;
+	}
+	if (val > MAX_SRPT_RSP_SIZE) {
+		pr_err("val: %lu exceeds MAX_SRPT_RSP_SIZE: %d\n", val,
+			MAX_SRPT_RSP_SIZE);
+		return -EINVAL;
+	}
+	if (val < MIN_MAX_RSP_SIZE) {
+		pr_err("val: %lu smaller than MIN_MAX_RSP_SIZE: %d\n", val,
+			MIN_MAX_RSP_SIZE);
+		return -EINVAL;
+	}
+	sport->port_attrib.srp_max_rsp_size = val;
+
+	return count;
+}
+
+TF_TPG_ATTRIB_ATTR(srpt, srp_max_rsp_size, S_IRUGO | S_IWUSR);
+
+static ssize_t srpt_tpg_attrib_show_srp_sq_size(
+	struct se_portal_group *se_tpg,
+	char *page)
+{
+	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1);
+
+	return sprintf(page, "%u\n", sport->port_attrib.srp_sq_size);
+}
+
+static ssize_t srpt_tpg_attrib_store_srp_sq_size(
+	struct se_portal_group *se_tpg,
+	const char *page,
+	size_t count)
+{
+	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1);
+	unsigned long val;
+	int ret;
+
+	ret = kstrtoul(page, 0, &val);
+	if (ret < 0) {
+		pr_err("kstrtoul() failed with ret: %d\n", ret);
+		return -EINVAL;
+	}
+	if (val > MAX_SRPT_SRQ_SIZE) {
+		pr_err("val: %lu exceeds MAX_SRPT_SRQ_SIZE: %d\n", val,
+			MAX_SRPT_SRQ_SIZE);
+		return -EINVAL;
+	}
+	if (val < MIN_SRPT_SRQ_SIZE) {
+		pr_err("val: %lu smaller than MIN_SRPT_SRQ_SIZE: %d\n", val,
+			MIN_SRPT_SRQ_SIZE);
+		return -EINVAL;
+	}
+	sport->port_attrib.srp_sq_size = val;
+
+	return count;
+}
+
+TF_TPG_ATTRIB_ATTR(srpt, srp_sq_size, S_IRUGO | S_IWUSR);
+
+static struct configfs_attribute *srpt_tpg_attrib_attrs[] = {
+	&srpt_tpg_attrib_srp_max_rdma_size.attr,
+	&srpt_tpg_attrib_srp_max_rsp_size.attr,
+	&srpt_tpg_attrib_srp_sq_size.attr,
+	NULL,
+};
+
+static ssize_t srpt_tpg_show_enable(
+	struct se_portal_group *se_tpg,
+	char *page)
+{
+	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1);
+
+	return snprintf(page, PAGE_SIZE, "%d\n", (sport->enabled) ? 1: 0);
+}
+
+static ssize_t srpt_tpg_store_enable(
+	struct se_portal_group *se_tpg,
+	const char *page,
+	size_t count)
+{
+	struct srpt_port *sport = container_of(se_tpg, struct srpt_port, port_tpg_1);
+	unsigned long tmp;
+        int ret;
+
+	ret = kstrtoul(page, 0, &tmp);
+	if (ret < 0) {
+		pr_err("Unable to extract srpt_tpg_store_enable\n");
+		return -EINVAL;
+	}
+
+	if ((tmp != 0) && (tmp != 1)) {
+		pr_err("Illegal value for srpt_tpg_store_enable: %lu\n", tmp);
+		return -EINVAL;
+	}
+	if (tmp == 1)
+		sport->enabled = true;
+	else
+		sport->enabled = false;
+
+	return count;
+}
+
+TF_TPG_BASE_ATTR(srpt, enable, S_IRUGO | S_IWUSR);
+
+static struct configfs_attribute *srpt_tpg_attrs[] = {
+	&srpt_tpg_enable.attr,
+	NULL,
+};
+
+/**
+ * configfs callback invoked for
+ * mkdir /sys/kernel/config/target/$driver/$port/$tpg
+ */
+static struct se_portal_group *srpt_make_tpg(struct se_wwn *wwn,
+					     struct config_group *group,
+					     const char *name)
+{
+	struct srpt_port *sport = container_of(wwn, struct srpt_port, port_wwn);
+	int res;
+
+	/* Initialize sport->port_wwn and sport->port_tpg_1 */
+	res = core_tpg_register(&srpt_template, &sport->port_wwn,
+			&sport->port_tpg_1, sport, TRANSPORT_TPG_TYPE_NORMAL);
+	if (res)
+		return ERR_PTR(res);
+
+	return &sport->port_tpg_1;
+}
+
+/**
+ * configfs callback invoked for
+ * rmdir /sys/kernel/config/target/$driver/$port/$tpg
+ */
+static void srpt_drop_tpg(struct se_portal_group *tpg)
+{
+	struct srpt_port *sport = container_of(tpg,
+				struct srpt_port, port_tpg_1);
+
+	sport->enabled = false;
+	core_tpg_deregister(&sport->port_tpg_1);
+}
+
+/**
+ * configfs callback invoked for
+ * mkdir /sys/kernel/config/target/$driver/$port
+ */
+static struct se_wwn *srpt_make_tport(struct target_fabric_configfs *tf,
+				      struct config_group *group,
+				      const char *name)
+{
+	struct srpt_port *sport;
+	int ret;
+
+	sport = srpt_lookup_port(name);
+	pr_debug("make_tport(%s)\n", name);
+	ret = -EINVAL;
+	if (!sport)
+		goto err;
+
+	return &sport->port_wwn;
+
+err:
+	return ERR_PTR(ret);
+}
+
+/**
+ * configfs callback invoked for
+ * rmdir /sys/kernel/config/target/$driver/$port
+ */
+static void srpt_drop_tport(struct se_wwn *wwn)
+{
+	struct srpt_port *sport = container_of(wwn, struct srpt_port, port_wwn);
+
+	pr_debug("drop_tport(%s\n", config_item_name(&sport->port_wwn.wwn_group.cg_item));
+}
+
+static ssize_t srpt_wwn_show_attr_version(struct target_fabric_configfs *tf,
+					      char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%s\n", DRV_VERSION);
+}
+
+TF_WWN_ATTR_RO(srpt, version);
+
+static struct configfs_attribute *srpt_wwn_attrs[] = {
+	&srpt_wwn_version.attr,
+	NULL,
+};
+
+static const struct target_core_fabric_ops srpt_template = {
+	.module				= THIS_MODULE,
+	.name				= "srpt",
+	.get_fabric_name		= srpt_get_fabric_name,
+	.get_fabric_proto_ident		= srpt_get_fabric_proto_ident,
+	.tpg_get_wwn			= srpt_get_fabric_wwn,
+	.tpg_get_tag			= srpt_get_tag,
+	.tpg_get_default_depth		= srpt_get_default_depth,
+	.tpg_get_pr_transport_id	= srpt_get_pr_transport_id,
+	.tpg_get_pr_transport_id_len	= srpt_get_pr_transport_id_len,
+	.tpg_parse_pr_out_transport_id	= srpt_parse_pr_out_transport_id,
+	.tpg_check_demo_mode		= srpt_check_false,
+	.tpg_check_demo_mode_cache	= srpt_check_true,
+	.tpg_check_demo_mode_write_protect = srpt_check_true,
+	.tpg_check_prod_mode_write_protect = srpt_check_false,
+	.tpg_alloc_fabric_acl		= srpt_alloc_fabric_acl,
+	.tpg_release_fabric_acl		= srpt_release_fabric_acl,
+	.tpg_get_inst_index		= srpt_tpg_get_inst_index,
+	.release_cmd			= srpt_release_cmd,
+	.check_stop_free		= srpt_check_stop_free,
+	.shutdown_session		= srpt_shutdown_session,
+	.close_session			= srpt_close_session,
+	.sess_get_index			= srpt_sess_get_index,
+	.sess_get_initiator_sid		= NULL,
+	.write_pending			= srpt_write_pending,
+	.write_pending_status		= srpt_write_pending_status,
+	.set_default_node_attributes	= srpt_set_default_node_attrs,
+	.get_task_tag			= srpt_get_task_tag,
+	.get_cmd_state			= srpt_get_tcm_cmd_state,
+	.queue_data_in			= srpt_queue_data_in,
+	.queue_status			= srpt_queue_status,
+	.queue_tm_rsp			= srpt_queue_tm_rsp,
+	.aborted_task			= srpt_aborted_task,
+	/*
+	 * Setup function pointers for generic logic in
+	 * target_core_fabric_configfs.c
+	 */
+	.fabric_make_wwn		= srpt_make_tport,
+	.fabric_drop_wwn		= srpt_drop_tport,
+	.fabric_make_tpg		= srpt_make_tpg,
+	.fabric_drop_tpg		= srpt_drop_tpg,
+	.fabric_post_link		= NULL,
+	.fabric_pre_unlink		= NULL,
+	.fabric_make_np			= NULL,
+	.fabric_drop_np			= NULL,
+	.fabric_make_nodeacl		= srpt_make_nodeacl,
+	.fabric_drop_nodeacl		= srpt_drop_nodeacl,
+
+	.tfc_wwn_attrs			= srpt_wwn_attrs,
+	.tfc_tpg_base_attrs		= srpt_tpg_attrs,
+	.tfc_tpg_attrib_attrs		= srpt_tpg_attrib_attrs,
+};
+
+/**
+ * srpt_init_module() - Kernel module initialization.
+ *
+ * Note: Since ib_register_client() registers callback functions, and since at
+ * least one of these callback functions (srpt_add_one()) calls target core
+ * functions, this driver must be registered with the target core before
+ * ib_register_client() is called.
+ */
+static int __init srpt_init_module(void)
+{
+	int ret;
+
+	ret = -EINVAL;
+	if (srp_max_req_size < MIN_MAX_REQ_SIZE) {
+		pr_err("invalid value %d for kernel module parameter"
+		       " srp_max_req_size -- must be at least %d.\n",
+		       srp_max_req_size, MIN_MAX_REQ_SIZE);
+		goto out;
+	}
+
+	if (srpt_srq_size < MIN_SRPT_SRQ_SIZE
+	    || srpt_srq_size > MAX_SRPT_SRQ_SIZE) {
+		pr_err("invalid value %d for kernel module parameter"
+		       " srpt_srq_size -- must be in the range [%d..%d].\n",
+		       srpt_srq_size, MIN_SRPT_SRQ_SIZE, MAX_SRPT_SRQ_SIZE);
+		goto out;
+	}
+
+	ret = target_register_template(&srpt_template);
+	if (ret)
+		goto out;
+
+	ret = ib_register_client(&srpt_client);
+	if (ret) {
+		pr_err("couldn't register IB client\n");
+		goto out_unregister_target;
+	}
+
+	return 0;
+
+out_unregister_target:
+	target_unregister_template(&srpt_template);
+out:
+	return ret;
+}
+
+static void __exit srpt_cleanup_module(void)
+{
+	ib_unregister_client(&srpt_client);
+	target_unregister_template(&srpt_template);
+}
+
+module_init(srpt_init_module);
+module_exit(srpt_cleanup_module);
diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h
new file mode 100644
index 000000000..3dae15690
--- /dev/null
+++ b/drivers/infiniband/ulp/srpt/ib_srpt.h
@@ -0,0 +1,443 @@
+/*
+ * Copyright (c) 2006 - 2009 Mellanox Technology Inc.  All rights reserved.
+ * Copyright (C) 2009 - 2010 Bart Van Assche <bvanassche@acm.org>.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ *     Redistribution and use in source and binary forms, with or
+ *     without modification, are permitted provided that the following
+ *     conditions are met:
+ *
+ *      - Redistributions of source code must retain the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer.
+ *
+ *      - Redistributions in binary form must reproduce the above
+ *        copyright notice, this list of conditions and the following
+ *        disclaimer in the documentation and/or other materials
+ *        provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef IB_SRPT_H
+#define IB_SRPT_H
+
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+
+#include <rdma/ib_verbs.h>
+#include <rdma/ib_sa.h>
+#include <rdma/ib_cm.h>
+
+#include <scsi/srp.h>
+
+#include "ib_dm_mad.h"
+
+/*
+ * The prefix the ServiceName field must start with in the device management
+ * ServiceEntries attribute pair. See also the SRP specification.
+ */
+#define SRP_SERVICE_NAME_PREFIX		"SRP.T10:"
+
+enum {
+	/*
+	 * SRP IOControllerProfile attributes for SRP target ports that have
+	 * not been defined in <scsi/srp.h>. Source: section B.7, table B.7
+	 * in the SRP specification.
+	 */
+	SRP_PROTOCOL = 0x0108,
+	SRP_PROTOCOL_VERSION = 0x0001,
+	SRP_IO_SUBCLASS = 0x609e,
+	SRP_SEND_TO_IOC = 0x01,
+	SRP_SEND_FROM_IOC = 0x02,
+	SRP_RDMA_READ_FROM_IOC = 0x08,
+	SRP_RDMA_WRITE_FROM_IOC = 0x20,
+
+	/*
+	 * srp_login_cmd.req_flags bitmasks. See also table 9 in the SRP
+	 * specification.
+	 */
+	SRP_MTCH_ACTION = 0x03, /* MULTI-CHANNEL ACTION */
+	SRP_LOSOLNT = 0x10, /* logout solicited notification */
+	SRP_CRSOLNT = 0x20, /* credit request solicited notification */
+	SRP_AESOLNT = 0x40, /* asynchronous event solicited notification */
+
+	/*
+	 * srp_cmd.sol_nt / srp_tsk_mgmt.sol_not bitmasks. See also tables
+	 * 18 and 20 in the SRP specification.
+	 */
+	SRP_SCSOLNT = 0x02, /* SCSOLNT = successful solicited notification */
+	SRP_UCSOLNT = 0x04, /* UCSOLNT = unsuccessful solicited notification */
+
+	/*
+	 * srp_rsp.sol_not / srp_t_logout.sol_not bitmasks. See also tables
+	 * 16 and 22 in the SRP specification.
+	 */
+	SRP_SOLNT = 0x01, /* SOLNT = solicited notification */
+
+	/* See also table 24 in the SRP specification. */
+	SRP_TSK_MGMT_SUCCESS = 0x00,
+	SRP_TSK_MGMT_FUNC_NOT_SUPP = 0x04,
+	SRP_TSK_MGMT_FAILED = 0x05,
+
+	/* See also table 21 in the SRP specification. */
+	SRP_CMD_SIMPLE_Q = 0x0,
+	SRP_CMD_HEAD_OF_Q = 0x1,
+	SRP_CMD_ORDERED_Q = 0x2,
+	SRP_CMD_ACA = 0x4,
+
+	SRP_LOGIN_RSP_MULTICHAN_NO_CHAN = 0x0,
+	SRP_LOGIN_RSP_MULTICHAN_TERMINATED = 0x1,
+	SRP_LOGIN_RSP_MULTICHAN_MAINTAINED = 0x2,
+
+	SRPT_DEF_SG_TABLESIZE = 128,
+	SRPT_DEF_SG_PER_WQE = 16,
+
+	MIN_SRPT_SQ_SIZE = 16,
+	DEF_SRPT_SQ_SIZE = 4096,
+	SRPT_RQ_SIZE = 128,
+	MIN_SRPT_SRQ_SIZE = 4,
+	DEFAULT_SRPT_SRQ_SIZE = 4095,
+	MAX_SRPT_SRQ_SIZE = 65535,
+	MAX_SRPT_RDMA_SIZE = 1U << 24,
+	MAX_SRPT_RSP_SIZE = 1024,
+
+	MIN_MAX_REQ_SIZE = 996,
+	DEFAULT_MAX_REQ_SIZE
+		= sizeof(struct srp_cmd)/*48*/
+		+ sizeof(struct srp_indirect_buf)/*20*/
+		+ 128 * sizeof(struct srp_direct_buf)/*16*/,
+
+	MIN_MAX_RSP_SIZE = sizeof(struct srp_rsp)/*36*/ + 4,
+	DEFAULT_MAX_RSP_SIZE = 256, /* leaves 220 bytes for sense data */
+
+	DEFAULT_MAX_RDMA_SIZE = 65536,
+};
+
+enum srpt_opcode {
+	SRPT_RECV,
+	SRPT_SEND,
+	SRPT_RDMA_MID,
+	SRPT_RDMA_ABORT,
+	SRPT_RDMA_READ_LAST,
+	SRPT_RDMA_WRITE_LAST,
+};
+
+static inline u64 encode_wr_id(u8 opcode, u32 idx)
+{
+	return ((u64)opcode << 32) | idx;
+}
+static inline enum srpt_opcode opcode_from_wr_id(u64 wr_id)
+{
+	return wr_id >> 32;
+}
+static inline u32 idx_from_wr_id(u64 wr_id)
+{
+	return (u32)wr_id;
+}
+
+struct rdma_iu {
+	u64		raddr;
+	u32		rkey;
+	struct ib_sge	*sge;
+	u32		sge_cnt;
+	int		mem_id;
+};
+
+/**
+ * enum srpt_command_state - SCSI command state managed by SRPT.
+ * @SRPT_STATE_NEW:           New command arrived and is being processed.
+ * @SRPT_STATE_NEED_DATA:     Processing a write or bidir command and waiting
+ *                            for data arrival.
+ * @SRPT_STATE_DATA_IN:       Data for the write or bidir command arrived and is
+ *                            being processed.
+ * @SRPT_STATE_CMD_RSP_SENT:  SRP_RSP for SRP_CMD has been sent.
+ * @SRPT_STATE_MGMT:          Processing a SCSI task management command.
+ * @SRPT_STATE_MGMT_RSP_SENT: SRP_RSP for SRP_TSK_MGMT has been sent.
+ * @SRPT_STATE_DONE:          Command processing finished successfully, command
+ *                            processing has been aborted or command processing
+ *                            failed.
+ */
+enum srpt_command_state {
+	SRPT_STATE_NEW		 = 0,
+	SRPT_STATE_NEED_DATA	 = 1,
+	SRPT_STATE_DATA_IN	 = 2,
+	SRPT_STATE_CMD_RSP_SENT	 = 3,
+	SRPT_STATE_MGMT		 = 4,
+	SRPT_STATE_MGMT_RSP_SENT = 5,
+	SRPT_STATE_DONE		 = 6,
+};
+
+/**
+ * struct srpt_ioctx - Shared SRPT I/O context information.
+ * @buf:   Pointer to the buffer.
+ * @dma:   DMA address of the buffer.
+ * @index: Index of the I/O context in its ioctx_ring array.
+ */
+struct srpt_ioctx {
+	void			*buf;
+	dma_addr_t		dma;
+	uint32_t		index;
+};
+
+/**
+ * struct srpt_recv_ioctx - SRPT receive I/O context.
+ * @ioctx:     See above.
+ * @wait_list: Node for insertion in srpt_rdma_ch.cmd_wait_list.
+ */
+struct srpt_recv_ioctx {
+	struct srpt_ioctx	ioctx;
+	struct list_head	wait_list;
+};
+
+/**
+ * struct srpt_send_ioctx - SRPT send I/O context.
+ * @ioctx:       See above.
+ * @ch:          Channel pointer.
+ * @free_list:   Node in srpt_rdma_ch.free_list.
+ * @n_rbuf:      Number of data buffers in the received SRP command.
+ * @rbufs:       Pointer to SRP data buffer array.
+ * @single_rbuf: SRP data buffer if the command has only a single buffer.
+ * @sg:          Pointer to sg-list associated with this I/O context.
+ * @sg_cnt:      SG-list size.
+ * @mapped_sg_count: ib_dma_map_sg() return value.
+ * @n_rdma_ius:  Number of elements in the rdma_ius array.
+ * @rdma_ius:    Array with information about the RDMA mapping.
+ * @tag:         Tag of the received SRP information unit.
+ * @spinlock:    Protects 'state'.
+ * @state:       I/O context state.
+ * @rdma_aborted: If initiating a multipart RDMA transfer failed, whether
+ * 		 the already initiated transfers have finished.
+ * @cmd:         Target core command data structure.
+ * @sense_data:  SCSI sense data.
+ */
+struct srpt_send_ioctx {
+	struct srpt_ioctx	ioctx;
+	struct srpt_rdma_ch	*ch;
+	struct rdma_iu		*rdma_ius;
+	struct srp_direct_buf	*rbufs;
+	struct srp_direct_buf	single_rbuf;
+	struct scatterlist	*sg;
+	struct list_head	free_list;
+	spinlock_t		spinlock;
+	enum srpt_command_state	state;
+	bool			rdma_aborted;
+	struct se_cmd		cmd;
+	struct completion	tx_done;
+	u64			tag;
+	int			sg_cnt;
+	int			mapped_sg_count;
+	u16			n_rdma_ius;
+	u8			n_rdma;
+	u8			n_rbuf;
+	bool			queue_status_only;
+	u8			sense_data[SCSI_SENSE_BUFFERSIZE];
+};
+
+/**
+ * enum rdma_ch_state - SRP channel state.
+ * @CH_CONNECTING:	 QP is in RTR state; waiting for RTU.
+ * @CH_LIVE:		 QP is in RTS state.
+ * @CH_DISCONNECTING:    DREQ has been received; waiting for DREP
+ *                       or DREQ has been send and waiting for DREP
+ *                       or .
+ * @CH_DRAINING:	 QP is in ERR state; waiting for last WQE event.
+ * @CH_RELEASING:	 Last WQE event has been received; releasing resources.
+ */
+enum rdma_ch_state {
+	CH_CONNECTING,
+	CH_LIVE,
+	CH_DISCONNECTING,
+	CH_DRAINING,
+	CH_RELEASING
+};
+
+/**
+ * struct srpt_rdma_ch - RDMA channel.
+ * @wait_queue:    Allows the kernel thread to wait for more work.
+ * @thread:        Kernel thread that processes the IB queues associated with
+ *                 the channel.
+ * @cm_id:         IB CM ID associated with the channel.
+ * @qp:            IB queue pair used for communicating over this channel.
+ * @cq:            IB completion queue for this channel.
+ * @rq_size:       IB receive queue size.
+ * @rsp_size	   IB response message size in bytes.
+ * @sq_wr_avail:   number of work requests available in the send queue.
+ * @sport:         pointer to the information of the HCA port used by this
+ *                 channel.
+ * @i_port_id:     128-bit initiator port identifier copied from SRP_LOGIN_REQ.
+ * @t_port_id:     128-bit target port identifier copied from SRP_LOGIN_REQ.
+ * @max_ti_iu_len: maximum target-to-initiator information unit length.
+ * @req_lim:       request limit: maximum number of requests that may be sent
+ *                 by the initiator without having received a response.
+ * @req_lim_delta: Number of credits not yet sent back to the initiator.
+ * @spinlock:      Protects free_list and state.
+ * @free_list:     Head of list with free send I/O contexts.
+ * @state:         channel state. See also enum rdma_ch_state.
+ * @ioctx_ring:    Send ring.
+ * @wc:            IB work completion array for srpt_process_completion().
+ * @list:          Node for insertion in the srpt_device.rch_list list.
+ * @cmd_wait_list: List of SCSI commands that arrived before the RTU event. This
+ *                 list contains struct srpt_ioctx elements and is protected
+ *                 against concurrent modification by the cm_id spinlock.
+ * @sess:          Session information associated with this SRP channel.
+ * @sess_name:     Session name.
+ * @release_work:  Allows scheduling of srpt_release_channel().
+ * @release_done:  Enables waiting for srpt_release_channel() completion.
+ */
+struct srpt_rdma_ch {
+	wait_queue_head_t	wait_queue;
+	struct task_struct	*thread;
+	struct ib_cm_id		*cm_id;
+	struct ib_qp		*qp;
+	struct ib_cq		*cq;
+	int			rq_size;
+	u32			rsp_size;
+	atomic_t		sq_wr_avail;
+	struct srpt_port	*sport;
+	u8			i_port_id[16];
+	u8			t_port_id[16];
+	int			max_ti_iu_len;
+	atomic_t		req_lim;
+	atomic_t		req_lim_delta;
+	spinlock_t		spinlock;
+	struct list_head	free_list;
+	enum rdma_ch_state	state;
+	struct srpt_send_ioctx	**ioctx_ring;
+	struct ib_wc		wc[16];
+	struct list_head	list;
+	struct list_head	cmd_wait_list;
+	struct se_session	*sess;
+	u8			sess_name[36];
+	struct work_struct	release_work;
+	struct completion	*release_done;
+	bool			in_shutdown;
+};
+
+/**
+ * struct srpt_port_attib - Attributes for SRPT port
+ * @srp_max_rdma_size: Maximum size of SRP RDMA transfers for new connections.
+ * @srp_max_rsp_size: Maximum size of SRP response messages in bytes.
+ * @srp_sq_size: Shared receive queue (SRQ) size.
+ */
+struct srpt_port_attrib {
+	u32			srp_max_rdma_size;
+	u32			srp_max_rsp_size;
+	u32			srp_sq_size;
+};
+
+/**
+ * struct srpt_port - Information associated by SRPT with a single IB port.
+ * @sdev:      backpointer to the HCA information.
+ * @mad_agent: per-port management datagram processing information.
+ * @enabled:   Whether or not this target port is enabled.
+ * @port_guid: ASCII representation of Port GUID
+ * @port:      one-based port number.
+ * @sm_lid:    cached value of the port's sm_lid.
+ * @lid:       cached value of the port's lid.
+ * @gid:       cached value of the port's gid.
+ * @port_acl_lock spinlock for port_acl_list:
+ * @work:      work structure for refreshing the aforementioned cached values.
+ * @port_tpg_1 Target portal group = 1 data.
+ * @port_wwn:  Target core WWN data.
+ * @port_acl_list: Head of the list with all node ACLs for this port.
+ */
+struct srpt_port {
+	struct srpt_device	*sdev;
+	struct ib_mad_agent	*mad_agent;
+	bool			enabled;
+	u8			port_guid[64];
+	u8			port;
+	u16			sm_lid;
+	u16			lid;
+	union ib_gid		gid;
+	spinlock_t		port_acl_lock;
+	struct work_struct	work;
+	struct se_portal_group	port_tpg_1;
+	struct se_wwn		port_wwn;
+	struct list_head	port_acl_list;
+	struct srpt_port_attrib port_attrib;
+};
+
+/**
+ * struct srpt_device - Information associated by SRPT with a single HCA.
+ * @device:        Backpointer to the struct ib_device managed by the IB core.
+ * @pd:            IB protection domain.
+ * @mr:            L_Key (local key) with write access to all local memory.
+ * @srq:           Per-HCA SRQ (shared receive queue).
+ * @cm_id:         Connection identifier.
+ * @dev_attr:      Attributes of the InfiniBand device as obtained during the
+ *                 ib_client.add() callback.
+ * @srq_size:      SRQ size.
+ * @ioctx_ring:    Per-HCA SRQ.
+ * @rch_list:      Per-device channel list -- see also srpt_rdma_ch.list.
+ * @ch_releaseQ:   Enables waiting for removal from rch_list.
+ * @spinlock:      Protects rch_list and tpg.
+ * @port:          Information about the ports owned by this HCA.
+ * @event_handler: Per-HCA asynchronous IB event handler.
+ * @list:          Node in srpt_dev_list.
+ */
+struct srpt_device {
+	struct ib_device	*device;
+	struct ib_pd		*pd;
+	struct ib_mr		*mr;
+	struct ib_srq		*srq;
+	struct ib_cm_id		*cm_id;
+	struct ib_device_attr	dev_attr;
+	int			srq_size;
+	struct srpt_recv_ioctx	**ioctx_ring;
+	struct list_head	rch_list;
+	wait_queue_head_t	ch_releaseQ;
+	spinlock_t		spinlock;
+	struct srpt_port	port[2];
+	struct ib_event_handler	event_handler;
+	struct list_head	list;
+};
+
+/**
+ * struct srpt_node_acl - Per-initiator ACL data (managed via configfs).
+ * @i_port_id: 128-bit SRP initiator port ID.
+ * @sport:     port information.
+ * @nacl:      Target core node ACL information.
+ * @list:      Element of the per-HCA ACL list.
+ */
+struct srpt_node_acl {
+	u8			i_port_id[16];
+	struct srpt_port	*sport;
+	struct se_node_acl	nacl;
+	struct list_head	list;
+};
+
+/*
+ * SRP-releated SCSI persistent reservation definitions.
+ *
+ * See also SPC4r28, section 7.6.1 (Protocol specific parameters introduction).
+ * See also SPC4r28, section 7.6.4.5 (TransportID for initiator ports using
+ * SCSI over an RDMA interface).
+ */
+
+enum {
+	SCSI_TRANSPORTID_PROTOCOLID_SRP	= 4,
+};
+
+struct spc_rdma_transport_id {
+	uint8_t protocol_identifier;
+	uint8_t reserved[7];
+	uint8_t i_port_id[16];
+};
+
+#endif				/* IB_SRPT_H */
author	André Fabian Silva Delgado <emulatorman@parabola.nu>	2015-08-05 17:04:01 -0300
committer	André Fabian Silva Delgado <emulatorman@parabola.nu>	2015-08-05 17:04:01 -0300
commit	57f0f512b273f60d52568b8c6b77e17f5636edc0 (patch)
tree	5e910f0e82173f4ef4f51111366a3f1299037a7b /drivers/infiniband